Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks

author: Ingo Molnar <mingo@elte.hu> 2008-10-28 16:26:12 +0100
committer: Ingo Molnar <mingo@elte.hu> 2008-10-28 16:26:12 +0100
commit: 7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch)
tree: e730a4565e0318140d2fbd2f0415d18a339d7336 /arch/x86/kernel
parent: x86, pci: introduce config option for pci reroute quirks (was: [PATCH 0/3] Bo... (diff)
parent: Linux 2.6.28-rc2 (diff)
download: linux-7a9787e1eba95a166265e6a260cf30af04ef0a99.tar.xz
linux-7a9787e1eba95a166265e6a260cf30af04ef0a99.zip
167 files changed, 16356 insertions, 13064 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index da140611bb57..d7e5a58ee22f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -7,9 +7,10 @@ extra-y                := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
 ifdef CONFIG_FTRACE
-# Do not profile debug utilities
+# Do not profile debug and lowlevel utilities
 CFLAGS_REMOVE_tsc.o = -pg
 CFLAGS_REMOVE_rtc.o = -pg
+CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
 endif
 
 #
@@ -22,7 +23,7 @@ CFLAGS_hpet.o		:= $(nostackp)
 CFLAGS_tsc.o		:= $(nostackp)
 
 obj-y			:= process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
-obj-y			+= traps_$(BITS).o irq_$(BITS).o
+obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o
 obj-y			+= setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
 obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
@@ -37,7 +38,7 @@ obj-y			+= tsc.o io_delay.o rtc.o
 
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
 obj-y				+= process.o
-obj-y				+= i387.o
+obj-y				+= i387.o xsave.o
 obj-y				+= ptrace.o
 obj-y				+= ds.o
 obj-$(CONFIG_X86_32)		+= tls.o
@@ -50,7 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot.o
 obj-$(CONFIG_MCA)		+= mca_32.o
 obj-$(CONFIG_X86_MSR)		+= msr.o
 obj-$(CONFIG_X86_CPUID)		+= cpuid.o
-obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_PCI)		+= early-quirks.o
 apm-y				:= apm_32.o
 obj-$(CONFIG_APM)		+= apm.o
@@ -60,14 +60,15 @@ obj-$(CONFIG_X86_32_SMP)	+= smpcommon.o
 obj-$(CONFIG_X86_64_SMP)	+= tsc_sync.o smpcommon.o
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline_$(BITS).o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC)	+= apic_$(BITS).o nmi.o
-obj-$(CONFIG_X86_IO_APIC)	+= io_apic_$(BITS).o
+obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
+obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
 obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
+obj-$(CONFIG_X86_ES7000)	+= es7000_32.o
 obj-$(CONFIG_X86_SUMMIT_NUMA)	+= summit_32.o
 obj-y				+= vsmp_64.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
@@ -88,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST)	+= test_nx.o
 obj-$(CONFIG_VMI)		+= vmi_32.o vmiclock_32.o
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o
 obj-$(CONFIG_KVM_CLOCK)		+= kvmclock.o
-obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
+obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
@@ -98,10 +99,18 @@ scx200-y			+= scx200_32.o
 
 obj-$(CONFIG_OLPC)		+= olpc.o
 
+microcode-y				:= microcode_core.o
+microcode-$(CONFIG_MICROCODE_INTEL)	+= microcode_intel.o
+microcode-$(CONFIG_MICROCODE_AMD)	+= microcode_amd.o
+obj-$(CONFIG_MICROCODE)			+= microcode.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
         obj-y				+= genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
+	obj-y				+= bios_uv.o uv_irq.o uv_sysfs.o
+        obj-y				+= genx2apic_cluster.o
+        obj-y				+= genx2apic_phys.o
         obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer_64.o
         obj-$(CONFIG_AUDIT)		+= audit_64.o
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index f489d7a9be92..8c1f76abae9e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled);
 #ifdef	CONFIG_X86_64
 
 #include <asm/proto.h>
-#include <asm/genapic.h>
 
 #else				/* X86 */
 
@@ -154,10 +153,21 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
 }
 
 #ifdef CONFIG_PCI_MMCONFIG
+
+static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
+
 /* The physical address of the MMCONFIG aperture.  Set from ACPI tables. */
 struct acpi_mcfg_allocation *pci_mmcfg_config;
 int pci_mmcfg_config_num;
 
+static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
+{
+	if (!strcmp(mcfg->header.oem_id, "SGI"))
+		acpi_mcfg_64bit_base_addr = TRUE;
+
+	return 0;
+}
+
 int __init acpi_parse_mcfg(struct acpi_table_header *header)
 {
 	struct acpi_table_mcfg *mcfg;
@@ -190,8 +200,12 @@ int __init acpi_parse_mcfg(struct acpi_table_header *header)
 	}
 
 	memcpy(pci_mmcfg_config, &mcfg[1], config_size);
+
+	acpi_mcfg_oem_check(mcfg);
+
 	for (i = 0; i < pci_mmcfg_config_num; ++i) {
-		if (pci_mmcfg_config[i].address > 0xFFFFFFFF) {
+		if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
+		    !acpi_mcfg_64bit_base_addr) {
 			printk(KERN_ERR PREFIX
 			       "MMCONFIG not in low 4GB of memory\n");
 			kfree(pci_mmcfg_config);
@@ -239,10 +253,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
 		return;
 	}
 
-#ifdef CONFIG_X86_32
 	if (boot_cpu_physical_apicid != -1U)
 		ver = apic_version[boot_cpu_physical_apicid];
-#endif
 
 	generic_processor_info(id, ver);
 }
@@ -761,11 +773,9 @@ static void __init acpi_register_lapic_address(unsigned long address)
 
 	set_fixmap_nocache(FIX_APIC_BASE, address);
 	if (boot_cpu_physical_apicid == -1U) {
-		boot_cpu_physical_apicid  = GET_APIC_ID(read_apic_id());
-#ifdef CONFIG_X86_32
+		boot_cpu_physical_apicid  = read_apic_id();
 		apic_version[boot_cpu_physical_apicid] =
 			 GET_APIC_VERSION(apic_read(APIC_LVR));
-#endif
 	}
 }
 
@@ -1021,7 +1031,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 	mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
 #endif
 	set_bit(MP_ISA_BUS, mp_bus_not_pci);
-	Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
+	pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
 
 #ifdef CONFIG_X86_ES7000
 	/*
@@ -1127,8 +1137,8 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
 		return gsi;
 	}
 	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-		Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
-			mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+		pr_debug("Pin %d-%d already programmed\n",
+			 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
 #ifdef CONFIG_X86_32
 		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
 #else
@@ -1247,7 +1257,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 
 	count =
 	    acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
-				  NR_IRQ_VECTORS);
+				  nr_irqs);
 	if (count < 0) {
 		printk(KERN_ERR PREFIX
 		       "Error parsing interrupt source overrides entry\n");
@@ -1267,7 +1277,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
 
 	count =
 	    acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
-				  NR_IRQ_VECTORS);
+				  nr_irqs);
 	if (count < 0) {
 		printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
 		/* TBD: Cleanup to allow fallback to MPS */
@@ -1337,7 +1347,9 @@ static void __init acpi_process_madt(void)
 				acpi_ioapic = 1;
 
 				smp_found_config = 1;
+#ifdef CONFIG_X86_32
 				setup_apic_routing();
+#endif
 			}
 		}
 		if (error == -EINVAL) {
@@ -1407,8 +1419,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
  */
 static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
 {
-	pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident);
-	acpi_skip_timer_override = 1;
+	/*
+	 * The ati_ixp4x0_rev() early PCI quirk should have set
+	 * the acpi_skip_timer_override flag already:
+	 */
+	if (!acpi_skip_timer_override) {
+		WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
+		pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
+			d->ident);
+		acpi_skip_timer_override = 1;
+	}
 	return 0;
 }
 
@@ -1579,6 +1599,11 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
 		     DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
 		     },
 	 },
+	{}
+};
+
+/* second table for DMI checks that should run after early-quirks */
+static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
 	/*
 	 * HP laptops which use a DSDT reporting as HP/SB400/10000,
 	 * which includes some code which overrides all temperature
@@ -1591,6 +1616,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
 	 */
 	{
 	 .callback = dmi_ignore_irq0_timer_override,
+	 .ident = "HP nx6115 laptop",
+	 .matches = {
+		     DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
+		     },
+	 },
+	{
+	 .callback = dmi_ignore_irq0_timer_override,
 	 .ident = "HP NX6125 laptop",
 	 .matches = {
 		     DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
@@ -1605,6 +1638,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
 		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
 		     },
 	 },
+	{
+	 .callback = dmi_ignore_irq0_timer_override,
+	 .ident = "HP 6715b laptop",
+	 .matches = {
+		     DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
+		     DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
+		     },
+	 },
 	{}
 };
 
@@ -1691,6 +1732,9 @@ int __init early_acpi_boot_init(void)
 
 int __init acpi_boot_init(void)
 {
+	/* those are executed after early-quirks are executed */
+	dmi_check_system(acpi_dmi_table_late);
+
 	/*
 	 * If acpi_disabled, bail out
 	 * One exception: acpi=ht continues far enough to enumerate LAPICs
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 868de3d5c39d..806b4e9051b4 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -9,6 +9,8 @@
 #include <linux/bootmem.h>
 #include <linux/dmi.h>
 #include <linux/cpumask.h>
+#include <asm/segment.h>
+#include <asm/desc.h>
 
 #include "realmode/wakeup.h"
 #include "sleep.h"
@@ -19,19 +21,10 @@ unsigned long acpi_realmode_flags;
 /* address in low memory of the wakeup routine. */
 static unsigned long acpi_realmode;
 
-#ifdef CONFIG_64BIT
-static char temp_stack[10240];
+#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
+static char temp_stack[4096];
 #endif
 
-/* XXX: this macro should move to asm-x86/segment.h and be shared with the
-   boot code... */
-#define GDT_ENTRY(flags, base, limit)		\
-	(((u64)(base & 0xff000000) << 32) |	\
-	 ((u64)flags << 40) |			\
-	 ((u64)(limit & 0x00ff0000) << 32) |	\
-	 ((u64)(base & 0x00ffffff) << 16) |	\
-	 ((u64)(limit & 0x0000ffff)))
-
 /**
  * acpi_save_state_mem - save kernel state
  *
@@ -94,7 +87,7 @@ int acpi_save_state_mem(void)
 #endif /* !CONFIG_64BIT */
 
 	header->pmode_cr0 = read_cr0();
-	header->pmode_cr4 = read_cr4();
+	header->pmode_cr4 = read_cr4_safe();
 	header->realmode_flags = acpi_realmode_flags;
 	header->real_magic = 0x12345678;
 
@@ -105,7 +98,9 @@ int acpi_save_state_mem(void)
 #else /* CONFIG_64BIT */
 	header->trampoline_segment = setup_trampoline() >> 4;
 #ifdef CONFIG_SMP
-	stack_start.sp = temp_stack + 4096;
+	stack_start.sp = temp_stack + sizeof(temp_stack);
+	early_gdt_descr.address =
+			(unsigned long)get_cpu_gdt_table(smp_processor_id());
 #endif
 	initial_code = (unsigned long)wakeup_long64;
 	saved_magic = 0x123456789abcdef0;
@@ -158,6 +153,10 @@ static int __init acpi_sleep_setup(char *str)
 			acpi_realmode_flags |= 2;
 		if (strncmp(str, "s3_beep", 7) == 0)
 			acpi_realmode_flags |= 4;
+#ifdef CONFIG_HIBERNATION
+		if (strncmp(str, "s4_nohwsig", 10) == 0)
+			acpi_no_s4_hw_signature();
+#endif
 		if (strncmp(str, "old_ordering", 12) == 0)
 			acpi_old_suspend_ordering();
 		str = strchr(str, ',');
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2763cb37b553..a84ac7b570e6 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
 extern char __vsyscall_0;
 const unsigned char *const *find_nop_table(void)
 {
-	return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
-	       boot_cpu_data.x86 < 6 ? k8_nops : p6_nops;
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+	    boot_cpu_has(X86_FEATURE_NOPL))
+		return p6_nops;
+	else
+		return k8_nops;
 }
 
 #else /* CONFIG_X86_64 */
 
-static const struct nop {
-	int cpuid;
-	const unsigned char *const *noptable;
-} noptypes[] = {
-	{ X86_FEATURE_K8, k8_nops },
-	{ X86_FEATURE_K7, k7_nops },
-	{ X86_FEATURE_P4, p6_nops },
-	{ X86_FEATURE_P3, p6_nops },
-	{ -1, NULL }
-};
-
 const unsigned char *const *find_nop_table(void)
 {
-	const unsigned char *const *noptable = intel_nops;
-	int i;
-
-	for (i = 0; noptypes[i].cpuid >= 0; i++) {
-		if (boot_cpu_has(noptypes[i].cpuid)) {
-			noptable = noptypes[i].noptable;
-			break;
-		}
-	}
-	return noptable;
+	if (boot_cpu_has(X86_FEATURE_K8))
+		return k8_nops;
+	else if (boot_cpu_has(X86_FEATURE_K7))
+		return k7_nops;
+	else if (boot_cpu_has(X86_FEATURE_NOPL))
+		return p6_nops;
+	else
+		return intel_nops;
 }
 
 #endif /* CONFIG_X86_64 */
@@ -241,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 			continue;
 		if (*ptr > text_end)
 			continue;
-		text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */
+		/* turn DS segment override prefix into lock prefix */
+		text_poke(*ptr, ((unsigned char []){0xf0}), 1);
 	};
 }
 
 static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
 {
 	u8 **ptr;
-	char insn[1];
 
 	if (noreplace_smp)
 		return;
 
-	add_nops(insn, 1);
 	for (ptr = start; ptr < end; ptr++) {
 		if (*ptr < text)
 			continue;
 		if (*ptr > text_end)
 			continue;
-		text_poke(*ptr, insn, 1);
+		/* turn lock prefix into DS segment override prefix */
+		text_poke(*ptr, ((unsigned char []){0x3E}), 1);
 	};
 }
 
@@ -454,7 +444,7 @@ void __init alternative_instructions(void)
 					    _text, _etext);
 
 		/* Only switch to UP mode if we don't immediately boot others */
-		if (num_possible_cpus() == 1 || setup_max_cpus <= 1)
+		if (num_present_cpus() == 1 || setup_max_cpus <= 1)
 			alternatives_smp_switch(0);
 	}
 #endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f2766d84c7a0..a8fd9ebdc8e2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -23,36 +23,149 @@
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
 #include <asm/proto.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/amd_iommu_types.h>
 #include <asm/amd_iommu.h>
 
 #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
 
-#define to_pages(addr, size) \
-	 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
+#define EXIT_LOOP_COUNT 10000000
 
 static DEFINE_RWLOCK(amd_iommu_devtable_lock);
 
-struct command {
+/* A list of preallocated protection domains */
+static LIST_HEAD(iommu_pd_list);
+static DEFINE_SPINLOCK(iommu_pd_list_lock);
+
+/*
+ * general struct to manage commands send to an IOMMU
+ */
+struct iommu_cmd {
 	u32 data[4];
 };
 
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 			     struct unity_map_entry *e);
 
+/* returns !0 if the IOMMU is caching non-present entries in its TLB */
 static int iommu_has_npcache(struct amd_iommu *iommu)
 {
 	return iommu->cap & IOMMU_CAP_NPCACHE;
 }
 
-static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+/****************************************************************************
+ *
+ * Interrupt handling functions
+ *
+ ****************************************************************************/
+
+static void iommu_print_event(void *__evt)
+{
+	u32 *event = __evt;
+	int type  = (event[1] >> EVENT_TYPE_SHIFT)  & EVENT_TYPE_MASK;
+	int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
+	int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
+	int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
+	u64 address = (u64)(((u64)event[3]) << 32) | event[2];
+
+	printk(KERN_ERR "AMD IOMMU: Event logged [");
+
+	switch (type) {
+	case EVENT_TYPE_ILL_DEV:
+		printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
+		       "address=0x%016llx flags=0x%04x]\n",
+		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+		       address, flags);
+		break;
+	case EVENT_TYPE_IO_FAULT:
+		printk("IO_PAGE_FAULT device=%02x:%02x.%x "
+		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
+		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+		       domid, address, flags);
+		break;
+	case EVENT_TYPE_DEV_TAB_ERR:
+		printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
+		       "address=0x%016llx flags=0x%04x]\n",
+		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+		       address, flags);
+		break;
+	case EVENT_TYPE_PAGE_TAB_ERR:
+		printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
+		       "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
+		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+		       domid, address, flags);
+		break;
+	case EVENT_TYPE_ILL_CMD:
+		printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
+		break;
+	case EVENT_TYPE_CMD_HARD_ERR:
+		printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
+		       "flags=0x%04x]\n", address, flags);
+		break;
+	case EVENT_TYPE_IOTLB_INV_TO:
+		printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
+		       "address=0x%016llx]\n",
+		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+		       address);
+		break;
+	case EVENT_TYPE_INV_DEV_REQ:
+		printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
+		       "address=0x%016llx flags=0x%04x]\n",
+		       PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
+		       address, flags);
+		break;
+	default:
+		printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
+	}
+}
+
+static void iommu_poll_events(struct amd_iommu *iommu)
+{
+	u32 head, tail;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iommu->lock, flags);
+
+	head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+	tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
+
+	while (head != tail) {
+		iommu_print_event(iommu->evt_buf + head);
+		head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
+	}
+
+	writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
+
+	spin_unlock_irqrestore(&iommu->lock, flags);
+}
+
+irqreturn_t amd_iommu_int_handler(int irq, void *data)
+{
+	struct amd_iommu *iommu;
+
+	list_for_each_entry(iommu, &amd_iommu_list, list)
+		iommu_poll_events(iommu);
+
+	return IRQ_HANDLED;
+}
+
+/****************************************************************************
+ *
+ * IOMMU command queuing functions
+ *
+ ****************************************************************************/
+
+/*
+ * Writes the command to the IOMMUs command buffer and informs the
+ * hardware about the new command. Must be called with iommu->lock held.
+ */
+static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
 {
 	u32 tail, head;
 	u8 *target;
 
 	tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
-	target = (iommu->cmd_buf + tail);
+	target = iommu->cmd_buf + tail;
 	memcpy_toio(target, cmd, sizeof(*cmd));
 	tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
 	head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
@@ -63,7 +176,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
 	return 0;
 }
 
-static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
+/*
+ * General queuing function for commands. Takes iommu->lock and calls
+ * __iommu_queue_command().
+ */
+static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
 {
 	unsigned long flags;
 	int ret;
@@ -75,35 +192,59 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
 	return ret;
 }
 
+/*
+ * This function is called whenever we need to ensure that the IOMMU has
+ * completed execution of all commands we sent. It sends a
+ * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
+ * us about that by writing a value to a physical address we pass with
+ * the command.
+ */
 static int iommu_completion_wait(struct amd_iommu *iommu)
 {
-	int ret;
-	struct command cmd;
-	volatile u64 ready = 0;
-	unsigned long ready_phys = virt_to_phys(&ready);
+	int ret = 0, ready = 0;
+	unsigned status = 0;
+	struct iommu_cmd cmd;
+	unsigned long flags, i = 0;
 
 	memset(&cmd, 0, sizeof(cmd));
-	cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK;
-	cmd.data[1] = HIGH_U32(ready_phys);
-	cmd.data[2] = 1; /* value written to 'ready' */
+	cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
 	CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
 
 	iommu->need_sync = 0;
 
-	ret = iommu_queue_command(iommu, &cmd);
+	spin_lock_irqsave(&iommu->lock, flags);
+
+	ret = __iommu_queue_command(iommu, &cmd);
 
 	if (ret)
-		return ret;
+		goto out;
+
+	while (!ready && (i < EXIT_LOOP_COUNT)) {
+		++i;
+		/* wait for the bit to become one */
+		status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
+		ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
+	}
+
+	/* set bit back to zero */
+	status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
+	writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
 
-	while (!ready)
-		cpu_relax();
+	if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
+		printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
+out:
+	spin_unlock_irqrestore(&iommu->lock, flags);
 
 	return 0;
 }
 
+/*
+ * Command send function for invalidating a device table entry
+ */
 static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
 {
-	struct command cmd;
+	struct iommu_cmd cmd;
+	int ret;
 
 	BUG_ON(iommu == NULL);
 
@@ -111,37 +252,50 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
 	CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
 	cmd.data[0] = devid;
 
+	ret = iommu_queue_command(iommu, &cmd);
+
 	iommu->need_sync = 1;
 
-	return iommu_queue_command(iommu, &cmd);
+	return ret;
 }
 
+/*
+ * Generic command send function for invalidaing TLB entries
+ */
 static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
 		u64 address, u16 domid, int pde, int s)
 {
-	struct command cmd;
+	struct iommu_cmd cmd;
+	int ret;
 
 	memset(&cmd, 0, sizeof(cmd));
 	address &= PAGE_MASK;
 	CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
 	cmd.data[1] |= domid;
-	cmd.data[2] = LOW_U32(address);
-	cmd.data[3] = HIGH_U32(address);
-	if (s)
+	cmd.data[2] = lower_32_bits(address);
+	cmd.data[3] = upper_32_bits(address);
+	if (s) /* size bit - we flush more than one 4kb page */
 		cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
-	if (pde)
+	if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
 		cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
 
+	ret = iommu_queue_command(iommu, &cmd);
+
 	iommu->need_sync = 1;
 
-	return iommu_queue_command(iommu, &cmd);
+	return ret;
 }
 
+/*
+ * TLB invalidation function which is called from the mapping functions.
+ * It invalidates a single PTE if the range to flush is within a single
+ * page. Otherwise it flushes the whole TLB of the IOMMU.
+ */
 static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
 		u64 address, size_t size)
 {
 	int s = 0;
-	unsigned pages = to_pages(address, size);
+	unsigned pages = iommu_num_pages(address, size, PAGE_SIZE);
 
 	address &= PAGE_MASK;
 
@@ -159,6 +313,28 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
 	return 0;
 }
 
+/* Flush the whole IO/TLB for a given protection domain */
+static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
+{
+	u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
+
+	iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
+}
+
+/****************************************************************************
+ *
+ * The functions below are used the create the page table mappings for
+ * unity mapped regions.
+ *
+ ****************************************************************************/
+
+/*
+ * Generic mapping functions. It maps a physical address into a DMA
+ * address space. It allocates the page table pages if necessary.
+ * In the future it can be extended to a generic mapping function
+ * supporting all features of AMD IOMMU page tables like level skipping
+ * and full 64 bit address spaces.
+ */
 static int iommu_map(struct protection_domain *dom,
 		     unsigned long bus_addr,
 		     unsigned long phys_addr,
@@ -209,6 +385,10 @@ static int iommu_map(struct protection_domain *dom,
 	return 0;
 }
 
+/*
+ * This function checks if a specific unity mapping entry is needed for
+ * this specific IOMMU.
+ */
 static int iommu_for_unity_map(struct amd_iommu *iommu,
 			       struct unity_map_entry *entry)
 {
@@ -223,6 +403,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
 	return 0;
 }
 
+/*
+ * Init the unity mappings for a specific IOMMU in the system
+ *
+ * Basically iterates over all unity mapping entries and applies them to
+ * the default domain DMA of that IOMMU if necessary.
+ */
 static int iommu_init_unity_mappings(struct amd_iommu *iommu)
 {
 	struct unity_map_entry *entry;
@@ -239,6 +425,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu)
 	return 0;
 }
 
+/*
+ * This function actually applies the mapping to the page table of the
+ * dma_ops domain.
+ */
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 			     struct unity_map_entry *e)
 {
@@ -261,6 +451,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 	return 0;
 }
 
+/*
+ * Inits the unity mappings required for a specific device
+ */
 static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
 					  u16 devid)
 {
@@ -278,33 +471,48 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
 	return 0;
 }
 
-static unsigned long dma_mask_to_pages(unsigned long mask)
-{
-	return (mask >> PAGE_SHIFT) +
-		(PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
-}
+/****************************************************************************
+ *
+ * The next functions belong to the address allocator for the dma_ops
+ * interface functions. They work like the allocators in the other IOMMU
+ * drivers. Its basically a bitmap which marks the allocated pages in
+ * the aperture. Maybe it could be enhanced in the future to a more
+ * efficient allocator.
+ *
+ ****************************************************************************/
 
+/*
+ * The address allocator core function.
+ *
+ * called with domain->lock held
+ */
 static unsigned long dma_ops_alloc_addresses(struct device *dev,
 					     struct dma_ops_domain *dom,
-					     unsigned int pages)
+					     unsigned int pages,
+					     unsigned long align_mask,
+					     u64 dma_mask)
 {
-	unsigned long limit = dma_mask_to_pages(*dev->dma_mask);
+	unsigned long limit;
 	unsigned long address;
-	unsigned long size = dom->aperture_size >> PAGE_SHIFT;
 	unsigned long boundary_size;
 
 	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
 			PAGE_SIZE) >> PAGE_SHIFT;
-	limit = limit < size ? limit : size;
+	limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
+				       dma_mask >> PAGE_SHIFT);
 
-	if (dom->next_bit >= limit)
+	if (dom->next_bit >= limit) {
 		dom->next_bit = 0;
+		dom->need_flush = true;
+	}
 
 	address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
-			0 , boundary_size, 0);
-	if (address == -1)
+				   0 , boundary_size, align_mask);
+	if (address == -1) {
 		address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
-				0, boundary_size, 0);
+				0, boundary_size, align_mask);
+		dom->need_flush = true;
+	}
 
 	if (likely(address != -1)) {
 		dom->next_bit = address + pages;
@@ -317,6 +525,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
 	return address;
 }
 
+/*
+ * The address free function.
+ *
+ * called with domain->lock held
+ */
 static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 				   unsigned long address,
 				   unsigned int pages)
@@ -325,6 +538,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 	iommu_area_free(dom->bitmap, address, pages);
 }
 
+/****************************************************************************
+ *
+ * The next functions belong to the domain allocation. A domain is
+ * allocated for every IOMMU as the default domain. If device isolation
+ * is enabled, every device get its own domain. The most important thing
+ * about domains is the page table mapping the DMA address space they
+ * contain.
+ *
+ ****************************************************************************/
+
 static u16 domain_id_alloc(void)
 {
 	unsigned long flags;
@@ -342,6 +565,10 @@ static u16 domain_id_alloc(void)
 	return id;
 }
 
+/*
+ * Used to reserve address ranges in the aperture (e.g. for exclusion
+ * ranges.
+ */
 static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 				      unsigned long start_page,
 				      unsigned int pages)
@@ -351,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 	if (start_page + pages > last_page)
 		pages = last_page - start_page;
 
-	set_bit_string(dom->bitmap, start_page, pages);
+	iommu_area_reserve(dom->bitmap, start_page, pages);
 }
 
 static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
@@ -382,6 +609,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
 	free_page((unsigned long)p1);
 }
 
+/*
+ * Free a domain, only used if something went wrong in the
+ * allocation path and we need to free an already allocated page table
+ */
 static void dma_ops_domain_free(struct dma_ops_domain *dom)
 {
 	if (!dom)
@@ -396,6 +627,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
 	kfree(dom);
 }
 
+/*
+ * Allocates a new protection domain usable for the dma_ops functions.
+ * It also intializes the page table and the address allocator data
+ * structures required for the dma_ops interface
+ */
 static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 						   unsigned order)
 {
@@ -436,14 +672,24 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->bitmap[0] = 1;
 	dma_dom->next_bit = 0;
 
+	dma_dom->need_flush = false;
+	dma_dom->target_dev = 0xffff;
+
+	/* Intialize the exclusion range if necessary */
 	if (iommu->exclusion_start &&
 	    iommu->exclusion_start < dma_dom->aperture_size) {
 		unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
-		int pages = to_pages(iommu->exclusion_start,
-				iommu->exclusion_length);
+		int pages = iommu_num_pages(iommu->exclusion_start,
+					    iommu->exclusion_length,
+					    PAGE_SIZE);
 		dma_ops_reserve_addresses(dma_dom, startpage, pages);
 	}
 
+	/*
+	 * At the last step, build the page tables so we don't need to
+	 * allocate page table pages in the dma_ops mapping/unmapping
+	 * path.
+	 */
 	num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
 	dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
 			GFP_KERNEL);
@@ -472,6 +718,10 @@ free_dma_dom:
 	return NULL;
 }
 
+/*
+ * Find out the protection domain structure for a given PCI device. This
+ * will give us the pointer to the page table root for example.
+ */
 static struct protection_domain *domain_for_device(u16 devid)
 {
 	struct protection_domain *dom;
@@ -484,6 +734,10 @@ static struct protection_domain *domain_for_device(u16 devid)
 	return dom;
 }
 
+/*
+ * If a device is not yet associated with a domain, this function does
+ * assigns it visible for the hardware
+ */
 static void set_device_domain(struct amd_iommu *iommu,
 			      struct protection_domain *domain,
 			      u16 devid)
@@ -492,12 +746,13 @@ static void set_device_domain(struct amd_iommu *iommu,
 
 	u64 pte_root = virt_to_phys(domain->pt_root);
 
-	pte_root |= (domain->mode & 0x07) << 9;
-	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2;
+	pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
+		    << DEV_ENTRY_MODE_SHIFT;
+	pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
 
 	write_lock_irqsave(&amd_iommu_devtable_lock, flags);
-	amd_iommu_dev_table[devid].data[0] = pte_root;
-	amd_iommu_dev_table[devid].data[1] = pte_root >> 32;
+	amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
+	amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
 	amd_iommu_dev_table[devid].data[2] = domain->id;
 
 	amd_iommu_pd_table[devid] = domain;
@@ -508,6 +763,58 @@ static void set_device_domain(struct amd_iommu *iommu,
 	iommu->need_sync = 1;
 }
 
+/*****************************************************************************
+ *
+ * The next functions belong to the dma_ops mapping/unmapping code.
+ *
+ *****************************************************************************/
+
+/*
+ * This function checks if the driver got a valid device from the caller to
+ * avoid dereferencing invalid pointers.
+ */
+static bool check_device(struct device *dev)
+{
+	if (!dev || !dev->dma_mask)
+		return false;
+
+	return true;
+}
+
+/*
+ * In this function the list of preallocated protection domains is traversed to
+ * find the domain for a specific device
+ */
+static struct dma_ops_domain *find_protection_domain(u16 devid)
+{
+	struct dma_ops_domain *entry, *ret = NULL;
+	unsigned long flags;
+
+	if (list_empty(&iommu_pd_list))
+		return NULL;
+
+	spin_lock_irqsave(&iommu_pd_list_lock, flags);
+
+	list_for_each_entry(entry, &iommu_pd_list, list) {
+		if (entry->target_dev == devid) {
+			ret = entry;
+			list_del(&ret->list);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
+
+	return ret;
+}
+
+/*
+ * In the dma_ops path we only have the struct device. This function
+ * finds the corresponding IOMMU, the protection domain and the
+ * requestor id for a given device.
+ * If the device is not yet associated with a domain this is also done
+ * in this function.
+ */
 static int get_device_resources(struct device *dev,
 				struct amd_iommu **iommu,
 				struct protection_domain **domain,
@@ -517,26 +824,30 @@ static int get_device_resources(struct device *dev,
 	struct pci_dev *pcidev;
 	u16 _bdf;
 
-	BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask);
+	*iommu = NULL;
+	*domain = NULL;
+	*bdf = 0xffff;
+
+	if (dev->bus != &pci_bus_type)
+		return 0;
 
 	pcidev = to_pci_dev(dev);
-	_bdf = (pcidev->bus->number << 8) | pcidev->devfn;
+	_bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
 
-	if (_bdf >= amd_iommu_last_bdf) {
-		*iommu = NULL;
-		*domain = NULL;
-		*bdf = 0xffff;
+	/* device not translated by any IOMMU in the system? */
+	if (_bdf > amd_iommu_last_bdf)
 		return 0;
-	}
 
 	*bdf = amd_iommu_alias_table[_bdf];
 
 	*iommu = amd_iommu_rlookup_table[*bdf];
 	if (*iommu == NULL)
 		return 0;
-	dma_dom = (*iommu)->default_dom;
 	*domain = domain_for_device(*bdf);
 	if (*domain == NULL) {
+		dma_dom = find_protection_domain(*bdf);
+		if (!dma_dom)
+			dma_dom = (*iommu)->default_dom;
 		*domain = &dma_dom->domain;
 		set_device_domain(*iommu, *domain, *bdf);
 		printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
@@ -547,6 +858,10 @@ static int get_device_resources(struct device *dev,
 	return 1;
 }
 
+/*
+ * This is the generic map function. It maps one 4kb page at paddr to
+ * the given address in the DMA address space for the domain.
+ */
 static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
 				     struct dma_ops_domain *dom,
 				     unsigned long address,
@@ -578,6 +893,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
 	return (dma_addr_t)address;
 }
 
+/*
+ * The generic unmapping function for on page in the DMA address space.
+ */
 static void dma_ops_domain_unmap(struct amd_iommu *iommu,
 				 struct dma_ops_domain *dom,
 				 unsigned long address)
@@ -597,22 +915,35 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
 	*pte = 0ULL;
 }
 
+/*
+ * This function contains common code for mapping of a physically
+ * contiguous memory region into DMA address space. It is uses by all
+ * mapping functions provided by this IOMMU driver.
+ * Must be called with the domain lock held.
+ */
 static dma_addr_t __map_single(struct device *dev,
 			       struct amd_iommu *iommu,
 			       struct dma_ops_domain *dma_dom,
 			       phys_addr_t paddr,
 			       size_t size,
-			       int dir)
+			       int dir,
+			       bool align,
+			       u64 dma_mask)
 {
 	dma_addr_t offset = paddr & ~PAGE_MASK;
 	dma_addr_t address, start;
 	unsigned int pages;
+	unsigned long align_mask = 0;
 	int i;
 
-	pages = to_pages(paddr, size);
+	pages = iommu_num_pages(paddr, size, PAGE_SIZE);
 	paddr &= PAGE_MASK;
 
-	address = dma_ops_alloc_addresses(dev, dma_dom, pages);
+	if (align)
+		align_mask = (1UL << get_order(size)) - 1;
+
+	address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
+					  dma_mask);
 	if (unlikely(address == bad_dma_address))
 		goto out;
 
@@ -624,10 +955,20 @@ static dma_addr_t __map_single(struct device *dev,
 	}
 	address += offset;
 
+	if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
+		iommu_flush_tlb(iommu, dma_dom->domain.id);
+		dma_dom->need_flush = false;
+	} else if (unlikely(iommu_has_npcache(iommu)))
+		iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
+
 out:
 	return address;
 }
 
+/*
+ * Does the reverse of the __map_single function. Must be called with
+ * the domain lock held too
+ */
 static void __unmap_single(struct amd_iommu *iommu,
 			   struct dma_ops_domain *dma_dom,
 			   dma_addr_t dma_addr,
@@ -640,7 +981,7 @@ static void __unmap_single(struct amd_iommu *iommu,
 	if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
 		return;
 
-	pages = to_pages(dma_addr, size);
+	pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
 	dma_addr &= PAGE_MASK;
 	start = dma_addr;
 
@@ -650,8 +991,14 @@ static void __unmap_single(struct amd_iommu *iommu,
 	}
 
 	dma_ops_free_addresses(dma_dom, dma_addr, pages);
+
+	if (amd_iommu_unmap_flush)
+		iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
 }
 
+/*
+ * The exported map_single function for dma_ops.
+ */
 static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
 			     size_t size, int dir)
 {
@@ -660,21 +1007,26 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
 	struct protection_domain *domain;
 	u16 devid;
 	dma_addr_t addr;
+	u64 dma_mask;
+
+	if (!check_device(dev))
+		return bad_dma_address;
+
+	dma_mask = *dev->dma_mask;
 
 	get_device_resources(dev, &iommu, &domain, &devid);
 
 	if (iommu == NULL || domain == NULL)
+		/* device not handled by any AMD IOMMU */
 		return (dma_addr_t)paddr;
 
 	spin_lock_irqsave(&domain->lock, flags);
-	addr = __map_single(dev, iommu, domain->priv, paddr, size, dir);
+	addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
+			    dma_mask);
 	if (addr == bad_dma_address)
 		goto out;
 
-	if (iommu_has_npcache(iommu))
-		iommu_flush_pages(iommu, domain->id, addr, size);
-
-	if (iommu->need_sync)
+	if (unlikely(iommu->need_sync))
 		iommu_completion_wait(iommu);
 
 out:
@@ -683,6 +1035,9 @@ out:
 	return addr;
 }
 
+/*
+ * The exported unmap_single function for dma_ops.
+ */
 static void unmap_single(struct device *dev, dma_addr_t dma_addr,
 			 size_t size, int dir)
 {
@@ -691,21 +1046,25 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
 	struct protection_domain *domain;
 	u16 devid;
 
-	if (!get_device_resources(dev, &iommu, &domain, &devid))
+	if (!check_device(dev) ||
+	    !get_device_resources(dev, &iommu, &domain, &devid))
+		/* device not handled by any AMD IOMMU */
 		return;
 
 	spin_lock_irqsave(&domain->lock, flags);
 
 	__unmap_single(iommu, domain->priv, dma_addr, size, dir);
 
-	iommu_flush_pages(iommu, domain->id, dma_addr, size);
-
-	if (iommu->need_sync)
+	if (unlikely(iommu->need_sync))
 		iommu_completion_wait(iommu);
 
 	spin_unlock_irqrestore(&domain->lock, flags);
 }
 
+/*
+ * This is a special map_sg function which is used if we should map a
+ * device which is not handled by an AMD IOMMU in the system.
+ */
 static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
 			   int nelems, int dir)
 {
@@ -720,6 +1079,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
 	return nelems;
 }
 
+/*
+ * The exported map_sg function for dma_ops (handles scatter-gather
+ * lists).
+ */
 static int map_sg(struct device *dev, struct scatterlist *sglist,
 		  int nelems, int dir)
 {
@@ -731,6 +1094,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
 	struct scatterlist *s;
 	phys_addr_t paddr;
 	int mapped_elems = 0;
+	u64 dma_mask;
+
+	if (!check_device(dev))
+		return 0;
+
+	dma_mask = *dev->dma_mask;
 
 	get_device_resources(dev, &iommu, &domain, &devid);
 
@@ -743,19 +1112,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
 		paddr = sg_phys(s);
 
 		s->dma_address = __map_single(dev, iommu, domain->priv,
-					      paddr, s->length, dir);
+					      paddr, s->length, dir, false,
+					      dma_mask);
 
 		if (s->dma_address) {
 			s->dma_length = s->length;
 			mapped_elems++;
 		} else
 			goto unmap;
-		if (iommu_has_npcache(iommu))
-			iommu_flush_pages(iommu, domain->id, s->dma_address,
-					  s->dma_length);
 	}
 
-	if (iommu->need_sync)
+	if (unlikely(iommu->need_sync))
 		iommu_completion_wait(iommu);
 
 out:
@@ -775,6 +1142,10 @@ unmap:
 	goto out;
 }
 
+/*
+ * The exported map_sg function for dma_ops (handles scatter-gather
+ * lists).
+ */
 static void unmap_sg(struct device *dev, struct scatterlist *sglist,
 		     int nelems, int dir)
 {
@@ -785,7 +1156,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
 	u16 devid;
 	int i;
 
-	if (!get_device_resources(dev, &iommu, &domain, &devid))
+	if (!check_device(dev) ||
+	    !get_device_resources(dev, &iommu, &domain, &devid))
 		return;
 
 	spin_lock_irqsave(&domain->lock, flags);
@@ -793,17 +1165,18 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
 	for_each_sg(sglist, s, nelems, i) {
 		__unmap_single(iommu, domain->priv, s->dma_address,
 			       s->dma_length, dir);
-		iommu_flush_pages(iommu, domain->id, s->dma_address,
-				  s->dma_length);
 		s->dma_address = s->dma_length = 0;
 	}
 
-	if (iommu->need_sync)
+	if (unlikely(iommu->need_sync))
 		iommu_completion_wait(iommu);
 
 	spin_unlock_irqrestore(&domain->lock, flags);
 }
 
+/*
+ * The exported alloc_coherent function for dma_ops.
+ */
 static void *alloc_coherent(struct device *dev, size_t size,
 			    dma_addr_t *dma_addr, gfp_t flag)
 {
@@ -813,25 +1186,33 @@ static void *alloc_coherent(struct device *dev, size_t size,
 	struct protection_domain *domain;
 	u16 devid;
 	phys_addr_t paddr;
+	u64 dma_mask = dev->coherent_dma_mask;
+
+	if (!check_device(dev))
+		return NULL;
+
+	if (!get_device_resources(dev, &iommu, &domain, &devid))
+		flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
 
+	flag |= __GFP_ZERO;
 	virt_addr = (void *)__get_free_pages(flag, get_order(size));
 	if (!virt_addr)
 		return 0;
 
-	memset(virt_addr, 0, size);
 	paddr = virt_to_phys(virt_addr);
 
-	get_device_resources(dev, &iommu, &domain, &devid);
-
 	if (!iommu || !domain) {
 		*dma_addr = (dma_addr_t)paddr;
 		return virt_addr;
 	}
 
+	if (!dma_mask)
+		dma_mask = *dev->dma_mask;
+
 	spin_lock_irqsave(&domain->lock, flags);
 
 	*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
-				 size, DMA_BIDIRECTIONAL);
+				 size, DMA_BIDIRECTIONAL, true, dma_mask);
 
 	if (*dma_addr == bad_dma_address) {
 		free_pages((unsigned long)virt_addr, get_order(size));
@@ -839,10 +1220,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
 		goto out;
 	}
 
-	if (iommu_has_npcache(iommu))
-		iommu_flush_pages(iommu, domain->id, *dma_addr, size);
-
-	if (iommu->need_sync)
+	if (unlikely(iommu->need_sync))
 		iommu_completion_wait(iommu);
 
 out:
@@ -851,6 +1229,9 @@ out:
 	return virt_addr;
 }
 
+/*
+ * The exported free_coherent function for dma_ops.
+ */
 static void free_coherent(struct device *dev, size_t size,
 			  void *virt_addr, dma_addr_t dma_addr)
 {
@@ -859,6 +1240,9 @@ static void free_coherent(struct device *dev, size_t size,
 	struct protection_domain *domain;
 	u16 devid;
 
+	if (!check_device(dev))
+		return;
+
 	get_device_resources(dev, &iommu, &domain, &devid);
 
 	if (!iommu || !domain)
@@ -867,9 +1251,8 @@ static void free_coherent(struct device *dev, size_t size,
 	spin_lock_irqsave(&domain->lock, flags);
 
 	__unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
-	iommu_flush_pages(iommu, domain->id, dma_addr, size);
 
-	if (iommu->need_sync)
+	if (unlikely(iommu->need_sync))
 		iommu_completion_wait(iommu);
 
 	spin_unlock_irqrestore(&domain->lock, flags);
@@ -879,6 +1262,32 @@ free_mem:
 }
 
 /*
+ * This function is called by the DMA layer to find out if we can handle a
+ * particular device. It is part of the dma_ops.
+ */
+static int amd_iommu_dma_supported(struct device *dev, u64 mask)
+{
+	u16 bdf;
+	struct pci_dev *pcidev;
+
+	/* No device or no PCI device */
+	if (!dev || dev->bus != &pci_bus_type)
+		return 0;
+
+	pcidev = to_pci_dev(dev);
+
+	bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
+
+	/* Out of our scope? */
+	if (bdf > amd_iommu_last_bdf)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * The function for pre-allocating protection domains.
+ *
  * If the driver core informs the DMA layer if a driver grabs a device
  * we don't need to preallocate the protection domains anymore.
  * For now we have to.
@@ -893,7 +1302,7 @@ void prealloc_protection_domains(void)
 
 	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
 		devid = (dev->bus->number << 8) | dev->devfn;
-		if (devid >= amd_iommu_last_bdf)
+		if (devid > amd_iommu_last_bdf)
 			continue;
 		devid = amd_iommu_alias_table[devid];
 		if (domain_for_device(devid))
@@ -905,10 +1314,9 @@ void prealloc_protection_domains(void)
 		if (!dma_dom)
 			continue;
 		init_unity_mappings_for_device(dma_dom, devid);
-		set_device_domain(iommu, &dma_dom->domain, devid);
-		printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ",
-		       dma_dom->domain.id);
-		print_devid(devid, 1);
+		dma_dom->target_dev = devid;
+
+		list_add_tail(&dma_dom->list, &iommu_pd_list);
 	}
 }
 
@@ -919,14 +1327,23 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
 	.unmap_single = unmap_single,
 	.map_sg = map_sg,
 	.unmap_sg = unmap_sg,
+	.dma_supported = amd_iommu_dma_supported,
 };
 
+/*
+ * The function which clues the AMD IOMMU driver into dma_ops.
+ */
 int __init amd_iommu_init_dma_ops(void)
 {
 	struct amd_iommu *iommu;
 	int order = amd_iommu_aperture_order;
 	int ret;
 
+	/*
+	 * first allocate a default protection domain for every IOMMU we
+	 * found in the system. Devices not assigned to any other
+	 * protection domain will be assigned to the default one.
+	 */
 	list_for_each_entry(iommu, &amd_iommu_list, list) {
 		iommu->default_dom = dma_ops_domain_alloc(iommu, order);
 		if (iommu->default_dom == NULL)
@@ -936,6 +1353,10 @@ int __init amd_iommu_init_dma_ops(void)
 			goto free_domains;
 	}
 
+	/*
+	 * If device isolation is enabled, pre-allocate the protection
+	 * domains for each device.
+	 */
 	if (amd_iommu_isolate)
 		prealloc_protection_domains();
 
@@ -947,6 +1368,7 @@ int __init amd_iommu_init_dma_ops(void)
 	gart_iommu_aperture = 0;
 #endif
 
+	/* Make the driver finally visible to the drivers */
 	dma_ops = &amd_iommu_dma_ops;
 
 	return 0;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 2a13e430437d..0cdcda35a05f 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -22,23 +22,17 @@
 #include <linux/gfp.h>
 #include <linux/list.h>
 #include <linux/sysdev.h>
+#include <linux/interrupt.h>
+#include <linux/msi.h>
 #include <asm/pci-direct.h>
 #include <asm/amd_iommu_types.h>
 #include <asm/amd_iommu.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 
 /*
  * definitions for the ACPI scanning code
  */
-#define UPDATE_LAST_BDF(x) do {\
-	if ((x) > amd_iommu_last_bdf) \
-		amd_iommu_last_bdf = (x); \
-	} while (0);
-
-#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
-#define PCI_BUS(x) (((x) >> 8) & 0xff)
 #define IVRS_HEADER_LENGTH 48
-#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
 
 #define ACPI_IVHD_TYPE                  0x10
 #define ACPI_IVMD_TYPE_ALL              0x20
@@ -71,6 +65,17 @@
 #define ACPI_DEVFLAG_LINT1              0x80
 #define ACPI_DEVFLAG_ATSDIS             0x10000000
 
+/*
+ * ACPI table definitions
+ *
+ * These data structures are laid over the table to parse the important values
+ * out of it.
+ */
+
+/*
+ * structure describing one IOMMU in the ACPI table. Typically followed by one
+ * or more ivhd_entrys.
+ */
 struct ivhd_header {
 	u8 type;
 	u8 flags;
@@ -83,6 +88,10 @@ struct ivhd_header {
 	u32 reserved;
 } __attribute__((packed));
 
+/*
+ * A device entry describing which devices a specific IOMMU translates and
+ * which requestor ids they use.
+ */
 struct ivhd_entry {
 	u8 type;
 	u16 devid;
@@ -90,6 +99,10 @@ struct ivhd_entry {
 	u32 ext;
 } __attribute__((packed));
 
+/*
+ * An AMD IOMMU memory definition structure. It defines things like exclusion
+ * ranges for devices and regions that should be unity mapped.
+ */
 struct ivmd_header {
 	u8 type;
 	u8 flags;
@@ -103,22 +116,81 @@ struct ivmd_header {
 
 static int __initdata amd_iommu_detected;
 
-u16 amd_iommu_last_bdf;
-struct list_head amd_iommu_unity_map;
-unsigned amd_iommu_aperture_order = 26;
-int amd_iommu_isolate;
+u16 amd_iommu_last_bdf;			/* largest PCI device id we have
+					   to handle */
+LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
+					   we find in ACPI */
+unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
+int amd_iommu_isolate;			/* if 1, device isolation is enabled */
+bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
+
+LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
+					   system */
 
-struct list_head amd_iommu_list;
+/*
+ * Pointer to the device table which is shared by all AMD IOMMUs
+ * it is indexed by the PCI device id or the HT unit id and contains
+ * information about the domain the device belongs to as well as the
+ * page table root pointer.
+ */
 struct dev_table_entry *amd_iommu_dev_table;
+
+/*
+ * The alias table is a driver specific data structure which contains the
+ * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
+ * More than one device can share the same requestor id.
+ */
 u16 *amd_iommu_alias_table;
+
+/*
+ * The rlookup table is used to find the IOMMU which is responsible
+ * for a specific device. It is also indexed by the PCI device id.
+ */
 struct amd_iommu **amd_iommu_rlookup_table;
+
+/*
+ * The pd table (protection domain table) is used to find the protection domain
+ * data structure a device belongs to. Indexed with the PCI device id too.
+ */
 struct protection_domain **amd_iommu_pd_table;
+
+/*
+ * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
+ * to know which ones are already in use.
+ */
 unsigned long *amd_iommu_pd_alloc_bitmap;
 
-static u32 dev_table_size;
-static u32 alias_table_size;
-static u32 rlookup_table_size;
+static u32 dev_table_size;	/* size of the device table */
+static u32 alias_table_size;	/* size of the alias table */
+static u32 rlookup_table_size;	/* size if the rlookup table */
+
+static inline void update_last_devid(u16 devid)
+{
+	if (devid > amd_iommu_last_bdf)
+		amd_iommu_last_bdf = devid;
+}
+
+static inline unsigned long tbl_size(int entry_size)
+{
+	unsigned shift = PAGE_SHIFT +
+			 get_order(amd_iommu_last_bdf * entry_size);
+
+	return 1UL << shift;
+}
 
+/****************************************************************************
+ *
+ * AMD IOMMU MMIO register space handling functions
+ *
+ * These functions are used to program the IOMMU device registers in
+ * MMIO space required for that driver.
+ *
+ ****************************************************************************/
+
+/*
+ * This function set the exclusion range in the IOMMU. DMA accesses to the
+ * exclusion range are passed through untranslated
+ */
 static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
 {
 	u64 start = iommu->exclusion_start & PAGE_MASK;
@@ -137,9 +209,10 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
 			&entry, sizeof(entry));
 }
 
+/* Programs the physical address of the device table into the IOMMU hardware */
 static void __init iommu_set_device_table(struct amd_iommu *iommu)
 {
-	u32 entry;
+	u64 entry;
 
 	BUG_ON(iommu->mmio_base == NULL);
 
@@ -149,6 +222,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
 			&entry, sizeof(entry));
 }
 
+/* Generic functions to enable/disable certain features of the IOMMU. */
 static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
 {
 	u32 ctrl;
@@ -162,20 +236,35 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 {
 	u32 ctrl;
 
-	ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
+	ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
 	ctrl &= ~(1 << bit);
 	writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
 }
 
+/* Function to enable the hardware */
 void __init iommu_enable(struct amd_iommu *iommu)
 {
-	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at ");
-	print_devid(iommu->devid, 0);
-	printk(" cap 0x%hx\n", iommu->cap_ptr);
+	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
+	       "at %02x:%02x.%x cap 0x%hx\n",
+	       iommu->dev->bus->number,
+	       PCI_SLOT(iommu->dev->devfn),
+	       PCI_FUNC(iommu->dev->devfn),
+	       iommu->cap_ptr);
 
 	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 }
 
+/* Function to enable IOMMU event logging and event interrupts */
+void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+{
+	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
+	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+}
+
+/*
+ * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
+ * the system has one.
+ */
 static u8 * __init iommu_map_mmio_space(u64 address)
 {
 	u8 *ret;
@@ -199,16 +288,41 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
 	release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
 }
 
+/****************************************************************************
+ *
+ * The functions below belong to the first pass of AMD IOMMU ACPI table
+ * parsing. In this pass we try to find out the highest device id this
+ * code has to handle. Upon this information the size of the shared data
+ * structures is determined later.
+ *
+ ****************************************************************************/
+
+/*
+ * This function calculates the length of a given IVHD entry
+ */
+static inline int ivhd_entry_length(u8 *ivhd)
+{
+	return 0x04 << (*ivhd >> 6);
+}
+
+/*
+ * This function reads the last device id the IOMMU has to handle from the PCI
+ * capability header for this IOMMU
+ */
 static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
 {
 	u32 cap;
 
 	cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
-	UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
+	update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
 
 	return 0;
 }
 
+/*
+ * After reading the highest device id from the IOMMU PCI capability header
+ * this function looks if there is a higher device id defined in the ACPI table
+ */
 static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
 {
 	u8 *p = (void *)h, *end = (void *)h;
@@ -229,12 +343,13 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
 		case IVHD_DEV_RANGE_END:
 		case IVHD_DEV_ALIAS:
 		case IVHD_DEV_EXT_SELECT:
-			UPDATE_LAST_BDF(dev->devid);
+			/* all the above subfield types refer to device ids */
+			update_last_devid(dev->devid);
 			break;
 		default:
 			break;
 		}
-		p += 0x04 << (*p >> 6);
+		p += ivhd_entry_length(p);
 	}
 
 	WARN_ON(p != end);
@@ -242,6 +357,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
 	return 0;
 }
 
+/*
+ * Iterate over all IVHD entries in the ACPI table and find the highest device
+ * id which we need to handle. This is the first of three functions which parse
+ * the ACPI table. So we check the checksum here.
+ */
 static int __init find_last_devid_acpi(struct acpi_table_header *table)
 {
 	int i;
@@ -277,19 +397,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
 	return 0;
 }
 
+/****************************************************************************
+ *
+ * The following functions belong the the code path which parses the ACPI table
+ * the second time. In this ACPI parsing iteration we allocate IOMMU specific
+ * data structures, initialize the device/alias/rlookup table and also
+ * basically initialize the hardware.
+ *
+ ****************************************************************************/
+
+/*
+ * Allocates the command buffer. This buffer is per AMD IOMMU. We can
+ * write commands to that buffer later and the IOMMU will execute them
+ * asynchronously
+ */
 static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
 {
-	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL,
+	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 			get_order(CMD_BUFFER_SIZE));
-	u64 entry = 0;
+	u64 entry;
 
 	if (cmd_buf == NULL)
 		return NULL;
 
 	iommu->cmd_buf_size = CMD_BUFFER_SIZE;
 
-	memset(cmd_buf, 0, CMD_BUFFER_SIZE);
-
 	entry = (u64)virt_to_phys(cmd_buf);
 	entry |= MMIO_CMD_SIZE_512;
 	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -302,11 +434,35 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
 
 static void __init free_command_buffer(struct amd_iommu *iommu)
 {
-	if (iommu->cmd_buf)
-		free_pages((unsigned long)iommu->cmd_buf,
-				get_order(CMD_BUFFER_SIZE));
+	free_pages((unsigned long)iommu->cmd_buf,
+		   get_order(iommu->cmd_buf_size));
+}
+
+/* allocates the memory where the IOMMU will log its events to */
+static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
+{
+	u64 entry;
+	iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+						get_order(EVT_BUFFER_SIZE));
+
+	if (iommu->evt_buf == NULL)
+		return NULL;
+
+	entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+	memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
+		    &entry, sizeof(entry));
+
+	iommu->evt_buf_size = EVT_BUFFER_SIZE;
+
+	return iommu->evt_buf;
+}
+
+static void __init free_event_buffer(struct amd_iommu *iommu)
+{
+	free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
 }
 
+/* sets a specific bit in the device table entry. */
 static void set_dev_entry_bit(u16 devid, u8 bit)
 {
 	int i = (bit >> 5) & 0x07;
@@ -315,7 +471,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
 	amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
 }
 
-static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
+/* Writes the specific IOMMU for a device into the rlookup table */
+static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
+{
+	amd_iommu_rlookup_table[devid] = iommu;
+}
+
+/*
+ * This function takes the device specific flags read from the ACPI
+ * table and sets up the device table entry with that information
+ */
+static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
+					   u16 devid, u32 flags, u32 ext_flags)
 {
 	if (flags & ACPI_DEVFLAG_INITPASS)
 		set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
@@ -331,13 +498,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
 		set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
 	if (flags & ACPI_DEVFLAG_LINT1)
 		set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
-}
 
-static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
-{
-	amd_iommu_rlookup_table[devid] = iommu;
+	set_iommu_for_device(iommu, devid);
 }
 
+/*
+ * Reads the device exclusion range from ACPI and initialize IOMMU with
+ * it
+ */
 static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
 {
 	struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
@@ -346,27 +514,45 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
 		return;
 
 	if (iommu) {
+		/*
+		 * We only can configure exclusion ranges per IOMMU, not
+		 * per device. But we can enable the exclusion range per
+		 * device. This is done here
+		 */
 		set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
 		iommu->exclusion_start = m->range_start;
 		iommu->exclusion_length = m->range_length;
 	}
 }
 
+/*
+ * This function reads some important data from the IOMMU PCI space and
+ * initializes the driver data structure with it. It reads the hardware
+ * capabilities and the first/last device entries
+ */
 static void __init init_iommu_from_pci(struct amd_iommu *iommu)
 {
-	int bus = PCI_BUS(iommu->devid);
-	int dev = PCI_SLOT(iommu->devid);
-	int fn  = PCI_FUNC(iommu->devid);
 	int cap_ptr = iommu->cap_ptr;
-	u32 range;
-
-	iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET);
-
-	range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
-	iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range));
-	iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range));
+	u32 range, misc;
+
+	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
+			      &iommu->cap);
+	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
+			      &range);
+	pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
+			      &misc);
+
+	iommu->first_device = calc_devid(MMIO_GET_BUS(range),
+					 MMIO_GET_FD(range));
+	iommu->last_device = calc_devid(MMIO_GET_BUS(range),
+					MMIO_GET_LD(range));
+	iommu->evt_msi_num = MMIO_MSI_NUM(misc);
 }
 
+/*
+ * Takes a pointer to an AMD IOMMU entry in the ACPI table and
+ * initializes the hardware and our data structures with it.
+ */
 static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 					struct ivhd_header *h)
 {
@@ -374,7 +560,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 	u8 *end = p, flags = 0;
 	u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
 	u32 ext_flags = 0;
-	bool alias = 0;
+	bool alias = false;
 	struct ivhd_entry *e;
 
 	/*
@@ -414,22 +600,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 		case IVHD_DEV_ALL:
 			for (dev_i = iommu->first_device;
 					dev_i <= iommu->last_device; ++dev_i)
-				set_dev_entry_from_acpi(dev_i, e->flags, 0);
+				set_dev_entry_from_acpi(iommu, dev_i,
+							e->flags, 0);
 			break;
 		case IVHD_DEV_SELECT:
 			devid = e->devid;
-			set_dev_entry_from_acpi(devid, e->flags, 0);
+			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
 			break;
 		case IVHD_DEV_SELECT_RANGE_START:
 			devid_start = e->devid;
 			flags = e->flags;
 			ext_flags = 0;
-			alias = 0;
+			alias = false;
 			break;
 		case IVHD_DEV_ALIAS:
 			devid = e->devid;
 			devid_to = e->ext >> 8;
-			set_dev_entry_from_acpi(devid, e->flags, 0);
+			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
 			amd_iommu_alias_table[devid] = devid_to;
 			break;
 		case IVHD_DEV_ALIAS_RANGE:
@@ -437,24 +624,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 			flags = e->flags;
 			devid_to = e->ext >> 8;
 			ext_flags = 0;
-			alias = 1;
+			alias = true;
 			break;
 		case IVHD_DEV_EXT_SELECT:
 			devid = e->devid;
-			set_dev_entry_from_acpi(devid, e->flags, e->ext);
+			set_dev_entry_from_acpi(iommu, devid, e->flags,
+						e->ext);
 			break;
 		case IVHD_DEV_EXT_SELECT_RANGE:
 			devid_start = e->devid;
 			flags = e->flags;
 			ext_flags = e->ext;
-			alias = 0;
+			alias = false;
 			break;
 		case IVHD_DEV_RANGE_END:
 			devid = e->devid;
 			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
 				if (alias)
 					amd_iommu_alias_table[dev_i] = devid_to;
-				set_dev_entry_from_acpi(
+				set_dev_entry_from_acpi(iommu,
 						amd_iommu_alias_table[dev_i],
 						flags, ext_flags);
 			}
@@ -463,10 +651,11 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 			break;
 		}
 
-		p += 0x04 << (e->type >> 6);
+		p += ivhd_entry_length(p);
 	}
 }
 
+/* Initializes the device->iommu mapping for the driver */
 static int __init init_iommu_devices(struct amd_iommu *iommu)
 {
 	u16 i;
@@ -480,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu)
 static void __init free_iommu_one(struct amd_iommu *iommu)
 {
 	free_command_buffer(iommu);
+	free_event_buffer(iommu);
 	iommu_unmap_mmio_space(iommu);
 }
 
@@ -494,6 +684,11 @@ static void __init free_iommu_all(void)
 	}
 }
 
+/*
+ * This function clues the initialization function for one IOMMU
+ * together and also allocates the command buffer and programs the
+ * hardware. It does NOT enable the IOMMU. This is done afterwards.
+ */
 static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 {
 	spin_lock_init(&iommu->lock);
@@ -502,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 	/*
 	 * Copy data from ACPI table entry to the iommu struct
 	 */
-	iommu->devid = h->devid;
+	iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
+	if (!iommu->dev)
+		return 1;
+
 	iommu->cap_ptr = h->cap_ptr;
+	iommu->pci_seg = h->pci_seg;
 	iommu->mmio_phys = h->mmio_phys;
 	iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
 	if (!iommu->mmio_base)
@@ -514,13 +713,23 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 	if (!iommu->cmd_buf)
 		return -ENOMEM;
 
+	iommu->evt_buf = alloc_event_buffer(iommu);
+	if (!iommu->evt_buf)
+		return -ENOMEM;
+
+	iommu->int_enabled = false;
+
 	init_iommu_from_pci(iommu);
 	init_iommu_from_acpi(iommu, h);
 	init_iommu_devices(iommu);
 
-	return 0;
+	return pci_enable_device(iommu->dev);
 }
 
+/*
+ * Iterates over all IOMMU entries in the ACPI table, allocates the
+ * IOMMU structure and initializes it with init_iommu_one()
+ */
 static int __init init_iommu_all(struct acpi_table_header *table)
 {
 	u8 *p = (u8 *)table, *end = (u8 *)table;
@@ -528,8 +737,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
 	struct amd_iommu *iommu;
 	int ret;
 
-	INIT_LIST_HEAD(&amd_iommu_list);
-
 	end += table->length;
 	p += IVRS_HEADER_LENGTH;
 
@@ -555,6 +762,103 @@ static int __init init_iommu_all(struct acpi_table_header *table)
 	return 0;
 }
 
+/****************************************************************************
+ *
+ * The following functions initialize the MSI interrupts for all IOMMUs
+ * in the system. Its a bit challenging because there could be multiple
+ * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
+ * pci_dev.
+ *
+ ****************************************************************************/
+
+static int __init iommu_setup_msix(struct amd_iommu *iommu)
+{
+	struct amd_iommu *curr;
+	struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
+	int nvec = 0, i;
+
+	list_for_each_entry(curr, &amd_iommu_list, list) {
+		if (curr->dev == iommu->dev) {
+			entries[nvec].entry = curr->evt_msi_num;
+			entries[nvec].vector = 0;
+			curr->int_enabled = true;
+			nvec++;
+		}
+	}
+
+	if (pci_enable_msix(iommu->dev, entries, nvec)) {
+		pci_disable_msix(iommu->dev);
+		return 1;
+	}
+
+	for (i = 0; i < nvec; ++i) {
+		int r = request_irq(entries->vector, amd_iommu_int_handler,
+				    IRQF_SAMPLE_RANDOM,
+				    "AMD IOMMU",
+				    NULL);
+		if (r)
+			goto out_free;
+	}
+
+	return 0;
+
+out_free:
+	for (i -= 1; i >= 0; --i)
+		free_irq(entries->vector, NULL);
+
+	pci_disable_msix(iommu->dev);
+
+	return 1;
+}
+
+static int __init iommu_setup_msi(struct amd_iommu *iommu)
+{
+	int r;
+	struct amd_iommu *curr;
+
+	list_for_each_entry(curr, &amd_iommu_list, list) {
+		if (curr->dev == iommu->dev)
+			curr->int_enabled = true;
+	}
+
+
+	if (pci_enable_msi(iommu->dev))
+		return 1;
+
+	r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
+			IRQF_SAMPLE_RANDOM,
+			"AMD IOMMU",
+			NULL);
+
+	if (r) {
+		pci_disable_msi(iommu->dev);
+		return 1;
+	}
+
+	return 0;
+}
+
+static int __init iommu_init_msi(struct amd_iommu *iommu)
+{
+	if (iommu->int_enabled)
+		return 0;
+
+	if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
+		return iommu_setup_msix(iommu);
+	else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
+		return iommu_setup_msi(iommu);
+
+	return 1;
+}
+
+/****************************************************************************
+ *
+ * The next functions belong to the third pass of parsing the ACPI
+ * table. In this last pass the memory mapping requirements are
+ * gathered (like exclusion and unity mapping reanges).
+ *
+ ****************************************************************************/
+
 static void __init free_unity_maps(void)
 {
 	struct unity_map_entry *entry, *next;
@@ -565,6 +869,7 @@ static void __init free_unity_maps(void)
 	}
 }
 
+/* called when we find an exclusion range definition in ACPI */
 static int __init init_exclusion_range(struct ivmd_header *m)
 {
 	int i;
@@ -574,7 +879,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
 		set_device_exclusion_range(m->devid, m);
 		break;
 	case ACPI_IVMD_TYPE_ALL:
-		for (i = 0; i < amd_iommu_last_bdf; ++i)
+		for (i = 0; i <= amd_iommu_last_bdf; ++i)
 			set_device_exclusion_range(i, m);
 		break;
 	case ACPI_IVMD_TYPE_RANGE:
@@ -588,6 +893,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
 	return 0;
 }
 
+/* called for unity map ACPI definition */
 static int __init init_unity_map_range(struct ivmd_header *m)
 {
 	struct unity_map_entry *e = 0;
@@ -619,13 +925,12 @@ static int __init init_unity_map_range(struct ivmd_header *m)
 	return 0;
 }
 
+/* iterates over all memory definitions we find in the ACPI table */
 static int __init init_memory_definitions(struct acpi_table_header *table)
 {
 	u8 *p = (u8 *)table, *end = (u8 *)table;
 	struct ivmd_header *m;
 
-	INIT_LIST_HEAD(&amd_iommu_unity_map);
-
 	end += table->length;
 	p += IVRS_HEADER_LENGTH;
 
@@ -642,12 +947,32 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
 	return 0;
 }
 
+/*
+ * Init the device table to not allow DMA access for devices and
+ * suppress all page faults
+ */
+static void init_device_table(void)
+{
+	u16 devid;
+
+	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
+		set_dev_entry_bit(devid, DEV_ENTRY_VALID);
+		set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
+	}
+}
+
+/*
+ * This function finally enables all IOMMUs found in the system after
+ * they have been initialized
+ */
 static void __init enable_iommus(void)
 {
 	struct amd_iommu *iommu;
 
 	list_for_each_entry(iommu, &amd_iommu_list, list) {
 		iommu_set_exclusion_range(iommu);
+		iommu_init_msi(iommu);
+		iommu_enable_event_logging(iommu);
 		iommu_enable(iommu);
 	}
 }
@@ -678,6 +1003,34 @@ static struct sys_device device_amd_iommu = {
 	.cls = &amd_iommu_sysdev_class,
 };
 
+/*
+ * This is the core init function for AMD IOMMU hardware in the system.
+ * This function is called from the generic x86 DMA layer initialization
+ * code.
+ *
+ * This function basically parses the ACPI table for AMD IOMMU (IVRS)
+ * three times:
+ *
+ *	1 pass) Find the highest PCI device id the driver has to handle.
+ *		Upon this information the size of the data structures is
+ *		determined that needs to be allocated.
+ *
+ *	2 pass) Initialize the data structures just allocated with the
+ *		information in the ACPI table about available AMD IOMMUs
+ *		in the system. It also maps the PCI devices in the
+ *		system to specific IOMMUs
+ *
+ *	3 pass) After the basic data structures are allocated and
+ *		initialized we update them with information about memory
+ *		remapping requirements parsed out of the ACPI table in
+ *		this last pass.
+ *
+ * After that the hardware is initialized and ready to go. In the last
+ * step we do some Linux specific things like registering the driver in
+ * the dma_ops interface and initializing the suspend/resume support
+ * functions. Finally it prints some information about AMD IOMMUs and
+ * the driver state and enables the hardware.
+ */
 int __init amd_iommu_init(void)
 {
 	int i, ret = 0;
@@ -699,14 +1052,14 @@ int __init amd_iommu_init(void)
 	if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
 		return -ENODEV;
 
-	dev_table_size     = TBL_SIZE(DEV_TABLE_ENTRY_SIZE);
-	alias_table_size   = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE);
-	rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE);
+	dev_table_size     = tbl_size(DEV_TABLE_ENTRY_SIZE);
+	alias_table_size   = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
+	rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
 
 	ret = -ENOMEM;
 
 	/* Device table - directly used by all IOMMUs */
-	amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL,
+	amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 				      get_order(dev_table_size));
 	if (amd_iommu_dev_table == NULL)
 		goto out;
@@ -730,27 +1083,26 @@ int __init amd_iommu_init(void)
 	 * Protection Domain table - maps devices to protection domains
 	 * This table has the same size as the rlookup_table
 	 */
-	amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL,
+	amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 				     get_order(rlookup_table_size));
 	if (amd_iommu_pd_table == NULL)
 		goto free;
 
-	amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL,
+	amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
+					    GFP_KERNEL | __GFP_ZERO,
 					    get_order(MAX_DOMAIN_ID/8));
 	if (amd_iommu_pd_alloc_bitmap == NULL)
 		goto free;
 
+	/* init the device table */
+	init_device_table();
+
 	/*
-	 * memory is allocated now; initialize the device table with all zeroes
-	 * and let all alias entries point to itself
+	 * let all alias entries point to itself
 	 */
-	memset(amd_iommu_dev_table, 0, dev_table_size);
-	for (i = 0; i < amd_iommu_last_bdf; ++i)
+	for (i = 0; i <= amd_iommu_last_bdf; ++i)
 		amd_iommu_alias_table[i] = i;
 
-	memset(amd_iommu_pd_table, 0, rlookup_table_size);
-	memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
-
 	/*
 	 * never allocate domain 0 because its used as the non-allocated and
 	 * error value placeholder
@@ -768,15 +1120,15 @@ int __init amd_iommu_init(void)
 	if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
 		goto free;
 
-	ret = amd_iommu_init_dma_ops();
+	ret = sysdev_class_register(&amd_iommu_sysdev_class);
 	if (ret)
 		goto free;
 
-	ret = sysdev_class_register(&amd_iommu_sysdev_class);
+	ret = sysdev_register(&device_amd_iommu);
 	if (ret)
 		goto free;
 
-	ret = sysdev_register(&device_amd_iommu);
+	ret = amd_iommu_init_dma_ops();
 	if (ret)
 		goto free;
 
@@ -791,28 +1143,29 @@ int __init amd_iommu_init(void)
 	else
 		printk("disabled\n");
 
+	if (amd_iommu_unmap_flush)
+		printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
+	else
+		printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
+
 out:
 	return ret;
 
 free:
-	if (amd_iommu_pd_alloc_bitmap)
-		free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1);
+	free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
+		   get_order(MAX_DOMAIN_ID/8));
 
-	if (amd_iommu_pd_table)
-		free_pages((unsigned long)amd_iommu_pd_table,
-				get_order(rlookup_table_size));
+	free_pages((unsigned long)amd_iommu_pd_table,
+		   get_order(rlookup_table_size));
 
-	if (amd_iommu_rlookup_table)
-		free_pages((unsigned long)amd_iommu_rlookup_table,
-				get_order(rlookup_table_size));
+	free_pages((unsigned long)amd_iommu_rlookup_table,
+		   get_order(rlookup_table_size));
 
-	if (amd_iommu_alias_table)
-		free_pages((unsigned long)amd_iommu_alias_table,
-				get_order(alias_table_size));
+	free_pages((unsigned long)amd_iommu_alias_table,
+		   get_order(alias_table_size));
 
-	if (amd_iommu_dev_table)
-		free_pages((unsigned long)amd_iommu_dev_table,
-				get_order(dev_table_size));
+	free_pages((unsigned long)amd_iommu_dev_table,
+		   get_order(dev_table_size));
 
 	free_iommu_all();
 
@@ -821,6 +1174,13 @@ free:
 	goto out;
 }
 
+/****************************************************************************
+ *
+ * Early detect code. This code runs at IOMMU detection time in the DMA
+ * layer. It just looks if there is an IVRS ACPI table to detect AMD
+ * IOMMUs
+ *
+ ****************************************************************************/
 static int __init early_amd_iommu_detect(struct acpi_table_header *table)
 {
 	return 0;
@@ -828,7 +1188,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
 
 void __init amd_iommu_detect(void)
 {
-	if (swiotlb || no_iommu || iommu_detected)
+	if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
 		return;
 
 	if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
@@ -841,11 +1201,20 @@ void __init amd_iommu_detect(void)
 	}
 }
 
+/****************************************************************************
+ *
+ * Parsing functions for the AMD IOMMU specific kernel command line
+ * options.
+ *
+ ****************************************************************************/
+
 static int __init parse_amd_iommu_options(char *str)
 {
 	for (; *str; ++str) {
-		if (strcmp(str, "isolate") == 0)
+		if (strncmp(str, "isolate", 7) == 0)
 			amd_iommu_isolate = 1;
+		if (strncmp(str, "fullflush", 11) == 0)
+			amd_iommu_unmap_flush = true;
 	}
 
 	return 1;
@@ -853,20 +1222,10 @@ static int __init parse_amd_iommu_options(char *str)
 
 static int __init parse_amd_iommu_size_options(char *str)
 {
-	for (; *str; ++str) {
-		if (strcmp(str, "32M") == 0)
-			amd_iommu_aperture_order = 25;
-		if (strcmp(str, "64M") == 0)
-			amd_iommu_aperture_order = 26;
-		if (strcmp(str, "128M") == 0)
-			amd_iommu_aperture_order = 27;
-		if (strcmp(str, "256M") == 0)
-			amd_iommu_aperture_order = 28;
-		if (strcmp(str, "512M") == 0)
-			amd_iommu_aperture_order = 29;
-		if (strcmp(str, "1G") == 0)
-			amd_iommu_aperture_order = 30;
-	}
+	unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
+
+	if ((order > 24) && (order < 31))
+		amd_iommu_aperture_order = order;
 
 	return 1;
 }
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9f907806c1a5..9a32b37ee2ee 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -21,6 +21,7 @@
 #include <linux/suspend.h>
 #include <asm/e820.h>
 #include <asm/io.h>
+#include <asm/iommu.h>
 #include <asm/gart.h>
 #include <asm/pci-direct.h>
 #include <asm/dma.h>
@@ -454,11 +455,11 @@ out:
 		   force_iommu ||
 		   valid_agp ||
 		   fallback_aper_force) {
-		printk(KERN_ERR
+		printk(KERN_INFO
 			"Your BIOS doesn't leave a aperture memory hole\n");
-		printk(KERN_ERR
+		printk(KERN_INFO
 			"Please enable the IOMMU option in the BIOS setup\n");
-		printk(KERN_ERR
+		printk(KERN_INFO
 			"This costs you %d MB of RAM\n",
 				32 << fallback_aper_order);
 
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic.c
index a437d027f20b..04a7f960bbc0 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic.c
@@ -23,11 +23,13 @@
 #include <linux/mc146818rtc.h>
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
+#include <linux/ioport.h>
 #include <linux/cpu.h>
 #include <linux/clockchips.h>
 #include <linux/acpi_pmtmr.h>
 #include <linux/module.h>
 #include <linux/dmi.h>
+#include <linux/dmar.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
@@ -36,8 +38,14 @@
 #include <asm/desc.h>
 #include <asm/arch_hooks.h>
 #include <asm/hpet.h>
+#include <asm/pgalloc.h>
 #include <asm/i8253.h>
 #include <asm/nmi.h>
+#include <asm/idle.h>
+#include <asm/proto.h>
+#include <asm/timex.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
 
 #include <mach_apic.h>
 #include <mach_apicdef.h>
@@ -50,20 +58,60 @@
 # error SPURIOUS_APIC_VECTOR definition error
 #endif
 
-unsigned long mp_lapic_addr;
-
+#ifdef CONFIG_X86_32
 /*
  * Knob to control our willingness to enable the local APIC.
  *
  * +1=force-enable
  */
 static int force_enable_local_apic;
-int disable_apic;
+/*
+ * APIC command line parameters
+ */
+static int __init parse_lapic(char *arg)
+{
+	force_enable_local_apic = 1;
+	return 0;
+}
+early_param("lapic", parse_lapic);
+/* Local APIC was disabled by the BIOS and enabled by the kernel */
+static int enabled_via_apicbase;
+
+#endif
 
-/* Local APIC timer verification ok */
-static int local_apic_timer_verify_ok;
+#ifdef CONFIG_X86_64
+static int apic_calibrate_pmtmr __initdata;
+static __init int setup_apicpmtimer(char *s)
+{
+	apic_calibrate_pmtmr = 1;
+	notsc_setup(NULL);
+	return 0;
+}
+__setup("apicpmtimer", setup_apicpmtimer);
+#endif
+
+#ifdef CONFIG_X86_64
+#define HAVE_X2APIC
+#endif
+
+#ifdef HAVE_X2APIC
+int x2apic;
+/* x2apic enabled before OS handover */
+int x2apic_preenabled;
+int disable_x2apic;
+static __init int setup_nox2apic(char *str)
+{
+	disable_x2apic = 1;
+	setup_clear_cpu_cap(X86_FEATURE_X2APIC);
+	return 0;
+}
+early_param("nox2apic", setup_nox2apic);
+#endif
+
+unsigned long mp_lapic_addr;
+int disable_apic;
 /* Disable local APIC timer from the kernel commandline or via dmi quirk */
-static int local_apic_timer_disabled;
+static int disable_apic_timer __cpuinitdata;
 /* Local APIC timer works in C2 */
 int local_apic_timer_c2_ok;
 EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -75,7 +123,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
 /*
  * Debug level, exported for io_apic.c
  */
-int apic_verbosity;
+unsigned int apic_verbosity;
 
 int pic_mode;
 
@@ -112,9 +160,6 @@ static struct clock_event_device lapic_clockevent = {
 };
 static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
 
-/* Local APIC was disabled by the BIOS and enabled by the kernel */
-static int enabled_via_apicbase;
-
 static unsigned long apic_phys;
 
 /*
@@ -130,7 +175,11 @@ static inline int lapic_get_version(void)
  */
 static inline int lapic_is_integrated(void)
 {
+#ifdef CONFIG_X86_64
+	return 1;
+#else
 	return APIC_INTEGRATED(lapic_get_version());
+#endif
 }
 
 /*
@@ -145,13 +194,18 @@ static int modern_apic(void)
 	return lapic_get_version() >= 0x14;
 }
 
-void apic_wait_icr_idle(void)
+/*
+ * Paravirt kernels also might be using these below ops. So we still
+ * use generic apic_read()/apic_write(), which might be pointing to different
+ * ops in PARAVIRT case.
+ */
+void xapic_wait_icr_idle(void)
 {
 	while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
 		cpu_relax();
 }
 
-u32 safe_apic_wait_icr_idle(void)
+u32 safe_xapic_wait_icr_idle(void)
 {
 	u32 send_status;
 	int timeout;
@@ -167,19 +221,88 @@ u32 safe_apic_wait_icr_idle(void)
 	return send_status;
 }
 
+void xapic_icr_write(u32 low, u32 id)
+{
+	apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
+	apic_write(APIC_ICR, low);
+}
+
+u64 xapic_icr_read(void)
+{
+	u32 icr1, icr2;
+
+	icr2 = apic_read(APIC_ICR2);
+	icr1 = apic_read(APIC_ICR);
+
+	return icr1 | ((u64)icr2 << 32);
+}
+
+static struct apic_ops xapic_ops = {
+	.read = native_apic_mem_read,
+	.write = native_apic_mem_write,
+	.icr_read = xapic_icr_read,
+	.icr_write = xapic_icr_write,
+	.wait_icr_idle = xapic_wait_icr_idle,
+	.safe_wait_icr_idle = safe_xapic_wait_icr_idle,
+};
+
+struct apic_ops __read_mostly *apic_ops = &xapic_ops;
+EXPORT_SYMBOL_GPL(apic_ops);
+
+#ifdef HAVE_X2APIC
+static void x2apic_wait_icr_idle(void)
+{
+	/* no need to wait for icr idle in x2apic */
+	return;
+}
+
+static u32 safe_x2apic_wait_icr_idle(void)
+{
+	/* no need to wait for icr idle in x2apic */
+	return 0;
+}
+
+void x2apic_icr_write(u32 low, u32 id)
+{
+	wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
+}
+
+u64 x2apic_icr_read(void)
+{
+	unsigned long val;
+
+	rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
+	return val;
+}
+
+static struct apic_ops x2apic_ops = {
+	.read = native_apic_msr_read,
+	.write = native_apic_msr_write,
+	.icr_read = x2apic_icr_read,
+	.icr_write = x2apic_icr_write,
+	.wait_icr_idle = x2apic_wait_icr_idle,
+	.safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
+};
+#endif
+
 /**
  * enable_NMI_through_LVT0 - enable NMI through local vector table 0
  */
 void __cpuinit enable_NMI_through_LVT0(void)
 {
-	unsigned int v = APIC_DM_NMI;
+	unsigned int v;
+
+	/* unmask and set to NMI */
+	v = APIC_DM_NMI;
 
-	/* Level triggered for 82489DX */
+	/* Level triggered for 82489DX (32bit mode) */
 	if (!lapic_is_integrated())
 		v |= APIC_LVT_LEVEL_TRIGGER;
-	apic_write_around(APIC_LVT0, v);
+
+	apic_write(APIC_LVT0, v);
 }
 
+#ifdef CONFIG_X86_32
 /**
  * get_physical_broadcast - Get number of physical broadcast IDs
  */
@@ -187,15 +310,20 @@ int get_physical_broadcast(void)
 {
 	return modern_apic() ? 0xff : 0xf;
 }
+#endif
 
 /**
  * lapic_get_maxlvt - get the maximum number of local vector table entries
  */
 int lapic_get_maxlvt(void)
 {
-	unsigned int v = apic_read(APIC_LVR);
+	unsigned int v;
 
-	/* 82489DXs do not report # of LVT entries. */
+	v = apic_read(APIC_LVR);
+	/*
+	 * - we always have APIC integrated on 64bit mode
+	 * - 82489DXs do not report # of LVT entries
+	 */
 	return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
 }
 
@@ -203,7 +331,7 @@ int lapic_get_maxlvt(void)
  * Local APIC timer
  */
 
-/* Clock divisor is set to 16 */
+/* Clock divisor */
 #define APIC_DIVISOR 16
 
 /*
@@ -229,27 +357,61 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 	if (!irqen)
 		lvtt_value |= APIC_LVT_MASKED;
 
-	apic_write_around(APIC_LVTT, lvtt_value);
+	apic_write(APIC_LVTT, lvtt_value);
 
 	/*
 	 * Divide PICLK by 16
 	 */
 	tmp_value = apic_read(APIC_TDCR);
-	apic_write_around(APIC_TDCR, (tmp_value
-				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
-				| APIC_TDR_DIV_16);
+	apic_write(APIC_TDCR,
+		(tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
+		APIC_TDR_DIV_16);
 
 	if (!oneshot)
-		apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+		apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
 }
 
 /*
+ * Setup extended LVT, AMD specific (K8, family 10h)
+ *
+ * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
+ * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ *
+ * If mask=1, the LVT entry does not generate interrupts while mask=0
+ * enables the vector. See also the BKDGs.
+ */
+
+#define APIC_EILVT_LVTOFF_MCE 0
+#define APIC_EILVT_LVTOFF_IBS 1
+
+static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+{
+	unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
+	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
+
+	apic_write(reg, v);
+}
+
+u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
+{
+	setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
+	return APIC_EILVT_LVTOFF_MCE;
+}
+
+u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
+{
+	setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
+	return APIC_EILVT_LVTOFF_IBS;
+}
+EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
+
+/*
  * Program the next event, relative to now
  */
 static int lapic_next_event(unsigned long delta,
 			    struct clock_event_device *evt)
 {
-	apic_write_around(APIC_TMICT, delta);
+	apic_write(APIC_TMICT, delta);
 	return 0;
 }
 
@@ -262,8 +424,8 @@ static void lapic_timer_setup(enum clock_event_mode mode,
 	unsigned long flags;
 	unsigned int v;
 
-	/* Lapic used for broadcast ? */
-	if (!local_apic_timer_verify_ok)
+	/* Lapic used as dummy for broadcast ? */
+	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
 		return;
 
 	local_irq_save(flags);
@@ -278,7 +440,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
 	case CLOCK_EVT_MODE_SHUTDOWN:
 		v = apic_read(APIC_LVTT);
 		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-		apic_write_around(APIC_LVTT, v);
+		apic_write(APIC_LVTT, v);
 		break;
 	case CLOCK_EVT_MODE_RESUME:
 		/* Nothing to do here */
@@ -302,7 +464,7 @@ static void lapic_timer_broadcast(cpumask_t mask)
  * Setup the local APIC timer for this CPU. Copy the initilized values
  * of the boot CPU and register the clock event in the framework.
  */
-static void __devinit setup_APIC_timer(void)
+static void __cpuinit setup_APIC_timer(void)
 {
 	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
 
@@ -372,39 +534,53 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
 	}
 }
 
-/*
- * Setup the boot APIC
- *
- * Calibrate and verify the result.
- */
-void __init setup_boot_APIC_clock(void)
+static int __init calibrate_by_pmtimer(long deltapm, long *delta)
+{
+	const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
+	const long pm_thresh = pm_100ms / 100;
+	unsigned long mult;
+	u64 res;
+
+#ifndef CONFIG_X86_PM_TIMER
+	return -1;
+#endif
+
+	apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
+
+	/* Check, if the PM timer is available */
+	if (!deltapm)
+		return -1;
+
+	mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
+
+	if (deltapm > (pm_100ms - pm_thresh) &&
+	    deltapm < (pm_100ms + pm_thresh)) {
+		apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
+	} else {
+		res = (((u64)deltapm) *  mult) >> 22;
+		do_div(res, 1000000);
+		printk(KERN_WARNING "APIC calibration not consistent "
+			"with PM Timer: %ldms instead of 100ms\n",
+			(long)res);
+		/* Correct the lapic counter value */
+		res = (((u64)(*delta)) * pm_100ms);
+		do_div(res, deltapm);
+		printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
+			"%lu (%ld)\n", (unsigned long)res, *delta);
+		*delta = (long)res;
+	}
+
+	return 0;
+}
+
+static int __init calibrate_APIC_clock(void)
 {
 	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
-	const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
-	const long pm_thresh = pm_100ms/100;
 	void (*real_handler)(struct clock_event_device *dev);
 	unsigned long deltaj;
-	long delta, deltapm;
+	long delta;
 	int pm_referenced = 0;
 
-	/*
-	 * The local apic timer can be disabled via the kernel
-	 * commandline or from the CPU detection code. Register the lapic
-	 * timer as a dummy clock event source on SMP systems, so the
-	 * broadcast mechanism is used. On UP systems simply ignore it.
-	 */
-	if (local_apic_timer_disabled) {
-		/* No broadcast on UP ! */
-		if (num_possible_cpus() > 1) {
-			lapic_clockevent.mult = 1;
-			setup_APIC_timer();
-		}
-		return;
-	}
-
-	apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
-		    "calibrating APIC timer ...\n");
-
 	local_irq_disable();
 
 	/* Replace the global interrupt handler */
@@ -412,10 +588,10 @@ void __init setup_boot_APIC_clock(void)
 	global_clock_event->event_handler = lapic_cal_handler;
 
 	/*
-	 * Setup the APIC counter to 1e9. There is no way the lapic
+	 * Setup the APIC counter to maximum. There is no way the lapic
 	 * can underflow in the 100ms detection time frame
 	 */
-	__setup_APIC_LVTT(1000000000, 0, 0);
+	__setup_APIC_LVTT(0xffffffff, 0, 0);
 
 	/* Let the interrupts run */
 	local_irq_enable();
@@ -432,34 +608,9 @@ void __init setup_boot_APIC_clock(void)
 	delta = lapic_cal_t1 - lapic_cal_t2;
 	apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
 
-	/* Check, if the PM timer is available */
-	deltapm = lapic_cal_pm2 - lapic_cal_pm1;
-	apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
-
-	if (deltapm) {
-		unsigned long mult;
-		u64 res;
-
-		mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
-
-		if (deltapm > (pm_100ms - pm_thresh) &&
-		    deltapm < (pm_100ms + pm_thresh)) {
-			apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
-		} else {
-			res = (((u64) deltapm) *  mult) >> 22;
-			do_div(res, 1000000);
-			printk(KERN_WARNING "APIC calibration not consistent "
-			       "with PM Timer: %ldms instead of 100ms\n",
-			       (long)res);
-			/* Correct the lapic counter value */
-			res = (((u64) delta) * pm_100ms);
-			do_div(res, deltapm);
-			printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
-			       "%lu (%ld)\n", (unsigned long) res, delta);
-			delta = (long) res;
-		}
-		pm_referenced = 1;
-	}
+	/* we trust the PM based calibration if possible */
+	pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
+					&delta);
 
 	/* Calculate the scaled math multiplication factor */
 	lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
@@ -489,8 +640,6 @@ void __init setup_boot_APIC_clock(void)
 		    calibration_result / (1000000 / HZ),
 		    calibration_result % (1000000 / HZ));
 
-	local_apic_timer_verify_ok = 1;
-
 	/*
 	 * Do a sanity check on the APIC calibration result
 	 */
@@ -498,13 +647,15 @@ void __init setup_boot_APIC_clock(void)
 		local_irq_enable();
 		printk(KERN_WARNING
 		       "APIC frequency too slow, disabling apic timer\n");
-		/* No broadcast on UP ! */
-		if (num_possible_cpus() > 1)
-			setup_APIC_timer();
-		return;
+		return -1;
 	}
 
-	/* We trust the pm timer based calibration */
+	levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
+
+	/*
+	 * PM timer calibration failed or not turned on
+	 * so lets try APIC timer based calibration
+	 */
 	if (!pm_referenced) {
 		apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
 
@@ -536,34 +687,68 @@ void __init setup_boot_APIC_clock(void)
 		if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
 			apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
 		else
-			local_apic_timer_verify_ok = 0;
+			levt->features |= CLOCK_EVT_FEAT_DUMMY;
 	} else
 		local_irq_enable();
 
-	if (!local_apic_timer_verify_ok) {
+	if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
 		printk(KERN_WARNING
 		       "APIC timer disabled due to verification failure.\n");
+			return -1;
+	}
+
+	return 0;
+}
+
+/*
+ * Setup the boot APIC
+ *
+ * Calibrate and verify the result.
+ */
+void __init setup_boot_APIC_clock(void)
+{
+	/*
+	 * The local apic timer can be disabled via the kernel
+	 * commandline or from the CPU detection code. Register the lapic
+	 * timer as a dummy clock event source on SMP systems, so the
+	 * broadcast mechanism is used. On UP systems simply ignore it.
+	 */
+	if (disable_apic_timer) {
+		printk(KERN_INFO "Disabling APIC timer\n");
 		/* No broadcast on UP ! */
-		if (num_possible_cpus() == 1)
-			return;
-	} else {
-		/*
-		 * If nmi_watchdog is set to IO_APIC, we need the
-		 * PIT/HPET going.  Otherwise register lapic as a dummy
-		 * device.
-		 */
-		if (nmi_watchdog != NMI_IO_APIC)
-			lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-		else
-			printk(KERN_WARNING "APIC timer registered as dummy,"
-				" due to nmi_watchdog=%d!\n", nmi_watchdog);
+		if (num_possible_cpus() > 1) {
+			lapic_clockevent.mult = 1;
+			setup_APIC_timer();
+		}
+		return;
+	}
+
+	apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
+		    "calibrating APIC timer ...\n");
+
+	if (calibrate_APIC_clock()) {
+		/* No broadcast on UP ! */
+		if (num_possible_cpus() > 1)
+			setup_APIC_timer();
+		return;
 	}
 
+	/*
+	 * If nmi_watchdog is set to IO_APIC, we need the
+	 * PIT/HPET going.  Otherwise register lapic as a dummy
+	 * device.
+	 */
+	if (nmi_watchdog != NMI_IO_APIC)
+		lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
+	else
+		printk(KERN_WARNING "APIC timer registered as dummy,"
+			" due to nmi_watchdog=%d!\n", nmi_watchdog);
+
 	/* Setup the lapic or request the broadcast */
 	setup_APIC_timer();
 }
 
-void __devinit setup_secondary_APIC_clock(void)
+void __cpuinit setup_secondary_APIC_clock(void)
 {
 	setup_APIC_timer();
 }
@@ -598,7 +783,11 @@ static void local_apic_timer_interrupt(void)
 	/*
 	 * the NMI deadlock-detector uses this.
 	 */
+#ifdef CONFIG_X86_64
+	add_pda(apic_timer_irqs, 1);
+#else
 	per_cpu(irq_stat, cpu).apic_timer_irqs++;
+#endif
 
 	evt->event_handler(evt);
 }
@@ -625,6 +814,9 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
 	 * Besides, if we don't timer interrupts ignore the global
 	 * interrupt lock, which is the WrongThing (tm) to do.
 	 */
+#ifdef CONFIG_X86_64
+	exit_idle();
+#endif
 	irq_enter();
 	local_apic_timer_interrupt();
 	irq_exit();
@@ -638,35 +830,6 @@ int setup_profiling_timer(unsigned int multiplier)
 }
 
 /*
- * Setup extended LVT, AMD specific (K8, family 10h)
- *
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
- */
-
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
-
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
-{
-	unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
-	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
-	apic_write(reg, v);
-}
-
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
-{
-	setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
-	return APIC_EILVT_LVTOFF_MCE;
-}
-
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
-{
-	setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
-	return APIC_EILVT_LVTOFF_IBS;
-}
-
-/*
  * Local APIC start and shutdown
  */
 
@@ -693,45 +856,41 @@ void clear_local_APIC(void)
 	 */
 	if (maxlvt >= 3) {
 		v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
-		apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED);
+		apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
 	}
 	/*
 	 * Careful: we have to set masks only first to deassert
 	 * any level-triggered sources.
 	 */
 	v = apic_read(APIC_LVTT);
-	apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED);
+	apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
 	v = apic_read(APIC_LVT0);
-	apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
+	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
 	v = apic_read(APIC_LVT1);
-	apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED);
+	apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
 	if (maxlvt >= 4) {
 		v = apic_read(APIC_LVTPC);
-		apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED);
+		apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
 	}
 
 	/* lets not touch this if we didn't frob it */
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
 	if (maxlvt >= 5) {
 		v = apic_read(APIC_LVTTHMR);
-		apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED);
+		apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
 	}
 #endif
 	/*
 	 * Clean APIC state for other OSs:
 	 */
-	apic_write_around(APIC_LVTT, APIC_LVT_MASKED);
-	apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
-	apic_write_around(APIC_LVT1, APIC_LVT_MASKED);
+	apic_write(APIC_LVTT, APIC_LVT_MASKED);
+	apic_write(APIC_LVT0, APIC_LVT_MASKED);
+	apic_write(APIC_LVT1, APIC_LVT_MASKED);
 	if (maxlvt >= 3)
-		apic_write_around(APIC_LVTERR, APIC_LVT_MASKED);
+		apic_write(APIC_LVTERR, APIC_LVT_MASKED);
 	if (maxlvt >= 4)
-		apic_write_around(APIC_LVTPC, APIC_LVT_MASKED);
+		apic_write(APIC_LVTPC, APIC_LVT_MASKED);
 
-#ifdef CONFIG_X86_MCE_P4THERMAL
-	if (maxlvt >= 5)
-		apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
-#endif
 	/* Integrated APIC (!82489DX) ? */
 	if (lapic_is_integrated()) {
 		if (maxlvt > 3)
@@ -746,7 +905,7 @@ void clear_local_APIC(void)
  */
 void disable_local_APIC(void)
 {
-	unsigned long value;
+	unsigned int value;
 
 	clear_local_APIC();
 
@@ -756,8 +915,9 @@ void disable_local_APIC(void)
 	 */
 	value = apic_read(APIC_SPIV);
 	value &= ~APIC_SPIV_APIC_ENABLED;
-	apic_write_around(APIC_SPIV, value);
+	apic_write(APIC_SPIV, value);
 
+#ifdef CONFIG_X86_32
 	/*
 	 * When LAPIC was disabled by the BIOS and enabled by the kernel,
 	 * restore the disabled state.
@@ -769,6 +929,7 @@ void disable_local_APIC(void)
 		l &= ~MSR_IA32_APICBASE_ENABLE;
 		wrmsr(MSR_IA32_APICBASE, l, h);
 	}
+#endif
 }
 
 /*
@@ -785,11 +946,15 @@ void lapic_shutdown(void)
 		return;
 
 	local_irq_save(flags);
-	clear_local_APIC();
 
-	if (enabled_via_apicbase)
+#ifdef CONFIG_X86_32
+	if (!enabled_via_apicbase)
+		clear_local_APIC();
+	else
+#endif
 		disable_local_APIC();
 
+
 	local_irq_restore(flags);
 }
 
@@ -834,6 +999,12 @@ int __init verify_local_APIC(void)
 	 */
 	reg0 = apic_read(APIC_ID);
 	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
+	apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
+	reg1 = apic_read(APIC_ID);
+	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
+	apic_write(APIC_ID, reg0);
+	if (reg1 != (reg0 ^ APIC_ID_MASK))
+		return 0;
 
 	/*
 	 * The next two are just to see if we have sane values.
@@ -859,14 +1030,15 @@ void __init sync_Arb_IDs(void)
 	 */
 	if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
 		return;
+
 	/*
 	 * Wait for idle.
 	 */
 	apic_wait_icr_idle();
 
 	apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
-	apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
-				| APIC_DM_INIT);
+	apic_write(APIC_ICR, APIC_DEST_ALLINC |
+			APIC_INT_LEVELTRIG | APIC_DM_INIT);
 }
 
 /*
@@ -874,7 +1046,7 @@ void __init sync_Arb_IDs(void)
  */
 void __init init_bsp_APIC(void)
 {
-	unsigned long value;
+	unsigned int value;
 
 	/*
 	 * Don't do the setup now if we have a SMP BIOS as the
@@ -895,60 +1067,66 @@ void __init init_bsp_APIC(void)
 	value &= ~APIC_VECTOR_MASK;
 	value |= APIC_SPIV_APIC_ENABLED;
 
+#ifdef CONFIG_X86_32
 	/* This bit is reserved on P4/Xeon and should be cleared */
 	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
 	    (boot_cpu_data.x86 == 15))
 		value &= ~APIC_SPIV_FOCUS_DISABLED;
 	else
+#endif
 		value |= APIC_SPIV_FOCUS_DISABLED;
 	value |= SPURIOUS_APIC_VECTOR;
-	apic_write_around(APIC_SPIV, value);
+	apic_write(APIC_SPIV, value);
 
 	/*
 	 * Set up the virtual wire mode.
 	 */
-	apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
+	apic_write(APIC_LVT0, APIC_DM_EXTINT);
 	value = APIC_DM_NMI;
 	if (!lapic_is_integrated())		/* 82489DX */
 		value |= APIC_LVT_LEVEL_TRIGGER;
-	apic_write_around(APIC_LVT1, value);
+	apic_write(APIC_LVT1, value);
 }
 
 static void __cpuinit lapic_setup_esr(void)
 {
-	unsigned long oldvalue, value, maxlvt;
-	if (lapic_is_integrated() && !esr_disable) {
-		/* !82489DX */
-		maxlvt = lapic_get_maxlvt();
-		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP. */
-			apic_write(APIC_ESR, 0);
-		oldvalue = apic_read(APIC_ESR);
+	unsigned int oldvalue, value, maxlvt;
 
-		/* enables sending errors */
-		value = ERROR_APIC_VECTOR;
-		apic_write_around(APIC_LVTERR, value);
+	if (!lapic_is_integrated()) {
+		printk(KERN_INFO "No ESR for 82489DX.\n");
+		return;
+	}
+
+	if (esr_disable) {
 		/*
-		 * spec says clear errors after enabling vector.
+		 * Something untraceable is creating bad interrupts on
+		 * secondary quads ... for the moment, just leave the
+		 * ESR disabled - we can't do anything useful with the
+		 * errors anyway - mbligh
 		 */
-		if (maxlvt > 3)
-			apic_write(APIC_ESR, 0);
-		value = apic_read(APIC_ESR);
-		if (value != oldvalue)
-			apic_printk(APIC_VERBOSE, "ESR value before enabling "
-				"vector: 0x%08lx  after: 0x%08lx\n",
-				oldvalue, value);
-	} else {
-		if (esr_disable)
-			/*
-			 * Something untraceable is creating bad interrupts on
-			 * secondary quads ... for the moment, just leave the
-			 * ESR disabled - we can't do anything useful with the
-			 * errors anyway - mbligh
-			 */
-			printk(KERN_INFO "Leaving ESR disabled.\n");
-		else
-			printk(KERN_INFO "No ESR for 82489DX.\n");
+		printk(KERN_INFO "Leaving ESR disabled.\n");
+		return;
 	}
+
+	maxlvt = lapic_get_maxlvt();
+	if (maxlvt > 3)		/* Due to the Pentium erratum 3AP. */
+		apic_write(APIC_ESR, 0);
+	oldvalue = apic_read(APIC_ESR);
+
+	/* enables sending errors */
+	value = ERROR_APIC_VECTOR;
+	apic_write(APIC_LVTERR, value);
+
+	/*
+	 * spec says clear errors after enabling vector.
+	 */
+	if (maxlvt > 3)
+		apic_write(APIC_ESR, 0);
+	value = apic_read(APIC_ESR);
+	if (value != oldvalue)
+		apic_printk(APIC_VERBOSE, "ESR value before enabling "
+			"vector: 0x%08x  after: 0x%08x\n",
+			oldvalue, value);
 }
 
 
@@ -957,24 +1135,27 @@ static void __cpuinit lapic_setup_esr(void)
  */
 void __cpuinit setup_local_APIC(void)
 {
-	unsigned long value, integrated;
+	unsigned int value;
 	int i, j;
 
+#ifdef CONFIG_X86_32
 	/* Pound the ESR really hard over the head with a big hammer - mbligh */
-	if (esr_disable) {
+	if (lapic_is_integrated() && esr_disable) {
 		apic_write(APIC_ESR, 0);
 		apic_write(APIC_ESR, 0);
 		apic_write(APIC_ESR, 0);
 		apic_write(APIC_ESR, 0);
 	}
+#endif
 
-	integrated = lapic_is_integrated();
+	preempt_disable();
 
 	/*
 	 * Double-check whether this APIC is really registered.
+	 * This is meaningless in clustered apic mode, so we skip it.
 	 */
 	if (!apic_id_registered())
-		WARN_ON_ONCE(1);
+		BUG();
 
 	/*
 	 * Intel recommends to set DFR, LDR and TPR before enabling
@@ -989,7 +1170,7 @@ void __cpuinit setup_local_APIC(void)
 	 */
 	value = apic_read(APIC_TASKPRI);
 	value &= ~APIC_TPRI_MASK;
-	apic_write_around(APIC_TASKPRI, value);
+	apic_write(APIC_TASKPRI, value);
 
 	/*
 	 * After a crash, we no longer service the interrupts and a pending
@@ -1020,6 +1201,7 @@ void __cpuinit setup_local_APIC(void)
 	 */
 	value |= APIC_SPIV_APIC_ENABLED;
 
+#ifdef CONFIG_X86_32
 	/*
 	 * Some unknown Intel IO/APIC (or APIC) errata is biting us with
 	 * certain networking cards. If high frequency interrupts are
@@ -1040,14 +1222,19 @@ void __cpuinit setup_local_APIC(void)
 	 * See also the comment in end_level_ioapic_irq().  --macro
 	 */
 
-	/* Enable focus processor (bit==0) */
+	/*
+	 * - enable focus processor (bit==0)
+	 * - 64bit mode always use processor focus
+	 *   so no need to set it
+	 */
 	value &= ~APIC_SPIV_FOCUS_DISABLED;
+#endif
 
 	/*
 	 * Set spurious IRQ vector
 	 */
 	value |= SPURIOUS_APIC_VECTOR;
-	apic_write_around(APIC_SPIV, value);
+	apic_write(APIC_SPIV, value);
 
 	/*
 	 * Set up LVT0, LVT1:
@@ -1069,7 +1256,7 @@ void __cpuinit setup_local_APIC(void)
 		apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
 				smp_processor_id());
 	}
-	apic_write_around(APIC_LVT0, value);
+	apic_write(APIC_LVT0, value);
 
 	/*
 	 * only the BP should see the LINT1 NMI signal, obviously.
@@ -1078,25 +1265,178 @@ void __cpuinit setup_local_APIC(void)
 		value = APIC_DM_NMI;
 	else
 		value = APIC_DM_NMI | APIC_LVT_MASKED;
-	if (!integrated)		/* 82489DX */
+	if (!lapic_is_integrated())		/* 82489DX */
 		value |= APIC_LVT_LEVEL_TRIGGER;
-	apic_write_around(APIC_LVT1, value);
+	apic_write(APIC_LVT1, value);
+
+	preempt_enable();
 }
 
 void __cpuinit end_local_APIC_setup(void)
 {
-	unsigned long value;
-
 	lapic_setup_esr();
-	/* Disable the local apic timer */
-	value = apic_read(APIC_LVTT);
-	value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-	apic_write_around(APIC_LVTT, value);
+
+#ifdef CONFIG_X86_32
+	{
+		unsigned int value;
+		/* Disable the local apic timer */
+		value = apic_read(APIC_LVTT);
+		value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+		apic_write(APIC_LVTT, value);
+	}
+#endif
 
 	setup_apic_nmi_watchdog(NULL);
 	apic_pm_activate();
 }
 
+#ifdef HAVE_X2APIC
+void check_x2apic(void)
+{
+	int msr, msr2;
+
+	rdmsr(MSR_IA32_APICBASE, msr, msr2);
+
+	if (msr & X2APIC_ENABLE) {
+		printk("x2apic enabled by BIOS, switching to x2apic ops\n");
+		x2apic_preenabled = x2apic = 1;
+		apic_ops = &x2apic_ops;
+	}
+}
+
+void enable_x2apic(void)
+{
+	int msr, msr2;
+
+	rdmsr(MSR_IA32_APICBASE, msr, msr2);
+	if (!(msr & X2APIC_ENABLE)) {
+		printk("Enabling x2apic\n");
+		wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
+	}
+}
+
+void enable_IR_x2apic(void)
+{
+#ifdef CONFIG_INTR_REMAP
+	int ret;
+	unsigned long flags;
+
+	if (!cpu_has_x2apic)
+		return;
+
+	if (!x2apic_preenabled && disable_x2apic) {
+		printk(KERN_INFO
+		       "Skipped enabling x2apic and Interrupt-remapping "
+		       "because of nox2apic\n");
+		return;
+	}
+
+	if (x2apic_preenabled && disable_x2apic)
+		panic("Bios already enabled x2apic, can't enforce nox2apic");
+
+	if (!x2apic_preenabled && skip_ioapic_setup) {
+		printk(KERN_INFO
+		       "Skipped enabling x2apic and Interrupt-remapping "
+		       "because of skipping io-apic setup\n");
+		return;
+	}
+
+	ret = dmar_table_init();
+	if (ret) {
+		printk(KERN_INFO
+		       "dmar_table_init() failed with %d:\n", ret);
+
+		if (x2apic_preenabled)
+			panic("x2apic enabled by bios. But IR enabling failed");
+		else
+			printk(KERN_INFO
+			       "Not enabling x2apic,Intr-remapping\n");
+		return;
+	}
+
+	local_irq_save(flags);
+	mask_8259A();
+
+	ret = save_mask_IO_APIC_setup();
+	if (ret) {
+		printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret);
+		goto end;
+	}
+
+	ret = enable_intr_remapping(1);
+
+	if (ret && x2apic_preenabled) {
+		local_irq_restore(flags);
+		panic("x2apic enabled by bios. But IR enabling failed");
+	}
+
+	if (ret)
+		goto end_restore;
+
+	if (!x2apic) {
+		x2apic = 1;
+		apic_ops = &x2apic_ops;
+		enable_x2apic();
+	}
+
+end_restore:
+	if (ret)
+		/*
+		 * IR enabling failed
+		 */
+		restore_IO_APIC_setup();
+	else
+		reinit_intr_remapped_IO_APIC(x2apic_preenabled);
+
+end:
+	unmask_8259A();
+	local_irq_restore(flags);
+
+	if (!ret) {
+		if (!x2apic_preenabled)
+			printk(KERN_INFO
+			       "Enabled x2apic and interrupt-remapping\n");
+		else
+			printk(KERN_INFO
+			       "Enabled Interrupt-remapping\n");
+	} else
+		printk(KERN_ERR
+		       "Failed to enable Interrupt-remapping and x2apic\n");
+#else
+	if (!cpu_has_x2apic)
+		return;
+
+	if (x2apic_preenabled)
+		panic("x2apic enabled prior OS handover,"
+		      " enable CONFIG_INTR_REMAP");
+
+	printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
+	       " and x2apic\n");
+#endif
+
+	return;
+}
+#endif /* HAVE_X2APIC */
+
+#ifdef CONFIG_X86_64
+/*
+ * Detect and enable local APICs on non-SMP boards.
+ * Original code written by Keir Fraser.
+ * On AMD64 we trust the BIOS - if it says no APIC it is likely
+ * not correctly set up (usually the APIC timer won't work etc.)
+ */
+static int __init detect_init_APIC(void)
+{
+	if (!cpu_has_apic) {
+		printk(KERN_INFO "No local APIC present\n");
+		return -1;
+	}
+
+	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+	boot_cpu_physical_apicid = 0;
+	return 0;
+}
+#else
 /*
  * Detect and initialize APIC
  */
@@ -1175,12 +1515,46 @@ no_apic:
 	printk(KERN_INFO "No local APIC present or hardware disabled\n");
 	return -1;
 }
+#endif
+
+#ifdef CONFIG_X86_64
+void __init early_init_lapic_mapping(void)
+{
+	unsigned long phys_addr;
+
+	/*
+	 * If no local APIC can be found then go out
+	 * : it means there is no mpatable and MADT
+	 */
+	if (!smp_found_config)
+		return;
+
+	phys_addr = mp_lapic_addr;
+
+	set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
+	apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
+		    APIC_BASE, phys_addr);
+
+	/*
+	 * Fetch the APIC ID of the BSP in case we have a
+	 * default configuration (or the MP table is broken).
+	 */
+	boot_cpu_physical_apicid = read_apic_id();
+}
+#endif
 
 /**
  * init_apic_mappings - initialize APIC mappings
  */
 void __init init_apic_mappings(void)
 {
+#ifdef HAVE_X2APIC
+	if (x2apic) {
+		boot_cpu_physical_apicid = read_apic_id();
+		return;
+	}
+#endif
+
 	/*
 	 * If no local APIC can be found then set up a fake all
 	 * zeroes page to simulate the local APIC and another
@@ -1193,30 +1567,36 @@ void __init init_apic_mappings(void)
 		apic_phys = mp_lapic_addr;
 
 	set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
-	printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE,
-	       apic_phys);
+	apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
+				APIC_BASE, apic_phys);
 
 	/*
 	 * Fetch the APIC ID of the BSP in case we have a
 	 * default configuration (or the MP table is broken).
 	 */
 	if (boot_cpu_physical_apicid == -1U)
-		boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
-
+		boot_cpu_physical_apicid = read_apic_id();
 }
 
 /*
  * This initializes the IO-APIC and APIC hardware if this is
  * a UP kernel.
  */
-
 int apic_version[MAX_APICS];
 
 int __init APIC_init_uniprocessor(void)
 {
-	if (disable_apic)
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
-
+#ifdef CONFIG_X86_64
+	if (disable_apic) {
+		printk(KERN_INFO "Apic disabled\n");
+		return -1;
+	}
+	if (!cpu_has_apic) {
+		disable_apic = 1;
+		printk(KERN_INFO "Apic disabled by BIOS\n");
+		return -1;
+	}
+#else
 	if (!smp_found_config && !cpu_has_apic)
 		return -1;
 
@@ -1225,39 +1605,68 @@ int __init APIC_init_uniprocessor(void)
 	 */
 	if (!cpu_has_apic &&
 	    APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
-		printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
+		printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n",
 		       boot_cpu_physical_apicid);
 		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
 		return -1;
 	}
+#endif
 
-	verify_local_APIC();
+#ifdef HAVE_X2APIC
+	enable_IR_x2apic();
+#endif
+#ifdef CONFIG_X86_64
+	setup_apic_routing();
+#endif
 
+	verify_local_APIC();
 	connect_bsp_APIC();
 
+#ifdef CONFIG_X86_64
+	apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
+#else
 	/*
 	 * Hack: In case of kdump, after a crash, kernel might be booting
 	 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
 	 * might be zero if read from MP tables. Get it from LAPIC.
 	 */
-#ifdef CONFIG_CRASH_DUMP
-	boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
+# ifdef CONFIG_CRASH_DUMP
+	boot_cpu_physical_apicid = read_apic_id();
+# endif
 #endif
 	physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
-
 	setup_local_APIC();
 
+#ifdef CONFIG_X86_64
+	/*
+	 * Now enable IO-APICs, actually call clear_IO_APIC
+	 * We need clear_IO_APIC before enabling vector on BP
+	 */
+	if (!skip_ioapic_setup && nr_ioapics)
+		enable_IO_APIC();
+#endif
+
 #ifdef CONFIG_X86_IO_APIC
 	if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
 #endif
 		localise_nmi_watchdog();
 	end_local_APIC_setup();
+
 #ifdef CONFIG_X86_IO_APIC
-	if (smp_found_config)
-		if (!skip_ioapic_setup && nr_ioapics)
-			setup_IO_APIC();
+	if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
+		setup_IO_APIC();
+# ifdef CONFIG_X86_64
+	else
+		nr_ioapics = 0;
+# endif
 #endif
+
+#ifdef CONFIG_X86_64
+	setup_boot_APIC_clock();
+	check_nmi_watchdog();
+#else
 	setup_boot_clock();
+#endif
 
 	return 0;
 }
@@ -1271,8 +1680,11 @@ int __init APIC_init_uniprocessor(void)
  */
 void smp_spurious_interrupt(struct pt_regs *regs)
 {
-	unsigned long v;
+	u32 v;
 
+#ifdef CONFIG_X86_64
+	exit_idle();
+#endif
 	irq_enter();
 	/*
 	 * Check if this really is a spurious interrupt and ACK it
@@ -1283,10 +1695,14 @@ void smp_spurious_interrupt(struct pt_regs *regs)
 	if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
 		ack_APIC_irq();
 
+#ifdef CONFIG_X86_64
+	add_pda(irq_spurious_count, 1);
+#else
 	/* see sw-dev-man vol 3, chapter 7.4.13.5 */
 	printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
 	       "should never happen.\n", smp_processor_id());
 	__get_cpu_var(irq_stat).irq_spurious_count++;
+#endif
 	irq_exit();
 }
 
@@ -1295,8 +1711,11 @@ void smp_spurious_interrupt(struct pt_regs *regs)
  */
 void smp_error_interrupt(struct pt_regs *regs)
 {
-	unsigned long v, v1;
+	u32 v, v1;
 
+#ifdef CONFIG_X86_64
+	exit_idle();
+#endif
 	irq_enter();
 	/* First tickle the hardware, only then report what went on. -- REW */
 	v = apic_read(APIC_ESR);
@@ -1315,64 +1734,17 @@ void smp_error_interrupt(struct pt_regs *regs)
 	   6: Received illegal vector
 	   7: Illegal register address
 	*/
-	printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
+	printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
 		smp_processor_id(), v , v1);
 	irq_exit();
 }
 
-#ifdef CONFIG_SMP
-void __init smp_intr_init(void)
-{
-	/*
-	 * IRQ0 must be given a fixed assignment and initialized,
-	 * because it's used before the IO-APIC is set up.
-	 */
-	set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
-
-	/*
-	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-	 * IPI, driven by wakeup.
-	 */
-	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-	/* IPI for invalidation */
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
-
-	/* IPI for generic function call */
-	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-	/* IPI for single call function */
-	set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-				call_function_single_interrupt);
-}
-#endif
-
-/*
- * Initialize APIC interrupts
- */
-void __init apic_intr_init(void)
-{
-#ifdef CONFIG_SMP
-	smp_intr_init();
-#endif
-	/* self generated IPI for local APIC timer */
-	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-	/* IPI vectors for APIC spurious and error interrupts */
-	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-
-	/* thermal monitor LVT interrupt */
-#ifdef CONFIG_X86_MCE_P4THERMAL
-	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-}
-
 /**
  * connect_bsp_APIC - attach the APIC to the interrupt system
  */
 void __init connect_bsp_APIC(void)
 {
+#ifdef CONFIG_X86_32
 	if (pic_mode) {
 		/*
 		 * Do not trust the local APIC being empty at bootup.
@@ -1387,6 +1759,7 @@ void __init connect_bsp_APIC(void)
 		outb(0x70, 0x22);
 		outb(0x01, 0x23);
 	}
+#endif
 	enable_apic_mode();
 }
 
@@ -1399,6 +1772,9 @@ void __init connect_bsp_APIC(void)
  */
 void disconnect_bsp_APIC(int virt_wire_setup)
 {
+	unsigned int value;
+
+#ifdef CONFIG_X86_32
 	if (pic_mode) {
 		/*
 		 * Put the board back into PIC mode (has an effect only on
@@ -1410,56 +1786,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 				"entering PIC mode.\n");
 		outb(0x70, 0x22);
 		outb(0x00, 0x23);
-	} else {
-		/* Go back to Virtual Wire compatibility mode */
-		unsigned long value;
-
-		/* For the spurious interrupt use vector F, and enable it */
-		value = apic_read(APIC_SPIV);
-		value &= ~APIC_VECTOR_MASK;
-		value |= APIC_SPIV_APIC_ENABLED;
-		value |= 0xf;
-		apic_write_around(APIC_SPIV, value);
-
-		if (!virt_wire_setup) {
-			/*
-			 * For LVT0 make it edge triggered, active high,
-			 * external and enabled
-			 */
-			value = apic_read(APIC_LVT0);
-			value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
-				APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-				APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
-			value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-			value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
-			apic_write_around(APIC_LVT0, value);
-		} else {
-			/* Disable LVT0 */
-			apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
-		}
+		return;
+	}
+#endif
+
+	/* Go back to Virtual Wire compatibility mode */
+
+	/* For the spurious interrupt use vector F, and enable it */
+	value = apic_read(APIC_SPIV);
+	value &= ~APIC_VECTOR_MASK;
+	value |= APIC_SPIV_APIC_ENABLED;
+	value |= 0xf;
+	apic_write(APIC_SPIV, value);
 
+	if (!virt_wire_setup) {
 		/*
-		 * For LVT1 make it edge triggered, active high, nmi and
-		 * enabled
+		 * For LVT0 make it edge triggered, active high,
+		 * external and enabled
 		 */
-		value = apic_read(APIC_LVT1);
-		value &= ~(
-			APIC_MODE_MASK | APIC_SEND_PENDING |
+		value = apic_read(APIC_LVT0);
+		value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
 			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
 			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
 		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
-		apic_write_around(APIC_LVT1, value);
+		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+		apic_write(APIC_LVT0, value);
+	} else {
+		/* Disable LVT0 */
+		apic_write(APIC_LVT0, APIC_LVT_MASKED);
 	}
-}
 
-unsigned int __cpuinitdata maxcpus = NR_CPUS;
+	/*
+	 * For LVT1 make it edge triggered, active high,
+	 * nmi and enabled
+	 */
+	value = apic_read(APIC_LVT1);
+	value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+	value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+	value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+	apic_write(APIC_LVT1, value);
+}
 
 void __cpuinit generic_processor_info(int apicid, int version)
 {
 	int cpu;
 	cpumask_t tmp_map;
-	physid_mask_t phys_cpu;
 
 	/*
 	 * Validate version
@@ -1472,36 +1845,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
 	}
 	apic_version[apicid] = version;
 
-	phys_cpu = apicid_to_cpu_present(apicid);
-	physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
-
 	if (num_processors >= NR_CPUS) {
 		printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
 			"  Processor ignored.\n", NR_CPUS);
 		return;
 	}
 
-	if (num_processors >= maxcpus) {
-		printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
-			" Processor ignored.\n", maxcpus);
-		return;
-	}
-
 	num_processors++;
 	cpus_complement(tmp_map, cpu_present_map);
 	cpu = first_cpu(tmp_map);
 
-	if (apicid == boot_cpu_physical_apicid)
+	physid_set(apicid, phys_cpu_present_map);
+	if (apicid == boot_cpu_physical_apicid) {
 		/*
 		 * x86_bios_cpu_apicid is required to have processors listed
 		 * in same order as logical cpu numbers. Hence the first
 		 * entry is BSP, and so on.
 		 */
 		cpu = 0;
-
+	}
 	if (apicid > max_physical_apicid)
 		max_physical_apicid = apicid;
 
+#ifdef CONFIG_X86_32
 	/*
 	 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
 	 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1521,7 +1887,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
 			def_to_bigsmp = 1;
 		}
 	}
-#ifdef CONFIG_SMP
+#endif
+
+#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
 	/* are we being called early in kernel startup? */
 	if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
 		u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1534,16 +1902,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
 		per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
 	}
 #endif
+
 	cpu_set(cpu, cpu_possible_map);
 	cpu_set(cpu, cpu_present_map);
 }
 
+#ifdef CONFIG_X86_64
+int hard_smp_processor_id(void)
+{
+	return read_apic_id();
+}
+#endif
+
 /*
  * Power management
  */
 #ifdef CONFIG_PM
 
 static struct {
+	/*
+	 * 'active' is true if the local APIC was enabled by us and
+	 * not the BIOS; this signifies that we are also responsible
+	 * for disabling it before entering apm/acpi suspend
+	 */
 	int active;
 	/* r/w apic fields */
 	unsigned int apic_id;
@@ -1584,7 +1965,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
 	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
 	if (maxlvt >= 5)
 		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
 #endif
@@ -1608,16 +1989,23 @@ static int lapic_resume(struct sys_device *dev)
 
 	local_irq_save(flags);
 
-	/*
-	 * Make sure the APICBASE points to the right address
-	 *
-	 * FIXME! This will be wrong if we ever support suspend on
-	 * SMP! We'll need to do this as part of the CPU restore!
-	 */
-	rdmsr(MSR_IA32_APICBASE, l, h);
-	l &= ~MSR_IA32_APICBASE_BASE;
-	l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
-	wrmsr(MSR_IA32_APICBASE, l, h);
+#ifdef HAVE_X2APIC
+	if (x2apic)
+		enable_x2apic();
+	else
+#endif
+	{
+		/*
+		 * Make sure the APICBASE points to the right address
+		 *
+		 * FIXME! This will be wrong if we ever support suspend on
+		 * SMP! We'll need to do this as part of the CPU restore!
+		 */
+		rdmsr(MSR_IA32_APICBASE, l, h);
+		l &= ~MSR_IA32_APICBASE_BASE;
+		l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
+		wrmsr(MSR_IA32_APICBASE, l, h);
+	}
 
 	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
 	apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1627,7 +2015,7 @@ static int lapic_resume(struct sys_device *dev)
 	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
 	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
 	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_P4THERMAL
+#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
 	if (maxlvt >= 5)
 		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
 #endif
@@ -1641,7 +2029,9 @@ static int lapic_resume(struct sys_device *dev)
 	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
+
 	local_irq_restore(flags);
+
 	return 0;
 }
 
@@ -1661,7 +2051,7 @@ static struct sys_device device_lapic = {
 	.cls	= &lapic_sysclass,
 };
 
-static void __devinit apic_pm_activate(void)
+static void __cpuinit apic_pm_activate(void)
 {
 	apic_pm_state.active = 1;
 }
@@ -1687,30 +2077,101 @@ static void apic_pm_activate(void) { }
 
 #endif	/* CONFIG_PM */
 
+#ifdef CONFIG_X86_64
 /*
- * APIC command line parameters
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ *
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis. Use available data to take a good guess.
+ * If in doubt, go HPET.
  */
-static int __init parse_lapic(char *arg)
+__cpuinit int apic_is_clustered_box(void)
 {
-	force_enable_local_apic = 1;
-	return 0;
+	int i, clusters, zeros;
+	unsigned id;
+	u16 *bios_cpu_apicid;
+	DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
+
+	/*
+	 * there is not this kind of box with AMD CPU yet.
+	 * Some AMD box with quadcore cpu and 8 sockets apicid
+	 * will be [4, 0x23] or [8, 0x27] could be thought to
+	 * vsmp box still need checking...
+	 */
+	if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
+		return 0;
+
+	bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
+	bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
+
+	for (i = 0; i < NR_CPUS; i++) {
+		/* are we being called early in kernel startup? */
+		if (bios_cpu_apicid) {
+			id = bios_cpu_apicid[i];
+		}
+		else if (i < nr_cpu_ids) {
+			if (cpu_present(i))
+				id = per_cpu(x86_bios_cpu_apicid, i);
+			else
+				continue;
+		}
+		else
+			break;
+
+		if (id != BAD_APICID)
+			__set_bit(APIC_CLUSTERID(id), clustermap);
+	}
+
+	/* Problem:  Partially populated chassis may not have CPUs in some of
+	 * the APIC clusters they have been allocated.  Only present CPUs have
+	 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
+	 * Since clusters are allocated sequentially, count zeros only if
+	 * they are bounded by ones.
+	 */
+	clusters = 0;
+	zeros = 0;
+	for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
+		if (test_bit(i, clustermap)) {
+			clusters += 1 + zeros;
+			zeros = 0;
+		} else
+			++zeros;
+	}
+
+	/* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
+	 * not guaranteed to be synced between boards
+	 */
+	if (is_vsmp_box() && clusters > 1)
+		return 1;
+
+	/*
+	 * If clusters > 2, then should be multi-chassis.
+	 * May have to revisit this when multi-core + hyperthreaded CPUs come
+	 * out, but AFAIK this will work even for them.
+	 */
+	return (clusters > 2);
 }
-early_param("lapic", parse_lapic);
+#endif
 
-static int __init parse_nolapic(char *arg)
+/*
+ * APIC command line parameters
+ */
+static int __init setup_disableapic(char *arg)
 {
 	disable_apic = 1;
-	clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+	setup_clear_cpu_cap(X86_FEATURE_APIC);
 	return 0;
 }
-early_param("nolapic", parse_nolapic);
+early_param("disableapic", setup_disableapic);
 
-static int __init parse_disable_lapic_timer(char *arg)
+/* same as disableapic, for compatibility */
+static int __init setup_nolapic(char *arg)
 {
-	local_apic_timer_disabled = 1;
-	return 0;
+	return setup_disableapic(arg);
 }
-early_param("nolapic_timer", parse_disable_lapic_timer);
+early_param("nolapic", setup_nolapic);
 
 static int __init parse_lapic_timer_c2_ok(char *arg)
 {
@@ -1719,15 +2180,43 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
 }
 early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
 
-static int __init apic_set_verbosity(char *str)
+static int __init parse_disable_apic_timer(char *arg)
 {
-	if (strcmp("debug", str) == 0)
+	disable_apic_timer = 1;
+	return 0;
+}
+early_param("noapictimer", parse_disable_apic_timer);
+
+static int __init parse_nolapic_timer(char *arg)
+{
+	disable_apic_timer = 1;
+	return 0;
+}
+early_param("nolapic_timer", parse_nolapic_timer);
+
+static int __init apic_set_verbosity(char *arg)
+{
+	if (!arg)  {
+#ifdef CONFIG_X86_64
+		skip_ioapic_setup = 0;
+		return 0;
+#endif
+		return -EINVAL;
+	}
+
+	if (strcmp("debug", arg) == 0)
 		apic_verbosity = APIC_DEBUG;
-	else if (strcmp("verbose", str) == 0)
+	else if (strcmp("verbose", arg) == 0)
 		apic_verbosity = APIC_VERBOSE;
-	return 1;
+	else {
+		printk(KERN_WARNING "APIC Verbosity level %s not recognised"
+			" use apic=verbose or apic=debug\n", arg);
+		return -EINVAL;
+	}
+
+	return 0;
 }
-__setup("apic=", apic_set_verbosity);
+early_param("apic", apic_set_verbosity);
 
 static int __init lapic_insert_resource(void)
 {
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
deleted file mode 100644
index 1e3d32e27c14..000000000000
--- a/arch/x86/kernel/apic_64.c
+++ /dev/null
@@ -1,1393 +0,0 @@
-/*
- *	Local APIC handling, local APIC timers
- *
- *	(c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- *	Fixes
- *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs;
- *					thanks to Eric Gilmore
- *					and Rolf G. Tews
- *					for testing these extensively.
- *	Maciej W. Rozycki	:	Various updates and fixes.
- *	Mikael Pettersson	:	Power Management for UP-APIC.
- *	Pavel Machek and
- *	Mikael Pettersson	:	PM converted to driver model.
- */
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/ioport.h>
-#include <linux/clockchips.h>
-#include <linux/acpi_pmtmr.h>
-#include <linux/module.h>
-
-#include <asm/atomic.h>
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
-#include <asm/hpet.h>
-#include <asm/pgalloc.h>
-#include <asm/nmi.h>
-#include <asm/idle.h>
-#include <asm/proto.h>
-#include <asm/timex.h>
-#include <asm/apic.h>
-
-#include <mach_ipi.h>
-#include <mach_apic.h>
-
-static int disable_apic_timer __cpuinitdata;
-static int apic_calibrate_pmtmr __initdata;
-int disable_apic;
-
-/* Local APIC timer works in C2 */
-int local_apic_timer_c2_ok;
-EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
-
-/*
- * Debug level, exported for io_apic.c
- */
-int apic_verbosity;
-
-/* Have we found an MP table */
-int smp_found_config;
-
-static struct resource lapic_resource = {
-	.name = "Local APIC",
-	.flags = IORESOURCE_MEM | IORESOURCE_BUSY,
-};
-
-static unsigned int calibration_result;
-
-static int lapic_next_event(unsigned long delta,
-			    struct clock_event_device *evt);
-static void lapic_timer_setup(enum clock_event_mode mode,
-			      struct clock_event_device *evt);
-static void lapic_timer_broadcast(cpumask_t mask);
-static void apic_pm_activate(void);
-
-static struct clock_event_device lapic_clockevent = {
-	.name		= "lapic",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
-			| CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
-	.shift		= 32,
-	.set_mode	= lapic_timer_setup,
-	.set_next_event	= lapic_next_event,
-	.broadcast	= lapic_timer_broadcast,
-	.rating		= 100,
-	.irq		= -1,
-};
-static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
-
-static unsigned long apic_phys;
-
-unsigned long mp_lapic_addr;
-
-unsigned int __cpuinitdata maxcpus = NR_CPUS;
-/*
- * Get the LAPIC version
- */
-static inline int lapic_get_version(void)
-{
-	return GET_APIC_VERSION(apic_read(APIC_LVR));
-}
-
-/*
- * Check, if the APIC is integrated or a seperate chip
- */
-static inline int lapic_is_integrated(void)
-{
-	return 1;
-}
-
-/*
- * Check, whether this is a modern or a first generation APIC
- */
-static int modern_apic(void)
-{
-	/* AMD systems use old APIC versions, so check the CPU */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
-	    boot_cpu_data.x86 >= 0xf)
-		return 1;
-	return lapic_get_version() >= 0x14;
-}
-
-void apic_wait_icr_idle(void)
-{
-	while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
-		cpu_relax();
-}
-
-u32 safe_apic_wait_icr_idle(void)
-{
-	u32 send_status;
-	int timeout;
-
-	timeout = 0;
-	do {
-		send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
-		if (!send_status)
-			break;
-		udelay(100);
-	} while (timeout++ < 1000);
-
-	return send_status;
-}
-
-/**
- * enable_NMI_through_LVT0 - enable NMI through local vector table 0
- */
-void __cpuinit enable_NMI_through_LVT0(void)
-{
-	unsigned int v;
-
-	/* unmask and set to NMI */
-	v = APIC_DM_NMI;
-	apic_write(APIC_LVT0, v);
-}
-
-/**
- * lapic_get_maxlvt - get the maximum number of local vector table entries
- */
-int lapic_get_maxlvt(void)
-{
-	unsigned int v, maxlvt;
-
-	v = apic_read(APIC_LVR);
-	maxlvt = GET_APIC_MAXLVT(v);
-	return maxlvt;
-}
-
-/*
- * This function sets up the local APIC timer, with a timeout of
- * 'clocks' APIC bus clock. During calibration we actually call
- * this function twice on the boot CPU, once with a bogus timeout
- * value, second time for real. The other (noncalibrating) CPUs
- * call this function only once, with the real, calibrated value.
- *
- * We do reads before writes even if unnecessary, to get around the
- * P5 APIC double write bug.
- */
-
-static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
-{
-	unsigned int lvtt_value, tmp_value;
-
-	lvtt_value = LOCAL_TIMER_VECTOR;
-	if (!oneshot)
-		lvtt_value |= APIC_LVT_TIMER_PERIODIC;
-	if (!irqen)
-		lvtt_value |= APIC_LVT_MASKED;
-
-	apic_write(APIC_LVTT, lvtt_value);
-
-	/*
-	 * Divide PICLK by 16
-	 */
-	tmp_value = apic_read(APIC_TDCR);
-	apic_write(APIC_TDCR, (tmp_value
-				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
-				| APIC_TDR_DIV_16);
-
-	if (!oneshot)
-		apic_write(APIC_TMICT, clocks);
-}
-
-/*
- * Setup extended LVT, AMD specific (K8, family 10h)
- *
- * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
- * MCE interrupts are supported. Thus MCE offset must be set to 0.
- */
-
-#define APIC_EILVT_LVTOFF_MCE 0
-#define APIC_EILVT_LVTOFF_IBS 1
-
-static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
-{
-	unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
-	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
-
-	apic_write(reg, v);
-}
-
-u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
-{
-	setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
-	return APIC_EILVT_LVTOFF_MCE;
-}
-
-u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
-{
-	setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
-	return APIC_EILVT_LVTOFF_IBS;
-}
-
-/*
- * Program the next event, relative to now
- */
-static int lapic_next_event(unsigned long delta,
-			    struct clock_event_device *evt)
-{
-	apic_write(APIC_TMICT, delta);
-	return 0;
-}
-
-/*
- * Setup the lapic timer in periodic or oneshot mode
- */
-static void lapic_timer_setup(enum clock_event_mode mode,
-			      struct clock_event_device *evt)
-{
-	unsigned long flags;
-	unsigned int v;
-
-	/* Lapic used as dummy for broadcast ? */
-	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
-		return;
-
-	local_irq_save(flags);
-
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-	case CLOCK_EVT_MODE_ONESHOT:
-		__setup_APIC_LVTT(calibration_result,
-				  mode != CLOCK_EVT_MODE_PERIODIC, 1);
-		break;
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		v = apic_read(APIC_LVTT);
-		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-		apic_write(APIC_LVTT, v);
-		break;
-	case CLOCK_EVT_MODE_RESUME:
-		/* Nothing to do here */
-		break;
-	}
-
-	local_irq_restore(flags);
-}
-
-/*
- * Local APIC timer broadcast function
- */
-static void lapic_timer_broadcast(cpumask_t mask)
-{
-#ifdef CONFIG_SMP
-	send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
-#endif
-}
-
-/*
- * Setup the local APIC timer for this CPU. Copy the initilized values
- * of the boot CPU and register the clock event in the framework.
- */
-static void setup_APIC_timer(void)
-{
-	struct clock_event_device *levt = &__get_cpu_var(lapic_events);
-
-	memcpy(levt, &lapic_clockevent, sizeof(*levt));
-	levt->cpumask = cpumask_of_cpu(smp_processor_id());
-
-	clockevents_register_device(levt);
-}
-
-/*
- * In this function we calibrate APIC bus clocks to the external
- * timer. Unfortunately we cannot use jiffies and the timer irq
- * to calibrate, since some later bootup code depends on getting
- * the first irq? Ugh.
- *
- * We want to do the calibration only once since we
- * want to have local timer irqs syncron. CPUs connected
- * by the same APIC bus have the very same bus frequency.
- * And we want to have irqs off anyways, no accidental
- * APIC irq that way.
- */
-
-#define TICK_COUNT 100000000
-
-static void __init calibrate_APIC_clock(void)
-{
-	unsigned apic, apic_start;
-	unsigned long tsc, tsc_start;
-	int result;
-
-	local_irq_disable();
-
-	/*
-	 * Put whatever arbitrary (but long enough) timeout
-	 * value into the APIC clock, we just want to get the
-	 * counter running for calibration.
-	 *
-	 * No interrupt enable !
-	 */
-	__setup_APIC_LVTT(250000000, 0, 0);
-
-	apic_start = apic_read(APIC_TMCCT);
-#ifdef CONFIG_X86_PM_TIMER
-	if (apic_calibrate_pmtmr && pmtmr_ioport) {
-		pmtimer_wait(5000);  /* 5ms wait */
-		apic = apic_read(APIC_TMCCT);
-		result = (apic_start - apic) * 1000L / 5;
-	} else
-#endif
-	{
-		rdtscll(tsc_start);
-
-		do {
-			apic = apic_read(APIC_TMCCT);
-			rdtscll(tsc);
-		} while ((tsc - tsc_start) < TICK_COUNT &&
-				(apic_start - apic) < TICK_COUNT);
-
-		result = (apic_start - apic) * 1000L * tsc_khz /
-					(tsc - tsc_start);
-	}
-
-	local_irq_enable();
-
-	printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
-
-	printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
-		result / 1000 / 1000, result / 1000 % 1000);
-
-	/* Calculate the scaled math multiplication factor */
-	lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC,
-				       lapic_clockevent.shift);
-	lapic_clockevent.max_delta_ns =
-		clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
-	lapic_clockevent.min_delta_ns =
-		clockevent_delta2ns(0xF, &lapic_clockevent);
-
-	calibration_result = result / HZ;
-}
-
-/*
- * Setup the boot APIC
- *
- * Calibrate and verify the result.
- */
-void __init setup_boot_APIC_clock(void)
-{
-	/*
-	 * The local apic timer can be disabled via the kernel commandline.
-	 * Register the lapic timer as a dummy clock event source on SMP
-	 * systems, so the broadcast mechanism is used. On UP systems simply
-	 * ignore it.
-	 */
-	if (disable_apic_timer) {
-		printk(KERN_INFO "Disabling APIC timer\n");
-		/* No broadcast on UP ! */
-		if (num_possible_cpus() > 1) {
-			lapic_clockevent.mult = 1;
-			setup_APIC_timer();
-		}
-		return;
-	}
-
-	printk(KERN_INFO "Using local APIC timer interrupts.\n");
-	calibrate_APIC_clock();
-
-	/*
-	 * Do a sanity check on the APIC calibration result
-	 */
-	if (calibration_result < (1000000 / HZ)) {
-		printk(KERN_WARNING
-		       "APIC frequency too slow, disabling apic timer\n");
-		/* No broadcast on UP ! */
-		if (num_possible_cpus() > 1)
-			setup_APIC_timer();
-		return;
-	}
-
-	/*
-	 * If nmi_watchdog is set to IO_APIC, we need the
-	 * PIT/HPET going.  Otherwise register lapic as a dummy
-	 * device.
-	 */
-	if (nmi_watchdog != NMI_IO_APIC)
-		lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
-	else
-		printk(KERN_WARNING "APIC timer registered as dummy,"
-			" due to nmi_watchdog=%d!\n", nmi_watchdog);
-
-	setup_APIC_timer();
-}
-
-void __cpuinit setup_secondary_APIC_clock(void)
-{
-	setup_APIC_timer();
-}
-
-/*
- * The guts of the apic timer interrupt
- */
-static void local_apic_timer_interrupt(void)
-{
-	int cpu = smp_processor_id();
-	struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
-
-	/*
-	 * Normally we should not be here till LAPIC has been initialized but
-	 * in some cases like kdump, its possible that there is a pending LAPIC
-	 * timer interrupt from previous kernel's context and is delivered in
-	 * new kernel the moment interrupts are enabled.
-	 *
-	 * Interrupts are enabled early and LAPIC is setup much later, hence
-	 * its possible that when we get here evt->event_handler is NULL.
-	 * Check for event_handler being NULL and discard the interrupt as
-	 * spurious.
-	 */
-	if (!evt->event_handler) {
-		printk(KERN_WARNING
-		       "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
-		/* Switch it off */
-		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
-		return;
-	}
-
-	/*
-	 * the NMI deadlock-detector uses this.
-	 */
-	add_pda(apic_timer_irqs, 1);
-
-	evt->event_handler(evt);
-}
-
-/*
- * Local APIC timer interrupt. This is the most natural way for doing
- * local interrupts, but local timer interrupts can be emulated by
- * broadcast interrupts too. [in case the hw doesn't support APIC timers]
- *
- * [ if a single-CPU system runs an SMP kernel then we call the local
- *   interrupt as well. Thus we cannot inline the local irq ... ]
- */
-void smp_apic_timer_interrupt(struct pt_regs *regs)
-{
-	struct pt_regs *old_regs = set_irq_regs(regs);
-
-	/*
-	 * NOTE! We'd better ACK the irq immediately,
-	 * because timer handling can be slow.
-	 */
-	ack_APIC_irq();
-	/*
-	 * update_process_times() expects us to have done irq_enter().
-	 * Besides, if we don't timer interrupts ignore the global
-	 * interrupt lock, which is the WrongThing (tm) to do.
-	 */
-	exit_idle();
-	irq_enter();
-	local_apic_timer_interrupt();
-	irq_exit();
-	set_irq_regs(old_regs);
-}
-
-int setup_profiling_timer(unsigned int multiplier)
-{
-	return -EINVAL;
-}
-
-
-/*
- * Local APIC start and shutdown
- */
-
-/**
- * clear_local_APIC - shutdown the local APIC
- *
- * This is called, when a CPU is disabled and before rebooting, so the state of
- * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
- * leftovers during boot.
- */
-void clear_local_APIC(void)
-{
-	int maxlvt;
-	u32 v;
-
-	/* APIC hasn't been mapped yet */
-	if (!apic_phys)
-		return;
-
-	maxlvt = lapic_get_maxlvt();
-	/*
-	 * Masking an LVT entry can trigger a local APIC error
-	 * if the vector is zero. Mask LVTERR first to prevent this.
-	 */
-	if (maxlvt >= 3) {
-		v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
-		apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
-	}
-	/*
-	 * Careful: we have to set masks only first to deassert
-	 * any level-triggered sources.
-	 */
-	v = apic_read(APIC_LVTT);
-	apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
-	v = apic_read(APIC_LVT0);
-	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
-	v = apic_read(APIC_LVT1);
-	apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
-	if (maxlvt >= 4) {
-		v = apic_read(APIC_LVTPC);
-		apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
-	}
-
-	/*
-	 * Clean APIC state for other OSs:
-	 */
-	apic_write(APIC_LVTT, APIC_LVT_MASKED);
-	apic_write(APIC_LVT0, APIC_LVT_MASKED);
-	apic_write(APIC_LVT1, APIC_LVT_MASKED);
-	if (maxlvt >= 3)
-		apic_write(APIC_LVTERR, APIC_LVT_MASKED);
-	if (maxlvt >= 4)
-		apic_write(APIC_LVTPC, APIC_LVT_MASKED);
-	apic_write(APIC_ESR, 0);
-	apic_read(APIC_ESR);
-}
-
-/**
- * disable_local_APIC - clear and disable the local APIC
- */
-void disable_local_APIC(void)
-{
-	unsigned int value;
-
-	clear_local_APIC();
-
-	/*
-	 * Disable APIC (implies clearing of registers
-	 * for 82489DX!).
-	 */
-	value = apic_read(APIC_SPIV);
-	value &= ~APIC_SPIV_APIC_ENABLED;
-	apic_write(APIC_SPIV, value);
-}
-
-void lapic_shutdown(void)
-{
-	unsigned long flags;
-
-	if (!cpu_has_apic)
-		return;
-
-	local_irq_save(flags);
-
-	disable_local_APIC();
-
-	local_irq_restore(flags);
-}
-
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
-	unsigned int reg0, reg1;
-
-	/*
-	 * The version register is read-only in a real APIC.
-	 */
-	reg0 = apic_read(APIC_LVR);
-	apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
-	apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
-	reg1 = apic_read(APIC_LVR);
-	apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
-	/*
-	 * The two version reads above should print the same
-	 * numbers.  If the second one is different, then we
-	 * poke at a non-APIC.
-	 */
-	if (reg1 != reg0)
-		return 0;
-
-	/*
-	 * Check if the version looks reasonably.
-	 */
-	reg1 = GET_APIC_VERSION(reg0);
-	if (reg1 == 0x00 || reg1 == 0xff)
-		return 0;
-	reg1 = lapic_get_maxlvt();
-	if (reg1 < 0x02 || reg1 == 0xff)
-		return 0;
-
-	/*
-	 * The ID register is read/write in a real APIC.
-	 */
-	reg0 = read_apic_id();
-	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
-	apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
-	reg1 = read_apic_id();
-	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
-	apic_write(APIC_ID, reg0);
-	if (reg1 != (reg0 ^ APIC_ID_MASK))
-		return 0;
-
-	/*
-	 * The next two are just to see if we have sane values.
-	 * They're only really relevant if we're in Virtual Wire
-	 * compatibility mode, but most boxes are anymore.
-	 */
-	reg0 = apic_read(APIC_LVT0);
-	apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
-	reg1 = apic_read(APIC_LVT1);
-	apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
-	return 1;
-}
-
-/**
- * sync_Arb_IDs - synchronize APIC bus arbitration IDs
- */
-void __init sync_Arb_IDs(void)
-{
-	/* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
-	if (modern_apic())
-		return;
-
-	/*
-	 * Wait for idle.
-	 */
-	apic_wait_icr_idle();
-
-	apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
-	apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
-				| APIC_DM_INIT);
-}
-
-/*
- * An initial setup of the virtual wire mode.
- */
-void __init init_bsp_APIC(void)
-{
-	unsigned int value;
-
-	/*
-	 * Don't do the setup now if we have a SMP BIOS as the
-	 * through-I/O-APIC virtual wire mode might be active.
-	 */
-	if (smp_found_config || !cpu_has_apic)
-		return;
-
-	value = apic_read(APIC_LVR);
-
-	/*
-	 * Do not trust the local APIC being empty at bootup.
-	 */
-	clear_local_APIC();
-
-	/*
-	 * Enable APIC.
-	 */
-	value = apic_read(APIC_SPIV);
-	value &= ~APIC_VECTOR_MASK;
-	value |= APIC_SPIV_APIC_ENABLED;
-	value |= APIC_SPIV_FOCUS_DISABLED;
-	value |= SPURIOUS_APIC_VECTOR;
-	apic_write(APIC_SPIV, value);
-
-	/*
-	 * Set up the virtual wire mode.
-	 */
-	apic_write(APIC_LVT0, APIC_DM_EXTINT);
-	value = APIC_DM_NMI;
-	apic_write(APIC_LVT1, value);
-}
-
-/**
- * setup_local_APIC - setup the local APIC
- */
-void __cpuinit setup_local_APIC(void)
-{
-	unsigned int value;
-	int i, j;
-
-	preempt_disable();
-	value = apic_read(APIC_LVR);
-
-	BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
-
-	/*
-	 * Double-check whether this APIC is really registered.
-	 * This is meaningless in clustered apic mode, so we skip it.
-	 */
-	if (!apic_id_registered())
-		BUG();
-
-	/*
-	 * Intel recommends to set DFR, LDR and TPR before enabling
-	 * an APIC.  See e.g. "AP-388 82489DX User's Manual" (Intel
-	 * document number 292116).  So here it goes...
-	 */
-	init_apic_ldr();
-
-	/*
-	 * Set Task Priority to 'accept all'. We never change this
-	 * later on.
-	 */
-	value = apic_read(APIC_TASKPRI);
-	value &= ~APIC_TPRI_MASK;
-	apic_write(APIC_TASKPRI, value);
-
-	/*
-	 * After a crash, we no longer service the interrupts and a pending
-	 * interrupt from previous kernel might still have ISR bit set.
-	 *
-	 * Most probably by now CPU has serviced that pending interrupt and
-	 * it might not have done the ack_APIC_irq() because it thought,
-	 * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
-	 * does not clear the ISR bit and cpu thinks it has already serivced
-	 * the interrupt. Hence a vector might get locked. It was noticed
-	 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
-	 */
-	for (i = APIC_ISR_NR - 1; i >= 0; i--) {
-		value = apic_read(APIC_ISR + i*0x10);
-		for (j = 31; j >= 0; j--) {
-			if (value & (1<<j))
-				ack_APIC_irq();
-		}
-	}
-
-	/*
-	 * Now that we are all set up, enable the APIC
-	 */
-	value = apic_read(APIC_SPIV);
-	value &= ~APIC_VECTOR_MASK;
-	/*
-	 * Enable APIC
-	 */
-	value |= APIC_SPIV_APIC_ENABLED;
-
-	/* We always use processor focus */
-
-	/*
-	 * Set spurious IRQ vector
-	 */
-	value |= SPURIOUS_APIC_VECTOR;
-	apic_write(APIC_SPIV, value);
-
-	/*
-	 * Set up LVT0, LVT1:
-	 *
-	 * set up through-local-APIC on the BP's LINT0. This is not
-	 * strictly necessary in pure symmetric-IO mode, but sometimes
-	 * we delegate interrupts to the 8259A.
-	 */
-	/*
-	 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
-	 */
-	value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
-	if (!smp_processor_id() && !value) {
-		value = APIC_DM_EXTINT;
-		apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
-			    smp_processor_id());
-	} else {
-		value = APIC_DM_EXTINT | APIC_LVT_MASKED;
-		apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
-			    smp_processor_id());
-	}
-	apic_write(APIC_LVT0, value);
-
-	/*
-	 * only the BP should see the LINT1 NMI signal, obviously.
-	 */
-	if (!smp_processor_id())
-		value = APIC_DM_NMI;
-	else
-		value = APIC_DM_NMI | APIC_LVT_MASKED;
-	apic_write(APIC_LVT1, value);
-	preempt_enable();
-}
-
-static void __cpuinit lapic_setup_esr(void)
-{
-	unsigned maxlvt = lapic_get_maxlvt();
-
-	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
-	/*
-	 * spec says clear errors after enabling vector.
-	 */
-	if (maxlvt > 3)
-		apic_write(APIC_ESR, 0);
-}
-
-void __cpuinit end_local_APIC_setup(void)
-{
-	lapic_setup_esr();
-	setup_apic_nmi_watchdog(NULL);
-	apic_pm_activate();
-}
-
-/*
- * Detect and enable local APICs on non-SMP boards.
- * Original code written by Keir Fraser.
- * On AMD64 we trust the BIOS - if it says no APIC it is likely
- * not correctly set up (usually the APIC timer won't work etc.)
- */
-static int __init detect_init_APIC(void)
-{
-	if (!cpu_has_apic) {
-		printk(KERN_INFO "No local APIC present\n");
-		return -1;
-	}
-
-	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-	boot_cpu_physical_apicid = 0;
-	return 0;
-}
-
-void __init early_init_lapic_mapping(void)
-{
-	unsigned long phys_addr;
-
-	/*
-	 * If no local APIC can be found then go out
-	 * : it means there is no mpatable and MADT
-	 */
-	if (!smp_found_config)
-		return;
-
-	phys_addr = mp_lapic_addr;
-
-	set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
-	apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
-		    APIC_BASE, phys_addr);
-
-	/*
-	 * Fetch the APIC ID of the BSP in case we have a
-	 * default configuration (or the MP table is broken).
-	 */
-	boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
-}
-
-/**
- * init_apic_mappings - initialize APIC mappings
- */
-void __init init_apic_mappings(void)
-{
-	/*
-	 * If no local APIC can be found then set up a fake all
-	 * zeroes page to simulate the local APIC and another
-	 * one for the IO-APIC.
-	 */
-	if (!smp_found_config && detect_init_APIC()) {
-		apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
-		apic_phys = __pa(apic_phys);
-	} else
-		apic_phys = mp_lapic_addr;
-
-	set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
-	apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
-				APIC_BASE, apic_phys);
-
-	/*
-	 * Fetch the APIC ID of the BSP in case we have a
-	 * default configuration (or the MP table is broken).
-	 */
-	boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
-}
-
-/*
- * This initializes the IO-APIC and APIC hardware if this is
- * a UP kernel.
- */
-int __init APIC_init_uniprocessor(void)
-{
-	if (disable_apic) {
-		printk(KERN_INFO "Apic disabled\n");
-		return -1;
-	}
-	if (!cpu_has_apic) {
-		disable_apic = 1;
-		printk(KERN_INFO "Apic disabled by BIOS\n");
-		return -1;
-	}
-
-	verify_local_APIC();
-
-	connect_bsp_APIC();
-
-	physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
-	apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
-
-	setup_local_APIC();
-
-	/*
-	 * Now enable IO-APICs, actually call clear_IO_APIC
-	 * We need clear_IO_APIC before enabling vector on BP
-	 */
-	if (!skip_ioapic_setup && nr_ioapics)
-		enable_IO_APIC();
-
-	if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
-		localise_nmi_watchdog();
-	end_local_APIC_setup();
-
-	if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
-		setup_IO_APIC();
-	else
-		nr_ioapics = 0;
-	setup_boot_APIC_clock();
-	check_nmi_watchdog();
-	return 0;
-}
-
-/*
- * Local APIC interrupts
- */
-
-/*
- * This interrupt should _never_ happen with our APIC/SMP architecture
- */
-asmlinkage void smp_spurious_interrupt(void)
-{
-	unsigned int v;
-	exit_idle();
-	irq_enter();
-	/*
-	 * Check if this really is a spurious interrupt and ACK it
-	 * if it is a vectored one.  Just in case...
-	 * Spurious interrupts should not be ACKed.
-	 */
-	v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
-	if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
-		ack_APIC_irq();
-
-	add_pda(irq_spurious_count, 1);
-	irq_exit();
-}
-
-/*
- * This interrupt should never happen with our APIC/SMP architecture
- */
-asmlinkage void smp_error_interrupt(void)
-{
-	unsigned int v, v1;
-
-	exit_idle();
-	irq_enter();
-	/* First tickle the hardware, only then report what went on. -- REW */
-	v = apic_read(APIC_ESR);
-	apic_write(APIC_ESR, 0);
-	v1 = apic_read(APIC_ESR);
-	ack_APIC_irq();
-	atomic_inc(&irq_err_count);
-
-	/* Here is what the APIC error bits mean:
-	   0: Send CS error
-	   1: Receive CS error
-	   2: Send accept error
-	   3: Receive accept error
-	   4: Reserved
-	   5: Send illegal vector
-	   6: Received illegal vector
-	   7: Illegal register address
-	*/
-	printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
-		smp_processor_id(), v , v1);
-	irq_exit();
-}
-
-/**
- *  * connect_bsp_APIC - attach the APIC to the interrupt system
- *   */
-void __init connect_bsp_APIC(void)
-{
-	enable_apic_mode();
-}
-
-void disconnect_bsp_APIC(int virt_wire_setup)
-{
-	/* Go back to Virtual Wire compatibility mode */
-	unsigned long value;
-
-	/* For the spurious interrupt use vector F, and enable it */
-	value = apic_read(APIC_SPIV);
-	value &= ~APIC_VECTOR_MASK;
-	value |= APIC_SPIV_APIC_ENABLED;
-	value |= 0xf;
-	apic_write(APIC_SPIV, value);
-
-	if (!virt_wire_setup) {
-		/*
-		 * For LVT0 make it edge triggered, active high,
-		 * external and enabled
-		 */
-		value = apic_read(APIC_LVT0);
-		value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
-			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
-		value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-		value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
-		apic_write(APIC_LVT0, value);
-	} else {
-		/* Disable LVT0 */
-		apic_write(APIC_LVT0, APIC_LVT_MASKED);
-	}
-
-	/* For LVT1 make it edge triggered, active high, nmi and enabled */
-	value = apic_read(APIC_LVT1);
-	value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
-			APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
-			APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
-	value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
-	value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
-	apic_write(APIC_LVT1, value);
-}
-
-void __cpuinit generic_processor_info(int apicid, int version)
-{
-	int cpu;
-	cpumask_t tmp_map;
-
-	if (num_processors >= NR_CPUS) {
-		printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
-		       " Processor ignored.\n", NR_CPUS);
-		return;
-	}
-
-	if (num_processors >= maxcpus) {
-		printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
-		       " Processor ignored.\n", maxcpus);
-		return;
-	}
-
-	num_processors++;
-	cpus_complement(tmp_map, cpu_present_map);
-	cpu = first_cpu(tmp_map);
-
-	physid_set(apicid, phys_cpu_present_map);
-	if (apicid == boot_cpu_physical_apicid) {
-		/*
-		 * x86_bios_cpu_apicid is required to have processors listed
-		 * in same order as logical cpu numbers. Hence the first
-		 * entry is BSP, and so on.
-		 */
-		cpu = 0;
-	}
-	if (apicid > max_physical_apicid)
-		max_physical_apicid = apicid;
-
-	/* are we being called early in kernel startup? */
-	if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
-		u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
-		u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-
-		cpu_to_apicid[cpu] = apicid;
-		bios_cpu_apicid[cpu] = apicid;
-	} else {
-		per_cpu(x86_cpu_to_apicid, cpu) = apicid;
-		per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
-	}
-
-	cpu_set(cpu, cpu_possible_map);
-	cpu_set(cpu, cpu_present_map);
-}
-
-/*
- * Power management
- */
-#ifdef CONFIG_PM
-
-static struct {
-	/* 'active' is true if the local APIC was enabled by us and
-	   not the BIOS; this signifies that we are also responsible
-	   for disabling it before entering apm/acpi suspend */
-	int active;
-	/* r/w apic fields */
-	unsigned int apic_id;
-	unsigned int apic_taskpri;
-	unsigned int apic_ldr;
-	unsigned int apic_dfr;
-	unsigned int apic_spiv;
-	unsigned int apic_lvtt;
-	unsigned int apic_lvtpc;
-	unsigned int apic_lvt0;
-	unsigned int apic_lvt1;
-	unsigned int apic_lvterr;
-	unsigned int apic_tmict;
-	unsigned int apic_tdcr;
-	unsigned int apic_thmr;
-} apic_pm_state;
-
-static int lapic_suspend(struct sys_device *dev, pm_message_t state)
-{
-	unsigned long flags;
-	int maxlvt;
-
-	if (!apic_pm_state.active)
-		return 0;
-
-	maxlvt = lapic_get_maxlvt();
-
-	apic_pm_state.apic_id = read_apic_id();
-	apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
-	apic_pm_state.apic_ldr = apic_read(APIC_LDR);
-	apic_pm_state.apic_dfr = apic_read(APIC_DFR);
-	apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
-	apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
-	if (maxlvt >= 4)
-		apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
-	apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
-	apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
-	apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
-	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
-	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
-#ifdef CONFIG_X86_MCE_INTEL
-	if (maxlvt >= 5)
-		apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-#endif
-	local_irq_save(flags);
-	disable_local_APIC();
-	local_irq_restore(flags);
-	return 0;
-}
-
-static int lapic_resume(struct sys_device *dev)
-{
-	unsigned int l, h;
-	unsigned long flags;
-	int maxlvt;
-
-	if (!apic_pm_state.active)
-		return 0;
-
-	maxlvt = lapic_get_maxlvt();
-
-	local_irq_save(flags);
-	rdmsr(MSR_IA32_APICBASE, l, h);
-	l &= ~MSR_IA32_APICBASE_BASE;
-	l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
-	wrmsr(MSR_IA32_APICBASE, l, h);
-	apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
-	apic_write(APIC_ID, apic_pm_state.apic_id);
-	apic_write(APIC_DFR, apic_pm_state.apic_dfr);
-	apic_write(APIC_LDR, apic_pm_state.apic_ldr);
-	apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
-	apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
-	apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
-	apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
-#ifdef CONFIG_X86_MCE_INTEL
-	if (maxlvt >= 5)
-		apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
-#endif
-	if (maxlvt >= 4)
-		apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
-	apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
-	apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
-	apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
-	apic_write(APIC_ESR, 0);
-	apic_read(APIC_ESR);
-	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
-	apic_write(APIC_ESR, 0);
-	apic_read(APIC_ESR);
-	local_irq_restore(flags);
-	return 0;
-}
-
-static struct sysdev_class lapic_sysclass = {
-	.name		= "lapic",
-	.resume		= lapic_resume,
-	.suspend	= lapic_suspend,
-};
-
-static struct sys_device device_lapic = {
-	.id	= 0,
-	.cls	= &lapic_sysclass,
-};
-
-static void __cpuinit apic_pm_activate(void)
-{
-	apic_pm_state.active = 1;
-}
-
-static int __init init_lapic_sysfs(void)
-{
-	int error;
-
-	if (!cpu_has_apic)
-		return 0;
-	/* XXX: remove suspend/resume procs if !apic_pm_state.active? */
-
-	error = sysdev_class_register(&lapic_sysclass);
-	if (!error)
-		error = sysdev_register(&device_lapic);
-	return error;
-}
-device_initcall(init_lapic_sysfs);
-
-#else	/* CONFIG_PM */
-
-static void apic_pm_activate(void) { }
-
-#endif	/* CONFIG_PM */
-
-/*
- * apic_is_clustered_box() -- Check if we can expect good TSC
- *
- * Thus far, the major user of this is IBM's Summit2 series:
- *
- * Clustered boxes may have unsynced TSC problems if they are
- * multi-chassis. Use available data to take a good guess.
- * If in doubt, go HPET.
- */
-__cpuinit int apic_is_clustered_box(void)
-{
-	int i, clusters, zeros;
-	unsigned id;
-	u16 *bios_cpu_apicid;
-	DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
-
-	/*
-	 * there is not this kind of box with AMD CPU yet.
-	 * Some AMD box with quadcore cpu and 8 sockets apicid
-	 * will be [4, 0x23] or [8, 0x27] could be thought to
-	 * vsmp box still need checking...
-	 */
-	if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
-		return 0;
-
-	bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-	bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
-
-	for (i = 0; i < NR_CPUS; i++) {
-		/* are we being called early in kernel startup? */
-		if (bios_cpu_apicid) {
-			id = bios_cpu_apicid[i];
-		}
-		else if (i < nr_cpu_ids) {
-			if (cpu_present(i))
-				id = per_cpu(x86_bios_cpu_apicid, i);
-			else
-				continue;
-		}
-		else
-			break;
-
-		if (id != BAD_APICID)
-			__set_bit(APIC_CLUSTERID(id), clustermap);
-	}
-
-	/* Problem:  Partially populated chassis may not have CPUs in some of
-	 * the APIC clusters they have been allocated.  Only present CPUs have
-	 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
-	 * Since clusters are allocated sequentially, count zeros only if
-	 * they are bounded by ones.
-	 */
-	clusters = 0;
-	zeros = 0;
-	for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
-		if (test_bit(i, clustermap)) {
-			clusters += 1 + zeros;
-			zeros = 0;
-		} else
-			++zeros;
-	}
-
-	/* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
-	 * not guaranteed to be synced between boards
-	 */
-	if (is_vsmp_box() && clusters > 1)
-		return 1;
-
-	/*
-	 * If clusters > 2, then should be multi-chassis.
-	 * May have to revisit this when multi-core + hyperthreaded CPUs come
-	 * out, but AFAIK this will work even for them.
-	 */
-	return (clusters > 2);
-}
-
-/*
- * APIC command line parameters
- */
-static int __init apic_set_verbosity(char *str)
-{
-	if (str == NULL)  {
-		skip_ioapic_setup = 0;
-		ioapic_force = 1;
-		return 0;
-	}
-	if (strcmp("debug", str) == 0)
-		apic_verbosity = APIC_DEBUG;
-	else if (strcmp("verbose", str) == 0)
-		apic_verbosity = APIC_VERBOSE;
-	else {
-		printk(KERN_WARNING "APIC Verbosity level %s not recognised"
-				" use apic=verbose or apic=debug\n", str);
-		return -EINVAL;
-	}
-
-	return 0;
-}
-early_param("apic", apic_set_verbosity);
-
-static __init int setup_disableapic(char *str)
-{
-	disable_apic = 1;
-	clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
-	return 0;
-}
-early_param("disableapic", setup_disableapic);
-
-/* same as disableapic, for compatibility */
-static __init int setup_nolapic(char *str)
-{
-	return setup_disableapic(str);
-}
-early_param("nolapic", setup_nolapic);
-
-static int __init parse_lapic_timer_c2_ok(char *arg)
-{
-	local_apic_timer_c2_ok = 1;
-	return 0;
-}
-early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
-
-static __init int setup_noapictimer(char *str)
-{
-	if (str[0] != ' ' && str[0] != 0)
-		return 0;
-	disable_apic_timer = 1;
-	return 1;
-}
-__setup("noapictimer", setup_noapictimer);
-
-static __init int setup_apicpmtimer(char *s)
-{
-	apic_calibrate_pmtmr = 1;
-	notsc_setup(NULL);
-	return 0;
-}
-__setup("apicpmtimer", setup_apicpmtimer);
-
-static int __init lapic_insert_resource(void)
-{
-	if (!apic_phys)
-		return -1;
-
-	/* Put local APIC into the resource map. */
-	lapic_resource.start = apic_phys;
-	lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
-	insert_resource(&iomem_resource, &lapic_resource);
-
-	return 0;
-}
-
-/*
- * need call insert after e820_reserve_resources()
- * that is using request_resource
- */
-late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9b441331e9..5145a6e72bbb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -219,7 +219,6 @@
 #include <linux/time.h>
 #include <linux/sched.h>
 #include <linux/pm.h>
-#include <linux/pm_legacy.h>
 #include <linux/capability.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
@@ -229,12 +228,12 @@
 #include <linux/suspend.h>
 #include <linux/kthread.h>
 #include <linux/jiffies.h>
-#include <linux/smp_lock.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/desc.h>
 #include <asm/i8253.h>
+#include <asm/olpc.h>
 #include <asm/paravirt.h>
 #include <asm/reboot.h>
 
@@ -2218,7 +2217,7 @@ static int __init apm_init(void)
 
 	dmi_check_system(apm_dmi_table);
 
-	if (apm_info.bios.version == 0 || paravirt_enabled()) {
+	if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
 		printk(KERN_INFO "apm: BIOS not found.\n");
 		return -ENODEV;
 	}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bacf5deeec2d..7fcf63d22f8b 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,9 +18,11 @@
 #include <asm/ia32.h>
 #include <asm/bootparam.h>
 
+#include <xen/interface/xen.h>
+
 #define __NO_STUBS 1
 #undef __SYSCALL
-#undef _ASM_X86_64_UNISTD_H_
+#undef _ASM_X86_UNISTD_64_H
 #define __SYSCALL(nr, sym) [nr] = 1,
 static char syscalls[] = {
 #include <asm/unistd.h>
@@ -131,5 +133,14 @@ int main(void)
 	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
+
+	BLANK();
+	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
+#ifdef CONFIG_XEN
+	BLANK();
+	OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+	OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#undef ENTRY
+#endif
 	return 0;
 }
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
new file mode 100644
index 000000000000..f0dfe6f17e7e
--- /dev/null
+++ b/arch/x86/kernel/bios_uv.c
@@ -0,0 +1,141 @@
+/*
+ * BIOS run time interface routines.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *  Copyright (c) Russ Anderson
+ */
+
+#include <linux/efi.h>
+#include <asm/efi.h>
+#include <linux/io.h>
+#include <asm/uv/bios.h>
+#include <asm/uv/uv_hub.h>
+
+struct uv_systab uv_systab;
+
+s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
+{
+	struct uv_systab *tab = &uv_systab;
+
+	if (!tab->function)
+		/*
+		 * BIOS does not support UV systab
+		 */
+		return BIOS_STATUS_UNIMPLEMENTED;
+
+	return efi_call6((void *)__va(tab->function),
+					(u64)which, a1, a2, a3, a4, a5);
+}
+
+s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+					u64 a4, u64 a5)
+{
+	unsigned long bios_flags;
+	s64 ret;
+
+	local_irq_save(bios_flags);
+	ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+	local_irq_restore(bios_flags);
+
+	return ret;
+}
+
+s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
+					u64 a4, u64 a5)
+{
+	s64 ret;
+
+	preempt_disable();
+	ret = uv_bios_call(which, a1, a2, a3, a4, a5);
+	preempt_enable();
+
+	return ret;
+}
+
+
+long sn_partition_id;
+EXPORT_SYMBOL_GPL(sn_partition_id);
+long uv_coherency_id;
+EXPORT_SYMBOL_GPL(uv_coherency_id);
+long uv_region_size;
+EXPORT_SYMBOL_GPL(uv_region_size);
+int uv_type;
+
+
+s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
+		long *region)
+{
+	s64 ret;
+	u64 v0, v1;
+	union partition_info_u part;
+
+	ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
+				(u64)(&v0), (u64)(&v1), 0, 0);
+	if (ret != BIOS_STATUS_SUCCESS)
+		return ret;
+
+	part.val = v0;
+	if (uvtype)
+		*uvtype = part.hub_version;
+	if (partid)
+		*partid = part.partition_id;
+	if (coher)
+		*coher = part.coherence_id;
+	if (region)
+		*region = part.region_size;
+	return ret;
+}
+
+
+s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
+{
+	return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
+			   (u64)ticks_per_second, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(uv_bios_freq_base);
+
+
+#ifdef CONFIG_EFI
+void uv_bios_init(void)
+{
+	struct uv_systab *tab;
+
+	if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
+	    (efi.uv_systab == (unsigned long)NULL)) {
+		printk(KERN_CRIT "No EFI UV System Table.\n");
+		uv_systab.function = (unsigned long)NULL;
+		return;
+	}
+
+	tab = (struct uv_systab *)ioremap(efi.uv_systab,
+					sizeof(struct uv_systab));
+	if (strncmp(tab->signature, "UVST", 4) != 0)
+		printk(KERN_ERR "bad signature in UV system table!");
+
+	/*
+	 * Copy table to permanent spot for later use.
+	 */
+	memcpy(&uv_systab, tab, sizeof(struct uv_systab));
+	iounmap(tab);
+
+	printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision);
+}
+#else	/* !CONFIG_EFI */
+
+void uv_bios_init(void) { }
+#endif
+
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
new file mode 100644
index 000000000000..667df55a4399
--- /dev/null
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -0,0 +1 @@
+capflags.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index ee76eaad3001..82ec6075c057 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,22 +3,30 @@
 #
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
-obj-y			+= proc.o feature_names.o
-
-obj-$(CONFIG_X86_32)	+= common.o bugs.o
-obj-$(CONFIG_X86_64)	+= common_64.o bugs_64.o
-obj-$(CONFIG_X86_32)	+= amd.o
-obj-$(CONFIG_X86_64)	+= amd_64.o
-obj-$(CONFIG_X86_32)	+= cyrix.o
-obj-$(CONFIG_X86_32)	+= centaur.o
-obj-$(CONFIG_X86_64)	+= centaur_64.o
-obj-$(CONFIG_X86_32)	+= transmeta.o
-obj-$(CONFIG_X86_32)	+= intel.o
-obj-$(CONFIG_X86_64)	+= intel_64.o
-obj-$(CONFIG_X86_32)	+= umc.o
+obj-y			+= proc.o capflags.o powerflags.o common.o
+
+obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o
+obj-$(CONFIG_X86_64)	+= bugs_64.o
+
+obj-$(CONFIG_CPU_SUP_INTEL)		+= intel.o
+obj-$(CONFIG_CPU_SUP_AMD)		+= amd.o
+obj-$(CONFIG_CPU_SUP_CYRIX_32)		+= cyrix.o
+obj-$(CONFIG_CPU_SUP_CENTAUR_32)	+= centaur.o
+obj-$(CONFIG_CPU_SUP_CENTAUR_64)	+= centaur_64.o
+obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
+obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
 obj-$(CONFIG_X86_MCE)	+= mcheck/
 obj-$(CONFIG_MTRR)	+= mtrr/
 obj-$(CONFIG_CPU_FREQ)	+= cpufreq/
 
 obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+
+quiet_cmd_mkcapflags = MKCAP   $@
+      cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
+
+cpufeature = $(src)/../../include/asm/cpufeature.h
+
+targets += capflags.c
+$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
+	$(call if_changed,mkcapflags)
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 84a8220a6072..0d9c993aa93e 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,6 +7,8 @@
 #include <asm/pat.h>
 #include <asm/processor.h>
 
+#include <mach_apic.h>
+
 struct cpuid_bit {
 	u16 feature;
 	u8 reg;
@@ -48,6 +50,92 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 	}
 }
 
+/* leaf 0xb SMT level */
+#define SMT_LEVEL	0
+
+/* leaf 0xb sub-leaf types */
+#define INVALID_TYPE	0
+#define SMT_TYPE	1
+#define CORE_TYPE	2
+
+#define LEAFB_SUBTYPE(ecx)		(((ecx) >> 8) & 0xff)
+#define BITS_SHIFT_NEXT_LEVEL(eax)	((eax) & 0x1f)
+#define LEVEL_MAX_SIBLINGS(ebx)		((ebx) & 0xffff)
+
+/*
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
+ */
+void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_SMP
+	unsigned int eax, ebx, ecx, edx, sub_index;
+	unsigned int ht_mask_width, core_plus_mask_width;
+	unsigned int core_select_mask, core_level_siblings;
+
+	if (c->cpuid_level < 0xb)
+		return;
+
+	cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
+
+	/*
+	 * check if the cpuid leaf 0xb is actually implemented.
+	 */
+	if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
+		return;
+
+	set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
+
+	/*
+	 * initial apic id, which also represents 32-bit extended x2apic id.
+	 */
+	c->initial_apicid = edx;
+
+	/*
+	 * Populate HT related information from sub-leaf level 0.
+	 */
+	core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
+	core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+
+	sub_index = 1;
+	do {
+		cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
+
+		/*
+		 * Check for the Core type in the implemented sub leaves.
+		 */
+		if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
+			core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
+			core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
+			break;
+		}
+
+		sub_index++;
+	} while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
+
+	core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
+
+#ifdef CONFIG_X86_32
+	c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
+						 & core_select_mask;
+	c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
+#else
+	c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
+	c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
+#endif
+	c->x86_max_cores = (core_level_siblings / smp_num_siblings);
+
+
+	printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+	       c->phys_proc_id);
+	if (c->x86_max_cores > 1)
+		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+		       c->cpu_core_id);
+	return;
+#endif
+}
+
 #ifdef CONFIG_X86_PAT
 void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
 {
@@ -56,9 +144,22 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
 
 	switch (c->x86_vendor) {
 	case X86_VENDOR_INTEL:
-		if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
+		/*
+		 * There is a known erratum on Pentium III and Core Solo
+		 * and Core Duo CPUs.
+		 * " Page with PAT set to WC while associated MTRR is UC
+		 *   may consolidate to UC "
+		 * Because of this erratum, it is better to stick with
+		 * setting WC in MTRR rather than using PAT on these CPUs.
+		 *
+		 * Enable PAT WC only on P4, Core 2 or later CPUs.
+		 */
+		if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
 			return;
-		break;
+
+		pat_disable("PAT WC disabled due to known CPU erratum.");
+		return;
+
 	case X86_VENDOR_AMD:
 	case X86_VENDOR_CENTAUR:
 	case X86_VENDOR_TRANSMETA:
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 81a07ca65d44..8f1e31db2ad5 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,13 +1,22 @@
 #include <linux/init.h>
 #include <linux/bitops.h>
 #include <linux/mm.h>
+
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
 
+#ifdef CONFIG_X86_64
+# include <asm/numa_64.h>
+# include <asm/mmconfig.h>
+# include <asm/cacheflush.h>
+#endif
+
 #include <mach_apic.h>
+
 #include "cpu.h"
 
+#ifdef CONFIG_X86_32
 /*
  *	B step AMD K6 before B 9730xxxx have hardware bugs that can cause
  *	misexecution of code under Linux. Owners of such processors should
@@ -24,23 +33,273 @@
 extern void vide(void);
 __asm__(".align 4\nvide: ret");
 
-int force_mwait __cpuinitdata;
-
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
 {
-	if (cpuid_eax(0x80000000) >= 0x80000007) {
-		c->x86_power = cpuid_edx(0x80000007);
-		if (c->x86_power & (1<<8))
-			set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+/*
+ * General Systems BIOSen alias the cpu frequency registers
+ * of the Elan at 0x000df000. Unfortuantly, one of the Linux
+ * drivers subsequently pokes it, and changes the CPU speed.
+ * Workaround : Remove the unneeded alias.
+ */
+#define CBAR		(0xfffc) /* Configuration Base Address  (32-bit) */
+#define CBAR_ENB	(0x80000000)
+#define CBAR_KEY	(0X000000CB)
+	if (c->x86_model == 9 || c->x86_model == 10) {
+		if (inl (CBAR) & CBAR_ENB)
+			outl (0 | CBAR_KEY, CBAR);
 	}
 }
 
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+
+static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
 {
 	u32 l, h;
 	int mbytes = num_physpages >> (20-PAGE_SHIFT);
-	int r;
 
+	if (c->x86_model < 6) {
+		/* Based on AMD doc 20734R - June 2000 */
+		if (c->x86_model == 0) {
+			clear_cpu_cap(c, X86_FEATURE_APIC);
+			set_cpu_cap(c, X86_FEATURE_PGE);
+		}
+		return;
+	}
+
+	if (c->x86_model == 6 && c->x86_mask == 1) {
+		const int K6_BUG_LOOP = 1000000;
+		int n;
+		void (*f_vide)(void);
+		unsigned long d, d2;
+
+		printk(KERN_INFO "AMD K6 stepping B detected - ");
+
+		/*
+		 * It looks like AMD fixed the 2.6.2 bug and improved indirect
+		 * calls at the same time.
+		 */
+
+		n = K6_BUG_LOOP;
+		f_vide = vide;
+		rdtscl(d);
+		while (n--)
+			f_vide();
+		rdtscl(d2);
+		d = d2-d;
+
+		if (d > 20*K6_BUG_LOOP)
+			printk("system stability may be impaired when more than 32 MB are used.\n");
+		else
+			printk("probably OK (after B9730xxxx).\n");
+		printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
+	}
+
+	/* K6 with old style WHCR */
+	if (c->x86_model < 8 ||
+	   (c->x86_model == 8 && c->x86_mask < 8)) {
+		/* We can only write allocate on the low 508Mb */
+		if (mbytes > 508)
+			mbytes = 508;
+
+		rdmsr(MSR_K6_WHCR, l, h);
+		if ((l&0x0000FFFF) == 0) {
+			unsigned long flags;
+			l = (1<<0)|((mbytes/4)<<1);
+			local_irq_save(flags);
+			wbinvd();
+			wrmsr(MSR_K6_WHCR, l, h);
+			local_irq_restore(flags);
+			printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
+				mbytes);
+		}
+		return;
+	}
+
+	if ((c->x86_model == 8 && c->x86_mask > 7) ||
+	     c->x86_model == 9 || c->x86_model == 13) {
+		/* The more serious chips .. */
+
+		if (mbytes > 4092)
+			mbytes = 4092;
+
+		rdmsr(MSR_K6_WHCR, l, h);
+		if ((l&0xFFFF0000) == 0) {
+			unsigned long flags;
+			l = ((mbytes>>2)<<22)|(1<<16);
+			local_irq_save(flags);
+			wbinvd();
+			wrmsr(MSR_K6_WHCR, l, h);
+			local_irq_restore(flags);
+			printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
+				mbytes);
+		}
+
+		return;
+	}
+
+	if (c->x86_model == 10) {
+		/* AMD Geode LX is model 10 */
+		/* placeholder for any needed mods */
+		return;
+	}
+}
+
+static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
+{
+	u32 l, h;
+
+	/*
+	 * Bit 15 of Athlon specific MSR 15, needs to be 0
+	 * to enable SSE on Palomino/Morgan/Barton CPU's.
+	 * If the BIOS didn't enable it already, enable it here.
+	 */
+	if (c->x86_model >= 6 && c->x86_model <= 10) {
+		if (!cpu_has(c, X86_FEATURE_XMM)) {
+			printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
+			rdmsr(MSR_K7_HWCR, l, h);
+			l &= ~0x00008000;
+			wrmsr(MSR_K7_HWCR, l, h);
+			set_cpu_cap(c, X86_FEATURE_XMM);
+		}
+	}
+
+	/*
+	 * It's been determined by AMD that Athlons since model 8 stepping 1
+	 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
+	 * As per AMD technical note 27212 0.2
+	 */
+	if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
+		rdmsr(MSR_K7_CLK_CTL, l, h);
+		if ((l & 0xfff00000) != 0x20000000) {
+			printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
+				((l & 0x000fffff)|0x20000000));
+			wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
+		}
+	}
+
+	set_cpu_cap(c, X86_FEATURE_K7);
+}
+#endif
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+static int __cpuinit nearby_node(int apicid)
+{
+	int i, node;
+
+	for (i = apicid - 1; i >= 0; i--) {
+		node = apicid_to_node[i];
+		if (node != NUMA_NO_NODE && node_online(node))
+			return node;
+	}
+	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
+		node = apicid_to_node[i];
+		if (node != NUMA_NO_NODE && node_online(node))
+			return node;
+	}
+	return first_node(node_online_map); /* Shouldn't happen */
+}
+#endif
+
+/*
+ * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
+ * Assumes number of cores is a power of two.
+ */
+static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_HT
+	unsigned bits;
+
+	bits = c->x86_coreid_bits;
+
+	/* Low order bits define the core id (index of core in socket) */
+	c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+	/* Convert the initial APIC ID into the socket ID */
+	c->phys_proc_id = c->initial_apicid >> bits;
+#endif
+}
+
+static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
+{
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+	int cpu = smp_processor_id();
+	int node;
+	unsigned apicid = hard_smp_processor_id();
+
+	node = c->phys_proc_id;
+	if (apicid_to_node[apicid] != NUMA_NO_NODE)
+		node = apicid_to_node[apicid];
+	if (!node_online(node)) {
+		/* Two possibilities here:
+		   - The CPU is missing memory and no node was created.
+		   In that case try picking one from a nearby CPU
+		   - The APIC IDs differ from the HyperTransport node IDs
+		   which the K8 northbridge parsing fills in.
+		   Assume they are all increased by a constant offset,
+		   but in the same order as the HT nodeids.
+		   If that doesn't result in a usable node fall back to the
+		   path for the previous case.  */
+
+		int ht_nodeid = c->initial_apicid;
+
+		if (ht_nodeid >= 0 &&
+		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+			node = apicid_to_node[ht_nodeid];
+		/* Pick a nearby node */
+		if (!node_online(node))
+			node = nearby_node(apicid);
+	}
+	numa_set_node(cpu, node);
+
+	printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
+#endif
+}
+
+static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_HT
+	unsigned bits, ecx;
+
+	/* Multi core CPU? */
+	if (c->extended_cpuid_level < 0x80000008)
+		return;
+
+	ecx = cpuid_ecx(0x80000008);
+
+	c->x86_max_cores = (ecx & 0xff) + 1;
+
+	/* CPU telling us the core id bits shift? */
+	bits = (ecx >> 12) & 0xF;
+
+	/* Otherwise recompute */
+	if (bits == 0) {
+		while ((1 << bits) < c->x86_max_cores)
+			bits++;
+	}
+
+	c->x86_coreid_bits = bits;
+#endif
+}
+
+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+{
+	early_init_amd_mc(c);
+
+	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
+	if (c->x86_power & (1<<8))
+		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+#ifdef CONFIG_X86_64
+	set_cpu_cap(c, X86_FEATURE_SYSCALL32);
+#else
+	/*  Set MTRR capability flag if appropriate */
+	if (c->x86 == 5)
+		if (c->x86_model == 13 || c->x86_model == 9 ||
+		    (c->x86_model == 8 && c->x86_mask >= 8))
+			set_cpu_cap(c, X86_FEATURE_K6_MTRR);
+#endif
+}
+
+static void __cpuinit init_amd(struct cpuinfo_x86 *c)
+{
 #ifdef CONFIG_SMP
 	unsigned long long value;
 
@@ -51,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	 * Errata 63 for SH-B3 steppings
 	 * Errata 122 for all steppings (F+ have it disabled by default)
 	 */
-	if (c->x86 == 15) {
+	if (c->x86 == 0xf) {
 		rdmsrl(MSR_K7_HWCR, value);
 		value |= 1 << 6;
 		wrmsrl(MSR_K7_HWCR, value);
@@ -61,213 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	early_init_amd(c);
 
 	/*
-	 *	FIXME: We should handle the K5 here. Set up the write
-	 *	range and also turn on MSR 83 bits 4 and 31 (write alloc,
-	 *	no bus pipeline)
-	 */
-
-	/*
 	 * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
 	 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
 	 */
 	clear_cpu_cap(c, 0*32+31);
 
-	r = get_model_name(c);
-
-	switch (c->x86) {
-	case 4:
-		/*
-		 * General Systems BIOSen alias the cpu frequency registers
-		 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
-		 * drivers subsequently pokes it, and changes the CPU speed.
-		 * Workaround : Remove the unneeded alias.
-		 */
-#define CBAR		(0xfffc) /* Configuration Base Address  (32-bit) */
-#define CBAR_ENB	(0x80000000)
-#define CBAR_KEY	(0X000000CB)
-			if (c->x86_model == 9 || c->x86_model == 10) {
-				if (inl (CBAR) & CBAR_ENB)
-					outl (0 | CBAR_KEY, CBAR);
-			}
-			break;
-	case 5:
-			if (c->x86_model < 6) {
-				/* Based on AMD doc 20734R - June 2000 */
-				if (c->x86_model == 0) {
-					clear_cpu_cap(c, X86_FEATURE_APIC);
-					set_cpu_cap(c, X86_FEATURE_PGE);
-				}
-				break;
-			}
-
-			if (c->x86_model == 6 && c->x86_mask == 1) {
-				const int K6_BUG_LOOP = 1000000;
-				int n;
-				void (*f_vide)(void);
-				unsigned long d, d2;
-
-				printk(KERN_INFO "AMD K6 stepping B detected - ");
-
-				/*
-				 * It looks like AMD fixed the 2.6.2 bug and improved indirect
-				 * calls at the same time.
-				 */
-
-				n = K6_BUG_LOOP;
-				f_vide = vide;
-				rdtscl(d);
-				while (n--)
-					f_vide();
-				rdtscl(d2);
-				d = d2-d;
-
-				if (d > 20*K6_BUG_LOOP)
-					printk("system stability may be impaired when more than 32 MB are used.\n");
-				else
-					printk("probably OK (after B9730xxxx).\n");
-				printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
-			}
-
-			/* K6 with old style WHCR */
-			if (c->x86_model < 8 ||
-			   (c->x86_model == 8 && c->x86_mask < 8)) {
-				/* We can only write allocate on the low 508Mb */
-				if (mbytes > 508)
-					mbytes = 508;
-
-				rdmsr(MSR_K6_WHCR, l, h);
-				if ((l&0x0000FFFF) == 0) {
-					unsigned long flags;
-					l = (1<<0)|((mbytes/4)<<1);
-					local_irq_save(flags);
-					wbinvd();
-					wrmsr(MSR_K6_WHCR, l, h);
-					local_irq_restore(flags);
-					printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
-						mbytes);
-				}
-				break;
-			}
-
-			if ((c->x86_model == 8 && c->x86_mask > 7) ||
-			     c->x86_model == 9 || c->x86_model == 13) {
-				/* The more serious chips .. */
-
-				if (mbytes > 4092)
-					mbytes = 4092;
-
-				rdmsr(MSR_K6_WHCR, l, h);
-				if ((l&0xFFFF0000) == 0) {
-					unsigned long flags;
-					l = ((mbytes>>2)<<22)|(1<<16);
-					local_irq_save(flags);
-					wbinvd();
-					wrmsr(MSR_K6_WHCR, l, h);
-					local_irq_restore(flags);
-					printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
-						mbytes);
-				}
-
-				/*  Set MTRR capability flag if appropriate */
-				if (c->x86_model == 13 || c->x86_model == 9 ||
-				   (c->x86_model == 8 && c->x86_mask >= 8))
-					set_cpu_cap(c, X86_FEATURE_K6_MTRR);
-				break;
-			}
-
-			if (c->x86_model == 10) {
-				/* AMD Geode LX is model 10 */
-				/* placeholder for any needed mods */
-				break;
-			}
-			break;
-	case 6: /* An Athlon/Duron */
+#ifdef CONFIG_X86_64
+	/* On C+ stepping K8 rep microcode works well for copy/memset */
+	if (c->x86 == 0xf) {
+		u32 level;
 
-			/*
-			 * Bit 15 of Athlon specific MSR 15, needs to be 0
-			 * to enable SSE on Palomino/Morgan/Barton CPU's.
-			 * If the BIOS didn't enable it already, enable it here.
-			 */
-			if (c->x86_model >= 6 && c->x86_model <= 10) {
-				if (!cpu_has(c, X86_FEATURE_XMM)) {
-					printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
-					rdmsr(MSR_K7_HWCR, l, h);
-					l &= ~0x00008000;
-					wrmsr(MSR_K7_HWCR, l, h);
-					set_cpu_cap(c, X86_FEATURE_XMM);
-				}
-			}
-
-			/*
-			 * It's been determined by AMD that Athlons since model 8 stepping 1
-			 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
-			 * As per AMD technical note 27212 0.2
-			 */
-			if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
-				rdmsr(MSR_K7_CLK_CTL, l, h);
-				if ((l & 0xfff00000) != 0x20000000) {
-					printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
-						((l & 0x000fffff)|0x20000000));
-					wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
-				}
-			}
-			break;
+		level = cpuid_eax(1);
+		if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+			set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 	}
+	if (c->x86 == 0x10 || c->x86 == 0x11)
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+#else
+
+	/*
+	 *	FIXME: We should handle the K5 here. Set up the write
+	 *	range and also turn on MSR 83 bits 4 and 31 (write alloc,
+	 *	no bus pipeline)
+	 */
 
 	switch (c->x86) {
-	case 15:
-	/* Use K8 tuning for Fam10h and Fam11h */
-	case 0x10:
-	case 0x11:
-		set_cpu_cap(c, X86_FEATURE_K8);
+	case 4:
+		init_amd_k5(c);
 		break;
-	case 6:
-		set_cpu_cap(c, X86_FEATURE_K7);
+	case 5:
+		init_amd_k6(c);
+		break;
+	case 6: /* An Athlon/Duron */
+		init_amd_k7(c);
 		break;
 	}
+
+	/* K6s reports MCEs but don't actually have all the MSRs */
+	if (c->x86 < 6)
+		clear_cpu_cap(c, X86_FEATURE_MCE);
+#endif
+
+	/* Enable workaround for FXSAVE leak */
 	if (c->x86 >= 6)
 		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
 
-	display_cacheinfo(c);
-
-	if (cpuid_eax(0x80000000) >= 0x80000008)
-		c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
+	if (!c->x86_model_id[0]) {
+		switch (c->x86) {
+		case 0xf:
+			/* Should distinguish Models here, but this is only
+			   a fallback anyways. */
+			strcpy(c->x86_model_id, "Hammer");
+			break;
+		}
+	}
 
-#ifdef CONFIG_X86_HT
-	/*
-	 * On a AMD multi core setup the lower bits of the APIC id
-	 * distinguish the cores.
-	 */
-	if (c->x86_max_cores > 1) {
-		int cpu = smp_processor_id();
-		unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
+	display_cacheinfo(c);
 
-		if (bits == 0) {
-			while ((1 << bits) < c->x86_max_cores)
-				bits++;
-		}
-		c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
-		c->phys_proc_id >>= bits;
-		printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
-		       cpu, c->x86_max_cores, c->cpu_core_id);
+	/* Multi core CPU? */
+	if (c->extended_cpuid_level >= 0x80000008) {
+		amd_detect_cmp(c);
+		srat_detect_node(c);
 	}
+
+#ifdef CONFIG_X86_32
+	detect_ht(c);
 #endif
 
-	if (cpuid_eax(0x80000000) >= 0x80000006) {
-		if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000))
+	if (c->extended_cpuid_level >= 0x80000006) {
+		if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
 			num_cache_leaves = 4;
 		else
 			num_cache_leaves = 3;
 	}
 
-	/* K6s reports MCEs but don't actually have all the MSRs */
-	if (c->x86 < 6)
-		clear_cpu_cap(c, X86_FEATURE_MCE);
+	if (c->x86 >= 0xf && c->x86 <= 0x11)
+		set_cpu_cap(c, X86_FEATURE_K8);
 
-	if (cpu_has_xmm2)
+	if (cpu_has_xmm2) {
+		/* MFENCE stops RDTSC speculation */
 		set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+	}
+
+#ifdef CONFIG_X86_64
+	if (c->x86 == 0x10) {
+		/* do this for boot cpu */
+		if (c == &boot_cpu_data)
+			check_enable_amd_mmconf_dmi();
+
+		fam10h_check_enable_mmcfg();
+	}
+
+	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+		unsigned long long tseg;
+
+		/*
+		 * Split up direct mapping around the TSEG SMM area.
+		 * Don't do it for gbpages because there seems very little
+		 * benefit in doing so.
+		 */
+		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+		    printk(KERN_DEBUG "tseg: %010llx\n", tseg);
+		    if ((tseg>>PMD_SHIFT) <
+				(max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
+			((tseg>>PMD_SHIFT) <
+				(max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
+			 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
+			set_memory_4k((unsigned long)__va(tseg), 1);
+		}
+	}
+#endif
 }
 
+#ifdef CONFIG_X86_32
 static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 {
 	/* AMD errata T13 (order #21922) */
@@ -280,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
 	}
 	return size;
 }
+#endif
 
 static struct cpu_dev amd_cpu_dev __cpuinitdata = {
 	.c_vendor	= "AMD",
 	.c_ident	= { "AuthenticAMD" },
+#ifdef CONFIG_X86_32
 	.c_models = {
 		{ .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
 		  {
@@ -296,9 +463,11 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = {
 		  }
 		},
 	},
+	.c_size_cache	= amd_size_cache,
+#endif
 	.c_early_init   = early_init_amd,
 	.c_init		= init_amd,
-	.c_size_cache	= amd_size_cache,
+	.c_x86_vendor	= X86_VENDOR_AMD,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
+cpu_dev_register(amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
deleted file mode 100644
index 7c36fb8a28d4..000000000000
--- a/arch/x86/kernel/cpu/amd_64.c
+++ /dev/null
@@ -1,222 +0,0 @@
-#include <linux/init.h>
-#include <linux/mm.h>
-
-#include <asm/numa_64.h>
-#include <asm/mmconfig.h>
-#include <asm/cacheflush.h>
-
-#include <mach_apic.h>
-
-#include "cpu.h"
-
-int force_mwait __cpuinitdata;
-
-#ifdef CONFIG_NUMA
-static int __cpuinit nearby_node(int apicid)
-{
-	int i, node;
-
-	for (i = apicid - 1; i >= 0; i--) {
-		node = apicid_to_node[i];
-		if (node != NUMA_NO_NODE && node_online(node))
-			return node;
-	}
-	for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
-		node = apicid_to_node[i];
-		if (node != NUMA_NO_NODE && node_online(node))
-			return node;
-	}
-	return first_node(node_online_map); /* Shouldn't happen */
-}
-#endif
-
-/*
- * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
- * Assumes number of cores is a power of two.
- */
-static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	unsigned bits;
-#ifdef CONFIG_NUMA
-	int cpu = smp_processor_id();
-	int node = 0;
-	unsigned apicid = hard_smp_processor_id();
-#endif
-	bits = c->x86_coreid_bits;
-
-	/* Low order bits define the core id (index of core in socket) */
-	c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
-	/* Convert the initial APIC ID into the socket ID */
-	c->phys_proc_id = c->initial_apicid >> bits;
-
-#ifdef CONFIG_NUMA
-	node = c->phys_proc_id;
-	if (apicid_to_node[apicid] != NUMA_NO_NODE)
-		node = apicid_to_node[apicid];
-	if (!node_online(node)) {
-		/* Two possibilities here:
-		   - The CPU is missing memory and no node was created.
-		   In that case try picking one from a nearby CPU
-		   - The APIC IDs differ from the HyperTransport node IDs
-		   which the K8 northbridge parsing fills in.
-		   Assume they are all increased by a constant offset,
-		   but in the same order as the HT nodeids.
-		   If that doesn't result in a usable node fall back to the
-		   path for the previous case.  */
-
-		int ht_nodeid = c->initial_apicid;
-
-		if (ht_nodeid >= 0 &&
-		    apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
-			node = apicid_to_node[ht_nodeid];
-		/* Pick a nearby node */
-		if (!node_online(node))
-			node = nearby_node(apicid);
-	}
-	numa_set_node(cpu, node);
-
-	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-#endif
-}
-
-static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	unsigned bits, ecx;
-
-	/* Multi core CPU? */
-	if (c->extended_cpuid_level < 0x80000008)
-		return;
-
-	ecx = cpuid_ecx(0x80000008);
-
-	c->x86_max_cores = (ecx & 0xff) + 1;
-
-	/* CPU telling us the core id bits shift? */
-	bits = (ecx >> 12) & 0xF;
-
-	/* Otherwise recompute */
-	if (bits == 0) {
-		while ((1 << bits) < c->x86_max_cores)
-			bits++;
-	}
-
-	c->x86_coreid_bits = bits;
-
-#endif
-}
-
-static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
-{
-	early_init_amd_mc(c);
-
-	/* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
-	if (c->x86_power & (1<<8))
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-}
-
-static void __cpuinit init_amd(struct cpuinfo_x86 *c)
-{
-	unsigned level;
-
-#ifdef CONFIG_SMP
-	unsigned long value;
-
-	/*
-	 * Disable TLB flush filter by setting HWCR.FFDIS on K8
-	 * bit 6 of msr C001_0015
-	 *
-	 * Errata 63 for SH-B3 steppings
-	 * Errata 122 for all steppings (F+ have it disabled by default)
-	 */
-	if (c->x86 == 0xf) {
-		rdmsrl(MSR_K8_HWCR, value);
-		value |= 1 << 6;
-		wrmsrl(MSR_K8_HWCR, value);
-	}
-#endif
-
-	/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
-	   3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
-	clear_cpu_cap(c, 0*32+31);
-
-	/* On C+ stepping K8 rep microcode works well for copy/memset */
-	if (c->x86 == 0xf) {
-		level = cpuid_eax(1);
-		if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
-			set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	}
-	if (c->x86 == 0x10 || c->x86 == 0x11)
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-
-	/* Enable workaround for FXSAVE leak */
-	if (c->x86 >= 6)
-		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
-
-	level = get_model_name(c);
-	if (!level) {
-		switch (c->x86) {
-		case 0xf:
-			/* Should distinguish Models here, but this is only
-			   a fallback anyways. */
-			strcpy(c->x86_model_id, "Hammer");
-			break;
-		}
-	}
-	display_cacheinfo(c);
-
-	/* Multi core CPU? */
-	if (c->extended_cpuid_level >= 0x80000008)
-		amd_detect_cmp(c);
-
-	if (c->extended_cpuid_level >= 0x80000006 &&
-		(cpuid_edx(0x80000006) & 0xf000))
-		num_cache_leaves = 4;
-	else
-		num_cache_leaves = 3;
-
-	if (c->x86 >= 0xf && c->x86 <= 0x11)
-		set_cpu_cap(c, X86_FEATURE_K8);
-
-	/* MFENCE stops RDTSC speculation */
-	set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
-
-	if (c->x86 == 0x10) {
-		/* do this for boot cpu */
-		if (c == &boot_cpu_data)
-			check_enable_amd_mmconf_dmi();
-
-		fam10h_check_enable_mmcfg();
-	}
-
-	if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
-		unsigned long long tseg;
-
-		/*
-		 * Split up direct mapping around the TSEG SMM area.
-		 * Don't do it for gbpages because there seems very little
-		 * benefit in doing so.
-		 */
-		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
-		    printk(KERN_DEBUG "tseg: %010llx\n", tseg);
-		    if ((tseg>>PMD_SHIFT) <
-				(max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
-			((tseg>>PMD_SHIFT) <
-				(max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
-			 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
-			set_memory_4k((unsigned long)__va(tseg), 1);
-		}
-	}
-}
-
-static struct cpu_dev amd_cpu_dev __cpuinitdata = {
-	.c_vendor	= "AMD",
-	.c_ident	= { "AuthenticAMD" },
-	.c_early_init   = early_init_amd,
-	.c_init		= init_amd,
-};
-
-cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
-
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1b1c56bb338f..c8e315f1aa83 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -50,6 +50,8 @@ static double __initdata y = 3145727.0;
  */
 static void __init check_fpu(void)
 {
+	s32 fdiv_bug;
+
 	if (!boot_cpu_data.hard_math) {
 #ifndef CONFIG_MATH_EMULATION
 		printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
@@ -74,8 +76,10 @@ static void __init check_fpu(void)
 		"fistpl %0\n\t"
 		"fwait\n\t"
 		"fninit"
-		: "=m" (*&boot_cpu_data.fdiv_bug)
+		: "=m" (*&fdiv_bug)
 		: "m" (*&x), "m" (*&y));
+
+	boot_cpu_data.fdiv_bug = fdiv_bug;
 	if (boot_cpu_data.fdiv_bug)
 		printk("Hmm, FPU with FDIV bug.\n");
 }
@@ -131,13 +135,7 @@ static void __init check_popad(void)
  *   (for due to lack of "invlpg" and working WP on a i386)
  * - In order to run on anything without a TSC, we need to be
  *   compiled for a i486.
- * - In order to support the local APIC on a buggy Pentium machine,
- *   we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
- *   which happens implicitly if compiled for a Pentium or lower
- *   (unless an advanced selection of CPU features is used) as an
- *   otherwise config implies a properly working local APIC without
- *   the need to do extra reads from the APIC.
-*/
+ */
 
 static void __init check_config(void)
 {
@@ -151,21 +149,6 @@ static void __init check_config(void)
 	if (boot_cpu_data.x86 == 3)
 		panic("Kernel requires i486+ for 'invlpg' and other features");
 #endif
-
-/*
- * If we were told we had a good local APIC, check for buggy Pentia,
- * i.e. all B steppings and the C2 stepping of P54C when using their
- * integrated APIC (see 11AP erratum in "Pentium Processor
- * Specification Update").
- */
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
-	    && cpu_has_apic
-	    && boot_cpu_data.x86 == 5
-	    && boot_cpu_data.x86_model == 2
-	    && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
-		panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
-#endif
 }
 
 
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e0f45edd6a55..89bfdd9cacc6 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
 	if (c->x86_model >= 6 && c->x86_model < 9)
 		set_cpu_cap(c, X86_FEATURE_3DNOW);
 
-	get_model_name(c);
 	display_cacheinfo(c);
 }
 
@@ -314,6 +313,16 @@ enum {
 		EAMD3D		= 1<<20,
 };
 
+static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+{
+	switch (c->x86) {
+	case 5:
+		/* Emulate MTRRs using Centaur's MCR. */
+		set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
+		break;
+	}
+}
+
 static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 {
 
@@ -462,8 +471,10 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
 	.c_vendor	= "Centaur",
 	.c_ident	= { "CentaurHauls" },
+	.c_early_init	= early_init_centaur,
 	.c_init		= init_centaur,
 	.c_size_cache	= centaur_size_cache,
+	.c_x86_vendor	= X86_VENDOR_CENTAUR,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev);
+cpu_dev_register(centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 1d181c40e2e1..a1625f5a1e78 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -16,9 +16,10 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
 
 static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 {
+	early_init_centaur(c);
+
 	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
 		c->x86_cache_alignment = c->x86_clflush_size * 2;
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
 		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
 	}
 	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -29,7 +30,8 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
 	.c_ident	= { "CentaurHauls" },
 	.c_early_init	= early_init_centaur,
 	.c_init		= init_centaur,
+	.c_x86_vendor	= X86_VENDOR_CENTAUR,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev);
+cpu_dev_register(centaur_cpu_dev);
 
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
new file mode 100644
index 000000000000..2056ccf572cc
--- /dev/null
+++ b/arch/x86/kernel/cpu/cmpxchg.c
@@ -0,0 +1,72 @@
+/*
+ * cmpxchg*() fallbacks for CPU not supporting these instructions
+ */
+
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+
+#ifndef CONFIG_X86_CMPXCHG
+unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
+{
+	u8 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u8 *)ptr;
+	if (prev == old)
+		*(u8 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u8);
+
+unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
+{
+	u16 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u16 *)ptr;
+	if (prev == old)
+		*(u16 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u16);
+
+unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
+{
+	u32 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u32 *)ptr;
+	if (prev == old)
+		*(u32 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_386_u32);
+#endif
+
+#ifndef CONFIG_X86_CMPXCHG64
+unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
+{
+	u64 prev;
+	unsigned long flags;
+
+	/* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
+	local_irq_save(flags);
+	prev = *(u64 *)ptr;
+	if (prev == old)
+		*(u64 *)ptr = new;
+	local_irq_restore(flags);
+	return prev;
+}
+EXPORT_SYMBOL(cmpxchg_486_u64);
+#endif
+
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 80ab20d4fa39..25581dcb280e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,27 +1,62 @@
 #include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
+#include <linux/bootmem.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/kgdb.h>
+#include <linux/topology.h>
 #include <linux/delay.h>
 #include <linux/smp.h>
-#include <linux/module.h>
 #include <linux/percpu.h>
-#include <linux/bootmem.h>
-#include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/msr.h>
 #include <asm/io.h>
+#include <asm/linkage.h>
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/pat.h>
+#include <asm/asm.h>
+#include <asm/numa.h>
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/mpspec.h>
 #include <asm/apic.h>
 #include <mach_apic.h>
+#include <asm/genapic.h>
 #endif
 
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+#include <asm/desc.h>
+#include <asm/atomic.h>
+#include <asm/proto.h>
+#include <asm/sections.h>
+#include <asm/setup.h>
+
 #include "cpu.h"
 
+static struct cpu_dev *this_cpu __cpuinitdata;
+
+#ifdef CONFIG_X86_64
+/* We need valid kernel segments for data and code in long mode too
+ * IRET will check the segment types  kkeil 2000/10/28
+ * Also sysret mandates a special GDT layout
+ */
+/* The TLS descriptors are currently at a different place compared to i386.
+   Hopefully nobody expects them at a fixed place (Wine?) */
 DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
+	[GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
+	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
+	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
+	[GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
+	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
+	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
+} };
+#else
+DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
 	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
 	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
 	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -55,17 +90,157 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
 	[GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
 	[GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
 } };
+#endif
 EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
 
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
-
+#ifdef CONFIG_X86_32
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_serial_nr __cpuinitdata = 1;
 
-struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
+static int __init cachesize_setup(char *str)
+{
+	get_option(&str, &cachesize_override);
+	return 1;
+}
+__setup("cachesize=", cachesize_setup);
+
+static int __init x86_fxsr_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_FXSR);
+	setup_clear_cpu_cap(X86_FEATURE_XMM);
+	return 1;
+}
+__setup("nofxsr", x86_fxsr_setup);
+
+static int __init x86_sep_setup(char *s)
+{
+	setup_clear_cpu_cap(X86_FEATURE_SEP);
+	return 1;
+}
+__setup("nosep", x86_sep_setup);
+
+/* Standard macro to see if a specific flag is changeable */
+static inline int flag_is_changeable_p(u32 flag)
+{
+	u32 f1, f2;
+
+	/*
+	 * Cyrix and IDT cpus allow disabling of CPUID
+	 * so the code below may return different results
+	 * when it is executed before and after enabling
+	 * the CPUID. Add "volatile" to not allow gcc to
+	 * optimize the subsequent calls to this function.
+	 */
+	asm volatile ("pushfl\n\t"
+		      "pushfl\n\t"
+		      "popl %0\n\t"
+		      "movl %0,%1\n\t"
+		      "xorl %2,%0\n\t"
+		      "pushl %0\n\t"
+		      "popfl\n\t"
+		      "pushfl\n\t"
+		      "popl %0\n\t"
+		      "popfl\n\t"
+		      : "=&r" (f1), "=&r" (f2)
+		      : "ir" (flag));
+
+	return ((f1^f2) & flag) != 0;
+}
+
+/* Probe for the CPUID instruction */
+static int __cpuinit have_cpuid_p(void)
+{
+	return flag_is_changeable_p(X86_EFLAGS_ID);
+}
+
+static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+	if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
+		/* Disable processor serial number */
+		unsigned long lo, hi;
+		rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+		lo |= 0x200000;
+		wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
+		printk(KERN_NOTICE "CPU serial number disabled.\n");
+		clear_cpu_cap(c, X86_FEATURE_PN);
+
+		/* Disabling the serial number may affect the cpuid level */
+		c->cpuid_level = cpuid_eax(0);
+	}
+}
+
+static int __init x86_serial_nr_setup(char *s)
+{
+	disable_x86_serial_nr = 0;
+	return 1;
+}
+__setup("serialnumber", x86_serial_nr_setup);
+#else
+static inline int flag_is_changeable_p(u32 flag)
+{
+	return 1;
+}
+/* Probe for the CPUID instruction */
+static inline int have_cpuid_p(void)
+{
+	return 1;
+}
+static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+{
+}
+#endif
+
+/*
+ * Naming convention should be: <Name> [(<Codename>)]
+ * This table only is used unless init_<vendor>() below doesn't set it;
+ * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
+ *
+ */
+
+/* Look up CPU names by table lookup. */
+static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+{
+	struct cpu_model_info *info;
+
+	if (c->x86_model >= 16)
+		return NULL;	/* Range check */
+
+	if (!this_cpu)
+		return NULL;
+
+	info = this_cpu->c_models;
+
+	while (info && info->family) {
+		if (info->family == c->x86)
+			return info->model_names[c->x86_model];
+		info++;
+	}
+	return NULL;		/* Not found */
+}
+
+__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+void switch_to_new_gdt(void)
+{
+	struct desc_ptr gdt_descr;
+
+	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
+#ifdef CONFIG_X86_32
+	asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
+#endif
+}
+
+static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
 
 static void __cpuinit default_init(struct cpuinfo_x86 *c)
 {
+#ifdef CONFIG_X86_64
+	display_cacheinfo(c);
+#else
 	/* Not much we can do here... */
 	/* Check if at least it has cpuid */
 	if (c->cpuid_level == -1) {
@@ -75,28 +250,22 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
 		else if (c->x86 == 3)
 			strcpy(c->x86_model_id, "386");
 	}
+#endif
 }
 
 static struct cpu_dev __cpuinitdata default_cpu = {
 	.c_init	= default_init,
 	.c_vendor = "Unknown",
+	.c_x86_vendor = X86_VENDOR_UNKNOWN,
 };
-static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
-
-static int __init cachesize_setup(char *str)
-{
-	get_option(&str, &cachesize_override);
-	return 1;
-}
-__setup("cachesize=", cachesize_setup);
 
-int __cpuinit get_model_name(struct cpuinfo_x86 *c)
+static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
 {
 	unsigned int *v;
 	char *p, *q;
 
-	if (cpuid_eax(0x80000000) < 0x80000004)
-		return 0;
+	if (c->extended_cpuid_level < 0x80000004)
+		return;
 
 	v = (unsigned int *) c->x86_model_id;
 	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
@@ -115,30 +284,34 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
 	     while (q <= &c->x86_model_id[48])
 		  *q++ = '\0';	/* Zero-pad the rest */
 	}
-
-	return 1;
 }
 
-
 void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 {
-	unsigned int n, dummy, ecx, edx, l2size;
+	unsigned int n, dummy, ebx, ecx, edx, l2size;
 
-	n = cpuid_eax(0x80000000);
+	n = c->extended_cpuid_level;
 
 	if (n >= 0x80000005) {
-		cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
+		cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
 		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
-			edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
-		c->x86_cache_size = (ecx>>24)+(edx>>24);
+				edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
+		c->x86_cache_size = (ecx>>24) + (edx>>24);
+#ifdef CONFIG_X86_64
+		/* On K8 L1 TLB is inclusive, so don't count it */
+		c->x86_tlbsize = 0;
+#endif
 	}
 
 	if (n < 0x80000006)	/* Some chips just has a large L1. */
 		return;
 
-	ecx = cpuid_ecx(0x80000006);
+	cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
 	l2size = ecx >> 16;
 
+#ifdef CONFIG_X86_64
+	c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
+#else
 	/* do processor-specific cache resizing */
 	if (this_cpu->c_size_cache)
 		l2size = this_cpu->c_size_cache(c, l2size);
@@ -149,116 +322,106 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 
 	if (l2size == 0)
 		return;		/* Again, no L2 cache is possible */
+#endif
 
 	c->x86_cache_size = l2size;
 
 	printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
-	       l2size, ecx & 0xFF);
+			l2size, ecx & 0xFF);
 }
 
-/*
- * Naming convention should be: <Name> [(<Codename>)]
- * This table only is used unless init_<vendor>() below doesn't set it;
- * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
- *
- */
-
-/* Look up CPU names by table lookup. */
-static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
+void __cpuinit detect_ht(struct cpuinfo_x86 *c)
 {
-	struct cpu_model_info *info;
+#ifdef CONFIG_X86_HT
+	u32 eax, ebx, ecx, edx;
+	int index_msb, core_bits;
 
-	if (c->x86_model >= 16)
-		return NULL;	/* Range check */
+	if (!cpu_has(c, X86_FEATURE_HT))
+		return;
 
-	if (!this_cpu)
-		return NULL;
+	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
+		goto out;
 
-	info = this_cpu->c_models;
+	if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
+		return;
 
-	while (info && info->family) {
-		if (info->family == c->x86)
-			return info->model_names[c->x86_model];
-		info++;
+	cpuid(1, &eax, &ebx, &ecx, &edx);
+
+	smp_num_siblings = (ebx & 0xff0000) >> 16;
+
+	if (smp_num_siblings == 1) {
+		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
+	} else if (smp_num_siblings > 1) {
+
+		if (smp_num_siblings > NR_CPUS) {
+			printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
+					smp_num_siblings);
+			smp_num_siblings = 1;
+			return;
+		}
+
+		index_msb = get_count_order(smp_num_siblings);
+#ifdef CONFIG_X86_64
+		c->phys_proc_id = phys_pkg_id(index_msb);
+#else
+		c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
+#endif
+
+		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
+
+		index_msb = get_count_order(smp_num_siblings);
+
+		core_bits = get_count_order(c->x86_max_cores);
+
+#ifdef CONFIG_X86_64
+		c->cpu_core_id = phys_pkg_id(index_msb) &
+					       ((1 << core_bits) - 1);
+#else
+		c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
+					       ((1 << core_bits) - 1);
+#endif
 	}
-	return NULL;		/* Not found */
-}
 
+out:
+	if ((c->x86_max_cores * smp_num_siblings) > 1) {
+		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
+		       c->phys_proc_id);
+		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
+		       c->cpu_core_id);
+	}
+#endif
+}
 
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
+static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
 {
 	char *v = c->x86_vendor_id;
 	int i;
 	static int printed;
 
 	for (i = 0; i < X86_VENDOR_NUM; i++) {
-		if (cpu_devs[i]) {
-			if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
-			    (cpu_devs[i]->c_ident[1] &&
-			     !strcmp(v, cpu_devs[i]->c_ident[1]))) {
-				c->x86_vendor = i;
-				if (!early)
-					this_cpu = cpu_devs[i];
-				return;
-			}
+		if (!cpu_devs[i])
+			break;
+
+		if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+		    (cpu_devs[i]->c_ident[1] &&
+		     !strcmp(v, cpu_devs[i]->c_ident[1]))) {
+			this_cpu = cpu_devs[i];
+			c->x86_vendor = this_cpu->c_x86_vendor;
+			return;
 		}
 	}
+
 	if (!printed) {
 		printed++;
-		printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
+		printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
 		printk(KERN_ERR "CPU: Your system may be unstable.\n");
 	}
+
 	c->x86_vendor = X86_VENDOR_UNKNOWN;
 	this_cpu = &default_cpu;
 }
 
-
-static int __init x86_fxsr_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_FXSR);
-	setup_clear_cpu_cap(X86_FEATURE_XMM);
-	return 1;
-}
-__setup("nofxsr", x86_fxsr_setup);
-
-
-static int __init x86_sep_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_SEP);
-	return 1;
-}
-__setup("nosep", x86_sep_setup);
-
-
-/* Standard macro to see if a specific flag is changeable */
-static inline int flag_is_changeable_p(u32 flag)
-{
-	u32 f1, f2;
-
-	asm("pushfl\n\t"
-	    "pushfl\n\t"
-	    "popl %0\n\t"
-	    "movl %0,%1\n\t"
-	    "xorl %2,%0\n\t"
-	    "pushl %0\n\t"
-	    "popfl\n\t"
-	    "pushfl\n\t"
-	    "popl %0\n\t"
-	    "popfl\n\t"
-	    : "=&r" (f1), "=&r" (f2)
-	    : "ir" (flag));
-
-	return ((f1^f2) & flag) != 0;
-}
-
-
-/* Probe for the CPUID instruction */
-static int __cpuinit have_cpuid_p(void)
-{
-	return flag_is_changeable_p(X86_EFLAGS_ID);
-}
-
-void __init cpu_detect(struct cpuinfo_x86 *c)
+void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
 {
 	/* Get vendor name */
 	cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -267,50 +430,87 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
 	      (unsigned int *)&c->x86_vendor_id[4]);
 
 	c->x86 = 4;
+	/* Intel-defined flags: level 0x00000001 */
 	if (c->cpuid_level >= 0x00000001) {
 		u32 junk, tfms, cap0, misc;
 		cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
-		c->x86 = (tfms >> 8) & 15;
-		c->x86_model = (tfms >> 4) & 15;
+		c->x86 = (tfms >> 8) & 0xf;
+		c->x86_model = (tfms >> 4) & 0xf;
+		c->x86_mask = tfms & 0xf;
 		if (c->x86 == 0xf)
 			c->x86 += (tfms >> 20) & 0xff;
 		if (c->x86 >= 0x6)
-			c->x86_model += ((tfms >> 16) & 0xF) << 4;
-		c->x86_mask = tfms & 15;
+			c->x86_model += ((tfms >> 16) & 0xf) << 4;
 		if (cap0 & (1<<19)) {
-			c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
 			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
+			c->x86_cache_alignment = c->x86_clflush_size;
 		}
 	}
 }
-static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
+
+static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
 {
 	u32 tfms, xlvl;
-	unsigned int ebx;
+	u32 ebx;
 
-	memset(&c->x86_capability, 0, sizeof c->x86_capability);
-	if (have_cpuid_p()) {
-		/* Intel-defined flags: level 0x00000001 */
-		if (c->cpuid_level >= 0x00000001) {
-			u32 capability, excap;
-			cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
-			c->x86_capability[0] = capability;
-			c->x86_capability[4] = excap;
-		}
+	/* Intel-defined flags: level 0x00000001 */
+	if (c->cpuid_level >= 0x00000001) {
+		u32 capability, excap;
+		cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
+		c->x86_capability[0] = capability;
+		c->x86_capability[4] = excap;
+	}
 
-		/* AMD-defined flags: level 0x80000001 */
-		xlvl = cpuid_eax(0x80000000);
-		if ((xlvl & 0xffff0000) == 0x80000000) {
-			if (xlvl >= 0x80000001) {
-				c->x86_capability[1] = cpuid_edx(0x80000001);
-				c->x86_capability[6] = cpuid_ecx(0x80000001);
-			}
+	/* AMD-defined flags: level 0x80000001 */
+	xlvl = cpuid_eax(0x80000000);
+	c->extended_cpuid_level = xlvl;
+	if ((xlvl & 0xffff0000) == 0x80000000) {
+		if (xlvl >= 0x80000001) {
+			c->x86_capability[1] = cpuid_edx(0x80000001);
+			c->x86_capability[6] = cpuid_ecx(0x80000001);
 		}
+	}
+
+#ifdef CONFIG_X86_64
+	if (c->extended_cpuid_level >= 0x80000008) {
+		u32 eax = cpuid_eax(0x80000008);
 
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
 	}
+#endif
+
+	if (c->extended_cpuid_level >= 0x80000007)
+		c->x86_power = cpuid_edx(0x80000007);
 
 }
 
+static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+	int i;
+
+	/*
+	 * First of all, decide if this is a 486 or higher
+	 * It's a 486 if we can modify the AC flag
+	 */
+	if (flag_is_changeable_p(X86_EFLAGS_AC))
+		c->x86 = 4;
+	else
+		c->x86 = 3;
+
+	for (i = 0; i < X86_VENDOR_NUM; i++)
+		if (cpu_devs[i] && cpu_devs[i]->c_identify) {
+			c->x86_vendor_id[0] = 0;
+			cpu_devs[i]->c_identify(c);
+			if (c->x86_vendor_id[0]) {
+				get_cpu_vendor(c);
+				break;
+			}
+		}
+#endif
+}
+
 /*
  * Do minimum CPU detection early.
  * Fields really needed: vendor, cpuid_level, family, model, mask,
@@ -320,109 +520,113 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
  * WARNING: this function is only called on the BP.  Don't add code here
  * that is supposed to run on all CPUs.
  */
-static void __init early_cpu_detect(void)
+static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 {
-	struct cpuinfo_x86 *c = &boot_cpu_data;
-
-	c->x86_cache_alignment = 32;
+#ifdef CONFIG_X86_64
+	c->x86_clflush_size = 64;
+#else
 	c->x86_clflush_size = 32;
+#endif
+	c->x86_cache_alignment = c->x86_clflush_size;
+
+	memset(&c->x86_capability, 0, sizeof c->x86_capability);
+	c->extended_cpuid_level = 0;
+
+	if (!have_cpuid_p())
+		identify_cpu_without_cpuid(c);
 
+	/* cyrix could have cpuid enabled via c_identify()*/
 	if (!have_cpuid_p())
 		return;
 
 	cpu_detect(c);
 
-	get_cpu_vendor(c, 1);
+	get_cpu_vendor(c);
+
+	get_cpu_cap(c);
 
-	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
-	    cpu_devs[c->x86_vendor]->c_early_init)
-		cpu_devs[c->x86_vendor]->c_early_init(c);
+	if (this_cpu->c_early_init)
+		this_cpu->c_early_init(c);
 
-	early_get_cap(c);
+	validate_pat_support(c);
 }
 
-static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
+void __init early_cpu_init(void)
 {
-	u32 tfms, xlvl;
-	unsigned int ebx;
-
-	if (have_cpuid_p()) {
-		/* Get vendor name */
-		cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
-		      (unsigned int *)&c->x86_vendor_id[0],
-		      (unsigned int *)&c->x86_vendor_id[8],
-		      (unsigned int *)&c->x86_vendor_id[4]);
-
-		get_cpu_vendor(c, 0);
-		/* Initialize the standard set of capabilities */
-		/* Note that the vendor-specific code below might override */
-		/* Intel-defined flags: level 0x00000001 */
-		if (c->cpuid_level >= 0x00000001) {
-			u32 capability, excap;
-			cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
-			c->x86_capability[0] = capability;
-			c->x86_capability[4] = excap;
-			c->x86 = (tfms >> 8) & 15;
-			c->x86_model = (tfms >> 4) & 15;
-			if (c->x86 == 0xf)
-				c->x86 += (tfms >> 20) & 0xff;
-			if (c->x86 >= 0x6)
-				c->x86_model += ((tfms >> 16) & 0xF) << 4;
-			c->x86_mask = tfms & 15;
-			c->initial_apicid = (ebx >> 24) & 0xFF;
-#ifdef CONFIG_X86_HT
-			c->apicid = phys_pkg_id(c->initial_apicid, 0);
-			c->phys_proc_id = c->initial_apicid;
-#else
-			c->apicid = c->initial_apicid;
-#endif
-			if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
-				c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
-		} else {
-			/* Have CPUID level 0 only - unheard of */
-			c->x86 = 4;
-		}
-
-		/* AMD-defined flags: level 0x80000001 */
-		xlvl = cpuid_eax(0x80000000);
-		if ((xlvl & 0xffff0000) == 0x80000000) {
-			if (xlvl >= 0x80000001) {
-				c->x86_capability[1] = cpuid_edx(0x80000001);
-				c->x86_capability[6] = cpuid_ecx(0x80000001);
-			}
-			if (xlvl >= 0x80000004)
-				get_model_name(c); /* Default name */
+	struct cpu_dev **cdev;
+	int count = 0;
+
+	printk("KERNEL supported cpus:\n");
+	for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
+		struct cpu_dev *cpudev = *cdev;
+		unsigned int j;
+
+		if (count >= X86_VENDOR_NUM)
+			break;
+		cpu_devs[count] = cpudev;
+		count++;
+
+		for (j = 0; j < 2; j++) {
+			if (!cpudev->c_ident[j])
+				continue;
+			printk("  %s %s\n", cpudev->c_vendor,
+				cpudev->c_ident[j]);
 		}
-
-		init_scattered_cpuid_features(c);
 	}
 
+	early_identify_cpu(&boot_cpu_data);
 }
 
-static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
+/*
+ * The NOPL instruction is supposed to exist on all CPUs with
+ * family >= 6; unfortunately, that's not true in practice because
+ * of early VIA chips and (more importantly) broken virtualizers that
+ * are not easy to detect.  In the latter case it doesn't even *fail*
+ * reliably, so probing for it doesn't even work.  Disable it completely
+ * unless we can find a reliable way to detect all the broken cases.
+ */
+static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
 {
-	if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
-		/* Disable processor serial number */
-		unsigned long lo, hi;
-		rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-		lo |= 0x200000;
-		wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
-		printk(KERN_NOTICE "CPU serial number disabled.\n");
-		clear_cpu_cap(c, X86_FEATURE_PN);
-
-		/* Disabling the serial number may affect the cpuid level */
-		c->cpuid_level = cpuid_eax(0);
-	}
+	clear_cpu_cap(c, X86_FEATURE_NOPL);
 }
 
-static int __init x86_serial_nr_setup(char *s)
+static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
 {
-	disable_x86_serial_nr = 0;
-	return 1;
-}
-__setup("serialnumber", x86_serial_nr_setup);
+	c->extended_cpuid_level = 0;
+
+	if (!have_cpuid_p())
+		identify_cpu_without_cpuid(c);
+
+	/* cyrix could have cpuid enabled via c_identify()*/
+	if (!have_cpuid_p())
+		return;
+
+	cpu_detect(c);
+
+	get_cpu_vendor(c);
 
+	get_cpu_cap(c);
 
+	if (c->cpuid_level >= 0x00000001) {
+		c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
+#ifdef CONFIG_X86_32
+# ifdef CONFIG_X86_HT
+		c->apicid = phys_pkg_id(c->initial_apicid, 0);
+# else
+		c->apicid = c->initial_apicid;
+# endif
+#endif
+
+#ifdef CONFIG_X86_HT
+		c->phys_proc_id = c->initial_apicid;
+#endif
+	}
+
+	get_model_name(c); /* Default name */
+
+	init_scattered_cpuid_features(c);
+	detect_nopl(c);
+}
 
 /*
  * This does the hard work of actually picking apart the CPU stuff...
@@ -434,30 +638,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	c->loops_per_jiffy = loops_per_jiffy;
 	c->x86_cache_size = -1;
 	c->x86_vendor = X86_VENDOR_UNKNOWN;
-	c->cpuid_level = -1;	/* CPUID not detected */
 	c->x86_model = c->x86_mask = 0;	/* So far unknown... */
 	c->x86_vendor_id[0] = '\0'; /* Unset */
 	c->x86_model_id[0] = '\0';  /* Unset */
 	c->x86_max_cores = 1;
+	c->x86_coreid_bits = 0;
+#ifdef CONFIG_X86_64
+	c->x86_clflush_size = 64;
+#else
+	c->cpuid_level = -1;	/* CPUID not detected */
 	c->x86_clflush_size = 32;
+#endif
+	c->x86_cache_alignment = c->x86_clflush_size;
 	memset(&c->x86_capability, 0, sizeof c->x86_capability);
 
-	if (!have_cpuid_p()) {
-		/*
-		 * First of all, decide if this is a 486 or higher
-		 * It's a 486 if we can modify the AC flag
-		 */
-		if (flag_is_changeable_p(X86_EFLAGS_AC))
-			c->x86 = 4;
-		else
-			c->x86 = 3;
-	}
-
 	generic_identify(c);
 
 	if (this_cpu->c_identify)
 		this_cpu->c_identify(c);
 
+#ifdef CONFIG_X86_64
+	c->apicid = phys_pkg_id(0);
+#endif
+
 	/*
 	 * Vendor-specific initialization.  In this section we
 	 * canonicalize the feature flags, meaning if there are
@@ -491,6 +694,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 				c->x86, c->x86_model);
 	}
 
+#ifdef CONFIG_X86_64
+	detect_ht(c);
+#endif
+
 	/*
 	 * On SMP, boot_cpu_data holds the common feature set between
 	 * all CPUs; so make sure that we indicate which features are
@@ -499,7 +706,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	 */
 	if (c != &boot_cpu_data) {
 		/* AND the already accumulated flags with these */
-		for (i = 0 ; i < NCAPINTS ; i++)
+		for (i = 0; i < NCAPINTS; i++)
 			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
 	}
 
@@ -507,72 +714,91 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	for (i = 0; i < NCAPINTS; i++)
 		c->x86_capability[i] &= ~cleared_cpu_caps[i];
 
+#ifdef CONFIG_X86_MCE
 	/* Init Machine Check Exception if available. */
 	mcheck_init(c);
+#endif
 
 	select_idle_routine(c);
+
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+	numa_add_cpu(smp_processor_id());
+#endif
 }
 
+#ifdef CONFIG_X86_64
+static void vgetcpu_set_mode(void)
+{
+	if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
+		vgetcpu_mode = VGETCPU_RDTSCP;
+	else
+		vgetcpu_mode = VGETCPU_LSL;
+}
+#endif
+
 void __init identify_boot_cpu(void)
 {
 	identify_cpu(&boot_cpu_data);
+#ifdef CONFIG_X86_32
 	sysenter_setup();
 	enable_sep_cpu();
+#else
+	vgetcpu_set_mode();
+#endif
 }
 
 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
 {
 	BUG_ON(c == &boot_cpu_data);
 	identify_cpu(c);
+#ifdef CONFIG_X86_32
 	enable_sep_cpu();
+#endif
 	mtrr_ap_init();
 }
 
-#ifdef CONFIG_X86_HT
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-{
-	u32 	eax, ebx, ecx, edx;
-	int 	index_msb, core_bits;
-
-	cpuid(1, &eax, &ebx, &ecx, &edx);
-
-	if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
-		return;
-
-	smp_num_siblings = (ebx & 0xff0000) >> 16;
+struct msr_range {
+	unsigned min;
+	unsigned max;
+};
 
-	if (smp_num_siblings == 1) {
-		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-	} else if (smp_num_siblings > 1) {
+static struct msr_range msr_range_array[] __cpuinitdata = {
+	{ 0x00000000, 0x00000418},
+	{ 0xc0000000, 0xc000040b},
+	{ 0xc0010000, 0xc0010142},
+	{ 0xc0011000, 0xc001103b},
+};
 
-		if (smp_num_siblings > NR_CPUS) {
-			printk(KERN_WARNING "CPU: Unsupported number of the "
-					"siblings %d", smp_num_siblings);
-			smp_num_siblings = 1;
-			return;
+static void __cpuinit print_cpu_msr(void)
+{
+	unsigned index;
+	u64 val;
+	int i;
+	unsigned index_min, index_max;
+
+	for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
+		index_min = msr_range_array[i].min;
+		index_max = msr_range_array[i].max;
+		for (index = index_min; index < index_max; index++) {
+			if (rdmsrl_amd_safe(index, &val))
+				continue;
+			printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
 		}
+	}
+}
 
-		index_msb = get_count_order(smp_num_siblings);
-		c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
-
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       c->phys_proc_id);
-
-		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
-		index_msb = get_count_order(smp_num_siblings) ;
-
-		core_bits = get_count_order(c->x86_max_cores);
+static int show_msr __cpuinitdata;
+static __init int setup_show_msr(char *arg)
+{
+	int num;
 
-		c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
-					       ((1 << core_bits) - 1);
+	get_option(&arg, &num);
 
-		if (c->x86_max_cores > 1)
-			printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-			       c->cpu_core_id);
-	}
+	if (num > 0)
+		show_msr = num;
+	return 1;
 }
-#endif
+__setup("show_msr=", setup_show_msr);
 
 static __init int setup_noclflush(char *arg)
 {
@@ -590,18 +816,26 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
 	else if (c->cpuid_level >= 0)
 		vendor = c->x86_vendor_id;
 
-	if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor)))
-		printk("%s ", vendor);
+	if (vendor && !strstr(c->x86_model_id, vendor))
+		printk(KERN_CONT "%s ", vendor);
 
-	if (!c->x86_model_id[0])
-		printk("%d86", c->x86);
+	if (c->x86_model_id[0])
+		printk(KERN_CONT "%s", c->x86_model_id);
 	else
-		printk("%s", c->x86_model_id);
+		printk(KERN_CONT "%d86", c->x86);
 
 	if (c->x86_mask || c->cpuid_level >= 0)
-		printk(" stepping %02x\n", c->x86_mask);
+		printk(KERN_CONT " stepping %02x\n", c->x86_mask);
 	else
-		printk("\n");
+		printk(KERN_CONT "\n");
+
+#ifdef CONFIG_SMP
+	if (c->cpu_index < show_msr)
+		print_cpu_msr();
+#else
+	if (show_msr)
+		print_cpu_msr();
+#endif
 }
 
 static __init int setup_disablecpuid(char *arg)
@@ -617,19 +851,89 @@ __setup("clearcpuid=", setup_disablecpuid);
 
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
-void __init early_cpu_init(void)
+#ifdef CONFIG_X86_64
+struct x8664_pda **_cpu_pda __read_mostly;
+EXPORT_SYMBOL(_cpu_pda);
+
+struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
+
+char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+
+void __cpuinit pda_init(int cpu)
 {
-	struct cpu_vendor_dev *cvdev;
+	struct x8664_pda *pda = cpu_pda(cpu);
+
+	/* Setup up data that may be needed in __get_free_pages early */
+	loadsegment(fs, 0);
+	loadsegment(gs, 0);
+	/* Memory clobbers used to order PDA accessed */
+	mb();
+	wrmsrl(MSR_GS_BASE, pda);
+	mb();
+
+	pda->cpunumber = cpu;
+	pda->irqcount = -1;
+	pda->kernelstack = (unsigned long)stack_thread_info() -
+				 PDA_STACKOFFSET + THREAD_SIZE;
+	pda->active_mm = &init_mm;
+	pda->mmu_state = 0;
+
+	if (cpu == 0) {
+		/* others are initialized in smpboot.c */
+		pda->pcurrent = &init_task;
+		pda->irqstackptr = boot_cpu_stack;
+		pda->irqstackptr += IRQSTACKSIZE - 64;
+	} else {
+		if (!pda->irqstackptr) {
+			pda->irqstackptr = (char *)
+				__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
+			if (!pda->irqstackptr)
+				panic("cannot allocate irqstack for cpu %d",
+				      cpu);
+			pda->irqstackptr += IRQSTACKSIZE - 64;
+		}
+
+		if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
+			pda->nodenumber = cpu_to_node(cpu);
+	}
+}
+
+char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
+			   DEBUG_STKSZ] __page_aligned_bss;
 
-	for (cvdev = __x86cpuvendor_start ;
-	     cvdev < __x86cpuvendor_end   ;
-	     cvdev++)
-		cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+extern asmlinkage void ignore_sysret(void);
 
-	early_cpu_detect();
-	validate_pat_support(&boot_cpu_data);
+/* May not be marked __init: used by software suspend */
+void syscall_init(void)
+{
+	/*
+	 * LSTAR and STAR live in a bit strange symbiosis.
+	 * They both write to the same internal register. STAR allows to
+	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
+	 */
+	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
+	wrmsrl(MSR_LSTAR, system_call);
+	wrmsrl(MSR_CSTAR, ignore_sysret);
+
+#ifdef CONFIG_IA32_EMULATION
+	syscall32_cpu_init();
+#endif
+
+	/* Flags to clear on syscall */
+	wrmsrl(MSR_SYSCALL_MASK,
+	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
 }
 
+unsigned long kernel_eflags;
+
+/*
+ * Copies of the original ist values from the tss are only accessed during
+ * debugging, no special alignment required.
+ */
+DEFINE_PER_CPU(struct orig_ist, orig_ist);
+
+#else
+
 /* Make sure %fs is initialized properly in idle threads */
 struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
 {
@@ -637,25 +941,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
 	regs->fs = __KERNEL_PERCPU;
 	return regs;
 }
-
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
-void switch_to_new_gdt(void)
-{
-	struct desc_ptr gdt_descr;
-
-	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
-	gdt_descr.size = GDT_SIZE - 1;
-	load_gdt(&gdt_descr);
-	asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
-}
+#endif
 
 /*
  * cpu_init() initializes state that is per-CPU. Some data is already
  * initialized (naturally) in the bootstrap process, such as the GDT
  * and IDT. We reload them nevertheless, this function acts as a
  * 'CPU state barrier', nothing should get across.
+ * A lot of state is already set up in PDA init for 64 bit
  */
+#ifdef CONFIG_X86_64
+void __cpuinit cpu_init(void)
+{
+	int cpu = stack_smp_processor_id();
+	struct tss_struct *t = &per_cpu(init_tss, cpu);
+	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
+	unsigned long v;
+	char *estacks = NULL;
+	struct task_struct *me;
+	int i;
+
+	/* CPU 0 is initialised in head64.c */
+	if (cpu != 0)
+		pda_init(cpu);
+	else
+		estacks = boot_exception_stacks;
+
+	me = current;
+
+	if (cpu_test_and_set(cpu, cpu_initialized))
+		panic("CPU#%d already initialized!\n", cpu);
+
+	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
+
+	clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
+
+	/*
+	 * Initialize the per-CPU GDT with the boot GDT,
+	 * and set up the GDT descriptor:
+	 */
+
+	switch_to_new_gdt();
+	load_idt((const struct desc_ptr *)&idt_descr);
+
+	memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
+	syscall_init();
+
+	wrmsrl(MSR_FS_BASE, 0);
+	wrmsrl(MSR_KERNEL_GS_BASE, 0);
+	barrier();
+
+	check_efer();
+	if (cpu != 0 && x2apic)
+		enable_x2apic();
+
+	/*
+	 * set up and load the per-CPU TSS
+	 */
+	if (!orig_ist->ist[0]) {
+		static const unsigned int order[N_EXCEPTION_STACKS] = {
+		  [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
+		  [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+		};
+		for (v = 0; v < N_EXCEPTION_STACKS; v++) {
+			if (cpu) {
+				estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
+				if (!estacks)
+					panic("Cannot allocate exception "
+					      "stack %ld %d\n", v, cpu);
+			}
+			estacks += PAGE_SIZE << order[v];
+			orig_ist->ist[v] = t->x86_tss.ist[v] =
+					(unsigned long)estacks;
+		}
+	}
+
+	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
+	/*
+	 * <= is required because the CPU will access up to
+	 * 8 bits beyond the end of the IO permission bitmap.
+	 */
+	for (i = 0; i <= IO_BITMAP_LONGS; i++)
+		t->io_bitmap[i] = ~0UL;
+
+	atomic_inc(&init_mm.mm_count);
+	me->active_mm = &init_mm;
+	if (me->mm)
+		BUG();
+	enter_lazy_tlb(&init_mm, me);
+
+	load_sp0(t, &current->thread);
+	set_tss_desc(cpu, t);
+	load_TR_desc();
+	load_LDT(&init_mm.context);
+
+#ifdef CONFIG_KGDB
+	/*
+	 * If the kgdb is connected no debug regs should be altered.  This
+	 * is only applicable when KGDB and a KGDB I/O module are built
+	 * into the kernel and you are using early debugging with
+	 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
+	 */
+	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
+		arch_kgdb_ops.correct_hw_break();
+	else {
+#endif
+	/*
+	 * Clear all 6 debug registers:
+	 */
+
+	set_debugreg(0UL, 0);
+	set_debugreg(0UL, 1);
+	set_debugreg(0UL, 2);
+	set_debugreg(0UL, 3);
+	set_debugreg(0UL, 6);
+	set_debugreg(0UL, 7);
+#ifdef CONFIG_KGDB
+	/* If the kgdb is connected no debug regs should be altered. */
+	}
+#endif
+
+	fpu_init();
+
+	raw_local_save_flags(kernel_eflags);
+
+	if (is_uv_system())
+		uv_cpu_init();
+}
+
+#else
+
 void __cpuinit cpu_init(void)
 {
 	int cpu = smp_processor_id();
@@ -709,19 +1124,21 @@ void __cpuinit cpu_init(void)
 	/*
 	 * Force FPU initialization:
 	 */
-	current_thread_info()->status = 0;
+	if (cpu_has_xsave)
+		current_thread_info()->status = TS_XSAVE;
+	else
+		current_thread_info()->status = 0;
 	clear_used_math();
 	mxcsr_feature_mask_init();
-}
 
-#ifdef CONFIG_HOTPLUG_CPU
-void __cpuinit cpu_uninit(void)
-{
-	int cpu = raw_smp_processor_id();
-	cpu_clear(cpu, cpu_initialized);
+	/*
+	 * Boot processor to setup the FP and extended state context info.
+	 */
+	if (!smp_processor_id())
+		init_thread_xstate();
 
-	/* lazy TLB state */
-	per_cpu(cpu_tlbstate, cpu).state = 0;
-	per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
+	xsave_init();
 }
+
+
 #endif
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
deleted file mode 100644
index 7b8cc72feb40..000000000000
--- a/arch/x86/kernel/cpu/common_64.c
+++ /dev/null
@@ -1,681 +0,0 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/string.h>
-#include <linux/bootmem.h>
-#include <linux/bitops.h>
-#include <linux/module.h>
-#include <linux/kgdb.h>
-#include <linux/topology.h>
-#include <linux/string.h>
-#include <linux/delay.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <asm/processor.h>
-#include <asm/i387.h>
-#include <asm/msr.h>
-#include <asm/io.h>
-#include <asm/mmu_context.h>
-#include <asm/mtrr.h>
-#include <asm/mce.h>
-#include <asm/pat.h>
-#include <asm/numa.h>
-#ifdef CONFIG_X86_LOCAL_APIC
-#include <asm/mpspec.h>
-#include <asm/apic.h>
-#include <mach_apic.h>
-#endif
-#include <asm/pda.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-#include <asm/desc.h>
-#include <asm/atomic.h>
-#include <asm/proto.h>
-#include <asm/sections.h>
-#include <asm/setup.h>
-#include <asm/genapic.h>
-
-#include "cpu.h"
-
-/* We need valid kernel segments for data and code in long mode too
- * IRET will check the segment types  kkeil 2000/10/28
- * Also sysret mandates a special GDT layout
- */
-/* The TLS descriptors are currently at a different place compared to i386.
-   Hopefully nobody expects them at a fixed place (Wine?) */
-DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
-	[GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
-	[GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
-	[GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
-	[GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
-	[GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
-	[GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
-} };
-EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
-
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
-
-/* Current gdt points %fs at the "master" per-cpu area: after this,
- * it's on the real one. */
-void switch_to_new_gdt(void)
-{
-	struct desc_ptr gdt_descr;
-
-	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
-	gdt_descr.size = GDT_SIZE - 1;
-	load_gdt(&gdt_descr);
-}
-
-struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
-
-static void __cpuinit default_init(struct cpuinfo_x86 *c)
-{
-	display_cacheinfo(c);
-}
-
-static struct cpu_dev __cpuinitdata default_cpu = {
-	.c_init	= default_init,
-	.c_vendor = "Unknown",
-};
-static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
-
-int __cpuinit get_model_name(struct cpuinfo_x86 *c)
-{
-	unsigned int *v;
-
-	if (c->extended_cpuid_level < 0x80000004)
-		return 0;
-
-	v = (unsigned int *) c->x86_model_id;
-	cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
-	cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
-	cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
-	c->x86_model_id[48] = 0;
-	return 1;
-}
-
-
-void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
-{
-	unsigned int n, dummy, ebx, ecx, edx;
-
-	n = c->extended_cpuid_level;
-
-	if (n >= 0x80000005) {
-		cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
-		printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
-		       "D cache %dK (%d bytes/line)\n",
-		       edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
-		c->x86_cache_size = (ecx>>24) + (edx>>24);
-		/* On K8 L1 TLB is inclusive, so don't count it */
-		c->x86_tlbsize = 0;
-	}
-
-	if (n >= 0x80000006) {
-		cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
-		ecx = cpuid_ecx(0x80000006);
-		c->x86_cache_size = ecx >> 16;
-		c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
-
-		printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
-		c->x86_cache_size, ecx & 0xFF);
-	}
-}
-
-void __cpuinit detect_ht(struct cpuinfo_x86 *c)
-{
-#ifdef CONFIG_SMP
-	u32 eax, ebx, ecx, edx;
-	int index_msb, core_bits;
-
-	cpuid(1, &eax, &ebx, &ecx, &edx);
-
-
-	if (!cpu_has(c, X86_FEATURE_HT))
-		return;
-	if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
-		goto out;
-
-	smp_num_siblings = (ebx & 0xff0000) >> 16;
-
-	if (smp_num_siblings == 1) {
-		printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
-	} else if (smp_num_siblings > 1) {
-
-		if (smp_num_siblings > NR_CPUS) {
-			printk(KERN_WARNING "CPU: Unsupported number of "
-			       "siblings %d", smp_num_siblings);
-			smp_num_siblings = 1;
-			return;
-		}
-
-		index_msb = get_count_order(smp_num_siblings);
-		c->phys_proc_id = phys_pkg_id(index_msb);
-
-		smp_num_siblings = smp_num_siblings / c->x86_max_cores;
-
-		index_msb = get_count_order(smp_num_siblings);
-
-		core_bits = get_count_order(c->x86_max_cores);
-
-		c->cpu_core_id = phys_pkg_id(index_msb) &
-					       ((1 << core_bits) - 1);
-	}
-out:
-	if ((c->x86_max_cores * smp_num_siblings) > 1) {
-		printk(KERN_INFO  "CPU: Physical Processor ID: %d\n",
-		       c->phys_proc_id);
-		printk(KERN_INFO  "CPU: Processor Core ID: %d\n",
-		       c->cpu_core_id);
-	}
-
-#endif
-}
-
-static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
-{
-	char *v = c->x86_vendor_id;
-	int i;
-	static int printed;
-
-	for (i = 0; i < X86_VENDOR_NUM; i++) {
-		if (cpu_devs[i]) {
-			if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
-			    (cpu_devs[i]->c_ident[1] &&
-			    !strcmp(v, cpu_devs[i]->c_ident[1]))) {
-				c->x86_vendor = i;
-				this_cpu = cpu_devs[i];
-				return;
-			}
-		}
-	}
-	if (!printed) {
-		printed++;
-		printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
-		printk(KERN_ERR "CPU: Your system may be unstable.\n");
-	}
-	c->x86_vendor = X86_VENDOR_UNKNOWN;
-}
-
-static void __init early_cpu_support_print(void)
-{
-	int i,j;
-	struct cpu_dev *cpu_devx;
-
-	printk("KERNEL supported cpus:\n");
-	for (i = 0; i < X86_VENDOR_NUM; i++) {
-		cpu_devx = cpu_devs[i];
-		if (!cpu_devx)
-			continue;
-		for (j = 0; j < 2; j++) {
-			if (!cpu_devx->c_ident[j])
-				continue;
-			printk("  %s %s\n", cpu_devx->c_vendor,
-				cpu_devx->c_ident[j]);
-		}
-	}
-}
-
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
-
-void __init early_cpu_init(void)
-{
-        struct cpu_vendor_dev *cvdev;
-
-        for (cvdev = __x86cpuvendor_start ;
-             cvdev < __x86cpuvendor_end   ;
-             cvdev++)
-                cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
-	early_cpu_support_print();
-	early_identify_cpu(&boot_cpu_data);
-}
-
-/* Do some early cpuid on the boot CPU to get some parameter that are
-   needed before check_bugs. Everything advanced is in identify_cpu
-   below. */
-static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
-{
-	u32 tfms, xlvl;
-
-	c->loops_per_jiffy = loops_per_jiffy;
-	c->x86_cache_size = -1;
-	c->x86_vendor = X86_VENDOR_UNKNOWN;
-	c->x86_model = c->x86_mask = 0;	/* So far unknown... */
-	c->x86_vendor_id[0] = '\0'; /* Unset */
-	c->x86_model_id[0] = '\0';  /* Unset */
-	c->x86_clflush_size = 64;
-	c->x86_cache_alignment = c->x86_clflush_size;
-	c->x86_max_cores = 1;
-	c->x86_coreid_bits = 0;
-	c->extended_cpuid_level = 0;
-	memset(&c->x86_capability, 0, sizeof c->x86_capability);
-
-	/* Get vendor name */
-	cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
-	      (unsigned int *)&c->x86_vendor_id[0],
-	      (unsigned int *)&c->x86_vendor_id[8],
-	      (unsigned int *)&c->x86_vendor_id[4]);
-
-	get_cpu_vendor(c);
-
-	/* Initialize the standard set of capabilities */
-	/* Note that the vendor-specific code below might override */
-
-	/* Intel-defined flags: level 0x00000001 */
-	if (c->cpuid_level >= 0x00000001) {
-		__u32 misc;
-		cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
-		      &c->x86_capability[0]);
-		c->x86 = (tfms >> 8) & 0xf;
-		c->x86_model = (tfms >> 4) & 0xf;
-		c->x86_mask = tfms & 0xf;
-		if (c->x86 == 0xf)
-			c->x86 += (tfms >> 20) & 0xff;
-		if (c->x86 >= 0x6)
-			c->x86_model += ((tfms >> 16) & 0xF) << 4;
-		if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
-			c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
-	} else {
-		/* Have CPUID level 0 only - unheard of */
-		c->x86 = 4;
-	}
-
-	c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
-#ifdef CONFIG_SMP
-	c->phys_proc_id = c->initial_apicid;
-#endif
-	/* AMD-defined flags: level 0x80000001 */
-	xlvl = cpuid_eax(0x80000000);
-	c->extended_cpuid_level = xlvl;
-	if ((xlvl & 0xffff0000) == 0x80000000) {
-		if (xlvl >= 0x80000001) {
-			c->x86_capability[1] = cpuid_edx(0x80000001);
-			c->x86_capability[6] = cpuid_ecx(0x80000001);
-		}
-		if (xlvl >= 0x80000004)
-			get_model_name(c); /* Default name */
-	}
-
-	/* Transmeta-defined flags: level 0x80860001 */
-	xlvl = cpuid_eax(0x80860000);
-	if ((xlvl & 0xffff0000) == 0x80860000) {
-		/* Don't set x86_cpuid_level here for now to not confuse. */
-		if (xlvl >= 0x80860001)
-			c->x86_capability[2] = cpuid_edx(0x80860001);
-	}
-
-	c->extended_cpuid_level = cpuid_eax(0x80000000);
-	if (c->extended_cpuid_level >= 0x80000007)
-		c->x86_power = cpuid_edx(0x80000007);
-
-	if (c->extended_cpuid_level >= 0x80000008) {
-		u32 eax = cpuid_eax(0x80000008);
-
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-	}
-
-	/* Assume all 64-bit CPUs support 32-bit syscall */
-	set_cpu_cap(c, X86_FEATURE_SYSCALL32);
-
-	if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
-	    cpu_devs[c->x86_vendor]->c_early_init)
-		cpu_devs[c->x86_vendor]->c_early_init(c);
-
-	validate_pat_support(c);
-
-	/* early_param could clear that, but recall get it set again */
-	if (disable_apic)
-		clear_cpu_cap(c, X86_FEATURE_APIC);
-}
-
-/*
- * This does the hard work of actually picking apart the CPU stuff...
- */
-static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
-{
-	int i;
-
-	early_identify_cpu(c);
-
-	init_scattered_cpuid_features(c);
-
-	c->apicid = phys_pkg_id(0);
-
-	/*
-	 * Vendor-specific initialization.  In this section we
-	 * canonicalize the feature flags, meaning if there are
-	 * features a certain CPU supports which CPUID doesn't
-	 * tell us, CPUID claiming incorrect flags, or other bugs,
-	 * we handle them here.
-	 *
-	 * At the end of this section, c->x86_capability better
-	 * indicate the features this CPU genuinely supports!
-	 */
-	if (this_cpu->c_init)
-		this_cpu->c_init(c);
-
-	detect_ht(c);
-
-	/*
-	 * On SMP, boot_cpu_data holds the common feature set between
-	 * all CPUs; so make sure that we indicate which features are
-	 * common between the CPUs.  The first time this routine gets
-	 * executed, c == &boot_cpu_data.
-	 */
-	if (c != &boot_cpu_data) {
-		/* AND the already accumulated flags with these */
-		for (i = 0; i < NCAPINTS; i++)
-			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
-	}
-
-	/* Clear all flags overriden by options */
-	for (i = 0; i < NCAPINTS; i++)
-		c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
-#ifdef CONFIG_X86_MCE
-	mcheck_init(c);
-#endif
-	select_idle_routine(c);
-
-#ifdef CONFIG_NUMA
-	numa_add_cpu(smp_processor_id());
-#endif
-
-}
-
-void __cpuinit identify_boot_cpu(void)
-{
-	identify_cpu(&boot_cpu_data);
-}
-
-void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
-{
-	BUG_ON(c == &boot_cpu_data);
-	identify_cpu(c);
-	mtrr_ap_init();
-}
-
-static __init int setup_noclflush(char *arg)
-{
-	setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
-	return 1;
-}
-__setup("noclflush", setup_noclflush);
-
-void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
-{
-	if (c->x86_model_id[0])
-		printk(KERN_CONT "%s", c->x86_model_id);
-
-	if (c->x86_mask || c->cpuid_level >= 0)
-		printk(KERN_CONT " stepping %02x\n", c->x86_mask);
-	else
-		printk(KERN_CONT "\n");
-}
-
-static __init int setup_disablecpuid(char *arg)
-{
-	int bit;
-	if (get_option(&arg, &bit) && bit < NCAPINTS*32)
-		setup_clear_cpu_cap(bit);
-	else
-		return 0;
-	return 1;
-}
-__setup("clearcpuid=", setup_disablecpuid);
-
-cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
-
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
-
-struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
-
-char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
-
-unsigned long __supported_pte_mask __read_mostly = ~0UL;
-EXPORT_SYMBOL_GPL(__supported_pte_mask);
-
-static int do_not_nx __cpuinitdata;
-
-/* noexec=on|off
-Control non executable mappings for 64bit processes.
-
-on	Enable(default)
-off	Disable
-*/
-static int __init nonx_setup(char *str)
-{
-	if (!str)
-		return -EINVAL;
-	if (!strncmp(str, "on", 2)) {
-		__supported_pte_mask |= _PAGE_NX;
-		do_not_nx = 0;
-	} else if (!strncmp(str, "off", 3)) {
-		do_not_nx = 1;
-		__supported_pte_mask &= ~_PAGE_NX;
-	}
-	return 0;
-}
-early_param("noexec", nonx_setup);
-
-int force_personality32;
-
-/* noexec32=on|off
-Control non executable heap for 32bit processes.
-To control the stack too use noexec=off
-
-on	PROT_READ does not imply PROT_EXEC for 32bit processes (default)
-off	PROT_READ implies PROT_EXEC
-*/
-static int __init nonx32_setup(char *str)
-{
-	if (!strcmp(str, "on"))
-		force_personality32 &= ~READ_IMPLIES_EXEC;
-	else if (!strcmp(str, "off"))
-		force_personality32 |= READ_IMPLIES_EXEC;
-	return 1;
-}
-__setup("noexec32=", nonx32_setup);
-
-void pda_init(int cpu)
-{
-	struct x8664_pda *pda = cpu_pda(cpu);
-
-	/* Setup up data that may be needed in __get_free_pages early */
-	loadsegment(fs, 0);
-	loadsegment(gs, 0);
-	/* Memory clobbers used to order PDA accessed */
-	mb();
-	wrmsrl(MSR_GS_BASE, pda);
-	mb();
-
-	pda->cpunumber = cpu;
-	pda->irqcount = -1;
-	pda->kernelstack = (unsigned long)stack_thread_info() -
-				 PDA_STACKOFFSET + THREAD_SIZE;
-	pda->active_mm = &init_mm;
-	pda->mmu_state = 0;
-
-	if (cpu == 0) {
-		/* others are initialized in smpboot.c */
-		pda->pcurrent = &init_task;
-		pda->irqstackptr = boot_cpu_stack;
-	} else {
-		pda->irqstackptr = (char *)
-			__get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
-		if (!pda->irqstackptr)
-			panic("cannot allocate irqstack for cpu %d", cpu);
-
-		if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
-			pda->nodenumber = cpu_to_node(cpu);
-	}
-
-	pda->irqstackptr += IRQSTACKSIZE-64;
-}
-
-char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
-			   DEBUG_STKSZ]
-__attribute__((section(".bss.page_aligned")));
-
-extern asmlinkage void ignore_sysret(void);
-
-/* May not be marked __init: used by software suspend */
-void syscall_init(void)
-{
-	/*
-	 * LSTAR and STAR live in a bit strange symbiosis.
-	 * They both write to the same internal register. STAR allows to
-	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
-	 */
-	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
-	wrmsrl(MSR_LSTAR, system_call);
-	wrmsrl(MSR_CSTAR, ignore_sysret);
-
-#ifdef CONFIG_IA32_EMULATION
-	syscall32_cpu_init();
-#endif
-
-	/* Flags to clear on syscall */
-	wrmsrl(MSR_SYSCALL_MASK,
-	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
-}
-
-void __cpuinit check_efer(void)
-{
-	unsigned long efer;
-
-	rdmsrl(MSR_EFER, efer);
-	if (!(efer & EFER_NX) || do_not_nx)
-		__supported_pte_mask &= ~_PAGE_NX;
-}
-
-unsigned long kernel_eflags;
-
-/*
- * Copies of the original ist values from the tss are only accessed during
- * debugging, no special alignment required.
- */
-DEFINE_PER_CPU(struct orig_ist, orig_ist);
-
-/*
- * cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- * A lot of state is already set up in PDA init.
- */
-void __cpuinit cpu_init(void)
-{
-	int cpu = stack_smp_processor_id();
-	struct tss_struct *t = &per_cpu(init_tss, cpu);
-	struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
-	unsigned long v;
-	char *estacks = NULL;
-	struct task_struct *me;
-	int i;
-
-	/* CPU 0 is initialised in head64.c */
-	if (cpu != 0)
-		pda_init(cpu);
-	else
-		estacks = boot_exception_stacks;
-
-	me = current;
-
-	if (cpu_test_and_set(cpu, cpu_initialized))
-		panic("CPU#%d already initialized!\n", cpu);
-
-	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
-
-	clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
-
-	/*
-	 * Initialize the per-CPU GDT with the boot GDT,
-	 * and set up the GDT descriptor:
-	 */
-
-	switch_to_new_gdt();
-	load_idt((const struct desc_ptr *)&idt_descr);
-
-	memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
-	syscall_init();
-
-	wrmsrl(MSR_FS_BASE, 0);
-	wrmsrl(MSR_KERNEL_GS_BASE, 0);
-	barrier();
-
-	check_efer();
-
-	/*
-	 * set up and load the per-CPU TSS
-	 */
-	for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-		static const unsigned int order[N_EXCEPTION_STACKS] = {
-			[0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
-			[DEBUG_STACK - 1] = DEBUG_STACK_ORDER
-		};
-		if (cpu) {
-			estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
-			if (!estacks)
-				panic("Cannot allocate exception stack %ld %d\n",
-				      v, cpu);
-		}
-		estacks += PAGE_SIZE << order[v];
-		orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
-	}
-
-	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
-	/*
-	 * <= is required because the CPU will access up to
-	 * 8 bits beyond the end of the IO permission bitmap.
-	 */
-	for (i = 0; i <= IO_BITMAP_LONGS; i++)
-		t->io_bitmap[i] = ~0UL;
-
-	atomic_inc(&init_mm.mm_count);
-	me->active_mm = &init_mm;
-	if (me->mm)
-		BUG();
-	enter_lazy_tlb(&init_mm, me);
-
-	load_sp0(t, &current->thread);
-	set_tss_desc(cpu, t);
-	load_TR_desc();
-	load_LDT(&init_mm.context);
-
-#ifdef CONFIG_KGDB
-	/*
-	 * If the kgdb is connected no debug regs should be altered.  This
-	 * is only applicable when KGDB and a KGDB I/O module are built
-	 * into the kernel and you are using early debugging with
-	 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
-	 */
-	if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
-		arch_kgdb_ops.correct_hw_break();
-	else {
-#endif
-	/*
-	 * Clear all 6 debug registers:
-	 */
-
-	set_debugreg(0UL, 0);
-	set_debugreg(0UL, 1);
-	set_debugreg(0UL, 2);
-	set_debugreg(0UL, 3);
-	set_debugreg(0UL, 6);
-	set_debugreg(0UL, 7);
-#ifdef CONFIG_KGDB
-	/* If the kgdb is connected no debug regs should be altered. */
-	}
-#endif
-
-	fpu_init();
-
-	raw_local_save_flags(kernel_eflags);
-
-	if (is_uv_system())
-		uv_cpu_init();
-}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 4d894e8565fe..de4094a39210 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -21,23 +21,16 @@ struct cpu_dev {
 	void		(*c_init)(struct cpuinfo_x86 * c);
 	void		(*c_identify)(struct cpuinfo_x86 * c);
 	unsigned int	(*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
+	int	c_x86_vendor;
 };
 
-extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
+#define cpu_dev_register(cpu_devX) \
+	static struct cpu_dev *__cpu_dev_##cpu_devX __used \
+	__attribute__((__section__(".x86_cpu_dev.init"))) = \
+	&cpu_devX;
 
-struct cpu_vendor_dev {
-	int vendor;
-	struct cpu_dev *cpu_dev;
-};
-
-#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \
-	static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \
-	__attribute__((__section__(".x86cpuvendor.init"))) = \
-	{ cpu_vendor_id, cpu_dev }
-
-extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
+extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
 
-extern int get_model_name(struct cpuinfo_x86 *c);
 extern void display_cacheinfo(struct cpuinfo_x86 *c);
 
 #endif
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index cb7a5715596d..efae3b22a0ff 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -235,9 +235,9 @@ config X86_LONGHAUL
 	  If in doubt, say N.
 
 config X86_E_POWERSAVER
-	tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)"
+	tristate "VIA C7 Enhanced PowerSaver"
 	select CPU_FREQ_TABLE
-	depends on X86_32 && EXPERIMENTAL
+	depends on X86_32
 	help
 	  This adds the CPUFreq driver for VIA C7 processors.
 
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index b0c8208df9fa..8e48c5d4467d 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -202,7 +202,7 @@ static void drv_write(struct drv_cmd *cmd)
 	cpumask_t saved_mask = current->cpus_allowed;
 	unsigned int i;
 
-	for_each_cpu_mask(i, cmd->mask) {
+	for_each_cpu_mask_nr(i, cmd->mask) {
 		set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
 		do_drv_write(cmd);
 	}
@@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask)
  * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
  * no meaning should be associated with absolute values of these MSRs.
  */
-static unsigned int get_measured_perf(unsigned int cpu)
+static unsigned int get_measured_perf(struct cpufreq_policy *policy,
+				      unsigned int cpu)
 {
 	union {
 		struct {
@@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
 
 #endif
 
-	retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100;
+	retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
 
 	put_cpu();
 	set_cpus_allowed_ptr(current, &saved_mask);
@@ -451,7 +452,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 
 	freqs.old = perf->states[perf->state].core_frequency * 1000;
 	freqs.new = data->freq_table[next_state].frequency;
-	for_each_cpu_mask(i, cmd.mask) {
+	for_each_cpu_mask_nr(i, cmd.mask) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 	}
@@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
-	for_each_cpu_mask(i, cmd.mask) {
+	for_each_cpu_mask_nr(i, cmd.mask) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 	}
@@ -779,13 +780,20 @@ static int __init acpi_cpufreq_init(void)
 {
 	int ret;
 
+	if (acpi_disabled)
+		return 0;
+
 	dprintk("acpi_cpufreq_init\n");
 
 	ret = acpi_cpufreq_early_init();
 	if (ret)
 		return ret;
 
-	return cpufreq_register_driver(&acpi_cpufreq_driver);
+	ret = cpufreq_register_driver(&acpi_cpufreq_driver);
+	if (ret)
+		free_percpu(acpi_perf_data);
+
+	return ret;
 }
 
 static void __exit acpi_cpufreq_exit(void)
@@ -795,8 +803,6 @@ static void __exit acpi_cpufreq_exit(void)
 	cpufreq_unregister_driver(&acpi_cpufreq_driver);
 
 	free_percpu(acpi_perf_data);
-
-	return;
 }
 
 module_param(acpi_pstate_strict, uint, 0644);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 94619c22f563..fe613c93b366 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -25,8 +25,8 @@
 #include <linux/cpufreq.h>
 
 #include <asm/msr.h>
-#include <asm/timex.h>
-#include <asm/io.h>
+#include <linux/timex.h>
+#include <linux/io.h>
 
 #define REG_CSCIR 0x22		/* Chip Setup and Control Index Register    */
 #define REG_CSCDR 0x23		/* Chip Setup and Control Data  Register    */
@@ -44,7 +44,7 @@ struct s_elan_multiplier {
  * It is important that the frequencies
  * are listed in ascending order here!
  */
-struct s_elan_multiplier elan_multiplier[] = {
+static struct s_elan_multiplier elan_multiplier[] = {
 	{1000,	0x02,	0x18},
 	{2000,	0x02,	0x10},
 	{4000,	0x02,	0x08},
@@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
 	u8 clockspeed_reg;    /* Clock Speed Register */
 
 	local_irq_disable();
-	outb_p(0x80,REG_CSCIR);
+	outb_p(0x80, REG_CSCIR);
 	clockspeed_reg = inb_p(REG_CSCDR);
 	local_irq_enable();
 
@@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
 	}
 
 	/* 33 MHz is not 32 MHz... */
-	if ((clockspeed_reg & 0xE0)==0xA0)
+	if ((clockspeed_reg & 0xE0) == 0xA0)
 		return 33000;
 
-	return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000);
+	return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
 }
 
 
@@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
  *	There is no return value.
  */
 
-static void elanfreq_set_cpu_state (unsigned int state)
+static void elanfreq_set_cpu_state(unsigned int state)
 {
 	struct cpufreq_freqs    freqs;
 
@@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state)
 	 */
 
 	local_irq_disable();
-	outb_p(0x40,REG_CSCIR);		/* Disable hyperspeed mode */
-	outb_p(0x00,REG_CSCDR);
+	outb_p(0x40, REG_CSCIR);		/* Disable hyperspeed mode */
+	outb_p(0x00, REG_CSCDR);
 	local_irq_enable();		/* wait till internal pipelines and */
 	udelay(1000);			/* buffers have cleaned up          */
 
 	local_irq_disable();
 
 	/* now, set the CPU clock speed register (0x80) */
-	outb_p(0x80,REG_CSCIR);
-	outb_p(elan_multiplier[state].val80h,REG_CSCDR);
+	outb_p(0x80, REG_CSCIR);
+	outb_p(elan_multiplier[state].val80h, REG_CSCDR);
 
 	/* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
-	outb_p(0x40,REG_CSCIR);
-	outb_p(elan_multiplier[state].val40h,REG_CSCDR);
+	outb_p(0x40, REG_CSCIR);
+	outb_p(elan_multiplier[state].val40h, REG_CSCDR);
 	udelay(10000);
 	local_irq_enable();
 
@@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state)
  *	for the hardware supported by the driver.
  */
 
-static int elanfreq_verify (struct cpufreq_policy *policy)
+static int elanfreq_verify(struct cpufreq_policy *policy)
 {
 	return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
 }
 
-static int elanfreq_target (struct cpufreq_policy *policy,
+static int elanfreq_target(struct cpufreq_policy *policy,
 			    unsigned int target_freq,
 			    unsigned int relation)
 {
@@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
 
 	/* capability check */
 	if ((c->x86_vendor != X86_VENDOR_AMD) ||
-	    (c->x86 != 4) || (c->x86_model!=10))
+	    (c->x86 != 4) || (c->x86_model != 10))
 		return -ENODEV;
 
 	/* max freq */
@@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
 		max_freq = elanfreq_get_cpu_frequency(0);
 
 	/* table init */
-	for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
+	for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
 		if (elanfreq_table[i].frequency > max_freq)
 			elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
 	}
@@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
 
 	result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
 	if (result)
-		return (result);
+		return result;
 
 	cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
 	return 0;
@@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup);
 #endif
 
 
-static struct freq_attr* elanfreq_attr[] = {
+static struct freq_attr *elanfreq_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
 	NULL,
 };
@@ -284,9 +284,9 @@ static int __init elanfreq_init(void)
 
 	/* Test if we have the right hardware */
 	if ((c->x86_vendor != X86_VENDOR_AMD) ||
-		(c->x86 != 4) || (c->x86_model!=10)) {
+		(c->x86 != 4) || (c->x86_model != 10)) {
 		printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
-                return -ENODEV;
+		return -ENODEV;
 	}
 	return cpufreq_register_driver(&elanfreq_driver);
 }
@@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void)
 }
 
 
-module_param (max_freq, int, 0444);
+module_param(max_freq, int, 0444);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 06fcce516d51..b0461856acfb 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -1,5 +1,5 @@
 /*
- *  (C) 2001-2004  Dave Jones. <davej@codemonkey.org.uk>
+ *  (C) 2001-2004  Dave Jones. <davej@redhat.com>
  *  (C) 2002  Padraig Brady. <padraig@antefacto.com>
  *
  *  Licensed under the terms of the GNU GPL License version 2.
@@ -1019,7 +1019,7 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
 module_param(revid_errata, int, 0644);
 MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
 
-MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
 MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
 MODULE_LICENSE ("GPL");
 
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 199e4e05e5dc..b8e05ee4f736 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
 		return 0;
 
 	/* notifiers */
-	for_each_cpu_mask(i, policy->cpus) {
+	for_each_cpu_mask_nr(i, policy->cpus) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 	}
@@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
 	/* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
 	 * Developer's Manual, Volume 3
 	 */
-	for_each_cpu_mask(i, policy->cpus)
+	for_each_cpu_mask_nr(i, policy->cpus)
 		cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
 
 	/* notifiers */
-	for_each_cpu_mask(i, policy->cpus) {
+	for_each_cpu_mask_nr(i, policy->cpus) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 	}
@@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
 	}
 
 	if (c->x86 != 0xF) {
-		printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n");
+		printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
 		return 0;
 	}
 
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index eb9b62b0830c..c1ac5790c63e 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -15,12 +15,11 @@
 #include <linux/slab.h>
 
 #include <asm/msr.h>
-#include <asm/timex.h>
-#include <asm/io.h>
+#include <linux/timex.h>
+#include <linux/io.h>
 
-
-#define POWERNOW_IOPORT 0xfff0         /* it doesn't matter where, as long
-					  as it is unused */
+#define POWERNOW_IOPORT 0xfff0          /* it doesn't matter where, as long
+					   as it is unused */
 
 static unsigned int                     busfreq;   /* FSB, in 10 kHz */
 static unsigned int                     max_multiplier;
@@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void)
 
 	msrval = POWERNOW_IOPORT + 0x1;
 	wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
-	invalue=inl(POWERNOW_IOPORT + 0x8);
+	invalue = inl(POWERNOW_IOPORT + 0x8);
 	msrval = POWERNOW_IOPORT + 0x0;
 	wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
 
@@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void)
  *
  *   Tries to change the PowerNow! multiplier
  */
-static void powernow_k6_set_state (unsigned int best_i)
+static void powernow_k6_set_state(unsigned int best_i)
 {
-	unsigned long           outvalue=0, invalue=0;
+	unsigned long           outvalue = 0, invalue = 0;
 	unsigned long           msrval;
 	struct cpufreq_freqs    freqs;
 
@@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i)
 
 	msrval = POWERNOW_IOPORT + 0x1;
 	wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
-	invalue=inl(POWERNOW_IOPORT + 0x8);
+	invalue = inl(POWERNOW_IOPORT + 0x8);
 	invalue = invalue & 0xf;
 	outvalue = outvalue | invalue;
-	outl(outvalue ,(POWERNOW_IOPORT + 0x8));
+	outl(outvalue , (POWERNOW_IOPORT + 0x8));
 	msrval = POWERNOW_IOPORT + 0x0;
 	wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
 
@@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
  *
  * sets a new CPUFreq policy
  */
-static int powernow_k6_target (struct cpufreq_policy *policy,
+static int powernow_k6_target(struct cpufreq_policy *policy,
 			       unsigned int target_freq,
 			       unsigned int relation)
 {
@@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
 	busfreq = cpu_khz / max_multiplier;
 
 	/* table init */
-	for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
+	for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
 		if (clock_ratio[i].index > max_multiplier)
 			clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
 		else
@@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
 
 	result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
 	if (result)
-		return (result);
+		return result;
 
 	cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
 
@@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
 static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
 {
 	unsigned int i;
-	for (i=0; i<8; i++) {
-		if (i==max_multiplier)
+	for (i = 0; i < 8; i++) {
+		if (i == max_multiplier)
 			powernow_k6_set_state(i);
 	}
 	cpufreq_frequency_table_put_attr(policy->cpu);
@@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu)
 	return busfreq * powernow_k6_get_cpu_multiplier();
 }
 
-static struct freq_attr* powernow_k6_attr[] = {
+static struct freq_attr *powernow_k6_attr[] = {
 	&cpufreq_freq_attr_scaling_available_freqs,
 	NULL,
 };
@@ -227,7 +226,7 @@ static int __init powernow_k6_init(void)
 	}
 
 	if (cpufreq_register_driver(&powernow_k6_driver)) {
-		release_region (POWERNOW_IOPORT, 16);
+		release_region(POWERNOW_IOPORT, 16);
 		return -EINVAL;
 	}
 
@@ -243,13 +242,13 @@ static int __init powernow_k6_init(void)
 static void __exit powernow_k6_exit(void)
 {
 	cpufreq_unregister_driver(&powernow_k6_driver);
-	release_region (POWERNOW_IOPORT, 16);
+	release_region(POWERNOW_IOPORT, 16);
 }
 
 
-MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
-MODULE_LICENSE ("GPL");
+MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
+MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
+MODULE_LICENSE("GPL");
 
 module_init(powernow_k6_init);
 module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 0a61159d7b71..7c7d56b43136 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -1,6 +1,6 @@
 /*
  *  AMD K7 Powernow driver.
- *  (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs.
+ *  (C) 2003 Dave Jones on behalf of SuSE Labs.
  *  (C) 2003-2004 Dave Jones <davej@redhat.com>
  *
  *  Licensed under the terms of the GNU GPL License version 2.
@@ -692,7 +692,7 @@ static void __exit powernow_exit (void)
 module_param(acpi_force,  int, 0444);
 MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
 
-MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>");
+MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
 MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors.");
 MODULE_LICENSE ("GPL");
 
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
index f8a63b3664e3..35fb4eaf6e1c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
@@ -1,5 +1,4 @@
 /*
- *  $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $
  *  (C) 2003 Dave Jones.
  *
  *  Licensed under the terms of the GNU GPL License version 2.
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 206791eb46e3..d3dcd58b87cd 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -7,7 +7,7 @@
  *  Support : mark.langsdorf@amd.com
  *
  *  Based on the powernow-k7.c module written by Dave Jones.
- *  (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs
+ *  (C) 2003 Dave Jones on behalf of SuSE Labs
  *  (C) 2004 Dominik Brodowski <linux@brodo.de>
  *  (C) 2004 Pavel Machek <pavel@suse.cz>
  *  Licensed under the terms of the GNU GPL License version 2.
@@ -45,7 +45,6 @@
 #endif
 
 #define PFX "powernow-k8: "
-#define BFX PFX "BIOS error: "
 #define VERSION "version 2.20.00"
 #include "powernow-k8.h"
 
@@ -66,7 +65,6 @@ static u32 find_freq_from_fid(u32 fid)
 	return 800 + (fid * 100);
 }
 
-
 /* Return a frequency in KHz, given an input fid */
 static u32 find_khz_freq_from_fid(u32 fid)
 {
@@ -78,7 +76,6 @@ static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 p
 	return data[pstate].frequency;
 }
 
-
 /* Return the vco fid for an input fid
  *
  * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
@@ -166,7 +163,6 @@ static void fidvid_msr_init(void)
 	wrmsr(MSR_FIDVID_CTL, lo, hi);
 }
 
-
 /* write the new fid value along with the other control fields to the msr */
 static int write_new_fid(struct powernow_k8_data *data, u32 fid)
 {
@@ -539,35 +535,40 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8
 
 	for (j = 0; j < data->numps; j++) {
 		if (pst[j].vid > LEAST_VID) {
-			printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid);
+			printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
+			       j, pst[j].vid);
 			return -EINVAL;
 		}
 		if (pst[j].vid < data->rvo) {	/* vid + rvo >= 0 */
-			printk(KERN_ERR BFX "0 vid exceeded with pstate %d\n", j);
+			printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
+			       " %d\n", j);
 			return -ENODEV;
 		}
 		if (pst[j].vid < maxvid + data->rvo) {	/* vid + rvo >= maxvid */
-			printk(KERN_ERR BFX "maxvid exceeded with pstate %d\n", j);
+			printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
+			       " %d\n", j);
 			return -ENODEV;
 		}
 		if (pst[j].fid > MAX_FID) {
-			printk(KERN_ERR BFX "maxfid exceeded with pstate %d\n", j);
+			printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
+			       " %d\n", j);
 			return -ENODEV;
 		}
 		if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
 			/* Only first fid is allowed to be in "low" range */
-			printk(KERN_ERR BFX "two low fids - %d : 0x%x\n", j, pst[j].fid);
+			printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
+			       "0x%x\n", j, pst[j].fid);
 			return -EINVAL;
 		}
 		if (pst[j].fid < lastfid)
 			lastfid = pst[j].fid;
 	}
 	if (lastfid & 1) {
-		printk(KERN_ERR BFX "lastfid invalid\n");
+		printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
 		return -EINVAL;
 	}
 	if (lastfid > LO_FID_TABLE_TOP)
-		printk(KERN_INFO BFX  "first fid not from lo freq table\n");
+		printk(KERN_INFO FW_BUG PFX  "first fid not from lo freq table\n");
 
 	return 0;
 }
@@ -675,13 +676,13 @@ static int find_psb_table(struct powernow_k8_data *data)
 
 		dprintk("table vers: 0x%x\n", psb->tableversion);
 		if (psb->tableversion != PSB_VERSION_1_4) {
-			printk(KERN_ERR BFX "PSB table is not v1.4\n");
+			printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
 			return -ENODEV;
 		}
 
 		dprintk("flags: 0x%x\n", psb->flags1);
 		if (psb->flags1) {
-			printk(KERN_ERR BFX "unknown flags\n");
+			printk(KERN_ERR FW_BUG PFX "unknown flags\n");
 			return -ENODEV;
 		}
 
@@ -708,7 +709,7 @@ static int find_psb_table(struct powernow_k8_data *data)
 			}
 		}
 		if (cpst != 1) {
-			printk(KERN_ERR BFX "numpst must be 1\n");
+			printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
 			return -ENODEV;
 		}
 
@@ -966,7 +967,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
 	freqs.old = find_khz_freq_from_fid(data->currfid);
 	freqs.new = find_khz_freq_from_fid(fid);
 
-	for_each_cpu_mask(i, *(data->available_cores)) {
+	for_each_cpu_mask_nr(i, *(data->available_cores)) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 	}
@@ -974,7 +975,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
 	res = transition_fid_vid(data, fid, vid);
 	freqs.new = find_khz_freq_from_fid(data->currfid);
 
-	for_each_cpu_mask(i, *(data->available_cores)) {
+	for_each_cpu_mask_nr(i, *(data->available_cores)) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 	}
@@ -997,7 +998,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
 	freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
 	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
 
-	for_each_cpu_mask(i, *(data->available_cores)) {
+	for_each_cpu_mask_nr(i, *(data->available_cores)) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 	}
@@ -1005,7 +1006,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
 	res = transition_pstate(data, pstate);
 	freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
 
-	for_each_cpu_mask(i, *(data->available_cores)) {
+	for_each_cpu_mask_nr(i, *(data->available_cores)) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 	}
@@ -1133,17 +1134,19 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 			       "ACPI Processor module before starting this "
 			       "driver.\n");
 #else
-			printk(KERN_ERR PFX "Your BIOS does not provide ACPI "
-			       "_PSS objects in a way that Linux understands. "
-			       "Please report this to the Linux ACPI maintainers"
-			       " and complain to your BIOS vendor.\n");
+			printk(KERN_ERR FW_BUG PFX "Your BIOS does not provide"
+			       " ACPI _PSS objects in a way that Linux "
+			       "understands. Please report this to the Linux "
+			       "ACPI maintainers and complain to your BIOS "
+			       "vendor.\n");
 #endif
 			kfree(data);
 			return -ENODEV;
 		}
 		if (pol->cpu != 0) {
-			printk(KERN_ERR PFX "No ACPI _PSS objects for CPU other than "
-			       "CPU0. Complain to your BIOS vendor.\n");
+			printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
+			       "CPU other than CPU0. Complain to your BIOS "
+			       "vendor.\n");
 			kfree(data);
 			return -ENODEV;
 		}
@@ -1196,7 +1199,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
 
 	/* min/max the cpu is capable of */
 	if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
-		printk(KERN_ERR PFX "invalid powernow_table\n");
+		printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
 		powernow_k8_cpu_exit_acpi(data);
 		kfree(data->powernow_table);
 		kfree(data);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 908dd347c67e..3b5f06423e77 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -26,9 +26,10 @@
 #include <asm/cpufeature.h>
 
 #define PFX		"speedstep-centrino: "
-#define MAINTAINER	"cpufreq@lists.linux.org.uk"
+#define MAINTAINER	"cpufreq@vger.kernel.org"
 
-#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
+#define dprintk(msg...) \
+	cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
 
 #define INTEL_MSR_RANGE	(0xffff)
 
@@ -66,11 +67,12 @@ struct cpu_model
 
 	struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
 };
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x);
+static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
+				  const struct cpu_id *x);
 
 /* Operating points for current CPU */
-static struct cpu_model *centrino_model[NR_CPUS];
-static const struct cpu_id *centrino_cpu[NR_CPUS];
+static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
+static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
 
 static struct cpufreq_driver centrino_driver;
 
@@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
 		return -ENOENT;
 	}
 
-	centrino_model[policy->cpu] = model;
+	per_cpu(centrino_model, policy->cpu) = model;
 
 	dprintk("found \"%s\": max frequency: %dkHz\n",
 	       model->model_name, model->max_freq);
@@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
 }
 
 #else
-static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; }
+static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
+{
+	return -ENODEV;
+}
 #endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
 
-static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x)
+static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
+				  const struct cpu_id *x)
 {
 	if ((c->x86 == x->x86) &&
 	    (c->x86_model == x->x86_model) &&
@@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
 	 * for centrino, as some DSDTs are buggy.
 	 * Ideally, this can be done using the acpi_data structure.
 	 */
-	if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) ||
-	    (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) ||
-	    (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) {
+	if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
+	    (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
+	    (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
 		msr = (msr >> 8) & 0xff;
 		return msr * 100000;
 	}
 
-	if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points))
+	if ((!per_cpu(centrino_model, cpu)) ||
+	    (!per_cpu(centrino_model, cpu)->op_points))
 		return 0;
 
 	msr &= 0xffff;
-	for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) {
-		if (msr == centrino_model[cpu]->op_points[i].index)
-			return centrino_model[cpu]->op_points[i].frequency;
+	for (i = 0;
+		per_cpu(centrino_model, cpu)->op_points[i].frequency
+							!= CPUFREQ_TABLE_END;
+	     i++) {
+		if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
+			return per_cpu(centrino_model, cpu)->
+							op_points[i].frequency;
 	}
 	if (failsafe)
-		return centrino_model[cpu]->op_points[i-1].frequency;
+		return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
 	else
 		return 0;
 }
@@ -347,7 +358,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
 	int i;
 
 	/* Only Intel makes Enhanced Speedstep-capable CPUs */
-	if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST))
+	if (cpu->x86_vendor != X86_VENDOR_INTEL ||
+	    !cpu_has(cpu, X86_FEATURE_EST))
 		return -ENODEV;
 
 	if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
@@ -361,9 +373,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
 			break;
 
 	if (i != N_IDS)
-		centrino_cpu[policy->cpu] = &cpu_ids[i];
+		per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
 
-	if (!centrino_cpu[policy->cpu]) {
+	if (!per_cpu(centrino_cpu, policy->cpu)) {
 		dprintk("found unsupported CPU with "
 		"Enhanced SpeedStep: send /proc/cpuinfo to "
 		MAINTAINER "\n");
@@ -386,23 +398,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
 		/* check to see if it stuck */
 		rdmsr(MSR_IA32_MISC_ENABLE, l, h);
 		if (!(l & (1<<16))) {
-			printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n");
+			printk(KERN_INFO PFX
+				"couldn't enable Enhanced SpeedStep\n");
 			return -ENODEV;
 		}
 	}
 
 	freq = get_cur_freq(policy->cpu);
-
-	policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */
+	policy->cpuinfo.transition_latency = 10000;
+						/* 10uS transition latency */
 	policy->cur = freq;
 
 	dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
 
-	ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points);
+	ret = cpufreq_frequency_table_cpuinfo(policy,
+		per_cpu(centrino_model, policy->cpu)->op_points);
 	if (ret)
 		return (ret);
 
-	cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu);
+	cpufreq_frequency_table_get_attr(
+		per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
 
 	return 0;
 }
@@ -411,12 +426,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
 {
 	unsigned int cpu = policy->cpu;
 
-	if (!centrino_model[cpu])
+	if (!per_cpu(centrino_model, cpu))
 		return -ENODEV;
 
 	cpufreq_frequency_table_put_attr(cpu);
 
-	centrino_model[cpu] = NULL;
+	per_cpu(centrino_model, cpu) = NULL;
 
 	return 0;
 }
@@ -430,17 +445,26 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
  */
 static int centrino_verify (struct cpufreq_policy *policy)
 {
-	return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points);
+	return cpufreq_frequency_table_verify(policy,
+			per_cpu(centrino_model, policy->cpu)->op_points);
 }
 
 /**
  * centrino_setpolicy - set a new CPUFreq policy
  * @policy: new policy
  * @target_freq: the target frequency
- * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
+ * @relation: how that frequency relates to achieved frequency
+ *	(CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
  *
  * Sets a new CPUFreq policy.
  */
+struct allmasks {
+	cpumask_t		online_policy_cpus;
+	cpumask_t		saved_mask;
+	cpumask_t		set_mask;
+	cpumask_t		covered_cpus;
+};
+
 static int centrino_target (struct cpufreq_policy *policy,
 			    unsigned int target_freq,
 			    unsigned int relation)
@@ -448,48 +472,55 @@ static int centrino_target (struct cpufreq_policy *policy,
 	unsigned int    newstate = 0;
 	unsigned int	msr, oldmsr = 0, h = 0, cpu = policy->cpu;
 	struct cpufreq_freqs	freqs;
-	cpumask_t		online_policy_cpus;
-	cpumask_t		saved_mask;
-	cpumask_t		set_mask;
-	cpumask_t		covered_cpus;
 	int			retval = 0;
 	unsigned int		j, k, first_cpu, tmp;
-
-	if (unlikely(centrino_model[cpu] == NULL))
-		return -ENODEV;
+	CPUMASK_ALLOC(allmasks);
+	CPUMASK_PTR(online_policy_cpus, allmasks);
+	CPUMASK_PTR(saved_mask, allmasks);
+	CPUMASK_PTR(set_mask, allmasks);
+	CPUMASK_PTR(covered_cpus, allmasks);
+
+	if (unlikely(allmasks == NULL))
+		return -ENOMEM;
+
+	if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
+		retval = -ENODEV;
+		goto out;
+	}
 
 	if (unlikely(cpufreq_frequency_table_target(policy,
-			centrino_model[cpu]->op_points,
+			per_cpu(centrino_model, cpu)->op_points,
 			target_freq,
 			relation,
 			&newstate))) {
-		return -EINVAL;
+		retval = -EINVAL;
+		goto out;
 	}
 
 #ifdef CONFIG_HOTPLUG_CPU
 	/* cpufreq holds the hotplug lock, so we are safe from here on */
-	cpus_and(online_policy_cpus, cpu_online_map, policy->cpus);
+	cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus);
 #else
-	online_policy_cpus = policy->cpus;
+	*online_policy_cpus = policy->cpus;
 #endif
 
-	saved_mask = current->cpus_allowed;
+	*saved_mask = current->cpus_allowed;
 	first_cpu = 1;
-	cpus_clear(covered_cpus);
-	for_each_cpu_mask(j, online_policy_cpus) {
+	cpus_clear(*covered_cpus);
+	for_each_cpu_mask_nr(j, *online_policy_cpus) {
 		/*
 		 * Support for SMP systems.
 		 * Make sure we are running on CPU that wants to change freq
 		 */
-		cpus_clear(set_mask);
+		cpus_clear(*set_mask);
 		if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
-			cpus_or(set_mask, set_mask, online_policy_cpus);
+			cpus_or(*set_mask, *set_mask, *online_policy_cpus);
 		else
-			cpu_set(j, set_mask);
+			cpu_set(j, *set_mask);
 
-		set_cpus_allowed_ptr(current, &set_mask);
+		set_cpus_allowed_ptr(current, set_mask);
 		preempt_disable();
-		if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) {
+		if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) {
 			dprintk("couldn't limit to CPUs in this domain\n");
 			retval = -EAGAIN;
 			if (first_cpu) {
@@ -500,7 +531,7 @@ static int centrino_target (struct cpufreq_policy *policy,
 			break;
 		}
 
-		msr = centrino_model[cpu]->op_points[newstate].index;
+		msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
 
 		if (first_cpu) {
 			rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
@@ -517,7 +548,7 @@ static int centrino_target (struct cpufreq_policy *policy,
 			dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
 				target_freq, freqs.old, freqs.new, msr);
 
-			for_each_cpu_mask(k, online_policy_cpus) {
+			for_each_cpu_mask_nr(k, *online_policy_cpus) {
 				freqs.cpu = k;
 				cpufreq_notify_transition(&freqs,
 					CPUFREQ_PRECHANGE);
@@ -536,11 +567,11 @@ static int centrino_target (struct cpufreq_policy *policy,
 			break;
 		}
 
-		cpu_set(j, covered_cpus);
+		cpu_set(j, *covered_cpus);
 		preempt_enable();
 	}
 
-	for_each_cpu_mask(k, online_policy_cpus) {
+	for_each_cpu_mask_nr(k, *online_policy_cpus) {
 		freqs.cpu = k;
 		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 	}
@@ -553,30 +584,32 @@ static int centrino_target (struct cpufreq_policy *policy,
 		 * Best effort undo..
 		 */
 
-		if (!cpus_empty(covered_cpus)) {
-			for_each_cpu_mask(j, covered_cpus) {
+		if (!cpus_empty(*covered_cpus))
+			for_each_cpu_mask_nr(j, *covered_cpus) {
 				set_cpus_allowed_ptr(current,
 						     &cpumask_of_cpu(j));
 				wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
 			}
-		}
 
 		tmp = freqs.new;
 		freqs.new = freqs.old;
 		freqs.old = tmp;
-		for_each_cpu_mask(j, online_policy_cpus) {
+		for_each_cpu_mask_nr(j, *online_policy_cpus) {
 			freqs.cpu = j;
 			cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 			cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 		}
 	}
-	set_cpus_allowed_ptr(current, &saved_mask);
-	return 0;
+	set_cpus_allowed_ptr(current, saved_mask);
+	retval = 0;
+	goto out;
 
 migrate_end:
 	preempt_enable();
-	set_cpus_allowed_ptr(current, &saved_mask);
-	return 0;
+	set_cpus_allowed_ptr(current, saved_mask);
+out:
+	CPUMASK_FREE(allmasks);
+	return retval;
 }
 
 static struct freq_attr* centrino_attr[] = {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 1b50244b1fdf..04d0376b64b0 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -279,7 +279,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
 
 	cpus_allowed = current->cpus_allowed;
 
-	for_each_cpu_mask(i, policy->cpus) {
+	for_each_cpu_mask_nr(i, policy->cpus) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
 	}
@@ -292,7 +292,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
 	/* allow to be run on all CPUs */
 	set_cpus_allowed_ptr(current, &cpus_allowed);
 
-	for_each_cpu_mask(i, policy->cpus) {
+	for_each_cpu_mask_nr(i, policy->cpus) {
 		freqs.cpu = i;
 		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
 	}
@@ -431,7 +431,7 @@ static void __exit speedstep_exit(void)
 }
 
 
-MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>");
+MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
 MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges.");
 MODULE_LICENSE ("GPL");
 
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 3fd7a67bb06a..ffd0f5ed071a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -15,13 +15,11 @@
 /*
  * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
  */
-static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
 {
 	unsigned char ccr2, ccr3;
-	unsigned long flags;
 
 	/* we test for DEVID by checking whether CCR3 is writable */
-	local_irq_save(flags);
 	ccr3 = getCx86(CX86_CCR3);
 	setCx86(CX86_CCR3, ccr3 ^ 0x80);
 	getCx86(0xc0);   /* dummy to change bus */
@@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
 		*dir0 = getCx86(CX86_DIR0);
 		*dir1 = getCx86(CX86_DIR1);
 	}
-	local_irq_restore(flags);
 }
 
+static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	__do_cyrix_devid(dir0, dir1);
+	local_irq_restore(flags);
+}
 /*
  * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
  * order to identify the Cyrix CPU model after we're out of setup.c
@@ -116,7 +121,7 @@ static void __cpuinit set_cx86_reorder(void)
 	setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
 
 	/* Load/Store Serialize to mem access disable (=reorder it) */
-	setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80);
+	setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
 	/* set load/store serialize from 1GB to 4GB */
 	ccr3 |= 0xe0;
 	setCx86(CX86_CCR3, ccr3);
@@ -127,28 +132,11 @@ static void __cpuinit set_cx86_memwb(void)
 	printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
 
 	/* CCR2 bit 2: unlock NW bit */
-	setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
+	setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
 	/* set 'Not Write-through' */
 	write_cr0(read_cr0() | X86_CR0_NW);
 	/* CCR2 bit 2: lock NW bit and set WT1 */
-	setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14);
-}
-
-static void __cpuinit set_cx86_inc(void)
-{
-	unsigned char ccr3;
-
-	printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n");
-
-	ccr3 = getCx86(CX86_CCR3);
-	setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
-	/* PCR1 -- Performance Control */
-	/* Incrementor on, whatever that is */
-	setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02);
-	/* PCR0 -- Performance Control */
-	/* Incrementor Margin 10 */
-	setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04);
-	setCx86(CX86_CCR3, ccr3);	/* disable MAPEN */
+	setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
 }
 
 /*
@@ -162,23 +150,40 @@ static void __cpuinit geode_configure(void)
 	local_irq_save(flags);
 
 	/* Suspend on halt power saving and enable #SUSP pin */
-	setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
+	setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
 
 	ccr3 = getCx86(CX86_CCR3);
 	setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);	/* enable MAPEN */
 
 
 	/* FPU fast, DTE cache, Mem bypass */
-	setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
+	setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
 	setCx86(CX86_CCR3, ccr3);			/* disable MAPEN */
 
 	set_cx86_memwb();
 	set_cx86_reorder();
-	set_cx86_inc();
 
 	local_irq_restore(flags);
 }
 
+static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
+{
+	unsigned char dir0, dir0_msn, dir1 = 0;
+
+	__do_cyrix_devid(&dir0, &dir1);
+	dir0_msn = dir0 >> 4; /* identifies CPU "family"   */
+
+	switch (dir0_msn) {
+	case 3: /* 6x86/6x86L */
+		/* Emulate MTRRs using Cyrix's ARRs. */
+		set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
+		break;
+	case 5: /* 6x86MX/M II */
+		/* Emulate MTRRs using Cyrix's ARRs. */
+		set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
+		break;
+	}
+}
 
 static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
 {
@@ -286,7 +291,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
 		/* GXm supports extended cpuid levels 'ala' AMD */
 		if (c->cpuid_level == 2) {
 			/* Enable cxMMX extensions (GX1 Datasheet 54) */
-			setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
+			setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
 
 			/*
 			 * GXm : 0x30 ... 0x5f GXm  datasheet 51
@@ -296,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
 			 */
 			if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
 				geode_configure();
-			get_model_name(c);  /* get CPU marketing name */
 			return;
 		} else { /* MediaGX */
 			Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
@@ -309,7 +313,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
 		if (dir1 > 7) {
 			dir0_msn++;  /* M II */
 			/* Enable MMX extensions (App note 108) */
-			setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
+			setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
 		} else {
 			c->coma_bug = 1;      /* 6x86MX, it has the bug. */
 		}
@@ -424,7 +428,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
 			local_irq_save(flags);
 			ccr3 = getCx86(CX86_CCR3);
 			setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);       /* enable MAPEN  */
-			setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80);  /* enable cpuid  */
+			setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80);  /* enable cpuid  */
 			setCx86(CX86_CCR3, ccr3);                       /* disable MAPEN */
 			local_irq_restore(flags);
 		}
@@ -434,16 +438,19 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
 static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
 	.c_vendor	= "Cyrix",
 	.c_ident	= { "CyrixInstead" },
+	.c_early_init	= early_init_cyrix,
 	.c_init		= init_cyrix,
 	.c_identify	= cyrix_identify,
+	.c_x86_vendor	= X86_VENDOR_CYRIX,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev);
+cpu_dev_register(cyrix_cpu_dev);
 
 static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
 	.c_vendor	= "NSC",
 	.c_ident	= { "Geode by NSC" },
 	.c_init		= init_nsc,
+	.c_x86_vendor	= X86_VENDOR_NSC,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev);
+cpu_dev_register(nsc_cpu_dev);
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
deleted file mode 100644
index e43ad4ad4cba..000000000000
--- a/arch/x86/kernel/cpu/feature_names.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Strings for the various x86 capability flags.
- *
- * This file must not contain any executable code.
- */
-
-#include <asm/cpufeature.h>
-
-/*
- * These flag bits must match the definitions in <asm/cpufeature.h>.
- * NULL means this bit is undefined or reserved; either way it doesn't
- * have meaning as far as Linux is concerned.  Note that it's important
- * to realize there is a difference between this table and CPUID -- if
- * applications want to get the raw CPUID data, they should access
- * /dev/cpu/<cpu_nr>/cpuid instead.
- */
-const char * const x86_cap_flags[NCAPINTS*32] = {
-	/* Intel-defined */
-	"fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
-	"cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
-	"pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
-	"fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
-
-	/* AMD-defined */
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
-	NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
-	"3dnowext", "3dnow",
-
-	/* Transmeta-defined */
-	"recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-	/* Other (Linux-defined) */
-	"cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
-	NULL, NULL, NULL, NULL,
-	"constant_tsc", "up", NULL, "arch_perfmon",
-	"pebs", "bts", NULL, NULL,
-	"rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-	/* Intel-defined (#2) */
-	"pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
-	"tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
-	NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-	/* VIA/Cyrix/Centaur-defined */
-	NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
-	"ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-	/* AMD-defined (#2) */
-	"lahf_lm", "cmp_legacy", "svm", "extapic",
-	"cr8_legacy", "abm", "sse4a", "misalignsse",
-	"3dnowprefetch", "osvw", "ibs", "sse5",
-	"skinit", "wdt", NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-
-	/* Auxiliary (Linux-defined) */
-	"ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-	NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
-};
-
-const char *const x86_power_flags[32] = {
-	"ts",	/* temperature sensor */
-	"fid",  /* frequency id control */
-	"vid",  /* voltage id control */
-	"ttp",  /* thermal trip */
-	"tm",
-	"stc",
-	"100mhzsteps",
-	"hwpstate",
-	"",	/* tsc invariant mapped to constant_tsc */
-		/* nothing */
-};
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 70609efdf1da..cce0b6118d55 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -15,6 +15,11 @@
 #include <asm/ds.h>
 #include <asm/bugs.h>
 
+#ifdef CONFIG_X86_64
+#include <asm/topology.h>
+#include <asm/numa_64.h>
+#endif
+
 #include "cpu.h"
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -23,23 +28,22 @@
 #include <mach_apic.h>
 #endif
 
-#ifdef CONFIG_X86_INTEL_USERCOPY
-/*
- * Alignment at which movsl is preferred for bulk memory copies.
- */
-struct movsl_mask movsl_mask __read_mostly;
-#endif
-
 static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
 {
-	/* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
-	if (c->x86 == 15 && c->x86_cache_alignment == 64)
-		c->x86_cache_alignment = 128;
 	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
 		(c->x86 == 0x6 && c->x86_model >= 0x0e))
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
+#ifdef CONFIG_X86_64
+	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
+#else
+	/* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
+	if (c->x86 == 15 && c->x86_cache_alignment == 64)
+		c->x86_cache_alignment = 128;
+#endif
 }
 
+#ifdef CONFIG_X86_32
 /*
  *	Early probe support logic for ppro memory erratum #50
  *
@@ -59,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void)
 	return 0;
 }
 
+#ifdef CONFIG_X86_F00F_BUG
+static void __cpuinit trap_init_f00f_bug(void)
+{
+	__set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
 
-/*
- * P4 Xeon errata 037 workaround.
- * Hardware prefetcher may cause stale data to be loaded into the cache.
- */
-static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
+	/*
+	 * Update the IDT descriptor and reload the IDT so that
+	 * it uses the read-only mapped virtual address.
+	 */
+	idt_descr.address = fix_to_virt(FIX_F00F_IDT);
+	load_idt(&idt_descr);
+}
+#endif
+
+static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 {
 	unsigned long lo, hi;
 
+#ifdef CONFIG_X86_F00F_BUG
+	/*
+	 * All current models of Pentium and Pentium with MMX technology CPUs
+	 * have the F0 0F bug, which lets nonprivileged users lock up the system.
+	 * Note that the workaround only should be initialized once...
+	 */
+	c->f00f_bug = 0;
+	if (!paravirt_enabled() && c->x86 == 5) {
+		static int f00f_workaround_enabled;
+
+		c->f00f_bug = 1;
+		if (!f00f_workaround_enabled) {
+			trap_init_f00f_bug();
+			printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
+			f00f_workaround_enabled = 1;
+		}
+	}
+#endif
+
+	/*
+	 * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
+	 * model 3 mask 3
+	 */
+	if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
+		clear_cpu_cap(c, X86_FEATURE_SEP);
+
+	/*
+	 * P4 Xeon errata 037 workaround.
+	 * Hardware prefetcher may cause stale data to be loaded into the cache.
+	 */
 	if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
 		rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
 		if ((lo & (1<<9)) == 0) {
@@ -77,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
 			wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
 		}
 	}
+
+	/*
+	 * See if we have a good local APIC by checking for buggy Pentia,
+	 * i.e. all B steppings and the C2 stepping of P54C when using their
+	 * integrated APIC (see 11AP erratum in "Pentium Processor
+	 * Specification Update").
+	 */
+	if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
+	    (c->x86_mask < 0x6 || c->x86_mask == 0xb))
+		set_cpu_cap(c, X86_FEATURE_11AP);
+
+
+#ifdef CONFIG_X86_INTEL_USERCOPY
+	/*
+	 * Set up the preferred alignment for movsl bulk memory moves
+	 */
+	switch (c->x86) {
+	case 4:		/* 486: untested */
+		break;
+	case 5:		/* Old Pentia: untested */
+		break;
+	case 6:		/* PII/PIII only like movsl with 8-byte alignment */
+		movsl_mask.mask = 7;
+		break;
+	case 15:	/* P4 is OK down to 8-byte alignment */
+		movsl_mask.mask = 7;
+		break;
+	}
+#endif
+
+#ifdef CONFIG_X86_NUMAQ
+	numaq_tsc_disable();
+#endif
 }
+#else
+static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
+{
+}
+#endif
 
+static void __cpuinit srat_detect_node(void)
+{
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
+	unsigned node;
+	int cpu = smp_processor_id();
+	int apicid = hard_smp_processor_id();
+
+	/* Don't do the funky fallback heuristics the AMD version employs
+	   for now. */
+	node = apicid_to_node[apicid];
+	if (node == NUMA_NO_NODE || !node_online(node))
+		node = first_node(node_online_map);
+	numa_set_node(cpu, node);
+
+	printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
+#endif
+}
 
 /*
  * find out the number of processor cores on the die
  */
-static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
 {
 	unsigned int eax, ebx, ecx, edx;
 
@@ -98,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
 		return 1;
 }
 
-#ifdef CONFIG_X86_F00F_BUG
-static void __cpuinit trap_init_f00f_bug(void)
+static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
 {
-	__set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
-
-	/*
-	 * Update the IDT descriptor and reload the IDT so that
-	 * it uses the read-only mapped virtual address.
-	 */
-	idt_descr.address = fix_to_virt(FIX_F00F_IDT);
-	load_idt(&idt_descr);
+	/* Intel VMX MSR indicated features */
+#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW	0x00200000
+#define X86_VMX_FEATURE_PROC_CTLS_VNMI		0x00400000
+#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS	0x80000000
+#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC	0x00000001
+#define X86_VMX_FEATURE_PROC_CTLS2_EPT		0x00000002
+#define X86_VMX_FEATURE_PROC_CTLS2_VPID		0x00000020
+
+	u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
+
+	clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+	clear_cpu_cap(c, X86_FEATURE_VNMI);
+	clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+	clear_cpu_cap(c, X86_FEATURE_EPT);
+	clear_cpu_cap(c, X86_FEATURE_VPID);
+
+	rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
+	msr_ctl = vmx_msr_high | vmx_msr_low;
+	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
+		set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
+	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
+		set_cpu_cap(c, X86_FEATURE_VNMI);
+	if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
+		rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
+		      vmx_msr_low, vmx_msr_high);
+		msr_ctl2 = vmx_msr_high | vmx_msr_low;
+		if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
+		    (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
+			set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
+		if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
+			set_cpu_cap(c, X86_FEATURE_EPT);
+		if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
+			set_cpu_cap(c, X86_FEATURE_VPID);
+	}
 }
-#endif
 
 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 {
 	unsigned int l2 = 0;
-	char *p = NULL;
 
 	early_init_intel(c);
 
-#ifdef CONFIG_X86_F00F_BUG
-	/*
-	 * All current models of Pentium and Pentium with MMX technology CPUs
-	 * have the F0 0F bug, which lets nonprivileged users lock up the system.
-	 * Note that the workaround only should be initialized once...
-	 */
-	c->f00f_bug = 0;
-	if (!paravirt_enabled() && c->x86 == 5) {
-		static int f00f_workaround_enabled;
-
-		c->f00f_bug = 1;
-		if (!f00f_workaround_enabled) {
-			trap_init_f00f_bug();
-			printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
-			f00f_workaround_enabled = 1;
-		}
-	}
-#endif
+	intel_workarounds(c);
 
 	l2 = init_intel_cacheinfo(c);
 	if (c->cpuid_level > 9) {
@@ -146,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
 	}
 
-	/* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */
-	if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
-		clear_cpu_cap(c, X86_FEATURE_SEP);
+	if (cpu_has_xmm2)
+		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+	if (cpu_has_ds) {
+		unsigned int l1;
+		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
+		if (!(l1 & (1<<11)))
+			set_cpu_cap(c, X86_FEATURE_BTS);
+		if (!(l1 & (1<<12)))
+			set_cpu_cap(c, X86_FEATURE_PEBS);
+		ds_init_intel(c);
+	}
 
+#ifdef CONFIG_X86_64
+	if (c->x86 == 15)
+		c->x86_cache_alignment = c->x86_clflush_size * 2;
+	if (c->x86 == 6)
+		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+#else
 	/*
 	 * Names for the Pentium II/Celeron processors
 	 * detectable only by also checking the cache size.
 	 * Dixon is NOT a Celeron.
 	 */
 	if (c->x86 == 6) {
+		char *p = NULL;
+
 		switch (c->x86_model) {
 		case 5:
 			if (c->x86_mask == 0) {
@@ -178,60 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 				p = "Celeron (Coppermine)";
 			break;
 		}
-	}
 
-	if (p)
-		strcpy(c->x86_model_id, p);
-
-	c->x86_max_cores = num_cpu_cores(c);
-
-	detect_ht(c);
-
-	/* Work around errata */
-	Intel_errata_workarounds(c);
-
-#ifdef CONFIG_X86_INTEL_USERCOPY
-	/*
-	 * Set up the preferred alignment for movsl bulk memory moves
-	 */
-	switch (c->x86) {
-	case 4:		/* 486: untested */
-		break;
-	case 5:		/* Old Pentia: untested */
-		break;
-	case 6:		/* PII/PIII only like movsl with 8-byte alignment */
-		movsl_mask.mask = 7;
-		break;
-	case 15:	/* P4 is OK down to 8-byte alignment */
-		movsl_mask.mask = 7;
-		break;
+		if (p)
+			strcpy(c->x86_model_id, p);
 	}
-#endif
 
-	if (cpu_has_xmm2)
-		set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-	if (c->x86 == 15) {
+	if (c->x86 == 15)
 		set_cpu_cap(c, X86_FEATURE_P4);
-	}
 	if (c->x86 == 6)
 		set_cpu_cap(c, X86_FEATURE_P3);
-	if (cpu_has_ds) {
-		unsigned int l1;
-		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
-		if (!(l1 & (1<<11)))
-			set_cpu_cap(c, X86_FEATURE_BTS);
-		if (!(l1 & (1<<12)))
-			set_cpu_cap(c, X86_FEATURE_PEBS);
-	}
 
 	if (cpu_has_bts)
-		ds_init_intel(c);
+		ptrace_bts_init_intel(c);
 
-#ifdef CONFIG_X86_NUMAQ
-	numaq_tsc_disable();
 #endif
+
+	detect_extended_topology(c);
+	if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
+		/*
+		 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
+		 * detection.
+		 */
+		c->x86_max_cores = intel_num_cpu_cores(c);
+#ifdef CONFIG_X86_32
+		detect_ht(c);
+#endif
+	}
+
+	/* Work around errata */
+	srat_detect_node();
+
+	if (cpu_has(c, X86_FEATURE_VMX))
+		detect_vmx_virtcap(c);
 }
 
+#ifdef CONFIG_X86_32
 static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
 {
 	/*
@@ -244,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
 		size = 256;
 	return size;
 }
+#endif
 
 static struct cpu_dev intel_cpu_dev __cpuinitdata = {
 	.c_vendor	= "Intel",
 	.c_ident	= { "GenuineIntel" },
+#ifdef CONFIG_X86_32
 	.c_models = {
 		{ .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
 		  {
@@ -297,76 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
 		  }
 		},
 	},
+	.c_size_cache	= intel_size_cache,
+#endif
 	.c_early_init   = early_init_intel,
 	.c_init		= init_intel,
-	.c_size_cache	= intel_size_cache,
+	.c_x86_vendor	= X86_VENDOR_INTEL,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
-
-#ifndef CONFIG_X86_CMPXCHG
-unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
-{
-	u8 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u8 *)ptr;
-	if (prev == old)
-		*(u8 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u8);
-
-unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
-{
-	u16 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u16 *)ptr;
-	if (prev == old)
-		*(u16 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u16);
-
-unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
-{
-	u32 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg for 386. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u32 *)ptr;
-	if (prev == old)
-		*(u32 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u32);
-#endif
-
-#ifndef CONFIG_X86_CMPXCHG64
-unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
-{
-	u64 prev;
-	unsigned long flags;
-
-	/* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
-	local_irq_save(flags);
-	prev = *(u64 *)ptr;
-	if (prev == old)
-		*(u64 *)ptr = new;
-	local_irq_restore(flags);
-	return prev;
-}
-EXPORT_SYMBOL(cmpxchg_486_u64);
-#endif
-
-/* arch_initcall(intel_cpu_init); */
+cpu_dev_register(intel_cpu_dev);
 
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
deleted file mode 100644
index 1019c58d39f0..000000000000
--- a/arch/x86/kernel/cpu/intel_64.c
+++ /dev/null
@@ -1,95 +0,0 @@
-#include <linux/init.h>
-#include <linux/smp.h>
-#include <asm/processor.h>
-#include <asm/ptrace.h>
-#include <asm/topology.h>
-#include <asm/numa_64.h>
-
-#include "cpu.h"
-
-static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
-{
-	if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
-	    (c->x86 == 0x6 && c->x86_model >= 0x0e))
-		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
-
-	set_cpu_cap(c, X86_FEATURE_SYSENTER32);
-}
-
-/*
- * find out the number of processor cores on the die
- */
-static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
-{
-	unsigned int eax, t;
-
-	if (c->cpuid_level < 4)
-		return 1;
-
-	cpuid_count(4, 0, &eax, &t, &t, &t);
-
-	if (eax & 0x1f)
-		return ((eax >> 26) + 1);
-	else
-		return 1;
-}
-
-static void __cpuinit srat_detect_node(void)
-{
-#ifdef CONFIG_NUMA
-	unsigned node;
-	int cpu = smp_processor_id();
-	int apicid = hard_smp_processor_id();
-
-	/* Don't do the funky fallback heuristics the AMD version employs
-	   for now. */
-	node = apicid_to_node[apicid];
-	if (node == NUMA_NO_NODE || !node_online(node))
-		node = first_node(node_online_map);
-	numa_set_node(cpu, node);
-
-	printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
-#endif
-}
-
-static void __cpuinit init_intel(struct cpuinfo_x86 *c)
-{
-	init_intel_cacheinfo(c);
-	if (c->cpuid_level > 9) {
-		unsigned eax = cpuid_eax(10);
-		/* Check for version and the number of counters */
-		if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
-			set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
-	}
-
-	if (cpu_has_ds) {
-		unsigned int l1, l2;
-		rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
-		if (!(l1 & (1<<11)))
-			set_cpu_cap(c, X86_FEATURE_BTS);
-		if (!(l1 & (1<<12)))
-			set_cpu_cap(c, X86_FEATURE_PEBS);
-	}
-
-
-	if (cpu_has_bts)
-		ds_init_intel(c);
-
-	if (c->x86 == 15)
-		c->x86_cache_alignment = c->x86_clflush_size * 2;
-	if (c->x86 == 6)
-		set_cpu_cap(c, X86_FEATURE_REP_GOOD);
-	set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
-	c->x86_max_cores = intel_num_cpu_cores(c);
-
-	srat_detect_node();
-}
-
-static struct cpu_dev intel_cpu_dev __cpuinitdata = {
-	.c_vendor	= "Intel",
-	.c_ident	= { "GenuineIntel" },
-	.c_early_init   = early_init_intel,
-	.c_init		= init_intel,
-};
-cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
-
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 2c8afafa18e8..3f46afbb1cf1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -1,8 +1,8 @@
 /*
- *      Routines to indentify caches on Intel CPU.
+ *	Routines to indentify caches on Intel CPU.
  *
- *      Changes:
- *      Venkatesh Pallipadi	: Adding cache identification through cpuid(4)
+ *	Changes:
+ *	Venkatesh Pallipadi	: Adding cache identification through cpuid(4)
  *		Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
  *	Andi Kleen / Andreas Herrmann	: CPUID4 emulation on AMD.
  */
@@ -13,6 +13,7 @@
 #include <linux/compiler.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/pci.h>
 
 #include <asm/processor.h>
 #include <asm/smp.h>
@@ -130,9 +131,18 @@ struct _cpuid4_info {
 	union _cpuid4_leaf_ebx ebx;
 	union _cpuid4_leaf_ecx ecx;
 	unsigned long size;
+	unsigned long can_disable;
 	cpumask_t shared_cpu_map;	/* future?: only cpus/node is needed */
 };
 
+#ifdef CONFIG_PCI
+static struct pci_device_id k8_nb_id[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
+	{}
+};
+#endif
+
 unsigned short			num_cache_leaves;
 
 /* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -182,9 +192,10 @@ static unsigned short assocs[] __cpuinitdata = {
 static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
 static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
 
-static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
-		       union _cpuid4_leaf_ebx *ebx,
-		       union _cpuid4_leaf_ecx *ecx)
+static void __cpuinit
+amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
+		     union _cpuid4_leaf_ebx *ebx,
+		     union _cpuid4_leaf_ecx *ecx)
 {
 	unsigned dummy;
 	unsigned line_size, lines_per_tag, assoc, size_in_kb;
@@ -251,27 +262,40 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
 		(ebx->split.ways_of_associativity + 1) - 1;
 }
 
-static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
+static void __cpuinit
+amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
+{
+	if (index < 3)
+		return;
+	this_leaf->can_disable = 1;
+}
+
+static int
+__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
 {
 	union _cpuid4_leaf_eax 	eax;
 	union _cpuid4_leaf_ebx 	ebx;
 	union _cpuid4_leaf_ecx 	ecx;
 	unsigned		edx;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
 		amd_cpuid4(index, &eax, &ebx, &ecx);
-	else
-		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full,  &edx);
+		if (boot_cpu_data.x86 >= 0x10)
+			amd_check_l3_disable(index, this_leaf);
+	} else {
+		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
+	}
+
 	if (eax.split.type == CACHE_TYPE_NULL)
 		return -EIO; /* better error ? */
 
 	this_leaf->eax = eax;
 	this_leaf->ebx = ebx;
 	this_leaf->ecx = ecx;
-	this_leaf->size = (ecx.split.number_of_sets + 1) *
-		(ebx.split.coherency_line_size + 1) *
-		(ebx.split.physical_line_partition + 1) *
-		(ebx.split.ways_of_associativity + 1);
+	this_leaf->size = (ecx.split.number_of_sets          + 1) *
+			  (ebx.split.coherency_line_size     + 1) *
+			  (ebx.split.physical_line_partition + 1) *
+			  (ebx.split.ways_of_associativity   + 1);
 	return 0;
 }
 
@@ -453,7 +477,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
 
 /* pointer to _cpuid4_info array (for each cache leaf) */
 static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
-#define CPUID4_INFO_IDX(x, y)    (&((per_cpu(cpuid4_info, x))[y]))
+#define CPUID4_INFO_IDX(x, y)	(&((per_cpu(cpuid4_info, x))[y]))
 
 #ifdef CONFIG_SMP
 static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
@@ -489,8 +513,8 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
 	int sibling;
 
 	this_leaf = CPUID4_INFO_IDX(cpu, index);
-	for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) {
-		sibling_leaf = CPUID4_INFO_IDX(sibling, index);	
+	for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
+		sibling_leaf = CPUID4_INFO_IDX(sibling, index);
 		cpu_clear(cpu, sibling_leaf->shared_cpu_map);
 	}
 }
@@ -572,7 +596,7 @@ struct _index_kobject {
 
 /* pointer to array of kobjects for cpuX/cache/indexY */
 static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
-#define INDEX_KOBJECT_PTR(x, y)    (&((per_cpu(index_kobject, x))[y]))
+#define INDEX_KOBJECT_PTR(x, y)		(&((per_cpu(index_kobject, x))[y]))
 
 #define show_one_plus(file_name, object, val)				\
 static ssize_t show_##file_name						\
@@ -637,6 +661,99 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
 	}
 }
 
+#define to_object(k)	container_of(k, struct _index_kobject, kobj)
+#define to_attr(a)	container_of(a, struct _cache_attr, attr)
+
+#ifdef CONFIG_PCI
+static struct pci_dev *get_k8_northbridge(int node)
+{
+	struct pci_dev *dev = NULL;
+	int i;
+
+	for (i = 0; i <= node; i++) {
+		do {
+			dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
+			if (!dev)
+				break;
+		} while (!pci_match_id(&k8_nb_id[0], dev));
+		if (!dev)
+			break;
+	}
+	return dev;
+}
+#else
+static struct pci_dev *get_k8_northbridge(int node)
+{
+	return NULL;
+}
+#endif
+
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
+{
+	int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+	struct pci_dev *dev = NULL;
+	ssize_t ret = 0;
+	int i;
+
+	if (!this_leaf->can_disable)
+		return sprintf(buf, "Feature not enabled\n");
+
+	dev = get_k8_northbridge(node);
+	if (!dev) {
+		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
+		return -EINVAL;
+	}
+
+	for (i = 0; i < 2; i++) {
+		unsigned int reg;
+
+		pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
+
+		ret += sprintf(buf, "%sEntry: %d\n", buf, i);
+		ret += sprintf(buf, "%sReads:  %s\tNew Entries: %s\n",  
+			buf,
+			reg & 0x80000000 ? "Disabled" : "Allowed",
+			reg & 0x40000000 ? "Disabled" : "Allowed");
+		ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
+			buf, (reg & 0x30000) >> 16, reg & 0xfff);
+	}
+	return ret;
+}
+
+static ssize_t
+store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
+		    size_t count)
+{
+	int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
+	struct pci_dev *dev = NULL;
+	unsigned int ret, index, val;
+
+	if (!this_leaf->can_disable)
+		return 0;
+
+	if (strlen(buf) > 15)
+		return -EINVAL;
+
+	ret = sscanf(buf, "%x %x", &index, &val);
+	if (ret != 2)
+		return -EINVAL;
+	if (index > 1)
+		return -EINVAL;
+
+	val |= 0xc0000000;
+	dev = get_k8_northbridge(node);
+	if (!dev) {
+		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
+		return -EINVAL;
+	}
+
+	pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
+	wbinvd();
+	pci_write_config_dword(dev, 0x1BC + index * 4, val);
+
+	return 1;
+}
+
 struct _cache_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct _cpuid4_info *, char *);
@@ -657,6 +774,8 @@ define_one_ro(size);
 define_one_ro(shared_cpu_map);
 define_one_ro(shared_cpu_list);
 
+static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
+
 static struct attribute * default_attrs[] = {
 	&type.attr,
 	&level.attr,
@@ -667,12 +786,10 @@ static struct attribute * default_attrs[] = {
 	&size.attr,
 	&shared_cpu_map.attr,
 	&shared_cpu_list.attr,
+	&cache_disable.attr,
 	NULL
 };
 
-#define to_object(k) container_of(k, struct _index_kobject, kobj)
-#define to_attr(a) container_of(a, struct _cache_attr, attr)
-
 static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
 {
 	struct _cache_attr *fattr = to_attr(attr);
@@ -682,14 +799,22 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
 	ret = fattr->show ?
 		fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
 			buf) :
-	       	0;
+		0;
 	return ret;
 }
 
 static ssize_t store(struct kobject * kobj, struct attribute * attr,
 		     const char * buf, size_t count)
 {
-	return 0;
+	struct _cache_attr *fattr = to_attr(attr);
+	struct _index_kobject *this_leaf = to_object(kobj);
+	ssize_t ret;
+
+	ret = fattr->store ?
+		fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
+			buf, count) :
+		0;
+	return ret;
 }
 
 static struct sysfs_ops sysfs_ops = {
@@ -780,15 +905,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
 			}
 			kobject_put(per_cpu(cache_kobject, cpu));
 			cpuid4_cache_sysfs_exit(cpu);
-			break;
+			return retval;
 		}
 		kobject_uevent(&(this_object->kobj), KOBJ_ADD);
 	}
-	if (!retval)
-		cpu_set(cpu, cache_dev_map);
+	cpu_set(cpu, cache_dev_map);
 
 	kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
-	return retval;
+	return 0;
 }
 
 static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index f390c9f66351..dd3af6e7b39a 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -1,6 +1,6 @@
 /*
- * Athlon/Hammer specific Machine Check Exception Reporting
- * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk>
+ * Athlon specific Machine Check Exception Reporting
+ * (C) Copyright 2002 Dave Jones <davej@redhat.com>
  */
 
 #include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 774d87cfd8cd..0ebf3fc6a610 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -1,6 +1,6 @@
 /*
  * mce.c - x86 Machine Check Exception Reporting
- * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
+ * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@redhat.com>
  */
 
 #include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index c4a7ec31394c..4b031a4ac856 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -580,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	char __user *buf = ubuf;
 	int i, err;
 
-	cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL);
+	cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
 	if (!cpu_tsc)
 		return -ENOMEM;
 
@@ -759,13 +759,18 @@ static struct sysdev_class mce_sysclass = {
 };
 
 DEFINE_PER_CPU(struct sys_device, device_mce);
+void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
 
 /* Why are there no generic functions for this? */
 #define ACCESSOR(name, var, start) \
-	static ssize_t show_ ## name(struct sys_device *s, char *buf) {	\
+	static ssize_t show_ ## name(struct sys_device *s,		\
+				     struct sysdev_attribute *attr,	\
+				     char *buf) {			\
 		return sprintf(buf, "%lx\n", (unsigned long)var);	\
 	}								\
-	static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \
+	static ssize_t set_ ## name(struct sys_device *s,		\
+				    struct sysdev_attribute *attr,	\
+				    const char *buf, size_t siz) {	\
 		char *end;						\
 		unsigned long new = simple_strtoul(buf, &end, 0);	\
 		if (end == buf) return -EINVAL;				\
@@ -786,14 +791,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart())
 ACCESSOR(bank4ctl,bank[4],mce_restart())
 ACCESSOR(bank5ctl,bank[5],mce_restart())
 
-static ssize_t show_trigger(struct sys_device *s, char *buf)
+static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+				char *buf)
 {
 	strcpy(buf, trigger);
 	strcat(buf, "\n");
 	return strlen(trigger) + 1;
 }
 
-static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
+static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
+				const char *buf,size_t siz)
 {
 	char *p;
 	int len;
@@ -806,12 +813,12 @@ static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
 }
 
 static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
-ACCESSOR(tolerant,tolerant,)
+static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
 ACCESSOR(check_interval,check_interval,mce_restart())
 static struct sysdev_attribute *mce_attributes[] = {
 	&attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
 	&attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
-	&attr_tolerant, &attr_check_interval, &attr_trigger,
+	&attr_tolerant.attr, &attr_check_interval, &attr_trigger,
 	NULL
 };
 
@@ -853,7 +860,7 @@ error:
 	return err;
 }
 
-static void mce_remove_device(unsigned int cpu)
+static __cpuinit void mce_remove_device(unsigned int cpu)
 {
 	int i;
 
@@ -877,9 +884,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		mce_create_device(cpu);
+		if (threshold_cpu_callback)
+			threshold_cpu_callback(action, cpu);
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
+		if (threshold_cpu_callback)
+			threshold_cpu_callback(action, cpu);
 		mce_remove_device(cpu);
 		break;
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 7c9a813e1193..5eb390a4b2e9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
 	if (err)
 		goto out_free;
 
-	for_each_cpu_mask(i, b->cpus) {
+	for_each_cpu_mask_nr(i, b->cpus) {
 		if (i == cpu)
 			continue;
 
@@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 #endif
 
 	/* remove all sibling symlinks before unregistering */
-	for_each_cpu_mask(i, b->cpus) {
+	for_each_cpu_mask_nr(i, b->cpus) {
 		if (i == cpu)
 			continue;
 
@@ -628,6 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
 	deallocate_threshold_block(cpu, bank);
 
 free_out:
+	kobject_del(b->kobj);
 	kobject_put(b->kobj);
 	kfree(b);
 	per_cpu(threshold_banks, cpu)[bank] = NULL;
@@ -645,14 +646,11 @@ static void threshold_remove_device(unsigned int cpu)
 }
 
 /* get notified when a cpu comes on/off */
-static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
-					    unsigned long action, void *hcpu)
+static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action,
+						     unsigned int cpu)
 {
-	/* cpu was unsigned int to begin with */
-	unsigned int cpu = (unsigned long)hcpu;
-
 	if (cpu >= NR_CPUS)
-		goto out;
+		return;
 
 	switch (action) {
 	case CPU_ONLINE:
@@ -666,14 +664,8 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
 	default:
 		break;
 	}
-      out:
-	return NOTIFY_OK;
 }
 
-static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
-	.notifier_call = threshold_cpu_callback,
-};
-
 static __init int threshold_init_device(void)
 {
 	unsigned lcpu = 0;
@@ -684,7 +676,7 @@ static __init int threshold_init_device(void)
 		if (err)
 			return err;
 	}
-	register_hotcpu_notifier(&threshold_cpu_notifier);
+	threshold_cpu_callback = amd_64_threshold_cpu_callback;
 	return 0;
 }
 
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index cc1fccdd31e0..a74af128efc9 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -1,7 +1,7 @@
 /*
  * Non Fatal Machine Check Exception Reporting
  *
- * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk>
+ * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
  *
  * This file contains routines to check for non-fatal MCEs every 15s
  *
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index eef001ad3bde..9b60fce09f75 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	/* The temperature transition interrupt handler setup */
 	h = THERMAL_APIC_VECTOR;		/* our delivery vector */
 	h |= (APIC_DM_FIXED | APIC_LVT_MASKED);	/* we'll mask till we're ready */
-	apic_write_around(APIC_LVTTHMR, h);
+	apic_write(APIC_LVTTHMR, h);
 
 	rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
 	wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
@@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
 	wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
 
 	l = apic_read(APIC_LVTTHMR);
-	apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
+	apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
 	printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
 
 	/* enable thermal throttle processing */
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1f4cc48c14c6..d5ae2243f0b9 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0);
 
 #define define_therm_throt_sysdev_show_func(name)                            \
 static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev,        \
+					struct sysdev_attribute *attr,	     \
                                               char *buf)                     \
 {                                                                            \
 	unsigned int cpu = dev->id;                                          \
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
new file mode 100644
index 000000000000..dfea390e1608
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -0,0 +1,32 @@
+#!/usr/bin/perl
+#
+# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
+#
+
+($in, $out) = @ARGV;
+
+open(IN, "< $in\0")   or die "$0: cannot open: $in: $!\n";
+open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
+
+print OUT "#include <asm/cpufeature.h>\n\n";
+print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
+
+while (defined($line = <IN>)) {
+	if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
+		$macro = $1;
+		$feature = $2;
+		$tail = $3;
+		if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
+			$feature = $1;
+		}
+
+		if ($feature ne '') {
+			printf OUT "\t%-32s = \"%s\",\n",
+				"[$macro]", "\L$feature";
+		}
+	}
+}
+print OUT "};\n";
+
+close(IN);
+close(OUT);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 509bd3d9eacd..4e8d77f01eeb 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -379,6 +379,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 			     unsigned long *size, mtrr_type *type)
 {
 	unsigned int mask_lo, mask_hi, base_lo, base_hi;
+	unsigned int tmp, hi;
 
 	rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
 	if ((mask_lo & 0x800) == 0) {
@@ -392,8 +393,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
 	rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
 
 	/* Work out the shifted address mask. */
-	mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT)
-	    | mask_lo >> PAGE_SHIFT;
+	tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
+	mask_lo = size_or_mask | tmp;
+	/* Expand tmp with high bits to all 1s*/
+	hi = fls(tmp);
+	if (hi > 0) {
+		tmp |= ~((1<<(hi - 1)) - 1);
+
+		if (tmp != mask_lo) {
+			WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
+			mask_lo = tmp;
+		}
+	}
 
 	/* This works correctly if size is a power of two, i.e. a
 	   contiguous range. */
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb3715..4c4214690dd1 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
 			}
 			/* RED-PEN: base can be > 32bit */ 
 			len += seq_printf(seq, 
-				   "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n",
+				   "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
 			     i, base, base >> (20 - PAGE_SHIFT), size, factor,
-			     mtrr_attrib_to_str(type), mtrr_usage_table[i]);
+			     mtrr_usage_table[i], mtrr_attrib_to_str(type));
 		}
 	}
 	return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6f23969c8faf..c78c04821ea1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -729,7 +729,7 @@ struct var_mtrr_range_state {
 	mtrr_type type;
 };
 
-struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
+static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
 static int __initdata debug_print;
 
 static int __init
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
 	/* take out UC ranges */
 	for (i = 0; i < num_var_ranges; i++) {
 		type = range_state[i].type;
-		if (type != MTRR_TYPE_UNCACHABLE)
+		if (type != MTRR_TYPE_UNCACHABLE &&
+		    type != MTRR_TYPE_WRPROT)
 			continue;
 		size = range_state[i].size_pfn;
 		if (!size)
@@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str)
 		enable_mtrr_cleanup = 1;
 	return 0;
 }
-early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup);
+early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
+
+static int __init mtrr_cleanup_debug_setup(char *str)
+{
+	debug_print = 1;
+	return 0;
+}
+early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
 
 struct var_mtrr_state {
 	unsigned long	range_startk;
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits)
 	}
 }
 
+static unsigned long to_size_factor(unsigned long sizek, char *factorp)
+{
+	char factor;
+	unsigned long base = sizek;
+
+	if (base & ((1<<10) - 1)) {
+		/* not MB alignment */
+		factor = 'K';
+	} else if (base & ((1<<20) - 1)){
+		factor = 'M';
+		base >>= 10;
+	} else {
+		factor = 'G';
+		base >>= 20;
+	}
+
+	*factorp = factor;
+
+	return base;
+}
+
 static unsigned int __init
 range_to_mtrr(unsigned int reg, unsigned long range_startk,
 	      unsigned long range_sizek, unsigned char type)
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
 			align = max_align;
 
 		sizek = 1 << align;
-		if (debug_print)
+		if (debug_print) {
+			char start_factor = 'K', size_factor = 'K';
+			unsigned long start_base, size_base;
+
+			start_base = to_size_factor(range_startk, &start_factor),
+			size_base = to_size_factor(sizek, &size_factor),
+
 			printk(KERN_DEBUG "Setting variable MTRR %d, "
-				"base: %ldMB, range: %ldMB, type %s\n",
-				reg, range_startk >> 10, sizek >> 10,
+				"base: %ld%cB, range: %ld%cB, type %s\n",
+				reg, start_base, start_factor,
+				size_base, size_factor,
 				(type == MTRR_TYPE_UNCACHABLE)?"UC":
 				    ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
 				);
+		}
 		save_var_mtrr(reg++, range_startk, sizek, type);
 		range_startk += sizek;
 		range_sizek -= sizek;
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
 	/* try to append some small hole */
 	range0_basek = state->range_startk;
 	range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
+
+	/* no increase */
 	if (range0_sizek == state->range_sizek) {
 		if (debug_print)
 			printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
 		return 0;
 	}
 
-	range0_sizek -= chunk_sizek;
-	if (range0_sizek && sizek) {
-	    while (range0_basek + range0_sizek > (basek + sizek)) {
-		range0_sizek -= chunk_sizek;
-		if (!range0_sizek)
-			break;
-	    }
+	/* only cut back, when it is not the last */
+	if (sizek) {
+		while (range0_basek + range0_sizek > (basek + sizek)) {
+			if (range0_sizek >= chunk_sizek)
+				range0_sizek -= chunk_sizek;
+			else
+				range0_sizek = 0;
+
+			if (!range0_sizek)
+				break;
+		}
+	}
+
+second_try:
+	range_basek = range0_basek + range0_sizek;
+
+	/* one hole in the middle */
+	if (range_basek > basek && range_basek <= (basek + sizek))
+		second_sizek = range_basek - basek;
+
+	if (range0_sizek > state->range_sizek) {
+
+		/* one hole in middle or at end */
+		hole_sizek = range0_sizek - state->range_sizek - second_sizek;
+
+		/* hole size should be less than half of range0 size */
+		if (hole_sizek >= (range0_sizek >> 1) &&
+		    range0_sizek >= chunk_sizek) {
+			range0_sizek -= chunk_sizek;
+			second_sizek = 0;
+			hole_sizek = 0;
+
+			goto second_try;
+		}
 	}
 
 	if (range0_sizek) {
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
 				(range0_basek + range0_sizek)<<10);
 		state->reg = range_to_mtrr(state->reg, range0_basek,
 				range0_sizek, MTRR_TYPE_WRBACK);
-
-	}
-
-	range_basek = range0_basek + range0_sizek;
-	range_sizek = chunk_sizek;
-
-	if (range_basek + range_sizek > basek &&
-	    range_basek + range_sizek <= (basek + sizek)) {
-		/* one hole */
-		second_basek = basek;
-		second_sizek = range_basek + range_sizek - basek;
 	}
 
-	/* if last piece, only could one hole near end */
-	if ((second_basek || !basek) &&
-	    range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
-	    (chunk_sizek >> 1)) {
-		/*
-		 * one hole in middle (second_sizek is 0) or at end
-		 * (second_sizek is 0 )
-		 */
-		hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
-				 - second_sizek;
-		hole_basek = range_basek + range_sizek - hole_sizek
-				 - second_sizek;
-	} else {
-		/* fallback for big hole, or several holes */
+	if (range0_sizek < state->range_sizek) {
+		/* need to handle left over */
 		range_sizek = state->range_sizek - range0_sizek;
-		second_basek = 0;
-		second_sizek = 0;
+
+		if (debug_print)
+			printk(KERN_DEBUG "range: %016lx - %016lx\n",
+				 range_basek<<10,
+				 (range_basek + range_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, range_basek,
+				 range_sizek, MTRR_TYPE_WRBACK);
 	}
 
-	if (debug_print)
-		printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
-			 (range_basek + range_sizek)<<10);
-	state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
-					 MTRR_TYPE_WRBACK);
 	if (hole_sizek) {
+		hole_basek = range_basek - hole_sizek - second_sizek;
 		if (debug_print)
 			printk(KERN_DEBUG "hole: %016lx - %016lx\n",
-				 hole_basek<<10, (hole_basek + hole_sizek)<<10);
-		state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek,
-						 MTRR_TYPE_UNCACHABLE);
-
+				 hole_basek<<10,
+				 (hole_basek + hole_sizek)<<10);
+		state->reg = range_to_mtrr(state->reg, hole_basek,
+				 hole_sizek, MTRR_TYPE_UNCACHABLE);
 	}
 
 	return second_sizek;
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result {
 };
 
 /*
- * gran_size: 1M, 2M, ..., 2G
- * chunk size: gran_size, ..., 4G
- * so we need (2+13)*6
+ * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
+ * chunk size: gran_size, ..., 2G
+ * so we need (1+16)*8
  */
-#define NUM_RESULT	90
+#define NUM_RESULT	136
 #define PSHIFT		(PAGE_SHIFT - 10)
 
 static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
 static int __init mtrr_cleanup(unsigned address_bits)
 {
 	unsigned long extra_remove_base, extra_remove_size;
-	unsigned long i, base, size, def, dummy;
+	unsigned long base, size, def, dummy;
 	mtrr_type type;
 	int nr_range, nr_range_new;
 	u64 chunk_size, gran_size;
 	unsigned long range_sums, range_sums_new;
 	int index_good;
 	int num_reg_good;
+	int i;
 
 	/* extra one for all 0 */
 	int num[MTRR_NUM_TYPES + 1];
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
 			continue;
 		if (!size)
 			type = MTRR_NUM_TYPES;
+		if (type == MTRR_TYPE_WRPROT)
+			type = MTRR_TYPE_UNCACHABLE;
 		num[type]++;
 	}
 
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits)
 		num_var_ranges - num[MTRR_NUM_TYPES])
 		return 0;
 
+	/* print original var MTRRs at first, for debugging: */
+	printk(KERN_DEBUG "original variable MTRRs\n");
+	for (i = 0; i < num_var_ranges; i++) {
+		char start_factor = 'K', size_factor = 'K';
+		unsigned long start_base, size_base;
+
+		size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
+		if (!size_base)
+			continue;
+
+		size_base = to_size_factor(size_base, &size_factor),
+		start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
+		start_base = to_size_factor(start_base, &start_factor),
+		type = range_state[i].type;
+
+		printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
+			i, start_base, start_factor,
+			size_base, size_factor,
+			(type == MTRR_TYPE_UNCACHABLE) ? "UC" :
+			    ((type == MTRR_TYPE_WRPROT) ? "WP" :
+			     ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
+			);
+	}
+
 	memset(range, 0, sizeof(range));
 	extra_remove_size = 0;
-	if (mtrr_tom2) {
-		extra_remove_base = 1 << (32 - PAGE_SHIFT);
+	extra_remove_base = 1 << (32 - PAGE_SHIFT);
+	if (mtrr_tom2)
 		extra_remove_size =
 			(mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
-	}
 	nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
 					  extra_remove_size);
+	/*
+	 * [0, 1M) should always be coverred by var mtrr with WB
+	 * and fixed mtrrs should take effective before var mtrr for it
+	 */
+	nr_range = add_range_with_merge(range, nr_range, 0,
+					(1ULL<<(20 - PAGE_SHIFT)) - 1);
+	/* sort the ranges */
+	sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
+
 	range_sums = sum_ranges(range, nr_range);
 	printk(KERN_INFO "total RAM coverred: %ldM\n",
 	       range_sums >> (20 - PAGE_SHIFT));
 
 	if (mtrr_chunk_size && mtrr_gran_size) {
 		int num_reg;
+		char gran_factor, chunk_factor, lose_factor;
+		unsigned long gran_base, chunk_base, lose_base;
 
-		debug_print = 1;
+		debug_print++;
 		/* convert ranges to var ranges state */
 		num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
 					      mtrr_gran_size);
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits)
 			result[i].lose_cover_sizek =
 				(range_sums - range_sums_new) << PSHIFT;
 
-		printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
-			 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10,
-			 result[i].chunk_sizek >> 10);
-		printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ldM \n",
+		gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+		chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+		lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+		printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+			 result[i].bad?"*BAD*":" ",
+			 gran_base, gran_factor, chunk_base, chunk_factor);
+		printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ld%c\n",
 			 result[i].num_reg, result[i].bad?"-":"",
-			 result[i].lose_cover_sizek >> 10);
+			 lose_base, lose_factor);
 		if (!result[i].bad) {
 			set_var_mtrr_all(address_bits);
 			return 1;
 		}
 		printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
 		       "will find optimal one\n");
-		debug_print = 0;
+		debug_print--;
 		memset(result, 0, sizeof(result[0]));
 	}
 
 	i = 0;
 	memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
 	memset(result, 0, sizeof(result));
-	for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) {
-		for (chunk_size = gran_size; chunk_size < (1ULL<<33);
+	for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
+		char gran_factor;
+		unsigned long gran_base;
+
+		if (debug_print)
+			gran_base = to_size_factor(gran_size >> 10, &gran_factor);
+
+		for (chunk_size = gran_size; chunk_size < (1ULL<<32);
 		     chunk_size <<= 1) {
 			int num_reg;
 
-			if (debug_print)
-				printk(KERN_INFO
-			       "\ngran_size: %lldM   chunk_size_size: %lldM\n",
-				       gran_size >> 20, chunk_size >> 20);
+			if (debug_print) {
+				char chunk_factor;
+				unsigned long chunk_base;
+
+				chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
+				printk(KERN_INFO "\n");
+				printk(KERN_INFO "gran_size: %ld%c   chunk_size: %ld%c \n",
+				       gran_base, gran_factor, chunk_base, chunk_factor);
+			}
 			if (i >= NUM_RESULT)
 				continue;
 
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits)
 
 	/* print out all */
 	for (i = 0; i < NUM_RESULT; i++) {
-		printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t",
-		       result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10,
-		       result[i].chunk_sizek >> 10);
-		printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n",
-		       result[i].num_reg, result[i].bad?"-":"",
-		       result[i].lose_cover_sizek >> 10);
+		char gran_factor, chunk_factor, lose_factor;
+		unsigned long gran_base, chunk_base, lose_base;
+
+		gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+		chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+		lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+		printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
+			 result[i].bad?"*BAD*":" ",
+			 gran_base, gran_factor, chunk_base, chunk_factor);
+		printk(KERN_CONT "num_reg: %d  \tlose cover RAM: %s%ld%c\n",
+			 result[i].num_reg, result[i].bad?"-":"",
+			 lose_base, lose_factor);
 	}
 
 	/* try to find the optimal index */
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
 		nr_mtrr_spare_reg = num_var_ranges - 1;
 	num_reg_good = -1;
 	for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
-		if (!min_loss_pfn[i]) {
+		if (!min_loss_pfn[i])
 			num_reg_good = i;
-			break;
-		}
 	}
 
 	index_good = -1;
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits)
 	}
 
 	if (index_good != -1) {
+		char gran_factor, chunk_factor, lose_factor;
+		unsigned long gran_base, chunk_base, lose_base;
+
 		printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
 		i = index_good;
-		printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t",
-				result[i].gran_sizek >> 10,
-				result[i].chunk_sizek >> 10);
-		printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n",
-				result[i].num_reg,
-				result[i].lose_cover_sizek >> 10);
+		gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
+		chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
+		lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+		printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
+			 gran_base, gran_factor, chunk_base, chunk_factor);
+		printk(KERN_CONT "num_reg: %d  \tlose RAM: %ld%c\n",
+			 result[i].num_reg, lose_base, lose_factor);
 		/* convert ranges to var ranges state */
 		chunk_size = result[i].chunk_sizek;
 		chunk_size <<= 10;
 		gran_size = result[i].gran_sizek;
 		gran_size <<= 10;
-		debug_print = 1;
+		debug_print++;
 		x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
+		debug_print--;
 		set_var_mtrr_all(address_bits);
 		return 1;
 	}
@@ -1496,11 +1600,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 
 	/* kvm/qemu doesn't have mtrr set right, don't trim them all */
 	if (!highest_pfn) {
-		if (!kvm_para_available()) {
-			printk(KERN_WARNING
+		WARN(!kvm_para_available(), KERN_WARNING
 				"WARNING: strange, CPU MTRRs all blank?\n");
-			WARN_ON(1);
-		}
 		return 0;
 	}
 
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 6d4bdc02388a..9abd48b22674 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -17,6 +17,8 @@
 #include <linux/bitops.h>
 #include <linux/smp.h>
 #include <linux/nmi.h>
+#include <linux/kprobes.h>
+
 #include <asm/apic.h>
 #include <asm/intel_arch_perfmon.h>
 
@@ -250,7 +252,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr,
 
 	do_div(count, nmi_hz);
 	if(descr)
-		Dprintk("setting %s to -0x%08Lx\n", descr, count);
+		pr_debug("setting %s to -0x%08Lx\n", descr, count);
 	wrmsrl(perfctr_msr, 0 - count);
 }
 
@@ -261,7 +263,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr,
 
 	do_div(count, nmi_hz);
 	if(descr)
-		Dprintk("setting %s to -0x%08Lx\n", descr, count);
+		pr_debug("setting %s to -0x%08Lx\n", descr, count);
 	wrmsr(perfctr_msr, (u32)(-count), 0);
 }
 
@@ -295,13 +297,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
 	/* setup the timer */
 	wrmsr(evntsel_msr, evntsel, 0);
 	write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	evntsel |= K7_EVNTSEL_ENABLE;
-	wrmsr(evntsel_msr, evntsel, 0);
 
+	/* initialize the wd struct before enabling */
 	wd->perfctr_msr = perfctr_msr;
 	wd->evntsel_msr = evntsel_msr;
 	wd->cccr_msr = 0;  /* unused */
+
+	/* ok, everything is initialized, announce that we're set */
+	cpu_nmi_set_wd_enabled();
+
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	evntsel |= K7_EVNTSEL_ENABLE;
+	wrmsr(evntsel_msr, evntsel, 0);
+
 	return 1;
 }
 
@@ -330,7 +338,8 @@ static void single_msr_unreserve(void)
 	release_perfctr_nmi(wd_ops->perfctr);
 }
 
-static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+static void __kprobes
+single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
 {
 	/* start the cycle over again */
 	write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
@@ -379,17 +388,23 @@ static int setup_p6_watchdog(unsigned nmi_hz)
 	wrmsr(evntsel_msr, evntsel, 0);
 	nmi_hz = adjust_for_32bit_ctr(nmi_hz);
 	write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	evntsel |= P6_EVNTSEL0_ENABLE;
-	wrmsr(evntsel_msr, evntsel, 0);
 
+	/* initialize the wd struct before enabling */
 	wd->perfctr_msr = perfctr_msr;
 	wd->evntsel_msr = evntsel_msr;
 	wd->cccr_msr = 0;  /* unused */
+
+	/* ok, everything is initialized, announce that we're set */
+	cpu_nmi_set_wd_enabled();
+
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	evntsel |= P6_EVNTSEL0_ENABLE;
+	wrmsr(evntsel_msr, evntsel, 0);
+
 	return 1;
 }
 
-static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
 {
 	/*
 	 * P6 based Pentium M need to re-unmask
@@ -432,6 +447,27 @@ static const struct wd_ops p6_wd_ops = {
 #define P4_CCCR_ENABLE		(1 << 12)
 #define P4_CCCR_OVF 		(1 << 31)
 
+#define P4_CONTROLS 18
+static unsigned int p4_controls[18] = {
+	MSR_P4_BPU_CCCR0,
+	MSR_P4_BPU_CCCR1,
+	MSR_P4_BPU_CCCR2,
+	MSR_P4_BPU_CCCR3,
+	MSR_P4_MS_CCCR0,
+	MSR_P4_MS_CCCR1,
+	MSR_P4_MS_CCCR2,
+	MSR_P4_MS_CCCR3,
+	MSR_P4_FLAME_CCCR0,
+	MSR_P4_FLAME_CCCR1,
+	MSR_P4_FLAME_CCCR2,
+	MSR_P4_FLAME_CCCR3,
+	MSR_P4_IQ_CCCR0,
+	MSR_P4_IQ_CCCR1,
+	MSR_P4_IQ_CCCR2,
+	MSR_P4_IQ_CCCR3,
+	MSR_P4_IQ_CCCR4,
+	MSR_P4_IQ_CCCR5,
+};
 /*
  * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
  * CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -473,12 +509,38 @@ static int setup_p4_watchdog(unsigned nmi_hz)
 		evntsel_msr = MSR_P4_CRU_ESCR0;
 		cccr_msr = MSR_P4_IQ_CCCR0;
 		cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
+
+		/*
+		 * If we're on the kdump kernel or other situation, we may
+		 * still have other performance counter registers set to
+		 * interrupt and they'll keep interrupting forever because
+		 * of the P4_CCCR_OVF quirk. So we need to ACK all the
+		 * pending interrupts and disable all the registers here,
+		 * before reenabling the NMI delivery. Refer to p4_rearm()
+		 * about the P4_CCCR_OVF quirk.
+		 */
+		if (reset_devices) {
+			unsigned int low, high;
+			int i;
+
+			for (i = 0; i < P4_CONTROLS; i++) {
+				rdmsr(p4_controls[i], low, high);
+				low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
+				wrmsr(p4_controls[i], low, high);
+			}
+		}
 	} else {
 		/* logical cpu 1 */
 		perfctr_msr = MSR_P4_IQ_PERFCTR1;
 		evntsel_msr = MSR_P4_CRU_ESCR0;
 		cccr_msr = MSR_P4_IQ_CCCR1;
-		cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4);
+
+		/* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
+		if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
+			cccr_val = P4_CCCR_OVF_PMI0;
+		else
+			cccr_val = P4_CCCR_OVF_PMI1;
+		cccr_val |= P4_CCCR_ESCR_SELECT(4);
 	}
 
 	evntsel = P4_ESCR_EVENT_SELECT(0x3F)
@@ -493,12 +555,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
 	wrmsr(evntsel_msr, evntsel, 0);
 	wrmsr(cccr_msr, cccr_val, 0);
 	write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	cccr_val |= P4_CCCR_ENABLE;
-	wrmsr(cccr_msr, cccr_val, 0);
+
 	wd->perfctr_msr = perfctr_msr;
 	wd->evntsel_msr = evntsel_msr;
 	wd->cccr_msr = cccr_msr;
+
+	/* ok, everything is initialized, announce that we're set */
+	cpu_nmi_set_wd_enabled();
+
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	cccr_val |= P4_CCCR_ENABLE;
+	wrmsr(cccr_msr, cccr_val, 0);
 	return 1;
 }
 
@@ -541,7 +608,7 @@ static void p4_unreserve(void)
 	release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
 }
 
-static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
+static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
 {
 	unsigned dummy;
 	/*
@@ -614,13 +681,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
 	wrmsr(evntsel_msr, evntsel, 0);
 	nmi_hz = adjust_for_32bit_ctr(nmi_hz);
 	write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
-	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-	wrmsr(evntsel_msr, evntsel, 0);
 
 	wd->perfctr_msr = perfctr_msr;
 	wd->evntsel_msr = evntsel_msr;
 	wd->cccr_msr = 0;  /* unused */
+
+	/* ok, everything is initialized, announce that we're set */
+	cpu_nmi_set_wd_enabled();
+
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsr(evntsel_msr, evntsel, 0);
 	intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
 	return 1;
 }
@@ -716,7 +787,7 @@ unsigned lapic_adjust_nmi_hz(unsigned hz)
 	return hz;
 }
 
-int lapic_wd_event(unsigned nmi_hz)
+int __kprobes lapic_wd_event(unsigned nmi_hz)
 {
 	struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
 	u64 ctr;
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
new file mode 100644
index 000000000000..5abbea297e0c
--- /dev/null
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -0,0 +1,20 @@
+/*
+ * Strings for the various x86 power flags
+ *
+ * This file must not contain any executable code.
+ */
+
+#include <asm/cpufeature.h>
+
+const char *const x86_power_flags[32] = {
+	"ts",	/* temperature sensor */
+	"fid",  /* frequency id control */
+	"vid",  /* voltage id control */
+	"ttp",  /* thermal trip */
+	"tm",
+	"stc",
+	"100mhzsteps",
+	"hwpstate",
+	"",	/* tsc invariant mapped to constant_tsc */
+		/* nothing */
+};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 0d0d9057e7c0..01b1244ef1c0 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -160,14 +160,16 @@ static void *c_start(struct seq_file *m, loff_t *pos)
 {
 	if (*pos == 0)	/* just in case, cpu 0 is not the first */
 		*pos = first_cpu(cpu_online_map);
-	if ((*pos) < NR_CPUS && cpu_online(*pos))
+	else
+		*pos = next_cpu_nr(*pos - 1, cpu_online_map);
+	if ((*pos) < nr_cpu_ids)
 		return &cpu_data(*pos);
 	return NULL;
 }
 
 static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	*pos = next_cpu(*pos, cpu_online_map);
+	(*pos)++;
 	return c_start(m, pos);
 }
 
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index b911a2c61b8f..52b3fefbd5af 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -5,6 +5,18 @@
 #include <asm/msr.h>
 #include "cpu.h"
 
+static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
+{
+	u32 xlvl;
+
+	/* Transmeta-defined flags: level 0x80860001 */
+	xlvl = cpuid_eax(0x80860000);
+	if ((xlvl & 0xffff0000) == 0x80860000) {
+		if (xlvl >= 0x80860001)
+			c->x86_capability[2] = cpuid_edx(0x80860001);
+	}
+}
+
 static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
 {
 	unsigned int cap_mask, uk, max, dummy;
@@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
 	unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
 	char cpu_info[65];
 
-	get_model_name(c);	/* Same as AMD/Cyrix */
+	early_init_transmeta(c);
+
 	display_cacheinfo(c);
 
 	/* Print CMS and CPU revision */
@@ -85,23 +98,12 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
 #endif
 }
 
-static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c)
-{
-	u32 xlvl;
-
-	/* Transmeta-defined flags: level 0x80860001 */
-	xlvl = cpuid_eax(0x80860000);
-	if ((xlvl & 0xffff0000) == 0x80860000) {
-		if (xlvl >= 0x80860001)
-			c->x86_capability[2] = cpuid_edx(0x80860001);
-	}
-}
-
 static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
 	.c_vendor	= "Transmeta",
 	.c_ident	= { "GenuineTMx86", "TransmetaCPU" },
+	.c_early_init	= early_init_transmeta,
 	.c_init		= init_transmeta,
-	.c_identify	= transmeta_identify,
+	.c_x86_vendor	= X86_VENDOR_TRANSMETA,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev);
+cpu_dev_register(transmeta_cpu_dev);
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index b1fc90989d75..e777f79e0960 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -19,7 +19,8 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = {
 		  }
 		},
 	},
+	.c_x86_vendor	= X86_VENDOR_UMC,
 };
 
-cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev);
+cpu_dev_register(umc_cpu_dev);
 
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2de5fa2bbf77..72cefd1e649b 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -36,7 +36,6 @@
 #include <linux/smp_lock.h>
 #include <linux/major.h>
 #include <linux/fs.h>
-#include <linux/smp_lock.h>
 #include <linux/device.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
@@ -89,6 +88,8 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
 	struct cpuid_regs cmd;
 	int cpu = iminor(file->f_path.dentry->d_inode);
 	u64 pos = *ppos;
+	ssize_t bytes = 0;
+	int err = 0;
 
 	if (count % 16)
 		return -EINVAL;	/* Invalid chunk size */
@@ -96,14 +97,19 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
 	for (; count; count -= 16) {
 		cmd.eax = pos;
 		cmd.ecx = pos >> 32;
-		smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
-		if (copy_to_user(tmp, &cmd, 16))
-			return -EFAULT;
+		err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
+		if (err)
+			break;
+		if (copy_to_user(tmp, &cmd, 16)) {
+			err = -EFAULT;
+			break;
+		}
 		tmp += 16;
+		bytes += 16;
 		*ppos = ++pos;
 	}
 
-	return tmp - buf;
+	return bytes ? bytes : err;
 }
 
 static int cpuid_open(struct inode *inode, struct file *file)
@@ -141,7 +147,7 @@ static __cpuinit int cpuid_device_create(int cpu)
 {
 	struct device *dev;
 
-	dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu),
+	dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
 			    "cpu%d", cpu);
 	return IS_ERR(dev) ? PTR_ERR(dev) : 0;
 }
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 72d0c56c1b48..f7cdb3b457aa 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -13,6 +13,9 @@
 
 static void *kdump_buf_page;
 
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
 /**
  * copy_oldmem_page - copy one page from "oldmem"
  * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 15e6c6bc4a46..045b36cada65 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -7,9 +7,11 @@
 
 #include <linux/errno.h>
 #include <linux/crash_dump.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
 
-#include <asm/uaccess.h>
-#include <asm/io.h>
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
 
 /**
  * copy_oldmem_page - copy one page from "oldmem"
@@ -25,7 +27,7 @@
  * in the current kernel. We stitch up a pte, similar to kmap_atomic.
  */
 ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
-                               size_t csize, unsigned long offset, int userbuf)
+		size_t csize, unsigned long offset, int userbuf)
 {
 	void  *vaddr;
 
@@ -33,14 +35,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 		return 0;
 
 	vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!vaddr)
+		return -ENOMEM;
 
 	if (userbuf) {
-		if (copy_to_user(buf, (vaddr + offset), csize)) {
+		if (copy_to_user(buf, vaddr + offset, csize)) {
 			iounmap(vaddr);
 			return -EFAULT;
 		}
 	} else
-	memcpy(buf, (vaddr + offset), csize);
+		memcpy(buf, vaddr + offset, csize);
 
 	iounmap(vaddr);
 	return csize;
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index a47798b59f07..b4f14c6c09d9 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = {
 		.ds		= __USER_DS,
 		.fs		= __KERNEL_PERCPU,
 
-		.__cr3		= __pa(swapper_pg_dir)
+		.__cr3		= __pa_nodebug(swapper_pg_dir),
 	}
 };
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 11c11b8ec48d..2b69994fd3a8 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -2,26 +2,49 @@
  * Debug Store support
  *
  * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for last branch recording (LBR) and
+ * feature that is used for branch trace store (BTS) and
  * precise-event based sampling (PEBS).
  *
- * Different architectures use a different DS layout/pointer size.
- * The below functions therefore work on a void*.
+ * It manages:
+ * - per-thread and per-cpu allocation of BTS and PEBS
+ * - buffer memory allocation (optional)
+ * - buffer overflow handling
+ * - buffer access
  *
+ * It assumes:
+ * - get_task_struct on all parameter tasks
+ * - current is allowed to trace parameter tasks
  *
- * Since there is no user for PEBS, yet, only LBR (or branch
- * trace store, BTS) is supported.
  *
- *
- * Copyright (C) 2007 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
+ * Copyright (C) 2007-2008 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
  */
 
+
+#ifdef CONFIG_X86_DS
+
 #include <asm/ds.h>
 
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+
+
+/*
+ * The configuration for a particular DS hardware implementation.
+ */
+struct ds_configuration {
+	/* the size of the DS structure in bytes */
+	unsigned char  sizeof_ds;
+	/* the size of one pointer-typed field in the DS structure in bytes;
+	   this covers the first 8 fields related to buffer management. */
+	unsigned char  sizeof_field;
+	/* the size of a BTS/PEBS record in bytes */
+	unsigned char  sizeof_rec[2];
+};
+static struct ds_configuration ds_cfg;
 
 
 /*
@@ -44,378 +67,747 @@
  *   (interrupt occurs when write pointer passes interrupt pointer)
  * - value to which counter is reset following counter overflow
  *
- * On later architectures, the last branch recording hardware uses
- * 64bit pointers even in 32bit mode.
- *
- *
- * Branch Trace Store (BTS) records store information about control
- * flow changes. They at least provide the following information:
- * - source linear address
- * - destination linear address
+ * Later architectures use 64bit pointers throughout, whereas earlier
+ * architectures use 32bit pointers in 32bit mode.
  *
- * Netburst supported a predicated bit that had been dropped in later
- * architectures. We do not suppor it.
  *
+ * We compute the base address for the first 8 fields based on:
+ * - the field size stored in the DS configuration
+ * - the relative field position
+ * - an offset giving the start of the respective region
  *
- * In order to abstract from the actual DS and BTS layout, we describe
- * the access to the relevant fields.
- * Thanks to Andi Kleen for proposing this design.
+ * This offset is further used to index various arrays holding
+ * information for BTS and PEBS at the respective index.
  *
- * The implementation, however, is not as general as it might seem. In
- * order to stay somewhat simple and efficient, we assume an
- * underlying unsigned type (mostly a pointer type) and we expect the
- * field to be at least as big as that type.
+ * On later 32bit processors, we only access the lower 32bit of the
+ * 64bit pointer fields. The upper halves will be zeroed out.
  */
 
-/*
- * A special from_ip address to indicate that the BTS record is an
- * info record that needs to be interpreted or skipped.
- */
-#define BTS_ESCAPE_ADDRESS (-1)
+enum ds_field {
+	ds_buffer_base = 0,
+	ds_index,
+	ds_absolute_maximum,
+	ds_interrupt_threshold,
+};
 
-/*
- * A field access descriptor
- */
-struct access_desc {
-	unsigned char offset;
-	unsigned char size;
+enum ds_qualifier {
+	ds_bts  = 0,
+	ds_pebs
 };
 
+static inline unsigned long ds_get(const unsigned char *base,
+				   enum ds_qualifier qual, enum ds_field field)
+{
+	base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+	return *(unsigned long *)base;
+}
+
+static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
+			  enum ds_field field, unsigned long value)
+{
+	base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+	(*(unsigned long *)base) = value;
+}
+
+
 /*
- * The configuration for a particular DS/BTS hardware implementation.
+ * Locking is done only for allocating BTS or PEBS resources and for
+ * guarding context and buffer memory allocation.
+ *
+ * Most functions require the current task to own the ds context part
+ * they are going to access. All the locking is done when validating
+ * access to the context.
  */
-struct ds_configuration {
-	/* the DS configuration */
-	unsigned char  sizeof_ds;
-	struct access_desc bts_buffer_base;
-	struct access_desc bts_index;
-	struct access_desc bts_absolute_maximum;
-	struct access_desc bts_interrupt_threshold;
-	/* the BTS configuration */
-	unsigned char  sizeof_bts;
-	struct access_desc from_ip;
-	struct access_desc to_ip;
-	/* BTS variants used to store additional information like
-	   timestamps */
-	struct access_desc info_type;
-	struct access_desc info_data;
-	unsigned long debugctl_mask;
-};
+static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
 
 /*
- * The global configuration used by the below accessor functions
+ * Validate that the current task is allowed to access the BTS/PEBS
+ * buffer of the parameter task.
+ *
+ * Returns 0, if access is granted; -Eerrno, otherwise.
  */
-static struct ds_configuration ds_cfg;
+static inline int ds_validate_access(struct ds_context *context,
+				     enum ds_qualifier qual)
+{
+	if (!context)
+		return -EPERM;
+
+	if (context->owner[qual] == current)
+		return 0;
+
+	return -EPERM;
+}
+
 
 /*
- * Accessor functions for some DS and BTS fields using the above
- * global ptrace_bts_cfg.
+ * We either support (system-wide) per-cpu or per-thread allocation.
+ * We distinguish the two based on the task_struct pointer, where a
+ * NULL pointer indicates per-cpu allocation for the current cpu.
+ *
+ * Allocations are use-counted. As soon as resources are allocated,
+ * further allocations must be of the same type (per-cpu or
+ * per-thread). We model this by counting allocations (i.e. the number
+ * of tracers of a certain type) for one type negatively:
+ *   =0  no tracers
+ *   >0  number of per-thread tracers
+ *   <0  number of per-cpu tracers
+ *
+ * The below functions to get and put tracers and to check the
+ * allocation type require the ds_lock to be held by the caller.
+ *
+ * Tracers essentially gives the number of ds contexts for a certain
+ * type of allocation.
  */
-static inline unsigned long get_bts_buffer_base(char *base)
+static long tracers;
+
+static inline void get_tracer(struct task_struct *task)
 {
-	return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset);
+	tracers += (task ? 1 : -1);
 }
-static inline void set_bts_buffer_base(char *base, unsigned long value)
+
+static inline void put_tracer(struct task_struct *task)
 {
-	(*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value;
+	tracers -= (task ? 1 : -1);
 }
-static inline unsigned long get_bts_index(char *base)
+
+static inline int check_tracer(struct task_struct *task)
 {
-	return *(unsigned long *)(base + ds_cfg.bts_index.offset);
+	return (task ? (tracers >= 0) : (tracers <= 0));
 }
-static inline void set_bts_index(char *base, unsigned long value)
+
+
+/*
+ * The DS context is either attached to a thread or to a cpu:
+ * - in the former case, the thread_struct contains a pointer to the
+ *   attached context.
+ * - in the latter case, we use a static array of per-cpu context
+ *   pointers.
+ *
+ * Contexts are use-counted. They are allocated on first access and
+ * deallocated when the last user puts the context.
+ *
+ * We distinguish between an allocating and a non-allocating get of a
+ * context:
+ * - the allocating get is used for requesting BTS/PEBS resources. It
+ *   requires the caller to hold the global ds_lock.
+ * - the non-allocating get is used for all other cases. A
+ *   non-existing context indicates an error. It acquires and releases
+ *   the ds_lock itself for obtaining the context.
+ *
+ * A context and its DS configuration are allocated and deallocated
+ * together. A context always has a DS configuration of the
+ * appropriate size.
+ */
+static DEFINE_PER_CPU(struct ds_context *, system_context);
+
+#define this_system_context per_cpu(system_context, smp_processor_id())
+
+/*
+ * Returns the pointer to the parameter task's context or to the
+ * system-wide context, if task is NULL.
+ *
+ * Increases the use count of the returned context, if not NULL.
+ */
+static inline struct ds_context *ds_get_context(struct task_struct *task)
 {
-	(*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value;
+	struct ds_context *context;
+
+	spin_lock(&ds_lock);
+
+	context = (task ? task->thread.ds_ctx : this_system_context);
+	if (context)
+		context->count++;
+
+	spin_unlock(&ds_lock);
+
+	return context;
 }
-static inline unsigned long get_bts_absolute_maximum(char *base)
+
+/*
+ * Same as ds_get_context, but allocates the context and it's DS
+ * structure, if necessary; returns NULL; if out of memory.
+ *
+ * pre: requires ds_lock to be held
+ */
+static inline struct ds_context *ds_alloc_context(struct task_struct *task)
 {
-	return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset);
+	struct ds_context **p_context =
+		(task ? &task->thread.ds_ctx : &this_system_context);
+	struct ds_context *context = *p_context;
+
+	if (!context) {
+		context = kzalloc(sizeof(*context), GFP_KERNEL);
+
+		if (!context)
+			return NULL;
+
+		context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
+		if (!context->ds) {
+			kfree(context);
+			return NULL;
+		}
+
+		*p_context = context;
+
+		context->this = p_context;
+		context->task = task;
+
+		if (task)
+			set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
+
+		if (!task || (task == current))
+			wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
+
+		get_tracer(task);
+	}
+
+	context->count++;
+
+	return context;
 }
-static inline void set_bts_absolute_maximum(char *base, unsigned long value)
+
+/*
+ * Decreases the use count of the parameter context, if not NULL.
+ * Deallocates the context, if the use count reaches zero.
+ */
+static inline void ds_put_context(struct ds_context *context)
 {
-	(*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value;
+	if (!context)
+		return;
+
+	spin_lock(&ds_lock);
+
+	if (--context->count)
+		goto out;
+
+	*(context->this) = NULL;
+
+	if (context->task)
+		clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+
+	if (!context->task || (context->task == current))
+		wrmsrl(MSR_IA32_DS_AREA, 0);
+
+	put_tracer(context->task);
+
+	/* free any leftover buffers from tracers that did not
+	 * deallocate them properly. */
+	kfree(context->buffer[ds_bts]);
+	kfree(context->buffer[ds_pebs]);
+	kfree(context->ds);
+	kfree(context);
+ out:
+	spin_unlock(&ds_lock);
 }
-static inline unsigned long get_bts_interrupt_threshold(char *base)
+
+
+/*
+ * Handle a buffer overflow
+ *
+ * task: the task whose buffers are overflowing;
+ *       NULL for a buffer overflow on the current cpu
+ * context: the ds context
+ * qual: the buffer type
+ */
+static void ds_overflow(struct task_struct *task, struct ds_context *context,
+			enum ds_qualifier qual)
 {
-	return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset);
+	if (!context)
+		return;
+
+	if (context->callback[qual])
+		(*context->callback[qual])(task);
+
+	/* todo: do some more overflow handling */
 }
-static inline void set_bts_interrupt_threshold(char *base, unsigned long value)
+
+
+/*
+ * Allocate a non-pageable buffer of the parameter size.
+ * Checks the memory and the locked memory rlimit.
+ *
+ * Returns the buffer, if successful;
+ *         NULL, if out of memory or rlimit exceeded.
+ *
+ * size: the requested buffer size in bytes
+ * pages (out): if not NULL, contains the number of pages reserved
+ */
+static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
 {
-	(*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value;
+	unsigned long rlim, vm, pgsz;
+	void *buffer;
+
+	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
+
+	rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+	vm   = current->mm->total_vm  + pgsz;
+	if (rlim < vm)
+		return NULL;
+
+	rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+	vm   = current->mm->locked_vm  + pgsz;
+	if (rlim < vm)
+		return NULL;
+
+	buffer = kzalloc(size, GFP_KERNEL);
+	if (!buffer)
+		return NULL;
+
+	current->mm->total_vm  += pgsz;
+	current->mm->locked_vm += pgsz;
+
+	if (pages)
+		*pages = pgsz;
+
+	return buffer;
 }
-static inline unsigned long get_from_ip(char *base)
+
+static int ds_request(struct task_struct *task, void *base, size_t size,
+		      ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
 {
-	return *(unsigned long *)(base + ds_cfg.from_ip.offset);
+	struct ds_context *context;
+	unsigned long buffer, adj;
+	const unsigned long alignment = (1 << 3);
+	int error = 0;
+
+	if (!ds_cfg.sizeof_ds)
+		return -EOPNOTSUPP;
+
+	/* we require some space to do alignment adjustments below */
+	if (size < (alignment + ds_cfg.sizeof_rec[qual]))
+		return -EINVAL;
+
+	/* buffer overflow notification is not yet implemented */
+	if (ovfl)
+		return -EOPNOTSUPP;
+
+
+	spin_lock(&ds_lock);
+
+	if (!check_tracer(task))
+		return -EPERM;
+
+	error = -ENOMEM;
+	context = ds_alloc_context(task);
+	if (!context)
+		goto out_unlock;
+
+	error = -EALREADY;
+	if (context->owner[qual] == current)
+		goto out_unlock;
+	error = -EPERM;
+	if (context->owner[qual] != NULL)
+		goto out_unlock;
+	context->owner[qual] = current;
+
+	spin_unlock(&ds_lock);
+
+
+	error = -ENOMEM;
+	if (!base) {
+		base = ds_allocate_buffer(size, &context->pages[qual]);
+		if (!base)
+			goto out_release;
+
+		context->buffer[qual]   = base;
+	}
+	error = 0;
+
+	context->callback[qual] = ovfl;
+
+	/* adjust the buffer address and size to meet alignment
+	 * constraints:
+	 * - buffer is double-word aligned
+	 * - size is multiple of record size
+	 *
+	 * We checked the size at the very beginning; we have enough
+	 * space to do the adjustment.
+	 */
+	buffer = (unsigned long)base;
+
+	adj = ALIGN(buffer, alignment) - buffer;
+	buffer += adj;
+	size   -= adj;
+
+	size /= ds_cfg.sizeof_rec[qual];
+	size *= ds_cfg.sizeof_rec[qual];
+
+	ds_set(context->ds, qual, ds_buffer_base, buffer);
+	ds_set(context->ds, qual, ds_index, buffer);
+	ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
+
+	if (ovfl) {
+		/* todo: select a suitable interrupt threshold */
+	} else
+		ds_set(context->ds, qual,
+		       ds_interrupt_threshold, buffer + size + 1);
+
+	/* we keep the context until ds_release */
+	return error;
+
+ out_release:
+	context->owner[qual] = NULL;
+	ds_put_context(context);
+	return error;
+
+ out_unlock:
+	spin_unlock(&ds_lock);
+	ds_put_context(context);
+	return error;
 }
-static inline void set_from_ip(char *base, unsigned long value)
+
+int ds_request_bts(struct task_struct *task, void *base, size_t size,
+		   ds_ovfl_callback_t ovfl)
 {
-	(*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value;
+	return ds_request(task, base, size, ovfl, ds_bts);
 }
-static inline unsigned long get_to_ip(char *base)
+
+int ds_request_pebs(struct task_struct *task, void *base, size_t size,
+		    ds_ovfl_callback_t ovfl)
 {
-	return *(unsigned long *)(base + ds_cfg.to_ip.offset);
+	return ds_request(task, base, size, ovfl, ds_pebs);
 }
-static inline void set_to_ip(char *base, unsigned long value)
+
+static int ds_release(struct task_struct *task, enum ds_qualifier qual)
 {
-	(*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value;
+	struct ds_context *context;
+	int error;
+
+	context = ds_get_context(task);
+	error = ds_validate_access(context, qual);
+	if (error < 0)
+		goto out;
+
+	kfree(context->buffer[qual]);
+	context->buffer[qual] = NULL;
+
+	current->mm->total_vm  -= context->pages[qual];
+	current->mm->locked_vm -= context->pages[qual];
+	context->pages[qual] = 0;
+	context->owner[qual] = NULL;
+
+	/*
+	 * we put the context twice:
+	 *   once for the ds_get_context
+	 *   once for the corresponding ds_request
+	 */
+	ds_put_context(context);
+ out:
+	ds_put_context(context);
+	return error;
 }
-static inline unsigned char get_info_type(char *base)
+
+int ds_release_bts(struct task_struct *task)
 {
-	return *(unsigned char *)(base + ds_cfg.info_type.offset);
+	return ds_release(task, ds_bts);
 }
-static inline void set_info_type(char *base, unsigned char value)
+
+int ds_release_pebs(struct task_struct *task)
 {
-	(*(unsigned char *)(base + ds_cfg.info_type.offset)) = value;
+	return ds_release(task, ds_pebs);
 }
-static inline unsigned long get_info_data(char *base)
+
+static int ds_get_index(struct task_struct *task, size_t *pos,
+			enum ds_qualifier qual)
 {
-	return *(unsigned long *)(base + ds_cfg.info_data.offset);
+	struct ds_context *context;
+	unsigned long base, index;
+	int error;
+
+	context = ds_get_context(task);
+	error = ds_validate_access(context, qual);
+	if (error < 0)
+		goto out;
+
+	base  = ds_get(context->ds, qual, ds_buffer_base);
+	index = ds_get(context->ds, qual, ds_index);
+
+	error = ((index - base) / ds_cfg.sizeof_rec[qual]);
+	if (pos)
+		*pos = error;
+ out:
+	ds_put_context(context);
+	return error;
 }
-static inline void set_info_data(char *base, unsigned long value)
+
+int ds_get_bts_index(struct task_struct *task, size_t *pos)
 {
-	(*(unsigned long *)(base + ds_cfg.info_data.offset)) = value;
+	return ds_get_index(task, pos, ds_bts);
 }
 
+int ds_get_pebs_index(struct task_struct *task, size_t *pos)
+{
+	return ds_get_index(task, pos, ds_pebs);
+}
 
-int ds_allocate(void **dsp, size_t bts_size_in_bytes)
+static int ds_get_end(struct task_struct *task, size_t *pos,
+		      enum ds_qualifier qual)
 {
-	size_t bts_size_in_records;
-	unsigned long bts;
-	void *ds;
+	struct ds_context *context;
+	unsigned long base, end;
+	int error;
+
+	context = ds_get_context(task);
+	error = ds_validate_access(context, qual);
+	if (error < 0)
+		goto out;
+
+	base = ds_get(context->ds, qual, ds_buffer_base);
+	end  = ds_get(context->ds, qual, ds_absolute_maximum);
+
+	error = ((end - base) / ds_cfg.sizeof_rec[qual]);
+	if (pos)
+		*pos = error;
+ out:
+	ds_put_context(context);
+	return error;
+}
 
-	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
-		return -EOPNOTSUPP;
+int ds_get_bts_end(struct task_struct *task, size_t *pos)
+{
+	return ds_get_end(task, pos, ds_bts);
+}
 
-	if (bts_size_in_bytes < 0)
-		return -EINVAL;
+int ds_get_pebs_end(struct task_struct *task, size_t *pos)
+{
+	return ds_get_end(task, pos, ds_pebs);
+}
 
-	bts_size_in_records =
-		bts_size_in_bytes / ds_cfg.sizeof_bts;
-	bts_size_in_bytes =
-		bts_size_in_records * ds_cfg.sizeof_bts;
+static int ds_access(struct task_struct *task, size_t index,
+		     const void **record, enum ds_qualifier qual)
+{
+	struct ds_context *context;
+	unsigned long base, idx;
+	int error;
 
-	if (bts_size_in_bytes <= 0)
+	if (!record)
 		return -EINVAL;
 
-	bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL);
-
-	if (!bts)
-		return -ENOMEM;
+	context = ds_get_context(task);
+	error = ds_validate_access(context, qual);
+	if (error < 0)
+		goto out;
 
-	ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
+	base = ds_get(context->ds, qual, ds_buffer_base);
+	idx = base + (index * ds_cfg.sizeof_rec[qual]);
 
-	if (!ds) {
-		kfree((void *)bts);
-		return -ENOMEM;
-	}
-
-	set_bts_buffer_base(ds, bts);
-	set_bts_index(ds, bts);
-	set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
-	set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
+	error = -EINVAL;
+	if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
+		goto out;
 
-	*dsp = ds;
-	return 0;
+	*record = (const void *)idx;
+	error = ds_cfg.sizeof_rec[qual];
+ out:
+	ds_put_context(context);
+	return error;
 }
 
-int ds_free(void **dsp)
+int ds_access_bts(struct task_struct *task, size_t index, const void **record)
 {
-	if (*dsp) {
-		kfree((void *)get_bts_buffer_base(*dsp));
-		kfree(*dsp);
-		*dsp = NULL;
-	}
-	return 0;
+	return ds_access(task, index, record, ds_bts);
 }
 
-int ds_get_bts_size(void *ds)
+int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
 {
-	int size_in_bytes;
-
-	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
-		return -EOPNOTSUPP;
-
-	if (!ds)
-		return 0;
-
-	size_in_bytes =
-		get_bts_absolute_maximum(ds) -
-		get_bts_buffer_base(ds);
-	return size_in_bytes;
+	return ds_access(task, index, record, ds_pebs);
 }
 
-int ds_get_bts_end(void *ds)
+static int ds_write(struct task_struct *task, const void *record, size_t size,
+		    enum ds_qualifier qual, int force)
 {
-	int size_in_bytes = ds_get_bts_size(ds);
-
-	if (size_in_bytes <= 0)
-		return size_in_bytes;
+	struct ds_context *context;
+	int error;
 
-	return size_in_bytes / ds_cfg.sizeof_bts;
-}
+	if (!record)
+		return -EINVAL;
 
-int ds_get_bts_index(void *ds)
-{
-	int index_offset_in_bytes;
+	error = -EPERM;
+	context = ds_get_context(task);
+	if (!context)
+		goto out;
 
-	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
-		return -EOPNOTSUPP;
+	if (!force) {
+		error = ds_validate_access(context, qual);
+		if (error < 0)
+			goto out;
+	}
 
-	index_offset_in_bytes =
-		get_bts_index(ds) -
-		get_bts_buffer_base(ds);
+	error = 0;
+	while (size) {
+		unsigned long base, index, end, write_end, int_th;
+		unsigned long write_size, adj_write_size;
+
+		/*
+		 * write as much as possible without producing an
+		 * overflow interrupt.
+		 *
+		 * interrupt_threshold must either be
+		 * - bigger than absolute_maximum or
+		 * - point to a record between buffer_base and absolute_maximum
+		 *
+		 * index points to a valid record.
+		 */
+		base   = ds_get(context->ds, qual, ds_buffer_base);
+		index  = ds_get(context->ds, qual, ds_index);
+		end    = ds_get(context->ds, qual, ds_absolute_maximum);
+		int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
+
+		write_end = min(end, int_th);
+
+		/* if we are already beyond the interrupt threshold,
+		 * we fill the entire buffer */
+		if (write_end <= index)
+			write_end = end;
+
+		if (write_end <= index)
+			goto out;
+
+		write_size = min((unsigned long) size, write_end - index);
+		memcpy((void *)index, record, write_size);
+
+		record = (const char *)record + write_size;
+		size  -= write_size;
+		error += write_size;
+
+		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
+		adj_write_size *= ds_cfg.sizeof_rec[qual];
+
+		/* zero out trailing bytes */
+		memset((char *)index + write_size, 0,
+		       adj_write_size - write_size);
+		index += adj_write_size;
+
+		if (index >= end)
+			index = base;
+		ds_set(context->ds, qual, ds_index, index);
+
+		if (index >= int_th)
+			ds_overflow(task, context, qual);
+	}
 
-	return index_offset_in_bytes / ds_cfg.sizeof_bts;
+ out:
+	ds_put_context(context);
+	return error;
 }
 
-int ds_set_overflow(void *ds, int method)
+int ds_write_bts(struct task_struct *task, const void *record, size_t size)
 {
-	switch (method) {
-	case DS_O_SIGNAL:
-		return -EOPNOTSUPP;
-	case DS_O_WRAP:
-		return 0;
-	default:
-		return -EINVAL;
-	}
+	return ds_write(task, record, size, ds_bts, /* force = */ 0);
 }
 
-int ds_get_overflow(void *ds)
+int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
 {
-	return DS_O_WRAP;
+	return ds_write(task, record, size, ds_pebs, /* force = */ 0);
 }
 
-int ds_clear(void *ds)
+int ds_unchecked_write_bts(struct task_struct *task,
+			   const void *record, size_t size)
 {
-	int bts_size = ds_get_bts_size(ds);
-	unsigned long bts_base;
-
-	if (bts_size <= 0)
-		return bts_size;
-
-	bts_base = get_bts_buffer_base(ds);
-	memset((void *)bts_base, 0, bts_size);
-
-	set_bts_index(ds, bts_base);
-	return 0;
+	return ds_write(task, record, size, ds_bts, /* force = */ 1);
 }
 
-int ds_read_bts(void *ds, int index, struct bts_struct *out)
+int ds_unchecked_write_pebs(struct task_struct *task,
+			    const void *record, size_t size)
 {
-	void *bts;
+	return ds_write(task, record, size, ds_pebs, /* force = */ 1);
+}
 
-	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
-		return -EOPNOTSUPP;
+static int ds_reset_or_clear(struct task_struct *task,
+			     enum ds_qualifier qual, int clear)
+{
+	struct ds_context *context;
+	unsigned long base, end;
+	int error;
 
-	if (index < 0)
-		return -EINVAL;
+	context = ds_get_context(task);
+	error = ds_validate_access(context, qual);
+	if (error < 0)
+		goto out;
 
-	if (index >= ds_get_bts_size(ds))
-		return -EINVAL;
+	base = ds_get(context->ds, qual, ds_buffer_base);
+	end  = ds_get(context->ds, qual, ds_absolute_maximum);
 
-	bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts));
+	if (clear)
+		memset((void *)base, 0, end - base);
 
-	memset(out, 0, sizeof(*out));
-	if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
-		out->qualifier       = get_info_type(bts);
-		out->variant.jiffies = get_info_data(bts);
-	} else {
-		out->qualifier = BTS_BRANCH;
-		out->variant.lbr.from_ip = get_from_ip(bts);
-		out->variant.lbr.to_ip   = get_to_ip(bts);
-	}
+	ds_set(context->ds, qual, ds_index, base);
 
-	return sizeof(*out);;
+	error = 0;
+ out:
+	ds_put_context(context);
+	return error;
 }
 
-int ds_write_bts(void *ds, const struct bts_struct *in)
+int ds_reset_bts(struct task_struct *task)
 {
-	unsigned long bts;
-
-	if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
-		return -EOPNOTSUPP;
-
-	if (ds_get_bts_size(ds) <= 0)
-		return -ENXIO;
+	return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
+}
 
-	bts = get_bts_index(ds);
+int ds_reset_pebs(struct task_struct *task)
+{
+	return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
+}
 
-	memset((void *)bts, 0, ds_cfg.sizeof_bts);
-	switch (in->qualifier) {
-	case BTS_INVALID:
-		break;
+int ds_clear_bts(struct task_struct *task)
+{
+	return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
+}
 
-	case BTS_BRANCH:
-		set_from_ip((void *)bts, in->variant.lbr.from_ip);
-		set_to_ip((void *)bts, in->variant.lbr.to_ip);
-		break;
+int ds_clear_pebs(struct task_struct *task)
+{
+	return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
+}
 
-	case BTS_TASK_ARRIVES:
-	case BTS_TASK_DEPARTS:
-		set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS);
-		set_info_type((void *)bts, in->qualifier);
-		set_info_data((void *)bts, in->variant.jiffies);
-		break;
+int ds_get_pebs_reset(struct task_struct *task, u64 *value)
+{
+	struct ds_context *context;
+	int error;
 
-	default:
+	if (!value)
 		return -EINVAL;
-	}
 
-	bts = bts + ds_cfg.sizeof_bts;
-	if (bts >= get_bts_absolute_maximum(ds))
-		bts = get_bts_buffer_base(ds);
-	set_bts_index(ds, bts);
+	context = ds_get_context(task);
+	error = ds_validate_access(context, ds_pebs);
+	if (error < 0)
+		goto out;
 
-	return ds_cfg.sizeof_bts;
+	*value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
+
+	error = 0;
+ out:
+	ds_put_context(context);
+	return error;
 }
 
-unsigned long ds_debugctl_mask(void)
+int ds_set_pebs_reset(struct task_struct *task, u64 value)
 {
-	return ds_cfg.debugctl_mask;
-}
+	struct ds_context *context;
+	int error;
 
-#ifdef __i386__
-static const struct ds_configuration ds_cfg_netburst = {
-	.sizeof_ds = 9 * 4,
-	.bts_buffer_base = { 0, 4 },
-	.bts_index = { 4, 4 },
-	.bts_absolute_maximum = { 8, 4 },
-	.bts_interrupt_threshold = { 12, 4 },
-	.sizeof_bts = 3 * 4,
-	.from_ip = { 0, 4 },
-	.to_ip = { 4, 4 },
-	.info_type = { 4, 1 },
-	.info_data = { 8, 4 },
-	.debugctl_mask = (1<<2)|(1<<3)
-};
+	context = ds_get_context(task);
+	error = ds_validate_access(context, ds_pebs);
+	if (error < 0)
+		goto out;
 
-static const struct ds_configuration ds_cfg_pentium_m = {
-	.sizeof_ds = 9 * 4,
-	.bts_buffer_base = { 0, 4 },
-	.bts_index = { 4, 4 },
-	.bts_absolute_maximum = { 8, 4 },
-	.bts_interrupt_threshold = { 12, 4 },
-	.sizeof_bts = 3 * 4,
-	.from_ip = { 0, 4 },
-	.to_ip = { 4, 4 },
-	.info_type = { 4, 1 },
-	.info_data = { 8, 4 },
-	.debugctl_mask = (1<<6)|(1<<7)
+	*(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
+
+	error = 0;
+ out:
+	ds_put_context(context);
+	return error;
+}
+
+static const struct ds_configuration ds_cfg_var = {
+	.sizeof_ds    = sizeof(long) * 12,
+	.sizeof_field = sizeof(long),
+	.sizeof_rec[ds_bts]   = sizeof(long) * 3,
+	.sizeof_rec[ds_pebs]  = sizeof(long) * 10
 };
-#endif /* _i386_ */
-
-static const struct ds_configuration ds_cfg_core2 = {
-	.sizeof_ds = 9 * 8,
-	.bts_buffer_base = { 0, 8 },
-	.bts_index = { 8, 8 },
-	.bts_absolute_maximum = { 16, 8 },
-	.bts_interrupt_threshold = { 24, 8 },
-	.sizeof_bts = 3 * 8,
-	.from_ip = { 0, 8 },
-	.to_ip = { 8, 8 },
-	.info_type = { 8, 1 },
-	.info_data = { 16, 8 },
-	.debugctl_mask = (1<<6)|(1<<7)|(1<<9)
+static const struct ds_configuration ds_cfg_64 = {
+	.sizeof_ds    = 8 * 12,
+	.sizeof_field = 8,
+	.sizeof_rec[ds_bts]   = 8 * 3,
+	.sizeof_rec[ds_pebs]  = 8 * 10
 };
 
 static inline void
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 	switch (c->x86) {
 	case 0x6:
 		switch (c->x86_model) {
-#ifdef __i386__
 		case 0xD:
 		case 0xE: /* Pentium M */
-			ds_configure(&ds_cfg_pentium_m);
+			ds_configure(&ds_cfg_var);
 			break;
-#endif /* _i386_ */
 		case 0xF: /* Core2 */
-			ds_configure(&ds_cfg_core2);
+		case 0x1C: /* Atom */
+			ds_configure(&ds_cfg_64);
 			break;
 		default:
 			/* sorry, don't know about them */
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 		break;
 	case 0xF:
 		switch (c->x86_model) {
-#ifdef __i386__
 		case 0x0:
 		case 0x1:
 		case 0x2: /* Netburst */
-			ds_configure(&ds_cfg_netburst);
+			ds_configure(&ds_cfg_var);
 			break;
-#endif /* _i386_ */
 		default:
 			/* sorry, don't know about them */
 			break;
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 		break;
 	}
 }
+
+void ds_free(struct ds_context *context)
+{
+	/* This is called when the task owning the parameter context
+	 * is dying. There should not be any user of that context left
+	 * to disturb us, anymore. */
+	unsigned long leftovers = context->count;
+	while (leftovers--)
+		ds_put_context(context);
+}
+#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
new file mode 100644
index 000000000000..b3614752197b
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -0,0 +1,449 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/sysfs.h>
+
+#include <asm/stacktrace.h>
+
+#define STACKSLOTS_PER_LINE 8
+#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
+
+int panic_on_unrecovered_nmi;
+int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
+static unsigned int code_bytes = 64;
+static int die_counter;
+
+void printk_address(unsigned long address, int reliable)
+{
+	printk(" [<%p>] %s%pS\n", (void *) address,
+			reliable ? "" : "? ", (void *) address);
+}
+
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+			void *p, unsigned int size, void *end)
+{
+	void *t = tinfo;
+	if (end) {
+		if (p < end && p >= (end-THREAD_SIZE))
+			return 1;
+		else
+			return 0;
+	}
+	return p > t && p < t + THREAD_SIZE - size;
+}
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+	struct stack_frame *next_frame;
+	unsigned long return_address;
+};
+
+static inline unsigned long
+print_context_stack(struct thread_info *tinfo,
+		unsigned long *stack, unsigned long bp,
+		const struct stacktrace_ops *ops, void *data,
+		unsigned long *end)
+{
+	struct stack_frame *frame = (struct stack_frame *)bp;
+
+	while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+		unsigned long addr;
+
+		addr = *stack;
+		if (__kernel_text_address(addr)) {
+			if ((unsigned long) stack == bp + sizeof(long)) {
+				ops->address(data, addr, 1);
+				frame = frame->next_frame;
+				bp = (unsigned long) frame;
+			} else {
+				ops->address(data, addr, bp == 0);
+			}
+		}
+		stack++;
+	}
+	return bp;
+}
+
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp,
+		const struct stacktrace_ops *ops, void *data)
+{
+	if (!task)
+		task = current;
+
+	if (!stack) {
+		unsigned long dummy;
+		stack = &dummy;
+		if (task && task != current)
+			stack = (unsigned long *)task->thread.sp;
+	}
+
+#ifdef CONFIG_FRAME_POINTER
+	if (!bp) {
+		if (task == current) {
+			/* Grab bp right from our regs */
+			get_bp(bp);
+		} else {
+			/* bp is the last reg pushed by switch_to */
+			bp = *(unsigned long *) task->thread.sp;
+		}
+	}
+#endif
+
+	for (;;) {
+		struct thread_info *context;
+
+		context = (struct thread_info *)
+			((unsigned long)stack & (~(THREAD_SIZE - 1)));
+		bp = print_context_stack(context, stack, bp, ops, data, NULL);
+
+		stack = (unsigned long *)context->previous_esp;
+		if (!stack)
+			break;
+		if (ops->stack(data, "IRQ") < 0)
+			break;
+		touch_nmi_watchdog();
+	}
+}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	printk(data);
+	print_symbol(msg, symbol);
+	printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+	printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+	printk("%s <%s> ", (char *)data, name);
+	return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+	touch_nmi_watchdog();
+	printk(data);
+	printk_address(addr, reliable);
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+	.warning = print_trace_warning,
+	.warning_symbol = print_trace_warning_symbol,
+	.stack = print_trace_stack,
+	.address = print_trace_address,
+};
+
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp, char *log_lvl)
+{
+	printk("%sCall Trace:\n", log_lvl);
+	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp)
+{
+	show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
+static void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *sp, unsigned long bp, char *log_lvl)
+{
+	unsigned long *stack;
+	int i;
+
+	if (sp == NULL) {
+		if (task)
+			sp = (unsigned long *)task->thread.sp;
+		else
+			sp = (unsigned long *)&sp;
+	}
+
+	stack = sp;
+	for (i = 0; i < kstack_depth_to_print; i++) {
+		if (kstack_end(stack))
+			break;
+		if (i && ((i % STACKSLOTS_PER_LINE) == 0))
+			printk("\n%s", log_lvl);
+		printk(" %08lx", *stack++);
+		touch_nmi_watchdog();
+	}
+	printk("\n");
+	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp)
+{
+	show_stack_log_lvl(task, NULL, sp, 0, "");
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+	unsigned long bp = 0;
+	unsigned long stack;
+
+#ifdef CONFIG_FRAME_POINTER
+	if (!bp)
+		get_bp(bp);
+#endif
+
+	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+		current->pid, current->comm, print_tainted(),
+		init_utsname()->release,
+		(int)strcspn(init_utsname()->version, " "),
+		init_utsname()->version);
+	show_trace(NULL, NULL, &stack, bp);
+}
+
+EXPORT_SYMBOL(dump_stack);
+
+void show_registers(struct pt_regs *regs)
+{
+	int i;
+
+	print_modules();
+	__show_regs(regs, 0);
+
+	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
+		TASK_COMM_LEN, current->comm, task_pid_nr(current),
+		current_thread_info(), current, task_thread_info(current));
+	/*
+	 * When in-kernel, we also print out the stack and code at the
+	 * time of the fault..
+	 */
+	if (!user_mode_vm(regs)) {
+		unsigned int code_prologue = code_bytes * 43 / 64;
+		unsigned int code_len = code_bytes;
+		unsigned char c;
+		u8 *ip;
+
+		printk(KERN_EMERG "Stack:\n");
+		show_stack_log_lvl(NULL, regs, &regs->sp,
+				0, KERN_EMERG);
+
+		printk(KERN_EMERG "Code: ");
+
+		ip = (u8 *)regs->ip - code_prologue;
+		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
+			/* try starting at IP */
+			ip = (u8 *)regs->ip;
+			code_len = code_len - code_prologue + 1;
+		}
+		for (i = 0; i < code_len; i++, ip++) {
+			if (ip < (u8 *)PAGE_OFFSET ||
+					probe_kernel_address(ip, c)) {
+				printk(" Bad EIP value.");
+				break;
+			}
+			if (ip == (u8 *)regs->ip)
+				printk("<%02x> ", c);
+			else
+				printk("%02x ", c);
+		}
+	}
+	printk("\n");
+}
+
+int is_valid_bugaddr(unsigned long ip)
+{
+	unsigned short ud2;
+
+	if (ip < PAGE_OFFSET)
+		return 0;
+	if (probe_kernel_address((unsigned short *)ip, ud2))
+		return 0;
+
+	return ud2 == 0x0b0f;
+}
+
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+	unsigned long flags;
+
+	oops_enter();
+
+	if (die_owner != raw_smp_processor_id()) {
+		console_verbose();
+		raw_local_irq_save(flags);
+		__raw_spin_lock(&die_lock);
+		die_owner = smp_processor_id();
+		die_nest_count = 0;
+		bust_spinlocks(1);
+	} else {
+		raw_local_irq_save(flags);
+	}
+	die_nest_count++;
+	return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+	bust_spinlocks(0);
+	die_owner = -1;
+	add_taint(TAINT_DIE);
+	__raw_spin_unlock(&die_lock);
+	raw_local_irq_restore(flags);
+
+	if (!regs)
+		return;
+
+	if (kexec_should_crash(current))
+		crash_kexec(regs);
+	if (in_interrupt())
+		panic("Fatal exception in interrupt");
+	if (panic_on_oops)
+		panic("Fatal exception");
+	oops_exit();
+	do_exit(signr);
+}
+
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+{
+	unsigned short ss;
+	unsigned long sp;
+
+	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+	printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+	printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk("DEBUG_PAGEALLOC");
+#endif
+	printk("\n");
+	sysfs_printk_last_file();
+	if (notify_die(DIE_OOPS, str, regs, err,
+			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+		return 1;
+
+	show_registers(regs);
+	/* Executive summary in case the oops scrolled away */
+	sp = (unsigned long) (&regs->sp);
+	savesegment(ss, ss);
+	if (user_mode(regs)) {
+		sp = regs->sp;
+		ss = regs->ss & 0xffff;
+	}
+	printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
+	print_symbol("%s", regs->ip);
+	printk(" SS:ESP %04x:%08lx\n", ss, sp);
+	return 0;
+}
+
+/*
+ * This is gone through when something in the kernel has done something bad
+ * and is about to be terminated:
+ */
+void die(const char *str, struct pt_regs *regs, long err)
+{
+	unsigned long flags = oops_begin();
+
+	if (die_nest_count < 3) {
+		report_bug(regs->ip, regs);
+
+		if (__die(str, regs, err))
+			regs = NULL;
+	} else {
+		printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
+	}
+
+	oops_end(flags, regs, SIGSEGV);
+}
+
+static DEFINE_SPINLOCK(nmi_print_lock);
+
+void notrace __kprobes
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
+{
+	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+		return;
+
+	spin_lock(&nmi_print_lock);
+	/*
+	* We are in trouble anyway, lets at least try
+	* to get a message out:
+	*/
+	bust_spinlocks(1);
+	printk(KERN_EMERG "%s", str);
+	printk(" on CPU%d, ip %08lx, registers:\n",
+		smp_processor_id(), regs->ip);
+	show_registers(regs);
+	if (do_panic)
+		panic("Non maskable interrupt");
+	console_silent();
+	spin_unlock(&nmi_print_lock);
+
+	/*
+	 * If we are in kernel we are probably nested up pretty bad
+	 * and might aswell get out now while we still can:
+	 */
+	if (!user_mode_vm(regs)) {
+		current->thread.trap_no = 2;
+		crash_kexec(regs);
+	}
+
+	bust_spinlocks(0);
+	do_exit(SIGSEGV);
+}
+
+static int __init oops_setup(char *s)
+{
+	if (!s)
+		return -EINVAL;
+	if (!strcmp(s, "panic"))
+		panic_on_oops = 1;
+	return 0;
+}
+early_param("oops", oops_setup);
+
+static int __init kstack_setup(char *s)
+{
+	if (!s)
+		return -EINVAL;
+	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+	return 0;
+}
+early_param("kstack", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+	code_bytes = simple_strtoul(s, NULL, 0);
+	if (code_bytes > 8192)
+		code_bytes = 8192;
+
+	return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
new file mode 100644
index 000000000000..96a5db7da8a7
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -0,0 +1,575 @@
+/*
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
+ */
+#include <linux/kallsyms.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/hardirq.h>
+#include <linux/kdebug.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/kexec.h>
+#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/sysfs.h>
+
+#include <asm/stacktrace.h>
+
+#define STACKSLOTS_PER_LINE 4
+#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
+
+int panic_on_unrecovered_nmi;
+int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
+static unsigned int code_bytes = 64;
+static int die_counter;
+
+void printk_address(unsigned long address, int reliable)
+{
+	printk(" [<%p>] %s%pS\n", (void *) address,
+			reliable ? "" : "? ", (void *) address);
+}
+
+static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
+					unsigned *usedp, char **idp)
+{
+	static char ids[][8] = {
+		[DEBUG_STACK - 1] = "#DB",
+		[NMI_STACK - 1] = "NMI",
+		[DOUBLEFAULT_STACK - 1] = "#DF",
+		[STACKFAULT_STACK - 1] = "#SS",
+		[MCE_STACK - 1] = "#MC",
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+		[N_EXCEPTION_STACKS ...
+			N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
+#endif
+	};
+	unsigned k;
+
+	/*
+	 * Iterate over all exception stacks, and figure out whether
+	 * 'stack' is in one of them:
+	 */
+	for (k = 0; k < N_EXCEPTION_STACKS; k++) {
+		unsigned long end = per_cpu(orig_ist, cpu).ist[k];
+		/*
+		 * Is 'stack' above this exception frame's end?
+		 * If yes then skip to the next frame.
+		 */
+		if (stack >= end)
+			continue;
+		/*
+		 * Is 'stack' above this exception frame's start address?
+		 * If yes then we found the right frame.
+		 */
+		if (stack >= end - EXCEPTION_STKSZ) {
+			/*
+			 * Make sure we only iterate through an exception
+			 * stack once. If it comes up for the second time
+			 * then there's something wrong going on - just
+			 * break out and return NULL:
+			 */
+			if (*usedp & (1U << k))
+				break;
+			*usedp |= 1U << k;
+			*idp = ids[k];
+			return (unsigned long *)end;
+		}
+		/*
+		 * If this is a debug stack, and if it has a larger size than
+		 * the usual exception stacks, then 'stack' might still
+		 * be within the lower portion of the debug stack:
+		 */
+#if DEBUG_STKSZ > EXCEPTION_STKSZ
+		if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
+			unsigned j = N_EXCEPTION_STACKS - 1;
+
+			/*
+			 * Black magic. A large debug stack is composed of
+			 * multiple exception stack entries, which we
+			 * iterate through now. Dont look:
+			 */
+			do {
+				++j;
+				end -= EXCEPTION_STKSZ;
+				ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
+			} while (stack < end - EXCEPTION_STKSZ);
+			if (*usedp & (1U << j))
+				break;
+			*usedp |= 1U << j;
+			*idp = ids[j];
+			return (unsigned long *)end;
+		}
+#endif
+	}
+	return NULL;
+}
+
+/*
+ * x86-64 can have up to three kernel stacks:
+ * process stack
+ * interrupt stack
+ * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
+ */
+
+static inline int valid_stack_ptr(struct thread_info *tinfo,
+			void *p, unsigned int size, void *end)
+{
+	void *t = tinfo;
+	if (end) {
+		if (p < end && p >= (end-THREAD_SIZE))
+			return 1;
+		else
+			return 0;
+	}
+	return p > t && p < t + THREAD_SIZE - size;
+}
+
+/* The form of the top of the frame on the stack */
+struct stack_frame {
+	struct stack_frame *next_frame;
+	unsigned long return_address;
+};
+
+static inline unsigned long
+print_context_stack(struct thread_info *tinfo,
+		unsigned long *stack, unsigned long bp,
+		const struct stacktrace_ops *ops, void *data,
+		unsigned long *end)
+{
+	struct stack_frame *frame = (struct stack_frame *)bp;
+
+	while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
+		unsigned long addr;
+
+		addr = *stack;
+		if (__kernel_text_address(addr)) {
+			if ((unsigned long) stack == bp + sizeof(long)) {
+				ops->address(data, addr, 1);
+				frame = frame->next_frame;
+				bp = (unsigned long) frame;
+			} else {
+				ops->address(data, addr, bp == 0);
+			}
+		}
+		stack++;
+	}
+	return bp;
+}
+
+void dump_trace(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp,
+		const struct stacktrace_ops *ops, void *data)
+{
+	const unsigned cpu = get_cpu();
+	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+	unsigned used = 0;
+	struct thread_info *tinfo;
+
+	if (!task)
+		task = current;
+
+	if (!stack) {
+		unsigned long dummy;
+		stack = &dummy;
+		if (task && task != current)
+			stack = (unsigned long *)task->thread.sp;
+	}
+
+#ifdef CONFIG_FRAME_POINTER
+	if (!bp) {
+		if (task == current) {
+			/* Grab bp right from our regs */
+			get_bp(bp);
+		} else {
+			/* bp is the last reg pushed by switch_to */
+			bp = *(unsigned long *) task->thread.sp;
+		}
+	}
+#endif
+
+	/*
+	 * Print function call entries in all stacks, starting at the
+	 * current stack address. If the stacks consist of nested
+	 * exceptions
+	 */
+	tinfo = task_thread_info(task);
+	for (;;) {
+		char *id;
+		unsigned long *estack_end;
+		estack_end = in_exception_stack(cpu, (unsigned long)stack,
+						&used, &id);
+
+		if (estack_end) {
+			if (ops->stack(data, id) < 0)
+				break;
+
+			bp = print_context_stack(tinfo, stack, bp, ops,
+							data, estack_end);
+			ops->stack(data, "<EOE>");
+			/*
+			 * We link to the next stack via the
+			 * second-to-last pointer (index -2 to end) in the
+			 * exception stack:
+			 */
+			stack = (unsigned long *) estack_end[-2];
+			continue;
+		}
+		if (irqstack_end) {
+			unsigned long *irqstack;
+			irqstack = irqstack_end -
+				(IRQSTACKSIZE - 64) / sizeof(*irqstack);
+
+			if (stack >= irqstack && stack < irqstack_end) {
+				if (ops->stack(data, "IRQ") < 0)
+					break;
+				bp = print_context_stack(tinfo, stack, bp,
+						ops, data, irqstack_end);
+				/*
+				 * We link to the next stack (which would be
+				 * the process stack normally) the last
+				 * pointer (index -1 to end) in the IRQ stack:
+				 */
+				stack = (unsigned long *) (irqstack_end[-1]);
+				irqstack_end = NULL;
+				ops->stack(data, "EOI");
+				continue;
+			}
+		}
+		break;
+	}
+
+	/*
+	 * This handles the process stack:
+	 */
+	bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
+	put_cpu();
+}
+EXPORT_SYMBOL(dump_trace);
+
+static void
+print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	printk(data);
+	print_symbol(msg, symbol);
+	printk("\n");
+}
+
+static void print_trace_warning(void *data, char *msg)
+{
+	printk("%s%s\n", (char *)data, msg);
+}
+
+static int print_trace_stack(void *data, char *name)
+{
+	printk("%s <%s> ", (char *)data, name);
+	return 0;
+}
+
+/*
+ * Print one address/symbol entries per line.
+ */
+static void print_trace_address(void *data, unsigned long addr, int reliable)
+{
+	touch_nmi_watchdog();
+	printk(data);
+	printk_address(addr, reliable);
+}
+
+static const struct stacktrace_ops print_trace_ops = {
+	.warning = print_trace_warning,
+	.warning_symbol = print_trace_warning_symbol,
+	.stack = print_trace_stack,
+	.address = print_trace_address,
+};
+
+static void
+show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp, char *log_lvl)
+{
+	printk("%sCall Trace:\n", log_lvl);
+	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+}
+
+void show_trace(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp)
+{
+	show_trace_log_lvl(task, regs, stack, bp, "");
+}
+
+static void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+		unsigned long *sp, unsigned long bp, char *log_lvl)
+{
+	unsigned long *stack;
+	int i;
+	const int cpu = smp_processor_id();
+	unsigned long *irqstack_end =
+		(unsigned long *) (cpu_pda(cpu)->irqstackptr);
+	unsigned long *irqstack =
+		(unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+
+	/*
+	 * debugging aid: "show_stack(NULL, NULL);" prints the
+	 * back trace for this cpu.
+	 */
+
+	if (sp == NULL) {
+		if (task)
+			sp = (unsigned long *)task->thread.sp;
+		else
+			sp = (unsigned long *)&sp;
+	}
+
+	stack = sp;
+	for (i = 0; i < kstack_depth_to_print; i++) {
+		if (stack >= irqstack && stack <= irqstack_end) {
+			if (stack == irqstack_end) {
+				stack = (unsigned long *) (irqstack_end[-1]);
+				printk(" <EOI> ");
+			}
+		} else {
+		if (((long) stack & (THREAD_SIZE-1)) == 0)
+			break;
+		}
+		if (i && ((i % STACKSLOTS_PER_LINE) == 0))
+			printk("\n%s", log_lvl);
+		printk(" %016lx", *stack++);
+		touch_nmi_watchdog();
+	}
+	printk("\n");
+	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+}
+
+void show_stack(struct task_struct *task, unsigned long *sp)
+{
+	show_stack_log_lvl(task, NULL, sp, 0, "");
+}
+
+/*
+ * The architecture-independent dump_stack generator
+ */
+void dump_stack(void)
+{
+	unsigned long bp = 0;
+	unsigned long stack;
+
+#ifdef CONFIG_FRAME_POINTER
+	if (!bp)
+		get_bp(bp);
+#endif
+
+	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+		current->pid, current->comm, print_tainted(),
+		init_utsname()->release,
+		(int)strcspn(init_utsname()->version, " "),
+		init_utsname()->version);
+	show_trace(NULL, NULL, &stack, bp);
+}
+EXPORT_SYMBOL(dump_stack);
+
+void show_registers(struct pt_regs *regs)
+{
+	int i;
+	unsigned long sp;
+	const int cpu = smp_processor_id();
+	struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+
+	sp = regs->sp;
+	printk("CPU %d ", cpu);
+	__show_regs(regs, 1);
+	printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
+		cur->comm, cur->pid, task_thread_info(cur), cur);
+
+	/*
+	 * When in-kernel, we also print out the stack and code at the
+	 * time of the fault..
+	 */
+	if (!user_mode(regs)) {
+		unsigned int code_prologue = code_bytes * 43 / 64;
+		unsigned int code_len = code_bytes;
+		unsigned char c;
+		u8 *ip;
+
+		printk(KERN_EMERG "Stack:\n");
+		show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
+				regs->bp, KERN_EMERG);
+
+		printk(KERN_EMERG "Code: ");
+
+		ip = (u8 *)regs->ip - code_prologue;
+		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
+			/* try starting at IP */
+			ip = (u8 *)regs->ip;
+			code_len = code_len - code_prologue + 1;
+		}
+		for (i = 0; i < code_len; i++, ip++) {
+			if (ip < (u8 *)PAGE_OFFSET ||
+					probe_kernel_address(ip, c)) {
+				printk(" Bad RIP value.");
+				break;
+			}
+			if (ip == (u8 *)regs->ip)
+				printk("<%02x> ", c);
+			else
+				printk("%02x ", c);
+		}
+	}
+	printk("\n");
+}
+
+int is_valid_bugaddr(unsigned long ip)
+{
+	unsigned short ud2;
+
+	if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
+		return 0;
+
+	return ud2 == 0x0b0f;
+}
+
+static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
+static int die_owner = -1;
+static unsigned int die_nest_count;
+
+unsigned __kprobes long oops_begin(void)
+{
+	int cpu;
+	unsigned long flags;
+
+	oops_enter();
+
+	/* racy, but better than risking deadlock. */
+	raw_local_irq_save(flags);
+	cpu = smp_processor_id();
+	if (!__raw_spin_trylock(&die_lock)) {
+		if (cpu == die_owner)
+			/* nested oops. should stop eventually */;
+		else
+			__raw_spin_lock(&die_lock);
+	}
+	die_nest_count++;
+	die_owner = cpu;
+	console_verbose();
+	bust_spinlocks(1);
+	return flags;
+}
+
+void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
+{
+	die_owner = -1;
+	bust_spinlocks(0);
+	die_nest_count--;
+	if (!die_nest_count)
+		/* Nest count reaches zero, release the lock. */
+		__raw_spin_unlock(&die_lock);
+	raw_local_irq_restore(flags);
+	if (!regs) {
+		oops_exit();
+		return;
+	}
+	if (in_interrupt())
+		panic("Fatal exception in interrupt");
+	if (panic_on_oops)
+		panic("Fatal exception");
+	oops_exit();
+	do_exit(signr);
+}
+
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+{
+	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
+#ifdef CONFIG_PREEMPT
+	printk("PREEMPT ");
+#endif
+#ifdef CONFIG_SMP
+	printk("SMP ");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk("DEBUG_PAGEALLOC");
+#endif
+	printk("\n");
+	sysfs_printk_last_file();
+	if (notify_die(DIE_OOPS, str, regs, err,
+			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
+		return 1;
+
+	show_registers(regs);
+	add_taint(TAINT_DIE);
+	/* Executive summary in case the oops scrolled away */
+	printk(KERN_ALERT "RIP ");
+	printk_address(regs->ip, 1);
+	printk(" RSP <%016lx>\n", regs->sp);
+	if (kexec_should_crash(current))
+		crash_kexec(regs);
+	return 0;
+}
+
+void die(const char *str, struct pt_regs *regs, long err)
+{
+	unsigned long flags = oops_begin();
+
+	if (!user_mode(regs))
+		report_bug(regs->ip, regs);
+
+	if (__die(str, regs, err))
+		regs = NULL;
+	oops_end(flags, regs, SIGSEGV);
+}
+
+notrace __kprobes void
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
+{
+	unsigned long flags;
+
+	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
+		return;
+
+	flags = oops_begin();
+	/*
+	 * We are in trouble anyway, lets at least try
+	 * to get a message out.
+	 */
+	printk(KERN_EMERG "%s", str);
+	printk(" on CPU%d, ip %08lx, registers:\n",
+		smp_processor_id(), regs->ip);
+	show_registers(regs);
+	if (kexec_should_crash(current))
+		crash_kexec(regs);
+	if (do_panic || panic_on_oops)
+		panic("Non maskable interrupt");
+	oops_end(flags, NULL, SIGBUS);
+	nmi_exit();
+	local_irq_enable();
+	do_exit(SIGBUS);
+}
+
+static int __init oops_setup(char *s)
+{
+	if (!s)
+		return -EINVAL;
+	if (!strcmp(s, "panic"))
+		panic_on_oops = 1;
+	return 0;
+}
+early_param("oops", oops_setup);
+
+static int __init kstack_setup(char *s)
+{
+	if (!s)
+		return -EINVAL;
+	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+	return 0;
+}
+early_param("kstack", kstack_setup);
+
+static int __init code_bytes_setup(char *s)
+{
+	code_bytes = simple_strtoul(s, NULL, 0);
+	if (code_bytes > 8192)
+		code_bytes = 8192;
+
+	return 1;
+}
+__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 28c29180b380..ce97bf3bed12 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -148,6 +148,9 @@ void __init e820_print_map(char *who)
 		case E820_NVS:
 			printk(KERN_CONT "(ACPI NVS)\n");
 			break;
+		case E820_UNUSABLE:
+			printk("(unusable)\n");
+			break;
 		default:
 			printk(KERN_CONT "type %u\n", e820.map[i].type);
 			break;
@@ -877,7 +880,8 @@ void __init early_res_to_bootmem(u64 start, u64 end)
 	for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
 		count++;
 
-	printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count);
+	printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
+			 count, start, end);
 	for (i = 0; i < count; i++) {
 		struct early_res *r = &early_res[i];
 		printk(KERN_INFO "  #%d [%010llx - %010llx] %16s", i,
@@ -1202,7 +1206,7 @@ static int __init parse_memmap_opt(char *p)
 	if (!p)
 		return -EINVAL;
 
-	if (!strcmp(p, "exactmap")) {
+	if (!strncmp(p, "exactmap", 8)) {
 #ifdef CONFIG_CRASH_DUMP
 		/*
 		 * If we are doing a crash dump, we still need to know
@@ -1259,6 +1263,7 @@ static inline const char *e820_type_to_string(int e820_type)
 	case E820_RAM:	return "System RAM";
 	case E820_ACPI:	return "ACPI Tables";
 	case E820_NVS:	return "ACPI Non-volatile Storage";
+	case E820_UNUSABLE:	return "Unusable memory";
 	default:	return "reserved";
 	}
 }
@@ -1266,6 +1271,7 @@ static inline const char *e820_type_to_string(int e820_type)
 /*
  * Mark e820 reserved areas as busy for the resource manager.
  */
+static struct resource __initdata *e820_res;
 void __init e820_reserve_resources(void)
 {
 	int i;
@@ -1273,20 +1279,26 @@ void __init e820_reserve_resources(void)
 	u64 end;
 
 	res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
+	e820_res = res;
 	for (i = 0; i < e820.nr_map; i++) {
 		end = e820.map[i].addr + e820.map[i].size - 1;
-#ifndef CONFIG_RESOURCES_64BIT
-		if (end > 0x100000000ULL) {
+		if (end != (resource_size_t)end) {
 			res++;
 			continue;
 		}
-#endif
 		res->name = e820_type_to_string(e820.map[i].type);
 		res->start = e820.map[i].addr;
 		res->end = end;
 
 		res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
-		insert_resource(&iomem_resource, res);
+
+		/*
+		 * don't register the region that could be conflicted with
+		 * pci device BAR resource and insert them later in
+		 * pcibios_resource_survey()
+		 */
+		if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20))
+			insert_resource(&iomem_resource, res);
 		res++;
 	}
 
@@ -1298,10 +1310,18 @@ void __init e820_reserve_resources(void)
 	}
 }
 
-/*
- * Non-standard memory setup can be specified via this quirk:
- */
-char * (*arch_memory_setup_quirk)(void);
+void __init e820_reserve_resources_late(void)
+{
+	int i;
+	struct resource *res;
+
+	res = e820_res;
+	for (i = 0; i < e820.nr_map; i++) {
+		if (!res->parent && res->end)
+			reserve_region_with_split(&iomem_resource, res->start, res->end, res->name);
+		res++;
+	}
+}
 
 char *__init default_machine_specific_memory_setup(void)
 {
@@ -1343,8 +1363,8 @@ char *__init default_machine_specific_memory_setup(void)
 
 char *__init __attribute__((weak)) machine_specific_memory_setup(void)
 {
-	if (arch_memory_setup_quirk) {
-		char *who = arch_memory_setup_quirk();
+	if (x86_quirks->arch_memory_setup) {
+		char *who = x86_quirks->arch_memory_setup();
 
 		if (who)
 			return who;
@@ -1367,24 +1387,3 @@ void __init setup_memory_map(void)
 	printk(KERN_INFO "BIOS-provided physical RAM map:\n");
 	e820_print_map(who);
 }
-
-#ifdef CONFIG_X86_64
-int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
-{
-	int i;
-
-	if (slot < 0 || slot >= e820.nr_map)
-		return -1;
-	for (i = slot; i < e820.nr_map; i++) {
-		if (e820.map[i].type != E820_RAM)
-			continue;
-		break;
-	}
-	if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
-		return -1;
-	*addr = e820.map[i].addr;
-	*size = min_t(u64, e820.map[i].size + e820.map[i].addr,
-		max_pfn << PAGE_SHIFT) - *addr;
-	return i + 1;
-}
-#endif
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a0e11c0cc872..3ce029ffaa55 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -16,10 +16,7 @@
 #include <asm/dma.h>
 #include <asm/io_apic.h>
 #include <asm/apic.h>
-
-#ifdef CONFIG_GART_IOMMU
-#include <asm/gart.h>
-#endif
+#include <asm/iommu.h>
 
 static void __init fix_hypertransport_config(int num, int slot, int func)
 {
@@ -98,6 +95,113 @@ static void __init nvidia_bugs(int num, int slot, int func)
 
 }
 
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
+static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
+{
+	u32 d;
+	u8  b;
+
+	b = read_pci_config_byte(num, slot, func, 0xac);
+	b &= ~(1<<5);
+	write_pci_config_byte(num, slot, func, 0xac, b);
+
+	d = read_pci_config(num, slot, func, 0x70);
+	d |= 1<<8;
+	write_pci_config(num, slot, func, 0x70, d);
+
+	d = read_pci_config(num, slot, func, 0x8);
+	d &= 0xff;
+	return d;
+}
+
+static void __init ati_bugs(int num, int slot, int func)
+{
+	u32 d;
+	u8  b;
+
+	if (acpi_use_timer_override)
+		return;
+
+	d = ati_ixp4x0_rev(num, slot, func);
+	if (d  < 0x82)
+		acpi_skip_timer_override = 1;
+	else {
+		/* check for IRQ0 interrupt swap */
+		outb(0x72, 0xcd6); b = inb(0xcd7);
+		if (!(b & 0x2))
+			acpi_skip_timer_override = 1;
+	}
+
+	if (acpi_skip_timer_override) {
+		printk(KERN_INFO "SB4X0 revision 0x%x\n", d);
+		printk(KERN_INFO "Ignoring ACPI timer override.\n");
+		printk(KERN_INFO "If you got timer trouble "
+		       "try acpi_use_timer_override\n");
+	}
+}
+
+static u32 __init ati_sbx00_rev(int num, int slot, int func)
+{
+	u32 old, d;
+
+	d = read_pci_config(num, slot, func, 0x70);
+	old = d;
+	d &= ~(1<<8);
+	write_pci_config(num, slot, func, 0x70, d);
+	d = read_pci_config(num, slot, func, 0x8);
+	d &= 0xff;
+	write_pci_config(num, slot, func, 0x70, old);
+
+	return d;
+}
+
+static void __init ati_bugs_contd(int num, int slot, int func)
+{
+	u32 d, rev;
+
+	if (acpi_use_timer_override)
+		return;
+
+	rev = ati_sbx00_rev(num, slot, func);
+	if (rev > 0x13)
+		return;
+
+	/* check for IRQ0 interrupt swap */
+	d = read_pci_config(num, slot, func, 0x64);
+	if (!(d & (1<<14)))
+		acpi_skip_timer_override = 1;
+
+	if (acpi_skip_timer_override) {
+		printk(KERN_INFO "SB600 revision 0x%x\n", rev);
+		printk(KERN_INFO "Ignoring ACPI timer override.\n");
+		printk(KERN_INFO "If you got timer trouble "
+		       "try acpi_use_timer_override\n");
+	}
+}
+#else
+static void __init ati_bugs(int num, int slot, int func)
+{
+}
+
+static void __init ati_bugs_contd(int num, int slot, int func)
+{
+}
+#endif
+
+#ifdef CONFIG_DMAR
+static void __init intel_g33_dmar(int num, int slot, int func)
+{
+	struct acpi_table_header *dmar_tbl;
+	acpi_status status;
+
+	status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
+	if (ACPI_SUCCESS(status)) {
+		printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
+		dmar_disabled = 1;
+	}
+}
+#endif
+
 #define QFLAG_APPLY_ONCE 	0x1
 #define QFLAG_APPLIED		0x2
 #define QFLAG_DONE		(QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -117,6 +221,14 @@ static struct chipset early_qrk[] __initdata = {
 	  PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
 	{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
 	  PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
+	{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
+	  PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
+	{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
+	  PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
+#ifdef CONFIG_DMAR
+	{ PCI_VENDOR_ID_INTEL, 0x29c0,
+	  PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
+#endif
 	{}
 };
 
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index ff9e7350da54..34ad997d3834 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -3,11 +3,19 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/screen_info.h>
+#include <linux/usb/ch9.h>
+#include <linux/pci_regs.h>
+#include <linux/pci_ids.h>
+#include <linux/errno.h>
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/fcntl.h>
 #include <asm/setup.h>
 #include <xen/hvc-console.h>
+#include <asm/pci-direct.h>
+#include <asm/pgtable.h>
+#include <asm/fixmap.h>
+#include <linux/usb/ehci_def.h>
 
 /* Simple VGA output */
 #define VGABASE		(__ISA_IO_base + 0xb8000)
@@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8;  /* ttyS0 */
 static int early_serial_putc(unsigned char ch)
 {
 	unsigned timeout = 0xffff;
+
 	while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
 		cpu_relax();
 	outb(ch, early_serial_base + TXR);
@@ -111,7 +120,7 @@ static __init void early_serial_init(char *s)
 		if (!strncmp(s, "0x", 2)) {
 			early_serial_base = simple_strtoul(s, &e, 16);
 		} else {
-			static int bases[] = { 0x3f8, 0x2f8 };
+			static const int __initconst bases[] = { 0x3f8, 0x2f8 };
 
 			if (!strncmp(s, "ttyS", 4))
 				s += 4;
@@ -151,6 +160,721 @@ static struct console early_serial_console = {
 	.index =	-1,
 };
 
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+
+static struct ehci_caps __iomem *ehci_caps;
+static struct ehci_regs __iomem *ehci_regs;
+static struct ehci_dbg_port __iomem *ehci_debug;
+static unsigned int dbgp_endpoint_out;
+
+struct ehci_dev {
+	u32 bus;
+	u32 slot;
+	u32 func;
+};
+
+static struct ehci_dev ehci_dev;
+
+#define USB_DEBUG_DEVNUM 127
+
+#define DBGP_DATA_TOGGLE	0x8800
+
+static inline u32 dbgp_pid_update(u32 x, u32 tok)
+{
+	return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
+}
+
+static inline u32 dbgp_len_update(u32 x, u32 len)
+{
+	return (x & ~0x0f) | (len & 0x0f);
+}
+
+/*
+ * USB Packet IDs (PIDs)
+ */
+
+/* token */
+#define USB_PID_OUT		0xe1
+#define USB_PID_IN		0x69
+#define USB_PID_SOF		0xa5
+#define USB_PID_SETUP		0x2d
+/* handshake */
+#define USB_PID_ACK		0xd2
+#define USB_PID_NAK		0x5a
+#define USB_PID_STALL		0x1e
+#define USB_PID_NYET		0x96
+/* data */
+#define USB_PID_DATA0		0xc3
+#define USB_PID_DATA1		0x4b
+#define USB_PID_DATA2		0x87
+#define USB_PID_MDATA		0x0f
+/* Special */
+#define USB_PID_PREAMBLE	0x3c
+#define USB_PID_ERR		0x3c
+#define USB_PID_SPLIT		0x78
+#define USB_PID_PING		0xb4
+#define USB_PID_UNDEF_0		0xf0
+
+#define USB_PID_DATA_TOGGLE	0x88
+#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
+
+#define PCI_CAP_ID_EHCI_DEBUG	0xa
+
+#define HUB_ROOT_RESET_TIME	50	/* times are in msec */
+#define HUB_SHORT_RESET_TIME	10
+#define HUB_LONG_RESET_TIME	200
+#define HUB_RESET_TIMEOUT	500
+
+#define DBGP_MAX_PACKET		8
+
+static int dbgp_wait_until_complete(void)
+{
+	u32 ctrl;
+	int loop = 0x100000;
+
+	do {
+		ctrl = readl(&ehci_debug->control);
+		/* Stop when the transaction is finished */
+		if (ctrl & DBGP_DONE)
+			break;
+	} while (--loop > 0);
+
+	if (!loop)
+		return -1;
+
+	/*
+	 * Now that we have observed the completed transaction,
+	 * clear the done bit.
+	 */
+	writel(ctrl | DBGP_DONE, &ehci_debug->control);
+	return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
+}
+
+static void dbgp_mdelay(int ms)
+{
+	int i;
+
+	while (ms--) {
+		for (i = 0; i < 1000; i++)
+			outb(0x1, 0x80);
+	}
+}
+
+static void dbgp_breath(void)
+{
+	/* Sleep to give the debug port a chance to breathe */
+}
+
+static int dbgp_wait_until_done(unsigned ctrl)
+{
+	u32 pids, lpid;
+	int ret;
+	int loop = 3;
+
+retry:
+	writel(ctrl | DBGP_GO, &ehci_debug->control);
+	ret = dbgp_wait_until_complete();
+	pids = readl(&ehci_debug->pids);
+	lpid = DBGP_PID_GET(pids);
+
+	if (ret < 0)
+		return ret;
+
+	/*
+	 * If the port is getting full or it has dropped data
+	 * start pacing ourselves, not necessary but it's friendly.
+	 */
+	if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
+		dbgp_breath();
+
+	/* If I get a NACK reissue the transmission */
+	if (lpid == USB_PID_NAK) {
+		if (--loop > 0)
+			goto retry;
+	}
+
+	return ret;
+}
+
+static void dbgp_set_data(const void *buf, int size)
+{
+	const unsigned char *bytes = buf;
+	u32 lo, hi;
+	int i;
+
+	lo = hi = 0;
+	for (i = 0; i < 4 && i < size; i++)
+		lo |= bytes[i] << (8*i);
+	for (; i < 8 && i < size; i++)
+		hi |= bytes[i] << (8*(i - 4));
+	writel(lo, &ehci_debug->data03);
+	writel(hi, &ehci_debug->data47);
+}
+
+static void dbgp_get_data(void *buf, int size)
+{
+	unsigned char *bytes = buf;
+	u32 lo, hi;
+	int i;
+
+	lo = readl(&ehci_debug->data03);
+	hi = readl(&ehci_debug->data47);
+	for (i = 0; i < 4 && i < size; i++)
+		bytes[i] = (lo >> (8*i)) & 0xff;
+	for (; i < 8 && i < size; i++)
+		bytes[i] = (hi >> (8*(i - 4))) & 0xff;
+}
+
+static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
+			 const char *bytes, int size)
+{
+	u32 pids, addr, ctrl;
+	int ret;
+
+	if (size > DBGP_MAX_PACKET)
+		return -1;
+
+	addr = DBGP_EPADDR(devnum, endpoint);
+
+	pids = readl(&ehci_debug->pids);
+	pids = dbgp_pid_update(pids, USB_PID_OUT);
+
+	ctrl = readl(&ehci_debug->control);
+	ctrl = dbgp_len_update(ctrl, size);
+	ctrl |= DBGP_OUT;
+	ctrl |= DBGP_GO;
+
+	dbgp_set_data(bytes, size);
+	writel(addr, &ehci_debug->address);
+	writel(pids, &ehci_debug->pids);
+
+	ret = dbgp_wait_until_done(ctrl);
+	if (ret < 0)
+		return ret;
+
+	return ret;
+}
+
+static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
+				 int size)
+{
+	u32 pids, addr, ctrl;
+	int ret;
+
+	if (size > DBGP_MAX_PACKET)
+		return -1;
+
+	addr = DBGP_EPADDR(devnum, endpoint);
+
+	pids = readl(&ehci_debug->pids);
+	pids = dbgp_pid_update(pids, USB_PID_IN);
+
+	ctrl = readl(&ehci_debug->control);
+	ctrl = dbgp_len_update(ctrl, size);
+	ctrl &= ~DBGP_OUT;
+	ctrl |= DBGP_GO;
+
+	writel(addr, &ehci_debug->address);
+	writel(pids, &ehci_debug->pids);
+	ret = dbgp_wait_until_done(ctrl);
+	if (ret < 0)
+		return ret;
+
+	if (size > ret)
+		size = ret;
+	dbgp_get_data(data, size);
+	return ret;
+}
+
+static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
+	int value, int index, void *data, int size)
+{
+	u32 pids, addr, ctrl;
+	struct usb_ctrlrequest req;
+	int read;
+	int ret;
+
+	read = (requesttype & USB_DIR_IN) != 0;
+	if (size > (read ? DBGP_MAX_PACKET:0))
+		return -1;
+
+	/* Compute the control message */
+	req.bRequestType = requesttype;
+	req.bRequest = request;
+	req.wValue = cpu_to_le16(value);
+	req.wIndex = cpu_to_le16(index);
+	req.wLength = cpu_to_le16(size);
+
+	pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
+	addr = DBGP_EPADDR(devnum, 0);
+
+	ctrl = readl(&ehci_debug->control);
+	ctrl = dbgp_len_update(ctrl, sizeof(req));
+	ctrl |= DBGP_OUT;
+	ctrl |= DBGP_GO;
+
+	/* Send the setup message */
+	dbgp_set_data(&req, sizeof(req));
+	writel(addr, &ehci_debug->address);
+	writel(pids, &ehci_debug->pids);
+	ret = dbgp_wait_until_done(ctrl);
+	if (ret < 0)
+		return ret;
+
+	/* Read the result */
+	return dbgp_bulk_read(devnum, 0, data, size);
+}
+
+
+/* Find a PCI capability */
+static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
+{
+	u8 pos;
+	int bytes;
+
+	if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
+		PCI_STATUS_CAP_LIST))
+		return 0;
+
+	pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
+	for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
+		u8 id;
+
+		pos &= ~3;
+		id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
+		if (id == 0xff)
+			break;
+		if (id == cap)
+			return pos;
+
+		pos = read_pci_config_byte(num, slot, func,
+						 pos+PCI_CAP_LIST_NEXT);
+	}
+	return 0;
+}
+
+static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
+{
+	u32 class;
+
+	class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
+	if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
+		return 0;
+
+	return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
+}
+
+static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
+{
+	u32 bus, slot, func;
+
+	for (bus = 0; bus < 256; bus++) {
+		for (slot = 0; slot < 32; slot++) {
+			for (func = 0; func < 8; func++) {
+				unsigned cap;
+
+				cap = __find_dbgp(bus, slot, func);
+
+				if (!cap)
+					continue;
+				if (ehci_num-- != 0)
+					continue;
+				*rbus = bus;
+				*rslot = slot;
+				*rfunc = func;
+				return cap;
+			}
+		}
+	}
+	return 0;
+}
+
+static int ehci_reset_port(int port)
+{
+	u32 portsc;
+	u32 delay_time, delay;
+	int loop;
+
+	/* Reset the usb debug port */
+	portsc = readl(&ehci_regs->port_status[port - 1]);
+	portsc &= ~PORT_PE;
+	portsc |= PORT_RESET;
+	writel(portsc, &ehci_regs->port_status[port - 1]);
+
+	delay = HUB_ROOT_RESET_TIME;
+	for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
+	     delay_time += delay) {
+		dbgp_mdelay(delay);
+
+		portsc = readl(&ehci_regs->port_status[port - 1]);
+		if (portsc & PORT_RESET) {
+			/* force reset to complete */
+			loop = 2;
+			writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
+				&ehci_regs->port_status[port - 1]);
+			do {
+				portsc = readl(&ehci_regs->port_status[port-1]);
+			} while ((portsc & PORT_RESET) && (--loop > 0));
+		}
+
+		/* Device went away? */
+		if (!(portsc & PORT_CONNECT))
+			return -ENOTCONN;
+
+		/* bomb out completely if something weird happend */
+		if ((portsc & PORT_CSC))
+			return -EINVAL;
+
+		/* If we've finished resetting, then break out of the loop */
+		if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
+			return 0;
+	}
+	return -EBUSY;
+}
+
+static int ehci_wait_for_port(int port)
+{
+	u32 status;
+	int ret, reps;
+
+	for (reps = 0; reps < 3; reps++) {
+		dbgp_mdelay(100);
+		status = readl(&ehci_regs->status);
+		if (status & STS_PCD) {
+			ret = ehci_reset_port(port);
+			if (ret == 0)
+				return 0;
+		}
+	}
+	return -ENOTCONN;
+}
+
+#ifdef DBGP_DEBUG
+# define dbgp_printk early_printk
+#else
+static inline void dbgp_printk(const char *fmt, ...) { }
+#endif
+
+typedef void (*set_debug_port_t)(int port);
+
+static void default_set_debug_port(int port)
+{
+}
+
+static set_debug_port_t set_debug_port = default_set_debug_port;
+
+static void nvidia_set_debug_port(int port)
+{
+	u32 dword;
+	dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
+				 0x74);
+	dword &= ~(0x0f<<12);
+	dword |= ((port & 0x0f)<<12);
+	write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
+				 dword);
+	dbgp_printk("set debug port to %d\n", port);
+}
+
+static void __init detect_set_debug_port(void)
+{
+	u32 vendorid;
+
+	vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
+		 0x00);
+
+	if ((vendorid & 0xffff) == 0x10de) {
+		dbgp_printk("using nvidia set_debug_port\n");
+		set_debug_port = nvidia_set_debug_port;
+	}
+}
+
+static int __init ehci_setup(void)
+{
+	struct usb_debug_descriptor dbgp_desc;
+	u32 cmd, ctrl, status, portsc, hcs_params;
+	u32 debug_port, new_debug_port = 0, n_ports;
+	u32  devnum;
+	int ret, i;
+	int loop;
+	int port_map_tried;
+	int playtimes = 3;
+
+try_next_time:
+	port_map_tried = 0;
+
+try_next_port:
+
+	hcs_params = readl(&ehci_caps->hcs_params);
+	debug_port = HCS_DEBUG_PORT(hcs_params);
+	n_ports    = HCS_N_PORTS(hcs_params);
+
+	dbgp_printk("debug_port: %d\n", debug_port);
+	dbgp_printk("n_ports:    %d\n", n_ports);
+
+	for (i = 1; i <= n_ports; i++) {
+		portsc = readl(&ehci_regs->port_status[i-1]);
+		dbgp_printk("portstatus%d: %08x\n", i, portsc);
+	}
+
+	if (port_map_tried && (new_debug_port != debug_port)) {
+		if (--playtimes) {
+			set_debug_port(new_debug_port);
+			goto try_next_time;
+		}
+		return -1;
+	}
+
+	loop = 10;
+	/* Reset the EHCI controller */
+	cmd = readl(&ehci_regs->command);
+	cmd |= CMD_RESET;
+	writel(cmd, &ehci_regs->command);
+	do {
+		cmd = readl(&ehci_regs->command);
+	} while ((cmd & CMD_RESET) && (--loop > 0));
+
+	if (!loop) {
+		dbgp_printk("can not reset ehci\n");
+		return -1;
+	}
+	dbgp_printk("ehci reset done\n");
+
+	/* Claim ownership, but do not enable yet */
+	ctrl = readl(&ehci_debug->control);
+	ctrl |= DBGP_OWNER;
+	ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
+	writel(ctrl, &ehci_debug->control);
+
+	/* Start the ehci running */
+	cmd = readl(&ehci_regs->command);
+	cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
+	cmd |= CMD_RUN;
+	writel(cmd, &ehci_regs->command);
+
+	/* Ensure everything is routed to the EHCI */
+	writel(FLAG_CF, &ehci_regs->configured_flag);
+
+	/* Wait until the controller is no longer halted */
+	loop = 10;
+	do {
+		status = readl(&ehci_regs->status);
+	} while ((status & STS_HALT) && (--loop > 0));
+
+	if (!loop) {
+		dbgp_printk("ehci can be started\n");
+		return -1;
+	}
+	dbgp_printk("ehci started\n");
+
+	/* Wait for a device to show up in the debug port */
+	ret = ehci_wait_for_port(debug_port);
+	if (ret < 0) {
+		dbgp_printk("No device found in debug port\n");
+		goto next_debug_port;
+	}
+	dbgp_printk("ehci wait for port done\n");
+
+	/* Enable the debug port */
+	ctrl = readl(&ehci_debug->control);
+	ctrl |= DBGP_CLAIM;
+	writel(ctrl, &ehci_debug->control);
+	ctrl = readl(&ehci_debug->control);
+	if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
+		dbgp_printk("No device in debug port\n");
+		writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
+		goto err;
+	}
+	dbgp_printk("debug ported enabled\n");
+
+	/* Completely transfer the debug device to the debug controller */
+	portsc = readl(&ehci_regs->port_status[debug_port - 1]);
+	portsc &= ~PORT_PE;
+	writel(portsc, &ehci_regs->port_status[debug_port - 1]);
+
+	dbgp_mdelay(100);
+
+	/* Find the debug device and make it device number 127 */
+	for (devnum = 0; devnum <= 127; devnum++) {
+		ret = dbgp_control_msg(devnum,
+			USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+			USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
+			&dbgp_desc, sizeof(dbgp_desc));
+		if (ret > 0)
+			break;
+	}
+	if (devnum > 127) {
+		dbgp_printk("Could not find attached debug device\n");
+		goto err;
+	}
+	if (ret < 0) {
+		dbgp_printk("Attached device is not a debug device\n");
+		goto err;
+	}
+	dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
+
+	/* Move the device to 127 if it isn't already there */
+	if (devnum != USB_DEBUG_DEVNUM) {
+		ret = dbgp_control_msg(devnum,
+			USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+			USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
+		if (ret < 0) {
+			dbgp_printk("Could not move attached device to %d\n",
+				USB_DEBUG_DEVNUM);
+			goto err;
+		}
+		devnum = USB_DEBUG_DEVNUM;
+		dbgp_printk("debug device renamed to 127\n");
+	}
+
+	/* Enable the debug interface */
+	ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
+		USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
+		USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
+	if (ret < 0) {
+		dbgp_printk(" Could not enable the debug device\n");
+		goto err;
+	}
+	dbgp_printk("debug interface enabled\n");
+
+	/* Perform a small write to get the even/odd data state in sync
+	 */
+	ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
+	if (ret < 0) {
+		dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
+		goto err;
+	}
+	dbgp_printk("small write doned\n");
+
+	return 0;
+err:
+	/* Things didn't work so remove my claim */
+	ctrl = readl(&ehci_debug->control);
+	ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
+	writel(ctrl, &ehci_debug->control);
+	return -1;
+
+next_debug_port:
+	port_map_tried |= (1<<(debug_port - 1));
+	new_debug_port = ((debug_port-1+1)%n_ports) + 1;
+	if (port_map_tried != ((1<<n_ports) - 1)) {
+		set_debug_port(new_debug_port);
+		goto try_next_port;
+	}
+	if (--playtimes) {
+		set_debug_port(new_debug_port);
+		goto try_next_time;
+	}
+
+	return -1;
+}
+
+static int __init early_dbgp_init(char *s)
+{
+	u32 debug_port, bar, offset;
+	u32 bus, slot, func, cap;
+	void __iomem *ehci_bar;
+	u32 dbgp_num;
+	u32 bar_val;
+	char *e;
+	int ret;
+	u8 byte;
+
+	if (!early_pci_allowed())
+		return -1;
+
+	dbgp_num = 0;
+	if (*s)
+		dbgp_num = simple_strtoul(s, &e, 10);
+	dbgp_printk("dbgp_num: %d\n", dbgp_num);
+
+	cap = find_dbgp(dbgp_num, &bus, &slot, &func);
+	if (!cap)
+		return -1;
+
+	dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
+			 func);
+
+	debug_port = read_pci_config(bus, slot, func, cap);
+	bar = (debug_port >> 29) & 0x7;
+	bar = (bar * 4) + 0xc;
+	offset = (debug_port >> 16) & 0xfff;
+	dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
+	if (bar != PCI_BASE_ADDRESS_0) {
+		dbgp_printk("only debug ports on bar 1 handled.\n");
+
+		return -1;
+	}
+
+	bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
+	dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
+	if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
+		dbgp_printk("only simple 32bit mmio bars supported\n");
+
+		return -1;
+	}
+
+	/* double check if the mem space is enabled */
+	byte = read_pci_config_byte(bus, slot, func, 0x04);
+	if (!(byte & 0x2)) {
+		byte  |= 0x02;
+		write_pci_config_byte(bus, slot, func, 0x04, byte);
+		dbgp_printk("mmio for ehci enabled\n");
+	}
+
+	/*
+	 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
+	 * than enough.  1K is the biggest I have seen.
+	 */
+	set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
+	ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
+	ehci_bar += bar_val & ~PAGE_MASK;
+	dbgp_printk("ehci_bar: %p\n", ehci_bar);
+
+	ehci_caps  = ehci_bar;
+	ehci_regs  = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
+	ehci_debug = ehci_bar + offset;
+	ehci_dev.bus = bus;
+	ehci_dev.slot = slot;
+	ehci_dev.func = func;
+
+	detect_set_debug_port();
+
+	ret = ehci_setup();
+	if (ret < 0) {
+		dbgp_printk("ehci_setup failed\n");
+		ehci_debug = NULL;
+
+		return -1;
+	}
+
+	return 0;
+}
+
+static void early_dbgp_write(struct console *con, const char *str, u32 n)
+{
+	int chunk, ret;
+
+	if (!ehci_debug)
+		return;
+	while (n > 0) {
+		chunk = n;
+		if (chunk > DBGP_MAX_PACKET)
+			chunk = DBGP_MAX_PACKET;
+		ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
+			dbgp_endpoint_out, str, chunk);
+		str += chunk;
+		n -= chunk;
+	}
+}
+
+static struct console early_dbgp_console = {
+	.name =		"earlydbg",
+	.write =	early_dbgp_write,
+	.flags =	CON_PRINTBUFFER,
+	.index =	-1,
+};
+#endif
+
 /* Console interface to a host file on AMD's SimNow! */
 
 static int simnow_fd;
@@ -165,6 +889,7 @@ enum {
 static noinline long simnow(long cmd, long a, long b, long c)
 {
 	long ret;
+
 	asm volatile("cpuid" :
 		     "=a" (ret) :
 		     "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
@@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c)
 static void __init simnow_init(char *str)
 {
 	char *fn = "klog";
+
 	if (*str == '=')
 		fn = ++str;
 	/* error ignored */
@@ -194,7 +920,7 @@ static struct console simnow_console = {
 
 /* Direct interface for emergencies */
 static struct console *early_console = &early_vga_console;
-static int early_console_initialized;
+static int __initdata early_console_initialized;
 
 asmlinkage void early_printk(const char *fmt, ...)
 {
@@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...)
 	va_end(ap);
 }
 
-static int __initdata keep_early;
 
 static int __init setup_early_printk(char *buf)
 {
+	int keep_early;
+
 	if (!buf)
 		return 0;
 
@@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf)
 		return 0;
 	early_console_initialized = 1;
 
-	if (strstr(buf, "keep"))
-		keep_early = 1;
+	keep_early = (strstr(buf, "keep") != NULL);
 
 	if (!strncmp(buf, "serial", 6)) {
 		early_serial_init(buf + 6);
@@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf)
 		simnow_init(buf + 6);
 		early_console = &simnow_console;
 		keep_early = 1;
+#ifdef CONFIG_EARLY_PRINTK_DBGP
+	} else if (!strncmp(buf, "dbgp", 4)) {
+		if (early_dbgp_init(buf+4) < 0)
+			return 0;
+		early_console = &early_dbgp_console;
+		/*
+		 * usb subsys will reset ehci controller, so don't keep
+		 * that early console
+		 */
+		keep_early = 0;
+#endif
 #ifdef CONFIG_HVC_XEN
 	} else if (!strncmp(buf, "xen", 3)) {
 		early_console = &xenboot_console;
@@ -251,4 +988,5 @@ static int __init setup_early_printk(char *buf)
 	register_console(early_console);
 	return 0;
 }
+
 early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 06cc8d4254b1..1119d247fe11 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -367,6 +367,10 @@ void __init efi_init(void)
 			efi.smbios = config_tables[i].table;
 			printk(" SMBIOS=0x%lx ", config_tables[i].table);
 		} else if (!efi_guidcmp(config_tables[i].guid,
+					UV_SYSTEM_TABLE_GUID)) {
+			efi.uv_systab = config_tables[i].table;
+			printk(" UVsystab=0x%lx ", config_tables[i].table);
+		} else if (!efi_guidcmp(config_tables[i].guid,
 					HCDP_TABLE_GUID)) {
 			efi.hcdp = config_tables[i].table;
 			printk(" HCDP=0x%lx ", config_tables[i].table);
@@ -414,9 +418,11 @@ void __init efi_init(void)
 	if (memmap.map == NULL)
 		printk(KERN_ERR "Could not map the EFI memory map!\n");
 	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+
 	if (memmap.desc_size != sizeof(efi_memory_desc_t))
-		printk(KERN_WARNING "Kernel-defined memdesc"
-		       "doesn't match the one from EFI!\n");
+		printk(KERN_WARNING
+		  "Kernel-defined memdesc doesn't match the one from EFI!\n");
+
 	if (add_efi_memmap)
 		do_add_efi_memmap();
 
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 4b63c8e1f13b..5cab48ee61a4 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -53,7 +53,7 @@ void efi_call_phys_prelog(void)
 	 * directory. If I have PAE, I just need to duplicate one entry in
 	 * page directory.
 	 */
-	cr4 = read_cr4();
+	cr4 = read_cr4_safe();
 
 	if (cr4 & X86_CR4_PAE) {
 		efi_bak_pg_dir_pointer[0].pgd =
@@ -91,7 +91,7 @@ void efi_call_phys_epilog(void)
 	gdt_descr.size = GDT_SIZE - 1;
 	load_gdt(&gdt_descr);
 
-	cr4 = read_cr4();
+	cr4 = read_cr4_safe();
 
 	if (cr4 & X86_CR4_PAE) {
 		swapper_pg_dir[pgd_index(0)].pgd =
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 6bc07f0f1202..dd65143941a8 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -54,6 +54,16 @@
 #include <asm/ftrace.h>
 #include <asm/irq_vectors.h>
 
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_LE	   0x40000000
+
+#ifndef CONFIG_AUDITSYSCALL
+#define sysenter_audit	syscall_trace_entry
+#define sysexit_audit	syscall_exit_work
+#endif
+
 /*
  * We use macros for low-level operations which need to be overridden
  * for paravirtualization.  The following will never clobber any registers:
@@ -332,8 +342,9 @@ sysenter_past_esp:
 	GET_THREAD_INFO(%ebp)
 
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
-	jnz syscall_trace_entry
+	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
+	jnz sysenter_audit
+sysenter_do_call:
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 	call *sys_call_table(,%eax,4)
@@ -343,7 +354,8 @@ sysenter_past_esp:
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
-	jne syscall_exit_work
+	jne sysexit_audit
+sysenter_exit:
 /* if something modifies registers it must also disable sysexit */
 	movl PT_EIP(%esp), %edx
 	movl PT_OLDESP(%esp), %ecx
@@ -351,6 +363,45 @@ sysenter_past_esp:
 	TRACE_IRQS_ON
 1:	mov  PT_FS(%esp), %fs
 	ENABLE_INTERRUPTS_SYSEXIT
+
+#ifdef CONFIG_AUDITSYSCALL
+sysenter_audit:
+	testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	jnz syscall_trace_entry
+	addl $4,%esp
+	CFI_ADJUST_CFA_OFFSET -4
+	/* %esi already in 8(%esp)	   6th arg: 4th syscall arg */
+	/* %edx already in 4(%esp)	   5th arg: 3rd syscall arg */
+	/* %ecx already in 0(%esp)	   4th arg: 2nd syscall arg */
+	movl %ebx,%ecx			/* 3rd arg: 1st syscall arg */
+	movl %eax,%edx			/* 2nd arg: syscall number */
+	movl $AUDIT_ARCH_I386,%eax	/* 1st arg: audit arch */
+	call audit_syscall_entry
+	pushl %ebx
+	CFI_ADJUST_CFA_OFFSET 4
+	movl PT_EAX(%esp),%eax		/* reload syscall number */
+	jmp sysenter_do_call
+
+sysexit_audit:
+	testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+	jne syscall_exit_work
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_ANY)
+	movl %eax,%edx		/* second arg, syscall return value */
+	cmpl $0,%eax		/* is it < 0? */
+	setl %al		/* 1 if so, 0 if not */
+	movzbl %al,%eax		/* zero-extend that */
+	inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
+	call audit_syscall_exit
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	TRACE_IRQS_OFF
+	movl TI_flags(%ebp), %ecx
+	testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
+	jne syscall_exit_work
+	movl PT_EAX(%esp),%eax	/* reload syscall return value */
+	jmp sysenter_exit
+#endif
+
 	CFI_ENDPROC
 .pushsection .fixup,"ax"
 2:	movl $0,PT_FS(%esp)
@@ -370,7 +421,7 @@ ENTRY(system_call)
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation / emulation
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
+	testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -383,10 +434,6 @@ syscall_exit:
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	TRACE_IRQS_OFF
-	testl $X86_EFLAGS_TF,PT_EFLAGS(%esp)	# If tracing set singlestep flag on exit
-	jz no_singlestep
-	orl $_TIF_SINGLESTEP,TI_flags(%ebp)
-no_singlestep:
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx	# current->work
 	jne syscall_exit_work
@@ -514,12 +561,8 @@ END(work_pending)
 syscall_trace_entry:
 	movl $-ENOSYS,PT_EAX(%esp)
 	movl %esp, %eax
-	xorl %edx,%edx
-	call do_syscall_trace
-	cmpl $0, %eax
-	jne resume_userspace		# ret != 0 -> running under PTRACE_SYSEMU,
-					# so must skip actual syscall
-	movl PT_ORIG_EAX(%esp), %eax
+	call syscall_trace_enter
+	/* What it returned is what we'll actually use.  */
 	cmpl $(nr_syscalls), %eax
 	jnae syscall_call
 	jmp syscall_exit
@@ -528,14 +571,13 @@ END(syscall_trace_entry)
 	# perform syscall exit tracing
 	ALIGN
 syscall_exit_work:
-	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
+	testb $_TIF_WORK_SYSCALL_EXIT, %cl
 	jz work_pending
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)	# could let do_syscall_trace() call
+	ENABLE_INTERRUPTS(CLBR_ANY)	# could let syscall_trace_leave() call
 					# schedule() instead
 	movl %esp, %eax
-	movl $1, %edx
-	call do_syscall_trace
+	call syscall_trace_leave
 	jmp resume_userspace
 END(syscall_exit_work)
 	CFI_ENDPROC
@@ -587,7 +629,7 @@ ENTRY(interrupt)
 ENTRY(irq_entries_start)
 	RING0_INT_FRAME
 vector=0
-.rept NR_IRQS
+.rept NR_VECTORS
 	ALIGN
  .if vector
 	CFI_ADJUST_CFA_OFFSET -4
@@ -688,6 +730,7 @@ error_code:
 	movl $(__USER_DS), %ecx
 	movl %ecx, %ds
 	movl %ecx, %es
+	TRACE_IRQS_OFF
 	movl %esp,%eax			# pt_regs pointer
 	call *%edi
 	jmp ret_from_exception
@@ -718,20 +761,9 @@ ENTRY(device_not_available)
 	RING0_INT_FRAME
 	pushl $-1			# mark this as an int
 	CFI_ADJUST_CFA_OFFSET 4
-	SAVE_ALL
-	GET_CR0_INTO_EAX
-	testl $0x4, %eax		# EM (math emulation bit)
-	jne device_not_available_emulate
-	preempt_stop(CLBR_ANY)
-	call math_state_restore
-	jmp ret_from_exception
-device_not_available_emulate:
-	pushl $0			# temporary storage for ORIG_EIP
+	pushl $do_device_not_available
 	CFI_ADJUST_CFA_OFFSET 4
-	call math_emulate
-	addl $4, %esp
-	CFI_ADJUST_CFA_OFFSET -4
-	jmp ret_from_exception
+	jmp error_code
 	CFI_ENDPROC
 END(device_not_available)
 
@@ -772,6 +804,7 @@ debug_stack_correct:
 	pushl $-1			# mark this as an int
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
+	TRACE_IRQS_OFF
 	xorl %edx,%edx			# error code 0
 	movl %esp,%eax			# pt_regs pointer
 	call do_debug
@@ -816,6 +849,7 @@ nmi_stack_correct:
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
+	TRACE_IRQS_OFF
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_nmi
@@ -856,6 +890,7 @@ nmi_espfix_stack:
 	pushl %eax
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
+	TRACE_IRQS_OFF
 	FIXUP_ESPFIX_STACK		# %eax == %esp
 	xorl %edx,%edx			# zero error code
 	call do_nmi
@@ -886,6 +921,7 @@ KPROBE_ENTRY(int3)
 	pushl $-1			# mark this as an int
 	CFI_ADJUST_CFA_OFFSET 4
 	SAVE_ALL
+	TRACE_IRQS_OFF
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_int3
@@ -1024,6 +1060,7 @@ ENDPROC(kernel_thread_helper)
 ENTRY(xen_sysenter_target)
 	RING0_INT_FRAME
 	addl $5*4, %esp		/* remove xen-provided frame */
+	CFI_ADJUST_CFA_OFFSET -5*4
 	jmp sysenter_past_esp
 	CFI_ENDPROC
 
@@ -1116,20 +1153,6 @@ ENDPROC(xen_failsafe_callback)
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 ENTRY(mcount)
-	pushl %eax
-	pushl %ecx
-	pushl %edx
-	movl 0xc(%esp), %eax
-	subl $MCOUNT_INSN_SIZE, %eax
-
-.globl mcount_call
-mcount_call:
-	call ftrace_stub
-
-	popl %edx
-	popl %ecx
-	popl %eax
-
 	ret
 END(mcount)
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ae63e584c340..09e7145484c5 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -53,37 +53,17 @@
 #include <asm/paravirt.h>
 #include <asm/ftrace.h>
 
+/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
+#include <linux/elf-em.h>
+#define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
+#define __AUDIT_ARCH_64BIT 0x80000000
+#define __AUDIT_ARCH_LE	   0x40000000
+
 	.code64
 
 #ifdef CONFIG_FTRACE
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(mcount)
-
-	subq $0x38, %rsp
-	movq %rax, (%rsp)
-	movq %rcx, 8(%rsp)
-	movq %rdx, 16(%rsp)
-	movq %rsi, 24(%rsp)
-	movq %rdi, 32(%rsp)
-	movq %r8, 40(%rsp)
-	movq %r9, 48(%rsp)
-
-	movq 0x38(%rsp), %rdi
-	subq $MCOUNT_INSN_SIZE, %rdi
-
-.globl mcount_call
-mcount_call:
-	call ftrace_stub
-
-	movq 48(%rsp), %r9
-	movq 40(%rsp), %r8
-	movq 32(%rsp), %rdi
-	movq 24(%rsp), %rsi
-	movq 16(%rsp), %rdx
-	movq 8(%rsp), %rcx
-	movq (%rsp), %rax
-	addq $0x38, %rsp
-
 	retq
 END(mcount)
 
@@ -269,9 +249,9 @@ ENTRY(native_usergs_sysret64)
 ENTRY(ret_from_fork)
 	CFI_DEFAULT_STACK
 	push kernel_eflags(%rip)
-	CFI_ADJUST_CFA_OFFSET 4
+	CFI_ADJUST_CFA_OFFSET 8
 	popf				# reset kernel eflags
-	CFI_ADJUST_CFA_OFFSET -4
+	CFI_ADJUST_CFA_OFFSET -8
 	call schedule_tail
 	GET_THREAD_INFO(%rcx)
 	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
@@ -349,9 +329,9 @@ ENTRY(system_call_after_swapgs)
 	movq  %rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	GET_THREAD_INFO(%rcx)
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \
-		TI_flags(%rcx)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
 	jnz tracesys
+system_call_fastpath:
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
 	movq %r10,%rcx
@@ -403,16 +383,16 @@ sysret_careful:
 sysret_signal:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	testl $_TIF_DO_NOTIFY_MASK,%edx
-	jz    1f
-
-	/* Really a signal */
+#ifdef CONFIG_AUDITSYSCALL
+	bt $TIF_SYSCALL_AUDIT,%edx
+	jc sysret_audit
+#endif
 	/* edx:	work flags (arg3) */
 	leaq do_notify_resume(%rip),%rax
 	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
 	xorl %esi,%esi # oldset -> arg2
 	call ptregscall_common
-1:	movl $_TIF_WORK_MASK,%edi
+	movl $_TIF_WORK_MASK,%edi
 	/* Use IRET because user could have changed frame. This
 	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
 	DISABLE_INTERRUPTS(CLBR_NONE)
@@ -423,14 +403,56 @@ badsys:
 	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
 	jmp ret_from_sys_call
 
+#ifdef CONFIG_AUDITSYSCALL
+	/*
+	 * Fast path for syscall audit without full syscall trace.
+	 * We just call audit_syscall_entry() directly, and then
+	 * jump back to the normal fast path.
+	 */
+auditsys:
+	movq %r10,%r9			/* 6th arg: 4th syscall arg */
+	movq %rdx,%r8			/* 5th arg: 3rd syscall arg */
+	movq %rsi,%rcx			/* 4th arg: 2nd syscall arg */
+	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
+	movq %rax,%rsi			/* 2nd arg: syscall number */
+	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
+	call audit_syscall_entry
+	LOAD_ARGS 0		/* reload call-clobbered registers */
+	jmp system_call_fastpath
+
+	/*
+	 * Return fast path for syscall audit.  Call audit_syscall_exit()
+	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
+	 * masked off.
+	 */
+sysret_audit:
+	movq %rax,%rsi		/* second arg, syscall return value */
+	cmpq $0,%rax		/* is it < 0? */
+	setl %al		/* 1 if so, 0 if not */
+	movzbl %al,%edi		/* zero-extend that into %edi */
+	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
+	call audit_syscall_exit
+	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
+	jmp sysret_check
+#endif	/* CONFIG_AUDITSYSCALL */
+
 	/* Do syscall tracing */
 tracesys:			 
+#ifdef CONFIG_AUDITSYSCALL
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
+	jz auditsys
+#endif
 	SAVE_REST
 	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
 	FIXUP_TOP_OF_STACK %rdi
 	movq %rsp,%rdi
 	call syscall_trace_enter
-	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	/*
+	 * Reload arg registers from stack in case ptrace changed them.
+	 * We don't reload %rax because syscall_trace_enter() returned
+	 * the value it wants us to use in the table lookup.
+	 */
+	LOAD_ARGS ARGOFFSET, 1
 	RESTORE_REST
 	cmpq $__NR_syscall_max,%rax
 	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
@@ -444,6 +466,7 @@ tracesys:
  * Has correct top of stack, but partial stack frame.
  */
 	.globl int_ret_from_sys_call
+	.globl int_with_check
 int_ret_from_sys_call:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
@@ -483,7 +506,7 @@ int_very_careful:
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_REST
 	/* Check for syscall exit trace */	
-	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
+	testl $_TIF_WORK_SYSCALL_EXIT,%edx
 	jz int_signal
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
@@ -491,7 +514,7 @@ int_very_careful:
 	call syscall_trace_leave
 	popq %rdi
 	CFI_ADJUST_CFA_OFFSET -8
-	andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi
+	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
 	jmp int_restore_rest
 	
 int_signal:
@@ -618,6 +641,13 @@ END(stub_rt_sigreturn)
 	SAVE_ARGS
 	leaq -ARGOFFSET(%rsp),%rdi	# arg1 for handler
 	pushq %rbp
+	/*
+	 * Save rbp twice: One is for marking the stack frame, as usual, and the
+	 * other, to fill pt_regs properly. This is because bx comes right
+	 * before the last saved register in that structure, and not bp. If the
+	 * base pointer were in the place bx is today, this would not be needed.
+	 */
+	movq %rbp, -8(%rsp)
 	CFI_ADJUST_CFA_OFFSET	8
 	CFI_REL_OFFSET		rbp, 0
 	movq %rsp,%rbp
@@ -883,6 +913,9 @@ END(spurious_interrupt)
 	.if \ist
 	movq	%gs:pda_data_offset, %rbp
 	.endif
+	.if \irqtrace
+	TRACE_IRQS_OFF
+	.endif
 	movq %rsp,%rdi
 	movq ORIG_RAX(%rsp),%rsi
 	movq $-1,ORIG_RAX(%rsp)
@@ -1009,7 +1042,8 @@ KPROBE_ENTRY(error_entry)
 	je  error_kernelspace
 error_swapgs:	
 	SWAPGS
-error_sti:	
+error_sti:
+	TRACE_IRQS_OFF
 	movq %rdi,RDI(%rsp) 	
 	CFI_REL_OFFSET	rdi,RDI
 	movq %rsp,%rdi
@@ -1183,12 +1217,13 @@ ENTRY(simd_coprocessor_error)
 END(simd_coprocessor_error)
 
 ENTRY(device_not_available)
-	zeroentry math_state_restore
+	zeroentry do_device_not_available
 END(device_not_available)
 
 	/* runs on exception stack */
 KPROBE_ENTRY(debug)
  	INTR_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	pushq $0
 	CFI_ADJUST_CFA_OFFSET 8		
 	paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1233,7 @@ KPROBE_END(debug)
 	/* runs on exception stack */	
 KPROBE_ENTRY(nmi)
 	INTR_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	pushq $-1
 	CFI_ADJUST_CFA_OFFSET 8
 	paranoidentry do_nmi, 0, 0
@@ -1211,6 +1247,7 @@ KPROBE_END(nmi)
 
 KPROBE_ENTRY(int3)
  	INTR_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
  	pushq $0
  	CFI_ADJUST_CFA_OFFSET 8
  	paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1274,7 @@ END(coprocessor_segment_overrun)
 	/* runs on exception stack */
 ENTRY(double_fault)
 	XCPT_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	paranoidentry do_double_fault
 	jmp paranoid_exit1
 	CFI_ENDPROC
@@ -1253,6 +1291,7 @@ END(segment_not_present)
 	/* runs on exception stack */
 ENTRY(stack_segment)
 	XCPT_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	paranoidentry do_stack_segment
 	jmp paranoid_exit1
 	CFI_ENDPROC
@@ -1278,6 +1317,7 @@ END(spurious_interrupt_bug)
 	/* runs on exception stack */
 ENTRY(machine_check)
 	INTR_FRAME
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	pushq $0
 	CFI_ADJUST_CFA_OFFSET 8	
 	paranoidentry do_machine_check
@@ -1312,3 +1352,103 @@ KPROBE_ENTRY(ignore_sysret)
 	sysret
 	CFI_ENDPROC
 ENDPROC(ignore_sysret)
+
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+	zeroentry xen_do_hypervisor_callback
+END(xen_hypervisor_callback)
+
+/*
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+*/
+ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
+	CFI_STARTPROC
+/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+   see the correct pointer to the pt_regs */
+	movq %rdi, %rsp            # we don't return, adjust the stack frame
+	CFI_ENDPROC
+	CFI_DEFAULT_STACK
+11:	incl %gs:pda_irqcount
+	movq %rsp,%rbp
+	CFI_DEF_CFA_REGISTER rbp
+	cmovzq %gs:pda_irqstackptr,%rsp
+	pushq %rbp			# backlink for old unwinder
+	call xen_evtchn_do_upcall
+	popq %rsp
+	CFI_DEF_CFA_REGISTER rsp
+	decl %gs:pda_irqcount
+	jmp  error_exit
+	CFI_ENDPROC
+END(do_hypervisor_callback)
+
+/*
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+#  1. Fault while reloading DS, ES, FS or GS
+#  2. Fault while executing IRET
+# Category 1 we do not need to fix up as Xen has already reloaded all segment
+# registers that could be reloaded and zeroed the others.
+# Category 2 we fix up by killing the current process. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by comparing each saved segment register
+# with its current contents: any discrepancy means we in category 1.
+*/
+ENTRY(xen_failsafe_callback)
+	framesz = (RIP-0x30)	/* workaround buggy gas */
+	_frame framesz
+	CFI_REL_OFFSET rcx, 0
+	CFI_REL_OFFSET r11, 8
+	movw %ds,%cx
+	cmpw %cx,0x10(%rsp)
+	CFI_REMEMBER_STATE
+	jne 1f
+	movw %es,%cx
+	cmpw %cx,0x18(%rsp)
+	jne 1f
+	movw %fs,%cx
+	cmpw %cx,0x20(%rsp)
+	jne 1f
+	movw %gs,%cx
+	cmpw %cx,0x28(%rsp)
+	jne 1f
+	/* All segments match their saved values => Category 2 (Bad IRET). */
+	movq (%rsp),%rcx
+	CFI_RESTORE rcx
+	movq 8(%rsp),%r11
+	CFI_RESTORE r11
+	addq $0x30,%rsp
+	CFI_ADJUST_CFA_OFFSET -0x30
+	pushq $0
+	CFI_ADJUST_CFA_OFFSET 8
+	pushq %r11
+	CFI_ADJUST_CFA_OFFSET 8
+	pushq %rcx
+	CFI_ADJUST_CFA_OFFSET 8
+	jmp general_protection
+	CFI_RESTORE_STATE
+1:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+	movq (%rsp),%rcx
+	CFI_RESTORE rcx
+	movq 8(%rsp),%r11
+	CFI_RESTORE r11
+	addq $0x30,%rsp
+	CFI_ADJUST_CFA_OFFSET -0x30
+	pushq $0
+	CFI_ADJUST_CFA_OFFSET 8
+	SAVE_ALL
+	jmp error_exit
+	CFI_ENDPROC
+END(xen_failsafe_callback)
+
+#endif /* CONFIG_XEN */
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
new file mode 100644
index 000000000000..f454c78fcef6
--- /dev/null
+++ b/arch/x86/kernel/es7000_32.c
@@ -0,0 +1,363 @@
+/*
+ * Written by: Garry Forsgren, Unisys Corporation
+ *             Natalie Protasevich, Unisys Corporation
+ * This file contains the code to configure and interface
+ * with Unisys ES7000 series hardware system manager.
+ *
+ * Copyright (c) 2003 Unisys Corporation.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Unisys Corporation, Township Line & Union Meeting
+ * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
+ *
+ * http://www.unisys.com
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/init.h>
+#include <linux/acpi.h>
+#include <asm/io.h>
+#include <asm/nmi.h>
+#include <asm/smp.h>
+#include <asm/apicdef.h>
+#include <mach_mpparse.h>
+
+/*
+ * ES7000 chipsets
+ */
+
+#define NON_UNISYS		0
+#define ES7000_CLASSIC		1
+#define ES7000_ZORRO		2
+
+
+#define	MIP_REG			1
+#define	MIP_PSAI_REG		4
+
+#define	MIP_BUSY		1
+#define	MIP_SPIN		0xf0000
+#define	MIP_VALID		0x0100000000000000ULL
+#define	MIP_PORT(VALUE)	((VALUE >> 32) & 0xffff)
+
+#define	MIP_RD_LO(VALUE)	(VALUE & 0xffffffff)
+
+struct mip_reg_info {
+	unsigned long long mip_info;
+	unsigned long long delivery_info;
+	unsigned long long host_reg;
+	unsigned long long mip_reg;
+};
+
+struct part_info {
+	unsigned char type;
+	unsigned char length;
+	unsigned char part_id;
+	unsigned char apic_mode;
+	unsigned long snum;
+	char ptype[16];
+	char sname[64];
+	char pname[64];
+};
+
+struct psai {
+	unsigned long long entry_type;
+	unsigned long long addr;
+	unsigned long long bep_addr;
+};
+
+struct es7000_mem_info {
+	unsigned char type;
+	unsigned char length;
+	unsigned char resv[6];
+	unsigned long long  start;
+	unsigned long long  size;
+};
+
+struct es7000_oem_table {
+	unsigned long long hdr;
+	struct mip_reg_info mip;
+	struct part_info pif;
+	struct es7000_mem_info shm;
+	struct psai psai;
+};
+
+#ifdef CONFIG_ACPI
+
+struct oem_table {
+	struct acpi_table_header Header;
+	u32 OEMTableAddr;
+	u32 OEMTableSize;
+};
+
+extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
+extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
+#endif
+
+struct mip_reg {
+	unsigned long long off_0;
+	unsigned long long off_8;
+	unsigned long long off_10;
+	unsigned long long off_18;
+	unsigned long long off_20;
+	unsigned long long off_28;
+	unsigned long long off_30;
+	unsigned long long off_38;
+};
+
+#define	MIP_SW_APIC		0x1020b
+#define	MIP_FUNC(VALUE)		(VALUE & 0xff)
+
+/*
+ * ES7000 Globals
+ */
+
+static volatile unsigned long	*psai = NULL;
+static struct mip_reg		*mip_reg;
+static struct mip_reg		*host_reg;
+static int 			mip_port;
+static unsigned long		mip_addr, host_addr;
+
+int es7000_plat;
+
+/*
+ * GSI override for ES7000 platforms.
+ */
+
+static unsigned int base;
+
+static int
+es7000_rename_gsi(int ioapic, int gsi)
+{
+	if (es7000_plat == ES7000_ZORRO)
+		return gsi;
+
+	if (!base) {
+		int i;
+		for (i = 0; i < nr_ioapics; i++)
+			base += nr_ioapic_registers[i];
+	}
+
+	if (!ioapic && (gsi < 16))
+		gsi += base;
+	return gsi;
+}
+
+void __init
+setup_unisys(void)
+{
+	/*
+	 * Determine the generation of the ES7000 currently running.
+	 *
+	 * es7000_plat = 1 if the machine is a 5xx ES7000 box
+	 * es7000_plat = 2 if the machine is a x86_64 ES7000 box
+	 *
+	 */
+	if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2))
+		es7000_plat = ES7000_ZORRO;
+	else
+		es7000_plat = ES7000_CLASSIC;
+	ioapic_renumber_irq = es7000_rename_gsi;
+}
+
+/*
+ * Parse the OEM Table
+ */
+
+int __init
+parse_unisys_oem (char *oemptr)
+{
+	int                     i;
+	int 			success = 0;
+	unsigned char           type, size;
+	unsigned long           val;
+	char                    *tp = NULL;
+	struct psai             *psaip = NULL;
+	struct mip_reg_info 	*mi;
+	struct mip_reg		*host, *mip;
+
+	tp = oemptr;
+
+	tp += 8;
+
+	for (i=0; i <= 6; i++) {
+		type = *tp++;
+		size = *tp++;
+		tp -= 2;
+		switch (type) {
+		case MIP_REG:
+			mi = (struct mip_reg_info *)tp;
+			val = MIP_RD_LO(mi->host_reg);
+			host_addr = val;
+			host = (struct mip_reg *)val;
+			host_reg = __va(host);
+			val = MIP_RD_LO(mi->mip_reg);
+			mip_port = MIP_PORT(mi->mip_info);
+			mip_addr = val;
+			mip = (struct mip_reg *)val;
+			mip_reg = __va(mip);
+			pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
+				 (unsigned long)host_reg);
+			pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
+				 (unsigned long)mip_reg);
+			success++;
+			break;
+		case MIP_PSAI_REG:
+			psaip = (struct psai *)tp;
+			if (tp != NULL) {
+				if (psaip->addr)
+					psai = __va(psaip->addr);
+				else
+					psai = NULL;
+				success++;
+			}
+			break;
+		default:
+			break;
+		}
+		tp += size;
+	}
+
+	if (success < 2) {
+		es7000_plat = NON_UNISYS;
+	} else
+		setup_unisys();
+	return es7000_plat;
+}
+
+#ifdef CONFIG_ACPI
+static unsigned long oem_addrX;
+static unsigned long oem_size;
+int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
+{
+	struct acpi_table_header *header = NULL;
+	int i = 0;
+	acpi_size tbl_size;
+
+	while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) {
+		if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
+			struct oem_table *t = (struct oem_table *)header;
+
+			oem_addrX = t->OEMTableAddr;
+			oem_size = t->OEMTableSize;
+			early_acpi_os_unmap_memory(header, tbl_size);
+
+			*oem_addr = (unsigned long)__acpi_map_table(oem_addrX,
+								    oem_size);
+			return 0;
+		}
+		early_acpi_os_unmap_memory(header, tbl_size);
+	}
+	return -1;
+}
+
+void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
+{
+	if (!oem_addr)
+		return;
+
+	__acpi_unmap_table((char *)oem_addr, oem_size);
+}
+#endif
+
+static void
+es7000_spin(int n)
+{
+	int i = 0;
+
+	while (i++ < n)
+		rep_nop();
+}
+
+static int __init
+es7000_mip_write(struct mip_reg *mip_reg)
+{
+	int			status = 0;
+	int			spin;
+
+	spin = MIP_SPIN;
+	while (((unsigned long long)host_reg->off_38 &
+		(unsigned long long)MIP_VALID) != 0) {
+			if (--spin <= 0) {
+				printk("es7000_mip_write: Timeout waiting for Host Valid Flag");
+				return -1;
+			}
+		es7000_spin(MIP_SPIN);
+	}
+
+	memcpy(host_reg, mip_reg, sizeof(struct mip_reg));
+	outb(1, mip_port);
+
+	spin = MIP_SPIN;
+
+	while (((unsigned long long)mip_reg->off_38 &
+		(unsigned long long)MIP_VALID) == 0) {
+		if (--spin <= 0) {
+			printk("es7000_mip_write: Timeout waiting for MIP Valid Flag");
+			return -1;
+		}
+		es7000_spin(MIP_SPIN);
+	}
+
+	status = ((unsigned long long)mip_reg->off_0 &
+		(unsigned long long)0xffff0000000000ULL) >> 48;
+	mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 &
+		(unsigned long long)~MIP_VALID);
+	return status;
+}
+
+int
+es7000_start_cpu(int cpu, unsigned long eip)
+{
+	unsigned long vect = 0, psaival = 0;
+
+	if (psai == NULL)
+		return -1;
+
+	vect = ((unsigned long)__pa(eip)/0x1000) << 16;
+	psaival = (0x1000000 | vect | cpu);
+
+	while (*psai & 0x1000000)
+                ;
+
+	*psai = psaival;
+
+	return 0;
+
+}
+
+void __init
+es7000_sw_apic(void)
+{
+	if (es7000_plat) {
+		int mip_status;
+		struct mip_reg es7000_mip_reg;
+
+		printk("ES7000: Enabling APIC mode.\n");
+        	memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
+        	es7000_mip_reg.off_0 = MIP_SW_APIC;
+        	es7000_mip_reg.off_38 = (MIP_VALID);
+        	while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0)
+              		printk("es7000_sw_apic: command failed, status = %x\n",
+				mip_status);
+		return;
+	}
+}
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ab115cd15fdf..d073d981a730 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -11,17 +11,18 @@
 
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
+#include <linux/uaccess.h>
 #include <linux/ftrace.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/list.h>
 
-#include <asm/alternative.h>
 #include <asm/ftrace.h>
+#include <asm/nops.h>
 
 
 /* Long is fine, even if it is only 4 bytes ;-) */
-static long *ftrace_nop;
+static unsigned long *ftrace_nop;
 
 union ftrace_code_union {
 	char code[MCOUNT_INSN_SIZE];
@@ -60,11 +61,7 @@ notrace int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
 {
-	unsigned replaced;
-	unsigned old = *(unsigned *)old_code; /* 4 bytes */
-	unsigned new = *(unsigned *)new_code; /* 4 bytes */
-	unsigned char newch = new_code[4];
-	int faulted = 0;
+	unsigned char replaced[MCOUNT_INSN_SIZE];
 
 	/*
 	 * Note: Due to modules and __init, code can
@@ -72,29 +69,20 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	 *  as well as code changing.
 	 *
 	 * No real locking needed, this code is run through
-	 * kstop_machine.
+	 * kstop_machine, or before SMP starts.
 	 */
-	asm volatile (
-		"1: lock\n"
-		"   cmpxchg %3, (%2)\n"
-		"   jnz 2f\n"
-		"   movb %b4, 4(%2)\n"
-		"2:\n"
-		".section .fixup, \"ax\"\n"
-		"3:	movl $1, %0\n"
-		"	jmp 2b\n"
-		".previous\n"
-		_ASM_EXTABLE(1b, 3b)
-		: "=r"(faulted), "=a"(replaced)
-		: "r"(ip), "r"(new), "c"(newch),
-		  "0"(faulted), "a"(old)
-		: "memory");
-	sync_core();
+	if (__copy_from_user_inatomic(replaced, (char __user *)ip, MCOUNT_INSN_SIZE))
+		return 1;
+
+	if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
+		return 2;
 
-	if (replaced != old && replaced != new)
-		faulted = 2;
+	WARN_ON_ONCE(__copy_to_user_inatomic((char __user *)ip, new_code,
+				    MCOUNT_INSN_SIZE));
 
-	return faulted;
+	sync_core();
+
+	return 0;
 }
 
 notrace int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -112,30 +100,76 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func)
 
 notrace int ftrace_mcount_set(unsigned long *data)
 {
-	unsigned long ip = (long)(&mcount_call);
-	unsigned long *addr = data;
-	unsigned char old[MCOUNT_INSN_SIZE], *new;
-
-	/*
-	 * Replace the mcount stub with a pointer to the
-	 * ip recorder function.
-	 */
-	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
-	new = ftrace_call_replace(ip, *addr);
-	*addr = ftrace_modify_code(ip, old, new);
-
+	/* mcount is initialized as a nop */
+	*data = 0;
 	return 0;
 }
 
 int __init ftrace_dyn_arch_init(void *data)
 {
-	const unsigned char *const *noptable = find_nop_table();
-
-	/* This is running in kstop_machine */
-
-	ftrace_mcount_set(data);
+	extern const unsigned char ftrace_test_p6nop[];
+	extern const unsigned char ftrace_test_nop5[];
+	extern const unsigned char ftrace_test_jmp[];
+	int faulted = 0;
 
-	ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE];
+	/*
+	 * There is no good nop for all x86 archs.
+	 * We will default to using the P6_NOP5, but first we
+	 * will test to make sure that the nop will actually
+	 * work on this CPU. If it faults, we will then
+	 * go to a lesser efficient 5 byte nop. If that fails
+	 * we then just use a jmp as our nop. This isn't the most
+	 * efficient nop, but we can not use a multi part nop
+	 * since we would then risk being preempted in the middle
+	 * of that nop, and if we enabled tracing then, it might
+	 * cause a system crash.
+	 *
+	 * TODO: check the cpuid to determine the best nop.
+	 */
+	asm volatile (
+		"jmp ftrace_test_jmp\n"
+		/* This code needs to stay around */
+		".section .text, \"ax\"\n"
+		"ftrace_test_jmp:"
+		"jmp ftrace_test_p6nop\n"
+		"nop\n"
+		"nop\n"
+		"nop\n"  /* 2 byte jmp + 3 bytes */
+		"ftrace_test_p6nop:"
+		P6_NOP5
+		"jmp 1f\n"
+		"ftrace_test_nop5:"
+		".byte 0x66,0x66,0x66,0x66,0x90\n"
+		"jmp 1f\n"
+		".previous\n"
+		"1:"
+		".section .fixup, \"ax\"\n"
+		"2:	movl $1, %0\n"
+		"	jmp ftrace_test_nop5\n"
+		"3:	movl $2, %0\n"
+		"	jmp 1b\n"
+		".previous\n"
+		_ASM_EXTABLE(ftrace_test_p6nop, 2b)
+		_ASM_EXTABLE(ftrace_test_nop5, 3b)
+		: "=r"(faulted) : "0" (faulted));
+
+	switch (faulted) {
+	case 0:
+		pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n");
+		ftrace_nop = (unsigned long *)ftrace_test_p6nop;
+		break;
+	case 1:
+		pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n");
+		ftrace_nop = (unsigned long *)ftrace_test_nop5;
+		break;
+	case 2:
+		pr_info("ftrace: converting mcount calls to jmp . + 5\n");
+		ftrace_nop = (unsigned long *)ftrace_test_jmp;
+		break;
+	}
+
+	/* The return code is retured via data */
+	*(unsigned long *)data = 0;
 
 	return 0;
 }
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 1fa8be5bd217..6c9bfc9e1e95 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -16,86 +16,63 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/hardirq.h>
+#include <linux/dmar.h>
 
 #include <asm/smp.h>
 #include <asm/ipi.h>
 #include <asm/genapic.h>
 
-#ifdef CONFIG_ACPI
-#include <acpi/acpi_bus.h>
-#endif
-
-DEFINE_PER_CPU(int, x2apic_extra_bits);
+extern struct genapic apic_flat;
+extern struct genapic apic_physflat;
+extern struct genapic apic_x2xpic_uv_x;
+extern struct genapic apic_x2apic_phys;
+extern struct genapic apic_x2apic_cluster;
 
 struct genapic __read_mostly *genapic = &apic_flat;
 
-static enum uv_system_type uv_system_type;
+static struct genapic *apic_probe[] __initdata = {
+	&apic_x2apic_uv_x,
+	&apic_x2apic_phys,
+	&apic_x2apic_cluster,
+	&apic_physflat,
+	NULL,
+};
 
 /*
  * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
  */
 void __init setup_apic_routing(void)
 {
-	if (uv_system_type == UV_NON_UNIQUE_APIC)
-		genapic = &apic_x2apic_uv_x;
-	else
-#ifdef CONFIG_ACPI
-	/*
-	 * Quirk: some x86_64 machines can only use physical APIC mode
-	 * regardless of how many processors are present (x86_64 ES7000
-	 * is an example).
-	 */
-	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
-			(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
-		genapic = &apic_physflat;
-	else
-#endif
-
-	if (max_physical_apicid < 8)
-		genapic = &apic_flat;
-	else
-		genapic = &apic_physflat;
+	if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) {
+		if (!intr_remapping_enabled)
+			genapic = &apic_flat;
+	}
 
-	printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
+	if (genapic == &apic_flat) {
+		if (max_physical_apicid >= 8)
+			genapic = &apic_physflat;
+		printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
+	}
 }
 
 /* Same for both flat and physical. */
 
-void send_IPI_self(int vector)
+void apic_send_IPI_self(int vector)
 {
 	__send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
 }
 
 int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-	if (!strcmp(oem_id, "SGI")) {
-		if (!strcmp(oem_table_id, "UVL"))
-			uv_system_type = UV_LEGACY_APIC;
-		else if (!strcmp(oem_table_id, "UVX"))
-			uv_system_type = UV_X2APIC;
-		else if (!strcmp(oem_table_id, "UVH"))
-			uv_system_type = UV_NON_UNIQUE_APIC;
+	int i;
+
+	for (i = 0; apic_probe[i]; ++i) {
+		if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
+			genapic = apic_probe[i];
+			printk(KERN_INFO "Setting APIC routing to %s.\n",
+				genapic->name);
+			return 1;
+		}
 	}
 	return 0;
 }
-
-unsigned int read_apic_id(void)
-{
-	unsigned int id;
-
-	WARN_ON(preemptible() && num_online_cpus() > 1);
-	id = apic_read(APIC_ID);
-	if (uv_system_type >= UV_X2APIC)
-		id  |= __get_cpu_var(x2apic_extra_bits);
-	return id;
-}
-
-enum uv_system_type get_uv_system_type(void)
-{
-	return uv_system_type;
-}
-
-int is_uv_system(void)
-{
-	return uv_system_type != UV_NONE;
-}
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 1a9c68845ee8..c0262791bda4 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -15,9 +15,20 @@
 #include <linux/kernel.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
+#include <linux/hardirq.h>
 #include <asm/smp.h>
 #include <asm/ipi.h>
 #include <asm/genapic.h>
+#include <mach_apicdef.h>
+
+#ifdef CONFIG_ACPI
+#include <acpi/acpi_bus.h>
+#endif
+
+static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	return 1;
+}
 
 static cpumask_t flat_target_cpus(void)
 {
@@ -95,9 +106,33 @@ static void flat_send_IPI_all(int vector)
 		__send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
 }
 
+static unsigned int get_apic_id(unsigned long x)
+{
+	unsigned int id;
+
+	id = (((x)>>24) & 0xFFu);
+	return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+	unsigned long x;
+
+	x = ((id & 0xFFu)<<24);
+	return x;
+}
+
+static unsigned int read_xapic_id(void)
+{
+	unsigned int id;
+
+	id = get_apic_id(apic_read(APIC_ID));
+	return id;
+}
+
 static int flat_apic_id_registered(void)
 {
-	return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map);
+	return physid_isset(read_xapic_id(), phys_cpu_present_map);
 }
 
 static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
@@ -112,6 +147,7 @@ static unsigned int phys_pkg_id(int index_msb)
 
 struct genapic apic_flat =  {
 	.name = "flat",
+	.acpi_madt_oem_check = flat_acpi_madt_oem_check,
 	.int_delivery_mode = dest_LowestPrio,
 	.int_dest_mode = (APIC_DEST_LOGICAL != 0),
 	.target_cpus = flat_target_cpus,
@@ -121,8 +157,12 @@ struct genapic apic_flat =  {
 	.send_IPI_all = flat_send_IPI_all,
 	.send_IPI_allbutself = flat_send_IPI_allbutself,
 	.send_IPI_mask = flat_send_IPI_mask,
+	.send_IPI_self = apic_send_IPI_self,
 	.cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
 	.phys_pkg_id = phys_pkg_id,
+	.get_apic_id = get_apic_id,
+	.set_apic_id = set_apic_id,
+	.apic_id_mask = (0xFFu<<24),
 };
 
 /*
@@ -130,6 +170,23 @@ struct genapic apic_flat =  {
  * We cannot use logical delivery in this case because the mask
  * overflows, so use physical mode.
  */
+static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+#ifdef CONFIG_ACPI
+	/*
+	 * Quirk: some x86_64 machines can only use physical APIC mode
+	 * regardless of how many processors are present (x86_64 ES7000
+	 * is an example).
+	 */
+	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+		(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
+		printk(KERN_DEBUG "system APIC only can use physical flat");
+		return 1;
+	}
+#endif
+
+	return 0;
+}
 
 static cpumask_t physflat_target_cpus(void)
 {
@@ -168,7 +225,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
 	 * May as well be the first.
 	 */
 	cpu = first_cpu(cpumask);
-	if ((unsigned)cpu < NR_CPUS)
+	if ((unsigned)cpu < nr_cpu_ids)
 		return per_cpu(x86_cpu_to_apicid, cpu);
 	else
 		return BAD_APICID;
@@ -176,6 +233,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
 
 struct genapic apic_physflat =  {
 	.name = "physical flat",
+	.acpi_madt_oem_check = physflat_acpi_madt_oem_check,
 	.int_delivery_mode = dest_Fixed,
 	.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
 	.target_cpus = physflat_target_cpus,
@@ -185,6 +243,10 @@ struct genapic apic_physflat =  {
 	.send_IPI_all = physflat_send_IPI_all,
 	.send_IPI_allbutself = physflat_send_IPI_allbutself,
 	.send_IPI_mask = physflat_send_IPI_mask,
+	.send_IPI_self = apic_send_IPI_self,
 	.cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
 	.phys_pkg_id = phys_pkg_id,
+	.get_apic_id = get_apic_id,
+	.set_apic_id = set_apic_id,
+	.apic_id_mask = (0xFFu<<24),
 };
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
new file mode 100644
index 000000000000..f6a2c8eb48a6
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -0,0 +1,159 @@
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/ipi.h>
+#include <asm/genapic.h>
+
+DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	if (cpu_has_x2apic)
+		return 1;
+
+	return 0;
+}
+
+/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
+
+static cpumask_t x2apic_target_cpus(void)
+{
+	return cpumask_of_cpu(0);
+}
+
+/*
+ * for now each logical cpu is in its own vector allocation domain.
+ */
+static cpumask_t x2apic_vector_allocation_domain(int cpu)
+{
+	cpumask_t domain = CPU_MASK_NONE;
+	cpu_set(cpu, domain);
+	return domain;
+}
+
+static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
+				   unsigned int dest)
+{
+	unsigned long cfg;
+
+	cfg = __prepare_ICR(0, vector, dest);
+
+	/*
+	 * send the IPI.
+	 */
+	x2apic_icr_write(cfg, apicid);
+}
+
+/*
+ * for now, we send the IPI's one by one in the cpumask.
+ * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
+ * at once. We have 16 cpu's in a cluster. This will minimize IPI register
+ * writes.
+ */
+static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+{
+	unsigned long flags;
+	unsigned long query_cpu;
+
+	local_irq_save(flags);
+	for_each_cpu_mask(query_cpu, mask) {
+		__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
+				       vector, APIC_DEST_LOGICAL);
+	}
+	local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+	cpumask_t mask = cpu_online_map;
+
+	cpu_clear(smp_processor_id(), mask);
+
+	if (!cpus_empty(mask))
+		x2apic_send_IPI_mask(mask, vector);
+}
+
+static void x2apic_send_IPI_all(int vector)
+{
+	x2apic_send_IPI_mask(cpu_online_map, vector);
+}
+
+static int x2apic_apic_id_registered(void)
+{
+	return 1;
+}
+
+static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+	int cpu;
+
+	/*
+	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
+	 * May as well be the first.
+	 */
+	cpu = first_cpu(cpumask);
+	if ((unsigned)cpu < NR_CPUS)
+		return per_cpu(x86_cpu_to_logical_apicid, cpu);
+	else
+		return BAD_APICID;
+}
+
+static unsigned int get_apic_id(unsigned long x)
+{
+	unsigned int id;
+
+	id = x;
+	return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+	unsigned long x;
+
+	x = id;
+	return x;
+}
+
+static unsigned int phys_pkg_id(int index_msb)
+{
+	return current_cpu_data.initial_apicid >> index_msb;
+}
+
+static void x2apic_send_IPI_self(int vector)
+{
+	apic_write(APIC_SELF_IPI, vector);
+}
+
+static void init_x2apic_ldr(void)
+{
+	int cpu = smp_processor_id();
+
+	per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
+	return;
+}
+
+struct genapic apic_x2apic_cluster = {
+	.name = "cluster x2apic",
+	.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+	.int_delivery_mode = dest_LowestPrio,
+	.int_dest_mode = (APIC_DEST_LOGICAL != 0),
+	.target_cpus = x2apic_target_cpus,
+	.vector_allocation_domain = x2apic_vector_allocation_domain,
+	.apic_id_registered = x2apic_apic_id_registered,
+	.init_apic_ldr = init_x2apic_ldr,
+	.send_IPI_all = x2apic_send_IPI_all,
+	.send_IPI_allbutself = x2apic_send_IPI_allbutself,
+	.send_IPI_mask = x2apic_send_IPI_mask,
+	.send_IPI_self = x2apic_send_IPI_self,
+	.cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+	.phys_pkg_id = phys_pkg_id,
+	.get_apic_id = get_apic_id,
+	.set_apic_id = set_apic_id,
+	.apic_id_mask = (0xFFFFFFFFu),
+};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
new file mode 100644
index 000000000000..d042211768b7
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -0,0 +1,154 @@
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/dmar.h>
+
+#include <asm/smp.h>
+#include <asm/ipi.h>
+#include <asm/genapic.h>
+
+static int x2apic_phys;
+
+static int set_x2apic_phys_mode(char *arg)
+{
+	x2apic_phys = 1;
+	return 0;
+}
+early_param("x2apic_phys", set_x2apic_phys_mode);
+
+static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	if (cpu_has_x2apic && x2apic_phys)
+		return 1;
+
+	return 0;
+}
+
+/* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
+
+static cpumask_t x2apic_target_cpus(void)
+{
+	return cpumask_of_cpu(0);
+}
+
+static cpumask_t x2apic_vector_allocation_domain(int cpu)
+{
+	cpumask_t domain = CPU_MASK_NONE;
+	cpu_set(cpu, domain);
+	return domain;
+}
+
+static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
+				   unsigned int dest)
+{
+	unsigned long cfg;
+
+	cfg = __prepare_ICR(0, vector, dest);
+
+	/*
+	 * send the IPI.
+	 */
+	x2apic_icr_write(cfg, apicid);
+}
+
+static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
+{
+	unsigned long flags;
+	unsigned long query_cpu;
+
+	local_irq_save(flags);
+	for_each_cpu_mask(query_cpu, mask) {
+		__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
+				       vector, APIC_DEST_PHYSICAL);
+	}
+	local_irq_restore(flags);
+}
+
+static void x2apic_send_IPI_allbutself(int vector)
+{
+	cpumask_t mask = cpu_online_map;
+
+	cpu_clear(smp_processor_id(), mask);
+
+	if (!cpus_empty(mask))
+		x2apic_send_IPI_mask(mask, vector);
+}
+
+static void x2apic_send_IPI_all(int vector)
+{
+	x2apic_send_IPI_mask(cpu_online_map, vector);
+}
+
+static int x2apic_apic_id_registered(void)
+{
+	return 1;
+}
+
+static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+	int cpu;
+
+	/*
+	 * We're using fixed IRQ delivery, can only return one phys APIC ID.
+	 * May as well be the first.
+	 */
+	cpu = first_cpu(cpumask);
+	if ((unsigned)cpu < NR_CPUS)
+		return per_cpu(x86_cpu_to_apicid, cpu);
+	else
+		return BAD_APICID;
+}
+
+static unsigned int get_apic_id(unsigned long x)
+{
+	unsigned int id;
+
+	id = x;
+	return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+	unsigned long x;
+
+	x = id;
+	return x;
+}
+
+static unsigned int phys_pkg_id(int index_msb)
+{
+	return current_cpu_data.initial_apicid >> index_msb;
+}
+
+void x2apic_send_IPI_self(int vector)
+{
+	apic_write(APIC_SELF_IPI, vector);
+}
+
+void init_x2apic_ldr(void)
+{
+	return;
+}
+
+struct genapic apic_x2apic_phys = {
+	.name = "physical x2apic",
+	.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
+	.int_delivery_mode = dest_Fixed,
+	.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
+	.target_cpus = x2apic_target_cpus,
+	.vector_allocation_domain = x2apic_vector_allocation_domain,
+	.apic_id_registered = x2apic_apic_id_registered,
+	.init_apic_ldr = init_x2apic_ldr,
+	.send_IPI_all = x2apic_send_IPI_all,
+	.send_IPI_allbutself = x2apic_send_IPI_allbutself,
+	.send_IPI_mask = x2apic_send_IPI_mask,
+	.send_IPI_self = x2apic_send_IPI_self,
+	.cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
+	.phys_pkg_id = phys_pkg_id,
+	.get_apic_id = get_apic_id,
+	.set_apic_id = set_apic_id,
+	.apic_id_mask = (0xFFFFFFFFu),
+};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 711f11c30b06..680a06557c5e 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -12,18 +12,49 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <linux/string.h>
-#include <linux/kernel.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
+#include <linux/hardirq.h>
 #include <asm/smp.h>
 #include <asm/ipi.h>
 #include <asm/genapic.h>
 #include <asm/pgtable.h>
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
+#include <asm/uv/bios.h>
+
+DEFINE_PER_CPU(int, x2apic_extra_bits);
+
+static enum uv_system_type uv_system_type;
+
+static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	if (!strcmp(oem_id, "SGI")) {
+		if (!strcmp(oem_table_id, "UVL"))
+			uv_system_type = UV_LEGACY_APIC;
+		else if (!strcmp(oem_table_id, "UVX"))
+			uv_system_type = UV_X2APIC;
+		else if (!strcmp(oem_table_id, "UVH")) {
+			uv_system_type = UV_NON_UNIQUE_APIC;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+enum uv_system_type get_uv_system_type(void)
+{
+	return uv_system_type;
+}
+
+int is_uv_system(void)
+{
+	return uv_system_type != UV_NONE;
+}
+EXPORT_SYMBOL_GPL(is_uv_system);
 
 DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -40,6 +71,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
 short uv_possible_blades;
 EXPORT_SYMBOL_GPL(uv_possible_blades);
 
+unsigned long sn_rtc_cycles_per_second;
+EXPORT_SYMBOL(sn_rtc_cycles_per_second);
+
 /* Start with all IRQs pointing to boot CPU.  IRQ balancing will shift them. */
 
 static cpumask_t uv_target_cpus(void)
@@ -80,7 +114,7 @@ static void uv_send_IPI_one(int cpu, int vector)
 	unsigned long val, apicid, lapicid;
 	int pnode;
 
-	apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */
+	apicid = per_cpu(x86_cpu_to_apicid, cpu);
 	lapicid = apicid & 0x3f;		/* ZZZ macro needed */
 	pnode = uv_apicid_to_pnode(apicid);
 	val =
@@ -94,7 +128,7 @@ static void uv_send_IPI_mask(cpumask_t mask, int vector)
 {
 	unsigned int cpu;
 
-	for (cpu = 0; cpu < NR_CPUS; ++cpu)
+	for_each_possible_cpu(cpu)
 		if (cpu_isset(cpu, mask))
 			uv_send_IPI_one(cpu, vector);
 }
@@ -119,6 +153,10 @@ static int uv_apic_id_registered(void)
 	return 1;
 }
 
+static void uv_init_apic_ldr(void)
+{
+}
+
 static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
 {
 	int cpu;
@@ -128,37 +166,65 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
 	 * May as well be the first.
 	 */
 	cpu = first_cpu(cpumask);
-	if ((unsigned)cpu < NR_CPUS)
+	if ((unsigned)cpu < nr_cpu_ids)
 		return per_cpu(x86_cpu_to_apicid, cpu);
 	else
 		return BAD_APICID;
 }
 
+static unsigned int get_apic_id(unsigned long x)
+{
+	unsigned int id;
+
+	WARN_ON(preemptible() && num_online_cpus() > 1);
+	id = x | __get_cpu_var(x2apic_extra_bits);
+
+	return id;
+}
+
+static unsigned long set_apic_id(unsigned int id)
+{
+	unsigned long x;
+
+	/* maskout x2apic_extra_bits ? */
+	x = id;
+	return x;
+}
+
+static unsigned int uv_read_apic_id(void)
+{
+
+	return get_apic_id(apic_read(APIC_ID));
+}
+
 static unsigned int phys_pkg_id(int index_msb)
 {
-	return GET_APIC_ID(read_apic_id()) >> index_msb;
+	return uv_read_apic_id() >> index_msb;
 }
 
-#ifdef ZZZ		/* Needs x2apic patch */
 static void uv_send_IPI_self(int vector)
 {
 	apic_write(APIC_SELF_IPI, vector);
 }
-#endif
 
 struct genapic apic_x2apic_uv_x = {
 	.name = "UV large system",
+	.acpi_madt_oem_check = uv_acpi_madt_oem_check,
 	.int_delivery_mode = dest_Fixed,
 	.int_dest_mode = (APIC_DEST_PHYSICAL != 0),
 	.target_cpus = uv_target_cpus,
-	.vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */
+	.vector_allocation_domain = uv_vector_allocation_domain,
 	.apic_id_registered = uv_apic_id_registered,
+	.init_apic_ldr = uv_init_apic_ldr,
 	.send_IPI_all = uv_send_IPI_all,
 	.send_IPI_allbutself = uv_send_IPI_allbutself,
 	.send_IPI_mask = uv_send_IPI_mask,
-	/* ZZZ.send_IPI_self = uv_send_IPI_self, */
+	.send_IPI_self = uv_send_IPI_self,
 	.cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
-	.phys_pkg_id = phys_pkg_id,	/* Fixme ZZZ */
+	.phys_pkg_id = phys_pkg_id,
+	.get_apic_id = get_apic_id,
+	.set_apic_id = set_apic_id,
+	.apic_id_mask = (0xFFFFFFFFu),
 };
 
 static __cpuinit void set_x2apic_extra_bits(int pnode)
@@ -218,12 +284,13 @@ static __init void map_low_mmrs(void)
 
 enum map_type {map_wb, map_uc};
 
-static void map_high(char *id, unsigned long base, int shift, enum map_type map_type)
+static __init void map_high(char *id, unsigned long base, int shift,
+			    int max_pnode, enum map_type map_type)
 {
 	unsigned long bytes, paddr;
 
 	paddr = base << shift;
-	bytes = (1UL << shift);
+	bytes = (1UL << shift) * (max_pnode + 1);
 	printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
 	       					paddr + bytes);
 	if (map_type == map_uc)
@@ -239,7 +306,7 @@ static __init void map_gru_high(int max_pnode)
 
 	gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
 	if (gru.s.enable)
-		map_high("GRU", gru.s.base, shift, map_wb);
+		map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
 }
 
 static __init void map_config_high(int max_pnode)
@@ -249,7 +316,7 @@ static __init void map_config_high(int max_pnode)
 
 	cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
 	if (cfg.s.enable)
-		map_high("CONFIG", cfg.s.base, shift, map_uc);
+		map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
 }
 
 static __init void map_mmr_high(int max_pnode)
@@ -259,7 +326,7 @@ static __init void map_mmr_high(int max_pnode)
 
 	mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
 	if (mmr.s.enable)
-		map_high("MMR", mmr.s.base, shift, map_uc);
+		map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
 }
 
 static __init void map_mmioh_high(int max_pnode)
@@ -269,10 +336,44 @@ static __init void map_mmioh_high(int max_pnode)
 
 	mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
 	if (mmioh.s.enable)
-		map_high("MMIOH", mmioh.s.base, shift, map_uc);
+		map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
+}
+
+static __init void uv_rtc_init(void)
+{
+	long status;
+	u64 ticks_per_sec;
+
+	status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
+					&ticks_per_sec);
+	if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
+		printk(KERN_WARNING
+			"unable to determine platform RTC clock frequency, "
+			"guessing.\n");
+		/* BIOS gives wrong value for clock freq. so guess */
+		sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
+	} else
+		sn_rtc_cycles_per_second = ticks_per_sec;
 }
 
-static __init void uv_system_init(void)
+/*
+ * Called on each cpu to initialize the per_cpu UV data area.
+ * 	ZZZ hotplug not supported yet
+ */
+void __cpuinit uv_cpu_init(void)
+{
+	/* CPU 0 initilization will be done via uv_system_init. */
+	if (!uv_blade_info)
+		return;
+
+	uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
+
+	if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
+		set_x2apic_extra_bits(uv_hub_info->pnode);
+}
+
+
+void __init uv_system_init(void)
 {
 	union uvh_si_addr_map_config_u m_n_config;
 	union uvh_node_id_u node_id;
@@ -326,6 +427,11 @@ static __init void uv_system_init(void)
 	gnode_upper = (((unsigned long)node_id.s.node_id) &
 		       ~((1 << n_val) - 1)) << m_val;
 
+	uv_bios_init();
+	uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
+			    &uv_coherency_id, &uv_region_size);
+	uv_rtc_init();
+
 	for_each_present_cpu(cpu) {
 		nid = cpu_to_node(cpu);
 		pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
@@ -345,7 +451,7 @@ static __init void uv_system_init(void)
 		uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
 		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
 		uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
-		uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
+		uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id;
 		uv_node_to_blade[nid] = blade;
 		uv_cpu_to_blade[cpu] = blade;
 		max_pnode = max(pnode, max_pnode);
@@ -360,19 +466,6 @@ static __init void uv_system_init(void)
 	map_mmr_high(max_pnode);
 	map_config_high(max_pnode);
 	map_mmioh_high(max_pnode);
-}
 
-/*
- * Called on each cpu to initialize the per_cpu UV data area.
- * 	ZZZ hotplug not supported yet
- */
-void __cpuinit uv_cpu_init(void)
-{
-	if (!uv_node_to_blade)
-		uv_system_init();
-
-	uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
-
-	if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
-		set_x2apic_extra_bits(uv_hub_info->pnode);
+	uv_cpu_init();
 }
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9d..1dcb0f13897e 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -35,6 +35,7 @@ void __init reserve_ebda_region(void)
 
 	/* start of EBDA area */
 	ebda_addr = get_bios_ebda();
+	printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
 
 	/* Fixup: bios puts an EBDA in the top 64K segment */
 	/* of conventional memory, but does not adjust lowmem. */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c97819829146..d16084f90649 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
 static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
 #endif
 
+void __init x86_64_init_pda(void)
+{
+	_cpu_pda = __cpu_pda;
+	cpu_pda(0) = &_boot_cpu_pda;
+	pda_init(0);
+}
+
 static void __init zap_identity_mappings(void)
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
@@ -81,6 +88,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
 	BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
 				(__START_KERNEL & PGDIR_MASK)));
+	BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
 
 	/* clear bss before set_intr_gate with early_idt_handler */
 	clear_bss();
@@ -100,13 +108,10 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	}
 	load_idt((const struct desc_ptr *)&idt_descr);
 
-	early_printk("Kernel alive\n");
-
-	_cpu_pda = __cpu_pda;
-	cpu_pda(0) = &_boot_cpu_pda;
-	pda_init(0);
+	if (console_loglevel == 10)
+		early_printk("Kernel alive\n");
 
-	early_printk("Kernel really alive\n");
+	x86_64_init_pda();
 
 	x86_64_start_reservations(real_mode_data);
 }
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f67e93441caf..e835b4eea70b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4
  *
  * Note that the stack is not yet set up!
  */
-#define PTE_ATTR	0x007		/* PRESENT+RW+USER */
-#define PDE_ATTR	0x067		/* PRESENT+RW+USER+DIRTY+ACCESSED */
-#define PGD_ATTR	0x001		/* PRESENT (no other attributes) */
-
 default_entry:
 #ifdef CONFIG_X86_PAE
 
@@ -196,9 +192,9 @@ default_entry:
 	movl $pa(pg0), %edi
 	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_pmd), %edx
-	movl $PTE_ATTR, %eax
+	movl $PTE_IDENT_ATTR, %eax
 10:
-	leal PDE_ATTR(%edi),%ecx		/* Create PMD entry */
+	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PMD entry */
 	movl %ecx,(%edx)			/* Store PMD entry */
 						/* Upper half already zero */
 	addl $8,%edx
@@ -215,7 +211,7 @@ default_entry:
 	 * End condition: we must map up to and including INIT_MAP_BEYOND_END
 	 * bytes beyond the end of our own page tables.
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp
+	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
 	cmpl %ebp,%eax
 	jb 10b
 1:
@@ -224,7 +220,7 @@ default_entry:
 	movl %eax, pa(max_pfn_mapped)
 
 	/* Do early initialization of the fixmap area */
-	movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
+	movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
 	movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
 #else	/* Not PAE */
 
@@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	movl $pa(pg0), %edi
 	movl %edi, pa(init_pg_tables_start)
 	movl $pa(swapper_pg_dir), %edx
-	movl $PTE_ATTR, %eax
+	movl $PTE_IDENT_ATTR, %eax
 10:
-	leal PDE_ATTR(%edi),%ecx		/* Create PDE entry */
+	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PDE entry */
 	movl %ecx,(%edx)			/* Store identity PDE entry */
 	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
 	addl $4,%edx
@@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	 * bytes beyond the end of our own page tables; the +0x007 is
 	 * the attribute bits
 	 */
-	leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp
+	leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
 	cmpl %ebp,%eax
 	jb 10b
 	movl %edi,pa(init_pg_tables_end)
@@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 	movl %eax, pa(max_pfn_mapped)
 
 	/* Do early initialization of the fixmap area */
-	movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax
+	movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
 	movl %eax,pa(swapper_pg_dir+0xffc)
 #endif
 	jmp 3f
@@ -456,9 +452,6 @@ is386:	movl $2,%ecx		# set MP
 1:
 #endif /* CONFIG_SMP */
 	jmp *(initial_code)
-.align 4
-ENTRY(initial_code)
-	.long i386_start_kernel
 
 /*
  * We depend on ET to be correct. This checks for 287/387.
@@ -601,6 +594,11 @@ ignore_int:
 #endif
 	iret
 
+.section .cpuinit.data,"wa"
+.align 4
+ENTRY(initial_code)
+	.long i386_start_kernel
+
 .section .text
 /*
  * Real beginning of normal "text" segment
@@ -632,19 +630,19 @@ ENTRY(empty_zero_page)
 	/* Page-aligned for the benefit of paravirt? */
 	.align PAGE_SIZE_asm
 ENTRY(swapper_pg_dir)
-	.long	pa(swapper_pg_pmd+PGD_ATTR),0		/* low identity map */
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR),0	/* low identity map */
 # if KPMDS == 3
-	.long	pa(swapper_pg_pmd+PGD_ATTR),0
-	.long	pa(swapper_pg_pmd+PGD_ATTR+0x1000),0
-	.long	pa(swapper_pg_pmd+PGD_ATTR+0x2000),0
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
 # elif KPMDS == 2
 	.long	0,0
-	.long	pa(swapper_pg_pmd+PGD_ATTR),0
-	.long	pa(swapper_pg_pmd+PGD_ATTR+0x1000),0
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
 # elif KPMDS == 1
 	.long	0,0
 	.long	0,0
-	.long	pa(swapper_pg_pmd+PGD_ATTR),0
+	.long	pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
 # else
 #  error "Kernel PMDs should be 1, 2 or 3"
 # endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b07ac7b217cb..26cfdc1d7c7f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -110,7 +110,7 @@ startup_64:
 	movq	%rdi, %rax
 	shrq	$PMD_SHIFT, %rax
 	andq	$(PTRS_PER_PMD - 1), %rax
-	leaq	__PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx
+	leaq	__PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
 	leaq	level2_spare_pgt(%rip), %rbx
 	movq	%rdx, 0(%rbx, %rax, 8)
 ident_complete:
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
 	/* Since I easily can, map the first 1G.
 	 * Don't set NX because code runs from these pages.
 	 */
-	PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD)
+	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
 
 NEXT_PAGE(level2_kernel_pgt)
 	/*
@@ -407,6 +407,7 @@ ENTRY(phys_base)
 	/* This must match the first entry in level2_kernel_pgt */
 	.quad   0x0000000000000000
 
+#include "../../x86/xen/xen-head.S"
 	
 	.section .bss, "aw", @nobits
 	.align L1_CACHE_BYTES
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 0ea6a19bfdfe..77017e834cf7 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,29 +1,49 @@
 #include <linux/clocksource.h>
 #include <linux/clockchips.h>
+#include <linux/interrupt.h>
+#include <linux/sysdev.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/hpet.h>
 #include <linux/init.h>
-#include <linux/sysdev.h>
+#include <linux/cpu.h>
 #include <linux/pm.h>
+#include <linux/io.h>
 
 #include <asm/fixmap.h>
-#include <asm/hpet.h>
 #include <asm/i8253.h>
-#include <asm/io.h>
+#include <asm/hpet.h>
 
-#define HPET_MASK	CLOCKSOURCE_MASK(32)
-#define HPET_SHIFT	22
+#define HPET_MASK			CLOCKSOURCE_MASK(32)
+#define HPET_SHIFT			22
 
 /* FSEC = 10^-15
    NSEC = 10^-9 */
-#define FSEC_PER_NSEC	1000000L
+#define FSEC_PER_NSEC			1000000L
+
+#define HPET_DEV_USED_BIT		2
+#define HPET_DEV_USED			(1 << HPET_DEV_USED_BIT)
+#define HPET_DEV_VALID			0x8
+#define HPET_DEV_FSB_CAP		0x1000
+#define HPET_DEV_PERI_CAP		0x2000
+
+#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
 
 /*
  * HPET address is set in acpi/boot.c, when an ACPI entry exists
  */
-unsigned long hpet_address;
-static void __iomem *hpet_virt_address;
+unsigned long				hpet_address;
+unsigned long				hpet_num_timers;
+static void __iomem			*hpet_virt_address;
+
+struct hpet_dev {
+	struct clock_event_device	evt;
+	unsigned int			num;
+	int				cpu;
+	unsigned int			irq;
+	unsigned int			flags;
+	char				name[10];
+};
 
 unsigned long hpet_readl(unsigned long a)
 {
@@ -59,7 +79,7 @@ static inline void hpet_clear_mapping(void)
 static int boot_hpet_disable;
 int hpet_force_user;
 
-static int __init hpet_setup(char* str)
+static int __init hpet_setup(char *str)
 {
 	if (str) {
 		if (!strncmp("disable", str, 7))
@@ -80,7 +100,7 @@ __setup("nohpet", disable_hpet);
 
 static inline int is_hpet_capable(void)
 {
-	return (!boot_hpet_disable && hpet_address);
+	return !boot_hpet_disable && hpet_address;
 }
 
 /*
@@ -102,6 +122,9 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled);
  * timer 0 and timer 1 in case of RTC emulation.
  */
 #ifdef CONFIG_HPET
+
+static void hpet_reserve_msi_timers(struct hpet_data *hd);
+
 static void hpet_reserve_platform_timers(unsigned long id)
 {
 	struct hpet __iomem *hpet = hpet_virt_address;
@@ -111,25 +134,31 @@ static void hpet_reserve_platform_timers(unsigned long id)
 
 	nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
 
-	memset(&hd, 0, sizeof (hd));
-	hd.hd_phys_address = hpet_address;
-	hd.hd_address = hpet;
-	hd.hd_nirqs = nrtimers;
-	hd.hd_flags = HPET_DATA_PLATFORM;
+	memset(&hd, 0, sizeof(hd));
+	hd.hd_phys_address	= hpet_address;
+	hd.hd_address		= hpet;
+	hd.hd_nirqs		= nrtimers;
 	hpet_reserve_timer(&hd, 0);
 
 #ifdef CONFIG_HPET_EMULATE_RTC
 	hpet_reserve_timer(&hd, 1);
 #endif
 
+	/*
+	 * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
+	 * is wrong for i8259!) not the output IRQ.  Many BIOS writers
+	 * don't bother configuring *any* comparator interrupts.
+	 */
 	hd.hd_irq[0] = HPET_LEGACY_8254;
 	hd.hd_irq[1] = HPET_LEGACY_RTC;
 
 	for (i = 2; i < nrtimers; timer++, i++) {
-		hd.hd_irq[i] = (readl(&timer->hpet_config) & Tn_INT_ROUTE_CNF_MASK) >>
-			Tn_INT_ROUTE_CNF_SHIFT;
+		hd.hd_irq[i] = (readl(&timer->hpet_config) &
+			Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
 	}
 
+	hpet_reserve_msi_timers(&hd);
+
 	hpet_alloc(&hd);
 
 }
@@ -210,8 +239,8 @@ static void hpet_legacy_clockevent_register(void)
 	/* Calculate the min / max delta */
 	hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
 							   &hpet_clockevent);
-	hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30,
-							   &hpet_clockevent);
+	/* 5 usec minimum reprogramming delta. */
+	hpet_clockevent.min_delta_ns = 5000;
 
 	/*
 	 * Start hpet with the boot cpu mask and make it
@@ -223,63 +252,421 @@ static void hpet_legacy_clockevent_register(void)
 	printk(KERN_DEBUG "hpet clockevent registered\n");
 }
 
-static void hpet_legacy_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt)
+static int hpet_setup_msi_irq(unsigned int irq);
+
+static void hpet_set_mode(enum clock_event_mode mode,
+			  struct clock_event_device *evt, int timer)
 {
 	unsigned long cfg, cmp, now;
 	uint64_t delta;
 
-	switch(mode) {
+	switch (mode) {
 	case CLOCK_EVT_MODE_PERIODIC:
-		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult;
-		delta >>= hpet_clockevent.shift;
+		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
+		delta >>= evt->shift;
 		now = hpet_readl(HPET_COUNTER);
 		cmp = now + (unsigned long) delta;
-		cfg = hpet_readl(HPET_T0_CFG);
+		cfg = hpet_readl(HPET_Tn_CFG(timer));
 		cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
 		       HPET_TN_SETVAL | HPET_TN_32BIT;
-		hpet_writel(cfg, HPET_T0_CFG);
+		hpet_writel(cfg, HPET_Tn_CFG(timer));
 		/*
 		 * The first write after writing TN_SETVAL to the
 		 * config register sets the counter value, the second
 		 * write sets the period.
 		 */
-		hpet_writel(cmp, HPET_T0_CMP);
+		hpet_writel(cmp, HPET_Tn_CMP(timer));
 		udelay(1);
-		hpet_writel((unsigned long) delta, HPET_T0_CMP);
+		hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
 		break;
 
 	case CLOCK_EVT_MODE_ONESHOT:
-		cfg = hpet_readl(HPET_T0_CFG);
+		cfg = hpet_readl(HPET_Tn_CFG(timer));
 		cfg &= ~HPET_TN_PERIODIC;
 		cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
-		hpet_writel(cfg, HPET_T0_CFG);
+		hpet_writel(cfg, HPET_Tn_CFG(timer));
 		break;
 
 	case CLOCK_EVT_MODE_UNUSED:
 	case CLOCK_EVT_MODE_SHUTDOWN:
-		cfg = hpet_readl(HPET_T0_CFG);
+		cfg = hpet_readl(HPET_Tn_CFG(timer));
 		cfg &= ~HPET_TN_ENABLE;
-		hpet_writel(cfg, HPET_T0_CFG);
+		hpet_writel(cfg, HPET_Tn_CFG(timer));
 		break;
 
 	case CLOCK_EVT_MODE_RESUME:
-		hpet_enable_legacy_int();
+		if (timer == 0) {
+			hpet_enable_legacy_int();
+		} else {
+			struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+			hpet_setup_msi_irq(hdev->irq);
+			disable_irq(hdev->irq);
+			irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
+			enable_irq(hdev->irq);
+		}
 		break;
 	}
 }
 
-static int hpet_legacy_next_event(unsigned long delta,
-			   struct clock_event_device *evt)
+static int hpet_next_event(unsigned long delta,
+			   struct clock_event_device *evt, int timer)
 {
-	unsigned long cnt;
+	u32 cnt;
 
 	cnt = hpet_readl(HPET_COUNTER);
-	cnt += delta;
-	hpet_writel(cnt, HPET_T0_CMP);
+	cnt += (u32) delta;
+	hpet_writel(cnt, HPET_Tn_CMP(timer));
+
+	/*
+	 * We need to read back the CMP register to make sure that
+	 * what we wrote hit the chip before we compare it to the
+	 * counter.
+	 */
+	WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt);
+
+	return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
+}
+
+static void hpet_legacy_set_mode(enum clock_event_mode mode,
+			struct clock_event_device *evt)
+{
+	hpet_set_mode(mode, evt, 0);
+}
+
+static int hpet_legacy_next_event(unsigned long delta,
+			struct clock_event_device *evt)
+{
+	return hpet_next_event(delta, evt, 0);
+}
+
+/*
+ * HPET MSI Support
+ */
+#ifdef CONFIG_PCI_MSI
+
+static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
+static struct hpet_dev	*hpet_devs;
+
+void hpet_msi_unmask(unsigned int irq)
+{
+	struct hpet_dev *hdev = get_irq_data(irq);
+	unsigned long cfg;
+
+	/* unmask it */
+	cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+	cfg |= HPET_TN_FSB;
+	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+}
+
+void hpet_msi_mask(unsigned int irq)
+{
+	unsigned long cfg;
+	struct hpet_dev *hdev = get_irq_data(irq);
+
+	/* mask it */
+	cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
+	cfg &= ~HPET_TN_FSB;
+	hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
+}
+
+void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
+{
+	struct hpet_dev *hdev = get_irq_data(irq);
+
+	hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
+	hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
+}
+
+void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
+{
+	struct hpet_dev *hdev = get_irq_data(irq);
+
+	msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
+	msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
+	msg->address_hi = 0;
+}
+
+static void hpet_msi_set_mode(enum clock_event_mode mode,
+				struct clock_event_device *evt)
+{
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+	hpet_set_mode(mode, evt, hdev->num);
+}
+
+static int hpet_msi_next_event(unsigned long delta,
+				struct clock_event_device *evt)
+{
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+	return hpet_next_event(delta, evt, hdev->num);
+}
+
+static int hpet_setup_msi_irq(unsigned int irq)
+{
+	if (arch_setup_hpet_msi(irq)) {
+		destroy_irq(irq);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int hpet_assign_irq(struct hpet_dev *dev)
+{
+	unsigned int irq;
+
+	irq = create_irq();
+	if (!irq)
+		return -EINVAL;
+
+	set_irq_data(irq, dev);
+
+	if (hpet_setup_msi_irq(irq))
+		return -EINVAL;
+
+	dev->irq = irq;
+	return 0;
+}
+
+static irqreturn_t hpet_interrupt_handler(int irq, void *data)
+{
+	struct hpet_dev *dev = (struct hpet_dev *)data;
+	struct clock_event_device *hevt = &dev->evt;
+
+	if (!hevt->event_handler) {
+		printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n",
+				dev->num);
+		return IRQ_HANDLED;
+	}
+
+	hevt->event_handler(hevt);
+	return IRQ_HANDLED;
+}
+
+static int hpet_setup_irq(struct hpet_dev *dev)
+{
+
+	if (request_irq(dev->irq, hpet_interrupt_handler,
+			IRQF_SHARED|IRQF_NOBALANCING, dev->name, dev))
+		return -1;
+
+	disable_irq(dev->irq);
+	irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
+	enable_irq(dev->irq);
+
+	printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
+			 dev->name, dev->irq);
+
+	return 0;
+}
+
+/* This should be called in specific @cpu */
+static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
+{
+	struct clock_event_device *evt = &hdev->evt;
+	uint64_t hpet_freq;
+
+	WARN_ON(cpu != smp_processor_id());
+	if (!(hdev->flags & HPET_DEV_VALID))
+		return;
 
-	return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0;
+	if (hpet_setup_msi_irq(hdev->irq))
+		return;
+
+	hdev->cpu = cpu;
+	per_cpu(cpu_hpet_dev, cpu) = hdev;
+	evt->name = hdev->name;
+	hpet_setup_irq(hdev);
+	evt->irq = hdev->irq;
+
+	evt->rating = 110;
+	evt->features = CLOCK_EVT_FEAT_ONESHOT;
+	if (hdev->flags & HPET_DEV_PERI_CAP)
+		evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+
+	evt->set_mode = hpet_msi_set_mode;
+	evt->set_next_event = hpet_msi_next_event;
+	evt->shift = 32;
+
+	/*
+	 * The period is a femto seconds value. We need to calculate the
+	 * scaled math multiplication factor for nanosecond to hpet tick
+	 * conversion.
+	 */
+	hpet_freq = 1000000000000000ULL;
+	do_div(hpet_freq, hpet_period);
+	evt->mult = div_sc((unsigned long) hpet_freq,
+				      NSEC_PER_SEC, evt->shift);
+	/* Calculate the max delta */
+	evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
+	/* 5 usec minimum reprogramming delta. */
+	evt->min_delta_ns = 5000;
+
+	evt->cpumask = cpumask_of_cpu(hdev->cpu);
+	clockevents_register_device(evt);
+}
+
+#ifdef CONFIG_HPET
+/* Reserve at least one timer for userspace (/dev/hpet) */
+#define RESERVE_TIMERS 1
+#else
+#define RESERVE_TIMERS 0
+#endif
+
+static void hpet_msi_capability_lookup(unsigned int start_timer)
+{
+	unsigned int id;
+	unsigned int num_timers;
+	unsigned int num_timers_used = 0;
+	int i;
+
+	id = hpet_readl(HPET_ID);
+
+	num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
+	num_timers++; /* Value read out starts from 0 */
+
+	hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
+	if (!hpet_devs)
+		return;
+
+	hpet_num_timers = num_timers;
+
+	for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
+		struct hpet_dev *hdev = &hpet_devs[num_timers_used];
+		unsigned long cfg = hpet_readl(HPET_Tn_CFG(i));
+
+		/* Only consider HPET timer with MSI support */
+		if (!(cfg & HPET_TN_FSB_CAP))
+			continue;
+
+		hdev->flags = 0;
+		if (cfg & HPET_TN_PERIODIC_CAP)
+			hdev->flags |= HPET_DEV_PERI_CAP;
+		hdev->num = i;
+
+		sprintf(hdev->name, "hpet%d", i);
+		if (hpet_assign_irq(hdev))
+			continue;
+
+		hdev->flags |= HPET_DEV_FSB_CAP;
+		hdev->flags |= HPET_DEV_VALID;
+		num_timers_used++;
+		if (num_timers_used == num_possible_cpus())
+			break;
+	}
+
+	printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n",
+		num_timers, num_timers_used);
+}
+
+#ifdef CONFIG_HPET
+static void hpet_reserve_msi_timers(struct hpet_data *hd)
+{
+	int i;
+
+	if (!hpet_devs)
+		return;
+
+	for (i = 0; i < hpet_num_timers; i++) {
+		struct hpet_dev *hdev = &hpet_devs[i];
+
+		if (!(hdev->flags & HPET_DEV_VALID))
+			continue;
+
+		hd->hd_irq[hdev->num] = hdev->irq;
+		hpet_reserve_timer(hd, hdev->num);
+	}
+}
+#endif
+
+static struct hpet_dev *hpet_get_unused_timer(void)
+{
+	int i;
+
+	if (!hpet_devs)
+		return NULL;
+
+	for (i = 0; i < hpet_num_timers; i++) {
+		struct hpet_dev *hdev = &hpet_devs[i];
+
+		if (!(hdev->flags & HPET_DEV_VALID))
+			continue;
+		if (test_and_set_bit(HPET_DEV_USED_BIT,
+			(unsigned long *)&hdev->flags))
+			continue;
+		return hdev;
+	}
+	return NULL;
+}
+
+struct hpet_work_struct {
+	struct delayed_work work;
+	struct completion complete;
+};
+
+static void hpet_work(struct work_struct *w)
+{
+	struct hpet_dev *hdev;
+	int cpu = smp_processor_id();
+	struct hpet_work_struct *hpet_work;
+
+	hpet_work = container_of(w, struct hpet_work_struct, work.work);
+
+	hdev = hpet_get_unused_timer();
+	if (hdev)
+		init_one_hpet_msi_clockevent(hdev, cpu);
+
+	complete(&hpet_work->complete);
+}
+
+static int hpet_cpuhp_notify(struct notifier_block *n,
+		unsigned long action, void *hcpu)
+{
+	unsigned long cpu = (unsigned long)hcpu;
+	struct hpet_work_struct work;
+	struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
+
+	switch (action & 0xf) {
+	case CPU_ONLINE:
+		INIT_DELAYED_WORK(&work.work, hpet_work);
+		init_completion(&work.complete);
+		/* FIXME: add schedule_work_on() */
+		schedule_delayed_work_on(cpu, &work.work, 0);
+		wait_for_completion(&work.complete);
+		break;
+	case CPU_DEAD:
+		if (hdev) {
+			free_irq(hdev->irq, hdev);
+			hdev->flags &= ~HPET_DEV_USED;
+			per_cpu(cpu_hpet_dev, cpu) = NULL;
+		}
+		break;
+	}
+	return NOTIFY_OK;
+}
+#else
+
+static int hpet_setup_msi_irq(unsigned int irq)
+{
+	return 0;
+}
+static void hpet_msi_capability_lookup(unsigned int start_timer)
+{
+	return;
+}
+
+#ifdef CONFIG_HPET
+static void hpet_reserve_msi_timers(struct hpet_data *hd)
+{
+	return;
 }
+#endif
+
+static int hpet_cpuhp_notify(struct notifier_block *n,
+		unsigned long action, void *hcpu)
+{
+	return NOTIFY_OK;
+}
+
+#endif
 
 /*
  * Clock source related code
@@ -359,6 +746,7 @@ static int hpet_clocksource_register(void)
 int __init hpet_enable(void)
 {
 	unsigned long id;
+	int i;
 
 	if (!is_hpet_capable())
 		return 0;
@@ -369,6 +757,29 @@ int __init hpet_enable(void)
 	 * Read the period and check for a sane value:
 	 */
 	hpet_period = hpet_readl(HPET_PERIOD);
+
+	/*
+	 * AMD SB700 based systems with spread spectrum enabled use a
+	 * SMM based HPET emulation to provide proper frequency
+	 * setting. The SMM code is initialized with the first HPET
+	 * register access and takes some time to complete. During
+	 * this time the config register reads 0xffffffff. We check
+	 * for max. 1000 loops whether the config register reads a non
+	 * 0xffffffff value to make sure that HPET is up and running
+	 * before we go further. A counting loop is safe, as the HPET
+	 * access takes thousands of CPU cycles. On non SB700 based
+	 * machines this check is only done once and has no side
+	 * effects.
+	 */
+	for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
+		if (i == 1000) {
+			printk(KERN_WARNING
+			       "HPET config register value = 0xFFFFFFFF. "
+			       "Disabling HPET\n");
+			goto out_nohpet;
+		}
+	}
+
 	if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
 		goto out_nohpet;
 
@@ -392,8 +803,10 @@ int __init hpet_enable(void)
 
 	if (id & HPET_ID_LEGSUP) {
 		hpet_legacy_clockevent_register();
+		hpet_msi_capability_lookup(2);
 		return 1;
 	}
+	hpet_msi_capability_lookup(0);
 	return 0;
 
 out_nohpet:
@@ -410,6 +823,8 @@ out_nohpet:
  */
 static __init int hpet_late_init(void)
 {
+	int cpu;
+
 	if (boot_hpet_disable)
 		return -ENODEV;
 
@@ -425,6 +840,13 @@ static __init int hpet_late_init(void)
 
 	hpet_reserve_platform_timers(hpet_readl(HPET_ID));
 
+	for_each_online_cpu(cpu) {
+		hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
+	}
+
+	/* This notifier should be called after workqueue is ready */
+	hotcpu_notifier(hpet_cpuhp_notify, -20);
+
 	return 0;
 }
 fs_initcall(hpet_late_init);
@@ -468,7 +890,7 @@ void hpet_disable(void)
 #define RTC_NUM_INTS		1
 
 static unsigned long hpet_rtc_flags;
-static unsigned long hpet_prev_update_sec;
+static int hpet_prev_update_sec;
 static struct rtc_time hpet_alarm_time;
 static unsigned long hpet_pie_count;
 static unsigned long hpet_t1_cmp;
@@ -575,6 +997,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
 
 	hpet_rtc_flags |= bit_mask;
 
+	if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE))
+		hpet_prev_update_sec = -1;
+
 	if (!oldbits)
 		hpet_rtc_timer_init();
 
@@ -652,7 +1077,7 @@ static void hpet_rtc_timer_reinit(void)
 		if (hpet_rtc_flags & RTC_PIE)
 			hpet_pie_count += lost_ints;
 		if (printk_ratelimit())
-			printk(KERN_WARNING "rtc: lost %d interrupts\n",
+			printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
 				lost_ints);
 	}
 }
@@ -670,7 +1095,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
 
 	if (hpet_rtc_flags & RTC_UIE &&
 	    curr_time.tm_sec != hpet_prev_update_sec) {
-		rtc_int_flag = RTC_UF;
+		if (hpet_prev_update_sec >= 0)
+			rtc_int_flag = RTC_UF;
 		hpet_prev_update_sec = curr_time.tm_sec;
 	}
 
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index eb9ddd8efb82..1f20608d4ca8 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -21,9 +21,12 @@
 # include <asm/sigcontext32.h>
 # include <asm/user32.h>
 #else
-# define save_i387_ia32		save_i387
-# define restore_i387_ia32	restore_i387
+# define save_i387_xstate_ia32		save_i387_xstate
+# define restore_i387_xstate_ia32	restore_i387_xstate
 # define _fpstate_ia32		_fpstate
+# define _xstate_ia32		_xstate
+# define sig_xstate_ia32_size   sig_xstate_size
+# define fx_sw_reserved_ia32	fx_sw_reserved
 # define user_i387_ia32_struct	user_i387_struct
 # define user32_fxsr_struct	user_fxsr_struct
 #endif
@@ -36,6 +39,7 @@
 
 static unsigned int		mxcsr_feature_mask __read_mostly = 0xffffffffu;
 unsigned int xstate_size;
+unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
 static struct i387_fxsave_struct fx_scratch __cpuinitdata;
 
 void __cpuinit mxcsr_feature_mask_init(void)
@@ -61,6 +65,11 @@ void __init init_thread_xstate(void)
 		return;
 	}
 
+	if (cpu_has_xsave) {
+		xsave_cntxt_init();
+		return;
+	}
+
 	if (cpu_has_fxsr)
 		xstate_size = sizeof(struct i387_fxsave_struct);
 #ifdef CONFIG_X86_32
@@ -83,9 +92,19 @@ void __cpuinit fpu_init(void)
 
 	write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
 
+	/*
+	 * Boot processor to setup the FP and extended state context info.
+	 */
+	if (!smp_processor_id())
+		init_thread_xstate();
+	xsave_init();
+
 	mxcsr_feature_mask_init();
 	/* clean state in init */
-	current_thread_info()->status = 0;
+	if (cpu_has_xsave)
+		current_thread_info()->status = TS_XSAVE;
+	else
+		current_thread_info()->status = 0;
 	clear_used_math();
 }
 #endif	/* CONFIG_X86_64 */
@@ -195,6 +214,13 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
 	 */
 	target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
 
+	/*
+	 * update the header bits in the xsave header, indicating the
+	 * presence of FP and SSE state.
+	 */
+	if (cpu_has_xsave)
+		target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+
 	return ret;
 }
 
@@ -395,6 +421,12 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (!ret)
 		convert_to_fxsr(target, &env);
 
+	/*
+	 * update the header bit in the xsave header, indicating the
+	 * presence of FP.
+	 */
+	if (cpu_has_xsave)
+		target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
 	return ret;
 }
 
@@ -407,7 +439,6 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
 	struct task_struct *tsk = current;
 	struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
 
-	unlazy_fpu(tsk);
 	fp->status = fp->swd;
 	if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
 		return -1;
@@ -421,8 +452,6 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
 	struct user_i387_ia32_struct env;
 	int err = 0;
 
-	unlazy_fpu(tsk);
-
 	convert_from_fxsr(&env, tsk);
 	if (__copy_to_user(buf, &env, sizeof(env)))
 		return -1;
@@ -432,16 +461,54 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
 	if (err)
 		return -1;
 
-	if (__copy_to_user(&buf->_fxsr_env[0], fx,
-			   sizeof(struct i387_fxsave_struct)))
+	if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
+		return -1;
+	return 1;
+}
+
+static int save_i387_xsave(void __user *buf)
+{
+	struct task_struct *tsk = current;
+	struct _fpstate_ia32 __user *fx = buf;
+	int err = 0;
+
+	/*
+	 * For legacy compatible, we always set FP/SSE bits in the bit
+	 * vector while saving the state to the user context.
+	 * This will enable us capturing any changes(during sigreturn) to
+	 * the FP/SSE bits by the legacy applications which don't touch
+	 * xstate_bv in the xsave header.
+	 *
+	 * xsave aware applications can change the xstate_bv in the xsave
+	 * header as well as change any contents in the memory layout.
+	 * xrestore as part of sigreturn will capture all the changes.
+	 */
+	tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+
+	if (save_i387_fxsave(fx) < 0)
+		return -1;
+
+	err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
+			     sizeof(struct _fpx_sw_bytes));
+	err |= __put_user(FP_XSTATE_MAGIC2,
+			  (__u32 __user *) (buf + sig_xstate_ia32_size
+					    - FP_XSTATE_MAGIC2_SIZE));
+	if (err)
 		return -1;
+
 	return 1;
 }
 
-int save_i387_ia32(struct _fpstate_ia32 __user *buf)
+int save_i387_xstate_ia32(void __user *buf)
 {
+	struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
+	struct task_struct *tsk = current;
+
 	if (!used_math())
 		return 0;
+
+	if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
+		return -EACCES;
 	/*
 	 * This will cause a "finit" to be triggered by the next
 	 * attempted FPU operation by the 'current' process.
@@ -451,13 +518,17 @@ int save_i387_ia32(struct _fpstate_ia32 __user *buf)
 	if (!HAVE_HWFP) {
 		return fpregs_soft_get(current, NULL,
 				       0, sizeof(struct user_i387_ia32_struct),
-				       NULL, buf) ? -1 : 1;
+				       NULL, fp) ? -1 : 1;
 	}
 
+	unlazy_fpu(tsk);
+
+	if (cpu_has_xsave)
+		return save_i387_xsave(fp);
 	if (cpu_has_fxsr)
-		return save_i387_fxsave(buf);
+		return save_i387_fxsave(fp);
 	else
-		return save_i387_fsave(buf);
+		return save_i387_fsave(fp);
 }
 
 static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
@@ -468,14 +539,15 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
 				sizeof(struct i387_fsave_struct));
 }
 
-static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
+static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
+			       unsigned int size)
 {
 	struct task_struct *tsk = current;
 	struct user_i387_ia32_struct env;
 	int err;
 
 	err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
-			       sizeof(struct i387_fxsave_struct));
+			       size);
 	/* mxcsr reserved bits must be masked to zero for security reasons */
 	tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
 	if (err || __copy_from_user(&env, buf, sizeof(env)))
@@ -485,14 +557,69 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
 	return 0;
 }
 
-int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
+static int restore_i387_xsave(void __user *buf)
+{
+	struct _fpx_sw_bytes fx_sw_user;
+	struct _fpstate_ia32 __user *fx_user =
+			((struct _fpstate_ia32 __user *) buf);
+	struct i387_fxsave_struct __user *fx =
+		(struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
+	struct xsave_hdr_struct *xsave_hdr =
+				&current->thread.xstate->xsave.xsave_hdr;
+	u64 mask;
+	int err;
+
+	if (check_for_xstate(fx, buf, &fx_sw_user))
+		goto fx_only;
+
+	mask = fx_sw_user.xstate_bv;
+
+	err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
+
+	xsave_hdr->xstate_bv &= pcntxt_mask;
+	/*
+	 * These bits must be zero.
+	 */
+	xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
+
+	/*
+	 * Init the state that is not present in the memory layout
+	 * and enabled by the OS.
+	 */
+	mask = ~(pcntxt_mask & ~mask);
+	xsave_hdr->xstate_bv &= mask;
+
+	return err;
+fx_only:
+	/*
+	 * Couldn't find the extended state information in the memory
+	 * layout. Restore the FP/SSE and init the other extended state
+	 * enabled by the OS.
+	 */
+	xsave_hdr->xstate_bv = XSTATE_FPSSE;
+	return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
+}
+
+int restore_i387_xstate_ia32(void __user *buf)
 {
 	int err;
 	struct task_struct *tsk = current;
+	struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
 
 	if (HAVE_HWFP)
 		clear_fpu(tsk);
 
+	if (!buf) {
+		if (used_math()) {
+			clear_fpu(tsk);
+			clear_used_math();
+		}
+
+		return 0;
+	} else
+		if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
+			return -EACCES;
+
 	if (!used_math()) {
 		err = init_fpu(tsk);
 		if (err)
@@ -500,14 +627,17 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
 	}
 
 	if (HAVE_HWFP) {
-		if (cpu_has_fxsr)
-			err = restore_i387_fxsave(buf);
+		if (cpu_has_xsave)
+			err = restore_i387_xsave(buf);
+		else if (cpu_has_fxsr)
+			err = restore_i387_fxsave(fp, sizeof(struct
+							   i387_fxsave_struct));
 		else
-			err = restore_i387_fsave(buf);
+			err = restore_i387_fsave(fp);
 	} else {
 		err = fpregs_soft_set(current, NULL,
 				      0, sizeof(struct user_i387_ia32_struct),
-				      NULL, buf) != 0;
+				      NULL, fp) != 0;
 	}
 	set_used_math();
 
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index dc92b49d9204..4b8a53d841f7 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -282,6 +282,30 @@ static int __init i8259A_init_sysfs(void)
 
 device_initcall(i8259A_init_sysfs);
 
+void mask_8259A(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+
+	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */
+	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */
+
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
+void unmask_8259A(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8259A_lock, flags);
+
+	outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
+	outb(cached_slave_mask, PIC_SLAVE_IMR);	  /* restore slave IRQ mask */
+
+	spin_unlock_irqrestore(&i8259A_lock, flags);
+}
+
 void init_8259A(int auto_eoi)
 {
 	unsigned long flags;
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic.c
index 6510cde36b35..b764d7429c61 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic.c
@@ -27,16 +27,21 @@
 #include <linux/sched.h>
 #include <linux/pci.h>
 #include <linux/mc146818rtc.h>
+#include <linux/compiler.h>
 #include <linux/acpi.h>
+#include <linux/module.h>
 #include <linux/sysdev.h>
 #include <linux/msi.h>
 #include <linux/htirq.h>
-#include <linux/dmar.h>
-#include <linux/jiffies.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/jiffies.h>	/* time_after() */
 #ifdef CONFIG_ACPI
 #include <acpi/acpi_bus.h>
 #endif
 #include <linux/bootmem.h>
+#include <linux/dmar.h>
+#include <linux/hpet.h>
 
 #include <asm/idle.h>
 #include <asm/io.h>
@@ -45,62 +50,31 @@
 #include <asm/proto.h>
 #include <asm/acpi.h>
 #include <asm/dma.h>
+#include <asm/timer.h>
+#include <asm/i8259.h>
 #include <asm/nmi.h>
 #include <asm/msidef.h>
 #include <asm/hypertransport.h>
+#include <asm/setup.h>
+#include <asm/irq_remapping.h>
+#include <asm/hpet.h>
+#include <asm/uv/uv_hub.h>
+#include <asm/uv/uv_irq.h>
 
 #include <mach_ipi.h>
 #include <mach_apic.h>
+#include <mach_apicdef.h>
 
-struct irq_cfg {
-	cpumask_t domain;
-	cpumask_t old_domain;
-	unsigned move_cleanup_count;
-	u8 vector;
-	u8 move_in_progress : 1;
-};
-
-/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
-	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
-	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
-	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
-	[3]  = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
-	[4]  = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
-	[5]  = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
-	[6]  = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
-	[7]  = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
-	[8]  = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
-	[9]  = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
-	[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
-	[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
-	[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
-	[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
-	[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
-	[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
-};
-
-static int assign_irq_vector(int irq, cpumask_t mask);
+#define __apicdebuginit(type) static type __init
 
-int first_system_vector = 0xfe;
-
-char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
-
-#define __apicdebuginit  __init
-
-int sis_apic_bug; /* not actually supported, dummy for compile */
-
-static int no_timer_check;
-
-static int disable_timer_pin_1 __initdata;
-
-int timer_through_8259 __initdata;
-
-/* Where if anywhere is the i8259 connect in external int mode */
-static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
+/*
+ *      Is the SiS APIC rmw bug present ?
+ *      -1 = don't know, 0 = no, 1 = yes
+ */
+int sis_apic_bug = -1;
 
 static DEFINE_SPINLOCK(ioapic_lock);
-DEFINE_SPINLOCK(vector_lock);
+static DEFINE_SPINLOCK(vector_lock);
 
 /*
  * # of IRQ routing registers
@@ -117,11 +91,69 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
 /* # of MP IRQ source entries */
 int mp_irq_entries;
 
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
+
 DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 
+int skip_ioapic_setup;
+
+static int __init parse_noapic(char *str)
+{
+	/* disable IO-APIC */
+	disable_ioapic_setup();
+	return 0;
+}
+early_param("noapic", parse_noapic);
+
+struct irq_pin_list;
+struct irq_cfg {
+	unsigned int irq;
+	struct irq_pin_list *irq_2_pin;
+	cpumask_t domain;
+	cpumask_t old_domain;
+	unsigned move_cleanup_count;
+	u8 vector;
+	u8 move_in_progress : 1;
+};
+
+/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
+static struct irq_cfg irq_cfgx[NR_IRQS] = {
+	[0]  = { .irq =  0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
+	[1]  = { .irq =  1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
+	[2]  = { .irq =  2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
+	[3]  = { .irq =  3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR,  },
+	[4]  = { .irq =  4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR,  },
+	[5]  = { .irq =  5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR,  },
+	[6]  = { .irq =  6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR,  },
+	[7]  = { .irq =  7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR,  },
+	[8]  = { .irq =  8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR,  },
+	[9]  = { .irq =  9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR,  },
+	[10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
+	[11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
+	[12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
+	[13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
+	[14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
+	[15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
+};
+
+#define for_each_irq_cfg(irq, cfg)		\
+	for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
+
+static struct irq_cfg *irq_cfg(unsigned int irq)
+{
+	return irq < nr_irqs ? irq_cfgx + irq : NULL;
+}
+
+static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
+{
+	return irq_cfg(irq);
+}
+
 /*
- * Rough estimation of how many shared IRQs there are, can
- * be changed anytime.
+ * Rough estimation of how many shared IRQs there are, can be changed
+ * anytime.
  */
 #define MAX_PLUS_SHARED_IRQS NR_IRQS
 #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
@@ -133,9 +165,36 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
  * between pins and IRQs.
  */
 
-static struct irq_pin_list {
-	short apic, pin, next;
-} irq_2_pin[PIN_MAP_SIZE];
+struct irq_pin_list {
+	int apic, pin;
+	struct irq_pin_list *next;
+};
+
+static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
+static struct irq_pin_list *irq_2_pin_ptr;
+
+static void __init irq_2_pin_init(void)
+{
+	struct irq_pin_list *pin = irq_2_pin_head;
+	int i;
+
+	for (i = 1; i < PIN_MAP_SIZE; i++)
+		pin[i-1].next = &pin[i];
+
+	irq_2_pin_ptr = &pin[0];
+}
+
+static struct irq_pin_list *get_one_free_irq_2_pin(void)
+{
+	struct irq_pin_list *pin = irq_2_pin_ptr;
+
+	if (!pin)
+		panic("can not get more irq_2_pin\n");
+
+	irq_2_pin_ptr = pin->next;
+	pin->next = NULL;
+	return pin;
+}
 
 struct io_apic {
 	unsigned int index;
@@ -166,10 +225,15 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
 /*
  * Re-write a value: to be used for read-modify-write
  * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index register
  */
-static inline void io_apic_modify(unsigned int apic, unsigned int value)
+static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
 {
 	struct io_apic __iomem *io_apic = io_apic_base(apic);
+
+	if (sis_apic_bug)
+		writel(reg, &io_apic->index);
 	writel(value, &io_apic->data);
 }
 
@@ -177,16 +241,17 @@ static bool io_apic_level_ack_pending(unsigned int irq)
 {
 	struct irq_pin_list *entry;
 	unsigned long flags;
+	struct irq_cfg *cfg = irq_cfg(irq);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	entry = irq_2_pin + irq;
+	entry = cfg->irq_2_pin;
 	for (;;) {
 		unsigned int reg;
 		int pin;
 
-		pin = entry->pin;
-		if (pin == -1)
+		if (!entry)
 			break;
+		pin = entry->pin;
 		reg = io_apic_read(entry->apic, 0x10 + pin*2);
 		/* Is the remote IRR bit set? */
 		if (reg & IO_APIC_REDIR_REMOTE_IRR) {
@@ -195,45 +260,13 @@ static bool io_apic_level_ack_pending(unsigned int irq)
 		}
 		if (!entry->next)
 			break;
-		entry = irq_2_pin + entry->next;
+		entry = entry->next;
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return false;
 }
 
-/*
- * Synchronize the IO-APIC and the CPU by doing
- * a dummy read from the IO-APIC
- */
-static inline void io_apic_sync(unsigned int apic)
-{
-	struct io_apic __iomem *io_apic = io_apic_base(apic);
-	readl(&io_apic->data);
-}
-
-#define __DO_ACTION(R, ACTION, FINAL)					\
-									\
-{									\
-	int pin;							\
-	struct irq_pin_list *entry = irq_2_pin + irq;			\
-									\
-	BUG_ON(irq >= NR_IRQS);						\
-	for (;;) {							\
-		unsigned int reg;					\
-		pin = entry->pin;					\
-		if (pin == -1)						\
-			break;						\
-		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
-		reg ACTION;						\
-		io_apic_modify(entry->apic, reg);			\
-		FINAL;							\
-		if (!entry->next)					\
-			break;						\
-		entry = irq_2_pin + entry->next;			\
-	}								\
-}
-
 union entry_union {
 	struct { u32 w1, w2; };
 	struct IO_APIC_route_entry entry;
@@ -293,54 +326,71 @@ static void ioapic_mask_entry(int apic, int pin)
 static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
 {
 	int apic, pin;
-	struct irq_pin_list *entry = irq_2_pin + irq;
+	struct irq_cfg *cfg;
+	struct irq_pin_list *entry;
 
-	BUG_ON(irq >= NR_IRQS);
+	cfg = irq_cfg(irq);
+	entry = cfg->irq_2_pin;
 	for (;;) {
 		unsigned int reg;
+
+		if (!entry)
+			break;
+
 		apic = entry->apic;
 		pin = entry->pin;
-		if (pin == -1)
-			break;
+#ifdef CONFIG_INTR_REMAP
+		/*
+		 * With interrupt-remapping, destination information comes
+		 * from interrupt-remapping table entry.
+		 */
+		if (!irq_remapped(irq))
+			io_apic_write(apic, 0x11 + pin*2, dest);
+#else
 		io_apic_write(apic, 0x11 + pin*2, dest);
+#endif
 		reg = io_apic_read(apic, 0x10 + pin*2);
 		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
 		reg |= vector;
-		io_apic_modify(apic, reg);
+		io_apic_modify(apic, 0x10 + pin*2, reg);
 		if (!entry->next)
 			break;
-		entry = irq_2_pin + entry->next;
+		entry = entry->next;
 	}
 }
 
+static int assign_irq_vector(int irq, cpumask_t mask);
+
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
 {
-	struct irq_cfg *cfg = irq_cfg + irq;
+	struct irq_cfg *cfg;
 	unsigned long flags;
 	unsigned int dest;
 	cpumask_t tmp;
+	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
 		return;
 
+	cfg = irq_cfg(irq);
 	if (assign_irq_vector(irq, mask))
 		return;
 
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
-
 	/*
 	 * Only the high 8 bits are valid.
 	 */
 	dest = SET_APIC_LOGICAL_ID(dest);
 
+	desc = irq_to_desc(irq);
 	spin_lock_irqsave(&ioapic_lock, flags);
 	__target_IO_APIC_irq(irq, dest, cfg->vector);
-	irq_desc[irq].affinity = mask;
+	desc->affinity = mask;
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
-#endif
+#endif /* CONFIG_SMP */
 
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
@@ -349,19 +399,30 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
  */
 static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 {
-	static int first_free_entry = NR_IRQS;
-	struct irq_pin_list *entry = irq_2_pin + irq;
+	struct irq_cfg *cfg;
+	struct irq_pin_list *entry;
+
+	/* first time to refer irq_cfg, so with new */
+	cfg = irq_cfg_alloc(irq);
+	entry = cfg->irq_2_pin;
+	if (!entry) {
+		entry = get_one_free_irq_2_pin();
+		cfg->irq_2_pin = entry;
+		entry->apic = apic;
+		entry->pin = pin;
+		return;
+	}
 
-	BUG_ON(irq >= NR_IRQS);
-	while (entry->next)
-		entry = irq_2_pin + entry->next;
+	while (entry->next) {
+		/* not again, please */
+		if (entry->apic == apic && entry->pin == pin)
+			return;
 
-	if (entry->pin != -1) {
-		entry->next = first_free_entry;
-		entry = irq_2_pin + entry->next;
-		if (++first_free_entry >= PIN_MAP_SIZE)
-			panic("io_apic.c: ran out of irq_2_pin entries!");
+		entry = entry->next;
 	}
+
+	entry->next = get_one_free_irq_2_pin();
+	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
 }
@@ -373,30 +434,86 @@ static void __init replace_pin_at_irq(unsigned int irq,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
-	struct irq_pin_list *entry = irq_2_pin + irq;
+	struct irq_cfg *cfg = irq_cfg(irq);
+	struct irq_pin_list *entry = cfg->irq_2_pin;
+	int replaced = 0;
 
-	while (1) {
+	while (entry) {
 		if (entry->apic == oldapic && entry->pin == oldpin) {
 			entry->apic = newapic;
 			entry->pin = newpin;
-		}
-		if (!entry->next)
+			replaced = 1;
+			/* every one is different, right? */
 			break;
-		entry = irq_2_pin + entry->next;
+		}
+		entry = entry->next;
 	}
+
+	/* why? call replace before add? */
+	if (!replaced)
+		add_pin_to_irq(irq, newapic, newpin);
 }
 
+static inline void io_apic_modify_irq(unsigned int irq,
+				int mask_and, int mask_or,
+				void (*final)(struct irq_pin_list *entry))
+{
+	int pin;
+	struct irq_cfg *cfg;
+	struct irq_pin_list *entry;
 
-#define DO_ACTION(name,R,ACTION, FINAL)					\
-									\
-	static void name##_IO_APIC_irq (unsigned int irq)		\
-	__DO_ACTION(R, ACTION, FINAL)
+	cfg = irq_cfg(irq);
+	for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
+		unsigned int reg;
+		pin = entry->pin;
+		reg = io_apic_read(entry->apic, 0x10 + pin * 2);
+		reg &= mask_and;
+		reg |= mask_or;
+		io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
+		if (final)
+			final(entry);
+	}
+}
 
-/* mask = 1 */
-DO_ACTION(__mask,	0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic))
+static void __unmask_IO_APIC_irq(unsigned int irq)
+{
+	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
+}
 
-/* mask = 0 */
-DO_ACTION(__unmask,	0, &= ~IO_APIC_REDIR_MASKED, )
+#ifdef CONFIG_X86_64
+void io_apic_sync(struct irq_pin_list *entry)
+{
+	/*
+	 * Synchronize the IO-APIC and the CPU by doing
+	 * a dummy read from the IO-APIC
+	 */
+	struct io_apic __iomem *io_apic;
+	io_apic = io_apic_base(entry->apic);
+	readl(&io_apic->data);
+}
+
+static void __mask_IO_APIC_irq(unsigned int irq)
+{
+	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
+}
+#else /* CONFIG_X86_32 */
+static void __mask_IO_APIC_irq(unsigned int irq)
+{
+	io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
+}
+
+static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
+{
+	io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
+			IO_APIC_REDIR_MASKED, NULL);
+}
+
+static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
+{
+	io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
+			IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
+}
+#endif /* CONFIG_X86_32 */
 
 static void mask_IO_APIC_irq (unsigned int irq)
 {
@@ -439,24 +556,145 @@ static void clear_IO_APIC (void)
 			clear_IO_APIC_pin(apic, pin);
 }
 
-int skip_ioapic_setup;
-int ioapic_force;
+#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
+void send_IPI_self(int vector)
+{
+	unsigned int cfg;
 
-static int __init parse_noapic(char *str)
+	/*
+	 * Wait for idle.
+	 */
+	apic_wait_icr_idle();
+	cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
+	/*
+	 * Send the IPI. The write to APIC_ICR fires this off.
+	 */
+	apic_write(APIC_ICR, cfg);
+}
+#endif /* !CONFIG_SMP && CONFIG_X86_32*/
+
+#ifdef CONFIG_X86_32
+/*
+ * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
+ * specific CPU-side IRQs.
+ */
+
+#define MAX_PIRQS 8
+static int pirq_entries [MAX_PIRQS];
+static int pirqs_enabled;
+
+static int __init ioapic_pirq_setup(char *str)
 {
-	disable_ioapic_setup();
+	int i, max;
+	int ints[MAX_PIRQS+1];
+
+	get_options(str, ARRAY_SIZE(ints), ints);
+
+	for (i = 0; i < MAX_PIRQS; i++)
+		pirq_entries[i] = -1;
+
+	pirqs_enabled = 1;
+	apic_printk(APIC_VERBOSE, KERN_INFO
+			"PIRQ redirection, working around broken MP-BIOS.\n");
+	max = MAX_PIRQS;
+	if (ints[0] < MAX_PIRQS)
+		max = ints[0];
+
+	for (i = 0; i < max; i++) {
+		apic_printk(APIC_VERBOSE, KERN_DEBUG
+				"... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
+		/*
+		 * PIRQs are mapped upside down, usually.
+		 */
+		pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
+	}
+	return 1;
+}
+
+__setup("pirq=", ioapic_pirq_setup);
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_INTR_REMAP
+/* I/O APIC RTE contents at the OS boot up */
+static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
+
+/*
+ * Saves and masks all the unmasked IO-APIC RTE's
+ */
+int save_mask_IO_APIC_setup(void)
+{
+	union IO_APIC_reg_01 reg_01;
+	unsigned long flags;
+	int apic, pin;
+
+	/*
+	 * The number of IO-APIC IRQ registers (== #pins):
+	 */
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		spin_lock_irqsave(&ioapic_lock, flags);
+		reg_01.raw = io_apic_read(apic, 1);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
+	}
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		early_ioapic_entries[apic] =
+			kzalloc(sizeof(struct IO_APIC_route_entry) *
+				nr_ioapic_registers[apic], GFP_KERNEL);
+		if (!early_ioapic_entries[apic])
+			goto nomem;
+	}
+
+	for (apic = 0; apic < nr_ioapics; apic++)
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+			struct IO_APIC_route_entry entry;
+
+			entry = early_ioapic_entries[apic][pin] =
+				ioapic_read_entry(apic, pin);
+			if (!entry.mask) {
+				entry.mask = 1;
+				ioapic_write_entry(apic, pin, entry);
+			}
+		}
+
 	return 0;
+
+nomem:
+	while (apic >= 0)
+		kfree(early_ioapic_entries[apic--]);
+	memset(early_ioapic_entries, 0,
+		ARRAY_SIZE(early_ioapic_entries));
+
+	return -ENOMEM;
 }
-early_param("noapic", parse_noapic);
 
-/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
-static int __init disable_timer_pin_setup(char *arg)
+void restore_IO_APIC_setup(void)
 {
-	disable_timer_pin_1 = 1;
-	return 1;
+	int apic, pin;
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		if (!early_ioapic_entries[apic])
+			break;
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
+			ioapic_write_entry(apic, pin,
+					   early_ioapic_entries[apic][pin]);
+		kfree(early_ioapic_entries[apic]);
+		early_ioapic_entries[apic] = NULL;
+	}
 }
-__setup("disable_timer_pin_1", disable_timer_pin_setup);
 
+void reinit_intr_remapped_IO_APIC(int intr_remapping)
+{
+	/*
+	 * for now plain restore of previous settings.
+	 * TBD: In the case of OS enabling interrupt-remapping,
+	 * IO-APIC RTE's need to be setup to point to interrupt-remapping
+	 * table entries. for now, do a plain restore, and wait for
+	 * the setup_IO_APIC_irqs() to do proper initialization.
+	 */
+	restore_IO_APIC_setup();
+}
+#endif
 
 /*
  * Find the IRQ entry number of a certain pin.
@@ -560,22 +798,54 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
 				best_guess = irq;
 		}
 	}
-	BUG_ON(best_guess >= NR_IRQS);
 	return best_guess;
 }
 
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+/*
+ * EISA Edge/Level control register, ELCR
+ */
+static int EISA_ELCR(unsigned int irq)
+{
+	if (irq < 16) {
+		unsigned int port = 0x4d0 + (irq >> 3);
+		return (inb(port) >> (irq & 7)) & 1;
+	}
+	apic_printk(APIC_VERBOSE, KERN_INFO
+			"Broken MPtable reports ISA irq %d\n", irq);
+	return 0;
+}
+
+#endif
+
 /* ISA interrupts are always polarity zero edge triggered,
  * when listed as conforming in the MP table. */
 
 #define default_ISA_trigger(idx)	(0)
 #define default_ISA_polarity(idx)	(0)
 
+/* EISA interrupts are always polarity zero and can be edge or level
+ * trigger depending on the ELCR value.  If an interrupt is listed as
+ * EISA conforming in the MP table, that means its trigger type must
+ * be read in from the ELCR */
+
+#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
+#define default_EISA_polarity(idx)	default_ISA_polarity(idx)
+
 /* PCI interrupts are always polarity one level triggered,
  * when listed as conforming in the MP table. */
 
 #define default_PCI_trigger(idx)	(1)
 #define default_PCI_polarity(idx)	(1)
 
+/* MCA interrupts are always polarity zero level triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_MCA_trigger(idx)	(1)
+#define default_MCA_polarity(idx)	default_ISA_polarity(idx)
+
 static int MPBIOS_polarity(int idx)
 {
 	int bus = mp_irqs[idx].mp_srcbus;
@@ -633,6 +903,36 @@ static int MPBIOS_trigger(int idx)
 				trigger = default_ISA_trigger(idx);
 			else
 				trigger = default_PCI_trigger(idx);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
+			switch (mp_bus_id_to_type[bus]) {
+				case MP_BUS_ISA: /* ISA pin */
+				{
+					/* set before the switch */
+					break;
+				}
+				case MP_BUS_EISA: /* EISA pin */
+				{
+					trigger = default_EISA_trigger(idx);
+					break;
+				}
+				case MP_BUS_PCI: /* PCI pin */
+				{
+					/* set before the switch */
+					break;
+				}
+				case MP_BUS_MCA: /* MCA pin */
+				{
+					trigger = default_MCA_trigger(idx);
+					break;
+				}
+				default:
+				{
+					printk(KERN_WARNING "broken BIOS!!\n");
+					trigger = 1;
+					break;
+				}
+			}
+#endif
 			break;
 		case 1: /* edge */
 		{
@@ -670,6 +970,7 @@ static inline int irq_trigger(int idx)
 	return MPBIOS_trigger(idx);
 }
 
+int (*ioapic_renumber_irq)(int ioapic, int irq);
 static int pin_2_irq(int idx, int apic, int pin)
 {
 	int irq, i;
@@ -691,11 +992,48 @@ static int pin_2_irq(int idx, int apic, int pin)
 		while (i < apic)
 			irq += nr_ioapic_registers[i++];
 		irq += pin;
+		/*
+                 * For MPS mode, so far only needed by ES7000 platform
+                 */
+		if (ioapic_renumber_irq)
+			irq = ioapic_renumber_irq(apic, irq);
 	}
-	BUG_ON(irq >= NR_IRQS);
+
+#ifdef CONFIG_X86_32
+	/*
+	 * PCI IRQ command line redirection. Yes, limits are hardcoded.
+	 */
+	if ((pin >= 16) && (pin <= 23)) {
+		if (pirq_entries[pin-16] != -1) {
+			if (!pirq_entries[pin-16]) {
+				apic_printk(APIC_VERBOSE, KERN_DEBUG
+						"disabling PIRQ%d\n", pin-16);
+			} else {
+				irq = pirq_entries[pin-16];
+				apic_printk(APIC_VERBOSE, KERN_DEBUG
+						"using PIRQ%d -> IRQ %d\n",
+						pin-16, irq);
+			}
+		}
+	}
+#endif
+
 	return irq;
 }
 
+void lock_vector_lock(void)
+{
+	/* Used to the online set of cpus does not change
+	 * during assign_irq_vector.
+	 */
+	spin_lock(&vector_lock);
+}
+
+void unlock_vector_lock(void)
+{
+	spin_unlock(&vector_lock);
+}
+
 static int __assign_irq_vector(int irq, cpumask_t mask)
 {
 	/*
@@ -714,8 +1052,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
 	int cpu;
 	struct irq_cfg *cfg;
 
-	BUG_ON((unsigned)irq >= NR_IRQS);
-	cfg = &irq_cfg[irq];
+	cfg = irq_cfg(irq);
 
 	/* Only try and allocate irqs on cpus that are present */
 	cpus_and(mask, mask, cpu_online_map);
@@ -731,7 +1068,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
 			return 0;
 	}
 
-	for_each_cpu_mask(cpu, mask) {
+	for_each_cpu_mask_nr(cpu, mask) {
 		cpumask_t domain, new_mask;
 		int new_cpu;
 		int vector, offset;
@@ -750,9 +1087,14 @@ next:
 		}
 		if (unlikely(current_vector == vector))
 			continue;
+#ifdef CONFIG_X86_64
 		if (vector == IA32_SYSCALL_VECTOR)
 			goto next;
-		for_each_cpu_mask(new_cpu, new_mask)
+#else
+		if (vector == SYSCALL_VECTOR)
+			goto next;
+#endif
+		for_each_cpu_mask_nr(new_cpu, new_mask)
 			if (per_cpu(vector_irq, new_cpu)[vector] != -1)
 				goto next;
 		/* Found one! */
@@ -762,7 +1104,7 @@ next:
 			cfg->move_in_progress = 1;
 			cfg->old_domain = cfg->domain;
 		}
-		for_each_cpu_mask(new_cpu, new_mask)
+		for_each_cpu_mask_nr(new_cpu, new_mask)
 			per_cpu(vector_irq, new_cpu)[vector] = irq;
 		cfg->vector = vector;
 		cfg->domain = domain;
@@ -788,30 +1130,30 @@ static void __clear_irq_vector(int irq)
 	cpumask_t mask;
 	int cpu, vector;
 
-	BUG_ON((unsigned)irq >= NR_IRQS);
-	cfg = &irq_cfg[irq];
+	cfg = irq_cfg(irq);
 	BUG_ON(!cfg->vector);
 
 	vector = cfg->vector;
 	cpus_and(mask, cfg->domain, cpu_online_map);
-	for_each_cpu_mask(cpu, mask)
+	for_each_cpu_mask_nr(cpu, mask)
 		per_cpu(vector_irq, cpu)[vector] = -1;
 
 	cfg->vector = 0;
 	cpus_clear(cfg->domain);
 }
 
-static void __setup_vector_irq(int cpu)
+void __setup_vector_irq(int cpu)
 {
 	/* Initialize vector_irq on a new cpu */
 	/* This function must be called with vector_lock held */
 	int irq, vector;
+	struct irq_cfg *cfg;
 
 	/* Mark the inuse vectors */
-	for (irq = 0; irq < NR_IRQS; ++irq) {
-		if (!cpu_isset(cpu, irq_cfg[irq].domain))
+	for_each_irq_cfg(irq, cfg) {
+		if (!cpu_isset(cpu, cfg->domain))
 			continue;
-		vector = irq_cfg[irq].vector;
+		vector = cfg->vector;
 		per_cpu(vector_irq, cpu)[vector] = irq;
 	}
 	/* Mark the free vectors */
@@ -819,44 +1161,154 @@ static void __setup_vector_irq(int cpu)
 		irq = per_cpu(vector_irq, cpu)[vector];
 		if (irq < 0)
 			continue;
-		if (!cpu_isset(cpu, irq_cfg[irq].domain))
+
+		cfg = irq_cfg(irq);
+		if (!cpu_isset(cpu, cfg->domain))
 			per_cpu(vector_irq, cpu)[vector] = -1;
 	}
 }
 
-void setup_vector_irq(int cpu)
-{
-	spin_lock(&vector_lock);
-	__setup_vector_irq(smp_processor_id());
-	spin_unlock(&vector_lock);
-}
+static struct irq_chip ioapic_chip;
+#ifdef CONFIG_INTR_REMAP
+static struct irq_chip ir_ioapic_chip;
+#endif
 
+#define IOAPIC_AUTO     -1
+#define IOAPIC_EDGE     0
+#define IOAPIC_LEVEL    1
 
-static struct irq_chip ioapic_chip;
+#ifdef CONFIG_X86_32
+static inline int IO_APIC_irq_trigger(int irq)
+{
+	int apic, idx, pin;
+
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+			idx = find_irq_entry(apic, pin, mp_INT);
+			if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
+				return irq_trigger(idx);
+		}
+	}
+	/*
+         * nonexistent IRQs are edge default
+         */
+	return 0;
+}
+#else
+static inline int IO_APIC_irq_trigger(int irq)
+{
+	return 1;
+}
+#endif
 
 static void ioapic_register_intr(int irq, unsigned long trigger)
 {
-	if (trigger) {
-		irq_desc[irq].status |= IRQ_LEVEL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+	    trigger == IOAPIC_LEVEL)
+		desc->status |= IRQ_LEVEL;
+	else
+		desc->status &= ~IRQ_LEVEL;
+
+#ifdef CONFIG_INTR_REMAP
+	if (irq_remapped(irq)) {
+		desc->status |= IRQ_MOVE_PCNTXT;
+		if (trigger)
+			set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
+						      handle_fasteoi_irq,
+						     "fasteoi");
+		else
+			set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
+						      handle_edge_irq, "edge");
+		return;
+	}
+#endif
+	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
+	    trigger == IOAPIC_LEVEL)
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
-					      handle_fasteoi_irq, "fasteoi");
-	} else {
-		irq_desc[irq].status &= ~IRQ_LEVEL;
+					      handle_fasteoi_irq,
+					      "fasteoi");
+	else
 		set_irq_chip_and_handler_name(irq, &ioapic_chip,
 					      handle_edge_irq, "edge");
+}
+
+static int setup_ioapic_entry(int apic, int irq,
+			      struct IO_APIC_route_entry *entry,
+			      unsigned int destination, int trigger,
+			      int polarity, int vector)
+{
+	/*
+	 * add it to the IO-APIC irq-routing table:
+	 */
+	memset(entry,0,sizeof(*entry));
+
+#ifdef CONFIG_INTR_REMAP
+	if (intr_remapping_enabled) {
+		struct intel_iommu *iommu = map_ioapic_to_ir(apic);
+		struct irte irte;
+		struct IR_IO_APIC_route_entry *ir_entry =
+			(struct IR_IO_APIC_route_entry *) entry;
+		int index;
+
+		if (!iommu)
+			panic("No mapping iommu for ioapic %d\n", apic);
+
+		index = alloc_irte(iommu, irq, 1);
+		if (index < 0)
+			panic("Failed to allocate IRTE for ioapic %d\n", apic);
+
+		memset(&irte, 0, sizeof(irte));
+
+		irte.present = 1;
+		irte.dst_mode = INT_DEST_MODE;
+		irte.trigger_mode = trigger;
+		irte.dlvry_mode = INT_DELIVERY_MODE;
+		irte.vector = vector;
+		irte.dest_id = IRTE_DEST(destination);
+
+		modify_irte(irq, &irte);
+
+		ir_entry->index2 = (index >> 15) & 0x1;
+		ir_entry->zero = 0;
+		ir_entry->format = 1;
+		ir_entry->index = (index & 0x7fff);
+	} else
+#endif
+	{
+		entry->delivery_mode = INT_DELIVERY_MODE;
+		entry->dest_mode = INT_DEST_MODE;
+		entry->dest = destination;
 	}
+
+	entry->mask = 0;				/* enable IRQ */
+	entry->trigger = trigger;
+	entry->polarity = polarity;
+	entry->vector = vector;
+
+	/* Mask level triggered irqs.
+	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
+	 */
+	if (trigger)
+		entry->mask = 1;
+	return 0;
 }
 
 static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
 			      int trigger, int polarity)
 {
-	struct irq_cfg *cfg = irq_cfg + irq;
+	struct irq_cfg *cfg;
 	struct IO_APIC_route_entry entry;
 	cpumask_t mask;
 
 	if (!IO_APIC_IRQ(irq))
 		return;
 
+	cfg = irq_cfg(irq);
+
 	mask = TARGET_CPUS;
 	if (assign_irq_vector(irq, mask))
 		return;
@@ -869,24 +1321,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
 		    apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
 		    irq, trigger, polarity);
 
-	/*
-	 * add it to the IO-APIC irq-routing table:
-	 */
-	memset(&entry,0,sizeof(entry));
 
-	entry.delivery_mode = INT_DELIVERY_MODE;
-	entry.dest_mode = INT_DEST_MODE;
-	entry.dest = cpu_mask_to_apicid(mask);
-	entry.mask = 0;				/* enable IRQ */
-	entry.trigger = trigger;
-	entry.polarity = polarity;
-	entry.vector = cfg->vector;
-
-	/* Mask level triggered irqs.
-	 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
-	 */
-	if (trigger)
-		entry.mask = 1;
+	if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
+			       cpu_mask_to_apicid(mask), trigger, polarity,
+			       cfg->vector)) {
+		printk("Failed to setup ioapic entry for ioapic  %d, pin %d\n",
+		       mp_ioapics[apic].mp_apicid, pin);
+		__clear_irq_vector(irq);
+		return;
+	}
 
 	ioapic_register_intr(irq, trigger);
 	if (irq < 16)
@@ -897,37 +1340,49 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
 
 static void __init setup_IO_APIC_irqs(void)
 {
-	int apic, pin, idx, irq, first_notcon = 1;
+	int apic, pin, idx, irq;
+	int notcon = 0;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
 	for (apic = 0; apic < nr_ioapics; apic++) {
-	for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-
-		idx = find_irq_entry(apic,pin,mp_INT);
-		if (idx == -1) {
-			if (first_notcon) {
-				apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
-				first_notcon = 0;
-			} else
-				apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
-			continue;
-		}
-		if (!first_notcon) {
-			apic_printk(APIC_VERBOSE, " not connected.\n");
-			first_notcon = 1;
-		}
+		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+
+			idx = find_irq_entry(apic, pin, mp_INT);
+			if (idx == -1) {
+				if (!notcon) {
+					notcon = 1;
+					apic_printk(APIC_VERBOSE,
+						KERN_DEBUG " %d-%d",
+						mp_ioapics[apic].mp_apicid,
+						pin);
+				} else
+					apic_printk(APIC_VERBOSE, " %d-%d",
+						mp_ioapics[apic].mp_apicid,
+						pin);
+				continue;
+			}
+			if (notcon) {
+				apic_printk(APIC_VERBOSE,
+					" (apicid-pin) not connected\n");
+				notcon = 0;
+			}
 
-		irq = pin_2_irq(idx, apic, pin);
-		add_pin_to_irq(irq, apic, pin);
+			irq = pin_2_irq(idx, apic, pin);
+#ifdef CONFIG_X86_32
+			if (multi_timer_check(apic, irq))
+				continue;
+#endif
+			add_pin_to_irq(irq, apic, pin);
 
-		setup_IO_APIC_irq(apic, pin, irq,
-				  irq_trigger(idx), irq_polarity(idx));
-	}
+			setup_IO_APIC_irq(apic, pin, irq,
+					irq_trigger(idx), irq_polarity(idx));
+		}
 	}
 
-	if (!first_notcon)
-		apic_printk(APIC_VERBOSE, " not connected.\n");
+	if (notcon)
+		apic_printk(APIC_VERBOSE,
+			" (apicid-pin) not connected\n");
 }
 
 /*
@@ -938,6 +1393,11 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
 {
 	struct IO_APIC_route_entry entry;
 
+#ifdef CONFIG_INTR_REMAP
+	if (intr_remapping_enabled)
+		return;
+#endif
+
 	memset(&entry, 0, sizeof(entry));
 
 	/*
@@ -964,13 +1424,17 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
 	ioapic_write_entry(apic, pin, entry);
 }
 
-void __apicdebuginit print_IO_APIC(void)
+
+__apicdebuginit(void) print_IO_APIC(void)
 {
 	int apic, i;
 	union IO_APIC_reg_00 reg_00;
 	union IO_APIC_reg_01 reg_01;
 	union IO_APIC_reg_02 reg_02;
+	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
+	struct irq_cfg *cfg;
+	unsigned int irq;
 
 	if (apic_verbosity == APIC_QUIET)
 		return;
@@ -993,12 +1457,16 @@ void __apicdebuginit print_IO_APIC(void)
 	reg_01.raw = io_apic_read(apic, 1);
 	if (reg_01.bits.version >= 0x10)
 		reg_02.raw = io_apic_read(apic, 2);
+	if (reg_01.bits.version >= 0x20)
+		reg_03.raw = io_apic_read(apic, 3);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	printk("\n");
 	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
 	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
 	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
+	printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
+	printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
 
 	printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
 	printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
@@ -1006,11 +1474,27 @@ void __apicdebuginit print_IO_APIC(void)
 	printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
 	printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
 
-	if (reg_01.bits.version >= 0x10) {
+	/*
+	 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
+	 * but the value of reg_02 is read as the previous read register
+	 * value, so ignore it if reg_02 == reg_01.
+	 */
+	if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
 		printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
 		printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
 	}
 
+	/*
+	 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
+	 * or reg_03, but the value of reg_0[23] is read as the previous read
+	 * register value, so ignore it if reg_03 == reg_0[12].
+	 */
+	if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
+	    reg_03.raw != reg_01.raw) {
+		printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
+		printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
+	}
+
 	printk(KERN_DEBUG ".... IRQ redirection table:\n");
 
 	printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
@@ -1039,16 +1523,16 @@ void __apicdebuginit print_IO_APIC(void)
 	}
 	}
 	printk(KERN_DEBUG "IRQ to pin mappings:\n");
-	for (i = 0; i < NR_IRQS; i++) {
-		struct irq_pin_list *entry = irq_2_pin + i;
-		if (entry->pin < 0)
+	for_each_irq_cfg(irq, cfg) {
+		struct irq_pin_list *entry = cfg->irq_2_pin;
+		if (!entry)
 			continue;
-		printk(KERN_DEBUG "IRQ%d ", i);
+		printk(KERN_DEBUG "IRQ%d ", irq);
 		for (;;) {
 			printk("-> %d:%d", entry->apic, entry->pin);
 			if (!entry->next)
 				break;
-			entry = irq_2_pin + entry->next;
+			entry = entry->next;
 		}
 		printk("\n");
 	}
@@ -1058,9 +1542,7 @@ void __apicdebuginit print_IO_APIC(void)
 	return;
 }
 
-#if 0
-
-static __apicdebuginit void print_APIC_bitfield (int base)
+__apicdebuginit(void) print_APIC_bitfield(int base)
 {
 	unsigned int v;
 	int i, j;
@@ -1081,9 +1563,10 @@ static __apicdebuginit void print_APIC_bitfield (int base)
 	}
 }
 
-void __apicdebuginit print_local_APIC(void * dummy)
+__apicdebuginit(void) print_local_APIC(void *dummy)
 {
 	unsigned int v, ver, maxlvt;
+	u64 icr;
 
 	if (apic_verbosity == APIC_QUIET)
 		return;
@@ -1091,7 +1574,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
 	printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
 		smp_processor_id(), hard_smp_processor_id());
 	v = apic_read(APIC_ID);
-	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
+	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v, read_apic_id());
 	v = apic_read(APIC_LVR);
 	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
 	ver = GET_APIC_VERSION(v);
@@ -1100,20 +1583,31 @@ void __apicdebuginit print_local_APIC(void * dummy)
 	v = apic_read(APIC_TASKPRI);
 	printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
 
-	v = apic_read(APIC_ARBPRI);
-	printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
-		v & APIC_ARBPRI_MASK);
-	v = apic_read(APIC_PROCPRI);
-	printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+	if (APIC_INTEGRATED(ver)) {                     /* !82489DX */
+		if (!APIC_XAPIC(ver)) {
+			v = apic_read(APIC_ARBPRI);
+			printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
+			       v & APIC_ARBPRI_MASK);
+		}
+		v = apic_read(APIC_PROCPRI);
+		printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
+	}
+
+	/*
+	 * Remote read supported only in the 82489DX and local APIC for
+	 * Pentium processors.
+	 */
+	if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
+		v = apic_read(APIC_RRR);
+		printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
+	}
 
-	v = apic_read(APIC_EOI);
-	printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
-	v = apic_read(APIC_RRR);
-	printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
 	v = apic_read(APIC_LDR);
 	printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
-	v = apic_read(APIC_DFR);
-	printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+	if (!x2apic_enabled()) {
+		v = apic_read(APIC_DFR);
+		printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
+	}
 	v = apic_read(APIC_SPIV);
 	printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
 
@@ -1124,13 +1618,17 @@ void __apicdebuginit print_local_APIC(void * dummy)
 	printk(KERN_DEBUG "... APIC IRR field:\n");
 	print_APIC_bitfield(APIC_IRR);
 
-	v = apic_read(APIC_ESR);
-	printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+	if (APIC_INTEGRATED(ver)) {             /* !82489DX */
+		if (maxlvt > 3)         /* Due to the Pentium erratum 3AP. */
+			apic_write(APIC_ESR, 0);
 
-	v = apic_read(APIC_ICR);
-	printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
-	v = apic_read(APIC_ICR2);
-	printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
+		v = apic_read(APIC_ESR);
+		printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
+	}
+
+	icr = apic_icr_read();
+	printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
+	printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
 
 	v = apic_read(APIC_LVTT);
 	printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1158,12 +1656,17 @@ void __apicdebuginit print_local_APIC(void * dummy)
 	printk("\n");
 }
 
-void print_all_local_APICs (void)
+__apicdebuginit(void) print_all_local_APICs(void)
 {
-	on_each_cpu(print_local_APIC, NULL, 1);
+	int cpu;
+
+	preempt_disable();
+	for_each_online_cpu(cpu)
+		smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+	preempt_enable();
 }
 
-void __apicdebuginit print_PIC(void)
+__apicdebuginit(void) print_PIC(void)
 {
 	unsigned int v;
 	unsigned long flags;
@@ -1195,19 +1698,34 @@ void __apicdebuginit print_PIC(void)
 	printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
 }
 
-#endif  /*  0  */
+__apicdebuginit(int) print_all_ICs(void)
+{
+	print_PIC();
+	print_all_local_APICs();
+	print_IO_APIC();
+
+	return 0;
+}
+
+fs_initcall(print_all_ICs);
+
+
+/* Where if anywhere is the i8259 connect in external int mode */
+static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 
 void __init enable_IO_APIC(void)
 {
 	union IO_APIC_reg_01 reg_01;
 	int i8259_apic, i8259_pin;
-	int i, apic;
+	int apic;
 	unsigned long flags;
 
-	for (i = 0; i < PIN_MAP_SIZE; i++) {
-		irq_2_pin[i].pin = -1;
-		irq_2_pin[i].next = 0;
-	}
+#ifdef CONFIG_X86_32
+	int i;
+	if (!pirqs_enabled)
+		for (i = 0; i < MAX_PIRQS; i++)
+			pirq_entries[i] = -1;
+#endif
 
 	/*
 	 * The number of IO-APIC IRQ registers (== #pins):
@@ -1237,6 +1755,10 @@ void __init enable_IO_APIC(void)
 	}
  found_i8259:
 	/* Look to see what if the MP table has reported the ExtINT */
+	/* If we could not find the appropriate pin by looking at the ioapic
+	 * the i8259 probably is not connected the ioapic but give the
+	 * mptable a chance anyway.
+	 */
 	i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
 	i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
 	/* Trust the MP table if nothing is setup in the hardware */
@@ -1285,7 +1807,7 @@ void disable_IO_APIC(void)
 		entry.dest_mode       = 0; /* Physical */
 		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
 		entry.vector          = 0;
-		entry.dest          = GET_APIC_ID(read_apic_id());
+		entry.dest            = read_apic_id();
 
 		/*
 		 * Add it to the IO-APIC irq-routing table:
@@ -1296,6 +1818,133 @@ void disable_IO_APIC(void)
 	disconnect_bsp_APIC(ioapic_i8259.pin != -1);
 }
 
+#ifdef CONFIG_X86_32
+/*
+ * function to set the IO-APIC physical IDs based on the
+ * values stored in the MPC table.
+ *
+ * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
+ */
+
+static void __init setup_ioapic_ids_from_mpc(void)
+{
+	union IO_APIC_reg_00 reg_00;
+	physid_mask_t phys_id_present_map;
+	int apic;
+	int i;
+	unsigned char old_id;
+	unsigned long flags;
+
+	if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
+		return;
+
+	/*
+	 * Don't check I/O APIC IDs for xAPIC systems.  They have
+	 * no meaning without the serial APIC bus.
+	 */
+	if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		|| APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+		return;
+	/*
+	 * This is broken; anything with a real cpu count has to
+	 * circumvent this idiocy regardless.
+	 */
+	phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+	/*
+	 * Set the IOAPIC ID to the value stored in the MPC table.
+	 */
+	for (apic = 0; apic < nr_ioapics; apic++) {
+
+		/* Read the register 0 value */
+		spin_lock_irqsave(&ioapic_lock, flags);
+		reg_00.raw = io_apic_read(apic, 0);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+
+		old_id = mp_ioapics[apic].mp_apicid;
+
+		if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
+			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
+				apic, mp_ioapics[apic].mp_apicid);
+			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+				reg_00.bits.ID);
+			mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
+		}
+
+		/*
+		 * Sanity check, is the ID really free? Every APIC in a
+		 * system must have a unique ID or we get lots of nice
+		 * 'stuck on smp_invalidate_needed IPI wait' messages.
+		 */
+		if (check_apicid_used(phys_id_present_map,
+					mp_ioapics[apic].mp_apicid)) {
+			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
+				apic, mp_ioapics[apic].mp_apicid);
+			for (i = 0; i < get_physical_broadcast(); i++)
+				if (!physid_isset(i, phys_id_present_map))
+					break;
+			if (i >= get_physical_broadcast())
+				panic("Max APIC ID exceeded!\n");
+			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
+				i);
+			physid_set(i, phys_id_present_map);
+			mp_ioapics[apic].mp_apicid = i;
+		} else {
+			physid_mask_t tmp;
+			tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
+			apic_printk(APIC_VERBOSE, "Setting %d in the "
+					"phys_id_present_map\n",
+					mp_ioapics[apic].mp_apicid);
+			physids_or(phys_id_present_map, phys_id_present_map, tmp);
+		}
+
+
+		/*
+		 * We need to adjust the IRQ routing table
+		 * if the ID changed.
+		 */
+		if (old_id != mp_ioapics[apic].mp_apicid)
+			for (i = 0; i < mp_irq_entries; i++)
+				if (mp_irqs[i].mp_dstapic == old_id)
+					mp_irqs[i].mp_dstapic
+						= mp_ioapics[apic].mp_apicid;
+
+		/*
+		 * Read the right value from the MPC table and
+		 * write it into the ID register.
+		 */
+		apic_printk(APIC_VERBOSE, KERN_INFO
+			"...changing IO-APIC physical APIC ID to %d ...",
+			mp_ioapics[apic].mp_apicid);
+
+		reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
+		spin_lock_irqsave(&ioapic_lock, flags);
+		io_apic_write(apic, 0, reg_00.raw);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+
+		/*
+		 * Sanity check
+		 */
+		spin_lock_irqsave(&ioapic_lock, flags);
+		reg_00.raw = io_apic_read(apic, 0);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+		if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
+			printk("could not set ID!\n");
+		else
+			apic_printk(APIC_VERBOSE, " ok.\n");
+	}
+}
+#endif
+
+int no_timer_check __initdata;
+
+static int __init notimercheck(char *s)
+{
+	no_timer_check = 1;
+	return 1;
+}
+__setup("no_timer_check", notimercheck);
+
 /*
  * There is a nasty bug in some older SMP boards, their mptable lies
  * about the timer IRQ. We do the following to work around the situation:
@@ -1309,6 +1958,9 @@ static int __init timer_irq_works(void)
 	unsigned long t1 = jiffies;
 	unsigned long flags;
 
+	if (no_timer_check)
+		return 1;
+
 	local_save_flags(flags);
 	local_irq_enable();
 	/* Let ten ticks pass... */
@@ -1369,19 +2021,27 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
 	return was_pending;
 }
 
+#ifdef CONFIG_X86_64
 static int ioapic_retrigger_irq(unsigned int irq)
 {
-	struct irq_cfg *cfg = &irq_cfg[irq];
-	cpumask_t mask;
+
+	struct irq_cfg *cfg = irq_cfg(irq);
 	unsigned long flags;
 
 	spin_lock_irqsave(&vector_lock, flags);
-	mask = cpumask_of_cpu(first_cpu(cfg->domain));
-	send_IPI_mask(mask, cfg->vector);
+	send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
 	spin_unlock_irqrestore(&vector_lock, flags);
 
 	return 1;
 }
+#else
+static int ioapic_retrigger_irq(unsigned int irq)
+{
+	send_IPI_self(irq_cfg(irq)->vector);
+
+	return 1;
+}
+#endif
 
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
@@ -1393,11 +2053,159 @@ static int ioapic_retrigger_irq(unsigned int irq)
  */
 
 #ifdef CONFIG_SMP
+
+#ifdef CONFIG_INTR_REMAP
+static void ir_irq_migration(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
+
+/*
+ * Migrate the IO-APIC irq in the presence of intr-remapping.
+ *
+ * For edge triggered, irq migration is a simple atomic update(of vector
+ * and cpu destination) of IRTE and flush the hardware cache.
+ *
+ * For level triggered, we need to modify the io-apic RTE aswell with the update
+ * vector information, along with modifying IRTE with vector and destination.
+ * So irq migration for level triggered is little  bit more complex compared to
+ * edge triggered migration. But the good news is, we use the same algorithm
+ * for level triggered migration as we have today, only difference being,
+ * we now initiate the irq migration from process context instead of the
+ * interrupt context.
+ *
+ * In future, when we do a directed EOI (combined with cpu EOI broadcast
+ * suppression) to the IO-APIC, level triggered irq migration will also be
+ * as simple as edge triggered migration and we can do the irq migration
+ * with a simple atomic update to IO-APIC RTE.
+ */
+static void migrate_ioapic_irq(int irq, cpumask_t mask)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	cpumask_t tmp, cleanup_mask;
+	struct irte irte;
+	int modify_ioapic_rte;
+	unsigned int dest;
+	unsigned long flags;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		return;
+
+	if (get_irte(irq, &irte))
+		return;
+
+	if (assign_irq_vector(irq, mask))
+		return;
+
+	cfg = irq_cfg(irq);
+	cpus_and(tmp, cfg->domain, mask);
+	dest = cpu_mask_to_apicid(tmp);
+
+	desc = irq_to_desc(irq);
+	modify_ioapic_rte = desc->status & IRQ_LEVEL;
+	if (modify_ioapic_rte) {
+		spin_lock_irqsave(&ioapic_lock, flags);
+		__target_IO_APIC_irq(irq, dest, cfg->vector);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+	}
+
+	irte.vector = cfg->vector;
+	irte.dest_id = IRTE_DEST(dest);
+
+	/*
+	 * Modified the IRTE and flushes the Interrupt entry cache.
+	 */
+	modify_irte(irq, &irte);
+
+	if (cfg->move_in_progress) {
+		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+		cfg->move_in_progress = 0;
+	}
+
+	desc->affinity = mask;
+}
+
+static int migrate_irq_remapped_level(int irq)
+{
+	int ret = -1;
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	mask_IO_APIC_irq(irq);
+
+	if (io_apic_level_ack_pending(irq)) {
+		/*
+		 * Interrupt in progress. Migrating irq now will change the
+		 * vector information in the IO-APIC RTE and that will confuse
+		 * the EOI broadcast performed by cpu.
+		 * So, delay the irq migration to the next instance.
+		 */
+		schedule_delayed_work(&ir_migration_work, 1);
+		goto unmask;
+	}
+
+	/* everthing is clear. we have right of way */
+	migrate_ioapic_irq(irq, desc->pending_mask);
+
+	ret = 0;
+	desc->status &= ~IRQ_MOVE_PENDING;
+	cpus_clear(desc->pending_mask);
+
+unmask:
+	unmask_IO_APIC_irq(irq);
+	return ret;
+}
+
+static void ir_irq_migration(struct work_struct *work)
+{
+	unsigned int irq;
+	struct irq_desc *desc;
+
+	for_each_irq_desc(irq, desc) {
+		if (desc->status & IRQ_MOVE_PENDING) {
+			unsigned long flags;
+
+			spin_lock_irqsave(&desc->lock, flags);
+			if (!desc->chip->set_affinity ||
+			    !(desc->status & IRQ_MOVE_PENDING)) {
+				desc->status &= ~IRQ_MOVE_PENDING;
+				spin_unlock_irqrestore(&desc->lock, flags);
+				continue;
+			}
+
+			desc->chip->set_affinity(irq, desc->pending_mask);
+			spin_unlock_irqrestore(&desc->lock, flags);
+		}
+	}
+}
+
+/*
+ * Migrates the IRQ destination in the process context.
+ */
+static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	if (desc->status & IRQ_LEVEL) {
+		desc->status |= IRQ_MOVE_PENDING;
+		desc->pending_mask = mask;
+		migrate_irq_remapped_level(irq);
+		return;
+	}
+
+	migrate_ioapic_irq(irq, mask);
+}
+#endif
+
 asmlinkage void smp_irq_move_cleanup_interrupt(void)
 {
 	unsigned vector, me;
 	ack_APIC_irq();
+#ifdef CONFIG_X86_64
 	exit_idle();
+#endif
 	irq_enter();
 
 	me = smp_processor_id();
@@ -1406,11 +2214,12 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
 		struct irq_desc *desc;
 		struct irq_cfg *cfg;
 		irq = __get_cpu_var(vector_irq)[vector];
-		if (irq >= NR_IRQS)
+
+		desc = irq_to_desc(irq);
+		if (!desc)
 			continue;
 
-		desc = irq_desc + irq;
-		cfg = irq_cfg + irq;
+		cfg = irq_cfg(irq);
 		spin_lock(&desc->lock);
 		if (!cfg->move_cleanup_count)
 			goto unlock;
@@ -1429,7 +2238,7 @@ unlock:
 
 static void irq_complete_move(unsigned int irq)
 {
-	struct irq_cfg *cfg = irq_cfg + irq;
+	struct irq_cfg *cfg = irq_cfg(irq);
 	unsigned vector, me;
 
 	if (likely(!cfg->move_in_progress))
@@ -1449,6 +2258,17 @@ static void irq_complete_move(unsigned int irq)
 #else
 static inline void irq_complete_move(unsigned int irq) {}
 #endif
+#ifdef CONFIG_INTR_REMAP
+static void ack_x2apic_level(unsigned int irq)
+{
+	ack_x2APIC_irq();
+}
+
+static void ack_x2apic_edge(unsigned int irq)
+{
+	ack_x2APIC_irq();
+}
+#endif
 
 static void ack_apic_edge(unsigned int irq)
 {
@@ -1457,19 +2277,50 @@ static void ack_apic_edge(unsigned int irq)
 	ack_APIC_irq();
 }
 
+atomic_t irq_mis_count;
+
 static void ack_apic_level(unsigned int irq)
 {
+#ifdef CONFIG_X86_32
+	unsigned long v;
+	int i;
+#endif
 	int do_unmask_irq = 0;
 
 	irq_complete_move(irq);
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	/* If we are moving the irq we need to mask it */
-	if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) {
+	if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
 		do_unmask_irq = 1;
 		mask_IO_APIC_irq(irq);
 	}
 #endif
 
+#ifdef CONFIG_X86_32
+	/*
+	* It appears there is an erratum which affects at least version 0x11
+	* of I/O APIC (that's the 82093AA and cores integrated into various
+	* chipsets).  Under certain conditions a level-triggered interrupt is
+	* erroneously delivered as edge-triggered one but the respective IRR
+	* bit gets set nevertheless.  As a result the I/O unit expects an EOI
+	* message but it will never arrive and further interrupts are blocked
+	* from the source.  The exact reason is so far unknown, but the
+	* phenomenon was observed when two consecutive interrupt requests
+	* from a given source get delivered to the same CPU and the source is
+	* temporarily disabled in between.
+	*
+	* A workaround is to simulate an EOI message manually.  We achieve it
+	* by setting the trigger mode to edge and then to level when the edge
+	* trigger mode gets detected in the TMR of a local APIC for a
+	* level-triggered interrupt.  We mask the source for the time of the
+	* operation to prevent an edge-triggered interrupt escaping meanwhile.
+	* The idea is from Manfred Spraul.  --macro
+	*/
+	i = irq_cfg(irq)->vector;
+
+	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+#endif
+
 	/*
 	 * We must acknowledge the irq before we move it or the acknowledge will
 	 * not propagate properly.
@@ -1508,24 +2359,51 @@ static void ack_apic_level(unsigned int irq)
 			move_masked_irq(irq);
 		unmask_IO_APIC_irq(irq);
 	}
+
+#ifdef CONFIG_X86_32
+	if (!(v & (1 << (i & 0x1f)))) {
+		atomic_inc(&irq_mis_count);
+		spin_lock(&ioapic_lock);
+		__mask_and_edge_IO_APIC_irq(irq);
+		__unmask_and_level_IO_APIC_irq(irq);
+		spin_unlock(&ioapic_lock);
+	}
+#endif
 }
 
 static struct irq_chip ioapic_chip __read_mostly = {
-	.name 		= "IO-APIC",
-	.startup 	= startup_ioapic_irq,
-	.mask	 	= mask_IO_APIC_irq,
-	.unmask	 	= unmask_IO_APIC_irq,
-	.ack 		= ack_apic_edge,
-	.eoi 		= ack_apic_level,
+	.name		= "IO-APIC",
+	.startup	= startup_ioapic_irq,
+	.mask		= mask_IO_APIC_irq,
+	.unmask		= unmask_IO_APIC_irq,
+	.ack		= ack_apic_edge,
+	.eoi		= ack_apic_level,
 #ifdef CONFIG_SMP
-	.set_affinity 	= set_ioapic_affinity_irq,
+	.set_affinity	= set_ioapic_affinity_irq,
 #endif
 	.retrigger	= ioapic_retrigger_irq,
 };
 
+#ifdef CONFIG_INTR_REMAP
+static struct irq_chip ir_ioapic_chip __read_mostly = {
+	.name		= "IR-IO-APIC",
+	.startup	= startup_ioapic_irq,
+	.mask		= mask_IO_APIC_irq,
+	.unmask		= unmask_IO_APIC_irq,
+	.ack		= ack_x2apic_edge,
+	.eoi		= ack_x2apic_level,
+#ifdef CONFIG_SMP
+	.set_affinity	= set_ir_ioapic_affinity_irq,
+#endif
+	.retrigger	= ioapic_retrigger_irq,
+};
+#endif
+
 static inline void init_IO_APIC_traps(void)
 {
 	int irq;
+	struct irq_desc *desc;
+	struct irq_cfg *cfg;
 
 	/*
 	 * NOTE! The local APIC isn't very good at handling
@@ -1538,8 +2416,8 @@ static inline void init_IO_APIC_traps(void)
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	for (irq = 0; irq < NR_IRQS ; irq++) {
-		if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) {
+	for_each_irq_cfg(irq, cfg) {
+		if (IO_APIC_IRQ(irq) && !cfg->vector) {
 			/*
 			 * Hmm.. We don't have an entry for this,
 			 * so default to an old-fashioned 8259
@@ -1547,27 +2425,33 @@ static inline void init_IO_APIC_traps(void)
 			 */
 			if (irq < 16)
 				make_8259A_irq(irq);
-			else
+			else {
+				desc = irq_to_desc(irq);
 				/* Strange. Oh, well.. */
-				irq_desc[irq].chip = &no_irq_chip;
+				desc->chip = &no_irq_chip;
+			}
 		}
 	}
 }
 
-static void unmask_lapic_irq(unsigned int irq)
+/*
+ * The local APIC irq-chip implementation:
+ */
+
+static void mask_lapic_irq(unsigned int irq)
 {
 	unsigned long v;
 
 	v = apic_read(APIC_LVT0);
-	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
+	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
 }
 
-static void mask_lapic_irq(unsigned int irq)
+static void unmask_lapic_irq(unsigned int irq)
 {
 	unsigned long v;
 
 	v = apic_read(APIC_LVT0);
-	apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
+	apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
 }
 
 static void ack_lapic_irq (unsigned int irq)
@@ -1584,7 +2468,10 @@ static struct irq_chip lapic_chip __read_mostly = {
 
 static void lapic_register_intr(int irq)
 {
-	irq_desc[irq].status &= ~IRQ_LEVEL;
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+	desc->status &= ~IRQ_LEVEL;
 	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
 				      "edge");
 }
@@ -1592,19 +2479,19 @@ static void lapic_register_intr(int irq)
 static void __init setup_nmi(void)
 {
 	/*
- 	 * Dirty trick to enable the NMI watchdog ...
+	 * Dirty trick to enable the NMI watchdog ...
 	 * We put the 8259A master into AEOI mode and
 	 * unmask on all local APICs LVT0 as NMI.
 	 *
 	 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
 	 * is from Maciej W. Rozycki - so we do not have to EOI from
 	 * the NMI handler or the timer interrupt.
-	 */ 
-	printk(KERN_INFO "activating NMI Watchdog ...");
+	 */
+	apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
 
 	enable_NMI_through_LVT0();
 
-	printk(" done.\n");
+	apic_printk(APIC_VERBOSE, " done.\n");
 }
 
 /*
@@ -1621,12 +2508,17 @@ static inline void __init unlock_ExtINT_logic(void)
 	unsigned char save_control, save_freq_select;
 
 	pin  = find_isa_irq_pin(8, mp_INT);
+	if (pin == -1) {
+		WARN_ON_ONCE(1);
+		return;
+	}
 	apic = find_isa_irq_apic(8, mp_INT);
-	if (pin == -1)
+	if (apic == -1) {
+		WARN_ON_ONCE(1);
 		return;
+	}
 
 	entry0 = ioapic_read_entry(apic, pin);
-
 	clear_IO_APIC_pin(apic, pin);
 
 	memset(&entry1, 0, sizeof(entry1));
@@ -1661,23 +2553,38 @@ static inline void __init unlock_ExtINT_logic(void)
 	ioapic_write_entry(apic, pin, entry0);
 }
 
+static int disable_timer_pin_1 __initdata;
+/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
+static int __init disable_timer_pin_setup(char *arg)
+{
+	disable_timer_pin_1 = 1;
+	return 0;
+}
+early_param("disable_timer_pin_1", disable_timer_pin_setup);
+
+int timer_through_8259 __initdata;
+
 /*
  * This code may look a bit paranoid, but it's supposed to cooperate with
  * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
  * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
  * fanatically on his truly buggy board.
  *
- * FIXME: really need to revamp this for modern platforms only.
+ * FIXME: really need to revamp this for all platforms.
  */
 static inline void __init check_timer(void)
 {
-	struct irq_cfg *cfg = irq_cfg + 0;
+	struct irq_cfg *cfg = irq_cfg(0);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
+	unsigned int ver;
 	int no_pin1 = 0;
 
 	local_irq_save(flags);
 
+	ver = apic_read(APIC_LVR);
+	ver = GET_APIC_VERSION(ver);
+
 	/*
 	 * get/set the timer IRQ vector:
 	 */
@@ -1686,18 +2593,27 @@ static inline void __init check_timer(void)
 
 	/*
 	 * As IRQ0 is to be enabled in the 8259A, the virtual
-	 * wire has to be disabled in the local APIC.
+	 * wire has to be disabled in the local APIC.  Also
+	 * timer interrupts need to be acknowledged manually in
+	 * the 8259A for the i82489DX when using the NMI
+	 * watchdog as that APIC treats NMIs as level-triggered.
+	 * The AEOI mode will finish them in the 8259A
+	 * automatically.
 	 */
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
+#ifdef CONFIG_X86_32
+	timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
+#endif
 
 	pin1  = find_isa_irq_pin(0, mp_INT);
 	apic1 = find_isa_irq_apic(0, mp_INT);
 	pin2  = ioapic_i8259.pin;
 	apic2 = ioapic_i8259.apic;
 
-	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-		cfg->vector, apic1, pin1, apic2, pin2);
+	apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
+		    "apic1=%d pin1=%d apic2=%d pin2=%d\n",
+		    cfg->vector, apic1, pin1, apic2, pin2);
 
 	/*
 	 * Some BIOS writers are clueless and report the ExtINTA
@@ -1707,6 +2623,10 @@ static inline void __init check_timer(void)
 	 * 8259A.
 	 */
 	if (pin1 == -1) {
+#ifdef CONFIG_INTR_REMAP
+		if (intr_remapping_enabled)
+			panic("BIOS bug: timer not connected to IO-APIC");
+#endif
 		pin1 = pin2;
 		apic1 = apic2;
 		no_pin1 = 1;
@@ -1724,7 +2644,7 @@ static inline void __init check_timer(void)
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		}
 		unmask_IO_APIC_irq(0);
-		if (!no_timer_check && timer_irq_works()) {
+		if (timer_irq_works()) {
 			if (nmi_watchdog == NMI_IO_APIC) {
 				setup_nmi();
 				enable_8259A_irq(0);
@@ -1733,16 +2653,19 @@ static inline void __init check_timer(void)
 				clear_IO_APIC_pin(0, pin1);
 			goto out;
 		}
+#ifdef CONFIG_INTR_REMAP
+		if (intr_remapping_enabled)
+			panic("timer doesn't work through Interrupt-remapped IO-APIC");
+#endif
 		clear_IO_APIC_pin(apic1, pin1);
 		if (!no_pin1)
-			apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: "
+			apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
 				    "8254 timer not connected to IO-APIC\n");
 
-		apic_printk(APIC_VERBOSE,KERN_INFO
-			"...trying to set up timer (IRQ0) "
-			"through the 8259A ... ");
-		apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...",
-			apic2, pin2);
+		apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
+			    "(IRQ0) through the 8259A ...\n");
+		apic_printk(APIC_QUIET, KERN_INFO
+			    "..... (found apic %d pin %d) ...\n", apic2, pin2);
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
@@ -1751,7 +2674,7 @@ static inline void __init check_timer(void)
 		unmask_IO_APIC_irq(0);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
-			apic_printk(APIC_VERBOSE," works.\n");
+			apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
 			timer_through_8259 = 1;
 			if (nmi_watchdog == NMI_IO_APIC) {
 				disable_8259A_irq(0);
@@ -1765,29 +2688,35 @@ static inline void __init check_timer(void)
 		 */
 		disable_8259A_irq(0);
 		clear_IO_APIC_pin(apic2, pin2);
-		apic_printk(APIC_VERBOSE," failed.\n");
+		apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
 	}
 
 	if (nmi_watchdog == NMI_IO_APIC) {
-		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
+		apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
+			    "through the IO-APIC - disabling NMI Watchdog!\n");
 		nmi_watchdog = NMI_NONE;
 	}
+#ifdef CONFIG_X86_32
+	timer_ack = 0;
+#endif
 
-	apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
+	apic_printk(APIC_QUIET, KERN_INFO
+		    "...trying to set up timer as Virtual Wire IRQ...\n");
 
 	lapic_register_intr(0);
 	apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector);	/* Fixed mode */
 	enable_8259A_irq(0);
 
 	if (timer_irq_works()) {
-		apic_printk(APIC_VERBOSE," works.\n");
+		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
 		goto out;
 	}
 	disable_8259A_irq(0);
 	apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
-	apic_printk(APIC_VERBOSE," failed.\n");
+	apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
 
-	apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ...");
+	apic_printk(APIC_QUIET, KERN_INFO
+		    "...trying to set up timer as ExtINT IRQ...\n");
 
 	init_8259A(0);
 	make_8259A_irq(0);
@@ -1796,22 +2725,16 @@ static inline void __init check_timer(void)
 	unlock_ExtINT_logic();
 
 	if (timer_irq_works()) {
-		apic_printk(APIC_VERBOSE," works.\n");
+		apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
 		goto out;
 	}
-	apic_printk(APIC_VERBOSE," failed :(.\n");
-	panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n");
+	apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
+	panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
+		"report.  Then try booting with the 'noapic' option.\n");
 out:
 	local_irq_restore(flags);
 }
 
-static int __init notimercheck(char *s)
-{
-	no_timer_check = 1;
-	return 1;
-}
-__setup("no_timer_check", notimercheck);
-
 /*
  * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
  * to devices.  However there may be an I/O APIC pin available for
@@ -1829,27 +2752,49 @@ __setup("no_timer_check", notimercheck);
  * the I/O APIC in all cases now.  No actual device should request
  * it anyway.  --macro
  */
-#define PIC_IRQS	(1<<2)
+#define PIC_IRQS	(1 << PIC_CASCADE_IR)
 
 void __init setup_IO_APIC(void)
 {
 
+#ifdef CONFIG_X86_32
+	enable_IO_APIC();
+#else
 	/*
 	 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
 	 */
+#endif
 
 	io_apic_irqs = ~PIC_IRQS;
 
 	apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
-
+	/*
+         * Set up IO-APIC IRQ routing.
+         */
+#ifdef CONFIG_X86_32
+	if (!acpi_ioapic)
+		setup_ioapic_ids_from_mpc();
+#endif
 	sync_Arb_IDs();
 	setup_IO_APIC_irqs();
 	init_IO_APIC_traps();
 	check_timer();
-	if (!acpi_ioapic)
-		print_IO_APIC();
 }
 
+/*
+ *      Called after all the initialization is done. If we didnt find any
+ *      APIC bugs then we can allow the modify fast path
+ */
+
+static int __init io_apic_bug_finalize(void)
+{
+	if (sis_apic_bug == -1)
+		sis_apic_bug = 0;
+	return 0;
+}
+
+late_initcall(io_apic_bug_finalize);
+
 struct sysfs_ioapic_data {
 	struct sys_device dev;
 	struct IO_APIC_route_entry entry[0];
@@ -1937,38 +2882,60 @@ device_initcall(ioapic_init_sysfs);
 /*
  * Dynamic irq allocate and deallocation
  */
-int create_irq(void)
+unsigned int create_irq_nr(unsigned int irq_want)
 {
 	/* Allocate an unused irq */
-	int irq;
-	int new;
+	unsigned int irq;
+	unsigned int new;
 	unsigned long flags;
+	struct irq_cfg *cfg_new;
+
+	irq_want = nr_irqs - 1;
 
-	irq = -ENOSPC;
+	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
-	for (new = (NR_IRQS - 1); new >= 0; new--) {
+	for (new = irq_want; new > 0; new--) {
 		if (platform_legacy_irq(new))
 			continue;
-		if (irq_cfg[new].vector != 0)
+		cfg_new = irq_cfg(new);
+		if (cfg_new && cfg_new->vector != 0)
 			continue;
+		/* check if need to create one */
+		if (!cfg_new)
+			cfg_new = irq_cfg_alloc(new);
 		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
 			irq = new;
 		break;
 	}
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	if (irq >= 0) {
+	if (irq > 0) {
 		dynamic_irq_init(irq);
 	}
 	return irq;
 }
 
+int create_irq(void)
+{
+	int irq;
+
+	irq = create_irq_nr(nr_irqs - 1);
+
+	if (irq == 0)
+		irq = -1;
+
+	return irq;
+}
+
 void destroy_irq(unsigned int irq)
 {
 	unsigned long flags;
 
 	dynamic_irq_cleanup(irq);
 
+#ifdef CONFIG_INTR_REMAP
+	free_irte(irq);
+#endif
 	spin_lock_irqsave(&vector_lock, flags);
 	__clear_irq_vector(irq);
 	spin_unlock_irqrestore(&vector_lock, flags);
@@ -1980,18 +2947,50 @@ void destroy_irq(unsigned int irq)
 #ifdef CONFIG_PCI_MSI
 static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
 {
-	struct irq_cfg *cfg = irq_cfg + irq;
+	struct irq_cfg *cfg;
 	int err;
 	unsigned dest;
 	cpumask_t tmp;
 
 	tmp = TARGET_CPUS;
 	err = assign_irq_vector(irq, tmp);
-	if (!err) {
-		cpus_and(tmp, cfg->domain, tmp);
-		dest = cpu_mask_to_apicid(tmp);
+	if (err)
+		return err;
+
+	cfg = irq_cfg(irq);
+	cpus_and(tmp, cfg->domain, tmp);
+	dest = cpu_mask_to_apicid(tmp);
+
+#ifdef CONFIG_INTR_REMAP
+	if (irq_remapped(irq)) {
+		struct irte irte;
+		int ir_index;
+		u16 sub_handle;
+
+		ir_index = map_irq_to_irte_handle(irq, &sub_handle);
+		BUG_ON(ir_index == -1);
+
+		memset (&irte, 0, sizeof(irte));
+
+		irte.present = 1;
+		irte.dst_mode = INT_DEST_MODE;
+		irte.trigger_mode = 0; /* edge */
+		irte.dlvry_mode = INT_DELIVERY_MODE;
+		irte.vector = cfg->vector;
+		irte.dest_id = IRTE_DEST(dest);
+
+		modify_irte(irq, &irte);
 
 		msg->address_hi = MSI_ADDR_BASE_HI;
+		msg->data = sub_handle;
+		msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
+				  MSI_ADDR_IR_SHV |
+				  MSI_ADDR_IR_INDEX1(ir_index) |
+				  MSI_ADDR_IR_INDEX2(ir_index);
+	} else
+#endif
+	{
+		msg->address_hi = MSI_ADDR_BASE_HI;
 		msg->address_lo =
 			MSI_ADDR_BASE_LO |
 			((INT_DEST_MODE == 0) ?
@@ -2016,10 +3015,11 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 #ifdef CONFIG_SMP
 static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 {
-	struct irq_cfg *cfg = irq_cfg + irq;
+	struct irq_cfg *cfg;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
+	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -2028,6 +3028,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	if (assign_irq_vector(irq, mask))
 		return;
 
+	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -2039,8 +3040,61 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	write_msi_msg(irq, &msg);
-	irq_desc[irq].affinity = mask;
+	desc = irq_to_desc(irq);
+	desc->affinity = mask;
 }
+
+#ifdef CONFIG_INTR_REMAP
+/*
+ * Migrate the MSI irq to another cpumask. This migration is
+ * done in the process context using interrupt-remapping hardware.
+ */
+static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct irq_cfg *cfg;
+	unsigned int dest;
+	cpumask_t tmp, cleanup_mask;
+	struct irte irte;
+	struct irq_desc *desc;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		return;
+
+	if (get_irte(irq, &irte))
+		return;
+
+	if (assign_irq_vector(irq, mask))
+		return;
+
+	cfg = irq_cfg(irq);
+	cpus_and(tmp, cfg->domain, mask);
+	dest = cpu_mask_to_apicid(tmp);
+
+	irte.vector = cfg->vector;
+	irte.dest_id = IRTE_DEST(dest);
+
+	/*
+	 * atomically update the IRTE with the new destination and vector.
+	 */
+	modify_irte(irq, &irte);
+
+	/*
+	 * After this point, all the interrupts will start arriving
+	 * at the new destination. So, time to cleanup the previous
+	 * vector allocation.
+	 */
+	if (cfg->move_in_progress) {
+		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
+		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
+		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+		cfg->move_in_progress = 0;
+	}
+
+	desc = irq_to_desc(irq);
+	desc->affinity = mask;
+}
+#endif
 #endif /* CONFIG_SMP */
 
 /*
@@ -2058,26 +3112,179 @@ static struct irq_chip msi_chip = {
 	.retrigger	= ioapic_retrigger_irq,
 };
 
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+#ifdef CONFIG_INTR_REMAP
+static struct irq_chip msi_ir_chip = {
+	.name		= "IR-PCI-MSI",
+	.unmask		= unmask_msi_irq,
+	.mask		= mask_msi_irq,
+	.ack		= ack_x2apic_edge,
+#ifdef CONFIG_SMP
+	.set_affinity	= ir_set_msi_irq_affinity,
+#endif
+	.retrigger	= ioapic_retrigger_irq,
+};
+
+/*
+ * Map the PCI dev to the corresponding remapping hardware unit
+ * and allocate 'nvec' consecutive interrupt-remapping table entries
+ * in it.
+ */
+static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
 {
+	struct intel_iommu *iommu;
+	int index;
+
+	iommu = map_dev_to_ir(dev);
+	if (!iommu) {
+		printk(KERN_ERR
+		       "Unable to map PCI %s to iommu\n", pci_name(dev));
+		return -ENOENT;
+	}
+
+	index = alloc_irte(iommu, irq, nvec);
+	if (index < 0) {
+		printk(KERN_ERR
+		       "Unable to allocate %d IRTE for PCI %s\n", nvec,
+		       pci_name(dev));
+		return -ENOSPC;
+	}
+	return index;
+}
+#endif
+
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
+{
+	int ret;
 	struct msi_msg msg;
-	int irq, ret;
-	irq = create_irq();
-	if (irq < 0)
-		return irq;
 
 	ret = msi_compose_msg(dev, irq, &msg);
+	if (ret < 0)
+		return ret;
+
+	set_irq_msi(irq, desc);
+	write_msi_msg(irq, &msg);
+
+#ifdef CONFIG_INTR_REMAP
+	if (irq_remapped(irq)) {
+		struct irq_desc *desc = irq_to_desc(irq);
+		/*
+		 * irq migration in process context
+		 */
+		desc->status |= IRQ_MOVE_PCNTXT;
+		set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
+	} else
+#endif
+		set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+
+	dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+
+	return 0;
+}
+
+static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
+{
+	unsigned int irq;
+
+	irq = dev->bus->number;
+	irq <<= 8;
+	irq |= dev->devfn;
+	irq <<= 12;
+
+	return irq;
+}
+
+int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+{
+	unsigned int irq;
+	int ret;
+	unsigned int irq_want;
+
+	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+
+	irq = create_irq_nr(irq_want);
+	if (irq == 0)
+		return -1;
+
+#ifdef CONFIG_INTR_REMAP
+	if (!intr_remapping_enabled)
+		goto no_ir;
+
+	ret = msi_alloc_irte(dev, irq, 1);
+	if (ret < 0)
+		goto error;
+no_ir:
+#endif
+	ret = setup_msi_irq(dev, desc, irq);
 	if (ret < 0) {
 		destroy_irq(irq);
 		return ret;
 	}
+	return 0;
 
-	set_irq_msi(irq, desc);
-	write_msi_msg(irq, &msg);
+#ifdef CONFIG_INTR_REMAP
+error:
+	destroy_irq(irq);
+	return ret;
+#endif
+}
 
-	set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+	unsigned int irq;
+	int ret, sub_handle;
+	struct msi_desc *desc;
+	unsigned int irq_want;
 
+#ifdef CONFIG_INTR_REMAP
+	struct intel_iommu *iommu = 0;
+	int index = 0;
+#endif
+
+	irq_want = build_irq_for_pci_dev(dev) + 0x100;
+	sub_handle = 0;
+	list_for_each_entry(desc, &dev->msi_list, list) {
+		irq = create_irq_nr(irq_want--);
+		if (irq == 0)
+			return -1;
+#ifdef CONFIG_INTR_REMAP
+		if (!intr_remapping_enabled)
+			goto no_ir;
+
+		if (!sub_handle) {
+			/*
+			 * allocate the consecutive block of IRTE's
+			 * for 'nvec'
+			 */
+			index = msi_alloc_irte(dev, irq, nvec);
+			if (index < 0) {
+				ret = index;
+				goto error;
+			}
+		} else {
+			iommu = map_dev_to_ir(dev);
+			if (!iommu) {
+				ret = -ENOENT;
+				goto error;
+			}
+			/*
+			 * setup the mapping between the irq and the IRTE
+			 * base index, the sub_handle pointing to the
+			 * appropriate interrupt remap table entry.
+			 */
+			set_irte_irq(irq, iommu, index, sub_handle);
+		}
+no_ir:
+#endif
+		ret = setup_msi_irq(dev, desc, irq);
+		if (ret < 0)
+			goto error;
+		sub_handle++;
+	}
 	return 0;
+
+error:
+	destroy_irq(irq);
+	return ret;
 }
 
 void arch_teardown_msi_irq(unsigned int irq)
@@ -2089,10 +3296,11 @@ void arch_teardown_msi_irq(unsigned int irq)
 #ifdef CONFIG_SMP
 static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
 {
-	struct irq_cfg *cfg = irq_cfg + irq;
+	struct irq_cfg *cfg;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
+	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -2101,6 +3309,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
 	if (assign_irq_vector(irq, mask))
 		return;
 
+	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
@@ -2112,7 +3321,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
-	irq_desc[irq].affinity = mask;
+	desc = irq_to_desc(irq);
+	desc->affinity = mask;
 }
 #endif /* CONFIG_SMP */
 
@@ -2142,6 +3352,69 @@ int arch_setup_dmar_msi(unsigned int irq)
 }
 #endif
 
+#ifdef CONFIG_HPET_TIMER
+
+#ifdef CONFIG_SMP
+static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
+{
+	struct irq_cfg *cfg;
+	struct irq_desc *desc;
+	struct msi_msg msg;
+	unsigned int dest;
+	cpumask_t tmp;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		return;
+
+	if (assign_irq_vector(irq, mask))
+		return;
+
+	cfg = irq_cfg(irq);
+	cpus_and(tmp, cfg->domain, mask);
+	dest = cpu_mask_to_apicid(tmp);
+
+	hpet_msi_read(irq, &msg);
+
+	msg.data &= ~MSI_DATA_VECTOR_MASK;
+	msg.data |= MSI_DATA_VECTOR(cfg->vector);
+	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+	hpet_msi_write(irq, &msg);
+	desc = irq_to_desc(irq);
+	desc->affinity = mask;
+}
+#endif /* CONFIG_SMP */
+
+struct irq_chip hpet_msi_type = {
+	.name = "HPET_MSI",
+	.unmask = hpet_msi_unmask,
+	.mask = hpet_msi_mask,
+	.ack = ack_apic_edge,
+#ifdef CONFIG_SMP
+	.set_affinity = hpet_msi_set_affinity,
+#endif
+	.retrigger = ioapic_retrigger_irq,
+};
+
+int arch_setup_hpet_msi(unsigned int irq)
+{
+	int ret;
+	struct msi_msg msg;
+
+	ret = msi_compose_msg(NULL, irq, &msg);
+	if (ret < 0)
+		return ret;
+
+	hpet_msi_write(irq, &msg);
+	set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
+		"edge");
+
+	return 0;
+}
+#endif
+
 #endif /* CONFIG_PCI_MSI */
 /*
  * Hypertransport interrupt support
@@ -2166,9 +3439,10 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
 
 static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
 {
-	struct irq_cfg *cfg = irq_cfg + irq;
+	struct irq_cfg *cfg;
 	unsigned int dest;
 	cpumask_t tmp;
+	struct irq_desc *desc;
 
 	cpus_and(tmp, mask, cpu_online_map);
 	if (cpus_empty(tmp))
@@ -2177,11 +3451,13 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
 	if (assign_irq_vector(irq, mask))
 		return;
 
+	cfg = irq_cfg(irq);
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
 
 	target_ht_irq(irq, dest, cfg->vector);
-	irq_desc[irq].affinity = mask;
+	desc = irq_to_desc(irq);
+	desc->affinity = mask;
 }
 #endif
 
@@ -2198,7 +3474,7 @@ static struct irq_chip ht_irq_chip = {
 
 int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 {
-	struct irq_cfg *cfg = irq_cfg + irq;
+	struct irq_cfg *cfg;
 	int err;
 	cpumask_t tmp;
 
@@ -2208,6 +3484,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 		struct ht_irq_msg msg;
 		unsigned dest;
 
+		cfg = irq_cfg(irq);
 		cpus_and(tmp, cfg->domain, tmp);
 		dest = cpu_mask_to_apicid(tmp);
 
@@ -2230,20 +3507,196 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
 
 		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
 					      handle_edge_irq, "edge");
+
+		dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
 	}
 	return err;
 }
 #endif /* CONFIG_HT_IRQ */
 
+#ifdef CONFIG_X86_64
+/*
+ * Re-target the irq to the specified CPU and enable the specified MMR located
+ * on the specified blade to allow the sending of MSIs to the specified CPU.
+ */
+int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
+		       unsigned long mmr_offset)
+{
+	const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
+	struct irq_cfg *cfg;
+	int mmr_pnode;
+	unsigned long mmr_value;
+	struct uv_IO_APIC_route_entry *entry;
+	unsigned long flags;
+	int err;
+
+	err = assign_irq_vector(irq, *eligible_cpu);
+	if (err != 0)
+		return err;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
+				      irq_name);
+	spin_unlock_irqrestore(&vector_lock, flags);
+
+	cfg = irq_cfg(irq);
+
+	mmr_value = 0;
+	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
+	entry->vector = cfg->vector;
+	entry->delivery_mode = INT_DELIVERY_MODE;
+	entry->dest_mode = INT_DEST_MODE;
+	entry->polarity = 0;
+	entry->trigger = 0;
+	entry->mask = 0;
+	entry->dest = cpu_mask_to_apicid(*eligible_cpu);
+
+	mmr_pnode = uv_blade_to_pnode(mmr_blade);
+	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+
+	return irq;
+}
+
+/*
+ * Disable the specified MMR located on the specified blade so that MSIs are
+ * longer allowed to be sent.
+ */
+void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
+{
+	unsigned long mmr_value;
+	struct uv_IO_APIC_route_entry *entry;
+	int mmr_pnode;
+
+	mmr_value = 0;
+	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
+	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
+	entry->mask = 1;
+
+	mmr_pnode = uv_blade_to_pnode(mmr_blade);
+	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
+}
+#endif /* CONFIG_X86_64 */
+
+int __init io_apic_get_redir_entries (int ioapic)
+{
+	union IO_APIC_reg_01	reg_01;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	reg_01.raw = io_apic_read(ioapic, 1);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return reg_01.bits.entries;
+}
+
+int __init probe_nr_irqs(void)
+{
+	int idx;
+	int nr = 0;
+#ifndef CONFIG_XEN
+	int nr_min = 32;
+#else
+	int nr_min = NR_IRQS;
+#endif
+
+	for (idx = 0; idx < nr_ioapics; idx++)
+		nr += io_apic_get_redir_entries(idx) + 1;
+
+	/* double it for hotplug and msi and nmi */
+	nr <<= 1;
+
+	/* something wrong ? */
+	if (nr < nr_min)
+		nr = nr_min;
+
+	return nr;
+}
+
 /* --------------------------------------------------------------------------
                           ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */
 
 #ifdef CONFIG_ACPI
 
-#define IO_APIC_MAX_ID		0xFE
+#ifdef CONFIG_X86_32
+int __init io_apic_get_unique_id(int ioapic, int apic_id)
+{
+	union IO_APIC_reg_00 reg_00;
+	static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
+	physid_mask_t tmp;
+	unsigned long flags;
+	int i = 0;
 
-int __init io_apic_get_redir_entries (int ioapic)
+	/*
+	 * The P4 platform supports up to 256 APIC IDs on two separate APIC
+	 * buses (one for LAPICs, one for IOAPICs), where predecessors only
+	 * supports up to 16 on one shared APIC bus.
+	 *
+	 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
+	 *      advantage of new APIC bus architecture.
+	 */
+
+	if (physids_empty(apic_id_map))
+		apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	reg_00.raw = io_apic_read(ioapic, 0);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	if (apic_id >= get_physical_broadcast()) {
+		printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
+			"%d\n", ioapic, apic_id, reg_00.bits.ID);
+		apic_id = reg_00.bits.ID;
+	}
+
+	/*
+	 * Every APIC in a system must have a unique ID or we get lots of nice
+	 * 'stuck on smp_invalidate_needed IPI wait' messages.
+	 */
+	if (check_apicid_used(apic_id_map, apic_id)) {
+
+		for (i = 0; i < get_physical_broadcast(); i++) {
+			if (!check_apicid_used(apic_id_map, i))
+				break;
+		}
+
+		if (i == get_physical_broadcast())
+			panic("Max apic_id exceeded!\n");
+
+		printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
+			"trying %d\n", ioapic, apic_id, i);
+
+		apic_id = i;
+	}
+
+	tmp = apicid_to_cpu_present(apic_id);
+	physids_or(apic_id_map, apic_id_map, tmp);
+
+	if (reg_00.bits.ID != apic_id) {
+		reg_00.bits.ID = apic_id;
+
+		spin_lock_irqsave(&ioapic_lock, flags);
+		io_apic_write(ioapic, 0, reg_00.raw);
+		reg_00.raw = io_apic_read(ioapic, 0);
+		spin_unlock_irqrestore(&ioapic_lock, flags);
+
+		/* Sanity check */
+		if (reg_00.bits.ID != apic_id) {
+			printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
+			return -1;
+		}
+	}
+
+	apic_printk(APIC_VERBOSE, KERN_INFO
+			"IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+
+	return apic_id;
+}
+
+int __init io_apic_get_version(int ioapic)
 {
 	union IO_APIC_reg_01	reg_01;
 	unsigned long flags;
@@ -2252,9 +3705,9 @@ int __init io_apic_get_redir_entries (int ioapic)
 	reg_01.raw = io_apic_read(ioapic, 1);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
-	return reg_01.bits.entries;
+	return reg_01.bits.version;
 }
-
+#endif
 
 int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
 {
@@ -2306,6 +3759,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 void __init setup_ioapic_dest(void)
 {
 	int pin, ioapic, irq, irq_entry;
+	struct irq_cfg *cfg;
 
 	if (skip_ioapic_setup == 1)
 		return;
@@ -2321,10 +3775,15 @@ void __init setup_ioapic_dest(void)
 			 * when you have too many devices, because at that time only boot
 			 * cpu is online.
 			 */
-			if (!irq_cfg[irq].vector)
+			cfg = irq_cfg(irq);
+			if (!cfg->vector)
 				setup_IO_APIC_irq(ioapic, pin, irq,
 						  irq_trigger(irq_entry),
 						  irq_polarity(irq_entry));
+#ifdef CONFIG_INTR_REMAP
+			else if (intr_remapping_enabled)
+				set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
+#endif
 			else
 				set_ioapic_affinity_irq(irq, TARGET_CPUS);
 		}
@@ -2375,18 +3834,33 @@ void __init ioapic_init_mappings(void)
 	struct resource *ioapic_res;
 	int i;
 
+	irq_2_pin_init();
 	ioapic_res = ioapic_setup_resources();
 	for (i = 0; i < nr_ioapics; i++) {
 		if (smp_found_config) {
 			ioapic_phys = mp_ioapics[i].mp_apicaddr;
+#ifdef CONFIG_X86_32
+			if (!ioapic_phys) {
+				printk(KERN_ERR
+				       "WARNING: bogus zero IO-APIC "
+				       "address found in MPTABLE, "
+				       "disabling IO/APIC support!\n");
+				smp_found_config = 0;
+				skip_ioapic_setup = 1;
+				goto fake_ioapic_page;
+			}
+#endif
 		} else {
+#ifdef CONFIG_X86_32
+fake_ioapic_page:
+#endif
 			ioapic_phys = (unsigned long)
 				alloc_bootmem_pages(PAGE_SIZE);
 			ioapic_phys = __pa(ioapic_phys);
 		}
 		set_fixmap_nocache(idx, ioapic_phys);
 		apic_printk(APIC_VERBOSE,
-			    "mapped IOAPIC to %016lx (%016lx)\n",
+			    "mapped IOAPIC to %08lx (%08lx)\n",
 			    __fix_to_virt(idx), ioapic_phys);
 		idx++;
 
@@ -2420,4 +3894,3 @@ static int __init ioapic_insert_resources(void)
 /* Insert the IO APIC resources after PCI initialization has occured to handle
  * IO APICS that are mapped in on a BAR in PCI space. */
 late_initcall(ioapic_insert_resources);
-
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
deleted file mode 100644
index 558abf4c796a..000000000000
--- a/arch/x86/kernel/io_apic_32.c
+++ /dev/null
@@ -1,2900 +0,0 @@
-/*
- *	Intel IO-APIC support for multi-Pentium hosts.
- *
- *	Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
- *
- *	Many thanks to Stig Venaas for trying out countless experimental
- *	patches and reporting/debugging problems patiently!
- *
- *	(c) 1999, Multiple IO-APIC support, developed by
- *	Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
- *      Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
- *	further tested and cleaned up by Zach Brown <zab@redhat.com>
- *	and Ingo Molnar <mingo@redhat.com>
- *
- *	Fixes
- *	Maciej W. Rozycki	:	Bits for genuine 82489DX APICs;
- *					thanks to Eric Gilmore
- *					and Rolf G. Tews
- *					for testing these extensively
- *	Paul Diefenbaugh	:	Added full ACPI support
- */
-
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/sched.h>
-#include <linux/bootmem.h>
-#include <linux/mc146818rtc.h>
-#include <linux/compiler.h>
-#include <linux/acpi.h>
-#include <linux/module.h>
-#include <linux/sysdev.h>
-#include <linux/pci.h>
-#include <linux/msi.h>
-#include <linux/htirq.h>
-#include <linux/freezer.h>
-#include <linux/kthread.h>
-#include <linux/jiffies.h>	/* time_after() */
-
-#include <asm/io.h>
-#include <asm/smp.h>
-#include <asm/desc.h>
-#include <asm/timer.h>
-#include <asm/i8259.h>
-#include <asm/nmi.h>
-#include <asm/msidef.h>
-#include <asm/hypertransport.h>
-
-#include <mach_apic.h>
-#include <mach_apicdef.h>
-
-int (*ioapic_renumber_irq)(int ioapic, int irq);
-atomic_t irq_mis_count;
-
-/* Where if anywhere is the i8259 connect in external int mode */
-static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
-
-static DEFINE_SPINLOCK(ioapic_lock);
-static DEFINE_SPINLOCK(vector_lock);
-
-int timer_through_8259 __initdata;
-
-/*
- *	Is the SiS APIC rmw bug present ?
- *	-1 = don't know, 0 = no, 1 = yes
- */
-int sis_apic_bug = -1;
-
-/*
- * # of IRQ routing registers
- */
-int nr_ioapic_registers[MAX_IO_APICS];
-
-/* I/O APIC entries */
-struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
-int nr_ioapics;
-
-/* MP IRQ source entries */
-struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
-
-/* # of MP IRQ source entries */
-int mp_irq_entries;
-
-#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
-int mp_bus_id_to_type[MAX_MP_BUSSES];
-#endif
-
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-
-static int disable_timer_pin_1 __initdata;
-
-/*
- * Rough estimation of how many shared IRQs there are, can
- * be changed anytime.
- */
-#define MAX_PLUS_SHARED_IRQS NR_IRQS
-#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
-
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * the indexing order of this array favors 1:1 mappings
- * between pins and IRQs.
- */
-
-static struct irq_pin_list {
-	int apic, pin, next;
-} irq_2_pin[PIN_MAP_SIZE];
-
-struct io_apic {
-	unsigned int index;
-	unsigned int unused[3];
-	unsigned int data;
-};
-
-static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
-{
-	return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
-		+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
-}
-
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
-	struct io_apic __iomem *io_apic = io_apic_base(apic);
-	writel(reg, &io_apic->index);
-	return readl(&io_apic->data);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	struct io_apic __iomem *io_apic = io_apic_base(apic);
-	writel(reg, &io_apic->index);
-	writel(value, &io_apic->data);
-}
-
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index register
- */
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
-	if (sis_apic_bug)
-		writel(reg, &io_apic->index);
-	writel(value, &io_apic->data);
-}
-
-union entry_union {
-	struct { u32 w1, w2; };
-	struct IO_APIC_route_entry entry;
-};
-
-static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
-{
-	union entry_union eu;
-	unsigned long flags;
-	spin_lock_irqsave(&ioapic_lock, flags);
-	eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
-	eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-	return eu.entry;
-}
-
-/*
- * When we write a new IO APIC routing entry, we need to write the high
- * word first! If the mask bit in the low word is clear, we will enable
- * the interrupt, and we need to make sure the entry is fully populated
- * before that happens.
- */
-static void
-__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
-{
-	union entry_union eu;
-	eu.entry = e;
-	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
-	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
-}
-
-static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
-{
-	unsigned long flags;
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__ioapic_write_entry(apic, pin, e);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-/*
- * When we mask an IO APIC routing entry, we need to write the low
- * word first, in order to set the mask bit before we change the
- * high bits!
- */
-static void ioapic_mask_entry(int apic, int pin)
-{
-	unsigned long flags;
-	union entry_union eu = { .entry.mask = 1 };
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x10 + 2*pin, eu.w1);
-	io_apic_write(apic, 0x11 + 2*pin, eu.w2);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-/*
- * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
- * shared ISA-space IRQs, so we have to support them. We are super
- * fast in the common case, and fast for shared ISA-space IRQs.
- */
-static void add_pin_to_irq(unsigned int irq, int apic, int pin)
-{
-	static int first_free_entry = NR_IRQS;
-	struct irq_pin_list *entry = irq_2_pin + irq;
-
-	while (entry->next)
-		entry = irq_2_pin + entry->next;
-
-	if (entry->pin != -1) {
-		entry->next = first_free_entry;
-		entry = irq_2_pin + entry->next;
-		if (++first_free_entry >= PIN_MAP_SIZE)
-			panic("io_apic.c: whoops");
-	}
-	entry->apic = apic;
-	entry->pin = pin;
-}
-
-/*
- * Reroute an IRQ to a different pin.
- */
-static void __init replace_pin_at_irq(unsigned int irq,
-				      int oldapic, int oldpin,
-				      int newapic, int newpin)
-{
-	struct irq_pin_list *entry = irq_2_pin + irq;
-
-	while (1) {
-		if (entry->apic == oldapic && entry->pin == oldpin) {
-			entry->apic = newapic;
-			entry->pin = newpin;
-		}
-		if (!entry->next)
-			break;
-		entry = irq_2_pin + entry->next;
-	}
-}
-
-static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
-{
-	struct irq_pin_list *entry = irq_2_pin + irq;
-	unsigned int pin, reg;
-
-	for (;;) {
-		pin = entry->pin;
-		if (pin == -1)
-			break;
-		reg = io_apic_read(entry->apic, 0x10 + pin*2);
-		reg &= ~disable;
-		reg |= enable;
-		io_apic_modify(entry->apic, 0x10 + pin*2, reg);
-		if (!entry->next)
-			break;
-		entry = irq_2_pin + entry->next;
-	}
-}
-
-/* mask = 1 */
-static void __mask_IO_APIC_irq(unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
-}
-
-/* mask = 0 */
-static void __unmask_IO_APIC_irq(unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
-}
-
-/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
-				IO_APIC_REDIR_LEVEL_TRIGGER);
-}
-
-/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
-				IO_APIC_REDIR_MASKED);
-}
-
-static void mask_IO_APIC_irq(unsigned int irq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__mask_IO_APIC_irq(irq);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void unmask_IO_APIC_irq(unsigned int irq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__unmask_IO_APIC_irq(irq);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
-{
-	struct IO_APIC_route_entry entry;
-
-	/* Check delivery_mode to be sure we're not clearing an SMI pin */
-	entry = ioapic_read_entry(apic, pin);
-	if (entry.delivery_mode == dest_SMI)
-		return;
-
-	/*
-	 * Disable it in the IO-APIC irq-routing table:
-	 */
-	ioapic_mask_entry(apic, pin);
-}
-
-static void clear_IO_APIC(void)
-{
-	int apic, pin;
-
-	for (apic = 0; apic < nr_ioapics; apic++)
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
-			clear_IO_APIC_pin(apic, pin);
-}
-
-#ifdef CONFIG_SMP
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
-{
-	unsigned long flags;
-	int pin;
-	struct irq_pin_list *entry = irq_2_pin + irq;
-	unsigned int apicid_value;
-	cpumask_t tmp;
-
-	cpus_and(tmp, cpumask, cpu_online_map);
-	if (cpus_empty(tmp))
-		tmp = TARGET_CPUS;
-
-	cpus_and(cpumask, tmp, CPU_MASK_ALL);
-
-	apicid_value = cpu_mask_to_apicid(cpumask);
-	/* Prepare to do the io_apic_write */
-	apicid_value = apicid_value << 24;
-	spin_lock_irqsave(&ioapic_lock, flags);
-	for (;;) {
-		pin = entry->pin;
-		if (pin == -1)
-			break;
-		io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
-		if (!entry->next)
-			break;
-		entry = irq_2_pin + entry->next;
-	}
-	irq_desc[irq].affinity = cpumask;
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-#if defined(CONFIG_IRQBALANCE)
-# include <asm/processor.h>	/* kernel_thread() */
-# include <linux/kernel_stat.h>	/* kstat */
-# include <linux/slab.h>		/* kmalloc() */
-# include <linux/timer.h>
-
-#define IRQBALANCE_CHECK_ARCH -999
-#define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
-#define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
-#define BALANCED_IRQ_MORE_DELTA		(HZ/10)
-#define BALANCED_IRQ_LESS_DELTA		(HZ)
-
-static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
-static int physical_balance __read_mostly;
-static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
-
-static struct irq_cpu_info {
-	unsigned long *last_irq;
-	unsigned long *irq_delta;
-	unsigned long irq;
-} irq_cpu_data[NR_CPUS];
-
-#define CPU_IRQ(cpu)		(irq_cpu_data[cpu].irq)
-#define LAST_CPU_IRQ(cpu, irq)   (irq_cpu_data[cpu].last_irq[irq])
-#define IRQ_DELTA(cpu, irq) 	(irq_cpu_data[cpu].irq_delta[irq])
-
-#define IDLE_ENOUGH(cpu,now) \
-	(idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
-
-#define IRQ_ALLOWED(cpu, allowed_mask)	cpu_isset(cpu, allowed_mask)
-
-#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i)))
-
-static cpumask_t balance_irq_affinity[NR_IRQS] = {
-	[0 ... NR_IRQS-1] = CPU_MASK_ALL
-};
-
-void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
-{
-	balance_irq_affinity[irq] = mask;
-}
-
-static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
-			unsigned long now, int direction)
-{
-	int search_idle = 1;
-	int cpu = curr_cpu;
-
-	goto inside;
-
-	do {
-		if (unlikely(cpu == curr_cpu))
-			search_idle = 0;
-inside:
-		if (direction == 1) {
-			cpu++;
-			if (cpu >= NR_CPUS)
-				cpu = 0;
-		} else {
-			cpu--;
-			if (cpu == -1)
-				cpu = NR_CPUS-1;
-		}
-	} while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
-			(search_idle && !IDLE_ENOUGH(cpu, now)));
-
-	return cpu;
-}
-
-static inline void balance_irq(int cpu, int irq)
-{
-	unsigned long now = jiffies;
-	cpumask_t allowed_mask;
-	unsigned int new_cpu;
-
-	if (irqbalance_disabled)
-		return;
-
-	cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
-	new_cpu = move(cpu, allowed_mask, now, 1);
-	if (cpu != new_cpu)
-		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
-}
-
-static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
-{
-	int i, j;
-
-	for_each_online_cpu(i) {
-		for (j = 0; j < NR_IRQS; j++) {
-			if (!irq_desc[j].action)
-				continue;
-			/* Is it a significant load ?  */
-			if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
-						useful_load_threshold)
-				continue;
-			balance_irq(i, j);
-		}
-	}
-	balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
-		balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
-	return;
-}
-
-static void do_irq_balance(void)
-{
-	int i, j;
-	unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
-	unsigned long move_this_load = 0;
-	int max_loaded = 0, min_loaded = 0;
-	int load;
-	unsigned long useful_load_threshold = balanced_irq_interval + 10;
-	int selected_irq;
-	int tmp_loaded, first_attempt = 1;
-	unsigned long tmp_cpu_irq;
-	unsigned long imbalance = 0;
-	cpumask_t allowed_mask, target_cpu_mask, tmp;
-
-	for_each_possible_cpu(i) {
-		int package_index;
-		CPU_IRQ(i) = 0;
-		if (!cpu_online(i))
-			continue;
-		package_index = CPU_TO_PACKAGEINDEX(i);
-		for (j = 0; j < NR_IRQS; j++) {
-			unsigned long value_now, delta;
-			/* Is this an active IRQ or balancing disabled ? */
-			if (!irq_desc[j].action || irq_balancing_disabled(j))
-				continue;
-			if (package_index == i)
-				IRQ_DELTA(package_index, j) = 0;
-			/* Determine the total count per processor per IRQ */
-			value_now = (unsigned long) kstat_cpu(i).irqs[j];
-
-			/* Determine the activity per processor per IRQ */
-			delta = value_now - LAST_CPU_IRQ(i, j);
-
-			/* Update last_cpu_irq[][] for the next time */
-			LAST_CPU_IRQ(i, j) = value_now;
-
-			/* Ignore IRQs whose rate is less than the clock */
-			if (delta < useful_load_threshold)
-				continue;
-			/* update the load for the processor or package total */
-			IRQ_DELTA(package_index, j) += delta;
-
-			/* Keep track of the higher numbered sibling as well */
-			if (i != package_index)
-				CPU_IRQ(i) += delta;
-			/*
-			 * We have sibling A and sibling B in the package
-			 *
-			 * cpu_irq[A] = load for cpu A + load for cpu B
-			 * cpu_irq[B] = load for cpu B
-			 */
-			CPU_IRQ(package_index) += delta;
-		}
-	}
-	/* Find the least loaded processor package */
-	for_each_online_cpu(i) {
-		if (i != CPU_TO_PACKAGEINDEX(i))
-			continue;
-		if (min_cpu_irq > CPU_IRQ(i)) {
-			min_cpu_irq = CPU_IRQ(i);
-			min_loaded = i;
-		}
-	}
-	max_cpu_irq = ULONG_MAX;
-
-tryanothercpu:
-	/*
-	 * Look for heaviest loaded processor.
-	 * We may come back to get the next heaviest loaded processor.
-	 * Skip processors with trivial loads.
-	 */
-	tmp_cpu_irq = 0;
-	tmp_loaded = -1;
-	for_each_online_cpu(i) {
-		if (i != CPU_TO_PACKAGEINDEX(i))
-			continue;
-		if (max_cpu_irq <= CPU_IRQ(i))
-			continue;
-		if (tmp_cpu_irq < CPU_IRQ(i)) {
-			tmp_cpu_irq = CPU_IRQ(i);
-			tmp_loaded = i;
-		}
-	}
-
-	if (tmp_loaded == -1) {
-	 /*
-	  * In the case of small number of heavy interrupt sources,
-	  * loading some of the cpus too much. We use Ingo's original
-	  * approach to rotate them around.
-	  */
-		if (!first_attempt && imbalance >= useful_load_threshold) {
-			rotate_irqs_among_cpus(useful_load_threshold);
-			return;
-		}
-		goto not_worth_the_effort;
-	}
-
-	first_attempt = 0;		/* heaviest search */
-	max_cpu_irq = tmp_cpu_irq;	/* load */
-	max_loaded = tmp_loaded;	/* processor */
-	imbalance = (max_cpu_irq - min_cpu_irq) / 2;
-
-	/*
-	 * if imbalance is less than approx 10% of max load, then
-	 * observe diminishing returns action. - quit
-	 */
-	if (imbalance < (max_cpu_irq >> 3))
-		goto not_worth_the_effort;
-
-tryanotherirq:
-	/* if we select an IRQ to move that can't go where we want, then
-	 * see if there is another one to try.
-	 */
-	move_this_load = 0;
-	selected_irq = -1;
-	for (j = 0; j < NR_IRQS; j++) {
-		/* Is this an active IRQ? */
-		if (!irq_desc[j].action)
-			continue;
-		if (imbalance <= IRQ_DELTA(max_loaded, j))
-			continue;
-		/* Try to find the IRQ that is closest to the imbalance
-		 * without going over.
-		 */
-		if (move_this_load < IRQ_DELTA(max_loaded, j)) {
-			move_this_load = IRQ_DELTA(max_loaded, j);
-			selected_irq = j;
-		}
-	}
-	if (selected_irq == -1)
-		goto tryanothercpu;
-
-	imbalance = move_this_load;
-
-	/* For physical_balance case, we accumulated both load
-	 * values in the one of the siblings cpu_irq[],
-	 * to use the same code for physical and logical processors
-	 * as much as possible.
-	 *
-	 * NOTE: the cpu_irq[] array holds the sum of the load for
-	 * sibling A and sibling B in the slot for the lowest numbered
-	 * sibling (A), _AND_ the load for sibling B in the slot for
-	 * the higher numbered sibling.
-	 *
-	 * We seek the least loaded sibling by making the comparison
-	 * (A+B)/2 vs B
-	 */
-	load = CPU_IRQ(min_loaded) >> 1;
-	for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) {
-		if (load > CPU_IRQ(j)) {
-			/* This won't change cpu_sibling_map[min_loaded] */
-			load = CPU_IRQ(j);
-			min_loaded = j;
-		}
-	}
-
-	cpus_and(allowed_mask,
-		cpu_online_map,
-		balance_irq_affinity[selected_irq]);
-	target_cpu_mask = cpumask_of_cpu(min_loaded);
-	cpus_and(tmp, target_cpu_mask, allowed_mask);
-
-	if (!cpus_empty(tmp)) {
-		/* mark for change destination */
-		set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
-
-		/* Since we made a change, come back sooner to
-		 * check for more variation.
-		 */
-		balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
-			balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
-		return;
-	}
-	goto tryanotherirq;
-
-not_worth_the_effort:
-	/*
-	 * if we did not find an IRQ to move, then adjust the time interval
-	 * upward
-	 */
-	balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
-		balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
-	return;
-}
-
-static int balanced_irq(void *unused)
-{
-	int i;
-	unsigned long prev_balance_time = jiffies;
-	long time_remaining = balanced_irq_interval;
-
-	/* push everything to CPU 0 to give us a starting point.  */
-	for (i = 0 ; i < NR_IRQS ; i++) {
-		irq_desc[i].pending_mask = cpumask_of_cpu(0);
-		set_pending_irq(i, cpumask_of_cpu(0));
-	}
-
-	set_freezable();
-	for ( ; ; ) {
-		time_remaining = schedule_timeout_interruptible(time_remaining);
-		try_to_freeze();
-		if (time_after(jiffies,
-				prev_balance_time+balanced_irq_interval)) {
-			preempt_disable();
-			do_irq_balance();
-			prev_balance_time = jiffies;
-			time_remaining = balanced_irq_interval;
-			preempt_enable();
-		}
-	}
-	return 0;
-}
-
-static int __init balanced_irq_init(void)
-{
-	int i;
-	struct cpuinfo_x86 *c;
-	cpumask_t tmp;
-
-	cpus_shift_right(tmp, cpu_online_map, 2);
-	c = &boot_cpu_data;
-	/* When not overwritten by the command line ask subarchitecture. */
-	if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
-		irqbalance_disabled = NO_BALANCE_IRQ;
-	if (irqbalance_disabled)
-		return 0;
-
-	 /* disable irqbalance completely if there is only one processor online */
-	if (num_online_cpus() < 2) {
-		irqbalance_disabled = 1;
-		return 0;
-	}
-	/*
-	 * Enable physical balance only if more than 1 physical processor
-	 * is present
-	 */
-	if (smp_num_siblings > 1 && !cpus_empty(tmp))
-		physical_balance = 1;
-
-	for_each_online_cpu(i) {
-		irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
-		irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
-		if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
-			printk(KERN_ERR "balanced_irq_init: out of memory");
-			goto failed;
-		}
-	}
-
-	printk(KERN_INFO "Starting balanced_irq\n");
-	if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
-		return 0;
-	printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
-failed:
-	for_each_possible_cpu(i) {
-		kfree(irq_cpu_data[i].irq_delta);
-		irq_cpu_data[i].irq_delta = NULL;
-		kfree(irq_cpu_data[i].last_irq);
-		irq_cpu_data[i].last_irq = NULL;
-	}
-	return 0;
-}
-
-int __devinit irqbalance_disable(char *str)
-{
-	irqbalance_disabled = 1;
-	return 1;
-}
-
-__setup("noirqbalance", irqbalance_disable);
-
-late_initcall(balanced_irq_init);
-#endif /* CONFIG_IRQBALANCE */
-#endif /* CONFIG_SMP */
-
-#ifndef CONFIG_SMP
-void send_IPI_self(int vector)
-{
-	unsigned int cfg;
-
-	/*
-	 * Wait for idle.
-	 */
-	apic_wait_icr_idle();
-	cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
-	/*
-	 * Send the IPI. The write to APIC_ICR fires this off.
-	 */
-	apic_write_around(APIC_ICR, cfg);
-}
-#endif /* !CONFIG_SMP */
-
-
-/*
- * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
- * specific CPU-side IRQs.
- */
-
-#define MAX_PIRQS 8
-static int pirq_entries [MAX_PIRQS];
-static int pirqs_enabled;
-int skip_ioapic_setup;
-
-static int __init ioapic_pirq_setup(char *str)
-{
-	int i, max;
-	int ints[MAX_PIRQS+1];
-
-	get_options(str, ARRAY_SIZE(ints), ints);
-
-	for (i = 0; i < MAX_PIRQS; i++)
-		pirq_entries[i] = -1;
-
-	pirqs_enabled = 1;
-	apic_printk(APIC_VERBOSE, KERN_INFO
-			"PIRQ redirection, working around broken MP-BIOS.\n");
-	max = MAX_PIRQS;
-	if (ints[0] < MAX_PIRQS)
-		max = ints[0];
-
-	for (i = 0; i < max; i++) {
-		apic_printk(APIC_VERBOSE, KERN_DEBUG
-				"... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
-		/*
-		 * PIRQs are mapped upside down, usually.
-		 */
-		pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
-	}
-	return 1;
-}
-
-__setup("pirq=", ioapic_pirq_setup);
-
-/*
- * Find the IRQ entry number of a certain pin.
- */
-static int find_irq_entry(int apic, int pin, int type)
-{
-	int i;
-
-	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mp_irqtype == type &&
-		    (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
-		     mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
-		    mp_irqs[i].mp_dstirq == pin)
-			return i;
-
-	return -1;
-}
-
-/*
- * Find the pin to which IRQ[irq] (ISA) is connected
- */
-static int __init find_isa_irq_pin(int irq, int type)
-{
-	int i;
-
-	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mp_srcbus;
-
-		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mp_irqtype == type) &&
-		    (mp_irqs[i].mp_srcbusirq == irq))
-
-			return mp_irqs[i].mp_dstirq;
-	}
-	return -1;
-}
-
-static int __init find_isa_irq_apic(int irq, int type)
-{
-	int i;
-
-	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mp_srcbus;
-
-		if (test_bit(lbus, mp_bus_not_pci) &&
-		    (mp_irqs[i].mp_irqtype == type) &&
-		    (mp_irqs[i].mp_srcbusirq == irq))
-			break;
-	}
-	if (i < mp_irq_entries) {
-		int apic;
-		for (apic = 0; apic < nr_ioapics; apic++) {
-			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
-				return apic;
-		}
-	}
-
-	return -1;
-}
-
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-static int pin_2_irq(int idx, int apic, int pin);
-
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
-{
-	int apic, i, best_guess = -1;
-
-	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
-		"slot:%d, pin:%d.\n", bus, slot, pin);
-	if (test_bit(bus, mp_bus_not_pci)) {
-		printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
-		return -1;
-	}
-	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].mp_srcbus;
-
-		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
-			    mp_irqs[i].mp_dstapic == MP_APIC_ALL)
-				break;
-
-		if (!test_bit(lbus, mp_bus_not_pci) &&
-		    !mp_irqs[i].mp_irqtype &&
-		    (bus == lbus) &&
-		    (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
-
-			if (!(apic || IO_APIC_IRQ(irq)))
-				continue;
-
-			if (pin == (mp_irqs[i].mp_srcbusirq & 3))
-				return irq;
-			/*
-			 * Use the first all-but-pin matching entry as a
-			 * best-guess fuzzy result for broken mptables.
-			 */
-			if (best_guess < 0)
-				best_guess = irq;
-		}
-	}
-	return best_guess;
-}
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-
-/*
- * This function currently is only a helper for the i386 smp boot process where
- * we need to reprogram the ioredtbls to cater for the cpus which have come online
- * so mask in all cases should simply be TARGET_CPUS
- */
-#ifdef CONFIG_SMP
-void __init setup_ioapic_dest(void)
-{
-	int pin, ioapic, irq, irq_entry;
-
-	if (skip_ioapic_setup == 1)
-		return;
-
-	for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
-		for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
-			irq_entry = find_irq_entry(ioapic, pin, mp_INT);
-			if (irq_entry == -1)
-				continue;
-			irq = pin_2_irq(irq_entry, ioapic, pin);
-			set_ioapic_affinity_irq(irq, TARGET_CPUS);
-		}
-
-	}
-}
-#endif
-
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
-/*
- * EISA Edge/Level control register, ELCR
- */
-static int EISA_ELCR(unsigned int irq)
-{
-	if (irq < 16) {
-		unsigned int port = 0x4d0 + (irq >> 3);
-		return (inb(port) >> (irq & 7)) & 1;
-	}
-	apic_printk(APIC_VERBOSE, KERN_INFO
-			"Broken MPtable reports ISA irq %d\n", irq);
-	return 0;
-}
-#endif
-
-/* ISA interrupts are always polarity zero edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx)	(0)
-#define default_ISA_polarity(idx)	(0)
-
-/* EISA interrupts are always polarity zero and can be edge or level
- * trigger depending on the ELCR value.  If an interrupt is listed as
- * EISA conforming in the MP table, that means its trigger type must
- * be read in from the ELCR */
-
-#define default_EISA_trigger(idx)	(EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
-#define default_EISA_polarity(idx)	default_ISA_polarity(idx)
-
-/* PCI interrupts are always polarity one level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_PCI_trigger(idx)	(1)
-#define default_PCI_polarity(idx)	(1)
-
-/* MCA interrupts are always polarity zero level triggered,
- * when listed as conforming in the MP table. */
-
-#define default_MCA_trigger(idx)	(1)
-#define default_MCA_polarity(idx)	default_ISA_polarity(idx)
-
-static int MPBIOS_polarity(int idx)
-{
-	int bus = mp_irqs[idx].mp_srcbus;
-	int polarity;
-
-	/*
-	 * Determine IRQ line polarity (high active or low active):
-	 */
-	switch (mp_irqs[idx].mp_irqflag & 3) {
-	case 0: /* conforms, ie. bus-type dependent polarity */
-	{
-		polarity = test_bit(bus, mp_bus_not_pci)?
-			default_ISA_polarity(idx):
-			default_PCI_polarity(idx);
-		break;
-	}
-	case 1: /* high active */
-	{
-		polarity = 0;
-		break;
-	}
-	case 2: /* reserved */
-	{
-		printk(KERN_WARNING "broken BIOS!!\n");
-		polarity = 1;
-		break;
-	}
-	case 3: /* low active */
-	{
-		polarity = 1;
-		break;
-	}
-	default: /* invalid */
-	{
-		printk(KERN_WARNING "broken BIOS!!\n");
-		polarity = 1;
-		break;
-	}
-	}
-	return polarity;
-}
-
-static int MPBIOS_trigger(int idx)
-{
-	int bus = mp_irqs[idx].mp_srcbus;
-	int trigger;
-
-	/*
-	 * Determine IRQ trigger mode (edge or level sensitive):
-	 */
-	switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
-	case 0: /* conforms, ie. bus-type dependent */
-	{
-		trigger = test_bit(bus, mp_bus_not_pci)?
-				default_ISA_trigger(idx):
-				default_PCI_trigger(idx);
-#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
-		switch (mp_bus_id_to_type[bus]) {
-		case MP_BUS_ISA: /* ISA pin */
-		{
-			/* set before the switch */
-			break;
-		}
-		case MP_BUS_EISA: /* EISA pin */
-		{
-			trigger = default_EISA_trigger(idx);
-			break;
-		}
-		case MP_BUS_PCI: /* PCI pin */
-		{
-			/* set before the switch */
-			break;
-		}
-		case MP_BUS_MCA: /* MCA pin */
-		{
-			trigger = default_MCA_trigger(idx);
-			break;
-		}
-		default:
-		{
-			printk(KERN_WARNING "broken BIOS!!\n");
-			trigger = 1;
-			break;
-		}
-	}
-#endif
-		break;
-	}
-	case 1: /* edge */
-	{
-		trigger = 0;
-		break;
-	}
-	case 2: /* reserved */
-	{
-		printk(KERN_WARNING "broken BIOS!!\n");
-		trigger = 1;
-		break;
-	}
-	case 3: /* level */
-	{
-		trigger = 1;
-		break;
-	}
-	default: /* invalid */
-	{
-		printk(KERN_WARNING "broken BIOS!!\n");
-		trigger = 0;
-		break;
-	}
-	}
-	return trigger;
-}
-
-static inline int irq_polarity(int idx)
-{
-	return MPBIOS_polarity(idx);
-}
-
-static inline int irq_trigger(int idx)
-{
-	return MPBIOS_trigger(idx);
-}
-
-static int pin_2_irq(int idx, int apic, int pin)
-{
-	int irq, i;
-	int bus = mp_irqs[idx].mp_srcbus;
-
-	/*
-	 * Debugging check, we are in big trouble if this message pops up!
-	 */
-	if (mp_irqs[idx].mp_dstirq != pin)
-		printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
-
-	if (test_bit(bus, mp_bus_not_pci))
-		irq = mp_irqs[idx].mp_srcbusirq;
-	else {
-		/*
-		 * PCI IRQs are mapped in order
-		 */
-		i = irq = 0;
-		while (i < apic)
-			irq += nr_ioapic_registers[i++];
-		irq += pin;
-
-		/*
-		 * For MPS mode, so far only needed by ES7000 platform
-		 */
-		if (ioapic_renumber_irq)
-			irq = ioapic_renumber_irq(apic, irq);
-	}
-
-	/*
-	 * PCI IRQ command line redirection. Yes, limits are hardcoded.
-	 */
-	if ((pin >= 16) && (pin <= 23)) {
-		if (pirq_entries[pin-16] != -1) {
-			if (!pirq_entries[pin-16]) {
-				apic_printk(APIC_VERBOSE, KERN_DEBUG
-						"disabling PIRQ%d\n", pin-16);
-			} else {
-				irq = pirq_entries[pin-16];
-				apic_printk(APIC_VERBOSE, KERN_DEBUG
-						"using PIRQ%d -> IRQ %d\n",
-						pin-16, irq);
-			}
-		}
-	}
-	return irq;
-}
-
-static inline int IO_APIC_irq_trigger(int irq)
-{
-	int apic, idx, pin;
-
-	for (apic = 0; apic < nr_ioapics; apic++) {
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-			idx = find_irq_entry(apic, pin, mp_INT);
-			if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
-				return irq_trigger(idx);
-		}
-	}
-	/*
-	 * nonexistent IRQs are edge default
-	 */
-	return 0;
-}
-
-/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
-
-static int __assign_irq_vector(int irq)
-{
-	static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
-	int vector, offset;
-
-	BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
-
-	if (irq_vector[irq] > 0)
-		return irq_vector[irq];
-
-	vector = current_vector;
-	offset = current_offset;
-next:
-	vector += 8;
-	if (vector >= first_system_vector) {
-		offset = (offset + 1) % 8;
-		vector = FIRST_DEVICE_VECTOR + offset;
-	}
-	if (vector == current_vector)
-		return -ENOSPC;
-	if (test_and_set_bit(vector, used_vectors))
-		goto next;
-
-	current_vector = vector;
-	current_offset = offset;
-	irq_vector[irq] = vector;
-
-	return vector;
-}
-
-static int assign_irq_vector(int irq)
-{
-	unsigned long flags;
-	int vector;
-
-	spin_lock_irqsave(&vector_lock, flags);
-	vector = __assign_irq_vector(irq);
-	spin_unlock_irqrestore(&vector_lock, flags);
-
-	return vector;
-}
-
-void setup_vector_irq(int cpu)
-{
-}
-
-static struct irq_chip ioapic_chip;
-
-#define IOAPIC_AUTO	-1
-#define IOAPIC_EDGE	0
-#define IOAPIC_LEVEL	1
-
-static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
-{
-	if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
-	    trigger == IOAPIC_LEVEL) {
-		irq_desc[irq].status |= IRQ_LEVEL;
-		set_irq_chip_and_handler_name(irq, &ioapic_chip,
-					 handle_fasteoi_irq, "fasteoi");
-	} else {
-		irq_desc[irq].status &= ~IRQ_LEVEL;
-		set_irq_chip_and_handler_name(irq, &ioapic_chip,
-					 handle_edge_irq, "edge");
-	}
-	set_intr_gate(vector, interrupt[irq]);
-}
-
-static void __init setup_IO_APIC_irqs(void)
-{
-	struct IO_APIC_route_entry entry;
-	int apic, pin, idx, irq, first_notcon = 1, vector;
-
-	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
-
-	for (apic = 0; apic < nr_ioapics; apic++) {
-	for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-
-		/*
-		 * add it to the IO-APIC irq-routing table:
-		 */
-		memset(&entry, 0, sizeof(entry));
-
-		entry.delivery_mode = INT_DELIVERY_MODE;
-		entry.dest_mode = INT_DEST_MODE;
-		entry.mask = 0;				/* enable IRQ */
-		entry.dest.logical.logical_dest =
-					cpu_mask_to_apicid(TARGET_CPUS);
-
-		idx = find_irq_entry(apic, pin, mp_INT);
-		if (idx == -1) {
-			if (first_notcon) {
-				apic_printk(APIC_VERBOSE, KERN_DEBUG
-						" IO-APIC (apicid-pin) %d-%d",
-						mp_ioapics[apic].mp_apicid,
-						pin);
-				first_notcon = 0;
-			} else
-				apic_printk(APIC_VERBOSE, ", %d-%d",
-					mp_ioapics[apic].mp_apicid, pin);
-			continue;
-		}
-
-		if (!first_notcon) {
-			apic_printk(APIC_VERBOSE, " not connected.\n");
-			first_notcon = 1;
-		}
-
-		entry.trigger = irq_trigger(idx);
-		entry.polarity = irq_polarity(idx);
-
-		if (irq_trigger(idx)) {
-			entry.trigger = 1;
-			entry.mask = 1;
-		}
-
-		irq = pin_2_irq(idx, apic, pin);
-		/*
-		 * skip adding the timer int on secondary nodes, which causes
-		 * a small but painful rift in the time-space continuum
-		 */
-		if (multi_timer_check(apic, irq))
-			continue;
-		else
-			add_pin_to_irq(irq, apic, pin);
-
-		if (!apic && !IO_APIC_IRQ(irq))
-			continue;
-
-		if (IO_APIC_IRQ(irq)) {
-			vector = assign_irq_vector(irq);
-			entry.vector = vector;
-			ioapic_register_intr(irq, vector, IOAPIC_AUTO);
-
-			if (!apic && (irq < 16))
-				disable_8259A_irq(irq);
-		}
-		ioapic_write_entry(apic, pin, entry);
-	}
-	}
-
-	if (!first_notcon)
-		apic_printk(APIC_VERBOSE, " not connected.\n");
-}
-
-/*
- * Set up the timer pin, possibly with the 8259A-master behind.
- */
-static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
-					int vector)
-{
-	struct IO_APIC_route_entry entry;
-
-	memset(&entry, 0, sizeof(entry));
-
-	/*
-	 * We use logical delivery to get the timer IRQ
-	 * to the first CPU.
-	 */
-	entry.dest_mode = INT_DEST_MODE;
-	entry.mask = 1;					/* mask IRQ now */
-	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
-	entry.delivery_mode = INT_DELIVERY_MODE;
-	entry.polarity = 0;
-	entry.trigger = 0;
-	entry.vector = vector;
-
-	/*
-	 * The timer IRQ doesn't have to know that behind the
-	 * scene we may have a 8259A-master in AEOI mode ...
-	 */
-	ioapic_register_intr(0, vector, IOAPIC_EDGE);
-
-	/*
-	 * Add it to the IO-APIC irq-routing table:
-	 */
-	ioapic_write_entry(apic, pin, entry);
-}
-
-void __init print_IO_APIC(void)
-{
-	int apic, i;
-	union IO_APIC_reg_00 reg_00;
-	union IO_APIC_reg_01 reg_01;
-	union IO_APIC_reg_02 reg_02;
-	union IO_APIC_reg_03 reg_03;
-	unsigned long flags;
-
-	if (apic_verbosity == APIC_QUIET)
-		return;
-
-	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
-	for (i = 0; i < nr_ioapics; i++)
-		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-		       mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
-
-	/*
-	 * We are a bit conservative about what we expect.  We have to
-	 * know about every hardware change ASAP.
-	 */
-	printk(KERN_INFO "testing the IO APIC.......................\n");
-
-	for (apic = 0; apic < nr_ioapics; apic++) {
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	reg_00.raw = io_apic_read(apic, 0);
-	reg_01.raw = io_apic_read(apic, 1);
-	if (reg_01.bits.version >= 0x10)
-		reg_02.raw = io_apic_read(apic, 2);
-	if (reg_01.bits.version >= 0x20)
-		reg_03.raw = io_apic_read(apic, 3);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-
-	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
-	printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
-	printk(KERN_DEBUG ".......    : physical APIC id: %02X\n", reg_00.bits.ID);
-	printk(KERN_DEBUG ".......    : Delivery Type: %X\n", reg_00.bits.delivery_type);
-	printk(KERN_DEBUG ".......    : LTS          : %X\n", reg_00.bits.LTS);
-
-	printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
-	printk(KERN_DEBUG ".......     : max redirection entries: %04X\n", reg_01.bits.entries);
-
-	printk(KERN_DEBUG ".......     : PRQ implemented: %X\n", reg_01.bits.PRQ);
-	printk(KERN_DEBUG ".......     : IO APIC version: %04X\n", reg_01.bits.version);
-
-	/*
-	 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
-	 * but the value of reg_02 is read as the previous read register
-	 * value, so ignore it if reg_02 == reg_01.
-	 */
-	if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
-		printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
-		printk(KERN_DEBUG ".......     : arbitration: %02X\n", reg_02.bits.arbitration);
-	}
-
-	/*
-	 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
-	 * or reg_03, but the value of reg_0[23] is read as the previous read
-	 * register value, so ignore it if reg_03 == reg_0[12].
-	 */
-	if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
-	    reg_03.raw != reg_01.raw) {
-		printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
-		printk(KERN_DEBUG ".......     : Boot DT    : %X\n", reg_03.bits.boot_DT);
-	}
-
-	printk(KERN_DEBUG ".... IRQ redirection table:\n");
-
-	printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
-			  " Stat Dest Deli Vect:   \n");
-
-	for (i = 0; i <= reg_01.bits.entries; i++) {
-		struct IO_APIC_route_entry entry;
-
-		entry = ioapic_read_entry(apic, i);
-
-		printk(KERN_DEBUG " %02x %03X %02X  ",
-			i,
-			entry.dest.logical.logical_dest,
-			entry.dest.physical.physical_dest
-		);
-
-		printk("%1d    %1d    %1d   %1d   %1d    %1d    %1d    %02X\n",
-			entry.mask,
-			entry.trigger,
-			entry.irr,
-			entry.polarity,
-			entry.delivery_status,
-			entry.dest_mode,
-			entry.delivery_mode,
-			entry.vector
-		);
-	}
-	}
-	printk(KERN_DEBUG "IRQ to pin mappings:\n");
-	for (i = 0; i < NR_IRQS; i++) {
-		struct irq_pin_list *entry = irq_2_pin + i;
-		if (entry->pin < 0)
-			continue;
-		printk(KERN_DEBUG "IRQ%d ", i);
-		for (;;) {
-			printk("-> %d:%d", entry->apic, entry->pin);
-			if (!entry->next)
-				break;
-			entry = irq_2_pin + entry->next;
-		}
-		printk("\n");
-	}
-
-	printk(KERN_INFO ".................................... done.\n");
-
-	return;
-}
-
-#if 0
-
-static void print_APIC_bitfield(int base)
-{
-	unsigned int v;
-	int i, j;
-
-	if (apic_verbosity == APIC_QUIET)
-		return;
-
-	printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
-	for (i = 0; i < 8; i++) {
-		v = apic_read(base + i*0x10);
-		for (j = 0; j < 32; j++) {
-			if (v & (1<<j))
-				printk("1");
-			else
-				printk("0");
-		}
-		printk("\n");
-	}
-}
-
-void /*__init*/ print_local_APIC(void *dummy)
-{
-	unsigned int v, ver, maxlvt;
-
-	if (apic_verbosity == APIC_QUIET)
-		return;
-
-	printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
-		smp_processor_id(), hard_smp_processor_id());
-	v = apic_read(APIC_ID);
-	printk(KERN_INFO "... APIC ID:      %08x (%01x)\n", v,
-			GET_APIC_ID(read_apic_id()));
-	v = apic_read(APIC_LVR);
-	printk(KERN_INFO "... APIC VERSION: %08x\n", v);
-	ver = GET_APIC_VERSION(v);
-	maxlvt = lapic_get_maxlvt();
-
-	v = apic_read(APIC_TASKPRI);
-	printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
-
-	if (APIC_INTEGRATED(ver)) {			/* !82489DX */
-		v = apic_read(APIC_ARBPRI);
-		printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
-			v & APIC_ARBPRI_MASK);
-		v = apic_read(APIC_PROCPRI);
-		printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
-	}
-
-	v = apic_read(APIC_EOI);
-	printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
-	v = apic_read(APIC_RRR);
-	printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
-	v = apic_read(APIC_LDR);
-	printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
-	v = apic_read(APIC_DFR);
-	printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
-	v = apic_read(APIC_SPIV);
-	printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
-
-	printk(KERN_DEBUG "... APIC ISR field:\n");
-	print_APIC_bitfield(APIC_ISR);
-	printk(KERN_DEBUG "... APIC TMR field:\n");
-	print_APIC_bitfield(APIC_TMR);
-	printk(KERN_DEBUG "... APIC IRR field:\n");
-	print_APIC_bitfield(APIC_IRR);
-
-	if (APIC_INTEGRATED(ver)) {		/* !82489DX */
-		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP. */
-			apic_write(APIC_ESR, 0);
-		v = apic_read(APIC_ESR);
-		printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
-	}
-
-	v = apic_read(APIC_ICR);
-	printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
-	v = apic_read(APIC_ICR2);
-	printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
-
-	v = apic_read(APIC_LVTT);
-	printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
-
-	if (maxlvt > 3) {                       /* PC is LVT#4. */
-		v = apic_read(APIC_LVTPC);
-		printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
-	}
-	v = apic_read(APIC_LVT0);
-	printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
-	v = apic_read(APIC_LVT1);
-	printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
-
-	if (maxlvt > 2) {			/* ERR is LVT#3. */
-		v = apic_read(APIC_LVTERR);
-		printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
-	}
-
-	v = apic_read(APIC_TMICT);
-	printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
-	v = apic_read(APIC_TMCCT);
-	printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
-	v = apic_read(APIC_TDCR);
-	printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
-	printk("\n");
-}
-
-void print_all_local_APICs(void)
-{
-	on_each_cpu(print_local_APIC, NULL, 1);
-}
-
-void /*__init*/ print_PIC(void)
-{
-	unsigned int v;
-	unsigned long flags;
-
-	if (apic_verbosity == APIC_QUIET)
-		return;
-
-	printk(KERN_DEBUG "\nprinting PIC contents\n");
-
-	spin_lock_irqsave(&i8259A_lock, flags);
-
-	v = inb(0xa1) << 8 | inb(0x21);
-	printk(KERN_DEBUG "... PIC  IMR: %04x\n", v);
-
-	v = inb(0xa0) << 8 | inb(0x20);
-	printk(KERN_DEBUG "... PIC  IRR: %04x\n", v);
-
-	outb(0x0b, 0xa0);
-	outb(0x0b, 0x20);
-	v = inb(0xa0) << 8 | inb(0x20);
-	outb(0x0a, 0xa0);
-	outb(0x0a, 0x20);
-
-	spin_unlock_irqrestore(&i8259A_lock, flags);
-
-	printk(KERN_DEBUG "... PIC  ISR: %04x\n", v);
-
-	v = inb(0x4d1) << 8 | inb(0x4d0);
-	printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
-}
-
-#endif  /*  0  */
-
-static void __init enable_IO_APIC(void)
-{
-	union IO_APIC_reg_01 reg_01;
-	int i8259_apic, i8259_pin;
-	int i, apic;
-	unsigned long flags;
-
-	for (i = 0; i < PIN_MAP_SIZE; i++) {
-		irq_2_pin[i].pin = -1;
-		irq_2_pin[i].next = 0;
-	}
-	if (!pirqs_enabled)
-		for (i = 0; i < MAX_PIRQS; i++)
-			pirq_entries[i] = -1;
-
-	/*
-	 * The number of IO-APIC IRQ registers (== #pins):
-	 */
-	for (apic = 0; apic < nr_ioapics; apic++) {
-		spin_lock_irqsave(&ioapic_lock, flags);
-		reg_01.raw = io_apic_read(apic, 1);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-		nr_ioapic_registers[apic] = reg_01.bits.entries+1;
-	}
-	for (apic = 0; apic < nr_ioapics; apic++) {
-		int pin;
-		/* See if any of the pins is in ExtINT mode */
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
-			struct IO_APIC_route_entry entry;
-			entry = ioapic_read_entry(apic, pin);
-
-
-			/* If the interrupt line is enabled and in ExtInt mode
-			 * I have found the pin where the i8259 is connected.
-			 */
-			if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
-				ioapic_i8259.apic = apic;
-				ioapic_i8259.pin  = pin;
-				goto found_i8259;
-			}
-		}
-	}
- found_i8259:
-	/* Look to see what if the MP table has reported the ExtINT */
-	/* If we could not find the appropriate pin by looking at the ioapic
-	 * the i8259 probably is not connected the ioapic but give the
-	 * mptable a chance anyway.
-	 */
-	i8259_pin  = find_isa_irq_pin(0, mp_ExtINT);
-	i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
-	/* Trust the MP table if nothing is setup in the hardware */
-	if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
-		printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
-		ioapic_i8259.pin  = i8259_pin;
-		ioapic_i8259.apic = i8259_apic;
-	}
-	/* Complain if the MP table and the hardware disagree */
-	if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
-		(i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
-	{
-		printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
-	}
-
-	/*
-	 * Do not trust the IO-APIC being empty at bootup
-	 */
-	clear_IO_APIC();
-}
-
-/*
- * Not an __init, needed by the reboot code
- */
-void disable_IO_APIC(void)
-{
-	/*
-	 * Clear the IO-APIC before rebooting:
-	 */
-	clear_IO_APIC();
-
-	/*
-	 * If the i8259 is routed through an IOAPIC
-	 * Put that IOAPIC in virtual wire mode
-	 * so legacy interrupts can be delivered.
-	 */
-	if (ioapic_i8259.pin != -1) {
-		struct IO_APIC_route_entry entry;
-
-		memset(&entry, 0, sizeof(entry));
-		entry.mask            = 0; /* Enabled */
-		entry.trigger         = 0; /* Edge */
-		entry.irr             = 0;
-		entry.polarity        = 0; /* High */
-		entry.delivery_status = 0;
-		entry.dest_mode       = 0; /* Physical */
-		entry.delivery_mode   = dest_ExtINT; /* ExtInt */
-		entry.vector          = 0;
-		entry.dest.physical.physical_dest =
-					GET_APIC_ID(read_apic_id());
-
-		/*
-		 * Add it to the IO-APIC irq-routing table:
-		 */
-		ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
-	}
-	disconnect_bsp_APIC(ioapic_i8259.pin != -1);
-}
-
-/*
- * function to set the IO-APIC physical IDs based on the
- * values stored in the MPC table.
- *
- * by Matt Domsch <Matt_Domsch@dell.com>  Tue Dec 21 12:25:05 CST 1999
- */
-
-static void __init setup_ioapic_ids_from_mpc(void)
-{
-	union IO_APIC_reg_00 reg_00;
-	physid_mask_t phys_id_present_map;
-	int apic;
-	int i;
-	unsigned char old_id;
-	unsigned long flags;
-
-#ifdef CONFIG_X86_NUMAQ
-	if (found_numaq)
-		return;
-#endif
-
-	/*
-	 * Don't check I/O APIC IDs for xAPIC systems.  They have
-	 * no meaning without the serial APIC bus.
-	 */
-	if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		|| APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
-		return;
-	/*
-	 * This is broken; anything with a real cpu count has to
-	 * circumvent this idiocy regardless.
-	 */
-	phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
-
-	/*
-	 * Set the IOAPIC ID to the value stored in the MPC table.
-	 */
-	for (apic = 0; apic < nr_ioapics; apic++) {
-
-		/* Read the register 0 value */
-		spin_lock_irqsave(&ioapic_lock, flags);
-		reg_00.raw = io_apic_read(apic, 0);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-
-		old_id = mp_ioapics[apic].mp_apicid;
-
-		if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
-			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
-				apic, mp_ioapics[apic].mp_apicid);
-			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
-				reg_00.bits.ID);
-			mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
-		}
-
-		/*
-		 * Sanity check, is the ID really free? Every APIC in a
-		 * system must have a unique ID or we get lots of nice
-		 * 'stuck on smp_invalidate_needed IPI wait' messages.
-		 */
-		if (check_apicid_used(phys_id_present_map,
-					mp_ioapics[apic].mp_apicid)) {
-			printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
-				apic, mp_ioapics[apic].mp_apicid);
-			for (i = 0; i < get_physical_broadcast(); i++)
-				if (!physid_isset(i, phys_id_present_map))
-					break;
-			if (i >= get_physical_broadcast())
-				panic("Max APIC ID exceeded!\n");
-			printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
-				i);
-			physid_set(i, phys_id_present_map);
-			mp_ioapics[apic].mp_apicid = i;
-		} else {
-			physid_mask_t tmp;
-			tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
-			apic_printk(APIC_VERBOSE, "Setting %d in the "
-					"phys_id_present_map\n",
-					mp_ioapics[apic].mp_apicid);
-			physids_or(phys_id_present_map, phys_id_present_map, tmp);
-		}
-
-
-		/*
-		 * We need to adjust the IRQ routing table
-		 * if the ID changed.
-		 */
-		if (old_id != mp_ioapics[apic].mp_apicid)
-			for (i = 0; i < mp_irq_entries; i++)
-				if (mp_irqs[i].mp_dstapic == old_id)
-					mp_irqs[i].mp_dstapic
-						= mp_ioapics[apic].mp_apicid;
-
-		/*
-		 * Read the right value from the MPC table and
-		 * write it into the ID register.
-		 */
-		apic_printk(APIC_VERBOSE, KERN_INFO
-			"...changing IO-APIC physical APIC ID to %d ...",
-			mp_ioapics[apic].mp_apicid);
-
-		reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
-		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(apic, 0, reg_00.raw);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-
-		/*
-		 * Sanity check
-		 */
-		spin_lock_irqsave(&ioapic_lock, flags);
-		reg_00.raw = io_apic_read(apic, 0);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-		if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
-			printk("could not set ID!\n");
-		else
-			apic_printk(APIC_VERBOSE, " ok.\n");
-	}
-}
-
-int no_timer_check __initdata;
-
-static int __init notimercheck(char *s)
-{
-	no_timer_check = 1;
-	return 1;
-}
-__setup("no_timer_check", notimercheck);
-
-/*
- * There is a nasty bug in some older SMP boards, their mptable lies
- * about the timer IRQ. We do the following to work around the situation:
- *
- *	- timer IRQ defaults to IO-APIC IRQ
- *	- if this function detects that timer IRQs are defunct, then we fall
- *	  back to ISA timer IRQs
- */
-static int __init timer_irq_works(void)
-{
-	unsigned long t1 = jiffies;
-	unsigned long flags;
-
-	if (no_timer_check)
-		return 1;
-
-	local_save_flags(flags);
-	local_irq_enable();
-	/* Let ten ticks pass... */
-	mdelay((10 * 1000) / HZ);
-	local_irq_restore(flags);
-
-	/*
-	 * Expect a few ticks at least, to be sure some possible
-	 * glue logic does not lock up after one or two first
-	 * ticks in a non-ExtINT mode.  Also the local APIC
-	 * might have cached one ExtINT interrupt.  Finally, at
-	 * least one tick may be lost due to delays.
-	 */
-	if (time_after(jiffies, t1 + 4))
-		return 1;
-
-	return 0;
-}
-
-/*
- * In the SMP+IOAPIC case it might happen that there are an unspecified
- * number of pending IRQ events unhandled. These cases are very rare,
- * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
- * better to do it this way as thus we do not have to be aware of
- * 'pending' interrupts in the IRQ path, except at this point.
- */
-/*
- * Edge triggered needs to resend any interrupt
- * that was delayed but this is now handled in the device
- * independent code.
- */
-
-/*
- * Startup quirk:
- *
- * Starting up a edge-triggered IO-APIC interrupt is
- * nasty - we need to make sure that we get the edge.
- * If it is already asserted for some reason, we need
- * return 1 to indicate that is was pending.
- *
- * This is not complete - we should be able to fake
- * an edge even if it isn't on the 8259A...
- *
- * (We do this for level-triggered IRQs too - it cannot hurt.)
- */
-static unsigned int startup_ioapic_irq(unsigned int irq)
-{
-	int was_pending = 0;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	if (irq < 16) {
-		disable_8259A_irq(irq);
-		if (i8259A_irq_pending(irq))
-			was_pending = 1;
-	}
-	__unmask_IO_APIC_irq(irq);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-
-	return was_pending;
-}
-
-static void ack_ioapic_irq(unsigned int irq)
-{
-	move_native_irq(irq);
-	ack_APIC_irq();
-}
-
-static void ack_ioapic_quirk_irq(unsigned int irq)
-{
-	unsigned long v;
-	int i;
-
-	move_native_irq(irq);
-/*
- * It appears there is an erratum which affects at least version 0x11
- * of I/O APIC (that's the 82093AA and cores integrated into various
- * chipsets).  Under certain conditions a level-triggered interrupt is
- * erroneously delivered as edge-triggered one but the respective IRR
- * bit gets set nevertheless.  As a result the I/O unit expects an EOI
- * message but it will never arrive and further interrupts are blocked
- * from the source.  The exact reason is so far unknown, but the
- * phenomenon was observed when two consecutive interrupt requests
- * from a given source get delivered to the same CPU and the source is
- * temporarily disabled in between.
- *
- * A workaround is to simulate an EOI message manually.  We achieve it
- * by setting the trigger mode to edge and then to level when the edge
- * trigger mode gets detected in the TMR of a local APIC for a
- * level-triggered interrupt.  We mask the source for the time of the
- * operation to prevent an edge-triggered interrupt escaping meanwhile.
- * The idea is from Manfred Spraul.  --macro
- */
-	i = irq_vector[irq];
-
-	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
-
-	ack_APIC_irq();
-
-	if (!(v & (1 << (i & 0x1f)))) {
-		atomic_inc(&irq_mis_count);
-		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
-		spin_unlock(&ioapic_lock);
-	}
-}
-
-static int ioapic_retrigger_irq(unsigned int irq)
-{
-	send_IPI_self(irq_vector[irq]);
-
-	return 1;
-}
-
-static struct irq_chip ioapic_chip __read_mostly = {
-	.name 		= "IO-APIC",
-	.startup 	= startup_ioapic_irq,
-	.mask	 	= mask_IO_APIC_irq,
-	.unmask	 	= unmask_IO_APIC_irq,
-	.ack 		= ack_ioapic_irq,
-	.eoi 		= ack_ioapic_quirk_irq,
-#ifdef CONFIG_SMP
-	.set_affinity 	= set_ioapic_affinity_irq,
-#endif
-	.retrigger	= ioapic_retrigger_irq,
-};
-
-
-static inline void init_IO_APIC_traps(void)
-{
-	int irq;
-
-	/*
-	 * NOTE! The local APIC isn't very good at handling
-	 * multiple interrupts at the same interrupt level.
-	 * As the interrupt level is determined by taking the
-	 * vector number and shifting that right by 4, we
-	 * want to spread these out a bit so that they don't
-	 * all fall in the same interrupt level.
-	 *
-	 * Also, we've got to be careful not to trash gate
-	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
-	 */
-	for (irq = 0; irq < NR_IRQS ; irq++) {
-		if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
-			/*
-			 * Hmm.. We don't have an entry for this,
-			 * so default to an old-fashioned 8259
-			 * interrupt if we can..
-			 */
-			if (irq < 16)
-				make_8259A_irq(irq);
-			else
-				/* Strange. Oh, well.. */
-				irq_desc[irq].chip = &no_irq_chip;
-		}
-	}
-}
-
-/*
- * The local APIC irq-chip implementation:
- */
-
-static void ack_lapic_irq(unsigned int irq)
-{
-	ack_APIC_irq();
-}
-
-static void mask_lapic_irq(unsigned int irq)
-{
-	unsigned long v;
-
-	v = apic_read(APIC_LVT0);
-	apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
-}
-
-static void unmask_lapic_irq(unsigned int irq)
-{
-	unsigned long v;
-
-	v = apic_read(APIC_LVT0);
-	apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
-}
-
-static struct irq_chip lapic_chip __read_mostly = {
-	.name		= "local-APIC",
-	.mask		= mask_lapic_irq,
-	.unmask		= unmask_lapic_irq,
-	.ack		= ack_lapic_irq,
-};
-
-static void lapic_register_intr(int irq, int vector)
-{
-	irq_desc[irq].status &= ~IRQ_LEVEL;
-	set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
-				      "edge");
-	set_intr_gate(vector, interrupt[irq]);
-}
-
-static void __init setup_nmi(void)
-{
-	/*
-	 * Dirty trick to enable the NMI watchdog ...
-	 * We put the 8259A master into AEOI mode and
-	 * unmask on all local APICs LVT0 as NMI.
-	 *
-	 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
-	 * is from Maciej W. Rozycki - so we do not have to EOI from
-	 * the NMI handler or the timer interrupt.
-	 */
-	apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
-
-	enable_NMI_through_LVT0();
-
-	apic_printk(APIC_VERBOSE, " done.\n");
-}
-
-/*
- * This looks a bit hackish but it's about the only one way of sending
- * a few INTA cycles to 8259As and any associated glue logic.  ICR does
- * not support the ExtINT mode, unfortunately.  We need to send these
- * cycles as some i82489DX-based boards have glue logic that keeps the
- * 8259A interrupt line asserted until INTA.  --macro
- */
-static inline void __init unlock_ExtINT_logic(void)
-{
-	int apic, pin, i;
-	struct IO_APIC_route_entry entry0, entry1;
-	unsigned char save_control, save_freq_select;
-
-	pin  = find_isa_irq_pin(8, mp_INT);
-	if (pin == -1) {
-		WARN_ON_ONCE(1);
-		return;
-	}
-	apic = find_isa_irq_apic(8, mp_INT);
-	if (apic == -1) {
-		WARN_ON_ONCE(1);
-		return;
-	}
-
-	entry0 = ioapic_read_entry(apic, pin);
-	clear_IO_APIC_pin(apic, pin);
-
-	memset(&entry1, 0, sizeof(entry1));
-
-	entry1.dest_mode = 0;			/* physical delivery */
-	entry1.mask = 0;			/* unmask IRQ now */
-	entry1.dest.physical.physical_dest = hard_smp_processor_id();
-	entry1.delivery_mode = dest_ExtINT;
-	entry1.polarity = entry0.polarity;
-	entry1.trigger = 0;
-	entry1.vector = 0;
-
-	ioapic_write_entry(apic, pin, entry1);
-
-	save_control = CMOS_READ(RTC_CONTROL);
-	save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
-	CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
-		   RTC_FREQ_SELECT);
-	CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
-
-	i = 100;
-	while (i-- > 0) {
-		mdelay(10);
-		if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
-			i -= 10;
-	}
-
-	CMOS_WRITE(save_control, RTC_CONTROL);
-	CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
-	clear_IO_APIC_pin(apic, pin);
-
-	ioapic_write_entry(apic, pin, entry0);
-}
-
-/*
- * This code may look a bit paranoid, but it's supposed to cooperate with
- * a wide range of boards and BIOS bugs.  Fortunately only the timer IRQ
- * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
- * fanatically on his truly buggy board.
- */
-static inline void __init check_timer(void)
-{
-	int apic1, pin1, apic2, pin2;
-	int no_pin1 = 0;
-	int vector;
-	unsigned int ver;
-	unsigned long flags;
-
-	local_irq_save(flags);
-
-	ver = apic_read(APIC_LVR);
-	ver = GET_APIC_VERSION(ver);
-
-	/*
-	 * get/set the timer IRQ vector:
-	 */
-	disable_8259A_irq(0);
-	vector = assign_irq_vector(0);
-	set_intr_gate(vector, interrupt[0]);
-
-	/*
-	 * As IRQ0 is to be enabled in the 8259A, the virtual
-	 * wire has to be disabled in the local APIC.  Also
-	 * timer interrupts need to be acknowledged manually in
-	 * the 8259A for the i82489DX when using the NMI
-	 * watchdog as that APIC treats NMIs as level-triggered.
-	 * The AEOI mode will finish them in the 8259A
-	 * automatically.
-	 */
-	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
-	init_8259A(1);
-	timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
-
-	pin1  = find_isa_irq_pin(0, mp_INT);
-	apic1 = find_isa_irq_apic(0, mp_INT);
-	pin2  = ioapic_i8259.pin;
-	apic2 = ioapic_i8259.apic;
-
-	printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
-		vector, apic1, pin1, apic2, pin2);
-
-	/*
-	 * Some BIOS writers are clueless and report the ExtINTA
-	 * I/O APIC input from the cascaded 8259A as the timer
-	 * interrupt input.  So just in case, if only one pin
-	 * was found above, try it both directly and through the
-	 * 8259A.
-	 */
-	if (pin1 == -1) {
-		pin1 = pin2;
-		apic1 = apic2;
-		no_pin1 = 1;
-	} else if (pin2 == -1) {
-		pin2 = pin1;
-		apic2 = apic1;
-	}
-
-	if (pin1 != -1) {
-		/*
-		 * Ok, does IRQ0 through the IOAPIC work?
-		 */
-		if (no_pin1) {
-			add_pin_to_irq(0, apic1, pin1);
-			setup_timer_IRQ0_pin(apic1, pin1, vector);
-		}
-		unmask_IO_APIC_irq(0);
-		if (timer_irq_works()) {
-			if (nmi_watchdog == NMI_IO_APIC) {
-				setup_nmi();
-				enable_8259A_irq(0);
-			}
-			if (disable_timer_pin_1 > 0)
-				clear_IO_APIC_pin(0, pin1);
-			goto out;
-		}
-		clear_IO_APIC_pin(apic1, pin1);
-		if (!no_pin1)
-			printk(KERN_ERR "..MP-BIOS bug: "
-			       "8254 timer not connected to IO-APIC\n");
-
-		printk(KERN_INFO "...trying to set up timer (IRQ0) "
-		       "through the 8259A ... ");
-		printk("\n..... (found pin %d) ...", pin2);
-		/*
-		 * legacy devices should be connected to IO APIC #0
-		 */
-		replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
-		setup_timer_IRQ0_pin(apic2, pin2, vector);
-		unmask_IO_APIC_irq(0);
-		enable_8259A_irq(0);
-		if (timer_irq_works()) {
-			printk("works.\n");
-			timer_through_8259 = 1;
-			if (nmi_watchdog == NMI_IO_APIC) {
-				disable_8259A_irq(0);
-				setup_nmi();
-				enable_8259A_irq(0);
-			}
-			goto out;
-		}
-		/*
-		 * Cleanup, just in case ...
-		 */
-		disable_8259A_irq(0);
-		clear_IO_APIC_pin(apic2, pin2);
-		printk(" failed.\n");
-	}
-
-	if (nmi_watchdog == NMI_IO_APIC) {
-		printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
-		nmi_watchdog = NMI_NONE;
-	}
-	timer_ack = 0;
-
-	printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
-
-	lapic_register_intr(0, vector);
-	apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector);	/* Fixed mode */
-	enable_8259A_irq(0);
-
-	if (timer_irq_works()) {
-		printk(" works.\n");
-		goto out;
-	}
-	disable_8259A_irq(0);
-	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
-	printk(" failed.\n");
-
-	printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
-
-	init_8259A(0);
-	make_8259A_irq(0);
-	apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
-
-	unlock_ExtINT_logic();
-
-	if (timer_irq_works()) {
-		printk(" works.\n");
-		goto out;
-	}
-	printk(" failed :(.\n");
-	panic("IO-APIC + timer doesn't work!  Boot with apic=debug and send a "
-		"report.  Then try booting with the 'noapic' option");
-out:
-	local_irq_restore(flags);
-}
-
-/*
- * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
- * to devices.  However there may be an I/O APIC pin available for
- * this interrupt regardless.  The pin may be left unconnected, but
- * typically it will be reused as an ExtINT cascade interrupt for
- * the master 8259A.  In the MPS case such a pin will normally be
- * reported as an ExtINT interrupt in the MP table.  With ACPI
- * there is no provision for ExtINT interrupts, and in the absence
- * of an override it would be treated as an ordinary ISA I/O APIC
- * interrupt, that is edge-triggered and unmasked by default.  We
- * used to do this, but it caused problems on some systems because
- * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
- * the same ExtINT cascade interrupt to drive the local APIC of the
- * bootstrap processor.  Therefore we refrain from routing IRQ2 to
- * the I/O APIC in all cases now.  No actual device should request
- * it anyway.  --macro
- */
-#define PIC_IRQS	(1 << PIC_CASCADE_IR)
-
-void __init setup_IO_APIC(void)
-{
-	int i;
-
-	/* Reserve all the system vectors. */
-	for (i = first_system_vector; i < NR_VECTORS; i++)
-		set_bit(i, used_vectors);
-
-	enable_IO_APIC();
-
-	io_apic_irqs = ~PIC_IRQS;
-
-	printk("ENABLING IO-APIC IRQs\n");
-
-	/*
-	 * Set up IO-APIC IRQ routing.
-	 */
-	if (!acpi_ioapic)
-		setup_ioapic_ids_from_mpc();
-	sync_Arb_IDs();
-	setup_IO_APIC_irqs();
-	init_IO_APIC_traps();
-	check_timer();
-	if (!acpi_ioapic)
-		print_IO_APIC();
-}
-
-/*
- *	Called after all the initialization is done. If we didnt find any
- *	APIC bugs then we can allow the modify fast path
- */
-
-static int __init io_apic_bug_finalize(void)
-{
-	if (sis_apic_bug == -1)
-		sis_apic_bug = 0;
-	return 0;
-}
-
-late_initcall(io_apic_bug_finalize);
-
-struct sysfs_ioapic_data {
-	struct sys_device dev;
-	struct IO_APIC_route_entry entry[0];
-};
-static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
-
-static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
-{
-	struct IO_APIC_route_entry *entry;
-	struct sysfs_ioapic_data *data;
-	int i;
-
-	data = container_of(dev, struct sysfs_ioapic_data, dev);
-	entry = data->entry;
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
-		entry[i] = ioapic_read_entry(dev->id, i);
-
-	return 0;
-}
-
-static int ioapic_resume(struct sys_device *dev)
-{
-	struct IO_APIC_route_entry *entry;
-	struct sysfs_ioapic_data *data;
-	unsigned long flags;
-	union IO_APIC_reg_00 reg_00;
-	int i;
-
-	data = container_of(dev, struct sysfs_ioapic_data, dev);
-	entry = data->entry;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	reg_00.raw = io_apic_read(dev->id, 0);
-	if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
-		reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
-		io_apic_write(dev->id, 0, reg_00.raw);
-	}
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
-		ioapic_write_entry(dev->id, i, entry[i]);
-
-	return 0;
-}
-
-static struct sysdev_class ioapic_sysdev_class = {
-	.name = "ioapic",
-	.suspend = ioapic_suspend,
-	.resume = ioapic_resume,
-};
-
-static int __init ioapic_init_sysfs(void)
-{
-	struct sys_device *dev;
-	int i, size, error = 0;
-
-	error = sysdev_class_register(&ioapic_sysdev_class);
-	if (error)
-		return error;
-
-	for (i = 0; i < nr_ioapics; i++) {
-		size = sizeof(struct sys_device) + nr_ioapic_registers[i]
-			* sizeof(struct IO_APIC_route_entry);
-		mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
-		if (!mp_ioapic_data[i]) {
-			printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
-			continue;
-		}
-		dev = &mp_ioapic_data[i]->dev;
-		dev->id = i;
-		dev->cls = &ioapic_sysdev_class;
-		error = sysdev_register(dev);
-		if (error) {
-			kfree(mp_ioapic_data[i]);
-			mp_ioapic_data[i] = NULL;
-			printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
-			continue;
-		}
-	}
-
-	return 0;
-}
-
-device_initcall(ioapic_init_sysfs);
-
-/*
- * Dynamic irq allocate and deallocation
- */
-int create_irq(void)
-{
-	/* Allocate an unused irq */
-	int irq, new, vector = 0;
-	unsigned long flags;
-
-	irq = -ENOSPC;
-	spin_lock_irqsave(&vector_lock, flags);
-	for (new = (NR_IRQS - 1); new >= 0; new--) {
-		if (platform_legacy_irq(new))
-			continue;
-		if (irq_vector[new] != 0)
-			continue;
-		vector = __assign_irq_vector(new);
-		if (likely(vector > 0))
-			irq = new;
-		break;
-	}
-	spin_unlock_irqrestore(&vector_lock, flags);
-
-	if (irq >= 0) {
-		set_intr_gate(vector, interrupt[irq]);
-		dynamic_irq_init(irq);
-	}
-	return irq;
-}
-
-void destroy_irq(unsigned int irq)
-{
-	unsigned long flags;
-
-	dynamic_irq_cleanup(irq);
-
-	spin_lock_irqsave(&vector_lock, flags);
-	clear_bit(irq_vector[irq], used_vectors);
-	irq_vector[irq] = 0;
-	spin_unlock_irqrestore(&vector_lock, flags);
-}
-
-/*
- * MSI message composition
- */
-#ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
-{
-	int vector;
-	unsigned dest;
-
-	vector = assign_irq_vector(irq);
-	if (vector >= 0) {
-		dest = cpu_mask_to_apicid(TARGET_CPUS);
-
-		msg->address_hi = MSI_ADDR_BASE_HI;
-		msg->address_lo =
-			MSI_ADDR_BASE_LO |
-			((INT_DEST_MODE == 0) ?
-MSI_ADDR_DEST_MODE_PHYSICAL:
-				MSI_ADDR_DEST_MODE_LOGICAL) |
-			((INT_DELIVERY_MODE != dest_LowestPrio) ?
-				MSI_ADDR_REDIRECTION_CPU:
-				MSI_ADDR_REDIRECTION_LOWPRI) |
-			MSI_ADDR_DEST_ID(dest);
-
-		msg->data =
-			MSI_DATA_TRIGGER_EDGE |
-			MSI_DATA_LEVEL_ASSERT |
-			((INT_DELIVERY_MODE != dest_LowestPrio) ?
-MSI_DATA_DELIVERY_FIXED:
-				MSI_DATA_DELIVERY_LOWPRI) |
-			MSI_DATA_VECTOR(vector);
-	}
-	return vector;
-}
-
-#ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
-{
-	struct msi_msg msg;
-	unsigned int dest;
-	cpumask_t tmp;
-	int vector;
-
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
-		tmp = TARGET_CPUS;
-
-	vector = assign_irq_vector(irq);
-	if (vector < 0)
-		return;
-
-	dest = cpu_mask_to_apicid(mask);
-
-	read_msi_msg(irq, &msg);
-
-	msg.data &= ~MSI_DATA_VECTOR_MASK;
-	msg.data |= MSI_DATA_VECTOR(vector);
-	msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
-	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
-	write_msi_msg(irq, &msg);
-	irq_desc[irq].affinity = mask;
-}
-#endif /* CONFIG_SMP */
-
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
-	.name		= "PCI-MSI",
-	.unmask		= unmask_msi_irq,
-	.mask		= mask_msi_irq,
-	.ack		= ack_ioapic_irq,
-#ifdef CONFIG_SMP
-	.set_affinity	= set_msi_irq_affinity,
-#endif
-	.retrigger	= ioapic_retrigger_irq,
-};
-
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
-{
-	struct msi_msg msg;
-	int irq, ret;
-	irq = create_irq();
-	if (irq < 0)
-		return irq;
-
-	ret = msi_compose_msg(dev, irq, &msg);
-	if (ret < 0) {
-		destroy_irq(irq);
-		return ret;
-	}
-
-	set_irq_msi(irq, desc);
-	write_msi_msg(irq, &msg);
-
-	set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
-				      "edge");
-
-	return 0;
-}
-
-void arch_teardown_msi_irq(unsigned int irq)
-{
-	destroy_irq(irq);
-}
-
-#endif /* CONFIG_PCI_MSI */
-
-/*
- * Hypertransport interrupt support
- */
-#ifdef CONFIG_HT_IRQ
-
-#ifdef CONFIG_SMP
-
-static void target_ht_irq(unsigned int irq, unsigned int dest)
-{
-	struct ht_irq_msg msg;
-	fetch_ht_irq_msg(irq, &msg);
-
-	msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
-	msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
-	msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
-	msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
-	write_ht_irq_msg(irq, &msg);
-}
-
-static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
-{
-	unsigned int dest;
-	cpumask_t tmp;
-
-	cpus_and(tmp, mask, cpu_online_map);
-	if (cpus_empty(tmp))
-		tmp = TARGET_CPUS;
-
-	cpus_and(mask, tmp, CPU_MASK_ALL);
-
-	dest = cpu_mask_to_apicid(mask);
-
-	target_ht_irq(irq, dest);
-	irq_desc[irq].affinity = mask;
-}
-#endif
-
-static struct irq_chip ht_irq_chip = {
-	.name		= "PCI-HT",
-	.mask		= mask_ht_irq,
-	.unmask		= unmask_ht_irq,
-	.ack		= ack_ioapic_irq,
-#ifdef CONFIG_SMP
-	.set_affinity	= set_ht_irq_affinity,
-#endif
-	.retrigger	= ioapic_retrigger_irq,
-};
-
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
-{
-	int vector;
-
-	vector = assign_irq_vector(irq);
-	if (vector >= 0) {
-		struct ht_irq_msg msg;
-		unsigned dest;
-		cpumask_t tmp;
-
-		cpus_clear(tmp);
-		cpu_set(vector >> 8, tmp);
-		dest = cpu_mask_to_apicid(tmp);
-
-		msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
-
-		msg.address_lo =
-			HT_IRQ_LOW_BASE |
-			HT_IRQ_LOW_DEST_ID(dest) |
-			HT_IRQ_LOW_VECTOR(vector) |
-			((INT_DEST_MODE == 0) ?
-				HT_IRQ_LOW_DM_PHYSICAL :
-				HT_IRQ_LOW_DM_LOGICAL) |
-			HT_IRQ_LOW_RQEOI_EDGE |
-			((INT_DELIVERY_MODE != dest_LowestPrio) ?
-				HT_IRQ_LOW_MT_FIXED :
-				HT_IRQ_LOW_MT_ARBITRATED) |
-			HT_IRQ_LOW_IRQ_MASKED;
-
-		write_ht_irq_msg(irq, &msg);
-
-		set_irq_chip_and_handler_name(irq, &ht_irq_chip,
-					      handle_edge_irq, "edge");
-	}
-	return vector;
-}
-#endif /* CONFIG_HT_IRQ */
-
-/* --------------------------------------------------------------------------
-			ACPI-based IOAPIC Configuration
-   -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-int __init io_apic_get_unique_id(int ioapic, int apic_id)
-{
-	union IO_APIC_reg_00 reg_00;
-	static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
-	physid_mask_t tmp;
-	unsigned long flags;
-	int i = 0;
-
-	/*
-	 * The P4 platform supports up to 256 APIC IDs on two separate APIC
-	 * buses (one for LAPICs, one for IOAPICs), where predecessors only
-	 * supports up to 16 on one shared APIC bus.
-	 *
-	 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
-	 *      advantage of new APIC bus architecture.
-	 */
-
-	if (physids_empty(apic_id_map))
-		apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	reg_00.raw = io_apic_read(ioapic, 0);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-
-	if (apic_id >= get_physical_broadcast()) {
-		printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
-			"%d\n", ioapic, apic_id, reg_00.bits.ID);
-		apic_id = reg_00.bits.ID;
-	}
-
-	/*
-	 * Every APIC in a system must have a unique ID or we get lots of nice
-	 * 'stuck on smp_invalidate_needed IPI wait' messages.
-	 */
-	if (check_apicid_used(apic_id_map, apic_id)) {
-
-		for (i = 0; i < get_physical_broadcast(); i++) {
-			if (!check_apicid_used(apic_id_map, i))
-				break;
-		}
-
-		if (i == get_physical_broadcast())
-			panic("Max apic_id exceeded!\n");
-
-		printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
-			"trying %d\n", ioapic, apic_id, i);
-
-		apic_id = i;
-	}
-
-	tmp = apicid_to_cpu_present(apic_id);
-	physids_or(apic_id_map, apic_id_map, tmp);
-
-	if (reg_00.bits.ID != apic_id) {
-		reg_00.bits.ID = apic_id;
-
-		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(ioapic, 0, reg_00.raw);
-		reg_00.raw = io_apic_read(ioapic, 0);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-
-		/* Sanity check */
-		if (reg_00.bits.ID != apic_id) {
-			printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
-			return -1;
-		}
-	}
-
-	apic_printk(APIC_VERBOSE, KERN_INFO
-			"IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
-
-	return apic_id;
-}
-
-
-int __init io_apic_get_version(int ioapic)
-{
-	union IO_APIC_reg_01	reg_01;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	reg_01.raw = io_apic_read(ioapic, 1);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-
-	return reg_01.bits.version;
-}
-
-
-int __init io_apic_get_redir_entries(int ioapic)
-{
-	union IO_APIC_reg_01	reg_01;
-	unsigned long flags;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	reg_01.raw = io_apic_read(ioapic, 1);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-
-	return reg_01.bits.entries;
-}
-
-
-int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
-{
-	struct IO_APIC_route_entry entry;
-
-	if (!IO_APIC_IRQ(irq)) {
-		printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
-			ioapic);
-		return -EINVAL;
-	}
-
-	/*
-	 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
-	 * Note that we mask (disable) IRQs now -- these get enabled when the
-	 * corresponding device driver registers for this IRQ.
-	 */
-
-	memset(&entry, 0, sizeof(entry));
-
-	entry.delivery_mode = INT_DELIVERY_MODE;
-	entry.dest_mode = INT_DEST_MODE;
-	entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
-	entry.trigger = edge_level;
-	entry.polarity = active_high_low;
-	entry.mask  = 1;
-
-	/*
-	 * IRQs < 16 are already in the irq_2_pin[] map
-	 */
-	if (irq >= 16)
-		add_pin_to_irq(irq, ioapic, pin);
-
-	entry.vector = assign_irq_vector(irq);
-
-	apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
-		"(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
-		mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
-		edge_level, active_high_low);
-
-	ioapic_register_intr(irq, entry.vector, edge_level);
-
-	if (!ioapic && (irq < 16))
-		disable_8259A_irq(irq);
-
-	ioapic_write_entry(ioapic, pin, entry);
-
-	return 0;
-}
-
-int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
-{
-	int i;
-
-	if (skip_ioapic_setup)
-		return -1;
-
-	for (i = 0; i < mp_irq_entries; i++)
-		if (mp_irqs[i].mp_irqtype == mp_INT &&
-		    mp_irqs[i].mp_srcbusirq == bus_irq)
-			break;
-	if (i >= mp_irq_entries)
-		return -1;
-
-	*trigger = irq_trigger(i);
-	*polarity = irq_polarity(i);
-	return 0;
-}
-
-#endif /* CONFIG_ACPI */
-
-static int __init parse_disable_timer_pin_1(char *arg)
-{
-	disable_timer_pin_1 = 1;
-	return 0;
-}
-early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
-
-static int __init parse_enable_timer_pin_1(char *arg)
-{
-	disable_timer_pin_1 = -1;
-	return 0;
-}
-early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
-
-static int __init parse_noapic(char *arg)
-{
-	/* disable IO-APIC */
-	disable_ioapic_setup();
-	return 0;
-}
-early_param("noapic", parse_noapic);
-
-void __init ioapic_init_mappings(void)
-{
-	unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
-	int i;
-
-	for (i = 0; i < nr_ioapics; i++) {
-		if (smp_found_config) {
-			ioapic_phys = mp_ioapics[i].mp_apicaddr;
-			if (!ioapic_phys) {
-				printk(KERN_ERR
-				       "WARNING: bogus zero IO-APIC "
-				       "address found in MPTABLE, "
-				       "disabling IO/APIC support!\n");
-				smp_found_config = 0;
-				skip_ioapic_setup = 1;
-				goto fake_ioapic_page;
-			}
-		} else {
-fake_ioapic_page:
-			ioapic_phys = (unsigned long)
-				      alloc_bootmem_pages(PAGE_SIZE);
-			ioapic_phys = __pa(ioapic_phys);
-		}
-		set_fixmap_nocache(idx, ioapic_phys);
-		printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
-		       __fix_to_virt(idx), ioapic_phys);
-		idx++;
-	}
-}
-
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 5921e5f0a640..720d2607aacb 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "30BF")
 		}
 	},
+	{
+		.callback	= dmi_io_delay_0xed_port,
+		.ident		= "Presario F700",
+		.matches	= {
+			DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
+			DMI_MATCH(DMI_BOARD_NAME, "30D3")
+		}
+	},
 	{ }
 };
 
@@ -103,6 +111,9 @@ void __init io_delay_init(void)
 
 static int __init io_delay_param(char *s)
 {
+	if (!s)
+		return -EINVAL;
+
 	if (!strcmp(s, "0x80"))
 		io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
 	else if (!strcmp(s, "0xed"))
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 50e5e4a31c85..191914302744 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/thread_info.h>
 #include <linux/syscalls.h>
+#include <asm/syscalls.h>
 
 /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
 static void set_bitmap(unsigned long *bitmap, unsigned int base,
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index 9d98cda39ad9..f1c688e46f35 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -20,6 +20,8 @@
 
 #ifdef CONFIG_X86_32
 #include <mach_apic.h>
+#include <mach_ipi.h>
+
 /*
  * the following functions deal with sending IPIs between CPUs.
  *
@@ -70,7 +72,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
 	/*
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */
-	apic_write_around(APIC_ICR, cfg);
+	apic_write(APIC_ICR, cfg);
 }
 
 void send_IPI_self(int vector)
@@ -98,7 +100,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
 	 * prepare target chip field
 	 */
 	cfg = __prepare_ICR2(mask);
-	apic_write_around(APIC_ICR2, cfg);
+	apic_write(APIC_ICR2, cfg);
 
 	/*
 	 * program the ICR
@@ -108,7 +110,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
 	/*
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */
-	apic_write_around(APIC_ICR, cfg);
+	apic_write(APIC_ICR, cfg);
 }
 
 /*
@@ -147,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
 }
 
 /* must come after the send_IPI functions above for inlining */
-#include <mach_ipi.h>
 static int convert_apicid_to_cpu(int apic_id)
 {
 	int i;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
new file mode 100644
index 000000000000..d1d4dc52f649
--- /dev/null
+++ b/arch/x86/kernel/irq.c
@@ -0,0 +1,189 @@
+/*
+ * Common interrupt code for 32 and 64 bit
+ */
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/seq_file.h>
+
+#include <asm/apic.h>
+#include <asm/io_apic.h>
+#include <asm/smp.h>
+
+atomic_t irq_err_count;
+
+/*
+ * 'what should we do if we get a hw irq event on an illegal vector'.
+ * each architecture has to answer this themselves.
+ */
+void ack_bad_irq(unsigned int irq)
+{
+	printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	/*
+	 * Currently unexpected vectors happen only on SMP and APIC.
+	 * We _must_ ack these because every local APIC has only N
+	 * irq slots per priority level, and a 'hanging, unacked' IRQ
+	 * holds up an irq slot - in excessive cases (when multiple
+	 * unexpected vectors occur) that might lock up the APIC
+	 * completely.
+	 * But only ack when the APIC is enabled -AK
+	 */
+	if (cpu_has_apic)
+		ack_APIC_irq();
+#endif
+}
+
+#ifdef CONFIG_X86_32
+# define irq_stats(x)		(&per_cpu(irq_stat, x))
+#else
+# define irq_stats(x)		cpu_pda(x)
+#endif
+/*
+ * /proc/interrupts printing:
+ */
+static int show_other_interrupts(struct seq_file *p)
+{
+	int j;
+
+	seq_printf(p, "NMI: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
+	seq_printf(p, "  Non-maskable interrupts\n");
+#ifdef CONFIG_X86_LOCAL_APIC
+	seq_printf(p, "LOC: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
+	seq_printf(p, "  Local timer interrupts\n");
+#endif
+#ifdef CONFIG_SMP
+	seq_printf(p, "RES: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
+	seq_printf(p, "  Rescheduling interrupts\n");
+	seq_printf(p, "CAL: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
+	seq_printf(p, "  Function call interrupts\n");
+	seq_printf(p, "TLB: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
+	seq_printf(p, "  TLB shootdowns\n");
+#endif
+#ifdef CONFIG_X86_MCE
+	seq_printf(p, "TRM: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
+	seq_printf(p, "  Thermal event interrupts\n");
+# ifdef CONFIG_X86_64
+	seq_printf(p, "THR: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
+	seq_printf(p, "  Threshold APIC interrupts\n");
+# endif
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+	seq_printf(p, "SPU: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
+	seq_printf(p, "  Spurious interrupts\n");
+#endif
+	seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#if defined(CONFIG_X86_IO_APIC)
+	seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+	return 0;
+}
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+	unsigned long flags, any_count = 0;
+	int i = *(loff_t *) v, j;
+	struct irqaction *action;
+	struct irq_desc *desc;
+
+	if (i > nr_irqs)
+		return 0;
+
+	if (i == nr_irqs)
+		return show_other_interrupts(p);
+
+	/* print header */
+	if (i == 0) {
+		seq_printf(p, "           ");
+		for_each_online_cpu(j)
+			seq_printf(p, "CPU%-8d", j);
+		seq_putc(p, '\n');
+	}
+
+	desc = irq_to_desc(i);
+	spin_lock_irqsave(&desc->lock, flags);
+#ifndef CONFIG_SMP
+	any_count = kstat_irqs(i);
+#else
+	for_each_online_cpu(j)
+		any_count |= kstat_irqs_cpu(i, j);
+#endif
+	action = desc->action;
+	if (!action && !any_count)
+		goto out;
+
+	seq_printf(p, "%3d: ", i);
+#ifndef CONFIG_SMP
+	seq_printf(p, "%10u ", kstat_irqs(i));
+#else
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+#endif
+	seq_printf(p, " %8s", desc->chip->name);
+	seq_printf(p, "-%-8s", desc->name);
+
+	if (action) {
+		seq_printf(p, "  %s", action->name);
+		while ((action = action->next) != NULL)
+			seq_printf(p, ", %s", action->name);
+	}
+
+	seq_putc(p, '\n');
+out:
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return 0;
+}
+
+/*
+ * /proc/stat helpers
+ */
+u64 arch_irq_stat_cpu(unsigned int cpu)
+{
+	u64 sum = irq_stats(cpu)->__nmi_count;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	sum += irq_stats(cpu)->apic_timer_irqs;
+#endif
+#ifdef CONFIG_SMP
+	sum += irq_stats(cpu)->irq_resched_count;
+	sum += irq_stats(cpu)->irq_call_count;
+	sum += irq_stats(cpu)->irq_tlb_count;
+#endif
+#ifdef CONFIG_X86_MCE
+	sum += irq_stats(cpu)->irq_thermal_count;
+# ifdef CONFIG_X86_64
+	sum += irq_stats(cpu)->irq_threshold_count;
+#endif
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+	sum += irq_stats(cpu)->irq_spurious_count;
+#endif
+	return sum;
+}
+
+u64 arch_irq_stat(void)
+{
+	u64 sum = atomic_read(&irq_err_count);
+
+#ifdef CONFIG_X86_IO_APIC
+	sum += atomic_read(&irq_mis_count);
+#endif
+	return sum;
+}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 47a6f6f12478..a51382672de0 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -25,29 +25,6 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
 DEFINE_PER_CPU(struct pt_regs *, irq_regs);
 EXPORT_PER_CPU_SYMBOL(irq_regs);
 
-/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
- */
-void ack_bad_irq(unsigned int irq)
-{
-	printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	/*
-	 * Currently unexpected vectors happen only on SMP and APIC.
-	 * We _must_ ack these because every local APIC has only N
-	 * irq slots per priority level, and a 'hanging, unacked' IRQ
-	 * holds up an irq slot - in excessive cases (when multiple
-	 * unexpected vectors occur) that might lock up the APIC
-	 * completely.
-	 * But only ack when the APIC is enabled -AK
-	 */
-	if (cpu_has_apic)
-		ack_APIC_irq();
-#endif
-}
-
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 /* Debugging check for stack overflow: is there less than 1KB free? */
 static int check_stack_overflow(void)
@@ -83,11 +60,8 @@ union irq_ctx {
 static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
 static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
 
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__section__(".bss.page_aligned")));
-
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
-		__attribute__((__section__(".bss.page_aligned")));
+static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
 
 static void call_on_stack(void *func, void *stack)
 {
@@ -226,20 +200,25 @@ unsigned int do_IRQ(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs;
 	/* high bit used in ret_from_ code */
-	int overflow, irq = ~regs->orig_ax;
-	struct irq_desc *desc = irq_desc + irq;
+	int overflow;
+	unsigned vector = ~regs->orig_ax;
+	struct irq_desc *desc;
+	unsigned irq;
 
-	if (unlikely((unsigned)irq >= NR_IRQS)) {
-		printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
-					__func__, irq);
-		BUG();
-	}
 
 	old_regs = set_irq_regs(regs);
 	irq_enter();
+	irq = __get_cpu_var(vector_irq)[vector];
 
 	overflow = check_stack_overflow();
 
+	desc = irq_to_desc(irq);
+	if (unlikely(!desc)) {
+		printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n",
+					__func__, irq, vector, smp_processor_id());
+		BUG();
+	}
+
 	if (!execute_on_irq_stack(overflow, desc, irq)) {
 		if (unlikely(overflow))
 			print_stack_overflow();
@@ -251,146 +230,6 @@ unsigned int do_IRQ(struct pt_regs *regs)
 	return 1;
 }
 
-/*
- * Interrupt statistics:
- */
-
-atomic_t irq_err_count;
-
-/*
- * /proc/interrupts printing:
- */
-
-int show_interrupts(struct seq_file *p, void *v)
-{
-	int i = *(loff_t *) v, j;
-	struct irqaction * action;
-	unsigned long flags;
-
-	if (i == 0) {
-		seq_printf(p, "           ");
-		for_each_online_cpu(j)
-			seq_printf(p, "CPU%-8d",j);
-		seq_putc(p, '\n');
-	}
-
-	if (i < NR_IRQS) {
-		unsigned any_count = 0;
-
-		spin_lock_irqsave(&irq_desc[i].lock, flags);
-#ifndef CONFIG_SMP
-		any_count = kstat_irqs(i);
-#else
-		for_each_online_cpu(j)
-			any_count |= kstat_cpu(j).irqs[i];
-#endif
-		action = irq_desc[i].action;
-		if (!action && !any_count)
-			goto skip;
-		seq_printf(p, "%3d: ",i);
-#ifndef CONFIG_SMP
-		seq_printf(p, "%10u ", kstat_irqs(i));
-#else
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
-#endif
-		seq_printf(p, " %8s", irq_desc[i].chip->name);
-		seq_printf(p, "-%-8s", irq_desc[i].name);
-
-		if (action) {
-			seq_printf(p, "  %s", action->name);
-			while ((action = action->next) != NULL)
-				seq_printf(p, ", %s", action->name);
-		}
-
-		seq_putc(p, '\n');
-skip:
-		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
-	} else if (i == NR_IRQS) {
-		seq_printf(p, "NMI: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", nmi_count(j));
-		seq_printf(p, "  Non-maskable interrupts\n");
-#ifdef CONFIG_X86_LOCAL_APIC
-		seq_printf(p, "LOC: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ",
-				per_cpu(irq_stat,j).apic_timer_irqs);
-		seq_printf(p, "  Local timer interrupts\n");
-#endif
-#ifdef CONFIG_SMP
-		seq_printf(p, "RES: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ",
-				per_cpu(irq_stat,j).irq_resched_count);
-		seq_printf(p, "  Rescheduling interrupts\n");
-		seq_printf(p, "CAL: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ",
-				per_cpu(irq_stat,j).irq_call_count);
-		seq_printf(p, "  function call interrupts\n");
-		seq_printf(p, "TLB: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ",
-				per_cpu(irq_stat,j).irq_tlb_count);
-		seq_printf(p, "  TLB shootdowns\n");
-#endif
-#ifdef CONFIG_X86_MCE
-		seq_printf(p, "TRM: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ",
-				per_cpu(irq_stat,j).irq_thermal_count);
-		seq_printf(p, "  Thermal event interrupts\n");
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-		seq_printf(p, "SPU: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ",
-				per_cpu(irq_stat,j).irq_spurious_count);
-		seq_printf(p, "  Spurious interrupts\n");
-#endif
-		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
-#if defined(CONFIG_X86_IO_APIC)
-		seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
-#endif
-	}
-	return 0;
-}
-
-/*
- * /proc/stat helpers
- */
-u64 arch_irq_stat_cpu(unsigned int cpu)
-{
-	u64 sum = nmi_count(cpu);
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
-#endif
-#ifdef CONFIG_SMP
-	sum += per_cpu(irq_stat, cpu).irq_resched_count;
-	sum += per_cpu(irq_stat, cpu).irq_call_count;
-	sum += per_cpu(irq_stat, cpu).irq_tlb_count;
-#endif
-#ifdef CONFIG_X86_MCE
-	sum += per_cpu(irq_stat, cpu).irq_thermal_count;
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-	sum += per_cpu(irq_stat, cpu).irq_spurious_count;
-#endif
-	return sum;
-}
-
-u64 arch_irq_stat(void)
-{
-	u64 sum = atomic_read(&irq_err_count);
-
-#ifdef CONFIG_X86_IO_APIC
-	sum += atomic_read(&irq_mis_count);
-#endif
-	return sum;
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 #include <mach_apic.h>
 
@@ -398,20 +237,22 @@ void fixup_irqs(cpumask_t map)
 {
 	unsigned int irq;
 	static int warned;
+	struct irq_desc *desc;
 
-	for (irq = 0; irq < NR_IRQS; irq++) {
+	for_each_irq_desc(irq, desc) {
 		cpumask_t mask;
+
 		if (irq == 2)
 			continue;
 
-		cpus_and(mask, irq_desc[irq].affinity, map);
+		cpus_and(mask, desc->affinity, map);
 		if (any_online_cpu(mask) == NR_CPUS) {
 			printk("Breaking affinity for irq %i\n", irq);
 			mask = map;
 		}
-		if (irq_desc[irq].chip->set_affinity)
-			irq_desc[irq].chip->set_affinity(irq, mask);
-		else if (irq_desc[irq].action && !(warned++))
+		if (desc->chip->set_affinity)
+			desc->chip->set_affinity(irq, mask);
+		else if (desc->action && !(warned++))
 			printk("Cannot set affinity for irq %i\n", irq);
 	}
 
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1f78b238d8d2..60eb84eb77a0 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,28 +18,6 @@
 #include <asm/idle.h>
 #include <asm/smp.h>
 
-atomic_t irq_err_count;
-
-/*
- * 'what should we do if we get a hw irq event on an illegal vector'.
- * each architecture has to answer this themselves.
- */
-void ack_bad_irq(unsigned int irq)
-{
-	printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq);
-	/*
-	 * Currently unexpected vectors happen only on SMP and APIC.
-	 * We _must_ ack these because every local APIC has only N
-	 * irq slots per priority level, and a 'hanging, unacked' IRQ
-	 * holds up an irq slot - in excessive cases (when multiple
-	 * unexpected vectors occur) that might lock up the APIC
-	 * completely.
-	 * But don't ack when the APIC is disabled. -AK
-	 */
-	if (!disable_apic)
-		ack_APIC_irq();
-}
-
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 /*
  * Probabilistic stack overflow check:
@@ -65,122 +43,6 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 #endif
 
 /*
- * Generic, controller-independent functions:
- */
-
-int show_interrupts(struct seq_file *p, void *v)
-{
-	int i = *(loff_t *) v, j;
-	struct irqaction * action;
-	unsigned long flags;
-
-	if (i == 0) {
-		seq_printf(p, "           ");
-		for_each_online_cpu(j)
-			seq_printf(p, "CPU%-8d",j);
-		seq_putc(p, '\n');
-	}
-
-	if (i < NR_IRQS) {
-		unsigned any_count = 0;
-
-		spin_lock_irqsave(&irq_desc[i].lock, flags);
-#ifndef CONFIG_SMP
-		any_count = kstat_irqs(i);
-#else
-		for_each_online_cpu(j)
-			any_count |= kstat_cpu(j).irqs[i];
-#endif
-		action = irq_desc[i].action;
-		if (!action && !any_count)
-			goto skip;
-		seq_printf(p, "%3d: ",i);
-#ifndef CONFIG_SMP
-		seq_printf(p, "%10u ", kstat_irqs(i));
-#else
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
-#endif
-		seq_printf(p, " %8s", irq_desc[i].chip->name);
-		seq_printf(p, "-%-8s", irq_desc[i].name);
-
-		if (action) {
-			seq_printf(p, "  %s", action->name);
-			while ((action = action->next) != NULL)
-				seq_printf(p, ", %s", action->name);
-		}
-		seq_putc(p, '\n');
-skip:
-		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
-	} else if (i == NR_IRQS) {
-		seq_printf(p, "NMI: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
-		seq_printf(p, "  Non-maskable interrupts\n");
-		seq_printf(p, "LOC: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
-		seq_printf(p, "  Local timer interrupts\n");
-#ifdef CONFIG_SMP
-		seq_printf(p, "RES: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count);
-		seq_printf(p, "  Rescheduling interrupts\n");
-		seq_printf(p, "CAL: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
-		seq_printf(p, "  function call interrupts\n");
-		seq_printf(p, "TLB: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
-		seq_printf(p, "  TLB shootdowns\n");
-#endif
-#ifdef CONFIG_X86_MCE
-		seq_printf(p, "TRM: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
-		seq_printf(p, "  Thermal event interrupts\n");
-		seq_printf(p, "THR: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
-		seq_printf(p, "  Threshold APIC interrupts\n");
-#endif
-		seq_printf(p, "SPU: ");
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
-		seq_printf(p, "  Spurious interrupts\n");
-		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
-	}
-	return 0;
-}
-
-/*
- * /proc/stat helpers
- */
-u64 arch_irq_stat_cpu(unsigned int cpu)
-{
-	u64 sum = cpu_pda(cpu)->__nmi_count;
-
-	sum += cpu_pda(cpu)->apic_timer_irqs;
-#ifdef CONFIG_SMP
-	sum += cpu_pda(cpu)->irq_resched_count;
-	sum += cpu_pda(cpu)->irq_call_count;
-	sum += cpu_pda(cpu)->irq_tlb_count;
-#endif
-#ifdef CONFIG_X86_MCE
-	sum += cpu_pda(cpu)->irq_thermal_count;
-	sum += cpu_pda(cpu)->irq_threshold_count;
-#endif
-	sum += cpu_pda(cpu)->irq_spurious_count;
-	return sum;
-}
-
-u64 arch_irq_stat(void)
-{
-	return atomic_read(&irq_err_count);
-}
-
-/*
  * do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
  * handlers).
@@ -188,6 +50,7 @@ u64 arch_irq_stat(void)
 asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
+	struct irq_desc *desc;
 
 	/* high bit used in ret_from_ code  */
 	unsigned vector = ~regs->orig_ax;
@@ -201,8 +64,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
 	stack_overflow_check(regs);
 #endif
 
-	if (likely(irq < NR_IRQS))
-		generic_handle_irq(irq);
+	desc = irq_to_desc(irq);
+	if (likely(desc))
+		generic_handle_irq_desc(irq, desc);
 	else {
 		if (!disable_apic)
 			ack_APIC_irq();
@@ -223,8 +87,9 @@ void fixup_irqs(cpumask_t map)
 {
 	unsigned int irq;
 	static int warned;
+	struct irq_desc *desc;
 
-	for (irq = 0; irq < NR_IRQS; irq++) {
+	for_each_irq_desc(irq, desc) {
 		cpumask_t mask;
 		int break_affinity = 0;
 		int set_affinity = 1;
@@ -233,32 +98,32 @@ void fixup_irqs(cpumask_t map)
 			continue;
 
 		/* interrupt's are disabled at this point */
-		spin_lock(&irq_desc[irq].lock);
+		spin_lock(&desc->lock);
 
 		if (!irq_has_action(irq) ||
-		    cpus_equal(irq_desc[irq].affinity, map)) {
-			spin_unlock(&irq_desc[irq].lock);
+		    cpus_equal(desc->affinity, map)) {
+			spin_unlock(&desc->lock);
 			continue;
 		}
 
-		cpus_and(mask, irq_desc[irq].affinity, map);
+		cpus_and(mask, desc->affinity, map);
 		if (cpus_empty(mask)) {
 			break_affinity = 1;
 			mask = map;
 		}
 
-		if (irq_desc[irq].chip->mask)
-			irq_desc[irq].chip->mask(irq);
+		if (desc->chip->mask)
+			desc->chip->mask(irq);
 
-		if (irq_desc[irq].chip->set_affinity)
-			irq_desc[irq].chip->set_affinity(irq, mask);
+		if (desc->chip->set_affinity)
+			desc->chip->set_affinity(irq, mask);
 		else if (!(warned++))
 			set_affinity = 0;
 
-		if (irq_desc[irq].chip->unmask)
-			irq_desc[irq].chip->unmask(irq);
+		if (desc->chip->unmask)
+			desc->chip->unmask(irq);
 
-		spin_unlock(&irq_desc[irq].lock);
+		spin_unlock(&desc->lock);
 
 		if (break_affinity && set_affinity)
 			printk("Broke affinity for irq %i\n", irq);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index d66914287ee1..845aa9803e80 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -69,11 +69,48 @@ void __init init_ISA_irqs (void)
 	 * 16 old-style INTA-cycle interrupts:
 	 */
 	for (i = 0; i < 16; i++) {
+		/* first time call this irq_desc */
+		struct irq_desc *desc = irq_to_desc(i);
+
+		desc->status = IRQ_DISABLED;
+		desc->action = NULL;
+		desc->depth = 1;
+
 		set_irq_chip_and_handler_name(i, &i8259A_chip,
 					      handle_level_irq, "XT");
 	}
 }
 
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+static struct irqaction irq2 = {
+	.handler = no_action,
+	.mask = CPU_MASK_NONE,
+	.name = "cascade",
+};
+
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+	[0 ... IRQ0_VECTOR - 1] = -1,
+	[IRQ0_VECTOR] = 0,
+	[IRQ1_VECTOR] = 1,
+	[IRQ2_VECTOR] = 2,
+	[IRQ3_VECTOR] = 3,
+	[IRQ4_VECTOR] = 4,
+	[IRQ5_VECTOR] = 5,
+	[IRQ6_VECTOR] = 6,
+	[IRQ7_VECTOR] = 7,
+	[IRQ8_VECTOR] = 8,
+	[IRQ9_VECTOR] = 9,
+	[IRQ10_VECTOR] = 10,
+	[IRQ11_VECTOR] = 11,
+	[IRQ12_VECTOR] = 12,
+	[IRQ13_VECTOR] = 13,
+	[IRQ14_VECTOR] = 14,
+	[IRQ15_VECTOR] = 15,
+	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+};
+
 /* Overridden in paravirt.c */
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
@@ -89,15 +126,50 @@ void __init native_init_IRQ(void)
 	 * us. (some of these will be overridden and become
 	 * 'special' SMP interrupts)
 	 */
-	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (i >= NR_IRQS)
-			break;
+	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
 		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (!test_bit(vector, used_vectors))
-			set_intr_gate(vector, interrupt[i]);
+		if (i != SYSCALL_VECTOR)
+			set_intr_gate(i, interrupt[i]);
 	}
 
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
+	/*
+	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+	 * IPI, driven by wakeup.
+	 */
+	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+	/* IPI for invalidation */
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+
+	/* IPI for generic function call */
+	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+	/* IPI for single call function */
+	set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
+
+	/* Low priority IPI to cleanup after moving an irq */
+	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	/* self generated IPI for local APIC timer */
+	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+	/* IPI vectors for APIC spurious and error interrupts */
+	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+#endif
+
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+	/* thermal monitor LVT interrupt */
+	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+
+	if (!acpi_ioapic)
+		setup_irq(2, &irq2);
+
 	/* setup after call gates are initialised (usually add in
 	 * the architecture specific gates)
 	 */
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 0373e88de95a..ff0235391285 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -43,10 +43,11 @@
 
 #define BUILD_IRQ(nr)				\
 	asmlinkage void IRQ_NAME(nr);		\
-	asm("\n.p2align\n"			\
+	asm("\n.text\n.p2align\n"		\
 	    "IRQ" #nr "_interrupt:\n\t"		\
 	    "push $~(" #nr ") ; "		\
-	    "jmp common_interrupt");
+	    "jmp common_interrupt\n"		\
+	    ".previous");
 
 #define BI(x,y) \
 	BUILD_IRQ(x##y)
@@ -134,51 +135,33 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
 	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
 };
 
-static void __init init_ISA_irqs (void)
+void __init init_ISA_irqs(void)
 {
 	int i;
 
 	init_bsp_APIC();
 	init_8259A(0);
 
-	for (i = 0; i < NR_IRQS; i++) {
-		irq_desc[i].status = IRQ_DISABLED;
-		irq_desc[i].action = NULL;
-		irq_desc[i].depth = 1;
+	for (i = 0; i < 16; i++) {
+		/* first time call this irq_desc */
+		struct irq_desc *desc = irq_to_desc(i);
 
-		if (i < 16) {
-			/*
-			 * 16 old-style INTA-cycle interrupts:
-			 */
-			set_irq_chip_and_handler_name(i, &i8259A_chip,
+		desc->status = IRQ_DISABLED;
+		desc->action = NULL;
+		desc->depth = 1;
+
+		/*
+		 * 16 old-style INTA-cycle interrupts:
+		 */
+		set_irq_chip_and_handler_name(i, &i8259A_chip,
 						      handle_level_irq, "XT");
-		} else {
-			/*
-			 * 'high' PCI IRQs filled in on demand
-			 */
-			irq_desc[i].chip = &no_irq_chip;
-		}
 	}
 }
 
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
-void __init native_init_IRQ(void)
+static void __init smp_intr_init(void)
 {
-	int i;
-
-	init_ISA_irqs();
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (vector != IA32_SYSCALL_VECTOR)
-			set_intr_gate(vector, interrupt[i]);
-	}
-
 #ifdef CONFIG_SMP
 	/*
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -206,6 +189,12 @@ void __init native_init_IRQ(void)
 	/* Low priority IPI to cleanup after moving an irq */
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 #endif
+}
+
+static void __init apic_intr_init(void)
+{
+	smp_intr_init();
+
 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
 
@@ -215,6 +204,25 @@ void __init native_init_IRQ(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+}
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	init_ISA_irqs();
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		if (vector != IA32_SYSCALL_VECTOR)
+			set_intr_gate(vector, interrupt[i]);
+	}
+
+	apic_intr_init();
 
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 7377ccb21335..304d8bad6559 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges);
 static u32 *flush_words;
 
 struct pci_device_id k8_nb_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
 	{}
 };
 EXPORT_SYMBOL(k8_nb_ids);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index c03205991718..ff7d3b0124f1 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -12,9 +12,13 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/mm.h>
+#include <linux/module.h>
 
 #include <asm/setup.h>
 
+struct dentry *arch_debugfs_dir;
+EXPORT_SYMBOL(arch_debugfs_dir);
+
 #ifdef CONFIG_DEBUG_BOOT_PARAMS
 struct setup_data_node {
 	u64 paddr;
@@ -135,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
 		if (PageHighMem(pg)) {
 			data = ioremap_cache(pa_data, sizeof(*data));
 			if (!data) {
+				kfree(node);
 				error = -ENXIO;
 				goto err_dir;
 			}
@@ -209,6 +214,10 @@ static int __init arch_kdebugfs_init(void)
 {
 	int error = 0;
 
+	arch_debugfs_dir = debugfs_create_dir("x86", NULL);
+	if (!arch_debugfs_dir)
+		return -ENOMEM;
+
 #ifdef CONFIG_DEBUG_BOOT_PARAMS
 	error = boot_params_kdebugfs_init();
 #endif
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b8..10435a120d22 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1;
  */
 void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 {
+#ifndef CONFIG_X86_32
+	u32 *gdb_regs32 = (u32 *)gdb_regs;
+#endif
 	gdb_regs[GDB_AX]	= regs->ax;
 	gdb_regs[GDB_BX]	= regs->bx;
 	gdb_regs[GDB_CX]	= regs->cx;
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 	gdb_regs[GDB_SI]	= regs->si;
 	gdb_regs[GDB_DI]	= regs->di;
 	gdb_regs[GDB_BP]	= regs->bp;
-	gdb_regs[GDB_PS]	= regs->flags;
 	gdb_regs[GDB_PC]	= regs->ip;
 #ifdef CONFIG_X86_32
+	gdb_regs[GDB_PS]	= regs->flags;
 	gdb_regs[GDB_DS]	= regs->ds;
 	gdb_regs[GDB_ES]	= regs->es;
 	gdb_regs[GDB_CS]	= regs->cs;
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 	gdb_regs[GDB_R13]	= regs->r13;
 	gdb_regs[GDB_R14]	= regs->r14;
 	gdb_regs[GDB_R15]	= regs->r15;
+	gdb_regs32[GDB_PS]	= regs->flags;
+	gdb_regs32[GDB_CS]	= regs->cs;
+	gdb_regs32[GDB_SS]	= regs->ss;
 #endif
 	gdb_regs[GDB_SP]	= regs->sp;
 }
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
  */
 void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
 {
+#ifndef CONFIG_X86_32
+	u32 *gdb_regs32 = (u32 *)gdb_regs;
+#endif
 	gdb_regs[GDB_AX]	= 0;
 	gdb_regs[GDB_BX]	= 0;
 	gdb_regs[GDB_CX]	= 0;
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
 	gdb_regs[GDB_FS]	= 0xFFFF;
 	gdb_regs[GDB_GS]	= 0xFFFF;
 #else
-	gdb_regs[GDB_PS]	= *(unsigned long *)(p->thread.sp + 8);
-	gdb_regs[GDB_PC]	= 0;
+	gdb_regs32[GDB_PS]	= *(unsigned long *)(p->thread.sp + 8);
+	gdb_regs32[GDB_CS]	= __KERNEL_CS;
+	gdb_regs32[GDB_SS]	= __KERNEL_DS;
+	gdb_regs[GDB_PC]	= p->thread.ip;
 	gdb_regs[GDB_R8]	= 0;
 	gdb_regs[GDB_R9]	= 0;
 	gdb_regs[GDB_R10]	= 0;
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
  */
 void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 {
+#ifndef CONFIG_X86_32
+	u32 *gdb_regs32 = (u32 *)gdb_regs;
+#endif
 	regs->ax		= gdb_regs[GDB_AX];
 	regs->bx		= gdb_regs[GDB_BX];
 	regs->cx		= gdb_regs[GDB_CX];
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 	regs->si		= gdb_regs[GDB_SI];
 	regs->di		= gdb_regs[GDB_DI];
 	regs->bp		= gdb_regs[GDB_BP];
-	regs->flags		= gdb_regs[GDB_PS];
 	regs->ip		= gdb_regs[GDB_PC];
 #ifdef CONFIG_X86_32
+	regs->flags		= gdb_regs[GDB_PS];
 	regs->ds		= gdb_regs[GDB_DS];
 	regs->es		= gdb_regs[GDB_ES];
 	regs->cs		= gdb_regs[GDB_CS];
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
 	regs->r13		= gdb_regs[GDB_R13];
 	regs->r14		= gdb_regs[GDB_R14];
 	regs->r15		= gdb_regs[GDB_R15];
+	regs->flags		= gdb_regs32[GDB_PS];
+	regs->cs		= gdb_regs32[GDB_CS];
+	regs->ss		= gdb_regs32[GDB_SS];
 #endif
 }
 
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
 		if (remcomInBuffer[0] == 's') {
 			linux_regs->flags |= X86_EFLAGS_TF;
 			kgdb_single_step = 1;
-			if (kgdb_contthread) {
-				atomic_set(&kgdb_cpu_doing_single_step,
-					   raw_smp_processor_id());
-			}
+			atomic_set(&kgdb_cpu_doing_single_step,
+				   raw_smp_processor_id());
 		}
 
 		get_debugreg(dr6, 6);
@@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 		return NOTIFY_DONE;
 
 	case DIE_NMI_IPI:
-		if (atomic_read(&kgdb_active) != -1) {
-			/* KGDB CPU roundup */
-			kgdb_nmicallback(raw_smp_processor_id(), regs);
-			was_in_debug_nmi[raw_smp_processor_id()] = 1;
-			touch_nmi_watchdog();
-		}
+		/* Just ignore, we will handle the roundup on DIE_NMI. */
 		return NOTIFY_DONE;
 
 	case DIE_NMIUNKNOWN:
@@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 
 	case DIE_DEBUG:
 		if (atomic_read(&kgdb_cpu_doing_single_step) ==
-			raw_smp_processor_id() &&
-			user_mode(regs))
-			return single_step_cont(regs, args);
+		    raw_smp_processor_id()) {
+			if (user_mode(regs))
+				return single_step_cont(regs, args);
+			break;
+		} else if (test_thread_flag(TIF_SINGLESTEP))
+			/* This means a user thread is single stepping
+			 * a system call which should be ignored
+			 */
+			return NOTIFY_DONE;
 		/* fall through */
 	default:
 		if (user_mode(regs))
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b8c6743a13da..6c27679ec6aa 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 		regs->ip = (unsigned long)p->ainsn.insn;
 }
 
-/* Called with kretprobe_lock held */
 void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
 				      struct pt_regs *regs)
 {
@@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
 	unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
 
 	INIT_HLIST_HEAD(&empty_rp);
-	spin_lock_irqsave(&kretprobe_lock, flags);
-	head = kretprobe_inst_table_head(current);
+	kretprobe_hash_lock(current, &head, &flags);
 	/* fixup registers */
 #ifdef CONFIG_X86_64
 	regs->cs = __KERNEL_CS;
@@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
 
 	kretprobe_assert(ri, orig_ret_address, trampoline_address);
 
-	spin_unlock_irqrestore(&kretprobe_lock, flags);
+	kretprobe_hash_unlock(current, &flags);
 
 	hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
 		hlist_del(&ri->hlist);
@@ -860,7 +858,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
 
 	resume_execution(cur, regs, kcb);
 	regs->flags |= kcb->kprobe_saved_flags;
-	trace_hardirqs_fixup_flags(regs->flags);
 
 	if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
 		kcb->kprobe_status = KPROBE_HIT_SSDONE;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b7a3cf37d2b..478bca986eca 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
 	kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
 }
 
-static void kvm_release_pt(u32 pfn)
+static void kvm_release_pt(unsigned long pfn)
 {
 	struct kvm_mmu_op_release_pt rpt = {
 		.header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 87edf1ceb1df..774ac4991568 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
 	return ret;
 }
 
+/*
+ * If we don't do that, there is the possibility that the guest
+ * will calibrate under heavy load - thus, getting a lower lpj -
+ * and execute the delays themselves without load. This is wrong,
+ * because no delay loop can finish beforehand.
+ * Any heuristics is subject to fail, because ultimately, a large
+ * poll of guests can be running and trouble each other. So we preset
+ * lpj here
+ */
+static unsigned long kvm_get_tsc_khz(void)
+{
+	return preset_lpj;
+}
+
+static void kvm_get_preset_lpj(void)
+{
+	struct pvclock_vcpu_time_info *src;
+	unsigned long khz;
+	u64 lpj;
+
+	src = &per_cpu(hv_clock, 0);
+	khz = pvclock_tsc_khz(src);
+
+	lpj = ((u64)khz * 1000);
+	do_div(lpj, HZ);
+	preset_lpj = lpj;
+}
+
 static struct clocksource kvm_clock = {
 	.name = "kvm-clock",
 	.read = kvm_clock_read,
@@ -113,7 +141,7 @@ static void kvm_setup_secondary_clock(void)
 #endif
 
 #ifdef CONFIG_SMP
-void __init kvm_smp_prepare_boot_cpu(void)
+static void __init kvm_smp_prepare_boot_cpu(void)
 {
 	WARN_ON(kvm_register_clock("primary cpu clock"));
 	native_smp_prepare_boot_cpu();
@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
 		pv_time_ops.get_wallclock = kvm_get_wallclock;
 		pv_time_ops.set_wallclock = kvm_set_wallclock;
 		pv_time_ops.sched_clock = kvm_clock_read;
+		pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
 #ifdef CONFIG_X86_LOCAL_APIC
 		pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
 #endif
@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
 #ifdef CONFIG_KEXEC
 		machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
+		kvm_get_preset_lpj();
 		clocksource_register(&kvm_clock);
 	}
 }
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a8449571858a..eee32b43fee3 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -18,6 +18,7 @@
 #include <asm/ldt.h>
 #include <asm/desc.h>
 #include <asm/mmu_context.h>
+#include <asm/syscalls.h>
 
 #ifdef CONFIG_SMP
 static void flush_ldt(void *current_mm)
@@ -51,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 	memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
 	       (mincount - oldsize) * LDT_ENTRY_SIZE);
 
+	paravirt_alloc_ldt(newldt, mincount);
+
 #ifdef CONFIG_X86_64
 	/* CHECKME: Do we really need this ? */
 	wmb();
@@ -62,12 +65,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 
 	if (reload) {
 #ifdef CONFIG_SMP
-		cpumask_t mask;
-
 		preempt_disable();
 		load_LDT(pc);
-		mask = cpumask_of_cpu(smp_processor_id());
-		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
+		if (!cpus_equal(current->mm->cpu_vm_mask,
+				cpumask_of_cpu(smp_processor_id())))
 			smp_call_function(flush_ldt, current->mm, 1);
 		preempt_enable();
 #else
@@ -75,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 #endif
 	}
 	if (oldsize) {
+		paravirt_free_ldt(oldldt, oldsize);
 		if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
 			vfree(oldldt);
 		else
@@ -86,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
 {
 	int err = alloc_ldt(new, old->size, 0);
+	int i;
 
 	if (err < 0)
 		return err;
-	memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE);
+
+	for(i = 0; i < old->size; i++)
+		write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
 	return 0;
 }
 
@@ -126,6 +131,7 @@ void destroy_context(struct mm_struct *mm)
 		if (mm == current->active_mm)
 			clear_LDT();
 #endif
+		paravirt_free_ldt(mm->context.ldt, mm->context.size);
 		if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
 			vfree(mm->context.ldt);
 		else
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8864230d55af..0732adba05ca 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/numa.h>
 #include <linux/ftrace.h>
+#include <linux/suspend.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -22,6 +23,7 @@
 #include <asm/cpufeature.h>
 #include <asm/desc.h>
 #include <asm/system.h>
+#include <asm/cacheflush.h>
 
 #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
 static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -77,7 +79,7 @@ static void load_segments(void)
 /*
  * A architecture hook called to validate the
  * proposed image and prepare the control pages
- * as needed.  The pages for KEXEC_CONTROL_CODE_SIZE
+ * as needed.  The pages for KEXEC_CONTROL_PAGE_SIZE
  * have been allocated, but the segments have yet
  * been copied into the kernel.
  *
@@ -85,10 +87,12 @@ static void load_segments(void)
  * reboot code buffer to allow us to avoid allocations
  * later.
  *
- * Currently nothing.
+ * Make control page executable.
  */
 int machine_kexec_prepare(struct kimage *image)
 {
+	if (nx_enabled)
+		set_pages_x(image->control_code_page, 1);
 	return 0;
 }
 
@@ -98,27 +102,54 @@ int machine_kexec_prepare(struct kimage *image)
  */
 void machine_kexec_cleanup(struct kimage *image)
 {
+	if (nx_enabled)
+		set_pages_nx(image->control_code_page, 1);
 }
 
 /*
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
  */
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
+	int save_ftrace_enabled;
+	asmlinkage unsigned long
+		(*relocate_kernel_ptr)(unsigned long indirection_page,
+				       unsigned long control_page,
+				       unsigned long start_address,
+				       unsigned int has_pae,
+				       unsigned int preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context)
+		save_processor_state();
+#endif
 
-	tracer_disable();
+	save_ftrace_enabled = __ftrace_enabled_save();
 
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
+	if (image->preserve_context) {
+#ifdef CONFIG_X86_IO_APIC
+		/* We need to put APICs in legacy mode so that we can
+		 * get timer interrupts in second kernel. kexec/kdump
+		 * paths already have calls to disable_IO_APIC() in
+		 * one form or other. kexec jump path also need
+		 * one.
+		 */
+		disable_IO_APIC();
+#endif
+	}
+
 	control_page = page_address(image->control_code_page);
-	memcpy(control_page, relocate_kernel, PAGE_SIZE);
+	memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
 
+	relocate_kernel_ptr = control_page;
 	page_list[PA_CONTROL_PAGE] = __pa(control_page);
-	page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
+	page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
 	page_list[PA_PGD] = __pa(kexec_pgd);
 	page_list[VA_PGD] = (unsigned long)kexec_pgd;
 #ifdef CONFIG_X86_PAE
@@ -131,6 +162,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
 	page_list[PA_PTE_1] = __pa(kexec_pte1);
 	page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
+	page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
 
 	/* The segment registers are funny things, they have both a
 	 * visible and an invisible part.  Whenever the visible part is
@@ -149,8 +181,17 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	set_idt(phys_to_virt(0),0);
 
 	/* now call it */
-	relocate_kernel((unsigned long)image->head, (unsigned long)page_list,
-			image->start, cpu_has_pae);
+	image->start = relocate_kernel_ptr((unsigned long)image->head,
+					   (unsigned long)page_list,
+					   image->start, cpu_has_pae,
+					   image->preserve_context);
+
+#ifdef CONFIG_KEXEC_JUMP
+	if (kexec_image->preserve_context)
+		restore_processor_state();
+#endif
+
+	__ftrace_enabled_restore(save_ftrace_enabled);
 }
 
 void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9dd9262693a3..c43caa3a91f3 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image)
  * Do not allocate memory (or fail in any way) in machine_kexec().
  * We are past the point of no return, committed to rebooting now.
  */
-NORET_TYPE void machine_kexec(struct kimage *image)
+void machine_kexec(struct kimage *image)
 {
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 07c0f828f488..3b599518c322 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -33,6 +33,8 @@
 #include <linux/module.h>
 #include <asm/geode.h>
 
+#define MFGPT_DEFAULT_IRQ	7
+
 static struct mfgpt_timer_t {
 	unsigned int avail:1;
 } mfgpt_timers[MFGPT_MAX_TIMERS];
@@ -157,29 +159,48 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
 }
 EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
 
-int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable)
+int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
 {
-	u32 val, dummy;
-	int offset;
+	u32 zsel, lpc, dummy;
+	int shift;
 
 	if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
 		return -EIO;
 
-	if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
+	/*
+	 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
+	 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
+	 * 2, and we mustn't use nor change it.
+	 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
+	 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
+	 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
+	 */
+	rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
+	shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
+	if (((zsel >> shift) & 0xF) == 2)
 		return -EIO;
 
-	rdmsr(MSR_PIC_ZSEL_LOW, val, dummy);
+	/* Choose IRQ: if none supplied, keep IRQ already set or use default */
+	if (!*irq)
+		*irq = (zsel >> shift) & 0xF;
+	if (!*irq)
+		*irq = MFGPT_DEFAULT_IRQ;
 
-	offset = (timer % 4) * 4;
-
-	val &= ~((0xF << offset) | (0xF << (offset + 16)));
+	/* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
+	if (*irq < 1 || *irq == 2 || *irq > 15)
+		return -EIO;
+	rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
+	if (lpc & (1 << *irq))
+		return -EIO;
 
+	/* All chosen and checked - go for it */
+	if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
+		return -EIO;
 	if (enable) {
-		val |= (irq & 0x0F) << (offset);
-		val |= (irq & 0x0F) << (offset + 16);
+		zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
+		wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
 	}
 
-	wrmsr(MSR_PIC_ZSEL_LOW, val, dummy);
 	return 0;
 }
 
@@ -242,7 +263,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
 static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
 static u16 mfgpt_event_clock;
 
-static int irq = 7;
+static int irq;
 static int __init mfgpt_setup(char *str)
 {
 	get_option(&str, &irq);
@@ -346,7 +367,7 @@ int __init mfgpt_timer_setup(void)
 	mfgpt_event_clock = timer;
 
 	/* Set up the IRQ on the MFGPT side */
-	if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) {
+	if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
 		printk(KERN_ERR "mfgpt-timer:  Could not set up IRQ %d\n", irq);
 		return -EIO;
 	}
@@ -374,13 +395,14 @@ int __init mfgpt_timer_setup(void)
 			&mfgpt_clockevent);
 
 	printk(KERN_INFO
-	       "mfgpt-timer:  registering the MFGPT timer as a clock event.\n");
+	       "mfgpt-timer:  Registering MFGPT timer %d as a clock event, using IRQ %d\n",
+	       timer, irq);
 	clockevents_register_device(&mfgpt_clockevent);
 
 	return 0;
 
 err:
-	geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, irq);
+	geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
 	printk(KERN_ERR
 	       "mfgpt-timer:  Unable to set up the MFGPT clock source\n");
 	return -EIO;
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
deleted file mode 100644
index 56b933119a04..000000000000
--- a/arch/x86/kernel/microcode.c
+++ /dev/null
@@ -1,851 +0,0 @@
-/*
- *	Intel CPU Microcode Update Driver for Linux
- *
- *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
- *		      2006	Shaohua Li <shaohua.li@intel.com>
- *
- *	This driver allows to upgrade microcode on Intel processors
- *	belonging to IA-32 family - PentiumPro, Pentium II,
- *	Pentium III, Xeon, Pentium 4, etc.
- *
- *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
- *	Software Developer's Manual
- *	Order Number 253668 or free download from:
- *
- *	http://developer.intel.com/design/pentium4/manuals/253668.htm
- *
- *	For more information, go to http://www.urbanmyth.org/microcode
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Initial release.
- *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added read() support + cleanups.
- *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Added 'device trimming' support. open(O_WRONLY) zeroes
- *		and frees the saved copy of applied microcode.
- *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
- *		Made to use devfs (/dev/cpu/microcode) + cleanups.
- *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Added misc device support (now uses both devfs and misc).
- *		Added MICROCODE_IOCFREE ioctl to clear memory.
- *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
- *		Messages for error cases (non Intel & no suitable microcode).
- *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
- *		Removed ->release(). Removed exclusive open and status bitmap.
- *		Added microcode_rwsem to serialize read()/write()/ioctl().
- *		Removed global kernel lock usage.
- *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
- *		Write 0 to 0x8B msr and then cpuid before reading revision,
- *		so that it works even if there were no update done by the
- *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
- *		to be 0 on my machine which is why it worked even when I
- *		disabled update by the BIOS)
- *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
- *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
- *			     Tigran Aivazian <tigran@veritas.com>
- *		Intel Pentium 4 processor support and bugfixes.
- *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
- *		Bugfix for HT (Hyper-Threading) enabled processors
- *		whereby processor resources are shared by all logical processors
- *		in a single CPU package.
- *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
- *		Tigran Aivazian <tigran@veritas.com>,
- *		Serialize updates as required on HT processors due to speculative
- *		nature of implementation.
- *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
- *		Fix the panic when writing zero-length microcode chunk.
- *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
- *		Jun Nakajima <jun.nakajima@intel.com>
- *		Support for the microcode updates in the new format.
- *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
- *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
- *		because we no longer hold a copy of applied microcode
- *		in kernel memory.
- *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
- *		Fix sigmatch() macro to handle old CPUs with pf == 0.
- *		Thanks to Stuart Swales for pointing out this bug.
- */
-
-//#define DEBUG /* pr_debug */
-#include <linux/capability.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/smp_lock.h>
-#include <linux/cpumask.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/miscdevice.h>
-#include <linux/spinlock.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/mutex.h>
-#include <linux/cpu.h>
-#include <linux/firmware.h>
-#include <linux/platform_device.h>
-
-#include <asm/msr.h>
-#include <asm/uaccess.h>
-#include <asm/processor.h>
-
-MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
-MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
-MODULE_LICENSE("GPL");
-
-#define MICROCODE_VERSION 	"1.14a"
-
-#define DEFAULT_UCODE_DATASIZE 	(2000) 	  /* 2000 bytes */
-#define MC_HEADER_SIZE		(sizeof (microcode_header_t))  	  /* 48 bytes */
-#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
-#define EXT_HEADER_SIZE		(sizeof (struct extended_sigtable)) /* 20 bytes */
-#define EXT_SIGNATURE_SIZE	(sizeof (struct extended_signature)) /* 12 bytes */
-#define DWSIZE			(sizeof (u32))
-#define get_totalsize(mc) \
-	(((microcode_t *)mc)->hdr.totalsize ? \
-	 ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
-#define get_datasize(mc) \
-	(((microcode_t *)mc)->hdr.datasize ? \
-	 ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
-
-#define sigmatch(s1, s2, p1, p2) \
-	(((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
-
-#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
-
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
-/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
-static DEFINE_MUTEX(microcode_mutex);
-
-static struct ucode_cpu_info {
-	int valid;
-	unsigned int sig;
-	unsigned int pf;
-	unsigned int rev;
-	microcode_t *mc;
-} ucode_cpu_info[NR_CPUS];
-
-static void collect_cpu_info(int cpu_num)
-{
-	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-	unsigned int val[2];
-
-	/* We should bind the task to the CPU */
-	BUG_ON(raw_smp_processor_id() != cpu_num);
-	uci->pf = uci->rev = 0;
-	uci->mc = NULL;
-	uci->valid = 1;
-
-	if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
-	    	cpu_has(c, X86_FEATURE_IA64)) {
-		printk(KERN_ERR "microcode: CPU%d not a capable Intel "
-			"processor\n", cpu_num);
-		uci->valid = 0;
-		return;
-	}
-
-	uci->sig = cpuid_eax(0x00000001);
-
-	if ((c->x86_model >= 5) || (c->x86 > 6)) {
-		/* get processor flags from MSR 0x17 */
-		rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
-		uci->pf = 1 << ((val[1] >> 18) & 7);
-	}
-
-	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-	/* see notes above for revision 1.07.  Apparent chip bug */
-	sync_core();
-	/* get the current revision from MSR 0x8B */
-	rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
-	pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
-			uci->sig, uci->pf, uci->rev);
-}
-
-static inline int microcode_update_match(int cpu_num,
-	microcode_header_t *mc_header, int sig, int pf)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
-	if (!sigmatch(sig, uci->sig, pf, uci->pf)
-		|| mc_header->rev <= uci->rev)
-		return 0;
-	return 1;
-}
-
-static int microcode_sanity_check(void *mc)
-{
-	microcode_header_t *mc_header = mc;
-	struct extended_sigtable *ext_header = NULL;
-	struct extended_signature *ext_sig;
-	unsigned long total_size, data_size, ext_table_size;
-	int sum, orig_sum, ext_sigcount = 0, i;
-
-	total_size = get_totalsize(mc_header);
-	data_size = get_datasize(mc_header);
-	if (data_size + MC_HEADER_SIZE > total_size) {
-		printk(KERN_ERR "microcode: error! "
-			"Bad data size in microcode data file\n");
-		return -EINVAL;
-	}
-
-	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
-		printk(KERN_ERR "microcode: error! "
-			"Unknown microcode update format\n");
-		return -EINVAL;
-	}
-	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
-	if (ext_table_size) {
-		if ((ext_table_size < EXT_HEADER_SIZE)
-		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
-			printk(KERN_ERR "microcode: error! "
-				"Small exttable size in microcode data file\n");
-			return -EINVAL;
-		}
-		ext_header = mc + MC_HEADER_SIZE + data_size;
-		if (ext_table_size != exttable_size(ext_header)) {
-			printk(KERN_ERR "microcode: error! "
-				"Bad exttable size in microcode data file\n");
-			return -EFAULT;
-		}
-		ext_sigcount = ext_header->count;
-	}
-
-	/* check extended table checksum */
-	if (ext_table_size) {
-		int ext_table_sum = 0;
-		int *ext_tablep = (int *)ext_header;
-
-		i = ext_table_size / DWSIZE;
-		while (i--)
-			ext_table_sum += ext_tablep[i];
-		if (ext_table_sum) {
-			printk(KERN_WARNING "microcode: aborting, "
-				"bad extended signature table checksum\n");
-			return -EINVAL;
-		}
-	}
-
-	/* calculate the checksum */
-	orig_sum = 0;
-	i = (MC_HEADER_SIZE + data_size) / DWSIZE;
-	while (i--)
-		orig_sum += ((int *)mc)[i];
-	if (orig_sum) {
-		printk(KERN_ERR "microcode: aborting, bad checksum\n");
-		return -EINVAL;
-	}
-	if (!ext_table_size)
-		return 0;
-	/* check extended signature checksum */
-	for (i = 0; i < ext_sigcount; i++) {
-		ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
-			  EXT_SIGNATURE_SIZE * i;
-		sum = orig_sum
-			- (mc_header->sig + mc_header->pf + mc_header->cksum)
-			+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
-		if (sum) {
-			printk(KERN_ERR "microcode: aborting, bad checksum\n");
-			return -EINVAL;
-		}
-	}
-	return 0;
-}
-
-/*
- * return 0 - no update found
- * return 1 - found update
- * return < 0 - error
- */
-static int get_maching_microcode(void *mc, int cpu)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	microcode_header_t *mc_header = mc;
-	struct extended_sigtable *ext_header;
-	unsigned long total_size = get_totalsize(mc_header);
-	int ext_sigcount, i;
-	struct extended_signature *ext_sig;
-	void *new_mc;
-
-	if (microcode_update_match(cpu, mc_header,
-			mc_header->sig, mc_header->pf))
-		goto find;
-
-	if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
-		return 0;
-
-	ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
-	ext_sigcount = ext_header->count;
-	ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
-	for (i = 0; i < ext_sigcount; i++) {
-		if (microcode_update_match(cpu, mc_header,
-				ext_sig->sig, ext_sig->pf))
-			goto find;
-		ext_sig++;
-	}
-	return 0;
-find:
-	pr_debug("microcode: CPU%d found a matching microcode update with"
-		" version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
-	new_mc = vmalloc(total_size);
-	if (!new_mc) {
-		printk(KERN_ERR "microcode: error! Can not allocate memory\n");
-		return -ENOMEM;
-	}
-
-	/* free previous update file */
-	vfree(uci->mc);
-
-	memcpy(new_mc, mc, total_size);
-	uci->mc = new_mc;
-	return 1;
-}
-
-static void apply_microcode(int cpu)
-{
-	unsigned long flags;
-	unsigned int val[2];
-	int cpu_num = raw_smp_processor_id();
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
-
-	/* We should bind the task to the CPU */
-	BUG_ON(cpu_num != cpu);
-
-	if (uci->mc == NULL)
-		return;
-
-	/* serialize access to the physical write to MSR 0x79 */
-	spin_lock_irqsave(&microcode_update_lock, flags);
-
-	/* write microcode via MSR 0x79 */
-	wrmsr(MSR_IA32_UCODE_WRITE,
-		(unsigned long) uci->mc->bits,
-		(unsigned long) uci->mc->bits >> 16 >> 16);
-	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-
-	/* see notes above for revision 1.07.  Apparent chip bug */
-	sync_core();
-
-	/* get the current revision from MSR 0x8B */
-	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-
-	spin_unlock_irqrestore(&microcode_update_lock, flags);
-	if (val[1] != uci->mc->hdr.rev) {
-		printk(KERN_ERR "microcode: CPU%d update from revision "
-			"0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
-		return;
-	}
-	printk(KERN_INFO "microcode: CPU%d updated from revision "
-	       "0x%x to 0x%x, date = %08x \n",
-	       cpu_num, uci->rev, val[1], uci->mc->hdr.date);
-	uci->rev = val[1];
-}
-
-#ifdef CONFIG_MICROCODE_OLD_INTERFACE
-static void __user *user_buffer;	/* user area microcode data buffer */
-static unsigned int user_buffer_size;	/* it's size */
-
-static long get_next_ucode(void **mc, long offset)
-{
-	microcode_header_t mc_header;
-	unsigned long total_size;
-
-	/* No more data */
-	if (offset >= user_buffer_size)
-		return 0;
-	if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
-		printk(KERN_ERR "microcode: error! Can not read user data\n");
-		return -EFAULT;
-	}
-	total_size = get_totalsize(&mc_header);
-	if (offset + total_size > user_buffer_size) {
-		printk(KERN_ERR "microcode: error! Bad total size in microcode "
-				"data file\n");
-		return -EINVAL;
-	}
-	*mc = vmalloc(total_size);
-	if (!*mc)
-		return -ENOMEM;
-	if (copy_from_user(*mc, user_buffer + offset, total_size)) {
-		printk(KERN_ERR "microcode: error! Can not read user data\n");
-		vfree(*mc);
-		return -EFAULT;
-	}
-	return offset + total_size;
-}
-
-static int do_microcode_update (void)
-{
-	long cursor = 0;
-	int error = 0;
-	void *new_mc = NULL;
-	int cpu;
-	cpumask_t old;
-
-	old = current->cpus_allowed;
-
-	while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
-		error = microcode_sanity_check(new_mc);
-		if (error)
-			goto out;
-		/*
-		 * It's possible the data file has multiple matching ucode,
-		 * lets keep searching till the latest version
-		 */
-		for_each_online_cpu(cpu) {
-			struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-			if (!uci->valid)
-				continue;
-			set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-			error = get_maching_microcode(new_mc, cpu);
-			if (error < 0)
-				goto out;
-			if (error == 1)
-				apply_microcode(cpu);
-		}
-		vfree(new_mc);
-	}
-out:
-	if (cursor > 0)
-		vfree(new_mc);
-	if (cursor < 0)
-		error = cursor;
-	set_cpus_allowed_ptr(current, &old);
-	return error;
-}
-
-static int microcode_open (struct inode *unused1, struct file *unused2)
-{
-	cycle_kernel_lock();
-	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
-}
-
-static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
-{
-	ssize_t ret;
-
-	if ((len >> PAGE_SHIFT) > num_physpages) {
-		printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
-		return -EINVAL;
-	}
-
-	get_online_cpus();
-	mutex_lock(&microcode_mutex);
-
-	user_buffer = (void __user *) buf;
-	user_buffer_size = (int) len;
-
-	ret = do_microcode_update();
-	if (!ret)
-		ret = (ssize_t)len;
-
-	mutex_unlock(&microcode_mutex);
-	put_online_cpus();
-
-	return ret;
-}
-
-static const struct file_operations microcode_fops = {
-	.owner		= THIS_MODULE,
-	.write		= microcode_write,
-	.open		= microcode_open,
-};
-
-static struct miscdevice microcode_dev = {
-	.minor		= MICROCODE_MINOR,
-	.name		= "microcode",
-	.fops		= &microcode_fops,
-};
-
-static int __init microcode_dev_init (void)
-{
-	int error;
-
-	error = misc_register(&microcode_dev);
-	if (error) {
-		printk(KERN_ERR
-			"microcode: can't misc_register on minor=%d\n",
-			MICROCODE_MINOR);
-		return error;
-	}
-
-	return 0;
-}
-
-static void microcode_dev_exit (void)
-{
-	misc_deregister(&microcode_dev);
-}
-
-MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
-#else
-#define microcode_dev_init() 0
-#define microcode_dev_exit() do { } while(0)
-#endif
-
-static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
-	unsigned long size, long offset)
-{
-	microcode_header_t *mc_header;
-	unsigned long total_size;
-
-	/* No more data */
-	if (offset >= size)
-		return 0;
-	mc_header = (microcode_header_t *)(buf + offset);
-	total_size = get_totalsize(mc_header);
-
-	if (offset + total_size > size) {
-		printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
-		return -EINVAL;
-	}
-
-	*mc = vmalloc(total_size);
-	if (!*mc) {
-		printk(KERN_ERR "microcode: error! Can not allocate memory\n");
-		return -ENOMEM;
-	}
-	memcpy(*mc, buf + offset, total_size);
-	return offset + total_size;
-}
-
-/* fake device for request_firmware */
-static struct platform_device *microcode_pdev;
-
-static int cpu_request_microcode(int cpu)
-{
-	char name[30];
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-	const struct firmware *firmware;
-	const u8 *buf;
-	unsigned long size;
-	long offset = 0;
-	int error;
-	void *mc;
-
-	/* We should bind the task to the CPU */
-	BUG_ON(cpu != raw_smp_processor_id());
-	sprintf(name,"intel-ucode/%02x-%02x-%02x",
-		c->x86, c->x86_model, c->x86_mask);
-	error = request_firmware(&firmware, name, &microcode_pdev->dev);
-	if (error) {
-		pr_debug("microcode: data file %s load failed\n", name);
-		return error;
-	}
-	buf = firmware->data;
-	size = firmware->size;
-	while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
-			> 0) {
-		error = microcode_sanity_check(mc);
-		if (error)
-			break;
-		error = get_maching_microcode(mc, cpu);
-		if (error < 0)
-			break;
-		/*
-		 * It's possible the data file has multiple matching ucode,
-		 * lets keep searching till the latest version
-		 */
-		if (error == 1) {
-			apply_microcode(cpu);
-			error = 0;
-		}
-		vfree(mc);
-	}
-	if (offset > 0)
-		vfree(mc);
-	if (offset < 0)
-		error = offset;
-	release_firmware(firmware);
-
-	return error;
-}
-
-static int apply_microcode_check_cpu(int cpu)
-{
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	cpumask_t old;
-	unsigned int val[2];
-	int err = 0;
-
-	/* Check if the microcode is available */
-	if (!uci->mc)
-		return 0;
-
-	old = current->cpus_allowed;
-	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-
-	/* Check if the microcode we have in memory matches the CPU */
-	if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
-	    cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
-		err = -EINVAL;
-
-	if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
-		/* get processor flags from MSR 0x17 */
-		rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
-		if (uci->pf != (1 << ((val[1] >> 18) & 7)))
-			err = -EINVAL;
-	}
-
-	if (!err) {
-		wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-		/* see notes above for revision 1.07.  Apparent chip bug */
-		sync_core();
-		/* get the current revision from MSR 0x8B */
-		rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
-		if (uci->rev != val[1])
-			err = -EINVAL;
-	}
-
-	if (!err)
-		apply_microcode(cpu);
-	else
-		printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
-			" sig=0x%x, pf=0x%x, rev=0x%x\n",
-			cpu, uci->sig, uci->pf, uci->rev);
-
-	set_cpus_allowed_ptr(current, &old);
-	return err;
-}
-
-static void microcode_init_cpu(int cpu, int resume)
-{
-	cpumask_t old;
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	old = current->cpus_allowed;
-
-	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-	mutex_lock(&microcode_mutex);
-	collect_cpu_info(cpu);
-	if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
-		cpu_request_microcode(cpu);
-	mutex_unlock(&microcode_mutex);
-	set_cpus_allowed_ptr(current, &old);
-}
-
-static void microcode_fini_cpu(int cpu)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	mutex_lock(&microcode_mutex);
-	uci->valid = 0;
-	vfree(uci->mc);
-	uci->mc = NULL;
-	mutex_unlock(&microcode_mutex);
-}
-
-static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-	char *end;
-	unsigned long val = simple_strtoul(buf, &end, 0);
-	int err = 0;
-	int cpu = dev->id;
-
-	if (end == buf)
-		return -EINVAL;
-	if (val == 1) {
-		cpumask_t old;
-
-		old = current->cpus_allowed;
-
-		get_online_cpus();
-		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-
-		mutex_lock(&microcode_mutex);
-		if (uci->valid)
-			err = cpu_request_microcode(cpu);
-		mutex_unlock(&microcode_mutex);
-		put_online_cpus();
-		set_cpus_allowed_ptr(current, &old);
-	}
-	if (err)
-		return err;
-	return sz;
-}
-
-static ssize_t version_show(struct sys_device *dev, char *buf)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
-	return sprintf(buf, "0x%x\n", uci->rev);
-}
-
-static ssize_t pf_show(struct sys_device *dev, char *buf)
-{
-	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
-
-	return sprintf(buf, "0x%x\n", uci->pf);
-}
-
-static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
-static SYSDEV_ATTR(version, 0400, version_show, NULL);
-static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
-
-static struct attribute *mc_default_attrs[] = {
-	&attr_reload.attr,
-	&attr_version.attr,
-	&attr_processor_flags.attr,
-	NULL
-};
-
-static struct attribute_group mc_attr_group = {
-	.attrs = mc_default_attrs,
-	.name = "microcode",
-};
-
-static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
-{
-	int err, cpu = sys_dev->id;
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-
-	if (!cpu_online(cpu))
-		return 0;
-
-	pr_debug("microcode: CPU%d added\n", cpu);
-	memset(uci, 0, sizeof(*uci));
-
-	err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
-	if (err)
-		return err;
-
-	microcode_init_cpu(cpu, resume);
-
-	return 0;
-}
-
-static int mc_sysdev_add(struct sys_device *sys_dev)
-{
-	return __mc_sysdev_add(sys_dev, 0);
-}
-
-static int mc_sysdev_remove(struct sys_device *sys_dev)
-{
-	int cpu = sys_dev->id;
-
-	if (!cpu_online(cpu))
-		return 0;
-
-	pr_debug("microcode: CPU%d removed\n", cpu);
-	microcode_fini_cpu(cpu);
-	sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
-	return 0;
-}
-
-static int mc_sysdev_resume(struct sys_device *dev)
-{
-	int cpu = dev->id;
-
-	if (!cpu_online(cpu))
-		return 0;
-	pr_debug("microcode: CPU%d resumed\n", cpu);
-	/* only CPU 0 will apply ucode here */
-	apply_microcode(0);
-	return 0;
-}
-
-static struct sysdev_driver mc_sysdev_driver = {
-	.add = mc_sysdev_add,
-	.remove = mc_sysdev_remove,
-	.resume = mc_sysdev_resume,
-};
-
-static __cpuinit int
-mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (unsigned long)hcpu;
-	struct sys_device *sys_dev;
-
-	sys_dev = get_cpu_sysdev(cpu);
-	switch (action) {
-	case CPU_UP_CANCELED_FROZEN:
-		/* The CPU refused to come up during a system resume */
-		microcode_fini_cpu(cpu);
-		break;
-	case CPU_ONLINE:
-	case CPU_DOWN_FAILED:
-		mc_sysdev_add(sys_dev);
-		break;
-	case CPU_ONLINE_FROZEN:
-		/* System-wide resume is in progress, try to apply microcode */
-		if (apply_microcode_check_cpu(cpu)) {
-			/* The application of microcode failed */
-			microcode_fini_cpu(cpu);
-			__mc_sysdev_add(sys_dev, 1);
-			break;
-		}
-	case CPU_DOWN_FAILED_FROZEN:
-		if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
-			printk(KERN_ERR "microcode: Failed to create the sysfs "
-				"group for CPU%d\n", cpu);
-		break;
-	case CPU_DOWN_PREPARE:
-		mc_sysdev_remove(sys_dev);
-		break;
-	case CPU_DOWN_PREPARE_FROZEN:
-		/* Suspend is in progress, only remove the interface */
-		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block __refdata mc_cpu_notifier = {
-	.notifier_call = mc_cpu_callback,
-};
-
-static int __init microcode_init (void)
-{
-	int error;
-
-	printk(KERN_INFO
-		"IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
-
-	error = microcode_dev_init();
-	if (error)
-		return error;
-	microcode_pdev = platform_device_register_simple("microcode", -1,
-							 NULL, 0);
-	if (IS_ERR(microcode_pdev)) {
-		microcode_dev_exit();
-		return PTR_ERR(microcode_pdev);
-	}
-
-	get_online_cpus();
-	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
-	put_online_cpus();
-	if (error) {
-		microcode_dev_exit();
-		platform_device_unregister(microcode_pdev);
-		return error;
-	}
-
-	register_hotcpu_notifier(&mc_cpu_notifier);
-	return 0;
-}
-
-static void __exit microcode_exit (void)
-{
-	microcode_dev_exit();
-
-	unregister_hotcpu_notifier(&mc_cpu_notifier);
-
-	get_online_cpus();
-	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
-	put_online_cpus();
-
-	platform_device_unregister(microcode_pdev);
-}
-
-module_init(microcode_init)
-module_exit(microcode_exit)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
new file mode 100644
index 000000000000..7a1f8eeac2c7
--- /dev/null
+++ b/arch/x86/kernel/microcode_amd.c
@@ -0,0 +1,435 @@
+/*
+ *  AMD CPU Microcode Update Driver for Linux
+ *  Copyright (C) 2008 Advanced Micro Devices Inc.
+ *
+ *  Author: Peter Oruba <peter.oruba@amd.com>
+ *
+ *  Based on work by:
+ *  Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *
+ *  This driver allows to upgrade microcode on AMD
+ *  family 0x10 and 0x11 processors.
+ *
+ *  Licensed unter the terms of the GNU General Public
+ *  License version 2. See file COPYING for details.
+*/
+
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/microcode.h>
+
+MODULE_DESCRIPTION("AMD Microcode Update Driver");
+MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>");
+MODULE_LICENSE("GPL v2");
+
+#define UCODE_MAGIC                0x00414d44
+#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
+#define UCODE_UCODE_TYPE           0x00000001
+
+struct equiv_cpu_entry {
+	unsigned int installed_cpu;
+	unsigned int fixed_errata_mask;
+	unsigned int fixed_errata_compare;
+	unsigned int equiv_cpu;
+};
+
+struct microcode_header_amd {
+	unsigned int  data_code;
+	unsigned int  patch_id;
+	unsigned char mc_patch_data_id[2];
+	unsigned char mc_patch_data_len;
+	unsigned char init_flag;
+	unsigned int  mc_patch_data_checksum;
+	unsigned int  nb_dev_id;
+	unsigned int  sb_dev_id;
+	unsigned char processor_rev_id[2];
+	unsigned char nb_rev_id;
+	unsigned char sb_rev_id;
+	unsigned char bios_api_rev;
+	unsigned char reserved1[3];
+	unsigned int  match_reg[8];
+};
+
+struct microcode_amd {
+	struct microcode_header_amd hdr;
+	unsigned int mpb[0];
+};
+
+#define UCODE_MAX_SIZE          (2048)
+#define DEFAULT_UCODE_DATASIZE	(896)
+#define MC_HEADER_SIZE		(sizeof(struct microcode_header_amd))
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define DWSIZE			(sizeof(u32))
+/* For now we support a fixed ucode total size only */
+#define get_totalsize(mc) \
+	((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
+	 + MC_HEADER_SIZE)
+
+/* serialize access to the physical write */
+static DEFINE_SPINLOCK(microcode_update_lock);
+
+static struct equiv_cpu_entry *equiv_cpu_table;
+
+static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+	memset(csig, 0, sizeof(*csig));
+
+	if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
+		printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
+		       cpu);
+		return -1;
+	}
+
+	asm volatile("movl %1, %%ecx; rdmsr"
+		     : "=a" (csig->rev)
+		     : "i" (0x0000008B) : "ecx");
+
+	printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
+		csig->rev);
+
+	return 0;
+}
+
+static int get_matching_microcode(int cpu, void *mc, int rev)
+{
+	struct microcode_header_amd *mc_header = mc;
+	struct pci_dev *nb_pci_dev, *sb_pci_dev;
+	unsigned int current_cpu_id;
+	unsigned int equiv_cpu_id = 0x00;
+	unsigned int i = 0;
+
+	BUG_ON(equiv_cpu_table == NULL);
+	current_cpu_id = cpuid_eax(0x00000001);
+
+	while (equiv_cpu_table[i].installed_cpu != 0) {
+		if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
+			equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
+			break;
+		}
+		i++;
+	}
+
+	if (!equiv_cpu_id) {
+		printk(KERN_ERR "microcode: CPU%d cpu_id "
+		       "not found in equivalent cpu table \n", cpu);
+		return 0;
+	}
+
+	if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) {
+		printk(KERN_ERR
+			"microcode: CPU%d patch does not match "
+			"(patch is %x, cpu extended is %x) \n",
+			cpu, mc_header->processor_rev_id[0],
+			(equiv_cpu_id & 0xff));
+		return 0;
+	}
+
+	if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) {
+		printk(KERN_ERR "microcode: CPU%d patch does not match "
+			"(patch is %x, cpu base id is %x) \n",
+			cpu, mc_header->processor_rev_id[1],
+			((equiv_cpu_id >> 16) & 0xff));
+
+		return 0;
+	}
+
+	/* ucode may be northbridge specific */
+	if (mc_header->nb_dev_id) {
+		nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
+					    (mc_header->nb_dev_id & 0xff),
+					    NULL);
+		if ((!nb_pci_dev) ||
+		    (mc_header->nb_rev_id != nb_pci_dev->revision)) {
+			printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
+			pci_dev_put(nb_pci_dev);
+			return 0;
+		}
+		pci_dev_put(nb_pci_dev);
+	}
+
+	/* ucode may be southbridge specific */
+	if (mc_header->sb_dev_id) {
+		sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
+					    (mc_header->sb_dev_id & 0xff),
+					    NULL);
+		if ((!sb_pci_dev) ||
+		    (mc_header->sb_rev_id != sb_pci_dev->revision)) {
+			printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
+			pci_dev_put(sb_pci_dev);
+			return 0;
+		}
+		pci_dev_put(sb_pci_dev);
+	}
+
+	if (mc_header->patch_id <= rev)
+		return 0;
+
+	return 1;
+}
+
+static void apply_microcode_amd(int cpu)
+{
+	unsigned long flags;
+	unsigned int eax, edx;
+	unsigned int rev;
+	int cpu_num = raw_smp_processor_id();
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
+	struct microcode_amd *mc_amd = uci->mc;
+	unsigned long addr;
+
+	/* We should bind the task to the CPU */
+	BUG_ON(cpu_num != cpu);
+
+	if (mc_amd == NULL)
+		return;
+
+	spin_lock_irqsave(&microcode_update_lock, flags);
+
+	addr = (unsigned long)&mc_amd->hdr.data_code;
+	edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
+	eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
+
+	asm volatile("movl %0, %%ecx; wrmsr" :
+		     : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
+
+	/* get patch id after patching */
+	asm volatile("movl %1, %%ecx; rdmsr"
+		     : "=a" (rev)
+		     : "i" (0x0000008B) : "ecx");
+
+	spin_unlock_irqrestore(&microcode_update_lock, flags);
+
+	/* check current patch id and patch's id for match */
+	if (rev != mc_amd->hdr.patch_id) {
+		printk(KERN_ERR "microcode: CPU%d update from revision "
+		       "0x%x to 0x%x failed\n", cpu_num,
+		       mc_amd->hdr.patch_id, rev);
+		return;
+	}
+
+	printk(KERN_INFO "microcode: CPU%d updated from revision "
+	       "0x%x to 0x%x \n",
+	       cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
+
+	uci->cpu_sig.rev = rev;
+}
+
+static void * get_next_ucode(u8 *buf, unsigned int size,
+			int (*get_ucode_data)(void *, const void *, size_t),
+			unsigned int *mc_size)
+{
+	unsigned int total_size;
+#define UCODE_CONTAINER_SECTION_HDR	8
+	u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
+	void *mc;
+
+	if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
+		return NULL;
+
+	if (section_hdr[0] != UCODE_UCODE_TYPE) {
+		printk(KERN_ERR "microcode: error! "
+		       "Wrong microcode payload type field\n");
+		return NULL;
+	}
+
+	total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
+
+	printk(KERN_INFO "microcode: size %u, total_size %u\n",
+		size, total_size);
+
+	if (total_size > size || total_size > UCODE_MAX_SIZE) {
+		printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
+		return NULL;
+	}
+
+	mc = vmalloc(UCODE_MAX_SIZE);
+	if (mc) {
+		memset(mc, 0, UCODE_MAX_SIZE);
+		if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
+			vfree(mc);
+			mc = NULL;
+		} else
+			*mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
+	}
+#undef UCODE_CONTAINER_SECTION_HDR
+	return mc;
+}
+
+
+static int install_equiv_cpu_table(u8 *buf,
+		int (*get_ucode_data)(void *, const void *, size_t))
+{
+#define UCODE_CONTAINER_HEADER_SIZE	12
+	u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
+	unsigned int *buf_pos = (unsigned int *)container_hdr;
+	unsigned long size;
+
+	if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
+		return 0;
+
+	size = buf_pos[2];
+
+	if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
+		printk(KERN_ERR "microcode: error! "
+		       "Wrong microcode equivalnet cpu table\n");
+		return 0;
+	}
+
+	equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
+	if (!equiv_cpu_table) {
+		printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n");
+		return 0;
+	}
+
+	buf += UCODE_CONTAINER_HEADER_SIZE;
+	if (get_ucode_data(equiv_cpu_table, buf, size)) {
+		vfree(equiv_cpu_table);
+		return 0;
+	}
+
+	return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
+#undef UCODE_CONTAINER_HEADER_SIZE
+}
+
+static void free_equiv_cpu_table(void)
+{
+	if (equiv_cpu_table) {
+		vfree(equiv_cpu_table);
+		equiv_cpu_table = NULL;
+	}
+}
+
+static int generic_load_microcode(int cpu, void *data, size_t size,
+		int (*get_ucode_data)(void *, const void *, size_t))
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+	int new_rev = uci->cpu_sig.rev;
+	unsigned int leftover;
+	unsigned long offset;
+
+	offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data);
+	if (!offset) {
+		printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
+		return -EINVAL;
+	}
+
+	ucode_ptr += offset;
+	leftover = size - offset;
+
+	while (leftover) {
+		unsigned int uninitialized_var(mc_size);
+		struct microcode_header_amd *mc_header;
+
+		mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size);
+		if (!mc)
+			break;
+
+		mc_header = (struct microcode_header_amd *)mc;
+		if (get_matching_microcode(cpu, mc, new_rev)) {
+			if (new_mc)
+				vfree(new_mc);
+			new_rev = mc_header->patch_id;
+			new_mc  = mc;
+		} else 
+			vfree(mc);
+
+		ucode_ptr += mc_size;
+		leftover  -= mc_size;
+	}
+
+	if (new_mc) {
+		if (!leftover) {
+			if (uci->mc)
+				vfree(uci->mc);
+			uci->mc = new_mc;
+			pr_debug("microcode: CPU%d found a matching microcode update with"
+				" version 0x%x (current=0x%x)\n",
+				cpu, new_rev, uci->cpu_sig.rev);
+		} else
+			vfree(new_mc);
+	}
+
+	free_equiv_cpu_table();
+
+	return (int)leftover;
+}
+
+static int get_ucode_fw(void *to, const void *from, size_t n)
+{
+	memcpy(to, from, n);
+	return 0;
+}
+
+static int request_microcode_fw(int cpu, struct device *device)
+{
+	const char *fw_name = "amd-ucode/microcode_amd.bin";
+	const struct firmware *firmware;
+	int ret;
+
+	/* We should bind the task to the CPU */
+	BUG_ON(cpu != raw_smp_processor_id());
+
+	ret = request_firmware(&firmware, fw_name, device);
+	if (ret) {
+		printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name);
+		return ret;
+	}
+
+	ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
+			&get_ucode_fw);
+
+	release_firmware(firmware);
+
+	return ret;
+}
+
+static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+{
+	printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode"
+			"is not supported\n");
+	return -1;
+}
+
+static void microcode_fini_cpu_amd(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	vfree(uci->mc);
+	uci->mc = NULL;
+}
+
+static struct microcode_ops microcode_amd_ops = {
+	.request_microcode_user           = request_microcode_user,
+	.request_microcode_fw             = request_microcode_fw,
+	.collect_cpu_info                 = collect_cpu_info_amd,
+	.apply_microcode                  = apply_microcode_amd,
+	.microcode_fini_cpu               = microcode_fini_cpu_amd,
+};
+
+struct microcode_ops * __init init_amd_microcode(void)
+{
+	return &microcode_amd_ops;
+}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
new file mode 100644
index 000000000000..936d8d55f230
--- /dev/null
+++ b/arch/x86/kernel/microcode_core.c
@@ -0,0 +1,508 @@
+/*
+ *	Intel CPU Microcode Update Driver for Linux
+ *
+ *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *		      2006	Shaohua Li <shaohua.li@intel.com>
+ *
+ *	This driver allows to upgrade microcode on Intel processors
+ *	belonging to IA-32 family - PentiumPro, Pentium II,
+ *	Pentium III, Xeon, Pentium 4, etc.
+ *
+ *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ *	Software Developer's Manual
+ *	Order Number 253668 or free download from:
+ *
+ *	http://developer.intel.com/design/pentium4/manuals/253668.htm
+ *
+ *	For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Initial release.
+ *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Added read() support + cleanups.
+ *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Added 'device trimming' support. open(O_WRONLY) zeroes
+ *		and frees the saved copy of applied microcode.
+ *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Made to use devfs (/dev/cpu/microcode) + cleanups.
+ *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *		Added misc device support (now uses both devfs and misc).
+ *		Added MICROCODE_IOCFREE ioctl to clear memory.
+ *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *		Messages for error cases (non Intel & no suitable microcode).
+ *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
+ *		Removed ->release(). Removed exclusive open and status bitmap.
+ *		Added microcode_rwsem to serialize read()/write()/ioctl().
+ *		Removed global kernel lock usage.
+ *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
+ *		Write 0 to 0x8B msr and then cpuid before reading revision,
+ *		so that it works even if there were no update done by the
+ *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
+ *		to be 0 on my machine which is why it worked even when I
+ *		disabled update by the BIOS)
+ *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
+ *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
+ *			     Tigran Aivazian <tigran@veritas.com>
+ *		Intel Pentium 4 processor support and bugfixes.
+ *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
+ *		Bugfix for HT (Hyper-Threading) enabled processors
+ *		whereby processor resources are shared by all logical processors
+ *		in a single CPU package.
+ *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
+ *		Tigran Aivazian <tigran@veritas.com>,
+ *		Serialize updates as required on HT processors due to
+ *		speculative nature of implementation.
+ *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
+ *		Fix the panic when writing zero-length microcode chunk.
+ *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
+ *		Jun Nakajima <jun.nakajima@intel.com>
+ *		Support for the microcode updates in the new format.
+ *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
+ *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
+ *		because we no longer hold a copy of applied microcode
+ *		in kernel memory.
+ *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
+ *		Fix sigmatch() macro to handle old CPUs with pf == 0.
+ *		Thanks to Stuart Swales for pointing out this bug.
+ */
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/microcode.h>
+
+MODULE_DESCRIPTION("Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+
+#define MICROCODE_VERSION 	"2.00"
+
+struct microcode_ops *microcode_ops;
+
+/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+static DEFINE_MUTEX(microcode_mutex);
+
+struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
+EXPORT_SYMBOL_GPL(ucode_cpu_info);
+
+#ifdef CONFIG_MICROCODE_OLD_INTERFACE
+static int do_microcode_update(const void __user *buf, size_t size)
+{
+	cpumask_t old;
+	int error = 0;
+	int cpu;
+
+	old = current->cpus_allowed;
+
+	for_each_online_cpu(cpu) {
+		struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+		if (!uci->valid)
+			continue;
+
+		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+		error = microcode_ops->request_microcode_user(cpu, buf, size);
+		if (error < 0)
+			goto out;
+		if (!error)
+			microcode_ops->apply_microcode(cpu);
+	}
+out:
+	set_cpus_allowed_ptr(current, &old);
+	return error;
+}
+
+static int microcode_open(struct inode *unused1, struct file *unused2)
+{
+	cycle_kernel_lock();
+	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+}
+
+static ssize_t microcode_write(struct file *file, const char __user *buf,
+			       size_t len, loff_t *ppos)
+{
+	ssize_t ret;
+
+	if ((len >> PAGE_SHIFT) > num_physpages) {
+		printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
+		       num_physpages);
+		return -EINVAL;
+	}
+
+	get_online_cpus();
+	mutex_lock(&microcode_mutex);
+
+	ret = do_microcode_update(buf, len);
+	if (!ret)
+		ret = (ssize_t)len;
+
+	mutex_unlock(&microcode_mutex);
+	put_online_cpus();
+
+	return ret;
+}
+
+static const struct file_operations microcode_fops = {
+	.owner		= THIS_MODULE,
+	.write		= microcode_write,
+	.open		= microcode_open,
+};
+
+static struct miscdevice microcode_dev = {
+	.minor		= MICROCODE_MINOR,
+	.name		= "microcode",
+	.fops		= &microcode_fops,
+};
+
+static int __init microcode_dev_init(void)
+{
+	int error;
+
+	error = misc_register(&microcode_dev);
+	if (error) {
+		printk(KERN_ERR
+			"microcode: can't misc_register on minor=%d\n",
+			MICROCODE_MINOR);
+		return error;
+	}
+
+	return 0;
+}
+
+static void microcode_dev_exit(void)
+{
+	misc_deregister(&microcode_dev);
+}
+
+MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+#else
+#define microcode_dev_init() 0
+#define microcode_dev_exit() do { } while (0)
+#endif
+
+/* fake device for request_firmware */
+struct platform_device *microcode_pdev;
+
+static ssize_t reload_store(struct sys_device *dev,
+			    struct sysdev_attribute *attr,
+			    const char *buf, size_t sz)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+	char *end;
+	unsigned long val = simple_strtoul(buf, &end, 0);
+	int err = 0;
+	int cpu = dev->id;
+
+	if (end == buf)
+		return -EINVAL;
+	if (val == 1) {
+		cpumask_t old = current->cpus_allowed;
+
+		get_online_cpus();
+		if (cpu_online(cpu)) {
+			set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+			mutex_lock(&microcode_mutex);
+			if (uci->valid) {
+				err = microcode_ops->request_microcode_fw(cpu,
+						&microcode_pdev->dev);
+				if (!err)
+					microcode_ops->apply_microcode(cpu);
+			}
+			mutex_unlock(&microcode_mutex);
+			set_cpus_allowed_ptr(current, &old);
+		}
+		put_online_cpus();
+	}
+	if (err)
+		return err;
+	return sz;
+}
+
+static ssize_t version_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+	return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
+}
+
+static ssize_t pf_show(struct sys_device *dev,
+			struct sysdev_attribute *attr, char *buf)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
+
+	return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
+}
+
+static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
+static SYSDEV_ATTR(version, 0400, version_show, NULL);
+static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
+
+static struct attribute *mc_default_attrs[] = {
+	&attr_reload.attr,
+	&attr_version.attr,
+	&attr_processor_flags.attr,
+	NULL
+};
+
+static struct attribute_group mc_attr_group = {
+	.attrs = mc_default_attrs,
+	.name = "microcode",
+};
+
+static void microcode_fini_cpu(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	mutex_lock(&microcode_mutex);
+	microcode_ops->microcode_fini_cpu(cpu);
+	uci->valid = 0;
+	mutex_unlock(&microcode_mutex);
+}
+
+static void collect_cpu_info(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	memset(uci, 0, sizeof(*uci));
+	if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
+		uci->valid = 1;
+}
+
+static int microcode_resume_cpu(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	struct cpu_signature nsig;
+
+	pr_debug("microcode: CPU%d resumed\n", cpu);
+
+	if (!uci->mc)
+		return 1;
+
+	/*
+	 * Let's verify that the 'cached' ucode does belong
+	 * to this cpu (a bit of paranoia):
+	 */
+	if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
+		microcode_fini_cpu(cpu);
+		return -1;
+	}
+
+	if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) {
+		microcode_fini_cpu(cpu);
+		/* Should we look for a new ucode here? */
+		return 1;
+	}
+
+	return 0;
+}
+
+void microcode_update_cpu(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	int err = 0;
+
+	/*
+	 * Check if the system resume is in progress (uci->valid != NULL),
+	 * otherwise just request a firmware:
+	 */
+	if (uci->valid) {
+		err = microcode_resume_cpu(cpu);
+	} else {	
+		collect_cpu_info(cpu);
+		if (uci->valid && system_state == SYSTEM_RUNNING)
+			err = microcode_ops->request_microcode_fw(cpu,
+					&microcode_pdev->dev);
+	}
+	if (!err)
+		microcode_ops->apply_microcode(cpu);
+}
+
+static void microcode_init_cpu(int cpu)
+{
+	cpumask_t old = current->cpus_allowed;
+
+	set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
+	/* We should bind the task to the CPU */
+	BUG_ON(raw_smp_processor_id() != cpu);
+
+	mutex_lock(&microcode_mutex);
+	microcode_update_cpu(cpu);
+	mutex_unlock(&microcode_mutex);
+
+	set_cpus_allowed_ptr(current, &old);
+}
+
+static int mc_sysdev_add(struct sys_device *sys_dev)
+{
+	int err, cpu = sys_dev->id;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	pr_debug("microcode: CPU%d added\n", cpu);
+	memset(uci, 0, sizeof(*uci));
+
+	err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
+	if (err)
+		return err;
+
+	microcode_init_cpu(cpu);
+	return 0;
+}
+
+static int mc_sysdev_remove(struct sys_device *sys_dev)
+{
+	int cpu = sys_dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	pr_debug("microcode: CPU%d removed\n", cpu);
+	microcode_fini_cpu(cpu);
+	sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+	return 0;
+}
+
+static int mc_sysdev_resume(struct sys_device *dev)
+{
+	int cpu = dev->id;
+
+	if (!cpu_online(cpu))
+		return 0;
+
+	/* only CPU 0 will apply ucode here */
+	microcode_update_cpu(0);
+	return 0;
+}
+
+static struct sysdev_driver mc_sysdev_driver = {
+	.add = mc_sysdev_add,
+	.remove = mc_sysdev_remove,
+	.resume = mc_sysdev_resume,
+};
+
+static __cpuinit int
+mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	struct sys_device *sys_dev;
+
+	sys_dev = get_cpu_sysdev(cpu);
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_ONLINE_FROZEN:
+		microcode_init_cpu(cpu);
+	case CPU_DOWN_FAILED:
+	case CPU_DOWN_FAILED_FROZEN:
+		pr_debug("microcode: CPU%d added\n", cpu);
+		if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
+			printk(KERN_ERR "microcode: Failed to create the sysfs "
+				"group for CPU%d\n", cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		/* Suspend is in progress, only remove the interface */
+		sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
+		pr_debug("microcode: CPU%d removed\n", cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_UP_CANCELED_FROZEN:
+		/* The CPU refused to come up during a system resume */
+		microcode_fini_cpu(cpu);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __refdata mc_cpu_notifier = {
+	.notifier_call = mc_cpu_callback,
+};
+
+static int __init microcode_init(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+	int error;
+
+	if (c->x86_vendor == X86_VENDOR_INTEL)
+		microcode_ops = init_intel_microcode();
+	else if (c->x86_vendor == X86_VENDOR_AMD)
+		microcode_ops = init_amd_microcode();
+
+	if (!microcode_ops) {
+		printk(KERN_ERR "microcode: no support for this CPU vendor\n");
+		return -ENODEV;
+	}
+
+	error = microcode_dev_init();
+	if (error)
+		return error;
+	microcode_pdev = platform_device_register_simple("microcode", -1,
+							 NULL, 0);
+	if (IS_ERR(microcode_pdev)) {
+		microcode_dev_exit();
+		return PTR_ERR(microcode_pdev);
+	}
+
+	get_online_cpus();
+	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+	put_online_cpus();
+	if (error) {
+		microcode_dev_exit();
+		platform_device_unregister(microcode_pdev);
+		return error;
+	}
+
+	register_hotcpu_notifier(&mc_cpu_notifier);
+
+	printk(KERN_INFO
+	       "Microcode Update Driver: v" MICROCODE_VERSION
+	       " <tigran@aivazian.fsnet.co.uk>"
+	       " <peter.oruba@amd.com>\n");
+
+	return 0;
+}
+
+static void __exit microcode_exit(void)
+{
+	microcode_dev_exit();
+
+	unregister_hotcpu_notifier(&mc_cpu_notifier);
+
+	get_online_cpus();
+	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+	put_online_cpus();
+
+	platform_device_unregister(microcode_pdev);
+
+	microcode_ops = NULL;
+
+	printk(KERN_INFO
+	       "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
+}
+
+module_init(microcode_init);
+module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
new file mode 100644
index 000000000000..622dc4a21784
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel.c
@@ -0,0 +1,480 @@
+/*
+ *	Intel CPU Microcode Update Driver for Linux
+ *
+ *	Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
+ *		      2006	Shaohua Li <shaohua.li@intel.com>
+ *
+ *	This driver allows to upgrade microcode on Intel processors
+ *	belonging to IA-32 family - PentiumPro, Pentium II,
+ *	Pentium III, Xeon, Pentium 4, etc.
+ *
+ *	Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
+ *	Software Developer's Manual
+ *	Order Number 253668 or free download from:
+ *
+ *	http://developer.intel.com/design/pentium4/manuals/253668.htm
+ *
+ *	For more information, go to http://www.urbanmyth.org/microcode
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	1.0	16 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Initial release.
+ *	1.01	18 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Added read() support + cleanups.
+ *	1.02	21 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Added 'device trimming' support. open(O_WRONLY) zeroes
+ *		and frees the saved copy of applied microcode.
+ *	1.03	29 Feb 2000, Tigran Aivazian <tigran@sco.com>
+ *		Made to use devfs (/dev/cpu/microcode) + cleanups.
+ *	1.04	06 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *		Added misc device support (now uses both devfs and misc).
+ *		Added MICROCODE_IOCFREE ioctl to clear memory.
+ *	1.05	09 Jun 2000, Simon Trimmer <simon@veritas.com>
+ *		Messages for error cases (non Intel & no suitable microcode).
+ *	1.06	03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
+ *		Removed ->release(). Removed exclusive open and status bitmap.
+ *		Added microcode_rwsem to serialize read()/write()/ioctl().
+ *		Removed global kernel lock usage.
+ *	1.07	07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
+ *		Write 0 to 0x8B msr and then cpuid before reading revision,
+ *		so that it works even if there were no update done by the
+ *		BIOS. Otherwise, reading from 0x8B gives junk (which happened
+ *		to be 0 on my machine which is why it worked even when I
+ *		disabled update by the BIOS)
+ *		Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
+ *	1.08	11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
+ *			     Tigran Aivazian <tigran@veritas.com>
+ *		Intel Pentium 4 processor support and bugfixes.
+ *	1.09	30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
+ *		Bugfix for HT (Hyper-Threading) enabled processors
+ *		whereby processor resources are shared by all logical processors
+ *		in a single CPU package.
+ *	1.10	28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
+ *		Tigran Aivazian <tigran@veritas.com>,
+ *		Serialize updates as required on HT processors due to
+ *		speculative nature of implementation.
+ *	1.11	22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
+ *		Fix the panic when writing zero-length microcode chunk.
+ *	1.12	29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
+ *		Jun Nakajima <jun.nakajima@intel.com>
+ *		Support for the microcode updates in the new format.
+ *	1.13	10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
+ *		Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
+ *		because we no longer hold a copy of applied microcode
+ *		in kernel memory.
+ *	1.14	25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
+ *		Fix sigmatch() macro to handle old CPUs with pf == 0.
+ *		Thanks to Stuart Swales for pointing out this bug.
+ */
+#include <linux/capability.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/smp_lock.h>
+#include <linux/cpumask.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/miscdevice.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/firmware.h>
+#include <linux/platform_device.h>
+
+#include <asm/msr.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+#include <asm/microcode.h>
+
+MODULE_DESCRIPTION("Microcode Update Driver");
+MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
+MODULE_LICENSE("GPL");
+
+struct microcode_header_intel {
+	unsigned int            hdrver;
+	unsigned int            rev;
+	unsigned int            date;
+	unsigned int            sig;
+	unsigned int            cksum;
+	unsigned int            ldrver;
+	unsigned int            pf;
+	unsigned int            datasize;
+	unsigned int            totalsize;
+	unsigned int            reserved[3];
+};
+
+struct microcode_intel {
+	struct microcode_header_intel hdr;
+	unsigned int            bits[0];
+};
+
+/* microcode format is extended from prescott processors */
+struct extended_signature {
+	unsigned int            sig;
+	unsigned int            pf;
+	unsigned int            cksum;
+};
+
+struct extended_sigtable {
+	unsigned int            count;
+	unsigned int            cksum;
+	unsigned int            reserved[3];
+	struct extended_signature sigs[0];
+};
+
+#define DEFAULT_UCODE_DATASIZE 	(2000)
+#define MC_HEADER_SIZE		(sizeof(struct microcode_header_intel))
+#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
+#define EXT_HEADER_SIZE		(sizeof(struct extended_sigtable))
+#define EXT_SIGNATURE_SIZE	(sizeof(struct extended_signature))
+#define DWSIZE			(sizeof(u32))
+#define get_totalsize(mc) \
+	(((struct microcode_intel *)mc)->hdr.totalsize ? \
+	 ((struct microcode_intel *)mc)->hdr.totalsize : \
+	 DEFAULT_UCODE_TOTALSIZE)
+
+#define get_datasize(mc) \
+	(((struct microcode_intel *)mc)->hdr.datasize ? \
+	 ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
+
+#define sigmatch(s1, s2, p1, p2) \
+	(((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
+
+#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
+
+/* serialize access to the physical write to MSR 0x79 */
+static DEFINE_SPINLOCK(microcode_update_lock);
+
+static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
+{
+	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
+	unsigned int val[2];
+
+	memset(csig, 0, sizeof(*csig));
+
+	if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
+	    cpu_has(c, X86_FEATURE_IA64)) {
+		printk(KERN_ERR "microcode: CPU%d not a capable Intel "
+			"processor\n", cpu_num);
+		return -1;
+	}
+
+	csig->sig = cpuid_eax(0x00000001);
+
+	if ((c->x86_model >= 5) || (c->x86 > 6)) {
+		/* get processor flags from MSR 0x17 */
+		rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
+		csig->pf = 1 << ((val[1] >> 18) & 7);
+	}
+
+	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+	/* see notes above for revision 1.07.  Apparent chip bug */
+	sync_core();
+	/* get the current revision from MSR 0x8B */
+	rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
+	pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
+			csig->sig, csig->pf, csig->rev);
+
+	return 0;
+}
+
+static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
+{
+	return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
+}
+
+static inline int 
+update_match_revision(struct microcode_header_intel *mc_header,	int rev)
+{
+	return (mc_header->rev <= rev) ? 0 : 1;
+}
+
+static int microcode_sanity_check(void *mc)
+{
+	struct microcode_header_intel *mc_header = mc;
+	struct extended_sigtable *ext_header = NULL;
+	struct extended_signature *ext_sig;
+	unsigned long total_size, data_size, ext_table_size;
+	int sum, orig_sum, ext_sigcount = 0, i;
+
+	total_size = get_totalsize(mc_header);
+	data_size = get_datasize(mc_header);
+	if (data_size + MC_HEADER_SIZE > total_size) {
+		printk(KERN_ERR "microcode: error! "
+			"Bad data size in microcode data file\n");
+		return -EINVAL;
+	}
+
+	if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
+		printk(KERN_ERR "microcode: error! "
+			"Unknown microcode update format\n");
+		return -EINVAL;
+	}
+	ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
+	if (ext_table_size) {
+		if ((ext_table_size < EXT_HEADER_SIZE)
+		 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
+			printk(KERN_ERR "microcode: error! "
+				"Small exttable size in microcode data file\n");
+			return -EINVAL;
+		}
+		ext_header = mc + MC_HEADER_SIZE + data_size;
+		if (ext_table_size != exttable_size(ext_header)) {
+			printk(KERN_ERR "microcode: error! "
+				"Bad exttable size in microcode data file\n");
+			return -EFAULT;
+		}
+		ext_sigcount = ext_header->count;
+	}
+
+	/* check extended table checksum */
+	if (ext_table_size) {
+		int ext_table_sum = 0;
+		int *ext_tablep = (int *)ext_header;
+
+		i = ext_table_size / DWSIZE;
+		while (i--)
+			ext_table_sum += ext_tablep[i];
+		if (ext_table_sum) {
+			printk(KERN_WARNING "microcode: aborting, "
+				"bad extended signature table checksum\n");
+			return -EINVAL;
+		}
+	}
+
+	/* calculate the checksum */
+	orig_sum = 0;
+	i = (MC_HEADER_SIZE + data_size) / DWSIZE;
+	while (i--)
+		orig_sum += ((int *)mc)[i];
+	if (orig_sum) {
+		printk(KERN_ERR "microcode: aborting, bad checksum\n");
+		return -EINVAL;
+	}
+	if (!ext_table_size)
+		return 0;
+	/* check extended signature checksum */
+	for (i = 0; i < ext_sigcount; i++) {
+		ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
+			  EXT_SIGNATURE_SIZE * i;
+		sum = orig_sum
+			- (mc_header->sig + mc_header->pf + mc_header->cksum)
+			+ (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
+		if (sum) {
+			printk(KERN_ERR "microcode: aborting, bad checksum\n");
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/*
+ * return 0 - no update found
+ * return 1 - found update
+ */
+static int
+get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
+{
+	struct microcode_header_intel *mc_header = mc;
+	struct extended_sigtable *ext_header;
+	unsigned long total_size = get_totalsize(mc_header);
+	int ext_sigcount, i;
+	struct extended_signature *ext_sig;
+
+	if (!update_match_revision(mc_header, rev))
+		return 0;
+
+	if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
+		return 1;
+
+	/* Look for ext. headers: */
+	if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
+		return 0;
+
+	ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
+	ext_sigcount = ext_header->count;
+	ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
+
+	for (i = 0; i < ext_sigcount; i++) {
+		if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
+			return 1;
+		ext_sig++;
+	}
+	return 0;
+}
+
+static void apply_microcode(int cpu)
+{
+	unsigned long flags;
+	unsigned int val[2];
+	int cpu_num = raw_smp_processor_id();
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	struct microcode_intel *mc_intel = uci->mc;
+
+	/* We should bind the task to the CPU */
+	BUG_ON(cpu_num != cpu);
+
+	if (mc_intel == NULL)
+		return;
+
+	/* serialize access to the physical write to MSR 0x79 */
+	spin_lock_irqsave(&microcode_update_lock, flags);
+
+	/* write microcode via MSR 0x79 */
+	wrmsr(MSR_IA32_UCODE_WRITE,
+	      (unsigned long) mc_intel->bits,
+	      (unsigned long) mc_intel->bits >> 16 >> 16);
+	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
+
+	/* see notes above for revision 1.07.  Apparent chip bug */
+	sync_core();
+
+	/* get the current revision from MSR 0x8B */
+	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
+
+	spin_unlock_irqrestore(&microcode_update_lock, flags);
+	if (val[1] != mc_intel->hdr.rev) {
+		printk(KERN_ERR "microcode: CPU%d update from revision "
+			"0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]);
+		return;
+	}
+	printk(KERN_INFO "microcode: CPU%d updated from revision "
+	       "0x%x to 0x%x, date = %04x-%02x-%02x \n",
+		cpu_num, uci->cpu_sig.rev, val[1],
+		mc_intel->hdr.date & 0xffff,
+		mc_intel->hdr.date >> 24,
+		(mc_intel->hdr.date >> 16) & 0xff);
+	uci->cpu_sig.rev = val[1];
+}
+
+static int generic_load_microcode(int cpu, void *data, size_t size,
+		int (*get_ucode_data)(void *, const void *, size_t))
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+	int new_rev = uci->cpu_sig.rev;
+	unsigned int leftover = size;
+
+	while (leftover) {
+		struct microcode_header_intel mc_header;
+		unsigned int mc_size;
+
+		if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
+			break;
+
+		mc_size = get_totalsize(&mc_header);
+		if (!mc_size || mc_size > leftover) {
+			printk(KERN_ERR "microcode: error!"
+					"Bad data in microcode data file\n");
+			break;
+		}
+
+		mc = vmalloc(mc_size);
+		if (!mc)
+			break;
+
+		if (get_ucode_data(mc, ucode_ptr, mc_size) ||
+		    microcode_sanity_check(mc) < 0) {
+			vfree(mc);
+			break;
+		}
+
+		if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
+			if (new_mc)
+				vfree(new_mc);
+			new_rev = mc_header.rev;
+			new_mc  = mc;
+		} else
+			vfree(mc);
+
+		ucode_ptr += mc_size;
+		leftover  -= mc_size;
+	}
+
+	if (new_mc) {
+		if (!leftover) {
+			if (uci->mc)
+				vfree(uci->mc);
+			uci->mc = (struct microcode_intel *)new_mc;
+			pr_debug("microcode: CPU%d found a matching microcode update with"
+				 " version 0x%x (current=0x%x)\n",
+				cpu, new_rev, uci->cpu_sig.rev);
+		} else
+			vfree(new_mc);
+	}
+
+	return (int)leftover;
+}
+
+static int get_ucode_fw(void *to, const void *from, size_t n)
+{
+	memcpy(to, from, n);
+	return 0;
+}
+
+static int request_microcode_fw(int cpu, struct device *device)
+{
+	char name[30];
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	const struct firmware *firmware;
+	int ret;
+
+	/* We should bind the task to the CPU */
+	BUG_ON(cpu != raw_smp_processor_id());
+	sprintf(name, "intel-ucode/%02x-%02x-%02x",
+		c->x86, c->x86_model, c->x86_mask);
+	ret = request_firmware(&firmware, name, device);
+	if (ret) {
+		pr_debug("microcode: data file %s load failed\n", name);
+		return ret;
+	}
+
+	ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
+			&get_ucode_fw);
+
+	release_firmware(firmware);
+
+	return ret;
+}
+
+static int get_ucode_user(void *to, const void *from, size_t n)
+{
+	return copy_from_user(to, from, n);
+}
+
+static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+{
+	/* We should bind the task to the CPU */
+	BUG_ON(cpu != raw_smp_processor_id());
+
+	return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user);
+}
+
+static void microcode_fini_cpu(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+
+	vfree(uci->mc);
+	uci->mc = NULL;
+}
+
+struct microcode_ops microcode_intel_ops = {
+	.request_microcode_user		  = request_microcode_user,
+	.request_microcode_fw             = request_microcode_fw,
+	.collect_cpu_info                 = collect_cpu_info,
+	.apply_microcode                  = apply_microcode,
+	.microcode_fini_cpu               = microcode_fini_cpu,
+};
+
+struct microcode_ops * __init init_intel_microcode(void)
+{
+	return &microcode_intel_ops;
+}
+
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index fdfdc550b366..efc2f361fe85 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -238,7 +238,7 @@ static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
 	{}
 };
 
-void __init check_enable_amd_mmconf_dmi(void)
+void __cpuinit check_enable_amd_mmconf_dmi(void)
 {
 	dmi_check_system(mmconf_dmi_table);
 }
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index a888e67f5874..6ba87830d4b1 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/bug.h>
 
@@ -150,7 +151,8 @@ int module_finalize(const Elf_Ehdr *hdr,
                     const Elf_Shdr *sechdrs,
                     struct module *me)
 {
-	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
+	const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+		*para = NULL;
 	char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
 
 	for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -160,6 +162,8 @@ int module_finalize(const Elf_Ehdr *hdr,
 			alt = s;
 		if (!strcmp(".smp_locks", secstrings + s->sh_name))
 			locks= s;
+		if (!strcmp(".parainstructions", secstrings + s->sh_name))
+			para = s;
 	}
 
 	if (alt) {
@@ -175,6 +179,11 @@ int module_finalize(const Elf_Ehdr *hdr,
 					    tseg, tseg + text->sh_size);
 	}
 
+	if (para) {
+		void *pseg = (void *)para->sh_addr;
+		apply_paravirt(pseg, pseg + para->sh_size);
+	}
+
 	return module_bug_finalize(hdr, sechdrs, me);
 }
 
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 3b25e49380c6..f98f4e1dba09 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,6 +27,7 @@
 #include <asm/bios_ebda.h>
 #include <asm/e820.h>
 #include <asm/trampoline.h>
+#include <asm/setup.h>
 
 #include <mach_apic.h>
 #ifdef CONFIG_X86_32
@@ -48,77 +49,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
 	return sum & 0xFF;
 }
 
-#ifdef CONFIG_X86_NUMAQ
-int found_numaq;
-/*
- * Have to match translation table entries to main table entries by counter
- * hence the mpc_record variable .... can't see a less disgusting way of
- * doing this ....
- */
-struct mpc_config_translation {
-	unsigned char mpc_type;
-	unsigned char trans_len;
-	unsigned char trans_type;
-	unsigned char trans_quad;
-	unsigned char trans_global;
-	unsigned char trans_local;
-	unsigned short trans_reserved;
-};
-
-
-static int mpc_record;
-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
-    __cpuinitdata;
-
-static inline int generate_logical_apicid(int quad, int phys_apicid)
-{
-	return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
-}
-
-
-static inline int mpc_apic_id(struct mpc_config_processor *m,
-			struct mpc_config_translation *translation_record)
-{
-	int quad = translation_record->trans_quad;
-	int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
-
-	printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
-	       m->mpc_apicid,
-	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
-	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
-	       m->mpc_apicver, quad, logical_apicid);
-	return logical_apicid;
-}
-
-int mp_bus_id_to_node[MAX_MP_BUSSES];
-
-int mp_bus_id_to_local[MAX_MP_BUSSES];
-
-static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
-	struct mpc_config_translation *translation)
-{
-	int quad = translation->trans_quad;
-	int local = translation->trans_local;
-
-	mp_bus_id_to_node[m->mpc_busid] = quad;
-	mp_bus_id_to_local[m->mpc_busid] = local;
-	printk(KERN_INFO "Bus #%d is %s (node %d)\n",
-	       m->mpc_busid, name, quad);
-}
-
-int quad_local_to_mp_bus_id [NR_CPUS/4][4];
-static void mpc_oem_pci_bus(struct mpc_config_bus *m,
-	struct mpc_config_translation *translation)
-{
-	int quad = translation->trans_quad;
-	int local = translation->trans_local;
-
-	quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
-}
-
-#endif
-
-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
+static void __init MP_processor_info(struct mpc_config_processor *m)
 {
 	int apicid;
 	char *bootup_cpu = "";
@@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
 		disabled_cpus++;
 		return;
 	}
-#ifdef CONFIG_X86_NUMAQ
-	if (found_numaq)
-		apicid = mpc_apic_id(m, translation_table[mpc_record]);
+
+	if (x86_quirks->mpc_apic_id)
+		apicid = x86_quirks->mpc_apic_id(m);
 	else
 		apicid = m->mpc_apicid;
-#else
-	apicid = m->mpc_apicid;
-#endif
+
 	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
 		bootup_cpu = " (Bootup-CPU)";
 		boot_cpu_physical_apicid = m->mpc_apicid;
@@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 	memcpy(str, m->mpc_bustype, 6);
 	str[6] = 0;
 
-#ifdef CONFIG_X86_NUMAQ
-	if (found_numaq)
-		mpc_oem_bus_info(m, str, translation_table[mpc_record]);
-#else
-	printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
-#endif
+	if (x86_quirks->mpc_oem_bus_info)
+		x86_quirks->mpc_oem_bus_info(m, str);
+	else
+		apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
 
 #if MAX_MP_BUSSES < 256
 	if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
 #endif
 	} else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
-#ifdef CONFIG_X86_NUMAQ
-		if (found_numaq)
-			mpc_oem_pci_bus(m, translation_table[mpc_record]);
-#endif
+		if (x86_quirks->mpc_oem_pci_bus)
+			x86_quirks->mpc_oem_pci_bus(m);
+
 		clear_bit(m->mpc_busid, mp_bus_not_pci);
 #if defined(CONFIG_EISA) || defined (CONFIG_MCA)
 		mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
@@ -228,7 +154,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
 
 static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
 {
-	printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
+	apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
 		m->mpc_irqtype, m->mpc_irqflag & 3,
 		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
@@ -237,7 +163,7 @@ static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
 
 static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
 {
-	printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x,"
+	apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC INT %02x\n",
 		mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
 		(mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
@@ -309,90 +235,13 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
 
 static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
 {
-	printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x,"
+	apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
 		" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
 		m->mpc_irqtype, m->mpc_irqflag & 3,
 		(m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
 		m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
 }
 
-#ifdef CONFIG_X86_NUMAQ
-static void __init MP_translation_info(struct mpc_config_translation *m)
-{
-	printk(KERN_INFO
-	       "Translation: record %d, type %d, quad %d, global %d, local %d\n",
-	       mpc_record, m->trans_type, m->trans_quad, m->trans_global,
-	       m->trans_local);
-
-	if (mpc_record >= MAX_MPC_ENTRY)
-		printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
-	else
-		translation_table[mpc_record] = m;	/* stash this for later */
-	if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
-		node_set_online(m->trans_quad);
-}
-
-/*
- * Read/parse the MPC oem tables
- */
-
-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
-				    unsigned short oemsize)
-{
-	int count = sizeof(*oemtable);	/* the header size */
-	unsigned char *oemptr = ((unsigned char *)oemtable) + count;
-
-	mpc_record = 0;
-	printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
-	       oemtable);
-	if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
-		printk(KERN_WARNING
-		       "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
-		       oemtable->oem_signature[0], oemtable->oem_signature[1],
-		       oemtable->oem_signature[2], oemtable->oem_signature[3]);
-		return;
-	}
-	if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
-		printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
-		return;
-	}
-	while (count < oemtable->oem_length) {
-		switch (*oemptr) {
-		case MP_TRANSLATION:
-			{
-				struct mpc_config_translation *m =
-				    (struct mpc_config_translation *)oemptr;
-				MP_translation_info(m);
-				oemptr += sizeof(*m);
-				count += sizeof(*m);
-				++mpc_record;
-				break;
-			}
-		default:
-			{
-				printk(KERN_WARNING
-				       "Unrecognised OEM table entry type! - %d\n",
-				       (int)*oemptr);
-				return;
-			}
-		}
-	}
-}
-
-void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
-				 char *productid)
-{
-	if (strncmp(oem, "IBM NUMA", 8))
-		printk("Warning!  Not a NUMA-Q system!\n");
-	else
-		found_numaq = 1;
-
-	if (mpc->mpc_oemptr)
-		smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
-				 mpc->mpc_oemsize);
-}
-#endif /* CONFIG_X86_NUMAQ */
-
 /*
  * Read/parse the MPC
  */
@@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 	} else
 		mps_oem_check(mpc, oem, str);
 #endif
-
 	/* save the local APIC address, it might be non-default */
 	if (!acpi_lapic)
 		mp_lapic_addr = mpc->mpc_lapic;
@@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 	if (early)
 		return 1;
 
+	if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
+		struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
+		x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
+	}
+
 	/*
 	 *      Now process the configuration blocks.
 	 */
-#ifdef CONFIG_X86_NUMAQ
-	mpc_record = 0;
-#endif
+	if (x86_quirks->mpc_record)
+		*x86_quirks->mpc_record = 0;
+
 	while (count < mpc->mpc_length) {
 		switch (*mpt) {
 		case MP_PROCESSOR:
@@ -536,16 +389,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
 			count = mpc->mpc_length;
 			break;
 		}
-#ifdef CONFIG_X86_NUMAQ
-		++mpc_record;
-#endif
+		if (x86_quirks->mpc_record)
+			(*x86_quirks->mpc_record)++;
 	}
 
 #ifdef CONFIG_X86_GENERICARCH
        generic_bigsmp_probe();
 #endif
 
+#ifdef CONFIG_X86_32
 	setup_apic_routing();
+#endif
 	if (!num_processors)
 		printk(KERN_ERR "MPTABLE: no processors registered!\n");
 	return num_processors;
@@ -632,7 +486,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
 }
 
 
-static void construct_ioapic_table(int mpc_default_type)
+static void __init construct_ioapic_table(int mpc_default_type)
 {
 	struct mpc_config_ioapic ioapic;
 	struct mpc_config_bus bus;
@@ -677,7 +531,7 @@ static void construct_ioapic_table(int mpc_default_type)
 	construct_default_ioirq_mptable(mpc_default_type);
 }
 #else
-static inline void construct_ioapic_table(int mpc_default_type) { }
+static inline void __init construct_ioapic_table(int mpc_default_type) { }
 #endif
 
 static inline void __init construct_default_ISA_mptable(int mpc_default_type)
@@ -726,20 +580,14 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
 static struct intel_mp_floating *mpf_found;
 
 /*
- * Machine specific quirk for finding the SMP config before other setup
- * activities destroy the table:
- */
-int (*mach_get_smp_config_quirk)(unsigned int early);
-
-/*
  * Scan the memory blocks for an SMP configuration block.
  */
 static void __init __get_smp_config(unsigned int early)
 {
 	struct intel_mp_floating *mpf = mpf_found;
 
-	if (mach_get_smp_config_quirk) {
-		if (mach_get_smp_config_quirk(early))
+	if (x86_quirks->mach_get_smp_config) {
+		if (x86_quirks->mach_get_smp_config(early))
 			return;
 	}
 	if (acpi_lapic && early)
@@ -849,7 +697,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 	unsigned int *bp = phys_to_virt(base);
 	struct intel_mp_floating *mpf;
 
-	printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length);
+	apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
+			bp, length);
 	BUILD_BUG_ON(sizeof(*mpf) != 16);
 
 	while (length > 0) {
@@ -899,14 +748,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
 	return 0;
 }
 
-int (*mach_find_smp_config_quirk)(unsigned int reserve);
-
 static void __init __find_smp_config(unsigned int reserve)
 {
 	unsigned int address;
 
-	if (mach_find_smp_config_quirk) {
-		if (mach_find_smp_config_quirk(reserve))
+	if (x86_quirks->mach_find_smp_config) {
+		if (x86_quirks->mach_find_smp_config(reserve))
 			return;
 	}
 	/*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index a153b3905f60..82a7c7ed6d45 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -72,21 +72,28 @@ static ssize_t msr_read(struct file *file, char __user *buf,
 	u32 data[2];
 	u32 reg = *ppos;
 	int cpu = iminor(file->f_path.dentry->d_inode);
-	int err;
+	int err = 0;
+	ssize_t bytes = 0;
 
 	if (count % 8)
 		return -EINVAL;	/* Invalid chunk size */
 
 	for (; count; count -= 8) {
 		err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
-		if (err)
-			return -EIO;
-		if (copy_to_user(tmp, &data, 8))
-			return -EFAULT;
+		if (err) {
+			if (err == -EFAULT) /* Fix idiotic error code */
+				err = -EIO;
+			break;
+		}
+		if (copy_to_user(tmp, &data, 8)) {
+			err = -EFAULT;
+			break;
+		}
 		tmp += 2;
+		bytes += 8;
 	}
 
-	return ((char __user *)tmp) - buf;
+	return bytes ? bytes : err;
 }
 
 static ssize_t msr_write(struct file *file, const char __user *buf,
@@ -96,21 +103,28 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
 	u32 data[2];
 	u32 reg = *ppos;
 	int cpu = iminor(file->f_path.dentry->d_inode);
-	int err;
+	int err = 0;
+	ssize_t bytes = 0;
 
 	if (count % 8)
 		return -EINVAL;	/* Invalid chunk size */
 
 	for (; count; count -= 8) {
-		if (copy_from_user(&data, tmp, 8))
-			return -EFAULT;
+		if (copy_from_user(&data, tmp, 8)) {
+			err = -EFAULT;
+			break;
+		}
 		err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
-		if (err)
-			return -EIO;
+		if (err) {
+			if (err == -EFAULT) /* Fix idiotic error code */
+				err = -EIO;
+			break;
+		}
 		tmp += 2;
+		bytes += 8;
 	}
 
-	return ((char __user *)tmp) - buf;
+	return bytes ? bytes : err;
 }
 
 static int msr_open(struct inode *inode, struct file *file)
@@ -131,7 +145,7 @@ static int msr_open(struct inode *inode, struct file *file)
 		ret = -EIO;	/* MSR not supported */
 out:
 	unlock_kernel();
-	return 0;
+	return ret;
 }
 
 /*
@@ -149,7 +163,7 @@ static int __cpuinit msr_device_create(int cpu)
 {
 	struct device *dev;
 
-	dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu),
+	dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
 			    "msr%d", cpu);
 	return IS_ERR(dev) ? PTR_ERR(dev) : 0;
 }
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ec024b3baad0..2c97f07f1c2c 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -114,6 +114,23 @@ static __init void nmi_cpu_busy(void *data)
 }
 #endif
 
+static void report_broken_nmi(int cpu, int *prev_nmi_count)
+{
+	printk(KERN_CONT "\n");
+
+	printk(KERN_WARNING
+		"WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
+			cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
+
+	printk(KERN_WARNING
+		"Please report this to bugzilla.kernel.org,\n");
+	printk(KERN_WARNING
+		"and attach the output of the 'dmesg' command.\n");
+
+	per_cpu(wd_enabled, cpu) = 0;
+	atomic_dec(&nmi_active);
+}
+
 int __init check_nmi_watchdog(void)
 {
 	unsigned int *prev_nmi_count;
@@ -141,15 +158,8 @@ int __init check_nmi_watchdog(void)
 	for_each_online_cpu(cpu) {
 		if (!per_cpu(wd_enabled, cpu))
 			continue;
-		if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
-			printk(KERN_WARNING "WARNING: CPU#%d: NMI "
-				"appears to be stuck (%d->%d)!\n",
-				cpu,
-				prev_nmi_count[cpu],
-				get_nmi_count(cpu));
-			per_cpu(wd_enabled, cpu) = 0;
-			atomic_dec(&nmi_active);
-		}
+		if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
+			report_broken_nmi(cpu, prev_nmi_count);
 	}
 	endflag = 1;
 	if (!atomic_read(&nmi_active)) {
@@ -263,7 +273,7 @@ late_initcall(init_lapic_nmi_sysfs);
 
 static void __acpi_nmi_enable(void *__unused)
 {
-	apic_write_around(APIC_LVT0, APIC_DM_NMI);
+	apic_write(APIC_LVT0, APIC_DM_NMI);
 }
 
 /*
@@ -277,7 +287,7 @@ void acpi_nmi_enable(void)
 
 static void __acpi_nmi_disable(void *__unused)
 {
-	apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
+	apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
 }
 
 /*
@@ -289,6 +299,15 @@ void acpi_nmi_disable(void)
 		on_each_cpu(__acpi_nmi_disable, NULL, 1);
 }
 
+/*
+ * This function is called as soon the LAPIC NMI watchdog driver has everything
+ * in place and it's ready to check if the NMIs belong to the NMI watchdog
+ */
+void cpu_nmi_set_wd_enabled(void)
+{
+	__get_cpu_var(wd_enabled) = 1;
+}
+
 void setup_apic_nmi_watchdog(void *unused)
 {
 	if (__get_cpu_var(wd_enabled))
@@ -301,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused)
 
 	switch (nmi_watchdog) {
 	case NMI_LOCAL_APIC:
-		 /* enable it before to avoid race with handler */
-		__get_cpu_var(wd_enabled) = 1;
 		if (lapic_watchdog_init(nmi_hz) < 0) {
 			__get_cpu_var(wd_enabled) = 0;
 			return;
@@ -448,6 +465,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
 
 #ifdef CONFIG_SYSCTL
 
+static int __init setup_unknown_nmi_panic(char *str)
+{
+	unknown_nmi_panic = 1;
+	return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
 static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
 {
 	unsigned char reason = get_nmi_reason();
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index a23e8233b9ac..4caff39078e0 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -33,6 +33,7 @@
 #include <asm/processor.h>
 #include <asm/mpspec.h>
 #include <asm/e820.h>
+#include <asm/setup.h>
 
 #define	MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
 
@@ -71,6 +72,195 @@ static void __init smp_dump_qct(void)
 	}
 }
 
+
+void __cpuinit numaq_tsc_disable(void)
+{
+	if (!found_numaq)
+		return;
+
+	if (num_online_nodes() > 1) {
+		printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
+		setup_clear_cpu_cap(X86_FEATURE_TSC);
+	}
+}
+
+static int __init numaq_pre_time_init(void)
+{
+	numaq_tsc_disable();
+	return 0;
+}
+
+int found_numaq;
+/*
+ * Have to match translation table entries to main table entries by counter
+ * hence the mpc_record variable .... can't see a less disgusting way of
+ * doing this ....
+ */
+struct mpc_config_translation {
+	unsigned char mpc_type;
+	unsigned char trans_len;
+	unsigned char trans_type;
+	unsigned char trans_quad;
+	unsigned char trans_global;
+	unsigned char trans_local;
+	unsigned short trans_reserved;
+};
+
+/* x86_quirks member */
+static int mpc_record;
+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
+    __cpuinitdata;
+
+static inline int generate_logical_apicid(int quad, int phys_apicid)
+{
+	return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
+}
+
+/* x86_quirks member */
+static int mpc_apic_id(struct mpc_config_processor *m)
+{
+	int quad = translation_table[mpc_record]->trans_quad;
+	int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
+
+	printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
+	       m->mpc_apicid,
+	       (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
+	       (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
+	       m->mpc_apicver, quad, logical_apicid);
+	return logical_apicid;
+}
+
+int mp_bus_id_to_node[MAX_MP_BUSSES];
+
+int mp_bus_id_to_local[MAX_MP_BUSSES];
+
+/* x86_quirks member */
+static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
+{
+	int quad = translation_table[mpc_record]->trans_quad;
+	int local = translation_table[mpc_record]->trans_local;
+
+	mp_bus_id_to_node[m->mpc_busid] = quad;
+	mp_bus_id_to_local[m->mpc_busid] = local;
+	printk(KERN_INFO "Bus #%d is %s (node %d)\n",
+	       m->mpc_busid, name, quad);
+}
+
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
+
+/* x86_quirks member */
+static void mpc_oem_pci_bus(struct mpc_config_bus *m)
+{
+	int quad = translation_table[mpc_record]->trans_quad;
+	int local = translation_table[mpc_record]->trans_local;
+
+	quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
+}
+
+static void __init MP_translation_info(struct mpc_config_translation *m)
+{
+	printk(KERN_INFO
+	       "Translation: record %d, type %d, quad %d, global %d, local %d\n",
+	       mpc_record, m->trans_type, m->trans_quad, m->trans_global,
+	       m->trans_local);
+
+	if (mpc_record >= MAX_MPC_ENTRY)
+		printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
+	else
+		translation_table[mpc_record] = m;	/* stash this for later */
+	if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
+		node_set_online(m->trans_quad);
+}
+
+static int __init mpf_checksum(unsigned char *mp, int len)
+{
+	int sum = 0;
+
+	while (len--)
+		sum += *mp++;
+
+	return sum & 0xFF;
+}
+
+/*
+ * Read/parse the MPC oem tables
+ */
+
+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
+				    unsigned short oemsize)
+{
+	int count = sizeof(*oemtable);	/* the header size */
+	unsigned char *oemptr = ((unsigned char *)oemtable) + count;
+
+	mpc_record = 0;
+	printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
+	       oemtable);
+	if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
+		printk(KERN_WARNING
+		       "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+		       oemtable->oem_signature[0], oemtable->oem_signature[1],
+		       oemtable->oem_signature[2], oemtable->oem_signature[3]);
+		return;
+	}
+	if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
+		printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
+		return;
+	}
+	while (count < oemtable->oem_length) {
+		switch (*oemptr) {
+		case MP_TRANSLATION:
+			{
+				struct mpc_config_translation *m =
+				    (struct mpc_config_translation *)oemptr;
+				MP_translation_info(m);
+				oemptr += sizeof(*m);
+				count += sizeof(*m);
+				++mpc_record;
+				break;
+			}
+		default:
+			{
+				printk(KERN_WARNING
+				       "Unrecognised OEM table entry type! - %d\n",
+				       (int)*oemptr);
+				return;
+			}
+		}
+	}
+}
+
+static int __init numaq_setup_ioapic_ids(void)
+{
+	/* so can skip it */
+	return 1;
+}
+
+static struct x86_quirks numaq_x86_quirks __initdata = {
+	.arch_pre_time_init	= numaq_pre_time_init,
+	.arch_time_init		= NULL,
+	.arch_pre_intr_init	= NULL,
+	.arch_memory_setup	= NULL,
+	.arch_intr_init		= NULL,
+	.arch_trap_init		= NULL,
+	.mach_get_smp_config	= NULL,
+	.mach_find_smp_config	= NULL,
+	.mpc_record		= &mpc_record,
+	.mpc_apic_id		= mpc_apic_id,
+	.mpc_oem_bus_info	= mpc_oem_bus_info,
+	.mpc_oem_pci_bus	= mpc_oem_pci_bus,
+	.smp_read_mpc_oem	= smp_read_mpc_oem,
+	.setup_ioapic_ids	= numaq_setup_ioapic_ids,
+};
+
+void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
+				 char *productid)
+{
+	if (strncmp(oem, "IBM NUMA", 8))
+		printk("Warning!  Not a NUMA-Q system!\n");
+	else
+		found_numaq = 1;
+}
+
 static __init void early_check_numaq(void)
 {
 	/*
@@ -82,6 +272,9 @@ static __init void early_check_numaq(void)
 	 */
 	if (smp_found_config)
 		early_get_smp_config();
+
+	if (found_numaq)
+		x86_quirks = &numaq_x86_quirks;
 }
 
 int __init get_memcfg_numaq(void)
@@ -92,14 +285,3 @@ int __init get_memcfg_numaq(void)
 	smp_dump_qct();
 	return 1;
 }
-
-void __init numaq_tsc_disable(void)
-{
-	if (!found_numaq)
-		return;
-
-	if (num_online_nodes() > 1) {
-		printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
-		setup_clear_cpu_cap(X86_FEATURE_TSC);
-	}
-}
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 3e6672274807..7a13fac63a1f 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
 static void __init platform_detect(void)
 {
 	size_t propsize;
-	u32 rev;
+	__be32 rev;
 
 	if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
 			&propsize) || propsize != 4) {
 		printk(KERN_ERR "ofw: getprop call failed!\n");
-		rev = 0;
+		rev = cpu_to_be32(0);
 	}
 	olpc_platform_info.boardrev = be32_to_cpu(rev);
 }
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
 static void __init platform_detect(void)
 {
 	/* stopgap until OFW support is added to the kernel */
-	olpc_platform_info.boardrev = be32_to_cpu(0xc2);
+	olpc_platform_info.boardrev = 0xc2;
 }
 #endif
 
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644
index 000000000000..0e9f1982b1dd
--- /dev/null
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -0,0 +1,37 @@
+/*
+ * Split spinlock implementation out into its own file, so it can be
+ * compiled in a FTRACE-compatible way.
+ */
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <asm/paravirt.h>
+
+static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
+{
+	__raw_spin_lock(lock);
+}
+
+struct pv_lock_ops pv_lock_ops = {
+#ifdef CONFIG_SMP
+	.spin_is_locked = __ticket_spin_is_locked,
+	.spin_is_contended = __ticket_spin_is_contended,
+
+	.spin_lock = __ticket_spin_lock,
+	.spin_lock_flags = default_spin_lock_flags,
+	.spin_trylock = __ticket_spin_trylock,
+	.spin_unlock = __ticket_spin_unlock,
+#endif
+};
+EXPORT_SYMBOL(pv_lock_ops);
+
+void __init paravirt_use_bytelocks(void)
+{
+#ifdef CONFIG_SMP
+	pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
+	pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
+	pv_lock_ops.spin_lock = __byte_spin_lock;
+	pv_lock_ops.spin_trylock = __byte_spin_trylock;
+	pv_lock_ops.spin_unlock = __byte_spin_unlock;
+#endif
+}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e0f571d58c19..e4c8fb608873 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,6 +29,7 @@
 #include <asm/desc.h>
 #include <asm/setup.h>
 #include <asm/arch_hooks.h>
+#include <asm/pgtable.h>
 #include <asm/time.h>
 #include <asm/pgalloc.h>
 #include <asm/irq.h>
@@ -123,6 +124,7 @@ static void *get_call_destination(u8 type)
 		.pv_irq_ops = pv_irq_ops,
 		.pv_apic_ops = pv_apic_ops,
 		.pv_mmu_ops = pv_mmu_ops,
+		.pv_lock_ops = pv_lock_ops,
 	};
 	return *((void **)&tmpl + type);
 }
@@ -317,6 +319,7 @@ struct pv_cpu_ops pv_cpu_ops = {
 #endif
 	.wbinvd = native_wbinvd,
 	.read_msr = native_read_msr_safe,
+	.read_msr_amd = native_read_msr_amd_safe,
 	.write_msr = native_write_msr_safe,
 	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
@@ -335,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.write_ldt_entry = native_write_ldt_entry,
 	.write_gdt_entry = native_write_gdt_entry,
 	.write_idt_entry = native_write_idt_entry,
+
+	.alloc_ldt = paravirt_nop,
+	.free_ldt = paravirt_nop,
+
 	.load_sp0 = native_load_sp0,
 
 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -360,9 +367,6 @@ struct pv_cpu_ops pv_cpu_ops = {
 
 struct pv_apic_ops pv_apic_ops = {
 #ifdef CONFIG_X86_LOCAL_APIC
-	.apic_write = native_apic_write,
-	.apic_write_atomic = native_apic_write_atomic,
-	.apic_read = native_apic_read,
 	.setup_boot_clock = setup_boot_APIC_clock,
 	.setup_secondary_clock = setup_secondary_APIC_clock,
 	.startup_ipi_hook = paravirt_nop,
@@ -373,6 +377,9 @@ struct pv_mmu_ops pv_mmu_ops = {
 #ifndef CONFIG_X86_64
 	.pagetable_setup_start = native_pagetable_setup_start,
 	.pagetable_setup_done = native_pagetable_setup_done,
+#else
+	.pagetable_setup_start = paravirt_nop,
+	.pagetable_setup_done = paravirt_nop,
 #endif
 
 	.read_cr2 = native_read_cr2,
@@ -428,7 +435,7 @@ struct pv_mmu_ops pv_mmu_ops = {
 #endif /* PAGETABLE_LEVELS >= 3 */
 
 	.pte_val = native_pte_val,
-	.pte_flags = native_pte_val,
+	.pte_flags = native_pte_flags,
 	.pgd_val = native_pgd_val,
 
 	.make_pte = native_make_pte,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 58262218781b..9fe644f4861d 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 			start = start_##ops##_##x;		\
 			end = end_##ops##_##x;			\
 			goto patch_site
-	switch(type) {
+	switch (type) {
 		PATCH_SITE(pv_irq_ops, irq_disable);
 		PATCH_SITE(pv_irq_ops, irq_enable);
 		PATCH_SITE(pv_irq_ops, restore_fl);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 6959b5c45df4..e1e731d78f38 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -29,6 +29,7 @@
 #include <linux/mm.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
+#include <linux/crash_dump.h>
 #include <linux/dma-mapping.h>
 #include <linux/bitops.h>
 #include <linux/pci_ids.h>
@@ -36,7 +37,8 @@
 #include <linux/delay.h>
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
-#include <asm/gart.h>
+
+#include <asm/iommu.h>
 #include <asm/calgary.h>
 #include <asm/tce.h>
 #include <asm/pci-direct.h>
@@ -167,6 +169,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl);
 static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
 static void calioc2_tce_cache_blast(struct iommu_table *tbl);
 static void calioc2_dump_error_regs(struct iommu_table *tbl);
+static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
+static void get_tce_space_from_tar(void);
 
 static struct cal_chipset_ops calgary_chip_ops = {
 	.handle_quirks = calgary_handle_quirks,
@@ -213,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap,
 
 #endif /* CONFIG_IOMMU_DEBUG */
 
-static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
-{
-	unsigned int npages;
-
-	npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
-	npages >>= PAGE_SHIFT;
-
-	return npages;
-}
-
 static inline int translation_enabled(struct iommu_table *tbl)
 {
 	/* only PHBs with translation enabled have an IOMMU table */
@@ -257,7 +251,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
 			       badbit, tbl, start_addr, npages);
 	}
 
-	set_bit_string(tbl->it_map, index, npages);
+	iommu_area_reserve(tbl->it_map, index, npages);
 
 	spin_unlock_irqrestore(&tbl->it_lock, flags);
 }
@@ -339,9 +333,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	/* were we called with bad_dma_address? */
 	badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
 	if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
-		printk(KERN_ERR "Calgary: driver tried unmapping bad DMA "
+		WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
 		       "address 0x%Lx\n", dma_addr);
-		WARN_ON(1);
 		return;
 	}
 
@@ -405,27 +398,11 @@ static void calgary_unmap_sg(struct device *dev,
 		if (dmalen == 0)
 			break;
 
-		npages = num_dma_pages(dma, dmalen);
+		npages = iommu_num_pages(dma, dmalen, PAGE_SIZE);
 		iommu_free(tbl, dma, npages);
 	}
 }
 
-static int calgary_nontranslate_map_sg(struct device* dev,
-	struct scatterlist *sg, int nelems, int direction)
-{
-	struct scatterlist *s;
-	int i;
-
-	for_each_sg(sg, s, nelems, i) {
-		struct page *p = sg_page(s);
-
-		BUG_ON(!p);
-		s->dma_address = virt_to_bus(sg_virt(s));
-		s->dma_length = s->length;
-	}
-	return nelems;
-}
-
 static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
 	int nelems, int direction)
 {
@@ -436,14 +413,11 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
 	unsigned long entry;
 	int i;
 
-	if (!translation_enabled(tbl))
-		return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
-
 	for_each_sg(sg, s, nelems, i) {
 		BUG_ON(!sg_page(s));
 
 		vaddr = (unsigned long) sg_virt(s);
-		npages = num_dma_pages(vaddr, s->length);
+		npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
 
 		entry = iommu_range_alloc(dev, tbl, npages);
 		if (entry == bad_dma_address) {
@@ -474,21 +448,15 @@ error:
 static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
 	size_t size, int direction)
 {
-	dma_addr_t dma_handle = bad_dma_address;
 	void *vaddr = phys_to_virt(paddr);
 	unsigned long uaddr;
 	unsigned int npages;
 	struct iommu_table *tbl = find_iommu_table(dev);
 
 	uaddr = (unsigned long)vaddr;
-	npages = num_dma_pages(uaddr, size);
+	npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
 
-	if (translation_enabled(tbl))
-		dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
-	else
-		dma_handle = virt_to_bus(vaddr);
-
-	return dma_handle;
+	return iommu_alloc(dev, tbl, vaddr, npages, direction);
 }
 
 static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
@@ -497,10 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
 	struct iommu_table *tbl = find_iommu_table(dev);
 	unsigned int npages;
 
-	if (!translation_enabled(tbl))
-		return;
-
-	npages = num_dma_pages(dma_handle, size);
+	npages = iommu_num_pages(dma_handle, size, PAGE_SIZE);
 	iommu_free(tbl, dma_handle, npages);
 }
 
@@ -516,24 +481,20 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
 	npages = size >> PAGE_SHIFT;
 	order = get_order(size);
 
+	flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
 	/* alloc enough pages (and possibly more) */
 	ret = (void *)__get_free_pages(flag, order);
 	if (!ret)
 		goto error;
 	memset(ret, 0, size);
 
-	if (translation_enabled(tbl)) {
-		/* set up tces to cover the allocated range */
-		mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
-		if (mapping == bad_dma_address)
-			goto free;
-
-		*dma_handle = mapping;
-	} else /* non translated slot */
-		*dma_handle = virt_to_bus(ret);
-
+	/* set up tces to cover the allocated range */
+	mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
+	if (mapping == bad_dma_address)
+		goto free;
+	*dma_handle = mapping;
 	return ret;
-
 free:
 	free_pages((unsigned long)ret, get_order(size));
 	ret = NULL;
@@ -541,8 +502,22 @@ error:
 	return ret;
 }
 
-static const struct dma_mapping_ops calgary_dma_ops = {
+static void calgary_free_coherent(struct device *dev, size_t size,
+				  void *vaddr, dma_addr_t dma_handle)
+{
+	unsigned int npages;
+	struct iommu_table *tbl = find_iommu_table(dev);
+
+	size = PAGE_ALIGN(size);
+	npages = size >> PAGE_SHIFT;
+
+	iommu_free(tbl, dma_handle, npages);
+	free_pages((unsigned long)vaddr, get_order(size));
+}
+
+static struct dma_mapping_ops calgary_dma_ops = {
 	.alloc_coherent = calgary_alloc_coherent,
+	.free_coherent = calgary_free_coherent,
 	.map_single = calgary_map_single,
 	.unmap_single = calgary_unmap_single,
 	.map_sg = calgary_map_sg,
@@ -830,7 +805,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
 
 	tbl = pci_iommu(dev->bus);
 	tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
-	tce_free(tbl, 0, tbl->it_size);
+
+	if (is_kdump_kernel())
+		calgary_init_bitmap_from_tce_table(tbl);
+	else
+		tce_free(tbl, 0, tbl->it_size);
 
 	if (is_calgary(dev->device))
 		tbl->chip_ops = &calgary_chip_ops;
@@ -1209,6 +1188,10 @@ static int __init calgary_init(void)
 	if (ret)
 		return ret;
 
+	/* Purely for kdump kernel case */
+	if (is_kdump_kernel())
+		get_tce_space_from_tar();
+
 	do {
 		dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
 		if (!dev)
@@ -1230,6 +1213,16 @@ static int __init calgary_init(void)
 			goto error;
 	} while (1);
 
+	dev = NULL;
+	for_each_pci_dev(dev) {
+		struct iommu_table *tbl;
+
+		tbl = find_iommu_table(&dev->dev);
+
+		if (translation_enabled(tbl))
+			dev->dev.archdata.dma_ops = &calgary_dma_ops;
+	}
+
 	return ret;
 
 error:
@@ -1251,6 +1244,7 @@ error:
 		calgary_disable_translation(dev);
 		calgary_free_bus(dev);
 		pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
+		dev->dev.archdata.dma_ops = NULL;
 	} while (1);
 
 	return ret;
@@ -1280,13 +1274,15 @@ static inline int __init determine_tce_table_size(u64 ram)
 static int __init build_detail_arrays(void)
 {
 	unsigned long ptr;
-	int i, scal_detail_size, rio_detail_size;
+	unsigned numnodes, i;
+	int scal_detail_size, rio_detail_size;
 
-	if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
+	numnodes = rio_table_hdr->num_scal_dev;
+	if (numnodes > MAX_NUMNODES){
 		printk(KERN_WARNING
 			"Calgary: MAX_NUMNODES too low! Defined as %d, "
 			"but system has %d nodes.\n",
-			MAX_NUMNODES, rio_table_hdr->num_scal_dev);
+			MAX_NUMNODES, numnodes);
 		return -ENODEV;
 	}
 
@@ -1307,8 +1303,7 @@ static int __init build_detail_arrays(void)
 	}
 
 	ptr = ((unsigned long)rio_table_hdr) + 3;
-	for (i = 0; i < rio_table_hdr->num_scal_dev;
-		    i++, ptr += scal_detail_size)
+	for (i = 0; i < numnodes; i++, ptr += scal_detail_size)
 		scal_devs[i] = (struct scal_detail *)ptr;
 
 	for (i = 0; i < rio_table_hdr->num_rio_dev;
@@ -1339,6 +1334,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
 	return (val != 0xffffffff);
 }
 
+/*
+ * calgary_init_bitmap_from_tce_table():
+ * Funtion for kdump case. In the second/kdump kernel initialize
+ * the bitmap based on the tce table entries obtained from first kernel
+ */
+static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
+{
+	u64 *tp;
+	unsigned int index;
+	tp = ((u64 *)tbl->it_base);
+	for (index = 0 ; index < tbl->it_size; index++) {
+		if (*tp != 0x0)
+			set_bit(index, tbl->it_map);
+		tp++;
+	}
+}
+
+/*
+ * get_tce_space_from_tar():
+ * Function for kdump case. Get the tce tables from first kernel
+ * by reading the contents of the base adress register of calgary iommu
+ */
+static void __init get_tce_space_from_tar(void)
+{
+	int bus;
+	void __iomem *target;
+	unsigned long tce_space;
+
+	for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
+		struct calgary_bus_info *info = &bus_info[bus];
+		unsigned short pci_device;
+		u32 val;
+
+		val = read_pci_config(bus, 0, 0, 0);
+		pci_device = (val & 0xFFFF0000) >> 16;
+
+		if (!is_cal_pci_dev(pci_device))
+			continue;
+		if (info->translation_disabled)
+			continue;
+
+		if (calgary_bus_has_devices(bus, pci_device) ||
+						translate_empty_slots) {
+			target = calgary_reg(bus_info[bus].bbar,
+						tar_offset(bus));
+			tce_space = be64_to_cpu(readq(target));
+			tce_space = tce_space & TAR_SW_BITS;
+
+			tce_space = tce_space & (~specified_table_size);
+			info->tce_space = (u64 *)__va(tce_space);
+		}
+	}
+	return;
+}
+
 void __init detect_calgary(void)
 {
 	int bus;
@@ -1394,7 +1444,8 @@ void __init detect_calgary(void)
 		return;
 	}
 
-	specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE);
+	specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
+					saved_max_pfn : max_pfn) * PAGE_SIZE);
 
 	for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
 		struct calgary_bus_info *info = &bus_info[bus];
@@ -1412,10 +1463,16 @@ void __init detect_calgary(void)
 
 		if (calgary_bus_has_devices(bus, pci_device) ||
 		    translate_empty_slots) {
-			tbl = alloc_tce_table();
-			if (!tbl)
-				goto cleanup;
-			info->tce_space = tbl;
+			/*
+			 * If it is kdump kernel, find and use tce tables
+			 * from first kernel, else allocate tce tables here
+			 */
+			if (!is_kdump_kernel()) {
+				tbl = alloc_tce_table();
+				if (!tbl)
+					goto cleanup;
+				info->tce_space = tbl;
+			}
 			calgary_found = 1;
 		}
 	}
@@ -1430,6 +1487,10 @@ void __init detect_calgary(void)
 		printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
 		       "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
 		       debugging ? "enabled" : "disabled");
+
+		/* swiotlb for devices that aren't behind the Calgary. */
+		if (max_pfn > MAX_DMA32_PFN)
+			swiotlb = 1;
 	}
 	return;
 
@@ -1446,7 +1507,7 @@ int __init calgary_iommu_init(void)
 {
 	int ret;
 
-	if (no_iommu || swiotlb)
+	if (no_iommu || (swiotlb && !calgary_detected))
 		return -ENODEV;
 
 	if (!calgary_detected)
@@ -1459,15 +1520,14 @@ int __init calgary_iommu_init(void)
 	if (ret) {
 		printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
 		       "falling back to no_iommu\n", ret);
-		if (max_pfn > MAX_DMA32_PFN)
-			printk(KERN_ERR "WARNING more than 4GB of memory, "
-					"32bit PCI may malfunction.\n");
 		return ret;
 	}
 
 	force_iommu = 1;
 	bad_dma_address = 0x0;
-	dma_ops = &calgary_dma_ops;
+	/* dma_ops is set to swiotlb or nommu */
+	if (!dma_ops)
+		dma_ops = &nommu_dma_ops;
 
 	return 0;
 }
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 8467ec2320f1..1972266e8ba5 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -5,14 +5,11 @@
 
 #include <asm/proto.h>
 #include <asm/dma.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/calgary.h>
 #include <asm/amd_iommu.h>
 
-int forbid_dac __read_mostly;
-EXPORT_SYMBOL(forbid_dac);
-
-const struct dma_mapping_ops *dma_ops;
+struct dma_mapping_ops *dma_ops;
 EXPORT_SYMBOL(dma_ops);
 
 static int iommu_sac_force __read_mostly;
@@ -42,11 +39,12 @@ EXPORT_SYMBOL(bad_dma_address);
 /* Dummy device used for NULL arguments (normally ISA). Better would
    be probably a smaller DMA mask, but this is bug-to-bug compatible
    to older i386. */
-struct device fallback_dev = {
+struct device x86_dma_fallback_dev = {
 	.bus_id = "fallback device",
 	.coherent_dma_mask = DMA_32BIT_MASK,
-	.dma_mask = &fallback_dev.coherent_dma_mask,
+	.dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
 };
+EXPORT_SYMBOL(x86_dma_fallback_dev);
 
 int dma_set_mask(struct device *dev, u64 mask)
 {
@@ -83,7 +81,7 @@ void __init dma32_reserve_bootmem(void)
 	 * using 512M as goal
 	 */
 	align = 64ULL<<20;
-	size = round_up(dma32_bootmem_size, align);
+	size = roundup(dma32_bootmem_size, align);
 	dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
 				 512ULL<<20);
 	if (dma32_bootmem_ptr)
@@ -114,24 +112,57 @@ void __init pci_iommu_alloc(void)
 	 * The order of these functions is important for
 	 * fall-back/fail-over reasons
 	 */
-#ifdef CONFIG_GART_IOMMU
 	gart_iommu_hole_init();
-#endif
 
-#ifdef CONFIG_CALGARY_IOMMU
 	detect_calgary();
-#endif
 
 	detect_intel_iommu();
 
 	amd_iommu_detect();
 
-#ifdef CONFIG_SWIOTLB
 	pci_swiotlb_init();
-#endif
 }
+
+unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
+{
+	unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
+
+	return size >> PAGE_SHIFT;
+}
+EXPORT_SYMBOL(iommu_nr_pages);
 #endif
 
+void *dma_generic_alloc_coherent(struct device *dev, size_t size,
+				 dma_addr_t *dma_addr, gfp_t flag)
+{
+	unsigned long dma_mask;
+	struct page *page;
+	dma_addr_t addr;
+
+	dma_mask = dma_alloc_coherent_mask(dev, flag);
+
+	flag |= __GFP_ZERO;
+again:
+	page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
+	if (!page)
+		return NULL;
+
+	addr = page_to_phys(page);
+	if (!is_buffer_dma_capable(dma_mask, addr, size)) {
+		__free_pages(page, get_order(size));
+
+		if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
+			flag = (flag & ~GFP_DMA32) | GFP_DMA;
+			goto again;
+		}
+
+		return NULL;
+	}
+
+	*dma_addr = addr;
+	return page_address(page);
+}
+
 /*
  * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
  * documentation.
@@ -184,9 +215,7 @@ static __init int iommu_setup(char *p)
 			swiotlb = 1;
 #endif
 
-#ifdef CONFIG_GART_IOMMU
 		gart_parse_options(p);
-#endif
 
 #ifdef CONFIG_CALGARY_IOMMU
 		if (!strncmp(p, "calgary", 7))
@@ -201,136 +230,19 @@ static __init int iommu_setup(char *p)
 }
 early_param("iommu", iommu_setup);
 
-#ifdef CONFIG_X86_32
-int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
-				dma_addr_t device_addr, size_t size, int flags)
-{
-	void __iomem *mem_base = NULL;
-	int pages = size >> PAGE_SHIFT;
-	int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
-
-	if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
-		goto out;
-	if (!size)
-		goto out;
-	if (dev->dma_mem)
-		goto out;
-
-	/* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
-
-	mem_base = ioremap(bus_addr, size);
-	if (!mem_base)
-		goto out;
-
-	dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
-	if (!dev->dma_mem)
-		goto out;
-	dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
-	if (!dev->dma_mem->bitmap)
-		goto free1_out;
-
-	dev->dma_mem->virt_base = mem_base;
-	dev->dma_mem->device_base = device_addr;
-	dev->dma_mem->size = pages;
-	dev->dma_mem->flags = flags;
-
-	if (flags & DMA_MEMORY_MAP)
-		return DMA_MEMORY_MAP;
-
-	return DMA_MEMORY_IO;
-
- free1_out:
-	kfree(dev->dma_mem);
- out:
-	if (mem_base)
-		iounmap(mem_base);
-	return 0;
-}
-EXPORT_SYMBOL(dma_declare_coherent_memory);
-
-void dma_release_declared_memory(struct device *dev)
-{
-	struct dma_coherent_mem *mem = dev->dma_mem;
-
-	if (!mem)
-		return;
-	dev->dma_mem = NULL;
-	iounmap(mem->virt_base);
-	kfree(mem->bitmap);
-	kfree(mem);
-}
-EXPORT_SYMBOL(dma_release_declared_memory);
-
-void *dma_mark_declared_memory_occupied(struct device *dev,
-					dma_addr_t device_addr, size_t size)
-{
-	struct dma_coherent_mem *mem = dev->dma_mem;
-	int pos, err;
-	int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
-
-	pages >>= PAGE_SHIFT;
-
-	if (!mem)
-		return ERR_PTR(-EINVAL);
-
-	pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
-	err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
-	if (err != 0)
-		return ERR_PTR(err);
-	return mem->virt_base + (pos << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
-
-static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
-				       dma_addr_t *dma_handle, void **ret)
-{
-	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-	int order = get_order(size);
-
-	if (mem) {
-		int page = bitmap_find_free_region(mem->bitmap, mem->size,
-						     order);
-		if (page >= 0) {
-			*dma_handle = mem->device_base + (page << PAGE_SHIFT);
-			*ret = mem->virt_base + (page << PAGE_SHIFT);
-			memset(*ret, 0, size);
-		}
-		if (mem->flags & DMA_MEMORY_EXCLUSIVE)
-			*ret = NULL;
-	}
-	return (mem != NULL);
-}
-
-static int dma_release_coherent(struct device *dev, int order, void *vaddr)
-{
-	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
-
-	if (mem && vaddr >= mem->virt_base && vaddr <
-		   (mem->virt_base + (mem->size << PAGE_SHIFT))) {
-		int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
-
-		bitmap_release_region(mem->bitmap, page, order);
-		return 1;
-	}
-	return 0;
-}
-#else
-#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
-#define dma_release_coherent(dev, order, vaddr) (0)
-#endif /* CONFIG_X86_32 */
-
 int dma_supported(struct device *dev, u64 mask)
 {
+	struct dma_mapping_ops *ops = get_dma_ops(dev);
+
 #ifdef CONFIG_PCI
 	if (mask > 0xffffffff && forbid_dac > 0) {
-		printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
-				 dev->bus_id);
+		dev_info(dev, "PCI: Disallowing DAC for device\n");
 		return 0;
 	}
 #endif
 
-	if (dma_ops->dma_supported)
-		return dma_ops->dma_supported(dev, mask);
+	if (ops->dma_supported)
+		return ops->dma_supported(dev, mask);
 
 	/* Copied from i386. Doesn't make much sense, because it will
 	   only work for pci_alloc_coherent.
@@ -351,8 +263,7 @@ int dma_supported(struct device *dev, u64 mask)
 	   type. Normally this doesn't make any difference, but gives
 	   more gentle handling of IOMMU overflow. */
 	if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
-		printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
-				 dev->bus_id, mask);
+		dev_info(dev, "Force SAC with mask %Lx\n", mask);
 		return 0;
 	}
 
@@ -360,157 +271,15 @@ int dma_supported(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_supported);
 
-/* Allocate DMA memory on node near device */
-static noinline struct page *
-dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
-{
-	int node;
-
-	node = dev_to_node(dev);
-
-	return alloc_pages_node(node, gfp, order);
-}
-
-/*
- * Allocate memory for a coherent mapping.
- */
-void *
-dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		   gfp_t gfp)
-{
-	void *memory = NULL;
-	struct page *page;
-	unsigned long dma_mask = 0;
-	dma_addr_t bus;
-	int noretry = 0;
-
-	/* ignore region specifiers */
-	gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
-
-	if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
-		return memory;
-
-	if (!dev) {
-		dev = &fallback_dev;
-		gfp |= GFP_DMA;
-	}
-	dma_mask = dev->coherent_dma_mask;
-	if (dma_mask == 0)
-		dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
-
-	/* Device not DMA able */
-	if (dev->dma_mask == NULL)
-		return NULL;
-
-	/* Don't invoke OOM killer or retry in lower 16MB DMA zone */
-	if (gfp & __GFP_DMA)
-		noretry = 1;
-
-#ifdef CONFIG_X86_64
-	/* Why <=? Even when the mask is smaller than 4GB it is often
-	   larger than 16MB and in this case we have a chance of
-	   finding fitting memory in the next higher zone first. If
-	   not retry with true GFP_DMA. -AK */
-	if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
-		gfp |= GFP_DMA32;
-		if (dma_mask < DMA_32BIT_MASK)
-			noretry = 1;
-	}
-#endif
-
- again:
-	page = dma_alloc_pages(dev,
-		noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
-	if (page == NULL)
-		return NULL;
-
-	{
-		int high, mmu;
-		bus = page_to_phys(page);
-		memory = page_address(page);
-		high = (bus + size) >= dma_mask;
-		mmu = high;
-		if (force_iommu && !(gfp & GFP_DMA))
-			mmu = 1;
-		else if (high) {
-			free_pages((unsigned long)memory,
-				   get_order(size));
-
-			/* Don't use the 16MB ZONE_DMA unless absolutely
-			   needed. It's better to use remapping first. */
-			if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
-				gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
-				goto again;
-			}
-
-			/* Let low level make its own zone decisions */
-			gfp &= ~(GFP_DMA32|GFP_DMA);
-
-			if (dma_ops->alloc_coherent)
-				return dma_ops->alloc_coherent(dev, size,
-							   dma_handle, gfp);
-			return NULL;
-		}
-
-		memset(memory, 0, size);
-		if (!mmu) {
-			*dma_handle = bus;
-			return memory;
-		}
-	}
-
-	if (dma_ops->alloc_coherent) {
-		free_pages((unsigned long)memory, get_order(size));
-		gfp &= ~(GFP_DMA|GFP_DMA32);
-		return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
-	}
-
-	if (dma_ops->map_simple) {
-		*dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory),
-					      size,
-					      PCI_DMA_BIDIRECTIONAL);
-		if (*dma_handle != bad_dma_address)
-			return memory;
-	}
-
-	if (panic_on_overflow)
-		panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
-		      (unsigned long)size);
-	free_pages((unsigned long)memory, get_order(size));
-	return NULL;
-}
-EXPORT_SYMBOL(dma_alloc_coherent);
-
-/*
- * Unmap coherent memory.
- * The caller must ensure that the device has finished accessing the mapping.
- */
-void dma_free_coherent(struct device *dev, size_t size,
-			 void *vaddr, dma_addr_t bus)
-{
-	int order = get_order(size);
-	WARN_ON(irqs_disabled());	/* for portability */
-	if (dma_release_coherent(dev, order, vaddr))
-		return;
-	if (dma_ops->unmap_single)
-		dma_ops->unmap_single(dev, bus, size, 0);
-	free_pages((unsigned long)vaddr, order);
-}
-EXPORT_SYMBOL(dma_free_coherent);
-
 static int __init pci_iommu_init(void)
 {
-#ifdef CONFIG_CALGARY_IOMMU
 	calgary_iommu_init();
-#endif
 
 	intel_iommu_init();
 
 	amd_iommu_init();
 
-#ifdef CONFIG_GART_IOMMU
 	gart_iommu_init();
-#endif
 
 	no_iommu_init();
 	return 0;
@@ -522,17 +291,3 @@ void pci_iommu_shutdown(void)
 }
 /* Must execute after PCI subsystem */
 fs_initcall(pci_iommu_init);
-
-#ifdef CONFIG_PCI
-/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-
-static __devinit void via_no_dac(struct pci_dev *dev)
-{
-	if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
-		printk(KERN_INFO "PCI: VIA PCI bridge detected."
-				 "Disabling DAC.\n");
-		forbid_dac = 1;
-	}
-}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
-#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c3fe78406d18..e3f75bbcedea 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,11 +27,12 @@
 #include <linux/scatterlist.h>
 #include <linux/iommu-helper.h>
 #include <linux/sysdev.h>
+#include <linux/io.h>
 #include <asm/atomic.h>
-#include <asm/io.h>
 #include <asm/mtrr.h>
 #include <asm/pgtable.h>
 #include <asm/proto.h>
+#include <asm/iommu.h>
 #include <asm/gart.h>
 #include <asm/cacheflush.h>
 #include <asm/swiotlb.h>
@@ -66,9 +67,6 @@ static u32 gart_unmapped_entry;
 	(((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
 #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
 
-#define to_pages(addr, size) \
-	(round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
-
 #define EMERGENCY_PAGES 32 /* = 128KB */
 
 #ifdef CONFIG_AGP
@@ -82,9 +80,10 @@ AGPEXTERN int agp_memory_reserved;
 AGPEXTERN __u32 *agp_gatt_table;
 
 static unsigned long next_bit;  /* protected by iommu_bitmap_lock */
-static int need_flush;		/* global flush state. set for each gart wrap */
+static bool need_flush;		/* global flush state. set for each gart wrap */
 
-static unsigned long alloc_iommu(struct device *dev, int size)
+static unsigned long alloc_iommu(struct device *dev, int size,
+				 unsigned long align_mask)
 {
 	unsigned long offset, flags;
 	unsigned long boundary_size;
@@ -92,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size)
 
 	base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
 			   PAGE_SIZE) >> PAGE_SHIFT;
-	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+	boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
 			      PAGE_SIZE) >> PAGE_SHIFT;
 
 	spin_lock_irqsave(&iommu_bitmap_lock, flags);
 	offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
-				  size, base_index, boundary_size, 0);
+				  size, base_index, boundary_size, align_mask);
 	if (offset == -1) {
-		need_flush = 1;
+		need_flush = true;
 		offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
-					  size, base_index, boundary_size, 0);
+					  size, base_index, boundary_size,
+					  align_mask);
 	}
 	if (offset != -1) {
 		next_bit = offset+size;
 		if (next_bit >= iommu_pages) {
 			next_bit = 0;
-			need_flush = 1;
+			need_flush = true;
 		}
 	}
 	if (iommu_fullflush)
-		need_flush = 1;
+		need_flush = true;
 	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
 
 	return offset;
@@ -136,7 +136,7 @@ static void flush_gart(void)
 	spin_lock_irqsave(&iommu_bitmap_lock, flags);
 	if (need_flush) {
 		k8_flush_garts();
-		need_flush = 0;
+		need_flush = false;
 	}
 	spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
 }
@@ -175,7 +175,8 @@ static void dump_leak(void)
 	       iommu_leak_pages);
 	for (i = 0; i < iommu_leak_pages; i += 2) {
 		printk(KERN_DEBUG "%lu: ", iommu_pages-i);
-		printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0);
+		printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
+				0);
 		printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
 	}
 	printk(KERN_DEBUG "\n");
@@ -197,9 +198,7 @@ static void iommu_full(struct device *dev, size_t size, int dir)
 	 * out. Hopefully no network devices use single mappings that big.
 	 */
 
-	printk(KERN_ERR
-		"PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
-		size, dev->bus_id);
+	dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
 
 	if (size > PAGE_SIZE*EMERGENCY_PAGES) {
 		if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
@@ -216,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir)
 static inline int
 need_iommu(struct device *dev, unsigned long addr, size_t size)
 {
-	u64 mask = *dev->dma_mask;
-	int high = addr + size > mask;
-	int mmu = high;
-
-	if (force_iommu)
-		mmu = 1;
-
-	return mmu;
+	return force_iommu ||
+		!is_buffer_dma_capable(*dev->dma_mask, addr, size);
 }
 
 static inline int
 nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
 {
-	u64 mask = *dev->dma_mask;
-	int high = addr + size > mask;
-	int mmu = high;
-
-	return mmu;
+	return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
 }
 
 /* Map a single continuous physical area into the IOMMU.
  * Caller needs to check if the iommu is needed and flush.
  */
 static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
-				size_t size, int dir)
+				size_t size, int dir, unsigned long align_mask)
 {
-	unsigned long npages = to_pages(phys_mem, size);
-	unsigned long iommu_page = alloc_iommu(dev, npages);
+	unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
+	unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
 	int i;
 
 	if (iommu_page == -1) {
@@ -263,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
 	return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
 }
 
-static dma_addr_t
-gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
-{
-	dma_addr_t map = dma_map_area(dev, paddr, size, dir);
-
-	flush_gart();
-
-	return map;
-}
-
 /* Map a single area into the IOMMU */
 static dma_addr_t
 gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
@@ -280,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
 	unsigned long bus;
 
 	if (!dev)
-		dev = &fallback_dev;
+		dev = &x86_dma_fallback_dev;
 
 	if (!need_iommu(dev, paddr, size))
 		return paddr;
 
-	bus = gart_map_simple(dev, paddr, size, dir);
+	bus = dma_map_area(dev, paddr, size, dir, 0);
+	flush_gart();
 
 	return bus;
 }
@@ -305,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
 		return;
 
 	iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
-	npages = to_pages(dma_addr, size);
+	npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
 	for (i = 0; i < npages; i++) {
 		iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
 		CLEAR_LEAK(iommu_page + i);
@@ -344,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
 		unsigned long addr = sg_phys(s);
 
 		if (nonforced_iommu(dev, addr, s->length)) {
-			addr = dma_map_area(dev, addr, s->length, dir);
+			addr = dma_map_area(dev, addr, s->length, dir, 0);
 			if (addr == bad_dma_address) {
 				if (i > 0)
 					gart_unmap_sg(dev, sg, i, dir);
@@ -366,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
 			  int nelems, struct scatterlist *sout,
 			  unsigned long pages)
 {
-	unsigned long iommu_start = alloc_iommu(dev, pages);
+	unsigned long iommu_start = alloc_iommu(dev, pages, 0);
 	unsigned long iommu_page = iommu_start;
 	struct scatterlist *s;
 	int i;
@@ -388,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
 		}
 
 		addr = phys_addr;
-		pages = to_pages(s->offset, s->length);
+		pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
 		while (pages--) {
 			iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
 			SET_LEAK(iommu_page);
@@ -431,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 		return 0;
 
 	if (!dev)
-		dev = &fallback_dev;
+		dev = &x86_dma_fallback_dev;
 
 	out = 0;
 	start = 0;
@@ -471,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
 
 		seg_size += s->length;
 		need = nextneed;
-		pages += to_pages(s->offset, s->length);
+		pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
 		ps = s;
 	}
 	if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
@@ -503,6 +483,46 @@ error:
 	return 0;
 }
 
+/* allocate and map a coherent mapping */
+static void *
+gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
+		    gfp_t flag)
+{
+	dma_addr_t paddr;
+	unsigned long align_mask;
+	struct page *page;
+
+	if (force_iommu && !(flag & GFP_DMA)) {
+		flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+		page = alloc_pages(flag | __GFP_ZERO, get_order(size));
+		if (!page)
+			return NULL;
+
+		align_mask = (1UL << get_order(size)) - 1;
+		paddr = dma_map_area(dev, page_to_phys(page), size,
+				     DMA_BIDIRECTIONAL, align_mask);
+
+		flush_gart();
+		if (paddr != bad_dma_address) {
+			*dma_addr = paddr;
+			return page_address(page);
+		}
+		__free_pages(page, get_order(size));
+	} else
+		return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
+
+	return NULL;
+}
+
+/* free a coherent mapping */
+static void
+gart_free_coherent(struct device *dev, size_t size, void *vaddr,
+		   dma_addr_t dma_addr)
+{
+	gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL);
+	free_pages((unsigned long)vaddr, get_order(size));
+}
+
 static int no_agp;
 
 static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -630,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	struct pci_dev *dev;
 	void *gatt;
 	int i, error;
-	unsigned long start_pfn, end_pfn;
 
 	printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
 	aper_size = aper_base = info->aper_size = 0;
@@ -654,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	info->aper_size = aper_size >> 20;
 
 	gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
-	gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size));
+	gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					get_order(gatt_size));
 	if (!gatt)
 		panic("Cannot allocate GATT table");
 	if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
 		panic("Could not set GART PTEs to uncacheable pages");
 
-	memset(gatt, 0, gatt_size);
 	agp_gatt_table = gatt;
 
 	enable_gart_translations();
@@ -669,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	if (!error)
 		error = sysdev_register(&device_gart);
 	if (error)
-		panic("Could not register gart_sysdev -- would corrupt data on next suspend");
+		panic("Could not register gart_sysdev -- "
+		      "would corrupt data on next suspend");
 
 	flush_gart();
 
 	printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
 	       aper_base, aper_size>>10);
 
-	/* need to map that range */
-	end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
-	if (end_pfn > max_low_pfn_mapped) {
-		start_pfn = (aper_base>>PAGE_SHIFT);
-		init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
-	}
 	return 0;
 
  nommu:
@@ -691,21 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	return -1;
 }
 
-extern int agp_amd64_init(void);
-
-static const struct dma_mapping_ops gart_dma_ops = {
-	.mapping_error			= NULL,
+static struct dma_mapping_ops gart_dma_ops = {
 	.map_single			= gart_map_single,
-	.map_simple			= gart_map_simple,
 	.unmap_single			= gart_unmap_single,
-	.sync_single_for_cpu		= NULL,
-	.sync_single_for_device		= NULL,
-	.sync_single_range_for_cpu	= NULL,
-	.sync_single_range_for_device	= NULL,
-	.sync_sg_for_cpu		= NULL,
-	.sync_sg_for_device		= NULL,
 	.map_sg				= gart_map_sg,
 	.unmap_sg			= gart_unmap_sg,
+	.alloc_coherent			= gart_alloc_coherent,
+	.free_coherent			= gart_free_coherent,
 };
 
 void gart_iommu_shutdown(void)
@@ -732,7 +738,8 @@ void __init gart_iommu_init(void)
 {
 	struct agp_kern_info info;
 	unsigned long iommu_start;
-	unsigned long aper_size;
+	unsigned long aper_base, aper_size;
+	unsigned long start_pfn, end_pfn;
 	unsigned long scratch;
 	long i;
 
@@ -764,30 +771,35 @@ void __init gart_iommu_init(void)
 	    (no_agp && init_k8_gatt(&info) < 0)) {
 		if (max_pfn > MAX_DMA32_PFN) {
 			printk(KERN_WARNING "More than 4GB of memory "
-			       	          "but GART IOMMU not available.\n"
-			       KERN_WARNING "falling back to iommu=soft.\n");
+			       "but GART IOMMU not available.\n");
+			printk(KERN_WARNING "falling back to iommu=soft.\n");
 		}
 		return;
 	}
 
+	/* need to map that range */
+	aper_size = info.aper_size << 20;
+	aper_base = info.aper_base;
+	end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+	if (end_pfn > max_low_pfn_mapped) {
+		start_pfn = (aper_base>>PAGE_SHIFT);
+		init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
+	}
+
 	printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
-	aper_size = info.aper_size * 1024 * 1024;
 	iommu_size = check_iommu_size(info.aper_base, aper_size);
 	iommu_pages = iommu_size >> PAGE_SHIFT;
 
-	iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL,
+	iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
 						      get_order(iommu_pages/8));
 	if (!iommu_gart_bitmap)
 		panic("Cannot allocate iommu bitmap\n");
-	memset(iommu_gart_bitmap, 0, iommu_pages/8);
 
 #ifdef CONFIG_IOMMU_LEAK
 	if (leak_trace) {
-		iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL,
+		iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
 				  get_order(iommu_pages*sizeof(void *)));
-		if (iommu_leak_tab)
-			memset(iommu_leak_tab, 0, iommu_pages * 8);
-		else
+		if (!iommu_leak_tab)
 			printk(KERN_DEBUG
 			       "PCI-DMA: Cannot allocate leak trace area\n");
 	}
@@ -797,7 +809,7 @@ void __init gart_iommu_init(void)
 	 * Out of IOMMU space handling.
 	 * Reserve some invalid pages at the beginning of the GART.
 	 */
-	set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
+	iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
 
 	agp_memory_reserved = iommu_size;
 	printk(KERN_INFO
@@ -855,7 +867,8 @@ void __init gart_parse_options(char *p)
 	if (!strncmp(p, "leak", 4)) {
 		leak_trace = 1;
 		p += 4;
-		if (*p == '=') ++p;
+		if (*p == '=')
+			++p;
 		if (isdigit(*p) && get_option(&p, &arg))
 			iommu_leak_pages = arg;
 	}
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index aec43d56f49c..c70ab5a5d4c8 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -7,14 +7,14 @@
 #include <linux/dma-mapping.h>
 #include <linux/scatterlist.h>
 
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/processor.h>
 #include <asm/dma.h>
 
 static int
 check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
 {
-	if (hwdev && bus + size > *hwdev->dma_mask) {
+	if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
 		if (*hwdev->dma_mask >= DMA_32BIT_MASK)
 			printk(KERN_ERR
 			    "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -72,21 +72,17 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
 	return nents;
 }
 
-/* Make sure we keep the same behaviour */
-static int nommu_mapping_error(dma_addr_t dma_addr)
+static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
+				dma_addr_t dma_addr)
 {
-#ifdef CONFIG_X86_32
-	return 0;
-#else
-	return (dma_addr == bad_dma_address);
-#endif
+	free_pages((unsigned long)vaddr, get_order(size));
 }
 
-
-const struct dma_mapping_ops nommu_dma_ops = {
+struct dma_mapping_ops nommu_dma_ops = {
+	.alloc_coherent = dma_generic_alloc_coherent,
+	.free_coherent = nommu_free_coherent,
 	.map_single = nommu_map_single,
 	.map_sg = nommu_map_sg,
-	.mapping_error = nommu_mapping_error,
 	.is_phys = 1,
 };
 
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 82299cd1d04d..c4ce0332759e 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -5,7 +5,7 @@
 #include <linux/module.h>
 #include <linux/dma-mapping.h>
 
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/swiotlb.h>
 #include <asm/dma.h>
 
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
 	return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
 }
 
-const struct dma_mapping_ops swiotlb_dma_ops = {
+struct dma_mapping_ops swiotlb_dma_ops = {
 	.mapping_error = swiotlb_dma_mapping_error,
 	.alloc_coherent = swiotlb_alloc_coherent,
 	.free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea277..a311ffcaad16 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
 #include <linux/platform_device.h>
-#include <linux/errno.h>
+#include <linux/err.h>
 #include <linux/init.h>
 
 static __init int add_pcspkr(void)
 {
 	struct platform_device *pd;
-	int ret;
 
-	pd = platform_device_alloc("pcspkr", -1);
-	if (!pd)
-		return -ENOMEM;
+	pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
 
-	ret = platform_device_add(pd);
-	if (ret)
-		platform_device_put(pd);
-
-	return ret;
+	return IS_ERR(pd) ? PTR_ERR(pd) : 0;
 }
 device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4d629c62f4f8..c622772744d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -184,7 +184,8 @@ static void mwait_idle(void)
 static void poll_idle(void)
 {
 	local_irq_enable();
-	cpu_relax();
+	while (!need_resched())
+		cpu_relax();
 }
 
 /*
@@ -199,6 +200,7 @@ static void poll_idle(void)
  *
  * idle=mwait overrides this decision and forces the usage of mwait.
  */
+static int __cpuinitdata force_mwait;
 
 #define MWAIT_INFO			0x05
 #define MWAIT_ECX_EXTENDED_INFO		0x01
@@ -244,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
 	return 1;
 }
 
+static cpumask_t c1e_mask = CPU_MASK_NONE;
+static int c1e_detected;
+
+void c1e_remove_cpu(int cpu)
+{
+	cpu_clear(cpu, c1e_mask);
+}
+
 /*
  * C1E aware idle routine. We check for C1E active in the interrupt
  * pending message MSR. If we detect C1E, then we handle it the same
@@ -251,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
  */
 static void c1e_idle(void)
 {
-	static cpumask_t c1e_mask = CPU_MASK_NONE;
-	static int c1e_detected;
-
 	if (need_resched())
 		return;
 
@@ -263,8 +270,10 @@ static void c1e_idle(void)
 		rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
 		if (lo & K8_INTP_C1E_ACTIVE_MASK) {
 			c1e_detected = 1;
-			mark_tsc_unstable("TSC halt in C1E");
-			printk(KERN_INFO "System has C1E enabled\n");
+			if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+				mark_tsc_unstable("TSC halt in AMD C1E");
+			printk(KERN_INFO "System has AMD C1E enabled\n");
+			set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
 		}
 	}
 
@@ -326,6 +335,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
 
 static int __init idle_setup(char *str)
 {
+	if (!str)
+		return -EINVAL;
+
 	if (!strcmp(str, "poll")) {
 		printk("using polling idle threads.\n");
 		pm_idle = poll_idle;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0c3927accb00..0a1302fe6d45 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
 #include <linux/tick.h>
 #include <linux/percpu.h>
 #include <linux/prctl.h>
+#include <linux/dmi.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -55,6 +56,9 @@
 #include <asm/tlbflush.h>
 #include <asm/cpu.h>
 #include <asm/kdebug.h>
+#include <asm/idle.h>
+#include <asm/syscalls.h>
+#include <asm/smp.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
@@ -72,47 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 	return ((unsigned long *)tsk->thread.sp)[3];
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-#include <asm/nmi.h>
-
-static void cpu_exit_clear(void)
-{
-	int cpu = raw_smp_processor_id();
-
-	idle_task_exit();
-
-	cpu_uninit();
-	irq_ctx_exit(cpu);
-
-	cpu_clear(cpu, cpu_callout_map);
-	cpu_clear(cpu, cpu_callin_map);
-
-	numa_remove_cpu(cpu);
-}
-
-/* We don't actually take CPU down, just spin without interrupts. */
-static inline void play_dead(void)
-{
-	/* This must be done before dead CPU ack */
-	cpu_exit_clear();
-	wbinvd();
-	mb();
-	/* Ack it */
-	__get_cpu_var(cpu_state) = CPU_DEAD;
-
-	/*
-	 * With physical CPU hotplug, we should halt the cpu
-	 */
-	local_irq_disable();
-	while (1)
-		halt();
-}
-#else
+#ifndef CONFIG_SMP
 static inline void play_dead(void)
 {
 	BUG();
 }
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif
 
 /*
  * The idle thread. There's no useful work to be
@@ -128,7 +97,7 @@ void cpu_idle(void)
 
 	/* endless idle loop with no priority at all */
 	while (1) {
-		tick_nohz_stop_sched_tick();
+		tick_nohz_stop_sched_tick(1);
 		while (!need_resched()) {
 
 			check_pgt_cache();
@@ -154,12 +123,13 @@ void cpu_idle(void)
 	}
 }
 
-void __show_registers(struct pt_regs *regs, int all)
+void __show_regs(struct pt_regs *regs, int all)
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
 	unsigned long d0, d1, d2, d3, d6, d7;
 	unsigned long sp;
 	unsigned short ss, gs;
+	const char *board;
 
 	if (user_mode_vm(regs)) {
 		sp = regs->sp;
@@ -172,11 +142,15 @@ void __show_registers(struct pt_regs *regs, int all)
 	}
 
 	printk("\n");
-	printk("Pid: %d, comm: %s %s (%s %.*s)\n",
+
+	board = dmi_get_system_info(DMI_PRODUCT_NAME);
+	if (!board)
+		board = "";
+	printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
 			task_pid_nr(current), current->comm,
 			print_tainted(), init_utsname()->release,
 			(int)strcspn(init_utsname()->version, " "),
-			init_utsname()->version);
+			init_utsname()->version, board);
 
 	printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
 			(u16)regs->cs, regs->ip, regs->flags,
@@ -215,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all)
 
 void show_regs(struct pt_regs *regs)
 {
-	__show_registers(regs, 1);
+	__show_regs(regs, 1);
 	show_trace(NULL, regs, &regs->sp, regs->bp);
 }
 
@@ -276,6 +250,14 @@ void exit_thread(void)
 		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		put_cpu();
 	}
+#ifdef CONFIG_X86_DS
+	/* Free any DS contexts that have not been properly released. */
+	if (unlikely(current->thread.ds_ctx)) {
+		/* we clear debugctl to make sure DS is not used. */
+		update_debugctlmsr(0);
+		ds_free(current->thread.ds_ctx);
+	}
+#endif /* CONFIG_X86_DS */
 }
 
 void flush_thread(void)
@@ -437,6 +419,35 @@ int set_tsc_mode(unsigned int val)
 	return 0;
 }
 
+#ifdef CONFIG_X86_DS
+static int update_debugctl(struct thread_struct *prev,
+			struct thread_struct *next, unsigned long debugctl)
+{
+	unsigned long ds_prev = 0;
+	unsigned long ds_next = 0;
+
+	if (prev->ds_ctx)
+		ds_prev = (unsigned long)prev->ds_ctx->ds;
+	if (next->ds_ctx)
+		ds_next = (unsigned long)next->ds_ctx->ds;
+
+	if (ds_next != ds_prev) {
+		/* we clear debugctl to make sure DS
+		 * is not in use when we change it */
+		debugctl = 0;
+		update_debugctlmsr(0);
+		wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
+	}
+	return debugctl;
+}
+#else
+static int update_debugctl(struct thread_struct *prev,
+			struct thread_struct *next, unsigned long debugctl)
+{
+	return debugctl;
+}
+#endif /* CONFIG_X86_DS */
+
 static noinline void
 __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 		 struct tss_struct *tss)
@@ -447,14 +458,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	prev = &prev_p->thread;
 	next = &next_p->thread;
 
-	debugctl = prev->debugctlmsr;
-	if (next->ds_area_msr != prev->ds_area_msr) {
-		/* we clear debugctl to make sure DS
-		 * is not in use when we change it */
-		debugctl = 0;
-		update_debugctlmsr(0);
-		wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
-	}
+	debugctl = update_debugctl(prev, next, prev->debugctlmsr);
 
 	if (next->debugctlmsr != debugctl)
 		update_debugctlmsr(next->debugctlmsr);
@@ -478,13 +482,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 			hard_enable_TSC();
 	}
 
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
 
 	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif
+#endif /* CONFIG_X86_PTRACE_BTS */
 
 
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a8e53626ac9a..c958120fb1b6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,11 +37,11 @@
 #include <linux/kdebug.h>
 #include <linux/tick.h>
 #include <linux/prctl.h>
+#include <linux/uaccess.h>
+#include <linux/io.h>
 
-#include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/system.h>
-#include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
 #include <asm/mmu_context.h>
@@ -51,6 +51,7 @@
 #include <asm/proto.h>
 #include <asm/ia32.h>
 #include <asm/idle.h>
+#include <asm/syscalls.h>
 
 asmlinkage extern void ret_from_fork(void);
 
@@ -62,6 +63,13 @@ void idle_notifier_register(struct notifier_block *n)
 {
 	atomic_notifier_chain_register(&idle_notifier, n);
 }
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+	atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
 
 void enter_idle(void)
 {
@@ -85,29 +93,12 @@ void exit_idle(void)
 	__exit_idle();
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-DECLARE_PER_CPU(int, cpu_state);
-
-#include <asm/nmi.h>
-/* We halt the CPU with physical CPU hotplug */
-static inline void play_dead(void)
-{
-	idle_task_exit();
-	wbinvd();
-	mb();
-	/* Ack it */
-	__get_cpu_var(cpu_state) = CPU_DEAD;
-
-	local_irq_disable();
-	while (1)
-		halt();
-}
-#else
+#ifndef CONFIG_SMP
 static inline void play_dead(void)
 {
 	BUG();
 }
-#endif /* CONFIG_HOTPLUG_CPU */
+#endif
 
 /*
  * The idle thread. There's no useful work to be
@@ -120,7 +111,7 @@ void cpu_idle(void)
 	current_thread_info()->status |= TS_POLLING;
 	/* endless idle loop with no priority at all */
 	while (1) {
-		tick_nohz_stop_sched_tick();
+		tick_nohz_stop_sched_tick(1);
 		while (!need_resched()) {
 
 			rmb();
@@ -152,7 +143,7 @@ void cpu_idle(void)
 }
 
 /* Prints also some state that isn't saved in the pt_regs */
-void __show_regs(struct pt_regs * regs)
+void __show_regs(struct pt_regs *regs, int all)
 {
 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
 	unsigned long d0, d1, d2, d3, d6, d7;
@@ -161,60 +152,65 @@ void __show_regs(struct pt_regs * regs)
 
 	printk("\n");
 	print_modules();
-	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
+	printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
 		current->pid, current->comm, print_tainted(),
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
-	printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
+	printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
 	printk_address(regs->ip, 1);
-	printk("RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss, regs->sp,
-		regs->flags);
-	printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
+	printk(KERN_INFO "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
+			regs->sp, regs->flags);
+	printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
 	       regs->ax, regs->bx, regs->cx);
-	printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
+	printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
 	       regs->dx, regs->si, regs->di);
-	printk("RBP: %016lx R08: %016lx R09: %016lx\n",
+	printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
 	       regs->bp, regs->r8, regs->r9);
-	printk("R10: %016lx R11: %016lx R12: %016lx\n",
-	       regs->r10, regs->r11, regs->r12); 
-	printk("R13: %016lx R14: %016lx R15: %016lx\n",
-	       regs->r13, regs->r14, regs->r15); 
-
-	asm("movl %%ds,%0" : "=r" (ds)); 
-	asm("movl %%cs,%0" : "=r" (cs)); 
-	asm("movl %%es,%0" : "=r" (es)); 
+	printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
+	       regs->r10, regs->r11, regs->r12);
+	printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
+	       regs->r13, regs->r14, regs->r15);
+
+	asm("movl %%ds,%0" : "=r" (ds));
+	asm("movl %%cs,%0" : "=r" (cs));
+	asm("movl %%es,%0" : "=r" (es));
 	asm("movl %%fs,%0" : "=r" (fsindex));
 	asm("movl %%gs,%0" : "=r" (gsindex));
 
 	rdmsrl(MSR_FS_BASE, fs);
-	rdmsrl(MSR_GS_BASE, gs); 
-	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 
+	rdmsrl(MSR_GS_BASE, gs);
+	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
+
+	if (!all)
+		return;
 
 	cr0 = read_cr0();
 	cr2 = read_cr2();
 	cr3 = read_cr3();
 	cr4 = read_cr4();
 
-	printk("FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 
-	       fs,fsindex,gs,gsindex,shadowgs); 
-	printk("CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 
-	printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
+	printk(KERN_INFO "FS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
+	       fs, fsindex, gs, gsindex, shadowgs);
+	printk(KERN_INFO "CS:  %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
+			es, cr0);
+	printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
+			cr4);
 
 	get_debugreg(d0, 0);
 	get_debugreg(d1, 1);
 	get_debugreg(d2, 2);
-	printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
+	printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
 	get_debugreg(d3, 3);
 	get_debugreg(d6, 6);
 	get_debugreg(d7, 7);
-	printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
+	printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
 }
 
 void show_regs(struct pt_regs *regs)
 {
-	printk("CPU %d:", smp_processor_id());
-	__show_regs(regs);
+	printk(KERN_INFO "CPU %d:", smp_processor_id());
+	__show_regs(regs, 1);
 	show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
 }
 
@@ -239,6 +235,14 @@ void exit_thread(void)
 		t->io_bitmap_max = 0;
 		put_cpu();
 	}
+#ifdef CONFIG_X86_DS
+	/* Free any DS contexts that have not been properly released. */
+	if (unlikely(t->ds_ctx)) {
+		/* we clear debugctl to make sure DS is not used. */
+		update_debugctlmsr(0);
+		ds_free(t->ds_ctx);
+	}
+#endif /* CONFIG_X86_DS */
 }
 
 void flush_thread(void)
@@ -314,10 +318,10 @@ void prepare_to_copy(struct task_struct *tsk)
 
 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
-	struct task_struct * p, struct pt_regs * regs)
+	struct task_struct *p, struct pt_regs *regs)
 {
 	int err;
-	struct pt_regs * childregs;
+	struct pt_regs *childregs;
 	struct task_struct *me = current;
 
 	childregs = ((struct pt_regs *)
@@ -362,10 +366,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
 		if (test_thread_flag(TIF_IA32))
 			err = do_set_thread_area(p, -1,
 				(struct user_desc __user *)childregs->si, 0);
-		else 			
-#endif	 
-			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 
-		if (err) 
+		else
+#endif
+			err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
+		if (err)
 			goto out;
 	}
 	err = 0;
@@ -472,13 +476,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 	next = &next_p->thread;
 
 	debugctl = prev->debugctlmsr;
-	if (next->ds_area_msr != prev->ds_area_msr) {
-		/* we clear debugctl to make sure DS
-		 * is not in use when we change it */
-		debugctl = 0;
-		update_debugctlmsr(0);
-		wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
+
+#ifdef CONFIG_X86_DS
+	{
+		unsigned long ds_prev = 0, ds_next = 0;
+
+		if (prev->ds_ctx)
+			ds_prev = (unsigned long)prev->ds_ctx->ds;
+		if (next->ds_ctx)
+			ds_next = (unsigned long)next->ds_ctx->ds;
+
+		if (ds_next != ds_prev) {
+			/*
+			 * We clear debugctl to make sure DS
+			 * is not in use when we change it:
+			 */
+			debugctl = 0;
+			update_debugctlmsr(0);
+			wrmsrl(MSR_IA32_DS_AREA, ds_next);
+		}
 	}
+#endif /* CONFIG_X86_DS */
 
 	if (next->debugctlmsr != debugctl)
 		update_debugctlmsr(next->debugctlmsr);
@@ -516,13 +534,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
 
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
 	if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
 
 	if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
 		ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
-#endif
+#endif /* CONFIG_X86_PTRACE_BTS */
 }
 
 /*
@@ -537,14 +555,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
 struct task_struct *
 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 {
-	struct thread_struct *prev = &prev_p->thread,
-				 *next = &next_p->thread;
+	struct thread_struct *prev = &prev_p->thread;
+	struct thread_struct *next = &next_p->thread;
 	int cpu = smp_processor_id();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
 	unsigned fsindex, gsindex;
 
 	/* we're going to use this soon, after a few expensive things */
-	if (next_p->fpu_counter>5)
+	if (next_p->fpu_counter > 5)
 		prefetch(next->xstate);
 
 	/*
@@ -552,13 +570,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 */
 	load_sp0(tss, next);
 
-	/* 
+	/*
 	 * Switch DS and ES.
 	 * This won't pick up thread selector changes, but I guess that is ok.
 	 */
 	savesegment(es, prev->es);
 	if (unlikely(next->es | prev->es))
-		loadsegment(es, next->es); 
+		loadsegment(es, next->es);
 
 	savesegment(ds, prev->ds);
 	if (unlikely(next->ds | prev->ds))
@@ -584,50 +602,50 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 */
 	arch_leave_lazy_cpu_mode();
 
-	/* 
+	/*
 	 * Switch FS and GS.
+	 *
+	 * Segment register != 0 always requires a reload.  Also
+	 * reload when it has changed.  When prev process used 64bit
+	 * base always reload to avoid an information leak.
 	 */
-	{ 
-		/* segment register != 0 always requires a reload. 
-		   also reload when it has changed. 
-		   when prev process used 64bit base always reload
-		   to avoid an information leak. */
-		if (unlikely(fsindex | next->fsindex | prev->fs)) {
-			loadsegment(fs, next->fsindex);
-			/* check if the user used a selector != 0
-	                 * if yes clear 64bit base, since overloaded base
-                         * is always mapped to the Null selector
-                         */
-			if (fsindex)
-			prev->fs = 0;				
-		}
-		/* when next process has a 64bit base use it */
-		if (next->fs) 
-			wrmsrl(MSR_FS_BASE, next->fs); 
-		prev->fsindex = fsindex;
-
-		if (unlikely(gsindex | next->gsindex | prev->gs)) {
-			load_gs_index(next->gsindex);
-			if (gsindex)
-			prev->gs = 0;				
-		}
-		if (next->gs)
-			wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 
-		prev->gsindex = gsindex;
+	if (unlikely(fsindex | next->fsindex | prev->fs)) {
+		loadsegment(fs, next->fsindex);
+		/*
+		 * Check if the user used a selector != 0; if yes
+		 *  clear 64bit base, since overloaded base is always
+		 *  mapped to the Null selector
+		 */
+		if (fsindex)
+			prev->fs = 0;
+	}
+	/* when next process has a 64bit base use it */
+	if (next->fs)
+		wrmsrl(MSR_FS_BASE, next->fs);
+	prev->fsindex = fsindex;
+
+	if (unlikely(gsindex | next->gsindex | prev->gs)) {
+		load_gs_index(next->gsindex);
+		if (gsindex)
+			prev->gs = 0;
 	}
+	if (next->gs)
+		wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
+	prev->gsindex = gsindex;
 
 	/* Must be after DS reload */
 	unlazy_fpu(prev_p);
 
-	/* 
+	/*
 	 * Switch the PDA and FPU contexts.
 	 */
 	prev->usersp = read_pda(oldrsp);
 	write_pda(oldrsp, next->usersp);
-	write_pda(pcurrent, next_p); 
+	write_pda(pcurrent, next_p);
 
 	write_pda(kernelstack,
-	(unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+		  (unsigned long)task_stack_page(next_p) +
+		  THREAD_SIZE - PDA_STACKOFFSET);
 #ifdef CONFIG_CC_STACKPROTECTOR
 	write_pda(stack_canary, next_p->stack_canary);
 	/*
@@ -664,7 +682,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
 		char __user * __user *envp, struct pt_regs *regs)
 {
 	long error;
-	char * filename;
+	char *filename;
 
 	filename = getname(name);
 	error = PTR_ERR(filename);
@@ -722,55 +740,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
 unsigned long get_wchan(struct task_struct *p)
 {
 	unsigned long stack;
-	u64 fp,ip;
+	u64 fp, ip;
 	int count = 0;
 
-	if (!p || p == current || p->state==TASK_RUNNING)
-		return 0; 
+	if (!p || p == current || p->state == TASK_RUNNING)
+		return 0;
 	stack = (unsigned long)task_stack_page(p);
-	if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE)
+	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
 		return 0;
 	fp = *(u64 *)(p->thread.sp);
-	do { 
+	do {
 		if (fp < (unsigned long)stack ||
-		    fp > (unsigned long)stack+THREAD_SIZE)
-			return 0; 
+		    fp >= (unsigned long)stack+THREAD_SIZE)
+			return 0;
 		ip = *(u64 *)(fp+8);
 		if (!in_sched_functions(ip))
 			return ip;
-		fp = *(u64 *)fp; 
-	} while (count++ < 16); 
+		fp = *(u64 *)fp;
+	} while (count++ < 16);
 	return 0;
 }
 
 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
-{ 
-	int ret = 0; 
+{
+	int ret = 0;
 	int doit = task == current;
 	int cpu;
 
-	switch (code) { 
+	switch (code) {
 	case ARCH_SET_GS:
 		if (addr >= TASK_SIZE_OF(task))
-			return -EPERM; 
+			return -EPERM;
 		cpu = get_cpu();
-		/* handle small bases via the GDT because that's faster to 
+		/* handle small bases via the GDT because that's faster to
 		   switch. */
-		if (addr <= 0xffffffff) {  
-			set_32bit_tls(task, GS_TLS, addr); 
-			if (doit) { 
+		if (addr <= 0xffffffff) {
+			set_32bit_tls(task, GS_TLS, addr);
+			if (doit) {
 				load_TLS(&task->thread, cpu);
-				load_gs_index(GS_TLS_SEL); 
+				load_gs_index(GS_TLS_SEL);
 			}
-			task->thread.gsindex = GS_TLS_SEL; 
+			task->thread.gsindex = GS_TLS_SEL;
 			task->thread.gs = 0;
-		} else { 
+		} else {
 			task->thread.gsindex = 0;
 			task->thread.gs = addr;
 			if (doit) {
 				load_gs_index(0);
 				ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
-			} 
+			}
 		}
 		put_cpu();
 		break;
@@ -824,8 +842,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 				rdmsrl(MSR_KERNEL_GS_BASE, base);
 			else
 				base = task->thread.gs;
-		}
-		else
+		} else
 			base = task->thread.gs;
 		ret = put_user(base, (unsigned long __user *)addr);
 		break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 77040b6070e1..0a6d8c12e10d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -14,6 +14,7 @@
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <linux/regset.h>
+#include <linux/tracehook.h>
 #include <linux/user.h>
 #include <linux/elf.h>
 #include <linux/security.h>
@@ -39,7 +40,9 @@ enum x86_regset {
 	REGSET_GENERAL,
 	REGSET_FP,
 	REGSET_XFP,
+	REGSET_IOPERM64 = REGSET_XFP,
 	REGSET_TLS,
+	REGSET_IOPERM32,
 };
 
 /*
@@ -69,7 +72,7 @@ static inline bool invalid_selector(u16 value)
 
 #define FLAG_MASK		FLAG_MASK_32
 
-static long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
+static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
 {
 	BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
 	regno >>= 2;
@@ -554,45 +557,138 @@ static int ptrace_set_debugreg(struct task_struct *child,
 	return 0;
 }
 
-#ifdef X86_BTS
+/*
+ * These access the current or another (stopped) task's io permission
+ * bitmap for debugging or core dump.
+ */
+static int ioperm_active(struct task_struct *target,
+			 const struct user_regset *regset)
+{
+	return target->thread.io_bitmap_max / regset->size;
+}
 
-static int ptrace_bts_get_size(struct task_struct *child)
+static int ioperm_get(struct task_struct *target,
+		      const struct user_regset *regset,
+		      unsigned int pos, unsigned int count,
+		      void *kbuf, void __user *ubuf)
 {
-	if (!child->thread.ds_area_msr)
+	if (!target->thread.io_bitmap_ptr)
 		return -ENXIO;
 
-	return ds_get_bts_index((void *)child->thread.ds_area_msr);
+	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				   target->thread.io_bitmap_ptr,
+				   0, IO_BITMAP_BYTES);
+}
+
+#ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * The configuration for a particular BTS hardware implementation.
+ */
+struct bts_configuration {
+	/* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
+	unsigned char  sizeof_bts;
+	/* the size of a field in the BTS record in bytes */
+	unsigned char  sizeof_field;
+	/* a bitmask to enable/disable BTS in DEBUGCTL MSR */
+	unsigned long debugctl_mask;
+};
+static struct bts_configuration bts_cfg;
+
+#define BTS_MAX_RECORD_SIZE (8 * 3)
+
+
+/*
+ * Branch Trace Store (BTS) uses the following format. Different
+ * architectures vary in the size of those fields.
+ * - source linear address
+ * - destination linear address
+ * - flags
+ *
+ * Later architectures use 64bit pointers throughout, whereas earlier
+ * architectures use 32bit pointers in 32bit mode.
+ *
+ * We compute the base address for the first 8 fields based on:
+ * - the field size stored in the DS configuration
+ * - the relative field position
+ *
+ * In order to store additional information in the BTS buffer, we use
+ * a special source address to indicate that the record requires
+ * special interpretation.
+ *
+ * Netburst indicated via a bit in the flags field whether the branch
+ * was predicted; this is ignored.
+ */
+
+enum bts_field {
+	bts_from = 0,
+	bts_to,
+	bts_flags,
+
+	bts_escape = (unsigned long)-1,
+	bts_qual = bts_to,
+	bts_jiffies = bts_flags
+};
+
+static inline unsigned long bts_get(const char *base, enum bts_field field)
+{
+	base += (bts_cfg.sizeof_field * field);
+	return *(unsigned long *)base;
 }
 
-static int ptrace_bts_read_record(struct task_struct *child,
-				  long index,
+static inline void bts_set(char *base, enum bts_field field, unsigned long val)
+{
+	base += (bts_cfg.sizeof_field * field);;
+	(*(unsigned long *)base) = val;
+}
+
+/*
+ * Translate a BTS record from the raw format into the bts_struct format
+ *
+ * out (out): bts_struct interpretation
+ * raw: raw BTS record
+ */
+static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
+{
+	memset(out, 0, sizeof(*out));
+	if (bts_get(raw, bts_from) == bts_escape) {
+		out->qualifier       = bts_get(raw, bts_qual);
+		out->variant.jiffies = bts_get(raw, bts_jiffies);
+	} else {
+		out->qualifier = BTS_BRANCH;
+		out->variant.lbr.from_ip = bts_get(raw, bts_from);
+		out->variant.lbr.to_ip   = bts_get(raw, bts_to);
+	}
+}
+
+static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 				  struct bts_struct __user *out)
 {
 	struct bts_struct ret;
-	int retval;
-	int bts_end;
-	int bts_index;
+	const void *bts_record;
+	size_t bts_index, bts_end;
+	int error;
 
-	if (!child->thread.ds_area_msr)
-		return -ENXIO;
+	error = ds_get_bts_end(child, &bts_end);
+	if (error < 0)
+		return error;
 
-	if (index < 0)
-		return -EINVAL;
-
-	bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
 	if (bts_end <= index)
 		return -EINVAL;
 
+	error = ds_get_bts_index(child, &bts_index);
+	if (error < 0)
+		return error;
+
 	/* translate the ptrace bts index into the ds bts index */
-	bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr);
-	bts_index -= (index + 1);
-	if (bts_index < 0)
-		bts_index += bts_end;
+	bts_index += bts_end - (index + 1);
+	if (bts_end <= bts_index)
+		bts_index -= bts_end;
+
+	error = ds_access_bts(child, bts_index, &bts_record);
+	if (error < 0)
+		return error;
 
-	retval = ds_read_bts((void *)child->thread.ds_area_msr,
-			     bts_index, &ret);
-	if (retval < 0)
-		return retval;
+	ptrace_bts_translate_record(&ret, bts_record);
 
 	if (copy_to_user(out, &ret, sizeof(ret)))
 		return -EFAULT;
@@ -600,101 +696,106 @@ static int ptrace_bts_read_record(struct task_struct *child,
 	return sizeof(ret);
 }
 
-static int ptrace_bts_clear(struct task_struct *child)
-{
-	if (!child->thread.ds_area_msr)
-		return -ENXIO;
-
-	return ds_clear((void *)child->thread.ds_area_msr);
-}
-
 static int ptrace_bts_drain(struct task_struct *child,
 			    long size,
 			    struct bts_struct __user *out)
 {
-	int end, i;
-	void *ds = (void *)child->thread.ds_area_msr;
-
-	if (!ds)
-		return -ENXIO;
+	struct bts_struct ret;
+	const unsigned char *raw;
+	size_t end, i;
+	int error;
 
-	end = ds_get_bts_index(ds);
-	if (end <= 0)
-		return end;
+	error = ds_get_bts_index(child, &end);
+	if (error < 0)
+		return error;
 
 	if (size < (end * sizeof(struct bts_struct)))
 		return -EIO;
 
-	for (i = 0; i < end; i++, out++) {
-		struct bts_struct ret;
-		int retval;
+	error = ds_access_bts(child, 0, (const void **)&raw);
+	if (error < 0)
+		return error;
 
-		retval = ds_read_bts(ds, i, &ret);
-		if (retval < 0)
-			return retval;
+	for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
+		ptrace_bts_translate_record(&ret, raw);
 
 		if (copy_to_user(out, &ret, sizeof(ret)))
 			return -EFAULT;
 	}
 
-	ds_clear(ds);
+	error = ds_clear_bts(child);
+	if (error < 0)
+		return error;
 
 	return end;
 }
 
+static void ptrace_bts_ovfl(struct task_struct *child)
+{
+	send_sig(child->thread.bts_ovfl_signal, child, 0);
+}
+
 static int ptrace_bts_config(struct task_struct *child,
 			     long cfg_size,
 			     const struct ptrace_bts_config __user *ucfg)
 {
 	struct ptrace_bts_config cfg;
-	int bts_size, ret = 0;
-	void *ds;
+	int error = 0;
 
+	error = -EOPNOTSUPP;
+	if (!bts_cfg.sizeof_bts)
+		goto errout;
+
+	error = -EIO;
 	if (cfg_size < sizeof(cfg))
-		return -EIO;
+		goto errout;
 
+	error = -EFAULT;
 	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
-		return -EFAULT;
+		goto errout;
 
-	if ((int)cfg.size < 0)
-		return -EINVAL;
+	error = -EINVAL;
+	if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
+	    !(cfg.flags & PTRACE_BTS_O_ALLOC))
+		goto errout;
 
-	bts_size = 0;
-	ds = (void *)child->thread.ds_area_msr;
-	if (ds) {
-		bts_size = ds_get_bts_size(ds);
-		if (bts_size < 0)
-			return bts_size;
-	}
-	cfg.size = PAGE_ALIGN(cfg.size);
+	if (cfg.flags & PTRACE_BTS_O_ALLOC) {
+		ds_ovfl_callback_t ovfl = NULL;
+		unsigned int sig = 0;
 
-	if (bts_size != cfg.size) {
-		ret = ptrace_bts_realloc(child, cfg.size,
-					 cfg.flags & PTRACE_BTS_O_CUT_SIZE);
-		if (ret < 0)
+		/* we ignore the error in case we were not tracing child */
+		(void)ds_release_bts(child);
+
+		if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
+			if (!cfg.signal)
+				goto errout;
+
+			sig  = cfg.signal;
+			ovfl = ptrace_bts_ovfl;
+		}
+
+		error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
+		if (error < 0)
 			goto errout;
 
-		ds = (void *)child->thread.ds_area_msr;
+		child->thread.bts_ovfl_signal = sig;
 	}
 
-	if (cfg.flags & PTRACE_BTS_O_SIGNAL)
-		ret = ds_set_overflow(ds, DS_O_SIGNAL);
-	else
-		ret = ds_set_overflow(ds, DS_O_WRAP);
-	if (ret < 0)
+	error = -EINVAL;
+	if (!child->thread.ds_ctx && cfg.flags)
 		goto errout;
 
 	if (cfg.flags & PTRACE_BTS_O_TRACE)
-		child->thread.debugctlmsr |= ds_debugctl_mask();
+		child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
 	else
-		child->thread.debugctlmsr &= ~ds_debugctl_mask();
+		child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
 
 	if (cfg.flags & PTRACE_BTS_O_SCHED)
 		set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
 	else
 		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
 
-	ret = sizeof(cfg);
+	error = sizeof(cfg);
 
 out:
 	if (child->thread.debugctlmsr)
@@ -702,10 +803,10 @@ out:
 	else
 		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
 
-	return ret;
+	return error;
 
 errout:
-	child->thread.debugctlmsr &= ~ds_debugctl_mask();
+	child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
 	clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
 	goto out;
 }
@@ -714,29 +815,40 @@ static int ptrace_bts_status(struct task_struct *child,
 			     long cfg_size,
 			     struct ptrace_bts_config __user *ucfg)
 {
-	void *ds = (void *)child->thread.ds_area_msr;
 	struct ptrace_bts_config cfg;
+	size_t end;
+	const void *base, *max;
+	int error;
 
 	if (cfg_size < sizeof(cfg))
 		return -EIO;
 
-	memset(&cfg, 0, sizeof(cfg));
+	error = ds_get_bts_end(child, &end);
+	if (error < 0)
+		return error;
+
+	error = ds_access_bts(child, /* index = */ 0, &base);
+	if (error < 0)
+		return error;
 
-	if (ds) {
-		cfg.size = ds_get_bts_size(ds);
+	error = ds_access_bts(child, /* index = */ end, &max);
+	if (error < 0)
+		return error;
 
-		if (ds_get_overflow(ds) == DS_O_SIGNAL)
-			cfg.flags |= PTRACE_BTS_O_SIGNAL;
+	memset(&cfg, 0, sizeof(cfg));
+	cfg.size = (max - base);
+	cfg.signal = child->thread.bts_ovfl_signal;
+	cfg.bts_size = sizeof(struct bts_struct);
 
-		if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
-		    child->thread.debugctlmsr & ds_debugctl_mask())
-			cfg.flags |= PTRACE_BTS_O_TRACE;
+	if (cfg.signal)
+		cfg.flags |= PTRACE_BTS_O_SIGNAL;
 
-		if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
-			cfg.flags |= PTRACE_BTS_O_SCHED;
-	}
+	if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
+	    child->thread.debugctlmsr & bts_cfg.debugctl_mask)
+		cfg.flags |= PTRACE_BTS_O_TRACE;
 
-	cfg.bts_size = sizeof(struct bts_struct);
+	if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
+		cfg.flags |= PTRACE_BTS_O_SCHED;
 
 	if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
 		return -EFAULT;
@@ -744,89 +856,38 @@ static int ptrace_bts_status(struct task_struct *child,
 	return sizeof(cfg);
 }
 
-
 static int ptrace_bts_write_record(struct task_struct *child,
 				   const struct bts_struct *in)
 {
-	int retval;
+	unsigned char bts_record[BTS_MAX_RECORD_SIZE];
 
-	if (!child->thread.ds_area_msr)
-		return -ENXIO;
+	BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
 
-	retval = ds_write_bts((void *)child->thread.ds_area_msr, in);
-	if (retval)
-		return retval;
+	memset(bts_record, 0, bts_cfg.sizeof_bts);
+	switch (in->qualifier) {
+	case BTS_INVALID:
+		break;
 
-	return sizeof(*in);
-}
+	case BTS_BRANCH:
+		bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
+		bts_set(bts_record, bts_to,   in->variant.lbr.to_ip);
+		break;
 
-static int ptrace_bts_realloc(struct task_struct *child,
-			      int size, int reduce_size)
-{
-	unsigned long rlim, vm;
-	int ret, old_size;
+	case BTS_TASK_ARRIVES:
+	case BTS_TASK_DEPARTS:
+		bts_set(bts_record, bts_from,    bts_escape);
+		bts_set(bts_record, bts_qual,    in->qualifier);
+		bts_set(bts_record, bts_jiffies, in->variant.jiffies);
+		break;
 
-	if (size < 0)
+	default:
 		return -EINVAL;
-
-	old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
-	if (old_size < 0)
-		return old_size;
-
-	ret = ds_free((void **)&child->thread.ds_area_msr);
-	if (ret < 0)
-		goto out;
-
-	size >>= PAGE_SHIFT;
-	old_size >>= PAGE_SHIFT;
-
-	current->mm->total_vm  -= old_size;
-	current->mm->locked_vm -= old_size;
-
-	if (size == 0)
-		goto out;
-
-	rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
-	vm = current->mm->total_vm  + size;
-	if (rlim < vm) {
-		ret = -ENOMEM;
-
-		if (!reduce_size)
-			goto out;
-
-		size = rlim - current->mm->total_vm;
-		if (size <= 0)
-			goto out;
 	}
 
-	rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
-	vm = current->mm->locked_vm  + size;
-	if (rlim < vm) {
-		ret = -ENOMEM;
-
-		if (!reduce_size)
-			goto out;
-
-		size = rlim - current->mm->locked_vm;
-		if (size <= 0)
-			goto out;
-	}
-
-	ret = ds_allocate((void **)&child->thread.ds_area_msr,
-			  size << PAGE_SHIFT);
-	if (ret < 0)
-		goto out;
-
-	current->mm->total_vm  += size;
-	current->mm->locked_vm += size;
-
-out:
-	if (child->thread.ds_area_msr)
-		set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
-	else
-		clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
-
-	return ret;
+	/* The writing task will be the switched-to task on a context
+	 * switch. It needs to write into the switched-from task's BTS
+	 * buffer. */
+	return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
 }
 
 void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -839,7 +900,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
 
 	ptrace_bts_write_record(tsk, &rec);
 }
-#endif /* X86_BTS */
+
+static const struct bts_configuration bts_cfg_netburst = {
+	.sizeof_bts    = sizeof(long) * 3,
+	.sizeof_field  = sizeof(long),
+	.debugctl_mask = (1<<2)|(1<<3)|(1<<5)
+};
+
+static const struct bts_configuration bts_cfg_pentium_m = {
+	.sizeof_bts    = sizeof(long) * 3,
+	.sizeof_field  = sizeof(long),
+	.debugctl_mask = (1<<6)|(1<<7)
+};
+
+static const struct bts_configuration bts_cfg_core2 = {
+	.sizeof_bts    = 8 * 3,
+	.sizeof_field  = 8,
+	.debugctl_mask = (1<<6)|(1<<7)|(1<<9)
+};
+
+static inline void bts_configure(const struct bts_configuration *cfg)
+{
+	bts_cfg = *cfg;
+}
+
+void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
+{
+	switch (c->x86) {
+	case 0x6:
+		switch (c->x86_model) {
+		case 0xD:
+		case 0xE: /* Pentium M */
+			bts_configure(&bts_cfg_pentium_m);
+			break;
+		case 0xF: /* Core2 */
+        case 0x1C: /* Atom */
+			bts_configure(&bts_cfg_core2);
+			break;
+		default:
+			/* sorry, don't know about them */
+			break;
+		}
+		break;
+	case 0xF:
+		switch (c->x86_model) {
+		case 0x0:
+		case 0x1:
+		case 0x2: /* Netburst */
+			bts_configure(&bts_cfg_netburst);
+			break;
+		default:
+			/* sorry, don't know about them */
+			break;
+		}
+		break;
+	default:
+		/* sorry, don't know about them */
+		break;
+	}
+}
+#endif /* CONFIG_X86_PTRACE_BTS */
 
 /*
  * Called by kernel/ptrace.c when detaching..
@@ -852,15 +972,15 @@ void ptrace_disable(struct task_struct *child)
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
-	if (child->thread.ds_area_msr) {
-#ifdef X86_BTS
-		ptrace_bts_realloc(child, 0, 0);
-#endif
-		child->thread.debugctlmsr &= ~ds_debugctl_mask();
-		if (!child->thread.debugctlmsr)
-			clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
-		clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
-	}
+#ifdef CONFIG_X86_PTRACE_BTS
+	(void)ds_release_bts(child);
+
+	child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
+	if (!child->thread.debugctlmsr)
+		clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+
+	clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
+#endif /* CONFIG_X86_PTRACE_BTS */
 }
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -980,7 +1100,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	/*
 	 * These bits need more cooking - not enabled yet:
 	 */
-#ifdef X86_BTS
+#ifdef CONFIG_X86_PTRACE_BTS
 	case PTRACE_BTS_CONFIG:
 		ret = ptrace_bts_config
 			(child, data, (struct ptrace_bts_config __user *)addr);
@@ -992,7 +1112,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 
 	case PTRACE_BTS_SIZE:
-		ret = ptrace_bts_get_size(child);
+		ret = ds_get_bts_index(child, /* pos = */ NULL);
 		break;
 
 	case PTRACE_BTS_GET:
@@ -1001,14 +1121,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 
 	case PTRACE_BTS_CLEAR:
-		ret = ptrace_bts_clear(child);
+		ret = ds_clear_bts(child);
 		break;
 
 	case PTRACE_BTS_DRAIN:
 		ret = ptrace_bts_drain
 			(child, data, (struct bts_struct __user *) addr);
 		break;
-#endif
+#endif /* CONFIG_X86_PTRACE_BTS */
 
 	default:
 		ret = ptrace_request(child, request, addr, data);
@@ -1290,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = {
 		.size = sizeof(long), .align = sizeof(long),
 		.active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
 	},
+	[REGSET_IOPERM64] = {
+		.core_note_type = NT_386_IOPERM,
+		.n = IO_BITMAP_LONGS,
+		.size = sizeof(long), .align = sizeof(long),
+		.active = ioperm_active, .get = ioperm_get
+	},
 };
 
 static const struct user_regset_view user_x86_64_view = {
@@ -1336,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = {
 		.active = regset_tls_active,
 		.get = regset_tls_get, .set = regset_tls_set
 	},
+	[REGSET_IOPERM32] = {
+		.core_note_type = NT_386_IOPERM,
+		.n = IO_BITMAP_BYTES / sizeof(u32),
+		.size = sizeof(u32), .align = sizeof(u32),
+		.active = ioperm_active, .get = ioperm_get
+	},
 };
 
 static const struct user_regset_view user_x86_32_view = {
@@ -1357,9 +1489,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
 #endif
 }
 
-#ifdef CONFIG_X86_32
-
-void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
+void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
+					 int error_code, int si_code)
 {
 	struct siginfo info;
 
@@ -1368,7 +1499,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 
 	memset(&info, 0, sizeof(info));
 	info.si_signo = SIGTRAP;
-	info.si_code = TRAP_BRKPT;
+	info.si_code = si_code;
 
 	/* User-mode ip? */
 	info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
@@ -1377,143 +1508,83 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 	force_sig_info(SIGTRAP, &info, tsk);
 }
 
-/* notification of system call entry/exit
- * - triggered by current->work.syscall_trace
- */
-int do_syscall_trace(struct pt_regs *regs, int entryexit)
-{
-	int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
-	/*
-	 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
-	 * interception
-	 */
-	int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
-	int ret = 0;
-
-	/* do the secure computing check first */
-	if (!entryexit)
-		secure_computing(regs->orig_ax);
-
-	if (unlikely(current->audit_context)) {
-		if (entryexit)
-			audit_syscall_exit(AUDITSC_RESULT(regs->ax),
-						regs->ax);
-		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
-		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
-		 * not used, entry.S will call us only on syscall exit, not
-		 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
-		 * calling send_sigtrap() on syscall entry.
-		 *
-		 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
-		 * is_singlestep is false, despite his name, so we will still do
-		 * the correct thing.
-		 */
-		else if (is_singlestep)
-			goto out;
-	}
-
-	if (!(current->ptrace & PT_PTRACED))
-		goto out;
-
-	/* If a process stops on the 1st tracepoint with SYSCALL_TRACE
-	 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
-	 * here. We have to check this and return */
-	if (is_sysemu && entryexit)
-		return 0;
-
-	/* Fake a debug trap */
-	if (is_singlestep)
-		send_sigtrap(current, regs, 0);
-
- 	if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
-		goto out;
-
-	/* the 0x80 provides a way for the tracing parent to distinguish
-	   between a syscall stop and SIGTRAP delivery */
-	/* Note that the debugger could change the result of test_thread_flag!*/
-	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
 
-	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
-	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
-	ret = is_sysemu;
-out:
-	if (unlikely(current->audit_context) && !entryexit)
-		audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
-				    regs->bx, regs->cx, regs->dx, regs->si);
-	if (ret == 0)
-		return 0;
-
-	regs->orig_ax = -1; /* force skip of syscall restarting */
-	if (unlikely(current->audit_context))
-		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
-	return 1;
-}
-
-#else  /* CONFIG_X86_64 */
+#ifdef CONFIG_X86_32
+# define IS_IA32	1
+#elif defined CONFIG_IA32_EMULATION
+# define IS_IA32	test_thread_flag(TIF_IA32)
+#else
+# define IS_IA32	0
+#endif
 
-static void syscall_trace(struct pt_regs *regs)
+/*
+ * We must return the syscall number to actually look up in the table.
+ * This can be -1L to skip running any syscall at all.
+ */
+asmregparm long syscall_trace_enter(struct pt_regs *regs)
 {
+	long ret = 0;
 
-#if 0
-	printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
-	       current->comm,
-	       regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
-	       current_thread_info()->flags, current->ptrace);
-#endif
-
-	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
-				? 0x80 : 0));
 	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
+	 * If we stepped into a sysenter/syscall insn, it trapped in
+	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+	 * If user-mode had set TF itself, then it's still clear from
+	 * do_debug() and we need to set it again to restore the user
+	 * state.  If we entered on the slow path, TF was already set.
 	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
-}
+	if (test_thread_flag(TIF_SINGLESTEP))
+		regs->flags |= X86_EFLAGS_TF;
 
-asmlinkage void syscall_trace_enter(struct pt_regs *regs)
-{
 	/* do the secure computing check first */
 	secure_computing(regs->orig_ax);
 
-	if (test_thread_flag(TIF_SYSCALL_TRACE)
-	    && (current->ptrace & PT_PTRACED))
-		syscall_trace(regs);
+	if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
+		ret = -1L;
+
+	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
+	    tracehook_report_syscall_entry(regs))
+		ret = -1L;
 
 	if (unlikely(current->audit_context)) {
-		if (test_thread_flag(TIF_IA32)) {
+		if (IS_IA32)
 			audit_syscall_entry(AUDIT_ARCH_I386,
 					    regs->orig_ax,
 					    regs->bx, regs->cx,
 					    regs->dx, regs->si);
-		} else {
+#ifdef CONFIG_X86_64
+		else
 			audit_syscall_entry(AUDIT_ARCH_X86_64,
 					    regs->orig_ax,
 					    regs->di, regs->si,
 					    regs->dx, regs->r10);
-		}
+#endif
 	}
+
+	return ret ?: regs->orig_ax;
 }
 
-asmlinkage void syscall_trace_leave(struct pt_regs *regs)
+asmregparm void syscall_trace_leave(struct pt_regs *regs)
 {
 	if (unlikely(current->audit_context))
 		audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
 
-	if ((test_thread_flag(TIF_SYSCALL_TRACE)
-	     || test_thread_flag(TIF_SINGLESTEP))
-	    && (current->ptrace & PT_PTRACED))
-		syscall_trace(regs);
-}
+	if (test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall_exit(regs, 0);
 
-#endif	/* CONFIG_X86_32 */
+	/*
+	 * If TIF_SYSCALL_EMU is set, we only get here because of
+	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+	 * We already reported this syscall instruction in
+	 * syscall_trace_enter(), so don't do any more now.
+	 */
+	if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
+		return;
+
+	/*
+	 * If we are single-stepping, synthesize a trap to follow the
+	 * system call instruction.
+	 */
+	if (test_thread_flag(TIF_SINGLESTEP) &&
+	    tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
+		send_sigtrap(current, regs, 0, TRAP_BRKPT);
+}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 05fbe9a0325a..4f9c55f3a7c0 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
 	return dst->version;
 }
 
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
+{
+	u64 pv_tsc_khz = 1000000ULL << 32;
+
+	do_div(pv_tsc_khz, src->tsc_to_system_mul);
+	if (src->tsc_shift < 0)
+		pv_tsc_khz <<= -src->tsc_shift;
+	else
+		pv_tsc_khz >>= src->tsc_shift;
+	return pv_tsc_khz;
+}
+
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
 	struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d13858818100..67465ed89310 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
 	if (!(word & (1 << 13))) {
 		dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
 			"disabling irq balancing and affinity\n");
-#ifdef CONFIG_IRQBALANCE
-		irqbalance_disable("");
-#endif
 		noirqdebug_setup("");
 #ifdef CONFIG_PROC_FS
 		no_irq_affinity = 1;
@@ -354,9 +351,27 @@ static void ati_force_hpet_resume(void)
 	printk(KERN_DEBUG "Force enabled HPET at resume\n");
 }
 
+static u32 ati_ixp4x0_rev(struct pci_dev *dev)
+{
+	u32 d;
+	u8  b;
+
+	pci_read_config_byte(dev, 0xac, &b);
+	b &= ~(1<<5);
+	pci_write_config_byte(dev, 0xac, b);
+	pci_read_config_dword(dev, 0x70, &d);
+	d |= 1<<8;
+	pci_write_config_dword(dev, 0x70, d);
+	pci_read_config_dword(dev, 0x8, &d);
+	d &= 0xff;
+	dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d);
+	return d;
+}
+
 static void ati_force_enable_hpet(struct pci_dev *dev)
 {
-	u32 uninitialized_var(val);
+	u32 d, val;
+	u8  b;
 
 	if (hpet_address || force_hpet_address)
 		return;
@@ -366,14 +381,33 @@ static void ati_force_enable_hpet(struct pci_dev *dev)
 		return;
 	}
 
+	d = ati_ixp4x0_rev(dev);
+	if (d  < 0x82)
+		return;
+
+	/* base address */
 	pci_write_config_dword(dev, 0x14, 0xfed00000);
 	pci_read_config_dword(dev, 0x14, &val);
+
+	/* enable interrupt */
+	outb(0x72, 0xcd6); b = inb(0xcd7);
+	b |= 0x1;
+	outb(0x72, 0xcd6); outb(b, 0xcd7);
+	outb(0x72, 0xcd6); b = inb(0xcd7);
+	if (!(b & 0x1))
+		return;
+	pci_read_config_dword(dev, 0x64, &d);
+	d |= (1<<10);
+	pci_write_config_dword(dev, 0x64, d);
+	pci_read_config_dword(dev, 0x64, &d);
+	if (!(d & (1<<10)))
+		return;
+
 	force_hpet_address = val;
 	force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
 	dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
 		   force_hpet_address);
 	cached_dev = dev;
-	return;
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
 			 ati_force_enable_hpet);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f8a62160e151..f4c93f1cfc19 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off);
 
 static const struct desc_ptr no_idt = {};
 static int reboot_mode;
-enum reboot_type reboot_type = BOOT_KBD;
+/*
+ * Keyboard reset and triple fault may result in INIT, not RESET, which
+ * doesn't work when we're in vmx root mode.  Try ACPI first.
+ */
+enum reboot_type reboot_type = BOOT_ACPI;
 int reboot_force;
 
 #if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -177,6 +181,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
 		},
 	},
+	{	/* Handle problems with rebooting on Dell T5400's */
+		.callback = set_bios_reboot,
+		.ident = "Dell Precision T5400",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
+		},
+	},
 	{	/* Handle problems with rebooting on HP laptops */
 		.callback = set_bios_reboot,
 		.ident = "HP Compaq Laptop",
@@ -403,10 +415,9 @@ void native_machine_shutdown(void)
 {
 	/* Stop the cpus and apics */
 #ifdef CONFIG_SMP
-	int reboot_cpu_id;
 
 	/* The boot cpu is always logical cpu 0 */
-	reboot_cpu_id = 0;
+	int reboot_cpu_id = 0;
 
 #ifdef CONFIG_X86_32
 	/* See if there has been given a command line override */
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470d..6f50664b2ba5 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,45 @@
 #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
 #define PAE_PGD_ATTR (_PAGE_PRESENT)
 
+/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
+ * ~ control_page + PAGE_SIZE are used as data storage and stack for
+ * jumping back
+ */
+#define DATA(offset)		(KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
+
+/* Minimal CPU state */
+#define ESP			DATA(0x0)
+#define CR0			DATA(0x4)
+#define CR3			DATA(0x8)
+#define CR4			DATA(0xc)
+
+/* other data */
+#define CP_VA_CONTROL_PAGE	DATA(0x10)
+#define CP_PA_PGD		DATA(0x14)
+#define CP_PA_SWAP_PAGE		DATA(0x18)
+#define CP_PA_BACKUP_PAGES_MAP	DATA(0x1c)
+
 	.text
 	.align PAGE_SIZE
 	.globl relocate_kernel
 relocate_kernel:
-	movl	8(%esp), %ebp /* list of pages */
+	/* Save the CPU context, used for jumping back */
+
+	pushl	%ebx
+	pushl	%esi
+	pushl	%edi
+	pushl	%ebp
+	pushf
+
+	movl	20+8(%esp), %ebp /* list of pages */
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %edi
+	movl	%esp, ESP(%edi)
+	movl	%cr0, %eax
+	movl	%eax, CR0(%edi)
+	movl	%cr3, %eax
+	movl	%eax, CR3(%edi)
+	movl	%cr4, %eax
+	movl	%eax, CR4(%edi)
 
 #ifdef CONFIG_X86_PAE
 	/* map the control page at its virtual address */
@@ -138,15 +172,25 @@ relocate_kernel:
 
 relocate_new_kernel:
 	/* read the arguments and say goodbye to the stack */
-	movl  4(%esp), %ebx /* page_list */
-	movl  8(%esp), %ebp /* list of pages */
-	movl  12(%esp), %edx /* start address */
-	movl  16(%esp), %ecx /* cpu_has_pae */
+	movl  20+4(%esp), %ebx /* page_list */
+	movl  20+8(%esp), %ebp /* list of pages */
+	movl  20+12(%esp), %edx /* start address */
+	movl  20+16(%esp), %ecx /* cpu_has_pae */
+	movl  20+20(%esp), %esi /* preserve_context */
 
 	/* zero out flags, and disable interrupts */
 	pushl $0
 	popfl
 
+	/* save some information for jumping back */
+	movl	PTR(VA_CONTROL_PAGE)(%ebp), %edi
+	movl	%edi, CP_VA_CONTROL_PAGE(%edi)
+	movl	PTR(PA_PGD)(%ebp), %eax
+	movl	%eax, CP_PA_PGD(%edi)
+	movl	PTR(PA_SWAP_PAGE)(%ebp), %eax
+	movl	%eax, CP_PA_SWAP_PAGE(%edi)
+	movl	%ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
+
 	/* get physical address of control page now */
 	/* this is impossible after page table switch */
 	movl	PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +241,90 @@ identity_mapped:
 	xorl	%eax, %eax
 	movl	%eax, %cr3
 
+	movl	CP_PA_SWAP_PAGE(%edi), %eax
+	pushl	%eax
+	pushl	%ebx
+	call	swap_pages
+	addl	$8, %esp
+
+	/* To be certain of avoiding problems with self-modifying code
+	 * I need to execute a serializing instruction here.
+	 * So I flush the TLB, it's handy, and not processor dependent.
+	 */
+	xorl	%eax, %eax
+	movl	%eax, %cr3
+
+	/* set all of the registers to known values */
+	/* leave %esp alone */
+
+	testl	%esi, %esi
+	jnz 1f
+	xorl	%edi, %edi
+	xorl	%eax, %eax
+	xorl	%ebx, %ebx
+	xorl    %ecx, %ecx
+	xorl    %edx, %edx
+	xorl    %esi, %esi
+	xorl    %ebp, %ebp
+	ret
+1:
+	popl	%edx
+	movl	CP_PA_SWAP_PAGE(%edi), %esp
+	addl	$PAGE_SIZE, %esp
+2:
+	call	*%edx
+
+	/* get the re-entry point of the peer system */
+	movl	0(%esp), %ebp
+	call	1f
+1:
+	popl	%ebx
+	subl	$(1b - relocate_kernel), %ebx
+	movl	CP_VA_CONTROL_PAGE(%ebx), %edi
+	lea	PAGE_SIZE(%ebx), %esp
+	movl	CP_PA_SWAP_PAGE(%ebx), %eax
+	movl	CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
+	pushl	%eax
+	pushl	%edx
+	call	swap_pages
+	addl	$8, %esp
+	movl	CP_PA_PGD(%ebx), %eax
+	movl	%eax, %cr3
+	movl	%cr0, %eax
+	orl	$(1<<31), %eax
+	movl	%eax, %cr0
+	lea	PAGE_SIZE(%edi), %esp
+	movl	%edi, %eax
+	addl	$(virtual_mapped - relocate_kernel), %eax
+	pushl	%eax
+	ret
+
+virtual_mapped:
+	movl	CR4(%edi), %eax
+	movl	%eax, %cr4
+	movl	CR3(%edi), %eax
+	movl	%eax, %cr3
+	movl	CR0(%edi), %eax
+	movl	%eax, %cr0
+	movl	ESP(%edi), %esp
+	movl	%ebp, %eax
+
+	popf
+	popl	%ebp
+	popl	%edi
+	popl	%esi
+	popl	%ebx
+	ret
+
 	/* Do the copies */
-	movl	%ebx, %ecx
+swap_pages:
+	movl	8(%esp), %edx
+	movl	4(%esp), %ecx
+	pushl	%ebp
+	pushl	%ebx
+	pushl	%edi
+	pushl	%esi
+	movl	%ecx, %ebx
 	jmp	1f
 
 0:	/* top, read another word from the indirection page */
@@ -226,27 +352,31 @@ identity_mapped:
 	movl    %ecx,   %esi /* For every source page do a copy */
 	andl    $0xfffff000, %esi
 
+	movl	%edi, %eax
+	movl	%esi, %ebp
+
+	movl	%edx, %edi
 	movl    $1024, %ecx
 	rep ; movsl
-	jmp     0b
-
-3:
 
-	/* To be certain of avoiding problems with self-modifying code
-	 * I need to execute a serializing instruction here.
-	 * So I flush the TLB, it's handy, and not processor dependent.
-	 */
-	xorl	%eax, %eax
-	movl	%eax, %cr3
+	movl	%ebp, %edi
+	movl	%eax, %esi
+	movl	$1024, %ecx
+	rep ; movsl
 
-	/* set all of the registers to known values */
-	/* leave %esp alone */
+	movl	%eax, %edi
+	movl	%edx, %esi
+	movl	$1024, %ecx
+	rep ; movsl
 
-	xorl	%eax, %eax
-	xorl	%ebx, %ebx
-	xorl    %ecx, %ecx
-	xorl    %edx, %edx
-	xorl    %esi, %esi
-	xorl    %edi, %edi
-	xorl    %ebp, %ebp
+	lea	PAGE_SIZE(%ebp), %esi
+	jmp     0b
+3:
+	popl	%esi
+	popl	%edi
+	popl	%ebx
+	popl	%ebp
 	ret
+
+	.globl kexec_control_code_size
+.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 05191bbc68b8..dd6f2b71561b 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -52,7 +52,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
 
 	cmos_minutes = CMOS_READ(RTC_MINUTES);
 	if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
-		BCD_TO_BIN(cmos_minutes);
+		cmos_minutes = bcd2bin(cmos_minutes);
 
 	/*
 	 * since we're only adjusting minutes and seconds,
@@ -69,8 +69,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
 
 	if (abs(real_minutes - cmos_minutes) < 30) {
 		if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
-			BIN_TO_BCD(real_seconds);
-			BIN_TO_BCD(real_minutes);
+			real_seconds = bin2bcd(real_seconds);
+			real_minutes = bin2bcd(real_minutes);
 		}
 		CMOS_WRITE(real_seconds,RTC_SECONDS);
 		CMOS_WRITE(real_minutes,RTC_MINUTES);
@@ -124,16 +124,16 @@ unsigned long mach_get_cmos_time(void)
 	WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
 
 	if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
-		BCD_TO_BIN(sec);
-		BCD_TO_BIN(min);
-		BCD_TO_BIN(hour);
-		BCD_TO_BIN(day);
-		BCD_TO_BIN(mon);
-		BCD_TO_BIN(year);
+		sec = bcd2bin(sec);
+		min = bcd2bin(min);
+		hour = bcd2bin(hour);
+		day = bcd2bin(day);
+		mon = bcd2bin(mon);
+		year = bcd2bin(year);
 	}
 
 	if (century) {
-		BCD_TO_BIN(century);
+		century = bcd2bin(century);
 		year += century * 100;
 		printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
 	} else
@@ -223,11 +223,25 @@ static struct platform_device rtc_device = {
 static __init int add_rtc_cmos(void)
 {
 #ifdef CONFIG_PNP
-	if (!pnp_platform_devices)
-		platform_device_register(&rtc_device);
-#else
+	static const char *ids[] __initconst =
+	    { "PNP0b00", "PNP0b01", "PNP0b02", };
+	struct pnp_dev *dev;
+	struct pnp_id *id;
+	int i;
+
+	pnp_for_each_dev(dev) {
+		for (id = dev->id; id; id = id->next) {
+			for (i = 0; i < ARRAY_SIZE(ids); i++) {
+				if (compare_pnp_id(id, ids[i]) != 0)
+					return 0;
+			}
+		}
+	}
+#endif
+
 	platform_device_register(&rtc_device);
-#endif /* CONFIG_PNP */
+	dev_info(&rtc_device.dev,
+		 "registered platform RTC device (no PNP device found)\n");
 	return 0;
 }
 device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 531b55b8e81a..0fa6790c1dd3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -57,12 +57,8 @@
 #include <linux/slab.h>
 #include <linux/user.h>
 #include <linux/delay.h>
-#include <linux/highmem.h>
 
 #include <linux/kallsyms.h>
-#include <linux/edd.h>
-#include <linux/iscsi_ibft.h>
-#include <linux/kexec.h>
 #include <linux/cpufreq.h>
 #include <linux/dma-mapping.h>
 #include <linux/ctype.h>
@@ -96,7 +92,7 @@
 #include <asm/smp.h>
 #include <asm/desc.h>
 #include <asm/dma.h>
-#include <asm/gart.h>
+#include <asm/iommu.h>
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
 
@@ -104,7 +100,6 @@
 #include <asm/paravirt.h>
 
 #include <asm/percpu.h>
-#include <asm/sections.h>
 #include <asm/topology.h>
 #include <asm/apicdef.h>
 #ifdef CONFIG_X86_64
@@ -228,6 +223,9 @@ unsigned long saved_video_mode;
 #define RAMDISK_LOAD_FLAG		0x4000
 
 static char __initdata command_line[COMMAND_LINE_SIZE];
+#ifdef CONFIG_CMDLINE_BOOL
+static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
+#endif
 
 #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
 struct edd edd;
@@ -304,7 +302,7 @@ static void __init relocate_initrd(void)
 		if (clen > MAX_MAP_CHUNK-slop)
 			clen = MAX_MAP_CHUNK-slop;
 		mapaddr = ramdisk_image & PAGE_MASK;
-		p = early_ioremap(mapaddr, clen+slop);
+		p = early_memremap(mapaddr, clen+slop);
 		memcpy(q, p+slop, clen);
 		early_iounmap(p, clen+slop);
 		q += clen;
@@ -381,7 +379,7 @@ static void __init parse_setup_data(void)
 		return;
 	pa_data = boot_params.hdr.setup_data;
 	while (pa_data) {
-		data = early_ioremap(pa_data, PAGE_SIZE);
+		data = early_memremap(pa_data, PAGE_SIZE);
 		switch (data->type) {
 		case SETUP_E820_EXT:
 			parse_e820_ext(data, pa_data);
@@ -404,7 +402,7 @@ static void __init e820_reserve_setup_data(void)
 		return;
 	pa_data = boot_params.hdr.setup_data;
 	while (pa_data) {
-		data = early_ioremap(pa_data, sizeof(*data));
+		data = early_memremap(pa_data, sizeof(*data));
 		e820_update_range(pa_data, sizeof(*data)+data->len,
 			 E820_RAM, E820_RESERVED_KERN);
 		found = 1;
@@ -430,7 +428,7 @@ static void __init reserve_early_setup_data(void)
 		return;
 	pa_data = boot_params.hdr.setup_data;
 	while (pa_data) {
-		data = early_ioremap(pa_data, sizeof(*data));
+		data = early_memremap(pa_data, sizeof(*data));
 		sprintf(buf, "setup data %x", data->type);
 		reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
 		pa_data = data->next;
@@ -450,7 +448,7 @@ static void __init reserve_early_setup_data(void)
  * @size: Size of the crashkernel memory to reserve.
  * Returns the base address on success, and -1ULL on failure.
  */
-unsigned long long find_and_reserve_crashkernel(unsigned long long size)
+unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
 {
 	const unsigned long long alignment = 16<<20; 	/* 16M */
 	unsigned long long start = 0LL;
@@ -563,7 +561,13 @@ static void __init reserve_standard_io_resources(void)
 
 }
 
-#ifdef CONFIG_PROC_VMCORE
+/*
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence
+ * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+
+#ifdef CONFIG_CRASH_DUMP
 /* elfcorehdr= specifies the location of elf core header
  * stored by the crashed kernel. This option will be passed
  * by kexec loader to the capture kernel.
@@ -579,6 +583,194 @@ static int __init setup_elfcorehdr(char *arg)
 early_param("elfcorehdr", setup_elfcorehdr);
 #endif
 
+static struct x86_quirks default_x86_quirks __initdata;
+
+struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
+
+/*
+ * Some BIOSes seem to corrupt the low 64k of memory during events
+ * like suspend/resume and unplugging an HDMI cable.  Reserve all
+ * remaining free memory in that area and fill it with a distinct
+ * pattern.
+ */
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+#define MAX_SCAN_AREAS	8
+
+static int __read_mostly memory_corruption_check = -1;
+
+static unsigned __read_mostly corruption_check_size = 64*1024;
+static unsigned __read_mostly corruption_check_period = 60; /* seconds */
+
+static struct e820entry scan_areas[MAX_SCAN_AREAS];
+static int num_scan_areas;
+
+
+static int set_corruption_check(char *arg)
+{
+	char *end;
+
+	memory_corruption_check = simple_strtol(arg, &end, 10);
+
+	return (*end == 0) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check", set_corruption_check);
+
+static int set_corruption_check_period(char *arg)
+{
+	char *end;
+
+	corruption_check_period = simple_strtoul(arg, &end, 10);
+
+	return (*end == 0) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check_period", set_corruption_check_period);
+
+static int set_corruption_check_size(char *arg)
+{
+	char *end;
+	unsigned size;
+
+	size = memparse(arg, &end);
+
+	if (*end == '\0')
+		corruption_check_size = size;
+
+	return (size == corruption_check_size) ? 0 : -EINVAL;
+}
+early_param("memory_corruption_check_size", set_corruption_check_size);
+
+
+static void __init setup_bios_corruption_check(void)
+{
+	u64 addr = PAGE_SIZE;	/* assume first page is reserved anyway */
+
+	if (memory_corruption_check == -1) {
+		memory_corruption_check =
+#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
+			1
+#else
+			0
+#endif
+			;
+	}
+
+	if (corruption_check_size == 0)
+		memory_corruption_check = 0;
+
+	if (!memory_corruption_check)
+		return;
+
+	corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
+
+	while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
+		u64 size;
+		addr = find_e820_area_size(addr, &size, PAGE_SIZE);
+
+		if (addr == 0)
+			break;
+
+		if ((addr + size) > corruption_check_size)
+			size = corruption_check_size - addr;
+
+		if (size == 0)
+			break;
+
+		e820_update_range(addr, size, E820_RAM, E820_RESERVED);
+		scan_areas[num_scan_areas].addr = addr;
+		scan_areas[num_scan_areas].size = size;
+		num_scan_areas++;
+
+		/* Assume we've already mapped this early memory */
+		memset(__va(addr), 0, size);
+
+		addr += size;
+	}
+
+	printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
+	       num_scan_areas);
+	update_e820();
+}
+
+static struct timer_list periodic_check_timer;
+
+void check_for_bios_corruption(void)
+{
+	int i;
+	int corruption = 0;
+
+	if (!memory_corruption_check)
+		return;
+
+	for(i = 0; i < num_scan_areas; i++) {
+		unsigned long *addr = __va(scan_areas[i].addr);
+		unsigned long size = scan_areas[i].size;
+
+		for(; size; addr++, size -= sizeof(unsigned long)) {
+			if (!*addr)
+				continue;
+			printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
+			       addr, __pa(addr), *addr);
+			corruption = 1;
+			*addr = 0;
+		}
+	}
+
+	WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
+}
+
+static void periodic_check_for_corruption(unsigned long data)
+{
+	check_for_bios_corruption();
+	mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
+}
+
+void start_periodic_check_for_corruption(void)
+{
+	if (!memory_corruption_check || corruption_check_period == 0)
+		return;
+
+	printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
+	       corruption_check_period);
+
+	init_timer(&periodic_check_timer);
+	periodic_check_timer.function = &periodic_check_for_corruption;
+	periodic_check_for_corruption(0);
+}
+#endif
+
+static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
+{
+	printk(KERN_NOTICE
+		"%s detected: BIOS may corrupt low RAM, working it around.\n",
+		d->ident);
+
+	e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
+	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
+
+	return 0;
+}
+
+/* List of systems that have known low memory corruption BIOS problems */
+static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
+#ifdef CONFIG_X86_RESERVE_LOW_64K
+	{
+		.callback = dmi_low_memory_corruption,
+		.ident = "AMI BIOS",
+		.matches = {
+			DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
+		},
+	},
+	{
+		.callback = dmi_low_memory_corruption,
+		.ident = "Phoenix BIOS",
+		.matches = {
+			DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
+		},
+	},
+#endif
+	{}
+};
+
 /*
  * Determine if we were loaded by an EFI loader.  If so, then we have also been
  * passed the efi memmap, systab, etc., so we should use these data structures
@@ -598,11 +790,11 @@ void __init setup_arch(char **cmdline_p)
 	memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
 	visws_early_detect();
 	pre_setup_arch_hook();
-	early_cpu_init();
 #else
 	printk(KERN_INFO "Command line: %s\n", boot_command_line);
 #endif
 
+	early_cpu_init();
 	early_ioremap_init();
 
 	ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -666,14 +858,36 @@ void __init setup_arch(char **cmdline_p)
 	bss_resource.start = virt_to_phys(&__bss_start);
 	bss_resource.end = virt_to_phys(&__bss_stop)-1;
 
-#ifdef CONFIG_X86_64
-	early_cpu_init();
+#ifdef CONFIG_CMDLINE_BOOL
+#ifdef CONFIG_CMDLINE_OVERRIDE
+	strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+#else
+	if (builtin_cmdline[0]) {
+		/* append boot loader cmdline to builtin */
+		strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
+		strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
+		strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
+	}
 #endif
+#endif
+
 	strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
 	*cmdline_p = command_line;
 
 	parse_early_param();
 
+#ifdef CONFIG_X86_64
+	check_efer();
+#endif
+
+#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
+	/*
+	 * Must be before kernel pagetables are setup
+	 * or fixmap area is touched.
+	 */
+	vmi_init();
+#endif
+
 	/* after early param, so could get panic from serial */
 	reserve_early_setup_data();
 
@@ -681,7 +895,7 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_X86_LOCAL_APIC
 		disable_apic = 1;
 #endif
-		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+		setup_clear_cpu_cap(X86_FEATURE_APIC);
 	}
 
 #ifdef CONFIG_PCI
@@ -691,6 +905,10 @@ void __init setup_arch(char **cmdline_p)
 
 	finish_e820_parsing();
 
+	dmi_scan_machine();
+
+	dmi_check_system(bad_bios_dmi_table);
+
 #ifdef CONFIG_X86_32
 	probe_roms();
 #endif
@@ -734,7 +952,8 @@ void __init setup_arch(char **cmdline_p)
 #else
 	num_physpages = max_pfn;
 
-	check_efer();
+ 	if (cpu_has_x2apic)
+ 		check_x2apic();
 
 	/* How many end-of-memory variables you have, grandma! */
 	/* need this before calling reserve_initrd */
@@ -746,6 +965,10 @@ void __init setup_arch(char **cmdline_p)
 	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
 #endif
 
+#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
+	setup_bios_corruption_check();
+#endif
+
 	/* max_pfn_mapped is updated here */
 	max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
 	max_pfn_mapped = max_low_pfn_mapped;
@@ -774,8 +997,6 @@ void __init setup_arch(char **cmdline_p)
 	vsmp_init();
 #endif
 
-	dmi_scan_machine();
-
 	io_delay_init();
 
 	/*
@@ -783,6 +1004,8 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	acpi_boot_table_init();
 
+	early_acpi_boot_init();
+
 #ifdef CONFIG_ACPI_NUMA
 	/*
 	 * Parse SRAT to discover nodes.
@@ -792,10 +1015,6 @@ void __init setup_arch(char **cmdline_p)
 
 	initmem_init(0, max_pfn);
 
-#ifdef CONFIG_X86_64
-	dma32_reserve_bootmem();
-#endif
-
 #ifdef CONFIG_ACPI_SLEEP
 	/*
 	 * Reserve low memory region for sleep support.
@@ -810,21 +1029,25 @@ void __init setup_arch(char **cmdline_p)
 #endif
 	reserve_crashkernel();
 
+#ifdef CONFIG_X86_64
+	/*
+	 * dma32_reserve_bootmem() allocates bootmem which may conflict
+	 * with the crashkernel command line, so do that after
+	 * reserve_crashkernel()
+	 */
+	dma32_reserve_bootmem();
+#endif
+
 	reserve_ibft_region();
 
 #ifdef CONFIG_KVM_CLOCK
 	kvmclock_init();
 #endif
 
-#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
-	/*
-	 * Must be after max_low_pfn is determined, and before kernel
-	 * pagetables are setup.
-	 */
-	vmi_init();
-#endif
-
+	paravirt_pagetable_setup_start(swapper_pg_dir);
 	paging_init();
+	paravirt_pagetable_setup_done(swapper_pg_dir);
+	paravirt_post_allocator_init();
 
 #ifdef CONFIG_X86_64
 	map_vsyscall();
@@ -850,27 +1073,17 @@ void __init setup_arch(char **cmdline_p)
 #endif
 
 	prefill_possible_map();
+
 #ifdef CONFIG_X86_64
 	init_cpu_to_node();
 #endif
 
-#ifdef CONFIG_X86_NUMAQ
-	/*
-	 * need to check online nodes num, call it
-	 * here before time_init/tsc_init
-	 */
-	numaq_tsc_disable();
-#endif
-
 	init_apic_mappings();
 	ioapic_init_mappings();
 
-#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
-	if (def_to_bigsmp)
-		printk(KERN_WARNING "More than 8 CPUs detected and "
-			"CONFIG_X86_PC cannot handle it.\nUse "
-			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
-#endif
+	/* need to wait for io_apic is mapped */
+	nr_irqs = probe_nr_irqs();
+
 	kvm_guest_init();
 
 	e820_reserve_resources();
@@ -892,3 +1105,5 @@ void __init setup_arch(char **cmdline_p)
 #endif
 #endif
 }
+
+
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index cac68430d31f..ae0c0d3bb770 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -80,24 +80,6 @@ static void __init setup_per_cpu_maps(void)
 #endif
 }
 
-#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
-cpumask_t *cpumask_of_cpu_map __read_mostly;
-EXPORT_SYMBOL(cpumask_of_cpu_map);
-
-/* requires nr_cpu_ids to be initialized */
-static void __init setup_cpumask_of_cpu(void)
-{
-	int i;
-
-	/* alloc_bootmem zeroes memory */
-	cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
-	for (i = 0; i < nr_cpu_ids; i++)
-		cpu_set(i, cpumask_of_cpu_map[i]);
-}
-#else
-static inline void setup_cpumask_of_cpu(void) { }
-#endif
-
 #ifdef CONFIG_X86_32
 /*
  * Great future not-so-futuristic plan: make i386 and x86_64 do it
@@ -158,35 +140,47 @@ static void __init setup_cpu_pda_map(void)
  */
 void __init setup_per_cpu_areas(void)
 {
-	ssize_t size = PERCPU_ENOUGH_ROOM;
+	ssize_t size, old_size;
 	char *ptr;
 	int cpu;
+	unsigned long align = 1;
 
 	/* Setup cpu_pda map */
 	setup_cpu_pda_map();
 
 	/* Copy section for each CPU (we discard the original) */
-	size = PERCPU_ENOUGH_ROOM;
+	old_size = PERCPU_ENOUGH_ROOM;
+	align = max_t(unsigned long, PAGE_SIZE, align);
+	size = roundup(old_size, align);
 	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
 			  size);
 
 	for_each_possible_cpu(cpu) {
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-		ptr = alloc_bootmem_pages(size);
+		ptr = __alloc_bootmem(size, align,
+				 __pa(MAX_DMA_ADDRESS));
 #else
 		int node = early_cpu_to_node(cpu);
 		if (!node_online(node) || !NODE_DATA(node)) {
-			ptr = alloc_bootmem_pages(size);
+			ptr = __alloc_bootmem(size, align,
+					 __pa(MAX_DMA_ADDRESS));
 			printk(KERN_INFO
 			       "cpu %d has no node %d or node-local memory\n",
 				cpu, node);
+			if (ptr)
+				printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
+					 cpu, __pa(ptr));
+		}
+		else {
+			ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
+							__pa(MAX_DMA_ADDRESS));
+			if (ptr)
+				printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
+					 cpu, node, __pa(ptr));
 		}
-		else
-			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
 #endif
 		per_cpu_offset(cpu) = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-
 	}
 
 	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
@@ -197,9 +191,6 @@ void __init setup_per_cpu_areas(void)
 
 	/* Setup node to cpumask map */
 	setup_node_to_cpumask_map();
-
-	/* Setup cpumask_of_cpu map */
-	setup_cpumask_of_cpu();
 }
 
 #endif
@@ -227,8 +218,8 @@ static void __init setup_node_to_cpumask_map(void)
 	/* allocate the map */
 	map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
 
-	Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
-		map, nr_node_ids);
+	pr_debug("Node to cpumask map at %p for %d nodes\n",
+		 map, nr_node_ids);
 
 	/* node_to_cpumask() will now work */
 	node_to_cpumask_map = map;
@@ -248,7 +239,7 @@ void __cpuinit numa_set_node(int cpu, int node)
 		per_cpu(x86_cpu_to_node_map, cpu) = node;
 
 	else
-		Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
+		pr_debug("Setting node for non-present cpu %d\n", cpu);
 }
 
 void __cpuinit numa_clear_node(int cpu)
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 72bbb519d2dc..cc673aa55ce4 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -3,9 +3,18 @@ struct sigframe {
 	char __user *pretcode;
 	int sig;
 	struct sigcontext sc;
-	struct _fpstate fpstate;
+	/*
+	 * fpstate is unused. fpstate is moved/allocated after
+	 * retcode[] below. This movement allows to have the FP state and the
+	 * future state extensions (xsave) stay together.
+	 * And at the same time retaining the unused fpstate, prevents changing
+	 * the offset of extramask[] in the sigframe and thus prevent any
+	 * legacy application accessing/modifying it.
+	 */
+	struct _fpstate fpstate_unused;
 	unsigned long extramask[_NSIG_WORDS-1];
 	char retcode[8];
+	/* fp state follows here */
 };
 
 struct rt_sigframe {
@@ -15,13 +24,19 @@ struct rt_sigframe {
 	void __user *puc;
 	struct siginfo info;
 	struct ucontext uc;
-	struct _fpstate fpstate;
 	char retcode[8];
+	/* fp state follows here */
 };
 #else
 struct rt_sigframe {
 	char __user *pretcode;
 	struct ucontext uc;
 	struct siginfo info;
+	/* fp state follows here */
 };
+
+int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+		sigset_t *set, struct pt_regs *regs);
+int ia32_setup_frame(int sig, struct k_sigaction *ka,
+		sigset_t *set, struct pt_regs *regs);
 #endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index d92373630963..d6dd057d0f22 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -17,6 +17,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
+#include <linux/tracehook.h>
 #include <linux/elf.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
@@ -26,6 +27,8 @@
 #include <asm/uaccess.h>
 #include <asm/i387.h>
 #include <asm/vdso.h>
+#include <asm/syscall.h>
+#include <asm/syscalls.h>
 
 #include "sigframe.h"
 
@@ -110,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
 	return do_sigaltstack(uss, uoss, regs->sp);
 }
 
+#define COPY(x)			{		\
+	err |= __get_user(regs->x, &sc->x);	\
+}
+
+#define COPY_SEG(seg)		{			\
+		unsigned short tmp;			\
+		err |= __get_user(tmp, &sc->seg);	\
+		regs->seg = tmp;			\
+}
+
+#define COPY_SEG_STRICT(seg)	{			\
+		unsigned short tmp;			\
+		err |= __get_user(tmp, &sc->seg);	\
+		regs->seg = tmp | 3;			\
+}
+
+#define GET_SEG(seg)		{			\
+		unsigned short tmp;			\
+		err |= __get_user(tmp, &sc->seg);	\
+		loadsegment(seg, tmp);			\
+}
 
 /*
  * Do a signal return; undo the signal stack.
@@ -118,28 +142,13 @@ static int
 restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 		   unsigned long *pax)
 {
+	void __user *buf;
+	unsigned int tmpflags;
 	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
 	current_thread_info()->restart_block.fn = do_no_restart_syscall;
 
-#define COPY(x)		err |= __get_user(regs->x, &sc->x)
-
-#define COPY_SEG(seg)							\
-	{ unsigned short tmp;						\
-	  err |= __get_user(tmp, &sc->seg);				\
-	  regs->seg = tmp; }
-
-#define COPY_SEG_STRICT(seg)						\
-	{ unsigned short tmp;						\
-	  err |= __get_user(tmp, &sc->seg);				\
-	  regs->seg = tmp|3; }
-
-#define GET_SEG(seg)							\
-	{ unsigned short tmp;						\
-	  err |= __get_user(tmp, &sc->seg);				\
-	  loadsegment(seg, tmp); }
-
 	GET_SEG(gs);
 	COPY_SEG(fs);
 	COPY_SEG(es);
@@ -149,38 +158,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 	COPY_SEG_STRICT(cs);
 	COPY_SEG_STRICT(ss);
 
-	{
-		unsigned int tmpflags;
-
-		err |= __get_user(tmpflags, &sc->flags);
-		regs->flags = (regs->flags & ~FIX_EFLAGS) |
-						(tmpflags & FIX_EFLAGS);
-		regs->orig_ax = -1;		/* disable syscall checks */
-	}
+	err |= __get_user(tmpflags, &sc->flags);
+	regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+	regs->orig_ax = -1;		/* disable syscall checks */
 
-	{
-		struct _fpstate __user *buf;
-
-		err |= __get_user(buf, &sc->fpstate);
-		if (buf) {
-			if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
-				goto badframe;
-			err |= restore_i387(buf);
-		} else {
-			struct task_struct *me = current;
-
-			if (used_math()) {
-				clear_fpu(me);
-				clear_used_math();
-			}
-		}
-	}
+	err |= __get_user(buf, &sc->fpstate);
+	err |= restore_i387_xstate(buf);
 
 	err |= __get_user(*pax, &sc->ax);
 	return err;
-
-badframe:
-	return 1;
 }
 
 asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
@@ -212,7 +198,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
 
 badframe:
 	if (show_unhandled_signals && printk_ratelimit()) {
-		printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:"
+		printk("%s%s[%d] bad frame in sigreturn frame:"
 			"%p ip:%lx sp:%lx oeax:%lx",
 		    task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
 		    current->comm, task_pid_nr(current), frame, regs->ip,
@@ -226,9 +212,8 @@ badframe:
 	return 0;
 }
 
-asmlinkage int sys_rt_sigreturn(unsigned long __unused)
+static long do_rt_sigreturn(struct pt_regs *regs)
 {
-	struct pt_regs *regs = (struct pt_regs *)&__unused;
 	struct rt_sigframe __user *frame;
 	unsigned long ax;
 	sigset_t set;
@@ -254,15 +239,22 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
 	return ax;
 
 badframe:
-	force_sig(SIGSEGV, current);
+	signal_fault(regs, frame, "rt_sigreturn");
 	return 0;
 }
 
+asmlinkage int sys_rt_sigreturn(unsigned long __unused)
+{
+	struct pt_regs *regs = (struct pt_regs *)&__unused;
+
+	return do_rt_sigreturn(regs);
+}
+
 /*
  * Set up a signal frame.
  */
 static int
-setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
+setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		 struct pt_regs *regs, unsigned long mask)
 {
 	int tmp, err = 0;
@@ -289,7 +281,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
 	err |= __put_user(regs->sp, &sc->sp_at_signal);
 	err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
 
-	tmp = save_i387(fpstate);
+	tmp = save_i387_xstate(fpstate);
 	if (tmp < 0)
 		err = 1;
 	else
@@ -306,7 +298,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
  * Determine which stack to use..
  */
 static inline void __user *
-get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
+get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
+	     void **fpstate)
 {
 	unsigned long sp;
 
@@ -332,6 +325,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
 			sp = (unsigned long) ka->sa.sa_restorer;
 	}
 
+	if (used_math()) {
+		sp = sp - sig_xstate_size;
+		*fpstate = (struct _fpstate *) sp;
+	}
+
 	sp -= frame_size;
 	/*
 	 * Align the stack pointer according to the i386 ABI,
@@ -343,38 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
 }
 
 static int
-setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
-	    struct pt_regs *regs)
+__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
+	      struct pt_regs *regs)
 {
 	struct sigframe __user *frame;
 	void __user *restorer;
 	int err = 0;
-	int usig;
+	void __user *fpstate = NULL;
 
-	frame = get_sigframe(ka, regs, sizeof(*frame));
+	frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
 
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
-		goto give_sigsegv;
+		return -EFAULT;
 
-	usig = current_thread_info()->exec_domain
-		&& current_thread_info()->exec_domain->signal_invmap
-		&& sig < 32
-		? current_thread_info()->exec_domain->signal_invmap[sig]
-		: sig;
+	if (__put_user(sig, &frame->sig))
+		return -EFAULT;
 
-	err = __put_user(usig, &frame->sig);
-	if (err)
-		goto give_sigsegv;
-
-	err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
-	if (err)
-		goto give_sigsegv;
+	if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
+		return -EFAULT;
 
 	if (_NSIG_WORDS > 1) {
-		err = __copy_to_user(&frame->extramask, &set->sig[1],
-				      sizeof(frame->extramask));
-		if (err)
-			goto give_sigsegv;
+		if (__copy_to_user(&frame->extramask, &set->sig[1],
+				   sizeof(frame->extramask)))
+			return -EFAULT;
 	}
 
 	if (current->mm->context.vdso)
@@ -399,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
 	err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
 
 	if (err)
-		goto give_sigsegv;
+		return -EFAULT;
 
 	/* Set up registers for signal handler */
 	regs->sp = (unsigned long)frame;
@@ -414,50 +403,43 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
 	regs->cs = __USER_CS;
 
 	return 0;
-
-give_sigsegv:
-	force_sigsegv(sig, current);
-	return -EFAULT;
 }
 
-static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-			  sigset_t *set, struct pt_regs *regs)
+static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+			    sigset_t *set, struct pt_regs *regs)
 {
 	struct rt_sigframe __user *frame;
 	void __user *restorer;
 	int err = 0;
-	int usig;
+	void __user *fpstate = NULL;
 
-	frame = get_sigframe(ka, regs, sizeof(*frame));
+	frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
 
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
-		goto give_sigsegv;
-
-	usig = current_thread_info()->exec_domain
-		&& current_thread_info()->exec_domain->signal_invmap
-		&& sig < 32
-		? current_thread_info()->exec_domain->signal_invmap[sig]
-		: sig;
+		return -EFAULT;
 
-	err |= __put_user(usig, &frame->sig);
+	err |= __put_user(sig, &frame->sig);
 	err |= __put_user(&frame->info, &frame->pinfo);
 	err |= __put_user(&frame->uc, &frame->puc);
 	err |= copy_siginfo_to_user(&frame->info, info);
 	if (err)
-		goto give_sigsegv;
+		return -EFAULT;
 
 	/* Create the ucontext.  */
-	err |= __put_user(0, &frame->uc.uc_flags);
+	if (cpu_has_xsave)
+		err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
+	else
+		err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __put_user(0, &frame->uc.uc_link);
 	err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
 	err |= __put_user(sas_ss_flags(regs->sp),
 			  &frame->uc.uc_stack.ss_flags);
 	err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
-	err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
+	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
 				regs, set->sig[0]);
 	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
 	if (err)
-		goto give_sigsegv;
+		return -EFAULT;
 
 	/* Set up to return from userspace.  */
 	restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -477,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
 
 	if (err)
-		goto give_sigsegv;
+		return -EFAULT;
 
 	/* Set up registers for signal handler */
 	regs->sp = (unsigned long)frame;
 	regs->ip = (unsigned long)ka->sa.sa_handler;
-	regs->ax = (unsigned long)usig;
+	regs->ax = (unsigned long)sig;
 	regs->dx = (unsigned long)&frame->info;
 	regs->cx = (unsigned long)&frame->uc;
 
@@ -492,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	regs->cs = __USER_CS;
 
 	return 0;
-
-give_sigsegv:
-	force_sigsegv(sig, current);
-	return -EFAULT;
 }
 
 /*
  * OK, we're invoking a handler:
  */
+static int signr_convert(int sig)
+{
+	struct thread_info *info = current_thread_info();
+
+	if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
+		return info->exec_domain->signal_invmap[sig];
+	return sig;
+}
+
+#define is_ia32	1
+#define ia32_setup_frame	__setup_frame
+#define ia32_setup_rt_frame	__setup_rt_frame
+
+static int
+setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+	       sigset_t *set, struct pt_regs *regs)
+{
+	int usig = signr_convert(sig);
+	int ret;
+
+	/* Set up the stack frame */
+	if (is_ia32) {
+		if (ka->sa.sa_flags & SA_SIGINFO)
+			ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+		else
+			ret = ia32_setup_frame(usig, ka, set, regs);
+	} else
+		ret = __setup_rt_frame(sig, ka, info, set, regs);
+
+	if (ret) {
+		force_sigsegv(sig, current);
+		return -EFAULT;
+	}
+
+	return ret;
+}
+
 static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	      sigset_t *oldset, struct pt_regs *regs)
@@ -508,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	int ret;
 
 	/* Are we from a system call? */
-	if ((long)regs->orig_ax >= 0) {
+	if (syscall_get_nr(current, regs) >= 0) {
 		/* If so, check system call restarting.. */
-		switch (regs->ax) {
+		switch (syscall_get_error(current, regs)) {
 		case -ERESTART_RESTARTBLOCK:
 		case -ERESTARTNOHAND:
 			regs->ax = -EINTR;
@@ -537,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
 		regs->flags &= ~X86_EFLAGS_TF;
 
-	/* Set up the stack frame */
-	if (ka->sa.sa_flags & SA_SIGINFO)
-		ret = setup_rt_frame(sig, ka, info, oldset, regs);
-	else
-		ret = setup_frame(sig, ka, oldset, regs);
+	ret = setup_rt_frame(sig, ka, info, oldset, regs);
 
 	if (ret)
 		return ret;
 
+#ifdef CONFIG_X86_64
+	/*
+	 * This has nothing to do with segment registers,
+	 * despite the name.  This magic affects uaccess.h
+	 * macros' behavior.  Reset it to the normal setting.
+	 */
+	set_fs(USER_DS);
+#endif
+
 	/*
 	 * Clear the direction flag as per the ABI for function entry.
 	 */
@@ -558,8 +578,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	 * handler too.
 	 */
 	regs->flags &= ~X86_EFLAGS_TF;
-	if (test_thread_flag(TIF_SINGLESTEP))
-		ptrace_notify(SIGTRAP);
 
 	spin_lock_irq(&current->sighand->siglock);
 	sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
@@ -568,9 +586,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
 
+	tracehook_signal_handler(sig, info, ka, regs,
+				 test_thread_flag(TIF_SINGLESTEP));
+
 	return 0;
 }
 
+#define NR_restart_syscall	__NR_restart_syscall
 /*
  * Note that 'init' is a special process: it doesn't get signals it doesn't
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -623,9 +645,9 @@ static void do_signal(struct pt_regs *regs)
 	}
 
 	/* Did we come from a system call? */
-	if ((long)regs->orig_ax >= 0) {
+	if (syscall_get_nr(current, regs) >= 0) {
 		/* Restart the system call - no handlers present */
-		switch (regs->ax) {
+		switch (syscall_get_error(current, regs)) {
 		case -ERESTARTNOHAND:
 		case -ERESTARTSYS:
 		case -ERESTARTNOINTR:
@@ -634,7 +656,7 @@ static void do_signal(struct pt_regs *regs)
 			break;
 
 		case -ERESTART_RESTARTBLOCK:
-			regs->ax = __NR_restart_syscall;
+			regs->ax = NR_restart_syscall;
 			regs->ip -= 2;
 			break;
 		}
@@ -657,18 +679,38 @@ static void do_signal(struct pt_regs *regs)
 void
 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 {
-	/* Pending single-step? */
-	if (thread_info_flags & _TIF_SINGLESTEP) {
-		regs->flags |= X86_EFLAGS_TF;
-		clear_thread_flag(TIF_SINGLESTEP);
-	}
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
+	/* notify userspace of pending MCEs */
+	if (thread_info_flags & _TIF_MCE_NOTIFY)
+		mce_notify_user();
+#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
 
 	/* deal with pending signal delivery */
 	if (thread_info_flags & _TIF_SIGPENDING)
 		do_signal(regs);
 
-	if (thread_info_flags & _TIF_HRTICK_RESCHED)
-		hrtick_resched();
+	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+		clear_thread_flag(TIF_NOTIFY_RESUME);
+		tracehook_notify_resume(regs);
+	}
 
+#ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
+#endif /* CONFIG_X86_32 */
+}
+
+void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
+{
+	struct task_struct *me = current;
+
+	if (show_unhandled_signals && printk_ratelimit()) {
+		printk(KERN_INFO
+		       "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
+		       me->comm, me->pid, where, frame,
+		       regs->ip, regs->sp, regs->orig_ax);
+		print_vma_addr(" in ", regs->ip);
+		printk(KERN_CONT "\n");
+	}
+
+	force_sig(SIGSEGV, me);
 }
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index e53b267662e7..a5c9627f4db9 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -15,17 +15,21 @@
 #include <linux/errno.h>
 #include <linux/wait.h>
 #include <linux/ptrace.h>
+#include <linux/tracehook.h>
 #include <linux/unistd.h>
 #include <linux/stddef.h>
 #include <linux/personality.h>
 #include <linux/compiler.h>
+#include <linux/uaccess.h>
+
 #include <asm/processor.h>
 #include <asm/ucontext.h>
-#include <asm/uaccess.h>
 #include <asm/i387.h>
 #include <asm/proto.h>
 #include <asm/ia32_unistd.h>
 #include <asm/mce.h>
+#include <asm/syscall.h>
+#include <asm/syscalls.h>
 #include "sigframe.h"
 
 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -41,11 +45,6 @@
 # define FIX_EFLAGS	__FIX_EFLAGS
 #endif
 
-int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-               sigset_t *set, struct pt_regs * regs); 
-int ia32_setup_frame(int sig, struct k_sigaction *ka,
-            sigset_t *set, struct pt_regs * regs); 
-
 asmlinkage long
 sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
 		struct pt_regs *regs)
@@ -53,6 +52,15 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
 	return do_sigaltstack(uss, uoss, regs->sp);
 }
 
+#define COPY(x)			{		\
+	err |= __get_user(regs->x, &sc->x);	\
+}
+
+#define COPY_SEG_STRICT(seg)	{			\
+		unsigned short tmp;			\
+		err |= __get_user(tmp, &sc->seg);	\
+		regs->seg = tmp | 3;			\
+}
 
 /*
  * Do a signal return; undo the signal stack.
@@ -61,13 +69,13 @@ static int
 restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 		   unsigned long *pax)
 {
+	void __user *buf;
+	unsigned int tmpflags;
 	unsigned int err = 0;
 
 	/* Always make any pending restarted system calls return -EINTR */
 	current_thread_info()->restart_block.fn = do_no_restart_syscall;
 
-#define COPY(x)		err |= __get_user(regs->x, &sc->x)
-
 	COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
 	COPY(dx); COPY(cx); COPY(ip);
 	COPY(r8);
@@ -82,48 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 	/* Kernel saves and restores only the CS segment register on signals,
 	 * which is the bare minimum needed to allow mixed 32/64-bit code.
 	 * App's signal handler can save/restore other segments if needed. */
-	{
-		unsigned cs;
-		err |= __get_user(cs, &sc->cs);
-		regs->cs = cs | 3;	/* Force into user mode */
-	}
+	COPY_SEG_STRICT(cs);
 
-	{
-		unsigned int tmpflags;
-		err |= __get_user(tmpflags, &sc->flags);
-		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
-		regs->orig_ax = -1;		/* disable syscall checks */
-	}
+	err |= __get_user(tmpflags, &sc->flags);
+	regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+	regs->orig_ax = -1;		/* disable syscall checks */
 
-	{
-		struct _fpstate __user * buf;
-		err |= __get_user(buf, &sc->fpstate);
-
-		if (buf) {
-			if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
-				goto badframe;
-			err |= restore_i387(buf);
-		} else {
-			struct task_struct *me = current;
-			if (used_math()) {
-				clear_fpu(me);
-				clear_used_math();
-			}
-		}
-	}
+	err |= __get_user(buf, &sc->fpstate);
+	err |= restore_i387_xstate(buf);
 
 	err |= __get_user(*pax, &sc->ax);
 	return err;
-
-badframe:
-	return 1;
 }
 
-asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
+static long do_rt_sigreturn(struct pt_regs *regs)
 {
 	struct rt_sigframe __user *frame;
-	sigset_t set;
 	unsigned long ax;
+	sigset_t set;
 
 	frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -136,7 +120,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	current->blocked = set;
 	recalc_sigpending();
 	spin_unlock_irq(&current->sighand->siglock);
-	
+
 	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
 		goto badframe;
 
@@ -146,16 +130,22 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
 	return ax;
 
 badframe:
-	signal_fault(regs,frame,"sigreturn");
+	signal_fault(regs, frame, "rt_sigreturn");
 	return 0;
-}	
+}
+
+asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
+{
+	return do_rt_sigreturn(regs);
+}
 
 /*
  * Set up a signal frame.
  */
 
 static inline int
-setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me)
+setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
+		unsigned long mask, struct task_struct *me)
 {
 	int err = 0;
 
@@ -207,41 +197,40 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
 			sp = current->sas_ss_sp + current->sas_ss_size;
 	}
 
-	return (void __user *)round_down(sp - size, 16);
+	return (void __user *)round_down(sp - size, 64);
 }
 
-static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
-			   sigset_t *set, struct pt_regs * regs)
+static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+			    sigset_t *set, struct pt_regs *regs)
 {
 	struct rt_sigframe __user *frame;
-	struct _fpstate __user *fp = NULL; 
+	void __user *fp = NULL;
 	int err = 0;
 	struct task_struct *me = current;
 
 	if (used_math()) {
-		fp = get_stack(ka, regs, sizeof(struct _fpstate)); 
+		fp = get_stack(ka, regs, sig_xstate_size);
 		frame = (void __user *)round_down(
 			(unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
 
-		if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate)))
-			goto give_sigsegv;
-
-		if (save_i387(fp) < 0) 
-			err |= -1; 
+		if (save_i387_xstate(fp) < 0)
+			return -EFAULT;
 	} else
 		frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
 
 	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
-		goto give_sigsegv;
+		return -EFAULT;
 
-	if (ka->sa.sa_flags & SA_SIGINFO) { 
-		err |= copy_siginfo_to_user(&frame->info, info);
-		if (err)
-			goto give_sigsegv;
+	if (ka->sa.sa_flags & SA_SIGINFO) {
+		if (copy_siginfo_to_user(&frame->info, info))
+			return -EFAULT;
 	}
-		
+
 	/* Create the ucontext.  */
-	err |= __put_user(0, &frame->uc.uc_flags);
+	if (cpu_has_xsave)
+		err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
+	else
+		err |= __put_user(0, &frame->uc.uc_flags);
 	err |= __put_user(0, &frame->uc.uc_link);
 	err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
 	err |= __put_user(sas_ss_flags(regs->sp),
@@ -249,9 +238,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
 	err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
 	err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
-	if (sizeof(*set) == 16) { 
+	if (sizeof(*set) == 16) {
 		__put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
-		__put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 
+		__put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
 	} else
 		err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
 
@@ -262,15 +251,15 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 		err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
 	} else {
 		/* could use a vstub here */
-		goto give_sigsegv; 
+		return -EFAULT;
 	}
 
 	if (err)
-		goto give_sigsegv;
+		return -EFAULT;
 
 	/* Set up registers for signal handler */
 	regs->di = sig;
-	/* In case the signal handler was declared without prototypes */ 
+	/* In case the signal handler was declared without prototypes */
 	regs->ax = 0;
 
 	/* This also works for non SA_SIGINFO handlers because they expect the
@@ -286,44 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
 	regs->cs = __USER_CS;
 
 	return 0;
-
-give_sigsegv:
-	force_sigsegv(sig, current);
-	return -EFAULT;
 }
 
 /*
- * Return -1L or the syscall number that @regs is executing.
+ * OK, we're invoking a handler
  */
-static long current_syscall(struct pt_regs *regs)
+static int signr_convert(int sig)
 {
-	/*
-	 * We always sign-extend a -1 value being set here,
-	 * so this is always either -1L or a syscall number.
-	 */
-	return regs->orig_ax;
+	return sig;
 }
 
-/*
- * Return a value that is -EFOO if the system call in @regs->orig_ax
- * returned an error.  This only works for @regs from @current.
- */
-static long current_syscall_ret(struct pt_regs *regs)
-{
 #ifdef CONFIG_IA32_EMULATION
-	if (test_thread_flag(TIF_IA32))
-		/*
-		 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
-		 * and will match correctly in comparisons.
-		 */
-		return (int) regs->ax;
+#define is_ia32	test_thread_flag(TIF_IA32)
+#else
+#define is_ia32	0
 #endif
-	return regs->ax;
-}
 
-/*
- * OK, we're invoking a handler
- */	
+static int
+setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
+	       sigset_t *set, struct pt_regs *regs)
+{
+	int usig = signr_convert(sig);
+	int ret;
+
+	/* Set up the stack frame */
+	if (is_ia32) {
+		if (ka->sa.sa_flags & SA_SIGINFO)
+			ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
+		else
+			ret = ia32_setup_frame(usig, ka, set, regs);
+	} else
+		ret = __setup_rt_frame(sig, ka, info, set, regs);
+
+	if (ret) {
+		force_sigsegv(sig, current);
+		return -EFAULT;
+	}
+
+	return ret;
+}
 
 static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -332,9 +322,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	int ret;
 
 	/* Are we from a system call? */
-	if (current_syscall(regs) >= 0) {
+	if (syscall_get_nr(current, regs) >= 0) {
 		/* If so, check system call restarting.. */
-		switch (current_syscall_ret(regs)) {
+		switch (syscall_get_error(current, regs)) {
 		case -ERESTART_RESTARTBLOCK:
 		case -ERESTARTNOHAND:
 			regs->ax = -EINTR;
@@ -361,50 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
 		regs->flags &= ~X86_EFLAGS_TF;
 
-#ifdef CONFIG_IA32_EMULATION
-	if (test_thread_flag(TIF_IA32)) {
-		if (ka->sa.sa_flags & SA_SIGINFO)
-			ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
-		else
-			ret = ia32_setup_frame(sig, ka, oldset, regs);
-	} else 
-#endif
 	ret = setup_rt_frame(sig, ka, info, oldset, regs);
 
-	if (ret == 0) {
-		/*
-		 * This has nothing to do with segment registers,
-		 * despite the name.  This magic affects uaccess.h
-		 * macros' behavior.  Reset it to the normal setting.
-		 */
-		set_fs(USER_DS);
+	if (ret)
+		return ret;
 
-		/*
-		 * Clear the direction flag as per the ABI for function entry.
-		 */
-		regs->flags &= ~X86_EFLAGS_DF;
+#ifdef CONFIG_X86_64
+	/*
+	 * This has nothing to do with segment registers,
+	 * despite the name.  This magic affects uaccess.h
+	 * macros' behavior.  Reset it to the normal setting.
+	 */
+	set_fs(USER_DS);
+#endif
 
-		/*
-		 * Clear TF when entering the signal handler, but
-		 * notify any tracer that was single-stepping it.
-		 * The tracer may want to single-step inside the
-		 * handler too.
-		 */
-		regs->flags &= ~X86_EFLAGS_TF;
-		if (test_thread_flag(TIF_SINGLESTEP))
-			ptrace_notify(SIGTRAP);
-
-		spin_lock_irq(&current->sighand->siglock);
-		sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
-		if (!(ka->sa.sa_flags & SA_NODEFER))
-			sigaddset(&current->blocked,sig);
-		recalc_sigpending();
-		spin_unlock_irq(&current->sighand->siglock);
-	}
+	/*
+	 * Clear the direction flag as per the ABI for function entry.
+	 */
+	regs->flags &= ~X86_EFLAGS_DF;
 
-	return ret;
+	/*
+	 * Clear TF when entering the signal handler, but
+	 * notify any tracer that was single-stepping it.
+	 * The tracer may want to single-step inside the
+	 * handler too.
+	 */
+	regs->flags &= ~X86_EFLAGS_TF;
+
+	spin_lock_irq(&current->sighand->siglock);
+	sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
+	if (!(ka->sa.sa_flags & SA_NODEFER))
+		sigaddset(&current->blocked, sig);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	tracehook_signal_handler(sig, info, ka, regs,
+				 test_thread_flag(TIF_SINGLESTEP));
+
+	return 0;
 }
 
+#define NR_restart_syscall	\
+	test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
 /*
  * Note that 'init' is a special process: it doesn't get signals it doesn't
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -434,7 +422,8 @@ static void do_signal(struct pt_regs *regs)
 
 	signr = get_signal_to_deliver(&info, &ka, regs, NULL);
 	if (signr > 0) {
-		/* Re-enable any watchpoints before delivering the
+		/*
+		 * Re-enable any watchpoints before delivering the
 		 * signal to user space. The processor register will
 		 * have been cleared if the watchpoint triggered
 		 * inside the kernel.
@@ -442,7 +431,7 @@ static void do_signal(struct pt_regs *regs)
 		if (current->thread.debugreg7)
 			set_debugreg(current->thread.debugreg7, 7);
 
-		/* Whee!  Actually deliver the signal.  */
+		/* Whee! Actually deliver the signal.  */
 		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
 			/*
 			 * A signal was successfully delivered; the saved
@@ -456,19 +445,18 @@ static void do_signal(struct pt_regs *regs)
 	}
 
 	/* Did we come from a system call? */
-	if (current_syscall(regs) >= 0) {
+	if (syscall_get_nr(current, regs) >= 0) {
 		/* Restart the system call - no handlers present */
-		switch (current_syscall_ret(regs)) {
+		switch (syscall_get_error(current, regs)) {
 		case -ERESTARTNOHAND:
 		case -ERESTARTSYS:
 		case -ERESTARTNOINTR:
 			regs->ax = regs->orig_ax;
 			regs->ip -= 2;
 			break;
+
 		case -ERESTART_RESTARTBLOCK:
-			regs->ax = test_thread_flag(TIF_IA32) ?
-					__NR_ia32_restart_syscall :
-					__NR_restart_syscall;
+			regs->ax = NR_restart_syscall;
 			regs->ip -= 2;
 			break;
 		}
@@ -484,38 +472,45 @@ static void do_signal(struct pt_regs *regs)
 	}
 }
 
-void do_notify_resume(struct pt_regs *regs, void *unused,
-		      __u32 thread_info_flags)
+/*
+ * notification of userspace execution resumption
+ * - triggered by the TIF_WORK_MASK flags
+ */
+void
+do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 {
-	/* Pending single-step? */
-	if (thread_info_flags & _TIF_SINGLESTEP) {
-		regs->flags |= X86_EFLAGS_TF;
-		clear_thread_flag(TIF_SINGLESTEP);
-	}
-
-#ifdef CONFIG_X86_MCE
+#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
 	/* notify userspace of pending MCEs */
 	if (thread_info_flags & _TIF_MCE_NOTIFY)
 		mce_notify_user();
-#endif /* CONFIG_X86_MCE */
+#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
 
 	/* deal with pending signal delivery */
 	if (thread_info_flags & _TIF_SIGPENDING)
 		do_signal(regs);
 
-	if (thread_info_flags & _TIF_HRTICK_RESCHED)
-		hrtick_resched();
+	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+		clear_thread_flag(TIF_NOTIFY_RESUME);
+		tracehook_notify_resume(regs);
+	}
+
+#ifdef CONFIG_X86_32
+	clear_thread_flag(TIF_IRET);
+#endif /* CONFIG_X86_32 */
 }
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
-{ 
-	struct task_struct *me = current; 
+{
+	struct task_struct *me = current;
+
 	if (show_unhandled_signals && printk_ratelimit()) {
-		printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
-	       me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax);
+		printk(KERN_INFO
+		       "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
+		       me->comm, me->pid, where, frame,
+		       regs->ip, regs->sp, regs->orig_ax);
 		print_vma_addr(" in ", regs->ip);
-		printk("\n");
+		printk(KERN_CONT "\n");
 	}
 
-	force_sig(SIGSEGV, me); 
-} 
+	force_sig(SIGSEGV, me);
+}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 361b7a4c640c..18f9b19f5f8f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
 struct smp_ops smp_ops = {
 	.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
 	.smp_prepare_cpus = native_smp_prepare_cpus,
-	.cpu_up = native_cpu_up,
 	.smp_cpus_done = native_smp_cpus_done,
 
 	.smp_send_stop = native_smp_send_stop,
 	.smp_send_reschedule = native_smp_send_reschedule,
 
+	.cpu_up = native_cpu_up,
+	.cpu_die = native_cpu_die,
+	.cpu_disable = native_cpu_disable,
+	.play_dead = native_play_dead,
+
 	.send_call_func_ipi = native_send_call_func_ipi,
 	.send_call_func_single_ipi = native_send_call_func_single_ipi,
 };
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 687376ab07e8..7b1093397319 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -52,6 +52,7 @@
 #include <asm/desc.h>
 #include <asm/nmi.h>
 #include <asm/irq.h>
+#include <asm/idle.h>
 #include <asm/smp.h>
 #include <asm/trampoline.h>
 #include <asm/cpu.h>
@@ -88,7 +89,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
 #define get_idle_for_cpu(x)      (per_cpu(idle_thread_array, x))
 #define set_idle_for_cpu(x, p)   (per_cpu(idle_thread_array, x) = (p))
 #else
-struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
 #define get_idle_for_cpu(x)      (idle_thread_array[(x)])
 #define set_idle_for_cpu(x, p)   (idle_thread_array[(x)] = (p))
 #endif
@@ -123,13 +124,12 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
 
 static atomic_t init_deasserted;
 
-static int boot_cpu_logical_apicid;
 
 /* representing cpus for which sibling maps can be computed */
 static cpumask_t cpu_sibling_setup_map;
 
 /* Set if we find a B stepping CPU */
-int __cpuinitdata smp_b_stepping;
+static int __cpuinitdata smp_b_stepping;
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
 
@@ -165,6 +165,8 @@ static void unmap_cpu_to_node(int cpu)
 #endif
 
 #ifdef CONFIG_X86_32
+static int boot_cpu_logical_apicid;
+
 u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
 					{ [0 ... NR_CPUS-1] = BAD_APICID };
 
@@ -210,13 +212,13 @@ static void __cpuinit smp_callin(void)
 	/*
 	 * (This works even if the APIC is not enabled.)
 	 */
-	phys_id = GET_APIC_ID(read_apic_id());
+	phys_id = read_apic_id();
 	cpuid = smp_processor_id();
 	if (cpu_isset(cpuid, cpu_callin_map)) {
 		panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
 					phys_id, cpuid);
 	}
-	Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
+	pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
 
 	/*
 	 * STARTUP IPIs are fragile beasts as they might sometimes
@@ -251,12 +253,13 @@ static void __cpuinit smp_callin(void)
 	 * boards)
 	 */
 
-	Dprintk("CALLIN, before setup_local_APIC().\n");
+	pr_debug("CALLIN, before setup_local_APIC().\n");
 	smp_callin_clear_local_apic();
 	setup_local_APIC();
 	end_local_APIC_setup();
 	map_cpu_to_logical_apicid();
 
+	notify_cpu_starting(cpuid);
 	/*
 	 * Get our bogomips.
 	 *
@@ -266,7 +269,7 @@ static void __cpuinit smp_callin(void)
 	local_irq_enable();
 	calibrate_delay();
 	local_irq_disable();
-	Dprintk("Stack at about %p\n", &cpuid);
+	pr_debug("Stack at about %p\n", &cpuid);
 
 	/*
 	 * Save our processor parameters
@@ -279,6 +282,8 @@ static void __cpuinit smp_callin(void)
 	cpu_set(cpuid, cpu_callin_map);
 }
 
+static int __cpuinitdata unsafe_smp;
+
 /*
  * Activate a secondary processor.
  */
@@ -326,15 +331,22 @@ static void __cpuinit start_secondary(void *unused)
 	 * for which cpus receive the IPI. Holding this
 	 * lock helps us to not include this cpu in a currently in progress
 	 * smp_call_function().
+	 *
+	 * We need to hold vector_lock so there the set of online cpus
+	 * does not change while we are assigning vectors to cpus.  Holding
+	 * this lock ensures we don't half assign or remove an irq from a cpu.
 	 */
-	ipi_call_lock_irq();
-#ifdef CONFIG_X86_IO_APIC
-	setup_vector_irq(smp_processor_id());
-#endif
+	ipi_call_lock();
+	lock_vector_lock();
+	__setup_vector_irq(smp_processor_id());
 	cpu_set(smp_processor_id(), cpu_online_map);
-	ipi_call_unlock_irq();
+	unlock_vector_lock();
+	ipi_call_unlock();
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 
+	/* enable local interrupts */
+	local_irq_enable();
+
 	setup_secondary_clock();
 
 	wmb();
@@ -387,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
 				goto valid_k7;
 
 		/* If we get here, not a certified SMP capable AMD system. */
-		add_taint(TAINT_UNSAFE_SMP);
+		unsafe_smp = 1;
 	}
 
 valid_k7:
@@ -404,12 +416,10 @@ static void __cpuinit smp_checks(void)
 	 * Don't taint if we are running SMP kernel on a single non-MP
 	 * approved Athlon
 	 */
-	if (tainted & TAINT_UNSAFE_SMP) {
-		if (num_online_cpus())
-			printk(KERN_INFO "WARNING: This combination of AMD"
-				"processors is not suitable for SMP.\n");
-		else
-			tainted &= ~TAINT_UNSAFE_SMP;
+	if (unsafe_smp && num_online_cpus() > 1) {
+		printk(KERN_INFO "WARNING: This combination of AMD"
+			"processors is not suitable for SMP.\n");
+		add_taint(TAINT_UNSAFE_SMP);
 	}
 }
 
@@ -438,7 +448,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 	cpu_set(cpu, cpu_sibling_setup_map);
 
 	if (smp_num_siblings > 1) {
-		for_each_cpu_mask(i, cpu_sibling_setup_map) {
+		for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
 			if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
 			    c->cpu_core_id == cpu_data(i).cpu_core_id) {
 				cpu_set(i, per_cpu(cpu_sibling_map, cpu));
@@ -461,7 +471,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
 		return;
 	}
 
-	for_each_cpu_mask(i, cpu_sibling_setup_map) {
+	for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
 		if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
 		    per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
 			cpu_set(i, c->llc_shared_map);
@@ -513,7 +523,7 @@ static void impress_friends(void)
 	/*
 	 * Allow the user to impress friends.
 	 */
-	Dprintk("Before bogomips.\n");
+	pr_debug("Before bogomips.\n");
 	for_each_possible_cpu(cpu)
 		if (cpu_isset(cpu, cpu_callout_map))
 			bogosum += cpu_data(cpu).loops_per_jiffy;
@@ -523,7 +533,7 @@ static void impress_friends(void)
 		bogosum/(500000/HZ),
 		(bogosum/(5000/HZ))%100);
 
-	Dprintk("Before bogocount - setting activated=1.\n");
+	pr_debug("Before bogocount - setting activated=1.\n");
 }
 
 static inline void __inquire_remote_apic(int apicid)
@@ -533,10 +543,10 @@ static inline void __inquire_remote_apic(int apicid)
 	int timeout;
 	u32 status;
 
-	printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
+	printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
 
 	for (i = 0; i < ARRAY_SIZE(regs); i++) {
-		printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]);
+		printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
 
 		/*
 		 * Wait for idle.
@@ -546,8 +556,7 @@ static inline void __inquire_remote_apic(int apicid)
 			printk(KERN_CONT
 			       "a previous APIC delivery may have failed\n");
 
-		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
-		apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
+		apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
 
 		timeout = 0;
 		do {
@@ -579,29 +588,24 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
 	int maxlvt;
 
 	/* Target chip */
-	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
-
 	/* Boot on the stack */
 	/* Kick the second */
-	apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL);
+	apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid);
 
-	Dprintk("Waiting for send to finish...\n");
+	pr_debug("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
 
 	/*
 	 * Give the other CPU some time to accept the IPI.
 	 */
 	udelay(200);
-	/*
-	 * Due to the Pentium erratum 3AP.
-	 */
-	maxlvt = lapic_get_maxlvt();
-	if (maxlvt > 3) {
-		apic_read_around(APIC_SPIV);
-		apic_write(APIC_ESR, 0);
+	if (APIC_INTEGRATED(apic_version[phys_apicid])) {
+		maxlvt = lapic_get_maxlvt();
+		if (maxlvt > 3)			/* Due to the Pentium erratum 3AP.  */
+			apic_write(APIC_ESR, 0);
+		accept_status = (apic_read(APIC_ESR) & 0xEF);
 	}
-	accept_status = (apic_read(APIC_ESR) & 0xEF);
-	Dprintk("NMI sent.\n");
+	pr_debug("NMI sent.\n");
 
 	if (send_status)
 		printk(KERN_ERR "APIC never delivered???\n");
@@ -625,42 +629,40 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 		return send_status;
 	}
 
+	maxlvt = lapic_get_maxlvt();
+
 	/*
 	 * Be paranoid about clearing APIC errors.
 	 */
 	if (APIC_INTEGRATED(apic_version[phys_apicid])) {
-		apic_read_around(APIC_SPIV);
-		apic_write(APIC_ESR, 0);
+		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
+			apic_write(APIC_ESR, 0);
 		apic_read(APIC_ESR);
 	}
 
-	Dprintk("Asserting INIT.\n");
+	pr_debug("Asserting INIT.\n");
 
 	/*
 	 * Turn INIT on target chip
 	 */
-	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
 	/*
 	 * Send IPI
 	 */
-	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
-				| APIC_DM_INIT);
+	apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
+		       phys_apicid);
 
-	Dprintk("Waiting for send to finish...\n");
+	pr_debug("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
 
 	mdelay(10);
 
-	Dprintk("Deasserting INIT.\n");
+	pr_debug("Deasserting INIT.\n");
 
 	/* Target chip */
-	apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
 	/* Send IPI */
-	apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
+	apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
 
-	Dprintk("Waiting for send to finish...\n");
+	pr_debug("Waiting for send to finish...\n");
 	send_status = safe_apic_wait_icr_idle();
 
 	mb();
@@ -687,55 +689,46 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
 	/*
 	 * Run STARTUP IPI loop.
 	 */
-	Dprintk("#startup loops: %d.\n", num_starts);
-
-	maxlvt = lapic_get_maxlvt();
+	pr_debug("#startup loops: %d.\n", num_starts);
 
 	for (j = 1; j <= num_starts; j++) {
-		Dprintk("Sending STARTUP #%d.\n", j);
-		apic_read_around(APIC_SPIV);
-		apic_write(APIC_ESR, 0);
+		pr_debug("Sending STARTUP #%d.\n", j);
+		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
+			apic_write(APIC_ESR, 0);
 		apic_read(APIC_ESR);
-		Dprintk("After apic_write.\n");
+		pr_debug("After apic_write.\n");
 
 		/*
 		 * STARTUP IPI
 		 */
 
 		/* Target chip */
-		apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
 		/* Boot on the stack */
 		/* Kick the second */
-		apic_write_around(APIC_ICR, APIC_DM_STARTUP
-					| (start_eip >> 12));
+		apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
+			       phys_apicid);
 
 		/*
 		 * Give the other CPU some time to accept the IPI.
 		 */
 		udelay(300);
 
-		Dprintk("Startup point 1.\n");
+		pr_debug("Startup point 1.\n");
 
-		Dprintk("Waiting for send to finish...\n");
+		pr_debug("Waiting for send to finish...\n");
 		send_status = safe_apic_wait_icr_idle();
 
 		/*
 		 * Give the other CPU some time to accept the IPI.
 		 */
 		udelay(200);
-		/*
-		 * Due to the Pentium erratum 3AP.
-		 */
-		if (maxlvt > 3) {
-			apic_read_around(APIC_SPIV);
+		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
 			apic_write(APIC_ESR, 0);
-		}
 		accept_status = (apic_read(APIC_ESR) & 0xEF);
 		if (send_status || accept_status)
 			break;
 	}
-	Dprintk("After Startup.\n");
+	pr_debug("After Startup.\n");
 
 	if (send_status)
 		printk(KERN_ERR "APIC never delivered???\n");
@@ -763,12 +756,20 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
 }
 
 #ifdef CONFIG_X86_64
+
+/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
+static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
+{
+	if (!after_bootmem)
+		free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
+}
+
 /*
  * Allocate node local memory for the AP pda.
  *
  * Must be called after the _cpu_pda pointer table is initialized.
  */
-static int __cpuinit get_local_pda(int cpu)
+int __cpuinit get_local_pda(int cpu)
 {
 	struct x8664_pda *oldpda, *newpda;
 	unsigned long size = sizeof(struct x8664_pda);
@@ -791,8 +792,7 @@ static int __cpuinit get_local_pda(int cpu)
 
 	if (oldpda) {
 		memcpy(newpda, oldpda, size);
-		if (!after_bootmem)
-			free_bootmem((unsigned long)oldpda, size);
+		free_bootmem_pda(oldpda);
 	}
 
 	newpda->in_bootmem = 0;
@@ -874,7 +874,7 @@ do_rest:
 	start_ip = setup_trampoline();
 
 	/* So we see what's up   */
-	printk(KERN_INFO "Booting processor %d/%d ip %lx\n",
+	printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n",
 			  cpu, apicid, start_ip);
 
 	/*
@@ -886,16 +886,18 @@ do_rest:
 
 	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
 
-		Dprintk("Setting warm reset code and vector.\n");
+		pr_debug("Setting warm reset code and vector.\n");
 
 		store_NMI_vector(&nmi_high, &nmi_low);
 
 		smpboot_setup_warm_reset_vector(start_ip);
 		/*
 		 * Be paranoid about clearing APIC errors.
-	 	*/
-		apic_write(APIC_ESR, 0);
-		apic_read(APIC_ESR);
+		*/
+		if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
+			apic_write(APIC_ESR, 0);
+			apic_read(APIC_ESR);
+		}
 	}
 
 	/*
@@ -907,9 +909,9 @@ do_rest:
 		/*
 		 * allow APs to start initializing.
 		 */
-		Dprintk("Before Callout %d.\n", cpu);
+		pr_debug("Before Callout %d.\n", cpu);
 		cpu_set(cpu, cpu_callout_map);
-		Dprintk("After Callout %d.\n", cpu);
+		pr_debug("After Callout %d.\n", cpu);
 
 		/*
 		 * Wait 5s total for a response
@@ -922,10 +924,10 @@ do_rest:
 
 		if (cpu_isset(cpu, cpu_callin_map)) {
 			/* number CPUs logically, starting from 1 (BSP is 0) */
-			Dprintk("OK.\n");
+			pr_debug("OK.\n");
 			printk(KERN_INFO "CPU%d: ", cpu);
 			print_cpu_info(&cpu_data(cpu));
-			Dprintk("CPU has booted.\n");
+			pr_debug("CPU has booted.\n");
 		} else {
 			boot_error = 1;
 			if (*((volatile unsigned char *)trampoline_base)
@@ -970,7 +972,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
 
 	WARN_ON(irqs_disabled());
 
-	Dprintk("++++++++++++++++++++=_---CPU UP  %u\n", cpu);
+	pr_debug("++++++++++++++++++++=_---CPU UP  %u\n", cpu);
 
 	if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
 	    !physid_isset(apicid, phys_cpu_present_map)) {
@@ -982,7 +984,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
 	 * Already booted CPU?
 	 */
 	if (cpu_isset(cpu, cpu_callin_map)) {
-		Dprintk("do_boot_cpu %d Already started\n", cpu);
+		pr_debug("do_boot_cpu %d Already started\n", cpu);
 		return -ENOSYS;
 	}
 
@@ -1009,7 +1011,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
 	err = do_boot_cpu(apicid, cpu);
 #endif
 	if (err) {
-		Dprintk("do_boot_cpu failed %d\n", err);
+		pr_debug("do_boot_cpu failed %d\n", err);
 		return -EIO;
 	}
 
@@ -1055,6 +1057,34 @@ static __init void disable_smp(void)
 static int __init smp_sanity_check(unsigned max_cpus)
 {
 	preempt_disable();
+
+#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
+	if (def_to_bigsmp && nr_cpu_ids > 8) {
+		unsigned int cpu;
+		unsigned nr;
+
+		printk(KERN_WARNING
+		       "More than 8 CPUs detected - skipping them.\n"
+		       "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
+
+		nr = 0;
+		for_each_present_cpu(cpu) {
+			if (nr >= 8)
+				cpu_clear(cpu, cpu_present_map);
+			nr++;
+		}
+
+		nr = 0;
+		for_each_possible_cpu(cpu) {
+			if (nr >= 8)
+				cpu_clear(cpu, cpu_possible_map);
+			nr++;
+		}
+
+		nr_cpu_ids = 8;
+	}
+#endif
+
 	if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
 		printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
 				    "by the BIOS.\n", hard_smp_processor_id());
@@ -1147,10 +1177,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	 * Setup boot CPU information
 	 */
 	smp_store_cpu_info(0); /* Final full version of the data */
+#ifdef CONFIG_X86_32
 	boot_cpu_logical_apicid = logical_smp_processor_id();
+#endif
 	current_thread_info()->cpu = 0;  /* needed? */
 	set_cpu_sibling_map(0);
 
+#ifdef CONFIG_X86_64
+	enable_IR_x2apic();
+	setup_apic_routing();
+#endif
+
 	if (smp_sanity_check(max_cpus) < 0) {
 		printk(KERN_INFO "SMP disabled\n");
 		disable_smp();
@@ -1158,9 +1195,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	}
 
 	preempt_disable();
-	if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) {
+	if (read_apic_id() != boot_cpu_physical_apicid) {
 		panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
-		     GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid);
+		     read_apic_id(), boot_cpu_physical_apicid);
 		/* Or can we switch back to PIC here? */
 	}
 	preempt_enable();
@@ -1193,6 +1230,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 	printk(KERN_INFO "CPU%d: ", 0);
 	print_cpu_info(&cpu_data(0));
 	setup_boot_clock();
+
+	if (is_uv_system())
+		uv_system_init();
 out:
 	preempt_enable();
 }
@@ -1213,7 +1253,7 @@ void __init native_smp_prepare_boot_cpu(void)
 
 void __init native_smp_cpus_done(unsigned int max_cpus)
 {
-	Dprintk("Boot done.\n");
+	pr_debug("Boot done.\n");
 
 	impress_friends();
 	smp_checks();
@@ -1223,39 +1263,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 	check_nmi_watchdog();
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void remove_siblinginfo(int cpu)
-{
-	int sibling;
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-	for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) {
-		cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
-		/*/
-		 * last thread sibling in this cpu core going down
-		 */
-		if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
-			cpu_data(sibling).booted_cores--;
-	}
-
-	for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu))
-		cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
-	cpus_clear(per_cpu(cpu_sibling_map, cpu));
-	cpus_clear(per_cpu(cpu_core_map, cpu));
-	c->phys_proc_id = 0;
-	c->cpu_core_id = 0;
-	cpu_clear(cpu, cpu_sibling_setup_map);
-}
-
-static int additional_cpus __initdata = -1;
-
-static __init int setup_additional_cpus(char *s)
-{
-	return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
-}
-early_param("additional_cpus", setup_additional_cpus);
-
 /*
  * cpu_possible_map should be static, it cannot change as cpu's
  * are onlined, or offlined. The reason is per-cpu data-structures
@@ -1275,24 +1282,13 @@ early_param("additional_cpus", setup_additional_cpus);
  */
 __init void prefill_possible_map(void)
 {
-	int i;
-	int possible;
+	int i, possible;
 
 	/* no processor from mptable or madt */
 	if (!num_processors)
 		num_processors = 1;
 
-#ifdef CONFIG_HOTPLUG_CPU
-	if (additional_cpus == -1) {
-		if (disabled_cpus > 0)
-			additional_cpus = disabled_cpus;
-		else
-			additional_cpus = 0;
-	}
-#else
-	additional_cpus = 0;
-#endif
-	possible = num_processors + additional_cpus;
+	possible = num_processors + disabled_cpus;
 	if (possible > NR_CPUS)
 		possible = NR_CPUS;
 
@@ -1305,17 +1301,64 @@ __init void prefill_possible_map(void)
 	nr_cpu_ids = possible;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void remove_siblinginfo(int cpu)
+{
+	int sibling;
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+	for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
+		cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
+		/*/
+		 * last thread sibling in this cpu core going down
+		 */
+		if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
+			cpu_data(sibling).booted_cores--;
+	}
+
+	for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
+		cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
+	cpus_clear(per_cpu(cpu_sibling_map, cpu));
+	cpus_clear(per_cpu(cpu_core_map, cpu));
+	c->phys_proc_id = 0;
+	c->cpu_core_id = 0;
+	cpu_clear(cpu, cpu_sibling_setup_map);
+}
+
 static void __ref remove_cpu_from_maps(int cpu)
 {
 	cpu_clear(cpu, cpu_online_map);
 	cpu_clear(cpu, cpu_callout_map);
 	cpu_clear(cpu, cpu_callin_map);
 	/* was set by cpu_init() */
-	clear_bit(cpu, (unsigned long *)&cpu_initialized);
+	cpu_clear(cpu, cpu_initialized);
 	numa_remove_cpu(cpu);
 }
 
-int __cpu_disable(void)
+void cpu_disable_common(void)
+{
+	int cpu = smp_processor_id();
+	/*
+	 * HACK:
+	 * Allow any queued timer interrupts to get serviced
+	 * This is only a temporary solution until we cleanup
+	 * fixup_irqs as we do for IA64.
+	 */
+	local_irq_enable();
+	mdelay(1);
+
+	local_irq_disable();
+	remove_siblinginfo(cpu);
+
+	/* It's now safe to remove this processor from the online map */
+	lock_vector_lock();
+	remove_cpu_from_maps(cpu);
+	unlock_vector_lock();
+	fixup_irqs(cpu_online_map);
+}
+
+int native_cpu_disable(void)
 {
 	int cpu = smp_processor_id();
 
@@ -1334,25 +1377,11 @@ int __cpu_disable(void)
 		stop_apic_nmi_watchdog(NULL);
 	clear_local_APIC();
 
-	/*
-	 * HACK:
-	 * Allow any queued timer interrupts to get serviced
-	 * This is only a temporary solution until we cleanup
-	 * fixup_irqs as we do for IA64.
-	 */
-	local_irq_enable();
-	mdelay(1);
-
-	local_irq_disable();
-	remove_siblinginfo(cpu);
-
-	/* It's now safe to remove this processor from the online map */
-	remove_cpu_from_maps(cpu);
-	fixup_irqs(cpu_online_map);
+	cpu_disable_common();
 	return 0;
 }
 
-void __cpu_die(unsigned int cpu)
+void native_cpu_die(unsigned int cpu)
 {
 	/* We don't do anything here: idle task is faking death itself. */
 	unsigned int i;
@@ -1369,28 +1398,45 @@ void __cpu_die(unsigned int cpu)
 	}
 	printk(KERN_ERR "CPU %u didn't die...\n", cpu);
 }
+
+void play_dead_common(void)
+{
+	idle_task_exit();
+	reset_lazy_tlbstate();
+	irq_ctx_exit(raw_smp_processor_id());
+	c1e_remove_cpu(raw_smp_processor_id());
+
+	mb();
+	/* Ack it */
+	__get_cpu_var(cpu_state) = CPU_DEAD;
+
+	/*
+	 * With physical CPU hotplug, we should halt the cpu
+	 */
+	local_irq_disable();
+}
+
+void native_play_dead(void)
+{
+	play_dead_common();
+	wbinvd_halt();
+}
+
 #else /* ... !CONFIG_HOTPLUG_CPU */
-int __cpu_disable(void)
+int native_cpu_disable(void)
 {
 	return -ENOSYS;
 }
 
-void __cpu_die(unsigned int cpu)
+void native_cpu_die(unsigned int cpu)
 {
 	/* We said "no" in __cpu_disable */
 	BUG();
 }
-#endif
 
-/*
- * If the BIOS enumerates physical processors before logical,
- * maxcpus=N at enumeration-time can be used to disable HT.
- */
-static int __init parse_maxcpus(char *arg)
+void native_play_dead(void)
 {
-	extern unsigned int maxcpus;
-
-	maxcpus = simple_strtoul(arg, NULL, 0);
-	return 0;
+	BUG();
 }
-early_param("maxcpus", parse_maxcpus);
+
+#endif
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 99941b37eca0..397e309839dd 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -8,18 +8,21 @@
 DEFINE_PER_CPU(unsigned long, this_cpu_off);
 EXPORT_PER_CPU_SYMBOL(this_cpu_off);
 
-/* Initialize the CPU's GDT.  This is either the boot CPU doing itself
-   (still using the master per-cpu area), or a CPU doing it for a
-   secondary which will soon come up. */
+/*
+ * Initialize the CPU's GDT.  This is either the boot CPU doing itself
+ * (still using the master per-cpu area), or a CPU doing it for a
+ * secondary which will soon come up.
+ */
 __cpuinit void init_gdt(int cpu)
 {
-	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+	struct desc_struct gdt;
 
-	pack_descriptor(&gdt[GDT_ENTRY_PERCPU],
-			__per_cpu_offset[cpu], 0xFFFFF,
+	pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
 			0x2 | DESCTYPE_S, 0x8);
+	gdt.s = 1;
 
-	gdt[GDT_ENTRY_PERCPU].s = 1;
+	write_gdt_entry(get_cpu_gdt_table(cpu),
+			GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
 
 	per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
 	per_cpu(cpu_number, cpu) = cpu;
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
deleted file mode 100644
index 8b137891791f..000000000000
--- a/arch/x86/kernel/smpcommon_32.c
+++ /dev/null
@@ -1 +0,0 @@
-
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 92c20fee6781..e8b9863ef8c4 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
 static int enable_single_step(struct task_struct *child)
 {
 	struct pt_regs *regs = task_pt_regs(child);
+	unsigned long oflags;
+
+	/*
+	 * If we stepped into a sysenter/syscall insn, it trapped in
+	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+	 * If user-mode had set TF itself, then it's still clear from
+	 * do_debug() and we need to set it again to restore the user
+	 * state so we don't wrongly set TIF_FORCED_TF below.
+	 * If enable_single_step() was used last and that is what
+	 * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
+	 * already set and our bookkeeping is fine.
+	 */
+	if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
+		regs->flags |= X86_EFLAGS_TF;
 
 	/*
 	 * Always set TIF_SINGLESTEP - this guarantees that
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
 	 */
 	set_tsk_thread_flag(child, TIF_SINGLESTEP);
 
-	/*
-	 * If TF was already set, don't do anything else
-	 */
-	if (regs->flags & X86_EFLAGS_TF)
-		return 0;
+	oflags = regs->flags;
 
 	/* Set TF on the kernel stack.. */
 	regs->flags |= X86_EFLAGS_TF;
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
 	 * ..but if TF is changed by the instruction we will trace,
 	 * don't mark it as being "us" that set it, so that we
 	 * won't clear it by hand later.
+	 *
+	 * Note that if we don't actually execute the popf because
+	 * of a signal arriving right now or suchlike, we will lose
+	 * track of the fact that it really was "us" that set it.
 	 */
-	if (is_setting_trap_flag(child, regs))
+	if (is_setting_trap_flag(child, regs)) {
+		clear_tsk_thread_flag(child, TIF_FORCED_TF);
 		return 0;
+	}
+
+	/*
+	 * If TF was already set, check whether it was us who set it.
+	 * If not, we should never attempt a block step.
+	 */
+	if (oflags & X86_EFLAGS_TF)
+		return test_tsk_thread_flag(child, TIF_FORCED_TF);
 
 	set_tsk_thread_flag(child, TIF_FORCED_TF);
 
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index d67ce5f044ba..7b987852e876 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -30,7 +30,7 @@
 #include <linux/init.h>
 #include <asm/io.h>
 #include <asm/bios_ebda.h>
-#include <asm/mach-summit/mach_mpparse.h>
+#include <asm/summit/mpparse.h>
 
 static struct rio_table_hdr *rio_table_hdr __initdata;
 static struct scal_detail   *scal_devs[MAX_NUMNODES] __initdata;
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 7066cb855a60..1884a8d12bfa 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -22,6 +22,8 @@
 #include <linux/uaccess.h>
 #include <linux/unistd.h>
 
+#include <asm/syscalls.h>
+
 asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
 			  unsigned long prot, unsigned long flags,
 			  unsigned long fd, unsigned long pgoff)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 3b360ef33817..6bc211accf08 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -13,15 +13,17 @@
 #include <linux/utsname.h>
 #include <linux/personality.h>
 #include <linux/random.h>
+#include <linux/uaccess.h>
 
-#include <asm/uaccess.h>
 #include <asm/ia32.h>
+#include <asm/syscalls.h>
 
-asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
-	unsigned long fd, unsigned long off)
+asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
+		unsigned long prot, unsigned long flags,
+		unsigned long fd, unsigned long off)
 {
 	long error;
-	struct file * file;
+	struct file *file;
 
 	error = -EINVAL;
 	if (off & ~PAGE_MASK)
@@ -56,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 		   unmapped base down for this case. This can give
 		   conflicts with the heap, but we assume that glibc
 		   malloc knows how to fall back to mmap. Give it 1GB
-		   of playground for now. -AK */ 
-		*begin = 0x40000000; 
-		*end = 0x80000000;		
+		   of playground for now. -AK */
+		*begin = 0x40000000;
+		*end = 0x80000000;
 		if (current->flags & PF_RANDOMIZE) {
 			new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
 			if (new_begin)
@@ -66,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
 		}
 	} else {
 		*begin = TASK_UNMAPPED_BASE;
-		*end = TASK_SIZE; 
+		*end = TASK_SIZE;
 	}
-} 
+}
 
 unsigned long
 arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -78,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	struct vm_area_struct *vma;
 	unsigned long start_addr;
 	unsigned long begin, end;
-	
+
 	if (flags & MAP_FIXED)
 		return addr;
 
-	find_start_end(flags, &begin, &end); 
+	find_start_end(flags, &begin, &end);
 
 	if (len > end)
 		return -ENOMEM;
@@ -96,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	}
 	if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
 	    && len <= mm->cached_hole_size) {
-	        mm->cached_hole_size = 0;
+		mm->cached_hole_size = 0;
 		mm->free_area_cache = begin;
 	}
 	addr = mm->free_area_cache;
-	if (addr < begin) 
-		addr = begin; 
+	if (addr < begin)
+		addr = begin;
 	start_addr = addr;
 
 full_search:
@@ -127,7 +129,7 @@ full_search:
 			return addr;
 		}
 		if (addr + mm->cached_hole_size < vma->vm_start)
-		        mm->cached_hole_size = vma->vm_start - addr;
+			mm->cached_hole_size = vma->vm_start - addr;
 
 		addr = vma->vm_end;
 	}
@@ -177,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 		vma = find_vma(mm, addr-len);
 		if (!vma || addr <= vma->vm_start)
 			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr-len);
+			return mm->free_area_cache = addr-len;
 	}
 
 	if (mm->mmap_base < len)
@@ -194,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 		vma = find_vma(mm, addr);
 		if (!vma || addr+len <= vma->vm_start)
 			/* remember the address as a hint for next time */
-			return (mm->free_area_cache = addr);
+			return mm->free_area_cache = addr;
 
 		/* remember the largest hole we saw so far */
 		if (addr + mm->cached_hole_size < vma->vm_start)
@@ -224,13 +226,13 @@ bottomup:
 }
 
 
-asmlinkage long sys_uname(struct new_utsname __user * name)
+asmlinkage long sys_uname(struct new_utsname __user *name)
 {
 	int err;
 	down_read(&uts_sem);
-	err = copy_to_user(name, utsname(), sizeof (*name));
+	err = copy_to_user(name, utsname(), sizeof(*name));
 	up_read(&uts_sem);
-	if (personality(current->personality) == PER_LINUX32) 
-		err |= copy_to_user(&name->machine, "i686", 5); 		
+	if (personality(current->personality) == PER_LINUX32)
+		err |= copy_to_user(&name->machine, "i686", 5);
 	return err ? -EFAULT : 0;
 }
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 170d43c17487..de87d6008295 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -8,12 +8,12 @@
 #define __NO_STUBS
 
 #define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
-#undef _ASM_X86_64_UNISTD_H_
+#undef _ASM_X86_UNISTD_64_H
 #include <asm/unistd_64.h>
 
 #undef __SYSCALL
 #define __SYSCALL(nr, sym) [nr] = sym,
-#undef _ASM_X86_64_UNISTD_H_
+#undef _ASM_X86_UNISTD_64_H
 
 typedef void (*sys_call_ptr_t)(void);
 
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index adff5562f5fd..d44395ff34c3 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -326,3 +326,9 @@ ENTRY(sys_call_table)
 	.long sys_fallocate
 	.long sys_timerfd_settime	/* 325 */
 	.long sys_timerfd_gettime
+	.long sys_signalfd4
+	.long sys_eventfd2
+	.long sys_epoll_create1
+	.long sys_dup3			/* 330 */
+	.long sys_pipe2
+	.long sys_inotify_init1
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 059ca6ee59b4..77b400f06ea2 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -36,6 +36,7 @@
 #include <asm/arch_hooks.h>
 #include <asm/hpet.h>
 #include <asm/time.h>
+#include <asm/timer.h>
 
 #include "do_timer.h"
 
@@ -46,10 +47,9 @@ unsigned long profile_pc(struct pt_regs *regs)
 	unsigned long pc = instruction_pointer(regs);
 
 #ifdef CONFIG_SMP
-	if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) &&
-	    in_lock_functions(pc)) {
+	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
 #ifdef CONFIG_FRAME_POINTER
-		return *(unsigned long *)(regs->bp + 4);
+		return *(unsigned long *)(regs->bp + sizeof(long));
 #else
 		unsigned long *sp = (unsigned long *)&regs->sp;
 
@@ -94,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
 
 	do_timer_interrupt_hook();
 
+#ifdef CONFIG_MCA
 	if (MCA_bus) {
 		/* The PS/2 uses level-triggered interrupts.  You can't
 		turn them off, nor would you want to (any attempt to
@@ -107,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
 		u8 irq_v = inb_p( 0x61 );	/* read the current state */
 		outb_p( irq_v|0x80, 0x61 );	/* reset the IRQ */
 	}
+#endif
 
 	return IRQ_HANDLED;
 }
@@ -129,6 +131,7 @@ void __init hpet_time_init(void)
  */
 void __init time_init(void)
 {
+	pre_time_init_hook();
 	tsc_init();
 	late_time_init = choose_time_init();
 }
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index e3d49c553af2..cb19d650c216 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/time.h>
+#include <linux/mca.h>
 
 #include <asm/i8253.h>
 #include <asm/hpet.h>
@@ -33,23 +34,34 @@ unsigned long profile_pc(struct pt_regs *regs)
 	/* Assume the lock function has either no stack frame or a copy
 	   of flags from PUSHF
 	   Eflags always has bits 22 and up cleared unlike kernel addresses. */
-	if (!user_mode(regs) && in_lock_functions(pc)) {
+	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+#ifdef CONFIG_FRAME_POINTER
+		return *(unsigned long *)(regs->bp + sizeof(long));
+#else
 		unsigned long *sp = (unsigned long *)regs->sp;
 		if (sp[0] >> 22)
 			return sp[0];
 		if (sp[1] >> 22)
 			return sp[1];
+#endif
 	}
 	return pc;
 }
 EXPORT_SYMBOL(profile_pc);
 
-static irqreturn_t timer_event_interrupt(int irq, void *dev_id)
+irqreturn_t timer_interrupt(int irq, void *dev_id)
 {
 	add_pda(irq0_irqs, 1);
 
 	global_clock_event->event_handler(global_clock_event);
 
+#ifdef CONFIG_MCA
+	if (MCA_bus) {
+		u8 irq_v = inb_p(0x61);       /* read the current state */
+		outb_p(irq_v|0x80, 0x61);     /* reset the IRQ */
+	}
+#endif
+
 	return IRQ_HANDLED;
 }
 
@@ -100,7 +112,7 @@ unsigned long __init calibrate_cpu(void)
 }
 
 static struct irqaction irq0 = {
-	.handler	= timer_event_interrupt,
+	.handler	= timer_interrupt,
 	.flags		= IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING,
 	.mask		= CPU_MASK_NONE,
 	.name		= "timer"
@@ -111,16 +123,13 @@ void __init hpet_time_init(void)
 	if (!hpet_enable())
 		setup_pit_timer();
 
+	irq0.mask = cpumask_of_cpu(0);
 	setup_irq(0, &irq0);
 }
 
 void __init time_init(void)
 {
 	tsc_init();
-	if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
-		vgetcpu_mode = VGETCPU_RDTSCP;
-	else
-		vgetcpu_mode = VGETCPU_LSL;
 
 	late_time_init = choose_time_init();
 }
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index fec1ecedc9b7..e00534b33534 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -241,3 +241,11 @@ void flush_tlb_all(void)
 	on_each_cpu(do_flush_tlb_all, NULL, 1);
 }
 
+void reset_lazy_tlbstate(void)
+{
+	int cpu = raw_smp_processor_id();
+
+	per_cpu(cpu_tlbstate, cpu).state = 0;
+	per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
+}
+
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index d0fbb7712ab0..04431f34fd16 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -6,7 +6,7 @@
  *	This code is released under the GNU General Public License version 2 or
  *	later.
  */
-#include <linux/mc146818rtc.h>
+#include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
 
@@ -17,6 +17,7 @@
 #include <asm/genapic.h>
 #include <asm/idle.h>
 #include <asm/tsc.h>
+#include <asm/irq_vectors.h>
 
 #include <mach_apic.h>
 
@@ -783,7 +784,7 @@ static int __init uv_bau_init(void)
 		uv_init_blade(blade, node, cur_cpu);
 		cur_cpu += uv_blade_nr_possible_cpus(blade);
 	}
-	set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
+	alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
 	uv_enable_timeouts();
 
 	return 0;
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index ab6bf375a307..6bb7b8579e70 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -10,6 +10,7 @@
 #include <asm/ldt.h>
 #include <asm/processor.h>
 #include <asm/proto.h>
+#include <asm/syscalls.h>
 
 #include "tls.h"
 
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps.c
index 8a768973c4f0..04d242ab0161 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps.c
@@ -7,13 +7,11 @@
  */
 
 /*
- * 'Traps.c' handles hardware traps and faults after we have saved some
- * state in 'asm.s'.
+ * Handle hardware traps and faults.
  */
 #include <linux/interrupt.h>
 #include <linux/kallsyms.h>
 #include <linux/spinlock.h>
-#include <linux/highmem.h>
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/utsname.h>
@@ -32,6 +30,8 @@
 #include <linux/bug.h>
 #include <linux/nmi.h>
 #include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/io.h>
 
 #ifdef CONFIG_EISA
 #include <linux/ioport.h>
@@ -46,20 +46,31 @@
 #include <linux/edac.h>
 #endif
 
-#include <asm/arch_hooks.h>
 #include <asm/stacktrace.h>
 #include <asm/processor.h>
 #include <asm/debugreg.h>
 #include <asm/atomic.h>
 #include <asm/system.h>
 #include <asm/unwind.h>
+#include <asm/traps.h>
 #include <asm/desc.h>
 #include <asm/i387.h>
+
+#include <mach_traps.h>
+
+#ifdef CONFIG_X86_64
+#include <asm/pgalloc.h>
+#include <asm/proto.h>
+#include <asm/pda.h>
+#else
+#include <asm/processor-flags.h>
+#include <asm/arch_hooks.h>
 #include <asm/nmi.h>
 #include <asm/smp.h>
 #include <asm/io.h>
+#include <asm/traps.h>
 
-#include "mach_traps.h"
+#include "cpu/mcheck/mce.h"
 
 DECLARE_BITMAP(used_vectors, NR_VECTORS);
 EXPORT_SYMBOL_GPL(used_vectors);
@@ -76,431 +87,104 @@ char ignore_fpu_irq;
  */
 gate_desc idt_table[256]
 	__attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
-
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void alignment_check(void);
-asmlinkage void spurious_interrupt_bug(void);
-asmlinkage void machine_check(void);
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 24;
-static unsigned int code_bytes = 64;
-static int ignore_nmis;
-static int die_counter;
-
-void printk_address(unsigned long address, int reliable)
-{
-#ifdef CONFIG_KALLSYMS
-	unsigned long offset = 0;
-	unsigned long symsize;
-	const char *symname;
-	char *modname;
-	char *delim = ":";
-	char namebuf[KSYM_NAME_LEN];
-	char reliab[4] = "";
-
-	symname = kallsyms_lookup(address, &symsize, &offset,
-					&modname, namebuf);
-	if (!symname) {
-		printk(" [<%08lx>]\n", address);
-		return;
-	}
-	if (!reliable)
-		strcpy(reliab, "? ");
-
-	if (!modname)
-		modname = delim = "";
-	printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
-		address, reliab, delim, modname, delim, symname, offset, symsize);
-#else
-	printk(" [<%08lx>]\n", address);
 #endif
-}
-
-static inline int valid_stack_ptr(struct thread_info *tinfo,
-			void *p, unsigned int size)
-{
-	void *t = tinfo;
-	return	p > t && p <= t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
-	struct stack_frame *next_frame;
-	unsigned long return_address;
-};
 
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
-		unsigned long *stack, unsigned long bp,
-		const struct stacktrace_ops *ops, void *data)
-{
-	struct stack_frame *frame = (struct stack_frame *)bp;
-
-	while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
-		unsigned long addr;
-
-		addr = *stack;
-		if (__kernel_text_address(addr)) {
-			if ((unsigned long) stack == bp + 4) {
-				ops->address(data, addr, 1);
-				frame = frame->next_frame;
-				bp = (unsigned long) frame;
-			} else {
-				ops->address(data, addr, bp == 0);
-			}
-		}
-		stack++;
-	}
-	return bp;
-}
-
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *stack, unsigned long bp,
-		const struct stacktrace_ops *ops, void *data)
-{
-	if (!task)
-		task = current;
-
-	if (!stack) {
-		unsigned long dummy;
-		stack = &dummy;
-		if (task != current)
-			stack = (unsigned long *)task->thread.sp;
-	}
-
-#ifdef CONFIG_FRAME_POINTER
-	if (!bp) {
-		if (task == current) {
-			/* Grab bp right from our regs */
-			asm("movl %%ebp, %0" : "=r" (bp) :);
-		} else {
-			/* bp is the last reg pushed by switch_to */
-			bp = *(unsigned long *) task->thread.sp;
-		}
-	}
-#endif
-
-	for (;;) {
-		struct thread_info *context;
-
-		context = (struct thread_info *)
-			((unsigned long)stack & (~(THREAD_SIZE - 1)));
-		bp = print_context_stack(context, stack, bp, ops, data);
-		/*
-		 * Should be after the line below, but somewhere
-		 * in early boot context comes out corrupted and we
-		 * can't reference it:
-		 */
-		if (ops->stack(data, "IRQ") < 0)
-			break;
-		stack = (unsigned long *)context->previous_esp;
-		if (!stack)
-			break;
-		touch_nmi_watchdog();
-	}
-}
-EXPORT_SYMBOL(dump_trace);
-
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-	printk(data);
-	print_symbol(msg, symbol);
-	printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
-	printk("%s%s\n", (char *)data, msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
-	return 0;
-}
-
-/*
- * Print one address/symbol entries per line.
- */
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
-	printk("%s [<%08lx>] ", (char *)data, addr);
-	if (!reliable)
-		printk("? ");
-	print_symbol("%s\n", addr);
-	touch_nmi_watchdog();
-}
-
-static const struct stacktrace_ops print_trace_ops = {
-	.warning = print_trace_warning,
-	.warning_symbol = print_trace_warning_symbol,
-	.stack = print_trace_stack,
-	.address = print_trace_address,
-};
+static int ignore_nmis;
 
-static void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		   unsigned long *stack, unsigned long bp, char *log_lvl)
+static inline void conditional_sti(struct pt_regs *regs)
 {
-	dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
-	printk("%s =======================\n", log_lvl);
+	if (regs->flags & X86_EFLAGS_IF)
+		local_irq_enable();
 }
 
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *stack, unsigned long bp)
+static inline void preempt_conditional_sti(struct pt_regs *regs)
 {
-	show_trace_log_lvl(task, regs, stack, bp, "");
+	inc_preempt_count();
+	if (regs->flags & X86_EFLAGS_IF)
+		local_irq_enable();
 }
 
-static void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
-		   unsigned long *sp, unsigned long bp, char *log_lvl)
+static inline void preempt_conditional_cli(struct pt_regs *regs)
 {
-	unsigned long *stack;
-	int i;
-
-	if (sp == NULL) {
-		if (task)
-			sp = (unsigned long *)task->thread.sp;
-		else
-			sp = (unsigned long *)&sp;
-	}
-
-	stack = sp;
-	for (i = 0; i < kstack_depth_to_print; i++) {
-		if (kstack_end(stack))
-			break;
-		if (i && ((i % 8) == 0))
-			printk("\n%s       ", log_lvl);
-		printk("%08lx ", *stack++);
-	}
-	printk("\n%sCall Trace:\n", log_lvl);
-
-	show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+	if (regs->flags & X86_EFLAGS_IF)
+		local_irq_disable();
+	dec_preempt_count();
 }
 
-void show_stack(struct task_struct *task, unsigned long *sp)
+#ifdef CONFIG_X86_32
+static inline void
+die_if_kernel(const char *str, struct pt_regs *regs, long err)
 {
-	printk("       ");
-	show_stack_log_lvl(task, NULL, sp, 0, "");
+	if (!user_mode_vm(regs))
+		die(str, regs, err);
 }
 
 /*
- * The architecture-independent dump_stack generator
+ * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
+ * invalid offset set (the LAZY one) and the faulting thread has
+ * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
+ * we set the offset field correctly and return 1.
  */
-void dump_stack(void)
-{
-	unsigned long bp = 0;
-	unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
-	if (!bp)
-		asm("movl %%ebp, %0" : "=r" (bp):);
-#endif
-
-	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
-		current->pid, current->comm, print_tainted(),
-		init_utsname()->release,
-		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
-
-	show_trace(current, NULL, &stack, bp);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
-void show_registers(struct pt_regs *regs)
+static int lazy_iobitmap_copy(void)
 {
-	int i;
+	struct thread_struct *thread;
+	struct tss_struct *tss;
+	int cpu;
 
-	print_modules();
-	__show_registers(regs, 0);
+	cpu = get_cpu();
+	tss = &per_cpu(init_tss, cpu);
+	thread = &current->thread;
 
-	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
-		TASK_COMM_LEN, current->comm, task_pid_nr(current),
-		current_thread_info(), current, task_thread_info(current));
-	/*
-	 * When in-kernel, we also print out the stack and code at the
-	 * time of the fault..
-	 */
-	if (!user_mode_vm(regs)) {
-		unsigned int code_prologue = code_bytes * 43 / 64;
-		unsigned int code_len = code_bytes;
-		unsigned char c;
-		u8 *ip;
-
-		printk("\n" KERN_EMERG "Stack: ");
-		show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
-
-		printk(KERN_EMERG "Code: ");
-
-		ip = (u8 *)regs->ip - code_prologue;
-		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
-			/* try starting at EIP */
-			ip = (u8 *)regs->ip;
-			code_len = code_len - code_prologue + 1;
-		}
-		for (i = 0; i < code_len; i++, ip++) {
-			if (ip < (u8 *)PAGE_OFFSET ||
-					probe_kernel_address(ip, c)) {
-				printk(" Bad EIP value.");
-				break;
-			}
-			if (ip == (u8 *)regs->ip)
-				printk("<%02x> ", c);
-			else
-				printk("%02x ", c);
+	if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
+	    thread->io_bitmap_ptr) {
+		memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
+		       thread->io_bitmap_max);
+		/*
+		 * If the previously set map was extending to higher ports
+		 * than the current one, pad extra space with 0xff (no access).
+		 */
+		if (thread->io_bitmap_max < tss->io_bitmap_max) {
+			memset((char *) tss->io_bitmap +
+				thread->io_bitmap_max, 0xff,
+				tss->io_bitmap_max - thread->io_bitmap_max);
 		}
-	}
-	printk("\n");
-}
-
-int is_valid_bugaddr(unsigned long ip)
-{
-	unsigned short ud2;
-
-	if (ip < PAGE_OFFSET)
-		return 0;
-	if (probe_kernel_address((unsigned short *)ip, ud2))
-		return 0;
-
-	return ud2 == 0x0b0f;
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
-	unsigned short ss;
-	unsigned long sp;
+		tss->io_bitmap_max = thread->io_bitmap_max;
+		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
+		tss->io_bitmap_owner = thread;
+		put_cpu();
 
-	printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
-	printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
-	printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	printk("DEBUG_PAGEALLOC");
-#endif
-	printk("\n");
-	if (notify_die(DIE_OOPS, str, regs, err,
-			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
 		return 1;
-
-	show_registers(regs);
-	/* Executive summary in case the oops scrolled away */
-	sp = (unsigned long) (&regs->sp);
-	savesegment(ss, ss);
-	if (user_mode(regs)) {
-		sp = regs->sp;
-		ss = regs->ss & 0xffff;
 	}
-	printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
-	print_symbol("%s", regs->ip);
-	printk(" SS:ESP %04x:%08lx\n", ss, sp);
-	return 0;
-}
-
-/*
- * This is gone through when something in the kernel has done something bad
- * and is about to be terminated:
- */
-void die(const char *str, struct pt_regs *regs, long err)
-{
-	static struct {
-		raw_spinlock_t lock;
-		u32 lock_owner;
-		int lock_owner_depth;
-	} die = {
-		.lock =			__RAW_SPIN_LOCK_UNLOCKED,
-		.lock_owner =		-1,
-		.lock_owner_depth =	0
-	};
-	unsigned long flags;
-
-	oops_enter();
-
-	if (die.lock_owner != raw_smp_processor_id()) {
-		console_verbose();
-		raw_local_irq_save(flags);
-		__raw_spin_lock(&die.lock);
-		die.lock_owner = smp_processor_id();
-		die.lock_owner_depth = 0;
-		bust_spinlocks(1);
-	} else {
-		raw_local_irq_save(flags);
-	}
-
-	if (++die.lock_owner_depth < 3) {
-		report_bug(regs->ip, regs);
-
-		if (__die(str, regs, err))
-			regs = NULL;
-	} else {
-		printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
-	}
-
-	bust_spinlocks(0);
-	die.lock_owner = -1;
-	add_taint(TAINT_DIE);
-	__raw_spin_unlock(&die.lock);
-	raw_local_irq_restore(flags);
-
-	if (!regs)
-		return;
-
-	if (kexec_should_crash(current))
-		crash_kexec(regs);
-
-	if (in_interrupt())
-		panic("Fatal exception in interrupt");
-
-	if (panic_on_oops)
-		panic("Fatal exception");
-
-	oops_exit();
-	do_exit(SIGSEGV);
-}
+	put_cpu();
 
-static inline void
-die_if_kernel(const char *str, struct pt_regs *regs, long err)
-{
-	if (!user_mode_vm(regs))
-		die(str, regs, err);
+	return 0;
 }
+#endif
 
 static void __kprobes
-do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs,
+do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 	long error_code, siginfo_t *info)
 {
 	struct task_struct *tsk = current;
 
+#ifdef CONFIG_X86_32
 	if (regs->flags & X86_VM_MASK) {
-		if (vm86)
+		/*
+		 * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
+		 * On nmi (interrupt 2), do_trap should not be called.
+		 */
+		if (trapnr < 6)
 			goto vm86_trap;
 		goto trap_signal;
 	}
+#endif
 
 	if (!user_mode(regs))
 		goto kernel_trap;
 
+#ifdef CONFIG_X86_32
 trap_signal:
+#endif
 	/*
 	 * We want error_code and trap_no set for userspace faults and
 	 * kernelspace faults which result in die(), but not
@@ -513,6 +197,18 @@ trap_signal:
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_no = trapnr;
 
+#ifdef CONFIG_X86_64
+	if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
+	    printk_ratelimit()) {
+		printk(KERN_INFO
+		       "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
+		       tsk->comm, tsk->pid, str,
+		       regs->ip, regs->sp, error_code);
+		print_vma_addr(" in ", regs->ip);
+		printk("\n");
+	}
+#endif
+
 	if (info)
 		force_sig_info(signr, info, tsk);
 	else
@@ -527,120 +223,98 @@ kernel_trap:
 	}
 	return;
 
+#ifdef CONFIG_X86_32
 vm86_trap:
 	if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
 						error_code, trapnr))
 		goto trap_signal;
 	return;
+#endif
 }
 
 #define DO_ERROR(trapnr, signr, str, name)				\
-void do_##name(struct pt_regs *regs, long error_code)			\
-{									\
-	trace_hardirqs_fixup();						\
-	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
-							== NOTIFY_STOP)	\
-		return;							\
-	do_trap(trapnr, signr, str, 0, regs, error_code, NULL);		\
-}
-
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq)	\
-void do_##name(struct pt_regs *regs, long error_code)			\
-{									\
-	siginfo_t info;							\
-	if (irq)							\
-		local_irq_enable();					\
-	info.si_signo = signr;						\
-	info.si_errno = 0;						\
-	info.si_code = sicode;						\
-	info.si_addr = (void __user *)siaddr;				\
-	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
-							== NOTIFY_STOP)	\
-		return;							\
-	do_trap(trapnr, signr, str, 0, regs, error_code, &info);	\
-}
-
-#define DO_VM86_ERROR(trapnr, signr, str, name)				\
-void do_##name(struct pt_regs *regs, long error_code)			\
+dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\
 {									\
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
 							== NOTIFY_STOP)	\
 		return;							\
-	do_trap(trapnr, signr, str, 1, regs, error_code, NULL);		\
+	conditional_sti(regs);						\
+	do_trap(trapnr, signr, str, regs, error_code, NULL);		\
 }
 
-#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)	\
-void do_##name(struct pt_regs *regs, long error_code)			\
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)		\
+dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\
 {									\
 	siginfo_t info;							\
 	info.si_signo = signr;						\
 	info.si_errno = 0;						\
 	info.si_code = sicode;						\
 	info.si_addr = (void __user *)siaddr;				\
-	trace_hardirqs_fixup();						\
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
 							== NOTIFY_STOP)	\
 		return;							\
-	do_trap(trapnr, signr, str, 1, regs, error_code, &info);	\
+	conditional_sti(regs);						\
+	do_trap(trapnr, signr, str, regs, error_code, &info);		\
 }
 
-DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-#ifndef CONFIG_KPROBES
-DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
-#endif
-DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
-DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
+DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_ERROR(4, SIGSEGV, "overflow", overflow)
+DO_ERROR(5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
 DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
 DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
 DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
+#ifdef CONFIG_X86_32
 DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
-DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
+#endif
+DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
+
+#ifdef CONFIG_X86_64
+/* Runs on IST stack */
+dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
+{
+	if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
+			12, SIGBUS) == NOTIFY_STOP)
+		return;
+	preempt_conditional_sti(regs);
+	do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
+	preempt_conditional_cli(regs);
+}
 
-void __kprobes
+dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
+{
+	static const char str[] = "double fault";
+	struct task_struct *tsk = current;
+
+	/* Return not checked because double check cannot be ignored */
+	notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
+
+	tsk->thread.error_code = error_code;
+	tsk->thread.trap_no = 8;
+
+	/* This is always a kernel trap and never fixable (and thus must
+	   never return). */
+	for (;;)
+		die(str, regs, error_code);
+}
+#endif
+
+dotraplinkage void __kprobes
 do_general_protection(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk;
-	struct thread_struct *thread;
-	struct tss_struct *tss;
-	int cpu;
 
-	cpu = get_cpu();
-	tss = &per_cpu(init_tss, cpu);
-	thread = &current->thread;
-
-	/*
-	 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
-	 * invalid offset set (the LAZY one) and the faulting thread has
-	 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
-	 * and we set the offset field correctly. Then we let the CPU to
-	 * restart the faulting instruction.
-	 */
-	if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
-	    thread->io_bitmap_ptr) {
-		memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
-		       thread->io_bitmap_max);
-		/*
-		 * If the previously set map was extending to higher ports
-		 * than the current one, pad extra space with 0xff (no access).
-		 */
-		if (thread->io_bitmap_max < tss->io_bitmap_max) {
-			memset((char *) tss->io_bitmap +
-				thread->io_bitmap_max, 0xff,
-				tss->io_bitmap_max - thread->io_bitmap_max);
-		}
-		tss->io_bitmap_max = thread->io_bitmap_max;
-		tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
-		tss->io_bitmap_owner = thread;
-		put_cpu();
+	conditional_sti(regs);
 
+#ifdef CONFIG_X86_32
+	if (lazy_iobitmap_copy()) {
+		/* restart the faulting instruction */
 		return;
 	}
-	put_cpu();
 
 	if (regs->flags & X86_VM_MASK)
 		goto gp_in_vm86;
+#endif
 
 	tsk = current;
 	if (!user_mode(regs))
@@ -662,10 +336,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	force_sig(SIGSEGV, tsk);
 	return;
 
+#ifdef CONFIG_X86_32
 gp_in_vm86:
 	local_irq_enable();
 	handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
 	return;
+#endif
 
 gp_in_kernel:
 	if (fixup_exception(regs))
@@ -702,7 +378,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
 	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
 
 	/* Clear and disable the memory parity error line. */
-	clear_mem_error(reason);
+	reason = (reason & 0xf) | 4;
+	outb(reason, 0x61);
 }
 
 static notrace __kprobes void
@@ -728,7 +405,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 static notrace __kprobes void
 unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 {
-	if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+	if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
+			NOTIFY_STOP)
 		return;
 #ifdef CONFIG_MCA
 	/*
@@ -751,41 +429,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
 }
 
-static DEFINE_SPINLOCK(nmi_print_lock);
-
-void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
-	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
-		return;
-
-	spin_lock(&nmi_print_lock);
-	/*
-	* We are in trouble anyway, lets at least try
-	* to get a message out:
-	*/
-	bust_spinlocks(1);
-	printk(KERN_EMERG "%s", str);
-	printk(" on CPU%d, ip %08lx, registers:\n",
-		smp_processor_id(), regs->ip);
-	show_registers(regs);
-	if (do_panic)
-		panic("Non maskable interrupt");
-	console_silent();
-	spin_unlock(&nmi_print_lock);
-	bust_spinlocks(0);
-
-	/*
-	 * If we are in kernel we are probably nested up pretty bad
-	 * and might aswell get out now while we still can:
-	 */
-	if (!user_mode_vm(regs)) {
-		current->thread.trap_no = 2;
-		crash_kexec(regs);
-	}
-
-	do_exit(SIGSEGV);
-}
-
 static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 {
 	unsigned char reason = 0;
@@ -824,22 +467,25 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
 		mem_parity_error(reason, regs);
 	if (reason & 0x40)
 		io_check_error(reason, regs);
+#ifdef CONFIG_X86_32
 	/*
 	 * Reassert NMI in case it became active meanwhile
 	 * as it's edge-triggered:
 	 */
 	reassert_nmi();
+#endif
 }
 
-notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
+dotraplinkage notrace __kprobes void
+do_nmi(struct pt_regs *regs, long error_code)
 {
-	int cpu;
-
 	nmi_enter();
 
-	cpu = smp_processor_id();
-
-	++nmi_count(cpu);
+#ifdef CONFIG_X86_32
+	{ int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
+#else
+	add_pda(__nmi_count, 1);
+#endif
 
 	if (!ignore_nmis)
 		default_do_nmi(regs);
@@ -859,21 +505,44 @@ void restart_nmi(void)
 	acpi_nmi_enable();
 }
 
-#ifdef CONFIG_KPROBES
-void __kprobes do_int3(struct pt_regs *regs, long error_code)
+/* May run on IST stack. */
+dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
-	trace_hardirqs_fixup();
-
+#ifdef CONFIG_KPROBES
 	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
 			== NOTIFY_STOP)
 		return;
-	/*
-	 * This is an interrupt gate, because kprobes wants interrupts
-	 * disabled. Normal trap handlers don't.
-	 */
-	restore_interrupts(regs);
+#else
+	if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
+			== NOTIFY_STOP)
+		return;
+#endif
 
-	do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
+	preempt_conditional_sti(regs);
+	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
+	preempt_conditional_cli(regs);
+}
+
+#ifdef CONFIG_X86_64
+/* Help handler running on IST stack to switch back to user stack
+   for scheduling or signal handling. The actual stack switch is done in
+   entry.S */
+asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
+{
+	struct pt_regs *regs = eregs;
+	/* Did already sync */
+	if (eregs == (struct pt_regs *)eregs->sp)
+		;
+	/* Exception from user space */
+	else if (user_mode(eregs))
+		regs = task_pt_regs(current);
+	/* Exception from kernel and interrupts are enabled. Move to
+	   kernel process stack. */
+	else if (eregs->flags & X86_EFLAGS_IF)
+		regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
+	if (eregs != regs)
+		*regs = *eregs;
+	return regs;
 }
 #endif
 
@@ -898,13 +567,14 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code)
  * about restoring all the debug state, and ptrace doesn't have to
  * find every occurrence of the TF bit that could be saved away even
  * by user code)
+ *
+ * May run on IST stack.
  */
-void __kprobes do_debug(struct pt_regs *regs, long error_code)
+dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk = current;
-	unsigned int condition;
-
-	trace_hardirqs_fixup();
+	unsigned long condition;
+	int si_code;
 
 	get_debugreg(condition, 6);
 
@@ -917,9 +587,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
 						SIGTRAP) == NOTIFY_STOP)
 		return;
+
 	/* It's safe to allow irq's after DR6 has been saved */
-	if (regs->flags & X86_EFLAGS_IF)
-		local_irq_enable();
+	preempt_conditional_sti(regs);
 
 	/* Mask out spurious debug traps due to lazy DR7 setting */
 	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
@@ -927,8 +597,10 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
 			goto clear_dr7;
 	}
 
+#ifdef CONFIG_X86_32
 	if (regs->flags & X86_VM_MASK)
 		goto debug_vm86;
+#endif
 
 	/* Save debug status register where ptrace can see it */
 	tsk->thread.debugreg6 = condition;
@@ -938,17 +610,13 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	 * kernel space (but re-enable TF when returning to user mode).
 	 */
 	if (condition & DR_STEP) {
-		/*
-		 * We already checked v86 mode above, so we can
-		 * check for kernel mode by just checking the CPL
-		 * of CS.
-		 */
 		if (!user_mode(regs))
 			goto clear_TF_reenable;
 	}
 
+	si_code = get_si_code(condition);
 	/* Ok, finally something we can handle */
-	send_sigtrap(tsk, regs, error_code);
+	send_sigtrap(tsk, regs, error_code, si_code);
 
 	/*
 	 * Disable additional traps. They'll be re-enabled when
@@ -956,18 +624,37 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	 */
 clear_dr7:
 	set_debugreg(0, 7);
+	preempt_conditional_cli(regs);
 	return;
 
+#ifdef CONFIG_X86_32
 debug_vm86:
 	handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
+	preempt_conditional_cli(regs);
 	return;
+#endif
 
 clear_TF_reenable:
 	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 	regs->flags &= ~X86_EFLAGS_TF;
+	preempt_conditional_cli(regs);
 	return;
 }
 
+#ifdef CONFIG_X86_64
+static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
+{
+	if (fixup_exception(regs))
+		return 1;
+
+	notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
+	/* Illegal floating point operation in the kernel */
+	current->thread.trap_no = trapnr;
+	die(str, regs, 0);
+	return 0;
+}
+#endif
+
 /*
  * Note that we play around with the 'TS' bit in an attempt to get
  * the correct behaviour even in the presence of the asynchronous
@@ -1004,7 +691,9 @@ void math_error(void __user *ip)
 	swd = get_fpu_swd(task);
 	switch (swd & ~cwd & 0x3f) {
 	case 0x000: /* No unmasked exception */
+#ifdef CONFIG_X86_32
 		return;
+#endif
 	default: /* Multiple exceptions */
 		break;
 	case 0x001: /* Invalid Op */
@@ -1032,9 +721,18 @@ void math_error(void __user *ip)
 	force_sig_info(SIGFPE, &info, task);
 }
 
-void do_coprocessor_error(struct pt_regs *regs, long error_code)
+dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
+	conditional_sti(regs);
+
+#ifdef CONFIG_X86_32
 	ignore_fpu_irq = 1;
+#else
+	if (!user_mode(regs) &&
+	    kernel_math_error(regs, "kernel x87 math error", 16))
+		return;
+#endif
+
 	math_error((void __user *)regs->ip);
 }
 
@@ -1086,8 +784,12 @@ static void simd_math_error(void __user *ip)
 	force_sig_info(SIGFPE, &info, task);
 }
 
-void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
+dotraplinkage void
+do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
+	conditional_sti(regs);
+
+#ifdef CONFIG_X86_32
 	if (cpu_has_xmm) {
 		/* Handle SIMD FPU exceptions on PIII+ processors. */
 		ignore_fpu_irq = 1;
@@ -1106,16 +808,25 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 	current->thread.error_code = error_code;
 	die_if_kernel("cache flush denied", regs, error_code);
 	force_sig(SIGSEGV, current);
+#else
+	if (!user_mode(regs) &&
+			kernel_math_error(regs, "kernel simd math error", 19))
+		return;
+	simd_math_error((void __user *)regs->ip);
+#endif
 }
 
-void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
+dotraplinkage void
+do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 {
+	conditional_sti(regs);
 #if 0
 	/* No need to warn about this any longer. */
 	printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
 #endif
 }
 
+#ifdef CONFIG_X86_32
 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
 {
 	struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
@@ -1134,6 +845,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
 
 	return new_kesp;
 }
+#else
+asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
+{
+}
+
+asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
+{
+}
+#endif
 
 /*
  * 'math_state_restore()' saves the current math information in the
@@ -1166,14 +886,24 @@ asmlinkage void math_state_restore(void)
 	}
 
 	clts();				/* Allow maths ops (or we recurse) */
+#ifdef CONFIG_X86_32
 	restore_fpu(tsk);
+#else
+	/*
+	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
+	 */
+	if (unlikely(restore_fpu_checking(tsk))) {
+		stts();
+		force_sig(SIGSEGV, tsk);
+		return;
+	}
+#endif
 	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
 	tsk->fpu_counter++;
 }
 EXPORT_SYMBOL_GPL(math_state_restore);
 
 #ifndef CONFIG_MATH_EMULATION
-
 asmlinkage void math_emulate(long arg)
 {
 	printk(KERN_EMERG
@@ -1182,12 +912,46 @@ asmlinkage void math_emulate(long arg)
 	force_sig(SIGFPE, current);
 	schedule();
 }
-
 #endif /* CONFIG_MATH_EMULATION */
 
+dotraplinkage void __kprobes
+do_device_not_available(struct pt_regs *regs, long error)
+{
+#ifdef CONFIG_X86_32
+	if (read_cr0() & X86_CR0_EM) {
+		conditional_sti(regs);
+		math_emulate(0);
+	} else {
+		math_state_restore(); /* interrupts still off */
+		conditional_sti(regs);
+	}
+#else
+	math_state_restore();
+#endif
+}
+
+#ifdef CONFIG_X86_32
+dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
+{
+	siginfo_t info;
+	local_irq_enable();
+
+	info.si_signo = SIGILL;
+	info.si_errno = 0;
+	info.si_code = ILL_BADSTK;
+	info.si_addr = 0;
+	if (notify_die(DIE_TRAP, "iret exception",
+			regs, error_code, 32, SIGILL) == NOTIFY_STOP)
+		return;
+	do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
+}
+#endif
+
 void __init trap_init(void)
 {
+#ifdef CONFIG_X86_32
 	int i;
+#endif
 
 #ifdef CONFIG_EISA
 	void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1197,29 +961,40 @@ void __init trap_init(void)
 	early_iounmap(p, 4);
 #endif
 
-	set_trap_gate(0, &divide_error);
-	set_intr_gate(1, &debug);
-	set_intr_gate(2, &nmi);
-	set_system_intr_gate(3, &int3); /* int3 can be called from all */
-	set_system_gate(4, &overflow); /* int4 can be called from all */
-	set_trap_gate(5, &bounds);
-	set_trap_gate(6, &invalid_op);
-	set_trap_gate(7, &device_not_available);
+	set_intr_gate(0, &divide_error);
+	set_intr_gate_ist(1, &debug, DEBUG_STACK);
+	set_intr_gate_ist(2, &nmi, NMI_STACK);
+	/* int3 can be called from all */
+	set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
+	/* int4 can be called from all */
+	set_system_intr_gate(4, &overflow);
+	set_intr_gate(5, &bounds);
+	set_intr_gate(6, &invalid_op);
+	set_intr_gate(7, &device_not_available);
+#ifdef CONFIG_X86_32
 	set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
-	set_trap_gate(9, &coprocessor_segment_overrun);
-	set_trap_gate(10, &invalid_TSS);
-	set_trap_gate(11, &segment_not_present);
-	set_trap_gate(12, &stack_segment);
-	set_trap_gate(13, &general_protection);
+#else
+	set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
+#endif
+	set_intr_gate(9, &coprocessor_segment_overrun);
+	set_intr_gate(10, &invalid_TSS);
+	set_intr_gate(11, &segment_not_present);
+	set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
+	set_intr_gate(13, &general_protection);
 	set_intr_gate(14, &page_fault);
-	set_trap_gate(15, &spurious_interrupt_bug);
-	set_trap_gate(16, &coprocessor_error);
-	set_trap_gate(17, &alignment_check);
+	set_intr_gate(15, &spurious_interrupt_bug);
+	set_intr_gate(16, &coprocessor_error);
+	set_intr_gate(17, &alignment_check);
 #ifdef CONFIG_X86_MCE
-	set_trap_gate(18, &machine_check);
+	set_intr_gate_ist(18, &machine_check, MCE_STACK);
 #endif
-	set_trap_gate(19, &simd_coprocessor_error);
+	set_intr_gate(19, &simd_coprocessor_error);
 
+#ifdef CONFIG_IA32_EMULATION
+	set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+#endif
+
+#ifdef CONFIG_X86_32
 	if (cpu_has_fxsr) {
 		printk(KERN_INFO "Enabling fast FPU save and restore... ");
 		set_in_cr4(X86_CR4_OSFXSR);
@@ -1232,37 +1007,20 @@ void __init trap_init(void)
 		printk("done.\n");
 	}
 
-	set_system_gate(SYSCALL_VECTOR, &system_call);
+	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
 
 	/* Reserve all the builtin and the syscall vector: */
 	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
 		set_bit(i, used_vectors);
 
 	set_bit(SYSCALL_VECTOR, used_vectors);
-
-	init_thread_xstate();
+#endif
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
 	cpu_init();
 
+#ifdef CONFIG_X86_32
 	trap_init_hook();
+#endif
 }
-
-static int __init kstack_setup(char *s)
-{
-	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
-
-	return 1;
-}
-__setup("kstack=", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
-	code_bytes = simple_strtoul(s, NULL, 0);
-	if (code_bytes > 8192)
-		code_bytes = 8192;
-
-	return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
deleted file mode 100644
index 2696a6837782..000000000000
--- a/arch/x86/kernel/traps_64.c
+++ /dev/null
@@ -1,1217 +0,0 @@
-/*
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *  Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
- *
- *  Pentium III FXSR, SSE support
- *	Gareth Hughes <gareth@valinux.com>, May 2000
- */
-
-/*
- * 'Traps.c' handles hardware traps and faults after we have saved some
- * state in 'entry.S'.
- */
-#include <linux/moduleparam.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/spinlock.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/utsname.h>
-#include <linux/kdebug.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/unwind.h>
-#include <linux/delay.h>
-#include <linux/errno.h>
-#include <linux/kexec.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/init.h>
-#include <linux/bug.h>
-#include <linux/nmi.h>
-#include <linux/mm.h>
-
-#if defined(CONFIG_EDAC)
-#include <linux/edac.h>
-#endif
-
-#include <asm/stacktrace.h>
-#include <asm/processor.h>
-#include <asm/debugreg.h>
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/unwind.h>
-#include <asm/desc.h>
-#include <asm/i387.h>
-#include <asm/nmi.h>
-#include <asm/smp.h>
-#include <asm/io.h>
-#include <asm/pgalloc.h>
-#include <asm/proto.h>
-#include <asm/pda.h>
-
-#include <mach_traps.h>
-
-asmlinkage void divide_error(void);
-asmlinkage void debug(void);
-asmlinkage void nmi(void);
-asmlinkage void int3(void);
-asmlinkage void overflow(void);
-asmlinkage void bounds(void);
-asmlinkage void invalid_op(void);
-asmlinkage void device_not_available(void);
-asmlinkage void double_fault(void);
-asmlinkage void coprocessor_segment_overrun(void);
-asmlinkage void invalid_TSS(void);
-asmlinkage void segment_not_present(void);
-asmlinkage void stack_segment(void);
-asmlinkage void general_protection(void);
-asmlinkage void page_fault(void);
-asmlinkage void coprocessor_error(void);
-asmlinkage void simd_coprocessor_error(void);
-asmlinkage void alignment_check(void);
-asmlinkage void spurious_interrupt_bug(void);
-asmlinkage void machine_check(void);
-
-int panic_on_unrecovered_nmi;
-int kstack_depth_to_print = 12;
-static unsigned int code_bytes = 64;
-static int ignore_nmis;
-static int die_counter;
-
-static inline void conditional_sti(struct pt_regs *regs)
-{
-	if (regs->flags & X86_EFLAGS_IF)
-		local_irq_enable();
-}
-
-static inline void preempt_conditional_sti(struct pt_regs *regs)
-{
-	inc_preempt_count();
-	if (regs->flags & X86_EFLAGS_IF)
-		local_irq_enable();
-}
-
-static inline void preempt_conditional_cli(struct pt_regs *regs)
-{
-	if (regs->flags & X86_EFLAGS_IF)
-		local_irq_disable();
-	/* Make sure to not schedule here because we could be running
-	   on an exception stack. */
-	dec_preempt_count();
-}
-
-void printk_address(unsigned long address, int reliable)
-{
-	printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
-}
-
-static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
-					unsigned *usedp, char **idp)
-{
-	static char ids[][8] = {
-		[DEBUG_STACK - 1] = "#DB",
-		[NMI_STACK - 1] = "NMI",
-		[DOUBLEFAULT_STACK - 1] = "#DF",
-		[STACKFAULT_STACK - 1] = "#SS",
-		[MCE_STACK - 1] = "#MC",
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
-		[N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
-#endif
-	};
-	unsigned k;
-
-	/*
-	 * Iterate over all exception stacks, and figure out whether
-	 * 'stack' is in one of them:
-	 */
-	for (k = 0; k < N_EXCEPTION_STACKS; k++) {
-		unsigned long end = per_cpu(orig_ist, cpu).ist[k];
-		/*
-		 * Is 'stack' above this exception frame's end?
-		 * If yes then skip to the next frame.
-		 */
-		if (stack >= end)
-			continue;
-		/*
-		 * Is 'stack' above this exception frame's start address?
-		 * If yes then we found the right frame.
-		 */
-		if (stack >= end - EXCEPTION_STKSZ) {
-			/*
-			 * Make sure we only iterate through an exception
-			 * stack once. If it comes up for the second time
-			 * then there's something wrong going on - just
-			 * break out and return NULL:
-			 */
-			if (*usedp & (1U << k))
-				break;
-			*usedp |= 1U << k;
-			*idp = ids[k];
-			return (unsigned long *)end;
-		}
-		/*
-		 * If this is a debug stack, and if it has a larger size than
-		 * the usual exception stacks, then 'stack' might still
-		 * be within the lower portion of the debug stack:
-		 */
-#if DEBUG_STKSZ > EXCEPTION_STKSZ
-		if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
-			unsigned j = N_EXCEPTION_STACKS - 1;
-
-			/*
-			 * Black magic. A large debug stack is composed of
-			 * multiple exception stack entries, which we
-			 * iterate through now. Dont look:
-			 */
-			do {
-				++j;
-				end -= EXCEPTION_STKSZ;
-				ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
-			} while (stack < end - EXCEPTION_STKSZ);
-			if (*usedp & (1U << j))
-				break;
-			*usedp |= 1U << j;
-			*idp = ids[j];
-			return (unsigned long *)end;
-		}
-#endif
-	}
-	return NULL;
-}
-
-/*
- * x86-64 can have up to three kernel stacks: 
- * process stack
- * interrupt stack
- * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
- */
-
-static inline int valid_stack_ptr(struct thread_info *tinfo,
-			void *p, unsigned int size, void *end)
-{
-	void *t = tinfo;
-	if (end) {
-		if (p < end && p >= (end-THREAD_SIZE))
-			return 1;
-		else
-			return 0;
-	}
-	return p > t && p < t + THREAD_SIZE - size;
-}
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
-	struct stack_frame *next_frame;
-	unsigned long return_address;
-};
-
-static inline unsigned long
-print_context_stack(struct thread_info *tinfo,
-		unsigned long *stack, unsigned long bp,
-		const struct stacktrace_ops *ops, void *data,
-		unsigned long *end)
-{
-	struct stack_frame *frame = (struct stack_frame *)bp;
-
-	while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
-		unsigned long addr;
-
-		addr = *stack;
-		if (__kernel_text_address(addr)) {
-			if ((unsigned long) stack == bp + 8) {
-				ops->address(data, addr, 1);
-				frame = frame->next_frame;
-				bp = (unsigned long) frame;
-			} else {
-				ops->address(data, addr, bp == 0);
-			}
-		}
-		stack++;
-	}
-	return bp;
-}
-
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *stack, unsigned long bp,
-		const struct stacktrace_ops *ops, void *data)
-{
-	const unsigned cpu = get_cpu();
-	unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
-	unsigned used = 0;
-	struct thread_info *tinfo;
-
-	if (!task)
-		task = current;
-
-	if (!stack) {
-		unsigned long dummy;
-		stack = &dummy;
-		if (task && task != current)
-			stack = (unsigned long *)task->thread.sp;
-	}
-
-#ifdef CONFIG_FRAME_POINTER
-	if (!bp) {
-		if (task == current) {
-			/* Grab bp right from our regs */
-			asm("movq %%rbp, %0" : "=r" (bp) :);
-		} else {
-			/* bp is the last reg pushed by switch_to */
-			bp = *(unsigned long *) task->thread.sp;
-		}
-	}
-#endif
-
-	/*
-	 * Print function call entries in all stacks, starting at the
-	 * current stack address. If the stacks consist of nested
-	 * exceptions
-	 */
-	tinfo = task_thread_info(task);
-	for (;;) {
-		char *id;
-		unsigned long *estack_end;
-		estack_end = in_exception_stack(cpu, (unsigned long)stack,
-						&used, &id);
-
-		if (estack_end) {
-			if (ops->stack(data, id) < 0)
-				break;
-
-			bp = print_context_stack(tinfo, stack, bp, ops,
-							data, estack_end);
-			ops->stack(data, "<EOE>");
-			/*
-			 * We link to the next stack via the
-			 * second-to-last pointer (index -2 to end) in the
-			 * exception stack:
-			 */
-			stack = (unsigned long *) estack_end[-2];
-			continue;
-		}
-		if (irqstack_end) {
-			unsigned long *irqstack;
-			irqstack = irqstack_end -
-				(IRQSTACKSIZE - 64) / sizeof(*irqstack);
-
-			if (stack >= irqstack && stack < irqstack_end) {
-				if (ops->stack(data, "IRQ") < 0)
-					break;
-				bp = print_context_stack(tinfo, stack, bp,
-						ops, data, irqstack_end);
-				/*
-				 * We link to the next stack (which would be
-				 * the process stack normally) the last
-				 * pointer (index -1 to end) in the IRQ stack:
-				 */
-				stack = (unsigned long *) (irqstack_end[-1]);
-				irqstack_end = NULL;
-				ops->stack(data, "EOI");
-				continue;
-			}
-		}
-		break;
-	}
-
-	/*
-	 * This handles the process stack:
-	 */
-	bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
-	put_cpu();
-}
-EXPORT_SYMBOL(dump_trace);
-
-static void
-print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
-{
-	print_symbol(msg, symbol);
-	printk("\n");
-}
-
-static void print_trace_warning(void *data, char *msg)
-{
-	printk("%s\n", msg);
-}
-
-static int print_trace_stack(void *data, char *name)
-{
-	printk(" <%s> ", name);
-	return 0;
-}
-
-static void print_trace_address(void *data, unsigned long addr, int reliable)
-{
-	touch_nmi_watchdog();
-	printk_address(addr, reliable);
-}
-
-static const struct stacktrace_ops print_trace_ops = {
-	.warning = print_trace_warning,
-	.warning_symbol = print_trace_warning_symbol,
-	.stack = print_trace_stack,
-	.address = print_trace_address,
-};
-
-void show_trace(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *stack, unsigned long bp)
-{
-	printk("\nCall Trace:\n");
-	dump_trace(task, regs, stack, bp, &print_trace_ops, NULL);
-	printk("\n");
-}
-
-static void
-_show_stack(struct task_struct *task, struct pt_regs *regs,
-		unsigned long *sp, unsigned long bp)
-{
-	unsigned long *stack;
-	int i;
-	const int cpu = smp_processor_id();
-	unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
-	unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
-
-	// debugging aid: "show_stack(NULL, NULL);" prints the
-	// back trace for this cpu.
-
-	if (sp == NULL) {
-		if (task)
-			sp = (unsigned long *)task->thread.sp;
-		else
-			sp = (unsigned long *)&sp;
-	}
-
-	stack = sp;
-	for (i = 0; i < kstack_depth_to_print; i++) {
-		if (stack >= irqstack && stack <= irqstack_end) {
-			if (stack == irqstack_end) {
-				stack = (unsigned long *) (irqstack_end[-1]);
-				printk(" <EOI> ");
-			}
-		} else {
-		if (((long) stack & (THREAD_SIZE-1)) == 0)
-			break;
-		}
-		if (i && ((i % 4) == 0))
-			printk("\n");
-		printk(" %016lx", *stack++);
-		touch_nmi_watchdog();
-	}
-	show_trace(task, regs, sp, bp);
-}
-
-void show_stack(struct task_struct *task, unsigned long *sp)
-{
-	_show_stack(task, NULL, sp, 0);
-}
-
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-	unsigned long bp = 0;
-	unsigned long stack;
-
-#ifdef CONFIG_FRAME_POINTER
-	if (!bp)
-		asm("movq %%rbp, %0" : "=r" (bp):);
-#endif
-
-	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
-		current->pid, current->comm, print_tainted(),
-		init_utsname()->release,
-		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
-	show_trace(NULL, NULL, &stack, bp);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
-void show_registers(struct pt_regs *regs)
-{
-	int i;
-	unsigned long sp;
-	const int cpu = smp_processor_id();
-	struct task_struct *cur = cpu_pda(cpu)->pcurrent;
-
-	sp = regs->sp;
-	printk("CPU %d ", cpu);
-	__show_regs(regs);
-	printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
-		cur->comm, cur->pid, task_thread_info(cur), cur);
-
-	/*
-	 * When in-kernel, we also print out the stack and code at the
-	 * time of the fault..
-	 */
-	if (!user_mode(regs)) {
-		unsigned int code_prologue = code_bytes * 43 / 64;
-		unsigned int code_len = code_bytes;
-		unsigned char c;
-		u8 *ip;
-
-		printk("Stack: ");
-		_show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
-		printk("\n");
-
-		printk(KERN_EMERG "Code: ");
-
-		ip = (u8 *)regs->ip - code_prologue;
-		if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
-			/* try starting at RIP */
-			ip = (u8 *)regs->ip;
-			code_len = code_len - code_prologue + 1;
-		}
-		for (i = 0; i < code_len; i++, ip++) {
-			if (ip < (u8 *)PAGE_OFFSET ||
-					probe_kernel_address(ip, c)) {
-				printk(" Bad RIP value.");
-				break;
-			}
-			if (ip == (u8 *)regs->ip)
-				printk("<%02x> ", c);
-			else
-				printk("%02x ", c);
-		}
-	}
-	printk("\n");
-}
-
-int is_valid_bugaddr(unsigned long ip)
-{
-	unsigned short ud2;
-
-	if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
-		return 0;
-
-	return ud2 == 0x0b0f;
-}
-
-static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
-static int die_owner = -1;
-static unsigned int die_nest_count;
-
-unsigned __kprobes long oops_begin(void)
-{
-	int cpu;
-	unsigned long flags;
-
-	oops_enter();
-
-	/* racy, but better than risking deadlock. */
-	raw_local_irq_save(flags);
-	cpu = smp_processor_id();
-	if (!__raw_spin_trylock(&die_lock)) {
-		if (cpu == die_owner) 
-			/* nested oops. should stop eventually */;
-		else
-			__raw_spin_lock(&die_lock);
-	}
-	die_nest_count++;
-	die_owner = cpu;
-	console_verbose();
-	bust_spinlocks(1);
-	return flags;
-}
-
-void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
-{ 
-	die_owner = -1;
-	bust_spinlocks(0);
-	die_nest_count--;
-	if (!die_nest_count)
-		/* Nest count reaches zero, release the lock. */
-		__raw_spin_unlock(&die_lock);
-	raw_local_irq_restore(flags);
-	if (!regs) {
-		oops_exit();
-		return;
-	}
-	if (panic_on_oops)
-		panic("Fatal exception");
-	oops_exit();
-	do_exit(signr);
-}
-
-int __kprobes __die(const char *str, struct pt_regs *regs, long err)
-{
-	printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
-#ifdef CONFIG_PREEMPT
-	printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
-	printk("SMP ");
-#endif
-#ifdef CONFIG_DEBUG_PAGEALLOC
-	printk("DEBUG_PAGEALLOC");
-#endif
-	printk("\n");
-	if (notify_die(DIE_OOPS, str, regs, err,
-			current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
-		return 1;
-
-	show_registers(regs);
-	add_taint(TAINT_DIE);
-	/* Executive summary in case the oops scrolled away */
-	printk(KERN_ALERT "RIP ");
-	printk_address(regs->ip, 1);
-	printk(" RSP <%016lx>\n", regs->sp);
-	if (kexec_should_crash(current))
-		crash_kexec(regs);
-	return 0;
-}
-
-void die(const char *str, struct pt_regs *regs, long err)
-{
-	unsigned long flags = oops_begin();
-
-	if (!user_mode(regs))
-		report_bug(regs->ip, regs);
-
-	if (__die(str, regs, err))
-		regs = NULL;
-	oops_end(flags, regs, SIGSEGV);
-}
-
-notrace __kprobes void
-die_nmi(char *str, struct pt_regs *regs, int do_panic)
-{
-	unsigned long flags;
-
-	if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
-		return;
-
-	flags = oops_begin();
-	/*
-	 * We are in trouble anyway, lets at least try
-	 * to get a message out.
-	 */
-	printk(KERN_EMERG "%s", str);
-	printk(" on CPU%d, ip %08lx, registers:\n",
-		smp_processor_id(), regs->ip);
-	show_registers(regs);
-	if (kexec_should_crash(current))
-		crash_kexec(regs);
-	if (do_panic || panic_on_oops)
-		panic("Non maskable interrupt");
-	oops_end(flags, NULL, SIGBUS);
-	nmi_exit();
-	local_irq_enable();
-	do_exit(SIGBUS);
-}
-
-static void __kprobes
-do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
-	long error_code, siginfo_t *info)
-{
-	struct task_struct *tsk = current;
-
-	if (!user_mode(regs))
-		goto kernel_trap;
-
-	/*
-	 * We want error_code and trap_no set for userspace faults and
-	 * kernelspace faults which result in die(), but not
-	 * kernelspace faults which are fixed up.  die() gives the
-	 * process no chance to handle the signal and notice the
-	 * kernel fault information, so that won't result in polluting
-	 * the information about previously queued, but not yet
-	 * delivered, faults.  See also do_general_protection below.
-	 */
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = trapnr;
-
-	if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
-	    printk_ratelimit()) {
-		printk(KERN_INFO
-		       "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
-		       tsk->comm, tsk->pid, str,
-		       regs->ip, regs->sp, error_code);
-		print_vma_addr(" in ", regs->ip);
-		printk("\n");
-	}
-
-	if (info)
-		force_sig_info(signr, info, tsk);
-	else
-		force_sig(signr, tsk);
-	return;
-
-kernel_trap:
-	if (!fixup_exception(regs)) {
-		tsk->thread.error_code = error_code;
-		tsk->thread.trap_no = trapnr;
-		die(str, regs, error_code);
-	}
-	return;
-}
-
-#define DO_ERROR(trapnr, signr, str, name) \
-asmlinkage void do_##name(struct pt_regs * regs, long error_code)	\
-{									\
-	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
-							== NOTIFY_STOP)	\
-		return;							\
-	conditional_sti(regs);						\
-	do_trap(trapnr, signr, str, regs, error_code, NULL);		\
-}
-
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)		\
-asmlinkage void do_##name(struct pt_regs * regs, long error_code)	\
-{									\
-	siginfo_t info;							\
-	info.si_signo = signr;						\
-	info.si_errno = 0;						\
-	info.si_code = sicode;						\
-	info.si_addr = (void __user *)siaddr;				\
-	trace_hardirqs_fixup();						\
-	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\
-							== NOTIFY_STOP)	\
-		return;							\
-	conditional_sti(regs);						\
-	do_trap(trapnr, signr, str, regs, error_code, &info);		\
-}
-
-DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
-DO_ERROR(4, SIGSEGV, "overflow", overflow)
-DO_ERROR(5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
-DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
-DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
-DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
-DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
-
-/* Runs on IST stack */
-asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
-{
-	if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
-			12, SIGBUS) == NOTIFY_STOP)
-		return;
-	preempt_conditional_sti(regs);
-	do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
-	preempt_conditional_cli(regs);
-}
-
-asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
-{
-	static const char str[] = "double fault";
-	struct task_struct *tsk = current;
-
-	/* Return not checked because double check cannot be ignored */
-	notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
-
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 8;
-
-	/* This is always a kernel trap and never fixable (and thus must
-	   never return). */
-	for (;;)
-		die(str, regs, error_code);
-}
-
-asmlinkage void __kprobes
-do_general_protection(struct pt_regs *regs, long error_code)
-{
-	struct task_struct *tsk;
-
-	conditional_sti(regs);
-
-	tsk = current;
-	if (!user_mode(regs))
-		goto gp_in_kernel;
-
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 13;
-
-	if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
-			printk_ratelimit()) {
-		printk(KERN_INFO
-			"%s[%d] general protection ip:%lx sp:%lx error:%lx",
-			tsk->comm, tsk->pid,
-			regs->ip, regs->sp, error_code);
-		print_vma_addr(" in ", regs->ip);
-		printk("\n");
-	}
-
-	force_sig(SIGSEGV, tsk);
-	return;
-
-gp_in_kernel:
-	if (fixup_exception(regs))
-		return;
-
-	tsk->thread.error_code = error_code;
-	tsk->thread.trap_no = 13;
-	if (notify_die(DIE_GPF, "general protection fault", regs,
-				error_code, 13, SIGSEGV) == NOTIFY_STOP)
-		return;
-	die("general protection fault", regs, error_code);
-}
-
-static notrace __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs *regs)
-{
-	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
-		reason);
-	printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
-
-#if defined(CONFIG_EDAC)
-	if (edac_handler_set()) {
-		edac_atomic_assert_error();
-		return;
-	}
-#endif
-
-	if (panic_on_unrecovered_nmi)
-		panic("NMI: Not continuing");
-
-	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
-
-	/* Clear and disable the memory parity error line. */
-	reason = (reason & 0xf) | 4;
-	outb(reason, 0x61);
-}
-
-static notrace __kprobes void
-io_check_error(unsigned char reason, struct pt_regs *regs)
-{
-	printk("NMI: IOCK error (debug interrupt?)\n");
-	show_registers(regs);
-
-	/* Re-enable the IOCK line, wait for a few seconds */
-	reason = (reason & 0xf) | 8;
-	outb(reason, 0x61);
-	mdelay(2000);
-	reason &= ~8;
-	outb(reason, 0x61);
-}
-
-static notrace __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-{
-	if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
-		return;
-	printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
-		reason);
-	printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
-
-	if (panic_on_unrecovered_nmi)
-		panic("NMI: Not continuing");
-
-	printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
-}
-
-/* Runs on IST stack. This code must keep interrupts off all the time.
-   Nested NMIs are prevented by the CPU. */
-asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
-{
-	unsigned char reason = 0;
-	int cpu;
-
-	cpu = smp_processor_id();
-
-	/* Only the BSP gets external NMIs from the system. */
-	if (!cpu)
-		reason = get_nmi_reason();
-
-	if (!(reason & 0xc0)) {
-		if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
-								== NOTIFY_STOP)
-			return;
-		/*
-		 * Ok, so this is none of the documented NMI sources,
-		 * so it must be the NMI watchdog.
-		 */
-		if (nmi_watchdog_tick(regs, reason))
-			return;
-		if (!do_nmi_callback(regs, cpu))
-			unknown_nmi_error(reason, regs);
-
-		return;
-	}
-	if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
-		return;
-
-	/* AK: following checks seem to be broken on modern chipsets. FIXME */
-	if (reason & 0x80)
-		mem_parity_error(reason, regs);
-	if (reason & 0x40)
-		io_check_error(reason, regs);
-}
-
-asmlinkage notrace __kprobes void
-do_nmi(struct pt_regs *regs, long error_code)
-{
-	nmi_enter();
-
-	add_pda(__nmi_count, 1);
-
-	if (!ignore_nmis)
-		default_do_nmi(regs);
-
-	nmi_exit();
-}
-
-void stop_nmi(void)
-{
-	acpi_nmi_disable();
-	ignore_nmis++;
-}
-
-void restart_nmi(void)
-{
-	ignore_nmis--;
-	acpi_nmi_enable();
-}
-
-/* runs on IST stack. */
-asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
-{
-	trace_hardirqs_fixup();
-
-	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
-			== NOTIFY_STOP)
-		return;
-
-	preempt_conditional_sti(regs);
-	do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
-	preempt_conditional_cli(regs);
-}
-
-/* Help handler running on IST stack to switch back to user stack
-   for scheduling or signal handling. The actual stack switch is done in
-   entry.S */
-asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
-{
-	struct pt_regs *regs = eregs;
-	/* Did already sync */
-	if (eregs == (struct pt_regs *)eregs->sp)
-		;
-	/* Exception from user space */
-	else if (user_mode(eregs))
-		regs = task_pt_regs(current);
-	/* Exception from kernel and interrupts are enabled. Move to
- 	   kernel process stack. */
-	else if (eregs->flags & X86_EFLAGS_IF)
-		regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
-	if (eregs != regs)
-		*regs = *eregs;
-	return regs;
-}
-
-/* runs on IST stack. */
-asmlinkage void __kprobes do_debug(struct pt_regs * regs,
-				   unsigned long error_code)
-{
-	struct task_struct *tsk = current;
-	unsigned long condition;
-	siginfo_t info;
-
-	trace_hardirqs_fixup();
-
-	get_debugreg(condition, 6);
-
-	/*
-	 * The processor cleared BTF, so don't mark that we need it set.
-	 */
-	clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
-	tsk->thread.debugctlmsr = 0;
-
-	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
-						SIGTRAP) == NOTIFY_STOP)
-		return;
-
-	preempt_conditional_sti(regs);
-
-	/* Mask out spurious debug traps due to lazy DR7 setting */
-	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
-		if (!tsk->thread.debugreg7)
-			goto clear_dr7;
-	}
-
-	tsk->thread.debugreg6 = condition;
-
-	/*
-	 * Single-stepping through TF: make sure we ignore any events in
-	 * kernel space (but re-enable TF when returning to user mode).
-	 */
-	if (condition & DR_STEP) {
-		if (!user_mode(regs))
-			goto clear_TF_reenable;
-	}
-
-	/* Ok, finally something we can handle */
-	tsk->thread.trap_no = 1;
-	tsk->thread.error_code = error_code;
-	info.si_signo = SIGTRAP;
-	info.si_errno = 0;
-	info.si_code = TRAP_BRKPT;
-	info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
-	force_sig_info(SIGTRAP, &info, tsk);
-
-clear_dr7:
-	set_debugreg(0, 7);
-	preempt_conditional_cli(regs);
-	return;
-
-clear_TF_reenable:
-	set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
-	regs->flags &= ~X86_EFLAGS_TF;
-	preempt_conditional_cli(regs);
-	return;
-}
-
-static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
-{
-	if (fixup_exception(regs))
-		return 1;
-
-	notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
-	/* Illegal floating point operation in the kernel */
-	current->thread.trap_no = trapnr;
-	die(str, regs, 0);
-	return 0;
-}
-
-/*
- * Note that we play around with the 'TS' bit in an attempt to get
- * the correct behaviour even in the presence of the asynchronous
- * IRQ13 behaviour
- */
-asmlinkage void do_coprocessor_error(struct pt_regs *regs)
-{
-	void __user *ip = (void __user *)(regs->ip);
-	struct task_struct *task;
-	siginfo_t info;
-	unsigned short cwd, swd;
-
-	conditional_sti(regs);
-	if (!user_mode(regs) &&
-	    kernel_math_error(regs, "kernel x87 math error", 16))
-		return;
-
-	/*
-	 * Save the info for the exception handler and clear the error.
-	 */
-	task = current;
-	save_init_fpu(task);
-	task->thread.trap_no = 16;
-	task->thread.error_code = 0;
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_code = __SI_FAULT;
-	info.si_addr = ip;
-	/*
-	 * (~cwd & swd) will mask out exceptions that are not set to unmasked
-	 * status.  0x3f is the exception bits in these regs, 0x200 is the
-	 * C1 reg you need in case of a stack fault, 0x040 is the stack
-	 * fault bit.  We should only be taking one exception at a time,
-	 * so if this combination doesn't produce any single exception,
-	 * then we have a bad program that isn't synchronizing its FPU usage
-	 * and it will suffer the consequences since we won't be able to
-	 * fully reproduce the context of the exception
-	 */
-	cwd = get_fpu_cwd(task);
-	swd = get_fpu_swd(task);
-	switch (swd & ~cwd & 0x3f) {
-	case 0x000: /* No unmasked exception */
-	default: /* Multiple exceptions */
-		break;
-	case 0x001: /* Invalid Op */
-		/*
-		 * swd & 0x240 == 0x040: Stack Underflow
-		 * swd & 0x240 == 0x240: Stack Overflow
-		 * User must clear the SF bit (0x40) if set
-		 */
-		info.si_code = FPE_FLTINV;
-		break;
-	case 0x002: /* Denormalize */
-	case 0x010: /* Underflow */
-		info.si_code = FPE_FLTUND;
-		break;
-	case 0x004: /* Zero Divide */
-		info.si_code = FPE_FLTDIV;
-		break;
-	case 0x008: /* Overflow */
-		info.si_code = FPE_FLTOVF;
-		break;
-	case 0x020: /* Precision */
-		info.si_code = FPE_FLTRES;
-		break;
-	}
-	force_sig_info(SIGFPE, &info, task);
-}
-
-asmlinkage void bad_intr(void)
-{
-	printk("bad interrupt"); 
-}
-
-asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
-{
-	void __user *ip = (void __user *)(regs->ip);
-	struct task_struct *task;
-	siginfo_t info;
-	unsigned short mxcsr;
-
-	conditional_sti(regs);
-	if (!user_mode(regs) &&
-        	kernel_math_error(regs, "kernel simd math error", 19))
-		return;
-
-	/*
-	 * Save the info for the exception handler and clear the error.
-	 */
-	task = current;
-	save_init_fpu(task);
-	task->thread.trap_no = 19;
-	task->thread.error_code = 0;
-	info.si_signo = SIGFPE;
-	info.si_errno = 0;
-	info.si_code = __SI_FAULT;
-	info.si_addr = ip;
-	/*
-	 * The SIMD FPU exceptions are handled a little differently, as there
-	 * is only a single status/control register.  Thus, to determine which
-	 * unmasked exception was caught we must mask the exception mask bits
-	 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
-	 */
-	mxcsr = get_fpu_mxcsr(task);
-	switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
-	case 0x000:
-	default:
-		break;
-	case 0x001: /* Invalid Op */
-		info.si_code = FPE_FLTINV;
-		break;
-	case 0x002: /* Denormalize */
-	case 0x010: /* Underflow */
-		info.si_code = FPE_FLTUND;
-		break;
-	case 0x004: /* Zero Divide */
-		info.si_code = FPE_FLTDIV;
-		break;
-	case 0x008: /* Overflow */
-		info.si_code = FPE_FLTOVF;
-		break;
-	case 0x020: /* Precision */
-		info.si_code = FPE_FLTRES;
-		break;
-	}
-	force_sig_info(SIGFPE, &info, task);
-}
-
-asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
-{
-}
-
-asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
-{
-}
-
-asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
-{
-}
-
-/*
- * 'math_state_restore()' saves the current math information in the
- * old math state array, and gets the new ones from the current task
- *
- * Careful.. There are problems with IBM-designed IRQ13 behaviour.
- * Don't touch unless you *really* know how it works.
- */
-asmlinkage void math_state_restore(void)
-{
-	struct task_struct *me = current;
-
-	if (!used_math()) {
-		local_irq_enable();
-		/*
-		 * does a slab alloc which can sleep
-		 */
-		if (init_fpu(me)) {
-			/*
-			 * ran out of memory!
-			 */
-			do_group_exit(SIGKILL);
-			return;
-		}
-		local_irq_disable();
-	}
-
-	clts();				/* Allow maths ops (or we recurse) */
-	restore_fpu_checking(&me->thread.xstate->fxsave);
-	task_thread_info(me)->status |= TS_USEDFPU;
-	me->fpu_counter++;
-}
-EXPORT_SYMBOL_GPL(math_state_restore);
-
-void __init trap_init(void)
-{
-	set_intr_gate(0, &divide_error);
-	set_intr_gate_ist(1, &debug, DEBUG_STACK);
-	set_intr_gate_ist(2, &nmi, NMI_STACK);
- 	set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */
-	set_system_gate(4, &overflow); /* int4 can be called from all */
-	set_intr_gate(5, &bounds);
-	set_intr_gate(6, &invalid_op);
-	set_intr_gate(7, &device_not_available);
-	set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
-	set_intr_gate(9, &coprocessor_segment_overrun);
-	set_intr_gate(10, &invalid_TSS);
-	set_intr_gate(11, &segment_not_present);
-	set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
-	set_intr_gate(13, &general_protection);
-	set_intr_gate(14, &page_fault);
-	set_intr_gate(15, &spurious_interrupt_bug);
-	set_intr_gate(16, &coprocessor_error);
-	set_intr_gate(17, &alignment_check);
-#ifdef CONFIG_X86_MCE
-	set_intr_gate_ist(18, &machine_check, MCE_STACK);
-#endif
-	set_intr_gate(19, &simd_coprocessor_error);
-
-#ifdef CONFIG_IA32_EMULATION
-	set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
-#endif
-	/*
-	 * initialize the per thread extended state:
-	 */
-	init_thread_xstate();
-	/*
-	 * Should be a barrier for any external CPU state:
-	 */
-	cpu_init();
-}
-
-static int __init oops_setup(char *s)
-{
-	if (!s)
-		return -EINVAL;
-	if (!strcmp(s, "panic"))
-		panic_on_oops = 1;
-	return 0;
-}
-early_param("oops", oops_setup);
-
-static int __init kstack_setup(char *s)
-{
-	if (!s)
-		return -EINVAL;
-	kstack_depth_to_print = simple_strtoul(s, NULL, 0);
-	return 0;
-}
-early_param("kstack", kstack_setup);
-
-static int __init code_bytes_setup(char *s)
-{
-	code_bytes = simple_strtoul(s, NULL, 0);
-	if (code_bytes > 8192)
-		code_bytes = 8192;
-
-	return 1;
-}
-__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7603c0553909..161bb850fc47 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
 /*
  * Read TSC and the reference counters. Take care of SMI disturbance
  */
-static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
+static u64 tsc_read_refs(u64 *p, int hpet)
 {
 	u64 t1, t2;
 	int i;
@@ -112,9 +112,9 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
 	for (i = 0; i < MAX_RETRIES; i++) {
 		t1 = get_cycles();
 		if (hpet)
-			*hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
+			*p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
 		else
-			*pm = acpi_pm_read_early();
+			*p = acpi_pm_read_early();
 		t2 = get_cycles();
 		if ((t2 - t1) < SMI_TRESHOLD)
 			return t2;
@@ -122,80 +122,390 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
 	return ULLONG_MAX;
 }
 
-/**
- * native_calibrate_tsc - calibrate the tsc on boot
+/*
+ * Calculate the TSC frequency from HPET reference
  */
-unsigned long native_calibrate_tsc(void)
+static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
 {
-	unsigned long flags;
-	u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2;
-	int hpet = is_hpet_enabled();
-	unsigned int tsc_khz_val = 0;
+	u64 tmp;
 
-	local_irq_save(flags);
+	if (hpet2 < hpet1)
+		hpet2 += 0x100000000ULL;
+	hpet2 -= hpet1;
+	tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
+	do_div(tmp, 1000000);
+	do_div(deltatsc, tmp);
+
+	return (unsigned long) deltatsc;
+}
+
+/*
+ * Calculate the TSC frequency from PMTimer reference
+ */
+static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
+{
+	u64 tmp;
 
-	tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL);
+	if (!pm1 && !pm2)
+		return ULONG_MAX;
+
+	if (pm2 < pm1)
+		pm2 += (u64)ACPI_PM_OVRRUN;
+	pm2 -= pm1;
+	tmp = pm2 * 1000000000LL;
+	do_div(tmp, PMTMR_TICKS_PER_SEC);
+	do_div(deltatsc, tmp);
+
+	return (unsigned long) deltatsc;
+}
+
+#define CAL_MS		10
+#define CAL_LATCH	(CLOCK_TICK_RATE / (1000 / CAL_MS))
+#define CAL_PIT_LOOPS	1000
+
+#define CAL2_MS		50
+#define CAL2_LATCH	(CLOCK_TICK_RATE / (1000 / CAL2_MS))
+#define CAL2_PIT_LOOPS	5000
+
+
+/*
+ * Try to calibrate the TSC against the Programmable
+ * Interrupt Timer and return the frequency of the TSC
+ * in kHz.
+ *
+ * Return ULONG_MAX on failure to calibrate.
+ */
+static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
+{
+	u64 tsc, t1, t2, delta;
+	unsigned long tscmin, tscmax;
+	int pitcnt;
 
+	/* Set the Gate high, disable speaker */
 	outb((inb(0x61) & ~0x02) | 0x01, 0x61);
 
+	/*
+	 * Setup CTC channel 2* for mode 0, (interrupt on terminal
+	 * count mode), binary count. Set the latch register to 50ms
+	 * (LSB then MSB) to begin countdown.
+	 */
 	outb(0xb0, 0x43);
-	outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42);
-	outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42);
-	tr1 = get_cycles();
-	while ((inb(0x61) & 0x20) == 0);
-	tr2 = get_cycles();
+	outb(latch & 0xff, 0x42);
+	outb(latch >> 8, 0x42);
+
+	tsc = t1 = t2 = get_cycles();
+
+	pitcnt = 0;
+	tscmax = 0;
+	tscmin = ULONG_MAX;
+	while ((inb(0x61) & 0x20) == 0) {
+		t2 = get_cycles();
+		delta = t2 - tsc;
+		tsc = t2;
+		if ((unsigned long) delta < tscmin)
+			tscmin = (unsigned int) delta;
+		if ((unsigned long) delta > tscmax)
+			tscmax = (unsigned int) delta;
+		pitcnt++;
+	}
+
+	/*
+	 * Sanity checks:
+	 *
+	 * If we were not able to read the PIT more than loopmin
+	 * times, then we have been hit by a massive SMI
+	 *
+	 * If the maximum is 10 times larger than the minimum,
+	 * then we got hit by an SMI as well.
+	 */
+	if (pitcnt < loopmin || tscmax > 10 * tscmin)
+		return ULONG_MAX;
+
+	/* Calculate the PIT value */
+	delta = t2 - t1;
+	do_div(delta, ms);
+	return delta;
+}
 
-	tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL);
+/*
+ * This reads the current MSB of the PIT counter, and
+ * checks if we are running on sufficiently fast and
+ * non-virtualized hardware.
+ *
+ * Our expectations are:
+ *
+ *  - the PIT is running at roughly 1.19MHz
+ *
+ *  - each IO is going to take about 1us on real hardware,
+ *    but we allow it to be much faster (by a factor of 10) or
+ *    _slightly_ slower (ie we allow up to a 2us read+counter
+ *    update - anything else implies a unacceptably slow CPU
+ *    or PIT for the fast calibration to work.
+ *
+ *  - with 256 PIT ticks to read the value, we have 214us to
+ *    see the same MSB (and overhead like doing a single TSC
+ *    read per MSB value etc).
+ *
+ *  - We're doing 2 reads per loop (LSB, MSB), and we expect
+ *    them each to take about a microsecond on real hardware.
+ *    So we expect a count value of around 100. But we'll be
+ *    generous, and accept anything over 50.
+ *
+ *  - if the PIT is stuck, and we see *many* more reads, we
+ *    return early (and the next caller of pit_expect_msb()
+ *    then consider it a failure when they don't see the
+ *    next expected value).
+ *
+ * These expectations mean that we know that we have seen the
+ * transition from one expected value to another with a fairly
+ * high accuracy, and we didn't miss any events. We can thus
+ * use the TSC value at the transitions to calculate a pretty
+ * good value for the TSC frequencty.
+ */
+static inline int pit_expect_msb(unsigned char val)
+{
+	int count = 0;
 
+	for (count = 0; count < 50000; count++) {
+		/* Ignore LSB */
+		inb(0x42);
+		if (inb(0x42) != val)
+			break;
+	}
+	return count > 50;
+}
+
+/*
+ * How many MSB values do we want to see? We aim for a
+ * 15ms calibration, which assuming a 2us counter read
+ * error should give us roughly 150 ppm precision for
+ * the calibration.
+ */
+#define QUICK_PIT_MS 15
+#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
+
+static unsigned long quick_pit_calibrate(void)
+{
+	/* Set the Gate high, disable speaker */
+	outb((inb(0x61) & ~0x02) | 0x01, 0x61);
+
+	/*
+	 * Counter 2, mode 0 (one-shot), binary count
+	 *
+	 * NOTE! Mode 2 decrements by two (and then the
+	 * output is flipped each time, giving the same
+	 * final output frequency as a decrement-by-one),
+	 * so mode 0 is much better when looking at the
+	 * individual counts.
+	 */
+	outb(0xb0, 0x43);
+
+	/* Start at 0xffff */
+	outb(0xff, 0x42);
+	outb(0xff, 0x42);
+
+	if (pit_expect_msb(0xff)) {
+		int i;
+		u64 t1, t2, delta;
+		unsigned char expect = 0xfe;
+
+		t1 = get_cycles();
+		for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
+			if (!pit_expect_msb(expect))
+				goto failed;
+		}
+		t2 = get_cycles();
+
+		/*
+		 * Make sure we can rely on the second TSC timestamp:
+		 */
+		if (!pit_expect_msb(expect))
+			goto failed;
+
+		/*
+		 * Ok, if we get here, then we've seen the
+		 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
+		 * times, and each MSB had many hits, so we never
+		 * had any sudden jumps.
+		 *
+		 * As a result, we can depend on there not being
+		 * any odd delays anywhere, and the TSC reads are
+		 * reliable.
+		 *
+		 * kHz = ticks / time-in-seconds / 1000;
+		 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
+		 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
+		 */
+		delta = (t2 - t1)*PIT_TICK_RATE;
+		do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
+		printk("Fast TSC calibration using PIT\n");
+		return delta;
+	}
+failed:
+	return 0;
+}
+
+/**
+ * native_calibrate_tsc - calibrate the tsc on boot
+ */
+unsigned long native_calibrate_tsc(void)
+{
+	u64 tsc1, tsc2, delta, ref1, ref2;
+	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
+	unsigned long flags, latch, ms, fast_calibrate;
+	int hpet = is_hpet_enabled(), i, loopmin;
+
+	local_irq_save(flags);
+	fast_calibrate = quick_pit_calibrate();
 	local_irq_restore(flags);
+	if (fast_calibrate)
+		return fast_calibrate;
 
 	/*
-	 * Preset the result with the raw and inaccurate PIT
-	 * calibration value
+	 * Run 5 calibration loops to get the lowest frequency value
+	 * (the best estimate). We use two different calibration modes
+	 * here:
+	 *
+	 * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
+	 * load a timeout of 50ms. We read the time right after we
+	 * started the timer and wait until the PIT count down reaches
+	 * zero. In each wait loop iteration we read the TSC and check
+	 * the delta to the previous read. We keep track of the min
+	 * and max values of that delta. The delta is mostly defined
+	 * by the IO time of the PIT access, so we can detect when a
+	 * SMI/SMM disturbance happend between the two reads. If the
+	 * maximum time is significantly larger than the minimum time,
+	 * then we discard the result and have another try.
+	 *
+	 * 2) Reference counter. If available we use the HPET or the
+	 * PMTIMER as a reference to check the sanity of that value.
+	 * We use separate TSC readouts and check inside of the
+	 * reference read for a SMI/SMM disturbance. We dicard
+	 * disturbed values here as well. We do that around the PIT
+	 * calibration delay loop as we have to wait for a certain
+	 * amount of time anyway.
 	 */
-	delta = (tr2 - tr1);
-	do_div(delta, 50);
-	tsc_khz_val = delta;
-
-	/* hpet or pmtimer available ? */
-	if (!hpet && !pm1 && !pm2) {
-		printk(KERN_INFO "TSC calibrated against PIT\n");
-		goto out;
+
+	/* Preset PIT loop values */
+	latch = CAL_LATCH;
+	ms = CAL_MS;
+	loopmin = CAL_PIT_LOOPS;
+
+	for (i = 0; i < 3; i++) {
+		unsigned long tsc_pit_khz;
+
+		/*
+		 * Read the start value and the reference count of
+		 * hpet/pmtimer when available. Then do the PIT
+		 * calibration, which will take at least 50ms, and
+		 * read the end value.
+		 */
+		local_irq_save(flags);
+		tsc1 = tsc_read_refs(&ref1, hpet);
+		tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
+		tsc2 = tsc_read_refs(&ref2, hpet);
+		local_irq_restore(flags);
+
+		/* Pick the lowest PIT TSC calibration so far */
+		tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
+
+		/* hpet or pmtimer available ? */
+		if (!hpet && !ref1 && !ref2)
+			continue;
+
+		/* Check, whether the sampling was disturbed by an SMI */
+		if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
+			continue;
+
+		tsc2 = (tsc2 - tsc1) * 1000000LL;
+		if (hpet)
+			tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
+		else
+			tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
+
+		tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
+
+		/* Check the reference deviation */
+		delta = ((u64) tsc_pit_min) * 100;
+		do_div(delta, tsc_ref_min);
+
+		/*
+		 * If both calibration results are inside a 10% window
+		 * then we can be sure, that the calibration
+		 * succeeded. We break out of the loop right away. We
+		 * use the reference value, as it is more precise.
+		 */
+		if (delta >= 90 && delta <= 110) {
+			printk(KERN_INFO
+			       "TSC: PIT calibration matches %s. %d loops\n",
+			       hpet ? "HPET" : "PMTIMER", i + 1);
+			return tsc_ref_min;
+		}
+
+		/*
+		 * Check whether PIT failed more than once. This
+		 * happens in virtualized environments. We need to
+		 * give the virtual PC a slightly longer timeframe for
+		 * the HPET/PMTIMER to make the result precise.
+		 */
+		if (i == 1 && tsc_pit_min == ULONG_MAX) {
+			latch = CAL2_LATCH;
+			ms = CAL2_MS;
+			loopmin = CAL2_PIT_LOOPS;
+		}
 	}
 
-	/* Check, whether the sampling was disturbed by an SMI */
-	if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) {
-		printk(KERN_WARNING "TSC calibration disturbed by SMI, "
-				"using PIT calibration result\n");
-		goto out;
+	/*
+	 * Now check the results.
+	 */
+	if (tsc_pit_min == ULONG_MAX) {
+		/* PIT gave no useful value */
+		printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
+
+		/* We don't have an alternative source, disable TSC */
+		if (!hpet && !ref1 && !ref2) {
+			printk("TSC: No reference (HPET/PMTIMER) available\n");
+			return 0;
+		}
+
+		/* The alternative source failed as well, disable TSC */
+		if (tsc_ref_min == ULONG_MAX) {
+			printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
+			       "failed.\n");
+			return 0;
+		}
+
+		/* Use the alternative source */
+		printk(KERN_INFO "TSC: using %s reference calibration\n",
+		       hpet ? "HPET" : "PMTIMER");
+
+		return tsc_ref_min;
 	}
 
-	tsc2 = (tsc2 - tsc1) * 1000000LL;
-
-	if (hpet) {
-		printk(KERN_INFO "TSC calibrated against HPET\n");
-		if (hpet2 < hpet1)
-			hpet2 += 0x100000000ULL;
-		hpet2 -= hpet1;
-		tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
-		do_div(tsc1, 1000000);
-	} else {
-		printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
-		if (pm2 < pm1)
-			pm2 += (u64)ACPI_PM_OVRRUN;
-		pm2 -= pm1;
-		tsc1 = pm2 * 1000000000LL;
-		do_div(tsc1, PMTMR_TICKS_PER_SEC);
+	/* We don't have an alternative source, use the PIT calibration value */
+	if (!hpet && !ref1 && !ref2) {
+		printk(KERN_INFO "TSC: Using PIT calibration value\n");
+		return tsc_pit_min;
 	}
 
-	do_div(tsc2, tsc1);
-	tsc_khz_val = tsc2;
+	/* The alternative source failed, use the PIT calibration value */
+	if (tsc_ref_min == ULONG_MAX) {
+		printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
+		       "Using PIT calibration\n");
+		return tsc_pit_min;
+	}
 
-out:
-	return tsc_khz_val;
+	/*
+	 * The calibration values differ too much. In doubt, we use
+	 * the PIT value as we know that there are PMTIMERs around
+	 * running at double speed. At least we let the user know:
+	 */
+	printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
+	       hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
+	printk(KERN_INFO "TSC: Using PIT calibration value\n");
+	return tsc_pit_min;
 }
 
-
 #ifdef CONFIG_X86_32
 /* Only called from the Powernow K7 cpu freq driver */
 int recalibrate_cpu_khz(void)
@@ -314,7 +624,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 			mark_tsc_unstable("cpufreq changes");
 	}
 
-	set_cyc2ns_scale(tsc_khz_ref, freq->cpu);
+	set_cyc2ns_scale(tsc_khz, freq->cpu);
 
 	return 0;
 }
@@ -325,6 +635,10 @@ static struct notifier_block time_cpufreq_notifier_block = {
 
 static int __init cpufreq_tsc(void)
 {
+	if (!cpu_has_tsc)
+		return 0;
+	if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+		return 0;
 	cpufreq_register_notifier(&time_cpufreq_notifier_block,
 				CPUFREQ_TRANSITION_NOTIFIER);
 	return 0;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 0577825cf89b..9ffb01c31c40 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -88,11 +88,9 @@ static __cpuinit void check_tsc_warp(void)
 			__raw_spin_unlock(&sync_lock);
 		}
 	}
-	if (!(now-start)) {
-		printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
+	WARN(!(now-start),
+		"Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
 			now-start, end-start);
-		WARN_ON(1);
-	}
 }
 
 /*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
new file mode 100644
index 000000000000..aeef529917e4
--- /dev/null
+++ b/arch/x86/kernel/uv_irq.c
@@ -0,0 +1,79 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV IRQ functions
+ *
+ * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/irq.h>
+
+#include <asm/apic.h>
+#include <asm/uv/uv_irq.h>
+
+static void uv_noop(unsigned int irq)
+{
+}
+
+static unsigned int uv_noop_ret(unsigned int irq)
+{
+	return 0;
+}
+
+static void uv_ack_apic(unsigned int irq)
+{
+	ack_APIC_irq();
+}
+
+struct irq_chip uv_irq_chip = {
+	.name		= "UV-CORE",
+	.startup	= uv_noop_ret,
+	.shutdown	= uv_noop,
+	.enable		= uv_noop,
+	.disable	= uv_noop,
+	.ack		= uv_noop,
+	.mask		= uv_noop,
+	.unmask		= uv_noop,
+	.eoi		= uv_ack_apic,
+	.end		= uv_noop,
+};
+
+/*
+ * Set up a mapping of an available irq and vector, and enable the specified
+ * MMR that defines the MSI that is to be sent to the specified CPU when an
+ * interrupt is raised.
+ */
+int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
+		 unsigned long mmr_offset)
+{
+	int irq;
+	int ret;
+
+	irq = create_irq();
+	if (irq <= 0)
+		return -EBUSY;
+
+	ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset);
+	if (ret != irq)
+		destroy_irq(irq);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(uv_setup_irq);
+
+/*
+ * Tear down a mapping of an irq and vector, and disable the specified MMR that
+ * defined the MSI that was to be sent to the specified CPU when an interrupt
+ * was raised.
+ *
+ * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
+ */
+void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset)
+{
+	arch_disable_uv_irq(mmr_blade, mmr_offset);
+	destroy_irq(irq);
+}
+EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
new file mode 100644
index 000000000000..67f9b9dbf800
--- /dev/null
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -0,0 +1,72 @@
+/*
+ * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ *  Copyright (c) 2008 Silicon Graphics, Inc.  All Rights Reserved.
+ *  Copyright (c) Russ Anderson
+ */
+
+#include <linux/sysdev.h>
+#include <asm/uv/bios.h>
+
+struct kobject *sgi_uv_kobj;
+
+static ssize_t partition_id_show(struct kobject *kobj,
+			struct kobj_attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
+}
+
+static ssize_t coherence_id_show(struct kobject *kobj,
+			struct kobj_attribute *attr, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
+}
+
+static struct kobj_attribute partition_id_attr =
+	__ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
+
+static struct kobj_attribute coherence_id_attr =
+	__ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
+
+
+static int __init sgi_uv_sysfs_init(void)
+{
+	unsigned long ret;
+
+	if (!sgi_uv_kobj)
+		sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
+	if (!sgi_uv_kobj) {
+		printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n");
+		return -EINVAL;
+	}
+
+	ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
+	if (ret) {
+		printk(KERN_WARNING "sysfs_create_file partition_id failed \n");
+		return ret;
+	}
+
+	ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
+	if (ret) {
+		printk(KERN_WARNING "sysfs_create_file coherence_id failed \n");
+		return ret;
+	}
+
+	return 0;
+}
+
+device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index e94bdb6add1d..0c9667f0752a 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -25,45 +25,31 @@
 #include <asm/visws/cobalt.h>
 #include <asm/visws/piix4.h>
 #include <asm/arch_hooks.h>
+#include <asm/io_apic.h>
 #include <asm/fixmap.h>
 #include <asm/reboot.h>
 #include <asm/setup.h>
 #include <asm/e820.h>
-#include <asm/smp.h>
 #include <asm/io.h>
 
 #include <mach_ipi.h>
 
 #include "mach_apic.h"
 
-#include <linux/init.h>
-#include <linux/smp.h>
-
 #include <linux/kernel_stat.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
 
-#include <asm/io.h>
-#include <asm/apic.h>
 #include <asm/i8259.h>
 #include <asm/irq_vectors.h>
-#include <asm/visws/cobalt.h>
 #include <asm/visws/lithium.h>
-#include <asm/visws/piix4.h>
 
 #include <linux/sched.h>
 #include <linux/kernel.h>
-#include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/pci_ids.h>
 
 extern int no_broadcast;
 
-#include <asm/io.h>
 #include <asm/apic.h>
-#include <asm/arch_hooks.h>
-#include <asm/visws/cobalt.h>
-#include <asm/visws/lithium.h>
 
 char visws_board_type	= -1;
 char visws_board_rev	= -1;
@@ -73,7 +59,7 @@ int is_visws_box(void)
 	return visws_board_type >= 0;
 }
 
-static int __init visws_time_init_quirk(void)
+static int __init visws_time_init(void)
 {
 	printk(KERN_INFO "Starting Cobalt Timer system clock\n");
 
@@ -93,7 +79,7 @@ static int __init visws_time_init_quirk(void)
 	return 0;
 }
 
-static int __init visws_pre_intr_init_quirk(void)
+static int __init visws_pre_intr_init(void)
 {
 	init_VISWS_APIC_irqs();
 
@@ -114,7 +100,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size);
 
 long long mem_size __initdata = 0;
 
-static char * __init visws_memory_setup_quirk(void)
+static char * __init visws_memory_setup(void)
 {
 	long long gfx_mem_size = 8 * MB;
 
@@ -176,7 +162,7 @@ static void visws_machine_power_off(void)
 	outl(PIIX_SPECIAL_STOP, 0xCFC);
 }
 
-static int __init visws_get_smp_config_quirk(unsigned int early)
+static int __init visws_get_smp_config(unsigned int early)
 {
 	/*
 	 * Prevent MP-table parsing by the generic code:
@@ -184,15 +170,13 @@ static int __init visws_get_smp_config_quirk(unsigned int early)
 	return 1;
 }
 
-extern unsigned int __cpuinitdata maxcpus;
-
 /*
  * The Visual Workstation is Intel MP compliant in the hardware
  * sense, but it doesn't have a BIOS(-configuration table).
  * No problem for Linux.
  */
 
-static void __init MP_processor_info (struct mpc_config_processor *m)
+static void __init MP_processor_info(struct mpc_config_processor *m)
 {
 	int ver, logical_apicid;
 	physid_mask_t apic_cpus;
@@ -232,7 +216,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 	apic_version[m->mpc_apicid] = ver;
 }
 
-int __init visws_find_smp_config_quirk(unsigned int reserve)
+static int __init visws_find_smp_config(unsigned int reserve)
 {
 	struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
 	unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -244,8 +228,8 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
 		ncpus = CO_CPU_MAX;
 	}
 
-	if (ncpus > maxcpus)
-		ncpus = maxcpus;
+	if (ncpus > setup_max_cpus)
+		ncpus = setup_max_cpus;
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	smp_found_config = 1;
@@ -258,7 +242,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
 	return 1;
 }
 
-extern int visws_trap_init_quirk(void);
+static int visws_trap_init(void);
+
+static struct x86_quirks visws_x86_quirks __initdata = {
+	.arch_time_init		= visws_time_init,
+	.arch_pre_intr_init	= visws_pre_intr_init,
+	.arch_memory_setup	= visws_memory_setup,
+	.arch_intr_init		= NULL,
+	.arch_trap_init		= visws_trap_init,
+	.mach_get_smp_config	= visws_get_smp_config,
+	.mach_find_smp_config	= visws_find_smp_config,
+};
 
 void __init visws_early_detect(void)
 {
@@ -272,16 +266,10 @@ void __init visws_early_detect(void)
 
 	/*
 	 * Install special quirks for timer, interrupt and memory setup:
-	 */
-	arch_time_init_quirk		= visws_time_init_quirk;
-	arch_pre_intr_init_quirk	= visws_pre_intr_init_quirk;
-	arch_memory_setup_quirk		= visws_memory_setup_quirk;
-
-	/*
 	 * Fall back to generic behavior for traps:
+	 * Override generic MP-table parsing:
 	 */
-	arch_intr_init_quirk		= NULL;
-	arch_trap_init_quirk		= visws_trap_init_quirk;
+	x86_quirks = &visws_x86_quirks;
 
 	/*
 	 * Install reboot quirks:
@@ -294,12 +282,6 @@ void __init visws_early_detect(void)
 	 */
 	no_broadcast = 0;
 
-	/*
-	 * Override generic MP-table parsing:
-	 */
-	mach_get_smp_config_quirk	= visws_get_smp_config_quirk;
-	mach_find_smp_config_quirk	= visws_find_smp_config_quirk;
-
 #ifdef CONFIG_X86_IO_APIC
 	/*
 	 * Turn off IO-APIC detection and initialization:
@@ -426,7 +408,7 @@ static __init void cobalt_init(void)
 		co_apic_read(CO_APIC_ID));
 }
 
-int __init visws_trap_init_quirk(void)
+static int __init visws_trap_init(void)
 {
 	lithium_init();
 	cobalt_init();
@@ -502,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq)
 static unsigned int startup_cobalt_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 	spin_lock_irqsave(&cobalt_lock, flags);
-	if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
-		irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
+	if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
+		desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
 	enable_cobalt_irq(irq);
 	spin_unlock_irqrestore(&cobalt_lock, flags);
 	return 0;
@@ -524,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq)
 static void end_cobalt_irq(unsigned int irq)
 {
 	unsigned long flags;
+	struct irq_desc *desc = irq_to_desc(irq);
 
 	spin_lock_irqsave(&cobalt_lock, flags);
-	if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS)))
+	if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
 		enable_cobalt_irq(irq);
 	spin_unlock_irqrestore(&cobalt_lock, flags);
 }
@@ -644,12 +628,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
 
 	spin_unlock_irqrestore(&i8259A_lock, flags);
 
-	desc = irq_desc + realirq;
+	desc = irq_to_desc(realirq);
 
 	/*
 	 * handle this 'virtual interrupt' as a Cobalt one now.
 	 */
-	kstat_cpu(smp_processor_id()).irqs[realirq]++;
+	kstat_incr_irqs_this_cpu(realirq, desc);
 
 	if (likely(desc->action != NULL))
 		handle_IRQ_event(realirq, desc->action);
@@ -680,27 +664,29 @@ void init_VISWS_APIC_irqs(void)
 	int i;
 
 	for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
-		irq_desc[i].status = IRQ_DISABLED;
-		irq_desc[i].action = 0;
-		irq_desc[i].depth = 1;
+		struct irq_desc *desc = irq_to_desc(i);
+
+		desc->status = IRQ_DISABLED;
+		desc->action = 0;
+		desc->depth = 1;
 
 		if (i == 0) {
-			irq_desc[i].chip = &cobalt_irq_type;
+			desc->chip = &cobalt_irq_type;
 		}
 		else if (i == CO_IRQ_IDE0) {
-			irq_desc[i].chip = &cobalt_irq_type;
+			desc->chip = &cobalt_irq_type;
 		}
 		else if (i == CO_IRQ_IDE1) {
-			irq_desc[i].chip = &cobalt_irq_type;
+			desc->chip = &cobalt_irq_type;
 		}
 		else if (i == CO_IRQ_8259) {
-			irq_desc[i].chip = &piix4_master_irq_type;
+			desc->chip = &piix4_master_irq_type;
 		}
 		else if (i < CO_IRQ_APIC0) {
-			irq_desc[i].chip = &piix4_virtual_irq_type;
+			desc->chip = &piix4_virtual_irq_type;
 		}
 		else if (IS_CO_APIC(i)) {
-			irq_desc[i].chip = &cobalt_irq_type;
+			desc->chip = &cobalt_irq_type;
 		}
 	}
 
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 38f566fa27d2..4eeb5cf9720d 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -46,6 +46,7 @@
 #include <asm/io.h>
 #include <asm/tlbflush.h>
 #include <asm/irq.h>
+#include <asm/syscalls.h>
 
 /*
  * Known problems:
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b15346092b7b..8b6c393ab9fd 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -37,6 +37,7 @@
 #include <asm/timer.h>
 #include <asm/vmi_time.h>
 #include <asm/kmap_types.h>
+#include <asm/setup.h>
 
 /* Convenient for calling VMI functions indirectly in the ROM */
 typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
@@ -234,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
 				const void *desc)
 {
 	u32 *ldt_entry = (u32 *)desc;
-	vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
+	vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
 }
 
 static void vmi_load_sp0(struct tss_struct *tss,
@@ -392,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
 }
 #endif
 
-static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
 {
 	vmi_set_page_type(pfn, VMI_PAGE_L1);
 	vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
 }
 
-static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
 {
  	/*
 	 * This call comes in very early, before mem_map is setup.
@@ -409,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
 	vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
 }
 
-static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
 {
  	vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
 	vmi_check_page_type(clonepfn, VMI_PAGE_L2);
 	vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
 }
 
-static void vmi_release_pte(u32 pfn)
+static void vmi_release_pte(unsigned long pfn)
 {
 	vmi_ops.release_page(pfn, VMI_PAGE_L1);
 	vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
 }
 
-static void vmi_release_pmd(u32 pfn)
+static void vmi_release_pmd(unsigned long pfn)
 {
 	vmi_ops.release_page(pfn, VMI_PAGE_L2);
 	vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
@@ -683,7 +684,7 @@ void vmi_bringup(void)
 {
  	/* We must establish the lowmem mapping for MMU ops to work */
 	if (vmi_ops.set_linear_mapping)
-		vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0);
+		vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
 }
 
 /*
@@ -904,9 +905,8 @@ static inline int __init activate_vmi(void)
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
-	para_fill(pv_apic_ops.apic_read, APICRead);
-	para_fill(pv_apic_ops.apic_write, APICWrite);
-	para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
+       para_fill(apic_ops->read, APICRead);
+       para_fill(apic_ops->write, APICWrite);
 #endif
 
 	/*
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 6953859fe289..254ee07f8635 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -235,11 +235,14 @@ static void __devinit vmi_time_init_clockevent(void)
 
 void __init vmi_time_init(void)
 {
+	unsigned int cpu;
 	/* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
 	outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
 
 	vmi_time_init_clockevent();
 	setup_irq(0, &vmi_clock_action);
+	for_each_possible_cpu(cpu)
+		per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
 }
 
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index cdb2363697d2..a9b8560adbc2 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -140,10 +140,10 @@ SECTIONS
 	*(.con_initcall.init)
   	__con_initcall_end = .;
   }
-  .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) {
-	__x86cpuvendor_start = .;
-	*(.x86cpuvendor.init)
-	__x86cpuvendor_end = .;
+  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+	__x86_cpu_dev_start = .;
+	*(.x86_cpu_dev.init)
+	__x86_cpu_dev_end = .;
   }
   SECURITY_INIT
   . = ALIGN(4);
@@ -180,6 +180,7 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
 	__per_cpu_start = .;
+	*(.data.percpu.page_aligned)
 	*(.data.percpu)
 	*(.data.percpu.shared_aligned)
 	__per_cpu_end = .;
@@ -209,3 +210,11 @@ SECTIONS
 
   DWARF_DEBUG
 }
+
+#ifdef CONFIG_KEXEC
+/* Link time checks */
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+       "kexec control code size is too big")
+#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 63e5c1a22e88..46e05447405b 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -168,12 +168,11 @@ SECTIONS
 	*(.con_initcall.init)
   }
   __con_initcall_end = .;
-  . = ALIGN(16);
-  __x86cpuvendor_start = .;
-  .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) {
-	*(.x86cpuvendor.init)
+  __x86_cpu_dev_start = .;
+  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+	*(.x86_cpu_dev.init)
   }
-  __x86cpuvendor_end = .;
+  __x86_cpu_dev_end = .;
   SECURITY_INIT
 
   . = ALIGN(8);
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 0c029e8959c7..7766d36983fc 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -61,7 +61,7 @@ static void vsmp_irq_enable(void)
 	native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
 }
 
-static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf,
+static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
 				  unsigned long addr, unsigned len)
 {
 	switch (type) {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
new file mode 100644
index 000000000000..b13acb75e822
--- /dev/null
+++ b/arch/x86/kernel/xsave.c
@@ -0,0 +1,345 @@
+/*
+ * xsave/xrstor support.
+ *
+ * Author: Suresh Siddha <suresh.b.siddha@intel.com>
+ */
+#include <linux/bootmem.h>
+#include <linux/compat.h>
+#include <asm/i387.h>
+#ifdef CONFIG_IA32_EMULATION
+#include <asm/sigcontext32.h>
+#endif
+#include <asm/xcr.h>
+
+/*
+ * Supported feature mask by the CPU and the kernel.
+ */
+u64 pcntxt_mask;
+
+struct _fpx_sw_bytes fx_sw_reserved;
+#ifdef CONFIG_IA32_EMULATION
+struct _fpx_sw_bytes fx_sw_reserved_ia32;
+#endif
+
+/*
+ * Check for the presence of extended state information in the
+ * user fpstate pointer in the sigcontext.
+ */
+int check_for_xstate(struct i387_fxsave_struct __user *buf,
+		     void __user *fpstate,
+		     struct _fpx_sw_bytes *fx_sw_user)
+{
+	int min_xstate_size = sizeof(struct i387_fxsave_struct) +
+			      sizeof(struct xsave_hdr_struct);
+	unsigned int magic2;
+	int err;
+
+	err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
+			       sizeof(struct _fpx_sw_bytes));
+
+	if (err)
+		return err;
+
+	/*
+	 * First Magic check failed.
+	 */
+	if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
+		return -1;
+
+	/*
+	 * Check for error scenarios.
+	 */
+	if (fx_sw_user->xstate_size < min_xstate_size ||
+	    fx_sw_user->xstate_size > xstate_size ||
+	    fx_sw_user->xstate_size > fx_sw_user->extended_size)
+		return -1;
+
+	err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
+					    fx_sw_user->extended_size -
+					    FP_XSTATE_MAGIC2_SIZE));
+	/*
+	 * Check for the presence of second magic word at the end of memory
+	 * layout. This detects the case where the user just copied the legacy
+	 * fpstate layout with out copying the extended state information
+	 * in the memory layout.
+	 */
+	if (err || magic2 != FP_XSTATE_MAGIC2)
+		return -1;
+
+	return 0;
+}
+
+#ifdef CONFIG_X86_64
+/*
+ * Signal frame handlers.
+ */
+
+int save_i387_xstate(void __user *buf)
+{
+	struct task_struct *tsk = current;
+	int err = 0;
+
+	if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
+		return -EACCES;
+
+	BUG_ON(sig_xstate_size < xstate_size);
+
+	if ((unsigned long)buf % 64)
+		printk("save_i387_xstate: bad fpstate %p\n", buf);
+
+	if (!used_math())
+		return 0;
+	clear_used_math(); /* trigger finit */
+	if (task_thread_info(tsk)->status & TS_USEDFPU) {
+		/*
+	 	 * Start with clearing the user buffer. This will present a
+	 	 * clean context for the bytes not touched by the fxsave/xsave.
+		 */
+		err = __clear_user(buf, sig_xstate_size);
+		if (err)
+			return err;
+
+		if (task_thread_info(tsk)->status & TS_XSAVE)
+			err = xsave_user(buf);
+		else
+			err = fxsave_user(buf);
+
+		if (err)
+			return err;
+		task_thread_info(tsk)->status &= ~TS_USEDFPU;
+		stts();
+	} else {
+		if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
+				   xstate_size))
+			return -1;
+	}
+
+	if (task_thread_info(tsk)->status & TS_XSAVE) {
+		struct _fpstate __user *fx = buf;
+		struct _xstate __user *x = buf;
+		u64 xstate_bv;
+
+		err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
+				     sizeof(struct _fpx_sw_bytes));
+
+		err |= __put_user(FP_XSTATE_MAGIC2,
+				  (__u32 __user *) (buf + sig_xstate_size
+						    - FP_XSTATE_MAGIC2_SIZE));
+
+		/*
+		 * Read the xstate_bv which we copied (directly from the cpu or
+		 * from the state in task struct) to the user buffers and
+		 * set the FP/SSE bits.
+		 */
+		err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
+
+		/*
+		 * For legacy compatible, we always set FP/SSE bits in the bit
+		 * vector while saving the state to the user context. This will
+		 * enable us capturing any changes(during sigreturn) to
+		 * the FP/SSE bits by the legacy applications which don't touch
+		 * xstate_bv in the xsave header.
+		 *
+		 * xsave aware apps can change the xstate_bv in the xsave
+		 * header as well as change any contents in the memory layout.
+		 * xrestore as part of sigreturn will capture all the changes.
+		 */
+		xstate_bv |= XSTATE_FPSSE;
+
+		err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
+
+		if (err)
+			return err;
+	}
+
+	return 1;
+}
+
+/*
+ * Restore the extended state if present. Otherwise, restore the FP/SSE
+ * state.
+ */
+int restore_user_xstate(void __user *buf)
+{
+	struct _fpx_sw_bytes fx_sw_user;
+	u64 mask;
+	int err;
+
+	if (((unsigned long)buf % 64) ||
+	     check_for_xstate(buf, buf, &fx_sw_user))
+		goto fx_only;
+
+	mask = fx_sw_user.xstate_bv;
+
+	/*
+	 * restore the state passed by the user.
+	 */
+	err = xrestore_user(buf, mask);
+	if (err)
+		return err;
+
+	/*
+	 * init the state skipped by the user.
+	 */
+	mask = pcntxt_mask & ~mask;
+
+	xrstor_state(init_xstate_buf, mask);
+
+	return 0;
+
+fx_only:
+	/*
+	 * couldn't find the extended state information in the
+	 * memory layout. Restore just the FP/SSE and init all
+	 * the other extended state.
+	 */
+	xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
+	return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
+}
+
+/*
+ * This restores directly out of user space. Exceptions are handled.
+ */
+int restore_i387_xstate(void __user *buf)
+{
+	struct task_struct *tsk = current;
+	int err = 0;
+
+	if (!buf) {
+		if (used_math())
+			goto clear;
+		return 0;
+	} else
+		if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
+			return -EACCES;
+
+	if (!used_math()) {
+		err = init_fpu(tsk);
+		if (err)
+			return err;
+	}
+
+	if (!(task_thread_info(current)->status & TS_USEDFPU)) {
+		clts();
+		task_thread_info(current)->status |= TS_USEDFPU;
+	}
+	if (task_thread_info(tsk)->status & TS_XSAVE)
+		err = restore_user_xstate(buf);
+	else
+		err = fxrstor_checking((__force struct i387_fxsave_struct *)
+				       buf);
+	if (unlikely(err)) {
+		/*
+		 * Encountered an error while doing the restore from the
+		 * user buffer, clear the fpu state.
+		 */
+clear:
+		clear_fpu(tsk);
+		clear_used_math();
+	}
+	return err;
+}
+#endif
+
+/*
+ * Prepare the SW reserved portion of the fxsave memory layout, indicating
+ * the presence of the extended state information in the memory layout
+ * pointed by the fpstate pointer in the sigcontext.
+ * This will be saved when ever the FP and extended state context is
+ * saved on the user stack during the signal handler delivery to the user.
+ */
+static void prepare_fx_sw_frame(void)
+{
+	int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
+			     FP_XSTATE_MAGIC2_SIZE;
+
+	sig_xstate_size = sizeof(struct _fpstate) + size_extended;
+
+#ifdef CONFIG_IA32_EMULATION
+	sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
+#endif
+
+	memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
+
+	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
+	fx_sw_reserved.extended_size = sig_xstate_size;
+	fx_sw_reserved.xstate_bv = pcntxt_mask;
+	fx_sw_reserved.xstate_size = xstate_size;
+#ifdef CONFIG_IA32_EMULATION
+	memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
+	       sizeof(struct _fpx_sw_bytes));
+	fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
+#endif
+}
+
+/*
+ * Represents init state for the supported extended state.
+ */
+struct xsave_struct *init_xstate_buf;
+
+#ifdef CONFIG_X86_64
+unsigned int sig_xstate_size = sizeof(struct _fpstate);
+#endif
+
+/*
+ * Enable the extended processor state save/restore feature
+ */
+void __cpuinit xsave_init(void)
+{
+	if (!cpu_has_xsave)
+		return;
+
+	set_in_cr4(X86_CR4_OSXSAVE);
+
+	/*
+	 * Enable all the features that the HW is capable of
+	 * and the Linux kernel is aware of.
+	 */
+	xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
+}
+
+/*
+ * setup the xstate image representing the init state
+ */
+static void __init setup_xstate_init(void)
+{
+	init_xstate_buf = alloc_bootmem(xstate_size);
+	init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
+}
+
+/*
+ * Enable and initialize the xsave feature.
+ */
+void __init xsave_cntxt_init(void)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+	pcntxt_mask = eax + ((u64)edx << 32);
+
+	if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
+		printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
+		       pcntxt_mask);
+		BUG();
+	}
+
+	/*
+	 * for now OS knows only about FP/SSE
+	 */
+	pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
+	xsave_init();
+
+	/*
+	 * Recompute the context size for enabled features
+	 */
+	cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+	xstate_size = ebx;
+
+	prepare_fx_sw_frame();
+
+	setup_xstate_init();
+
+	printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
+	       "cntxt size 0x%x\n",
+	       pcntxt_mask, xstate_size);
+}
author	Ingo Molnar <mingo@elte.hu>	2008-10-28 16:26:12 +0100
committer	Ingo Molnar <mingo@elte.hu>	2008-10-28 16:26:12 +0100
commit	7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch)
tree	e730a4565e0318140d2fbd2f0415d18a339d7336 /arch/x86/kernel
parent	x86, pci: introduce config option for pci reroute quirks (was: [PATCH 0/3] Bo... (diff)
parent	Linux 2.6.28-rc2 (diff)
download	linux-7a9787e1eba95a166265e6a260cf30af04ef0a99.tar.xz linux-7a9787e1eba95a166265e6a260cf30af04ef0a99.zip