From 3dc68d9b58ae644cee8e218e3dcde0dceb5c47a3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH] x86-64: revert x86_64-mm-add-genapic_force This is obsoleted by new Ingo genapic patches. Cc: Suresh Siddha Cc: Andi Kleen Cc: "Li, Shaohua" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/genapic.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 0b3603adf56d..7312ddb84fb4 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -33,7 +33,7 @@ extern struct genapic apic_flat; extern struct genapic apic_physflat; struct genapic *genapic = &apic_flat; -struct genapic *genapic_force; + /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. @@ -46,13 +46,6 @@ void __init clustered_apic_check(void) u8 cluster_cnt[NUM_APIC_CLUSTERS]; int max_apic = 0; - /* genapic selection can be forced because of certain quirks. - */ - if (genapic_force) { - genapic = genapic_force; - goto print; - } - #if defined(CONFIG_ACPI) /* * Some x86_64 machines use physical APIC mode regardless of how many -- cgit v1.2.3 From a86f34b49f32b238d16b2e3bf6c9a5391a3f683f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH] x86: revert x86_64-mm-fix-the-irqbalance-quirk-for-e7320-e7520-e7525 Obsoleted by Ingo's genapic stuff. Cc: Ingo Molnar Cc: Suresh Siddha Cc: Andi Kleen Cc: "Li, Shaohua" Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/acpi/earlyquirk.c | 21 ----------------- arch/i386/kernel/quirks.c | 46 +++++++++----------------------------- arch/i386/kernel/smpboot.c | 6 ----- arch/x86_64/kernel/early-quirks.c | 13 ----------- arch/x86_64/kernel/smpboot.c | 8 ------- include/asm-i386/genapic.h | 2 +- include/asm-i386/irq.h | 2 -- include/asm-x86_64/proto.h | 1 - 8 files changed, 12 insertions(+), 87 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/acpi/earlyquirk.c b/arch/i386/kernel/acpi/earlyquirk.c index 8f7efd38254d..23f78efc577d 100644 --- a/arch/i386/kernel/acpi/earlyquirk.c +++ b/arch/i386/kernel/acpi/earlyquirk.c @@ -10,7 +10,6 @@ #include #include #include -#include #ifdef CONFIG_ACPI @@ -48,24 +47,6 @@ static int __init check_bridge(int vendor, int device) return 0; } -static void check_intel(void) -{ - u16 vendor, device; - - vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID); - - if (vendor != PCI_VENDOR_ID_INTEL) - return; - - device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); -#ifdef CONFIG_SMP - if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || - device == PCI_DEVICE_ID_INTEL_E7520_MCH || - device == PCI_DEVICE_ID_INTEL_E7525_MCH) - quirk_intel_irqbalance(); -#endif -} - void __init check_acpi_pci(void) { int num, slot, func; @@ -77,8 +58,6 @@ void __init check_acpi_pci(void) if (!early_pci_allowed()) return; - check_intel(); - /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { for (slot = 0; slot < 32; slot++) { diff --git a/arch/i386/kernel/quirks.c b/arch/i386/kernel/quirks.c index a01320a7b636..9f6ab1789bb0 100644 --- a/arch/i386/kernel/quirks.c +++ b/arch/i386/kernel/quirks.c @@ -3,23 +3,10 @@ */ #include #include -#include -#include -#include #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) -static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) -{ -#ifdef CONFIG_X86_64 - if (genapic != &apic_flat) - panic("APIC mode must be flat on this system\n"); -#elif defined(CONFIG_X86_GENERICARCH) - if (genapic != &apic_default) - panic("APIC mode must be default(flat) on this system. Use apic=default\n"); -#endif -} -void __init quirk_intel_irqbalance(void) +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) { u8 config, rev; u32 word; @@ -29,18 +16,18 @@ void __init quirk_intel_irqbalance(void) * based platforms. * Disable SW irqbalance/affinity on those platforms. */ - rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION); + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); if (rev > 0x9) return; printk(KERN_INFO "Intel E7520/7320/7525 detected."); - /* enable access to config space */ - config = read_pci_config_byte(0, 0, 0, 0xf4); - write_pci_config_byte(0, 0, 0, 0xf4, config|0x2); + /* enable access to config space*/ + pci_read_config_byte(dev, 0xf4, &config); + pci_write_config_byte(dev, 0xf4, config|0x2); /* read xTPR register */ - word = read_pci_config_16(0, 0, 0x40, 0x4c); + raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); if (!(word & (1 << 13))) { printk(KERN_INFO "Disabling irq balancing and affinity\n"); @@ -50,25 +37,14 @@ void __init quirk_intel_irqbalance(void) noirqdebug_setup(""); #ifdef CONFIG_PROC_FS no_irq_affinity = 1; -#endif -#ifdef CONFIG_HOTPLUG_CPU - printk(KERN_INFO "Disabling cpu hotplug control\n"); - enable_cpu_hotplug = 0; -#endif -#ifdef CONFIG_X86_64 - /* force the genapic selection to flat mode so that - * interrupts can be redirected to more than one CPU. - */ - genapic_force = &apic_flat; #endif } - /* put back the original value for config space */ + /* put back the original value for config space*/ if (!(config & 0x2)) - write_pci_config_byte(0, 0, 0, 0xf4, config); + pci_write_config_byte(dev, 0xf4, config); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance); - +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); #endif diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 4ff55e675576..7b14e88b555f 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include @@ -1319,11 +1318,6 @@ int __cpuinit __cpu_up(unsigned int cpu) touch_nmi_watchdog(); } -#ifdef CONFIG_X86_GENERICARCH - if (num_online_cpus() > 8 && genapic == &apic_default) - panic("Default flat APIC routing can't be used with > 8 cpus\n"); -#endif - return 0; } diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index fede55a53995..990d9c218a5d 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -71,18 +71,6 @@ static void __init ati_bugs(void) } } -static void intel_bugs(void) -{ - u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); - -#ifdef CONFIG_SMP - if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || - device == PCI_DEVICE_ID_INTEL_E7520_MCH || - device == PCI_DEVICE_ID_INTEL_E7525_MCH) - quirk_intel_irqbalance(); -#endif -} - struct chipset { u16 vendor; void (*f)(void); @@ -92,7 +80,6 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, { PCI_VENDOR_ID_VIA, via_bugs }, { PCI_VENDOR_ID_ATI, ati_bugs }, - { PCI_VENDOR_ID_INTEL, intel_bugs}, {} }; diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index cd4643a37022..14724be48bec 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -60,7 +60,6 @@ #include #include #include -#include /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -965,13 +964,6 @@ int __cpuinit __cpu_up(unsigned int cpu) while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); - - if (num_online_cpus() > 8 && genapic == &apic_flat) { - printk(KERN_WARNING - "flat APIC routing can't be used with > 8 cpus\n"); - BUG(); - } - err = 0; return err; diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h index fd2be593b06e..8ffbb0f07457 100644 --- a/include/asm-i386/genapic.h +++ b/include/asm-i386/genapic.h @@ -122,6 +122,6 @@ struct genapic { APICFUNC(phys_pkg_id) \ } -extern struct genapic *genapic, apic_default; +extern struct genapic *genapic; #endif diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h index 11761cdaae19..9e15ce0006eb 100644 --- a/include/asm-i386/irq.h +++ b/include/asm-i386/irq.h @@ -37,8 +37,6 @@ static __inline__ int irq_canonicalize(int irq) extern int irqbalance_disable(char *str); #endif -extern void quirk_intel_irqbalance(void); - #ifdef CONFIG_HOTPLUG_CPU extern void fixup_irqs(cpumask_t map); #endif diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index b6e65a699f2a..6688cf9eb27b 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -82,7 +82,6 @@ extern void syscall32_cpu_init(void); extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); extern void early_quirks(void); -extern void quirk_intel_irqbalance(void); extern void check_efer(void); extern int unhandled_signal(struct task_struct *tsk, int sig); -- cgit v1.2.3 From f18d397e6aa5cde638d164b1d519c3ee903f4867 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH] x86-64: optimize & fix APIC mode setup Fix a couple of inconsistencies/problems I found while reviewing the x86_64 genapic code (when I was chasing mysterious eth0 timeouts that would only trigger if CPU_HOTPLUG is enabled): - AMD systems defaulted to the slower flat-physical mode instead of the flat-logical mode. The only restriction on AMD systems is that they should not use clustered APIC mode. - removed the CPU hotplug hacks, switching the default for small systems back from phys-flat to logical-flat. The switching to logical flat mode on small systems fixed sporadic ethernet driver timeouts i was getting on a dual-core Athlon64 system: NETDEV WATCHDOG: eth0: transmit timed out eth0: Transmit timeout, status 0c 0005 c07f media 80. eth0: Tx queue start entry 32 dirty entry 28. eth0: Tx descriptor 0 is 0008a04a. (queue head) eth0: Tx descriptor 1 is 0008a04a. eth0: Tx descriptor 2 is 0008a04a. eth0: Tx descriptor 3 is 0008a04a. eth0: link up, 100Mbps, full-duplex, lpa 0xC5E1 - The use of '<= 8' was a bug by itself (the valid APIC ids for logical flat mode go from 0 to 7, not 0 to 8). The new logic is to use logical flat mode on both AMD and Intel systems, and to only switch to physical mode when logical mode cannot be used. If CPU hotplug is racy wrt. APIC shutdown then CPU hotplug needs fixing, not the whole IRQ system be made inconsistent and slowed down. - minor cleanups: simplified some code constructs build & booted on a couple of AMD and Intel SMP systems. Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen Cc: Suresh Siddha Cc: Andi Kleen Cc: "Li, Shaohua" Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- arch/x86_64/kernel/genapic.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 7312ddb84fb4..2f2b8fc6e2f3 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -32,21 +32,20 @@ extern struct genapic apic_cluster; extern struct genapic apic_flat; extern struct genapic apic_physflat; -struct genapic *genapic = &apic_flat; - +struct genapic __read_mostly *genapic = &apic_flat; /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ void __init clustered_apic_check(void) { - long i; + int i; u8 clusters, max_cluster; u8 id; u8 cluster_cnt[NUM_APIC_CLUSTERS]; int max_apic = 0; -#if defined(CONFIG_ACPI) +#ifdef CONFIG_ACPI /* * Some x86_64 machines use physical APIC mode regardless of how many * procs/clusters are present (x86_64 ES7000 is an example). @@ -68,20 +67,17 @@ void __init clustered_apic_check(void) cluster_cnt[APIC_CLUSTERID(id)]++; } - /* Don't use clustered mode on AMD platforms. */ + /* + * Don't use clustered mode on AMD platforms, default + * to flat logical mode. + */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - genapic = &apic_physflat; -#ifndef CONFIG_HOTPLUG_CPU - /* In the CPU hotplug case we cannot use broadcast mode - because that opens a race when a CPU is removed. - Stay at physflat mode in this case. - It is bad to do this unconditionally though. Once - we have ACPI platform support for CPU hotplug - we should detect hotplug capablity from ACPI tables and - only do this when really needed. -AK */ - if (max_apic <= 8) - genapic = &apic_flat; -#endif + /* + * Switch to physical flat mode if more than 8 APICs + * (In the case of 8 CPUs APIC ID goes from 0 to 7): + */ + if (max_apic >= 8) + genapic = &apic_physflat; goto print; } @@ -103,14 +99,9 @@ void __init clustered_apic_check(void) * (We don't use lowest priority delivery + HW APIC IRQ steering, so * can ignore the clustered logical case and go straight to physical.) */ - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) { -#ifdef CONFIG_HOTPLUG_CPU - /* Don't use APIC shortcuts in CPU hotplug to avoid races */ - genapic = &apic_physflat; -#else + if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) genapic = &apic_flat; -#endif - } else + else genapic = &apic_cluster; print: -- cgit v1.2.3 From 07c7c4744400f93a7c52b32159c31d823e1747a5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH] x86-64: always use physical delivery mode on > 8 CPUs Remove clustered APIC mode. There's little point in the use of clustered APIC mode, broadcasting is limited to within the cluster only, and chipsets have bugs in this area as well. So default to physical APIC mode when the CPU count is large, and default to logical APIC mode when the CPU count is 8 or smaller. (this patch only removes the use of genapic_cluster and cleans up the resulting genapic.c file - removal of all remaining traces of clustered mode will be done by another patch.) Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen Cc: Suresh Siddha Cc: Andi Kleen Cc: "Li, Shaohua" Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- arch/x86_64/kernel/genapic.c | 71 ++++++++++---------------------------------- include/asm-x86_64/genapic.h | 4 ++- 2 files changed, 18 insertions(+), 57 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 2f2b8fc6e2f3..025f26ebb8d7 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -11,26 +11,24 @@ #include #include #include +#include #include #include #include -#include #include #include -#if defined(CONFIG_ACPI) +#ifdef CONFIG_ACPI #include #endif /* which logical CPU number maps to which CPU (physical APIC ID) */ -u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly + = { [0 ... NR_CPUS-1] = BAD_APICID }; EXPORT_SYMBOL(x86_cpu_to_apicid); -u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -extern struct genapic apic_cluster; -extern struct genapic apic_flat; -extern struct genapic apic_physflat; +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; struct genapic __read_mostly *genapic = &apic_flat; @@ -39,76 +37,37 @@ struct genapic __read_mostly *genapic = &apic_flat; */ void __init clustered_apic_check(void) { - int i; - u8 clusters, max_cluster; + unsigned int i, max_apic = 0; u8 id; - u8 cluster_cnt[NUM_APIC_CLUSTERS]; - int max_apic = 0; #ifdef CONFIG_ACPI /* - * Some x86_64 machines use physical APIC mode regardless of how many - * procs/clusters are present (x86_64 ES7000 is an example). + * Quirk: some x86_64 machines can only use physical APIC mode + * regardless of how many processors are present (x86_64 ES7000 + * is an example). */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID) - if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) { - genapic = &apic_cluster; - goto print; - } + if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) + genapic = &apic_physflat; #endif - memset(cluster_cnt, 0, sizeof(cluster_cnt)); for (i = 0; i < NR_CPUS; i++) { id = bios_cpu_apicid[i]; if (id == BAD_APICID) continue; if (id > max_apic) max_apic = id; - cluster_cnt[APIC_CLUSTERID(id)]++; } - /* - * Don't use clustered mode on AMD platforms, default - * to flat logical mode. - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - /* - * Switch to physical flat mode if more than 8 APICs - * (In the case of 8 CPUs APIC ID goes from 0 to 7): - */ - if (max_apic >= 8) - genapic = &apic_physflat; - goto print; - } - - clusters = 0; - max_cluster = 0; - - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (cluster_cnt[i] > 0) { - ++clusters; - if (cluster_cnt[i] > max_cluster) - max_cluster = cluster_cnt[i]; - } - } - - /* - * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, - * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical - * else physical mode. - * (We don't use lowest priority delivery + HW APIC IRQ steering, so - * can ignore the clustered logical case and go straight to physical.) - */ - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) + if (max_apic < 8) genapic = &apic_flat; else - genapic = &apic_cluster; + genapic = &apic_physflat; -print: printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); } -/* Same for both flat and clustered. */ +/* Same for both flat and physical. */ void send_IPI_self(int vector) { diff --git a/include/asm-x86_64/genapic.h b/include/asm-x86_64/genapic.h index a0e9a4b93484..d7e516ccbaa4 100644 --- a/include/asm-x86_64/genapic.h +++ b/include/asm-x86_64/genapic.h @@ -29,7 +29,9 @@ struct genapic { unsigned int (*phys_pkg_id)(int index_msb); }; - extern struct genapic *genapic; +extern struct genapic apic_flat; +extern struct genapic apic_physflat; + #endif -- cgit v1.2.3 From 424df390101f9dfe015f0633aaec696c159b37a8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH] x86-64: remove clustered APIC mode Remove now unused clustered APIC mode code. Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen Cc: Suresh Siddha Cc: Andi Kleen Cc: "Li, Shaohua" Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- arch/x86_64/kernel/Makefile | 3 +- arch/x86_64/kernel/genapic_cluster.c | 137 ----------------------------------- 2 files changed, 1 insertion(+), 139 deletions(-) delete mode 100644 arch/x86_64/kernel/genapic_cluster.c (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index bb47e86f3d02..6879b4f01e88 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -21,8 +21,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o tsc_sync.o obj-y += apic.o nmi.o -obj-y += io_apic.o mpparse.o \ - genapic.o genapic_cluster.o genapic_flat.o +obj-y += io_apic.o mpparse.o genapic.o genapic_flat.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_PM) += suspend.o diff --git a/arch/x86_64/kernel/genapic_cluster.c b/arch/x86_64/kernel/genapic_cluster.c deleted file mode 100644 index 73d76308b955..000000000000 --- a/arch/x86_64/kernel/genapic_cluster.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright 2004 James Cleverdon, IBM. - * Subject to the GNU Public License, v.2 - * - * Clustered APIC subarch code. Up to 255 CPUs, physical delivery. - * (A more realistic maximum is around 230 CPUs.) - * - * Hacked for x86-64 by James Cleverdon from i386 architecture code by - * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and - * James Cleverdon. - */ -#include -#include -#include -#include -#include -#include -#include -#include - - -/* - * Set up the logical destination ID. - * - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ -static void cluster_init_apic_ldr(void) -{ - unsigned long val, id; - long i, count; - u8 lid; - u8 my_id = hard_smp_processor_id(); - u8 my_cluster = APIC_CLUSTER(my_id); - - /* Create logical APIC IDs by counting CPUs already in cluster. */ - for (count = 0, i = NR_CPUS; --i >= 0; ) { - lid = x86_cpu_to_log_apicid[i]; - if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) - ++count; - } - /* - * We only have a 4 wide bitmap in cluster mode. There's no way - * to get above 60 CPUs and still give each one it's own bit. - * But, we're using physical IRQ delivery, so we don't care. - * Use bit 3 for the 4th through Nth CPU in each cluster. - */ - if (count >= XAPIC_DEST_CPUS_SHIFT) - count = 3; - id = my_cluster | (1UL << count); - x86_cpu_to_log_apicid[smp_processor_id()] = id; - apic_write(APIC_DFR, APIC_DFR_CLUSTER); - val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; - val |= SET_APIC_LOGICAL_ID(id); - apic_write(APIC_LDR, val); -} - -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - -static cpumask_t cluster_target_cpus(void) -{ - return cpumask_of_cpu(0); -} - -static cpumask_t cluster_vector_allocation_domain(int cpu) -{ - cpumask_t domain = CPU_MASK_NONE; - cpu_set(cpu, domain); - return domain; -} - -static void cluster_send_IPI_mask(cpumask_t mask, int vector) -{ - send_IPI_mask_sequence(mask, vector); -} - -static void cluster_send_IPI_allbutself(int vector) -{ - cpumask_t mask = cpu_online_map; - - cpu_clear(smp_processor_id(), mask); - - if (!cpus_empty(mask)) - cluster_send_IPI_mask(mask, vector); -} - -static void cluster_send_IPI_all(int vector) -{ - cluster_send_IPI_mask(cpu_online_map, vector); -} - -static int cluster_apic_id_registered(void) -{ - return 1; -} - -static unsigned int cluster_cpu_mask_to_apicid(cpumask_t cpumask) -{ - int cpu; - - /* - * We're using fixed IRQ delivery, can only return one phys APIC ID. - * May as well be the first. - */ - cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) - return x86_cpu_to_apicid[cpu]; - else - return BAD_APICID; -} - -/* cpuid returns the value latched in the HW at reset, not the APIC ID - * register's value. For any box whose BIOS changes APIC IDs, like - * clustered APIC systems, we must use hard_smp_processor_id. - * - * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID. - */ -static unsigned int phys_pkg_id(int index_msb) -{ - return hard_smp_processor_id() >> index_msb; -} - -struct genapic apic_cluster = { - .name = "clustered", - .int_delivery_mode = dest_Fixed, - .int_dest_mode = (APIC_DEST_PHYSICAL != 0), - .target_cpus = cluster_target_cpus, - .vector_allocation_domain = cluster_vector_allocation_domain, - .apic_id_registered = cluster_apic_id_registered, - .init_apic_ldr = cluster_init_apic_ldr, - .send_IPI_all = cluster_send_IPI_all, - .send_IPI_allbutself = cluster_send_IPI_allbutself, - .send_IPI_mask = cluster_send_IPI_mask, - .cpu_mask_to_apicid = cluster_cpu_mask_to_apicid, - .phys_pkg_id = phys_pkg_id, -}; -- cgit v1.2.3 From 3c43f03908de98fa8f7a9e8fc9411ebf4c2de298 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH] x86: default to physical mode on hotplug CPU kernels Default to physical mode on hotplug CPU kernels. Furher simplify and clean up the APIC initialization code. Signed-off-by: Ingo Molnar Signed-off-by: Andi Kleen Cc: Suresh Siddha Cc: Andi Kleen Cc: "Li, Shaohua" Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton --- arch/i386/kernel/acpi/boot.c | 2 +- arch/i386/kernel/mpparse.c | 2 +- arch/x86_64/kernel/genapic.c | 16 +++------------- arch/x86_64/kernel/mpparse.c | 2 +- include/asm-i386/genapic.h | 4 ++-- include/asm-i386/mach-bigsmp/mach_apic.h | 2 +- include/asm-i386/mach-default/mach_apic.h | 2 +- include/asm-i386/mach-es7000/mach_apic.h | 2 +- include/asm-i386/mach-generic/mach_apic.h | 2 +- include/asm-i386/mach-numaq/mach_apic.h | 2 +- include/asm-i386/mach-summit/mach_apic.h | 2 +- include/asm-i386/mach-visws/mach_apic.h | 2 +- include/asm-x86_64/apic.h | 2 +- 13 files changed, 16 insertions(+), 26 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 9ea5b8ecc7e1..280898b045b2 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -874,7 +874,7 @@ static void __init acpi_process_madt(void) acpi_ioapic = 1; smp_found_config = 1; - clustered_apic_check(); + setup_apic_routing(); } } if (error == -EINVAL) { diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 4f5983c98669..0952eccd8f28 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -477,7 +477,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) } ++mpc_record; } - clustered_apic_check(); + setup_apic_routing(); if (!num_processors) printk(KERN_ERR "SMP mptable: no processors registered!\n"); return num_processors; diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 025f26ebb8d7..c08650a427e2 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -35,11 +35,8 @@ struct genapic __read_mostly *genapic = &apic_flat; /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ -void __init clustered_apic_check(void) +void __init setup_apic_routing(void) { - unsigned int i, max_apic = 0; - u8 id; - #ifdef CONFIG_ACPI /* * Quirk: some x86_64 machines can only use physical APIC mode @@ -49,17 +46,10 @@ void __init clustered_apic_check(void) if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) genapic = &apic_physflat; + else #endif - for (i = 0; i < NR_CPUS; i++) { - id = bios_cpu_apicid[i]; - if (id == BAD_APICID) - continue; - if (id > max_apic) - max_apic = id; - } - - if (max_apic < 8) + if (cpus_weight(cpu_possible_map) <= 8) genapic = &apic_flat; else genapic = &apic_physflat; diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 455aa0b932f0..d0dc4891599b 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -300,7 +300,7 @@ static int __init smp_read_mpc(struct mp_config_table *mpc) } } } - clustered_apic_check(); + setup_apic_routing(); if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; diff --git a/include/asm-i386/genapic.h b/include/asm-i386/genapic.h index 8ffbb0f07457..33e3ffe1766c 100644 --- a/include/asm-i386/genapic.h +++ b/include/asm-i386/genapic.h @@ -36,7 +36,7 @@ struct genapic { void (*init_apic_ldr)(void); physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map); - void (*clustered_apic_check)(void); + void (*setup_apic_routing)(void); int (*multi_timer_check)(int apic, int irq); int (*apicid_to_node)(int logical_apicid); int (*cpu_to_logical_apicid)(int cpu); @@ -99,7 +99,7 @@ struct genapic { APICFUNC(check_apicid_present) \ APICFUNC(init_apic_ldr) \ APICFUNC(ioapic_phys_id_map) \ - APICFUNC(clustered_apic_check) \ + APICFUNC(setup_apic_routing) \ APICFUNC(multi_timer_check) \ APICFUNC(apicid_to_node) \ APICFUNC(cpu_to_logical_apicid) \ diff --git a/include/asm-i386/mach-bigsmp/mach_apic.h b/include/asm-i386/mach-bigsmp/mach_apic.h index 18b19a773440..ebd319f838ab 100644 --- a/include/asm-i386/mach-bigsmp/mach_apic.h +++ b/include/asm-i386/mach-bigsmp/mach_apic.h @@ -71,7 +71,7 @@ static inline void init_apic_ldr(void) apic_write_around(APIC_LDR, val); } -static inline void clustered_apic_check(void) +static inline void setup_apic_routing(void) { printk("Enabling APIC mode: %s. Using %d I/O APICs\n", "Physflat", nr_ioapics); diff --git a/include/asm-i386/mach-default/mach_apic.h b/include/asm-i386/mach-default/mach_apic.h index 3ef6292db780..6db1c3babe9a 100644 --- a/include/asm-i386/mach-default/mach_apic.h +++ b/include/asm-i386/mach-default/mach_apic.h @@ -54,7 +54,7 @@ static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) return phys_map; } -static inline void clustered_apic_check(void) +static inline void setup_apic_routing(void) { printk("Enabling APIC mode: %s. Using %d I/O APICs\n", "Flat", nr_ioapics); diff --git a/include/asm-i386/mach-es7000/mach_apic.h b/include/asm-i386/mach-es7000/mach_apic.h index 26333685a7fb..8e8b3949173a 100644 --- a/include/asm-i386/mach-es7000/mach_apic.h +++ b/include/asm-i386/mach-es7000/mach_apic.h @@ -81,7 +81,7 @@ static inline void enable_apic_mode(void) } extern int apic_version [MAX_APICS]; -static inline void clustered_apic_check(void) +static inline void setup_apic_routing(void) { int apic = bios_cpu_apicid[smp_processor_id()]; printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n", diff --git a/include/asm-i386/mach-generic/mach_apic.h b/include/asm-i386/mach-generic/mach_apic.h index d9dc039da94a..a236e7021528 100644 --- a/include/asm-i386/mach-generic/mach_apic.h +++ b/include/asm-i386/mach-generic/mach_apic.h @@ -13,7 +13,7 @@ #define apic_id_registered (genapic->apic_id_registered) #define init_apic_ldr (genapic->init_apic_ldr) #define ioapic_phys_id_map (genapic->ioapic_phys_id_map) -#define clustered_apic_check (genapic->clustered_apic_check) +#define setup_apic_routing (genapic->setup_apic_routing) #define multi_timer_check (genapic->multi_timer_check) #define apicid_to_node (genapic->apicid_to_node) #define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid) diff --git a/include/asm-i386/mach-numaq/mach_apic.h b/include/asm-i386/mach-numaq/mach_apic.h index 9d158095da82..5e5e7dd2692e 100644 --- a/include/asm-i386/mach-numaq/mach_apic.h +++ b/include/asm-i386/mach-numaq/mach_apic.h @@ -34,7 +34,7 @@ static inline void init_apic_ldr(void) /* Already done in NUMA-Q firmware */ } -static inline void clustered_apic_check(void) +static inline void setup_apic_routing(void) { printk("Enabling APIC mode: %s. Using %d I/O APICs\n", "NUMA-Q", nr_ioapics); diff --git a/include/asm-i386/mach-summit/mach_apic.h b/include/asm-i386/mach-summit/mach_apic.h index 43e5bd8f4a19..732f776aab8e 100644 --- a/include/asm-i386/mach-summit/mach_apic.h +++ b/include/asm-i386/mach-summit/mach_apic.h @@ -80,7 +80,7 @@ static inline int apic_id_registered(void) return 1; } -static inline void clustered_apic_check(void) +static inline void setup_apic_routing(void) { printk("Enabling APIC mode: Summit. Using %d I/O APICs\n", nr_ioapics); diff --git a/include/asm-i386/mach-visws/mach_apic.h b/include/asm-i386/mach-visws/mach_apic.h index 18afe6b6fc4d..efac6f0d139f 100644 --- a/include/asm-i386/mach-visws/mach_apic.h +++ b/include/asm-i386/mach-visws/mach_apic.h @@ -47,7 +47,7 @@ static inline void summit_check(char *oem, char *productid) { } -static inline void clustered_apic_check(void) +static inline void setup_apic_routing(void) { } diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h index 7cfb39cbd918..2f3b013595ab 100644 --- a/include/asm-x86_64/apic.h +++ b/include/asm-x86_64/apic.h @@ -83,7 +83,7 @@ extern void setup_secondary_APIC_clock (void); extern int APIC_init_uniprocessor (void); extern void disable_APIC_timer(void); extern void enable_APIC_timer(void); -extern void clustered_apic_check(void); +extern void setup_apic_routing(void); extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, unsigned char msg_type, unsigned char mask); -- cgit v1.2.3 From 00f1ea696702163b7411d2316264525996c66ed3 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:04 +0200 Subject: [PATCH] x86: adjust inclusion of asm/fixmap.h Move inclusion of asm/fixmap.h to where it is really used rather than where it may have been used long ago (requires a few other adjustments to includes due to previous implicit dependencies). Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/genapic.c | 1 + arch/x86_64/kernel/genapic_flat.c | 1 + include/asm-i386/hpet.h | 2 -- include/asm-i386/kexec.h | 5 ----- include/asm-i386/pgalloc.h | 1 - include/asm-i386/smp.h | 7 ++----- include/asm-x86_64/ipi.h | 4 +--- include/asm-x86_64/pgalloc.h | 1 - include/asm-x86_64/pgtable.h | 1 - include/asm-x86_64/smp.h | 3 +-- 10 files changed, 6 insertions(+), 20 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index c08650a427e2..47496a40e84f 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -18,6 +18,7 @@ #include #include +#include #ifdef CONFIG_ACPI #include diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 7c01db8fa9d1..9e0a552f0e4a 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -16,6 +16,7 @@ #include #include #include +#include static cpumask_t flat_target_cpus(void) { diff --git a/include/asm-i386/hpet.h b/include/asm-i386/hpet.h index fc03cf9de5c4..dddeedf504b7 100644 --- a/include/asm-i386/hpet.h +++ b/include/asm-i386/hpet.h @@ -28,8 +28,6 @@ #include -#include - /* * Documentation on HPET can be found at: * http://www.intel.com/ial/home/sp/pcmmspec.htm diff --git a/include/asm-i386/kexec.h b/include/asm-i386/kexec.h index 4dfc9f5ed031..c5b4ab95bdc2 100644 --- a/include/asm-i386/kexec.h +++ b/include/asm-i386/kexec.h @@ -21,7 +21,6 @@ #ifndef __ASSEMBLY__ -#include #include #include @@ -29,10 +28,6 @@ * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. * I.e. Maximum page that is mapped directly into kernel memory, * and kmap is not required. - * - * Someone correct me if FIXADDR_START - PAGEOFFSET is not the correct - * calculation for the amount of memory directly mappable into the - * kernel memory space. */ /* Maximum physical address we can use pages from */ diff --git a/include/asm-i386/pgalloc.h b/include/asm-i386/pgalloc.h index c8dc2d0141a7..47430175b75f 100644 --- a/include/asm-i386/pgalloc.h +++ b/include/asm-i386/pgalloc.h @@ -1,7 +1,6 @@ #ifndef _I386_PGALLOC_H #define _I386_PGALLOC_H -#include #include #include /* for struct page */ diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h index 6bf0033a301c..9cab1531c613 100644 --- a/include/asm-i386/smp.h +++ b/include/asm-i386/smp.h @@ -11,16 +11,13 @@ #include #endif -#ifdef CONFIG_X86_LOCAL_APIC -#ifndef __ASSEMBLY__ -#include +#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__) #include #include +#include #ifdef CONFIG_X86_IO_APIC #include #endif -#include -#endif #endif #define BAD_APICID 0xFFu diff --git a/include/asm-x86_64/ipi.h b/include/asm-x86_64/ipi.h index 2a5c162b7d92..ffa6f1517f1a 100644 --- a/include/asm-x86_64/ipi.h +++ b/include/asm-x86_64/ipi.h @@ -18,10 +18,8 @@ * Subject to the GNU Public License, v.2 */ -#include #include -#include -#include +#include /* * the following functions deal with sending IPIs between CPUs. diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h index 4e28b6060a5e..31d497171969 100644 --- a/include/asm-x86_64/pgalloc.h +++ b/include/asm-x86_64/pgalloc.h @@ -1,7 +1,6 @@ #ifndef _X86_64_PGALLOC_H #define _X86_64_PGALLOC_H -#include #include #include #include diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 730bd6028416..5957361782fe 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -6,7 +6,6 @@ * the x86-64 page table tree. */ #include -#include #include #include #include diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index de592a408c07..f4236d7789aa 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -10,10 +10,9 @@ #include extern int disable_apic; -#include #include -#include #include +#include #include #ifdef CONFIG_SMP -- cgit v1.2.3 From 9964cf7d776600724ef5f1b33303ceadc588b8ba Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH] x86: consolidate smp_send_stop() Synchronize i386's smp_send_stop() with x86-64's in only try-locking the call lock to prevent deadlocks when called from panic(). In both version, disable interrupts before clearing the CPU off the online map to eliminate races with IRQ handlers inspecting this map. Also in both versions, save/restore interrupts rather than disabling/ enabling them. On x86-64, eliminate one function used here by folding it into its single caller, convert to static, and rename for consistency with i386 (lkcd may like this). Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/smp.c | 68 +++++++++++++++++++++++++++--------------------- arch/x86_64/kernel/smp.c | 28 +++++++------------- include/asm-x86_64/smp.h | 1 - 3 files changed, 48 insertions(+), 49 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 0e8977871b1f..0cd459baad68 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -515,35 +515,14 @@ void unlock_ipi_call_lock(void) static struct call_data_struct *call_data; -/** - * smp_call_function(): Run a function on all other CPUs. - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @nonatomic: currently unused. - * @wait: If true, wait (atomically) until function has completed on other CPUs. - * - * Returns 0 on success, else a negative status code. Does not return until - * remote CPUs are nearly ready to execute <> or are or have executed. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. - */ -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) +static void __smp_call_function(void (*func) (void *info), void *info, + int nonatomic, int wait) { struct call_data_struct data; - int cpus; - - /* Holding any lock stops cpus from going down. */ - spin_lock(&call_lock); - cpus = num_online_cpus() - 1; - if (!cpus) { - spin_unlock(&call_lock); - return 0; - } + int cpus = num_online_cpus() - 1; - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); + if (!cpus) + return; data.func = func; data.info = info; @@ -565,6 +544,30 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, if (wait) while (atomic_read(&data.finished) != cpus) cpu_relax(); +} + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: currently unused. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. Does not return until + * remote CPUs are nearly ready to execute <> or are or have executed. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait) +{ + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + __smp_call_function(func, info, nonatomic, wait); spin_unlock(&call_lock); return 0; @@ -573,11 +576,11 @@ EXPORT_SYMBOL(smp_call_function); static void stop_this_cpu (void * dummy) { + local_irq_disable(); /* * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_disable(); disable_local_APIC(); if (cpu_data[smp_processor_id()].hlt_works_ok) for(;;) halt(); @@ -590,11 +593,16 @@ static void stop_this_cpu (void * dummy) void smp_send_stop(void) { - smp_call_function(stop_this_cpu, NULL, 1, 0); + /* Don't deadlock on the call lock in panic */ + int nolock = !spin_trylock(&call_lock); + unsigned long flags; - local_irq_disable(); + local_irq_save(flags); + __smp_call_function(stop_this_cpu, NULL, 0, 0); + if (!nolock) + spin_unlock(&call_lock); disable_local_APIC(); - local_irq_enable(); + local_irq_restore(flags); } /* diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index af1ec4d23cf8..bd1d123947ce 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -452,42 +452,34 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, } EXPORT_SYMBOL(smp_call_function); -void smp_stop_cpu(void) +static void stop_this_cpu(void *dummy) { - unsigned long flags; + local_irq_disable(); /* * Remove this CPU: */ cpu_clear(smp_processor_id(), cpu_online_map); - local_irq_save(flags); disable_local_APIC(); - local_irq_restore(flags); -} - -static void smp_really_stop_cpu(void *dummy) -{ - smp_stop_cpu(); for (;;) halt(); } void smp_send_stop(void) { - int nolock = 0; + int nolock; + unsigned long flags; + if (reboot_force) return; + /* Don't deadlock on the call lock in panic */ - if (!spin_trylock(&call_lock)) { - /* ignore locking because we have panicked anyways */ - nolock = 1; - } - __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); + nolock = !spin_trylock(&call_lock); + local_irq_save(flags); + __smp_call_function(stop_this_cpu, NULL, 0, 0); if (!nolock) spin_unlock(&call_lock); - - local_irq_disable(); disable_local_APIC(); - local_irq_enable(); + local_irq_restore(flags); } /* diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index f4236d7789aa..d5704421456b 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -37,7 +37,6 @@ extern void lock_ipi_call_lock(void); extern void unlock_ipi_call_lock(void); extern int smp_num_siblings; extern void smp_send_reschedule(int cpu); -void smp_stop_cpu(void); extern cpumask_t cpu_sibling_map[NR_CPUS]; extern cpumask_t cpu_core_map[NR_CPUS]; -- cgit v1.2.3 From 3755090722e5a9c44780e5461ed9e49ab542bc73 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH] x86-64: a few missing entry.S annotations Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/entry.S | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index ed4350ced3d0..fa984b53e7e6 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -701,6 +701,7 @@ END(spurious_interrupt) CFI_ADJUST_CFA_OFFSET 8 pushq %rax /* push real oldrax to the rdi slot */ CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rax,0 leaq \sym(%rip),%rax jmp error_entry CFI_ENDPROC @@ -710,6 +711,7 @@ END(spurious_interrupt) XCPT_FRAME pushq %rax CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET rax,0 leaq \sym(%rip),%rax jmp error_entry CFI_ENDPROC @@ -817,6 +819,7 @@ paranoid_schedule\trace: */ KPROBE_ENTRY(error_entry) _frame RDI + CFI_REL_OFFSET rax,0 /* rdi slot contains rax, oldrax contains error code */ cld subq $14*8,%rsp @@ -824,6 +827,7 @@ KPROBE_ENTRY(error_entry) movq %rsi,13*8(%rsp) CFI_REL_OFFSET rsi,RSI movq 14*8(%rsp),%rsi /* load rax from rdi slot */ + CFI_REGISTER rax,rsi movq %rdx,12*8(%rsp) CFI_REL_OFFSET rdx,RDX movq %rcx,11*8(%rsp) @@ -857,6 +861,7 @@ error_swapgs: swapgs error_sti: movq %rdi,RDI(%rsp) + CFI_REL_OFFSET rdi,RDI movq %rsp,%rdi movq ORIG_RAX(%rsp),%rsi /* get error code */ movq $-1,ORIG_RAX(%rsp) -- cgit v1.2.3 From d18951834216eae82e2f9112416111b4f55f1849 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH] x86: Fix i386 and x86_64 fault information pollution a userspace fault or a kernelspace fault which will result in the immediate death of the process. They should not be filled in as a result of a kernelspace fault which can be fixed up. Otherwise, if the process is handling SIGSEGV and examining the fault information, this can result in the kernel space fault trashing the previously stored fault information if it arrives between the userspace fault happening and the SIGSEGV being delivered to the process. Signed-off-by: Jeff Dike Signed-off-by: Andi Kleen Acked-by: Jan Beulich -- arch/i386/kernel/traps.c | 24 ++++++++++++++++++------ arch/x86_64/kernel/traps.c | 30 +++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 13 deletions(-) --- arch/i386/kernel/traps.c | 24 ++++++++++++++++++------ arch/x86_64/kernel/traps.c | 30 +++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 13 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index af0d3f70a817..58dfecc8e36c 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -476,8 +476,6 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, siginfo_t *info) { struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; if (regs->eflags & VM_MASK) { if (vm86) @@ -489,6 +487,18 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, goto kernel_trap; trap_signal: { + /* + * We want error_code and trap_no set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (info) force_sig_info(signr, info, tsk); else @@ -497,8 +507,11 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86, } kernel_trap: { - if (!fixup_exception(regs)) + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; die(str, regs, error_code); + } return; } @@ -603,9 +616,6 @@ fastcall void __kprobes do_general_protection(struct pt_regs * regs, } put_cpu(); - current->thread.error_code = error_code; - current->thread.trap_no = 13; - if (regs->eflags & VM_MASK) goto gp_in_vm86; @@ -624,6 +634,8 @@ gp_in_vm86: gp_in_kernel: if (!fixup_exception(regs)) { + current->thread.error_code = error_code; + current->thread.trap_no = 13; if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 13, SIGSEGV) == NOTIFY_STOP) return; diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 09d2e8a10a49..cceef5bd7302 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -581,10 +581,20 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, { struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - if (user_mode(regs)) { + /* + * We want error_code and trap_no set for userspace + * faults and kernelspace faults which result in + * die(), but not kernelspace faults which are fixed + * up. die() gives the process no chance to handle + * the signal and notice the kernel fault information, + * so that won't result in polluting the information + * about previously queued, but not yet delivered, + * faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (exception_trace && unhandled_signal(tsk, signr)) printk(KERN_INFO "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", @@ -605,8 +615,11 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, fixup = search_exception_tables(regs->rip); if (fixup) regs->rip = fixup->fixup; - else + else { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; die(str, regs, error_code); + } return; } } @@ -682,10 +695,10 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, conditional_sti(regs); - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (user_mode(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) printk(KERN_INFO "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", @@ -704,6 +717,9 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, regs->rip = fixup->fixup; return; } + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 13, SIGSEGV) == NOTIFY_STOP) return; -- cgit v1.2.3 From 803d80f65038f77c4681a0d7708e9d693e68aaa8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH] x86-64: Some cleanup in time.c Move prototypes into header files Remove unneeded includes. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 5 +---- include/asm-x86_64/proto.h | 2 ++ 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 75d73a9aa9ff..811b8f987b50 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -39,13 +39,10 @@ #include #include #include -#include #include #include #include - -extern void i8254_timer_resume(void); -extern int using_apic_timer; +#include static char *timename = NULL; diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 6688cf9eb27b..f64949fae61a 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -122,6 +122,8 @@ extern void smp_local_timer_interrupt(void); long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); +void i8254_timer_resume(void); + #define round_up(x,y) (((x) + (y) - 1) & ~((y)-1)) #define round_down(x,y) ((x) & ~((y)-1)) -- cgit v1.2.3 From 405e494d91bac85cc992f55ad434b0f325e399a5 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 2 May 2007 19:27:05 +0200 Subject: [PATCH] x86-64: x86_64 make NMI use PERFCTR1 for architectural perfmon (take 2) Hello, This patch against 2.6.20-git14 makes the NMI watchdog use PERFSEL1/PERFCTR1 instead of PERFSEL0/PERFCTR0 on processors supporting Intel architectural perfmon, such as Intel Core 2. Although all PMU events can work on both counters, the Precise Event-Based Sampling (PEBS) requires that the event be in PERFCTR0 to work correctly (see section 18.14.4.1 in the IA32 SDM Vol 3b). This versions has 3 chunks compared to previous where we had missed on check. Changelog: - make the x86-64 NMI watchdog use PERFSEL1/PERFCTR1 instead of PERFSEL0/PERFCTR0 on processors supporting the Intel architectural perfmon (e.g. Core 2 Duo). This allows PEBS to work when the NMI watchdog is active. signed-off-by: stephane eranian Signed-off-by: Andi Kleen --- arch/x86_64/kernel/nmi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index dfab9f167366..010d3d9bd56a 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -348,7 +348,7 @@ int __init check_nmi_watchdog (void) struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); nmi_hz = 1; - if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) + if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) nmi_hz = adjust_for_32bit_ctr(nmi_hz); } @@ -713,8 +713,8 @@ static int setup_intel_arch_watchdog(void) (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) goto fail; - perfctr_msr = MSR_ARCH_PERFMON_PERFCTR0; - evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL0; + perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1; + evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1; if (!__reserve_perfctr_nmi(-1, perfctr_msr)) goto fail; @@ -958,7 +958,7 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) /* start the cycle over again */ wrmsrl(wd->perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); - } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { + } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) { /* * ArchPerfom/Core Duo needs to re-unmask * the apic vector -- cgit v1.2.3 From 6b37f5a20c0e5c334c010a587058354215433e92 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH] x86-64: fix cpu MHz reporting on constant_tsc cpus This patch fixes the reporting of cpu_mhz in /proc/cpuinfo on CPUs with a constant TSC rate and a kernel with disabled cpufreq. Signed-off-by: Mark Langsdorf Signed-off-by: Joerg Roedel Signed-off-by: Andi Kleen arch/x86_64/kernel/apic.c | 2 - arch/x86_64/kernel/time.c | 58 +++++++++++++++++++++++++++++++++++++++--- arch/x86_64/kernel/tsc.c | 12 +++++--- arch/x86_64/kernel/tsc_sync.c | 2 - include/asm-x86_64/proto.h | 1 5 files changed, 65 insertions(+), 10 deletions(-) --- arch/x86_64/kernel/apic.c | 2 +- arch/x86_64/kernel/time.c | 58 ++++++++++++++++++++++++++++++++++++++++--- arch/x86_64/kernel/tsc.c | 12 +++++---- arch/x86_64/kernel/tsc_sync.c | 2 +- include/asm-x86_64/proto.h | 1 + 5 files changed, 65 insertions(+), 10 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index bd3e45d47c37..3421f21b6c70 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -843,7 +843,7 @@ static int __init calibrate_APIC_clock(void) } while ((tsc - tsc_start) < TICK_COUNT && (apic - apic_start) < TICK_COUNT); - result = (apic_start - apic) * 1000L * cpu_khz / + result = (apic_start - apic) * 1000L * tsc_khz / (tsc - tsc_start); } printk("result %d\n", result); diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 811b8f987b50..5f862e216a42 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -43,6 +43,7 @@ #include #include #include +#include static char *timename = NULL; @@ -249,6 +250,51 @@ static unsigned long get_cmos_time(void) return mktime(year, mon, day, hour, min, sec); } +/* calibrate_cpu is used on systems with fixed rate TSCs to determine + * processor frequency */ +#define TICK_COUNT 100000000 +static unsigned int __init tsc_calibrate_cpu_khz(void) +{ + int tsc_start, tsc_now; + int i, no_ctr_free; + unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; + unsigned long flags; + + for (i = 0; i < 4; i++) + if (avail_to_resrv_perfctr_nmi_bit(i)) + break; + no_ctr_free = (i == 4); + if (no_ctr_free) { + i = 3; + rdmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + rdmsrl(MSR_K7_PERFCTR3, pmc3); + } else { + reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); + reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + local_irq_save(flags); + /* start meauring cycles, incrementing from 0 */ + wrmsrl(MSR_K7_PERFCTR0 + i, 0); + wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); + rdtscl(tsc_start); + do { + rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); + tsc_now = get_cycles_sync(); + } while ((tsc_now - tsc_start) < TICK_COUNT); + + local_irq_restore(flags); + if (no_ctr_free) { + wrmsrl(MSR_K7_EVNTSEL3, 0); + wrmsrl(MSR_K7_PERFCTR3, pmc3); + wrmsrl(MSR_K7_EVNTSEL3, evntsel3); + } else { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + + return pmc_now * tsc_khz / (tsc_now - tsc_start); +} /* * pit_calibrate_tsc() uses the speaker output (channel 2) of @@ -336,14 +382,20 @@ void __init time_init(void) if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ tick_nsec = TICK_NSEC_HPET; - cpu_khz = hpet_calibrate_tsc(); + tsc_khz = hpet_calibrate_tsc(); timename = "HPET"; } else { pit_init(); - cpu_khz = pit_calibrate_tsc(); + tsc_khz = pit_calibrate_tsc(); timename = "PIT"; } + cpu_khz = tsc_khz; + if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && + boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 == 16) + cpu_khz = tsc_calibrate_cpu_khz(); + if (unsynchronized_tsc()) mark_tsc_unstable(); @@ -352,7 +404,7 @@ void __init time_init(void) else vgetcpu_mode = VGETCPU_LSL; - set_cyc2ns_scale(cpu_khz); + set_cyc2ns_scale(tsc_khz); printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); init_tsc_clocksource(); diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c index 1a0edbbffaa0..5c84992c676d 100644 --- a/arch/x86_64/kernel/tsc.c +++ b/arch/x86_64/kernel/tsc.c @@ -13,6 +13,8 @@ static int notsc __initdata = 0; unsigned int cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); +unsigned int tsc_khz; +EXPORT_SYMBOL(tsc_khz); static unsigned int cyc2ns_scale __read_mostly; @@ -77,7 +79,7 @@ static void handle_cpufreq_delayed_get(struct work_struct *v) static unsigned int ref_freq = 0; static unsigned long loops_per_jiffy_ref = 0; -static unsigned long cpu_khz_ref = 0; +static unsigned long tsc_khz_ref = 0; static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) @@ -99,7 +101,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, if (!ref_freq) { ref_freq = freq->old; loops_per_jiffy_ref = *lpj; - cpu_khz_ref = cpu_khz; + tsc_khz_ref = tsc_khz; } if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || @@ -107,12 +109,12 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); + tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable(); } - set_cyc2ns_scale(cpu_khz_ref); + set_cyc2ns_scale(tsc_khz_ref); return 0; } @@ -213,7 +215,7 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable); void __init init_tsc_clocksource(void) { if (!notsc) { - clocksource_tsc.mult = clocksource_khz2mult(cpu_khz, + clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, clocksource_tsc.shift); if (check_tsc_unstable()) clocksource_tsc.rating = 0; diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c index 014f0db45dfa..72d444dede9b 100644 --- a/arch/x86_64/kernel/tsc_sync.c +++ b/arch/x86_64/kernel/tsc_sync.c @@ -50,7 +50,7 @@ static __cpuinit void check_tsc_warp(void) /* * The measurement runs for 20 msecs: */ - end = start + cpu_khz * 20ULL; + end = start + tsc_khz * 20ULL; now = start; for (i = 0; ; i++) { diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index f64949fae61a..78427021d94a 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -92,6 +92,7 @@ extern unsigned long table_start, table_end; extern int exception_trace; extern unsigned cpu_khz; +extern unsigned tsc_khz; extern void no_iommu_init(void); extern int force_iommu, no_iommu; -- cgit v1.2.3 From e65845045588806fa5c8df8a4f4253516515a5e3 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH] x86-64: dma_ops as const The dma_ops structure can be const since it never changes after boot. Signed-off-by: Stephen Hemminger Signed-off-by: Andi Kleen --- arch/x86_64/kernel/pci-calgary.c | 2 +- arch/x86_64/kernel/pci-gart.c | 2 +- arch/x86_64/kernel/pci-nommu.c | 2 +- arch/x86_64/kernel/pci-swiotlb.c | 2 +- arch/x86_64/mm/init.c | 2 +- include/asm-x86_64/dma-mapping.h | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 04480c3b68f5..5bd20b542c1e 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -507,7 +507,7 @@ error: return ret; } -static struct dma_mapping_ops calgary_dma_ops = { +static const struct dma_mapping_ops calgary_dma_ops = { .alloc_coherent = calgary_alloc_coherent, .map_single = calgary_map_single, .unmap_single = calgary_unmap_single, diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 0bae862e9a55..0a762e10f2be 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -556,7 +556,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) extern int agp_amd64_init(void); -static struct dma_mapping_ops gart_dma_ops = { +static const struct dma_mapping_ops gart_dma_ops = { .mapping_error = NULL, .map_single = gart_map_single, .map_simple = gart_map_simple, diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index df09ab05a1bd..6dade0c867cc 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -79,7 +79,7 @@ void nommu_unmap_sg(struct device *dev, struct scatterlist *sg, { } -struct dma_mapping_ops nommu_dma_ops = { +const struct dma_mapping_ops nommu_dma_ops = { .map_single = nommu_map_single, .unmap_single = nommu_unmap_single, .map_sg = nommu_map_sg, diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index eb18be5a6569..4b4569abc60c 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -12,7 +12,7 @@ int swiotlb __read_mostly; EXPORT_SYMBOL(swiotlb); -struct dma_mapping_ops swiotlb_dma_ops = { +const struct dma_mapping_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, .alloc_coherent = swiotlb_alloc_coherent, .free_coherent = swiotlb_free_coherent, diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index ec31534eb104..5ca61731f550 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -46,7 +46,7 @@ #define Dprintk(x...) #endif -struct dma_mapping_ops* dma_ops; +const struct dma_mapping_ops* dma_ops; EXPORT_SYMBOL(dma_ops); static unsigned long dma_reserve __initdata; diff --git a/include/asm-x86_64/dma-mapping.h b/include/asm-x86_64/dma-mapping.h index d2af227f06d0..6897e2a436e5 100644 --- a/include/asm-x86_64/dma-mapping.h +++ b/include/asm-x86_64/dma-mapping.h @@ -52,7 +52,7 @@ struct dma_mapping_ops { }; extern dma_addr_t bad_dma_address; -extern struct dma_mapping_ops* dma_ops; +extern const struct dma_mapping_ops* dma_ops; extern int iommu_merge; static inline int dma_mapping_error(dma_addr_t dma_addr) -- cgit v1.2.3 From dafe41ee3a9389c08c91cdfd8670295f20f89e04 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH] x86-64: Kill temp boot pmds Early in the boot process we need the ability to set up temporary mappings, before our normal mechanisms are initialized. Currently this is used to map pages that are part of the page tables we are building and pages during the dmi scan. The core problem is that we are using the user portion of the page tables to implement this. Which means that while this mechanism is active we cannot catch NULL pointer dereferences and we deviate from the normal ways of handling things. In this patch I modify early_ioremap to map pages into the kernel portion of address space, roughly where we will later put modules, and I make the discovery of which addresses we can use dynamic which removes all kinds of static limits and remove the dependencies on implementation details between different parts of the code. Now alloc_low_page() and unmap_low_page() use early_iomap() and early_iounmap() to allocate/map and unmap a page. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 3 -- arch/x86_64/mm/init.c | 100 +++++++++++++++++++++------------------------- 2 files changed, 45 insertions(+), 58 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 598a4d0351fc..118c6088198a 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -288,9 +288,6 @@ NEXT_PAGE(level2_ident_pgt) .quad i << 21 | 0x083 i = i + 1 .endr - /* Temporary mappings for the super early allocator in arch/x86_64/mm/init.c */ - .globl temp_boot_pmds -temp_boot_pmds: .fill 492,8,0 NEXT_PAGE(level2_kernel_pgt) diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 5ca61731f550..4ab3d40aac90 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -167,23 +167,9 @@ __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot) unsigned long __initdata table_start, table_end; -extern pmd_t temp_boot_pmds[]; - -static struct temp_map { - pmd_t *pmd; - void *address; - int allocated; -} temp_mappings[] __initdata = { - { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) }, - { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) }, - {} -}; - -static __meminit void *alloc_low_page(int *index, unsigned long *phys) +static __meminit void *alloc_low_page(unsigned long *phys) { - struct temp_map *ti; - int i; - unsigned long pfn = table_end++, paddr; + unsigned long pfn = table_end++; void *adr; if (after_bootmem) { @@ -194,57 +180,63 @@ static __meminit void *alloc_low_page(int *index, unsigned long *phys) if (pfn >= end_pfn) panic("alloc_low_page: ran out of memory"); - for (i = 0; temp_mappings[i].allocated; i++) { - if (!temp_mappings[i].pmd) - panic("alloc_low_page: ran out of temp mappings"); - } - ti = &temp_mappings[i]; - paddr = (pfn << PAGE_SHIFT) & PMD_MASK; - set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE)); - ti->allocated = 1; - __flush_tlb(); - adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK); + + adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); memset(adr, 0, PAGE_SIZE); - *index = i; - *phys = pfn * PAGE_SIZE; - return adr; -} + *phys = pfn * PAGE_SIZE; + return adr; +} -static __meminit void unmap_low_page(int i) +static __meminit void unmap_low_page(void *adr) { - struct temp_map *ti; if (after_bootmem) return; - ti = &temp_mappings[i]; - set_pmd(ti->pmd, __pmd(0)); - ti->allocated = 0; + early_iounmap(adr, PAGE_SIZE); } /* Must run before zap_low_mappings */ __init void *early_ioremap(unsigned long addr, unsigned long size) { - unsigned long map = round_down(addr, LARGE_PAGE_SIZE); - - /* actually usually some more */ - if (size >= LARGE_PAGE_SIZE) { - return NULL; + unsigned long vaddr; + pmd_t *pmd, *last_pmd; + int i, pmds; + + pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; + vaddr = __START_KERNEL_map; + pmd = level2_kernel_pgt; + last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; + for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { + for (i = 0; i < pmds; i++) { + if (pmd_present(pmd[i])) + goto next; + } + vaddr += addr & ~PMD_MASK; + addr &= PMD_MASK; + for (i = 0; i < pmds; i++, addr += PMD_SIZE) + set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); + __flush_tlb(); + return (void *)vaddr; + next: + ; } - set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); - map += LARGE_PAGE_SIZE; - set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); - __flush_tlb(); - return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1)); + printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); + return NULL; } /* To avoid virtual aliases later */ __init void early_iounmap(void *addr, unsigned long size) { - if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address) - printk("early_iounmap: bad address %p\n", addr); - set_pmd(temp_mappings[0].pmd, __pmd(0)); - set_pmd(temp_mappings[1].pmd, __pmd(0)); + unsigned long vaddr; + pmd_t *pmd; + int i, pmds; + + vaddr = (unsigned long)addr; + pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; + pmd = level2_kernel_pgt + pmd_index(vaddr); + for (i = 0; i < pmds; i++) + pmd_clear(pmd + i); __flush_tlb(); } @@ -289,7 +281,6 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) { - int map; unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; @@ -307,12 +298,12 @@ static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigne continue; } - pmd = alloc_low_page(&map, &pmd_phys); + pmd = alloc_low_page(&pmd_phys); spin_lock(&init_mm.page_table_lock); set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE)); phys_pmd_init(pmd, addr, end); spin_unlock(&init_mm.page_table_lock); - unmap_low_page(map); + unmap_low_page(pmd); } __flush_tlb(); } @@ -364,7 +355,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) end = (unsigned long)__va(end); for (; start < end; start = next) { - int map; unsigned long pud_phys; pgd_t *pgd = pgd_offset_k(start); pud_t *pud; @@ -372,7 +362,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) if (after_bootmem) pud = pud_offset(pgd, start & PGDIR_MASK); else - pud = alloc_low_page(&map, &pud_phys); + pud = alloc_low_page(&pud_phys); next = start + PGDIR_SIZE; if (next > end) @@ -380,7 +370,7 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) phys_pud_init(pud, __pa(start), __pa(next)); if (!after_bootmem) set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys)); - unmap_low_page(map); + unmap_low_page(pud); } if (!after_bootmem) -- cgit v1.2.3 From 67dcbb6bc6537aea92a2466bfc75f015b00e465e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH] x86-64: Clean up the early boot page table - Merge physmem_pgt and ident_pgt, removing physmem_pgt. The merge is broken as soon as mm/init.c:init_memory_mapping is run. - As physmem_pgt is gone don't export it in pgtable.h. - Use defines from pgtable.h for page permissions. - Fix the physical memory identity mapping so it is at the correct address. - Remove the physical memory mapping from wakeup_level4_pgt it is at the wrong address so we can't possibly be usinging it. - Simply NEXT_PAGE the work to calculate the phys_ alias of the labels was very cool. Unfortuantely it was a brittle special purpose hack that makes maitenance more difficult. Instead just use label - __START_KERNEL_map like we do everywhere else in assembly. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 61 ++++++++++++++++++++------------------------ include/asm-x86_64/pgtable.h | 1 - 2 files changed, 28 insertions(+), 34 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 118c6088198a..23b45e79a27c 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -260,52 +261,48 @@ ljumpvector: ENTRY(stext) ENTRY(_stext) - $page = 0 #define NEXT_PAGE(name) \ - $page = $page + 1; \ - .org $page * 0x1000; \ - phys_/**/name = $page * 0x1000 + __PHYSICAL_START; \ + .balign PAGE_SIZE; \ ENTRY(name) +/* Automate the creation of 1 to 1 mapping pmd entries */ +#define PMDS(START, PERM, COUNT) \ + i = 0 ; \ + .rept (COUNT) ; \ + .quad (START) + (i << 21) + (PERM) ; \ + i = i + 1 ; \ + .endr + NEXT_PAGE(init_level4_pgt) /* This gets initialized in x86_64_start_kernel */ .fill 512,8,0 NEXT_PAGE(level3_ident_pgt) - .quad phys_level2_ident_pgt | 0x007 + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .fill 511,8,0 NEXT_PAGE(level3_kernel_pgt) .fill 510,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ - .quad phys_level2_kernel_pgt | 0x007 + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE .fill 1,8,0 NEXT_PAGE(level2_ident_pgt) - /* 40MB for bootup. */ - i = 0 - .rept 20 - .quad i << 21 | 0x083 - i = i + 1 - .endr - .fill 492,8,0 + /* Since I easily can, map the first 1G. + * Don't set NX because code runs from these pages. + */ + PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) NEXT_PAGE(level2_kernel_pgt) /* 40MB kernel mapping. The kernel code cannot be bigger than that. When you change this change KERNEL_TEXT_SIZE in page.h too. */ /* (2^48-(2*1024*1024*1024)-((2^39)*511)-((2^30)*510)) = 0 */ - i = 0 - .rept 20 - .quad i << 21 | 0x183 - i = i + 1 - .endr + PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC|_PAGE_GLOBAL, + KERNEL_TEXT_SIZE/PMD_SIZE) /* Module mapping starts here */ - .fill 492,8,0 - -NEXT_PAGE(level3_physmem_pgt) - .quad phys_level2_kernel_pgt | 0x007 /* so that __va works even before pagetable_init */ - .fill 511,8,0 + .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 +#undef PMDS #undef NEXT_PAGE .data @@ -313,12 +310,10 @@ NEXT_PAGE(level3_physmem_pgt) #ifdef CONFIG_ACPI_SLEEP .align PAGE_SIZE ENTRY(wakeup_level4_pgt) - .quad phys_level3_ident_pgt | 0x007 - .fill 255,8,0 - .quad phys_level3_physmem_pgt | 0x007 - .fill 254,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 510,8,0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad phys_level3_kernel_pgt | 0x007 + .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE #endif #ifndef CONFIG_HOTPLUG_CPU @@ -332,12 +327,12 @@ ENTRY(wakeup_level4_pgt) */ .align PAGE_SIZE ENTRY(boot_level4_pgt) - .quad phys_level3_ident_pgt | 0x007 - .fill 255,8,0 - .quad phys_level3_physmem_pgt | 0x007 - .fill 254,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 257,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 252,8,0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad phys_level3_kernel_pgt | 0x007 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE .data diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index c514deb658a3..5a5d43b3ef5d 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -14,7 +14,6 @@ #include extern pud_t level3_kernel_pgt[512]; -extern pud_t level3_physmem_pgt[512]; extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pgd_t init_level4_pgt[]; -- cgit v1.2.3 From 93fd755e47d7b00fa5470199cfb14cbd9ded0284 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:06 +0200 Subject: [PATCH] x86-64: Fix early printk to use standard ISA mapping Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/early_printk.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index 47b6d90349da..b7aa9abe1c6a 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -11,11 +11,10 @@ #ifdef __i386__ #include -#define VGABASE (__ISA_IO_base + 0xb8000) #else #include -#define VGABASE ((void __iomem *)0xffffffff800b8000UL) #endif +#define VGABASE (__ISA_IO_base + 0xb8000) static int max_ypos = 25, max_xpos = 80; static int current_ypos = 25, current_xpos = 0; -- cgit v1.2.3 From 278c0eb7f96586c02b2bfaa8e250d951919a2e6a Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86-64: modify copy_bootdata to use virtual addresses Use virtual addresses instead of physical addresses in copy bootdata. In addition fix the implementation of the old bootloader convention. Everything is at real_mode_data always. It is just that sometimes real_mode_data was relocated by setup.S to not sit at 0x90000. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head64.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 5f197b0a330a..5c529c1e3d6c 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -29,25 +29,24 @@ static void __init clear_bss(void) } #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ -#define OLD_CL_MAGIC_ADDR 0x90020 +#define OLD_CL_MAGIC_ADDR 0x20 #define OLD_CL_MAGIC 0xA33F -#define OLD_CL_BASE_ADDR 0x90000 -#define OLD_CL_OFFSET 0x90022 +#define OLD_CL_OFFSET 0x22 static void __init copy_bootdata(char *real_mode_data) { - int new_data; + unsigned long new_data; char * command_line; memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); - new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); + new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER); if (!new_data) { - if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { + if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) { return; } - new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; + new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET); } - command_line = (char *) ((u64)(new_data)); + command_line = __va(new_data); memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } @@ -74,7 +73,7 @@ void __init x86_64_start_kernel(char * real_mode_data) cpu_pda(i) = &boot_cpu_pda[i]; pda_init(0); - copy_bootdata(real_mode_data); + copy_bootdata(__va(real_mode_data)); #ifdef CONFIG_SMP cpu_set(0, cpu_online_map); #endif -- cgit v1.2.3 From 30f472895401fbe8e64f861a2569bc9acb098741 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86-64: cleanup segments Move __KERNEL32_CS up into the unused gdt entry. __KERNEL32_CS is used when entering the kernel so putting it first is useful when trying to keep boot gdt sizes to a minimum. Set the accessed bit on all gdt entries. We don't care so there is no need for the cpu to burn the extra cycles, and it potentially allows the pages to be immutable. Plus it is confusing when debugging and your gdt entries mysteriously change. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 12 ++++++------ include/asm-x86_64/segment.h | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 23b45e79a27c..2b2e2c51e532 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -362,13 +362,13 @@ gdt: ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ + .quad 0x00af9b000000ffff /* __KERNEL_CS */ + .quad 0x00cf93000000ffff /* __KERNEL_DS */ + .quad 0x00cffb000000ffff /* __USER32_CS */ + .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ + .quad 0x00affb000000ffff /* __USER_CS */ .quad 0x0 /* unused */ - .quad 0x00af9a000000ffff /* __KERNEL_CS */ - .quad 0x00cf92000000ffff /* __KERNEL_DS */ - .quad 0x00cffa000000ffff /* __USER32_CS */ - .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ - .quad 0x00affa000000ffff /* __USER_CS */ - .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ .quad 0,0 /* TSS */ .quad 0,0 /* LDT */ .quad 0,0,0 /* three TLS descriptors */ diff --git a/include/asm-x86_64/segment.h b/include/asm-x86_64/segment.h index 334ddcdd8f92..adf2bf1e187c 100644 --- a/include/asm-x86_64/segment.h +++ b/include/asm-x86_64/segment.h @@ -6,7 +6,7 @@ #define __KERNEL_CS 0x10 #define __KERNEL_DS 0x18 -#define __KERNEL32_CS 0x38 +#define __KERNEL32_CS 0x08 /* * we cannot use the same code segment descriptor for user and kernel -- cgit v1.2.3 From 3c321bceb4a626639ab43a5a24d884930e511826 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86-64: Add EFER to the register set saved by save_processor_state EFER varies like %cr4 depending on the cpu capabilities, and which cpu capabilities we want to make use of. So save/restore it make certain we have the same EFER value when we are done. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/suspend.c | 3 ++- include/asm-x86_64/suspend.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index 91f7e678bae7..fe865ea4df52 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -33,7 +33,6 @@ void __save_processor_state(struct saved_context *ctxt) asm volatile ("str %0" : "=m" (ctxt->tr)); /* XMM0..XMM15 should be handled by kernel_fpu_begin(). */ - /* EFER should be constant for kernel version, no need to handle it. */ /* * segment registers */ @@ -50,6 +49,7 @@ void __save_processor_state(struct saved_context *ctxt) /* * control registers */ + rdmsrl(MSR_EFER, ctxt->efer); asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0)); asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); @@ -75,6 +75,7 @@ void __restore_processor_state(struct saved_context *ctxt) /* * control registers */ + wrmsrl(MSR_EFER, ctxt->efer); asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8)); asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); diff --git a/include/asm-x86_64/suspend.h b/include/asm-x86_64/suspend.h index bc7f81715e5e..a42306c220d6 100644 --- a/include/asm-x86_64/suspend.h +++ b/include/asm-x86_64/suspend.h @@ -17,6 +17,7 @@ struct saved_context { u16 ds, es, fs, gs, ss; unsigned long gs_base, gs_kernel_base, fs_base; unsigned long cr0, cr2, cr3, cr4, cr8; + unsigned long efer; u16 gdt_pad; u16 gdt_limit; unsigned long gdt_base; -- cgit v1.2.3 From 90b1c2085ea1955641e60a4f0acd63fdc271cd0b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86-64: 64bit PIC SMP trampoline This modifies the SMP trampoline and all of the associated code so it can jump to a 64bit kernel loaded at an arbitrary address. The dependencies on having an idenetity mapped page in the kernel page tables for SMP bootup have all been removed. In addition the trampoline has been modified to verify that long mode is supported. Asking if long mode is implemented is down right silly but we have traditionally had some of these checks, and they can't hurt anything. So when the totally ludicrous happens we just might handle it correctly. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 1 + arch/x86_64/kernel/setup.c | 9 +-- arch/x86_64/kernel/trampoline.S | 168 ++++++++++++++++++++++++++++++++++++---- 3 files changed, 156 insertions(+), 22 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 2b2e2c51e532..562d62fbd69f 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -101,6 +101,7 @@ startup_32: .org 0x100 .globl startup_64 startup_64: +ENTRY(secondary_startup_64) /* We come here either from startup_32 * or directly from a 64bit bootloader. * Since we may have come directly from a bootloader we diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 3d98b696881d..4b114ee31ebc 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -329,15 +329,8 @@ void __init setup_arch(char **cmdline_p) #endif #ifdef CONFIG_SMP - /* - * But first pinch a few for the stack/trampoline stuff - * FIXME: Don't need the extra page at 4K, but need to fix - * trampoline before removing it. (see the GDT stuff) - */ - reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE); - /* Reserve SMP trampoline */ - reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); + reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); #endif #ifdef CONFIG_ACPI_SLEEP diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S index c79b99a9e2f6..13eee63c7bb5 100644 --- a/arch/x86_64/kernel/trampoline.S +++ b/arch/x86_64/kernel/trampoline.S @@ -3,6 +3,7 @@ * Trampoline.S Derived from Setup.S by Linus Torvalds * * 4 Jan 1997 Michael Chastain: changed to gnu as. + * 15 Sept 2005 Eric Biederman: 64bit PIC support * * Entry: CS:IP point to the start of our code, we are * in real mode with no stack, but the rest of the @@ -17,15 +18,20 @@ * and IP is zero. Thus, data addresses need to be absolute * (no relocation) and are taken with regard to r_base. * + * With the addition of trampoline_level4_pgt this code can + * now enter a 64bit kernel that lives at arbitrary 64bit + * physical addresses. + * * If you work on this file, check the object module with objdump * --full-contents --reloc to make sure there are no relocation - * entries. For the GDT entry we do hand relocation in smpboot.c - * because of 64bit linker limitations. + * entries. */ #include -#include +#include #include +#include +#include .data @@ -33,15 +39,31 @@ ENTRY(trampoline_data) r_base = . + cli # We should be safe anyway wbinvd mov %cs, %ax # Code and data in the same place mov %ax, %ds + mov %ax, %es + mov %ax, %ss - cli # We should be safe anyway movl $0xA5A5A5A5, trampoline_data - r_base # write marker for master knows we're running + # Setup stack + movw $(trampoline_stack_end - r_base), %sp + + call verify_cpu # Verify the cpu supports long mode + + mov %cs, %ax + movzx %ax, %esi # Find the 32bit trampoline location + shll $4, %esi + + # Fixup the vectors + addl %esi, startup_32_vector - r_base + addl %esi, startup_64_vector - r_base + addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer + /* * GDT tables in non default location kernel can be beyond 16MB and * lgdt will not be able to load the address as in real mode default @@ -49,23 +71,141 @@ r_base = . * to 32 bit. */ - lidtl idt_48 - r_base # load idt with 0, 0 - lgdtl gdt_48 - r_base # load gdt with whatever is appropriate + lidtl tidt - r_base # load idt with 0, 0 + lgdtl tgdt - r_base # load gdt with whatever is appropriate xor %ax, %ax inc %ax # protected mode (PE) bit lmsw %ax # into protected mode - # flaush prefetch and jump to startup_32 in arch/x86_64/kernel/head.S - ljmpl $__KERNEL32_CS, $(startup_32-__START_KERNEL_map) + + # flush prefetch and jump to startup_32 + ljmpl *(startup_32_vector - r_base) + + .code32 + .balign 4 +startup_32: + movl $__KERNEL_DS, %eax # Initialize the %ds segment register + movl %eax, %ds + + xorl %eax, %eax + btsl $5, %eax # Enable PAE mode + movl %eax, %cr4 + + # Setup trampoline 4 level pagetables + leal (trampoline_level4_pgt - r_base)(%esi), %eax + movl %eax, %cr3 + + movl $MSR_EFER, %ecx + movl $(1 << _EFER_LME), %eax # Enable Long Mode + xorl %edx, %edx + wrmsr + + xorl %eax, %eax + btsl $31, %eax # Enable paging and in turn activate Long Mode + btsl $0, %eax # Enable protected mode + movl %eax, %cr0 + + /* + * At this point we're in long mode but in 32bit compatibility mode + * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn + * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + */ + ljmp *(startup_64_vector - r_base)(%esi) + + .code64 + .balign 4 +startup_64: + # Now jump into the kernel using virtual addresses + movq $secondary_startup_64, %rax + jmp *%rax + + .code16 +verify_cpu: + pushl $0 # Kill any dangerous flags + popfl + + /* minimum CPUID flags for x86-64 */ + /* see http://www.x86-64.org/lists/discuss/msg02971.html */ +#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ + (1<<13)|(1<<15)|(1<<24)|(1<<25)|(1<<26)) +#define REQUIRED_MASK2 (1<<29) + + pushfl # check for cpuid + popl %eax + movl %eax, %ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + pushl %ebx + popfl + cmpl %eax, %ebx + jz no_longmode + + xorl %eax, %eax # See if cpuid 1 is implemented + cpuid + cmpl $0x1, %eax + jb no_longmode + + movl $0x01, %eax # Does the cpu have what it takes? + cpuid + andl $REQUIRED_MASK1, %edx + xorl $REQUIRED_MASK1, %edx + jnz no_longmode + + movl $0x80000000, %eax # See if extended cpuid is implemented + cpuid + cmpl $0x80000001, %eax + jb no_longmode + + movl $0x80000001, %eax # Does the cpu have what it takes? + cpuid + andl $REQUIRED_MASK2, %edx + xorl $REQUIRED_MASK2, %edx + jnz no_longmode + + ret # The cpu supports long mode + +no_longmode: + hlt + jmp no_longmode + # Careful these need to be in the same 64K segment as the above; -idt_48: +tidt: .word 0 # idt limit = 0 .word 0, 0 # idt base = 0L -gdt_48: - .short GDT_ENTRIES*8 - 1 # gdt limit - .long cpu_gdt_table-__START_KERNEL_map + # Duplicate the global descriptor table + # so the kernel can live anywhere + .balign 4 +tgdt: + .short tgdt_end - tgdt # gdt limit + .long tgdt - r_base + .short 0 + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS +tgdt_end: + + .balign 4 +startup_32_vector: + .long startup_32 - r_base + .word __KERNEL32_CS, 0 + + .balign 4 +startup_64_vector: + .long startup_64 - r_base + .word __KERNEL_CS, 0 + +trampoline_stack: + .org 0x1000 +trampoline_stack_end: +ENTRY(trampoline_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 510,8,0 + .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE -.globl trampoline_end -trampoline_end: +ENTRY(trampoline_end) -- cgit v1.2.3 From 7c17e70613d2c70f9d5d0b5d37126fd5e8e9b728 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86-64: Get rid of dead code in suspend resume o Get rid of dead code in wakeup.S o We never restore from saved_gdt, saved_idt, saved_ltd, saved_tss, saved_cr3, saved_cr4, saved_cr0, real_save_gdt, saved_efer, saved_efer2. Get rid of of associated code. o Get rid of bogus_magic, bogus_31_magic and bogus_magic2. No longer being used. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/acpi/wakeup.S | 57 +--------------------------------------- 1 file changed, 1 insertion(+), 56 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index 185faa911db5..6ece70e91a32 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -258,8 +258,6 @@ gdt_48a: .word 0, 0 # gdt base (filled in later) -real_save_gdt: .word 0 - .quad 0 real_magic: .quad 0 video_mode: .quad 0 video_flags: .quad 0 @@ -272,10 +270,6 @@ bogus_32_magic: movb $0xb3,%al ; outb %al,$0x80 jmp bogus_32_magic -bogus_31_magic: - movb $0xb1,%al ; outb %al,$0x80 - jmp bogus_31_magic - bogus_cpu: movb $0xbc,%al ; outb %al,$0x80 jmp bogus_cpu @@ -346,16 +340,6 @@ check_vesaa: _setbada: jmp setbada - .code64 -bogus_magic: - movw $0x0e00 + 'B', %ds:(0xb8018) - jmp bogus_magic - -bogus_magic2: - movw $0x0e00 + '2', %ds:(0xb8018) - jmp bogus_magic2 - - wakeup_stack_begin: # Stack grows down .org 0xff0 @@ -373,28 +357,11 @@ ENTRY(wakeup_end) # # Returned address is location of code in low memory (past data and stack) # + .code64 ENTRY(acpi_copy_wakeup_routine) pushq %rax - pushq %rcx pushq %rdx - sgdt saved_gdt - sidt saved_idt - sldt saved_ldt - str saved_tss - - movq %cr3, %rdx - movq %rdx, saved_cr3 - movq %cr4, %rdx - movq %rdx, saved_cr4 - movq %cr0, %rdx - movq %rdx, saved_cr0 - sgdt real_save_gdt - wakeup_start (,%rdi) - movl $MSR_EFER, %ecx - rdmsr - movl %eax, saved_efer - movl %edx, saved_efer2 - movl saved_video_mode, %edx movl %edx, video_mode - wakeup_start (,%rdi) movl acpi_video_flags, %edx @@ -407,17 +374,8 @@ ENTRY(acpi_copy_wakeup_routine) cmpl $0x9abcdef0, %eax jne bogus_32_magic - # make sure %cr4 is set correctly (features, etc) - movl saved_cr4 - __START_KERNEL_map, %eax - movq %rax, %cr4 - - movl saved_cr0 - __START_KERNEL_map, %eax - movq %rax, %cr0 - jmp 1f # Flush pipelines -1: # restore the regs we used popq %rdx - popq %rcx popq %rax ENTRY(do_suspend_lowlevel_s4bios) ret @@ -512,16 +470,3 @@ ENTRY(saved_eip) .quad 0 ENTRY(saved_esp) .quad 0 ENTRY(saved_magic) .quad 0 - -ALIGN -# saved registers -saved_gdt: .quad 0,0 -saved_idt: .quad 0,0 -saved_ldt: .quad 0 -saved_tss: .quad 0 - -saved_cr0: .quad 0 -saved_cr3: .quad 0 -saved_cr4: .quad 0 -saved_efer: .quad 0 -saved_efer2: .quad 0 -- cgit v1.2.3 From 7db681d7e4038ad205b5face5cf7f7815633e1b5 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86-64: wakeup.S rename registers to reflect right names o Use appropriate names for 64bit regsiters. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/acpi/wakeup.S | 36 ++++++++++++++++++------------------ include/asm-x86_64/suspend.h | 12 ++++++------ 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index 6ece70e91a32..17dbeff64eef 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -211,16 +211,16 @@ wakeup_long64: movw %ax, %es movw %ax, %fs movw %ax, %gs - movq saved_esp, %rsp + movq saved_rsp, %rsp movw $0x0e00 + 'x', %ds:(0xb8018) - movq saved_ebx, %rbx - movq saved_edi, %rdi - movq saved_esi, %rsi - movq saved_ebp, %rbp + movq saved_rbx, %rbx + movq saved_rdi, %rdi + movq saved_rsi, %rsi + movq saved_rbp, %rbp movw $0x0e00 + '!', %ds:(0xb801a) - movq saved_eip, %rax + movq saved_rip, %rax jmp *%rax .code32 @@ -408,13 +408,13 @@ do_suspend_lowlevel: movq %r15, saved_context_r15(%rip) pushfq ; popq saved_context_eflags(%rip) - movq $.L97, saved_eip(%rip) + movq $.L97, saved_rip(%rip) - movq %rsp,saved_esp - movq %rbp,saved_ebp - movq %rbx,saved_ebx - movq %rdi,saved_edi - movq %rsi,saved_esi + movq %rsp,saved_rsp + movq %rbp,saved_rbp + movq %rbx,saved_rbx + movq %rdi,saved_rdi + movq %rsi,saved_rsi addq $8, %rsp movl $3, %edi @@ -461,12 +461,12 @@ do_suspend_lowlevel: .data ALIGN -ENTRY(saved_ebp) .quad 0 -ENTRY(saved_esi) .quad 0 -ENTRY(saved_edi) .quad 0 -ENTRY(saved_ebx) .quad 0 +ENTRY(saved_rbp) .quad 0 +ENTRY(saved_rsi) .quad 0 +ENTRY(saved_rdi) .quad 0 +ENTRY(saved_rbx) .quad 0 -ENTRY(saved_eip) .quad 0 -ENTRY(saved_esp) .quad 0 +ENTRY(saved_rip) .quad 0 +ENTRY(saved_rsp) .quad 0 ENTRY(saved_magic) .quad 0 diff --git a/include/asm-x86_64/suspend.h b/include/asm-x86_64/suspend.h index a42306c220d6..9c3f8de90d2d 100644 --- a/include/asm-x86_64/suspend.h +++ b/include/asm-x86_64/suspend.h @@ -45,12 +45,12 @@ extern unsigned long saved_context_eflags; extern void fix_processor_context(void); #ifdef CONFIG_ACPI_SLEEP -extern unsigned long saved_eip; -extern unsigned long saved_esp; -extern unsigned long saved_ebp; -extern unsigned long saved_ebx; -extern unsigned long saved_esi; -extern unsigned long saved_edi; +extern unsigned long saved_rip; +extern unsigned long saved_rsp; +extern unsigned long saved_rbp; +extern unsigned long saved_rbx; +extern unsigned long saved_rsi; +extern unsigned long saved_rdi; /* routines for saving/restoring kernel state */ extern int acpi_save_state_mem(void); -- cgit v1.2.3 From 275f55170ec2b5d777b070cb8ab9e5d58e65a2a8 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86-64: wakeup.S misc cleanups o Various cleanups. One of the main purpose of cleanups is that make wakeup.S as close as possible to trampoline.S. o Following are the changes - Indentations for comments. - Changed the gdt table to compact form and to resemble the one in trampoline.S - Take the jump to 32bit from real mode using ljmpl. Makes code more readable. - After enabling long mode, directly take a long jump for 64bit mode. No need to take an extra jump to "reach_comaptibility_mode" - Stack is not used after real mode. So don't load stack in 32 bit mode. - No need to enable PGE here. - No need to do extra EFER read, anyway we trash the read contents. - No need to enable system call (EFER_SCE). Anyway it will be enabled when original EFER is restored. - No need to set MP, ET, NE, WP, AM bits in cr0. Very soon we will reload the original cr0 while restroing the processor state. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/acpi/wakeup.S | 112 ++++++++++++++------------------------- 1 file changed, 40 insertions(+), 72 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index 17dbeff64eef..bd4c6f1a6f32 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -30,11 +30,12 @@ wakeup_code: cld # setup data segment movw %cs, %ax - movw %ax, %ds # Make ds:0 point to wakeup_start + movw %ax, %ds # Make ds:0 point to wakeup_start movw %ax, %ss - mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board + # Private stack is needed for ASUS board + mov $(wakeup_stack - wakeup_code), %sp - pushl $0 # Kill any dangerous flags + pushl $0 # Kill any dangerous flags popfl movl real_magic - wakeup_code, %eax @@ -45,7 +46,7 @@ wakeup_code: jz 1f lcall $0xc000,$3 movw %cs, %ax - movw %ax, %ds # Bios might have played with that + movw %ax, %ds # Bios might have played with that movw %ax, %ss 1: @@ -75,9 +76,12 @@ wakeup_code: jmp 1f 1: - .byte 0x66, 0xea # prefix + jmpi-opcode - .long wakeup_32 - __START_KERNEL_map - .word __KERNEL_CS + ljmpl *(wakeup_32_vector - wakeup_code) + + .balign 4 +wakeup_32_vector: + .long wakeup_32 - __START_KERNEL_map + .word __KERNEL32_CS, 0 .code32 wakeup_32: @@ -96,65 +100,50 @@ wakeup_32: jnc bogus_cpu movl %edx,%edi - movw $__KERNEL_DS, %ax - movw %ax, %ds - movw %ax, %es - movw %ax, %fs - movw %ax, %gs + movl $__KERNEL_DS, %eax + movl %eax, %ds - movw $__KERNEL_DS, %ax - movw %ax, %ss - - mov $(wakeup_stack - __START_KERNEL_map), %esp movl saved_magic - __START_KERNEL_map, %eax cmpl $0x9abcdef0, %eax jne bogus_32_magic + movw $0x0e00 + 'i', %ds:(0xb8012) + movb $0xa8, %al ; outb %al, $0x80; + /* * Prepare for entering 64bits mode */ - /* Enable PAE mode and PGE */ + /* Enable PAE */ xorl %eax, %eax btsl $5, %eax - btsl $7, %eax movl %eax, %cr4 /* Setup early boot stage 4 level pagetables */ movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax movl %eax, %cr3 - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - /* Fool rdmsr and reset %eax to avoid dependences */ - xorl %eax, %eax /* Enable Long Mode */ + xorl %eax, %eax btsl $_EFER_LME, %eax - /* Enable System Call */ - btsl $_EFER_SCE, %eax - /* No Execute supported? */ + /* No Execute supported? */ btl $20,%edi jnc 1f btsl $_EFER_NX, %eax -1: /* Make changes effective */ +1: movl $MSR_EFER, %ecx + xorl %edx, %edx wrmsr - wbinvd xorl %eax, %eax btsl $31, %eax /* Enable paging and in turn activate Long Mode */ btsl $0, %eax /* Enable protected mode */ - btsl $1, %eax /* Enable MP */ - btsl $4, %eax /* Enable ET */ - btsl $5, %eax /* Enable NE */ - btsl $16, %eax /* Enable WP */ - btsl $18, %eax /* Enable AM */ /* Make changes effective */ movl %eax, %cr0 + /* At this point: CR4.PAE must be 1 CS.L must be 0 @@ -162,11 +151,6 @@ wakeup_32: Next instruction must be a branch This must be on identity-mapped page */ - jmp reach_compatibility_mode -reach_compatibility_mode: - movw $0x0e00 + 'i', %ds:(0xb8012) - movb $0xa8, %al ; outb %al, $0x80; - /* * At this point we're in long mode but in 32bit compatibility mode * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn @@ -174,24 +158,19 @@ reach_compatibility_mode: * the new gdt/idt that has __KERNEL_CS with CS.L = 1. */ - movw $0x0e00 + 'n', %ds:(0xb8014) - movb $0xa9, %al ; outb %al, $0x80 - - /* Load new GDT with the 64bit segment using 32bit descriptor */ - movl $(pGDT32 - __START_KERNEL_map), %eax - lgdt (%eax) - - movl $(wakeup_jumpvector - __START_KERNEL_map), %eax /* Finally jump in 64bit mode */ - ljmp *(%eax) + ljmp *(wakeup_long64_vector - __START_KERNEL_map) -wakeup_jumpvector: - .long wakeup_long64 - __START_KERNEL_map - .word __KERNEL_CS + .balign 4 +wakeup_long64_vector: + .long wakeup_long64 - __START_KERNEL_map + .word __KERNEL_CS, 0 .code64 - /* Hooray, we are in Long 64-bit mode (but still running in low memory) */ + /* Hooray, we are in Long 64-bit mode (but still running in + * low memory) + */ wakeup_long64: /* * We must switch to a new descriptor in kernel space for the GDT @@ -201,6 +180,9 @@ wakeup_long64: */ lgdt cpu_gdt_descr - __START_KERNEL_map + movw $0x0e00 + 'n', %ds:(0xb8014) + movb $0xa9, %al ; outb %al, $0x80 + movw $0x0e00 + 'u', %ds:(0xb8016) nop @@ -227,33 +209,19 @@ wakeup_long64: .align 64 gdta: + /* Its good to keep gdt in sync with one in trampoline.S */ .word 0, 0, 0, 0 # dummy - - .word 0, 0, 0, 0 # unused - - .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) - .word 0 # base address = 0 - .word 0x9B00 # code read/exec. ??? Why I need 0x9B00 (as opposed to 0x9A00 in order for this to work?) - .word 0x00CF # granularity = 4096, 386 - # (+5th nibble of limit) - - .word 0xFFFF # 4Gb - (0x100000*0x1000 = 4Gb) - .word 0 # base address = 0 - .word 0x9200 # data read/write - .word 0x00CF # granularity = 4096, 386 - # (+5th nibble of limit) -# this is 64bit descriptor for code - .word 0xFFFF - .word 0 - .word 0x9A00 # code read/exec - .word 0x00AF # as above, but it is long mode and with D=0 + /* ??? Why I need the accessed bit set in order for this to work? */ + .quad 0x00cf9b000000ffff # __KERNEL32_CS + .quad 0x00af9b000000ffff # __KERNEL_CS + .quad 0x00cf93000000ffff # __KERNEL_DS idt_48a: .word 0 # idt limit = 0 .word 0, 0 # idt base = 0L gdt_48a: - .word 0x8000 # gdt limit=2048, + .word 0x800 # gdt limit=2048, # 256 GDT entries .word 0, 0 # gdt base (filled in later) @@ -263,7 +231,7 @@ video_mode: .quad 0 video_flags: .quad 0 bogus_real_magic: - movb $0xba,%al ; outb %al,$0x80 + movb $0xba,%al ; outb %al,$0x80 jmp bogus_real_magic bogus_32_magic: -- cgit v1.2.3 From d8e1baf10d62c06fc52e89137357e54da3d92672 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86-64: 64bit ACPI wakeup trampoline o Moved wakeup_level4_pgt into the wakeup routine so we can run the kernel above 4G. o Now we first go to 64bit mode and continue to run from trampoline and then then start accessing kernel symbols and restore processor context. This enables us to resume even in relocatable kernel context when kernel might not be loaded at physical addr it has been compiled for. o Removed the need for modifying any existing kernel page table. o Increased the size of the wakeup routine to 8K. This is required as wake page tables are on trampoline itself and they got to be at 4K boundary, hence one page is not sufficient. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/acpi/sleep.c | 24 +++------------- arch/x86_64/kernel/acpi/wakeup.S | 59 +++++++++++++++++++++++++--------------- arch/x86_64/kernel/head.S | 9 ------ 3 files changed, 41 insertions(+), 51 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index e1548fbe95ae..195b7034a148 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -60,19 +60,6 @@ extern char wakeup_start, wakeup_end; extern unsigned long acpi_copy_wakeup_routine(unsigned long); -static pgd_t low_ptr; - -static void init_low_mapping(void) -{ - pgd_t *slot0 = pgd_offset(current->mm, 0UL); - low_ptr = *slot0; - /* FIXME: We're playing with the current task's page tables here, which - * is potentially dangerous on SMP systems. - */ - set_pgd(slot0, *pgd_offset(current->mm, PAGE_OFFSET)); - local_flush_tlb(); -} - /** * acpi_save_state_mem - save kernel state * @@ -81,8 +68,6 @@ static void init_low_mapping(void) */ int acpi_save_state_mem(void) { - init_low_mapping(); - memcpy((void *)acpi_wakeup_address, &wakeup_start, &wakeup_end - &wakeup_start); acpi_copy_wakeup_routine(acpi_wakeup_address); @@ -95,8 +80,6 @@ int acpi_save_state_mem(void) */ void acpi_restore_state_mem(void) { - set_pgd(pgd_offset(current->mm, 0UL), low_ptr); - local_flush_tlb(); } /** @@ -109,10 +92,11 @@ void acpi_restore_state_mem(void) */ void __init acpi_reserve_bootmem(void) { - acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE); - if ((&wakeup_end - &wakeup_start) > PAGE_SIZE) + acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2); + if ((&wakeup_end - &wakeup_start) > (PAGE_SIZE*2)) printk(KERN_CRIT - "ACPI: Wakeup code way too big, will crash on attempt to suspend\n"); + "ACPI: Wakeup code way too big, will crash on attempt" + " to suspend\n"); } static int __init acpi_sleep_setup(char *str) diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index bd4c6f1a6f32..766cfbcac1db 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -1,6 +1,7 @@ .text #include #include +#include #include #include @@ -62,12 +63,15 @@ wakeup_code: movb $0xa2, %al ; outb %al, $0x80 - lidt %ds:idt_48a - wakeup_code - xorl %eax, %eax - movw %ds, %ax # (Convert %ds:gdt to a linear ptr) - shll $4, %eax - addl $(gdta - wakeup_code), %eax - movl %eax, gdt_48a +2 - wakeup_code + mov %ds, %ax # Find 32bit wakeup_code addr + movzx %ax, %esi # (Convert %ds:gdt to a liner ptr) + shll $4, %esi + # Fix up the vectors + addl %esi, wakeup_32_vector - wakeup_code + addl %esi, wakeup_long64_vector - wakeup_code + addl %esi, gdt_48a + 2 - wakeup_code # Fixup the gdt pointer + + lidtl %ds:idt_48a - wakeup_code lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is # appropriate @@ -80,7 +84,7 @@ wakeup_code: .balign 4 wakeup_32_vector: - .long wakeup_32 - __START_KERNEL_map + .long wakeup_32 - wakeup_code .word __KERNEL32_CS, 0 .code32 @@ -103,10 +107,6 @@ wakeup_32: movl $__KERNEL_DS, %eax movl %eax, %ds - movl saved_magic - __START_KERNEL_map, %eax - cmpl $0x9abcdef0, %eax - jne bogus_32_magic - movw $0x0e00 + 'i', %ds:(0xb8012) movb $0xa8, %al ; outb %al, $0x80; @@ -120,7 +120,7 @@ wakeup_32: movl %eax, %cr4 /* Setup early boot stage 4 level pagetables */ - movl $(wakeup_level4_pgt - __START_KERNEL_map), %eax + leal (wakeup_level4_pgt - wakeup_code)(%esi), %eax movl %eax, %cr3 /* Enable Long Mode */ @@ -159,11 +159,11 @@ wakeup_32: */ /* Finally jump in 64bit mode */ - ljmp *(wakeup_long64_vector - __START_KERNEL_map) + ljmp *(wakeup_long64_vector - wakeup_code)(%esi) .balign 4 wakeup_long64_vector: - .long wakeup_long64 - __START_KERNEL_map + .long wakeup_long64 - wakeup_code .word __KERNEL_CS, 0 .code64 @@ -178,11 +178,16 @@ wakeup_long64: * addresses where we're currently running on. We have to do that here * because in 32bit we couldn't load a 64bit linear address. */ - lgdt cpu_gdt_descr - __START_KERNEL_map + lgdt cpu_gdt_descr movw $0x0e00 + 'n', %ds:(0xb8014) movb $0xa9, %al ; outb %al, $0x80 + movq saved_magic, %rax + movq $0x123456789abcdef0, %rdx + cmpq %rdx, %rax + jne bogus_64_magic + movw $0x0e00 + 'u', %ds:(0xb8016) nop @@ -223,20 +228,21 @@ idt_48a: gdt_48a: .word 0x800 # gdt limit=2048, # 256 GDT entries - .word 0, 0 # gdt base (filled in later) - + .long gdta - wakeup_code # gdt base (relocated in later) real_magic: .quad 0 video_mode: .quad 0 video_flags: .quad 0 +.code16 bogus_real_magic: movb $0xba,%al ; outb %al,$0x80 jmp bogus_real_magic -bogus_32_magic: +.code64 +bogus_64_magic: movb $0xb3,%al ; outb %al,$0x80 - jmp bogus_32_magic + jmp bogus_64_magic bogus_cpu: movb $0xbc,%al ; outb %al,$0x80 @@ -263,6 +269,7 @@ bogus_cpu: #define VIDEO_FIRST_V7 0x0900 # Setting of user mode (AX=mode ID) => CF=success +.code16 mode_seta: movw %ax, %bx #if 0 @@ -313,6 +320,13 @@ wakeup_stack_begin: # Stack grows down .org 0xff0 wakeup_stack: # Just below end of page +.org 0x1000 +ENTRY(wakeup_level4_pgt) + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 510,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE + ENTRY(wakeup_end) ## @@ -338,9 +352,10 @@ ENTRY(acpi_copy_wakeup_routine) movq $0x123456789abcdef0, %rdx movq %rdx, saved_magic - movl saved_magic - __START_KERNEL_map, %eax - cmpl $0x9abcdef0, %eax - jne bogus_32_magic + movq saved_magic, %rax + movq $0x123456789abcdef0, %rdx + cmpq %rdx, %rax + jne bogus_64_magic # restore the regs we used popq %rdx diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 562d62fbd69f..926aa2197aaa 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -308,15 +308,6 @@ NEXT_PAGE(level2_kernel_pgt) .data -#ifdef CONFIG_ACPI_SLEEP - .align PAGE_SIZE -ENTRY(wakeup_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 510,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE -#endif - #ifndef CONFIG_HOTPLUG_CPU __INITDATA #endif -- cgit v1.2.3 From bdb96a6614cfaba24e23dd9de4040c068c3af19b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86-64: Modify discover_ebda to use virtual addresses Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/setup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 4b114ee31ebc..65e2bc551a2b 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -205,10 +205,10 @@ static void discover_ebda(void) * there is a real-mode segmented pointer pointing to the * 4K EBDA area at 0x40E */ - ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER; + ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); ebda_addr <<= 4; - ebda_size = *(unsigned short *)(unsigned long)ebda_addr; + ebda_size = *(unsigned short *)__va(ebda_addr); /* Round EBDA up to pages */ if (ebda_size == 0) -- cgit v1.2.3 From cfd243d4af7c7f8f52f5cb99d3932d9074b039ff Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86-64: Remove the identity mapping as early as possible With the rewrite of the SMP trampoline and the early page allocator there is nothing that needs identity mapped pages, once we start executing C code. So add zap_identity_mappings into head64.c and remove zap_low_mappings() from much later in the code. The functions are subtly different thus the name change. This also kills boot_level4_pgt which was from an earlier attempt to move the identity mappings as early as possible, and is now no longer needed. Essentially I have replaced boot_level4_pgt with trampoline_level4_pgt in trampoline.S Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 39 ++++++++++++++------------------------- arch/x86_64/kernel/head64.c | 17 +++++++++++------ arch/x86_64/kernel/setup.c | 2 -- arch/x86_64/kernel/setup64.c | 1 - arch/x86_64/mm/init.c | 24 ------------------------ include/asm-x86_64/pgtable.h | 1 - include/asm-x86_64/proto.h | 2 -- 7 files changed, 25 insertions(+), 61 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 926aa2197aaa..c211e52f1333 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -71,7 +71,7 @@ startup_32: movl %eax, %cr4 /* Setup early boot stage 4 level pagetables */ - movl $(boot_level4_pgt - __START_KERNEL_map), %eax + movl $(init_level4_pgt - __START_KERNEL_map), %eax movl %eax, %cr3 /* Setup EFER (Extended Feature Enable Register) */ @@ -115,7 +115,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr4 /* Setup early boot stage 4 level pagetables. */ - movq $(boot_level4_pgt - __START_KERNEL_map), %rax + movq $(init_level4_pgt - __START_KERNEL_map), %rax movq %rax, %cr3 /* Check if nx is implemented */ @@ -274,9 +274,19 @@ ENTRY(name) i = i + 1 ; \ .endr + /* + * This default setting generates an ident mapping at address 0x100000 + * and a mapping for the kernel that precisely maps virtual address + * 0xffffffff80000000 to physical address 0x000000. (always using + * 2Mbyte large pages provided by PAE mode) + */ NEXT_PAGE(init_level4_pgt) - /* This gets initialized in x86_64_start_kernel */ - .fill 512,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 257,8,0 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE + .fill 252,8,0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE @@ -307,27 +317,6 @@ NEXT_PAGE(level2_kernel_pgt) #undef NEXT_PAGE .data - -#ifndef CONFIG_HOTPLUG_CPU - __INITDATA -#endif - /* - * This default setting generates an ident mapping at address 0x100000 - * and a mapping for the kernel that precisely maps virtual address - * 0xffffffff80000000 to physical address 0x000000. (always using - * 2Mbyte large pages provided by PAE mode) - */ - .align PAGE_SIZE -ENTRY(boot_level4_pgt) - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 257,8,0 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 252,8,0 - /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE - - .data - .align 16 .globl cpu_gdt_descr cpu_gdt_descr: diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 5c529c1e3d6c..6c34bdd22e26 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -18,8 +18,16 @@ #include #include #include +#include #include +static void __init zap_identity_mappings(void) +{ + pgd_t *pgd = pgd_offset_k(0UL); + pgd_clear(pgd); + __flush_tlb(); +} + /* Don't add a printk in there. printk relies on the PDA which is not initialized yet. */ static void __init clear_bss(void) @@ -57,18 +65,15 @@ void __init x86_64_start_kernel(char * real_mode_data) /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); + /* Make NULL pointers segfault */ + zap_identity_mappings(); + for (i = 0; i < IDT_ENTRIES; i++) set_intr_gate(i, early_idt_handler); asm volatile("lidt %0" :: "m" (idt_descr)); early_printk("Kernel alive\n"); - /* - * switch to init_level4_pgt from boot_level4_pgt - */ - memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t)); - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); - for (i = 0; i < NR_CPUS; i++) cpu_pda(i) = &boot_cpu_pda[i]; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 65e2bc551a2b..0e2b8df0ea64 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -274,8 +274,6 @@ void __init setup_arch(char **cmdline_p) dmi_scan_machine(); - zap_low_mappings(0); - #ifdef CONFIG_ACPI /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 6a70b55f719d..53064a9a365f 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -201,7 +201,6 @@ void __cpuinit cpu_init (void) /* CPU 0 is initialised in head64.c */ if (cpu != 0) { pda_init(cpu); - zap_low_mappings(cpu); } else estacks = boot_exception_stacks; diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 4ab3d40aac90..b0a607892183 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -378,21 +378,6 @@ void __meminit init_memory_mapping(unsigned long start, unsigned long end) __flush_tlb_all(); } -void __cpuinit zap_low_mappings(int cpu) -{ - if (cpu == 0) { - pgd_t *pgd = pgd_offset_k(0UL); - pgd_clear(pgd); - } else { - /* - * For AP's, zap the low identity mappings by changing the cr3 - * to init_level4_pgt and doing local flush tlb all - */ - asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt))); - } - __flush_tlb_all(); -} - #ifndef CONFIG_NUMA void __init paging_init(void) { @@ -569,15 +554,6 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); - -#ifdef CONFIG_SMP - /* - * Sync boot_level4_pgt mappings with the init_level4_pgt - * except for the low identity mappings which are already zapped - * in init_level4_pgt. This sync-up is essential for AP's bringup - */ - memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t)); -#endif } void free_init_pages(char *what, unsigned long begin, unsigned long end) diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 5a5d43b3ef5d..703f0243f27a 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -17,7 +17,6 @@ extern pud_t level3_kernel_pgt[512]; extern pud_t level3_ident_pgt[512]; extern pmd_t level2_kernel_pgt[512]; extern pgd_t init_level4_pgt[]; -extern pgd_t boot_level4_pgt[]; extern unsigned long __supported_pte_mask; #define swapper_pg_dir init_level4_pgt diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 78427021d94a..3f8f285138d2 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -11,8 +11,6 @@ struct pt_regs; extern void start_kernel(void); extern void pda_init(int); -extern void zap_low_mappings(int cpu); - extern void early_idt_handler(void); extern void mcheck_init(struct cpuinfo_x86 *c); -- cgit v1.2.3 From 49c3df6aaa6a51071fc135273d1a2515d019099f Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86: Move swsusp __pa() dependent code to arch portion o __pa() should be used only on kernel linearly mapped virtual addresses and not on kernel text and data addresses. o Hibernation code needs to determine the physical address associated with kernel symbol to mark a section boundary which contains pages which don't have to be saved and restored during hibernate/resume operation. o Move this piece of code in arch dependent section. So that architectures which don't have kernel text/data mapped into kernel linearly mapped region can come up with their own ways of determining physical addresses associated with a kernel text. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/power/suspend.c | 14 ++++++++++++++ arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/suspend.c | 24 ++++++++++++++++++++++++ arch/x86_64/kernel/suspend.c | 14 ++++++++++++++ kernel/power/power.h | 5 ++--- kernel/power/snapshot.c | 11 ----------- 6 files changed, 55 insertions(+), 14 deletions(-) create mode 100644 arch/powerpc/kernel/suspend.c (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/power/suspend.c b/arch/i386/power/suspend.c index db5e98d2eb73..a0020b913f31 100644 --- a/arch/i386/power/suspend.c +++ b/arch/i386/power/suspend.c @@ -16,6 +16,9 @@ /* Defined in arch/i386/power/swsusp.S */ extern int restore_image(void); +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + /* Pointer to the temporary resume page tables */ pgd_t *resume_pg_dir; @@ -156,3 +159,14 @@ int swsusp_arch_resume(void) restore_image(); return 0; } + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index e0fa80eca366..aa693d0f151a 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_6xx) += idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o obj-$(CONFIG_TAU) += tau_6xx.o obj32-$(CONFIG_SOFTWARE_SUSPEND) += swsusp_32.o +obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o obj32-$(CONFIG_MODULES) += module_32.o ifeq ($(CONFIG_PPC_MERGE),y) diff --git a/arch/powerpc/kernel/suspend.c b/arch/powerpc/kernel/suspend.c new file mode 100644 index 000000000000..8cee57107541 --- /dev/null +++ b/arch/powerpc/kernel/suspend.c @@ -0,0 +1,24 @@ +/* + * Suspend support specific for power. + * + * Distribute under GPLv2 + * + * Copyright (c) 2002 Pavel Machek + * Copyright (c) 2001 Patrick Mochel + */ + +#include + +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index fe865ea4df52..4ca523d58a5b 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -13,6 +13,9 @@ #include #include +/* References to section boundaries */ +extern const void __nosave_begin, __nosave_end; + struct saved_context saved_context; unsigned long saved_context_eax, saved_context_ebx, saved_context_ecx, saved_context_edx; @@ -220,4 +223,15 @@ int swsusp_arch_resume(void) restore_image(); return 0; } + +/* + * pfn_is_nosave - check if given pfn is in the 'nosave' section + */ + +int pfn_is_nosave(unsigned long pfn) +{ + unsigned long nosave_begin_pfn = __pa_symbol(&__nosave_begin) >> PAGE_SHIFT; + unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; + return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); +} #endif /* CONFIG_SOFTWARE_SUSPEND */ diff --git a/kernel/power/power.h b/kernel/power/power.h index eb461b816bf4..1c6eef8df4ad 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -23,6 +23,8 @@ static inline int pm_suspend_disk(void) } #endif +extern int pfn_is_nosave(unsigned long); + extern struct mutex pm_mutex; #define power_attr(_name) \ @@ -37,9 +39,6 @@ static struct subsys_attribute _name##_attr = { \ extern struct subsystem power_subsys; -/* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; - /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; extern int in_suspend; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index fc53ad068128..704c25a3ffec 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -650,17 +650,6 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } static inline unsigned int count_highmem_pages(void) { return 0; } #endif /* CONFIG_HIGHMEM */ -/** - * pfn_is_nosave - check if given pfn is in the 'nosave' section - */ - -static inline int pfn_is_nosave(unsigned long pfn) -{ - unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; - unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; - return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); -} - /** * saveable - Determine whether a non-highmem page should be included in * the suspend image. -- cgit v1.2.3 From 0dbf7028c0c1f266c9631139450a1502d3cd457e Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86: __pa and __pa_symbol address space separation Currently __pa_symbol is for use with symbols in the kernel address map and __pa is for use with pointers into the physical memory map. But the code is implemented so you can usually interchange the two. __pa which is much more common can be implemented much more cheaply if it is it doesn't have to worry about any other kernel address spaces. This is especially true with a relocatable kernel as __pa_symbol needs to peform an extra variable read to resolve the address. There is a third macro that is added for the vsyscall data __pa_vsymbol for finding the physical addesses of vsyscall pages. Most of this patch is simply sorting through the references to __pa or __pa_symbol and using the proper one. A little of it is continuing to use a physical address when we have it instead of recalculating it several times. swapper_pgd is now NULL. leave_mm now uses init_mm.pgd and init_mm.pgd is initialized at boot (instead of compile time) to the physmem virtual mapping of init_level4_pgd. The physical address changed. Except for the for EMPTY_ZERO page all of the remaining references to __pa_symbol appear to be during kernel initialization. So this should reduce the cost of __pa in the common case, even on a relocated kernel. As this is technically a semantic change we need to be on the lookout for anything I missed. But it works for me (tm). Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/i386/kernel/alternative.c | 4 ++-- arch/i386/mm/init.c | 15 ++++++++------- arch/x86_64/kernel/machine_kexec.c | 14 +++++++------- arch/x86_64/kernel/setup.c | 9 +++++---- arch/x86_64/kernel/smp.c | 2 +- arch/x86_64/kernel/vsyscall.c | 9 +++++++-- arch/x86_64/mm/init.c | 21 +++++++++++---------- arch/x86_64/mm/pageattr.c | 16 ++++++++-------- include/asm-x86_64/page.h | 6 ++---- include/asm-x86_64/pgtable.h | 4 ++-- 10 files changed, 53 insertions(+), 47 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/alternative.c b/arch/i386/kernel/alternative.c index 426f59b0106b..a27c8d347364 100644 --- a/arch/i386/kernel/alternative.c +++ b/arch/i386/kernel/alternative.c @@ -402,8 +402,8 @@ void __init alternative_instructions(void) _text, _etext); } free_init_pages("SMP alternatives", - (unsigned long)__smp_alt_begin, - (unsigned long)__smp_alt_end); + __pa_symbol(&__smp_alt_begin), + __pa_symbol(&__smp_alt_end)); } else { alternatives_smp_save(__smp_alt_instructions, __smp_alt_instructions_end); diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index ae436882af7a..23be1b0aafa4 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -774,10 +774,11 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); + struct page *page = pfn_to_page(addr >> PAGE_SHIFT); + ClearPageReserved(page); + init_page_count(page); + memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); + __free_page(page); totalram_pages++; } printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); @@ -786,14 +787,14 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) void free_initmem(void) { free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + __pa_symbol(&__init_begin), + __pa_symbol(&__init_end)); } #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", start, end); + free_init_pages("initrd memory", __pa(start), __pa(end)); } #endif diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c index 0497e3bd5bff..a8bb33c1a8f2 100644 --- a/arch/x86_64/kernel/machine_kexec.c +++ b/arch/x86_64/kernel/machine_kexec.c @@ -191,19 +191,19 @@ NORET_TYPE void machine_kexec(struct kimage *image) page_list[PA_CONTROL_PAGE] = __pa(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; - page_list[PA_PGD] = __pa(kexec_pgd); + page_list[PA_PGD] = __pa_symbol(&kexec_pgd); page_list[VA_PGD] = (unsigned long)kexec_pgd; - page_list[PA_PUD_0] = __pa(kexec_pud0); + page_list[PA_PUD_0] = __pa_symbol(&kexec_pud0); page_list[VA_PUD_0] = (unsigned long)kexec_pud0; - page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[PA_PMD_0] = __pa_symbol(&kexec_pmd0); page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; - page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[PA_PTE_0] = __pa_symbol(&kexec_pte0); page_list[VA_PTE_0] = (unsigned long)kexec_pte0; - page_list[PA_PUD_1] = __pa(kexec_pud1); + page_list[PA_PUD_1] = __pa_symbol(&kexec_pud1); page_list[VA_PUD_1] = (unsigned long)kexec_pud1; - page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[PA_PMD_1] = __pa_symbol(&kexec_pmd1); page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; - page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[PA_PTE_1] = __pa_symbol(&kexec_pte1); page_list[VA_PTE_1] = (unsigned long)kexec_pte1; page_list[PA_TABLE_PAGE] = diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 0e2b8df0ea64..b9bdfc1b54f8 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -243,11 +243,12 @@ void __init setup_arch(char **cmdline_p) init_mm.end_code = (unsigned long) &_etext; init_mm.end_data = (unsigned long) &_edata; init_mm.brk = (unsigned long) &_end; + init_mm.pgd = __va(__pa_symbol(&init_level4_pgt)); - code_resource.start = virt_to_phys(&_text); - code_resource.end = virt_to_phys(&_etext)-1; - data_resource.start = virt_to_phys(&_etext); - data_resource.end = virt_to_phys(&_edata)-1; + code_resource.start = __pa_symbol(&_text); + code_resource.end = __pa_symbol(&_etext)-1; + data_resource.start = __pa_symbol(&_etext); + data_resource.end = __pa_symbol(&_edata)-1; early_identify_cpu(&boot_cpu_data); diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index bd1d123947ce..22abae4e9f39 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -76,7 +76,7 @@ static inline void leave_mm(int cpu) if (read_pda(mmu_state) == TLBSTATE_OK) BUG(); cpu_clear(cpu, read_pda(active_mm)->cpu_vm_mask); - load_cr3(swapper_pg_dir); + load_cr3(init_mm.pgd); } /* diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index b43c698cf7d3..d14cbb3e0ebe 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -45,6 +45,11 @@ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","rcx","memory" +#define __pa_vsymbol(x) \ + ({unsigned long v; \ + extern char __vsyscall_0; \ + asm("" : "=r" (v) : "0" (x)); \ + ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); }) struct vsyscall_gtod_data_t { seqlock_t lock; @@ -224,10 +229,10 @@ static int vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, return ret; /* gcc has some trouble with __va(__pa()), so just do it this way. */ - map1 = ioremap(__pa_symbol(&vsysc1), 2); + map1 = ioremap(__pa_vsymbol(&vsysc1), 2); if (!map1) return -ENOMEM; - map2 = ioremap(__pa_symbol(&vsysc2), 2); + map2 = ioremap(__pa_vsymbol(&vsysc2), 2); if (!map2) { ret = -ENOMEM; goto out; diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index b0a607892183..69e22d3c9238 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -565,11 +565,11 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(virt_to_page(addr)); - init_page_count(virt_to_page(addr)); - memset((void *)(addr & ~(PAGE_SIZE-1)), - POISON_FREE_INITMEM, PAGE_SIZE); - free_page(addr); + struct page *page = pfn_to_page(addr >> PAGE_SHIFT); + ClearPageReserved(page); + init_page_count(page); + memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); + __free_page(page); totalram_pages++; } } @@ -579,17 +579,18 @@ void free_initmem(void) memset(__initdata_begin, POISON_FREE_INITDATA, __initdata_end - __initdata_begin); free_init_pages("unused kernel memory", - (unsigned long)(&__init_begin), - (unsigned long)(&__init_end)); + __pa_symbol(&__init_begin), + __pa_symbol(&__init_end)); } #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__start_rodata; + unsigned long addr = (unsigned long)__va(__pa_symbol(&__start_rodata)); + unsigned long end = (unsigned long)__va(__pa_symbol(&__end_rodata)); - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) + for (; addr < end; addr += PAGE_SIZE) change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); printk ("Write protecting the kernel read-only data: %luk\n", @@ -608,7 +609,7 @@ void mark_rodata_ro(void) #ifdef CONFIG_BLK_DEV_INITRD void free_initrd_mem(unsigned long start, unsigned long end) { - free_init_pages("initrd memory", start, end); + free_init_pages("initrd memory", __pa(start), __pa(end)); } #endif diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 081409aa3452..76ee90a5abe0 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -51,7 +51,6 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, SetPagePrivate(base); page_private(base) = 0; - address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { @@ -101,13 +100,12 @@ static inline void save_page(struct page *fpage) * No more special protections in this 2/4MB area - revert to a * large page again. */ -static void revert_page(unsigned long address, pgprot_t ref_prot) +static void revert_page(unsigned long address, unsigned long pfn, pgprot_t ref_prot) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t large_pte; - unsigned long pfn; pgd = pgd_offset_k(address); BUG_ON(pgd_none(*pgd)); @@ -115,7 +113,6 @@ static void revert_page(unsigned long address, pgprot_t ref_prot) BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, address); BUG_ON(pmd_val(*pmd) & _PAGE_PSE); - pfn = (__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT; large_pte = pfn_pte(pfn, ref_prot); large_pte = pte_mkhuge(large_pte); set_pte((pte_t *)pmd, large_pte); @@ -141,7 +138,8 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, */ struct page *split; ref_prot2 = pte_pgprot(pte_clrhuge(*kpte)); - split = split_large_page(address, prot, ref_prot2); + split = split_large_page(pfn << PAGE_SHIFT, prot, + ref_prot2); if (!split) return -ENOMEM; set_pte(kpte, mk_pte(split, ref_prot2)); @@ -160,7 +158,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, if (page_private(kpte_page) == 0) { save_page(kpte_page); - revert_page(address, ref_prot); + revert_page(address, pfn, ref_prot); } return 0; } @@ -180,6 +178,7 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, */ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) { + unsigned long phys_base_pfn = __pa_symbol(__START_KERNEL_map) >> PAGE_SHIFT; int err = 0; int i; @@ -192,10 +191,11 @@ int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) break; /* Handle kernel mapping too which aliases part of the * lowmem */ - if (__pa(address) < KERNEL_TEXT_SIZE) { + if ((pfn >= phys_base_pfn) && + ((pfn - phys_base_pfn) < (KERNEL_TEXT_SIZE >> PAGE_SHIFT))) { unsigned long addr2; pgprot_t prot2; - addr2 = __START_KERNEL_map + __pa(address); + addr2 = __START_KERNEL_map + ((pfn - phys_base_pfn) << PAGE_SHIFT); /* Make sure the kernel mappings stay executable */ prot2 = pte_pgprot(pte_mkexec(pfn_pte(0, prot))); err = __change_page_attr(addr2, pfn, prot2, diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index d554b94485df..4974433bbf34 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -102,17 +102,15 @@ typedef struct { unsigned long pgprot; } pgprot_t; /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. Otherwise you risk miscompilation. */ -#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET) +#define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ #define __pa_symbol(x) \ ({unsigned long v; \ asm("" : "=r" (v) : "0" (x)); \ - __pa(v); }) + (v - __START_KERNEL_map); }) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) -#define __boot_va(x) __va(x) -#define __boot_pa(x) __pa(x) #ifdef CONFIG_FLATMEM #define pfn_valid(pfn) ((pfn) < end_pfn) #endif diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h index 703f0243f27a..c1865e38c7b7 100644 --- a/include/asm-x86_64/pgtable.h +++ b/include/asm-x86_64/pgtable.h @@ -19,7 +19,7 @@ extern pmd_t level2_kernel_pgt[512]; extern pgd_t init_level4_pgt[]; extern unsigned long __supported_pte_mask; -#define swapper_pg_dir init_level4_pgt +#define swapper_pg_dir ((pgd_t *)NULL) extern void paging_init(void); extern void clear_kernel_mapping(unsigned long addr, unsigned long size); @@ -29,7 +29,7 @@ extern void clear_kernel_mapping(unsigned long addr, unsigned long size); * for zero-mapped memory areas etc.. */ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +#define ZERO_PAGE(vaddr) (pfn_to_page(__pa_symbol(&empty_zero_page) >> PAGE_SHIFT)) #endif /* !__ASSEMBLY__ */ -- cgit v1.2.3 From 1ab60e0f72f71ec54831e525a3e1154f1c092408 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:07 +0200 Subject: [PATCH] x86-64: Relocatable Kernel Support This patch modifies the x86_64 kernel so that it can be loaded and run at any 2M aligned address, below 512G. The technique used is to compile the decompressor with -fPIC and modify it so the decompressor is fully relocatable. For the main kernel the page tables are modified so the kernel remains at the same virtual address. In addition a variable phys_base is kept that holds the physical address the kernel is loaded at. __pa_symbol is modified to add that when we take the address of a kernel symbol. When loaded with a normal bootloader the decompressor will decompress the kernel to 2M and it will run there. This both ensures the relocation code is always working, and makes it easier to use 2M pages for the kernel and the cpu. AK: changed to not make RELOCATABLE default in Kconfig Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/Kconfig | 49 ++++- arch/x86_64/boot/compressed/Makefile | 12 +- arch/x86_64/boot/compressed/head.S | 322 +++++++++++++++++++++++--------- arch/x86_64/boot/compressed/misc.c | 247 ++++++++++++------------ arch/x86_64/boot/compressed/vmlinux.lds | 44 +++++ arch/x86_64/boot/compressed/vmlinux.scr | 9 +- arch/x86_64/kernel/head.S | 233 +++++++++++++---------- arch/x86_64/kernel/suspend_asm.S | 7 +- include/asm-x86_64/page.h | 6 +- 9 files changed, 597 insertions(+), 332 deletions(-) create mode 100644 arch/x86_64/boot/compressed/vmlinux.lds (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index b3dbf11eb82c..715632026073 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -565,23 +565,56 @@ config CRASH_DUMP PHYSICAL_START. For more details see Documentation/kdump/kdump.txt +config RELOCATABLE + bool "Build a relocatable kernel(EXPERIMENTAL)" + depends on EXPERIMENTAL + help + Builds a relocatable kernel. This enables loading and running + a kernel binary from a different physical address than it has + been compiled for. + + One use is for the kexec on panic case where the recovery kernel + must live at a different physical address than the primary + kernel. + + Note: If CONFIG_RELOCATABLE=y, then kernel run from the address + it has been loaded at and compile time physical address + (CONFIG_PHYSICAL_START) is ignored. + config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) - default "0x1000000" if CRASH_DUMP default "0x200000" help - This gives the physical address where the kernel is loaded. Normally - for regular kernels this value is 0x200000 (2MB). But in the case - of kexec on panic the fail safe kernel needs to run at a different - address than the panic-ed kernel. This option is used to set the load - address for kernels used to capture crash dump on being kexec'ed - after panic. The default value for crash dump kernels is - 0x1000000 (16MB). This can also be set based on the "X" value as + This gives the physical address where the kernel is loaded. It + should be aligned to 2MB boundary. + + If kernel is a not relocatable (CONFIG_RELOCATABLE=n) then + bzImage will decompress itself to above physical address and + run from there. Otherwise, bzImage will run from the address where + it has been loaded by the boot loader and will ignore above physical + address. + + In normal kdump cases one does not have to set/change this option + as now bzImage can be compiled as a completely relocatable image + (CONFIG_RELOCATABLE=y) and be used to load and run from a different + address. This option is mainly useful for the folks who don't want + to use a bzImage for capturing the crash dump and want to use a + vmlinux instead. + + So if you are using bzImage for capturing the crash dump, leave + the value here unchanged to 0x200000 and set CONFIG_RELOCATABLE=y. + Otherwise if you plan to use vmlinux for capturing the crash dump + change this value to start of the reserved region (Typically 16MB + 0x1000000). In other words, it can be set based on the "X" value as specified in the "crashkernel=YM@XM" command line boot parameter passed to the panic-ed kernel. Typically this parameter is set as crashkernel=64M@16M. Please take a look at Documentation/kdump/kdump.txt for more details about crash dumps. + Usage of bzImage for capturing the crash dump is advantageous as + one does not have to build two kernels. Same kernel can be used + as production kernel and capture kernel. + Don't change this unless you know what you are doing. config SECCOMP diff --git a/arch/x86_64/boot/compressed/Makefile b/arch/x86_64/boot/compressed/Makefile index e70fa6e1da08..705a3e33d7e1 100644 --- a/arch/x86_64/boot/compressed/Makefile +++ b/arch/x86_64/boot/compressed/Makefile @@ -8,16 +8,14 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz head.o misc.o piggy.o EXTRA_AFLAGS := -traditional -AFLAGS := $(subst -m64,-m32,$(AFLAGS)) # cannot use EXTRA_CFLAGS because base CFLAGS contains -mkernel which conflicts with # -m32 -CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -LDFLAGS := -m elf_i386 +CFLAGS := -m64 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing -fPIC -mcmodel=small -fno-builtin +LDFLAGS := -m elf_x86_64 -LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup_32 -m elf_i386 - -$(obj)/vmlinux: $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE +LDFLAGS_vmlinux := -T +$(obj)/vmlinux: $(src)/vmlinux.lds $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: @@ -27,7 +25,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE $(call if_changed,gzip) -LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T +LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE $(call if_changed,ld) diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S index 6f55565e4d42..c353a9266ea4 100644 --- a/arch/x86_64/boot/compressed/head.S +++ b/arch/x86_64/boot/compressed/head.S @@ -26,116 +26,262 @@ #include #include +#include #include +#include +.section ".text.head" .code32 .globl startup_32 - + startup_32: cld cli - movl $(__KERNEL_DS),%eax - movl %eax,%ds - movl %eax,%es - movl %eax,%fs - movl %eax,%gs - - lss stack_start,%esp - xorl %eax,%eax -1: incl %eax # check that A20 really IS enabled - movl %eax,0x000000 # loop forever if it isn't - cmpl %eax,0x100000 - je 1b + movl $(__KERNEL_DS), %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + +/* Calculate the delta between where we were compiled to run + * at and where we were actually loaded at. This can only be done + * with a short local call on x86. Nothing else will tell us what + * address we are running at. The reserved chunk of the real-mode + * data at 0x34-0x3f are used as the stack for this calculation. + * Only 4 bytes are needed. + */ + leal 0x40(%esi), %esp + call 1f +1: popl %ebp + subl $1b, %ebp + +/* Compute the delta between where we were compiled to run at + * and where the code will actually run at. + */ +/* %ebp contains the address we are loaded at by the boot loader and %ebx + * contains the address where we should move the kernel image temporarily + * for safe in-place decompression. + */ + +#ifdef CONFIG_RELOCATABLE + movl %ebp, %ebx + addl $(LARGE_PAGE_SIZE -1), %ebx + andl $LARGE_PAGE_MASK, %ebx +#else + movl $CONFIG_PHYSICAL_START, %ebx +#endif + + /* Replace the compressed data size with the uncompressed size */ + subl input_len(%ebp), %ebx + movl output_len(%ebp), %eax + addl %eax, %ebx + /* Add 8 bytes for every 32K input block */ + shrl $12, %eax + addl %eax, %ebx + /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ + addl $(32768 + 18 + 4095), %ebx + andl $~4095, %ebx /* - * Initialize eflags. Some BIOS's leave bits like NT set. This would - * confuse the debugger if this code is traced. - * XXX - best to initialize before switching to protected mode. + * Prepare for entering 64 bit mode */ - pushl $0 - popfl + + /* Load new GDT with the 64bit segments using 32bit descriptor */ + leal gdt(%ebp), %eax + movl %eax, gdt+2(%ebp) + lgdt gdt(%ebp) + + /* Enable PAE mode */ + xorl %eax, %eax + orl $(1 << 5), %eax + movl %eax, %cr4 + + /* + * Build early 4G boot pagetable + */ + /* Initialize Page tables to 0*/ + leal pgtable(%ebx), %edi + xorl %eax, %eax + movl $((4096*6)/4), %ecx + rep stosl + + /* Build Level 4 */ + leal pgtable + 0(%ebx), %edi + leal 0x1007 (%edi), %eax + movl %eax, 0(%edi) + + /* Build Level 3 */ + leal pgtable + 0x1000(%ebx), %edi + leal 0x1007(%edi), %eax + movl $4, %ecx +1: movl %eax, 0x00(%edi) + addl $0x00001000, %eax + addl $8, %edi + decl %ecx + jnz 1b + + /* Build Level 2 */ + leal pgtable + 0x2000(%ebx), %edi + movl $0x00000183, %eax + movl $2048, %ecx +1: movl %eax, 0(%edi) + addl $0x00200000, %eax + addl $8, %edi + decl %ecx + jnz 1b + + /* Enable the boot page tables */ + leal pgtable(%ebx), %eax + movl %eax, %cr3 + + /* Enable Long mode in EFER (Extended Feature Enable Register) */ + movl $MSR_EFER, %ecx + rdmsr + btsl $_EFER_LME, %eax + wrmsr + + /* Setup for the jump to 64bit mode + * + * When the jump is performend we will be in long mode but + * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 + * (and in turn EFER.LMA = 1). To jump into 64bit mode we use + * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + * We place all of the values on our mini stack so lret can + * used to perform that far jump. + */ + pushl $__KERNEL_CS + leal startup_64(%ebp), %eax + pushl %eax + + /* Enter paged protected Mode, activating Long Mode */ + movl $0x80000001, %eax /* Enable Paging and Protected mode */ + movl %eax, %cr0 + + /* Jump from 32bit compatibility mode into 64bit mode. */ + lret + + /* Be careful here startup_64 needs to be at a predictable + * address so I can export it in an ELF header. Bootloaders + * should look at the ELF header to find this address, as + * it may change in the future. + */ + .code64 + .org 0x100 +ENTRY(startup_64) + /* We come here either from startup_32 or directly from a + * 64bit bootloader. If we come here from a bootloader we depend on + * an identity mapped page table being provied that maps our + * entire text+data+bss and hopefully all of memory. + */ + + /* Setup data segments. */ + xorl %eax, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + + /* Compute the decompressed kernel start address. It is where + * we were loaded at aligned to a 2M boundary. %rbp contains the + * decompressed kernel start address. + * + * If it is a relocatable kernel then decompress and run the kernel + * from load address aligned to 2MB addr, otherwise decompress and + * run the kernel from CONFIG_PHYSICAL_START + */ + + /* Start with the delta to where the kernel will run at. */ +#ifdef CONFIG_RELOCATABLE + leaq startup_32(%rip) /* - $startup_32 */, %rbp + addq $(LARGE_PAGE_SIZE - 1), %rbp + andq $LARGE_PAGE_MASK, %rbp + movq %rbp, %rbx +#else + movq $CONFIG_PHYSICAL_START, %rbp + movq %rbp, %rbx +#endif + + /* Replace the compressed data size with the uncompressed size */ + movl input_len(%rip), %eax + subq %rax, %rbx + movl output_len(%rip), %eax + addq %rax, %rbx + /* Add 8 bytes for every 32K input block */ + shrq $12, %rax + addq %rax, %rbx + /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ + addq $(32768 + 18 + 4095), %rbx + andq $~4095, %rbx + +/* Copy the compressed kernel to the end of our buffer + * where decompression in place becomes safe. + */ + leaq _end(%rip), %r8 + leaq _end(%rbx), %r9 + movq $_end /* - $startup_32 */, %rcx +1: subq $8, %r8 + subq $8, %r9 + movq 0(%r8), %rax + movq %rax, 0(%r9) + subq $8, %rcx + jnz 1b + +/* + * Jump to the relocated address. + */ + leaq relocated(%rbx), %rax + jmp *%rax + +.section ".text" +relocated: + /* * Clear BSS */ - xorl %eax,%eax - movl $_edata,%edi - movl $_end,%ecx - subl %edi,%ecx + xorq %rax, %rax + leaq _edata(%rbx), %rdi + leaq _end(%rbx), %rcx + subq %rdi, %rcx cld rep stosb + + /* Setup the stack */ + leaq user_stack_end(%rip), %rsp + + /* zero EFLAGS after setting rsp */ + pushq $0 + popfq + /* * Do the decompression, and jump to the new kernel.. */ - subl $16,%esp # place for structure on the stack - movl %esp,%eax - pushl %esi # real mode pointer as second arg - pushl %eax # address of structure as first arg - call decompress_kernel - orl %eax,%eax - jnz 3f - addl $8,%esp - xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $__PHYSICAL_START + pushq %rsi # Save the real mode argument + movq %rsi, %rdi # real mode address + leaq _heap(%rip), %rsi # _heap + leaq input_data(%rip), %rdx # input_data + movl input_len(%rip), %eax + movq %rax, %rcx # input_len + movq %rbp, %r8 # output + call decompress_kernel + popq %rsi -/* - * We come here, if we were loaded high. - * We need to move the move-in-place routine down to 0x1000 - * and then start it with the buffer addresses in registers, - * which we got from the stack. - */ -3: - movl %esi,%ebx - movl $move_routine_start,%esi - movl $0x1000,%edi - movl $move_routine_end,%ecx - subl %esi,%ecx - addl $3,%ecx - shrl $2,%ecx - cld - rep - movsl - - popl %esi # discard the address - addl $4,%esp # real mode pointer - popl %esi # low_buffer_start - popl %ecx # lcount - popl %edx # high_buffer_start - popl %eax # hcount - movl $__PHYSICAL_START,%edi - cli # make sure we don't get interrupted - ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine /* - * Routine (template) for moving the decompressed kernel in place, - * if we were high loaded. This _must_ PIC-code ! + * Jump to the decompressed kernel. */ -move_routine_start: - movl %ecx,%ebp - shrl $2,%ecx - rep - movsl - movl %ebp,%ecx - andl $3,%ecx - rep - movsb - movl %edx,%esi - movl %eax,%ecx # NOTE: rep movsb won't move if %ecx == 0 - addl $3,%ecx - shrl $2,%ecx - rep - movsl - movl %ebx,%esi # Restore setup pointer - xorl %ebx,%ebx - ljmp $(__KERNEL_CS), $__PHYSICAL_START -move_routine_end: + jmp *%rbp - -/* Stack for uncompression */ - .align 32 -user_stack: + .data +gdt: + .word gdt_end - gdt + .long gdt + .word 0 + .quad 0x0000000000000000 /* NULL descriptor */ + .quad 0x00af9a000000ffff /* __KERNEL_CS */ + .quad 0x00cf92000000ffff /* __KERNEL_DS */ +gdt_end: + .bss +/* Stack for uncompression */ + .balign 4 +user_stack: .fill 4096,4,0 -stack_start: - .long user_stack+4096 - .word __KERNEL_DS - +user_stack_end: diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index 3755b2e394d0..fee54dbf1749 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -9,10 +9,95 @@ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ +#define _LINUX_STRING_H_ 1 +#define __LINUX_BITMAP_H 1 + +#include #include #include #include +/* WARNING!! + * This code is compiled with -fPIC and it is relocated dynamically + * at run time, but no relocation processing is performed. + * This means that it is not safe to place pointers in static structures. + */ + +/* + * Getting to provable safe in place decompression is hard. + * Worst case behaviours need to be analized. + * Background information: + * + * The file layout is: + * magic[2] + * method[1] + * flags[1] + * timestamp[4] + * extraflags[1] + * os[1] + * compressed data blocks[N] + * crc[4] orig_len[4] + * + * resulting in 18 bytes of non compressed data overhead. + * + * Files divided into blocks + * 1 bit (last block flag) + * 2 bits (block type) + * + * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved. + * The smallest block type encoding is always used. + * + * stored: + * 32 bits length in bytes. + * + * fixed: + * magic fixed tree. + * symbols. + * + * dynamic: + * dynamic tree encoding. + * symbols. + * + * + * The buffer for decompression in place is the length of the + * uncompressed data, plus a small amount extra to keep the algorithm safe. + * The compressed data is placed at the end of the buffer. The output + * pointer is placed at the start of the buffer and the input pointer + * is placed where the compressed data starts. Problems will occur + * when the output pointer overruns the input pointer. + * + * The output pointer can only overrun the input pointer if the input + * pointer is moving faster than the output pointer. A condition only + * triggered by data whose compressed form is larger than the uncompressed + * form. + * + * The worst case at the block level is a growth of the compressed data + * of 5 bytes per 32767 bytes. + * + * The worst case internal to a compressed block is very hard to figure. + * The worst case can at least be boundined by having one bit that represents + * 32764 bytes and then all of the rest of the bytes representing the very + * very last byte. + * + * All of which is enough to compute an amount of extra data that is required + * to be safe. To avoid problems at the block level allocating 5 extra bytes + * per 32767 bytes of data is sufficient. To avoind problems internal to a block + * adding an extra 32767 bytes (the worst case uncompressed block size) is + * sufficient, to ensure that in the worst case the decompressed data for + * block will stop the byte before the compressed data for a block begins. + * To avoid problems with the compressed data's meta information an extra 18 + * bytes are needed. Leading to the formula: + * + * extra_bytes = (uncompressed_size >> 12) + 32768 + 18 + decompressor_size. + * + * Adding 8 bytes per 32K is a bit excessive but much easier to calculate. + * Adding 32768 instead of 32767 just makes for round numbers. + * Adding the decompressor_size is necessary as it musht live after all + * of the data as well. Last I measured the decompressor is about 14K. + * 10K of actuall data and 4K of bss. + * + */ + /* * gzip declarations */ @@ -28,15 +113,20 @@ typedef unsigned char uch; typedef unsigned short ush; typedef unsigned long ulg; -#define WSIZE 0x8000 /* Window size must be at least 32k, */ - /* and a power of two */ +#define WSIZE 0x80000000 /* Window size must be at least 32k, + * and a power of two + * We don't actually have a window just + * a huge output buffer so I report + * a 2G windows size, as that should + * always be larger than our output buffer. + */ -static uch *inbuf; /* input buffer */ -static uch window[WSIZE]; /* Sliding window buffer */ +static uch *inbuf; /* input buffer */ +static uch *window; /* Sliding window buffer, (and final output buffer) */ -static unsigned insize = 0; /* valid bytes in inbuf */ -static unsigned inptr = 0; /* index of next byte to be processed in inbuf */ -static unsigned outcnt = 0; /* bytes in output buffer */ +static unsigned insize; /* valid bytes in inbuf */ +static unsigned inptr; /* index of next byte to be processed in inbuf */ +static unsigned outcnt; /* bytes in output buffer */ /* gzip flag byte */ #define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */ @@ -87,8 +177,6 @@ extern unsigned char input_data[]; extern int input_len; static long bytes_out = 0; -static uch *output_data; -static unsigned long output_ptr = 0; static void *malloc(int size); static void free(void *where); @@ -98,17 +186,10 @@ static void *memcpy(void *dest, const void *src, unsigned n); static void putstr(const char *); -extern int end; -static long free_mem_ptr = (long)&end; +static long free_mem_ptr; static long free_mem_end_ptr; -#define INPLACE_MOVE_ROUTINE 0x1000 -#define LOW_BUFFER_START 0x2000 -#define LOW_BUFFER_MAX 0x90000 -#define HEAP_SIZE 0x3000 -static unsigned int low_buffer_end, low_buffer_size; -static int high_loaded =0; -static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; +#define HEAP_SIZE 0x6000 static char *vidmem = (char *)0xb8000; static int vidport; @@ -218,58 +299,31 @@ static void* memcpy(void* dest, const void* src, unsigned n) */ static int fill_inbuf(void) { - if (insize != 0) { - error("ran out of input data"); - } - - inbuf = input_data; - insize = input_len; - inptr = 1; - return inbuf[0]; + error("ran out of input data"); + return 0; } /* =========================================================================== * Write the output window window[0..outcnt-1] and update crc and bytes_out. * (Used for the decompressed data only.) */ -static void flush_window_low(void) -{ - ulg c = crc; /* temporary variable */ - unsigned n; - uch *in, *out, ch; - - in = window; - out = &output_data[output_ptr]; - for (n = 0; n < outcnt; n++) { - ch = *out++ = *in++; - c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); - } - crc = c; - bytes_out += (ulg)outcnt; - output_ptr += (ulg)outcnt; - outcnt = 0; -} - -static void flush_window_high(void) -{ - ulg c = crc; /* temporary variable */ - unsigned n; - uch *in, ch; - in = window; - for (n = 0; n < outcnt; n++) { - ch = *output_data++ = *in++; - if ((ulg)output_data == low_buffer_end) output_data=high_buffer_start; - c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); - } - crc = c; - bytes_out += (ulg)outcnt; - outcnt = 0; -} - static void flush_window(void) { - if (high_loaded) flush_window_high(); - else flush_window_low(); + /* With my window equal to my output buffer + * I only need to compute the crc here. + */ + ulg c = crc; /* temporary variable */ + unsigned n; + uch *in, ch; + + in = window; + for (n = 0; n < outcnt; n++) { + ch = *in++; + c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8); + } + crc = c; + bytes_out += (ulg)outcnt; + outcnt = 0; } static void error(char *x) @@ -281,57 +335,8 @@ static void error(char *x) while(1); /* Halt */ } -static void setup_normal_output_buffer(void) -{ -#ifdef STANDARD_MEMORY_BIOS_CALL - if (RM_EXT_MEM_K < 1024) error("Less than 2MB of memory"); -#else - if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < 1024) error("Less than 2MB of memory"); -#endif - output_data = (unsigned char *)__PHYSICAL_START; /* Normally Points to 1M */ - free_mem_end_ptr = (long)real_mode; -} - -struct moveparams { - uch *low_buffer_start; int lcount; - uch *high_buffer_start; int hcount; -}; - -static void setup_output_buffer_if_we_run_high(struct moveparams *mv) -{ - high_buffer_start = (uch *)(((ulg)&end) + HEAP_SIZE); -#ifdef STANDARD_MEMORY_BIOS_CALL - if (RM_EXT_MEM_K < (3*1024)) error("Less than 4MB of memory"); -#else - if ((RM_ALT_MEM_K > RM_EXT_MEM_K ? RM_ALT_MEM_K : RM_EXT_MEM_K) < (3*1024)) error("Less than 4MB of memory"); -#endif - mv->low_buffer_start = output_data = (unsigned char *)LOW_BUFFER_START; - low_buffer_end = ((unsigned int)real_mode > LOW_BUFFER_MAX - ? LOW_BUFFER_MAX : (unsigned int)real_mode) & ~0xfff; - low_buffer_size = low_buffer_end - LOW_BUFFER_START; - high_loaded = 1; - free_mem_end_ptr = (long)high_buffer_start; - if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) { - high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size); - mv->hcount = 0; /* say: we need not to move high_buffer */ - } - else mv->hcount = -1; - mv->high_buffer_start = high_buffer_start; -} - -static void close_output_buffer_if_we_run_high(struct moveparams *mv) -{ - if (bytes_out > low_buffer_size) { - mv->lcount = low_buffer_size; - if (mv->hcount) - mv->hcount = bytes_out - low_buffer_size; - } else { - mv->lcount = bytes_out; - mv->hcount = 0; - } -} - -int decompress_kernel(struct moveparams *mv, void *rmode) +asmlinkage void decompress_kernel(void *rmode, unsigned long heap, + uch *input_data, unsigned long input_len, uch *output) { real_mode = rmode; @@ -346,13 +351,21 @@ int decompress_kernel(struct moveparams *mv, void *rmode) lines = RM_SCREEN_INFO.orig_video_lines; cols = RM_SCREEN_INFO.orig_video_cols; - if (free_mem_ptr < 0x100000) setup_normal_output_buffer(); - else setup_output_buffer_if_we_run_high(mv); + window = output; /* Output buffer (Normally at 1M) */ + free_mem_ptr = heap; /* Heap */ + free_mem_end_ptr = heap + HEAP_SIZE; + inbuf = input_data; /* Input buffer */ + insize = input_len; + inptr = 0; + + if ((ulg)output & 0x1fffffUL) + error("Destination address not 2M aligned"); + if ((ulg)output >= 0xffffffffffUL) + error("Destination address too large"); makecrc(); putstr(".\nDecompressing Linux..."); gunzip(); putstr("done.\nBooting the kernel.\n"); - if (high_loaded) close_output_buffer_if_we_run_high(mv); - return high_loaded; + return; } diff --git a/arch/x86_64/boot/compressed/vmlinux.lds b/arch/x86_64/boot/compressed/vmlinux.lds new file mode 100644 index 000000000000..94c13e557fb4 --- /dev/null +++ b/arch/x86_64/boot/compressed/vmlinux.lds @@ -0,0 +1,44 @@ +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") +OUTPUT_ARCH(i386:x86-64) +ENTRY(startup_64) +SECTIONS +{ + /* Be careful parts of head.S assume startup_32 is at + * address 0. + */ + . = 0; + .text : { + _head = . ; + *(.text.head) + _ehead = . ; + *(.text.compressed) + _text = .; /* Text */ + *(.text) + *(.text.*) + _etext = . ; + } + .rodata : { + _rodata = . ; + *(.rodata) /* read-only data */ + *(.rodata.*) + _erodata = . ; + } + .data : { + _data = . ; + *(.data) + *(.data.*) + _edata = . ; + } + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + . = ALIGN(8); + _end = . ; + . = ALIGN(4096); + pgtable = . ; + . = . + 4096 * 6; + _heap = .; + } +} diff --git a/arch/x86_64/boot/compressed/vmlinux.scr b/arch/x86_64/boot/compressed/vmlinux.scr index 1ed9d791f863..bd1429ce193e 100644 --- a/arch/x86_64/boot/compressed/vmlinux.scr +++ b/arch/x86_64/boot/compressed/vmlinux.scr @@ -1,9 +1,10 @@ SECTIONS { - .data : { + .text.compressed : { input_len = .; - LONG(input_data_end - input_data) input_data = .; - *(.data) - input_data_end = .; + LONG(input_data_end - input_data) input_data = .; + *(.data) + output_len = . - 4; + input_data_end = .; } } diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index c211e52f1333..36aa98a6d15c 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -5,6 +5,7 @@ * Copyright (C) 2000 Pavel Machek * Copyright (C) 2000 Karsten Keil * Copyright (C) 2001,2002 Andi Kleen + * Copyright (C) 2005 Eric Biederman */ @@ -17,95 +18,127 @@ #include #include #include - + /* we are not able to switch in one step to the final KERNEL ADRESS SPACE - * because we need identity-mapped pages on setup so define __START_KERNEL to - * 0x100000 for this stage - * + * because we need identity-mapped pages. + * */ .text .section .bootstrap.text - .code32 - .globl startup_32 -/* %bx: 1 if coming from smp trampoline on secondary cpu */ -startup_32: - + .code64 + .globl startup_64 +startup_64: + /* - * At this point the CPU runs in 32bit protected mode (CS.D = 1) with - * paging disabled and the point of this file is to switch to 64bit - * long mode with a kernel mapping for kerneland to jump into the - * kernel virtual addresses. - * There is no stack until we set one up. + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, + * and someone has loaded an identity mapped page table + * for us. These identity mapped page tables map all of the + * kernel pages and possibly all of memory. + * + * %esi holds a physical pointer to real_mode_data. + * + * We come here either directly from a 64bit bootloader, or from + * arch/x86_64/boot/compressed/head.S. + * + * We only come here initially at boot nothing else comes here. + * + * Since we may be loaded at an address different from what we were + * compiled to run at we first fixup the physical addresses in our page + * tables and then reload them. */ - /* Initialize the %ds segment register */ - movl $__KERNEL_DS,%eax - movl %eax,%ds - - /* Load new GDT with the 64bit segments using 32bit descriptor */ - lgdt pGDT32 - __START_KERNEL_map - - /* If the CPU doesn't support CPUID this will double fault. - * Unfortunately it is hard to check for CPUID without a stack. + /* Compute the delta between the address I am compiled to run at and the + * address I am actually running at. */ - - /* Check if extended functions are implemented */ - movl $0x80000000, %eax - cpuid - cmpl $0x80000000, %eax - jbe no_long_mode - /* Check if long mode is implemented */ - mov $0x80000001, %eax - cpuid - btl $29, %edx - jnc no_long_mode - - /* - * Prepare for entering 64bits mode + leaq _text(%rip), %rbp + subq $_text - __START_KERNEL_map, %rbp + + /* Is the address not 2M aligned? */ + movq %rbp, %rax + andl $~LARGE_PAGE_MASK, %eax + testl %eax, %eax + jnz bad_address + + /* Is the address too large? */ + leaq _text(%rip), %rdx + movq $PGDIR_SIZE, %rax + cmpq %rax, %rdx + jae bad_address + + /* Fixup the physical addresses in the page table */ + addq %rbp, init_level4_pgt + 0(%rip) + addq %rbp, init_level4_pgt + (258*8)(%rip) + addq %rbp, init_level4_pgt + (511*8)(%rip) + + addq %rbp, level3_ident_pgt + 0(%rip) + addq %rbp, level3_kernel_pgt + (510*8)(%rip) + + /* Add an Identity mapping if I am above 1G */ + leaq _text(%rip), %rdi + andq $LARGE_PAGE_MASK, %rdi + + movq %rdi, %rax + shrq $PUD_SHIFT, %rax + andq $(PTRS_PER_PUD - 1), %rax + jz ident_complete + + leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx + leaq level3_ident_pgt(%rip), %rbx + movq %rdx, 0(%rbx, %rax, 8) + + movq %rdi, %rax + shrq $PMD_SHIFT, %rax + andq $(PTRS_PER_PMD - 1), %rax + leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx + leaq level2_spare_pgt(%rip), %rbx + movq %rdx, 0(%rbx, %rax, 8) +ident_complete: + + /* Fixup the kernel text+data virtual addresses + */ + leaq level2_kernel_pgt(%rip), %rdi + leaq 4096(%rdi), %r8 + /* See if it is a valid page table entry */ +1: testq $1, 0(%rdi) + jz 2f + addq %rbp, 0(%rdi) + /* Go to the next page */ +2: addq $8, %rdi + cmp %r8, %rdi + jne 1b + + /* Fixup phys_base */ + addq %rbp, phys_base(%rip) - /* Enable PAE mode */ - xorl %eax, %eax - btsl $5, %eax - movl %eax, %cr4 - - /* Setup early boot stage 4 level pagetables */ - movl $(init_level4_pgt - __START_KERNEL_map), %eax - movl %eax, %cr3 - - /* Setup EFER (Extended Feature Enable Register) */ - movl $MSR_EFER, %ecx - rdmsr - - /* Enable Long Mode */ - btsl $_EFER_LME, %eax - - /* Make changes effective */ - wrmsr +#ifdef CONFIG_SMP + addq %rbp, trampoline_level4_pgt + 0(%rip) + addq %rbp, trampoline_level4_pgt + (511*8)(%rip) +#endif +#ifdef CONFIG_ACPI_SLEEP + addq %rbp, wakeup_level4_pgt + 0(%rip) + addq %rbp, wakeup_level4_pgt + (511*8)(%rip) +#endif - xorl %eax, %eax - btsl $31, %eax /* Enable paging and in turn activate Long Mode */ - btsl $0, %eax /* Enable protected mode */ - /* Make changes effective */ - movl %eax, %cr0 - /* - * At this point we're in long mode but in 32bit compatibility mode - * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn - * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we use - * the new gdt/idt that has __KERNEL_CS with CS.L = 1. + /* Due to ENTRY(), sometimes the empty space gets filled with + * zeros. Better take a jmp than relying on empty space being + * filled with 0x90 (nop) */ - ljmp $__KERNEL_CS, $(startup_64 - __START_KERNEL_map) - - .code64 - .org 0x100 - .globl startup_64 -startup_64: + jmp secondary_startup_64 ENTRY(secondary_startup_64) - /* We come here either from startup_32 - * or directly from a 64bit bootloader. - * Since we may have come directly from a bootloader we - * reload the page tables here. + /* + * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1, + * and someone has loaded a mapped page table. + * + * %esi holds a physical pointer to real_mode_data. + * + * We come here either from startup_64 (using physical addresses) + * or from trampoline.S (using virtual addresses). + * + * Using virtual addresses from trampoline.S removes the need + * to have any identity mapped pages in the kernel page table + * after the boot processor executes this code. */ /* Enable PAE mode and PGE */ @@ -116,8 +149,14 @@ ENTRY(secondary_startup_64) /* Setup early boot stage 4 level pagetables. */ movq $(init_level4_pgt - __START_KERNEL_map), %rax + addq phys_base(%rip), %rax movq %rax, %cr3 + /* Ensure I am executing from virtual addresses */ + movq $1f, %rax + jmp *%rax +1: + /* Check if nx is implemented */ movl $0x80000001, %eax cpuid @@ -126,17 +165,11 @@ ENTRY(secondary_startup_64) /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx rdmsr - - /* Enable System Call */ - btsl $_EFER_SCE, %eax - - /* No Execute supported? */ - btl $20,%edi + btsl $_EFER_SCE, %eax /* Enable System Call */ + btl $20,%edi /* No Execute supported? */ jnc 1f btsl $_EFER_NX, %eax -1: - /* Make changes effective */ - wrmsr +1: wrmsr /* Make changes effective */ /* Setup cr0 */ #define CR0_PM 1 /* protected mode */ @@ -163,7 +196,7 @@ ENTRY(secondary_startup_64) * addresses where we're currently running on. We have to do that here * because in 32bit we couldn't load a 64bit linear address. */ - lgdt cpu_gdt_descr + lgdt cpu_gdt_descr(%rip) /* set up data segments. actually 0 would do too */ movl $__KERNEL_DS,%eax @@ -214,6 +247,9 @@ initial_code: init_rsp: .quad init_thread_union+THREAD_SIZE-8 +bad_address: + jmp bad_address + ENTRY(early_idt_handler) cmpl $2,early_recursion_flag(%rip) jz 1f @@ -242,23 +278,7 @@ early_idt_msg: early_idt_ripmsg: .asciz "RIP %s\n" -.code32 -ENTRY(no_long_mode) - /* This isn't an x86-64 CPU so hang */ -1: - jmp 1b - -.org 0xf00 - .globl pGDT32 -pGDT32: - .word gdt_end-cpu_gdt_table-1 - .long cpu_gdt_table-__START_KERNEL_map - -.org 0xf10 -ljumpvector: - .long startup_64-__START_KERNEL_map - .word __KERNEL_CS - +.balign PAGE_SIZE ENTRY(stext) ENTRY(_stext) @@ -303,7 +323,7 @@ NEXT_PAGE(level2_ident_pgt) * Don't set NX because code runs from these pages. */ PMDS(0x0000000000000000, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) - + NEXT_PAGE(level2_kernel_pgt) /* 40MB kernel mapping. The kernel code cannot be bigger than that. When you change this change KERNEL_TEXT_SIZE in page.h too. */ @@ -313,6 +333,9 @@ NEXT_PAGE(level2_kernel_pgt) /* Module mapping starts here */ .fill (PTRS_PER_PMD - (KERNEL_TEXT_SIZE/PMD_SIZE)),8,0 +NEXT_PAGE(level2_spare_pgt) + .fill 512,8,0 + #undef PMDS #undef NEXT_PAGE @@ -330,6 +353,10 @@ gdt: .endr #endif +ENTRY(phys_base) + /* This must match the first entry in level2_kernel_pgt */ + .quad 0x0000000000000000 + /* We need valid kernel segments for data and code in long mode too * IRET will check the segment types kkeil 2000/10/28 * Also sysret mandates a special GDT layout diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S index bfbe00763c68..16d183f67bc1 100644 --- a/arch/x86_64/kernel/suspend_asm.S +++ b/arch/x86_64/kernel/suspend_asm.S @@ -71,9 +71,10 @@ loop: jmp loop done: /* go back to the original page tables */ - leaq init_level4_pgt(%rip), %rax - subq $__START_KERNEL_map, %rax - movq %rax, %cr3 + movq $(init_level4_pgt - __START_KERNEL_map), %rax + addq phys_base(%rip), %rax + movq %rax, %cr3 + /* Flush TLB, including "global" things (vmalloc) */ movq mmu_cr4_features(%rip), %rax movq %rax, %rdx diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index 4974433bbf34..40a24d0df090 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -61,6 +61,8 @@ typedef struct { unsigned long pgd; } pgd_t; typedef struct { unsigned long pgprot; } pgprot_t; +extern unsigned long phys_base; + #define pte_val(x) ((x).pte) #define pmd_val(x) ((x).pmd) #define pud_val(x) ((x).pud) @@ -101,14 +103,14 @@ typedef struct { unsigned long pgprot; } pgprot_t; #define PAGE_OFFSET __PAGE_OFFSET /* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. - Otherwise you risk miscompilation. */ + Otherwise you risk miscompilation. */ #define __pa(x) ((unsigned long)(x) - PAGE_OFFSET) /* __pa_symbol should be used for C visible symbols. This seems to be the official gcc blessed way to do such arithmetic. */ #define __pa_symbol(x) \ ({unsigned long v; \ asm("" : "=r" (v) : "0" (x)); \ - (v - __START_KERNEL_map); }) + ((v - __START_KERNEL_map) + phys_base); }) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #ifdef CONFIG_FLATMEM -- cgit v1.2.3 From 6a50a664ca0cfd2a487525f10cec3ff4d570b5e8 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH] x86-64: build-time checking o X86_64 kernel should run from 2MB aligned address for two reasons. - Performance. - For relocatable kernels, page tables are updated based on difference between compile time address and load time physical address. This difference should be multiple of 2MB as kernel text and data is mapped using 2MB pages and PMD should be pointing to a 2MB aligned address. Life is simpler if both compile time and load time kernel addresses are 2MB aligned. o Flag the error at compile time if one is trying to build a kernel which does not meet alignment restrictions. Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen Cc: "Eric W. Biederman" Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/boot/compressed/misc.c | 2 +- arch/x86_64/kernel/head64.c | 7 +++++++ include/asm-x86_64/page.h | 1 + 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c index fee54dbf1749..fed1167159c3 100644 --- a/arch/x86_64/boot/compressed/misc.c +++ b/arch/x86_64/boot/compressed/misc.c @@ -358,7 +358,7 @@ asmlinkage void decompress_kernel(void *rmode, unsigned long heap, insize = input_len; inptr = 0; - if ((ulg)output & 0x1fffffUL) + if ((ulg)output & (__KERNEL_ALIGN - 1)) error("Destination address not 2M aligned"); if ((ulg)output >= 0xffffffffffUL) error("Destination address too large"); diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 6c34bdd22e26..213d90e04755 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -62,6 +62,13 @@ void __init x86_64_start_kernel(char * real_mode_data) { int i; + /* + * Make sure kernel is aligned to 2MB address. Catching it at compile + * time is better. Change your config file and compile the kernel + * for a 2MB aligned address (CONFIG_PHYSICAL_START) + */ + BUILD_BUG_ON(CONFIG_PHYSICAL_START & (__KERNEL_ALIGN - 1)); + /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h index 40a24d0df090..b17fc16ec2eb 100644 --- a/include/asm-x86_64/page.h +++ b/include/asm-x86_64/page.h @@ -78,6 +78,7 @@ extern unsigned long phys_base; #endif /* !__ASSEMBLY__ */ #define __PHYSICAL_START CONFIG_PHYSICAL_START +#define __KERNEL_ALIGN 0x200000 #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) #define __START_KERNEL_map 0xffffffff80000000 #define __PAGE_OFFSET 0xffff810000000000 -- cgit v1.2.3 From a4831e08b7f3ed51623c9eb25e8c945b76b3eda3 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH] x86-64: Move cpu verification code to common file o This patch moves the code to verify long mode and SSE to a common file. This code is now shared by trampoline.S, wakeup.S, boot/setup.S and boot/compressed/head.S o So far we used to do very limited check in trampoline.S, wakeup.S and in 32bit entry point. Now all the entry paths are forced to do the exhaustive check, including SSE because verify_cpu is shared. o I am keeping this patch as last in the x86 relocatable series because previous patches have got quite some amount of testing done and don't want to distrub that. So that if there is problem introduced by this patch, at least it can be easily isolated. Signed-off-by: Eric W. Biederman Signed-off-by: Vivek Goyal Signed-off-by: Andi Kleen --- arch/x86_64/boot/compressed/head.S | 19 ++++++- arch/x86_64/boot/setup.S | 65 ++-------------------- arch/x86_64/kernel/acpi/wakeup.S | 30 +++++----- arch/x86_64/kernel/trampoline.S | 51 +---------------- arch/x86_64/kernel/verify_cpu.S | 110 +++++++++++++++++++++++++++++++++++++ 5 files changed, 152 insertions(+), 123 deletions(-) create mode 100644 arch/x86_64/kernel/verify_cpu.S (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S index c353a9266ea4..f9d5692a0106 100644 --- a/arch/x86_64/boot/compressed/head.S +++ b/arch/x86_64/boot/compressed/head.S @@ -54,6 +54,15 @@ startup_32: 1: popl %ebp subl $1b, %ebp +/* setup a stack and make sure cpu supports long mode. */ + movl $user_stack_end, %eax + addl %ebp, %eax + movl %eax, %esp + + call verify_cpu + testl %eax, %eax + jnz no_longmode + /* Compute the delta between where we were compiled to run at * and where the code will actually run at. */ @@ -159,13 +168,21 @@ startup_32: /* Jump from 32bit compatibility mode into 64bit mode. */ lret +no_longmode: + /* This isn't an x86-64 CPU so hang */ +1: + hlt + jmp 1b + +#include "../../kernel/verify_cpu.S" + /* Be careful here startup_64 needs to be at a predictable * address so I can export it in an ELF header. Bootloaders * should look at the ELF header to find this address, as * it may change in the future. */ .code64 - .org 0x100 + .org 0x200 ENTRY(startup_64) /* We come here either from startup_32 or directly from a * 64bit bootloader. If we come here from a bootloader we depend on diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index deb3573c7aec..816d04faa2be 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -299,64 +299,10 @@ loader_ok: movw %cs,%ax movw %ax,%ds - /* minimum CPUID flags for x86-64 */ - /* see http://www.x86-64.org/lists/discuss/msg02971.html */ -#define SSE_MASK ((1<<25)|(1<<26)) -#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ - (1<<13)|(1<<15)|(1<<24)) -#define REQUIRED_MASK2 (1<<29) - - pushfl /* standard way to check for cpuid */ - popl %eax - movl %eax,%ebx - xorl $0x200000,%eax - pushl %eax - popfl - pushfl - popl %eax - cmpl %eax,%ebx - jz no_longmode /* cpu has no cpuid */ - movl $0x0,%eax - cpuid - cmpl $0x1,%eax - jb no_longmode /* no cpuid 1 */ - xor %di,%di - cmpl $0x68747541,%ebx /* AuthenticAMD */ - jnz noamd - cmpl $0x69746e65,%edx - jnz noamd - cmpl $0x444d4163,%ecx - jnz noamd - mov $1,%di /* cpu is from AMD */ -noamd: - movl $0x1,%eax - cpuid - andl $REQUIRED_MASK1,%edx - xorl $REQUIRED_MASK1,%edx - jnz no_longmode - movl $0x80000000,%eax - cpuid - cmpl $0x80000001,%eax - jb no_longmode /* no extended cpuid */ - movl $0x80000001,%eax - cpuid - andl $REQUIRED_MASK2,%edx - xorl $REQUIRED_MASK2,%edx - jnz no_longmode -sse_test: - movl $1,%eax - cpuid - andl $SSE_MASK,%edx - cmpl $SSE_MASK,%edx - je sse_ok - test %di,%di - jz no_longmode /* only try to force SSE on AMD */ - movl $0xc0010015,%ecx /* HWCR */ - rdmsr - btr $15,%eax /* enable SSE */ - wrmsr - xor %di,%di /* don't loop */ - jmp sse_test /* try again */ + call verify_cpu + testl %eax,%eax + jz sse_ok + no_longmode: call beep lea long_mode_panic,%si @@ -366,7 +312,8 @@ no_longmode_loop: long_mode_panic: .string "Your CPU does not support long mode. Use a 32bit distribution." .byte 0 - + +#include "../kernel/verify_cpu.S" sse_ok: popw %ds diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index 766cfbcac1db..8550a6ffa275 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -43,6 +43,11 @@ wakeup_code: cmpl $0x12345678, %eax jne bogus_real_magic + call verify_cpu # Verify the cpu supports long + # mode + testl %eax, %eax + jnz no_longmode + testl $1, video_flags - wakeup_code jz 1f lcall $0xc000,$3 @@ -92,18 +97,6 @@ wakeup_32: # Running in this code, but at low address; paging is not yet turned on. movb $0xa5, %al ; outb %al, $0x80 - /* Check if extended functions are implemented */ - movl $0x80000000, %eax - cpuid - cmpl $0x80000000, %eax - jbe bogus_cpu - wbinvd - mov $0x80000001, %eax - cpuid - btl $29, %edx - jnc bogus_cpu - movl %edx,%edi - movl $__KERNEL_DS, %eax movl %eax, %ds @@ -123,6 +116,11 @@ wakeup_32: leal (wakeup_level4_pgt - wakeup_code)(%esi), %eax movl %eax, %cr3 + /* Check if nx is implemented */ + movl $0x80000001, %eax + cpuid + movl %edx,%edi + /* Enable Long Mode */ xorl %eax, %eax btsl $_EFER_LME, %eax @@ -244,10 +242,12 @@ bogus_64_magic: movb $0xb3,%al ; outb %al,$0x80 jmp bogus_64_magic -bogus_cpu: - movb $0xbc,%al ; outb %al,$0x80 - jmp bogus_cpu +.code16 +no_longmode: + movb $0xbc,%al ; outb %al,$0x80 + jmp no_longmode +#include "../verify_cpu.S" /* This code uses an extended set of video mode numbers. These include: * Aliases for standard modes diff --git a/arch/x86_64/kernel/trampoline.S b/arch/x86_64/kernel/trampoline.S index 13eee63c7bb5..e7e2764c461b 100644 --- a/arch/x86_64/kernel/trampoline.S +++ b/arch/x86_64/kernel/trampoline.S @@ -54,6 +54,8 @@ r_base = . movw $(trampoline_stack_end - r_base), %sp call verify_cpu # Verify the cpu supports long mode + testl %eax, %eax # Check for return code + jnz no_longmode mov %cs, %ax movzx %ax, %esi # Find the 32bit trampoline location @@ -121,57 +123,10 @@ startup_64: jmp *%rax .code16 -verify_cpu: - pushl $0 # Kill any dangerous flags - popfl - - /* minimum CPUID flags for x86-64 */ - /* see http://www.x86-64.org/lists/discuss/msg02971.html */ -#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ - (1<<13)|(1<<15)|(1<<24)|(1<<25)|(1<<26)) -#define REQUIRED_MASK2 (1<<29) - - pushfl # check for cpuid - popl %eax - movl %eax, %ebx - xorl $0x200000,%eax - pushl %eax - popfl - pushfl - popl %eax - pushl %ebx - popfl - cmpl %eax, %ebx - jz no_longmode - - xorl %eax, %eax # See if cpuid 1 is implemented - cpuid - cmpl $0x1, %eax - jb no_longmode - - movl $0x01, %eax # Does the cpu have what it takes? - cpuid - andl $REQUIRED_MASK1, %edx - xorl $REQUIRED_MASK1, %edx - jnz no_longmode - - movl $0x80000000, %eax # See if extended cpuid is implemented - cpuid - cmpl $0x80000001, %eax - jb no_longmode - - movl $0x80000001, %eax # Does the cpu have what it takes? - cpuid - andl $REQUIRED_MASK2, %edx - xorl $REQUIRED_MASK2, %edx - jnz no_longmode - - ret # The cpu supports long mode - no_longmode: hlt jmp no_longmode - +#include "verify_cpu.S" # Careful these need to be in the same 64K segment as the above; tidt: diff --git a/arch/x86_64/kernel/verify_cpu.S b/arch/x86_64/kernel/verify_cpu.S new file mode 100644 index 000000000000..72edabd2ef9a --- /dev/null +++ b/arch/x86_64/kernel/verify_cpu.S @@ -0,0 +1,110 @@ +/* + * + * verify_cpu.S - Code for cpu long mode and SSE verification. This + * code has been borrowed from boot/setup.S and was introduced by + * Andi Kleen. + * + * Copyright (c) 2007 Andi Kleen (ak@suse.de) + * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) + * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + * + * This is a common code for verification whether CPU supports + * long mode and SSE or not. It is not called directly instead this + * file is included at various places and compiled in that context. + * Following are the current usage. + * + * This file is included by both 16bit and 32bit code. + * + * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) + * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit) + * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit) + * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit) + * + * verify_cpu, returns the status of cpu check in register %eax. + * 0: Success 1: Failure + * + * The caller needs to check for the error code and take the action + * appropriately. Either display a message or halt. + */ + +verify_cpu: + + pushfl # Save caller passed flags + pushl $0 # Kill any dangerous flags + popfl + + /* minimum CPUID flags for x86-64 */ + /* see http://www.x86-64.org/lists/discuss/msg02971.html */ +#define SSE_MASK ((1<<25)|(1<<26)) +#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ + (1<<13)|(1<<15)|(1<<24)) +#define REQUIRED_MASK2 (1<<29) + pushfl # standard way to check for cpuid + popl %eax + movl %eax,%ebx + xorl $0x200000,%eax + pushl %eax + popfl + pushfl + popl %eax + cmpl %eax,%ebx + jz verify_cpu_no_longmode # cpu has no cpuid + + movl $0x0,%eax # See if cpuid 1 is implemented + cpuid + cmpl $0x1,%eax + jb verify_cpu_no_longmode # no cpuid 1 + + xor %di,%di + cmpl $0x68747541,%ebx # AuthenticAMD + jnz verify_cpu_noamd + cmpl $0x69746e65,%edx + jnz verify_cpu_noamd + cmpl $0x444d4163,%ecx + jnz verify_cpu_noamd + mov $1,%di # cpu is from AMD + +verify_cpu_noamd: + movl $0x1,%eax # Does the cpu have what it takes + cpuid + andl $REQUIRED_MASK1,%edx + xorl $REQUIRED_MASK1,%edx + jnz verify_cpu_no_longmode + + movl $0x80000000,%eax # See if extended cpuid is implemented + cpuid + cmpl $0x80000001,%eax + jb verify_cpu_no_longmode # no extended cpuid + + movl $0x80000001,%eax # Does the cpu have what it takes + cpuid + andl $REQUIRED_MASK2,%edx + xorl $REQUIRED_MASK2,%edx + jnz verify_cpu_no_longmode + +verify_cpu_sse_test: + movl $1,%eax + cpuid + andl $SSE_MASK,%edx + cmpl $SSE_MASK,%edx + je verify_cpu_sse_ok + test %di,%di + jz verify_cpu_no_longmode # only try to force SSE on AMD + movl $0xc0010015,%ecx # HWCR + rdmsr + btr $15,%eax # enable SSE + wrmsr + xor %di,%di # don't loop + jmp verify_cpu_sse_test # try again + +verify_cpu_no_longmode: + popfl # Restore caller passed flags + movl $1,%eax + ret +verify_cpu_sse_ok: + popfl # Restore caller passed flags + xorl %eax, %eax + ret -- cgit v1.2.3 From d039c688c6b3e7217381d2804bc0ca3171913fec Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH] x86-64: Minor white space cleanup in traps.c Signed-off-by: Andi Kleen --- arch/x86_64/kernel/traps.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index cceef5bd7302..d76fc32d4599 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -426,8 +426,7 @@ void show_registers(struct pt_regs *regs) const int cpu = smp_processor_id(); struct task_struct *cur = cpu_pda(cpu)->pcurrent; - rsp = regs->rsp; - + rsp = regs->rsp; printk("CPU %d ", cpu); __show_regs(regs); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", @@ -438,7 +437,6 @@ void show_registers(struct pt_regs *regs) * time of the fault.. */ if (in_kernel) { - printk("Stack: "); _show_stack(NULL, regs, (unsigned long*)rsp); -- cgit v1.2.3 From 5a90cf205c922707ffed2d8f87cefd942e96b0ba Mon Sep 17 00:00:00 2001 From: john stultz Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH] x86: Log reason why TSC was marked unstable Change mark_tsc_unstable() so it takes a string argument, which holds the reason the TSC was marked unstable. This is then displayed the first time mark_tsc_unstable is called. This should help us better debug why the TSC was marked unstable on certain systems and allow us to make sure we're not being overly paranoid when throwing out this troublesome clocksource. Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/cyrix.c | 2 +- arch/i386/kernel/tsc.c | 5 +++-- arch/x86_64/kernel/time.c | 2 +- arch/x86_64/kernel/tsc.c | 5 +++-- arch/x86_64/kernel/tsc_sync.c | 2 +- drivers/acpi/processor_idle.c | 4 ++-- include/asm-i386/mach-summit/mach_mpparse.h | 4 ++-- include/asm-i386/tsc.h | 2 +- include/asm-x86_64/timex.h | 2 +- 9 files changed, 15 insertions(+), 13 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c index f0badfdd4e45..e77f8e1cf7aa 100644 --- a/arch/i386/kernel/cpu/cyrix.c +++ b/arch/i386/kernel/cpu/cyrix.c @@ -279,7 +279,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) */ if (vendor == PCI_VENDOR_ID_CYRIX && (device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520)) - mark_tsc_unstable(); + mark_tsc_unstable("cyrix 5510/5520 detected"); } #endif c->x86_cache_size=16; /* Yep 16K integrated cache thats it */ diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c index 6cb8f5336732..755209dc93e1 100644 --- a/arch/i386/kernel/tsc.c +++ b/arch/i386/kernel/tsc.c @@ -233,7 +233,7 @@ time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) * TSC based sched_clock turns * to junk w/ cpufreq */ - mark_tsc_unstable(); + mark_tsc_unstable("cpufreq changes"); } } } @@ -281,11 +281,12 @@ static struct clocksource clocksource_tsc = { CLOCK_SOURCE_MUST_VERIFY, }; -void mark_tsc_unstable(void) +void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; tsc_enabled = 0; + printk("Marking TSC unstable due to: %s.\n", reason); /* Can be called before registration */ if (clocksource_tsc.mult) clocksource_change_rating(&clocksource_tsc, 0); diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 5f862e216a42..91c9066a380d 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -397,7 +397,7 @@ void __init time_init(void) cpu_khz = tsc_calibrate_cpu_khz(); if (unsynchronized_tsc()) - mark_tsc_unstable(); + mark_tsc_unstable("TSCs unsynchronized"); if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) vgetcpu_mode = VGETCPU_RDTSCP; diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c index 5c84992c676d..48f9a8e6aa91 100644 --- a/arch/x86_64/kernel/tsc.c +++ b/arch/x86_64/kernel/tsc.c @@ -111,7 +111,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - mark_tsc_unstable(); + mark_tsc_unstable("cpufreq changes"); } set_cyc2ns_scale(tsc_khz_ref); @@ -199,10 +199,11 @@ static struct clocksource clocksource_tsc = { .vread = vread_tsc, }; -void mark_tsc_unstable(void) +void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; + printk("Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) clocksource_change_rating(&clocksource_tsc, 0); diff --git a/arch/x86_64/kernel/tsc_sync.c b/arch/x86_64/kernel/tsc_sync.c index 72d444dede9b..355f5f506c81 100644 --- a/arch/x86_64/kernel/tsc_sync.c +++ b/arch/x86_64/kernel/tsc_sync.c @@ -138,7 +138,7 @@ void __cpuinit check_tsc_sync_source(int cpu) printk("\n"); printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," " turning off TSC clock.\n", max_warp); - mark_tsc_unstable(); + mark_tsc_unstable("check_tsc_sync_source failed"); nr_warps = 0; max_warp = 0; last_tsc = 0; diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index ae0654cd11ea..ee5759bef945 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -475,7 +475,7 @@ static void acpi_processor_idle(void) #ifdef CONFIG_GENERIC_TIME /* TSC halts in C2, so notify users */ - mark_tsc_unstable(); + mark_tsc_unstable("possible TSC halt in C2"); #endif /* Re-enable interrupts */ local_irq_enable(); @@ -517,7 +517,7 @@ static void acpi_processor_idle(void) #ifdef CONFIG_GENERIC_TIME /* TSC halts in C3, so notify users */ - mark_tsc_unstable(); + mark_tsc_unstable("TSC halts in C3"); #endif /* Re-enable interrupts */ local_irq_enable(); diff --git a/include/asm-i386/mach-summit/mach_mpparse.h b/include/asm-i386/mach-summit/mach_mpparse.h index 94268399170d..c2520539d934 100644 --- a/include/asm-i386/mach-summit/mach_mpparse.h +++ b/include/asm-i386/mach-summit/mach_mpparse.h @@ -30,7 +30,7 @@ static inline int mps_oem_check(struct mp_config_table *mpc, char *oem, (!strncmp(productid, "VIGIL SMP", 9) || !strncmp(productid, "EXA", 3) || !strncmp(productid, "RUTHLESS SMP", 12))){ - mark_tsc_unstable(); + mark_tsc_unstable("Summit based system"); use_cyclone = 1; /*enable cyclone-timer*/ setup_summit(); return 1; @@ -44,7 +44,7 @@ static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (!strncmp(oem_id, "IBM", 3) && (!strncmp(oem_table_id, "SERVIGIL", 8) || !strncmp(oem_table_id, "EXA", 3))){ - mark_tsc_unstable(); + mark_tsc_unstable("Summit based system"); use_cyclone = 1; /*enable cyclone-timer*/ setup_summit(); return 1; diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h index 84016ff481b9..346976632e15 100644 --- a/include/asm-i386/tsc.h +++ b/include/asm-i386/tsc.h @@ -53,7 +53,7 @@ static __always_inline cycles_t get_cycles_sync(void) } extern void tsc_init(void); -extern void mark_tsc_unstable(void); +extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern void init_tsc_clocksource(void); diff --git a/include/asm-x86_64/timex.h b/include/asm-x86_64/timex.h index 8c6808a3fba4..f6527e1b6c1c 100644 --- a/include/asm-x86_64/timex.h +++ b/include/asm-x86_64/timex.h @@ -27,6 +27,6 @@ extern int read_current_timer(unsigned long *timer_value); #define NS_SCALE 10 /* 2^10, carefully chosen */ #define US_SCALE 32 /* 2^32, arbitralrily chosen */ -extern void mark_tsc_unstable(void); +extern void mark_tsc_unstable(char *msg); extern void set_cyc2ns_scale(unsigned long khz); #endif -- cgit v1.2.3 From f0e13ae76a607eab9c387544ebca550f2196a876 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 2 May 2007 19:27:08 +0200 Subject: [PATCH] x86-64: remove extra smp_processor_id calling Cc: "Eric W. Biederman" Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/io_apic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index c6a5bc7e8118..318d9055cd97 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -1407,8 +1407,7 @@ static void irq_complete_move(unsigned int irq) vector = ~get_irq_regs()->orig_rax; me = smp_processor_id(); - if ((vector == cfg->vector) && - cpu_isset(smp_processor_id(), cfg->domain)) { + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { cpumask_t cleanup_mask; cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); -- cgit v1.2.3 From 786142fab86cfddd3e7797e9ccf8a8a3bcaf0456 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH] x86-64: make simnow_init() static Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/early_printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index b7aa9abe1c6a..92213d2b7c11 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -175,7 +175,7 @@ static noinline long simnow(long cmd, long a, long b, long c) return ret; } -void __init simnow_init(char *str) +static void __init simnow_init(char *str) { char *fn = "klog"; if (*str == '=') -- cgit v1.2.3 From d824395c5994adbf7efe377cc67f732133270554 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH] x86: remove constant_tsc reporting from /proc/cpuinfo' power flags remove the reporting of the constant_tsc flag from the "power management" field in /proc/cpuinfo. The NULL value there was replaced by "" because the former would result in a printout of [8] if the flag is set. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/i386/kernel/cpu/proc.c | 3 +-- arch/x86_64/kernel/setup.c | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 47e3ebbfb28d..89d91e6cc972 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -72,8 +72,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) "stc", "100mhzsteps", "hwpstate", - NULL, - NULL, /* constant_tsc - moved to flags */ + "", /* constant_tsc - moved to flags */ /* nothing */ }; struct cpuinfo_x86 *c = v; diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index b9bdfc1b54f8..0a1d539149df 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -979,9 +979,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) "stc", "100mhzsteps", "hwpstate", - NULL, /* tsc invariant mapped to constant_tsc */ - NULL, - /* nothing */ /* constant_tsc - moved to flags */ + "", /* tsc invariant mapped to constant_tsc */ + /* nothing */ }; -- cgit v1.2.3 From 6fb14755a676282a4e6caa05a08c92db8e45cfff Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH] x86: tighten kernel image page access rights On x86-64, kernel memory freed after init can be entirely unmapped instead of just getting 'poisoned' by overwriting with a debug pattern. On i386 and x86-64 (under CONFIG_DEBUG_RODATA), kernel text and bug table can also be write-protected. Compared to the first version, this one prevents re-creating deleted mappings in the kernel image range on x86-64, if those got removed previously. This, together with the original changes, prevents temporarily having inconsistent mappings when cacheability attributes are being changed on such pages (e.g. from AGP code). While on i386 such duplicate mappings don't exist, the same change is done there, too, both for consistency and because checking pte_present() before using various other pte_XXX functions is a requirement anyway. At once, i386 code gets adjusted to use pte_huge() instead of open coding this. AK: split out cpa() changes Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen --- arch/i386/kernel/vmlinux.lds.S | 4 ++-- arch/i386/mm/init.c | 25 +++++++++++++++++++------ arch/x86_64/kernel/head.S | 1 - arch/x86_64/kernel/vmlinux.lds.S | 5 +++-- arch/x86_64/mm/init.c | 25 ++++++++++++++++--------- include/linux/poison.h | 3 --- 6 files changed, 40 insertions(+), 23 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 6f38f818380b..f4ec72231835 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -61,8 +61,6 @@ SECTIONS __stop___ex_table = .; } - RODATA - BUG_TABLE . = ALIGN(4); @@ -72,6 +70,8 @@ SECTIONS __tracedata_end = .; } + RODATA + /* writeable */ . = ALIGN(4096); .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 23be1b0aafa4..bd5ef3718504 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -751,13 +752,25 @@ static int noinline do_test_wp_bit(void) void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__start_rodata; + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; - for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) - change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); +#ifdef CONFIG_HOTPLUG_CPU + /* It must still be possible to apply SMP alternatives. */ + if (num_possible_cpus() <= 1) +#endif + { + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RX); + printk("Write protecting the kernel text: %luk\n", size >> 10); + } - printk("Write protecting the kernel read-only data: %uk\n", - (__end_rodata - __start_rodata) >> 10); + start += size; + size = (unsigned long)__end_rodata - start; + change_page_attr(virt_to_page(start), + size >> PAGE_SHIFT, PAGE_KERNEL_RO); + printk("Write protecting the kernel read-only data: %luk\n", + size >> 10); /* * change_page_attr() requires a global_flush_tlb() call after it. @@ -781,7 +794,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) __free_page(page); totalram_pages++; } - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); } void free_initmem(void) diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index 36aa98a6d15c..fd9fdfdd143e 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -280,7 +280,6 @@ early_idt_ripmsg: .balign PAGE_SIZE ENTRY(stext) -ENTRY(_stext) #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 5176ecf006ee..3bdeb88d28f4 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -29,6 +29,7 @@ SECTIONS .text : AT(ADDR(.text) - LOAD_OFFSET) { /* First the code that has to be first for bootstrapping */ *(.bootstrap.text) + _stext = .; /* Then all the functions that are "hot" in profiles, to group them onto the same hugetlb entry */ #include "functionlist" @@ -50,10 +51,10 @@ SECTIONS __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } __stop___ex_table = .; - RODATA - BUG_TABLE + RODATA + . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 69e22d3c9238..e3134bc9a4fc 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -563,21 +564,23 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end) if (begin >= end) return; - printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); + printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); for (addr = begin; addr < end; addr += PAGE_SIZE) { struct page *page = pfn_to_page(addr >> PAGE_SHIFT); ClearPageReserved(page); init_page_count(page); memset(page_address(page), POISON_FREE_INITMEM, PAGE_SIZE); + if (addr >= __START_KERNEL_map) + change_page_attr_addr(addr, 1, __pgprot(0)); __free_page(page); totalram_pages++; } + if (addr > __START_KERNEL_map) + global_flush_tlb(); } void free_initmem(void) { - memset(__initdata_begin, POISON_FREE_INITDATA, - __initdata_end - __initdata_begin); free_init_pages("unused kernel memory", __pa_symbol(&__init_begin), __pa_symbol(&__init_end)); @@ -587,14 +590,18 @@ void free_initmem(void) void mark_rodata_ro(void) { - unsigned long addr = (unsigned long)__va(__pa_symbol(&__start_rodata)); - unsigned long end = (unsigned long)__va(__pa_symbol(&__end_rodata)); + unsigned long start = PFN_ALIGN(__va(__pa_symbol(&_stext))), size; - for (; addr < end; addr += PAGE_SIZE) - change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); +#ifdef CONFIG_HOTPLUG_CPU + /* It must still be possible to apply SMP alternatives. */ + if (num_possible_cpus() > 1) + start = PFN_ALIGN(__va(__pa_symbol(&_etext))); +#endif + size = (unsigned long)__va(__pa_symbol(&__end_rodata)) - start; + change_page_attr_addr(start, size >> PAGE_SHIFT, PAGE_KERNEL_RO); - printk ("Write protecting the kernel read-only data: %luk\n", - (__end_rodata - __start_rodata) >> 10); + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + size >> 10); /* * change_page_attr_addr() requires a global_flush_tlb() call after it. diff --git a/include/linux/poison.h b/include/linux/poison.h index 3e628f990fdf..89580b764959 100644 --- a/include/linux/poison.h +++ b/include/linux/poison.h @@ -26,9 +26,6 @@ /********** arch/$ARCH/mm/init.c **********/ #define POISON_FREE_INITMEM 0xcc -/********** arch/x86_64/mm/init.c **********/ -#define POISON_FREE_INITDATA 0xba - /********** arch/ia64/hp/common/sba_iommu.c **********/ /* * arch/ia64/hp/common/sba_iommu.c uses a 16-byte poison string with a -- cgit v1.2.3 From b8716890f35dd567a3f8e4dd69c428a8b3f47814 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH] x86-64: Remove unused stext symbol suggested by Jan Beulich Signed-off-by: Andi Kleen --- arch/x86_64/kernel/head.S | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index fd9fdfdd143e..1fab487dee86 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -279,7 +279,6 @@ early_idt_ripmsg: .asciz "RIP %s\n" .balign PAGE_SIZE -ENTRY(stext) #define NEXT_PAGE(name) \ .balign PAGE_SIZE; \ -- cgit v1.2.3 From ca906e42312781c38b7a9625109fc65b937ca56c Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 2 May 2007 19:27:10 +0200 Subject: [PATCH] x86: sys_ioperm() prototype cleanup - there's no reason for duplicating the prototype from include/linux/syscalls.h in include/asm-x86_64/unistd.h - every file should #include the headers containing the prototypes for it's global functions Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen --- arch/i386/kernel/ioport.c | 1 + arch/x86_64/kernel/ioport.c | 1 + include/asm-x86_64/unistd.h | 1 - 3 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c index 498e8bc197d5..1b4530e6cd82 100644 --- a/arch/i386/kernel/ioport.c +++ b/arch/i386/kernel/ioport.c @@ -16,6 +16,7 @@ #include #include #include +#include /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c index 745b1f0f494e..387d347b0e07 100644 --- a/arch/x86_64/kernel/ioport.c +++ b/arch/x86_64/kernel/ioport.c @@ -16,6 +16,7 @@ #include #include #include +#include /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index c5f596e71faa..576b29732d3c 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -655,7 +655,6 @@ __SYSCALL(__NR_move_pages, sys_move_pages) #include asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs); -asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on); struct sigaction; asmlinkage long sys_rt_sigaction(int sig, const struct sigaction __user *act, -- cgit v1.2.3 From bd8559c38ee5be40ce2c57a80fd3c3e978cca267 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH] x86: remove UNEXPECTED_IO_APIC() Many years ago, UNEXPECTED_IO_APIC() contained printk()'s (but nothing more). Now that it's completely empty for years, we can as well remove it. Signed-off-by: Adrian Bunk Signed-off-by: Andi Kleen --- arch/i386/kernel/io_apic.c | 30 ------------------------------ arch/x86_64/kernel/io_apic.c | 28 ---------------------------- 2 files changed, 58 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index b3ab8ffebd27..8191583032d0 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -1403,10 +1403,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in enable_8259A_irq(0); } -static inline void UNEXPECTED_IO_APIC(void) -{ -} - void __init print_IO_APIC(void) { int apic, i; @@ -1446,34 +1442,12 @@ void __init print_IO_APIC(void) printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - if (reg_00.bits.ID >= get_physical_broadcast()) - UNEXPECTED_IO_APIC(); - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ - (reg_01.bits.entries != 0x2E) && - (reg_01.bits.entries != 0x3F) - ) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ - ) - UNEXPECTED_IO_APIC(); - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) - UNEXPECTED_IO_APIC(); /* * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, @@ -1483,8 +1457,6 @@ void __init print_IO_APIC(void) if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) - UNEXPECTED_IO_APIC(); } /* @@ -1496,8 +1468,6 @@ void __init print_IO_APIC(void) reg_03.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); - if (reg_03.bits.__reserved_1) - UNEXPECTED_IO_APIC(); } printk(KERN_DEBUG ".... IRQ redirection table:\n"); diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 318d9055cd97..dd14ffa565b1 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -907,10 +907,6 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in enable_8259A_irq(0); } -void __init UNEXPECTED_IO_APIC(void) -{ -} - void __apicdebuginit print_IO_APIC(void) { int apic, i; @@ -946,40 +942,16 @@ void __apicdebuginit print_IO_APIC(void) printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ - (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ - (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ - (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ - (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ - (reg_01.bits.entries != 0x2E) && - (reg_01.bits.entries != 0x3F) && - (reg_01.bits.entries != 0x03) - ) - UNEXPECTED_IO_APIC(); printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ - (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */ - (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ - (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ - (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ - (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ - ) - UNEXPECTED_IO_APIC(); - if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) - UNEXPECTED_IO_APIC(); if (reg_01.bits.version >= 0x10) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) - UNEXPECTED_IO_APIC(); } printk(KERN_DEBUG ".... IRQ redirection table:\n"); -- cgit v1.2.3 From 272a3713bb9e302e0455c894c41180a482d2c8a3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH] x86-64: fix vtime() vsyscall There is a tiny probability that the return value from vtime(time_t *t) is Signed-off-by: Andi Kleen different than the value stored in *t Using a temporary variable solves the problem and gives a faster code. 17: 48 85 ff test %rdi,%rdi 1a: 48 8b 05 00 00 00 00 mov 0(%rip),%rax # __vsyscall_gtod_data.wall_time_tv.tv_sec 21: 74 03 je 26 23: 48 89 07 mov %rax,(%rdi) 26: c9 leaveq 27: c3 retq Signed-off-by: Eric Dumazet --- arch/x86_64/kernel/vsyscall.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index d14cbb3e0ebe..ba330f870679 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -156,11 +156,13 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { + time_t result; if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - else if (t) - *t = __vsyscall_gtod_data.wall_time_tv.tv_sec; - return __vsyscall_gtod_data.wall_time_tv.tv_sec; + result = __vsyscall_gtod_data.wall_time_tv.tv_sec; + if (t) + *t = result; + return result; } /* Fast way to get current CPU and node. -- cgit v1.2.3 From c8118c6c07f2edfd697aaa0b93e08c3b65a5a675 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH] x86-64: vsyscall_gtod_data diet and vgettimeofday() fix Current vsyscall_gtod_data is large (3 or 4 cache lines dirtied at timer interrupt). We can shrink it to exactly 64 bytes (1 cache line on AMD64) Instead of copying a whole struct clocksource, we copy only needed fields. I deleted an unused field : offset_base This patch fixes one oddity in vgettimeofday(): It can returns a timeval with tv_usec = 1000000. Maybe not a bug, but why not doing the right thing ? Signed-off-by: Eric Dumazet Signed-off-by: Andi Kleen --- arch/x86_64/kernel/vsyscall.c | 53 +++++++++++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 17 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index ba330f870679..dc32cef96195 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -51,13 +51,28 @@ asm("" : "=r" (v) : "0" (x)); \ ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); }) +/* + * vsyscall_gtod_data contains data that is : + * - readonly from vsyscalls + * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) + * Try to keep this structure as small as possible to avoid cache line ping pongs + */ struct vsyscall_gtod_data_t { - seqlock_t lock; - int sysctl_enabled; - struct timeval wall_time_tv; + seqlock_t lock; + + /* open coded 'struct timespec' */ + time_t wall_time_sec; + u32 wall_time_nsec; + + int sysctl_enabled; struct timezone sys_tz; - cycle_t offset_base; - struct clocksource clock; + struct { /* extract of a clocksource struct */ + cycle_t (*vread)(void); + cycle_t cycle_last; + cycle_t mask; + u32 mult; + u32 shift; + } clock; }; int __vgetcpu_mode __section_vgetcpu_mode; @@ -73,9 +88,13 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); /* copy vsyscall data */ - vsyscall_gtod_data.clock = *clock; - vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec; - vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000; + vsyscall_gtod_data.clock.vread = clock->vread; + vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; + vsyscall_gtod_data.clock.mask = clock->mask; + vsyscall_gtod_data.clock.mult = clock->mult; + vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.sys_tz = sys_tz; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -110,7 +129,8 @@ static __always_inline long time_syscall(long *t) static __always_inline void do_vgettimeofday(struct timeval * tv) { cycle_t now, base, mask, cycle_delta; - unsigned long seq, mult, shift, nsec_delta; + unsigned seq; + unsigned long mult, shift, nsec; cycle_t (*vread)(void); do { seq = read_seqbegin(&__vsyscall_gtod_data.lock); @@ -126,21 +146,20 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) mult = __vsyscall_gtod_data.clock.mult; shift = __vsyscall_gtod_data.clock.shift; - *tv = __vsyscall_gtod_data.wall_time_tv; - + tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; + nsec = __vsyscall_gtod_data.wall_time_nsec; } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); /* calculate interval: */ cycle_delta = (now - base) & mask; /* convert to nsecs: */ - nsec_delta = (cycle_delta * mult) >> shift; + nsec += (cycle_delta * mult) >> shift; - /* convert to usecs and add to timespec: */ - tv->tv_usec += nsec_delta / NSEC_PER_USEC; - while (tv->tv_usec > USEC_PER_SEC) { + while (nsec >= NSEC_PER_SEC) { tv->tv_sec += 1; - tv->tv_usec -= USEC_PER_SEC; + nsec -= NSEC_PER_SEC; } + tv->tv_usec = nsec / NSEC_PER_USEC; } int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) @@ -159,7 +178,7 @@ time_t __vsyscall(1) vtime(time_t *t) time_t result; if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - result = __vsyscall_gtod_data.wall_time_tv.tv_sec; + result = __vsyscall_gtod_data.wall_time_sec; if (t) *t = result; return result; -- cgit v1.2.3 From 141f9cfe0a948c8fe26e5669364ee62c03ea42e8 Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH] x86-64: Fix "Section mismatch" compile warning Fix "Section mismatch" warnings in arch/x86_64/kernel/time.c Signed-off-by: Bernhard Walle Signed-off-by: Andi Kleen --- arch/x86_64/kernel/time.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 91c9066a380d..0652e173813b 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -328,7 +328,7 @@ static unsigned int __init pit_calibrate_tsc(void) #define PIT_MODE 0x43 #define PIT_CH0 0x40 -static void __init __pit_init(int val, u8 mode) +static void __pit_init(int val, u8 mode) { unsigned long flags; @@ -344,12 +344,12 @@ void __init pit_init(void) __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */ } -void __init pit_stop_interrupt(void) +void pit_stop_interrupt(void) { __pit_init(0, 0x30); /* mode 0 */ } -void __init stop_timer_interrupt(void) +void stop_timer_interrupt(void) { char *name; if (hpet_address) { -- cgit v1.2.3 From b92e9fac400d4ae5bc7a75c568e9844ec53ea329 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 2 May 2007 19:27:11 +0200 Subject: [PATCH] x86: fix amd64-agp aperture validation Under CONFIG_DISCONTIGMEM, assuming that a !pfn_valid() implies all subsequent pfn-s are also invalid is wrong. Thus replace this by explicitly checking against the E820 map. AK: make e820 on x86-64 not initdata Signed-off-by: Jan Beulich Signed-off-by: Andi Kleen Acked-by: Mark Langsdorf --- arch/i386/kernel/e820.c | 20 ++++++++++++++++++++ arch/x86_64/kernel/e820.c | 5 +++-- drivers/char/agp/amd64-agp.c | 13 ++++--------- include/asm-i386/e820.h | 1 + 4 files changed, 28 insertions(+), 11 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/e820.c b/arch/i386/kernel/e820.c index 31f4670ef740..829beec9247e 100644 --- a/arch/i386/kernel/e820.c +++ b/arch/i386/kernel/e820.c @@ -825,6 +825,26 @@ void __init limit_regions(unsigned long long size) print_memory_map("limit_regions endfunc"); } +/* + * This function checks if any part of the range is mapped + * with type. + */ +int +e820_any_mapped(u64 start, u64 end, unsigned type) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + const struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + /* * This function checks if the entire range is mapped with type. * diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index a490fabfcf47..be8965427a93 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -25,7 +25,7 @@ #include #include -struct e820map e820 __initdata; +struct e820map e820; /* * PFN of last memory page. @@ -98,7 +98,7 @@ static inline int bad_addr(unsigned long *addrp, unsigned long size) * This function checks if any part of the range is mapped * with type. */ -int __meminit +int e820_any_mapped(unsigned long start, unsigned long end, unsigned type) { int i; @@ -112,6 +112,7 @@ e820_any_mapped(unsigned long start, unsigned long end, unsigned type) } return 0; } +EXPORT_SYMBOL_GPL(e820_any_mapped); /* * This function checks if the entire range is mapped with type. diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c index 485720486d60..c9f0f250d78f 100644 --- a/drivers/char/agp/amd64-agp.c +++ b/drivers/char/agp/amd64-agp.c @@ -14,6 +14,7 @@ #include #include #include /* PAGE_SIZE */ +#include #include #include "agp.h" @@ -259,7 +260,6 @@ static const struct agp_bridge_driver amd_8151_driver = { /* Some basic sanity checks for the aperture. */ static int __devinit aperture_valid(u64 aper, u32 size) { - u32 pfn, c; if (aper == 0) { printk(KERN_ERR PFX "No aperture\n"); return 0; @@ -272,14 +272,9 @@ static int __devinit aperture_valid(u64 aper, u32 size) printk(KERN_ERR PFX "Aperture out of bounds\n"); return 0; } - pfn = aper >> PAGE_SHIFT; - for (c = 0; c < size/PAGE_SIZE; c++) { - if (!pfn_valid(pfn + c)) - break; - if (!PageReserved(pfn_to_page(pfn + c))) { - printk(KERN_ERR PFX "Aperture pointing to RAM\n"); - return 0; - } + if (e820_any_mapped(aper, aper + size, E820_RAM)) { + printk(KERN_ERR PFX "Aperture pointing to RAM\n"); + return 0; } /* Request the Aperture. This catches cases when someone else diff --git a/include/asm-i386/e820.h b/include/asm-i386/e820.h index c5b8fc6109d6..096a2a8eb1da 100644 --- a/include/asm-i386/e820.h +++ b/include/asm-i386/e820.h @@ -38,6 +38,7 @@ extern struct e820map e820; extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type); +extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern void find_max_pfn(void); extern void register_bootmem_low_pages(unsigned long max_low_pfn); extern void e820_register_memory(void); -- cgit v1.2.3 From c169859d6dfc7471ef9f2dbd720936e17906a084 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH] x86-64: Clean up asm-x86_64/bugs.h Most of asm-x86_64/bugs.h is code which should be in a C file, so put it there. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Linus Torvalds --- arch/x86_64/kernel/Makefile | 2 +- arch/x86_64/kernel/bugs.c | 28 ++++++++++++++++++++++++++++ include/asm-x86_64/alternative.h | 1 + include/asm-x86_64/bugs.h | 30 ++++-------------------------- 4 files changed, 34 insertions(+), 27 deletions(-) create mode 100644 arch/x86_64/kernel/bugs.c (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 6879b4f01e88..a613e13d0e75 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o + pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o diff --git a/arch/x86_64/kernel/bugs.c b/arch/x86_64/kernel/bugs.c new file mode 100644 index 000000000000..131e541e3f7b --- /dev/null +++ b/arch/x86_64/kernel/bugs.c @@ -0,0 +1,28 @@ +/* + * arch/x86_64/kernel/bugs.c + * + * Copyright (C) 1994 Linus Torvalds + * Copyright (C) 2000 SuSE + * + * This is included by init/main.c to check for architecture-dependent bugs. + * + * Needs: + * void check_bugs(void); + */ + +#include +#include +#include +#include +#include +#include + +void __init check_bugs(void) +{ + identify_cpu(&boot_cpu_data); +#if !defined(CONFIG_SMP) + printk("CPU: "); + print_cpu_info(&boot_cpu_data); +#endif + alternative_instructions(); +} diff --git a/include/asm-x86_64/alternative.h b/include/asm-x86_64/alternative.h index a6657b4f3e0e..67ebea3cc481 100644 --- a/include/asm-x86_64/alternative.h +++ b/include/asm-x86_64/alternative.h @@ -16,6 +16,7 @@ struct alt_instr { u8 pad[5]; }; +extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); struct module; diff --git a/include/asm-x86_64/bugs.h b/include/asm-x86_64/bugs.h index d86c5dd689fa..b33dc04d8f42 100644 --- a/include/asm-x86_64/bugs.h +++ b/include/asm-x86_64/bugs.h @@ -1,28 +1,6 @@ -/* - * include/asm-x86_64/bugs.h - * - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2000 SuSE - * - * This is included by init/main.c to check for architecture-dependent bugs. - * - * Needs: - * void check_bugs(void); - */ +#ifndef _ASM_X86_64_BUGS_H +#define _ASM_X86_64_BUGS_H -#include -#include -#include -#include +void check_bugs(void); -extern void alternative_instructions(void); - -static void __init check_bugs(void) -{ - identify_cpu(&boot_cpu_data); -#if !defined(CONFIG_SMP) - printk("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif - alternative_instructions(); -} +#endif /* _ASM_X86_64_BUGS_H */ -- cgit v1.2.3 From f039b754714a422959027cb18bb33760eb8153f0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH] x86: Don't use MWAIT on AMD Family 10 It doesn't put the CPU into deeper sleep states, so it's better to use the standard idle loop to save power. But allow to reenable it anyways for benchmarking. I also removed the obsolete idle=halt on i386 Cc: andreas.herrmann@amd.com Signed-off-by: Andi Kleen --- Documentation/kernel-parameters.txt | 11 +++++++++-- arch/i386/kernel/cpu/amd.c | 5 +++++ arch/i386/kernel/process.c | 17 ++++++++--------- arch/x86_64/kernel/process.c | 12 +++++++----- arch/x86_64/kernel/setup.c | 6 ++++++ include/asm-i386/processor.h | 2 ++ include/asm-x86_64/proto.h | 2 ++ 7 files changed, 39 insertions(+), 16 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4287696f18dd..94ce0d20253d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -695,8 +695,15 @@ and is between 256 and 4096 characters. It is defined in the file idebus= [HW] (E)IDE subsystem - VLB/PCI bus speed See Documentation/ide.txt. - idle= [HW] - Format: idle=poll or idle=halt + idle= [X86] + Format: idle=poll or idle=mwait + Poll forces a polling idle loop that can slightly improves the performance + of waking up a idle CPU, but will use a lot of power and make the system + run hot. Not recommended. + idle=mwait. On systems which support MONITOR/MWAIT but the kernel chose + to not use it because it doesn't save as much power as a normal idle + loop use the MONITOR/MWAIT idle loop anyways. Performance should be the same + as idle=poll. ignore_loglevel [KNL] Ignore loglevel setting - this will print /all/ diff --git a/arch/i386/kernel/cpu/amd.c b/arch/i386/kernel/cpu/amd.c index 2d47db482972..197cda62caa3 100644 --- a/arch/i386/kernel/cpu/amd.c +++ b/arch/i386/kernel/cpu/amd.c @@ -53,6 +53,8 @@ static __cpuinit int amd_apic_timer_broken(void) return 0; } +int force_mwait __cpuinitdata; + static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 l, h; @@ -275,6 +277,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (amd_apic_timer_broken()) set_bit(X86_FEATURE_LAPIC_TIMER_BROKEN, c->x86_capability); + + if (c->x86 == 0x10 && !force_mwait) + clear_bit(X86_FEATURE_MWAIT, c->x86_capability); } static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size) diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 393a67d5d943..7e8e129b3d7d 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -272,25 +272,24 @@ void __devinit select_idle_routine(const struct cpuinfo_x86 *c) } } -static int __init idle_setup (char *str) +static int __init idle_setup(char *str) { - if (!strncmp(str, "poll", 4)) { + if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; #ifdef CONFIG_X86_SMP if (smp_num_siblings > 1) printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); #endif - } else if (!strncmp(str, "halt", 4)) { - printk("using halt in idle threads.\n"); - pm_idle = default_idle; - } + } else if (!strcmp(str, "mwait")) + force_mwait = 1; + else + return -1; boot_option_idle_override = 1; - return 1; + return 0; } - -__setup("idle=", idle_setup); +early_param("idle", idle_setup); void show_regs(struct pt_regs * regs) { diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index d8d5ccc245c8..4f21765078b7 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -288,16 +288,18 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) static int __init idle_setup (char *str) { - if (!strncmp(str, "poll", 4)) { + if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; - } + } else if (!strcmp(str, "mwait")) + force_mwait = 1; + else + return -1; boot_option_idle_override = 1; - return 1; + return 0; } - -__setup("idle=", idle_setup); +early_param("idle", idle_setup); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs * regs) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index 0a1d539149df..db30b5bcef61 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -79,6 +79,8 @@ int bootloader_type; unsigned long saved_video_mode; +int force_mwait __cpuinitdata; + /* * Early DMI memory */ @@ -604,6 +606,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* RDTSC can be speculated around */ clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + + /* Family 10 doesn't support C states in MWAIT so don't use it */ + if (c->x86 == 0x10 && !force_mwait) + clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); } static void __cpuinit detect_ht(struct cpuinfo_x86 *c) diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h index 9d895cc2f312..882d3f8fbbac 100644 --- a/include/asm-i386/processor.h +++ b/include/asm-i386/processor.h @@ -779,4 +779,6 @@ extern int sysenter_setup(void); extern void cpu_set_gdt(int); extern void cpu_init(void); +extern int force_mwait; + #endif /* __ASM_I386_PROCESSOR_H */ diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 3f8f285138d2..98063bcb3b33 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -119,6 +119,8 @@ extern int gsi_irq_sharing(int gsi); extern void smp_local_timer_interrupt(void); +extern int force_mwait; + long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); void i8254_timer_resume(void); -- cgit v1.2.3 From b6e3590f8145c77b8fcef3247e2412335221412f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:12 +0200 Subject: [PATCH] x86: Allow percpu variables to be page-aligned Let's allow page-alignment in general for per-cpu data (wanted by Xen, and Ingo suggested KVM as well). Because larger alignments can use more room, we increase the max per-cpu memory to 64k rather than 32k: it's getting a little tight. Signed-off-by: Rusty Russell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen Acked-by: Ingo Molnar Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/alpha/kernel/vmlinux.lds.S | 2 +- arch/arm/kernel/vmlinux.lds.S | 2 +- arch/cris/arch-v32/vmlinux.lds.S | 1 + arch/frv/kernel/vmlinux.lds.S | 1 + arch/i386/kernel/vmlinux.lds.S | 2 +- arch/m32r/kernel/vmlinux.lds.S | 2 +- arch/mips/kernel/vmlinux.lds.S | 2 +- arch/parisc/kernel/vmlinux.lds.S | 2 +- arch/powerpc/kernel/setup_64.c | 4 ++-- arch/powerpc/kernel/vmlinux.lds.S | 6 +----- arch/ppc/kernel/vmlinux.lds.S | 2 +- arch/s390/kernel/vmlinux.lds.S | 2 +- arch/sh/kernel/vmlinux.lds.S | 2 +- arch/sh64/kernel/vmlinux.lds.S | 2 +- arch/sparc/kernel/vmlinux.lds.S | 2 +- arch/sparc64/kernel/smp.c | 6 +++--- arch/x86_64/kernel/setup64.c | 4 ++-- arch/x86_64/kernel/vmlinux.lds.S | 2 +- arch/xtensa/kernel/vmlinux.lds.S | 2 +- init/main.c | 8 ++------ kernel/module.c | 8 ++++---- 21 files changed, 29 insertions(+), 35 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 4cc44bd33d33..cf1e6fc6c686 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -69,7 +69,7 @@ SECTIONS . = ALIGN(8); SECURITY_INIT - . = ALIGN(64); + . = ALIGN(8192); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index ddbdad48f5b2..d1a6a597ed9a 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -59,7 +59,7 @@ SECTIONS usr/built-in.o(.init.ramfs) __initramfs_end = .; #endif - . = ALIGN(64); + . = ALIGN(4096); __per_cpu_start = .; *(.data.percpu) __per_cpu_end = .; diff --git a/arch/cris/arch-v32/vmlinux.lds.S b/arch/cris/arch-v32/vmlinux.lds.S index e124fcd766d5..dfa25e1542b9 100644 --- a/arch/cris/arch-v32/vmlinux.lds.S +++ b/arch/cris/arch-v32/vmlinux.lds.S @@ -91,6 +91,7 @@ SECTIONS } SECURITY_INIT + . = ALIGN (8192); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index 97910e016825..28eae9735ad6 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -57,6 +57,7 @@ SECTIONS __alt_instructions_end = .; .altinstr_replacement : { *(.altinstr_replacement) } + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index f4ec72231835..97fe6eac47c9 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -194,7 +194,7 @@ SECTIONS __initramfs_end = .; } #endif - . = ALIGN(L1_CACHE_BYTES); + . = ALIGN(4096); .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { __per_cpu_start = .; *(.data.percpu) diff --git a/arch/m32r/kernel/vmlinux.lds.S b/arch/m32r/kernel/vmlinux.lds.S index 439cc257cd1d..6c73bca3f478 100644 --- a/arch/m32r/kernel/vmlinux.lds.S +++ b/arch/m32r/kernel/vmlinux.lds.S @@ -110,7 +110,7 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(32); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index c76b793310c2..043f637e3d10 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -119,7 +119,7 @@ SECTIONS .init.ramfs : { *(.init.ramfs) } __initramfs_end = .; #endif - . = ALIGN(32); + . = ALIGN(_PAGE_SIZE); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 2a8253358c6c..c74585990598 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -181,7 +181,7 @@ SECTIONS .init.ramfs : { *(.init.ramfs) } __initramfs_end = .; #endif - . = ALIGN(32); + . = ALIGN(ASM_PAGE_SIZE); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 22083ce3cc30..6018178708a5 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -582,14 +582,14 @@ void __init setup_per_cpu_areas(void) char *ptr; /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); + size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE); #ifdef CONFIG_MODULES if (size < PERCPU_ENOUGH_ROOM) size = PERCPU_ENOUGH_ROOM; #endif for_each_possible_cpu(i) { - ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); + ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); if (!ptr) panic("Cannot allocate cpu data for CPU %d\n", i); diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 7eefeb4a30e7..132067313147 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -139,11 +139,7 @@ SECTIONS __initramfs_end = .; } #endif -#ifdef CONFIG_PPC32 - . = ALIGN(32); -#else - . = ALIGN(128); -#endif + . = ALIGN(PAGE_SIZE); .data.percpu : { __per_cpu_start = .; *(.data.percpu) diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S index a0625562a44b..44cd128fb719 100644 --- a/arch/ppc/kernel/vmlinux.lds.S +++ b/arch/ppc/kernel/vmlinux.lds.S @@ -130,7 +130,7 @@ SECTIONS __ftr_fixup : { *(__ftr_fixup) } __stop___ftr_fixup = .; - . = ALIGN(32); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 418f6426a949..e9d3432aba60 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -107,7 +107,7 @@ SECTIONS . = ALIGN(2); __initramfs_end = .; #endif - . = ALIGN(256); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 78a6c09875b2..2f606d0ce1f6 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -54,7 +54,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); .data.page_aligned : { *(.data.page_aligned) } - . = ALIGN(L1_CACHE_BYTES); + . = ALIGN(PAGE_SIZE); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/sh64/kernel/vmlinux.lds.S b/arch/sh64/kernel/vmlinux.lds.S index a59c5e998131..4f9616f39830 100644 --- a/arch/sh64/kernel/vmlinux.lds.S +++ b/arch/sh64/kernel/vmlinux.lds.S @@ -85,7 +85,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); .data.page_aligned : C_PHYS(.data.page_aligned) { *(.data.page_aligned) } - . = ALIGN(L1_CACHE_BYTES); + . = ALIGN(PAGE_SIZE); __per_cpu_start = .; .data.percpu : C_PHYS(.data.percpu) { *(.data.percpu) } __per_cpu_end = . ; diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index e5c24e0521de..f0bb6e60e620 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -65,7 +65,7 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(32); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index d4f0a70f4845..1fac215252e4 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -1343,11 +1343,11 @@ void __init setup_per_cpu_areas(void) /* Copy section for each CPU (we discard the original) */ goal = PERCPU_ENOUGH_ROOM; - __per_cpu_shift = 0; - for (size = 1UL; size < goal; size <<= 1UL) + __per_cpu_shift = PAGE_SHIFT; + for (size = PAGE_SIZE; size < goal; size <<= 1UL) __per_cpu_shift++; - ptr = alloc_bootmem(size * NR_CPUS); + ptr = alloc_bootmem_pages(size * NR_CPUS); __per_cpu_base = ptr - __per_cpu_start; diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index 53064a9a365f..64379a80d763 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -103,9 +103,9 @@ void __init setup_per_cpu_areas(void) if (!NODE_DATA(cpu_to_node(i))) { printk("cpu with no node %d, num_online_nodes %d\n", i, num_online_nodes()); - ptr = alloc_bootmem(size); + ptr = alloc_bootmem_pages(size); } else { - ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); + ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); } if (!ptr) panic("Cannot allocate cpu data for CPU %d\n", i); diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 3bdeb88d28f4..7ef0b8820cd2 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -195,7 +195,7 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; diff --git a/arch/xtensa/kernel/vmlinux.lds.S b/arch/xtensa/kernel/vmlinux.lds.S index ab6370054cee..4fbd66a52a88 100644 --- a/arch/xtensa/kernel/vmlinux.lds.S +++ b/arch/xtensa/kernel/vmlinux.lds.S @@ -198,7 +198,7 @@ SECTIONS __ftr_fixup : { *(__ftr_fixup) } __stop___ftr_fixup = .; - . = ALIGN(32); + . = ALIGN(4096); __per_cpu_start = .; .data.percpu : { *(.data.percpu) } __per_cpu_end = .; diff --git a/init/main.c b/init/main.c index a92989e7836a..80f09f31bfab 100644 --- a/init/main.c +++ b/init/main.c @@ -369,12 +369,8 @@ static void __init setup_per_cpu_areas(void) unsigned long nr_possible_cpus = num_possible_cpus(); /* Copy section for each CPU (we discard the original) */ - size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES); -#ifdef CONFIG_MODULES - if (size < PERCPU_ENOUGH_ROOM) - size = PERCPU_ENOUGH_ROOM; -#endif - ptr = alloc_bootmem(size * nr_possible_cpus); + size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE); + ptr = alloc_bootmem_pages(size * nr_possible_cpus); for_each_possible_cpu(i) { __per_cpu_offset[i] = ptr - __per_cpu_start; diff --git a/kernel/module.c b/kernel/module.c index cf49ca25fcce..4dc4a257545c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -346,10 +346,10 @@ static void *percpu_modalloc(unsigned long size, unsigned long align, unsigned int i; void *ptr; - if (align > SMP_CACHE_BYTES) { - printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n", - name, align, SMP_CACHE_BYTES); - align = SMP_CACHE_BYTES; + if (align > PAGE_SIZE) { + printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", + name, align, PAGE_SIZE); + align = PAGE_SIZE; } ptr = __per_cpu_start; -- cgit v1.2.3 From 1c3d99c11c47c8a1a9ed6a46555dbf6520683c52 Mon Sep 17 00:00:00 2001 From: Glauber de Oliveira Costa Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH] x86-64: Fix x86_64 compilation with DEBUG_SIG on Setting the DEBUG_SIG flag breaks compilation due to a wrong struct access. Aditionally, it raises two warnings. This is one patch to fix them all. Signed-off-by: Glauber de Oliveira Costa Signed-off-by: Andi Kleen --- arch/x86_64/kernel/signal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 49ec324cd141..c819625f3316 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -141,7 +141,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) goto badframe; #ifdef DEBUG_SIG - printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs.rip,regs.rsp,frame,eax); + printk("%d sigreturn rip:%lx rsp:%lx frame:%p rax:%lx\n",current->pid,regs->rip,regs->rsp,frame,eax); #endif if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->rsp) == -EFAULT) @@ -301,7 +301,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, if (test_thread_flag(TIF_SINGLESTEP)) ptrace_notify(SIGTRAP); #ifdef DEBUG_SIG - printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n", + printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n", current->comm, current->pid, frame, regs->rip, frame->pretcode); #endif @@ -463,7 +463,7 @@ void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { #ifdef DEBUG_SIG - printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%lx pending:%lx\n", + printk("do_notify_resume flags:%x rip:%lx rsp:%lx caller:%p pending:%x\n", thread_info_flags, regs->rip, regs->rsp, __builtin_return_address(0),signal_pending(current)); #endif -- cgit v1.2.3 From 82d1bb725e128c97b362a4b33fcbfff08fdaaa5a Mon Sep 17 00:00:00 2001 From: James Puthukattukaran Date: Wed, 2 May 2007 19:27:13 +0200 Subject: [PATCH] x86-64: x86-64 system crashes when no memory populating Node 0 I have a 4 socket AMD Operton system. The 2.6.18 kernel I have crashes when there is no memory in node0. AK: changed call to _nopanic Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/aperture.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index b487396c4c5b..a52af5820592 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -51,7 +51,6 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size) static u32 __init allocate_aperture(void) { - pg_data_t *nd0 = NODE_DATA(0); u32 aper_size; void *p; @@ -65,12 +64,12 @@ static u32 __init allocate_aperture(void) * Unfortunately we cannot move it up because that would make the * IOMMU useless. */ - p = __alloc_bootmem_node(nd0, aper_size, aper_size, 0); + p = __alloc_bootmem_nopanic(aper_size, aper_size, 0); if (!p || __pa(p)+aper_size > 0xffffffff) { printk("Cannot allocate aperture memory hole (%p,%uK)\n", p, aper_size>>10); if (p) - free_bootmem_node(nd0, __pa(p), aper_size); + free_bootmem(__pa(p), aper_size); return 0; } printk("Mapping aperture over %d KB of RAM @ %lx\n", -- cgit v1.2.3 From 57decbda6a2a7c400b2a3b3b12e52ccbdc977118 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 2 May 2007 19:27:16 +0200 Subject: [PATCH] x86: update for i386 and x86-64 check_bugs Remove spurious comments, headers and keywords from x86-64 bugs.[ch]. Use identify_boot_cpu() AK: merged with other patch Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Andi Kleen --- arch/x86_64/kernel/bugs.c | 9 +-------- include/asm-i386/bugs.h | 2 +- 2 files changed, 2 insertions(+), 9 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/bugs.c b/arch/x86_64/kernel/bugs.c index 131e541e3f7b..12b585b5345d 100644 --- a/arch/x86_64/kernel/bugs.c +++ b/arch/x86_64/kernel/bugs.c @@ -3,19 +3,12 @@ * * Copyright (C) 1994 Linus Torvalds * Copyright (C) 2000 SuSE - * - * This is included by init/main.c to check for architecture-dependent bugs. - * - * Needs: - * void check_bugs(void); */ #include +#include #include #include -#include -#include -#include void __init check_bugs(void) { diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h index df539b390448..d28979ff73be 100644 --- a/include/asm-i386/bugs.h +++ b/include/asm-i386/bugs.h @@ -7,6 +7,6 @@ #ifndef _ASM_I386_BUG_H #define _ASM_I386_BUG_H -extern void __init check_bugs(void); +void check_bugs(void); #endif /* _ASM_I386_BUG_H */ -- cgit v1.2.3 From 2b1f6278d77c1f2f669346fc2bb48012b5e9495a Mon Sep 17 00:00:00 2001 From: Bernhard Kaindl Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH] x86: Save the MTRRs of the BSP before booting an AP Applied fix by Andew Morton: http://lkml.org/lkml/2007/4/8/88 - Fix `make headers_check'. AMD and Intel x86 CPU manuals state that it is the responsibility of system software to initialize and maintain MTRR consistency across all processors in Multi-Processing Environments. Quote from page 188 of the AMD64 System Programming manual (Volume 2): 7.6.5 MTRRs in Multi-Processing Environments "In multi-processing environments, the MTRRs located in all processors must characterize memory in the same way. Generally, this means that identical values are written to the MTRRs used by the processors." (short omission here) "Failure to do so may result in coherency violations or loss of atomicity. Processor implementations do not check the MTRR settings in other processors to ensure consistency. It is the responsibility of system software to initialize and maintain MTRR consistency across all processors." Current Linux MTRR code already implements the above in the case that the BIOS does not properly initialize MTRRs on the secondary processors, but the case where the fixed-range MTRRs of the boot processor are changed after Linux started to boot, before the initialsation of a secondary processor, is not handled yet. In this case, secondary processors are currently initialized by Linux with MTRRs which the boot processor had very early, when mtrr_bp_init() did run, but not with the MTRRs which the boot processor uses at the time when that secondary processors is actually booted, causing differing MTRR contents on the secondary processors. Such situation happens on Acer Ferrari 1000 and 5000 notebooks where the BIOS enables and sets AMD-specific IORR bits in the fixed-range MTRRs of the boot processor when it transitions the system into ACPI mode. The SMI handler of the BIOS does this in SMM, entered while Linux ACPI code runs acpi_enable(). Other occasions where the SMI handler of the BIOS may change bits in the MTRRs could occur as well. To initialize newly booted secodary processors with the fixed-range MTRRs which the boot processor uses at that time, this patch saves the fixed-range MTRRs of the boot processor before new secondary processors are started. When the secondary processors run their Linux initialisation code, their fixed-range MTRRs will be updated with the saved fixed-range MTRRs. If CONFIG_MTRR is not set, we define mtrr_save_state as an empty statement because there is nothing to do. Possible TODOs: *) CPU-hotplugging outside of SMP suspend/resume is not yet tested with this patch. *) If, even in this case, an AP never runs i386/do_boot_cpu or x86_64/cpu_up, then the calls to mtrr_save_state() could be replaced by calls to mtrr_save_fixed_ranges(NULL) and mtrr_save_state() would not be needed. That would need either verification of the CPU-hotplug code or at least a test on a >2 CPU machine. *) The MTRRs of other running processors are not yet checked at this time but it might be interesting to syncronize the MTTRs of all processors before booting. That would be an incremental patch, but of rather low priority since there is no machine known so far which would require this. AK: moved prototypes on x86-64 around to fix warnings Signed-off-by: Bernhard Kaindl Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen Cc: Andi Kleen Cc: Dave Jones --- arch/i386/kernel/cpu/mtrr/main.c | 11 +++++++++++ arch/i386/kernel/smpboot.c | 7 +++++++ arch/x86_64/kernel/smpboot.c | 6 ++++++ include/asm-i386/mtrr.h | 2 ++ include/asm-x86_64/mtrr.h | 2 ++ 5 files changed, 28 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index 0acfb6a5a220..02a2f39e5e0a 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -729,6 +729,17 @@ void mtrr_ap_init(void) local_irq_restore(flags); } +/** + * Save current fixed-range MTRR state of the BSP + */ +void mtrr_save_state(void) +{ + if (smp_processor_id() == 0) + mtrr_save_fixed_ranges(NULL); + else + smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1); +} + static int __init mtrr_init_finialize(void) { if (!mtrr_if) diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 7c1dbef399cd..f14d93351a82 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -58,6 +58,7 @@ #include #include #include +#include /* Set if we find a B stepping CPU */ static int __devinitdata smp_b_stepping; @@ -814,6 +815,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; + /* + * Save current MTRR state in case it was changed since early boot + * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: + */ + mtrr_save_state(); + /* * We can't use kernel_thread since we must avoid to * reschedule the child. diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 14724be48bec..ddc392bee243 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -944,6 +944,12 @@ int __cpuinit __cpu_up(unsigned int cpu) return -ENOSYS; } + /* + * Save current MTRR state in case it was changed since early boot + * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync: + */ + mtrr_save_state(); + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Boot it! */ err = do_boot_cpu(cpu, apicid); diff --git a/include/asm-i386/mtrr.h b/include/asm-i386/mtrr.h index 02a41b99bd7e..7e9c7ccbdcfe 100644 --- a/include/asm-i386/mtrr.h +++ b/include/asm-i386/mtrr.h @@ -70,6 +70,7 @@ struct mtrr_gentry /* The following functions are for use by other drivers */ # ifdef CONFIG_MTRR extern void mtrr_save_fixed_ranges(void *); +extern void mtrr_save_state(void); extern int mtrr_add (unsigned long base, unsigned long size, unsigned int type, char increment); extern int mtrr_add_page (unsigned long base, unsigned long size, @@ -81,6 +82,7 @@ extern void mtrr_ap_init(void); extern void mtrr_bp_init(void); # else #define mtrr_save_fixed_ranges(arg) do {} while (0) +#define mtrr_save_state() do {} while (0) static __inline__ int mtrr_add (unsigned long base, unsigned long size, unsigned int type, char increment) { diff --git a/include/asm-x86_64/mtrr.h b/include/asm-x86_64/mtrr.h index 1b326cbb9305..b557c486bef8 100644 --- a/include/asm-x86_64/mtrr.h +++ b/include/asm-x86_64/mtrr.h @@ -139,10 +139,12 @@ struct mtrr_gentry32 extern void mtrr_ap_init(void); extern void mtrr_bp_init(void); extern void mtrr_save_fixed_ranges(void *); +extern void mtrr_save_state(void); #else #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) #define mtrr_save_fixed_ranges(arg) do {} while (0) +#define mtrr_save_state() do {} while (0) #endif #endif /* __KERNEL__ */ -- cgit v1.2.3 From 3ebad5905609476a4ff1151a66b21d9794009961 Mon Sep 17 00:00:00 2001 From: Bernhard Kaindl Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH] x86: Save and restore the fixed-range MTRRs of the BSP when suspending Note: This patch didn'nt need an update since it's initial post. Some BIOSes may modify fixed-range MTRRs in SMM, e.g. when they transition the system into ACPI mode, which is entered thru an SMI, triggered by Linux in acpi_enable(). SMIs which cause that Linux is interrupted and BIOS code is executed (which may change e.g. fixed-range MTRRs) in SMM may be raised by an embedded system controller which is often found in notebooks also at other occasions. If we would not update our copy of the fixed-range MTRRs before suspending to RAM or to disk, restore_processor_state() would set the fixed-range MTRRs of the BSP using old backup values which may be outdated and this could cause the system to fail later during resume. This patch ensures that our copy of the fixed-range MTRRs is updated when saving the boot processor state on suspend to disk and suspend to RAM. In combination with other patches this allows to fix s2ram and s2disk on the Acer Ferrari 1000 notebook and at least s2disk on the Acer Ferrari 5000 notebook. Signed-off-by: Bernhard Kaindl Signed-off-by: Andi Kleen Cc: Andrew Morton Cc: Andi Kleen Cc: Dave Jones --- arch/i386/power/cpu.c | 1 + arch/x86_64/kernel/suspend.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c index 2c15500f8713..998fd3ec0d68 100644 --- a/arch/i386/power/cpu.c +++ b/arch/i386/power/cpu.c @@ -21,6 +21,7 @@ unsigned long saved_context_eflags; void __save_processor_state(struct saved_context *ctxt) { + mtrr_save_fixed_ranges(NULL); kernel_fpu_begin(); /* diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index 4ca523d58a5b..6a5a98f2a75c 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -12,6 +12,7 @@ #include #include #include +#include /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; @@ -48,6 +49,7 @@ void __save_processor_state(struct saved_context *ctxt) rdmsrl(MSR_FS_BASE, ctxt->fs_base); rdmsrl(MSR_GS_BASE, ctxt->gs_base); rdmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + mtrr_save_fixed_ranges(NULL); /* * control registers -- cgit v1.2.3 From 8339e9fba33aa3205f541478c413982c0ac5a37f Mon Sep 17 00:00:00 2001 From: Fernando Luis VazquezCao Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH] x86-64: safe_apic_wait_icr_idle - x86_64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit apic_wait_icr_idle looks like this: static __inline__ void apic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) cpu_relax(); } The busy loop in this function would not be problematic if the corresponding status bit in the ICR were always updated, but that does not seem to be the case under certain crash scenarios. Kdump uses an IPI to stop the other CPUs in the event of a crash, but when any of the other CPUs are locked-up inside the NMI handler the CPU that sends the IPI will end up looping forever in the ICR check, effectively hard-locking the whole system. Quoting from Intel's "MultiProcessor Specification" (Version 1.4), B-3: "A local APIC unit indicates successful dispatch of an IPI by resetting the Delivery Status bit in the Interrupt Command Register (ICR). The operating system polls the delivery status bit after sending an INIT or STARTUP IPI until the command has been dispatched. A period of 20 microseconds should be sufficient for IPI dispatch to complete under normal operating conditions. If the IPI is not successfully dispatched, the operating system can abort the command. Alternatively, the operating system can retry the IPI by writing the lower 32-bit double word of the ICR. This “time-out” mechanism can be implemented through an external interrupt, if interrupts are enabled on the processor, or through execution of an instruction or time-stamp counter spin loop." Intel's documentation suggests the implementation of a time-out mechanism, which, by the way, is already being open-coded in some parts of the kernel that tinker with ICR. Create a apic_wait_icr_idle replacement that implements the time-out mechanism and that can be used to solve the aforementioned problem. AK: moved both functions out of line AK: Added improved loop from Keith Owens Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 22 ++++++++++++++++++++++ include/asm-x86_64/apic.h | 8 +++----- 2 files changed, 25 insertions(+), 5 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 3421f21b6c70..943ec4d1bd8e 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -68,6 +68,28 @@ int using_apic_timer __read_mostly = 0; static void apic_pm_activate(void); +void apic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +unsigned int safe_apic_wait_icr_idle(void) +{ + unsigned int send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + void enable_NMI_through_LVT0 (void * dummy) { unsigned int v; diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h index 2f3b013595ab..45e9fca1febc 100644 --- a/include/asm-x86_64/apic.h +++ b/include/asm-x86_64/apic.h @@ -2,6 +2,7 @@ #define __ASM_APIC_H #include +#include #include #include #include @@ -47,11 +48,8 @@ static __inline unsigned int apic_read(unsigned long reg) return *((volatile unsigned int *)(APIC_BASE+reg)); } -static __inline__ void apic_wait_icr_idle(void) -{ - while (apic_read( APIC_ICR ) & APIC_ICR_BUSY) - cpu_relax(); -} +extern void apic_wait_icr_idle(void); +extern unsigned int safe_apic_wait_icr_idle(void); static inline void ack_APIC_irq(void) { -- cgit v1.2.3 From ea8c733b98b73289421ed8f8fc15cfbce3adc3bc Mon Sep 17 00:00:00 2001 From: Fernando Luis VazquezCao Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH] x86-64: use safe_apic_wait_icr_idle in smpboot.c - x86_64 The functionality provided by the new safe_apic_wait_icr_idle is being open-coded all over "kernel/smpboot.c". Use safe_apic_wait_icr_idle instead to consolidate code and ease maintenance. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- arch/x86_64/kernel/smpboot.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index ddc392bee243..f2a234b390e8 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -429,8 +429,8 @@ static void inquire_remote_apic(int apicid) */ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip) { - unsigned long send_status = 0, accept_status = 0; - int maxlvt, timeout, num_starts, j; + unsigned long send_status, accept_status = 0; + int maxlvt, num_starts, j; Dprintk("Asserting INIT.\n"); @@ -446,12 +446,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); mdelay(10); @@ -464,12 +459,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); mb(); atomic_set(&init_deasserted, 1); @@ -508,12 +498,7 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta Dprintk("Startup point 1.\n"); Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); + send_status = safe_apic_wait_icr_idle(); /* * Give the other CPU some time to accept the IPI. -- cgit v1.2.3 From 3144c332fa2ee4c4b9804aae5fe31098f04bffd9 Mon Sep 17 00:00:00 2001 From: Fernando Luis VazquezCao Date: Wed, 2 May 2007 19:27:17 +0200 Subject: [PATCH] x86-64: use safe_apic_wait_icr_idle in smpboot.c - x86_64 inquire_remote_apic is used for APIC debugging, so use safe_apic_wait_icr_idle instead of apic_wait_icr_idle to avoid possible lockups when APIC delivery fails. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- arch/x86_64/kernel/smpboot.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index f2a234b390e8..082f478fb777 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -391,7 +391,8 @@ static void inquire_remote_apic(int apicid) { unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; char *names[] = { "ID", "VERSION", "SPIV" }; - int timeout, status; + int timeout; + unsigned int status; printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); @@ -401,7 +402,9 @@ static void inquire_remote_apic(int apicid) /* * Wait for idle. */ - apic_wait_icr_idle(); + status = safe_apic_wait_icr_idle(); + if (status) + printk("a previous APIC delivery may have failed\n"); apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); -- cgit v1.2.3 From 9062d888aa448318e38792b6879a795dd10adda4 Mon Sep 17 00:00:00 2001 From: "Fernando Luis [** ISO-8859-1 charset **] VzquezCao" Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH] x86-64: __send_IPI_dest_field - x86_64 Implement __send_IPI_dest_field which can be used to send IPIs when the "destination shorthand" field of the ICR is set to 00 (destination field). Use it whenever possible. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Andi Kleen --- arch/x86_64/kernel/genapic_flat.c | 23 +---------------- include/asm-x86_64/ipi.h | 54 +++++++++++++++++++++++---------------- 2 files changed, 33 insertions(+), 44 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 9e0a552f0e4a..01d3939e3a9c 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -61,31 +61,10 @@ static void flat_init_apic_ldr(void) static void flat_send_IPI_mask(cpumask_t cpumask, int vector) { unsigned long mask = cpus_addr(cpumask)[0]; - unsigned long cfg; unsigned long flags; local_irq_save(flags); - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(mask); - apic_write(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector, APIC_DEST_LOGICAL); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); + __send_IPI_dest_field(mask, vector, APIC_DEST_LOGICAL); local_irq_restore(flags); } diff --git a/include/asm-x86_64/ipi.h b/include/asm-x86_64/ipi.h index ffa6f1517f1a..26961e671948 100644 --- a/include/asm-x86_64/ipi.h +++ b/include/asm-x86_64/ipi.h @@ -74,10 +74,39 @@ static inline void __send_IPI_shortcut(unsigned int shortcut, int vector, unsign apic_write(APIC_ICR, cfg); } +/* + * This is used to send an IPI with no shorthand notation (the destination is + * specified in bits 56 to 63 of the ICR). + */ +static inline void __send_IPI_dest_field(unsigned int mask, int vector, unsigned int dest) +{ + unsigned long cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + /* + * prepare target chip field + */ + cfg = __prepare_ICR2(mask); + apic_write(APIC_ICR2, cfg); + + /* + * program the ICR + */ + cfg = __prepare_ICR(0, vector, dest); + + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write(APIC_ICR, cfg); +} static inline void send_IPI_mask_sequence(cpumask_t mask, int vector) { - unsigned long cfg, flags; + unsigned long flags; unsigned long query_cpu; /* @@ -86,28 +115,9 @@ static inline void send_IPI_mask_sequence(cpumask_t mask, int vector) * - mbligh */ local_irq_save(flags); - for_each_cpu_mask(query_cpu, mask) { - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - /* - * prepare target chip field - */ - cfg = __prepare_ICR2(x86_cpu_to_apicid[query_cpu]); - apic_write(APIC_ICR2, cfg); - - /* - * program the ICR - */ - cfg = __prepare_ICR(0, vector, APIC_DEST_PHYSICAL); - - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); + __send_IPI_dest_field(x86_cpu_to_apicid[query_cpu], + vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } -- cgit v1.2.3 From 141a892f57972b01891df7036f567a70459c19ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH] x86-64: move __vgetcpu_mode & __jiffies to the vsyscall_2 zone We apparently hit the 1024 limit of vsyscall_0 zone when some debugging options are set, or if __vsyscall_gtod_data is 64 bytes larger. In order to save 128 bytes from the vsyscall_0 zone, we move __vgetcpu_mode & __jiffies to vsyscall_2 zone where they really belong, since they are used only from vgetcpu() (which is in this vsyscall_2 area). After patch is applied, new layout is : ffffffffff600000 T vgettimeofday ffffffffff60004e t vsysc2 ffffffffff600140 t vread_hpet ffffffffff600150 t vread_tsc ffffffffff600180 D __vsyscall_gtod_data ffffffffff600400 T vtime ffffffffff600413 t vsysc1 ffffffffff600800 T vgetcpu ffffffffff600870 D __vgetcpu_mode ffffffffff600880 D __jiffies ffffffffff600c00 T venosys_1 Signed-off-by: Eric Dumazet Signed-off-by: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/x86_64/kernel/vmlinux.lds.S | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 7ef0b8820cd2..32e427a95ffd 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -95,6 +95,12 @@ SECTIONS { *(.vsyscall_gtod_data) } vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) + { *(.vsyscall_1) } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) + { *(.vsyscall_2) } + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } vgetcpu_mode = VVIRT(.vgetcpu_mode); @@ -102,10 +108,6 @@ SECTIONS .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } jiffies = VVIRT(.jiffies); - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) - { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) - { *(.vsyscall_2) } .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } -- cgit v1.2.3 From 57a4f91ae5571edd7c0428285d8df16bb8bf5f40 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:18 +0200 Subject: [PATCH] x86-64: Auto compute __NR_syscall_max at compile time No need to maintain it anymore Signed-off-by: Andi Kleen --- arch/x86_64/kernel/asm-offsets.c | 10 ++++++++++ arch/x86_64/kernel/syscall.c | 1 + include/asm-x86_64/unistd.h | 2 -- 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c index 96687e2beb2c..778953bc636c 100644 --- a/arch/x86_64/kernel/asm-offsets.c +++ b/arch/x86_64/kernel/asm-offsets.c @@ -21,6 +21,14 @@ #define BLANK() asm volatile("\n->" : : ) +#define __NO_STUBS 1 +#undef __SYSCALL +#undef _ASM_X86_64_UNISTD_H_ +#define __SYSCALL(nr, sym) [nr] = 1, +static char syscalls[] = { +#include +}; + int main(void) { #define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry)) @@ -71,5 +79,7 @@ int main(void) DEFINE(TSS_ist, offsetof(struct tss_struct, ist)); BLANK(); DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx)); + BLANK(); + DEFINE(__NR_syscall_max, sizeof(syscalls) - 1); return 0; } diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c index 213fd6ab789d..63d592c276cc 100644 --- a/arch/x86_64/kernel/syscall.c +++ b/arch/x86_64/kernel/syscall.c @@ -3,6 +3,7 @@ #include #include #include +#include #define __NO_STUBS diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 576b29732d3c..26e23e01c54a 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -620,8 +620,6 @@ __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages 279 __SYSCALL(__NR_move_pages, sys_move_pages) -#define __NR_syscall_max __NR_move_pages - #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_OLD_STAT -- cgit v1.2.3 From 425001fea782dd072cfec5694b93778eb59a1fd3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 May 2007 19:27:19 +0200 Subject: [PATCH] x86-64: unexport cpu_llc_id WARNING: arch/x86_64/kernel/built-in.o - Section mismatch: reference to .init.data:cpu_llc_id from __ksymtab between '__ksymtab_cpu_llc_id' (at offset 0x4a0) and '__ksymtab_smp_num_siblings' It is strange to export a __cpuinitdata symbols to modules, and no module appears to use it anyway. Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Andi Kleen --- arch/x86_64/kernel/smpboot.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 082f478fb777..4d9dacfae575 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -67,7 +67,6 @@ EXPORT_SYMBOL(smp_num_siblings); /* Last level cache ID of each logical CPU */ u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID}; -EXPORT_SYMBOL(cpu_llc_id); /* Bitmask of currently online CPUs */ cpumask_t cpu_online_map __read_mostly; -- cgit v1.2.3 From 8a336b0a4b6dfacc8cc5fd617ba1e1904077de2d Mon Sep 17 00:00:00 2001 From: Tim Hockin Date: Wed, 2 May 2007 19:27:19 +0200 Subject: [PATCH] x86-64: Dynamically adjust machine check interval Background: We've found that MCEs (specifically DRAM SBEs) tend to come in bunches, especially when we are trying really hard to stress the system out. The current MCE poller uses a static interval which does not care whether it has or has not found MCEs recently. Description: This patch makes the MCE poller adjust the polling interval dynamically. If we find an MCE, poll 2x faster (down to 10 ms). When we stop finding MCEs, poll 2x slower (up to check_interval seconds). The check_interval tunable becomes the max polling interval. The "Machine check events logged" printk() is rate limited to the check_interval, which should be identical behavior to the old functionality. Result: If you start to take a lot of correctable errors (not exceptions), you log them faster and more accurately (less chance of overflowing the MCA registers). If you don't take a lot of errors, you will see no change. Alternatives: I considered simply reducing the polling interval to 10 ms immediately and keeping it there as long as we continue to find errors. This felt a bit heavy handed, but does perform significantly better for the default check_interval of 5 minutes (we're using a few seconds when testing for DRAM errors). I could be convinced to go with this, if anyone felt it was not too aggressive. Testing: I used an error-injecting DIMM to create lots of correctable DRAM errors and verified that the polling interval accelerates. The printk() only happens once per check_interval seconds. Patch: This patch is against 2.6.21-rc7. Signed-Off-By: Tim Hockin Signed-off-by: Andi Kleen --- Documentation/x86_64/machinecheck | 7 ++++++- arch/x86_64/kernel/mce.c | 32 ++++++++++++++++++++++++-------- 2 files changed, 30 insertions(+), 9 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/Documentation/x86_64/machinecheck b/Documentation/x86_64/machinecheck index 068a6d9904b9..feaeaf6f6e4d 100644 --- a/Documentation/x86_64/machinecheck +++ b/Documentation/x86_64/machinecheck @@ -36,7 +36,12 @@ between all CPUs. check_interval How often to poll for corrected machine check errors, in seconds - (Note output is hexademical). Default 5 minutes. + (Note output is hexademical). Default 5 minutes. When the poller + finds MCEs it triggers an exponential speedup (poll more often) on + the polling interval. When the poller stops finding MCEs, it + triggers an exponential backoff (poll less often) on the polling + interval. The check_interval variable is both the initial and + maximum polling interval. tolerant Tolerance level. When a machine check exception occurs for a non diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index 8011a8e1c7d4..fa2672682477 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -323,10 +323,13 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status) #endif /* CONFIG_X86_MCE_INTEL */ /* - * Periodic polling timer for "silent" machine check errors. + * Periodic polling timer for "silent" machine check errors. If the + * poller finds an MCE, poll 2x faster. When the poller finds no more + * errors, poll 2x slower (up to check_interval seconds). */ static int check_interval = 5 * 60; /* 5 minutes */ +static int next_interval; /* in jiffies */ static void mcheck_timer(struct work_struct *work); static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); @@ -339,7 +342,6 @@ static void mcheck_check_cpu(void *info) static void mcheck_timer(struct work_struct *work) { on_each_cpu(mcheck_check_cpu, NULL, 1, 1); - schedule_delayed_work(&mcheck_work, check_interval * HZ); /* * It's ok to read stale data here for notify_user and @@ -349,17 +351,30 @@ static void mcheck_timer(struct work_struct *work) * writes. */ if (notify_user && console_logged) { + static unsigned long last_print; + unsigned long now = jiffies; + + /* if we logged an MCE, reduce the polling interval */ + next_interval = max(next_interval/2, HZ/100); notify_user = 0; clear_bit(0, &console_logged); - printk(KERN_INFO "Machine check events logged\n"); + if (time_after_eq(now, last_print + (check_interval*HZ))) { + last_print = now; + printk(KERN_INFO "Machine check events logged\n"); + } + } else { + next_interval = min(next_interval*2, check_interval*HZ); } + + schedule_delayed_work(&mcheck_work, next_interval); } static __init int periodic_mcheck_init(void) { - if (check_interval) - schedule_delayed_work(&mcheck_work, check_interval*HZ); + next_interval = check_interval * HZ; + if (next_interval) + schedule_delayed_work(&mcheck_work, next_interval); return 0; } __initcall(periodic_mcheck_init); @@ -597,12 +612,13 @@ static int mce_resume(struct sys_device *dev) /* Reinit MCEs after user configuration changes */ static void mce_restart(void) { - if (check_interval) + if (next_interval) cancel_delayed_work(&mcheck_work); /* Timer race is harmless here */ on_each_cpu(mce_init, NULL, 1, 1); - if (check_interval) - schedule_delayed_work(&mcheck_work, check_interval*HZ); + next_interval = check_interval * HZ; + if (next_interval) + schedule_delayed_work(&mcheck_work, next_interval); } static struct sysdev_class mce_sysclass = { -- cgit v1.2.3 From 05cb007dac9a50148daf87d0b9469e0cd05fd5e7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH] x86-64: Use the 32bit wd_ops for 64bit too. This mainly removes a lot of code, replacing it with calls into the new 32bit perfctr-watchdog.c Signed-off-by: Andi Kleen --- arch/x86_64/kernel/Makefile | 4 +- arch/x86_64/kernel/nmi.c | 678 +++----------------------------------------- include/asm-x86_64/nmi.h | 9 + 3 files changed, 45 insertions(+), 646 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index a613e13d0e75..4d94c51803d8 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -8,7 +8,8 @@ obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o + pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o bugs.o \ + perfctr-watchdog.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o @@ -57,3 +58,4 @@ i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o alternative-y += ../../i386/kernel/alternative.o pcspeaker-y += ../../i386/kernel/pcspeaker.o +perfctr-watchdog-y += ../../i386/kernel/cpu/perfctr-watchdog.o diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 010d3d9bd56a..6cd2b30e2ffc 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -27,28 +27,11 @@ #include #include #include -#include int unknown_nmi_panic; int nmi_watchdog_enabled; int panic_on_unrecovered_nmi; -/* perfctr_nmi_owner tracks the ownership of the perfctr registers: - * evtsel_nmi_owner tracks the ownership of the event selection - * - different performance counters/ event selection may be reserved for - * different subsystems this reservation system just tries to coordinate - * things a little - */ - -/* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's - * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) - */ -#define NMI_MAX_COUNTER_BITS 66 -#define NMI_MAX_COUNTER_LONGS BITS_TO_LONGS(NMI_MAX_COUNTER_BITS) - -static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner[NMI_MAX_COUNTER_LONGS]); -static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[NMI_MAX_COUNTER_LONGS]); - static cpumask_t backtrace_mask = CPU_MASK_NONE; /* nmi_active: @@ -63,191 +46,11 @@ int panic_on_timeout; unsigned int nmi_watchdog = NMI_DEFAULT; static unsigned int nmi_hz = HZ; -struct nmi_watchdog_ctlblk { - int enabled; - u64 check_bit; - unsigned int cccr_msr; - unsigned int perfctr_msr; /* the MSR to reset in NMI handler */ - unsigned int evntsel_msr; /* the MSR to select the events to handle */ -}; -static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk); +static DEFINE_PER_CPU(short, wd_enabled); /* local prototypes */ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); -/* converts an msr to an appropriate reservation bit */ -static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) -{ - /* returns the bit offset of the performance counter register */ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return (msr - MSR_K7_PERFCTR0); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_PERFCTR0); - else - return (msr - MSR_P4_BPU_PERFCTR0); - } - return 0; -} - -/* converts an msr to an appropriate reservation bit */ -static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr) -{ - /* returns the bit offset of the event selection register */ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return (msr - MSR_K7_EVNTSEL0); - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return (msr - MSR_ARCH_PERFMON_EVENTSEL0); - else - return (msr - MSR_P4_BSU_ESCR0); - } - return 0; -} - -/* checks for a bit availability (hack for oprofile) */ -int avail_to_resrv_perfctr_nmi_bit(unsigned int counter) -{ - int cpu; - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - for_each_possible_cpu (cpu) { - if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu))) - return 0; - } - return 1; -} - -/* checks the an msr for availability */ -int avail_to_resrv_perfctr_nmi(unsigned int msr) -{ - unsigned int counter; - int cpu; - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - for_each_possible_cpu (cpu) { - if (test_bit(counter, &per_cpu(perfctr_nmi_owner, cpu))) - return 0; - } - return 1; -} - -static int __reserve_perfctr_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - if (!test_and_set_bit(counter, &per_cpu(perfctr_nmi_owner, cpu))) - return 1; - return 0; -} - -static void __release_perfctr_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - clear_bit(counter, &per_cpu(perfctr_nmi_owner, cpu)); -} - -int reserve_perfctr_nmi(unsigned int msr) -{ - int cpu, i; - for_each_possible_cpu (cpu) { - if (!__reserve_perfctr_nmi(cpu, msr)) { - for_each_possible_cpu (i) { - if (i >= cpu) - break; - __release_perfctr_nmi(i, msr); - } - return 0; - } - } - return 1; -} - -void release_perfctr_nmi(unsigned int msr) -{ - int cpu; - for_each_possible_cpu (cpu) - __release_perfctr_nmi(cpu, msr); -} - -int __reserve_evntsel_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - if (!test_and_set_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0])) - return 1; - return 0; -} - -static void __release_evntsel_nmi(int cpu, unsigned int msr) -{ - unsigned int counter; - if (cpu < 0) - cpu = smp_processor_id(); - - counter = nmi_evntsel_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - clear_bit(counter, &per_cpu(evntsel_nmi_owner, cpu)[0]); -} - -int reserve_evntsel_nmi(unsigned int msr) -{ - int cpu, i; - for_each_possible_cpu (cpu) { - if (!__reserve_evntsel_nmi(cpu, msr)) { - for_each_possible_cpu (i) { - if (i >= cpu) - break; - __release_evntsel_nmi(i, msr); - } - return 0; - } - } - return 1; -} - -void release_evntsel_nmi(unsigned int msr) -{ - int cpu; - for_each_possible_cpu (cpu) { - __release_evntsel_nmi(cpu, msr); - } -} - -static __cpuinit inline int nmi_known_cpu(void) -{ - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - return boot_cpu_data.x86 == 15 || boot_cpu_data.x86 == 16; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return 1; - else - return (boot_cpu_data.x86 == 15); - } - return 0; -} - /* Run after command line and cpu_init init, but before all other checks */ void nmi_watchdog_default(void) { @@ -277,23 +80,6 @@ static __init void nmi_cpu_busy(void *data) } #endif -static unsigned int adjust_for_32bit_ctr(unsigned int hz) -{ - unsigned int retval = hz; - - /* - * On Intel CPUs with ARCH_PERFMON only 32 bits in the counter - * are writable, with higher bits sign extending from bit 31. - * So, we can only program the counter with 31 bit values and - * 32nd bit should be 1, for 33.. to be 1. - * Find the appropriate nmi_hz - */ - if ((((u64)cpu_khz * 1000) / retval) > 0x7fffffffULL) { - retval = ((u64)cpu_khz * 1000) / 0x7fffffffUL + 1; - } - return retval; -} - int __init check_nmi_watchdog (void) { int *counts; @@ -322,14 +108,14 @@ int __init check_nmi_watchdog (void) mdelay((20*1000)/nmi_hz); // wait 20 ticks for_each_online_cpu(cpu) { - if (!per_cpu(nmi_watchdog_ctlblk, cpu).enabled) + if (!per_cpu(wd_enabled, cpu)) continue; if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) { printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", cpu, counts[cpu], cpu_pda(cpu)->__nmi_count); - per_cpu(nmi_watchdog_ctlblk, cpu).enabled = 0; + per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } } @@ -344,13 +130,8 @@ int __init check_nmi_watchdog (void) /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ - if (nmi_watchdog == NMI_LOCAL_APIC) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - nmi_hz = 1; - if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - } + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = lapic_adjust_nmi_hz(1); kfree(counts); return 0; @@ -379,57 +160,6 @@ int __init setup_nmi_watchdog(char *str) __setup("nmi_watchdog=", setup_nmi_watchdog); -static void disable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -static void enable_lapic_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_LOCAL_APIC); - - /* are we already enabled */ - if (atomic_read(&nmi_active) != 0) - return; - - /* are we lapic aware */ - if (nmi_known_cpu() <= 0) - return; - - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); - touch_nmi_watchdog(); -} - -void disable_timer_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_IO_APIC); - - if (atomic_read(&nmi_active) <= 0) - return; - - disable_irq(0); - on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1); - - BUG_ON(atomic_read(&nmi_active) != 0); -} - -void enable_timer_nmi_watchdog(void) -{ - BUG_ON(nmi_watchdog != NMI_IO_APIC); - - if (atomic_read(&nmi_active) == 0) { - touch_nmi_watchdog(); - on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1); - enable_irq(0); - } -} static void __acpi_nmi_disable(void *__unused) { @@ -515,275 +245,9 @@ late_initcall(init_lapic_nmi_sysfs); #endif /* CONFIG_PM */ -/* - * Activate the NMI watchdog via the local APIC. - * Original code written by Keith Owens. - */ - -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define K7_EVNTSEL_ENABLE (1 << 22) -#define K7_EVNTSEL_INT (1 << 20) -#define K7_EVNTSEL_OS (1 << 17) -#define K7_EVNTSEL_USR (1 << 16) -#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76 -#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING - -static int setup_k7_watchdog(void) -{ - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - perfctr_msr = MSR_K7_PERFCTR0; - evntsel_msr = MSR_K7_EVNTSEL0; - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - /* Simulator may not support it */ - if (checking_wrmsrl(evntsel_msr, 0UL)) - goto fail2; - wrmsrl(perfctr_msr, 0UL); - - evntsel = K7_EVNTSEL_INT - | K7_EVNTSEL_OS - | K7_EVNTSEL_USR - | K7_NMI_EVENT; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL<<63; - return 1; -fail2: - __release_evntsel_nmi(-1, evntsel_msr); -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_k7_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - -/* Note that these events don't tick when the CPU idles. This means - the frequency varies with CPU load. */ - -#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7) -#define P4_ESCR_EVENT_SELECT(N) ((N)<<25) -#define P4_ESCR_OS (1<<3) -#define P4_ESCR_USR (1<<2) -#define P4_CCCR_OVF_PMI0 (1<<26) -#define P4_CCCR_OVF_PMI1 (1<<27) -#define P4_CCCR_THRESHOLD(N) ((N)<<20) -#define P4_CCCR_COMPLEMENT (1<<19) -#define P4_CCCR_COMPARE (1<<18) -#define P4_CCCR_REQUIRED (3<<16) -#define P4_CCCR_ESCR_SELECT(N) ((N)<<13) -#define P4_CCCR_ENABLE (1<<12) -#define P4_CCCR_OVF (1<<31) -/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter - CRU_ESCR0 (with any non-null event selector) through a complemented - max threshold. [IA32-Vol3, Section 14.9.9] */ - -static int setup_p4_watchdog(void) -{ - unsigned int perfctr_msr, evntsel_msr, cccr_msr; - unsigned int evntsel, cccr_val; - unsigned int misc_enable, dummy; - unsigned int ht_num; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy); - if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL)) - return 0; - -#ifdef CONFIG_SMP - /* detect which hyperthread we are on */ - if (smp_num_siblings == 2) { - unsigned int ebx, apicid; - - ebx = cpuid_ebx(1); - apicid = (ebx >> 24) & 0xff; - ht_num = apicid & 1; - } else -#endif - ht_num = 0; - - /* performance counters are shared resources - * assign each hyperthread its own set - * (re-use the ESCR0 register, seems safe - * and keeps the cccr_val the same) - */ - if (!ht_num) { - /* logical cpu 0 */ - perfctr_msr = MSR_P4_IQ_PERFCTR0; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR0; - cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); - } else { - /* logical cpu 1 */ - perfctr_msr = MSR_P4_IQ_PERFCTR1; - evntsel_msr = MSR_P4_CRU_ESCR0; - cccr_msr = MSR_P4_IQ_CCCR1; - cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); - } - - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - evntsel = P4_ESCR_EVENT_SELECT(0x3F) - | P4_ESCR_OS - | P4_ESCR_USR; - - cccr_val |= P4_CCCR_THRESHOLD(15) - | P4_CCCR_COMPLEMENT - | P4_CCCR_COMPARE - | P4_CCCR_REQUIRED; - - wrmsr(evntsel_msr, evntsel, 0); - wrmsr(cccr_msr, cccr_val, 0); - wrmsrl(perfctr_msr, -((u64)cpu_khz * 1000 / nmi_hz)); - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = cccr_msr; - wd->check_bit = 1ULL<<39; - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_p4_watchdog(void) -{ - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - wrmsr(wd->cccr_msr, 0, 0); - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - -#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL -#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK - -static int setup_intel_arch_watchdog(void) -{ - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int perfctr_msr, evntsel_msr; - unsigned int evntsel; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - goto fail; - - perfctr_msr = MSR_ARCH_PERFMON_PERFCTR1; - evntsel_msr = MSR_ARCH_PERFMON_EVENTSEL1; - - if (!__reserve_perfctr_nmi(-1, perfctr_msr)) - goto fail; - - if (!__reserve_evntsel_nmi(-1, evntsel_msr)) - goto fail1; - - wrmsrl(perfctr_msr, 0UL); - - evntsel = ARCH_PERFMON_EVENTSEL_INT - | ARCH_PERFMON_EVENTSEL_OS - | ARCH_PERFMON_EVENTSEL_USR - | ARCH_PERFMON_NMI_EVENT_SEL - | ARCH_PERFMON_NMI_EVENT_UMASK; - - /* setup the timer */ - wrmsr(evntsel_msr, evntsel, 0); - - nmi_hz = adjust_for_32bit_ctr(nmi_hz); - wrmsr(perfctr_msr, (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); - - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); - - wd->perfctr_msr = perfctr_msr; - wd->evntsel_msr = evntsel_msr; - wd->cccr_msr = 0; //unused - wd->check_bit = 1ULL << (eax.split.bit_width - 1); - return 1; -fail1: - __release_perfctr_nmi(-1, perfctr_msr); -fail: - return 0; -} - -static void stop_intel_arch_watchdog(void) -{ - unsigned int ebx; - union cpuid10_eax eax; - unsigned int unused; - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* - * Check whether the Architectural PerfMon supports - * Unhalted Core Cycles Event or not. - * NOTE: Corresponding bit = 0 in ebx indicates event present. - */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); - if ((eax.split.mask_length < (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) || - (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT)) - return; - - wrmsr(wd->evntsel_msr, 0, 0); - - __release_evntsel_nmi(-1, wd->evntsel_msr); - __release_perfctr_nmi(-1, wd->perfctr_msr); -} - void setup_apic_nmi_watchdog(void *unused) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - - /* only support LOCAL and IO APICs for now */ - if ((nmi_watchdog != NMI_LOCAL_APIC) && - (nmi_watchdog != NMI_IO_APIC)) - return; - - if (wd->enabled == 1) + if (__get_cpu_var(wd_enabled) == 1) return; /* cheap hack to support suspend/resume */ @@ -791,62 +255,31 @@ void setup_apic_nmi_watchdog(void *unused) if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) return; - if (nmi_watchdog == NMI_LOCAL_APIC) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) - return; - if (!setup_k7_watchdog()) - return; - break; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - if (!setup_intel_arch_watchdog()) - return; - break; - } - if (!setup_p4_watchdog()) - return; - break; - default: + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + __get_cpu_var(wd_enabled) = 1; + if (lapic_watchdog_init(nmi_hz) < 0) { + __get_cpu_var(wd_enabled) = 0; return; } + /* FALL THROUGH */ + case NMI_IO_APIC: + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); } - wd->enabled = 1; - atomic_inc(&nmi_active); } void stop_apic_nmi_watchdog(void *unused) { - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - /* only support LOCAL and IO APICs for now */ if ((nmi_watchdog != NMI_LOCAL_APIC) && (nmi_watchdog != NMI_IO_APIC)) return; - - if (wd->enabled == 0) + if (__get_cpu_var(wd_enabled) == 0) return; - - if (nmi_watchdog == NMI_LOCAL_APIC) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - if (strstr(boot_cpu_data.x86_model_id, "Screwdriver")) - return; - stop_k7_watchdog(); - break; - case X86_VENDOR_INTEL: - if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - stop_intel_arch_watchdog(); - break; - } - stop_p4_watchdog(); - break; - default: - return; - } - } - wd->enabled = 0; + if (nmi_watchdog == NMI_LOCAL_APIC) + lapic_watchdog_stop(); + __get_cpu_var(wd_enabled) = 0; atomic_dec(&nmi_active); } @@ -885,9 +318,7 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) int sum; int touched = 0; int cpu = smp_processor_id(); - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - u64 dummy; - int rc=0; + int rc = 0; /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) @@ -934,55 +365,20 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) } /* see if the nmi watchdog went off */ - if (wd->enabled) { - if (nmi_watchdog == NMI_LOCAL_APIC) { - rdmsrl(wd->perfctr_msr, dummy); - if (dummy & wd->check_bit){ - /* this wasn't a watchdog timer interrupt */ - goto done; - } - - /* only Intel uses the cccr msr */ - if (wd->cccr_msr != 0) { - /* - * P4 quirks: - * - An overflown perfctr will assert its interrupt - * until the OVF flag in its CCCR is cleared. - * - LVTPC is masked on interrupt and must be - * unmasked by the LVTPC handler. - */ - rdmsrl(wd->cccr_msr, dummy); - dummy &= ~P4_CCCR_OVF; - wrmsrl(wd->cccr_msr, dummy); - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* start the cycle over again */ - wrmsrl(wd->perfctr_msr, - -((u64)cpu_khz * 1000 / nmi_hz)); - } else if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1) { - /* - * ArchPerfom/Core Duo needs to re-unmask - * the apic vector - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); - /* ARCH_PERFMON has 32 bit counter writes */ - wrmsr(wd->perfctr_msr, - (u32)(-((u64)cpu_khz * 1000 / nmi_hz)), 0); - } else { - /* start the cycle over again */ - wrmsrl(wd->perfctr_msr, - -((u64)cpu_khz * 1000 / nmi_hz)); - } - rc = 1; - } else if (nmi_watchdog == NMI_IO_APIC) { - /* don't know how to accurately check for this. - * just assume it was a watchdog timer interrupt - * This matches the old behaviour. - */ - rc = 1; - } else - printk(KERN_WARNING "Unknown enabled NMI hardware?!\n"); + if (!__get_cpu_var(wd_enabled)) + return rc; + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + rc |= lapic_wd_event(nmi_hz); + break; + case NMI_IO_APIC: + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + break; } -done: return rc; } @@ -1067,12 +463,4 @@ void __trigger_all_cpu_backtrace(void) EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); -EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); -EXPORT_SYMBOL(reserve_perfctr_nmi); -EXPORT_SYMBOL(release_perfctr_nmi); -EXPORT_SYMBOL(reserve_evntsel_nmi); -EXPORT_SYMBOL(release_evntsel_nmi); -EXPORT_SYMBOL(disable_timer_nmi_watchdog); -EXPORT_SYMBOL(enable_timer_nmi_watchdog); EXPORT_SYMBOL(touch_nmi_watchdog); diff --git a/include/asm-x86_64/nmi.h b/include/asm-x86_64/nmi.h index 72375e7d32a8..d0a7f53b1497 100644 --- a/include/asm-x86_64/nmi.h +++ b/include/asm-x86_64/nmi.h @@ -80,4 +80,13 @@ extern int unknown_nmi_panic; void __trigger_all_cpu_backtrace(void); #define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() + +void lapic_watchdog_stop(void); +int lapic_watchdog_init(unsigned nmi_hz); +int lapic_wd_event(unsigned nmi_hz); +unsigned lapic_adjust_nmi_hz(unsigned hz); +int lapic_watchdog_ok(void); +void disable_lapic_nmi_watchdog(void); +void enable_lapic_nmi_watchdog(void); + #endif /* ASM_NMI_H */ -- cgit v1.2.3 From 4637a74cf2ac3a3696d385c8624d84de789d1bbe Mon Sep 17 00:00:00 2001 From: "David P. Reed" Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH] x86-64: Avoid overflows during apic timer calibration - Use 64bit TSC calculations to avoid handling overflow - Use 32bit unsigned arithmetic for the APIC timer. This way overflows are handled correctly. - Fix exit check of loop to account for apic timer counting down Signed-off-by: dpreed@reed.com Signed-off-by: Andi Kleen --- arch/x86_64/kernel/apic.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 943ec4d1bd8e..d198f7d82e5a 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -839,14 +839,15 @@ static void setup_APIC_timer(unsigned int clocks) static int __init calibrate_APIC_clock(void) { - int apic, apic_start, tsc, tsc_start; + unsigned apic, apic_start; + unsigned long tsc, tsc_start; int result; /* * Put whatever arbitrary (but long enough) timeout * value into the APIC clock, we just want to get the * counter running for calibration. */ - __setup_APIC_LVTT(1000000000); + __setup_APIC_LVTT(4000000000); apic_start = apic_read(APIC_TMCCT); #ifdef CONFIG_X86_PM_TIMER @@ -857,13 +858,13 @@ static int __init calibrate_APIC_clock(void) } else #endif { - rdtscl(tsc_start); + rdtscll(tsc_start); do { apic = apic_read(APIC_TMCCT); - rdtscl(tsc); + rdtscll(tsc); } while ((tsc - tsc_start) < TICK_COUNT && - (apic - apic_start) < TICK_COUNT); + (apic_start - apic) < TICK_COUNT); result = (apic_start - apic) * 1000L * tsc_khz / (tsc - tsc_start); -- cgit v1.2.3 From 72b1b1d0133d7eb4040697f1052bf92123fb051b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:20 +0200 Subject: [PATCH] x86-64: Use symbolic CPU features in early CPUID check Dead to magic numbers! Generated code is the same. Signed-off-by: Andi Kleen --- arch/x86_64/kernel/verify_cpu.S | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/verify_cpu.S b/arch/x86_64/kernel/verify_cpu.S index 72edabd2ef9a..e035f5948199 100644 --- a/arch/x86_64/kernel/verify_cpu.S +++ b/arch/x86_64/kernel/verify_cpu.S @@ -30,18 +30,27 @@ * appropriately. Either display a message or halt. */ -verify_cpu: +#include +verify_cpu: pushfl # Save caller passed flags pushl $0 # Kill any dangerous flags popfl - /* minimum CPUID flags for x86-64 */ - /* see http://www.x86-64.org/lists/discuss/msg02971.html */ -#define SSE_MASK ((1<<25)|(1<<26)) -#define REQUIRED_MASK1 ((1<<0)|(1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<8)|\ - (1<<13)|(1<<15)|(1<<24)) -#define REQUIRED_MASK2 (1<<29) + /* minimum CPUID flags for x86-64 as defined by AMD */ +#define M(x) (1<<(x)) +#define M2(a,b) M(a)|M(b) +#define M4(a,b,c,d) M(a)|M(b)|M(c)|M(d) + +#define SSE_MASK \ + (M2(X86_FEATURE_XMM,X86_FEATURE_XMM2)) +#define REQUIRED_MASK1 \ + (M4(X86_FEATURE_FPU,X86_FEATURE_PSE,X86_FEATURE_TSC,X86_FEATURE_MSR)|\ + M4(X86_FEATURE_PAE,X86_FEATURE_CX8,X86_FEATURE_PGE,X86_FEATURE_CMOV)|\ + M(X86_FEATURE_FXSR)) +#define REQUIRED_MASK2 \ + (M(X86_FEATURE_LM - 32)) + pushfl # standard way to check for cpuid popl %eax movl %eax,%ebx -- cgit v1.2.3 From 2136220d00d84e5dd923f23552f75b1864a76f21 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH] x86-64: Remove CONFIG_REORDER The option never worked well and functionlist wasn't well maintained. Also it made the build very slow on many binutils version. So just remove it. Cc: arjan@linux.intel.com Signed-off-by: Andi Kleen --- arch/x86_64/Kconfig | 8 - arch/x86_64/Makefile | 1 - arch/x86_64/kernel/functionlist | 1284 -------------------------------------- arch/x86_64/kernel/vmlinux.lds.S | 3 - 4 files changed, 1296 deletions(-) delete mode 100644 arch/x86_64/kernel/functionlist (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 715632026073..1cad418373c3 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -660,14 +660,6 @@ config CC_STACKPROTECTOR_ALL source kernel/Kconfig.hz -config REORDER - bool "Function reordering" - default n - help - This option enables the toolchain to reorder functions for a more - optimal TLB usage. If you have pretty much any version of binutils, - this can increase your kernel build time by roughly one minute. - config K8_NB def_bool y depends on AGP_AMD64 || IOMMU || (PCI && NUMA) diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 803cfcc9b346..29617ae3926d 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -40,7 +40,6 @@ cflags-y += -m64 cflags-y += -mno-red-zone cflags-y += -mcmodel=kernel cflags-y += -pipe -cflags-kernel-$(CONFIG_REORDER) += -ffunction-sections cflags-y += -Wno-sign-compare cflags-y += -fno-asynchronous-unwind-tables ifneq ($(CONFIG_DEBUG_INFO),y) diff --git a/arch/x86_64/kernel/functionlist b/arch/x86_64/kernel/functionlist deleted file mode 100644 index 7ae18ec12454..000000000000 --- a/arch/x86_64/kernel/functionlist +++ /dev/null @@ -1,1284 +0,0 @@ -*(.text.flush_thread) -*(.text.check_poison_obj) -*(.text.copy_page) -*(.text.__set_personality) -*(.text.gart_map_sg) -*(.text.kmem_cache_free) -*(.text.find_get_page) -*(.text._raw_spin_lock) -*(.text.ide_outb) -*(.text.unmap_vmas) -*(.text.copy_page_range) -*(.text.kprobe_handler) -*(.text.__handle_mm_fault) -*(.text.__d_lookup) -*(.text.copy_user_generic) -*(.text.__link_path_walk) -*(.text.get_page_from_freelist) -*(.text.kmem_cache_alloc) -*(.text.drive_cmd_intr) -*(.text.ia32_setup_sigcontext) -*(.text.huge_pte_offset) -*(.text.do_page_fault) -*(.text.page_remove_rmap) -*(.text.release_pages) -*(.text.ide_end_request) -*(.text.__mutex_lock_slowpath) -*(.text.__find_get_block) -*(.text.kfree) -*(.text.vfs_read) -*(.text._raw_spin_unlock) -*(.text.free_hot_cold_page) -*(.text.fget_light) -*(.text.schedule) -*(.text.memcmp) -*(.text.touch_atime) -*(.text.__might_sleep) -*(.text.__down_read_trylock) -*(.text.arch_pick_mmap_layout) -*(.text.find_vma) -*(.text.__make_request) -*(.text.do_generic_mapping_read) -*(.text.mutex_lock_interruptible) -*(.text.__generic_file_aio_read) -*(.text._atomic_dec_and_lock) -*(.text.__wake_up_bit) -*(.text.add_to_page_cache) -*(.text.cache_alloc_debugcheck_after) -*(.text.vm_normal_page) -*(.text.mutex_debug_check_no_locks_freed) -*(.text.net_rx_action) -*(.text.__find_first_zero_bit) -*(.text.put_page) -*(.text._raw_read_lock) -*(.text.__delay) -*(.text.dnotify_parent) -*(.text.do_path_lookup) -*(.text.do_sync_read) -*(.text.do_lookup) -*(.text.bit_waitqueue) -*(.text.file_read_actor) -*(.text.strncpy_from_user) -*(.text.__pagevec_lru_add_active) -*(.text.fget) -*(.text.dput) -*(.text.__strnlen_user) -*(.text.inotify_inode_queue_event) -*(.text.rw_verify_area) -*(.text.ide_intr) -*(.text.inotify_dentry_parent_queue_event) -*(.text.permission) -*(.text.memscan) -*(.text.hpet_rtc_interrupt) -*(.text.do_mmap_pgoff) -*(.text.current_fs_time) -*(.text.vfs_getattr) -*(.text.kmem_flagcheck) -*(.text.mark_page_accessed) -*(.text.free_pages_and_swap_cache) -*(.text.generic_fillattr) -*(.text.__block_prepare_write) -*(.text.__set_page_dirty_nobuffers) -*(.text.link_path_walk) -*(.text.find_get_pages_tag) -*(.text.ide_do_request) -*(.text.__alloc_pages) -*(.text.generic_permission) -*(.text.mod_page_state_offset) -*(.text.free_pgd_range) -*(.text.generic_file_buffered_write) -*(.text.number) -*(.text.ide_do_rw_disk) -*(.text.__brelse) -*(.text.__mod_page_state_offset) -*(.text.rotate_reclaimable_page) -*(.text.find_vma_prepare) -*(.text.find_vma_prev) -*(.text.lru_cache_add_active) -*(.text.__kmalloc_track_caller) -*(.text.smp_invalidate_interrupt) -*(.text.handle_IRQ_event) -*(.text.__find_get_block_slow) -*(.text.do_wp_page) -*(.text.do_select) -*(.text.set_user_nice) -*(.text.sys_read) -*(.text.do_munmap) -*(.text.csum_partial) -*(.text.__do_softirq) -*(.text.may_open) -*(.text.getname) -*(.text.get_empty_filp) -*(.text.__fput) -*(.text.remove_mapping) -*(.text.filp_ctor) -*(.text.poison_obj) -*(.text.unmap_region) -*(.text.test_set_page_writeback) -*(.text.__do_page_cache_readahead) -*(.text.sock_def_readable) -*(.text.ide_outl) -*(.text.shrink_zone) -*(.text.rb_insert_color) -*(.text.get_request) -*(.text.sys_pread64) -*(.text.spin_bug) -*(.text.ide_outsl) -*(.text.mask_and_ack_8259A) -*(.text.filemap_nopage) -*(.text.page_add_file_rmap) -*(.text.find_lock_page) -*(.text.tcp_poll) -*(.text.__mark_inode_dirty) -*(.text.file_ra_state_init) -*(.text.generic_file_llseek) -*(.text.__pagevec_lru_add) -*(.text.page_cache_readahead) -*(.text.n_tty_receive_buf) -*(.text.zonelist_policy) -*(.text.vma_adjust) -*(.text.test_clear_page_dirty) -*(.text.sync_buffer) -*(.text.do_exit) -*(.text.__bitmap_weight) -*(.text.alloc_pages_current) -*(.text.get_unused_fd) -*(.text.zone_watermark_ok) -*(.text.cpuset_update_task_memory_state) -*(.text.__bitmap_empty) -*(.text.sys_munmap) -*(.text.__inode_dir_notify) -*(.text.__generic_file_aio_write_nolock) -*(.text.__pte_alloc) -*(.text.sys_select) -*(.text.vm_acct_memory) -*(.text.vfs_write) -*(.text.__lru_add_drain) -*(.text.prio_tree_insert) -*(.text.generic_file_aio_read) -*(.text.vma_merge) -*(.text.block_write_full_page) -*(.text.__page_set_anon_rmap) -*(.text.apic_timer_interrupt) -*(.text.release_console_sem) -*(.text.sys_write) -*(.text.sys_brk) -*(.text.dup_mm) -*(.text.read_current_timer) -*(.text.ll_rw_block) -*(.text.blk_rq_map_sg) -*(.text.dbg_userword) -*(.text.__block_commit_write) -*(.text.cache_grow) -*(.text.copy_strings) -*(.text.release_task) -*(.text.do_sync_write) -*(.text.unlock_page) -*(.text.load_elf_binary) -*(.text.__follow_mount) -*(.text.__getblk) -*(.text.do_sys_open) -*(.text.current_kernel_time) -*(.text.call_rcu) -*(.text.write_chan) -*(.text.vsnprintf) -*(.text.dummy_inode_setsecurity) -*(.text.submit_bh) -*(.text.poll_freewait) -*(.text.bio_alloc_bioset) -*(.text.skb_clone) -*(.text.page_waitqueue) -*(.text.__mutex_lock_interruptible_slowpath) -*(.text.get_index) -*(.text.csum_partial_copy_generic) -*(.text.bad_range) -*(.text.remove_vma) -*(.text.cp_new_stat) -*(.text.alloc_arraycache) -*(.text.test_clear_page_writeback) -*(.text.strsep) -*(.text.open_namei) -*(.text._raw_read_unlock) -*(.text.get_vma_policy) -*(.text.__down_write_trylock) -*(.text.find_get_pages) -*(.text.tcp_rcv_established) -*(.text.generic_make_request) -*(.text.__block_write_full_page) -*(.text.cfq_set_request) -*(.text.sys_inotify_init) -*(.text.split_vma) -*(.text.__mod_timer) -*(.text.get_options) -*(.text.vma_link) -*(.text.mpage_writepages) -*(.text.truncate_complete_page) -*(.text.tcp_recvmsg) -*(.text.sigprocmask) -*(.text.filemap_populate) -*(.text.sys_close) -*(.text.inotify_dev_queue_event) -*(.text.do_task_stat) -*(.text.__dentry_open) -*(.text.unlink_file_vma) -*(.text.__pollwait) -*(.text.packet_rcv_spkt) -*(.text.drop_buffers) -*(.text.free_pgtables) -*(.text.generic_file_direct_write) -*(.text.copy_process) -*(.text.netif_receive_skb) -*(.text.dnotify_flush) -*(.text.print_bad_pte) -*(.text.anon_vma_unlink) -*(.text.sys_mprotect) -*(.text.sync_sb_inodes) -*(.text.find_inode_fast) -*(.text.dummy_inode_readlink) -*(.text.putname) -*(.text.init_smp_flush) -*(.text.dbg_redzone2) -*(.text.sk_run_filter) -*(.text.may_expand_vm) -*(.text.generic_file_aio_write) -*(.text.find_next_zero_bit) -*(.text.file_kill) -*(.text.audit_getname) -*(.text.arch_unmap_area_topdown) -*(.text.alloc_page_vma) -*(.text.tcp_transmit_skb) -*(.text.rb_next) -*(.text.dbg_redzone1) -*(.text.generic_file_mmap) -*(.text.vfs_fstat) -*(.text.sys_time) -*(.text.page_lock_anon_vma) -*(.text.get_unmapped_area) -*(.text.remote_llseek) -*(.text.__up_read) -*(.text.fd_install) -*(.text.eventpoll_init_file) -*(.text.dma_alloc_coherent) -*(.text.create_empty_buffers) -*(.text.__mutex_unlock_slowpath) -*(.text.dup_fd) -*(.text.d_alloc) -*(.text.tty_ldisc_try) -*(.text.sys_stime) -*(.text.__rb_rotate_right) -*(.text.d_validate) -*(.text.rb_erase) -*(.text.path_release) -*(.text.memmove) -*(.text.invalidate_complete_page) -*(.text.clear_inode) -*(.text.cache_estimate) -*(.text.alloc_buffer_head) -*(.text.smp_call_function_interrupt) -*(.text.flush_tlb_others) -*(.text.file_move) -*(.text.balance_dirty_pages_ratelimited) -*(.text.vma_prio_tree_add) -*(.text.timespec_trunc) -*(.text.mempool_alloc) -*(.text.iget_locked) -*(.text.d_alloc_root) -*(.text.cpuset_populate_dir) -*(.text.anon_vma_prepare) -*(.text.sys_newstat) -*(.text.alloc_page_interleave) -*(.text.__path_lookup_intent_open) -*(.text.__pagevec_free) -*(.text.inode_init_once) -*(.text.free_vfsmnt) -*(.text.__user_walk_fd) -*(.text.cfq_idle_slice_timer) -*(.text.sys_mmap) -*(.text.sys_llseek) -*(.text.prio_tree_remove) -*(.text.filp_close) -*(.text.file_permission) -*(.text.vma_prio_tree_remove) -*(.text.tcp_ack) -*(.text.nameidata_to_filp) -*(.text.sys_lseek) -*(.text.percpu_counter_mod) -*(.text.igrab) -*(.text.__bread) -*(.text.alloc_inode) -*(.text.filldir) -*(.text.__rb_rotate_left) -*(.text.irq_affinity_write_proc) -*(.text.init_request_from_bio) -*(.text.find_or_create_page) -*(.text.tty_poll) -*(.text.tcp_sendmsg) -*(.text.ide_wait_stat) -*(.text.free_buffer_head) -*(.text.flush_signal_handlers) -*(.text.tcp_v4_rcv) -*(.text.nr_blockdev_pages) -*(.text.locks_remove_flock) -*(.text.__iowrite32_copy) -*(.text.do_filp_open) -*(.text.try_to_release_page) -*(.text.page_add_new_anon_rmap) -*(.text.kmem_cache_size) -*(.text.eth_type_trans) -*(.text.try_to_free_buffers) -*(.text.schedule_tail) -*(.text.proc_lookup) -*(.text.no_llseek) -*(.text.kfree_skbmem) -*(.text.do_wait) -*(.text.do_mpage_readpage) -*(.text.vfs_stat_fd) -*(.text.tty_write) -*(.text.705) -*(.text.sync_page) -*(.text.__remove_shared_vm_struct) -*(.text.__kfree_skb) -*(.text.sock_poll) -*(.text.get_request_wait) -*(.text.do_sigaction) -*(.text.do_brk) -*(.text.tcp_event_data_recv) -*(.text.read_chan) -*(.text.pipe_writev) -*(.text.__emul_lookup_dentry) -*(.text.rtc_get_rtc_time) -*(.text.print_objinfo) -*(.text.file_update_time) -*(.text.do_signal) -*(.text.disable_8259A_irq) -*(.text.blk_queue_bounce) -*(.text.__anon_vma_link) -*(.text.__vma_link) -*(.text.vfs_rename) -*(.text.sys_newlstat) -*(.text.sys_newfstat) -*(.text.sys_mknod) -*(.text.__show_regs) -*(.text.iput) -*(.text.get_signal_to_deliver) -*(.text.flush_tlb_page) -*(.text.debug_mutex_wake_waiter) -*(.text.copy_thread) -*(.text.clear_page_dirty_for_io) -*(.text.buffer_io_error) -*(.text.vfs_permission) -*(.text.truncate_inode_pages_range) -*(.text.sys_recvfrom) -*(.text.remove_suid) -*(.text.mark_buffer_dirty) -*(.text.local_bh_enable) -*(.text.get_zeroed_page) -*(.text.get_vmalloc_info) -*(.text.flush_old_exec) -*(.text.dummy_inode_permission) -*(.text.__bio_add_page) -*(.text.prio_tree_replace) -*(.text.notify_change) -*(.text.mntput_no_expire) -*(.text.fput) -*(.text.__end_that_request_first) -*(.text.wake_up_bit) -*(.text.unuse_mm) -*(.text.shrink_icache_memory) -*(.text.sched_balance_self) -*(.text.__pmd_alloc) -*(.text.pipe_poll) -*(.text.normal_poll) -*(.text.__free_pages) -*(.text.follow_mount) -*(.text.cdrom_start_packet_command) -*(.text.blk_recount_segments) -*(.text.bio_put) -*(.text.__alloc_skb) -*(.text.__wake_up) -*(.text.vm_stat_account) -*(.text.sys_fcntl) -*(.text.sys_fadvise64) -*(.text._raw_write_unlock) -*(.text.__pud_alloc) -*(.text.alloc_page_buffers) -*(.text.vfs_llseek) -*(.text.sockfd_lookup) -*(.text._raw_write_lock) -*(.text.put_compound_page) -*(.text.prune_dcache) -*(.text.pipe_readv) -*(.text.mempool_free) -*(.text.make_ahead_window) -*(.text.lru_add_drain) -*(.text.constant_test_bit) -*(.text.__clear_user) -*(.text.arch_unmap_area) -*(.text.anon_vma_link) -*(.text.sys_chroot) -*(.text.setup_arg_pages) -*(.text.radix_tree_preload) -*(.text.init_rwsem) -*(.text.generic_osync_inode) -*(.text.generic_delete_inode) -*(.text.do_sys_poll) -*(.text.dev_queue_xmit) -*(.text.default_llseek) -*(.text.__writeback_single_inode) -*(.text.vfs_ioctl) -*(.text.__up_write) -*(.text.unix_poll) -*(.text.sys_rt_sigprocmask) -*(.text.sock_recvmsg) -*(.text.recalc_bh_state) -*(.text.__put_unused_fd) -*(.text.process_backlog) -*(.text.locks_remove_posix) -*(.text.lease_modify) -*(.text.expand_files) -*(.text.end_buffer_read_nobh) -*(.text.d_splice_alias) -*(.text.debug_mutex_init_waiter) -*(.text.copy_from_user) -*(.text.cap_vm_enough_memory) -*(.text.show_vfsmnt) -*(.text.release_sock) -*(.text.pfifo_fast_enqueue) -*(.text.half_md4_transform) -*(.text.fs_may_remount_ro) -*(.text.do_fork) -*(.text.copy_hugetlb_page_range) -*(.text.cache_free_debugcheck) -*(.text.__tcp_select_window) -*(.text.task_handoff_register) -*(.text.sys_open) -*(.text.strlcpy) -*(.text.skb_copy_datagram_iovec) -*(.text.set_up_list3s) -*(.text.release_open_intent) -*(.text.qdisc_restart) -*(.text.n_tty_chars_in_buffer) -*(.text.inode_change_ok) -*(.text.__downgrade_write) -*(.text.debug_mutex_unlock) -*(.text.add_timer_randomness) -*(.text.sock_common_recvmsg) -*(.text.set_bh_page) -*(.text.printk_lock) -*(.text.path_release_on_umount) -*(.text.ip_output) -*(.text.ide_build_dmatable) -*(.text.__get_user_8) -*(.text.end_buffer_read_sync) -*(.text.__d_path) -*(.text.d_move) -*(.text.del_timer) -*(.text.constant_test_bit) -*(.text.blockable_page_cache_readahead) -*(.text.tty_read) -*(.text.sys_readlink) -*(.text.sys_faccessat) -*(.text.read_swap_cache_async) -*(.text.pty_write_room) -*(.text.page_address_in_vma) -*(.text.kthread) -*(.text.cfq_exit_io_context) -*(.text.__tcp_push_pending_frames) -*(.text.sys_pipe) -*(.text.submit_bio) -*(.text.pid_revalidate) -*(.text.page_referenced_file) -*(.text.lock_sock) -*(.text.get_page_state_node) -*(.text.generic_block_bmap) -*(.text.do_setitimer) -*(.text.dev_queue_xmit_nit) -*(.text.copy_from_read_buf) -*(.text.__const_udelay) -*(.text.console_conditional_schedule) -*(.text.wake_up_new_task) -*(.text.wait_for_completion_interruptible) -*(.text.tcp_rcv_rtt_update) -*(.text.sys_mlockall) -*(.text.set_fs_altroot) -*(.text.schedule_timeout) -*(.text.nr_free_pagecache_pages) -*(.text.nf_iterate) -*(.text.mapping_tagged) -*(.text.ip_queue_xmit) -*(.text.ip_local_deliver) -*(.text.follow_page) -*(.text.elf_map) -*(.text.dummy_file_permission) -*(.text.dispose_list) -*(.text.dentry_open) -*(.text.dentry_iput) -*(.text.bio_alloc) -*(.text.wait_on_page_bit) -*(.text.vfs_readdir) -*(.text.vfs_lstat) -*(.text.seq_escape) -*(.text.__posix_lock_file) -*(.text.mm_release) -*(.text.kref_put) -*(.text.ip_rcv) -*(.text.__iget) -*(.text.free_pages) -*(.text.find_mergeable_anon_vma) -*(.text.find_extend_vma) -*(.text.dummy_inode_listsecurity) -*(.text.bio_add_page) -*(.text.__vm_enough_memory) -*(.text.vfs_stat) -*(.text.tty_paranoia_check) -*(.text.tcp_read_sock) -*(.text.tcp_data_queue) -*(.text.sys_uname) -*(.text.sys_renameat) -*(.text.__strncpy_from_user) -*(.text.__mutex_init) -*(.text.__lookup_hash) -*(.text.kref_get) -*(.text.ip_route_input) -*(.text.__insert_inode_hash) -*(.text.do_sock_write) -*(.text.blk_done_softirq) -*(.text.__wake_up_sync) -*(.text.__vma_link_rb) -*(.text.tty_ioctl) -*(.text.tracesys) -*(.text.sys_getdents) -*(.text.sys_dup) -*(.text.stub_execve) -*(.text.sha_transform) -*(.text.radix_tree_tag_clear) -*(.text.put_unused_fd) -*(.text.put_files_struct) -*(.text.mpage_readpages) -*(.text.may_delete) -*(.text.kmem_cache_create) -*(.text.ip_mc_output) -*(.text.interleave_nodes) -*(.text.groups_search) -*(.text.generic_drop_inode) -*(.text.generic_commit_write) -*(.text.fcntl_setlk) -*(.text.exit_mmap) -*(.text.end_page_writeback) -*(.text.__d_rehash) -*(.text.debug_mutex_free_waiter) -*(.text.csum_ipv6_magic) -*(.text.count) -*(.text.cleanup_rbuf) -*(.text.check_spinlock_acquired_node) -*(.text.can_vma_merge_after) -*(.text.bio_endio) -*(.text.alloc_pidmap) -*(.text.write_ldt) -*(.text.vmtruncate_range) -*(.text.vfs_create) -*(.text.__user_walk) -*(.text.update_send_head) -*(.text.unmap_underlying_metadata) -*(.text.tty_ldisc_deref) -*(.text.tcp_setsockopt) -*(.text.tcp_send_ack) -*(.text.sys_pause) -*(.text.sys_gettimeofday) -*(.text.sync_dirty_buffer) -*(.text.strncmp) -*(.text.release_posix_timer) -*(.text.proc_file_read) -*(.text.prepare_to_wait) -*(.text.locks_mandatory_locked) -*(.text.interruptible_sleep_on_timeout) -*(.text.inode_sub_bytes) -*(.text.in_group_p) -*(.text.hrtimer_try_to_cancel) -*(.text.filldir64) -*(.text.fasync_helper) -*(.text.dummy_sb_pivotroot) -*(.text.d_lookup) -*(.text.d_instantiate) -*(.text.__d_find_alias) -*(.text.cpu_idle_wait) -*(.text.cond_resched_lock) -*(.text.chown_common) -*(.text.blk_congestion_wait) -*(.text.activate_page) -*(.text.unlock_buffer) -*(.text.tty_wakeup) -*(.text.tcp_v4_do_rcv) -*(.text.tcp_current_mss) -*(.text.sys_openat) -*(.text.sys_fchdir) -*(.text.strnlen_user) -*(.text.strnlen) -*(.text.strchr) -*(.text.sock_common_getsockopt) -*(.text.skb_checksum) -*(.text.remove_wait_queue) -*(.text.rb_replace_node) -*(.text.radix_tree_node_ctor) -*(.text.pty_chars_in_buffer) -*(.text.profile_hit) -*(.text.prio_tree_left) -*(.text.pgd_clear_bad) -*(.text.pfifo_fast_dequeue) -*(.text.page_referenced) -*(.text.open_exec) -*(.text.mmput) -*(.text.mm_init) -*(.text.__ide_dma_off_quietly) -*(.text.ide_dma_intr) -*(.text.hrtimer_start) -*(.text.get_io_context) -*(.text.__get_free_pages) -*(.text.find_first_zero_bit) -*(.text.file_free_rcu) -*(.text.dummy_socket_sendmsg) -*(.text.do_unlinkat) -*(.text.do_arch_prctl) -*(.text.destroy_inode) -*(.text.can_vma_merge_before) -*(.text.block_sync_page) -*(.text.block_prepare_write) -*(.text.bio_init) -*(.text.arch_ptrace) -*(.text.wake_up_inode) -*(.text.wait_on_retry_sync_kiocb) -*(.text.vma_prio_tree_next) -*(.text.tcp_rcv_space_adjust) -*(.text.__tcp_ack_snd_check) -*(.text.sys_utime) -*(.text.sys_recvmsg) -*(.text.sys_mremap) -*(.text.sys_bdflush) -*(.text.sleep_on) -*(.text.set_page_dirty_lock) -*(.text.seq_path) -*(.text.schedule_timeout_interruptible) -*(.text.sched_fork) -*(.text.rt_run_flush) -*(.text.profile_munmap) -*(.text.prepare_binprm) -*(.text.__pagevec_release_nonlru) -*(.text.m_show) -*(.text.lookup_mnt) -*(.text.__lookup_mnt) -*(.text.lock_timer_base) -*(.text.is_subdir) -*(.text.invalidate_bh_lru) -*(.text.init_buffer_head) -*(.text.ifind_fast) -*(.text.ide_dma_start) -*(.text.__get_page_state) -*(.text.flock_to_posix_lock) -*(.text.__find_symbol) -*(.text.do_futex) -*(.text.do_execve) -*(.text.dirty_writeback_centisecs_handler) -*(.text.dev_watchdog) -*(.text.can_share_swap_page) -*(.text.blkdev_put) -*(.text.bio_get_nr_vecs) -*(.text.xfrm_compile_policy) -*(.text.vma_prio_tree_insert) -*(.text.vfs_lstat_fd) -*(.text.__user_path_lookup_open) -*(.text.thread_return) -*(.text.tcp_send_delayed_ack) -*(.text.sock_def_error_report) -*(.text.shrink_slab) -*(.text.serial_out) -*(.text.seq_read) -*(.text.secure_ip_id) -*(.text.search_binary_handler) -*(.text.proc_pid_unhash) -*(.text.pagevec_lookup) -*(.text.new_inode) -*(.text.memcpy_toiovec) -*(.text.locks_free_lock) -*(.text.__lock_page) -*(.text.__lock_buffer) -*(.text.load_module) -*(.text.is_bad_inode) -*(.text.invalidate_inode_buffers) -*(.text.insert_vm_struct) -*(.text.inode_setattr) -*(.text.inode_add_bytes) -*(.text.ide_read_24) -*(.text.ide_get_error_location) -*(.text.ide_do_drive_cmd) -*(.text.get_locked_pte) -*(.text.get_filesystem_list) -*(.text.generic_file_open) -*(.text.follow_down) -*(.text.find_next_bit) -*(.text.__find_first_bit) -*(.text.exit_mm) -*(.text.exec_keys) -*(.text.end_buffer_write_sync) -*(.text.end_bio_bh_io_sync) -*(.text.dummy_socket_shutdown) -*(.text.d_rehash) -*(.text.d_path) -*(.text.do_ioctl) -*(.text.dget_locked) -*(.text.copy_thread_group_keys) -*(.text.cdrom_end_request) -*(.text.cap_bprm_apply_creds) -*(.text.blk_rq_bio_prep) -*(.text.__bitmap_intersects) -*(.text.bio_phys_segments) -*(.text.bio_free) -*(.text.arch_get_unmapped_area_topdown) -*(.text.writeback_in_progress) -*(.text.vfs_follow_link) -*(.text.tcp_rcv_state_process) -*(.text.tcp_check_space) -*(.text.sys_stat) -*(.text.sys_rt_sigreturn) -*(.text.sys_rt_sigaction) -*(.text.sys_remap_file_pages) -*(.text.sys_pwrite64) -*(.text.sys_fchownat) -*(.text.sys_fchmodat) -*(.text.strncat) -*(.text.strlcat) -*(.text.strcmp) -*(.text.steal_locks) -*(.text.sock_create) -*(.text.sk_stream_rfree) -*(.text.sk_stream_mem_schedule) -*(.text.skip_atoi) -*(.text.sk_alloc) -*(.text.show_stat) -*(.text.set_fs_pwd) -*(.text.set_binfmt) -*(.text.pty_unthrottle) -*(.text.proc_symlink) -*(.text.pipe_release) -*(.text.pageout) -*(.text.n_tty_write_wakeup) -*(.text.n_tty_ioctl) -*(.text.nr_free_zone_pages) -*(.text.migration_thread) -*(.text.mempool_free_slab) -*(.text.meminfo_read_proc) -*(.text.max_sane_readahead) -*(.text.lru_cache_add) -*(.text.kill_fasync) -*(.text.kernel_read) -*(.text.invalidate_mapping_pages) -*(.text.inode_has_buffers) -*(.text.init_once) -*(.text.inet_sendmsg) -*(.text.idedisk_issue_flush) -*(.text.generic_file_write) -*(.text.free_more_memory) -*(.text.__free_fdtable) -*(.text.filp_dtor) -*(.text.exit_sem) -*(.text.exit_itimers) -*(.text.error_interrupt) -*(.text.end_buffer_async_write) -*(.text.eligible_child) -*(.text.elf_map) -*(.text.dump_task_regs) -*(.text.dummy_task_setscheduler) -*(.text.dummy_socket_accept) -*(.text.dummy_file_free_security) -*(.text.__down_read) -*(.text.do_sock_read) -*(.text.do_sigaltstack) -*(.text.do_mremap) -*(.text.current_io_context) -*(.text.cpu_swap_callback) -*(.text.copy_vma) -*(.text.cap_bprm_set_security) -*(.text.blk_insert_request) -*(.text.bio_map_kern_endio) -*(.text.bio_hw_segments) -*(.text.bictcp_cong_avoid) -*(.text.add_interrupt_randomness) -*(.text.wait_for_completion) -*(.text.version_read_proc) -*(.text.unix_write_space) -*(.text.tty_ldisc_ref_wait) -*(.text.tty_ldisc_put) -*(.text.try_to_wake_up) -*(.text.tcp_v4_tw_remember_stamp) -*(.text.tcp_try_undo_dsack) -*(.text.tcp_may_send_now) -*(.text.sys_waitid) -*(.text.sys_sched_getparam) -*(.text.sys_getppid) -*(.text.sys_getcwd) -*(.text.sys_dup2) -*(.text.sys_chmod) -*(.text.sys_chdir) -*(.text.sprintf) -*(.text.sock_wfree) -*(.text.sock_aio_write) -*(.text.skb_drop_fraglist) -*(.text.skb_dequeue) -*(.text.set_close_on_exec) -*(.text.set_brk) -*(.text.seq_puts) -*(.text.SELECT_DRIVE) -*(.text.sched_exec) -*(.text.return_EIO) -*(.text.remove_from_page_cache) -*(.text.rcu_start_batch) -*(.text.__put_task_struct) -*(.text.proc_pid_readdir) -*(.text.proc_get_inode) -*(.text.prepare_to_wait_exclusive) -*(.text.pipe_wait) -*(.text.pipe_new) -*(.text.pdflush_operation) -*(.text.__pagevec_release) -*(.text.pagevec_lookup_tag) -*(.text.packet_rcv) -*(.text.n_tty_set_room) -*(.text.nr_free_pages) -*(.text.__net_timestamp) -*(.text.mpage_end_io_read) -*(.text.mod_timer) -*(.text.__memcpy) -*(.text.mb_cache_shrink_fn) -*(.text.lock_rename) -*(.text.kstrdup) -*(.text.is_ignored) -*(.text.int_very_careful) -*(.text.inotify_inode_is_dead) -*(.text.inotify_get_cookie) -*(.text.inode_get_bytes) -*(.text.init_timer) -*(.text.init_dev) -*(.text.inet_getname) -*(.text.ide_map_sg) -*(.text.__ide_dma_end) -*(.text.hrtimer_get_remaining) -*(.text.get_task_mm) -*(.text.get_random_int) -*(.text.free_pipe_info) -*(.text.filemap_write_and_wait_range) -*(.text.exit_thread) -*(.text.enter_idle) -*(.text.end_that_request_first) -*(.text.end_8259A_irq) -*(.text.dummy_file_alloc_security) -*(.text.do_group_exit) -*(.text.debug_mutex_init) -*(.text.cpuset_exit) -*(.text.cpu_idle) -*(.text.copy_semundo) -*(.text.copy_files) -*(.text.chrdev_open) -*(.text.cdrom_transfer_packet_command) -*(.text.cdrom_mode_sense) -*(.text.blk_phys_contig_segment) -*(.text.blk_get_queue) -*(.text.bio_split) -*(.text.audit_alloc) -*(.text.anon_pipe_buf_release) -*(.text.add_wait_queue_exclusive) -*(.text.add_wait_queue) -*(.text.acct_process) -*(.text.account) -*(.text.zeromap_page_range) -*(.text.yield) -*(.text.writeback_acquire) -*(.text.worker_thread) -*(.text.wait_on_page_writeback_range) -*(.text.__wait_on_buffer) -*(.text.vscnprintf) -*(.text.vmalloc_to_pfn) -*(.text.vgacon_save_screen) -*(.text.vfs_unlink) -*(.text.vfs_rmdir) -*(.text.unregister_md_personality) -*(.text.unlock_new_inode) -*(.text.unix_stream_sendmsg) -*(.text.unix_stream_recvmsg) -*(.text.unhash_process) -*(.text.udp_v4_lookup_longway) -*(.text.tty_ldisc_flush) -*(.text.tty_ldisc_enable) -*(.text.tty_hung_up_p) -*(.text.tty_buffer_free_all) -*(.text.tso_fragment) -*(.text.try_to_del_timer_sync) -*(.text.tcp_v4_err) -*(.text.tcp_unhash) -*(.text.tcp_seq_next) -*(.text.tcp_select_initial_window) -*(.text.tcp_sacktag_write_queue) -*(.text.tcp_cwnd_validate) -*(.text.sys_vhangup) -*(.text.sys_uselib) -*(.text.sys_symlink) -*(.text.sys_signal) -*(.text.sys_poll) -*(.text.sys_mount) -*(.text.sys_kill) -*(.text.sys_ioctl) -*(.text.sys_inotify_add_watch) -*(.text.sys_getuid) -*(.text.sys_getrlimit) -*(.text.sys_getitimer) -*(.text.sys_getgroups) -*(.text.sys_ftruncate) -*(.text.sysfs_lookup) -*(.text.sys_exit_group) -*(.text.stub_fork) -*(.text.sscanf) -*(.text.sock_map_fd) -*(.text.sock_get_timestamp) -*(.text.__sock_create) -*(.text.smp_call_function_single) -*(.text.sk_stop_timer) -*(.text.skb_copy_and_csum_datagram) -*(.text.__skb_checksum_complete) -*(.text.single_next) -*(.text.sigqueue_alloc) -*(.text.shrink_dcache_parent) -*(.text.select_idle_routine) -*(.text.run_workqueue) -*(.text.run_local_timers) -*(.text.remove_inode_hash) -*(.text.remove_dquot_ref) -*(.text.register_binfmt) -*(.text.read_cache_pages) -*(.text.rb_last) -*(.text.pty_open) -*(.text.proc_root_readdir) -*(.text.proc_pid_flush) -*(.text.proc_pident_lookup) -*(.text.proc_fill_super) -*(.text.proc_exe_link) -*(.text.posix_locks_deadlock) -*(.text.pipe_iov_copy_from_user) -*(.text.opost) -*(.text.nf_register_hook) -*(.text.netif_rx_ni) -*(.text.m_start) -*(.text.mpage_writepage) -*(.text.mm_alloc) -*(.text.memory_open) -*(.text.mark_buffer_async_write) -*(.text.lru_add_drain_all) -*(.text.locks_init_lock) -*(.text.locks_delete_lock) -*(.text.lock_hrtimer_base) -*(.text.load_script) -*(.text.__kill_fasync) -*(.text.ip_mc_sf_allow) -*(.text.__ioremap) -*(.text.int_with_check) -*(.text.int_sqrt) -*(.text.install_thread_keyring) -*(.text.init_page_buffers) -*(.text.inet_sock_destruct) -*(.text.idle_notifier_register) -*(.text.ide_execute_command) -*(.text.ide_end_drive_cmd) -*(.text.__ide_dma_host_on) -*(.text.hrtimer_run_queues) -*(.text.hpet_mask_rtc_irq_bit) -*(.text.__get_zone_counts) -*(.text.get_zone_counts) -*(.text.get_write_access) -*(.text.get_fs_struct) -*(.text.get_dirty_limits) -*(.text.generic_readlink) -*(.text.free_hot_page) -*(.text.finish_wait) -*(.text.find_inode) -*(.text.find_first_bit) -*(.text.__filemap_fdatawrite_range) -*(.text.__filemap_copy_from_user_iovec) -*(.text.exit_aio) -*(.text.elv_set_request) -*(.text.elv_former_request) -*(.text.dup_namespace) -*(.text.dupfd) -*(.text.dummy_socket_getsockopt) -*(.text.dummy_sb_post_mountroot) -*(.text.dummy_quotactl) -*(.text.dummy_inode_rename) -*(.text.__do_SAK) -*(.text.do_pipe) -*(.text.do_fsync) -*(.text.d_instantiate_unique) -*(.text.d_find_alias) -*(.text.deny_write_access) -*(.text.dentry_unhash) -*(.text.d_delete) -*(.text.datagram_poll) -*(.text.cpuset_fork) -*(.text.cpuid_read) -*(.text.copy_namespace) -*(.text.cond_resched) -*(.text.check_version) -*(.text.__change_page_attr) -*(.text.cfq_slab_kill) -*(.text.cfq_completed_request) -*(.text.cdrom_pc_intr) -*(.text.cdrom_decode_status) -*(.text.cap_capset_check) -*(.text.blk_put_request) -*(.text.bio_fs_destructor) -*(.text.bictcp_min_cwnd) -*(.text.alloc_chrdev_region) -*(.text.add_element) -*(.text.acct_update_integrals) -*(.text.write_boundary_block) -*(.text.writeback_release) -*(.text.writeback_inodes) -*(.text.wake_up_state) -*(.text.__wake_up_locked) -*(.text.wake_futex) -*(.text.wait_task_inactive) -*(.text.__wait_on_freeing_inode) -*(.text.wait_noreap_copyout) -*(.text.vmstat_start) -*(.text.vgacon_do_font_op) -*(.text.vfs_readv) -*(.text.vfs_quota_sync) -*(.text.update_queue) -*(.text.unshare_files) -*(.text.unmap_vm_area) -*(.text.unix_socketpair) -*(.text.unix_release_sock) -*(.text.unix_detach_fds) -*(.text.unix_create1) -*(.text.unix_bind) -*(.text.udp_sendmsg) -*(.text.udp_rcv) -*(.text.udp_queue_rcv_skb) -*(.text.uart_write) -*(.text.uart_startup) -*(.text.uart_open) -*(.text.tty_vhangup) -*(.text.tty_termios_baud_rate) -*(.text.tty_release) -*(.text.tty_ldisc_ref) -*(.text.throttle_vm_writeout) -*(.text.058) -*(.text.tcp_xmit_probe_skb) -*(.text.tcp_v4_send_check) -*(.text.tcp_v4_destroy_sock) -*(.text.tcp_sync_mss) -*(.text.tcp_snd_test) -*(.text.tcp_slow_start) -*(.text.tcp_send_fin) -*(.text.tcp_rtt_estimator) -*(.text.tcp_parse_options) -*(.text.tcp_ioctl) -*(.text.tcp_init_tso_segs) -*(.text.tcp_init_cwnd) -*(.text.tcp_getsockopt) -*(.text.tcp_fin) -*(.text.tcp_connect) -*(.text.tcp_cong_avoid) -*(.text.__tcp_checksum_complete_user) -*(.text.task_dumpable) -*(.text.sys_wait4) -*(.text.sys_utimes) -*(.text.sys_symlinkat) -*(.text.sys_socketpair) -*(.text.sys_rmdir) -*(.text.sys_readahead) -*(.text.sys_nanosleep) -*(.text.sys_linkat) -*(.text.sys_fstat) -*(.text.sysfs_readdir) -*(.text.sys_execve) -*(.text.sysenter_tracesys) -*(.text.sys_chown) -*(.text.stub_clone) -*(.text.strrchr) -*(.text.strncpy) -*(.text.stopmachine_set_state) -*(.text.sock_sendmsg) -*(.text.sock_release) -*(.text.sock_fasync) -*(.text.sock_close) -*(.text.sk_stream_write_space) -*(.text.sk_reset_timer) -*(.text.skb_split) -*(.text.skb_recv_datagram) -*(.text.skb_queue_tail) -*(.text.sk_attach_filter) -*(.text.si_swapinfo) -*(.text.simple_strtoll) -*(.text.set_termios) -*(.text.set_task_comm) -*(.text.set_shrinker) -*(.text.set_normalized_timespec) -*(.text.set_brk) -*(.text.serial_in) -*(.text.seq_printf) -*(.text.secure_dccp_sequence_number) -*(.text.rwlock_bug) -*(.text.rt_hash_code) -*(.text.__rta_fill) -*(.text.__request_resource) -*(.text.relocate_new_kernel) -*(.text.release_thread) -*(.text.release_mem) -*(.text.rb_prev) -*(.text.rb_first) -*(.text.random_poll) -*(.text.__put_super_and_need_restart) -*(.text.pty_write) -*(.text.ptrace_stop) -*(.text.proc_self_readlink) -*(.text.proc_root_lookup) -*(.text.proc_root_link) -*(.text.proc_pid_make_inode) -*(.text.proc_pid_attr_write) -*(.text.proc_lookupfd) -*(.text.proc_delete_inode) -*(.text.posix_same_owner) -*(.text.posix_block_lock) -*(.text.poll_initwait) -*(.text.pipe_write) -*(.text.pipe_read_fasync) -*(.text.pipe_ioctl) -*(.text.pdflush) -*(.text.pci_user_read_config_dword) -*(.text.page_readlink) -*(.text.null_lseek) -*(.text.nf_hook_slow) -*(.text.netlink_sock_destruct) -*(.text.netlink_broadcast) -*(.text.neigh_resolve_output) -*(.text.name_to_int) -*(.text.mwait_idle) -*(.text.mutex_trylock) -*(.text.mutex_debug_check_no_locks_held) -*(.text.m_stop) -*(.text.mpage_end_io_write) -*(.text.mpage_alloc) -*(.text.move_page_tables) -*(.text.mounts_open) -*(.text.__memset) -*(.text.memcpy_fromiovec) -*(.text.make_8259A_irq) -*(.text.lookup_user_key_possessed) -*(.text.lookup_create) -*(.text.locks_insert_lock) -*(.text.locks_alloc_lock) -*(.text.kthread_should_stop) -*(.text.kswapd) -*(.text.kobject_uevent) -*(.text.kobject_get_path) -*(.text.kobject_get) -*(.text.klist_children_put) -*(.text.__ip_route_output_key) -*(.text.ip_flush_pending_frames) -*(.text.ip_compute_csum) -*(.text.ip_append_data) -*(.text.ioc_set_batching) -*(.text.invalidate_inode_pages) -*(.text.__invalidate_device) -*(.text.install_arg_page) -*(.text.in_sched_functions) -*(.text.inotify_unmount_inodes) -*(.text.init_once) -*(.text.init_cdrom_command) -*(.text.inet_stream_connect) -*(.text.inet_sk_rebuild_header) -*(.text.inet_csk_addr2sockaddr) -*(.text.inet_create) -*(.text.ifind) -*(.text.ide_setup_dma) -*(.text.ide_outsw) -*(.text.ide_fixstring) -*(.text.ide_dma_setup) -*(.text.ide_cdrom_packet) -*(.text.ide_cd_put) -*(.text.ide_build_sglist) -*(.text.i8259A_shutdown) -*(.text.hung_up_tty_ioctl) -*(.text.hrtimer_nanosleep) -*(.text.hrtimer_init) -*(.text.hrtimer_cancel) -*(.text.hash_futex) -*(.text.group_send_sig_info) -*(.text.grab_cache_page_nowait) -*(.text.get_wchan) -*(.text.get_stack) -*(.text.get_page_state) -*(.text.getnstimeofday) -*(.text.get_node) -*(.text.get_kprobe) -*(.text.generic_unplug_device) -*(.text.free_task) -*(.text.frag_show) -*(.text.find_next_zero_string) -*(.text.filp_open) -*(.text.fillonedir) -*(.text.exit_io_context) -*(.text.exit_idle) -*(.text.exact_lock) -*(.text.eth_header) -*(.text.dummy_unregister_security) -*(.text.dummy_socket_post_create) -*(.text.dummy_socket_listen) -*(.text.dummy_quota_on) -*(.text.dummy_inode_follow_link) -*(.text.dummy_file_receive) -*(.text.dummy_file_mprotect) -*(.text.dummy_file_lock) -*(.text.dummy_file_ioctl) -*(.text.dummy_bprm_post_apply_creds) -*(.text.do_writepages) -*(.text.__down_interruptible) -*(.text.do_notify_resume) -*(.text.do_acct_process) -*(.text.del_timer_sync) -*(.text.default_rebuild_header) -*(.text.d_callback) -*(.text.dcache_readdir) -*(.text.ctrl_dumpfamily) -*(.text.cpuset_rmdir) -*(.text.copy_strings_kernel) -*(.text.con_write_room) -*(.text.complete_all) -*(.text.collect_sigign_sigcatch) -*(.text.clear_user) -*(.text.check_unthrottle) -*(.text.cdrom_release) -*(.text.cdrom_newpc_intr) -*(.text.cdrom_ioctl) -*(.text.cdrom_check_status) -*(.text.cdev_put) -*(.text.cdev_add) -*(.text.cap_ptrace) -*(.text.cap_bprm_secureexec) -*(.text.cache_alloc_refill) -*(.text.bmap) -*(.text.blk_run_queue) -*(.text.blk_queue_dma_alignment) -*(.text.blk_ordered_req_seq) -*(.text.blk_backing_dev_unplug) -*(.text.__bitmap_subset) -*(.text.__bitmap_and) -*(.text.bio_unmap_user) -*(.text.__bforget) -*(.text.bd_forget) -*(.text.bad_pipe_w) -*(.text.bad_get_user) -*(.text.audit_free) -*(.text.anon_vma_ctor) -*(.text.anon_pipe_buf_map) -*(.text.alloc_sock_iocb) -*(.text.alloc_fdset) -*(.text.aio_kick_handler) -*(.text.__add_entropy_words) -*(.text.add_disk_randomness) diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 32e427a95ffd..88cfa50b424d 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -30,9 +30,6 @@ SECTIONS /* First the code that has to be first for bootstrapping */ *(.bootstrap.text) _stext = .; - /* Then all the functions that are "hot" in profiles, to group them - onto the same hugetlb entry */ - #include "functionlist" /* Then the rest */ *(.text) SCHED_TEXT -- cgit v1.2.3 From f19cccf366a07e05703c90038704a3a5ffcb0607 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 2 May 2007 19:27:21 +0200 Subject: [PATCH] x86-64: Fix allnoconfig error in genapic_flat.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: In file included from include2/asm/apic.h:5, from include2/asm/smp.h:15, from linux/arch/x86_64/kernel/genapic_flat.c:18: linux/include/linux/pm.h: In function ‘call_platform_enable_wakeup’: linux/include/linux/pm.h:331: error: ‘EIO’ undeclared (first use in this function) linux/include/linux/pm.h:331: error: (Each undeclared identifier is reported only once linux/include/linux/pm.h:331: error: for each function it appears in.) Signed-off-by: Andi Kleen --- arch/x86_64/kernel/genapic_flat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86_64/kernel') diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c index 01d3939e3a9c..ecb01eefdd27 100644 --- a/arch/x86_64/kernel/genapic_flat.c +++ b/arch/x86_64/kernel/genapic_flat.c @@ -8,6 +8,7 @@ * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and * James Cleverdon. */ +#include #include #include #include -- cgit v1.2.3