diff options
Diffstat (limited to 'arch/i386/kernel')
64 files changed, 2397 insertions, 981 deletions
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index 51ecd512603d..4cc83b322b36 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -24,6 +24,7 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o obj-$(CONFIG_KPROBES) += kprobes.o diff --git a/arch/i386/kernel/acpi/Makefile b/arch/i386/kernel/acpi/Makefile index ee75cb286cfe..5e291a20c03d 100644 --- a/arch/i386/kernel/acpi/Makefile +++ b/arch/i386/kernel/acpi/Makefile @@ -2,3 +2,7 @@ obj-$(CONFIG_ACPI_BOOT) := boot.o obj-$(CONFIG_X86_IO_APIC) += earlyquirk.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup.o +ifneq ($(CONFIG_ACPI_PROCESSOR),) +obj-y += cstate.o +endif + diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c index 848bb97af7ca..b7808a89d945 100644 --- a/arch/i386/kernel/acpi/boot.c +++ b/arch/i386/kernel/acpi/boot.c @@ -29,6 +29,7 @@ #include <linux/efi.h> #include <linux/irq.h> #include <linux/module.h> +#include <linux/dmi.h> #include <asm/pgtable.h> #include <asm/io_apic.h> @@ -158,9 +159,15 @@ char *__acpi_map_table(unsigned long phys, unsigned long size) #endif #ifdef CONFIG_PCI_MMCONFIG -static int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size) +/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ +struct acpi_table_mcfg_config *pci_mmcfg_config; +int pci_mmcfg_config_num; + +int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size) { struct acpi_table_mcfg *mcfg; + unsigned long i; + int config_size; if (!phys_addr || !size) return -EINVAL; @@ -171,18 +178,38 @@ static int __init acpi_parse_mcfg(unsigned long phys_addr, unsigned long size) return -ENODEV; } - if (mcfg->base_reserved) { - printk(KERN_ERR PREFIX "MMCONFIG not in low 4GB of memory\n"); + /* how many config structures do we have */ + pci_mmcfg_config_num = 0; + i = size - sizeof(struct acpi_table_mcfg); + while (i >= sizeof(struct acpi_table_mcfg_config)) { + ++pci_mmcfg_config_num; + i -= sizeof(struct acpi_table_mcfg_config); + }; + if (pci_mmcfg_config_num == 0) { + printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); return -ENODEV; } - pci_mmcfg_base_addr = mcfg->base_address; + config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); + pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); + if (!pci_mmcfg_config) { + printk(KERN_WARNING PREFIX + "No memory for MCFG config tables\n"); + return -ENOMEM; + } + + memcpy(pci_mmcfg_config, &mcfg->config, config_size); + for (i = 0; i < pci_mmcfg_config_num; ++i) { + if (mcfg->config[i].base_reserved) { + printk(KERN_ERR PREFIX + "MMCONFIG not in low 4GB of memory\n"); + return -ENODEV; + } + } return 0; } -#else -#define acpi_parse_mcfg NULL -#endif /* !CONFIG_PCI_MMCONFIG */ +#endif /* CONFIG_PCI_MMCONFIG */ #ifdef CONFIG_X86_LOCAL_APIC static int __init @@ -506,6 +533,22 @@ acpi_unmap_lsapic(int cpu) EXPORT_SYMBOL(acpi_unmap_lsapic); #endif /* CONFIG_ACPI_HOTPLUG_CPU */ +int +acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base) +{ + /* TBD */ + return -EINVAL; +} +EXPORT_SYMBOL(acpi_register_ioapic); + +int +acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base) +{ + /* TBD */ + return -EINVAL; +} +EXPORT_SYMBOL(acpi_unregister_ioapic); + static unsigned long __init acpi_scan_rsdp ( unsigned long start, @@ -815,6 +858,219 @@ acpi_process_madt(void) return; } +extern int acpi_force; + +#ifdef __i386__ + +#ifdef CONFIG_ACPI_PCI +static int __init disable_acpi_irq(struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n", + d->ident); + acpi_noirq_set(); + } + return 0; +} + +static int __init disable_acpi_pci(struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n", + d->ident); + acpi_disable_pci(); + } + return 0; +} +#endif + +static int __init dmi_disable_acpi(struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: acpi off\n",d->ident); + disable_acpi(); + } else { + printk(KERN_NOTICE + "Warning: DMI blacklist says broken, but acpi forced\n"); + } + return 0; +} + +/* + * Limit ACPI to CPU enumeration for HT + */ +static int __init force_acpi_ht(struct dmi_system_id *d) +{ + if (!acpi_force) { + printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", d->ident); + disable_acpi(); + acpi_ht = 1; + } else { + printk(KERN_NOTICE + "Warning: acpi=force overrules DMI blacklist: acpi=ht\n"); + } + return 0; +} + +/* + * If your system is blacklisted here, but you find that acpi=force + * works for you, please contact acpi-devel@sourceforge.net + */ +static struct dmi_system_id __initdata acpi_dmi_table[] = { + /* + * Boxes that need ACPI disabled + */ + { + .callback = dmi_disable_acpi, + .ident = "IBM Thinkpad", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "2629H1G"), + }, + }, + + /* + * Boxes that need acpi=ht + */ + { + .callback = force_acpi_ht, + .ident = "FSC Primergy T850", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), + DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "DELL GX240", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_BOARD_NAME, "OptiPlex GX240"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "HP VISUALIZE NT Workstation", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "Compaq Workstation W8000", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), + DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "ASUS P4B266", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4B266"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "ASUS P2B-DS", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "ASUS CUR-DLS", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "ABIT i440BX-W83977", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"), + DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "IBM Bladecenter", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "IBM eServer xSeries 360", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "IBM eserver xSeries 330", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"), + }, + }, + { + .callback = force_acpi_ht, + .ident = "IBM eserver xSeries 440", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"), + }, + }, + +#ifdef CONFIG_ACPI_PCI + /* + * Boxes that need ACPI PCI IRQ routing disabled + */ + { + .callback = disable_acpi_irq, + .ident = "ASUS A7V", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"), + DMI_MATCH(DMI_BOARD_NAME, "<A7V>"), + /* newer BIOS, Revision 1011, does work */ + DMI_MATCH(DMI_BIOS_VERSION, "ASUS A7V ACPI BIOS Revision 1007"), + }, + }, + + /* + * Boxes that need ACPI PCI IRQ routing and PCI scan disabled + */ + { /* _BBN 0 bug */ + .callback = disable_acpi_pci, + .ident = "ASUS PR-DLS", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "PR-DLS"), + DMI_MATCH(DMI_BIOS_VERSION, "ASUS PR-DLS ACPI BIOS Revision 1010"), + DMI_MATCH(DMI_BIOS_DATE, "03/21/2003") + }, + }, + { + .callback = disable_acpi_pci, + .ident = "Acer TravelMate 36x Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), + }, + }, +#endif + { } +}; + +#endif /* __i386__ */ + /* * acpi_boot_table_init() and acpi_boot_init() * called from setup_arch(), always. @@ -843,6 +1099,10 @@ acpi_boot_table_init(void) { int error; +#ifdef __i386__ + dmi_check_system(acpi_dmi_table); +#endif + /* * If acpi_disabled, bail out * One exception: acpi=ht continues far enough to enumerate LAPICs @@ -870,8 +1130,6 @@ acpi_boot_table_init(void) */ error = acpi_blacklisted(); if (error) { - extern int acpi_force; - if (acpi_force) { printk(KERN_WARNING PREFIX "acpi=force override\n"); } else { @@ -907,7 +1165,6 @@ int __init acpi_boot_init(void) acpi_process_madt(); acpi_table_parse(ACPI_HPET, acpi_parse_hpet); - acpi_table_parse(ACPI_MCFG, acpi_parse_mcfg); return 0; } diff --git a/arch/i386/kernel/acpi/cstate.c b/arch/i386/kernel/acpi/cstate.c new file mode 100644 index 000000000000..4c3036ba65df --- /dev/null +++ b/arch/i386/kernel/acpi/cstate.c @@ -0,0 +1,103 @@ +/* + * arch/i386/kernel/acpi/cstate.c + * + * Copyright (C) 2005 Intel Corporation + * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> + * - Added _PDC for SMP C-states on Intel CPUs + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/acpi.h> + +#include <acpi/processor.h> +#include <asm/acpi.h> + +static void acpi_processor_power_init_intel_pdc(struct acpi_processor_power + *pow) +{ + struct acpi_object_list *obj_list; + union acpi_object *obj; + u32 *buf; + + /* allocate and initialize pdc. It will be used later. */ + obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL); + if (!obj_list) { + printk(KERN_ERR "Memory allocation error\n"); + return; + } + + obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL); + if (!obj) { + printk(KERN_ERR "Memory allocation error\n"); + kfree(obj_list); + return; + } + + buf = kmalloc(12, GFP_KERNEL); + if (!buf) { + printk(KERN_ERR "Memory allocation error\n"); + kfree(obj); + kfree(obj_list); + return; + } + + buf[0] = ACPI_PDC_REVISION_ID; + buf[1] = 1; + buf[2] = ACPI_PDC_C_CAPABILITY_SMP; + + obj->type = ACPI_TYPE_BUFFER; + obj->buffer.length = 12; + obj->buffer.pointer = (u8 *) buf; + obj_list->count = 1; + obj_list->pointer = obj; + pow->pdc = obj_list; + + return; +} + +/* Initialize _PDC data based on the CPU vendor */ +void acpi_processor_power_init_pdc(struct acpi_processor_power *pow, + unsigned int cpu) +{ + struct cpuinfo_x86 *c = cpu_data + cpu; + + pow->pdc = NULL; + if (c->x86_vendor == X86_VENDOR_INTEL) + acpi_processor_power_init_intel_pdc(pow); + + return; +} + +EXPORT_SYMBOL(acpi_processor_power_init_pdc); + +/* + * Initialize bm_flags based on the CPU cache properties + * On SMP it depends on cache configuration + * - When cache is not shared among all CPUs, we flush cache + * before entering C3. + * - When cache is shared among all CPUs, we use bm_check + * mechanism as in UP case + * + * This routine is called only after all the CPUs are online + */ +void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, + unsigned int cpu) +{ + struct cpuinfo_x86 *c = cpu_data + cpu; + + flags->bm_check = 0; + if (num_online_cpus() == 1) + flags->bm_check = 1; + else if (c->x86_vendor == X86_VENDOR_INTEL) { + /* + * Today all CPUs that support C3 share cache. + * TBD: This needs to look at cache shared map, once + * multi-core detection patch makes to the base. + */ + flags->bm_check = 1; + } +} + +EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/i386/kernel/acpi/sleep.c b/arch/i386/kernel/acpi/sleep.c index 28bb0514bb6e..c1af93032ff3 100644 --- a/arch/i386/kernel/acpi/sleep.c +++ b/arch/i386/kernel/acpi/sleep.c @@ -7,6 +7,7 @@ #include <linux/acpi.h> #include <linux/bootmem.h> +#include <linux/dmi.h> #include <asm/smp.h> #include <asm/tlbflush.h> @@ -91,3 +92,29 @@ static int __init acpi_sleep_setup(char *str) __setup("acpi_sleep=", acpi_sleep_setup); + + +static __init int reset_videomode_after_s3(struct dmi_system_id *d) +{ + acpi_video_flags |= 2; + return 0; +} + +static __initdata struct dmi_system_id acpisleep_dmi_table[] = { + { /* Reset video mode after returning from ACPI S3 sleep */ + .callback = reset_videomode_after_s3, + .ident = "Toshiba Satellite 4030cdt", + .matches = { + DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), + }, + }, + { } +}; + +static int __init acpisleep_dmi_init(void) +{ + dmi_check_system(acpisleep_dmi_table); + return 0; +} + +core_initcall(acpisleep_dmi_init); diff --git a/arch/i386/kernel/acpi/wakeup.S b/arch/i386/kernel/acpi/wakeup.S index 39d32484f6f5..44d886c745ec 100644 --- a/arch/i386/kernel/acpi/wakeup.S +++ b/arch/i386/kernel/acpi/wakeup.S @@ -74,8 +74,9 @@ wakeup_code: movw %ax,%fs movw $0x0e00 + 'i', %fs:(0x12) - # need a gdt - lgdt real_save_gdt - wakeup_code + # need a gdt -- use lgdtl to force 32-bit operands, in case + # the GDT is located past 16 megabytes. + lgdtl real_save_gdt - wakeup_code movl real_save_cr0 - wakeup_code, %eax movl %eax, %cr0 diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index d509836b70c3..bd1dbf3bd223 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -26,6 +26,7 @@ #include <linux/mc146818rtc.h> #include <linux/kernel_stat.h> #include <linux/sysdev.h> +#include <linux/cpu.h> #include <asm/atomic.h> #include <asm/smp.h> @@ -34,12 +35,18 @@ #include <asm/desc.h> #include <asm/arch_hooks.h> #include <asm/hpet.h> +#include <asm/i8253.h> #include <mach_apic.h> #include "io_ports.h" /* + * Knob to control our willingness to enable the local APIC. + */ +int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ + +/* * Debug level */ int apic_verbosity; @@ -205,7 +212,7 @@ void __init connect_bsp_APIC(void) enable_apic_mode(); } -void disconnect_bsp_APIC(void) +void disconnect_bsp_APIC(int virt_wire_setup) { if (pic_mode) { /* @@ -219,6 +226,42 @@ void disconnect_bsp_APIC(void) outb(0x70, 0x22); outb(0x00, 0x23); } + else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* For LVT0 make it edge triggered, active high, external and enabled */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write_around(APIC_LVT0, value); + } + else { + /* Disable LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + } + + /* For LVT1 make it edge triggered, active high, nmi and enabled */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } } void disable_local_APIC(void) @@ -363,7 +406,7 @@ void __init init_bsp_APIC(void) apic_write_around(APIC_LVT1, value); } -void __init setup_local_APIC (void) +void __devinit setup_local_APIC(void) { unsigned long oldvalue, value, ver, maxlvt; @@ -634,7 +677,7 @@ static struct sys_device device_lapic = { .cls = &lapic_sysclass, }; -static void __init apic_pm_activate(void) +static void __devinit apic_pm_activate(void) { apic_pm_state.active = 1; } @@ -665,26 +708,6 @@ static void apic_pm_activate(void) { } * Original code written by Keir Fraser. */ -/* - * Knob to control our willingness to enable the local APIC. - */ -int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ - -static int __init lapic_disable(char *str) -{ - enable_local_apic = -1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - return 0; -} -__setup("nolapic", lapic_disable); - -static int __init lapic_enable(char *str) -{ - enable_local_apic = 1; - return 0; -} -__setup("lapic", lapic_enable); - static int __init apic_set_verbosity(char *str) { if (strcmp("debug", str) == 0) @@ -855,9 +878,8 @@ fake_ioapic_page: * but we do not accept timer interrupts yet. We only allow the BP * to calibrate. */ -static unsigned int __init get_8254_timer_count(void) +static unsigned int __devinit get_8254_timer_count(void) { - extern spinlock_t i8253_lock; unsigned long flags; unsigned int count; @@ -874,7 +896,7 @@ static unsigned int __init get_8254_timer_count(void) } /* next tick in 8254 can be caught by catching timer wraparound */ -static void __init wait_8254_wraparound(void) +static void __devinit wait_8254_wraparound(void) { unsigned int curr_count, prev_count; @@ -894,7 +916,7 @@ static void __init wait_8254_wraparound(void) * Default initialization for 8254 timers. If we use other timers like HPET, * we override this later */ -void (*wait_timer_tick)(void) __initdata = wait_8254_wraparound; +void (*wait_timer_tick)(void) __devinitdata = wait_8254_wraparound; /* * This function sets up the local APIC timer, with a timeout of @@ -930,7 +952,7 @@ static void __setup_APIC_LVTT(unsigned int clocks) apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); } -static void __init setup_APIC_timer(unsigned int clocks) +static void __devinit setup_APIC_timer(unsigned int clocks) { unsigned long flags; @@ -1043,12 +1065,12 @@ void __init setup_boot_APIC_clock(void) local_irq_enable(); } -void __init setup_secondary_APIC_clock(void) +void __devinit setup_secondary_APIC_clock(void) { setup_APIC_timer(calibration_result); } -void __init disable_APIC_timer(void) +void __devinit disable_APIC_timer(void) { if (using_apic_timer) { unsigned long v; @@ -1133,7 +1155,7 @@ inline void smp_local_timer_interrupt(struct pt_regs * regs) } #ifdef CONFIG_SMP - update_process_times(user_mode(regs)); + update_process_times(user_mode_vm(regs)); #endif } diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c index 0ff65abcd56c..d7811c4e8b50 100644 --- a/arch/i386/kernel/apm.c +++ b/arch/i386/kernel/apm.c @@ -228,10 +228,10 @@ #include <asm/system.h> #include <asm/uaccess.h> #include <asm/desc.h> +#include <asm/i8253.h> #include "io_ports.h" -extern spinlock_t i8253_lock; extern unsigned long get_cmos_time(void); extern void machine_real_restart(unsigned char *, int); @@ -346,10 +346,10 @@ extern int (*console_blank_hook)(int); struct apm_user { int magic; struct apm_user * next; - int suser: 1; - int writer: 1; - int reader: 1; - int suspend_wait: 1; + unsigned int suser: 1; + unsigned int writer: 1; + unsigned int reader: 1; + unsigned int suspend_wait: 1; int suspend_result; int suspends_pending; int standbys_pending; @@ -911,14 +911,7 @@ static void apm_power_off(void) 0xcd, 0x15 /* int $0x15 */ }; - /* - * This may be called on an SMP machine. - */ -#ifdef CONFIG_SMP /* Some bioses don't like being called from CPU != 0 */ - set_cpus_allowed(current, cpumask_of_cpu(0)); - BUG_ON(smp_processor_id() != 0); -#endif if (apm_info.realmode_power_off) { (void)apm_save_cpus(); @@ -1168,8 +1161,7 @@ static void get_time_diff(void) static void reinit_timer(void) { #ifdef INIT_TIMER_AFTER_SUSPEND - unsigned long flags; - extern spinlock_t i8253_lock; + unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); /* set the clock to 100 Hz */ diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c index d199e525680a..4553ffd94b1f 100644 --- a/arch/i386/kernel/cpu/common.c +++ b/arch/i386/kernel/cpu/common.c @@ -24,9 +24,9 @@ EXPORT_PER_CPU_SYMBOL(cpu_gdt_table); DEFINE_PER_CPU(unsigned char, cpu_16bit_stack[CPU_16BIT_STACK_SIZE]); EXPORT_PER_CPU_SYMBOL(cpu_16bit_stack); -static int cachesize_override __initdata = -1; -static int disable_x86_fxsr __initdata = 0; -static int disable_x86_serial_nr __initdata = 1; +static int cachesize_override __devinitdata = -1; +static int disable_x86_fxsr __devinitdata = 0; +static int disable_x86_serial_nr __devinitdata = 1; struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {}; @@ -59,7 +59,7 @@ static int __init cachesize_setup(char *str) } __setup("cachesize=", cachesize_setup); -int __init get_model_name(struct cpuinfo_x86 *c) +int __devinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; char *p, *q; @@ -89,7 +89,7 @@ int __init get_model_name(struct cpuinfo_x86 *c) } -void __init display_cacheinfo(struct cpuinfo_x86 *c) +void __devinit display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, ecx, edx, l2size; @@ -130,7 +130,7 @@ void __init display_cacheinfo(struct cpuinfo_x86 *c) /* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */ /* Look up CPU names by table lookup. */ -static char __init *table_lookup_model(struct cpuinfo_x86 *c) +static char __devinit *table_lookup_model(struct cpuinfo_x86 *c) { struct cpu_model_info *info; @@ -151,7 +151,7 @@ static char __init *table_lookup_model(struct cpuinfo_x86 *c) } -void __init get_cpu_vendor(struct cpuinfo_x86 *c, int early) +void __devinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) { char *v = c->x86_vendor_id; int i; @@ -202,7 +202,7 @@ static inline int flag_is_changeable_p(u32 flag) /* Probe for the CPUID instruction */ -static int __init have_cpuid_p(void) +static int __devinit have_cpuid_p(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -249,7 +249,7 @@ static void __init early_cpu_detect(void) #endif } -void __init generic_identify(struct cpuinfo_x86 * c) +void __devinit generic_identify(struct cpuinfo_x86 * c) { u32 tfms, xlvl; int junk; @@ -296,7 +296,7 @@ void __init generic_identify(struct cpuinfo_x86 * c) } } -static void __init squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +static void __devinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) { /* Disable processor serial number */ @@ -324,7 +324,7 @@ __setup("serialnumber", x86_serial_nr_setup); /* * This does the hard work of actually picking apart the CPU stuff... */ -void __init identify_cpu(struct cpuinfo_x86 *c) +void __devinit identify_cpu(struct cpuinfo_x86 *c) { int i; @@ -432,10 +432,18 @@ void __init identify_cpu(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_MCE mcheck_init(c); #endif + if (c == &boot_cpu_data) + sysenter_setup(); + enable_sep_cpu(); + + if (c == &boot_cpu_data) + mtrr_bp_init(); + else + mtrr_ap_init(); } #ifdef CONFIG_X86_HT -void __init detect_ht(struct cpuinfo_x86 *c) +void __devinit detect_ht(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; int index_msb, tmp; @@ -490,7 +498,7 @@ void __init detect_ht(struct cpuinfo_x86 *c) } #endif -void __init print_cpu_info(struct cpuinfo_x86 *c) +void __devinit print_cpu_info(struct cpuinfo_x86 *c) { char *vendor = NULL; @@ -513,7 +521,7 @@ void __init print_cpu_info(struct cpuinfo_x86 *c) printk("\n"); } -cpumask_t cpu_initialized __initdata = CPU_MASK_NONE; +cpumask_t cpu_initialized __devinitdata = CPU_MASK_NONE; /* This is hacky. :) * We're emulating future behavior. @@ -560,7 +568,7 @@ void __init early_cpu_init(void) * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. */ -void __init cpu_init (void) +void __devinit cpu_init(void) { int cpu = smp_processor_id(); struct tss_struct * t = &per_cpu(init_tss, cpu); @@ -635,7 +643,7 @@ void __init cpu_init (void) /* Clear all 6 debug registers: */ -#define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) ); +#define CD(register) set_debugreg(0, register) CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7); @@ -648,3 +656,15 @@ void __init cpu_init (void) clear_used_math(); mxcsr_feature_mask_init(); } + +#ifdef CONFIG_HOTPLUG_CPU +void __devinit cpu_uninit(void) +{ + int cpu = raw_smp_processor_id(); + cpu_clear(cpu, cpu_initialized); + + /* lazy TLB state */ + per_cpu(cpu_tlbstate, cpu).state = 0; + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; +} +#endif diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c index 963e17aa205d..60a9e54dd20e 100644 --- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -442,6 +442,13 @@ acpi_cpufreq_cpu_init ( (u32) data->acpi_data.states[i].transition_latency); cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu); + + /* + * the first call to ->target() should result in us actually + * writing something to the appropriate registers. + */ + data->resume = 1; + return (result); err_freqfree: diff --git a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c index 1a49adb1f4a6..e86ea486c311 100644 --- a/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/i386/kernel/cpu/cpufreq/gx-suspmod.c @@ -190,7 +190,7 @@ static __init struct pci_dev *gx_detect_chipset(void) /* detect which companion chip is used */ while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { - if ((pci_match_device (gx_chipset_tbl, gx_pci)) != NULL) { + if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) { return gx_pci; } } diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k7.c b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c index 5c530064eb74..73a5dc5b26b8 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k7.c @@ -648,9 +648,7 @@ static int powernow_cpu_exit (struct cpufreq_policy *policy) { } #endif - if (powernow_table) - kfree(powernow_table); - + kfree(powernow_table); return 0; } diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c index 10cc096c0ade..ab6e0611303d 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c @@ -1,5 +1,5 @@ /* - * (c) 2003, 2004 Advanced Micro Devices, Inc. + * (c) 2003, 2004, 2005 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html @@ -44,7 +44,7 @@ #define PFX "powernow-k8: " #define BFX PFX "BIOS error: " -#define VERSION "version 1.40.2" +#define VERSION "version 1.50.3" #include "powernow-k8.h" /* serialize freq changes */ @@ -110,14 +110,13 @@ static int query_current_values_with_pending_wait(struct powernow_k8_data *data) u32 lo, hi; u32 i = 0; - lo = MSR_S_LO_CHANGE_PENDING; - while (lo & MSR_S_LO_CHANGE_PENDING) { + do { if (i++ > 0x1000000) { printk(KERN_ERR PFX "detected change pending stuck\n"); return 1; } rdmsr(MSR_FIDVID_STATUS, lo, hi); - } + } while (lo & MSR_S_LO_CHANGE_PENDING); data->currvid = hi & MSR_S_HI_CURRENT_VID; data->currfid = lo & MSR_S_LO_CURRENT_FID; @@ -232,7 +231,7 @@ static int write_new_vid(struct powernow_k8_data *data, u32 vid) /* * Reduce the vid by the max of step or reqvid. * Decreasing vid codes represent increasing voltages: - * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of 0x1f is off. + * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off. */ static int decrease_vid_code_by_step(struct powernow_k8_data *data, u32 reqvid, u32 step) { @@ -467,7 +466,7 @@ static int check_supported_cpu(unsigned int cpu) eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE); if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) || ((eax & CPUID_XFAM) != CPUID_XFAM_K8) || - ((eax & CPUID_XMOD) > CPUID_XMOD_REV_E)) { + ((eax & CPUID_XMOD) > CPUID_XMOD_REV_F)) { printk(KERN_INFO PFX "Processor cpuid %x not supported\n", eax); goto out; } @@ -696,6 +695,7 @@ static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK; data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK; + data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK); data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK; @@ -735,8 +735,16 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) } for (i = 0; i < data->acpi_data.state_count; i++) { - u32 fid = data->acpi_data.states[i].control & FID_MASK; - u32 vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK; + u32 fid; + u32 vid; + + if (data->exttype) { + fid = data->acpi_data.states[i].status & FID_MASK; + vid = (data->acpi_data.states[i].status >> VID_SHIFT) & VID_MASK; + } else { + fid = data->acpi_data.states[i].control & FID_MASK; + vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK; + } dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); @@ -753,7 +761,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) } /* verify voltage is OK - BIOSs are using "off" to indicate invalid */ - if (vid == 0x1f) { + if (vid == VID_OFF) { dprintk("invalid vid %u, ignoring\n", vid); powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; continue; @@ -930,15 +938,6 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi down(&fidvid_sem); - for_each_cpu_mask(i, cpu_core_map[pol->cpu]) { - /* make sure the sibling is initialized */ - if (!powernow_data[i]) { - ret = 0; - up(&fidvid_sem); - goto err_out; - } - } - powernow_k8_acpi_pst_values(data, newstate); if (transition_frequency(data, newstate)) { @@ -978,7 +977,7 @@ static int __init powernowk8_cpu_init(struct cpufreq_policy *pol) { struct powernow_k8_data *data; cpumask_t oldmask = CPU_MASK_ALL; - int rc; + int rc, i; if (!check_supported_cpu(pol->cpu)) return -ENODEV; @@ -1064,7 +1063,9 @@ static int __init powernowk8_cpu_init(struct cpufreq_policy *pol) printk("cpu_init done, current fid 0x%x, vid 0x%x\n", data->currfid, data->currvid); - powernow_data[pol->cpu] = data; + for_each_cpu_mask(i, cpu_core_map[pol->cpu]) { + powernow_data[i] = data; + } return 0; diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.h b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h index 9ed5bf221cb7..b1e85bb36396 100644 --- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.h @@ -1,5 +1,5 @@ /* - * (c) 2003, 2004 Advanced Micro Devices, Inc. + * (c) 2003, 2004, 2005 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html @@ -19,6 +19,7 @@ struct powernow_k8_data { u32 vidmvs; /* usable value calculated from mvs */ u32 vstable; /* voltage stabilization time, units 20 us */ u32 plllock; /* pll lock time, units 1 us */ + u32 exttype; /* extended interface = 1 */ /* keep track of the current fid / vid */ u32 currvid, currfid; @@ -41,7 +42,7 @@ struct powernow_k8_data { #define CPUID_XFAM 0x0ff00000 /* extended family */ #define CPUID_XFAM_K8 0 #define CPUID_XMOD 0x000f0000 /* extended model */ -#define CPUID_XMOD_REV_E 0x00020000 +#define CPUID_XMOD_REV_F 0x00040000 #define CPUID_USE_XFAM_XMOD 0x00000f00 #define CPUID_GET_MAX_CAPABILITIES 0x80000000 #define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 @@ -57,25 +58,26 @@ struct powernow_k8_data { /* Field definitions within the FID VID Low Control MSR : */ #define MSR_C_LO_INIT_FID_VID 0x00010000 -#define MSR_C_LO_NEW_VID 0x00001f00 -#define MSR_C_LO_NEW_FID 0x0000002f +#define MSR_C_LO_NEW_VID 0x00003f00 +#define MSR_C_LO_NEW_FID 0x0000003f #define MSR_C_LO_VID_SHIFT 8 /* Field definitions within the FID VID High Control MSR : */ -#define MSR_C_HI_STP_GNT_TO 0x000fffff +#define MSR_C_HI_STP_GNT_TO 0x000fffff /* Field definitions within the FID VID Low Status MSR : */ -#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */ -#define MSR_S_LO_MAX_RAMP_VID 0x1f000000 +#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */ +#define MSR_S_LO_MAX_RAMP_VID 0x3f000000 #define MSR_S_LO_MAX_FID 0x003f0000 #define MSR_S_LO_START_FID 0x00003f00 #define MSR_S_LO_CURRENT_FID 0x0000003f /* Field definitions within the FID VID High Status MSR : */ -#define MSR_S_HI_MAX_WORKING_VID 0x001f0000 -#define MSR_S_HI_START_VID 0x00001f00 -#define MSR_S_HI_CURRENT_VID 0x0000001f -#define MSR_C_HI_STP_GNT_BENIGN 0x00000001 +#define MSR_S_HI_MIN_WORKING_VID 0x3f000000 +#define MSR_S_HI_MAX_WORKING_VID 0x003f0000 +#define MSR_S_HI_START_VID 0x00003f00 +#define MSR_S_HI_CURRENT_VID 0x0000003f +#define MSR_C_HI_STP_GNT_BENIGN 0x00000001 /* * There are restrictions frequencies have to follow: @@ -99,13 +101,15 @@ struct powernow_k8_data { #define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */ #define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */ -#define LEAST_VID 0x1e /* Lowest (numerically highest) useful vid value */ +#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */ #define MIN_FREQ 800 /* Min and max freqs, per spec */ #define MAX_FREQ 5000 #define INVALID_FID_MASK 0xffffffc1 /* not a valid fid if these bits are set */ -#define INVALID_VID_MASK 0xffffffe0 /* not a valid vid if these bits are set */ +#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */ + +#define VID_OFF 0x3f #define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */ @@ -121,12 +125,14 @@ struct powernow_k8_data { #define IRT_SHIFT 30 #define RVO_SHIFT 28 +#define EXT_TYPE_SHIFT 27 #define PLL_L_SHIFT 20 #define MVS_SHIFT 18 #define VST_SHIFT 11 #define VID_SHIFT 6 #define IRT_MASK 3 #define RVO_MASK 3 +#define EXT_TYPE_MASK 1 #define PLL_L_MASK 0x7f #define MVS_MASK 3 #define VST_MASK 0x7f diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c index 7dcbf70fc16f..327a55d4d1c6 100644 --- a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c @@ -375,7 +375,7 @@ static int centrino_cpu_init_acpi(struct cpufreq_policy *policy) arg0.buffer.pointer = (u8 *) arg0_buf; arg0_buf[0] = ACPI_PDC_REVISION_ID; arg0_buf[1] = 1; - arg0_buf[2] = ACPI_PDC_EST_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_MSR; + arg0_buf[2] = ACPI_PDC_EST_CAPABILITY_SMP_MSR; p.pdc = &arg_list; diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 121aa2176e69..a2c33c1a46c5 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -25,10 +25,10 @@ extern int trap_init_f00f_bug(void); /* * Alignment at which movsl is preferred for bulk memory copies. */ -struct movsl_mask movsl_mask; +struct movsl_mask movsl_mask __read_mostly; #endif -void __init early_intel_workaround(struct cpuinfo_x86 *c) +void __devinit early_intel_workaround(struct cpuinfo_x86 *c) { if (c->x86_vendor != X86_VENDOR_INTEL) return; @@ -43,7 +43,7 @@ void __init early_intel_workaround(struct cpuinfo_x86 *c) * This is called before we do cpu ident work */ -int __init ppro_with_ram_bug(void) +int __devinit ppro_with_ram_bug(void) { /* Uses data from early_cpu_detect now */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && @@ -61,7 +61,7 @@ int __init ppro_with_ram_bug(void) * P4 Xeon errata 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ -static void __init Intel_errata_workarounds(struct cpuinfo_x86 *c) +static void __devinit Intel_errata_workarounds(struct cpuinfo_x86 *c) { unsigned long lo, hi; @@ -80,7 +80,7 @@ static void __init Intel_errata_workarounds(struct cpuinfo_x86 *c) /* * find out the number of processor cores on the die */ -static int __init num_cpu_cores(struct cpuinfo_x86 *c) +static int __devinit num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax; @@ -98,7 +98,7 @@ static int __init num_cpu_cores(struct cpuinfo_x86 *c) return 1; } -static void __init init_intel(struct cpuinfo_x86 *c) +static void __devinit init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; char *p = NULL; @@ -204,7 +204,7 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 * c, unsigned int size) return size; } -static struct cpu_dev intel_cpu_dev __initdata = { +static struct cpu_dev intel_cpu_dev __devinitdata = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, .c_models = { diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c index a710dc4eb20e..6c55b50cf048 100644 --- a/arch/i386/kernel/cpu/intel_cacheinfo.c +++ b/arch/i386/kernel/cpu/intel_cacheinfo.c @@ -28,7 +28,7 @@ struct _cache_table }; /* all the cache descriptor types we care about (no TLB or trace cache entries) */ -static struct _cache_table cache_table[] __initdata = +static struct _cache_table cache_table[] __devinitdata = { { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ @@ -128,7 +128,7 @@ static int __devinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_le cpuid_count(4, index, &eax, &ebx, &ecx, &edx); cache_eax.full = eax; if (cache_eax.split.type == CACHE_TYPE_NULL) - return -1; + return -EIO; /* better error ? */ this_leaf->eax.full = eax; this_leaf->ebx.full = ebx; @@ -160,7 +160,7 @@ static int __init find_num_cache_leaves(void) return retval; } -unsigned int __init init_intel_cacheinfo(struct cpuinfo_x86 *c) +unsigned int __devinit init_intel_cacheinfo(struct cpuinfo_x86 *c) { unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */ unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ @@ -334,6 +334,7 @@ static int __devinit detect_cache_attributes(unsigned int cpu) struct _cpuid4_info *this_leaf; unsigned long j; int retval; + cpumask_t oldmask; if (num_cache_leaves == 0) return -ENOENT; @@ -345,19 +346,26 @@ static int __devinit detect_cache_attributes(unsigned int cpu) memset(cpuid4_info[cpu], 0, sizeof(struct _cpuid4_info) * num_cache_leaves); + oldmask = current->cpus_allowed; + retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); + if (retval) + goto out; + /* Do cpuid and store the results */ + retval = 0; for (j = 0; j < num_cache_leaves; j++) { this_leaf = CPUID4_INFO_IDX(cpu, j); retval = cpuid4_cache_lookup(j, this_leaf); if (unlikely(retval < 0)) - goto err_out; + break; cache_shared_cpu_map_setup(cpu, j); } - return 0; + set_cpus_allowed(current, oldmask); -err_out: - free_cache_attributes(cpu); - return -ENOMEM; +out: + if (retval) + free_cache_attributes(cpu); + return retval; } #ifdef CONFIG_SYSFS diff --git a/arch/i386/kernel/cpu/mcheck/k7.c b/arch/i386/kernel/cpu/mcheck/k7.c index 8df52e86c4d2..c4abe7657397 100644 --- a/arch/i386/kernel/cpu/mcheck/k7.c +++ b/arch/i386/kernel/cpu/mcheck/k7.c @@ -69,7 +69,7 @@ static fastcall void k7_machine_check(struct pt_regs * regs, long error_code) /* AMD K7 machine check is Intel like */ -void __init amd_mcheck_init(struct cpuinfo_x86 *c) +void __devinit amd_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; int i; diff --git a/arch/i386/kernel/cpu/mcheck/mce.c b/arch/i386/kernel/cpu/mcheck/mce.c index bf6d1aefafc0..2cf25d2ba0f1 100644 --- a/arch/i386/kernel/cpu/mcheck/mce.c +++ b/arch/i386/kernel/cpu/mcheck/mce.c @@ -16,7 +16,7 @@ #include "mce.h" -int mce_disabled __initdata = 0; +int mce_disabled __devinitdata = 0; int nr_mce_banks; EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */ @@ -31,7 +31,7 @@ static fastcall void unexpected_machine_check(struct pt_regs * regs, long error_ void fastcall (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_machine_check; /* This has to be run for each processor */ -void __init mcheck_init(struct cpuinfo_x86 *c) +void __devinit mcheck_init(struct cpuinfo_x86 *c) { if (mce_disabled==1) return; diff --git a/arch/i386/kernel/cpu/mcheck/p4.c b/arch/i386/kernel/cpu/mcheck/p4.c index 8b16ceb929b4..0abccb6fdf9e 100644 --- a/arch/i386/kernel/cpu/mcheck/p4.c +++ b/arch/i386/kernel/cpu/mcheck/p4.c @@ -78,7 +78,7 @@ fastcall void smp_thermal_interrupt(struct pt_regs *regs) } /* P4/Xeon Thermal regulation detect and init */ -static void __init intel_init_thermal(struct cpuinfo_x86 *c) +static void __devinit intel_init_thermal(struct cpuinfo_x86 *c) { u32 l, h; unsigned int cpu = smp_processor_id(); @@ -232,7 +232,7 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) } -void __init intel_p4_mcheck_init(struct cpuinfo_x86 *c) +void __devinit intel_p4_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; int i; diff --git a/arch/i386/kernel/cpu/mcheck/p5.c b/arch/i386/kernel/cpu/mcheck/p5.c index c45a1b485c80..ec0614cd2925 100644 --- a/arch/i386/kernel/cpu/mcheck/p5.c +++ b/arch/i386/kernel/cpu/mcheck/p5.c @@ -29,7 +29,7 @@ static fastcall void pentium_machine_check(struct pt_regs * regs, long error_cod } /* Set up machine check reporting for processors with Intel style MCE */ -void __init intel_p5_mcheck_init(struct cpuinfo_x86 *c) +void __devinit intel_p5_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; diff --git a/arch/i386/kernel/cpu/mcheck/p6.c b/arch/i386/kernel/cpu/mcheck/p6.c index 46640f8c2494..f01b73f947e1 100644 --- a/arch/i386/kernel/cpu/mcheck/p6.c +++ b/arch/i386/kernel/cpu/mcheck/p6.c @@ -80,7 +80,7 @@ static fastcall void intel_machine_check(struct pt_regs * regs, long error_code) } /* Set up machine check reporting for processors with Intel style MCE */ -void __init intel_p6_mcheck_init(struct cpuinfo_x86 *c) +void __devinit intel_p6_mcheck_init(struct cpuinfo_x86 *c) { u32 l, h; int i; diff --git a/arch/i386/kernel/cpu/mcheck/winchip.c b/arch/i386/kernel/cpu/mcheck/winchip.c index 753fa7acb984..7bae68fa168f 100644 --- a/arch/i386/kernel/cpu/mcheck/winchip.c +++ b/arch/i386/kernel/cpu/mcheck/winchip.c @@ -23,7 +23,7 @@ static fastcall void winchip_machine_check(struct pt_regs * regs, long error_cod } /* Set up machine check reporting on the Winchip C6 series */ -void __init winchip_mcheck_init(struct cpuinfo_x86 *c) +void __devinit winchip_mcheck_init(struct cpuinfo_x86 *c) { u32 lo, hi; machine_check_vector = winchip_machine_check; diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c index f468a979e9aa..169ac8e0db68 100644 --- a/arch/i386/kernel/cpu/mtrr/generic.c +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -67,14 +67,6 @@ void __init get_mtrr_state(void) mtrr_state.enabled = (lo & 0xc00) >> 10; } -/* Free resources associated with a struct mtrr_state */ -void __init finalize_mtrr_state(void) -{ - if (mtrr_state.var_ranges) - kfree(mtrr_state.var_ranges); - mtrr_state.var_ranges = NULL; -} - /* Some BIOS's are fucked and don't set all MTRRs the same! */ void __init mtrr_state_warn(void) { @@ -335,6 +327,9 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, */ { unsigned long flags; + struct mtrr_var_range *vr; + + vr = &mtrr_state.var_ranges[reg]; local_irq_save(flags); prepare_set(); @@ -343,11 +338,15 @@ static void generic_set_mtrr(unsigned int reg, unsigned long base, /* The invalid bit is kept in the mask, so we simply clear the relevant mask register to disable a range. */ mtrr_wrmsr(MTRRphysMask_MSR(reg), 0, 0); + memset(vr, 0, sizeof(struct mtrr_var_range)); } else { - mtrr_wrmsr(MTRRphysBase_MSR(reg), base << PAGE_SHIFT | type, - (base & size_and_mask) >> (32 - PAGE_SHIFT)); - mtrr_wrmsr(MTRRphysMask_MSR(reg), -size << PAGE_SHIFT | 0x800, - (-size & size_and_mask) >> (32 - PAGE_SHIFT)); + vr->base_lo = base << PAGE_SHIFT | type; + vr->base_hi = (base & size_and_mask) >> (32 - PAGE_SHIFT); + vr->mask_lo = -size << PAGE_SHIFT | 0x800; + vr->mask_hi = (-size & size_and_mask) >> (32 - PAGE_SHIFT); + + mtrr_wrmsr(MTRRphysBase_MSR(reg), vr->base_lo, vr->base_hi); + mtrr_wrmsr(MTRRphysMask_MSR(reg), vr->mask_lo, vr->mask_hi); } post_set(); diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c index e1c2042b9b7e..764cac64e211 100644 --- a/arch/i386/kernel/cpu/mtrr/main.c +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -332,6 +332,8 @@ int mtrr_add_page(unsigned long base, unsigned long size, error = -EINVAL; + /* No CPU hotplug when we change MTRR entries */ + lock_cpu_hotplug(); /* Search for existing MTRR */ down(&main_lock); for (i = 0; i < num_var_ranges; ++i) { @@ -372,9 +374,23 @@ int mtrr_add_page(unsigned long base, unsigned long size, error = i; out: up(&main_lock); + unlock_cpu_hotplug(); return error; } +static int mtrr_check(unsigned long base, unsigned long size) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + printk(KERN_WARNING + "mtrr: size and base must be multiples of 4 kiB\n"); + printk(KERN_DEBUG + "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + dump_stack(); + return -1; + } + return 0; +} + /** * mtrr_add - Add a memory type region * @base: Physical base address of region @@ -415,11 +431,8 @@ int mtrr_add(unsigned long base, unsigned long size, unsigned int type, char increment) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - printk(KERN_WARNING "mtrr: size and base must be multiples of 4 kiB\n"); - printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + if (mtrr_check(base, size)) return -EINVAL; - } return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, increment); } @@ -451,6 +464,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) return -ENXIO; max = num_var_ranges; + /* No CPU hotplug when we change MTRR entries */ + lock_cpu_hotplug(); down(&main_lock); if (reg < 0) { /* Search for existing MTRR */ @@ -491,6 +506,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) error = reg; out: up(&main_lock); + unlock_cpu_hotplug(); return error; } /** @@ -511,11 +527,8 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) int mtrr_del(int reg, unsigned long base, unsigned long size) { - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { - printk(KERN_INFO "mtrr: size and base must be multiples of 4 kiB\n"); - printk(KERN_DEBUG "mtrr: size: 0x%lx base: 0x%lx\n", size, base); + if (mtrr_check(base, size)) return -EINVAL; - } return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); } @@ -537,21 +550,9 @@ static void __init init_ifs(void) centaur_init_mtrr(); } -static void __init init_other_cpus(void) -{ - if (use_intel()) - get_mtrr_state(); - - /* bring up the other processors */ - set_mtrr(~0U,0,0,0); - - if (use_intel()) { - finalize_mtrr_state(); - mtrr_state_warn(); - } -} - - +/* The suspend/resume methods are only for CPU without MTRR. CPU using generic + * MTRR driver doesn't require this + */ struct mtrr_value { mtrr_type ltype; unsigned long lbase; @@ -604,13 +605,13 @@ static struct sysdev_driver mtrr_sysdev_driver = { /** - * mtrr_init - initialize mtrrs on the boot CPU + * mtrr_bp_init - initialize mtrrs on the boot CPU * * This needs to be called early; before any of the other CPUs are * initialized (i.e. before smp_init()). * */ -static int __init mtrr_init(void) +void __init mtrr_bp_init(void) { init_ifs(); @@ -667,12 +668,48 @@ static int __init mtrr_init(void) if (mtrr_if) { set_num_var_ranges(); init_table(); - init_other_cpus(); - - return sysdev_driver_register(&cpu_sysdev_class, - &mtrr_sysdev_driver); + if (use_intel()) + get_mtrr_state(); } - return -ENXIO; } -subsys_initcall(mtrr_init); +void mtrr_ap_init(void) +{ + unsigned long flags; + + if (!mtrr_if || !use_intel()) + return; + /* + * Ideally we should hold main_lock here to avoid mtrr entries changed, + * but this routine will be called in cpu boot time, holding the lock + * breaks it. This routine is called in two cases: 1.very earily time + * of software resume, when there absolutely isn't mtrr entry changes; + * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to + * prevent mtrr entry changes + */ + local_irq_save(flags); + + mtrr_if->set_all(); + + local_irq_restore(flags); +} + +static int __init mtrr_init_finialize(void) +{ + if (!mtrr_if) + return 0; + if (use_intel()) + mtrr_state_warn(); + else { + /* The CPUs haven't MTRR and seemes not support SMP. They have + * specific drivers, we use a tricky method to support + * suspend/resume for them. + * TBD: is there any system with such CPU which supports + * suspend/resume? if no, we should remove the code. + */ + sysdev_driver_register(&cpu_sysdev_class, + &mtrr_sysdev_driver); + } + return 0; +} +subsys_initcall(mtrr_init_finialize); diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h index de1351245599..99c9f2682041 100644 --- a/arch/i386/kernel/cpu/mtrr/mtrr.h +++ b/arch/i386/kernel/cpu/mtrr/mtrr.h @@ -91,7 +91,6 @@ extern struct mtrr_ops * mtrr_if; extern unsigned int num_var_ranges; -void finalize_mtrr_state(void); void mtrr_state_warn(void); char *mtrr_attrib_to_str(int x); void mtrr_wrmsr(unsigned, unsigned, unsigned); diff --git a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c index 7323c19f354e..8bd77d948a84 100644 --- a/arch/i386/kernel/cpu/proc.c +++ b/arch/i386/kernel/cpu/proc.c @@ -86,7 +86,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "stepping\t: unknown\n"); if ( cpu_has(c, X86_FEATURE_TSC) ) { - seq_printf(m, "cpu MHz\t\t: %lu.%03lu\n", + seq_printf(m, "cpu MHz\t\t: %u.%03u\n", cpu_khz / 1000, (cpu_khz % 1000)); } diff --git a/arch/i386/kernel/cpu/transmeta.c b/arch/i386/kernel/cpu/transmeta.c index f57e5ee94943..fc426380366b 100644 --- a/arch/i386/kernel/cpu/transmeta.c +++ b/arch/i386/kernel/cpu/transmeta.c @@ -76,6 +76,12 @@ static void __init init_transmeta(struct cpuinfo_x86 *c) #define USER686 (X86_FEATURE_TSC|X86_FEATURE_CX8|X86_FEATURE_CMOV) if ( c->x86 == 5 && (c->x86_capability[0] & USER686) == USER686 ) c->x86 = 6; + +#ifdef CONFIG_SYSCTL + /* randomize_va_space slows us down enormously; + it probably triggers retranslation of x86->native bytecode */ + randomize_va_space = 0; +#endif } static void transmeta_identify(struct cpuinfo_x86 * c) diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c new file mode 100644 index 000000000000..e5fab12f7926 --- /dev/null +++ b/arch/i386/kernel/crash.c @@ -0,0 +1,223 @@ +/* + * Architecture specific (i386) functions for kexec based crash dumps. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * + * Copyright (C) IBM Corporation, 2004. All rights reserved. + * + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/irq.h> +#include <linux/reboot.h> +#include <linux/kexec.h> +#include <linux/irq.h> +#include <linux/delay.h> +#include <linux/elf.h> +#include <linux/elfcore.h> + +#include <asm/processor.h> +#include <asm/hardirq.h> +#include <asm/nmi.h> +#include <asm/hw_irq.h> +#include <asm/apic.h> +#include <mach_ipi.h> + + +note_buf_t crash_notes[NR_CPUS]; +/* This keeps a track of which one is crashing cpu. */ +static int crashing_cpu; + +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) +3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +static void crash_save_this_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= NR_CPUS)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that that no need to invent something new. + */ + buf = &crash_notes[cpu][0]; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; + elf_core_copy_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, + sizeof(prstatus)); + final_note(buf); +} + +static void crash_get_current_regs(struct pt_regs *regs) +{ + __asm__ __volatile__("movl %%ebx,%0" : "=m"(regs->ebx)); + __asm__ __volatile__("movl %%ecx,%0" : "=m"(regs->ecx)); + __asm__ __volatile__("movl %%edx,%0" : "=m"(regs->edx)); + __asm__ __volatile__("movl %%esi,%0" : "=m"(regs->esi)); + __asm__ __volatile__("movl %%edi,%0" : "=m"(regs->edi)); + __asm__ __volatile__("movl %%ebp,%0" : "=m"(regs->ebp)); + __asm__ __volatile__("movl %%eax,%0" : "=m"(regs->eax)); + __asm__ __volatile__("movl %%esp,%0" : "=m"(regs->esp)); + __asm__ __volatile__("movw %%ss, %%ax;" :"=a"(regs->xss)); + __asm__ __volatile__("movw %%cs, %%ax;" :"=a"(regs->xcs)); + __asm__ __volatile__("movw %%ds, %%ax;" :"=a"(regs->xds)); + __asm__ __volatile__("movw %%es, %%ax;" :"=a"(regs->xes)); + __asm__ __volatile__("pushfl; popl %0" :"=m"(regs->eflags)); + + regs->eip = (unsigned long)current_text_addr(); +} + +/* CPU does not save ss and esp on stack if execution is already + * running in kernel mode at the time of NMI occurrence. This code + * fixes it. + */ +static void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs) +{ + memcpy(newregs, oldregs, sizeof(*newregs)); + newregs->esp = (unsigned long)&(oldregs->esp); + __asm__ __volatile__("xorl %eax, %eax;"); + __asm__ __volatile__ ("movw %%ss, %%ax;" :"=a"(newregs->xss)); +} + +/* We may have saved_regs from where the error came from + * or it is NULL if via a direct panic(). + */ +static void crash_save_self(struct pt_regs *saved_regs) +{ + struct pt_regs regs; + int cpu; + + cpu = smp_processor_id(); + if (saved_regs) + crash_setup_regs(®s, saved_regs); + else + crash_get_current_regs(®s); + crash_save_this_cpu(®s, cpu); +} + +#ifdef CONFIG_SMP +static atomic_t waiting_for_crash_ipi; + +static int crash_nmi_callback(struct pt_regs *regs, int cpu) +{ + struct pt_regs fixed_regs; + + /* Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return 1; + local_irq_disable(); + + if (!user_mode(regs)) { + crash_setup_regs(&fixed_regs, regs); + regs = &fixed_regs; + } + crash_save_this_cpu(regs, cpu); + disable_local_APIC(); + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + __asm__("hlt"); + for(;;); + + return 1; +} + +/* + * By using the NMI code instead of a vector we just sneak thru the + * word generator coming out with just what we want. AND it does + * not matter if clustered_apic_mode is set or not. + */ +static void smp_send_nmi_allbutself(void) +{ + send_IPI_allbutself(APIC_DM_NMI); +} + +static void nmi_shootdown_cpus(void) +{ + unsigned long msecs; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + /* Would it be better to replace the trap vector here? */ + set_nmi_callback(crash_nmi_callback); + /* Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + smp_send_nmi_allbutself(); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + + /* Leave the nmi callback set */ + disable_local_APIC(); +} +#else +static void nmi_shootdown_cpus(void) +{ + /* There are no cpus to shootdown */ +} +#endif + +void machine_crash_shutdown(struct pt_regs *regs) +{ + /* This function is only called after the system + * has paniced or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ + /* The kernel is broken so disable interrupts */ + local_irq_disable(); + + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = smp_processor_id(); + nmi_shootdown_cpus(); + lapic_shutdown(); +#if defined(CONFIG_X86_IO_APIC) + disable_IO_APIC(); +#endif + crash_save_self(regs); +} diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c index 6ed7e28f306c..a3cdf894302b 100644 --- a/arch/i386/kernel/dmi_scan.c +++ b/arch/i386/kernel/dmi_scan.c @@ -1,22 +1,15 @@ #include <linux/types.h> -#include <linux/kernel.h> #include <linux/string.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/slab.h> -#include <linux/acpi.h> -#include <asm/io.h> -#include <linux/pm.h> -#include <asm/system.h> #include <linux/dmi.h> #include <linux/bootmem.h> -struct dmi_header -{ - u8 type; - u8 length; - u16 handle; +struct dmi_header { + u8 type; + u8 length; + u16 handle; }; #undef DMI_DEBUG @@ -29,15 +22,13 @@ struct dmi_header static char * __init dmi_string(struct dmi_header *dm, u8 s) { - u8 *bp=(u8 *)dm; - bp+=dm->length; - if(!s) + u8 *bp = ((u8 *) dm) + dm->length; + + if (!s) return ""; s--; - while(s>0 && *bp) - { - bp+=strlen(bp); - bp++; + while (s > 0 && *bp) { + bp += strlen(bp) + 1; s--; } return bp; @@ -47,16 +38,14 @@ static char * __init dmi_string(struct dmi_header *dm, u8 s) * We have to be cautious here. We have seen BIOSes with DMI pointers * pointing to completely the wrong place for example */ - -static int __init dmi_table(u32 base, int len, int num, void (*decode)(struct dmi_header *)) +static int __init dmi_table(u32 base, int len, int num, + void (*decode)(struct dmi_header *)) { - u8 *buf; - struct dmi_header *dm; - u8 *data; - int i=0; + u8 *buf, *data; + int i = 0; buf = bt_ioremap(base, len); - if(buf==NULL) + if (buf == NULL) return -1; data = buf; @@ -65,36 +54,34 @@ static int __init dmi_table(u32 base, int len, int num, void (*decode)(struct dm * Stop when we see all the items the table claimed to have * OR we run off the end of the table (also happens) */ - - while(i<num && data-buf+sizeof(struct dmi_header)<=len) - { - dm=(struct dmi_header *)data; + while ((i < num) && (data - buf + sizeof(struct dmi_header)) <= len) { + struct dmi_header *dm = (struct dmi_header *)data; /* * We want to know the total length (formated area and strings) * before decoding to make sure we won't run off the table in * dmi_decode or dmi_string */ - data+=dm->length; - while(data-buf<len-1 && (data[0] || data[1])) + data += dm->length; + while ((data - buf < len - 1) && (data[0] || data[1])) data++; - if(data-buf<len-1) + if (data - buf < len - 1) decode(dm); - data+=2; + data += 2; i++; } bt_iounmap(buf, len); return 0; } - -inline static int __init dmi_checksum(u8 *buf) +static int __init dmi_checksum(u8 *buf) { - u8 sum=0; + u8 sum = 0; int a; - for(a=0; a<15; a++) - sum+=buf[a]; - return (sum==0); + for (a = 0; a < 15; a++) + sum += buf[a]; + + return sum == 0; } static int __init dmi_iterate(void (*decode)(struct dmi_header *)) @@ -110,28 +97,30 @@ static int __init dmi_iterate(void (*decode)(struct dmi_header *)) p = ioremap(0xF0000, 0x10000); if (p == NULL) return -1; + for (q = p; q < p + 0x10000; q += 16) { memcpy_fromio(buf, q, 15); - if(memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf)) - { - u16 num=buf[13]<<8|buf[12]; - u16 len=buf[7]<<8|buf[6]; - u32 base=buf[11]<<24|buf[10]<<16|buf[9]<<8|buf[8]; + if ((memcmp(buf, "_DMI_", 5) == 0) && dmi_checksum(buf)) { + u16 num = (buf[13] << 8) | buf[12]; + u16 len = (buf[7] << 8) | buf[6]; + u32 base = (buf[11] << 24) | (buf[10] << 16) | + (buf[9] << 8) | buf[8]; /* * DMI version 0.0 means that the real version is taken from * the SMBIOS version, which we don't know at this point. */ - if(buf[14]!=0) + if (buf[14] != 0) printk(KERN_INFO "DMI %d.%d present.\n", - buf[14]>>4, buf[14]&0x0F); + buf[14] >> 4, buf[14] & 0xF); else printk(KERN_INFO "DMI present.\n"); + dmi_printk((KERN_INFO "%d structures occupying %d bytes.\n", num, len)); - dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", - base)); - if(dmi_table(base,len, num, decode)==0) + dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", base)); + + if (dmi_table(base,len, num, decode) == 0) return 0; } } @@ -143,16 +132,17 @@ static char *dmi_ident[DMI_STRING_MAX]; /* * Save a DMI string */ - static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string) { char *d = (char*)dm; char *p = dmi_string(dm, d[string]); - if(p==NULL || *p == 0) + + if (p == NULL || *p == 0) return; if (dmi_ident[slot]) return; - dmi_ident[slot] = alloc_bootmem(strlen(p)+1); + + dmi_ident[slot] = alloc_bootmem(strlen(p) + 1); if(dmi_ident[slot]) strcpy(dmi_ident[slot], p); else @@ -160,281 +150,47 @@ static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string) } /* - * Ugly compatibility crap. - */ -#define dmi_blacklist dmi_system_id -#define NO_MATCH { DMI_NONE, NULL} -#define MATCH DMI_MATCH - -/* - * Toshiba keyboard likes to repeat keys when they are not repeated. - */ - -static __init int broken_toshiba_keyboard(struct dmi_blacklist *d) -{ - printk(KERN_WARNING "Toshiba with broken keyboard detected. If your keyboard sometimes generates 3 keypresses instead of one, see http://davyd.ucc.asn.au/projects/toshiba/README\n"); - return 0; -} - - -#ifdef CONFIG_ACPI_SLEEP -static __init int reset_videomode_after_s3(struct dmi_blacklist *d) -{ - /* See acpi_wakeup.S */ - extern long acpi_video_flags; - acpi_video_flags |= 2; - return 0; -} -#endif - - -#ifdef CONFIG_ACPI_BOOT -extern int acpi_force; - -static __init __attribute__((unused)) int dmi_disable_acpi(struct dmi_blacklist *d) -{ - if (!acpi_force) { - printk(KERN_NOTICE "%s detected: acpi off\n",d->ident); - disable_acpi(); - } else { - printk(KERN_NOTICE - "Warning: DMI blacklist says broken, but acpi forced\n"); - } - return 0; -} - -/* - * Limit ACPI to CPU enumeration for HT - */ -static __init __attribute__((unused)) int force_acpi_ht(struct dmi_blacklist *d) -{ - if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of acpi=ht\n", d->ident); - disable_acpi(); - acpi_ht = 1; - } else { - printk(KERN_NOTICE - "Warning: acpi=force overrules DMI blacklist: acpi=ht\n"); - } - return 0; -} -#endif - -#ifdef CONFIG_ACPI_PCI -static __init int disable_acpi_irq(struct dmi_blacklist *d) -{ - if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of acpi=noirq\n", - d->ident); - acpi_noirq_set(); - } - return 0; -} -static __init int disable_acpi_pci(struct dmi_blacklist *d) -{ - if (!acpi_force) { - printk(KERN_NOTICE "%s detected: force use of pci=noacpi\n", - d->ident); - acpi_disable_pci(); - } - return 0; -} -#endif - -/* - * Process the DMI blacklists - */ - - -/* - * This will be expanded over time to force things like the APM - * interrupt mask settings according to the laptop - */ - -static __initdata struct dmi_blacklist dmi_blacklist[]={ - - { broken_toshiba_keyboard, "Toshiba Satellite 4030cdt", { /* Keyboard generates spurious repeats */ - MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), - NO_MATCH, NO_MATCH, NO_MATCH - } }, -#ifdef CONFIG_ACPI_SLEEP - { reset_videomode_after_s3, "Toshiba Satellite 4030cdt", { /* Reset video mode after returning from ACPI S3 sleep */ - MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"), - NO_MATCH, NO_MATCH, NO_MATCH - } }, -#endif - -#ifdef CONFIG_ACPI_BOOT - /* - * If your system is blacklisted here, but you find that acpi=force - * works for you, please contact acpi-devel@sourceforge.net - */ - - /* - * Boxes that need ACPI disabled - */ - - { dmi_disable_acpi, "IBM Thinkpad", { - MATCH(DMI_BOARD_VENDOR, "IBM"), - MATCH(DMI_BOARD_NAME, "2629H1G"), - NO_MATCH, NO_MATCH }}, - - /* - * Boxes that need acpi=ht - */ - - { force_acpi_ht, "FSC Primergy T850", { - MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), - MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "DELL GX240", { - MATCH(DMI_BOARD_VENDOR, "Dell Computer Corporation"), - MATCH(DMI_BOARD_NAME, "OptiPlex GX240"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "HP VISUALIZE NT Workstation", { - MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"), - MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "Compaq Workstation W8000", { - MATCH(DMI_SYS_VENDOR, "Compaq"), - MATCH(DMI_PRODUCT_NAME, "Workstation W8000"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "ASUS P4B266", { - MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - MATCH(DMI_BOARD_NAME, "P4B266"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "ASUS P2B-DS", { - MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - MATCH(DMI_BOARD_NAME, "P2B-DS"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "ASUS CUR-DLS", { - MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - MATCH(DMI_BOARD_NAME, "CUR-DLS"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "ABIT i440BX-W83977", { - MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"), - MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "IBM Bladecenter", { - MATCH(DMI_BOARD_VENDOR, "IBM"), - MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "IBM eServer xSeries 360", { - MATCH(DMI_BOARD_VENDOR, "IBM"), - MATCH(DMI_BOARD_NAME, "eServer xSeries 360"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "IBM eserver xSeries 330", { - MATCH(DMI_BOARD_VENDOR, "IBM"), - MATCH(DMI_BOARD_NAME, "eserver xSeries 330"), - NO_MATCH, NO_MATCH }}, - - { force_acpi_ht, "IBM eserver xSeries 440", { - MATCH(DMI_BOARD_VENDOR, "IBM"), - MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"), - NO_MATCH, NO_MATCH }}, - -#endif // CONFIG_ACPI_BOOT - -#ifdef CONFIG_ACPI_PCI - /* - * Boxes that need ACPI PCI IRQ routing disabled - */ - - { disable_acpi_irq, "ASUS A7V", { - MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC"), - MATCH(DMI_BOARD_NAME, "<A7V>"), - /* newer BIOS, Revision 1011, does work */ - MATCH(DMI_BIOS_VERSION, "ASUS A7V ACPI BIOS Revision 1007"), - NO_MATCH }}, - - /* - * Boxes that need ACPI PCI IRQ routing and PCI scan disabled - */ - { disable_acpi_pci, "ASUS PR-DLS", { /* _BBN 0 bug */ - MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - MATCH(DMI_BOARD_NAME, "PR-DLS"), - MATCH(DMI_BIOS_VERSION, "ASUS PR-DLS ACPI BIOS Revision 1010"), - MATCH(DMI_BIOS_DATE, "03/21/2003") }}, - - { disable_acpi_pci, "Acer TravelMate 36x Laptop", { - MATCH(DMI_SYS_VENDOR, "Acer"), - MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), - NO_MATCH, NO_MATCH - } }, - -#endif - - { NULL, } -}; - -/* * Process a DMI table entry. Right now all we care about are the BIOS * and machine entries. For 2.5 we should pull the smbus controller info * out of here. */ - static void __init dmi_decode(struct dmi_header *dm) { -#ifdef DMI_DEBUG - u8 *data = (u8 *)dm; -#endif + u8 *data __attribute__((__unused__)) = (u8 *)dm; - switch(dm->type) - { - case 0: - dmi_printk(("BIOS Vendor: %s\n", - dmi_string(dm, data[4]))); - dmi_save_ident(dm, DMI_BIOS_VENDOR, 4); - dmi_printk(("BIOS Version: %s\n", - dmi_string(dm, data[5]))); - dmi_save_ident(dm, DMI_BIOS_VERSION, 5); - dmi_printk(("BIOS Release: %s\n", - dmi_string(dm, data[8]))); - dmi_save_ident(dm, DMI_BIOS_DATE, 8); - break; - case 1: - dmi_printk(("System Vendor: %s\n", - dmi_string(dm, data[4]))); - dmi_save_ident(dm, DMI_SYS_VENDOR, 4); - dmi_printk(("Product Name: %s\n", - dmi_string(dm, data[5]))); - dmi_save_ident(dm, DMI_PRODUCT_NAME, 5); - dmi_printk(("Version: %s\n", - dmi_string(dm, data[6]))); - dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6); - dmi_printk(("Serial Number: %s\n", - dmi_string(dm, data[7]))); - break; - case 2: - dmi_printk(("Board Vendor: %s\n", - dmi_string(dm, data[4]))); - dmi_save_ident(dm, DMI_BOARD_VENDOR, 4); - dmi_printk(("Board Name: %s\n", - dmi_string(dm, data[5]))); - dmi_save_ident(dm, DMI_BOARD_NAME, 5); - dmi_printk(("Board Version: %s\n", - dmi_string(dm, data[6]))); - dmi_save_ident(dm, DMI_BOARD_VERSION, 6); - break; + switch(dm->type) { + case 0: + dmi_printk(("BIOS Vendor: %s\n", dmi_string(dm, data[4]))); + dmi_save_ident(dm, DMI_BIOS_VENDOR, 4); + dmi_printk(("BIOS Version: %s\n", dmi_string(dm, data[5]))); + dmi_save_ident(dm, DMI_BIOS_VERSION, 5); + dmi_printk(("BIOS Release: %s\n", dmi_string(dm, data[8]))); + dmi_save_ident(dm, DMI_BIOS_DATE, 8); + break; + case 1: + dmi_printk(("System Vendor: %s\n", dmi_string(dm, data[4]))); + dmi_save_ident(dm, DMI_SYS_VENDOR, 4); + dmi_printk(("Product Name: %s\n", dmi_string(dm, data[5]))); + dmi_save_ident(dm, DMI_PRODUCT_NAME, 5); + dmi_printk(("Version: %s\n", dmi_string(dm, data[6]))); + dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6); + dmi_printk(("Serial Number: %s\n", dmi_string(dm, data[7]))); + dmi_save_ident(dm, DMI_PRODUCT_SERIAL, 7); + break; + case 2: + dmi_printk(("Board Vendor: %s\n", dmi_string(dm, data[4]))); + dmi_save_ident(dm, DMI_BOARD_VENDOR, 4); + dmi_printk(("Board Name: %s\n", dmi_string(dm, data[5]))); + dmi_save_ident(dm, DMI_BOARD_NAME, 5); + dmi_printk(("Board Version: %s\n", dmi_string(dm, data[6]))); + dmi_save_ident(dm, DMI_BOARD_VERSION, 6); + break; } } void __init dmi_scan_machine(void) { - int err = dmi_iterate(dmi_decode); - if(err == 0) - dmi_check_system(dmi_blacklist); - else + if (dmi_iterate(dmi_decode)) printk(KERN_INFO "DMI not present.\n"); } @@ -470,7 +226,6 @@ fail: d++; return count; } - EXPORT_SYMBOL(dmi_check_system); /** @@ -480,8 +235,8 @@ EXPORT_SYMBOL(dmi_check_system); * Returns one DMI data value, can be used to perform * complex DMI data checks. */ -char * dmi_get_system_info(int field) +char *dmi_get_system_info(int field) { return dmi_ident[field]; } - +EXPORT_SYMBOL(dmi_get_system_info); diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c index f732f427b418..385883ea8c19 100644 --- a/arch/i386/kernel/efi.c +++ b/arch/i386/kernel/efi.c @@ -30,6 +30,7 @@ #include <linux/ioport.h> #include <linux/module.h> #include <linux/efi.h> +#include <linux/kexec.h> #include <asm/setup.h> #include <asm/io.h> @@ -598,6 +599,9 @@ efi_initialize_iomem_resources(struct resource *code_resource, if (md->type == EFI_CONVENTIONAL_MEMORY) { request_resource(res, code_resource); request_resource(res, data_resource); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif } } } diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S index e966fc8c44c4..4477bb107098 100644 --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -299,7 +299,6 @@ is386: movl $2,%ecx # set MP movl %eax,%cr0 call check_x87 - incb ready lgdt cpu_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f @@ -316,8 +315,9 @@ is386: movl $2,%ecx # set MP lldt %ax cld # gcc2 wants the direction flag cleared at all times #ifdef CONFIG_SMP - movb ready, %cl - cmpb $1,%cl + movb ready, %cl + movb $1, ready + cmpb $0,%cl je 1f # the first CPU calls start_kernel # all other CPUs call initialize_secondary call initialize_secondary diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index 903190a4b3ff..180f070d03cb 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -1,97 +1,17 @@ #include <linux/config.h> #include <linux/module.h> -#include <linux/smp.h> -#include <linux/user.h> -#include <linux/elfcore.h> -#include <linux/mca.h> -#include <linux/sched.h> -#include <linux/in6.h> -#include <linux/interrupt.h> -#include <linux/smp_lock.h> -#include <linux/pm.h> -#include <linux/pci.h> -#include <linux/apm_bios.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/tty.h> -#include <linux/highmem.h> -#include <linux/time.h> - -#include <asm/semaphore.h> -#include <asm/processor.h> -#include <asm/i387.h> -#include <asm/uaccess.h> #include <asm/checksum.h> -#include <asm/io.h> -#include <asm/delay.h> -#include <asm/irq.h> -#include <asm/mmx.h> #include <asm/desc.h> -#include <asm/pgtable.h> -#include <asm/tlbflush.h> -#include <asm/nmi.h> -#include <asm/ist.h> -#include <asm/kdebug.h> - -extern void dump_thread(struct pt_regs *, struct user *); -extern spinlock_t rtc_lock; /* This is definitely a GPL-only symbol */ EXPORT_SYMBOL_GPL(cpu_gdt_table); -#if defined(CONFIG_APM_MODULE) -extern void machine_real_restart(unsigned char *, int); -EXPORT_SYMBOL(machine_real_restart); -extern void default_idle(void); -EXPORT_SYMBOL(default_idle); -#endif - -#ifdef CONFIG_SMP -extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); -extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); -#endif - -#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) -extern struct drive_info_struct drive_info; -EXPORT_SYMBOL(drive_info); -#endif - -extern unsigned long cpu_khz; -extern unsigned long get_cmos_time(void); - -/* platform dependent support */ -EXPORT_SYMBOL(boot_cpu_data); -#ifdef CONFIG_DISCONTIGMEM -EXPORT_SYMBOL(node_data); -EXPORT_SYMBOL(physnode_map); -#endif -#ifdef CONFIG_X86_NUMAQ -EXPORT_SYMBOL(xquad_portio); -#endif -EXPORT_SYMBOL(dump_thread); -EXPORT_SYMBOL(dump_fpu); -EXPORT_SYMBOL_GPL(kernel_fpu_begin); -EXPORT_SYMBOL(__ioremap); -EXPORT_SYMBOL(ioremap_nocache); -EXPORT_SYMBOL(iounmap); -EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(pm_idle); -EXPORT_SYMBOL(pm_power_off); -EXPORT_SYMBOL(get_cmos_time); -EXPORT_SYMBOL(cpu_khz); -EXPORT_SYMBOL(apm_info); - EXPORT_SYMBOL(__down_failed); EXPORT_SYMBOL(__down_failed_interruptible); EXPORT_SYMBOL(__down_failed_trylock); EXPORT_SYMBOL(__up_wakeup); /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); -/* Delay loops */ -EXPORT_SYMBOL(__ndelay); -EXPORT_SYMBOL(__udelay); -EXPORT_SYMBOL(__delay); -EXPORT_SYMBOL(__const_udelay); EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); @@ -105,87 +25,11 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strstr); -EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); -EXPORT_SYMBOL(clear_user); -EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(__copy_from_user_ll); -EXPORT_SYMBOL(__copy_to_user_ll); -EXPORT_SYMBOL(strnlen_user); - -EXPORT_SYMBOL(dma_alloc_coherent); -EXPORT_SYMBOL(dma_free_coherent); - -#ifdef CONFIG_PCI -EXPORT_SYMBOL(pci_mem_start); -#endif - -#ifdef CONFIG_PCI_BIOS -EXPORT_SYMBOL(pcibios_set_irq_routing); -EXPORT_SYMBOL(pcibios_get_irq_routing_table); -#endif - -#ifdef CONFIG_X86_USE_3DNOW -EXPORT_SYMBOL(_mmx_memcpy); -EXPORT_SYMBOL(mmx_clear_page); -EXPORT_SYMBOL(mmx_copy_page); -#endif - -#ifdef CONFIG_X86_HT -EXPORT_SYMBOL(smp_num_siblings); -EXPORT_SYMBOL(cpu_sibling_map); -#endif - #ifdef CONFIG_SMP -EXPORT_SYMBOL(cpu_data); -EXPORT_SYMBOL(cpu_online_map); -EXPORT_SYMBOL(cpu_callout_map); +extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); +extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); EXPORT_SYMBOL(__write_lock_failed); EXPORT_SYMBOL(__read_lock_failed); - -/* Global SMP stuff */ -EXPORT_SYMBOL(smp_call_function); - -/* TLB flushing */ -EXPORT_SYMBOL(flush_tlb_page); -#endif - -#ifdef CONFIG_X86_IO_APIC -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -#endif - -#ifdef CONFIG_MCA -EXPORT_SYMBOL(machine_id); -#endif - -#ifdef CONFIG_VT -EXPORT_SYMBOL(screen_info); -#endif - -EXPORT_SYMBOL(get_wchan); - -EXPORT_SYMBOL(rtc_lock); - -EXPORT_SYMBOL_GPL(set_nmi_callback); -EXPORT_SYMBOL_GPL(unset_nmi_callback); - -EXPORT_SYMBOL(register_die_notifier); -#ifdef CONFIG_HAVE_DEC_LOCK -EXPORT_SYMBOL(_atomic_dec_and_lock); -#endif - -EXPORT_SYMBOL(__PAGE_KERNEL); - -#ifdef CONFIG_HIGHMEM -EXPORT_SYMBOL(kmap); -EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); -#endif - -#if defined(CONFIG_X86_SPEEDSTEP_SMI) || defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) -EXPORT_SYMBOL(ist_info); #endif EXPORT_SYMBOL(csum_partial); diff --git a/arch/i386/kernel/i387.c b/arch/i386/kernel/i387.c index c55e037f08f7..d75524758daf 100644 --- a/arch/i386/kernel/i387.c +++ b/arch/i386/kernel/i387.c @@ -10,6 +10,7 @@ #include <linux/config.h> #include <linux/sched.h> +#include <linux/module.h> #include <asm/processor.h> #include <asm/i387.h> #include <asm/math_emu.h> @@ -79,17 +80,7 @@ void kernel_fpu_begin(void) } clts(); } - -void restore_fpu( struct task_struct *tsk ) -{ - if ( cpu_has_fxsr ) { - asm volatile( "fxrstor %0" - : : "m" (tsk->thread.i387.fxsave) ); - } else { - asm volatile( "frstor %0" - : : "m" (tsk->thread.i387.fsave) ); - } -} +EXPORT_SYMBOL_GPL(kernel_fpu_begin); /* * FPU tag word conversions. @@ -526,6 +517,7 @@ int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) return fpvalid; } +EXPORT_SYMBOL(dump_fpu); int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) { diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c index 2c4813b47e57..178f4e9bac9d 100644 --- a/arch/i386/kernel/i8259.c +++ b/arch/i386/kernel/i8259.c @@ -268,10 +268,22 @@ static int i8259A_suspend(struct sys_device *dev, pm_message_t state) return 0; } +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-1 */ + return 0; +} + static struct sysdev_class i8259_sysdev_class = { set_kset_name("i8259"), .suspend = i8259A_suspend, .resume = i8259A_resume, + .shutdown = i8259A_shutdown, }; static struct sys_device device_i8259A = { diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c index 7a324e8b86f9..6578f40bd501 100644 --- a/arch/i386/kernel/io_apic.c +++ b/arch/i386/kernel/io_apic.c @@ -31,12 +31,13 @@ #include <linux/mc146818rtc.h> #include <linux/compiler.h> #include <linux/acpi.h> - +#include <linux/module.h> #include <linux/sysdev.h> #include <asm/io.h> #include <asm/smp.h> #include <asm/desc.h> #include <asm/timer.h> +#include <asm/i8259.h> #include <mach_apic.h> @@ -573,12 +574,14 @@ static int balanced_irq(void *unused) for ( ; ; ) { set_current_state(TASK_INTERRUPTIBLE); time_remaining = schedule_timeout(time_remaining); - try_to_freeze(PF_FREEZE); + try_to_freeze(); if (time_after(jiffies, prev_balance_time+balanced_irq_interval)) { + preempt_disable(); do_irq_balance(); prev_balance_time = jiffies; time_remaining = balanced_irq_interval; + preempt_enable(); } } return 0; @@ -630,10 +633,8 @@ static int __init balanced_irq_init(void) printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); failed: for (i = 0; i < NR_CPUS; i++) { - if(irq_cpu_data[i].irq_delta) - kfree(irq_cpu_data[i].irq_delta); - if(irq_cpu_data[i].last_irq) - kfree(irq_cpu_data[i].last_irq); + kfree(irq_cpu_data[i].irq_delta); + kfree(irq_cpu_data[i].last_irq); } return 0; } @@ -812,6 +813,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) } return best_guess; } +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); /* * This function currently is only a helper for the i386 smp boot process where @@ -1565,7 +1567,6 @@ void print_all_local_APICs (void) void /*__init*/ print_PIC(void) { - extern spinlock_t i8259A_lock; unsigned int v; unsigned long flags; @@ -1633,12 +1634,43 @@ static void __init enable_IO_APIC(void) */ void disable_IO_APIC(void) { + int pin; /* * Clear the IO-APIC before rebooting: */ clear_IO_APIC(); - disconnect_bsp_APIC(); + /* + * If the i82559 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrups can be delivered. + */ + pin = find_isa_irq_pin(0, mp_ExtINT); + if (pin != -1) { + struct IO_APIC_route_entry entry; + unsigned long flags; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = 7; /* ExtInt */ + entry.vector = 0; + entry.dest.physical.physical_dest = 0; + + + /* + * Add it to the IO-APIC irq-routing table: + */ + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1)); + io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0)); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + disconnect_bsp_APIC(pin != -1); } /* @@ -1659,6 +1691,12 @@ static void __init setup_ioapic_ids_from_mpc(void) unsigned long flags; /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && boot_cpu_data.x86 < 15)) + return; + /* * This is broken; anything with a real cpu count has to * circumvent this idiocy regardless. */ @@ -1684,10 +1722,6 @@ static void __init setup_ioapic_ids_from_mpc(void) mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; } - /* Don't check I/O APIC IDs for some xAPIC systems. They have - * no meaning without the serial APIC bus. */ - if (NO_IOAPIC_CHECK) - continue; /* * Sanity check, is the ID really free? Every APIC in a * system must have a unique ID or we get lots of nice diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 73945a3c53c4..ce66dcc26d90 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -15,6 +15,9 @@ #include <linux/seq_file.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/delay.h> DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_maxaligned_in_smp; EXPORT_PER_CPU_SYMBOL(irq_stat); @@ -153,6 +156,11 @@ void irq_ctx_init(int cpu) cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); } +void irq_ctx_exit(int cpu) +{ + hardirq_ctx[cpu] = NULL; +} + extern asmlinkage void __do_softirq(void); asmlinkage void do_softirq(void) @@ -210,9 +218,8 @@ int show_interrupts(struct seq_file *p, void *v) if (i == 0) { seq_printf(p, " "); - for (j=0; j<NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "CPU%d ",j); + for_each_cpu(j) + seq_printf(p, "CPU%d ",j); seq_putc(p, '\n'); } @@ -225,9 +232,8 @@ int show_interrupts(struct seq_file *p, void *v) #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); + for_each_cpu(j) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif seq_printf(p, " %14s", irq_desc[i].handler->typename); seq_printf(p, " %s", action->name); @@ -240,16 +246,14 @@ skip: spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) { seq_printf(p, "NMI: "); - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", nmi_count(j)); + for_each_cpu(j) + seq_printf(p, "%10u ", nmi_count(j)); seq_putc(p, '\n'); #ifdef CONFIG_X86_LOCAL_APIC seq_printf(p, "LOC: "); - for (j = 0; j < NR_CPUS; j++) - if (cpu_online(j)) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).apic_timer_irqs); + for_each_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).apic_timer_irqs); seq_putc(p, '\n'); #endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); @@ -259,3 +263,45 @@ skip: } return 0; } + +#ifdef CONFIG_HOTPLUG_CPU +#include <mach_apic.h> + +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + static int warned; + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; + if (irq == 2) + continue; + + cpus_and(mask, irq_affinity[irq], map); + if (any_online_cpu(mask) == NR_CPUS) { + printk("Breaking affinity for irq %i\n", irq); + mask = map; + } + if (irq_desc[irq].handler->set_affinity) + irq_desc[irq].handler->set_affinity(irq, mask); + else if (irq_desc[irq].action && !(warned++)) + printk("Cannot set affinity for irq %i\n", irq); + } + +#if 0 + barrier(); + /* Ingo Molnar says: "after the IO-APIC masks have been redirected + [note the nop - the interrupt-enable boundary on x86 is two + instructions from sti] - to flush out pending hardirqs and + IPIs. After this point nothing is supposed to reach this CPU." */ + __asm__ __volatile__("sti; nop; cli"); + barrier(); +#else + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +#endif +} +#endif + diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index 59ff9b455069..a6d8c45961d3 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -23,6 +23,9 @@ * Rusty Russell). * 2004-July Suparna Bhattacharya <suparna@in.ibm.com> added jumper probes * interface to access function arguments. + * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston + * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi + * <prasanna@in.ibm.com> added function-return probes. */ #include <linux/config.h> @@ -30,15 +33,14 @@ #include <linux/ptrace.h> #include <linux/spinlock.h> #include <linux/preempt.h> +#include <asm/cacheflush.h> #include <asm/kdebug.h> #include <asm/desc.h> -/* kprobe_status settings */ -#define KPROBE_HIT_ACTIVE 0x00000001 -#define KPROBE_HIT_SS 0x00000002 - static struct kprobe *current_kprobe; static unsigned long kprobe_status, kprobe_old_eflags, kprobe_saved_eflags; +static struct kprobe *kprobe_prev; +static unsigned long kprobe_status_prev, kprobe_old_eflags_prev, kprobe_saved_eflags_prev; static struct pt_regs jprobe_saved_regs; static long *jprobe_saved_esp; /* copy of the kernel stack at the probe fire time */ @@ -68,16 +70,50 @@ int arch_prepare_kprobe(struct kprobe *p) void arch_copy_kprobe(struct kprobe *p) { memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + p->opcode = *p->addr; } -void arch_remove_kprobe(struct kprobe *p) +void arch_arm_kprobe(struct kprobe *p) { + *p->addr = BREAKPOINT_INSTRUCTION; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); } -static inline void disarm_kprobe(struct kprobe *p, struct pt_regs *regs) +void arch_disarm_kprobe(struct kprobe *p) { *p->addr = p->opcode; - regs->eip = (unsigned long)p->addr; + flush_icache_range((unsigned long) p->addr, + (unsigned long) p->addr + sizeof(kprobe_opcode_t)); +} + +void arch_remove_kprobe(struct kprobe *p) +{ +} + +static inline void save_previous_kprobe(void) +{ + kprobe_prev = current_kprobe; + kprobe_status_prev = kprobe_status; + kprobe_old_eflags_prev = kprobe_old_eflags; + kprobe_saved_eflags_prev = kprobe_saved_eflags; +} + +static inline void restore_previous_kprobe(void) +{ + current_kprobe = kprobe_prev; + kprobe_status = kprobe_status_prev; + kprobe_old_eflags = kprobe_old_eflags_prev; + kprobe_saved_eflags = kprobe_saved_eflags_prev; +} + +static inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs) +{ + current_kprobe = p; + kprobe_saved_eflags = kprobe_old_eflags + = (regs->eflags & (TF_MASK | IF_MASK)); + if (is_IF_modifier(p->opcode)) + kprobe_saved_eflags &= ~IF_MASK; } static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) @@ -91,6 +127,25 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->eip = (unsigned long)&p->ainsn.insn; } +void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) +{ + unsigned long *sara = (unsigned long *)®s->esp; + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; + ri->ret_addr = (kprobe_opcode_t *) *sara; + + /* Replace the return addr with trampoline addr */ + *sara = (unsigned long) &kretprobe_trampoline; + + add_rp_inst(ri); + } else { + rp->nmissed++; + } +} + /* * Interrupts are disabled on entry as trap3 is an interrupt gate and they * remain disabled thorough out this function. @@ -127,8 +182,18 @@ static int kprobe_handler(struct pt_regs *regs) unlock_kprobes(); goto no_kprobe; } - disarm_kprobe(p, regs); - ret = 1; + /* We have reentered the kprobe_handler(), since + * another probe was hit while within the handler. + * We here save the original kprobes variables and + * just single step on the instruction of the new probe + * without calling any user handlers. + */ + save_previous_kprobe(); + set_current_kprobe(p, regs); + p->nmissed++; + prepare_singlestep(p, regs); + kprobe_status = KPROBE_REENTER; + return 1; } else { p = current_kprobe; if (p->break_handler && p->break_handler(p, regs)) { @@ -163,11 +228,7 @@ static int kprobe_handler(struct pt_regs *regs) } kprobe_status = KPROBE_HIT_ACTIVE; - current_kprobe = p; - kprobe_saved_eflags = kprobe_old_eflags - = (regs->eflags & (TF_MASK | IF_MASK)); - if (is_IF_modifier(p->opcode)) - kprobe_saved_eflags &= ~IF_MASK; + set_current_kprobe(p, regs); if (p->pre_handler && p->pre_handler(p, regs)) /* handler has already set things up, so skip ss setup */ @@ -184,6 +245,78 @@ no_kprobe: } /* + * For function-return probes, init_kprobes() establishes a probepoint + * here. When a retprobed function returns, this probe is hit and + * trampoline_probe_handler() runs, calling the kretprobe's handler. + */ + void kretprobe_trampoline_holder(void) + { + asm volatile ( ".global kretprobe_trampoline\n" + "kretprobe_trampoline: \n" + "nop\n"); + } + +/* + * Called when we hit the probe point at kretprobe_trampoline + */ +int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head; + struct hlist_node *node, *tmp; + unsigned long orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + + head = kretprobe_inst_table_head(current); + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler) + ri->rp->handler(ri, regs); + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address)); + regs->eip = orig_ret_address; + + unlock_kprobes(); + preempt_enable_no_resched(); + + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we have handled unlocking + * and re-enabling preemption. + */ + return 1; +} + +/* * Called after single-stepping. p->addr is the address of the * instruction whose first byte has been replaced by the "int 3" * instruction. To avoid the SMP problems that can occur when we @@ -263,13 +396,21 @@ static inline int post_kprobe_handler(struct pt_regs *regs) if (!kprobe_running()) return 0; - if (current_kprobe->post_handler) + if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) { + kprobe_status = KPROBE_HIT_SSDONE; current_kprobe->post_handler(current_kprobe, regs, 0); + } resume_execution(current_kprobe, regs); regs->eflags |= kprobe_saved_eflags; + /*Restore back the original saved kprobes variables and continue. */ + if (kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(); + goto out; + } unlock_kprobes(); +out: preempt_enable_no_resched(); /* @@ -390,3 +531,13 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) } return 0; } + +static struct kprobe trampoline_p = { + .addr = (kprobe_opcode_t *) &kretprobe_trampoline, + .pre_handler = trampoline_probe_handler +}; + +int __init arch_init_kprobes(void) +{ + return register_kprobe(&trampoline_p); +} diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c new file mode 100644 index 000000000000..cb699a2aa1f8 --- /dev/null +++ b/arch/i386/kernel/machine_kexec.c @@ -0,0 +1,226 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/mm.h> +#include <linux/kexec.h> +#include <linux/delay.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/io.h> +#include <asm/apic.h> +#include <asm/cpufeature.h> +#include <asm/desc.h> + +static inline unsigned long read_cr3(void) +{ + unsigned long cr3; + asm volatile("movl %%cr3,%0": "=r"(cr3)); + return cr3; +} + +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) + +#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) +#define L2_ATTR (_PAGE_PRESENT) + +#define LEVEL0_SIZE (1UL << 12UL) + +#ifndef CONFIG_X86_PAE +#define LEVEL1_SIZE (1UL << 22UL) +static u32 pgtable_level1[1024] PAGE_ALIGNED; + +static void identity_map_page(unsigned long address) +{ + unsigned long level1_index, level2_index; + u32 *pgtable_level2; + + /* Find the current page table */ + pgtable_level2 = __va(read_cr3()); + + /* Find the indexes of the physical address to identity map */ + level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; + level2_index = address / LEVEL1_SIZE; + + /* Identity map the page table entry */ + pgtable_level1[level1_index] = address | L0_ATTR; + pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; + + /* Flush the tlb so the new mapping takes effect. + * Global tlb entries are not flushed but that is not an issue. + */ + load_cr3(pgtable_level2); +} + +#else +#define LEVEL1_SIZE (1UL << 21UL) +#define LEVEL2_SIZE (1UL << 30UL) +static u64 pgtable_level1[512] PAGE_ALIGNED; +static u64 pgtable_level2[512] PAGE_ALIGNED; + +static void identity_map_page(unsigned long address) +{ + unsigned long level1_index, level2_index, level3_index; + u64 *pgtable_level3; + + /* Find the current page table */ + pgtable_level3 = __va(read_cr3()); + + /* Find the indexes of the physical address to identity map */ + level1_index = (address % LEVEL1_SIZE)/LEVEL0_SIZE; + level2_index = (address % LEVEL2_SIZE)/LEVEL1_SIZE; + level3_index = address / LEVEL2_SIZE; + + /* Identity map the page table entry */ + pgtable_level1[level1_index] = address | L0_ATTR; + pgtable_level2[level2_index] = __pa(pgtable_level1) | L1_ATTR; + set_64bit(&pgtable_level3[level3_index], + __pa(pgtable_level2) | L2_ATTR); + + /* Flush the tlb so the new mapping takes effect. + * Global tlb entries are not flushed but that is not an issue. + */ + load_cr3(pgtable_level3); +} +#endif + +static void set_idt(void *newidt, __u16 limit) +{ + struct Xgt_desc_struct curidt; + + /* ia32 supports unaliged loads & stores */ + curidt.size = limit; + curidt.address = (unsigned long)newidt; + + __asm__ __volatile__ ( + "lidtl %0\n" + : : "m" (curidt) + ); +}; + + +static void set_gdt(void *newgdt, __u16 limit) +{ + struct Xgt_desc_struct curgdt; + + /* ia32 supports unaligned loads & stores */ + curgdt.size = limit; + curgdt.address = (unsigned long)newgdt; + + __asm__ __volatile__ ( + "lgdtl %0\n" + : : "m" (curgdt) + ); +}; + +static void load_segments(void) +{ +#define __STR(X) #X +#define STR(X) __STR(X) + + __asm__ __volatile__ ( + "\tljmp $"STR(__KERNEL_CS)",$1f\n" + "\t1:\n" + "\tmovl $"STR(__KERNEL_DS)",%eax\n" + "\tmovl %eax,%ds\n" + "\tmovl %eax,%es\n" + "\tmovl %eax,%fs\n" + "\tmovl %eax,%gs\n" + "\tmovl %eax,%ss\n" + ); +#undef STR +#undef __STR +} + +typedef asmlinkage NORET_TYPE void (*relocate_new_kernel_t)( + unsigned long indirection_page, + unsigned long reboot_code_buffer, + unsigned long start_address, + unsigned int has_pae) ATTRIB_NORET; + +const extern unsigned char relocate_new_kernel[]; +extern void relocate_new_kernel_end(void); +const extern unsigned int relocate_new_kernel_size; + +/* + * A architecture hook called to validate the + * proposed image and prepare the control pages + * as needed. The pages for KEXEC_CONTROL_CODE_SIZE + * have been allocated, but the segments have yet + * been copied into the kernel. + * + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. + * + * Currently nothing. + */ +int machine_kexec_prepare(struct kimage *image) +{ + return 0; +} + +/* + * Undo anything leftover by machine_kexec_prepare + * when an image is freed. + */ +void machine_kexec_cleanup(struct kimage *image) +{ +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +NORET_TYPE void machine_kexec(struct kimage *image) +{ + unsigned long page_list; + unsigned long reboot_code_buffer; + + relocate_new_kernel_t rnk; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + /* Compute some offsets */ + reboot_code_buffer = page_to_pfn(image->control_code_page) + << PAGE_SHIFT; + page_list = image->head; + + /* Set up an identity mapping for the reboot_code_buffer */ + identity_map_page(reboot_code_buffer); + + /* copy it out */ + memcpy((void *)reboot_code_buffer, relocate_new_kernel, + relocate_new_kernel_size); + + /* The segment registers are funny things, they are + * automatically loaded from a table, in memory wherever you + * set them to a specific selector, but this table is never + * accessed again you set the segment to a different selector. + * + * The more common model is are caches where the behide + * the scenes work is done, but is also dropped at arbitrary + * times. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0),0); + set_idt(phys_to_virt(0),0); + + /* now call it */ + rnk = (relocate_new_kernel_t) reboot_code_buffer; + (*rnk)(page_list, reboot_code_buffer, image->start, cpu_has_pae); +} diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c index 1347ab4939e7..ce838abb27d8 100644 --- a/arch/i386/kernel/mpparse.c +++ b/arch/i386/kernel/mpparse.c @@ -67,7 +67,6 @@ unsigned long mp_lapic_addr; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; -unsigned int boot_cpu_logical_apicid = -1U; /* Internal processor count */ static unsigned int __initdata num_processors; @@ -180,7 +179,6 @@ static void __init MP_processor_info (struct mpc_config_processor *m) if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { Dprintk(" Bootup CPU\n"); boot_cpu_physical_apicid = m->mpc_apicid; - boot_cpu_logical_apicid = apicid; } if (num_processors >= NR_CPUS) { @@ -914,7 +912,10 @@ void __init mp_register_ioapic ( mp_ioapics[idx].mpc_apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 < 15)) + mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id); + else + mp_ioapics[idx].mpc_apicid = id; mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); /* @@ -1055,11 +1056,20 @@ void __init mp_config_acpi_legacy_irqs (void) } } +#define MAX_GSI_NUM 4096 + int mp_register_gsi (u32 gsi, int edge_level, int active_high_low) { int ioapic = -1; int ioapic_pin = 0; int idx, bit = 0; + static int pci_irq = 16; + /* + * Mapping between Global System Interrups, which + * represent all possible interrupts, and IRQs + * assigned to actual devices. + */ + static int gsi_to_irq[MAX_GSI_NUM]; #ifdef CONFIG_ACPI_BUS /* Don't set up the ACPI SCI because it's already set up */ @@ -1094,11 +1104,34 @@ int mp_register_gsi (u32 gsi, int edge_level, int active_high_low) if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); - return gsi; + return gsi_to_irq[gsi]; } mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit); + if (edge_level) { + /* + * For PCI devices assign IRQs in order, avoiding gaps + * due to unused I/O APIC pins. + */ + int irq = gsi; + if (gsi < MAX_GSI_NUM) { + if (gsi > 15) + gsi = pci_irq++; +#ifdef CONFIG_ACPI_BUS + /* + * Don't assign IRQ used by ACPI SCI + */ + if (gsi == acpi_fadt.sci_int) + gsi = pci_irq++; +#endif + gsi_to_irq[irq] = gsi; + } else { + printk(KERN_ERR "GSI %u is too high\n", gsi); + return gsi; + } + } + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, edge_level == ACPI_EDGE_SENSITIVE ? 0 : 1, active_high_low == ACPI_ACTIVE_HIGH ? 0 : 1); diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c index 2c0ee9c2d020..da6c46d667cb 100644 --- a/arch/i386/kernel/nmi.c +++ b/arch/i386/kernel/nmi.c @@ -28,8 +28,7 @@ #include <linux/sysctl.h> #include <asm/smp.h> -#include <asm/mtrr.h> -#include <asm/mpspec.h> +#include <asm/div64.h> #include <asm/nmi.h> #include "mach_traps.h" @@ -324,6 +323,16 @@ static void clear_msr_range(unsigned int base, unsigned int n) wrmsr(base+i, 0, 0); } +static inline void write_watchdog_counter(const char *descr) +{ + u64 count = (u64)cpu_khz * 1000; + + do_div(count, nmi_hz); + if(descr) + Dprintk("setting %s to -0x%08Lx\n", descr, count); + wrmsrl(nmi_perfctr_msr, 0 - count); +} + static void setup_k7_watchdog(void) { unsigned int evntsel; @@ -339,8 +348,7 @@ static void setup_k7_watchdog(void) | K7_NMI_EVENT; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); - Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000)); - wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1); + write_watchdog_counter("K7_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= K7_EVNTSEL_ENABLE; wrmsr(MSR_K7_EVNTSEL0, evntsel, 0); @@ -361,8 +369,7 @@ static void setup_p6_watchdog(void) | P6_NMI_EVENT; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); - Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000)); - wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), 0); + write_watchdog_counter("P6_PERFCTR0"); apic_write(APIC_LVTPC, APIC_DM_NMI); evntsel |= P6_EVNTSEL0_ENABLE; wrmsr(MSR_P6_EVNTSEL0, evntsel, 0); @@ -402,8 +409,7 @@ static int setup_p4_watchdog(void) wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0); wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0); - Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000)); - wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1); + write_watchdog_counter("P4_IQ_COUNTER0"); apic_write(APIC_LVTPC, APIC_DM_NMI); wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0); return 1; @@ -518,7 +524,7 @@ void nmi_watchdog_tick (struct pt_regs * regs) * other P6 variant */ apic_write(APIC_LVTPC, APIC_DM_NMI); } - wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1); + write_watchdog_counter(NULL); } } diff --git a/arch/i386/kernel/numaq.c b/arch/i386/kernel/numaq.c index e51edf0a6564..5f5b075f860a 100644 --- a/arch/i386/kernel/numaq.c +++ b/arch/i386/kernel/numaq.c @@ -31,6 +31,7 @@ #include <linux/nodemask.h> #include <asm/numaq.h> #include <asm/topology.h> +#include <asm/processor.h> #define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) @@ -77,3 +78,11 @@ int __init get_memcfg_numaq(void) smp_dump_qct(); return 1; } + +static int __init numaq_dsc_disable(void) +{ + printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); + tsc_disable = 1; + return 0; +} +core_initcall(numaq_dsc_disable); diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c index 4de2e03c7b45..1e51427cc9eb 100644 --- a/arch/i386/kernel/pci-dma.c +++ b/arch/i386/kernel/pci-dma.c @@ -11,6 +11,7 @@ #include <linux/mm.h> #include <linux/string.h> #include <linux/pci.h> +#include <linux/module.h> #include <asm/io.h> struct dma_coherent_mem { @@ -54,6 +55,7 @@ void *dma_alloc_coherent(struct device *dev, size_t size, } return ret; } +EXPORT_SYMBOL(dma_alloc_coherent); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle) @@ -68,6 +70,7 @@ void dma_free_coherent(struct device *dev, size_t size, } else free_pages((unsigned long)vaddr, order); } +EXPORT_SYMBOL(dma_free_coherent); int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, dma_addr_t device_addr, size_t size, int flags) diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 96e3ea6b17c7..e3f362e8af5b 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -13,6 +13,7 @@ #include <stdarg.h> +#include <linux/cpu.h> #include <linux/errno.h> #include <linux/sched.h> #include <linux/fs.h> @@ -37,6 +38,7 @@ #include <linux/kallsyms.h> #include <linux/ptrace.h> #include <linux/random.h> +#include <linux/kprobes.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -54,6 +56,9 @@ #include <linux/irq.h> #include <linux/err.h> +#include <asm/tlbflush.h> +#include <asm/cpu.h> + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); static int hlt_counter; @@ -73,6 +78,7 @@ unsigned long thread_saved_pc(struct task_struct *tsk) * Powermanagement idle function, if any.. */ void (*pm_idle)(void); +EXPORT_SYMBOL(pm_idle); static DEFINE_PER_CPU(unsigned int, cpu_idle_state); void disable_hlt(void) @@ -105,6 +111,9 @@ void default_idle(void) cpu_relax(); } } +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(default_idle); +#endif /* * On SMP it's slightly faster (but much more power-consuming!) @@ -138,14 +147,42 @@ static void poll_idle (void) } } +#ifdef CONFIG_HOTPLUG_CPU +#include <asm/nmi.h> +/* We don't actually take CPU down, just spin without interrupts. */ +static inline void play_dead(void) +{ + /* This must be done before dead CPU ack */ + cpu_exit_clear(); + wbinvd(); + mb(); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + /* + * With physical CPU hotplug, we should halt the cpu + */ + local_irq_disable(); + while (1) + __asm__ __volatile__("hlt":::"memory"); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a * low exit latency (ie sit in a loop waiting for * somebody to say that they'd like to reschedule) */ -void cpu_idle (void) +void cpu_idle(void) { + int cpu = raw_smp_processor_id(); + /* endless idle loop with no priority at all */ while (1) { while (!need_resched()) { @@ -160,6 +197,9 @@ void cpu_idle (void) if (!idle) idle = default_idle; + if (cpu_is_offline(cpu)) + play_dead(); + __get_cpu_var(irq_stat).idle_timestamp = jiffies; idle(); } @@ -218,7 +258,7 @@ static void mwait_idle(void) } } -void __init select_idle_routine(const struct cpuinfo_x86 *c) +void __devinit select_idle_routine(const struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_MWAIT)) { printk("monitor/mwait feature present.\n"); @@ -262,7 +302,7 @@ void show_regs(struct pt_regs * regs) printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); print_symbol("EIP is at %s\n", regs->eip); - if (regs->xcs & 3) + if (user_mode(regs)) printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); printk(" EFLAGS: %08lx %s (%s)\n", regs->eflags, print_tainted(), system_utsname.release); @@ -325,6 +365,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); } +EXPORT_SYMBOL(kernel_thread); /* * Free current thread data structures etc.. @@ -334,6 +375,13 @@ void exit_thread(void) struct task_struct *tsk = current; struct thread_struct *t = &tsk->thread; + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(tsk); + /* The process may have allocated an io port bitmap... nuke it. */ if (unlikely(NULL != t->io_bitmap_ptr)) { int cpu = get_cpu(); @@ -357,6 +405,13 @@ void flush_thread(void) { struct task_struct *tsk = current; + /* + * Remove function-return probe instances associated with this task + * and put them back on the free list. Do not insert an exit probe for + * this function, it will be disabled by kprobe_flush_task if you do. + */ + kprobe_flush_task(tsk); + memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); /* @@ -508,6 +563,7 @@ void dump_thread(struct pt_regs * regs, struct user * dump) dump->u_fpvalid = dump_fpu (regs, &dump->i387); } +EXPORT_SYMBOL(dump_thread); /* * Capture the user space registers if the task is not running (in user space) @@ -561,6 +617,33 @@ handle_io_bitmap(struct thread_struct *next, struct tss_struct *tss) } /* + * This function selects if the context switch from prev to next + * has to tweak the TSC disable bit in the cr4. + */ +static inline void disable_tsc(struct task_struct *prev_p, + struct task_struct *next_p) +{ + struct thread_info *prev, *next; + + /* + * gcc should eliminate the ->thread_info dereference if + * has_secure_computing returns 0 at compile time (SECCOMP=n). + */ + prev = prev_p->thread_info; + next = next_p->thread_info; + + if (has_secure_computing(prev) || has_secure_computing(next)) { + /* slow path here */ + if (has_secure_computing(prev) && + !has_secure_computing(next)) { + write_cr4(read_cr4() & ~X86_CR4_TSD); + } else if (!has_secure_computing(prev) && + has_secure_computing(next)) + write_cr4(read_cr4() | X86_CR4_TSD); + } +} + +/* * switch_to(x,yn) should switch tasks from x to y. * * We fsave/fwait so that an exception goes off at the right time @@ -617,28 +700,34 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas /* * Restore %fs and %gs if needed. + * + * Glibc normally makes %fs be zero, and %gs is one of + * the TLS segments. */ - if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) { + if (unlikely(prev->fs | next->fs)) loadsegment(fs, next->fs); + + if (prev->gs | next->gs) loadsegment(gs, next->gs); - } /* * Now maybe reload the debug registers */ if (unlikely(next->debugreg[7])) { - loaddebug(next, 0); - loaddebug(next, 1); - loaddebug(next, 2); - loaddebug(next, 3); + set_debugreg(next->debugreg[0], 0); + set_debugreg(next->debugreg[1], 1); + set_debugreg(next->debugreg[2], 2); + set_debugreg(next->debugreg[3], 3); /* no 4 and 5 */ - loaddebug(next, 6); - loaddebug(next, 7); + set_debugreg(next->debugreg[6], 6); + set_debugreg(next->debugreg[7], 7); } if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) handle_io_bitmap(next, tss); + disable_tsc(prev_p, next_p); + return prev_p; } @@ -731,6 +820,7 @@ unsigned long get_wchan(struct task_struct *p) } while (count++ < 16); return 0; } +EXPORT_SYMBOL(get_wchan); /* * sys_alloc_thread_area: get a yet unused TLS descriptor index. @@ -827,6 +917,8 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) return -EINVAL; + memset(&info, 0, sizeof(info)); + desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; info.entry_number = idx; diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c index e34f651fa13c..0da59b42843c 100644 --- a/arch/i386/kernel/ptrace.c +++ b/arch/i386/kernel/ptrace.c @@ -668,7 +668,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) info.si_code = TRAP_BRKPT; /* User-mode eip? */ - info.si_addr = user_mode(regs) ? (void __user *) regs->eip : NULL; + info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL; /* Send us the fakey SIGTRAP */ force_sig_info(SIGTRAP, &info, tsk); diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c index 6dc27eb70ee7..c71fef31dc47 100644 --- a/arch/i386/kernel/reboot.c +++ b/arch/i386/kernel/reboot.c @@ -2,6 +2,7 @@ * linux/arch/i386/kernel/reboot.c */ +#include <linux/config.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/delay.h> @@ -19,12 +20,12 @@ * Power off function, if any */ void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); static int reboot_mode; static int reboot_thru_bios; #ifdef CONFIG_SMP -int reboot_smp = 0; static int reboot_cpu = -1; /* shamelessly grabbed from lib/vsprintf.c for readability */ #define is_digit(c) ((c) >= '0' && (c) <= '9') @@ -47,7 +48,6 @@ static int __init reboot_setup(char *str) break; #ifdef CONFIG_SMP case 's': /* "smp" reboot by executing reset on BSP or other CPU*/ - reboot_smp = 1; if (is_digit(*(str+1))) { reboot_cpu = (int) (*(str+1) - '0'); if (is_digit(*(str+2))) @@ -86,33 +86,9 @@ static int __init set_bios_reboot(struct dmi_system_id *d) return 0; } -/* - * Some machines require the "reboot=s" commandline option, this quirk makes that automatic. - */ -static int __init set_smp_reboot(struct dmi_system_id *d) -{ -#ifdef CONFIG_SMP - if (!reboot_smp) { - reboot_smp = 1; - printk(KERN_INFO "%s series board detected. Selecting SMP-method for reboots.\n", d->ident); - } -#endif - return 0; -} - -/* - * Some machines require the "reboot=b,s" commandline option, this quirk makes that automatic. - */ -static int __init set_smp_bios_reboot(struct dmi_system_id *d) -{ - set_smp_reboot(d); - set_bios_reboot(d); - return 0; -} - static struct dmi_system_id __initdata reboot_dmi_table[] = { { /* Handle problems with rebooting on Dell 1300's */ - .callback = set_smp_bios_reboot, + .callback = set_bios_reboot, .ident = "Dell PowerEdge 1300", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), @@ -295,42 +271,36 @@ void machine_real_restart(unsigned char *code, int length) : : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); } +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(machine_real_restart); +#endif -void machine_restart(char * __unused) +void machine_shutdown(void) { #ifdef CONFIG_SMP - int cpuid; - - cpuid = GET_APIC_ID(apic_read(APIC_ID)); - - if (reboot_smp) { - - /* check to see if reboot_cpu is valid - if its not, default to the BSP */ - if ((reboot_cpu == -1) || - (reboot_cpu > (NR_CPUS -1)) || - !physid_isset(cpuid, phys_cpu_present_map)) - reboot_cpu = boot_cpu_physical_apicid; - - reboot_smp = 0; /* use this as a flag to only go through this once*/ - /* re-run this function on the other CPUs - it will fall though this section since we have - cleared reboot_smp, and do the reboot if it is the - correct CPU, otherwise it halts. */ - if (reboot_cpu != cpuid) - smp_call_function((void *)machine_restart , NULL, 1, 0); + int reboot_cpu_id; + + /* The boot cpu is always logical cpu 0 */ + reboot_cpu_id = 0; + + /* See if there has been given a command line override */ + if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && + cpu_isset(reboot_cpu, cpu_online_map)) { + reboot_cpu_id = reboot_cpu; } - /* if reboot_cpu is still -1, then we want a tradional reboot, - and if we are not running on the reboot_cpu,, halt */ - if ((reboot_cpu != -1) && (cpuid != reboot_cpu)) { - for (;;) - __asm__ __volatile__ ("hlt"); + /* Make certain the cpu I'm rebooting on is online */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { + reboot_cpu_id = smp_processor_id(); } - /* - * Stop all CPUs and turn off local APICs and the IO-APIC, so - * other OSs see a clean IRQ state. + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); + + /* O.K. Now that I'm on the appropriate processor, stop + * all of the others, and disable their local APICs. */ + smp_send_stop(); #endif /* CONFIG_SMP */ @@ -339,7 +309,10 @@ void machine_restart(char * __unused) #ifdef CONFIG_X86_IO_APIC disable_IO_APIC(); #endif +} +void machine_emergency_restart(void) +{ if (!reboot_thru_bios) { if (efi_enabled) { efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); @@ -362,23 +335,22 @@ void machine_restart(char * __unused) machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); } -EXPORT_SYMBOL(machine_restart); +void machine_restart(char * __unused) +{ + machine_shutdown(); + machine_emergency_restart(); +} void machine_halt(void) { } -EXPORT_SYMBOL(machine_halt); - void machine_power_off(void) { - lapic_shutdown(); + machine_shutdown(); - if (efi_enabled) - efi.reset_system(EFI_RESET_SHUTDOWN, EFI_SUCCESS, 0, NULL); if (pm_power_off) pm_power_off(); } -EXPORT_SYMBOL(machine_power_off); diff --git a/arch/i386/kernel/relocate_kernel.S b/arch/i386/kernel/relocate_kernel.S new file mode 100644 index 000000000000..d312616effa1 --- /dev/null +++ b/arch/i386/kernel/relocate_kernel.S @@ -0,0 +1,120 @@ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <linux/linkage.h> + + /* + * Must be relocatable PIC code callable as a C function, that once + * it starts can not use the previous processes stack. + */ + .globl relocate_new_kernel +relocate_new_kernel: + /* read the arguments and say goodbye to the stack */ + movl 4(%esp), %ebx /* page_list */ + movl 8(%esp), %ebp /* reboot_code_buffer */ + movl 12(%esp), %edx /* start address */ + movl 16(%esp), %ecx /* cpu_has_pae */ + + /* zero out flags, and disable interrupts */ + pushl $0 + popfl + + /* set a new stack at the bottom of our page... */ + lea 4096(%ebp), %esp + + /* store the parameters back on the stack */ + pushl %edx /* store the start address */ + + /* Set cr0 to a known state: + * 31 0 == Paging disabled + * 18 0 == Alignment check disabled + * 16 0 == Write protect disabled + * 3 0 == No task switch + * 2 0 == Don't do FP software emulation. + * 0 1 == Proctected mode enabled + */ + movl %cr0, %eax + andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax + orl $(1<<0), %eax + movl %eax, %cr0 + + /* clear cr4 if applicable */ + testl %ecx, %ecx + jz 1f + /* Set cr4 to a known state: + * Setting everything to zero seems safe. + */ + movl %cr4, %eax + andl $0, %eax + movl %eax, %cr4 + + jmp 1f +1: + + /* Flush the TLB (needed?) */ + xorl %eax, %eax + movl %eax, %cr3 + + /* Do the copies */ + movl %ebx, %ecx + jmp 1f + +0: /* top, read another word from the indirection page */ + movl (%ebx), %ecx + addl $4, %ebx +1: + testl $0x1, %ecx /* is it a destination page */ + jz 2f + movl %ecx, %edi + andl $0xfffff000, %edi + jmp 0b +2: + testl $0x2, %ecx /* is it an indirection page */ + jz 2f + movl %ecx, %ebx + andl $0xfffff000, %ebx + jmp 0b +2: + testl $0x4, %ecx /* is it the done indicator */ + jz 2f + jmp 3f +2: + testl $0x8, %ecx /* is it the source indicator */ + jz 0b /* Ignore it otherwise */ + movl %ecx, %esi /* For every source page do a copy */ + andl $0xfffff000, %esi + + movl $1024, %ecx + rep ; movsl + jmp 0b + +3: + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* set all of the registers to known values */ + /* leave %esp alone */ + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + xorl %ebp, %ebp + ret +relocate_new_kernel_end: + + .globl relocate_new_kernel_size +relocate_new_kernel_size: + .long relocate_new_kernel_end - relocate_new_kernel diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c index 2bfbddebdbf8..af4de58cab54 100644 --- a/arch/i386/kernel/setup.c +++ b/arch/i386/kernel/setup.c @@ -23,8 +23,10 @@ * This file handles the architecture-dependent parts of initialization */ +#include <linux/config.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/mmzone.h> #include <linux/tty.h> #include <linux/ioport.h> #include <linux/acpi.h> @@ -41,7 +43,12 @@ #include <linux/init.h> #include <linux/edd.h> #include <linux/nodemask.h> +#include <linux/kexec.h> +#include <linux/crash_dump.h> + #include <video/edid.h> + +#include <asm/apic.h> #include <asm/e820.h> #include <asm/mpspec.h> #include <asm/setup.h> @@ -53,12 +60,15 @@ #include "setup_arch_pre.h" #include <bios_ebda.h> +/* Forward Declaration. */ +void __init find_max_pfn(void); + /* This value is set up by the early boot code to point to the value immediately after the boot time page tables. It contains a *physical* address, and must not be in the .bss segment! */ unsigned long init_pg_tables_end __initdata = ~0UL; -int disable_pse __initdata = 0; +int disable_pse __devinitdata = 0; /* * Machine setup.. @@ -73,6 +83,7 @@ EXPORT_SYMBOL(efi_enabled); struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; /* common cpu data for all cpus */ struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 }; +EXPORT_SYMBOL(boot_cpu_data); unsigned long mmu_cr4_features; @@ -90,12 +101,18 @@ extern acpi_interrupt_flags acpi_sci_flags; /* for MCA, but anyone else can use it if they want */ unsigned int machine_id; +#ifdef CONFIG_MCA +EXPORT_SYMBOL(machine_id); +#endif unsigned int machine_submodel_id; unsigned int BIOS_revision; unsigned int mca_pentium_flag; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0x10000000; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif /* Boot loader ID as an integer, for the benefit of proc_dointvec */ int bootloader_type; @@ -107,14 +124,26 @@ static unsigned int highmem_pages = -1; * Setup options */ struct drive_info_struct { char dummy[32]; } drive_info; +#if defined(CONFIG_BLK_DEV_IDE) || defined(CONFIG_BLK_DEV_HD) || \ + defined(CONFIG_BLK_DEV_IDE_MODULE) || defined(CONFIG_BLK_DEV_HD_MODULE) +EXPORT_SYMBOL(drive_info); +#endif struct screen_info screen_info; +#ifdef CONFIG_VT +EXPORT_SYMBOL(screen_info); +#endif struct apm_info apm_info; +EXPORT_SYMBOL(apm_info); struct sys_desc_table_struct { unsigned short length; unsigned char table[0]; }; struct edid_info edid_info; struct ist_info ist_info; +#if defined(CONFIG_X86_SPEEDSTEP_SMI) || \ + defined(CONFIG_X86_SPEEDSTEP_SMI_MODULE) +EXPORT_SYMBOL(ist_info); +#endif struct e820map e820; extern void early_cpu_init(void); @@ -711,6 +740,15 @@ static void __init parse_cmdline_early (char ** cmdline_p) if (to != command_line) to--; if (!memcmp(from+7, "exactmap", 8)) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + find_max_pfn(); + saved_max_pfn = max_pfn; +#endif from += 8+7; e820.nr_map = 0; userdef = 1; @@ -814,6 +852,44 @@ static void __init parse_cmdline_early (char ** cmdline_p) #endif /* CONFIG_X86_LOCAL_APIC */ #endif /* CONFIG_ACPI_BOOT */ +#ifdef CONFIG_X86_LOCAL_APIC + /* enable local APIC */ + else if (!memcmp(from, "lapic", 5)) + lapic_enable(); + + /* disable local APIC */ + else if (!memcmp(from, "nolapic", 6)) + lapic_disable(); +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_KEXEC + /* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never set's it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ + else if (!memcmp(from, "crashkernel=", 12)) { + unsigned long size, base; + size = memparse(from+12, &from); + if (*from == '@') { + base = memparse(from+1, &from); + /* FIXME: Do I want a sanity check + * to validate the memory range? + */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + } +#endif +#ifdef CONFIG_CRASH_DUMP + /* elfcorehdr= specifies the location of elf core header + * stored by the crashed kernel. + */ + else if (!memcmp(from, "elfcorehdr=", 11)) + elfcorehdr_addr = memparse(from+11, &from); +#endif + /* * highmem=size forces highmem to be exactly 'size' bytes. * This works even on boxes that have no highmem otherwise. @@ -1022,7 +1098,7 @@ static void __init reserve_ebda_region(void) reserve_bootmem(addr, PAGE_SIZE); } -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NEED_MULTIPLE_NODES void __init setup_bootmem_allocator(void); static unsigned long __init setup_memory(void) { @@ -1072,9 +1148,9 @@ void __init zone_sizes_init(void) free_area_init(zones_size); } #else -extern unsigned long setup_memory(void); +extern unsigned long __init setup_memory(void); extern void zone_sizes_init(void); -#endif /* !CONFIG_DISCONTIGMEM */ +#endif /* !CONFIG_NEED_MULTIPLE_NODES */ void __init setup_bootmem_allocator(void) { @@ -1092,8 +1168,8 @@ void __init setup_bootmem_allocator(void) * the (very unlikely) case of us accidentally initializing the * bootmem allocator with an invalid RAM area. */ - reserve_bootmem(HIGH_MEMORY, (PFN_PHYS(min_low_pfn) + - bootmap_size + PAGE_SIZE-1) - (HIGH_MEMORY)); + reserve_bootmem(__PHYSICAL_START, (PFN_PHYS(min_low_pfn) + + bootmap_size + PAGE_SIZE-1) - (__PHYSICAL_START)); /* * reserve physical page 0 - it's a special BIOS page on many boxes, @@ -1149,6 +1225,11 @@ void __init setup_bootmem_allocator(void) } } #endif +#ifdef CONFIG_KEXEC + if (crashk_res.start != crashk_res.end) + reserve_bootmem(crashk_res.start, + crashk_res.end - crashk_res.start + 1); +#endif } /* @@ -1202,6 +1283,9 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat */ request_resource(res, code_resource); request_resource(res, data_resource); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif } } } @@ -1330,7 +1414,7 @@ static struct nop { This runs before SMP is initialized to avoid SMP problems with self modifying code. This implies that assymetric systems where APs have less capabilities than the boot processor are not handled. - In this case boot with "noreplacement". */ + Tough. Make sure you disable such features by hand. */ void apply_alternatives(void *start, void *end) { struct alt_instr *a; @@ -1358,24 +1442,12 @@ void apply_alternatives(void *start, void *end) } } -static int no_replacement __initdata = 0; - void __init alternative_instructions(void) { extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; - if (no_replacement) - return; apply_alternatives(__alt_instructions, __alt_instructions_end); } -static int __init noreplacement_setup(char *s) -{ - no_replacement = 1; - return 0; -} - -__setup("noreplacement", noreplacement_setup); - static char * __init machine_specific_memory_setup(void); #ifdef CONFIG_MCA @@ -1475,6 +1547,7 @@ void __init setup_arch(char **cmdline_p) #endif paging_init(); remapped_pgdat_init(); + sparse_init(); zone_sizes_init(); /* diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index ea46d028af08..89ef7adc63a4 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -346,8 +346,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size) extern void __user __kernel_sigreturn; extern void __user __kernel_rt_sigreturn; -static void setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs * regs) +static int setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs * regs) { void __user *restorer; struct sigframe __user *frame; @@ -429,13 +429,14 @@ static void setup_frame(int sig, struct k_sigaction *ka, current->comm, current->pid, frame, regs->eip, frame->pretcode); #endif - return; + return 1; give_sigsegv: force_sigsegv(sig, current); + return 0; } -static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs * regs) { void __user *restorer; @@ -522,20 +523,23 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, current->comm, current->pid, frame, regs->eip, frame->pretcode); #endif - return; + return 1; give_sigsegv: force_sigsegv(sig, current); + return 0; } /* * OK, we're invoking a handler */ -static void +static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs * regs) { + int ret; + /* Are we from a system call? */ if (regs->orig_eax >= 0) { /* If so, check system call restarting.. */ @@ -569,17 +573,19 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, /* Set up the stack frame */ if (ka->sa.sa_flags & SA_SIGINFO) - setup_rt_frame(sig, ka, info, oldset, regs); + ret = setup_rt_frame(sig, ka, info, oldset, regs); else - setup_frame(sig, ka, oldset, regs); + ret = setup_frame(sig, ka, oldset, regs); - if (!(ka->sa.sa_flags & SA_NODEFER)) { + if (ret && !(ka->sa.sa_flags & SA_NODEFER)) { spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); sigaddset(¤t->blocked,sig); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); } + + return ret; } /* @@ -599,13 +605,11 @@ int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset) * kernel mode. Just return without doing anything * if so. */ - if ((regs->xcs & 3) != 3) + if (!user_mode(regs)) return 1; - if (current->flags & PF_FREEZE) { - refrigerator(0); + if (try_to_freeze()) goto no_signal; - } if (!oldset) oldset = ¤t->blocked; @@ -618,12 +622,11 @@ int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset) * inside the kernel. */ if (unlikely(current->thread.debugreg[7])) { - loaddebug(¤t->thread, 7); + set_debugreg(current->thread.debugreg[7], 7); } /* Whee! Actually deliver the signal. */ - handle_signal(signr, &info, &ka, oldset, regs); - return 1; + return handle_signal(signr, &info, &ka, oldset, regs); } no_signal: diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 6223c33ac91c..cec4bde67161 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -19,6 +19,8 @@ #include <linux/mc146818rtc.h> #include <linux/cache.h> #include <linux/interrupt.h> +#include <linux/cpu.h> +#include <linux/module.h> #include <asm/mtrr.h> #include <asm/tlbflush.h> @@ -163,7 +165,7 @@ void send_IPI_mask_bitmask(cpumask_t cpumask, int vector) unsigned long flags; local_irq_save(flags); - + WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]); /* * Wait for idle. */ @@ -345,21 +347,21 @@ out: static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, unsigned long va) { - cpumask_t tmp; /* * A couple of (to be removed) sanity checks: * - * - we do not send IPIs to not-yet booted CPUs. * - current CPU must not be in mask * - mask must exist :) */ BUG_ON(cpus_empty(cpumask)); - - cpus_and(tmp, cpumask, cpu_online_map); - BUG_ON(!cpus_equal(cpumask, tmp)); BUG_ON(cpu_isset(smp_processor_id(), cpumask)); BUG_ON(!mm); + /* If a CPU which we ran on has gone down, OK. */ + cpus_and(cpumask, cpumask, cpu_online_map); + if (cpus_empty(cpumask)) + return; + /* * i'm not happy about this global shared spinlock in the * MM hot path, but we'll see how contended it is. @@ -452,6 +454,7 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_page); static void do_flush_tlb_all(void* info) { @@ -474,6 +477,7 @@ void flush_tlb_all(void) */ void smp_send_reschedule(int cpu) { + WARN_ON(cpu_is_offline(cpu)); send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); } @@ -491,6 +495,16 @@ struct call_data_struct { int wait; }; +void lock_ipi_call_lock(void) +{ + spin_lock_irq(&call_lock); +} + +void unlock_ipi_call_lock(void) +{ + spin_unlock_irq(&call_lock); +} + static struct call_data_struct * call_data; /* @@ -514,10 +528,15 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, */ { struct call_data_struct data; - int cpus = num_online_cpus()-1; + int cpus; - if (!cpus) + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); + cpus = num_online_cpus() - 1; + if (!cpus) { + spin_unlock(&call_lock); return 0; + } /* Can deadlock when called with interrupts disabled */ WARN_ON(irqs_disabled()); @@ -529,7 +548,6 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, if (wait) atomic_set(&data.finished, 0); - spin_lock(&call_lock); call_data = &data; mb(); @@ -547,6 +565,7 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, return 0; } +EXPORT_SYMBOL(smp_call_function); static void stop_this_cpu (void * dummy) { diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index bc1bb6919e6a..8ac8e9fd5614 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -44,6 +44,9 @@ #include <linux/smp_lock.h> #include <linux/irq.h> #include <linux/bootmem.h> +#include <linux/notifier.h> +#include <linux/cpu.h> +#include <linux/percpu.h> #include <linux/delay.h> #include <linux/mc146818rtc.h> @@ -56,26 +59,48 @@ #include <smpboot_hooks.h> /* Set if we find a B stepping CPU */ -static int __initdata smp_b_stepping; +static int __devinitdata smp_b_stepping; /* Number of siblings per CPU package */ int smp_num_siblings = 1; -int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ +#ifdef CONFIG_X86_HT +EXPORT_SYMBOL(smp_num_siblings); +#endif + +/* Package ID of each logical CPU */ +int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; EXPORT_SYMBOL(phys_proc_id); -int cpu_core_id[NR_CPUS]; /* Core ID of each logical CPU */ + +/* Core ID of each logical CPU */ +int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; EXPORT_SYMBOL(cpu_core_id); +cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(cpu_sibling_map); + +cpumask_t cpu_core_map[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(cpu_core_map); + /* bitmap of online cpus */ -cpumask_t cpu_online_map; +cpumask_t cpu_online_map __read_mostly; +EXPORT_SYMBOL(cpu_online_map); cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; +EXPORT_SYMBOL(cpu_callout_map); static cpumask_t smp_commenced_mask; +/* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there + * is no way to resync one AP against BP. TBD: for prescott and above, we + * should use IA64's algorithm + */ +static int __devinitdata tsc_sync_disabled; + /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +EXPORT_SYMBOL(cpu_data); -u8 x86_cpu_to_apicid[NR_CPUS] = +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0xff }; EXPORT_SYMBOL(x86_cpu_to_apicid); @@ -90,13 +115,16 @@ static int trampoline_exec; static void map_cpu_to_logical_apicid(void); +/* State of each CPU. */ +DEFINE_PER_CPU(int, cpu_state) = { 0 }; + /* * Currently trivial. Write the real->protected mode * bootstrap into the page concerned. The caller * has made sure it's suitably aligned. */ -static unsigned long __init setup_trampoline(void) +static unsigned long __devinit setup_trampoline(void) { memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data); return virt_to_phys(trampoline_base); @@ -126,7 +154,7 @@ void __init smp_alloc_memory(void) * a given CPU */ -static void __init smp_store_cpu_info(int id) +static void __devinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = cpu_data + id; @@ -199,7 +227,7 @@ static void __init synchronize_tsc_bp (void) unsigned long long t0; unsigned long long sum, avg; long long delta; - unsigned long one_usec; + unsigned int one_usec; int buggy = 0; printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus()); @@ -320,7 +348,7 @@ extern void calibrate_delay(void); static atomic_t init_deasserted; -static void __init smp_callin(void) +static void __devinit smp_callin(void) { int cpuid, phys_id; unsigned long timeout; @@ -405,16 +433,48 @@ static void __init smp_callin(void) /* * Synchronize the TSC with the BP */ - if (cpu_has_tsc && cpu_khz) + if (cpu_has_tsc && cpu_khz && !tsc_sync_disabled) synchronize_tsc_ap(); } static int cpucount; +static inline void +set_cpu_sibling_map(int cpu) +{ + int i; + + if (smp_num_siblings > 1) { + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_isset(i, cpu_callout_map)) + continue; + if (cpu_core_id[cpu] == cpu_core_id[i]) { + cpu_set(i, cpu_sibling_map[cpu]); + cpu_set(cpu, cpu_sibling_map[i]); + } + } + } else { + cpu_set(cpu, cpu_sibling_map[cpu]); + } + + if (current_cpu_data.x86_num_cores > 1) { + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_isset(i, cpu_callout_map)) + continue; + if (phys_proc_id[cpu] == phys_proc_id[i]) { + cpu_set(i, cpu_core_map[cpu]); + cpu_set(cpu, cpu_core_map[i]); + } + } + } else { + cpu_core_map[cpu] = cpu_sibling_map[cpu]; + } +} + /* * Activate a secondary processor. */ -static void __init start_secondary(void *unused) +static void __devinit start_secondary(void *unused) { /* * Dont put anything before smp_callin(), SMP @@ -437,7 +497,23 @@ static void __init start_secondary(void *unused) * the local TLBs too. */ local_flush_tlb(); + + /* This must be done before setting cpu_online_map */ + set_cpu_sibling_map(raw_smp_processor_id()); + wmb(); + + /* + * We need to hold call_lock, so there is no inconsistency + * between the time smp_call_function() determines number of + * IPI receipients, and the time when the determination is made + * for which cpus receive the IPI. Holding this + * lock helps us to not include this cpu in a currently in progress + * smp_call_function(). + */ + lock_ipi_call_lock(); cpu_set(smp_processor_id(), cpu_online_map); + unlock_ipi_call_lock(); + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; /* We can take interrupts now: we're officially "up". */ local_irq_enable(); @@ -452,7 +528,7 @@ static void __init start_secondary(void *unused) * from the task structure * This function must not return. */ -void __init initialize_secondary(void) +void __devinit initialize_secondary(void) { /* * We don't actually need to load the full TSS, @@ -474,10 +550,10 @@ extern struct { #ifdef CONFIG_NUMA /* which logical CPUs are on which nodes */ -cpumask_t node_2_cpu_mask[MAX_NUMNODES] = +cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; /* which node each logical CPU is on */ -int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 }; +int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; EXPORT_SYMBOL(cpu_2_node); /* set up a mapping between cpu and node. */ @@ -505,7 +581,7 @@ static inline void unmap_cpu_to_node(int cpu) #endif /* CONFIG_NUMA */ -u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; static void map_cpu_to_logical_apicid(void) { @@ -566,7 +642,7 @@ static inline void __inquire_remote_apic(int apicid) * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this * won't ... remember to clear down the APIC, etc later. */ -static int __init +static int __devinit wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) { unsigned long send_status = 0, accept_status = 0; @@ -612,7 +688,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) #endif /* WAKE_SECONDARY_VIA_NMI */ #ifdef WAKE_SECONDARY_VIA_INIT -static int __init +static int __devinit wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) { unsigned long send_status = 0, accept_status = 0; @@ -747,8 +823,43 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) #endif /* WAKE_SECONDARY_VIA_INIT */ extern cpumask_t cpu_initialized; +static inline int alloc_cpu_id(void) +{ + cpumask_t tmp_map; + int cpu; + cpus_complement(tmp_map, cpu_present_map); + cpu = first_cpu(tmp_map); + if (cpu >= NR_CPUS) + return -ENODEV; + return cpu; +} + +#ifdef CONFIG_HOTPLUG_CPU +static struct task_struct * __devinitdata cpu_idle_tasks[NR_CPUS]; +static inline struct task_struct * alloc_idle_task(int cpu) +{ + struct task_struct *idle; + + if ((idle = cpu_idle_tasks[cpu]) != NULL) { + /* initialize thread_struct. we really want to avoid destroy + * idle tread + */ + idle->thread.esp = (unsigned long)(((struct pt_regs *) + (THREAD_SIZE + (unsigned long) idle->thread_info)) - 1); + init_idle(idle, cpu); + return idle; + } + idle = fork_idle(cpu); -static int __init do_boot_cpu(int apicid) + if (!IS_ERR(idle)) + cpu_idle_tasks[cpu] = idle; + return idle; +} +#else +#define alloc_idle_task(cpu) fork_idle(cpu) +#endif + +static int __devinit do_boot_cpu(int apicid, int cpu) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -757,16 +868,17 @@ static int __init do_boot_cpu(int apicid) { struct task_struct *idle; unsigned long boot_error; - int timeout, cpu; + int timeout; unsigned long start_eip; unsigned short nmi_high = 0, nmi_low = 0; - cpu = ++cpucount; + ++cpucount; + /* * We can't use kernel_thread since we must avoid to * reschedule the child. */ - idle = fork_idle(cpu); + idle = alloc_idle_task(cpu); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); idle->thread.eip = (unsigned long) start_secondary; @@ -833,13 +945,16 @@ static int __init do_boot_cpu(int apicid) inquire_remote_apic(apicid); } } - x86_cpu_to_apicid[cpu] = apicid; + if (boot_error) { /* Try to put things back the way they were before ... */ unmap_cpu_to_logical_apicid(cpu); cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ cpucount--; + } else { + x86_cpu_to_apicid[cpu] = apicid; + cpu_set(cpu, cpu_present_map); } /* mark "stuck" area as not stuck */ @@ -848,6 +963,75 @@ static int __init do_boot_cpu(int apicid) return boot_error; } +#ifdef CONFIG_HOTPLUG_CPU +void cpu_exit_clear(void) +{ + int cpu = raw_smp_processor_id(); + + idle_task_exit(); + + cpucount --; + cpu_uninit(); + irq_ctx_exit(cpu); + + cpu_clear(cpu, cpu_callout_map); + cpu_clear(cpu, cpu_callin_map); + cpu_clear(cpu, cpu_present_map); + + cpu_clear(cpu, smp_commenced_mask); + unmap_cpu_to_logical_apicid(cpu); +} + +struct warm_boot_cpu_info { + struct completion *complete; + int apicid; + int cpu; +}; + +static void __devinit do_warm_boot_cpu(void *p) +{ + struct warm_boot_cpu_info *info = p; + do_boot_cpu(info->apicid, info->cpu); + complete(info->complete); +} + +int __devinit smp_prepare_cpu(int cpu) +{ + DECLARE_COMPLETION(done); + struct warm_boot_cpu_info info; + struct work_struct task; + int apicid, ret; + + lock_cpu_hotplug(); + apicid = x86_cpu_to_apicid[cpu]; + if (apicid == BAD_APICID) { + ret = -ENODEV; + goto exit; + } + + info.complete = &done; + info.apicid = apicid; + info.cpu = cpu; + INIT_WORK(&task, do_warm_boot_cpu, &info); + + tsc_sync_disabled = 1; + + /* init low mem mapping */ + memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, + sizeof(swapper_pg_dir[0]) * KERNEL_PGD_PTRS); + flush_tlb_all(); + schedule_work(&task); + wait_for_completion(&done); + + tsc_sync_disabled = 0; + zap_low_mappings(); + ret = 0; +exit: + unlock_cpu_hotplug(); + return ret; +} +#endif + static void smp_tune_scheduling (void) { unsigned long cachesize; /* kB */ @@ -885,10 +1069,9 @@ static void smp_tune_scheduling (void) static int boot_cpu_logical_apicid; /* Where the IO area was mapped on multiquad, always 0 otherwise */ void *xquad_portio; - -cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; -cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; -EXPORT_SYMBOL(cpu_core_map); +#ifdef CONFIG_X86_NUMAQ +EXPORT_SYMBOL(xquad_portio); +#endif static void __init smp_boot_cpus(unsigned int max_cpus) { @@ -1001,7 +1184,7 @@ static void __init smp_boot_cpus(unsigned int max_cpus) if (max_cpus <= cpucount+1) continue; - if (do_boot_cpu(apicid)) + if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu)) printk("CPU #%d not responding - cannot use it.\n", apicid); else @@ -1053,44 +1236,8 @@ static void __init smp_boot_cpus(unsigned int max_cpus) cpus_clear(cpu_core_map[cpu]); } - for (cpu = 0; cpu < NR_CPUS; cpu++) { - struct cpuinfo_x86 *c = cpu_data + cpu; - int siblings = 0; - int i; - if (!cpu_isset(cpu, cpu_callout_map)) - continue; - - if (smp_num_siblings > 1) { - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - if (cpu_core_id[cpu] == cpu_core_id[i]) { - siblings++; - cpu_set(i, cpu_sibling_map[cpu]); - } - } - } else { - siblings++; - cpu_set(cpu, cpu_sibling_map[cpu]); - } - - if (siblings != smp_num_siblings) { - printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings); - smp_num_siblings = siblings; - } - - if (c->x86_num_cores > 1) { - for (i = 0; i < NR_CPUS; i++) { - if (!cpu_isset(i, cpu_callout_map)) - continue; - if (phys_proc_id[cpu] == phys_proc_id[i]) { - cpu_set(i, cpu_core_map[cpu]); - } - } - } else { - cpu_core_map[cpu] = cpu_sibling_map[cpu]; - } - } + cpu_set(0, cpu_sibling_map[0]); + cpu_set(0, cpu_core_map[0]); smpboot_setup_io_apic(); @@ -1107,6 +1254,9 @@ static void __init smp_boot_cpus(unsigned int max_cpus) who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) { + smp_commenced_mask = cpumask_of_cpu(0); + cpu_callin_map = cpumask_of_cpu(0); + mb(); smp_boot_cpus(max_cpus); } @@ -1114,23 +1264,98 @@ void __devinit smp_prepare_boot_cpu(void) { cpu_set(smp_processor_id(), cpu_online_map); cpu_set(smp_processor_id(), cpu_callout_map); + cpu_set(smp_processor_id(), cpu_present_map); + per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; } -int __devinit __cpu_up(unsigned int cpu) +#ifdef CONFIG_HOTPLUG_CPU +static void +remove_siblinginfo(int cpu) { - /* This only works at boot for x86. See "rewrite" above. */ - if (cpu_isset(cpu, smp_commenced_mask)) { - local_irq_enable(); - return -ENOSYS; + int sibling; + + for_each_cpu_mask(sibling, cpu_sibling_map[cpu]) + cpu_clear(cpu, cpu_sibling_map[sibling]); + for_each_cpu_mask(sibling, cpu_core_map[cpu]) + cpu_clear(cpu, cpu_core_map[sibling]); + cpus_clear(cpu_sibling_map[cpu]); + cpus_clear(cpu_core_map[cpu]); + phys_proc_id[cpu] = BAD_APICID; + cpu_core_id[cpu] = BAD_APICID; +} + +int __cpu_disable(void) +{ + cpumask_t map = cpu_online_map; + int cpu = smp_processor_id(); + + /* + * Perhaps use cpufreq to drop frequency, but that could go + * into generic code. + * + * We won't take down the boot processor on i386 due to some + * interrupts only being able to be serviced by the BSP. + * Especially so if we're not using an IOAPIC -zwane + */ + if (cpu == 0) + return -EBUSY; + + /* We enable the timer again on the exit path of the death loop */ + disable_APIC_timer(); + /* Allow any queued timer interrupts to get serviced */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); + + remove_siblinginfo(cpu); + + cpu_clear(cpu, map); + fixup_irqs(map); + /* It's now safe to remove this processor from the online map */ + cpu_clear(cpu, cpu_online_map); + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + /* We don't do anything here: idle task is faking death itself. */ + unsigned int i; + + for (i = 0; i < 10; i++) { + /* They ack this in play_dead by setting CPU_DEAD */ + if (per_cpu(cpu_state, cpu) == CPU_DEAD) { + printk ("CPU %d is now offline\n", cpu); + return; + } + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); } + printk(KERN_ERR "CPU %u didn't die...\n", cpu); +} +#else /* ... !CONFIG_HOTPLUG_CPU */ +int __cpu_disable(void) +{ + return -ENOSYS; +} +void __cpu_die(unsigned int cpu) +{ + /* We said "no" in __cpu_disable */ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +int __devinit __cpu_up(unsigned int cpu) +{ /* In case one didn't come up */ if (!cpu_isset(cpu, cpu_callin_map)) { + printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu); local_irq_enable(); return -EIO; } local_irq_enable(); + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); while (!cpu_isset(cpu, cpu_online_map)) @@ -1144,10 +1369,12 @@ void __init smp_cpus_done(unsigned int max_cpus) setup_ioapic_dest(); #endif zap_low_mappings(); +#ifndef CONFIG_HOTPLUG_CPU /* * Disable executability of the SMP trampoline: */ set_kernel_exec((unsigned long)trampoline_base, trampoline_exec); +#endif } void __init smp_intr_init(void) diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index d408afaf6495..9b21a31d4f4e 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -251,7 +251,7 @@ ENTRY(sys_call_table) .long sys_io_submit .long sys_io_cancel .long sys_fadvise64 /* 250 */ - .long sys_set_zone_reclaim + .long sys_ni_syscall .long sys_exit_group .long sys_lookup_dcookie .long sys_epoll_create @@ -283,9 +283,14 @@ ENTRY(sys_call_table) .long sys_mq_timedreceive /* 280 */ .long sys_mq_notify .long sys_mq_getsetattr - .long sys_ni_syscall /* reserved for kexec */ + .long sys_kexec_load .long sys_waitid .long sys_ni_syscall /* 285 */ /* available */ .long sys_add_key .long sys_request_key .long sys_keyctl + .long sys_ioprio_set + .long sys_ioprio_get /* 290 */ + .long sys_inotify_init + .long sys_inotify_add_watch + .long sys_inotify_rm_watch diff --git a/arch/i386/kernel/sysenter.c b/arch/i386/kernel/sysenter.c index 960d8bd137d0..0bada1870bdf 100644 --- a/arch/i386/kernel/sysenter.c +++ b/arch/i386/kernel/sysenter.c @@ -21,11 +21,16 @@ extern asmlinkage void sysenter_entry(void); -void enable_sep_cpu(void *info) +void enable_sep_cpu(void) { int cpu = get_cpu(); struct tss_struct *tss = &per_cpu(init_tss, cpu); + if (!boot_cpu_has(X86_FEATURE_SEP)) { + put_cpu(); + return; + } + tss->ss1 = __KERNEL_CS; tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss; wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); @@ -41,7 +46,7 @@ void enable_sep_cpu(void *info) extern const char vsyscall_int80_start, vsyscall_int80_end; extern const char vsyscall_sysenter_start, vsyscall_sysenter_end; -static int __init sysenter_setup(void) +int __init sysenter_setup(void) { void *page = (void *)get_zeroed_page(GFP_ATOMIC); @@ -58,8 +63,5 @@ static int __init sysenter_setup(void) &vsyscall_sysenter_start, &vsyscall_sysenter_end - &vsyscall_sysenter_start); - on_each_cpu(enable_sep_cpu, NULL, 1, 1); return 0; } - -__initcall(sysenter_setup); diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index a0dcb7c87c30..0ee9dee8af06 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -68,7 +68,8 @@ #include "io_ports.h" -extern spinlock_t i8259A_lock; +#include <asm/i8259.h> + int pit_latch_buggy; /* extern */ #include "do_timer.h" @@ -77,16 +78,20 @@ u64 jiffies_64 = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); -unsigned long cpu_khz; /* Detected as we calibrate the TSC */ +unsigned int cpu_khz; /* Detected as we calibrate the TSC */ +EXPORT_SYMBOL(cpu_khz); extern unsigned long wall_jiffies; DEFINE_SPINLOCK(rtc_lock); +EXPORT_SYMBOL(rtc_lock); + +#include <asm/i8253.h> DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -struct timer_opts *cur_timer = &timer_none; +struct timer_opts *cur_timer __read_mostly = &timer_none; /* * This is a special lock that is owned by the CPU and holds the index @@ -324,6 +329,8 @@ unsigned long get_cmos_time(void) return retval; } +EXPORT_SYMBOL(get_cmos_time); + static void sync_cmos_clock(unsigned long dummy); static struct timer_list sync_cmos_timer = diff --git a/arch/i386/kernel/time_hpet.c b/arch/i386/kernel/time_hpet.c index 10a0cbb88e75..658c0629ba6a 100644 --- a/arch/i386/kernel/time_hpet.c +++ b/arch/i386/kernel/time_hpet.c @@ -50,7 +50,7 @@ static void hpet_writel(unsigned long d, unsigned long a) * comparator value and continue. Next tick can be caught by checking * for a change in the comparator value. Used in apic.c. */ -static void __init wait_hpet_tick(void) +static void __devinit wait_hpet_tick(void) { unsigned int start_cmp_val, end_cmp_val; diff --git a/arch/i386/kernel/timers/common.c b/arch/i386/kernel/timers/common.c index 8e201219f525..8163fe0cf1f0 100644 --- a/arch/i386/kernel/timers/common.c +++ b/arch/i386/kernel/timers/common.c @@ -86,7 +86,7 @@ bad_ctc: #define CALIBRATE_CNT_HPET (5 * hpet_tick) #define CALIBRATE_TIME_HPET (5 * KERNEL_TICK_USEC) -unsigned long __init calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr) +unsigned long __devinit calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr) { unsigned long tsc_startlow, tsc_starthigh; unsigned long tsc_endlow, tsc_endhigh; @@ -139,6 +139,15 @@ bad_calibration: } #endif + +unsigned long read_timer_tsc(void) +{ + unsigned long retval; + rdtscl(retval); + return retval; +} + + /* calculate cpu_khz */ void init_cpu_khz(void) { @@ -154,7 +163,8 @@ void init_cpu_khz(void) :"=a" (cpu_khz), "=d" (edx) :"r" (tsc_quotient), "0" (eax), "1" (edx)); - printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); + printk("Detected %u.%03u MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); } } } diff --git a/arch/i386/kernel/timers/timer.c b/arch/i386/kernel/timers/timer.c index a3d6a288088b..7e39ed8e33f8 100644 --- a/arch/i386/kernel/timers/timer.c +++ b/arch/i386/kernel/timers/timer.c @@ -64,3 +64,12 @@ struct timer_opts* __init select_timer(void) panic("select_timer: Cannot find a suitable timer\n"); return NULL; } + +int read_current_timer(unsigned long *timer_val) +{ + if (cur_timer->read_timer) { + *timer_val = cur_timer->read_timer(); + return 0; + } + return -1; +} diff --git a/arch/i386/kernel/timers/timer_cyclone.c b/arch/i386/kernel/timers/timer_cyclone.c index f6f1206a11bb..13892a65c941 100644 --- a/arch/i386/kernel/timers/timer_cyclone.c +++ b/arch/i386/kernel/timers/timer_cyclone.c @@ -17,9 +17,9 @@ #include <asm/io.h> #include <asm/pgtable.h> #include <asm/fixmap.h> -#include "io_ports.h" +#include <asm/i8253.h> -extern spinlock_t i8253_lock; +#include "io_ports.h" /* Number of usecs that the last interrupt was delayed */ static int delay_at_last_interrupt; diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c index f778f471a09a..ef8dac5dd33b 100644 --- a/arch/i386/kernel/timers/timer_hpet.c +++ b/arch/i386/kernel/timers/timer_hpet.c @@ -18,7 +18,7 @@ #include "mach_timer.h" #include <asm/hpet.h> -static unsigned long hpet_usec_quotient; /* convert hpet clks to usec */ +static unsigned long __read_mostly hpet_usec_quotient; /* convert hpet clks to usec */ static unsigned long tsc_hpet_quotient; /* convert tsc to hpet clks */ static unsigned long hpet_last; /* hpet counter value at last tick*/ static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ @@ -158,7 +158,7 @@ static int __init init_hpet(char* override) { unsigned long eax=0, edx=1000; ASM_DIV64_REG(cpu_khz, edx, tsc_quotient, eax, edx); - printk("Detected %lu.%03lu MHz processor.\n", + printk("Detected %u.%03u MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); } set_cyc2ns_scale(cpu_khz/1000); @@ -180,12 +180,13 @@ static int __init init_hpet(char* override) /************************************************************/ /* tsc timer_opts struct */ -static struct timer_opts timer_hpet = { +static struct timer_opts timer_hpet __read_mostly = { .name = "hpet", .mark_offset = mark_offset_hpet, .get_offset = get_offset_hpet, .monotonic_clock = monotonic_clock_hpet, .delay = delay_hpet, + .read_timer = read_timer_tsc, }; struct init_timer_opts __initdata timer_hpet_init = { diff --git a/arch/i386/kernel/timers/timer_pit.c b/arch/i386/kernel/timers/timer_pit.c index 967d5453cd0e..06de036a820c 100644 --- a/arch/i386/kernel/timers/timer_pit.c +++ b/arch/i386/kernel/timers/timer_pit.c @@ -15,9 +15,8 @@ #include <asm/smp.h> #include <asm/io.h> #include <asm/arch_hooks.h> +#include <asm/i8253.h> -extern spinlock_t i8259A_lock; -extern spinlock_t i8253_lock; #include "do_timer.h" #include "io_ports.h" @@ -166,7 +165,6 @@ struct init_timer_opts __initdata timer_pit_init = { void setup_pit_timer(void) { - extern spinlock_t i8253_lock; unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c index d77f22030fe6..4ef20e663498 100644 --- a/arch/i386/kernel/timers/timer_pm.c +++ b/arch/i386/kernel/timers/timer_pm.c @@ -246,6 +246,7 @@ static struct timer_opts timer_pmtmr = { .get_offset = get_offset_pmtmr, .monotonic_clock = monotonic_clock_pmtmr, .delay = delay_pmtmr, + .read_timer = read_timer_tsc, }; struct init_timer_opts __initdata timer_pmtmr_init = { diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c index 180444d87824..8f4e4d5bc560 100644 --- a/arch/i386/kernel/timers/timer_tsc.c +++ b/arch/i386/kernel/timers/timer_tsc.c @@ -24,6 +24,7 @@ #include "mach_timer.h" #include <asm/hpet.h> +#include <asm/i8253.h> #ifdef CONFIG_HPET_TIMER static unsigned long hpet_usec_quotient; @@ -33,9 +34,7 @@ static struct timer_opts timer_tsc; static inline void cpufreq_delayed_get(void); -int tsc_disable __initdata = 0; - -extern spinlock_t i8253_lock; +int tsc_disable __devinitdata = 0; static int use_tsc; /* Number of usecs that the last interrupt was delayed */ @@ -256,7 +255,7 @@ static unsigned long loops_per_jiffy_ref = 0; #ifndef CONFIG_SMP static unsigned long fast_gettimeoffset_ref = 0; -static unsigned long cpu_khz_ref = 0; +static unsigned int cpu_khz_ref = 0; #endif static int @@ -323,7 +322,7 @@ static inline void cpufreq_delayed_get(void) { return; } int recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP - unsigned long cpu_khz_old = cpu_khz; + unsigned int cpu_khz_old = cpu_khz; if (cpu_has_tsc) { init_cpu_khz(); @@ -534,7 +533,8 @@ static int __init init_tsc(char* override) :"=a" (cpu_khz), "=d" (edx) :"r" (tsc_quotient), "0" (eax), "1" (edx)); - printk("Detected %lu.%03lu MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); + printk("Detected %u.%03u MHz processor.\n", + cpu_khz / 1000, cpu_khz % 1000); } set_cyc2ns_scale(cpu_khz/1000); return 0; @@ -572,6 +572,7 @@ static struct timer_opts timer_tsc = { .get_offset = get_offset_tsc, .monotonic_clock = monotonic_clock_tsc, .delay = delay_tsc, + .read_timer = read_timer_tsc, }; struct init_timer_opts __initdata timer_tsc_init = { diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 83c579e82a81..a61f33d06ea3 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -27,6 +27,7 @@ #include <linux/ptrace.h> #include <linux/utsname.h> #include <linux/kprobes.h> +#include <linux/kexec.h> #ifdef CONFIG_EISA #include <linux/ioport.h> @@ -104,6 +105,7 @@ int register_die_notifier(struct notifier_block *nb) spin_unlock_irqrestore(&die_notifier_lock, flags); return err; } +EXPORT_SYMBOL(register_die_notifier); static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) { @@ -209,7 +211,7 @@ void show_registers(struct pt_regs *regs) esp = (unsigned long) (®s->esp); ss = __KERNEL_DS; - if (regs->xcs & 3) { + if (user_mode(regs)) { in_kernel = 0; esp = regs->esp; ss = regs->xss & 0xffff; @@ -233,22 +235,22 @@ void show_registers(struct pt_regs *regs) * time of the fault.. */ if (in_kernel) { - u8 *eip; + u8 __user *eip; printk("\nStack: "); show_stack(NULL, (unsigned long*)esp); printk("Code: "); - eip = (u8 *)regs->eip - 43; + eip = (u8 __user *)regs->eip - 43; for (i = 0; i < 64; i++, eip++) { unsigned char c; - if (eip < (u8 *)PAGE_OFFSET || __get_user(c, eip)) { + if (eip < (u8 __user *)PAGE_OFFSET || __get_user(c, eip)) { printk(" Bad EIP value."); break; } - if (eip == (u8 *)regs->eip) + if (eip == (u8 __user *)regs->eip) printk("<%02x> ", c); else printk("%02x ", c); @@ -265,20 +267,20 @@ static void handle_BUG(struct pt_regs *regs) char c; unsigned long eip; - if (regs->xcs & 3) + if (user_mode(regs)) goto no_bug; /* Not in kernel */ eip = regs->eip; if (eip < PAGE_OFFSET) goto no_bug; - if (__get_user(ud2, (unsigned short *)eip)) + if (__get_user(ud2, (unsigned short __user *)eip)) goto no_bug; if (ud2 != 0x0b0f) goto no_bug; - if (__get_user(line, (unsigned short *)(eip + 2))) + if (__get_user(line, (unsigned short __user *)(eip + 2))) goto bug; - if (__get_user(file, (char **)(eip + 4)) || + if (__get_user(file, (char * __user *)(eip + 4)) || (unsigned long)file < PAGE_OFFSET || __get_user(c, file)) file = "<bad filename>"; @@ -293,6 +295,9 @@ bug: printk("Kernel BUG\n"); } +/* This is gone through when something in the kernel + * has done something bad and is about to be terminated. +*/ void die(const char * str, struct pt_regs * regs, long err) { static struct { @@ -340,6 +345,10 @@ void die(const char * str, struct pt_regs * regs, long err) bust_spinlocks(0); die.lock_owner = -1; spin_unlock_irq(&die.lock); + + if (kexec_should_crash(current)) + crash_kexec(regs); + if (in_interrupt()) panic("Fatal exception in interrupt"); @@ -353,26 +362,27 @@ void die(const char * str, struct pt_regs * regs, long err) static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err) { - if (!(regs->eflags & VM_MASK) && !(3 & regs->xcs)) + if (!user_mode_vm(regs)) die(str, regs, err); } static void do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs * regs, long error_code, siginfo_t *info) { + struct task_struct *tsk = current; + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + if (regs->eflags & VM_MASK) { if (vm86) goto vm86_trap; goto trap_signal; } - if (!(regs->xcs & 3)) + if (!user_mode(regs)) goto kernel_trap; trap_signal: { - struct task_struct *tsk = current; - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; if (info) force_sig_info(signr, info, tsk); else @@ -485,10 +495,13 @@ fastcall void do_general_protection(struct pt_regs * regs, long error_code) } put_cpu(); + current->thread.error_code = error_code; + current->thread.trap_no = 13; + if (regs->eflags & VM_MASK) goto gp_in_vm86; - if (!(regs->xcs & 3)) + if (!user_mode(regs)) goto gp_in_kernel; current->thread.error_code = error_code; @@ -569,6 +582,15 @@ void die_nmi (struct pt_regs *regs, const char *msg) console_silent(); spin_unlock(&nmi_print_lock); bust_spinlocks(0); + + /* If we are in kernel we are probably nested up pretty bad + * and might aswell get out now while we still can. + */ + if (!user_mode(regs)) { + current->thread.trap_no = 2; + crash_kexec(regs); + } + do_exit(SIGSEGV); } @@ -624,6 +646,14 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code) nmi_enter(); cpu = smp_processor_id(); + +#ifdef CONFIG_HOTPLUG_CPU + if (!cpu_online(cpu)) { + nmi_exit(); + return; + } +#endif + ++nmi_count(cpu); if (!nmi_callback(regs, cpu)) @@ -636,11 +666,13 @@ void set_nmi_callback(nmi_callback_t callback) { nmi_callback = callback; } +EXPORT_SYMBOL_GPL(set_nmi_callback); void unset_nmi_callback(void) { nmi_callback = dummy_nmi_callback; } +EXPORT_SYMBOL_GPL(unset_nmi_callback); #ifdef CONFIG_KPROBES fastcall void do_int3(struct pt_regs *regs, long error_code) @@ -682,7 +714,7 @@ fastcall void do_debug(struct pt_regs * regs, long error_code) unsigned int condition; struct task_struct *tsk = current; - __asm__ __volatile__("movl %%db6,%0" : "=r" (condition)); + get_debugreg(condition, 6); if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) @@ -713,7 +745,7 @@ fastcall void do_debug(struct pt_regs * regs, long error_code) * check for kernel mode by just checking the CPL * of CS. */ - if ((regs->xcs & 3) == 0) + if (!user_mode(regs)) goto clear_TF_reenable; } @@ -724,9 +756,7 @@ fastcall void do_debug(struct pt_regs * regs, long error_code) * the signal is delivered. */ clear_dr7: - __asm__("movl %0,%%db7" - : /* no output */ - : "r" (0)); + set_debugreg(0, 7); return; debug_vm86: @@ -871,9 +901,9 @@ fastcall void do_simd_coprocessor_error(struct pt_regs * regs, error_code); return; } - die_if_kernel("cache flush denied", regs, error_code); current->thread.trap_no = 19; current->thread.error_code = error_code; + die_if_kernel("cache flush denied", regs, error_code); force_sig(SIGSEGV, current); } } diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index e0512cc8bea7..761972f8cb6c 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -2,20 +2,23 @@ * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>; */ +#define LOAD_OFFSET __PAGE_OFFSET + #include <asm-generic/vmlinux.lds.h> #include <asm/thread_info.h> #include <asm/page.h> OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) -ENTRY(startup_32) +ENTRY(phys_startup_32) jiffies = jiffies_64; SECTIONS { - . = __PAGE_OFFSET + 0x100000; + . = __KERNEL_START; + phys_startup_32 = startup_32 - LOAD_OFFSET; /* read-only */ _text = .; /* Text and read-only data */ - .text : { + .text : AT(ADDR(.text) - LOAD_OFFSET) { *(.text) SCHED_TEXT LOCK_TEXT @@ -27,49 +30,58 @@ SECTIONS . = ALIGN(16); /* Exception table */ __start___ex_table = .; - __ex_table : { *(__ex_table) } + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } __stop___ex_table = .; RODATA /* writeable */ - .data : { /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ *(.data) CONSTRUCTORS } . = ALIGN(4096); __nosave_begin = .; - .data_nosave : { *(.data.nosave) } + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) } . = ALIGN(4096); __nosave_end = .; . = ALIGN(4096); - .data.page_aligned : { *(.data.idt) } + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.idt) + } . = ALIGN(32); - .data.cacheline_aligned : { *(.data.cacheline_aligned) } + .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } + /* rarely changed data like cpu maps */ + . = ALIGN(32); + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } _edata = .; /* End of data section */ . = ALIGN(THREAD_SIZE); /* init_task */ - .data.init_task : { *(.data.init_task) } + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } /* will be freed after init */ . = ALIGN(4096); /* Init code and data */ __init_begin = .; - .init.text : { + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; *(.init.text) _einittext = .; } - .init.data : { *(.init.data) } + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) } . = ALIGN(16); __setup_start = .; - .init.setup : { *(.init.setup) } + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) } __setup_end = .; __initcall_start = .; - .initcall.init : { + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { *(.initcall1.init) *(.initcall2.init) *(.initcall3.init) @@ -80,33 +92,41 @@ SECTIONS } __initcall_end = .; __con_initcall_start = .; - .con_initcall.init : { *(.con_initcall.init) } + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + *(.con_initcall.init) + } __con_initcall_end = .; SECURITY_INIT . = ALIGN(4); __alt_instructions = .; - .altinstructions : { *(.altinstructions) } + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + *(.altinstructions) + } __alt_instructions_end = .; - .altinstr_replacement : { *(.altinstr_replacement) } + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } /* .exit.text is discard at runtime, not link time, to deal with references from .altinstructions and .eh_frame */ - .exit.text : { *(.exit.text) } - .exit.data : { *(.exit.data) } + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } . = ALIGN(4096); __initramfs_start = .; - .init.ramfs : { *(.init.ramfs) } + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) } __initramfs_end = .; . = ALIGN(32); __per_cpu_start = .; - .data.percpu : { *(.data.percpu) } + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } __per_cpu_end = .; . = ALIGN(4096); __init_end = .; /* freed after init ends here */ __bss_start = .; /* BSS */ - .bss : { + .bss.page_aligned : AT(ADDR(.bss.page_aligned) - LOAD_OFFSET) { *(.bss.page_aligned) + } + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { *(.bss) } . = ALIGN(4); |