From 9a163ed8e0552fdcffe405d2ea7134819a81456e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 11 Oct 2007 11:17:01 +0200 Subject: i386: move kernel Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/.gitignore | 1 + arch/x86/kernel/Makefile | 5 + arch/x86/kernel/Makefile_32 | 88 + arch/x86/kernel/alternative.c | 450 +++++ arch/x86/kernel/apic_32.c | 1566 +++++++++++++++++ arch/x86/kernel/apm_32.c | 2403 ++++++++++++++++++++++++++ arch/x86/kernel/asm-offsets.c | 5 + arch/x86/kernel/asm-offsets_32.c | 147 ++ arch/x86/kernel/bootflag.c | 98 ++ arch/x86/kernel/cpuid.c | 242 +++ arch/x86/kernel/crash_32.c | 137 ++ arch/x86/kernel/crash_dump_32.c | 74 + arch/x86/kernel/doublefault_32.c | 70 + arch/x86/kernel/e820_32.c | 944 ++++++++++ arch/x86/kernel/early_printk.c | 2 + arch/x86/kernel/efi_32.c | 712 ++++++++ arch/x86/kernel/efi_stub_32.S | 122 ++ arch/x86/kernel/entry_32.S | 1112 ++++++++++++ arch/x86/kernel/geode_32.c | 155 ++ arch/x86/kernel/head_32.S | 578 +++++++ arch/x86/kernel/hpet_32.c | 553 ++++++ arch/x86/kernel/i386_ksyms_32.c | 30 + arch/x86/kernel/i387_32.c | 546 ++++++ arch/x86/kernel/i8237.c | 72 + arch/x86/kernel/i8253_32.c | 206 +++ arch/x86/kernel/i8259_32.c | 420 +++++ arch/x86/kernel/init_task_32.c | 46 + arch/x86/kernel/io_apic_32.c | 2847 +++++++++++++++++++++++++++++++ arch/x86/kernel/ioport_32.c | 153 ++ arch/x86/kernel/irq_32.c | 343 ++++ arch/x86/kernel/kprobes_32.c | 751 ++++++++ arch/x86/kernel/ldt_32.c | 250 +++ arch/x86/kernel/machine_kexec_32.c | 171 ++ arch/x86/kernel/mca_32.c | 470 +++++ arch/x86/kernel/microcode.c | 850 +++++++++ arch/x86/kernel/module_32.c | 152 ++ arch/x86/kernel/mpparse_32.c | 1132 ++++++++++++ arch/x86/kernel/msr.c | 224 +++ arch/x86/kernel/nmi_32.c | 468 +++++ arch/x86/kernel/numaq_32.c | 89 + arch/x86/kernel/paravirt_32.c | 392 +++++ arch/x86/kernel/pci-dma_32.c | 177 ++ arch/x86/kernel/pcspeaker.c | 20 + arch/x86/kernel/process_32.c | 951 +++++++++++ arch/x86/kernel/ptrace_32.c | 723 ++++++++ arch/x86/kernel/quirks.c | 49 + arch/x86/kernel/reboot_32.c | 413 +++++ arch/x86/kernel/reboot_fixups_32.c | 68 + arch/x86/kernel/relocate_kernel_32.S | 252 +++ arch/x86/kernel/scx200_32.c | 131 ++ arch/x86/kernel/setup_32.c | 653 +++++++ arch/x86/kernel/sigframe_32.h | 21 + arch/x86/kernel/signal_32.c | 667 ++++++++ arch/x86/kernel/smp_32.c | 707 ++++++++ arch/x86/kernel/smpboot_32.c | 1322 ++++++++++++++ arch/x86/kernel/smpcommon_32.c | 81 + arch/x86/kernel/srat_32.c | 360 ++++ arch/x86/kernel/summit_32.c | 180 ++ arch/x86/kernel/sys_i386_32.c | 265 +++ arch/x86/kernel/syscall_table_32.S | 326 ++++ arch/x86/kernel/sysenter_32.c | 348 ++++ arch/x86/kernel/time_32.c | 236 +++ arch/x86/kernel/topology.c | 77 + arch/x86/kernel/trampoline_32.S | 85 + arch/x86/kernel/traps_32.c | 1250 ++++++++++++++ arch/x86/kernel/tsc_32.c | 413 +++++ arch/x86/kernel/tsc_sync.c | 1 + arch/x86/kernel/vm86_32.c | 843 +++++++++ arch/x86/kernel/vmi_32.c | 981 +++++++++++ arch/x86/kernel/vmiclock_32.c | 320 ++++ arch/x86/kernel/vmlinux.lds.S | 5 + arch/x86/kernel/vmlinux_32.lds.S | 213 +++ arch/x86/kernel/vsyscall-int80_32.S | 53 + arch/x86/kernel/vsyscall-note_32.S | 45 + arch/x86/kernel/vsyscall-sigreturn_32.S | 143 ++ arch/x86/kernel/vsyscall-sysenter_32.S | 122 ++ arch/x86/kernel/vsyscall_32.S | 15 + arch/x86/kernel/vsyscall_32.lds.S | 67 + arch/x86/mach-generic/Makefile | 2 +- arch/x86/mach-voyager/Makefile | 2 +- 80 files changed, 31661 insertions(+), 2 deletions(-) create mode 100644 arch/x86/kernel/.gitignore create mode 100644 arch/x86/kernel/Makefile create mode 100644 arch/x86/kernel/Makefile_32 create mode 100644 arch/x86/kernel/alternative.c create mode 100644 arch/x86/kernel/apic_32.c create mode 100644 arch/x86/kernel/apm_32.c create mode 100644 arch/x86/kernel/asm-offsets.c create mode 100644 arch/x86/kernel/asm-offsets_32.c create mode 100644 arch/x86/kernel/bootflag.c create mode 100644 arch/x86/kernel/cpuid.c create mode 100644 arch/x86/kernel/crash_32.c create mode 100644 arch/x86/kernel/crash_dump_32.c create mode 100644 arch/x86/kernel/doublefault_32.c create mode 100644 arch/x86/kernel/e820_32.c create mode 100644 arch/x86/kernel/early_printk.c create mode 100644 arch/x86/kernel/efi_32.c create mode 100644 arch/x86/kernel/efi_stub_32.S create mode 100644 arch/x86/kernel/entry_32.S create mode 100644 arch/x86/kernel/geode_32.c create mode 100644 arch/x86/kernel/head_32.S create mode 100644 arch/x86/kernel/hpet_32.c create mode 100644 arch/x86/kernel/i386_ksyms_32.c create mode 100644 arch/x86/kernel/i387_32.c create mode 100644 arch/x86/kernel/i8237.c create mode 100644 arch/x86/kernel/i8253_32.c create mode 100644 arch/x86/kernel/i8259_32.c create mode 100644 arch/x86/kernel/init_task_32.c create mode 100644 arch/x86/kernel/io_apic_32.c create mode 100644 arch/x86/kernel/ioport_32.c create mode 100644 arch/x86/kernel/irq_32.c create mode 100644 arch/x86/kernel/kprobes_32.c create mode 100644 arch/x86/kernel/ldt_32.c create mode 100644 arch/x86/kernel/machine_kexec_32.c create mode 100644 arch/x86/kernel/mca_32.c create mode 100644 arch/x86/kernel/microcode.c create mode 100644 arch/x86/kernel/module_32.c create mode 100644 arch/x86/kernel/mpparse_32.c create mode 100644 arch/x86/kernel/msr.c create mode 100644 arch/x86/kernel/nmi_32.c create mode 100644 arch/x86/kernel/numaq_32.c create mode 100644 arch/x86/kernel/paravirt_32.c create mode 100644 arch/x86/kernel/pci-dma_32.c create mode 100644 arch/x86/kernel/pcspeaker.c create mode 100644 arch/x86/kernel/process_32.c create mode 100644 arch/x86/kernel/ptrace_32.c create mode 100644 arch/x86/kernel/quirks.c create mode 100644 arch/x86/kernel/reboot_32.c create mode 100644 arch/x86/kernel/reboot_fixups_32.c create mode 100644 arch/x86/kernel/relocate_kernel_32.S create mode 100644 arch/x86/kernel/scx200_32.c create mode 100644 arch/x86/kernel/setup_32.c create mode 100644 arch/x86/kernel/sigframe_32.h create mode 100644 arch/x86/kernel/signal_32.c create mode 100644 arch/x86/kernel/smp_32.c create mode 100644 arch/x86/kernel/smpboot_32.c create mode 100644 arch/x86/kernel/smpcommon_32.c create mode 100644 arch/x86/kernel/srat_32.c create mode 100644 arch/x86/kernel/summit_32.c create mode 100644 arch/x86/kernel/sys_i386_32.c create mode 100644 arch/x86/kernel/syscall_table_32.S create mode 100644 arch/x86/kernel/sysenter_32.c create mode 100644 arch/x86/kernel/time_32.c create mode 100644 arch/x86/kernel/topology.c create mode 100644 arch/x86/kernel/trampoline_32.S create mode 100644 arch/x86/kernel/traps_32.c create mode 100644 arch/x86/kernel/tsc_32.c create mode 100644 arch/x86/kernel/tsc_sync.c create mode 100644 arch/x86/kernel/vm86_32.c create mode 100644 arch/x86/kernel/vmi_32.c create mode 100644 arch/x86/kernel/vmiclock_32.c create mode 100644 arch/x86/kernel/vmlinux.lds.S create mode 100644 arch/x86/kernel/vmlinux_32.lds.S create mode 100644 arch/x86/kernel/vsyscall-int80_32.S create mode 100644 arch/x86/kernel/vsyscall-note_32.S create mode 100644 arch/x86/kernel/vsyscall-sigreturn_32.S create mode 100644 arch/x86/kernel/vsyscall-sysenter_32.S create mode 100644 arch/x86/kernel/vsyscall_32.S create mode 100644 arch/x86/kernel/vsyscall_32.lds.S (limited to 'arch/x86') diff --git a/arch/x86/kernel/.gitignore b/arch/x86/kernel/.gitignore new file mode 100644 index 000000000000..40836ad9079c --- /dev/null +++ b/arch/x86/kernel/.gitignore @@ -0,0 +1 @@ +vsyscall.lds diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile new file mode 100644 index 000000000000..577d08f4b8bb --- /dev/null +++ b/arch/x86/kernel/Makefile @@ -0,0 +1,5 @@ +ifeq ($(CONFIG_X86_32),y) +include ${srctree}/arch/x86/kernel/Makefile_32 +else +include ${srctree}/arch/x86_64/kernel/Makefile_64 +endif diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32 new file mode 100644 index 000000000000..5096f486d389 --- /dev/null +++ b/arch/x86/kernel/Makefile_32 @@ -0,0 +1,88 @@ +# +# Makefile for the linux kernel. +# + +extra-y := head_32.o init_task_32.o vmlinux.lds + +obj-y := process_32.o signal_32.o entry_32.o traps_32.o irq_32.o \ + ptrace_32.o time_32.o ioport_32.o ldt_32.o setup_32.o i8259_32.o sys_i386_32.o \ + pci-dma_32.o i386_ksyms_32.o i387_32.o bootflag.o e820_32.o\ + quirks.o i8237.o topology.o alternative.o i8253_32.o tsc_32.o + +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-y += ../../x86/kernel/cpu/ +obj-y += ../../x86/kernel/acpi/ +obj-$(CONFIG_X86_BIOS_REBOOT) += reboot_32.o +obj-$(CONFIG_MCA) += mca_32.o +obj-$(CONFIG_X86_MSR) += msr.o +obj-$(CONFIG_X86_CPUID) += cpuid.o +obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_APM) += apm_32.o +obj-$(CONFIG_X86_SMP) += smp_32.o smpboot_32.o tsc_sync.o +obj-$(CONFIG_SMP) += smpcommon_32.o +obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_32.o +obj-$(CONFIG_X86_MPPARSE) += mpparse_32.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic_32.o nmi_32.o +obj-$(CONFIG_X86_IO_APIC) += io_apic_32.o +obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o +obj-$(CONFIG_KEXEC) += machine_kexec_32.o relocate_kernel_32.o crash_32.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump_32.o +obj-$(CONFIG_X86_NUMAQ) += numaq_32.o +obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o +obj-$(CONFIG_KPROBES) += kprobes_32.o +obj-$(CONFIG_MODULES) += module_32.o +obj-y += sysenter_32.o vsyscall_32.o +obj-$(CONFIG_ACPI_SRAT) += srat_32.o +obj-$(CONFIG_EFI) += efi_32.o efi_stub_32.o +obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o +obj-$(CONFIG_VM86) += vm86_32.o +obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +obj-$(CONFIG_HPET_TIMER) += hpet_32.o +obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_MGEODE_LX) += geode_32.o + +obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o +obj-$(CONFIG_PARAVIRT) += paravirt_32.o +obj-y += pcspeaker.o + +obj-$(CONFIG_SCx200) += scx200_32.o + +# vsyscall_32.o contains the vsyscall DSO images as __initdata. +# We must build both images before we can assemble it. +# Note: kbuild does not track this dependency due to usage of .incbin +$(obj)/vsyscall_32.o: $(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so +targets += $(foreach F,int80 sysenter,vsyscall-$F.o vsyscall-$F.so) +targets += vsyscall-note_32.o vsyscall_32.lds + +# The DSO images are built using a special linker script. +quiet_cmd_syscall = SYSCALL $@ + cmd_syscall = $(CC) -m elf_i386 -nostdlib $(SYSCFLAGS_$(@F)) \ + -Wl,-T,$(filter-out FORCE,$^) -o $@ + +export CPPFLAGS_vsyscall_32.lds += -P -C -U$(ARCH) + +vsyscall-flags = -shared -s -Wl,-soname=linux-gate.so.1 \ + $(call ld-option, -Wl$(comma)--hash-style=sysv) +SYSCFLAGS_vsyscall-sysenter_32.so = $(vsyscall-flags) +SYSCFLAGS_vsyscall-int80_32.so = $(vsyscall-flags) + +$(obj)/vsyscall-int80_32.so $(obj)/vsyscall-sysenter_32.so: \ +$(obj)/vsyscall-%.so: $(src)/vsyscall_32.lds \ + $(obj)/vsyscall-%.o $(obj)/vsyscall-note_32.o FORCE + $(call if_changed,syscall) + +# We also create a special relocatable object that should mirror the symbol +# table and layout of the linked DSO. With ld -R we can then refer to +# these symbols in the kernel code rather than hand-coded addresses. +extra-y += vsyscall-syms.o +$(obj)/built-in.o: $(obj)/vsyscall-syms.o +$(obj)/built-in.o: ld_flags += -R $(obj)/vsyscall-syms.o + +SYSCFLAGS_vsyscall-syms.o = -r +$(obj)/vsyscall-syms.o: $(src)/vsyscall_32.lds \ + $(obj)/vsyscall-sysenter_32.o $(obj)/vsyscall-note_32.o FORCE + $(call if_changed,syscall) + +k8-y += ../../x86_64/kernel/k8.o +stacktrace-y += ../../x86_64/kernel/stacktrace.o + diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c new file mode 100644 index 000000000000..bd72d94e713e --- /dev/null +++ b/arch/x86/kernel/alternative.c @@ -0,0 +1,450 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_PATCH_LEN (255-1) + +#ifdef CONFIG_HOTPLUG_CPU +static int smp_alt_once; + +static int __init bootonly(char *str) +{ + smp_alt_once = 1; + return 1; +} +__setup("smp-alt-boot", bootonly); +#else +#define smp_alt_once 1 +#endif + +static int debug_alternative; + +static int __init debug_alt(char *str) +{ + debug_alternative = 1; + return 1; +} +__setup("debug-alternative", debug_alt); + +static int noreplace_smp; + +static int __init setup_noreplace_smp(char *str) +{ + noreplace_smp = 1; + return 1; +} +__setup("noreplace-smp", setup_noreplace_smp); + +#ifdef CONFIG_PARAVIRT +static int noreplace_paravirt = 0; + +static int __init setup_noreplace_paravirt(char *str) +{ + noreplace_paravirt = 1; + return 1; +} +__setup("noreplace-paravirt", setup_noreplace_paravirt); +#endif + +#define DPRINTK(fmt, args...) if (debug_alternative) \ + printk(KERN_DEBUG fmt, args) + +#ifdef GENERIC_NOP1 +/* Use inline assembly to define this because the nops are defined + as inline assembly strings in the include files and we cannot + get them easily into strings. */ +asm("\t.data\nintelnops: " + GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 + GENERIC_NOP7 GENERIC_NOP8); +extern unsigned char intelnops[]; +static unsigned char *intel_nops[ASM_NOP_MAX+1] = { + NULL, + intelnops, + intelnops + 1, + intelnops + 1 + 2, + intelnops + 1 + 2 + 3, + intelnops + 1 + 2 + 3 + 4, + intelnops + 1 + 2 + 3 + 4 + 5, + intelnops + 1 + 2 + 3 + 4 + 5 + 6, + intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +#endif + +#ifdef K8_NOP1 +asm("\t.data\nk8nops: " + K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 + K8_NOP7 K8_NOP8); +extern unsigned char k8nops[]; +static unsigned char *k8_nops[ASM_NOP_MAX+1] = { + NULL, + k8nops, + k8nops + 1, + k8nops + 1 + 2, + k8nops + 1 + 2 + 3, + k8nops + 1 + 2 + 3 + 4, + k8nops + 1 + 2 + 3 + 4 + 5, + k8nops + 1 + 2 + 3 + 4 + 5 + 6, + k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +#endif + +#ifdef K7_NOP1 +asm("\t.data\nk7nops: " + K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 + K7_NOP7 K7_NOP8); +extern unsigned char k7nops[]; +static unsigned char *k7_nops[ASM_NOP_MAX+1] = { + NULL, + k7nops, + k7nops + 1, + k7nops + 1 + 2, + k7nops + 1 + 2 + 3, + k7nops + 1 + 2 + 3 + 4, + k7nops + 1 + 2 + 3 + 4 + 5, + k7nops + 1 + 2 + 3 + 4 + 5 + 6, + k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, +}; +#endif + +#ifdef CONFIG_X86_64 + +extern char __vsyscall_0; +static inline unsigned char** find_nop_table(void) +{ + return k8_nops; +} + +#else /* CONFIG_X86_64 */ + +static struct nop { + int cpuid; + unsigned char **noptable; +} noptypes[] = { + { X86_FEATURE_K8, k8_nops }, + { X86_FEATURE_K7, k7_nops }, + { -1, NULL } +}; + +static unsigned char** find_nop_table(void) +{ + unsigned char **noptable = intel_nops; + int i; + + for (i = 0; noptypes[i].cpuid >= 0; i++) { + if (boot_cpu_has(noptypes[i].cpuid)) { + noptable = noptypes[i].noptable; + break; + } + } + return noptable; +} + +#endif /* CONFIG_X86_64 */ + +/* Use this to add nops to a buffer, then text_poke the whole buffer. */ +static void add_nops(void *insns, unsigned int len) +{ + unsigned char **noptable = find_nop_table(); + + while (len > 0) { + unsigned int noplen = len; + if (noplen > ASM_NOP_MAX) + noplen = ASM_NOP_MAX; + memcpy(insns, noptable[noplen], noplen); + insns += noplen; + len -= noplen; + } +} + +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +extern u8 *__smp_locks[], *__smp_locks_end[]; + +/* Replace instructions with better alternatives for this CPU type. + This runs before SMP is initialized to avoid SMP problems with + self modifying code. This implies that assymetric systems where + APs have less capabilities than the boot processor are not handled. + Tough. Make sure you disable such features by hand. */ + +void apply_alternatives(struct alt_instr *start, struct alt_instr *end) +{ + struct alt_instr *a; + char insnbuf[MAX_PATCH_LEN]; + + DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end); + for (a = start; a < end; a++) { + u8 *instr = a->instr; + BUG_ON(a->replacementlen > a->instrlen); + BUG_ON(a->instrlen > sizeof(insnbuf)); + if (!boot_cpu_has(a->cpuid)) + continue; +#ifdef CONFIG_X86_64 + /* vsyscall code is not mapped yet. resolve it manually. */ + if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) { + instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0)); + DPRINTK("%s: vsyscall fixup: %p => %p\n", + __FUNCTION__, a->instr, instr); + } +#endif + memcpy(insnbuf, a->replacement, a->replacementlen); + add_nops(insnbuf + a->replacementlen, + a->instrlen - a->replacementlen); + text_poke(instr, insnbuf, a->instrlen); + } +} + +#ifdef CONFIG_SMP + +static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) +{ + u8 **ptr; + + for (ptr = start; ptr < end; ptr++) { + if (*ptr < text) + continue; + if (*ptr > text_end) + continue; + text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ + }; +} + +static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) +{ + u8 **ptr; + char insn[1]; + + if (noreplace_smp) + return; + + add_nops(insn, 1); + for (ptr = start; ptr < end; ptr++) { + if (*ptr < text) + continue; + if (*ptr > text_end) + continue; + text_poke(*ptr, insn, 1); + }; +} + +struct smp_alt_module { + /* what is this ??? */ + struct module *mod; + char *name; + + /* ptrs to lock prefixes */ + u8 **locks; + u8 **locks_end; + + /* .text segment, needed to avoid patching init code ;) */ + u8 *text; + u8 *text_end; + + struct list_head next; +}; +static LIST_HEAD(smp_alt_modules); +static DEFINE_SPINLOCK(smp_alt); + +void alternatives_smp_module_add(struct module *mod, char *name, + void *locks, void *locks_end, + void *text, void *text_end) +{ + struct smp_alt_module *smp; + unsigned long flags; + + if (noreplace_smp) + return; + + if (smp_alt_once) { + if (boot_cpu_has(X86_FEATURE_UP)) + alternatives_smp_unlock(locks, locks_end, + text, text_end); + return; + } + + smp = kzalloc(sizeof(*smp), GFP_KERNEL); + if (NULL == smp) + return; /* we'll run the (safe but slow) SMP code then ... */ + + smp->mod = mod; + smp->name = name; + smp->locks = locks; + smp->locks_end = locks_end; + smp->text = text; + smp->text_end = text_end; + DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", + __FUNCTION__, smp->locks, smp->locks_end, + smp->text, smp->text_end, smp->name); + + spin_lock_irqsave(&smp_alt, flags); + list_add_tail(&smp->next, &smp_alt_modules); + if (boot_cpu_has(X86_FEATURE_UP)) + alternatives_smp_unlock(smp->locks, smp->locks_end, + smp->text, smp->text_end); + spin_unlock_irqrestore(&smp_alt, flags); +} + +void alternatives_smp_module_del(struct module *mod) +{ + struct smp_alt_module *item; + unsigned long flags; + + if (smp_alt_once || noreplace_smp) + return; + + spin_lock_irqsave(&smp_alt, flags); + list_for_each_entry(item, &smp_alt_modules, next) { + if (mod != item->mod) + continue; + list_del(&item->next); + spin_unlock_irqrestore(&smp_alt, flags); + DPRINTK("%s: %s\n", __FUNCTION__, item->name); + kfree(item); + return; + } + spin_unlock_irqrestore(&smp_alt, flags); +} + +void alternatives_smp_switch(int smp) +{ + struct smp_alt_module *mod; + unsigned long flags; + +#ifdef CONFIG_LOCKDEP + /* + * A not yet fixed binutils section handling bug prevents + * alternatives-replacement from working reliably, so turn + * it off: + */ + printk("lockdep: not fixing up alternatives.\n"); + return; +#endif + + if (noreplace_smp || smp_alt_once) + return; + BUG_ON(!smp && (num_online_cpus() > 1)); + + spin_lock_irqsave(&smp_alt, flags); + if (smp) { + printk(KERN_INFO "SMP alternatives: switching to SMP code\n"); + clear_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); + clear_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); + list_for_each_entry(mod, &smp_alt_modules, next) + alternatives_smp_lock(mod->locks, mod->locks_end, + mod->text, mod->text_end); + } else { + printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); + set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); + list_for_each_entry(mod, &smp_alt_modules, next) + alternatives_smp_unlock(mod->locks, mod->locks_end, + mod->text, mod->text_end); + } + spin_unlock_irqrestore(&smp_alt, flags); +} + +#endif + +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) +{ + struct paravirt_patch_site *p; + char insnbuf[MAX_PATCH_LEN]; + + if (noreplace_paravirt) + return; + + for (p = start; p < end; p++) { + unsigned int used; + + BUG_ON(p->len > MAX_PATCH_LEN); + /* prep the buffer with the original instructions */ + memcpy(insnbuf, p->instr, p->len); + used = paravirt_ops.patch(p->instrtype, p->clobbers, insnbuf, + (unsigned long)p->instr, p->len); + + BUG_ON(used > p->len); + + /* Pad the rest with nops */ + add_nops(insnbuf + used, p->len - used); + text_poke(p->instr, insnbuf, p->len); + } +} +extern struct paravirt_patch_site __start_parainstructions[], + __stop_parainstructions[]; +#endif /* CONFIG_PARAVIRT */ + +void __init alternative_instructions(void) +{ + unsigned long flags; + + /* The patching is not fully atomic, so try to avoid local interruptions + that might execute the to be patched code. + Other CPUs are not running. */ + stop_nmi(); +#ifdef CONFIG_X86_MCE + stop_mce(); +#endif + + local_irq_save(flags); + apply_alternatives(__alt_instructions, __alt_instructions_end); + + /* switch to patch-once-at-boottime-only mode and free the + * tables in case we know the number of CPUs will never ever + * change */ +#ifdef CONFIG_HOTPLUG_CPU + if (num_possible_cpus() < 2) + smp_alt_once = 1; +#endif + +#ifdef CONFIG_SMP + if (smp_alt_once) { + if (1 == num_possible_cpus()) { + printk(KERN_INFO "SMP alternatives: switching to UP code\n"); + set_bit(X86_FEATURE_UP, boot_cpu_data.x86_capability); + set_bit(X86_FEATURE_UP, cpu_data[0].x86_capability); + alternatives_smp_unlock(__smp_locks, __smp_locks_end, + _text, _etext); + } + free_init_pages("SMP alternatives", + (unsigned long)__smp_locks, + (unsigned long)__smp_locks_end); + } else { + alternatives_smp_module_add(NULL, "core kernel", + __smp_locks, __smp_locks_end, + _text, _etext); + alternatives_smp_switch(0); + } +#endif + apply_paravirt(__parainstructions, __parainstructions_end); + local_irq_restore(flags); + + restart_nmi(); +#ifdef CONFIG_X86_MCE + restart_mce(); +#endif +} + +/* + * Warning: + * When you use this code to patch more than one byte of an instruction + * you need to make sure that other CPUs cannot execute this code in parallel. + * Also no thread must be currently preempted in the middle of these instructions. + * And on the local CPU you need to be protected again NMI or MCE handlers + * seeing an inconsistent instruction while you patch. + */ +void __kprobes text_poke(void *addr, unsigned char *opcode, int len) +{ + memcpy(addr, opcode, len); + sync_core(); + /* Could also do a CLFLUSH here to speed up CPU recovery; but + that causes hangs on some VIA CPUs. */ +} diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c new file mode 100644 index 000000000000..3d67ae18d762 --- /dev/null +++ b/arch/x86/kernel/apic_32.c @@ -0,0 +1,1566 @@ +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000 Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "io_ports.h" + +/* + * Sanity check + */ +#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F +# error SPURIOUS_APIC_VECTOR definition error +#endif + +/* + * Knob to control our willingness to enable the local APIC. + * + * -1=force-disable, +1=force-enable + */ +static int enable_local_apic __initdata = 0; + +/* Local APIC timer verification ok */ +static int local_apic_timer_verify_ok; +/* Disable local APIC timer from the kernel commandline or via dmi quirk + or using CPU MSR check */ +int local_apic_timer_disabled; +/* Local APIC timer works in C2 */ +int local_apic_timer_c2_ok; +EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); + +/* + * Debug level, exported for io_apic.c + */ +int apic_verbosity; + +static unsigned int calibration_result; + +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt); +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt); +static void lapic_timer_broadcast(cpumask_t mask); +static void apic_pm_activate(void); + +/* + * The local apic timer can be used for any function which is CPU local. + */ +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + +/* + * Get the LAPIC version + */ +static inline int lapic_get_version(void) +{ + return GET_APIC_VERSION(apic_read(APIC_LVR)); +} + +/* + * Check, if the APIC is integrated or a seperate chip + */ +static inline int lapic_is_integrated(void) +{ + return APIC_INTEGRATED(lapic_get_version()); +} + +/* + * Check, whether this is a modern or a first generation APIC + */ +static int modern_apic(void) +{ + /* AMD systems use old APIC versions, so check the CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0xf) + return 1; + return lapic_get_version() >= 0x14; +} + +void apic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +unsigned long safe_apic_wait_icr_idle(void) +{ + unsigned long send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + +/** + * enable_NMI_through_LVT0 - enable NMI through local vector table 0 + */ +void enable_NMI_through_LVT0 (void * dummy) +{ + unsigned int v = APIC_DM_NMI; + + /* Level triggered for 82489DX */ + if (!lapic_is_integrated()) + v |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT0, v); +} + +/** + * get_physical_broadcast - Get number of physical broadcast IDs + */ +int get_physical_broadcast(void) +{ + return modern_apic() ? 0xff : 0xf; +} + +/** + * lapic_get_maxlvt - get the maximum number of local vector table entries + */ +int lapic_get_maxlvt(void) +{ + unsigned int v = apic_read(APIC_LVR); + + /* 82489DXs do not report # of LVT entries. */ + return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; +} + +/* + * Local APIC timer + */ + +/* Clock divisor is set to 16 */ +#define APIC_DIVISOR 16 + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) +{ + unsigned int lvtt_value, tmp_value; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!lapic_is_integrated()) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + + if (!irqen) + lvtt_value |= APIC_LVT_MASKED; + + apic_write_around(APIC_LVTT, lvtt_value); + + /* + * Divide PICLK by 16 + */ + tmp_value = apic_read(APIC_TDCR); + apic_write_around(APIC_TDCR, (tmp_value + & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); + + if (!oneshot) + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); +} + +/* + * Program the next event, relative to now + */ +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + apic_write_around(APIC_TMICT, delta); + return 0; +} + +/* + * Setup the lapic timer in periodic or oneshot mode + */ +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long flags; + unsigned int v; + + /* Lapic used for broadcast ? */ + if (!local_apic_timer_verify_ok) + return; + + local_irq_save(flags); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + __setup_APIC_LVTT(calibration_result, + mode != CLOCK_EVT_MODE_PERIODIC, 1); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write_around(APIC_LVTT, v); + break; + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ + break; + } + + local_irq_restore(flags); +} + +/* + * Local APIC timer broadcast function + */ +static void lapic_timer_broadcast(cpumask_t mask) +{ +#ifdef CONFIG_SMP + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); +#endif +} + +/* + * Setup the local APIC timer for this CPU. Copy the initilized values + * of the boot CPU and register the clock event in the framework. + */ +static void __devinit setup_APIC_timer(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of_cpu(smp_processor_id()); + + clockevents_register_device(levt); +} + +/* + * In this functions we calibrate APIC bus clocks to the external timer. + * + * We want to do the calibration only once since we want to have local timer + * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * frequency. + * + * This was previously done by reading the PIT/HPET and waiting for a wrap + * around to find out, that a tick has elapsed. I have a box, where the PIT + * readout is broken, so it never gets out of the wait loop again. This was + * also reported by others. + * + * Monitoring the jiffies value is inaccurate and the clockevents + * infrastructure allows us to do a simple substitution of the interrupt + * handler. + * + * The calibration routine also uses the pm_timer when possible, as the PIT + * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes + * back to normal later in the boot process). + */ + +#define LAPIC_CAL_LOOPS (HZ/10) + +static __initdata int lapic_cal_loops = -1; +static __initdata long lapic_cal_t1, lapic_cal_t2; +static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; + +/* + * Temporary interrupt handler. + */ +static void __init lapic_cal_handler(struct clock_event_device *dev) +{ + unsigned long long tsc = 0; + long tapic = apic_read(APIC_TMCCT); + unsigned long pm = acpi_pm_read_early(); + + if (cpu_has_tsc) + rdtscll(tsc); + + switch (lapic_cal_loops++) { + case 0: + lapic_cal_t1 = tapic; + lapic_cal_tsc1 = tsc; + lapic_cal_pm1 = pm; + lapic_cal_j1 = jiffies; + break; + + case LAPIC_CAL_LOOPS: + lapic_cal_t2 = tapic; + lapic_cal_tsc2 = tsc; + if (pm < lapic_cal_pm1) + pm += ACPI_PM_OVRRUN; + lapic_cal_pm2 = pm; + lapic_cal_j2 = jiffies; + break; + } +} + +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + const long pm_100ms = PMTMR_TICKS_PER_SEC/10; + const long pm_thresh = pm_100ms/100; + void (*real_handler)(struct clock_event_device *dev); + unsigned long deltaj; + long delta, deltapm; + int pm_referenced = 0; + + /* + * The local apic timer can be disabled via the kernel + * commandline or from the CPU detection code. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. + */ + if (local_apic_timer_disabled) { + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + local_irq_disable(); + + /* Replace the global interrupt handler */ + real_handler = global_clock_event->event_handler; + global_clock_event->event_handler = lapic_cal_handler; + + /* + * Setup the APIC counter to 1e9. There is no way the lapic + * can underflow in the 100ms detection time frame + */ + __setup_APIC_LVTT(1000000000, 0, 0); + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + local_irq_disable(); + + /* Restore the real event handler */ + global_clock_event->event_handler = real_handler; + + /* Build delta t1-t2 as apic timer counts down */ + delta = lapic_cal_t1 - lapic_cal_t2; + apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + if (deltapm) { + unsigned long mult; + u64 res; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64) deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64) delta ) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long) res, delta); + delta = (long) res; + } + pm_referenced = 1; + } + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + + apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); + apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", + calibration_result); + + if (cpu_has_tsc) { + delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + } + + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%u.%04u MHz.\n", + calibration_result / (1000000 / HZ), + calibration_result % (1000000 / HZ)); + + local_apic_timer_verify_ok = 1; + + /* We trust the pm timer based calibration */ + if (!pm_referenced) { + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + + /* + * Setup the apic timer manually + */ + levt->event_handler = lapic_cal_handler; + lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); + lapic_cal_loops = -1; + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + local_irq_disable(); + + /* Stop the lapic timer */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); + + local_irq_enable(); + + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + + /* Check, if the jiffies result is consistent */ + if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) + apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + else + local_apic_timer_verify_ok = 0; + } else + local_irq_enable(); + + if (!local_apic_timer_verify_ok) { + printk(KERN_WARNING + "APIC timer disabled due to verification failure.\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() == 1) + return; + } else { + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + if (nmi_watchdog != NMI_IO_APIC) + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=1!\n"); + } + + /* Setup the lapic or request the broadcast */ + setup_APIC_timer(); +} + +void __devinit setup_secondary_APIC_clock(void) +{ + setup_APIC_timer(); +} + +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) +{ + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + + /* + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. + * + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + printk(KERN_WARNING + "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + /* Switch it off */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + return; + } + + per_cpu(irq_stat, cpu).apic_timer_irqs++; + + evt->event_handler(evt); +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ + +void fastcall smp_apic_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ + irq_enter(); + local_apic_timer_interrupt(); + irq_exit(); + + set_irq_regs(old_regs); +} + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * Local APIC start and shutdown + */ + +/** + * clear_local_APIC - shutdown the local APIC + * + * This is called, when a CPU is disabled and before rebooting, so the state of + * the local APIC has no dangling leftovers. Also used to cleanout any BIOS + * leftovers during boot. + */ +void clear_local_APIC(void) +{ + int maxlvt = lapic_get_maxlvt(); + unsigned long v; + + /* + * Masking an LVT entry can trigger a local APIC error + * if the vector is zero. Mask LVTERR first to prevent this. + */ + if (maxlvt >= 3) { + v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ + apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); + } + /* + * Careful: we have to set masks only first to deassert + * any level-triggered sources. + */ + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT1); + apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); + if (maxlvt >= 4) { + v = apic_read(APIC_LVTPC); + apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); + } + + /* lets not touch this if we didn't frob it */ +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) { + v = apic_read(APIC_LVTTHMR); + apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); + } +#endif + /* + * Clean APIC state for other OSs: + */ + apic_write_around(APIC_LVTT, APIC_LVT_MASKED); + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + apic_write_around(APIC_LVT1, APIC_LVT_MASKED); + if (maxlvt >= 3) + apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); + if (maxlvt >= 4) + apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); + +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); +#endif + /* Integrated APIC (!82489DX) ? */ + if (lapic_is_integrated()) { + if (maxlvt > 3) + /* Clear ESR due to Pentium errata 3AP and 11AP */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } +} + +/** + * disable_local_APIC - clear and disable the local APIC + */ +void disable_local_APIC(void) +{ + unsigned long value; + + clear_local_APIC(); + + /* + * Disable APIC (implies clearing of registers + * for 82489DX!). + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write_around(APIC_SPIV, value); + + /* + * When LAPIC was disabled by the BIOS and enabled by the kernel, + * restore the disabled state. + */ + if (enabled_via_apicbase) { + unsigned int l, h; + + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_ENABLE; + wrmsr(MSR_IA32_APICBASE, l, h); + } +} + +/* + * If Linux enabled the LAPIC against the BIOS default disable it down before + * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and + * not power-off. Additionally clear all LVT entries before disable_local_APIC + * for the case where Linux didn't enable the LAPIC. + */ +void lapic_shutdown(void) +{ + unsigned long flags; + + if (!cpu_has_apic) + return; + + local_irq_save(flags); + clear_local_APIC(); + + if (enabled_via_apicbase) + disable_local_APIC(); + + local_irq_restore(flags); +} + +/* + * This is to verify that we're looking at a real local APIC. + * Check these against your board if the CPUs aren't getting + * started for no apparent reason. + */ +int __init verify_local_APIC(void) +{ + unsigned int reg0, reg1; + + /* + * The version register is read-only in a real APIC. + */ + reg0 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); + apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); + reg1 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); + + /* + * The two version reads above should print the same + * numbers. If the second one is different, then we + * poke at a non-APIC. + */ + if (reg1 != reg0) + return 0; + + /* + * Check if the version looks reasonably. + */ + reg1 = GET_APIC_VERSION(reg0); + if (reg1 == 0x00 || reg1 == 0xff) + return 0; + reg1 = lapic_get_maxlvt(); + if (reg1 < 0x02 || reg1 == 0xff) + return 0; + + /* + * The ID register is read/write in a real APIC. + */ + reg0 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); + + /* + * The next two are just to see if we have sane values. + * They're only really relevant if we're in Virtual Wire + * compatibility mode, but most boxes are anymore. + */ + reg0 = apic_read(APIC_LVT0); + apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); + reg1 = apic_read(APIC_LVT1); + apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); + + return 1; +} + +/** + * sync_Arb_IDs - synchronize APIC bus arbitration IDs + */ +void __init sync_Arb_IDs(void) +{ + /* + * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not + * needed on AMD. + */ + if (modern_apic()) + return; + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); + apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG + | APIC_DM_INIT); +} + +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) +{ + unsigned long value; + + /* + * Don't do the setup now if we have a SMP BIOS as the + * through-I/O-APIC virtual wire mode might be active. + */ + if (smp_found_config || !cpu_has_apic) + return; + + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + + /* + * Enable APIC. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + + /* This bit is reserved on P4/Xeon and should be cleared */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) + value &= ~APIC_SPIV_FOCUS_DISABLED; + else + value |= APIC_SPIV_FOCUS_DISABLED; + value |= SPURIOUS_APIC_VECTOR; + apic_write_around(APIC_SPIV, value); + + /* + * Set up the virtual wire mode. + */ + apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + value = APIC_DM_NMI; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT1, value); +} + +/** + * setup_local_APIC - setup the local APIC + */ +void __devinit setup_local_APIC(void) +{ + unsigned long oldvalue, value, maxlvt, integrated; + int i, j; + + /* Pound the ESR really hard over the head with a big hammer - mbligh */ + if (esr_disable) { + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + } + + integrated = lapic_is_integrated(); + + /* + * Double-check whether this APIC is really registered. + */ + if (!apic_id_registered()) + BUG(); + + /* + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ + init_apic_ldr(); + + /* + * Set Task Priority to 'accept all'. We never change this + * later on. + */ + value = apic_read(APIC_TASKPRI); + value &= ~APIC_TPRI_MASK; + apic_write_around(APIC_TASKPRI, value); + + /* + * After a crash, we no longer service the interrupts and a pending + * interrupt from previous kernel might still have ISR bit set. + * + * Most probably by now CPU has serviced that pending interrupt and + * it might not have done the ack_APIC_irq() because it thought, + * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it + * does not clear the ISR bit and cpu thinks it has already serivced + * the interrupt. Hence a vector might get locked. It was noticed + * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. + */ + for (i = APIC_ISR_NR - 1; i >= 0; i--) { + value = apic_read(APIC_ISR + i*0x10); + for (j = 31; j >= 0; j--) { + if (value & (1< 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + /* enables sending errors */ + value = ERROR_APIC_VECTOR; + apic_write_around(APIC_LVTERR, value); + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08lx after: 0x%08lx\n", + oldvalue, value); + } else { + if (esr_disable) + /* + * Something untraceble is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + printk(KERN_INFO "Leaving ESR disabled.\n"); + else + printk(KERN_INFO "No ESR for 82489DX.\n"); + } + + /* Disable the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write_around(APIC_LVTT, value); + + setup_apic_nmi_watchdog(NULL); + apic_pm_activate(); +} + +/* + * Detect and initialize APIC + */ +static int __init detect_init_APIC (void) +{ + u32 h, l, features; + + /* Disabled by kernel option? */ + if (enable_local_apic < 0) + return -1; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || + (boot_cpu_data.x86 == 15)) + break; + goto no_apic; + case X86_VENDOR_INTEL: + if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || + (boot_cpu_data.x86 == 5 && cpu_has_apic)) + break; + goto no_apic; + default: + goto no_apic; + } + + if (!cpu_has_apic) { + /* + * Over-ride BIOS and try to enable the local APIC only if + * "lapic" specified. + */ + if (enable_local_apic <= 0) { + printk(KERN_INFO "Local APIC disabled by BIOS -- " + "you can enable it with \"lapic\"\n"); + return -1; + } + /* + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. + */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + printk(KERN_INFO + "Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } + } + /* + * The APIC feature bit should now be enabled + * in `cpuid' + */ + features = cpuid_edx(1); + if (!(features & (1 << X86_FEATURE_APIC))) { + printk(KERN_WARNING "Could not enable APIC!\n"); + return -1; + } + set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* The BIOS may have set up the APIC at some other address */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + + if (nmi_watchdog != NMI_NONE && nmi_watchdog != NMI_DISABLED) + nmi_watchdog = NMI_LOCAL_APIC; + + printk(KERN_INFO "Found and enabled local APIC!\n"); + + apic_pm_activate(); + + return 0; + +no_apic: + printk(KERN_INFO "No local APIC present or hardware disabled\n"); + return -1; +} + +/** + * init_apic_mappings - initialize APIC mappings + */ +void __init init_apic_mappings(void) +{ + unsigned long apic_phys; + + /* + * If no local APIC can be found then set up a fake all + * zeroes page to simulate the local APIC and another + * one for the IO-APIC. + */ + if (!smp_found_config && detect_init_APIC()) { + apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + apic_phys = __pa(apic_phys); + } else + apic_phys = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, + apic_phys); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); + +#ifdef CONFIG_X86_IO_APIC + { + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + int i; + + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mpc_apicaddr; + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } + } else { +fake_ioapic_page: + ioapic_phys = (unsigned long) + alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + } + } +#endif +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int __init APIC_init_uniprocessor (void) +{ + if (enable_local_apic < 0) + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + + if (!smp_found_config && !cpu_has_apic) + return -1; + + /* + * Complain if the BIOS pretends there is one. + */ + if (!cpu_has_apic && + APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return -1; + } + + verify_local_APIC(); + + connect_bsp_APIC(); + + /* + * Hack: In case of kdump, after a crash, kernel might be booting + * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid + * might be zero if read from MP tables. Get it from LAPIC. + */ +#ifdef CONFIG_CRASH_DUMP + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); +#endif + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); + + setup_local_APIC(); + +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config) + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +#endif + setup_boot_clock(); + + return 0; +} + +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + enable_local_apic = 1; + return 0; +} +early_param("lapic", parse_lapic); + +static int __init parse_nolapic(char *arg) +{ + enable_local_apic = -1; + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return 0; +} +early_param("nolapic", parse_nolapic); + +static int __init parse_disable_lapic_timer(char *arg) +{ + local_apic_timer_disabled = 1; + return 0; +} +early_param("nolapic_timer", parse_disable_lapic_timer); + +static int __init parse_lapic_timer_c2_ok(char *arg) +{ + local_apic_timer_c2_ok = 1; + return 0; +} +early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); + +static int __init apic_set_verbosity(char *str) +{ + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + return 1; +} + +__setup("apic=", apic_set_verbosity); + + +/* + * Local APIC interrupts + */ + +/* + * This interrupt should _never_ happen with our APIC/SMP architecture + */ +void smp_spurious_interrupt(struct pt_regs *regs) +{ + unsigned long v; + + irq_enter(); + /* + * Check if this really is a spurious interrupt and ACK it + * if it is a vectored one. Just in case... + * Spurious interrupts should not be ACKed. + */ + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) + ack_APIC_irq(); + + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " + "should never happen.\n", smp_processor_id()); + irq_exit(); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ +void smp_error_interrupt(struct pt_regs *regs) +{ + unsigned long v, v1; + + irq_enter(); + /* First tickle the hardware, only then report what went on. -- REW */ + v = apic_read(APIC_ESR); + apic_write(APIC_ESR, 0); + v1 = apic_read(APIC_ESR); + ack_APIC_irq(); + atomic_inc(&irq_err_count); + + /* Here is what the APIC error bits mean: + 0: Send CS error + 1: Receive CS error + 2: Send accept error + 3: Receive accept error + 4: Reserved + 5: Send illegal vector + 6: Received illegal vector + 7: Illegal register address + */ + printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", + smp_processor_id(), v , v1); + irq_exit(); +} + +/* + * Initialize APIC interrupts + */ +void __init apic_intr_init(void) +{ +#ifdef CONFIG_SMP + smp_intr_init(); +#endif + /* self generated IPI for local APIC timer */ + set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* thermal monitor LVT interrupt */ +#ifdef CONFIG_X86_MCE_P4THERMAL + set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +} + +/** + * connect_bsp_APIC - attach the APIC to the interrupt system + */ +void __init connect_bsp_APIC(void) +{ + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's + * local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, " + "enabling APIC mode.\n"); + outb(0x70, 0x22); + outb(0x01, 0x23); + } + enable_apic_mode(); +} + +/** + * disconnect_bsp_APIC - detach the APIC from the interrupt system + * @virt_wire_setup: indicates, whether virtual wire mode is selected + * + * Virtual wire mode is necessary to deliver legacy interrupts even when the + * APIC is disabled. + */ +void disconnect_bsp_APIC(int virt_wire_setup) +{ + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect only on + * certain older boards). Note that APIC interrupts, including + * IPIs, won't work beyond this point! The only exception are + * INIT IPIs. + */ + apic_printk(APIC_VERBOSE, "disabling APIC mode, " + "entering PIC mode.\n"); + outb(0x70, 0x22); + outb(0x00, 0x23); + } else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* + * For LVT0 make it edge triggered, active high, + * external and enabled + */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write_around(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + } + + /* + * For LVT1 make it edge triggered, active high, nmi and + * enabled + */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } +} + +/* + * Power management + */ +#ifdef CONFIG_PM + +static struct { + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; +} apic_pm_state; + +static int lapic_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#endif + + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); + return 0; +} + +static int lapic_resume(struct sys_device *dev) +{ + unsigned int l, h; + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + local_irq_save(flags); + + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + local_irq_restore(flags); + return 0; +} + +/* + * This device has no shutdown method - fully functioning local APICs + * are needed on every CPU up until machine_halt/restart/poweroff. + */ + +static struct sysdev_class lapic_sysclass = { + set_kset_name("lapic"), + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static struct sys_device device_lapic = { + .id = 0, + .cls = &lapic_sysclass, +}; + +static void __devinit apic_pm_activate(void) +{ + apic_pm_state.active = 1; +} + +static int __init init_lapic_sysfs(void) +{ + int error; + + if (!cpu_has_apic) + return 0; + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + + error = sysdev_class_register(&lapic_sysclass); + if (!error) + error = sysdev_register(&device_lapic); + return error; +} +device_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c new file mode 100644 index 000000000000..f02a8aca826b --- /dev/null +++ b/arch/x86/kernel/apm_32.c @@ -0,0 +1,2403 @@ +/* -*- linux-c -*- + * APM BIOS driver for Linux + * Copyright 1994-2001 Stephen Rothwell (sfr@canb.auug.org.au) + * + * Initial development of this driver was funded by NEC Australia P/L + * and NEC Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2, or (at your option) any + * later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * October 1995, Rik Faith (faith@cs.unc.edu): + * Minor enhancements and updates (to the patch set) for 1.3.x + * Documentation + * January 1996, Rik Faith (faith@cs.unc.edu): + * Make /proc/apm easy to format (bump driver version) + * March 1996, Rik Faith (faith@cs.unc.edu): + * Prohibit APM BIOS calls unless apm_enabled. + * (Thanks to Ulrich Windl ) + * April 1996, Stephen Rothwell (sfr@canb.auug.org.au) + * Version 1.0 and 1.1 + * May 1996, Version 1.2 + * Feb 1998, Version 1.3 + * Feb 1998, Version 1.4 + * Aug 1998, Version 1.5 + * Sep 1998, Version 1.6 + * Nov 1998, Version 1.7 + * Jan 1999, Version 1.8 + * Jan 1999, Version 1.9 + * Oct 1999, Version 1.10 + * Nov 1999, Version 1.11 + * Jan 2000, Version 1.12 + * Feb 2000, Version 1.13 + * Nov 2000, Version 1.14 + * Oct 2001, Version 1.15 + * Jan 2002, Version 1.16 + * Oct 2002, Version 1.16ac + * + * History: + * 0.6b: first version in official kernel, Linux 1.3.46 + * 0.7: changed /proc/apm format, Linux 1.3.58 + * 0.8: fixed gcc 2.7.[12] compilation problems, Linux 1.3.59 + * 0.9: only call bios if bios is present, Linux 1.3.72 + * 1.0: use fixed device number, consolidate /proc/apm into this file, + * Linux 1.3.85 + * 1.1: support user-space standby and suspend, power off after system + * halted, Linux 1.3.98 + * 1.2: When resetting RTC after resume, take care so that the time + * is only incorrect by 30-60mS (vs. 1S previously) (Gabor J. Toth + * ); improve interaction between + * screen-blanking and gpm (Stephen Rothwell); Linux 1.99.4 + * 1.2a:Simple change to stop mysterious bug reports with SMP also added + * levels to the printk calls. APM is not defined for SMP machines. + * The new replacment for it is, but Linux doesn't yet support this. + * Alan Cox Linux 2.1.55 + * 1.3: Set up a valid data descriptor 0x40 for buggy BIOS's + * 1.4: Upgraded to support APM 1.2. Integrated ThinkPad suspend patch by + * Dean Gaudet . + * C. Scott Ananian Linux 2.1.87 + * 1.5: Fix segment register reloading (in case of bad segments saved + * across BIOS call). + * Stephen Rothwell + * 1.6: Cope with complier/assembler differences. + * Only try to turn off the first display device. + * Fix OOPS at power off with no APM BIOS by Jan Echternach + * + * Stephen Rothwell + * 1.7: Modify driver's cached copy of the disabled/disengaged flags + * to reflect current state of APM BIOS. + * Chris Rankin + * Reset interrupt 0 timer to 100Hz after suspend + * Chad Miller + * Add CONFIG_APM_IGNORE_SUSPEND_BOUNCE + * Richard Gooch + * Allow boot time disabling of APM + * Make boot messages far less verbose by default + * Make asm safer + * Stephen Rothwell + * 1.8: Add CONFIG_APM_RTC_IS_GMT + * Richard Gooch + * change APM_NOINTS to CONFIG_APM_ALLOW_INTS + * remove dependency on CONFIG_PROC_FS + * Stephen Rothwell + * 1.9: Fix small typo. + * Try to cope with BIOS's that need to have all display + * devices blanked and not just the first one. + * Ross Paterson + * Fix segment limit setting it has always been wrong as + * the segments needed to have byte granularity. + * Mark a few things __init. + * Add hack to allow power off of SMP systems by popular request. + * Use CONFIG_SMP instead of __SMP__ + * Ignore BOUNCES for three seconds. + * Stephen Rothwell + * 1.10: Fix for Thinkpad return code. + * Merge 2.2 and 2.3 drivers. + * Remove APM dependencies in arch/i386/kernel/process.c + * Remove APM dependencies in drivers/char/sysrq.c + * Reset time across standby. + * Allow more inititialisation on SMP. + * Remove CONFIG_APM_POWER_OFF and make it boot time + * configurable (default on). + * Make debug only a boot time parameter (remove APM_DEBUG). + * Try to blank all devices on any error. + * 1.11: Remove APM dependencies in drivers/char/console.c + * Check nr_running to detect if we are idle (from + * Borislav Deianov ) + * Fix for bioses that don't zero the top part of the + * entrypoint offset (Mario Sitta ) + * (reported by Panos Katsaloulis ). + * Real mode power off patch (Walter Hofmann + * ). + * 1.12: Remove CONFIG_SMP as the compiler will optimize + * the code away anyway (smp_num_cpus == 1 in UP) + * noted by Artur Skawina . + * Make power off under SMP work again. + * Fix thinko with initial engaging of BIOS. + * Make sure power off only happens on CPU 0 + * (Paul "Rusty" Russell ). + * Do error notification to user mode if BIOS calls fail. + * Move entrypoint offset fix to ...boot/setup.S + * where it belongs (Cosmos ). + * Remove smp-power-off. SMP users must now specify + * "apm=power-off" on the kernel command line. Suggested + * by Jim Avera , modified by Alan Cox + * . + * Register the /proc/apm entry even on SMP so that + * scripts that check for it before doing power off + * work (Jim Avera ). + * 1.13: Changes for new pm_ interfaces (Andy Henroid + * ). + * Modularize the code. + * Fix the Thinkpad (again) :-( (CONFIG_APM_IGNORE_MULTIPLE_SUSPENDS + * is now the way life works). + * Fix thinko in suspend() (wrong return). + * Notify drivers on critical suspend. + * Make kapmd absorb more idle time (Pavel Machek + * modified by sfr). + * Disable interrupts while we are suspended (Andy Henroid + * fixed by sfr). + * Make power off work on SMP again (Tony Hoyle + * and ) modified by sfr. + * Remove CONFIG_APM_SUSPEND_BOUNCE. The bounce ignore + * interval is now configurable. + * 1.14: Make connection version persist across module unload/load. + * Enable and engage power management earlier. + * Disengage power management on module unload. + * Changed to use the sysrq-register hack for registering the + * power off function called by magic sysrq based upon discussions + * in irc://irc.openprojects.net/#kernelnewbies + * (Crutcher Dunnavant ). + * Make CONFIG_APM_REAL_MODE_POWER_OFF run time configurable. + * (Arjan van de Ven ) modified by sfr. + * Work around byte swap bug in one of the Vaio's BIOS's + * (Marc Boucher ). + * Exposed the disable flag to dmi so that we can handle known + * broken APM (Alan Cox ). + * 1.14ac: If the BIOS says "I slowed the CPU down" then don't spin + * calling it - instead idle. (Alan Cox ) + * If an APM idle fails log it and idle sensibly + * 1.15: Don't queue events to clients who open the device O_WRONLY. + * Don't expect replies from clients who open the device O_RDONLY. + * (Idea from Thomas Hood) + * Minor waitqueue cleanups. (John Fremlin ) + * 1.16: Fix idle calling. (Andreas Steinmetz et al.) + * Notify listeners of standby or suspend events before notifying + * drivers. Return EBUSY to ioctl() if suspend is rejected. + * (Russell King and Thomas Hood) + * Ignore first resume after we generate our own resume event + * after a suspend (Thomas Hood) + * Daemonize now gets rid of our controlling terminal (sfr). + * CONFIG_APM_CPU_IDLE now just affects the default value of + * idle_threshold (sfr). + * Change name of kernel apm daemon (as it no longer idles) (sfr). + * 1.16ac: Fix up SMP support somewhat. You can now force SMP on and we + * make _all_ APM calls on the CPU#0. Fix unsafe sign bug. + * TODO: determine if its "boot CPU" or "CPU0" we want to lock to. + * + * APM 1.1 Reference: + * + * Intel Corporation, Microsoft Corporation. Advanced Power Management + * (APM) BIOS Interface Specification, Revision 1.1, September 1993. + * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. + * + * [This document is available free from Intel by calling 800.628.8686 (fax + * 916.356.6100) or 800.548.4725; or via anonymous ftp from + * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also + * available from Microsoft by calling 206.882.8080.] + * + * APM 1.2 Reference: + * Intel Corporation, Microsoft Corporation. Advanced Power Management + * (APM) BIOS Interface Specification, Revision 1.2, February 1996. + * + * [This document is available from Microsoft at: + * http://www.microsoft.com/whdc/archive/amp_12.mspx] + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "io_ports.h" + +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) +extern int (*console_blank_hook)(int); +#endif + +/* + * The apm_bios device is one of the misc char devices. + * This is its minor number. + */ +#define APM_MINOR_DEV 134 + +/* + * See Documentation/Config.help for the configuration options. + * + * Various options can be changed at boot time as follows: + * (We allow underscores for compatibility with the modules code) + * apm=on/off enable/disable APM + * [no-]allow[-_]ints allow interrupts during BIOS calls + * [no-]broken[-_]psr BIOS has a broken GetPowerStatus call + * [no-]realmode[-_]power[-_]off switch to real mode before + * powering off + * [no-]debug log some debugging messages + * [no-]power[-_]off power off on shutdown + * [no-]smp Use apm even on an SMP box + * bounce[-_]interval= number of ticks to ignore suspend + * bounces + * idle[-_]threshold= System idle percentage above which to + * make APM BIOS idle calls. Set it to + * 100 to disable. + * idle[-_]period= Period (in 1/100s of a second) over + * which the idle percentage is + * calculated. + */ + +/* KNOWN PROBLEM MACHINES: + * + * U: TI 4000M TravelMate: BIOS is *NOT* APM compliant + * [Confirmed by TI representative] + * ?: ACER 486DX4/75: uses dseg 0040, in violation of APM specification + * [Confirmed by BIOS disassembly] + * [This may work now ...] + * P: Toshiba 1950S: battery life information only gets updated after resume + * P: Midwest Micro Soundbook Elite DX2/66 monochrome: screen blanking + * broken in BIOS [Reported by Garst R. Reese ] + * ?: AcerNote-950: oops on reading /proc/apm - workaround is a WIP + * Neale Banks December 2000 + * + * Legend: U = unusable with APM patches + * P = partially usable with APM patches + */ + +/* + * Define as 1 to make the driver always call the APM BIOS busy + * routine even if the clock was not reported as slowed by the + * idle routine. Otherwise, define as 0. + */ +#define ALWAYS_CALL_BUSY 1 + +/* + * Define to make the APM BIOS calls zero all data segment registers (so + * that an incorrect BIOS implementation will cause a kernel panic if it + * tries to write to arbitrary memory). + */ +#define APM_ZERO_SEGS + +#include "apm.h" + +/* + * Define to re-initialize the interrupt 0 timer to 100 Hz after a suspend. + * This patched by Chad Miller , original code by + * David Chen + */ +#undef INIT_TIMER_AFTER_SUSPEND + +#ifdef INIT_TIMER_AFTER_SUSPEND +#include +#include +#include +#endif + +/* + * Need to poll the APM BIOS every second + */ +#define APM_CHECK_TIMEOUT (HZ) + +/* + * Ignore suspend events for this amount of time after a resume + */ +#define DEFAULT_BOUNCE_INTERVAL (3 * HZ) + +/* + * Maximum number of events stored + */ +#define APM_MAX_EVENTS 20 + +/* + * The per-file APM data + */ +struct apm_user { + int magic; + struct apm_user * next; + unsigned int suser: 1; + unsigned int writer: 1; + unsigned int reader: 1; + unsigned int suspend_wait: 1; + int suspend_result; + int suspends_pending; + int standbys_pending; + int suspends_read; + int standbys_read; + int event_head; + int event_tail; + apm_event_t events[APM_MAX_EVENTS]; +}; + +/* + * The magic number in apm_user + */ +#define APM_BIOS_MAGIC 0x4101 + +/* + * idle percentage above which bios idle calls are done + */ +#ifdef CONFIG_APM_CPU_IDLE +#define DEFAULT_IDLE_THRESHOLD 95 +#else +#define DEFAULT_IDLE_THRESHOLD 100 +#endif +#define DEFAULT_IDLE_PERIOD (100 / 3) + +/* + * Local variables + */ +static struct { + unsigned long offset; + unsigned short segment; +} apm_bios_entry; +static int clock_slowed; +static int idle_threshold __read_mostly = DEFAULT_IDLE_THRESHOLD; +static int idle_period __read_mostly = DEFAULT_IDLE_PERIOD; +static int set_pm_idle; +static int suspends_pending; +static int standbys_pending; +static int ignore_sys_suspend; +static int ignore_normal_resume; +static int bounce_interval __read_mostly = DEFAULT_BOUNCE_INTERVAL; + +static int debug __read_mostly; +static int smp __read_mostly; +static int apm_disabled = -1; +#ifdef CONFIG_SMP +static int power_off; +#else +static int power_off = 1; +#endif +#ifdef CONFIG_APM_REAL_MODE_POWER_OFF +static int realmode_power_off = 1; +#else +static int realmode_power_off; +#endif +#ifdef CONFIG_APM_ALLOW_INTS +static int allow_ints = 1; +#else +static int allow_ints; +#endif +static int broken_psr; + +static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue); +static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); +static struct apm_user * user_list; +static DEFINE_SPINLOCK(user_list_lock); +static const struct desc_struct bad_bios_desc = { 0, 0x00409200 }; + +static const char driver_version[] = "1.16ac"; /* no spaces */ + +static struct task_struct *kapmd_task; + +/* + * APM event names taken from the APM 1.2 specification. These are + * the message codes that the BIOS uses to tell us about events + */ +static const char * const apm_event_name[] = { + "system standby", + "system suspend", + "normal resume", + "critical resume", + "low battery", + "power status change", + "update time", + "critical suspend", + "user standby", + "user suspend", + "system standby resume", + "capabilities change" +}; +#define NR_APM_EVENT_NAME ARRAY_SIZE(apm_event_name) + +typedef struct lookup_t { + int key; + char * msg; +} lookup_t; + +/* + * The BIOS returns a set of standard error codes in AX when the + * carry flag is set. + */ + +static const lookup_t error_table[] = { +/* N/A { APM_SUCCESS, "Operation succeeded" }, */ + { APM_DISABLED, "Power management disabled" }, + { APM_CONNECTED, "Real mode interface already connected" }, + { APM_NOT_CONNECTED, "Interface not connected" }, + { APM_16_CONNECTED, "16 bit interface already connected" }, +/* N/A { APM_16_UNSUPPORTED, "16 bit interface not supported" }, */ + { APM_32_CONNECTED, "32 bit interface already connected" }, + { APM_32_UNSUPPORTED, "32 bit interface not supported" }, + { APM_BAD_DEVICE, "Unrecognized device ID" }, + { APM_BAD_PARAM, "Parameter out of range" }, + { APM_NOT_ENGAGED, "Interface not engaged" }, + { APM_BAD_FUNCTION, "Function not supported" }, + { APM_RESUME_DISABLED, "Resume timer disabled" }, + { APM_BAD_STATE, "Unable to enter requested state" }, +/* N/A { APM_NO_EVENTS, "No events pending" }, */ + { APM_NO_ERROR, "BIOS did not set a return code" }, + { APM_NOT_PRESENT, "No APM present" } +}; +#define ERROR_COUNT ARRAY_SIZE(error_table) + +/** + * apm_error - display an APM error + * @str: information string + * @err: APM BIOS return code + * + * Write a meaningful log entry to the kernel log in the event of + * an APM error. + */ + +static void apm_error(char *str, int err) +{ + int i; + + for (i = 0; i < ERROR_COUNT; i++) + if (error_table[i].key == err) break; + if (i < ERROR_COUNT) + printk(KERN_NOTICE "apm: %s: %s\n", str, error_table[i].msg); + else + printk(KERN_NOTICE "apm: %s: unknown error code %#2.2x\n", + str, err); +} + +/* + * Lock APM functionality to physical CPU 0 + */ + +#ifdef CONFIG_SMP + +static cpumask_t apm_save_cpus(void) +{ + cpumask_t x = current->cpus_allowed; + /* Some bioses don't like being called from CPU != 0 */ + set_cpus_allowed(current, cpumask_of_cpu(0)); + BUG_ON(smp_processor_id() != 0); + return x; +} + +static inline void apm_restore_cpus(cpumask_t mask) +{ + set_cpus_allowed(current, mask); +} + +#else + +/* + * No CPU lockdown needed on a uniprocessor + */ + +#define apm_save_cpus() (current->cpus_allowed) +#define apm_restore_cpus(x) (void)(x) + +#endif + +/* + * These are the actual BIOS calls. Depending on APM_ZERO_SEGS and + * apm_info.allow_ints, we are being really paranoid here! Not only + * are interrupts disabled, but all the segment registers (except SS) + * are saved and zeroed this means that if the BIOS tries to reference + * any data without explicitly loading the segment registers, the kernel + * will fault immediately rather than have some unforeseen circumstances + * for the rest of the kernel. And it will be very obvious! :-) Doing + * this depends on CS referring to the same physical memory as DS so that + * DS can be zeroed before the call. Unfortunately, we can't do anything + * about the stack segment/pointer. Also, we tell the compiler that + * everything could change. + * + * Also, we KNOW that for the non error case of apm_bios_call, there + * is no useful data returned in the low order 8 bits of eax. + */ + +static inline unsigned long __apm_irq_save(void) +{ + unsigned long flags; + local_save_flags(flags); + if (apm_info.allow_ints) { + if (irqs_disabled_flags(flags)) + local_irq_enable(); + } else + local_irq_disable(); + + return flags; +} + +#define apm_irq_save(flags) \ + do { flags = __apm_irq_save(); } while (0) + +static inline void apm_irq_restore(unsigned long flags) +{ + if (irqs_disabled_flags(flags)) + local_irq_disable(); + else if (irqs_disabled()) + local_irq_enable(); +} + +#ifdef APM_ZERO_SEGS +# define APM_DECL_SEGS \ + unsigned int saved_fs; unsigned int saved_gs; +# define APM_DO_SAVE_SEGS \ + savesegment(fs, saved_fs); savesegment(gs, saved_gs) +# define APM_DO_RESTORE_SEGS \ + loadsegment(fs, saved_fs); loadsegment(gs, saved_gs) +#else +# define APM_DECL_SEGS +# define APM_DO_SAVE_SEGS +# define APM_DO_RESTORE_SEGS +#endif + +/** + * apm_bios_call - Make an APM BIOS 32bit call + * @func: APM function to execute + * @ebx_in: EBX register for call entry + * @ecx_in: ECX register for call entry + * @eax: EAX register return + * @ebx: EBX register return + * @ecx: ECX register return + * @edx: EDX register return + * @esi: ESI register return + * + * Make an APM call using the 32bit protected mode interface. The + * caller is responsible for knowing if APM BIOS is configured and + * enabled. This call can disable interrupts for a long period of + * time on some laptops. The return value is in AH and the carry + * flag is loaded into AL. If there is an error, then the error + * code is returned in AH (bits 8-15 of eax) and this function + * returns non-zero. + */ + +static u8 apm_bios_call(u32 func, u32 ebx_in, u32 ecx_in, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, u32 *esi) +{ + APM_DECL_SEGS + unsigned long flags; + cpumask_t cpus; + int cpu; + struct desc_struct save_desc_40; + struct desc_struct *gdt; + + cpus = apm_save_cpus(); + + cpu = get_cpu(); + gdt = get_cpu_gdt_table(cpu); + save_desc_40 = gdt[0x40 / 8]; + gdt[0x40 / 8] = bad_bios_desc; + + apm_irq_save(flags); + APM_DO_SAVE_SEGS; + apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi); + APM_DO_RESTORE_SEGS; + apm_irq_restore(flags); + gdt[0x40 / 8] = save_desc_40; + put_cpu(); + apm_restore_cpus(cpus); + + return *eax & 0xff; +} + +/** + * apm_bios_call_simple - make a simple APM BIOS 32bit call + * @func: APM function to invoke + * @ebx_in: EBX register value for BIOS call + * @ecx_in: ECX register value for BIOS call + * @eax: EAX register on return from the BIOS call + * + * Make a BIOS call that returns one value only, or just status. + * If there is an error, then the error code is returned in AH + * (bits 8-15 of eax) and this function returns non-zero. This is + * used for simpler BIOS operations. This call may hold interrupts + * off for a long time on some laptops. + */ + +static u8 apm_bios_call_simple(u32 func, u32 ebx_in, u32 ecx_in, u32 *eax) +{ + u8 error; + APM_DECL_SEGS + unsigned long flags; + cpumask_t cpus; + int cpu; + struct desc_struct save_desc_40; + struct desc_struct *gdt; + + cpus = apm_save_cpus(); + + cpu = get_cpu(); + gdt = get_cpu_gdt_table(cpu); + save_desc_40 = gdt[0x40 / 8]; + gdt[0x40 / 8] = bad_bios_desc; + + apm_irq_save(flags); + APM_DO_SAVE_SEGS; + error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax); + APM_DO_RESTORE_SEGS; + apm_irq_restore(flags); + gdt[0x40 / 8] = save_desc_40; + put_cpu(); + apm_restore_cpus(cpus); + return error; +} + +/** + * apm_driver_version - APM driver version + * @val: loaded with the APM version on return + * + * Retrieve the APM version supported by the BIOS. This is only + * supported for APM 1.1 or higher. An error indicates APM 1.0 is + * probably present. + * + * On entry val should point to a value indicating the APM driver + * version with the high byte being the major and the low byte the + * minor number both in BCD + * + * On return it will hold the BIOS revision supported in the + * same format. + */ + +static int apm_driver_version(u_short *val) +{ + u32 eax; + + if (apm_bios_call_simple(APM_FUNC_VERSION, 0, *val, &eax)) + return (eax >> 8) & 0xff; + *val = eax; + return APM_SUCCESS; +} + +/** + * apm_get_event - get an APM event from the BIOS + * @event: pointer to the event + * @info: point to the event information + * + * The APM BIOS provides a polled information for event + * reporting. The BIOS expects to be polled at least every second + * when events are pending. When a message is found the caller should + * poll until no more messages are present. However, this causes + * problems on some laptops where a suspend event notification is + * not cleared until it is acknowledged. + * + * Additional information is returned in the info pointer, providing + * that APM 1.2 is in use. If no messges are pending the value 0x80 + * is returned (No power management events pending). + */ + +static int apm_get_event(apm_event_t *event, apm_eventinfo_t *info) +{ + u32 eax; + u32 ebx; + u32 ecx; + u32 dummy; + + if (apm_bios_call(APM_FUNC_GET_EVENT, 0, 0, &eax, &ebx, &ecx, + &dummy, &dummy)) + return (eax >> 8) & 0xff; + *event = ebx; + if (apm_info.connection_version < 0x0102) + *info = ~0; /* indicate info not valid */ + else + *info = ecx; + return APM_SUCCESS; +} + +/** + * set_power_state - set the power management state + * @what: which items to transition + * @state: state to transition to + * + * Request an APM change of state for one or more system devices. The + * processor state must be transitioned last of all. what holds the + * class of device in the upper byte and the device number (0xFF for + * all) for the object to be transitioned. + * + * The state holds the state to transition to, which may in fact + * be an acceptance of a BIOS requested state change. + */ + +static int set_power_state(u_short what, u_short state) +{ + u32 eax; + + if (apm_bios_call_simple(APM_FUNC_SET_STATE, what, state, &eax)) + return (eax >> 8) & 0xff; + return APM_SUCCESS; +} + +/** + * set_system_power_state - set system wide power state + * @state: which state to enter + * + * Transition the entire system into a new APM power state. + */ + +static int set_system_power_state(u_short state) +{ + return set_power_state(APM_DEVICE_ALL, state); +} + +/** + * apm_do_idle - perform power saving + * + * This function notifies the BIOS that the processor is (in the view + * of the OS) idle. It returns -1 in the event that the BIOS refuses + * to handle the idle request. On a success the function returns 1 + * if the BIOS did clock slowing or 0 otherwise. + */ + +static int apm_do_idle(void) +{ + u32 eax; + u8 ret = 0; + int idled = 0; + int polling; + + polling = !!(current_thread_info()->status & TS_POLLING); + if (polling) { + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we + * test NEED_RESCHED: + */ + smp_mb(); + } + if (!need_resched()) { + idled = 1; + ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); + } + if (polling) + current_thread_info()->status |= TS_POLLING; + + if (!idled) + return 0; + + if (ret) { + static unsigned long t; + + /* This always fails on some SMP boards running UP kernels. + * Only report the failure the first 5 times. + */ + if (++t < 5) + { + printk(KERN_DEBUG "apm_do_idle failed (%d)\n", + (eax >> 8) & 0xff); + t = jiffies; + } + return -1; + } + clock_slowed = (apm_info.bios.flags & APM_IDLE_SLOWS_CLOCK) != 0; + return clock_slowed; +} + +/** + * apm_do_busy - inform the BIOS the CPU is busy + * + * Request that the BIOS brings the CPU back to full performance. + */ + +static void apm_do_busy(void) +{ + u32 dummy; + + if (clock_slowed || ALWAYS_CALL_BUSY) { + (void) apm_bios_call_simple(APM_FUNC_BUSY, 0, 0, &dummy); + clock_slowed = 0; + } +} + +/* + * If no process has really been interested in + * the CPU for some time, we want to call BIOS + * power management - we probably want + * to conserve power. + */ +#define IDLE_CALC_LIMIT (HZ * 100) +#define IDLE_LEAKY_MAX 16 + +static void (*original_pm_idle)(void) __read_mostly; + +/** + * apm_cpu_idle - cpu idling for APM capable Linux + * + * This is the idling function the kernel executes when APM is available. It + * tries to do BIOS powermanagement based on the average system idle time. + * Furthermore it calls the system default idle routine. + */ + +static void apm_cpu_idle(void) +{ + static int use_apm_idle; /* = 0 */ + static unsigned int last_jiffies; /* = 0 */ + static unsigned int last_stime; /* = 0 */ + + int apm_idle_done = 0; + unsigned int jiffies_since_last_check = jiffies - last_jiffies; + unsigned int bucket; + +recalc: + if (jiffies_since_last_check > IDLE_CALC_LIMIT) { + use_apm_idle = 0; + last_jiffies = jiffies; + last_stime = current->stime; + } else if (jiffies_since_last_check > idle_period) { + unsigned int idle_percentage; + + idle_percentage = current->stime - last_stime; + idle_percentage *= 100; + idle_percentage /= jiffies_since_last_check; + use_apm_idle = (idle_percentage > idle_threshold); + if (apm_info.forbid_idle) + use_apm_idle = 0; + last_jiffies = jiffies; + last_stime = current->stime; + } + + bucket = IDLE_LEAKY_MAX; + + while (!need_resched()) { + if (use_apm_idle) { + unsigned int t; + + t = jiffies; + switch (apm_do_idle()) { + case 0: apm_idle_done = 1; + if (t != jiffies) { + if (bucket) { + bucket = IDLE_LEAKY_MAX; + continue; + } + } else if (bucket) { + bucket--; + continue; + } + break; + case 1: apm_idle_done = 1; + break; + default: /* BIOS refused */ + break; + } + } + if (original_pm_idle) + original_pm_idle(); + else + default_idle(); + jiffies_since_last_check = jiffies - last_jiffies; + if (jiffies_since_last_check > idle_period) + goto recalc; + } + + if (apm_idle_done) + apm_do_busy(); +} + +/** + * apm_power_off - ask the BIOS to power off + * + * Handle the power off sequence. This is the one piece of code we + * will execute even on SMP machines. In order to deal with BIOS + * bugs we support real mode APM BIOS power off calls. We also make + * the SMP call on CPU0 as some systems will only honour this call + * on their first cpu. + */ + +static void apm_power_off(void) +{ + unsigned char po_bios_call[] = { + 0xb8, 0x00, 0x10, /* movw $0x1000,ax */ + 0x8e, 0xd0, /* movw ax,ss */ + 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */ + 0xb8, 0x07, 0x53, /* movw $0x5307,ax */ + 0xbb, 0x01, 0x00, /* movw $0x0001,bx */ + 0xb9, 0x03, 0x00, /* movw $0x0003,cx */ + 0xcd, 0x15 /* int $0x15 */ + }; + + /* Some bioses don't like being called from CPU != 0 */ + if (apm_info.realmode_power_off) + { + (void)apm_save_cpus(); + machine_real_restart(po_bios_call, sizeof(po_bios_call)); + } + else + (void) set_system_power_state(APM_STATE_OFF); +} + +#ifdef CONFIG_APM_DO_ENABLE + +/** + * apm_enable_power_management - enable BIOS APM power management + * @enable: enable yes/no + * + * Enable or disable the APM BIOS power services. + */ + +static int apm_enable_power_management(int enable) +{ + u32 eax; + + if ((enable == 0) && (apm_info.bios.flags & APM_BIOS_DISENGAGED)) + return APM_NOT_ENGAGED; + if (apm_bios_call_simple(APM_FUNC_ENABLE_PM, APM_DEVICE_BALL, + enable, &eax)) + return (eax >> 8) & 0xff; + if (enable) + apm_info.bios.flags &= ~APM_BIOS_DISABLED; + else + apm_info.bios.flags |= APM_BIOS_DISABLED; + return APM_SUCCESS; +} +#endif + +/** + * apm_get_power_status - get current power state + * @status: returned status + * @bat: battery info + * @life: estimated life + * + * Obtain the current power status from the APM BIOS. We return a + * status which gives the rough battery status, and current power + * source. The bat value returned give an estimate as a percentage + * of life and a status value for the battery. The estimated life + * if reported is a lifetime in secodnds/minutes at current powwer + * consumption. + */ + +static int apm_get_power_status(u_short *status, u_short *bat, u_short *life) +{ + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 dummy; + + if (apm_info.get_power_status_broken) + return APM_32_UNSUPPORTED; + if (apm_bios_call(APM_FUNC_GET_STATUS, APM_DEVICE_ALL, 0, + &eax, &ebx, &ecx, &edx, &dummy)) + return (eax >> 8) & 0xff; + *status = ebx; + *bat = ecx; + if (apm_info.get_power_status_swabinminutes) { + *life = swab16((u16)edx); + *life |= 0x8000; + } else + *life = edx; + return APM_SUCCESS; +} + +#if 0 +static int apm_get_battery_status(u_short which, u_short *status, + u_short *bat, u_short *life, u_short *nbat) +{ + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esi; + + if (apm_info.connection_version < 0x0102) { + /* pretend we only have one battery. */ + if (which != 1) + return APM_BAD_DEVICE; + *nbat = 1; + return apm_get_power_status(status, bat, life); + } + + if (apm_bios_call(APM_FUNC_GET_STATUS, (0x8000 | (which)), 0, &eax, + &ebx, &ecx, &edx, &esi)) + return (eax >> 8) & 0xff; + *status = ebx; + *bat = ecx; + *life = edx; + *nbat = esi; + return APM_SUCCESS; +} +#endif + +/** + * apm_engage_power_management - enable PM on a device + * @device: identity of device + * @enable: on/off + * + * Activate or deactive power management on either a specific device + * or the entire system (%APM_DEVICE_ALL). + */ + +static int apm_engage_power_management(u_short device, int enable) +{ + u32 eax; + + if ((enable == 0) && (device == APM_DEVICE_ALL) + && (apm_info.bios.flags & APM_BIOS_DISABLED)) + return APM_DISABLED; + if (apm_bios_call_simple(APM_FUNC_ENGAGE_PM, device, enable, &eax)) + return (eax >> 8) & 0xff; + if (device == APM_DEVICE_ALL) { + if (enable) + apm_info.bios.flags &= ~APM_BIOS_DISENGAGED; + else + apm_info.bios.flags |= APM_BIOS_DISENGAGED; + } + return APM_SUCCESS; +} + +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + +/** + * apm_console_blank - blank the display + * @blank: on/off + * + * Attempt to blank the console, firstly by blanking just video device + * zero, and if that fails (some BIOSes don't support it) then it blanks + * all video devices. Typically the BIOS will do laptop backlight and + * monitor powerdown for us. + */ + +static int apm_console_blank(int blank) +{ + int error = APM_NOT_ENGAGED; /* silence gcc */ + int i; + u_short state; + static const u_short dev[3] = { 0x100, 0x1FF, 0x101 }; + + state = blank ? APM_STATE_STANDBY : APM_STATE_READY; + + for (i = 0; i < ARRAY_SIZE(dev); i++) { + error = set_power_state(dev[i], state); + + if ((error == APM_SUCCESS) || (error == APM_NO_ERROR)) + return 1; + + if (error == APM_NOT_ENGAGED) + break; + } + + if (error == APM_NOT_ENGAGED) { + static int tried; + int eng_error; + if (tried++ == 0) { + eng_error = apm_engage_power_management(APM_DEVICE_ALL, 1); + if (eng_error) { + apm_error("set display", error); + apm_error("engage interface", eng_error); + return 0; + } else + return apm_console_blank(blank); + } + } + apm_error("set display", error); + return 0; +} +#endif + +static int queue_empty(struct apm_user *as) +{ + return as->event_head == as->event_tail; +} + +static apm_event_t get_queued_event(struct apm_user *as) +{ + if (++as->event_tail >= APM_MAX_EVENTS) + as->event_tail = 0; + return as->events[as->event_tail]; +} + +static void queue_event(apm_event_t event, struct apm_user *sender) +{ + struct apm_user * as; + + spin_lock(&user_list_lock); + if (user_list == NULL) + goto out; + for (as = user_list; as != NULL; as = as->next) { + if ((as == sender) || (!as->reader)) + continue; + if (++as->event_head >= APM_MAX_EVENTS) + as->event_head = 0; + + if (as->event_head == as->event_tail) { + static int notified; + + if (notified++ == 0) + printk(KERN_ERR "apm: an event queue overflowed\n"); + if (++as->event_tail >= APM_MAX_EVENTS) + as->event_tail = 0; + } + as->events[as->event_head] = event; + if ((!as->suser) || (!as->writer)) + continue; + switch (event) { + case APM_SYS_SUSPEND: + case APM_USER_SUSPEND: + as->suspends_pending++; + suspends_pending++; + break; + + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + as->standbys_pending++; + standbys_pending++; + break; + } + } + wake_up_interruptible(&apm_waitqueue); +out: + spin_unlock(&user_list_lock); +} + +static void reinit_timer(void) +{ +#ifdef INIT_TIMER_AFTER_SUSPEND + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + /* set the clock to HZ */ + outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ + udelay(10); + outb_p(LATCH & 0xff, PIT_CH0); /* LSB */ + udelay(10); + outb(LATCH >> 8, PIT_CH0); /* MSB */ + udelay(10); + spin_unlock_irqrestore(&i8253_lock, flags); +#endif +} + +static int suspend(int vetoable) +{ + int err; + struct apm_user *as; + + if (pm_send_all(PM_SUSPEND, (void *)3)) { + /* Vetoed */ + if (vetoable) { + if (apm_info.connection_version > 0x100) + set_system_power_state(APM_STATE_REJECT); + err = -EBUSY; + ignore_sys_suspend = 0; + printk(KERN_WARNING "apm: suspend was vetoed.\n"); + goto out; + } + printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n"); + } + + device_suspend(PMSG_SUSPEND); + local_irq_disable(); + device_power_down(PMSG_SUSPEND); + + local_irq_enable(); + + save_processor_state(); + err = set_system_power_state(APM_STATE_SUSPEND); + ignore_normal_resume = 1; + restore_processor_state(); + + local_irq_disable(); + reinit_timer(); + + if (err == APM_NO_ERROR) + err = APM_SUCCESS; + if (err != APM_SUCCESS) + apm_error("suspend", err); + err = (err == APM_SUCCESS) ? 0 : -EIO; + device_power_up(); + local_irq_enable(); + device_resume(); + pm_send_all(PM_RESUME, (void *)0); + queue_event(APM_NORMAL_RESUME, NULL); + out: + spin_lock(&user_list_lock); + for (as = user_list; as != NULL; as = as->next) { + as->suspend_wait = 0; + as->suspend_result = err; + } + spin_unlock(&user_list_lock); + wake_up_interruptible(&apm_suspend_waitqueue); + return err; +} + +static void standby(void) +{ + int err; + + local_irq_disable(); + device_power_down(PMSG_SUSPEND); + local_irq_enable(); + + err = set_system_power_state(APM_STATE_STANDBY); + if ((err != APM_SUCCESS) && (err != APM_NO_ERROR)) + apm_error("standby", err); + + local_irq_disable(); + device_power_up(); + local_irq_enable(); +} + +static apm_event_t get_event(void) +{ + int error; + apm_event_t event = APM_NO_EVENTS; /* silence gcc */ + apm_eventinfo_t info; + + static int notified; + + /* we don't use the eventinfo */ + error = apm_get_event(&event, &info); + if (error == APM_SUCCESS) + return event; + + if ((error != APM_NO_EVENTS) && (notified++ == 0)) + apm_error("get_event", error); + + return 0; +} + +static void check_events(void) +{ + apm_event_t event; + static unsigned long last_resume; + static int ignore_bounce; + + while ((event = get_event()) != 0) { + if (debug) { + if (event <= NR_APM_EVENT_NAME) + printk(KERN_DEBUG "apm: received %s notify\n", + apm_event_name[event - 1]); + else + printk(KERN_DEBUG "apm: received unknown " + "event 0x%02x\n", event); + } + if (ignore_bounce + && ((jiffies - last_resume) > bounce_interval)) + ignore_bounce = 0; + + switch (event) { + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + queue_event(event, NULL); + if (standbys_pending <= 0) + standby(); + break; + + case APM_USER_SUSPEND: +#ifdef CONFIG_APM_IGNORE_USER_SUSPEND + if (apm_info.connection_version > 0x100) + set_system_power_state(APM_STATE_REJECT); + break; +#endif + case APM_SYS_SUSPEND: + if (ignore_bounce) { + if (apm_info.connection_version > 0x100) + set_system_power_state(APM_STATE_REJECT); + break; + } + /* + * If we are already processing a SUSPEND, + * then further SUSPEND events from the BIOS + * will be ignored. We also return here to + * cope with the fact that the Thinkpads keep + * sending a SUSPEND event until something else + * happens! + */ + if (ignore_sys_suspend) + return; + ignore_sys_suspend = 1; + queue_event(event, NULL); + if (suspends_pending <= 0) + (void) suspend(1); + break; + + case APM_NORMAL_RESUME: + case APM_CRITICAL_RESUME: + case APM_STANDBY_RESUME: + ignore_sys_suspend = 0; + last_resume = jiffies; + ignore_bounce = 1; + if ((event != APM_NORMAL_RESUME) + || (ignore_normal_resume == 0)) { + device_resume(); + pm_send_all(PM_RESUME, (void *)0); + queue_event(event, NULL); + } + ignore_normal_resume = 0; + break; + + case APM_CAPABILITY_CHANGE: + case APM_LOW_BATTERY: + case APM_POWER_STATUS_CHANGE: + queue_event(event, NULL); + /* If needed, notify drivers here */ + break; + + case APM_UPDATE_TIME: + break; + + case APM_CRITICAL_SUSPEND: + /* + * We are not allowed to reject a critical suspend. + */ + (void) suspend(0); + break; + } + } +} + +static void apm_event_handler(void) +{ + static int pending_count = 4; + int err; + + if ((standbys_pending > 0) || (suspends_pending > 0)) { + if ((apm_info.connection_version > 0x100) && + (pending_count-- <= 0)) { + pending_count = 4; + if (debug) + printk(KERN_DEBUG "apm: setting state busy\n"); + err = set_system_power_state(APM_STATE_BUSY); + if (err) + apm_error("busy", err); + } + } else + pending_count = 4; + check_events(); +} + +/* + * This is the APM thread main loop. + */ + +static void apm_mainloop(void) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&apm_waitqueue, &wait); + set_current_state(TASK_INTERRUPTIBLE); + for (;;) { + schedule_timeout(APM_CHECK_TIMEOUT); + if (kthread_should_stop()) + break; + /* + * Ok, check all events, check for idle (and mark us sleeping + * so as not to count towards the load average).. + */ + set_current_state(TASK_INTERRUPTIBLE); + apm_event_handler(); + } + remove_wait_queue(&apm_waitqueue, &wait); +} + +static int check_apm_user(struct apm_user *as, const char *func) +{ + if ((as == NULL) || (as->magic != APM_BIOS_MAGIC)) { + printk(KERN_ERR "apm: %s passed bad filp\n", func); + return 1; + } + return 0; +} + +static ssize_t do_read(struct file *fp, char __user *buf, size_t count, loff_t *ppos) +{ + struct apm_user * as; + int i; + apm_event_t event; + + as = fp->private_data; + if (check_apm_user(as, "read")) + return -EIO; + if ((int)count < sizeof(apm_event_t)) + return -EINVAL; + if ((queue_empty(as)) && (fp->f_flags & O_NONBLOCK)) + return -EAGAIN; + wait_event_interruptible(apm_waitqueue, !queue_empty(as)); + i = count; + while ((i >= sizeof(event)) && !queue_empty(as)) { + event = get_queued_event(as); + if (copy_to_user(buf, &event, sizeof(event))) { + if (i < count) + break; + return -EFAULT; + } + switch (event) { + case APM_SYS_SUSPEND: + case APM_USER_SUSPEND: + as->suspends_read++; + break; + + case APM_SYS_STANDBY: + case APM_USER_STANDBY: + as->standbys_read++; + break; + } + buf += sizeof(event); + i -= sizeof(event); + } + if (i < count) + return count - i; + if (signal_pending(current)) + return -ERESTARTSYS; + return 0; +} + +static unsigned int do_poll(struct file *fp, poll_table * wait) +{ + struct apm_user * as; + + as = fp->private_data; + if (check_apm_user(as, "poll")) + return 0; + poll_wait(fp, &apm_waitqueue, wait); + if (!queue_empty(as)) + return POLLIN | POLLRDNORM; + return 0; +} + +static int do_ioctl(struct inode * inode, struct file *filp, + u_int cmd, u_long arg) +{ + struct apm_user * as; + + as = filp->private_data; + if (check_apm_user(as, "ioctl")) + return -EIO; + if ((!as->suser) || (!as->writer)) + return -EPERM; + switch (cmd) { + case APM_IOC_STANDBY: + if (as->standbys_read > 0) { + as->standbys_read--; + as->standbys_pending--; + standbys_pending--; + } else + queue_event(APM_USER_STANDBY, as); + if (standbys_pending <= 0) + standby(); + break; + case APM_IOC_SUSPEND: + if (as->suspends_read > 0) { + as->suspends_read--; + as->suspends_pending--; + suspends_pending--; + } else + queue_event(APM_USER_SUSPEND, as); + if (suspends_pending <= 0) { + return suspend(1); + } else { + as->suspend_wait = 1; + wait_event_interruptible(apm_suspend_waitqueue, + as->suspend_wait == 0); + return as->suspend_result; + } + break; + default: + return -EINVAL; + } + return 0; +} + +static int do_release(struct inode * inode, struct file * filp) +{ + struct apm_user * as; + + as = filp->private_data; + if (check_apm_user(as, "release")) + return 0; + filp->private_data = NULL; + if (as->standbys_pending > 0) { + standbys_pending -= as->standbys_pending; + if (standbys_pending <= 0) + standby(); + } + if (as->suspends_pending > 0) { + suspends_pending -= as->suspends_pending; + if (suspends_pending <= 0) + (void) suspend(1); + } + spin_lock(&user_list_lock); + if (user_list == as) + user_list = as->next; + else { + struct apm_user * as1; + + for (as1 = user_list; + (as1 != NULL) && (as1->next != as); + as1 = as1->next) + ; + if (as1 == NULL) + printk(KERN_ERR "apm: filp not in user list\n"); + else + as1->next = as->next; + } + spin_unlock(&user_list_lock); + kfree(as); + return 0; +} + +static int do_open(struct inode * inode, struct file * filp) +{ + struct apm_user * as; + + as = kmalloc(sizeof(*as), GFP_KERNEL); + if (as == NULL) { + printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", + sizeof(*as)); + return -ENOMEM; + } + as->magic = APM_BIOS_MAGIC; + as->event_tail = as->event_head = 0; + as->suspends_pending = as->standbys_pending = 0; + as->suspends_read = as->standbys_read = 0; + /* + * XXX - this is a tiny bit broken, when we consider BSD + * process accounting. If the device is opened by root, we + * instantly flag that we used superuser privs. Who knows, + * we might close the device immediately without doing a + * privileged operation -- cevans + */ + as->suser = capable(CAP_SYS_ADMIN); + as->writer = (filp->f_mode & FMODE_WRITE) == FMODE_WRITE; + as->reader = (filp->f_mode & FMODE_READ) == FMODE_READ; + spin_lock(&user_list_lock); + as->next = user_list; + user_list = as; + spin_unlock(&user_list_lock); + filp->private_data = as; + return 0; +} + +static int proc_apm_show(struct seq_file *m, void *v) +{ + unsigned short bx; + unsigned short cx; + unsigned short dx; + int error; + unsigned short ac_line_status = 0xff; + unsigned short battery_status = 0xff; + unsigned short battery_flag = 0xff; + int percentage = -1; + int time_units = -1; + char *units = "?"; + + if ((num_online_cpus() == 1) && + !(error = apm_get_power_status(&bx, &cx, &dx))) { + ac_line_status = (bx >> 8) & 0xff; + battery_status = bx & 0xff; + if ((cx & 0xff) != 0xff) + percentage = cx & 0xff; + + if (apm_info.connection_version > 0x100) { + battery_flag = (cx >> 8) & 0xff; + if (dx != 0xffff) { + units = (dx & 0x8000) ? "min" : "sec"; + time_units = dx & 0x7fff; + } + } + } + /* Arguments, with symbols from linux/apm_bios.h. Information is + from the Get Power Status (0x0a) call unless otherwise noted. + + 0) Linux driver version (this will change if format changes) + 1) APM BIOS Version. Usually 1.0, 1.1 or 1.2. + 2) APM flags from APM Installation Check (0x00): + bit 0: APM_16_BIT_SUPPORT + bit 1: APM_32_BIT_SUPPORT + bit 2: APM_IDLE_SLOWS_CLOCK + bit 3: APM_BIOS_DISABLED + bit 4: APM_BIOS_DISENGAGED + 3) AC line status + 0x00: Off-line + 0x01: On-line + 0x02: On backup power (BIOS >= 1.1 only) + 0xff: Unknown + 4) Battery status + 0x00: High + 0x01: Low + 0x02: Critical + 0x03: Charging + 0x04: Selected battery not present (BIOS >= 1.2 only) + 0xff: Unknown + 5) Battery flag + bit 0: High + bit 1: Low + bit 2: Critical + bit 3: Charging + bit 7: No system battery + 0xff: Unknown + 6) Remaining battery life (percentage of charge): + 0-100: valid + -1: Unknown + 7) Remaining battery life (time units): + Number of remaining minutes or seconds + -1: Unknown + 8) min = minutes; sec = seconds */ + + seq_printf(m, "%s %d.%d 0x%02x 0x%02x 0x%02x 0x%02x %d%% %d %s\n", + driver_version, + (apm_info.bios.version >> 8) & 0xff, + apm_info.bios.version & 0xff, + apm_info.bios.flags, + ac_line_status, + battery_status, + battery_flag, + percentage, + time_units, + units); + return 0; +} + +static int proc_apm_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_apm_show, NULL); +} + +static const struct file_operations apm_file_ops = { + .owner = THIS_MODULE, + .open = proc_apm_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int apm(void *unused) +{ + unsigned short bx; + unsigned short cx; + unsigned short dx; + int error; + char * power_stat; + char * bat_stat; + +#ifdef CONFIG_SMP + /* 2002/08/01 - WT + * This is to avoid random crashes at boot time during initialization + * on SMP systems in case of "apm=power-off" mode. Seen on ASUS A7M266D. + * Some bioses don't like being called from CPU != 0. + * Method suggested by Ingo Molnar. + */ + set_cpus_allowed(current, cpumask_of_cpu(0)); + BUG_ON(smp_processor_id() != 0); +#endif + + if (apm_info.connection_version == 0) { + apm_info.connection_version = apm_info.bios.version; + if (apm_info.connection_version > 0x100) { + /* + * We only support BIOSs up to version 1.2 + */ + if (apm_info.connection_version > 0x0102) + apm_info.connection_version = 0x0102; + error = apm_driver_version(&apm_info.connection_version); + if (error != APM_SUCCESS) { + apm_error("driver version", error); + /* Fall back to an APM 1.0 connection. */ + apm_info.connection_version = 0x100; + } + } + } + + if (debug) + printk(KERN_INFO "apm: Connection version %d.%d\n", + (apm_info.connection_version >> 8) & 0xff, + apm_info.connection_version & 0xff); + +#ifdef CONFIG_APM_DO_ENABLE + if (apm_info.bios.flags & APM_BIOS_DISABLED) { + /* + * This call causes my NEC UltraLite Versa 33/C to hang if it + * is booted with PM disabled but not in the docking station. + * Unfortunate ... + */ + error = apm_enable_power_management(1); + if (error) { + apm_error("enable power management", error); + return -1; + } + } +#endif + + if ((apm_info.bios.flags & APM_BIOS_DISENGAGED) + && (apm_info.connection_version > 0x0100)) { + error = apm_engage_power_management(APM_DEVICE_ALL, 1); + if (error) { + apm_error("engage power management", error); + return -1; + } + } + + if (debug && (num_online_cpus() == 1 || smp )) { + error = apm_get_power_status(&bx, &cx, &dx); + if (error) + printk(KERN_INFO "apm: power status not available\n"); + else { + switch ((bx >> 8) & 0xff) { + case 0: power_stat = "off line"; break; + case 1: power_stat = "on line"; break; + case 2: power_stat = "on backup power"; break; + default: power_stat = "unknown"; break; + } + switch (bx & 0xff) { + case 0: bat_stat = "high"; break; + case 1: bat_stat = "low"; break; + case 2: bat_stat = "critical"; break; + case 3: bat_stat = "charging"; break; + default: bat_stat = "unknown"; break; + } + printk(KERN_INFO + "apm: AC %s, battery status %s, battery life ", + power_stat, bat_stat); + if ((cx & 0xff) == 0xff) + printk("unknown\n"); + else + printk("%d%%\n", cx & 0xff); + if (apm_info.connection_version > 0x100) { + printk(KERN_INFO + "apm: battery flag 0x%02x, battery life ", + (cx >> 8) & 0xff); + if (dx == 0xffff) + printk("unknown\n"); + else + printk("%d %s\n", dx & 0x7fff, + (dx & 0x8000) ? + "minutes" : "seconds"); + } + } + } + + /* Install our power off handler.. */ + if (power_off) + pm_power_off = apm_power_off; + + if (num_online_cpus() == 1 || smp) { +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + console_blank_hook = apm_console_blank; +#endif + apm_mainloop(); +#if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) + console_blank_hook = NULL; +#endif + } + + return 0; +} + +#ifndef MODULE +static int __init apm_setup(char *str) +{ + int invert; + + while ((str != NULL) && (*str != '\0')) { + if (strncmp(str, "off", 3) == 0) + apm_disabled = 1; + if (strncmp(str, "on", 2) == 0) + apm_disabled = 0; + if ((strncmp(str, "bounce-interval=", 16) == 0) || + (strncmp(str, "bounce_interval=", 16) == 0)) + bounce_interval = simple_strtol(str + 16, NULL, 0); + if ((strncmp(str, "idle-threshold=", 15) == 0) || + (strncmp(str, "idle_threshold=", 15) == 0)) + idle_threshold = simple_strtol(str + 15, NULL, 0); + if ((strncmp(str, "idle-period=", 12) == 0) || + (strncmp(str, "idle_period=", 12) == 0)) + idle_period = simple_strtol(str + 12, NULL, 0); + invert = (strncmp(str, "no-", 3) == 0) || + (strncmp(str, "no_", 3) == 0); + if (invert) + str += 3; + if (strncmp(str, "debug", 5) == 0) + debug = !invert; + if ((strncmp(str, "power-off", 9) == 0) || + (strncmp(str, "power_off", 9) == 0)) + power_off = !invert; + if (strncmp(str, "smp", 3) == 0) + { + smp = !invert; + idle_threshold = 100; + } + if ((strncmp(str, "allow-ints", 10) == 0) || + (strncmp(str, "allow_ints", 10) == 0)) + apm_info.allow_ints = !invert; + if ((strncmp(str, "broken-psr", 10) == 0) || + (strncmp(str, "broken_psr", 10) == 0)) + apm_info.get_power_status_broken = !invert; + if ((strncmp(str, "realmode-power-off", 18) == 0) || + (strncmp(str, "realmode_power_off", 18) == 0)) + apm_info.realmode_power_off = !invert; + str = strchr(str, ','); + if (str != NULL) + str += strspn(str, ", \t"); + } + return 1; +} + +__setup("apm=", apm_setup); +#endif + +static const struct file_operations apm_bios_fops = { + .owner = THIS_MODULE, + .read = do_read, + .poll = do_poll, + .ioctl = do_ioctl, + .open = do_open, + .release = do_release, +}; + +static struct miscdevice apm_device = { + APM_MINOR_DEV, + "apm_bios", + &apm_bios_fops +}; + + +/* Simple "print if true" callback */ +static int __init print_if_true(struct dmi_system_id *d) +{ + printk("%s\n", d->ident); + return 0; +} + +/* + * Some Bioses enable the PS/2 mouse (touchpad) at resume, even if it was + * disabled before the suspend. Linux used to get terribly confused by that. + */ +static int __init broken_ps2_resume(struct dmi_system_id *d) +{ + printk(KERN_INFO "%s machine detected. Mousepad Resume Bug workaround hopefully not needed.\n", d->ident); + return 0; +} + +/* Some bioses have a broken protected mode poweroff and need to use realmode */ +static int __init set_realmode_power_off(struct dmi_system_id *d) +{ + if (apm_info.realmode_power_off == 0) { + apm_info.realmode_power_off = 1; + printk(KERN_INFO "%s bios detected. Using realmode poweroff only.\n", d->ident); + } + return 0; +} + +/* Some laptops require interrupts to be enabled during APM calls */ +static int __init set_apm_ints(struct dmi_system_id *d) +{ + if (apm_info.allow_ints == 0) { + apm_info.allow_ints = 1; + printk(KERN_INFO "%s machine detected. Enabling interrupts during APM calls.\n", d->ident); + } + return 0; +} + +/* Some APM bioses corrupt memory or just plain do not work */ +static int __init apm_is_horked(struct dmi_system_id *d) +{ + if (apm_info.disabled == 0) { + apm_info.disabled = 1; + printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); + } + return 0; +} + +static int __init apm_is_horked_d850md(struct dmi_system_id *d) +{ + if (apm_info.disabled == 0) { + apm_info.disabled = 1; + printk(KERN_INFO "%s machine detected. Disabling APM.\n", d->ident); + printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); + printk(KERN_INFO "download from support.intel.com \n"); + } + return 0; +} + +/* Some APM bioses hang on APM idle calls */ +static int __init apm_likes_to_melt(struct dmi_system_id *d) +{ + if (apm_info.forbid_idle == 0) { + apm_info.forbid_idle = 1; + printk(KERN_INFO "%s machine detected. Disabling APM idle calls.\n", d->ident); + } + return 0; +} + +/* + * Check for clue free BIOS implementations who use + * the following QA technique + * + * [ Write BIOS Code ]<------ + * | ^ + * < Does it Compile >----N-- + * |Y ^ + * < Does it Boot Win98 >-N-- + * |Y + * [Ship It] + * + * Phoenix A04 08/24/2000 is known bad (Dell Inspiron 5000e) + * Phoenix A07 09/29/2000 is known good (Dell Inspiron 5000) + */ +static int __init broken_apm_power(struct dmi_system_id *d) +{ + apm_info.get_power_status_broken = 1; + printk(KERN_WARNING "BIOS strings suggest APM bugs, disabling power status reporting.\n"); + return 0; +} + +/* + * This bios swaps the APM minute reporting bytes over (Many sony laptops + * have this problem). + */ +static int __init swab_apm_power_in_minutes(struct dmi_system_id *d) +{ + apm_info.get_power_status_swabinminutes = 1; + printk(KERN_WARNING "BIOS strings suggest APM reports battery life in minutes and wrong byte order.\n"); + return 0; +} + +static struct dmi_system_id __initdata apm_dmi_table[] = { + { + print_if_true, + KERN_WARNING "IBM T23 - BIOS 1.03b+ and controller firmware 1.02+ may be needed for Linux APM.", + { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_BIOS_VERSION, "1AET38WW (1.01b)"), }, + }, + { /* Handle problems with APM on the C600 */ + broken_ps2_resume, "Dell Latitude C600", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell"), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C600"), }, + }, + { /* Allow interrupts during suspend on Dell Latitude laptops*/ + set_apm_ints, "Dell Latitude", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude C510"), } + }, + { /* APM crashes */ + apm_is_horked, "Dell Inspiron 2500", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), + DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + }, + { /* Allow interrupts during suspend on Dell Inspiron laptops*/ + set_apm_ints, "Dell Inspiron", { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 4000"), }, + }, + { /* Handle problems with APM on Inspiron 5000e */ + broken_apm_power, "Dell Inspiron 5000e", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A04"), + DMI_MATCH(DMI_BIOS_DATE, "08/24/2000"), }, + }, + { /* Handle problems with APM on Inspiron 2500 */ + broken_apm_power, "Dell Inspiron 2500", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "A12"), + DMI_MATCH(DMI_BIOS_DATE, "02/04/2002"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell Dimension 4100", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), + DMI_MATCH(DMI_BIOS_VENDOR,"Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + }, + { /* Allow interrupts during suspend on Compaq Laptops*/ + set_apm_ints, "Compaq 12XL125", + { DMI_MATCH(DMI_SYS_VENDOR, "Compaq"), + DMI_MATCH(DMI_PRODUCT_NAME, "Compaq PC"), + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION,"4.06"), }, + }, + { /* Allow interrupts during APM or the clock goes slow */ + set_apm_ints, "ASUSTeK", + { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "L8400K series Notebook PC"), }, + }, + { /* APM blows on shutdown */ + apm_is_horked, "ABIT KX7-333[R]", + { DMI_MATCH(DMI_BOARD_VENDOR, "ABIT"), + DMI_MATCH(DMI_BOARD_NAME, "VT8367-8233A (KX7-333[R])"), }, + }, + { /* APM crashes */ + apm_is_horked, "Trigem Delhi3", + { DMI_MATCH(DMI_SYS_VENDOR, "TriGem Computer, Inc"), + DMI_MATCH(DMI_PRODUCT_NAME, "Delhi3"), }, + }, + { /* APM crashes */ + apm_is_horked, "Fujitsu-Siemens", + { DMI_MATCH(DMI_BIOS_VENDOR, "hoenix/FUJITSU SIEMENS"), + DMI_MATCH(DMI_BIOS_VERSION, "Version1.01"), }, + }, + { /* APM crashes */ + apm_is_horked_d850md, "Intel D850MD", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "MV85010A.86A.0016.P07.0201251536"), }, + }, + { /* APM crashes */ + apm_is_horked, "Intel D810EMO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "MO81010A.86A.0008.P04.0004170800"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell XPS-Z", + { DMI_MATCH(DMI_BIOS_VENDOR, "Intel Corp."), + DMI_MATCH(DMI_BIOS_VERSION, "A11"), + DMI_MATCH(DMI_PRODUCT_NAME, "XPS-Z"), }, + }, + { /* APM crashes */ + apm_is_horked, "Sharp PC-PJ/AX", + { DMI_MATCH(DMI_SYS_VENDOR, "SHARP"), + DMI_MATCH(DMI_PRODUCT_NAME, "PC-PJ/AX"), + DMI_MATCH(DMI_BIOS_VENDOR,"SystemSoft"), + DMI_MATCH(DMI_BIOS_VERSION,"Version R2.08"), }, + }, + { /* APM crashes */ + apm_is_horked, "Dell Inspiron 2500", + { DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 2500"), + DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION,"A11"), }, + }, + { /* APM idle hangs */ + apm_likes_to_melt, "Jabil AMD", + { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "0AASNP06"), }, + }, + { /* APM idle hangs */ + apm_likes_to_melt, "AMI Bios", + { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "0AASNP05"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-N505X(DE) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0206H"), + DMI_MATCH(DMI_BIOS_DATE, "08/23/99"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-N505VX */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "W2K06H0"), + DMI_MATCH(DMI_BIOS_DATE, "02/03/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-XG29 */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0117A0"), + DMI_MATCH(DMI_BIOS_DATE, "04/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0121Z1"), + DMI_MATCH(DMI_BIOS_DATE, "05/11/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600NE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "WME01Z1"), + DMI_MATCH(DMI_BIOS_DATE, "08/11/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z600LEK(DE) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0206Z3"), + DMI_MATCH(DMI_BIOS_DATE, "12/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0203D0"), + DMI_MATCH(DMI_BIOS_DATE, "05/12/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0203Z3"), + DMI_MATCH(DMI_BIOS_DATE, "08/25/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-Z505LS (with updated BIOS) */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0209Z3"), + DMI_MATCH(DMI_BIOS_DATE, "05/12/01"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-F104K */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0204K2"), + DMI_MATCH(DMI_BIOS_DATE, "08/28/00"), }, + }, + + { /* Handle problems with APM on Sony Vaio PCG-C1VN/C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0208P1"), + DMI_MATCH(DMI_BIOS_DATE, "11/09/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "R0204P1"), + DMI_MATCH(DMI_BIOS_DATE, "09/12/00"), }, + }, + { /* Handle problems with APM on Sony Vaio PCG-C1VE */ + swab_apm_power_in_minutes, "Sony VAIO", + { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies LTD"), + DMI_MATCH(DMI_BIOS_VERSION, "WXPO1Z3"), + DMI_MATCH(DMI_BIOS_DATE, "10/26/01"), }, + }, + { /* broken PM poweroff bios */ + set_realmode_power_off, "Award Software v4.60 PGMA", + { DMI_MATCH(DMI_BIOS_VENDOR, "Award Software International, Inc."), + DMI_MATCH(DMI_BIOS_VERSION, "4.60 PGMA"), + DMI_MATCH(DMI_BIOS_DATE, "134526184"), }, + }, + + /* Generic per vendor APM settings */ + + { /* Allow interrupts during suspend on IBM laptops */ + set_apm_ints, "IBM", + { DMI_MATCH(DMI_SYS_VENDOR, "IBM"), }, + }, + + { } +}; + +/* + * Just start the APM thread. We do NOT want to do APM BIOS + * calls from anything but the APM thread, if for no other reason + * than the fact that we don't trust the APM BIOS. This way, + * most common APM BIOS problems that lead to protection errors + * etc will have at least some level of being contained... + * + * In short, if something bad happens, at least we have a choice + * of just killing the apm thread.. + */ +static int __init apm_init(void) +{ + struct proc_dir_entry *apm_proc; + struct desc_struct *gdt; + int err; + + dmi_check_system(apm_dmi_table); + + if (apm_info.bios.version == 0 || paravirt_enabled()) { + printk(KERN_INFO "apm: BIOS not found.\n"); + return -ENODEV; + } + printk(KERN_INFO + "apm: BIOS version %d.%d Flags 0x%02x (Driver version %s)\n", + ((apm_info.bios.version >> 8) & 0xff), + (apm_info.bios.version & 0xff), + apm_info.bios.flags, + driver_version); + if ((apm_info.bios.flags & APM_32_BIT_SUPPORT) == 0) { + printk(KERN_INFO "apm: no 32 bit BIOS support\n"); + return -ENODEV; + } + + if (allow_ints) + apm_info.allow_ints = 1; + if (broken_psr) + apm_info.get_power_status_broken = 1; + if (realmode_power_off) + apm_info.realmode_power_off = 1; + /* User can override, but default is to trust DMI */ + if (apm_disabled != -1) + apm_info.disabled = apm_disabled; + + /* + * Fix for the Compaq Contura 3/25c which reports BIOS version 0.1 + * but is reportedly a 1.0 BIOS. + */ + if (apm_info.bios.version == 0x001) + apm_info.bios.version = 0x100; + + /* BIOS < 1.2 doesn't set cseg_16_len */ + if (apm_info.bios.version < 0x102) + apm_info.bios.cseg_16_len = 0; /* 64k */ + + if (debug) { + printk(KERN_INFO "apm: entry %x:%x cseg16 %x dseg %x", + apm_info.bios.cseg, apm_info.bios.offset, + apm_info.bios.cseg_16, apm_info.bios.dseg); + if (apm_info.bios.version > 0x100) + printk(" cseg len %x, dseg len %x", + apm_info.bios.cseg_len, + apm_info.bios.dseg_len); + if (apm_info.bios.version > 0x101) + printk(" cseg16 len %x", apm_info.bios.cseg_16_len); + printk("\n"); + } + + if (apm_info.disabled) { + printk(KERN_NOTICE "apm: disabled on user request.\n"); + return -ENODEV; + } + if ((num_online_cpus() > 1) && !power_off && !smp) { + printk(KERN_NOTICE "apm: disabled - APM is not SMP safe.\n"); + apm_info.disabled = 1; + return -ENODEV; + } + if (PM_IS_ACTIVE()) { + printk(KERN_NOTICE "apm: overridden by ACPI.\n"); + apm_info.disabled = 1; + return -ENODEV; + } +#ifdef CONFIG_PM_LEGACY + pm_active = 1; +#endif + + /* + * Set up a segment that references the real mode segment 0x40 + * that extends up to the end of page zero (that we have reserved). + * This is for buggy BIOS's that refer to (real mode) segment 0x40 + * even though they are called in protected mode. + */ + set_base(bad_bios_desc, __va((unsigned long)0x40 << 4)); + _set_limit((char *)&bad_bios_desc, 4095 - (0x40 << 4)); + + /* + * Set up the long jump entry point to the APM BIOS, which is called + * from inline assembly. + */ + apm_bios_entry.offset = apm_info.bios.offset; + apm_bios_entry.segment = APM_CS; + + /* + * The APM 1.1 BIOS is supposed to provide limit information that it + * recognizes. Many machines do this correctly, but many others do + * not restrict themselves to their claimed limit. When this happens, + * they will cause a segmentation violation in the kernel at boot time. + * Most BIOS's, however, will respect a 64k limit, so we use that. + * + * Note we only set APM segments on CPU zero, since we pin the APM + * code to that CPU. + */ + gdt = get_cpu_gdt_table(0); + set_base(gdt[APM_CS >> 3], + __va((unsigned long)apm_info.bios.cseg << 4)); + set_base(gdt[APM_CS_16 >> 3], + __va((unsigned long)apm_info.bios.cseg_16 << 4)); + set_base(gdt[APM_DS >> 3], + __va((unsigned long)apm_info.bios.dseg << 4)); + + apm_proc = create_proc_entry("apm", 0, NULL); + if (apm_proc) + apm_proc->proc_fops = &apm_file_ops; + + kapmd_task = kthread_create(apm, NULL, "kapmd"); + if (IS_ERR(kapmd_task)) { + printk(KERN_ERR "apm: disabled - Unable to start kernel " + "thread.\n"); + err = PTR_ERR(kapmd_task); + kapmd_task = NULL; + remove_proc_entry("apm", NULL); + return err; + } + wake_up_process(kapmd_task); + + if (num_online_cpus() > 1 && !smp ) { + printk(KERN_NOTICE + "apm: disabled - APM is not SMP safe (power off active).\n"); + return 0; + } + + /* + * Note we don't actually care if the misc_device cannot be registered. + * this driver can do its job without it, even if userspace can't + * control it. just log the error + */ + if (misc_register(&apm_device)) + printk(KERN_WARNING "apm: Could not register misc device.\n"); + + if (HZ != 100) + idle_period = (idle_period * HZ) / 100; + if (idle_threshold < 100) { + original_pm_idle = pm_idle; + pm_idle = apm_cpu_idle; + set_pm_idle = 1; + } + + return 0; +} + +static void __exit apm_exit(void) +{ + int error; + + if (set_pm_idle) { + pm_idle = original_pm_idle; + /* + * We are about to unload the current idle thread pm callback + * (pm_idle), Wait for all processors to update cached/local + * copies of pm_idle before proceeding. + */ + cpu_idle_wait(); + } + if (((apm_info.bios.flags & APM_BIOS_DISENGAGED) == 0) + && (apm_info.connection_version > 0x0100)) { + error = apm_engage_power_management(APM_DEVICE_ALL, 0); + if (error) + apm_error("disengage power management", error); + } + misc_deregister(&apm_device); + remove_proc_entry("apm", NULL); + if (power_off) + pm_power_off = NULL; + if (kapmd_task) { + kthread_stop(kapmd_task); + kapmd_task = NULL; + } +#ifdef CONFIG_PM_LEGACY + pm_active = 0; +#endif +} + +module_init(apm_init); +module_exit(apm_exit); + +MODULE_AUTHOR("Stephen Rothwell"); +MODULE_DESCRIPTION("Advanced Power Management"); +MODULE_LICENSE("GPL"); +module_param(debug, bool, 0644); +MODULE_PARM_DESC(debug, "Enable debug mode"); +module_param(power_off, bool, 0444); +MODULE_PARM_DESC(power_off, "Enable power off"); +module_param(bounce_interval, int, 0444); +MODULE_PARM_DESC(bounce_interval, + "Set the number of ticks to ignore suspend bounces"); +module_param(allow_ints, bool, 0444); +MODULE_PARM_DESC(allow_ints, "Allow interrupts during BIOS calls"); +module_param(broken_psr, bool, 0444); +MODULE_PARM_DESC(broken_psr, "BIOS has a broken GetPowerStatus call"); +module_param(realmode_power_off, bool, 0444); +MODULE_PARM_DESC(realmode_power_off, + "Switch to real mode before powering off"); +module_param(idle_threshold, int, 0444); +MODULE_PARM_DESC(idle_threshold, + "System idle percentage above which to make APM BIOS idle calls"); +module_param(idle_period, int, 0444); +MODULE_PARM_DESC(idle_period, + "Period (in sec/100) over which to caculate the idle percentage"); +module_param(smp, bool, 0444); +MODULE_PARM_DESC(smp, + "Set this to enable APM use on an SMP platform. Use with caution on older systems"); +MODULE_ALIAS_MISCDEV(APM_MINOR_DEV); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c new file mode 100644 index 000000000000..cfa82c899f47 --- /dev/null +++ b/arch/x86/kernel/asm-offsets.c @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "asm-offsets_32.c" +#else +# include "asm-offsets_64.c" +#endif diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c new file mode 100644 index 000000000000..8029742c0fc1 --- /dev/null +++ b/arch/x86/kernel/asm-offsets_32.c @@ -0,0 +1,147 @@ +/* + * Generate definitions needed by assembly language modules. + * This code generates raw asm output which is post-processed + * to extract and format the required data. + */ + +#include +#include +#include +#include +#include +#include +#include "sigframe_32.h" +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_LGUEST_GUEST +#include +#include "../../../drivers/lguest/lg.h" +#endif + +#define DEFINE(sym, val) \ + asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + +#define BLANK() asm volatile("\n->" : : ) + +#define OFFSET(sym, str, mem) \ + DEFINE(sym, offsetof(struct str, mem)); + +/* workaround for a warning with -Wmissing-prototypes */ +void foo(void); + +void foo(void) +{ + OFFSET(SIGCONTEXT_eax, sigcontext, eax); + OFFSET(SIGCONTEXT_ebx, sigcontext, ebx); + OFFSET(SIGCONTEXT_ecx, sigcontext, ecx); + OFFSET(SIGCONTEXT_edx, sigcontext, edx); + OFFSET(SIGCONTEXT_esi, sigcontext, esi); + OFFSET(SIGCONTEXT_edi, sigcontext, edi); + OFFSET(SIGCONTEXT_ebp, sigcontext, ebp); + OFFSET(SIGCONTEXT_esp, sigcontext, esp); + OFFSET(SIGCONTEXT_eip, sigcontext, eip); + BLANK(); + + OFFSET(CPUINFO_x86, cpuinfo_x86, x86); + OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); + OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); + OFFSET(CPUINFO_x86_mask, cpuinfo_x86, x86_mask); + OFFSET(CPUINFO_hard_math, cpuinfo_x86, hard_math); + OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); + OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); + OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); + BLANK(); + + OFFSET(TI_task, thread_info, task); + OFFSET(TI_exec_domain, thread_info, exec_domain); + OFFSET(TI_flags, thread_info, flags); + OFFSET(TI_status, thread_info, status); + OFFSET(TI_preempt_count, thread_info, preempt_count); + OFFSET(TI_addr_limit, thread_info, addr_limit); + OFFSET(TI_restart_block, thread_info, restart_block); + OFFSET(TI_sysenter_return, thread_info, sysenter_return); + OFFSET(TI_cpu, thread_info, cpu); + BLANK(); + + OFFSET(GDS_size, Xgt_desc_struct, size); + OFFSET(GDS_address, Xgt_desc_struct, address); + OFFSET(GDS_pad, Xgt_desc_struct, pad); + BLANK(); + + OFFSET(PT_EBX, pt_regs, ebx); + OFFSET(PT_ECX, pt_regs, ecx); + OFFSET(PT_EDX, pt_regs, edx); + OFFSET(PT_ESI, pt_regs, esi); + OFFSET(PT_EDI, pt_regs, edi); + OFFSET(PT_EBP, pt_regs, ebp); + OFFSET(PT_EAX, pt_regs, eax); + OFFSET(PT_DS, pt_regs, xds); + OFFSET(PT_ES, pt_regs, xes); + OFFSET(PT_FS, pt_regs, xfs); + OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); + OFFSET(PT_EIP, pt_regs, eip); + OFFSET(PT_CS, pt_regs, xcs); + OFFSET(PT_EFLAGS, pt_regs, eflags); + OFFSET(PT_OLDESP, pt_regs, esp); + OFFSET(PT_OLDSS, pt_regs, xss); + BLANK(); + + OFFSET(EXEC_DOMAIN_handler, exec_domain, handler); + OFFSET(RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); + BLANK(); + + OFFSET(pbe_address, pbe, address); + OFFSET(pbe_orig_address, pbe, orig_address); + OFFSET(pbe_next, pbe, next); + + /* Offset from the sysenter stack to tss.esp0 */ + DEFINE(TSS_sysenter_esp0, offsetof(struct tss_struct, x86_tss.esp0) - + sizeof(struct tss_struct)); + + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); + DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT); + DEFINE(PTRS_PER_PTE, PTRS_PER_PTE); + DEFINE(PTRS_PER_PMD, PTRS_PER_PMD); + DEFINE(PTRS_PER_PGD, PTRS_PER_PGD); + + DEFINE(VDSO_PRELINK_asm, VDSO_PRELINK); + + OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); + +#ifdef CONFIG_PARAVIRT + BLANK(); + OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled); + OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable); + OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable); + OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit); + OFFSET(PARAVIRT_iret, paravirt_ops, iret); + OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0); +#endif + +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#endif + +#ifdef CONFIG_LGUEST_GUEST + BLANK(); + OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); + OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc); + OFFSET(LGUEST_PAGES_host_idt_desc, lguest_pages, state.host_idt_desc); + OFFSET(LGUEST_PAGES_host_cr3, lguest_pages, state.host_cr3); + OFFSET(LGUEST_PAGES_host_sp, lguest_pages, state.host_sp); + OFFSET(LGUEST_PAGES_guest_gdt_desc, lguest_pages,state.guest_gdt_desc); + OFFSET(LGUEST_PAGES_guest_idt_desc, lguest_pages,state.guest_idt_desc); + OFFSET(LGUEST_PAGES_guest_gdt, lguest_pages, state.guest_gdt); + OFFSET(LGUEST_PAGES_regs_trapnum, lguest_pages, regs.trapnum); + OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); + OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); +#endif +} diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c new file mode 100644 index 000000000000..0b9860530a6b --- /dev/null +++ b/arch/x86/kernel/bootflag.c @@ -0,0 +1,98 @@ +/* + * Implement 'Simple Boot Flag Specification 2.0' + */ + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +#define SBF_RESERVED (0x78) +#define SBF_PNPOS (1<<0) +#define SBF_BOOTING (1<<1) +#define SBF_DIAG (1<<2) +#define SBF_PARITY (1<<7) + + +int sbf_port __initdata = -1; /* set via acpi_boot_init() */ + + +static int __init parity(u8 v) +{ + int x = 0; + int i; + + for(i=0;i<8;i++) + { + x^=(v&1); + v>>=1; + } + return x; +} + +static void __init sbf_write(u8 v) +{ + unsigned long flags; + if(sbf_port != -1) + { + v &= ~SBF_PARITY; + if(!parity(v)) + v|=SBF_PARITY; + + printk(KERN_INFO "Simple Boot Flag at 0x%x set to 0x%x\n", sbf_port, v); + + spin_lock_irqsave(&rtc_lock, flags); + CMOS_WRITE(v, sbf_port); + spin_unlock_irqrestore(&rtc_lock, flags); + } +} + +static u8 __init sbf_read(void) +{ + u8 v; + unsigned long flags; + if(sbf_port == -1) + return 0; + spin_lock_irqsave(&rtc_lock, flags); + v = CMOS_READ(sbf_port); + spin_unlock_irqrestore(&rtc_lock, flags); + return v; +} + +static int __init sbf_value_valid(u8 v) +{ + if(v&SBF_RESERVED) /* Reserved bits */ + return 0; + if(!parity(v)) + return 0; + return 1; +} + +static int __init sbf_init(void) +{ + u8 v; + if(sbf_port == -1) + return 0; + v = sbf_read(); + if(!sbf_value_valid(v)) + printk(KERN_WARNING "Simple Boot Flag value 0x%x read from CMOS RAM was invalid\n",v); + + v &= ~SBF_RESERVED; + v &= ~SBF_BOOTING; + v &= ~SBF_DIAG; +#if defined(CONFIG_ISAPNP) + v |= SBF_PNPOS; +#endif + sbf_write(v); + return 0; +} + +module_init(sbf_init); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c new file mode 100644 index 000000000000..5c2faa10e9fa --- /dev/null +++ b/arch/x86/kernel/cpuid.c @@ -0,0 +1,242 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2000 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * cpuid.c + * + * x86 CPUID access device + * + * This device is accessed by lseek() to the appropriate CPUID level + * and then read in chunks of 16 bytes. A larger size means multiple + * reads of consecutive levels. + * + * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on + * an SMP box will direct the access to CPU %d. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static struct class *cpuid_class; + +#ifdef CONFIG_SMP + +struct cpuid_command { + u32 reg; + u32 *data; +}; + +static void cpuid_smp_cpuid(void *cmd_block) +{ + struct cpuid_command *cmd = (struct cpuid_command *)cmd_block; + + cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], + &cmd->data[3]); +} + +static inline void do_cpuid(int cpu, u32 reg, u32 * data) +{ + struct cpuid_command cmd; + + preempt_disable(); + if (cpu == smp_processor_id()) { + cpuid(reg, &data[0], &data[1], &data[2], &data[3]); + } else { + cmd.reg = reg; + cmd.data = data; + + smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); + } + preempt_enable(); +} +#else /* ! CONFIG_SMP */ + +static inline void do_cpuid(int cpu, u32 reg, u32 * data) +{ + cpuid(reg, &data[0], &data[1], &data[2], &data[3]); +} + +#endif /* ! CONFIG_SMP */ + +static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) +{ + loff_t ret; + + lock_kernel(); + + switch (orig) { + case 0: + file->f_pos = offset; + ret = file->f_pos; + break; + case 1: + file->f_pos += offset; + ret = file->f_pos; + break; + default: + ret = -EINVAL; + } + + unlock_kernel(); + return ret; +} + +static ssize_t cpuid_read(struct file *file, char __user *buf, + size_t count, loff_t * ppos) +{ + char __user *tmp = buf; + u32 data[4]; + u32 reg = *ppos; + int cpu = iminor(file->f_path.dentry->d_inode); + + if (count % 16) + return -EINVAL; /* Invalid chunk size */ + + for (; count; count -= 16) { + do_cpuid(cpu, reg, data); + if (copy_to_user(tmp, &data, 16)) + return -EFAULT; + tmp += 16; + *ppos = reg++; + } + + return tmp - buf; +} + +static int cpuid_open(struct inode *inode, struct file *file) +{ + unsigned int cpu = iminor(file->f_path.dentry->d_inode); + struct cpuinfo_x86 *c = &(cpu_data)[cpu]; + + if (cpu >= NR_CPUS || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + if (c->cpuid_level < 0) + return -EIO; /* CPUID not supported */ + + return 0; +} + +/* + * File operations we support + */ +static const struct file_operations cpuid_fops = { + .owner = THIS_MODULE, + .llseek = cpuid_seek, + .read = cpuid_read, + .open = cpuid_open, +}; + +static int cpuid_device_create(int i) +{ + int err = 0; + struct device *dev; + + dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, i), "cpu%d",i); + if (IS_ERR(dev)) + err = PTR_ERR(dev); + return err; +} + +static int cpuid_class_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + cpuid_device_create(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata cpuid_class_cpu_notifier = +{ + .notifier_call = cpuid_class_cpu_callback, +}; + +static int __init cpuid_init(void) +{ + int i, err = 0; + i = 0; + + if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { + printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", + CPUID_MAJOR); + err = -EBUSY; + goto out; + } + cpuid_class = class_create(THIS_MODULE, "cpuid"); + if (IS_ERR(cpuid_class)) { + err = PTR_ERR(cpuid_class); + goto out_chrdev; + } + for_each_online_cpu(i) { + err = cpuid_device_create(i); + if (err != 0) + goto out_class; + } + register_hotcpu_notifier(&cpuid_class_cpu_notifier); + + err = 0; + goto out; + +out_class: + i = 0; + for_each_online_cpu(i) { + device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, i)); + } + class_destroy(cpuid_class); +out_chrdev: + unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); +out: + return err; +} + +static void __exit cpuid_exit(void) +{ + int cpu = 0; + + for_each_online_cpu(cpu) + device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + class_destroy(cpuid_class); + unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); +} + +module_init(cpuid_init); +module_exit(cpuid_exit); + +MODULE_AUTHOR("H. Peter Anvin "); +MODULE_DESCRIPTION("x86 generic CPUID driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/crash_32.c b/arch/x86/kernel/crash_32.c new file mode 100644 index 000000000000..53589d1b1a05 --- /dev/null +++ b/arch/x86/kernel/crash_32.c @@ -0,0 +1,137 @@ +/* + * Architecture specific (i386) functions for kexec based crash dumps. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * + * Copyright (C) IBM Corporation, 2004. All rights reserved. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* This keeps a track of which one is crashing cpu. */ +static int crashing_cpu; + +#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) +static atomic_t waiting_for_crash_ipi; + +static int crash_nmi_callback(struct notifier_block *self, + unsigned long val, void *data) +{ + struct pt_regs *regs; + struct pt_regs fixed_regs; + int cpu; + + if (val != DIE_NMI_IPI) + return NOTIFY_OK; + + regs = ((struct die_args *)data)->regs; + cpu = raw_smp_processor_id(); + + /* Don't do anything if this handler is invoked on crashing cpu. + * Otherwise, system will completely hang. Crashing cpu can get + * an NMI if system was initially booted with nmi_watchdog parameter. + */ + if (cpu == crashing_cpu) + return NOTIFY_STOP; + local_irq_disable(); + + if (!user_mode_vm(regs)) { + crash_fixup_ss_esp(&fixed_regs, regs); + regs = &fixed_regs; + } + crash_save_cpu(regs, cpu); + disable_local_APIC(); + atomic_dec(&waiting_for_crash_ipi); + /* Assume hlt works */ + halt(); + for (;;) + cpu_relax(); + + return 1; +} + +static void smp_send_nmi_allbutself(void) +{ + cpumask_t mask = cpu_online_map; + cpu_clear(safe_smp_processor_id(), mask); + if (!cpus_empty(mask)) + send_IPI_mask(mask, NMI_VECTOR); +} + +static struct notifier_block crash_nmi_nb = { + .notifier_call = crash_nmi_callback, +}; + +static void nmi_shootdown_cpus(void) +{ + unsigned long msecs; + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + /* Would it be better to replace the trap vector here? */ + if (register_die_notifier(&crash_nmi_nb)) + return; /* return what? */ + /* Ensure the new callback function is set before sending + * out the NMI + */ + wmb(); + + smp_send_nmi_allbutself(); + + msecs = 1000; /* Wait at most a second for the other cpus to stop */ + while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { + mdelay(1); + msecs--; + } + + /* Leave the nmi callback set */ + disable_local_APIC(); +} +#else +static void nmi_shootdown_cpus(void) +{ + /* There are no cpus to shootdown */ +} +#endif + +void machine_crash_shutdown(struct pt_regs *regs) +{ + /* This function is only called after the system + * has panicked or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ + /* The kernel is broken so disable interrupts */ + local_irq_disable(); + + /* Make a note of crashing cpu. Will be used in NMI callback.*/ + crashing_cpu = safe_smp_processor_id(); + nmi_shootdown_cpus(); + lapic_shutdown(); +#if defined(CONFIG_X86_IO_APIC) + disable_IO_APIC(); +#endif + crash_save_cpu(regs, safe_smp_processor_id()); +} diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c new file mode 100644 index 000000000000..3f532df488bc --- /dev/null +++ b/arch/x86/kernel/crash_dump_32.c @@ -0,0 +1,74 @@ +/* + * kernel/crash_dump.c - Memory preserving reboot related code. + * + * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) + * Copyright (C) IBM Corporation, 2004. All rights reserved + */ + +#include +#include +#include + +#include + +static void *kdump_buf_page; + +/** + * copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * + * Copy a page from "oldmem". For this page, there is no pte mapped + * in the current kernel. We stitch up a pte, similar to kmap_atomic. + * + * Calling copy_to_user() in atomic context is not desirable. Hence first + * copying the data to a pre-allocated kernel page and then copying to user + * space in non-atomic context. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + void *vaddr; + + if (!csize) + return 0; + + vaddr = kmap_atomic_pfn(pfn, KM_PTE0); + + if (!userbuf) { + memcpy(buf, (vaddr + offset), csize); + kunmap_atomic(vaddr, KM_PTE0); + } else { + if (!kdump_buf_page) { + printk(KERN_WARNING "Kdump: Kdump buffer page not" + " allocated\n"); + return -EFAULT; + } + copy_page(kdump_buf_page, vaddr); + kunmap_atomic(vaddr, KM_PTE0); + if (copy_to_user(buf, (kdump_buf_page + offset), csize)) + return -EFAULT; + } + + return csize; +} + +static int __init kdump_buf_page_init(void) +{ + int ret = 0; + + kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!kdump_buf_page) { + printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer" + " page\n"); + ret = -ENOMEM; + } + + return ret; +} +arch_initcall(kdump_buf_page_init); diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c new file mode 100644 index 000000000000..40978af630e7 --- /dev/null +++ b/arch/x86/kernel/doublefault_32.c @@ -0,0 +1,70 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define DOUBLEFAULT_STACKSIZE (1024) +static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; +#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) + +#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM) + +static void doublefault_fn(void) +{ + struct Xgt_desc_struct gdt_desc = {0, 0}; + unsigned long gdt, tss; + + store_gdt(&gdt_desc); + gdt = gdt_desc.address; + + printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size); + + if (ptr_ok(gdt)) { + gdt += GDT_ENTRY_TSS << 3; + tss = *(u16 *)(gdt+2); + tss += *(u8 *)(gdt+4) << 16; + tss += *(u8 *)(gdt+7) << 24; + printk(KERN_EMERG "double fault, tss at %08lx\n", tss); + + if (ptr_ok(tss)) { + struct i386_hw_tss *t = (struct i386_hw_tss *)tss; + + printk(KERN_EMERG "eip = %08lx, esp = %08lx\n", t->eip, t->esp); + + printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", + t->eax, t->ebx, t->ecx, t->edx); + printk(KERN_EMERG "esi = %08lx, edi = %08lx\n", + t->esi, t->edi); + } + } + + for (;;) + cpu_relax(); +} + +struct tss_struct doublefault_tss __cacheline_aligned = { + .x86_tss = { + .esp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, + + .eip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .eflags = X86_EFLAGS_SF | 0x2, + .esp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, + + .__cr3 = __pa(swapper_pg_dir) + } +}; diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c new file mode 100644 index 000000000000..3c86b979a40a --- /dev/null +++ b/arch/x86/kernel/e820_32.c @@ -0,0 +1,944 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef CONFIG_EFI +int efi_enabled = 0; +EXPORT_SYMBOL(efi_enabled); +#endif + +struct e820map e820; +struct change_member { + struct e820entry *pbios; /* pointer to original bios entry */ + unsigned long long addr; /* address for this change point */ +}; +static struct change_member change_point_list[2*E820MAX] __initdata; +static struct change_member *change_point[2*E820MAX] __initdata; +static struct e820entry *overlap_list[E820MAX] __initdata; +static struct e820entry new_bios[E820MAX] __initdata; +/* For PCI or other memory-mapped resources */ +unsigned long pci_mem_start = 0x10000000; +#ifdef CONFIG_PCI +EXPORT_SYMBOL(pci_mem_start); +#endif +extern int user_defined_memmap; +struct resource data_resource = { + .name = "Kernel data", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +struct resource code_resource = { + .name = "Kernel code", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource video_ram_resource = { + .name = "Video RAM area", + .start = 0xa0000, + .end = 0xbffff, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +static struct resource standard_io_resources[] = { { + .name = "dma1", + .start = 0x0000, + .end = 0x001f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic1", + .start = 0x0020, + .end = 0x0021, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer0", + .start = 0x0040, + .end = 0x0043, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "timer1", + .start = 0x0050, + .end = 0x0053, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "keyboard", + .start = 0x0060, + .end = 0x006f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma page reg", + .start = 0x0080, + .end = 0x008f, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "pic2", + .start = 0x00a0, + .end = 0x00a1, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "dma2", + .start = 0x00c0, + .end = 0x00df, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +}, { + .name = "fpu", + .start = 0x00f0, + .end = 0x00ff, + .flags = IORESOURCE_BUSY | IORESOURCE_IO +} }; + +#define ROMSIGNATURE 0xaa55 + +static int __init romsignature(const unsigned char *rom) +{ + const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig; + + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; +} + +static int __init romchecksum(const unsigned char *rom, unsigned long length) +{ + unsigned char sum, c; + + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) + sum += c; + return !length && !sum; +} + +static void __init probe_roms(void) +{ + const unsigned char *rom; + unsigned long start, length, upper; + unsigned char c; + int i; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + +/* + * Request address space for all standard RAM and ROM resources + * and also for regions reported as reserved by the e820. + */ +static void __init +legacy_init_iomem_resources(struct resource *code_resource, struct resource *data_resource) +{ + int i; + + probe_roms(); + for (i = 0; i < e820.nr_map; i++) { + struct resource *res; +#ifndef CONFIG_RESOURCES_64BIT + if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL) + continue; +#endif + res = kzalloc(sizeof(struct resource), GFP_ATOMIC); + switch (e820.map[i].type) { + case E820_RAM: res->name = "System RAM"; break; + case E820_ACPI: res->name = "ACPI Tables"; break; + case E820_NVS: res->name = "ACPI Non-volatile Storage"; break; + default: res->name = "reserved"; + } + res->start = e820.map[i].addr; + res->end = res->start + e820.map[i].size - 1; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + if (request_resource(&iomem_resource, res)) { + kfree(res); + continue; + } + if (e820.map[i].type == E820_RAM) { + /* + * We don't know which RAM region contains kernel data, + * so we try it repeatedly and let the resource manager + * test it. + */ + request_resource(res, code_resource); + request_resource(res, data_resource); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif + } + } +} + +/* + * Request address space for all standard resources + * + * This is called just before pcibios_init(), which is also a + * subsys_initcall, but is linked in later (in arch/i386/pci/common.c). + */ +static int __init request_standard_resources(void) +{ + int i; + + printk("Setting up standard PCI resources\n"); + if (efi_enabled) + efi_initialize_iomem_resources(&code_resource, &data_resource); + else + legacy_init_iomem_resources(&code_resource, &data_resource); + + /* EFI systems may still have VGA */ + request_resource(&iomem_resource, &video_ram_resource); + + /* request I/O space for devices used on all i[345]86 PCs */ + for (i = 0; i < ARRAY_SIZE(standard_io_resources); i++) + request_resource(&ioport_resource, &standard_io_resources[i]); + return 0; +} + +subsys_initcall(request_standard_resources); + +#if defined(CONFIG_PM) && defined(CONFIG_HIBERNATION) +/** + * e820_mark_nosave_regions - Find the ranges of physical addresses that do not + * correspond to e820 RAM areas and mark the corresponding pages as nosave for + * hibernation. + * + * This function requires the e820 map to be sorted and without any + * overlapping entries and assumes the first e820 area to be RAM. + */ +void __init e820_mark_nosave_regions(void) +{ + int i; + unsigned long pfn; + + pfn = PFN_DOWN(e820.map[0].addr + e820.map[0].size); + for (i = 1; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (pfn < PFN_UP(ei->addr)) + register_nosave_region(pfn, PFN_UP(ei->addr)); + + pfn = PFN_DOWN(ei->addr + ei->size); + if (ei->type != E820_RAM) + register_nosave_region(PFN_UP(ei->addr), pfn); + + if (pfn >= max_low_pfn) + break; + } +} +#endif + +void __init add_memory_region(unsigned long long start, + unsigned long long size, int type) +{ + int x; + + if (!efi_enabled) { + x = e820.nr_map; + + if (x == E820MAX) { + printk(KERN_ERR "Ooops! Too many entries in the memory map!\n"); + return; + } + + e820.map[x].addr = start; + e820.map[x].size = size; + e820.map[x].type = type; + e820.nr_map++; + } +} /* add_memory_region */ + +/* + * Sanitize the BIOS e820 map. + * + * Some e820 responses include overlapping entries. The following + * replaces the original e820 map with a new one, removing overlaps. + * + */ +int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map) +{ + struct change_member *change_tmp; + unsigned long current_type, last_type; + unsigned long long last_addr; + int chgidx, still_changing; + int overlap_entries; + int new_bios_entry; + int old_nr, new_nr, chg_nr; + int i; + + /* + Visually we're performing the following (1,2,3,4 = memory types)... + + Sample memory map (w/overlaps): + ____22__________________ + ______________________4_ + ____1111________________ + _44_____________________ + 11111111________________ + ____________________33__ + ___________44___________ + __________33333_________ + ______________22________ + ___________________2222_ + _________111111111______ + _____________________11_ + _________________4______ + + Sanitized equivalent (no overlap): + 1_______________________ + _44_____________________ + ___1____________________ + ____22__________________ + ______11________________ + _________1______________ + __________3_____________ + ___________44___________ + _____________33_________ + _______________2________ + ________________1_______ + _________________4______ + ___________________2____ + ____________________33__ + ______________________4_ + */ + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) { + return -1; + } + + old_nr = *pnr_map; + + /* bail out if we find any unreasonable addresses in bios map */ + for (i=0; iaddr = biosmap[i].addr; + change_point[chgidx++]->pbios = &biosmap[i]; + change_point[chgidx]->addr = biosmap[i].addr + biosmap[i].size; + change_point[chgidx++]->pbios = &biosmap[i]; + } + } + chg_nr = chgidx; /* true number of change-points */ + + /* sort change-point list by memory addresses (low -> high) */ + still_changing = 1; + while (still_changing) { + still_changing = 0; + for (i=1; i < chg_nr; i++) { + /* if > , swap */ + /* or, if current= & last=, swap */ + if ((change_point[i]->addr < change_point[i-1]->addr) || + ((change_point[i]->addr == change_point[i-1]->addr) && + (change_point[i]->addr == change_point[i]->pbios->addr) && + (change_point[i-1]->addr != change_point[i-1]->pbios->addr)) + ) + { + change_tmp = change_point[i]; + change_point[i] = change_point[i-1]; + change_point[i-1] = change_tmp; + still_changing=1; + } + } + } + + /* create a new bios memory map, removing overlaps */ + overlap_entries=0; /* number of entries in the overlap table */ + new_bios_entry=0; /* index for creating new bios map entries */ + last_type = 0; /* start with undefined memory type */ + last_addr = 0; /* start with 0 as last starting address */ + /* loop through change-points, determining affect on the new bios map */ + for (chgidx=0; chgidx < chg_nr; chgidx++) + { + /* keep track of all overlapping bios entries */ + if (change_point[chgidx]->addr == change_point[chgidx]->pbios->addr) + { + /* add map entry to overlap list (> 1 entry implies an overlap) */ + overlap_list[overlap_entries++]=change_point[chgidx]->pbios; + } + else + { + /* remove entry from list (order independent, so swap with last) */ + for (i=0; ipbios) + overlap_list[i] = overlap_list[overlap_entries-1]; + } + overlap_entries--; + } + /* if there are overlapping entries, decide which "type" to use */ + /* (larger value takes precedence -- 1=usable, 2,3,4,4+=unusable) */ + current_type = 0; + for (i=0; itype > current_type) + current_type = overlap_list[i]->type; + /* continue building up new bios map based on this information */ + if (current_type != last_type) { + if (last_type != 0) { + new_bios[new_bios_entry].size = + change_point[chgidx]->addr - last_addr; + /* move forward only if the new size was non-zero */ + if (new_bios[new_bios_entry].size != 0) + if (++new_bios_entry >= E820MAX) + break; /* no more space left for new bios entries */ + } + if (current_type != 0) { + new_bios[new_bios_entry].addr = change_point[chgidx]->addr; + new_bios[new_bios_entry].type = current_type; + last_addr=change_point[chgidx]->addr; + } + last_type = current_type; + } + } + new_nr = new_bios_entry; /* retain count for new bios entries */ + + /* copy new bios mapping into original location */ + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + *pnr_map = new_nr; + + return 0; +} + +/* + * Copy the BIOS e820 map into a safe place. + * + * Sanity-check it while we're at it.. + * + * If we're lucky and live on a modern system, the setup code + * will have given us a memory map that we can use to properly + * set up memory. If we aren't, we'll fake a memory map. + * + * We check to see that the memory map contains at least 2 elements + * before we'll use it, because the detection code in setup.S may + * not be perfect and most every PC known to man has two memory + * regions: one from 0 to 640k, and one from 1mb up. (The IBM + * thinkpad 560x, for example, does not cooperate with the memory + * detection code.) + */ +int __init copy_e820_map(struct e820entry * biosmap, int nr_map) +{ + /* Only one memory region (or negative)? Ignore it */ + if (nr_map < 2) + return -1; + + do { + unsigned long long start = biosmap->addr; + unsigned long long size = biosmap->size; + unsigned long long end = start + size; + unsigned long type = biosmap->type; + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) + return -1; + + /* + * Some BIOSes claim RAM in the 640k - 1M region. + * Not right. Fix it up. + */ + if (type == E820_RAM) { + if (start < 0x100000ULL && end > 0xA0000ULL) { + if (start < 0xA0000ULL) + add_memory_region(start, 0xA0000ULL-start, type); + if (end <= 0x100000ULL) + continue; + start = 0x100000ULL; + size = end - start; + } + } + add_memory_region(start, size, type); + } while (biosmap++,--nr_map); + return 0; +} + +/* + * Callback for efi_memory_walk. + */ +static int __init +efi_find_max_pfn(unsigned long start, unsigned long end, void *arg) +{ + unsigned long *max_pfn = arg, pfn; + + if (start < end) { + pfn = PFN_UP(end -1); + if (pfn > *max_pfn) + *max_pfn = pfn; + } + return 0; +} + +static int __init +efi_memory_present_wrapper(unsigned long start, unsigned long end, void *arg) +{ + memory_present(0, PFN_UP(start), PFN_DOWN(end)); + return 0; +} + +/* + * Find the highest page frame number we have available + */ +void __init find_max_pfn(void) +{ + int i; + + max_pfn = 0; + if (efi_enabled) { + efi_memmap_walk(efi_find_max_pfn, &max_pfn); + efi_memmap_walk(efi_memory_present_wrapper, NULL); + return; + } + + for (i = 0; i < e820.nr_map; i++) { + unsigned long start, end; + /* RAM? */ + if (e820.map[i].type != E820_RAM) + continue; + start = PFN_UP(e820.map[i].addr); + end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + if (start >= end) + continue; + if (end > max_pfn) + max_pfn = end; + memory_present(0, start, end); + } +} + +/* + * Free all available memory for boot time allocation. Used + * as a callback function by efi_memory_walk() + */ + +static int __init +free_available_memory(unsigned long start, unsigned long end, void *arg) +{ + /* check max_low_pfn */ + if (start >= (max_low_pfn << PAGE_SHIFT)) + return 0; + if (end >= (max_low_pfn << PAGE_SHIFT)) + end = max_low_pfn << PAGE_SHIFT; + if (start < end) + free_bootmem(start, end - start); + + return 0; +} +/* + * Register fully available low RAM pages with the bootmem allocator. + */ +void __init register_bootmem_low_pages(unsigned long max_low_pfn) +{ + int i; + + if (efi_enabled) { + efi_memmap_walk(free_available_memory, NULL); + return; + } + for (i = 0; i < e820.nr_map; i++) { + unsigned long curr_pfn, last_pfn, size; + /* + * Reserve usable low memory + */ + if (e820.map[i].type != E820_RAM) + continue; + /* + * We are rounding up the start address of usable memory: + */ + curr_pfn = PFN_UP(e820.map[i].addr); + if (curr_pfn >= max_low_pfn) + continue; + /* + * ... and at the end of the usable range downwards: + */ + last_pfn = PFN_DOWN(e820.map[i].addr + e820.map[i].size); + + if (last_pfn > max_low_pfn) + last_pfn = max_low_pfn; + + /* + * .. finally, did all the rounding and playing + * around just make the area go away? + */ + if (last_pfn <= curr_pfn) + continue; + + size = last_pfn - curr_pfn; + free_bootmem(PFN_PHYS(curr_pfn), PFN_PHYS(size)); + } +} + +void __init e820_register_memory(void) +{ + unsigned long gapstart, gapsize, round; + unsigned long long last; + int i; + + /* + * Search for the bigest gap in the low 32 bits of the e820 + * memory space. + */ + last = 0x100000000ull; + gapstart = 0x10000000; + gapsize = 0x400000; + i = e820.nr_map; + while (--i >= 0) { + unsigned long long start = e820.map[i].addr; + unsigned long long end = start + e820.map[i].size; + + /* + * Since "last" is at most 4GB, we know we'll + * fit in 32 bits if this condition is true + */ + if (last > end) { + unsigned long gap = last - end; + + if (gap > gapsize) { + gapsize = gap; + gapstart = end; + } + } + if (start < last) + last = start; + } + + /* + * See how much we want to round up: start off with + * rounding to the next 1MB area. + */ + round = 0x100000; + while ((gapsize >> 4) > round) + round += round; + /* Fun with two's complement */ + pci_mem_start = (gapstart + round) & -round; + + printk("Allocating PCI resources starting at %08lx (gap: %08lx:%08lx)\n", + pci_mem_start, gapstart, gapsize); +} + +void __init print_memory_map(char *who) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + printk(" %s: %016Lx - %016Lx ", who, + e820.map[i].addr, + e820.map[i].addr + e820.map[i].size); + switch (e820.map[i].type) { + case E820_RAM: printk("(usable)\n"); + break; + case E820_RESERVED: + printk("(reserved)\n"); + break; + case E820_ACPI: + printk("(ACPI data)\n"); + break; + case E820_NVS: + printk("(ACPI NVS)\n"); + break; + default: printk("type %u\n", e820.map[i].type); + break; + } + } +} + +static __init __always_inline void efi_limit_regions(unsigned long long size) +{ + unsigned long long current_addr = 0; + efi_memory_desc_t *md, *next_md; + void *p, *p1; + int i, j; + + j = 0; + p1 = memmap.map; + for (p = p1, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { + md = p; + next_md = p1; + current_addr = md->phys_addr + + PFN_PHYS(md->num_pages); + if (is_available_memory(md)) { + if (md->phys_addr >= size) continue; + memcpy(next_md, md, memmap.desc_size); + if (current_addr >= size) { + next_md->num_pages -= + PFN_UP(current_addr-size); + } + p1 += memmap.desc_size; + next_md = p1; + j++; + } else if ((md->attribute & EFI_MEMORY_RUNTIME) == + EFI_MEMORY_RUNTIME) { + /* In order to make runtime services + * available we have to include runtime + * memory regions in memory map */ + memcpy(next_md, md, memmap.desc_size); + p1 += memmap.desc_size; + next_md = p1; + j++; + } + } + memmap.nr_map = j; + memmap.map_end = memmap.map + + (memmap.nr_map * memmap.desc_size); +} + +void __init limit_regions(unsigned long long size) +{ + unsigned long long current_addr; + int i; + + print_memory_map("limit_regions start"); + if (efi_enabled) { + efi_limit_regions(size); + return; + } + for (i = 0; i < e820.nr_map; i++) { + current_addr = e820.map[i].addr + e820.map[i].size; + if (current_addr < size) + continue; + + if (e820.map[i].type != E820_RAM) + continue; + + if (e820.map[i].addr >= size) { + /* + * This region starts past the end of the + * requested size, skip it completely. + */ + e820.nr_map = i; + } else { + e820.nr_map = i + 1; + e820.map[i].size -= current_addr - size; + } + print_memory_map("limit_regions endfor"); + return; + } + print_memory_map("limit_regions endfunc"); +} + +/* + * This function checks if any part of the range is mapped + * with type. + */ +int +e820_any_mapped(u64 start, u64 end, unsigned type) +{ + int i; + for (i = 0; i < e820.nr_map; i++) { + const struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(e820_any_mapped); + + /* + * This function checks if the entire range is mapped with type. + * + * Note: this function only works correct if the e820 table is sorted and + * not-overlapping, which is the case + */ +int __init +e820_all_mapped(unsigned long s, unsigned long e, unsigned type) +{ + u64 start = s; + u64 end = e; + int i; + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + if (type && ei->type != type) + continue; + /* is the region (part) in overlap with the current region ?*/ + if (ei->addr >= end || ei->addr + ei->size <= start) + continue; + /* if the region is at the beginning of we move + * start to the end of the region since it's ok until there + */ + if (ei->addr <= start) + start = ei->addr + ei->size; + /* if start is now at or beyond end, we're done, full + * coverage */ + if (start >= end) + return 1; /* we're done */ + } + return 0; +} + +static int __init parse_memmap(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp(arg, "exactmap") == 0) { +#ifdef CONFIG_CRASH_DUMP + /* If we are doing a crash dump, we + * still need to know the real mem + * size before original memory map is + * reset. + */ + find_max_pfn(); + saved_max_pfn = max_pfn; +#endif + e820.nr_map = 0; + user_defined_memmap = 1; + } else { + /* If the user specifies memory size, we + * limit the BIOS-provided memory map to + * that size. exactmap can be used to specify + * the exact map. mem=number can be used to + * trim the existing memory map. + */ + unsigned long long start_at, mem_size; + + mem_size = memparse(arg, &arg); + if (*arg == '@') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RAM); + } else if (*arg == '#') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_ACPI); + } else if (*arg == '$') { + start_at = memparse(arg+1, &arg); + add_memory_region(start_at, mem_size, E820_RESERVED); + } else { + limit_regions(mem_size); + user_defined_memmap = 1; + } + } + return 0; +} +early_param("memmap", parse_memmap); diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c new file mode 100644 index 000000000000..92f812ba275c --- /dev/null +++ b/arch/x86/kernel/early_printk.c @@ -0,0 +1,2 @@ + +#include "../../x86_64/kernel/early_printk.c" diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c new file mode 100644 index 000000000000..2452c6fbe992 --- /dev/null +++ b/arch/x86/kernel/efi_32.c @@ -0,0 +1,712 @@ +/* + * Extensible Firmware Interface + * + * Based on Extensible Firmware Interface Specification version 1.0 + * + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond + * Copyright (C) 1999-2002 Hewlett-Packard Co. + * David Mosberger-Tang + * Stephane Eranian + * + * All EFI Runtime Services are not implemented yet as EFI only + * supports physical mode addressing on SoftSDV. This is to be fixed + * in a future version. --drummond 1999-07-20 + * + * Implemented EFI runtime services and virtual mode calls. --davidm + * + * Goutham Rao: + * Skip non-WB memory and ignore empty memory ranges. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define EFI_DEBUG 0 +#define PFX "EFI: " + +extern efi_status_t asmlinkage efi_call_phys(void *, ...); + +struct efi efi; +EXPORT_SYMBOL(efi); +static struct efi efi_phys; +struct efi_memory_map memmap; + +/* + * We require an early boot_ioremap mapping mechanism initially + */ +extern void * boot_ioremap(unsigned long, unsigned long); + +/* + * To make EFI call EFI runtime service in physical addressing mode we need + * prelog/epilog before/after the invocation to disable interrupt, to + * claim EFI runtime service handler exclusively and to duplicate a memory in + * low memory space say 0 - 3G. + */ + +static unsigned long efi_rt_eflags; +static DEFINE_SPINLOCK(efi_rt_lock); +static pgd_t efi_bak_pg_dir_pointer[2]; + +static void efi_call_phys_prelog(void) __acquires(efi_rt_lock) +{ + unsigned long cr4; + unsigned long temp; + struct Xgt_desc_struct gdt_descr; + + spin_lock(&efi_rt_lock); + local_irq_save(efi_rt_eflags); + + /* + * If I don't have PSE, I should just duplicate two entries in page + * directory. If I have PSE, I just need to duplicate one entry in + * page directory. + */ + cr4 = read_cr4(); + + if (cr4 & X86_CR4_PSE) { + efi_bak_pg_dir_pointer[0].pgd = + swapper_pg_dir[pgd_index(0)].pgd; + swapper_pg_dir[0].pgd = + swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; + } else { + efi_bak_pg_dir_pointer[0].pgd = + swapper_pg_dir[pgd_index(0)].pgd; + efi_bak_pg_dir_pointer[1].pgd = + swapper_pg_dir[pgd_index(0x400000)].pgd; + swapper_pg_dir[pgd_index(0)].pgd = + swapper_pg_dir[pgd_index(PAGE_OFFSET)].pgd; + temp = PAGE_OFFSET + 0x400000; + swapper_pg_dir[pgd_index(0x400000)].pgd = + swapper_pg_dir[pgd_index(temp)].pgd; + } + + /* + * After the lock is released, the original page table is restored. + */ + local_flush_tlb(); + + gdt_descr.address = __pa(get_cpu_gdt_table(0)); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} + +static void efi_call_phys_epilog(void) __releases(efi_rt_lock) +{ + unsigned long cr4; + struct Xgt_desc_struct gdt_descr; + + gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); + + cr4 = read_cr4(); + + if (cr4 & X86_CR4_PSE) { + swapper_pg_dir[pgd_index(0)].pgd = + efi_bak_pg_dir_pointer[0].pgd; + } else { + swapper_pg_dir[pgd_index(0)].pgd = + efi_bak_pg_dir_pointer[0].pgd; + swapper_pg_dir[pgd_index(0x400000)].pgd = + efi_bak_pg_dir_pointer[1].pgd; + } + + /* + * After the lock is released, the original page table is restored. + */ + local_flush_tlb(); + + local_irq_restore(efi_rt_eflags); + spin_unlock(&efi_rt_lock); +} + +static efi_status_t +phys_efi_set_virtual_address_map(unsigned long memory_map_size, + unsigned long descriptor_size, + u32 descriptor_version, + efi_memory_desc_t *virtual_map) +{ + efi_status_t status; + + efi_call_phys_prelog(); + status = efi_call_phys(efi_phys.set_virtual_address_map, + memory_map_size, descriptor_size, + descriptor_version, virtual_map); + efi_call_phys_epilog(); + return status; +} + +static efi_status_t +phys_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) +{ + efi_status_t status; + + efi_call_phys_prelog(); + status = efi_call_phys(efi_phys.get_time, tm, tc); + efi_call_phys_epilog(); + return status; +} + +inline int efi_set_rtc_mmss(unsigned long nowtime) +{ + int real_seconds, real_minutes; + efi_status_t status; + efi_time_t eft; + efi_time_cap_t cap; + + spin_lock(&efi_rt_lock); + status = efi.get_time(&eft, &cap); + spin_unlock(&efi_rt_lock); + if (status != EFI_SUCCESS) + panic("Ooops, efitime: can't read time!\n"); + real_seconds = nowtime % 60; + real_minutes = nowtime / 60; + + if (((abs(real_minutes - eft.minute) + 15)/30) & 1) + real_minutes += 30; + real_minutes %= 60; + + eft.minute = real_minutes; + eft.second = real_seconds; + + if (status != EFI_SUCCESS) { + printk("Ooops: efitime: can't read time!\n"); + return -1; + } + return 0; +} +/* + * This is used during kernel init before runtime + * services have been remapped and also during suspend, therefore, + * we'll need to call both in physical and virtual modes. + */ +inline unsigned long efi_get_time(void) +{ + efi_status_t status; + efi_time_t eft; + efi_time_cap_t cap; + + if (efi.get_time) { + /* if we are in virtual mode use remapped function */ + status = efi.get_time(&eft, &cap); + } else { + /* we are in physical mode */ + status = phys_efi_get_time(&eft, &cap); + } + + if (status != EFI_SUCCESS) + printk("Oops: efitime: can't read time status: 0x%lx\n",status); + + return mktime(eft.year, eft.month, eft.day, eft.hour, + eft.minute, eft.second); +} + +int is_available_memory(efi_memory_desc_t * md) +{ + if (!(md->attribute & EFI_MEMORY_WB)) + return 0; + + switch (md->type) { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + return 1; + } + return 0; +} + +/* + * We need to map the EFI memory map again after paging_init(). + */ +void __init efi_map_memmap(void) +{ + memmap.map = NULL; + + memmap.map = bt_ioremap((unsigned long) memmap.phys_map, + (memmap.nr_map * memmap.desc_size)); + if (memmap.map == NULL) + printk(KERN_ERR PFX "Could not remap the EFI memmap!\n"); + + memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); +} + +#if EFI_DEBUG +static void __init print_efi_memmap(void) +{ + efi_memory_desc_t *md; + void *p; + int i; + + for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { + md = p; + printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, " + "range=[0x%016llx-0x%016llx) (%lluMB)\n", + i, md->type, md->attribute, md->phys_addr, + md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), + (md->num_pages >> (20 - EFI_PAGE_SHIFT))); + } +} +#endif /* EFI_DEBUG */ + +/* + * Walks the EFI memory map and calls CALLBACK once for each EFI + * memory descriptor that has memory that is available for kernel use. + */ +void efi_memmap_walk(efi_freemem_callback_t callback, void *arg) +{ + int prev_valid = 0; + struct range { + unsigned long start; + unsigned long end; + } uninitialized_var(prev), curr; + efi_memory_desc_t *md; + unsigned long start, end; + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + + if ((md->num_pages == 0) || (!is_available_memory(md))) + continue; + + curr.start = md->phys_addr; + curr.end = curr.start + (md->num_pages << EFI_PAGE_SHIFT); + + if (!prev_valid) { + prev = curr; + prev_valid = 1; + } else { + if (curr.start < prev.start) + printk(KERN_INFO PFX "Unordered memory map\n"); + if (prev.end == curr.start) + prev.end = curr.end; + else { + start = + (unsigned long) (PAGE_ALIGN(prev.start)); + end = (unsigned long) (prev.end & PAGE_MASK); + if ((end > start) + && (*callback) (start, end, arg) < 0) + return; + prev = curr; + } + } + } + if (prev_valid) { + start = (unsigned long) PAGE_ALIGN(prev.start); + end = (unsigned long) (prev.end & PAGE_MASK); + if (end > start) + (*callback) (start, end, arg); + } +} + +void __init efi_init(void) +{ + efi_config_table_t *config_tables; + efi_runtime_services_t *runtime; + efi_char16_t *c16; + char vendor[100] = "unknown"; + unsigned long num_config_tables; + int i = 0; + + memset(&efi, 0, sizeof(efi) ); + memset(&efi_phys, 0, sizeof(efi_phys)); + + efi_phys.systab = EFI_SYSTAB; + memmap.phys_map = EFI_MEMMAP; + memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE; + memmap.desc_version = EFI_MEMDESC_VERSION; + memmap.desc_size = EFI_MEMDESC_SIZE; + + efi.systab = (efi_system_table_t *) + boot_ioremap((unsigned long) efi_phys.systab, + sizeof(efi_system_table_t)); + /* + * Verify the EFI Table + */ + if (efi.systab == NULL) + printk(KERN_ERR PFX "Woah! Couldn't map the EFI system table.\n"); + if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + printk(KERN_ERR PFX "Woah! EFI system table signature incorrect\n"); + if ((efi.systab->hdr.revision >> 16) == 0) + printk(KERN_ERR PFX "Warning: EFI system table version " + "%d.%02d, expected 1.00 or greater\n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff); + + /* + * Grab some details from the system table + */ + num_config_tables = efi.systab->nr_tables; + config_tables = (efi_config_table_t *)efi.systab->tables; + runtime = efi.systab->runtime; + + /* + * Show what we know for posterity + */ + c16 = (efi_char16_t *) boot_ioremap(efi.systab->fw_vendor, 2); + if (c16) { + for (i = 0; i < (sizeof(vendor) - 1) && *c16; ++i) + vendor[i] = *c16++; + vendor[i] = '\0'; + } else + printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); + + printk(KERN_INFO PFX "EFI v%u.%.02u by %s \n", + efi.systab->hdr.revision >> 16, + efi.systab->hdr.revision & 0xffff, vendor); + + /* + * Let's see what config tables the firmware passed to us. + */ + config_tables = (efi_config_table_t *) + boot_ioremap((unsigned long) config_tables, + num_config_tables * sizeof(efi_config_table_t)); + + if (config_tables == NULL) + printk(KERN_ERR PFX "Could not map EFI Configuration Table!\n"); + + efi.mps = EFI_INVALID_TABLE_ADDR; + efi.acpi = EFI_INVALID_TABLE_ADDR; + efi.acpi20 = EFI_INVALID_TABLE_ADDR; + efi.smbios = EFI_INVALID_TABLE_ADDR; + efi.sal_systab = EFI_INVALID_TABLE_ADDR; + efi.boot_info = EFI_INVALID_TABLE_ADDR; + efi.hcdp = EFI_INVALID_TABLE_ADDR; + efi.uga = EFI_INVALID_TABLE_ADDR; + + for (i = 0; i < num_config_tables; i++) { + if (efi_guidcmp(config_tables[i].guid, MPS_TABLE_GUID) == 0) { + efi.mps = config_tables[i].table; + printk(KERN_INFO " MPS=0x%lx ", config_tables[i].table); + } else + if (efi_guidcmp(config_tables[i].guid, ACPI_20_TABLE_GUID) == 0) { + efi.acpi20 = config_tables[i].table; + printk(KERN_INFO " ACPI 2.0=0x%lx ", config_tables[i].table); + } else + if (efi_guidcmp(config_tables[i].guid, ACPI_TABLE_GUID) == 0) { + efi.acpi = config_tables[i].table; + printk(KERN_INFO " ACPI=0x%lx ", config_tables[i].table); + } else + if (efi_guidcmp(config_tables[i].guid, SMBIOS_TABLE_GUID) == 0) { + efi.smbios = config_tables[i].table; + printk(KERN_INFO " SMBIOS=0x%lx ", config_tables[i].table); + } else + if (efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID) == 0) { + efi.hcdp = config_tables[i].table; + printk(KERN_INFO " HCDP=0x%lx ", config_tables[i].table); + } else + if (efi_guidcmp(config_tables[i].guid, UGA_IO_PROTOCOL_GUID) == 0) { + efi.uga = config_tables[i].table; + printk(KERN_INFO " UGA=0x%lx ", config_tables[i].table); + } + } + printk("\n"); + + /* + * Check out the runtime services table. We need to map + * the runtime services table so that we can grab the physical + * address of several of the EFI runtime functions, needed to + * set the firmware into virtual mode. + */ + + runtime = (efi_runtime_services_t *) boot_ioremap((unsigned long) + runtime, + sizeof(efi_runtime_services_t)); + if (runtime != NULL) { + /* + * We will only need *early* access to the following + * two EFI runtime services before set_virtual_address_map + * is invoked. + */ + efi_phys.get_time = (efi_get_time_t *) runtime->get_time; + efi_phys.set_virtual_address_map = + (efi_set_virtual_address_map_t *) + runtime->set_virtual_address_map; + } else + printk(KERN_ERR PFX "Could not map the runtime service table!\n"); + + /* Map the EFI memory map for use until paging_init() */ + memmap.map = boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE); + if (memmap.map == NULL) + printk(KERN_ERR PFX "Could not map the EFI memory map!\n"); + + memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); + +#if EFI_DEBUG + print_efi_memmap(); +#endif +} + +static inline void __init check_range_for_systab(efi_memory_desc_t *md) +{ + if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) && + ((unsigned long)efi_phys.systab < md->phys_addr + + ((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) { + unsigned long addr; + + addr = md->virt_addr - md->phys_addr + + (unsigned long)efi_phys.systab; + efi.systab = (efi_system_table_t *)addr; + } +} + +/* + * Wrap all the virtual calls in a way that forces the parameters on the stack. + */ + +#define efi_call_virt(f, args...) \ + ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args) + +static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) +{ + return efi_call_virt(get_time, tm, tc); +} + +static efi_status_t virt_efi_set_time (efi_time_t *tm) +{ + return efi_call_virt(set_time, tm); +} + +static efi_status_t virt_efi_get_wakeup_time (efi_bool_t *enabled, + efi_bool_t *pending, + efi_time_t *tm) +{ + return efi_call_virt(get_wakeup_time, enabled, pending, tm); +} + +static efi_status_t virt_efi_set_wakeup_time (efi_bool_t enabled, + efi_time_t *tm) +{ + return efi_call_virt(set_wakeup_time, enabled, tm); +} + +static efi_status_t virt_efi_get_variable (efi_char16_t *name, + efi_guid_t *vendor, u32 *attr, + unsigned long *data_size, void *data) +{ + return efi_call_virt(get_variable, name, vendor, attr, data_size, data); +} + +static efi_status_t virt_efi_get_next_variable (unsigned long *name_size, + efi_char16_t *name, + efi_guid_t *vendor) +{ + return efi_call_virt(get_next_variable, name_size, name, vendor); +} + +static efi_status_t virt_efi_set_variable (efi_char16_t *name, + efi_guid_t *vendor, + unsigned long attr, + unsigned long data_size, void *data) +{ + return efi_call_virt(set_variable, name, vendor, attr, data_size, data); +} + +static efi_status_t virt_efi_get_next_high_mono_count (u32 *count) +{ + return efi_call_virt(get_next_high_mono_count, count); +} + +static void virt_efi_reset_system (int reset_type, efi_status_t status, + unsigned long data_size, + efi_char16_t *data) +{ + efi_call_virt(reset_system, reset_type, status, data_size, data); +} + +/* + * This function will switch the EFI runtime services to virtual mode. + * Essentially, look through the EFI memmap and map every region that + * has the runtime attribute bit set in its memory descriptor and update + * that memory descriptor with the virtual address obtained from ioremap(). + * This enables the runtime services to be called without having to + * thunk back into physical mode for every invocation. + */ + +void __init efi_enter_virtual_mode(void) +{ + efi_memory_desc_t *md; + efi_status_t status; + void *p; + + efi.systab = NULL; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + + md->virt_addr = (unsigned long)ioremap(md->phys_addr, + md->num_pages << EFI_PAGE_SHIFT); + if (!(unsigned long)md->virt_addr) { + printk(KERN_ERR PFX "ioremap of 0x%lX failed\n", + (unsigned long)md->phys_addr); + } + /* update the virtual address of the EFI system table */ + check_range_for_systab(md); + } + + BUG_ON(!efi.systab); + + status = phys_efi_set_virtual_address_map( + memmap.desc_size * memmap.nr_map, + memmap.desc_size, + memmap.desc_version, + memmap.phys_map); + + if (status != EFI_SUCCESS) { + printk (KERN_ALERT "You are screwed! " + "Unable to switch EFI into virtual mode " + "(status=%lx)\n", status); + panic("EFI call to SetVirtualAddressMap() failed!"); + } + + /* + * Now that EFI is in virtual mode, update the function + * pointers in the runtime service table to the new virtual addresses. + */ + + efi.get_time = virt_efi_get_time; + efi.set_time = virt_efi_set_time; + efi.get_wakeup_time = virt_efi_get_wakeup_time; + efi.set_wakeup_time = virt_efi_set_wakeup_time; + efi.get_variable = virt_efi_get_variable; + efi.get_next_variable = virt_efi_get_next_variable; + efi.set_variable = virt_efi_set_variable; + efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; + efi.reset_system = virt_efi_reset_system; +} + +void __init +efi_initialize_iomem_resources(struct resource *code_resource, + struct resource *data_resource) +{ + struct resource *res; + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + + if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) > + 0x100000000ULL) + continue; + res = kzalloc(sizeof(struct resource), GFP_ATOMIC); + switch (md->type) { + case EFI_RESERVED_TYPE: + res->name = "Reserved Memory"; + break; + case EFI_LOADER_CODE: + res->name = "Loader Code"; + break; + case EFI_LOADER_DATA: + res->name = "Loader Data"; + break; + case EFI_BOOT_SERVICES_DATA: + res->name = "BootServices Data"; + break; + case EFI_BOOT_SERVICES_CODE: + res->name = "BootServices Code"; + break; + case EFI_RUNTIME_SERVICES_CODE: + res->name = "Runtime Service Code"; + break; + case EFI_RUNTIME_SERVICES_DATA: + res->name = "Runtime Service Data"; + break; + case EFI_CONVENTIONAL_MEMORY: + res->name = "Conventional Memory"; + break; + case EFI_UNUSABLE_MEMORY: + res->name = "Unusable Memory"; + break; + case EFI_ACPI_RECLAIM_MEMORY: + res->name = "ACPI Reclaim"; + break; + case EFI_ACPI_MEMORY_NVS: + res->name = "ACPI NVS"; + break; + case EFI_MEMORY_MAPPED_IO: + res->name = "Memory Mapped IO"; + break; + case EFI_MEMORY_MAPPED_IO_PORT_SPACE: + res->name = "Memory Mapped IO Port Space"; + break; + default: + res->name = "Reserved"; + break; + } + res->start = md->phys_addr; + res->end = res->start + ((md->num_pages << EFI_PAGE_SHIFT) - 1); + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + if (request_resource(&iomem_resource, res) < 0) + printk(KERN_ERR PFX "Failed to allocate res %s : " + "0x%llx-0x%llx\n", res->name, + (unsigned long long)res->start, + (unsigned long long)res->end); + /* + * We don't know which region contains kernel data so we try + * it repeatedly and let the resource manager test it. + */ + if (md->type == EFI_CONVENTIONAL_MEMORY) { + request_resource(res, code_resource); + request_resource(res, data_resource); +#ifdef CONFIG_KEXEC + request_resource(res, &crashk_res); +#endif + } + } +} + +/* + * Convenience functions to obtain memory types and attributes + */ + +u32 efi_mem_type(unsigned long phys_addr) +{ + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if ((md->phys_addr <= phys_addr) && (phys_addr < + (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) + return md->type; + } + return 0; +} + +u64 efi_mem_attributes(unsigned long phys_addr) +{ + efi_memory_desc_t *md; + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if ((md->phys_addr <= phys_addr) && (phys_addr < + (md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) )) + return md->attribute; + } + return 0; +} diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/kernel/efi_stub_32.S new file mode 100644 index 000000000000..ef00bb77d7e4 --- /dev/null +++ b/arch/x86/kernel/efi_stub_32.S @@ -0,0 +1,122 @@ +/* + * EFI call stub for IA32. + * + * This stub allows us to make EFI calls in physical mode with interrupts + * turned off. + */ + +#include +#include + +/* + * efi_call_phys(void *, ...) is a function with variable parameters. + * All the callers of this function assure that all the parameters are 4-bytes. + */ + +/* + * In gcc calling convention, EBX, ESP, EBP, ESI and EDI are all callee save. + * So we'd better save all of them at the beginning of this function and restore + * at the end no matter how many we use, because we can not assure EFI runtime + * service functions will comply with gcc calling convention, too. + */ + +.text +ENTRY(efi_call_phys) + /* + * 0. The function can only be called in Linux kernel. So CS has been + * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found + * the values of these registers are the same. And, the corresponding + * GDT entries are identical. So I will do nothing about segment reg + * and GDT, but change GDT base register in prelog and epilog. + */ + + /* + * 1. Now I am running with EIP = + PAGE_OFFSET. + * But to make it smoothly switch from virtual mode to flat mode. + * The mapping of lower virtual memory has been created in prelog and + * epilog. + */ + movl $1f, %edx + subl $__PAGE_OFFSET, %edx + jmp *%edx +1: + + /* + * 2. Now on the top of stack is the return + * address in the caller of efi_call_phys(), then parameter 1, + * parameter 2, ..., param n. To make things easy, we save the return + * address of efi_call_phys in a global variable. + */ + popl %edx + movl %edx, saved_return_addr + /* get the function pointer into ECX*/ + popl %ecx + movl %ecx, efi_rt_function_ptr + movl $2f, %edx + subl $__PAGE_OFFSET, %edx + pushl %edx + + /* + * 3. Clear PG bit in %CR0. + */ + movl %cr0, %edx + andl $0x7fffffff, %edx + movl %edx, %cr0 + jmp 1f +1: + + /* + * 4. Adjust stack pointer. + */ + subl $__PAGE_OFFSET, %esp + + /* + * 5. Call the physical function. + */ + jmp *%ecx + +2: + /* + * 6. After EFI runtime service returns, control will return to + * following instruction. We'd better readjust stack pointer first. + */ + addl $__PAGE_OFFSET, %esp + + /* + * 7. Restore PG bit + */ + movl %cr0, %edx + orl $0x80000000, %edx + movl %edx, %cr0 + jmp 1f +1: + /* + * 8. Now restore the virtual mode from flat mode by + * adding EIP with PAGE_OFFSET. + */ + movl $1f, %edx + jmp *%edx +1: + + /* + * 9. Balance the stack. And because EAX contain the return value, + * we'd better not clobber it. + */ + leal efi_rt_function_ptr, %edx + movl (%edx), %ecx + pushl %ecx + + /* + * 10. Push the saved return address onto the stack and return. + */ + leal saved_return_addr, %edx + movl (%edx), %ecx + pushl %ecx + ret +.previous + +.data +saved_return_addr: + .long 0 +efi_rt_function_ptr: + .long 0 diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S new file mode 100644 index 000000000000..290b7bc82da3 --- /dev/null +++ b/arch/x86/kernel/entry_32.S @@ -0,0 +1,1112 @@ +/* + * linux/arch/i386/entry.S + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * entry.S contains the system-call and fault low-level handling routines. + * This also contains the timer-interrupt handler, as well as all interrupts + * and faults that can result in a task-switch. + * + * NOTE: This code handles signal-recognition, which happens every time + * after a timer-interrupt and after each system call. + * + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * + * Stack layout in 'syscall_exit': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, + * ptrace.c and ptrace.h + * + * 0(%esp) - %ebx + * 4(%esp) - %ecx + * 8(%esp) - %edx + * C(%esp) - %esi + * 10(%esp) - %edi + * 14(%esp) - %ebp + * 18(%esp) - %eax + * 1C(%esp) - %ds + * 20(%esp) - %es + * 24(%esp) - %fs + * 28(%esp) - orig_eax + * 2C(%esp) - %eip + * 30(%esp) - %cs + * 34(%esp) - %eflags + * 38(%esp) - %oldesp + * 3C(%esp) - %oldss + * + * "current" is in register %ebx during any slow entries. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "irq_vectors.h" + +/* + * We use macros for low-level operations which need to be overridden + * for paravirtualization. The following will never clobber any registers: + * INTERRUPT_RETURN (aka. "iret") + * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax") + * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit"). + * + * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must + * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY). + * Allowing a register to be clobbered can shrink the paravirt replacement + * enough to patch inline, increasing performance. + */ + +#define nr_syscalls ((syscall_table_size)/4) + +CF_MASK = 0x00000001 +TF_MASK = 0x00000100 +IF_MASK = 0x00000200 +DF_MASK = 0x00000400 +NT_MASK = 0x00004000 +VM_MASK = 0x00020000 + +#ifdef CONFIG_PREEMPT +#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF +#else +#define preempt_stop(clobbers) +#define resume_kernel restore_nocheck +#endif + +.macro TRACE_IRQS_IRET +#ifdef CONFIG_TRACE_IRQFLAGS + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off? + jz 1f + TRACE_IRQS_ON +1: +#endif +.endm + +#ifdef CONFIG_VM86 +#define resume_userspace_sig check_userspace +#else +#define resume_userspace_sig resume_userspace +#endif + +#define SAVE_ALL \ + cld; \ + pushl %fs; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET fs, 0;*/\ + pushl %es; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET es, 0;*/\ + pushl %ds; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET ds, 0;*/\ + pushl %eax; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET eax, 0;\ + pushl %ebp; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ebp, 0;\ + pushl %edi; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET edi, 0;\ + pushl %esi; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET esi, 0;\ + pushl %edx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET edx, 0;\ + pushl %ecx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ecx, 0;\ + pushl %ebx; \ + CFI_ADJUST_CFA_OFFSET 4;\ + CFI_REL_OFFSET ebx, 0;\ + movl $(__USER_DS), %edx; \ + movl %edx, %ds; \ + movl %edx, %es; \ + movl $(__KERNEL_PERCPU), %edx; \ + movl %edx, %fs + +#define RESTORE_INT_REGS \ + popl %ebx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ebx;\ + popl %ecx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ecx;\ + popl %edx; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE edx;\ + popl %esi; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE esi;\ + popl %edi; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE edi;\ + popl %ebp; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE ebp;\ + popl %eax; \ + CFI_ADJUST_CFA_OFFSET -4;\ + CFI_RESTORE eax + +#define RESTORE_REGS \ + RESTORE_INT_REGS; \ +1: popl %ds; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE ds;*/\ +2: popl %es; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE es;*/\ +3: popl %fs; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE fs;*/\ +.pushsection .fixup,"ax"; \ +4: movl $0,(%esp); \ + jmp 1b; \ +5: movl $0,(%esp); \ + jmp 2b; \ +6: movl $0,(%esp); \ + jmp 3b; \ +.section __ex_table,"a";\ + .align 4; \ + .long 1b,4b; \ + .long 2b,5b; \ + .long 3b,6b; \ +.popsection + +#define RING0_INT_FRAME \ + CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ + CFI_DEF_CFA esp, 3*4;\ + /*CFI_OFFSET cs, -2*4;*/\ + CFI_OFFSET eip, -3*4 + +#define RING0_EC_FRAME \ + CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ + CFI_DEF_CFA esp, 4*4;\ + /*CFI_OFFSET cs, -2*4;*/\ + CFI_OFFSET eip, -3*4 + +#define RING0_PTREGS_FRAME \ + CFI_STARTPROC simple;\ + CFI_SIGNAL_FRAME;\ + CFI_DEF_CFA esp, PT_OLDESP-PT_EBX;\ + /*CFI_OFFSET cs, PT_CS-PT_OLDESP;*/\ + CFI_OFFSET eip, PT_EIP-PT_OLDESP;\ + /*CFI_OFFSET es, PT_ES-PT_OLDESP;*/\ + /*CFI_OFFSET ds, PT_DS-PT_OLDESP;*/\ + CFI_OFFSET eax, PT_EAX-PT_OLDESP;\ + CFI_OFFSET ebp, PT_EBP-PT_OLDESP;\ + CFI_OFFSET edi, PT_EDI-PT_OLDESP;\ + CFI_OFFSET esi, PT_ESI-PT_OLDESP;\ + CFI_OFFSET edx, PT_EDX-PT_OLDESP;\ + CFI_OFFSET ecx, PT_ECX-PT_OLDESP;\ + CFI_OFFSET ebx, PT_EBX-PT_OLDESP + +ENTRY(ret_from_fork) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + pushl $0x0202 # Reset kernel eflags + CFI_ADJUST_CFA_OFFSET 4 + popfl + CFI_ADJUST_CFA_OFFSET -4 + jmp syscall_exit + CFI_ENDPROC +END(ret_from_fork) + +/* + * Return to user mode is not as complex as all this looks, + * but we want the default path for a system call return to + * go as quickly as possible which is why some of this is + * less clear than it otherwise should be. + */ + + # userspace resumption stub bypassing syscall exit tracing + ALIGN + RING0_PTREGS_FRAME +ret_from_exception: + preempt_stop(CLBR_ANY) +ret_from_intr: + GET_THREAD_INFO(%ebp) +check_userspace: + movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS + movb PT_CS(%esp), %al + andl $(VM_MASK | SEGMENT_RPL_MASK), %eax + cmpl $USER_RPL, %eax + jb resume_kernel # not returning to v8086 or userspace + +ENTRY(resume_userspace) + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done on + # int/exception return? + jne work_pending + jmp restore_all +END(ret_from_exception) + +#ifdef CONFIG_PREEMPT +ENTRY(resume_kernel) + DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? + jnz restore_nocheck +need_resched: + movl TI_flags(%ebp), %ecx # need_resched set ? + testb $_TIF_NEED_RESCHED, %cl + jz restore_all + testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ? + jz restore_all + call preempt_schedule_irq + jmp need_resched +END(resume_kernel) +#endif + CFI_ENDPROC + +/* SYSENTER_RETURN points to after the "sysenter" instruction in + the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ + + # sysenter call handler stub +ENTRY(sysenter_entry) + CFI_STARTPROC simple + CFI_SIGNAL_FRAME + CFI_DEF_CFA esp, 0 + CFI_REGISTER esp, ebp + movl TSS_sysenter_esp0(%esp),%esp +sysenter_past_esp: + /* + * No need to follow this irqs on/off section: the syscall + * disabled irqs and here we enable it straight after entry: + */ + ENABLE_INTERRUPTS(CLBR_NONE) + pushl $(__USER_DS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ss, 0*/ + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esp, 0 + pushfl + CFI_ADJUST_CFA_OFFSET 4 + pushl $(__USER_CS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET cs, 0*/ + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words + * pushed above; +8 corresponds to copy_thread's esp0 setting. + */ + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eip, 0 + +/* + * Load the potential sixth argument from user stack. + * Careful about security. + */ + cmpl $__PAGE_OFFSET-3,%ebp + jae syscall_fault +1: movl (%ebp),%ebp +.section __ex_table,"a" + .align 4 + .long 1b,syscall_fault +.previous + + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_THREAD_INFO(%ebp) + + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys + call *sys_call_table(,%eax,4) + movl %eax,PT_EAX(%esp) + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx + jne syscall_exit_work +/* if something modifies registers it must also disable sysexit */ + movl PT_EIP(%esp), %edx + movl PT_OLDESP(%esp), %ecx + xorl %ebp,%ebp + TRACE_IRQS_ON +1: mov PT_FS(%esp), %fs + ENABLE_INTERRUPTS_SYSEXIT + CFI_ENDPROC +.pushsection .fixup,"ax" +2: movl $0,PT_FS(%esp) + jmp 1b +.section __ex_table,"a" + .align 4 + .long 1b,2b +.popsection +ENDPROC(sysenter_entry) + + # system call handler stub +ENTRY(system_call) + RING0_INT_FRAME # can't unwind into user space anyway + pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_THREAD_INFO(%ebp) + # system call tracing in operation / emulation + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + cmpl $(nr_syscalls), %eax + jae syscall_badsys +syscall_call: + call *sys_call_table(,%eax,4) + movl %eax,PT_EAX(%esp) # store the return value +syscall_exit: + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit + jz no_singlestep + orl $_TIF_SINGLESTEP,TI_flags(%ebp) +no_singlestep: + movl TI_flags(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx # current->work + jne syscall_exit_work + +restore_all: + movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS + # Warning: PT_OLDSS(%esp) contains the wrong/random values if we + # are returning to the kernel. + # See comments in process.c:copy_thread() for details. + movb PT_OLDSS(%esp), %ah + movb PT_CS(%esp), %al + andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax + cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax + CFI_REMEMBER_STATE + je ldt_ss # returning to user-space with LDT SS +restore_nocheck: + TRACE_IRQS_IRET +restore_nocheck_notrace: + RESTORE_REGS + addl $4, %esp # skip orig_eax/error_code + CFI_ADJUST_CFA_OFFSET -4 +1: INTERRUPT_RETURN +.section .fixup,"ax" +iret_exc: + pushl $0 # no error code + pushl $do_iret_error + jmp error_code +.previous +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous + + CFI_RESTORE_STATE +ldt_ss: + larl PT_OLDSS(%esp), %eax + jnz restore_nocheck + testl $0x00400000, %eax # returning to 32bit stack? + jnz restore_nocheck # allright, normal return + +#ifdef CONFIG_PARAVIRT + /* + * The kernel can't run on a non-flat stack if paravirt mode + * is active. Rather than try to fixup the high bits of + * ESP, bypass this code entirely. This may break DOSemu + * and/or Wine support in a paravirt VM, although the option + * is still available to implement the setting of the high + * 16-bits in the INTERRUPT_RETURN paravirt-op. + */ + cmpl $0, paravirt_ops+PARAVIRT_enabled + jne restore_nocheck +#endif + + /* If returning to userspace with 16bit stack, + * try to fix the higher word of ESP, as the CPU + * won't restore it. + * This is an "official" bug of all the x86-compatible + * CPUs, which we can try to work around to make + * dosemu and wine happy. */ + movl PT_OLDESP(%esp), %eax + movl %esp, %edx + call patch_espfix_desc + pushl $__ESPFIX_SS + CFI_ADJUST_CFA_OFFSET 4 + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + DISABLE_INTERRUPTS(CLBR_EAX) + TRACE_IRQS_OFF + lss (%esp), %esp + CFI_ADJUST_CFA_OFFSET -8 + jmp restore_nocheck + CFI_ENDPROC +ENDPROC(system_call) + + # perform work that needs to be done immediately before resumption + ALIGN + RING0_PTREGS_FRAME # can't unwind into user space anyway +work_pending: + testb $_TIF_NEED_RESCHED, %cl + jz work_notifysig +work_resched: + call schedule + DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + andl $_TIF_WORK_MASK, %ecx # is there any work to be done other + # than syscall tracing? + jz restore_all + testb $_TIF_NEED_RESCHED, %cl + jnz work_resched + +work_notifysig: # deal with pending signals and + # notify-resume requests +#ifdef CONFIG_VM86 + testl $VM_MASK, PT_EFLAGS(%esp) + movl %esp, %eax + jne work_notifysig_v86 # returning to kernel-space or + # vm86-space + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace_sig + + ALIGN +work_notifysig_v86: + pushl %ecx # save ti_flags for do_notify_resume + CFI_ADJUST_CFA_OFFSET 4 + call save_v86_state # %eax contains pt_regs pointer + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + movl %eax, %esp +#else + movl %esp, %eax +#endif + xorl %edx, %edx + call do_notify_resume + jmp resume_userspace_sig +END(work_pending) + + # perform syscall exit tracing + ALIGN +syscall_trace_entry: + movl $-ENOSYS,PT_EAX(%esp) + movl %esp, %eax + xorl %edx,%edx + call do_syscall_trace + cmpl $0, %eax + jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, + # so must skip actual syscall + movl PT_ORIG_EAX(%esp), %eax + cmpl $(nr_syscalls), %eax + jnae syscall_call + jmp syscall_exit +END(syscall_trace_entry) + + # perform syscall exit tracing + ALIGN +syscall_exit_work: + testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl + jz work_pending + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call + # schedule() instead + movl %esp, %eax + movl $1, %edx + call do_syscall_trace + jmp resume_userspace +END(syscall_exit_work) + CFI_ENDPROC + + RING0_INT_FRAME # can't unwind into user space anyway +syscall_fault: + pushl %eax # save orig_eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_THREAD_INFO(%ebp) + movl $-EFAULT,PT_EAX(%esp) + jmp resume_userspace +END(syscall_fault) + +syscall_badsys: + movl $-ENOSYS,PT_EAX(%esp) + jmp resume_userspace +END(syscall_badsys) + CFI_ENDPROC + +#define FIXUP_ESPFIX_STACK \ + /* since we are on a wrong stack, we cant make it a C code :( */ \ + PER_CPU(gdt_page, %ebx); \ + GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ + addl %esp, %eax; \ + pushl $__KERNEL_DS; \ + CFI_ADJUST_CFA_OFFSET 4; \ + pushl %eax; \ + CFI_ADJUST_CFA_OFFSET 4; \ + lss (%esp), %esp; \ + CFI_ADJUST_CFA_OFFSET -8; +#define UNWIND_ESPFIX_STACK \ + movl %ss, %eax; \ + /* see if on espfix stack */ \ + cmpw $__ESPFIX_SS, %ax; \ + jne 27f; \ + movl $__KERNEL_DS, %eax; \ + movl %eax, %ds; \ + movl %eax, %es; \ + /* switch to normal stack */ \ + FIXUP_ESPFIX_STACK; \ +27:; + +/* + * Build the entry stubs and pointer table with + * some assembler magic. + */ +.data +ENTRY(interrupt) +.text + +ENTRY(irq_entries_start) + RING0_INT_FRAME +vector=0 +.rept NR_IRQS + ALIGN + .if vector + CFI_ADJUST_CFA_OFFSET -4 + .endif +1: pushl $~(vector) + CFI_ADJUST_CFA_OFFSET 4 + jmp common_interrupt + .previous + .long 1b + .text +vector=vector+1 +.endr +END(irq_entries_start) + +.previous +END(interrupt) +.previous + +/* + * the CPU automatically disables interrupts when executing an IRQ vector, + * so IRQ-flags tracing has to follow that: + */ + ALIGN +common_interrupt: + SAVE_ALL + TRACE_IRQS_OFF + movl %esp,%eax + call do_IRQ + jmp ret_from_intr +ENDPROC(common_interrupt) + CFI_ENDPROC + +#define BUILD_INTERRUPT(name, nr) \ +ENTRY(name) \ + RING0_INT_FRAME; \ + pushl $~(nr); \ + CFI_ADJUST_CFA_OFFSET 4; \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp,%eax; \ + call smp_##name; \ + jmp ret_from_intr; \ + CFI_ENDPROC; \ +ENDPROC(name) + +/* The include is where all of the SMP etc. interrupts come from */ +#include "entry_arch.h" + +KPROBE_ENTRY(page_fault) + RING0_EC_FRAME + pushl $do_page_fault + CFI_ADJUST_CFA_OFFSET 4 + ALIGN +error_code: + /* the function address is in %fs's slot on the stack */ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET es, 0*/ + pushl %ds + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET ds, 0*/ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eax, 0 + pushl %ebp + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebp, 0 + pushl %edi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edi, 0 + pushl %esi + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET esi, 0 + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ebx, 0 + cld + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0*/ + movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + popl %ecx + CFI_ADJUST_CFA_OFFSET -4 + /*CFI_REGISTER es, ecx*/ + movl PT_FS(%esp), %edi # get the function address + movl PT_ORIG_EAX(%esp), %edx # get the error code + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + mov %ecx, PT_FS(%esp) + /*CFI_REL_OFFSET fs, ES*/ + movl $(__USER_DS), %ecx + movl %ecx, %ds + movl %ecx, %es + movl %esp,%eax # pt_regs pointer + call *%edi + jmp ret_from_exception + CFI_ENDPROC +KPROBE_END(page_fault) + +ENTRY(coprocessor_error) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(coprocessor_error) + +ENTRY(simd_coprocessor_error) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_simd_coprocessor_error + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(simd_coprocessor_error) + +ENTRY(device_not_available) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_CR0_INTO_EAX + testl $0x4, %eax # EM (math emulation bit) + jne device_not_available_emulate + preempt_stop(CLBR_ANY) + call math_state_restore + jmp ret_from_exception +device_not_available_emulate: + pushl $0 # temporary storage for ORIG_EIP + CFI_ADJUST_CFA_OFFSET 4 + call math_emulate + addl $4, %esp + CFI_ADJUST_CFA_OFFSET -4 + jmp ret_from_exception + CFI_ENDPROC +END(device_not_available) + +/* + * Debug traps and NMI can happen at the one SYSENTER instruction + * that sets up the real kernel stack. Check here, since we can't + * allow the wrong stack to be used. + * + * "TSS_sysenter_esp0+12" is because the NMI/debug handler will have + * already pushed 3 words if it hits on the sysenter instruction: + * eflags, cs and eip. + * + * We just load the right stack, and push the three (known) values + * by hand onto the new stack - while updating the return eip past + * the instruction that would have done it for sysenter. + */ +#define FIX_STACK(offset, ok, label) \ + cmpw $__KERNEL_CS,4(%esp); \ + jne ok; \ +label: \ + movl TSS_sysenter_esp0+offset(%esp),%esp; \ + CFI_DEF_CFA esp, 0; \ + CFI_UNDEFINED eip; \ + pushfl; \ + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $__KERNEL_CS; \ + CFI_ADJUST_CFA_OFFSET 4; \ + pushl $sysenter_past_esp; \ + CFI_ADJUST_CFA_OFFSET 4; \ + CFI_REL_OFFSET eip, 0 + +KPROBE_ENTRY(debug) + RING0_INT_FRAME + cmpl $sysenter_entry,(%esp) + jne debug_stack_correct + FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn) +debug_stack_correct: + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # error code 0 + movl %esp,%eax # pt_regs pointer + call do_debug + jmp ret_from_exception + CFI_ENDPROC +KPROBE_END(debug) + +/* + * NMI is doubly nasty. It can happen _while_ we're handling + * a debug fault, and the debug fault hasn't yet been able to + * clear up the stack. So we first check whether we got an + * NMI on the sysenter entry path, but after that we need to + * check whether we got an NMI on the debug path where the debug + * fault happened on the sysenter path. + */ +KPROBE_ENTRY(nmi) + RING0_INT_FRAME + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %ss, %eax + cmpw $__ESPFIX_SS, %ax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + je nmi_espfix_stack + cmpl $sysenter_entry,(%esp) + je nmi_stack_fixup + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl %esp,%eax + /* Do not access memory above the end of our stack page, + * it might not exist. + */ + andl $(THREAD_SIZE-1),%eax + cmpl $(THREAD_SIZE-20),%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + jae nmi_stack_correct + cmpl $sysenter_entry,12(%esp) + je nmi_debug_stack_check +nmi_stack_correct: + /* We have a RING0_INT_FRAME here */ + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_nmi + jmp restore_nocheck_notrace + CFI_ENDPROC + +nmi_stack_fixup: + RING0_INT_FRAME + FIX_STACK(12,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_debug_stack_check: + /* We have a RING0_INT_FRAME here */ + cmpw $__KERNEL_CS,16(%esp) + jne nmi_stack_correct + cmpl $debug,(%esp) + jb nmi_stack_correct + cmpl $debug_esp_fix_insn,(%esp) + ja nmi_stack_correct + FIX_STACK(24,nmi_stack_correct, 1) + jmp nmi_stack_correct + +nmi_espfix_stack: + /* We have a RING0_INT_FRAME here. + * + * create the pointer to lss back + */ + pushl %ss + CFI_ADJUST_CFA_OFFSET 4 + pushl %esp + CFI_ADJUST_CFA_OFFSET 4 + addw $4, (%esp) + /* copy the iret frame of 12 bytes */ + .rept 3 + pushl 16(%esp) + CFI_ADJUST_CFA_OFFSET 4 + .endr + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + FIXUP_ESPFIX_STACK # %eax == %esp + xorl %edx,%edx # zero error code + call do_nmi + RESTORE_REGS + lss 12+4(%esp), %esp # back to espfix stack + CFI_ADJUST_CFA_OFFSET -24 +1: INTERRUPT_RETURN + CFI_ENDPROC +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous +KPROBE_END(nmi) + +#ifdef CONFIG_PARAVIRT +ENTRY(native_iret) +1: iret +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous +END(native_iret) + +ENTRY(native_irq_enable_sysexit) + sti + sysexit +END(native_irq_enable_sysexit) +#endif + +KPROBE_ENTRY(int3) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + xorl %edx,%edx # zero error code + movl %esp,%eax # pt_regs pointer + call do_int3 + jmp ret_from_exception + CFI_ENDPROC +KPROBE_END(int3) + +ENTRY(overflow) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_overflow + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(overflow) + +ENTRY(bounds) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_bounds + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(bounds) + +ENTRY(invalid_op) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_invalid_op + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(invalid_op) + +ENTRY(coprocessor_segment_overrun) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_coprocessor_segment_overrun + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(coprocessor_segment_overrun) + +ENTRY(invalid_TSS) + RING0_EC_FRAME + pushl $do_invalid_TSS + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(invalid_TSS) + +ENTRY(segment_not_present) + RING0_EC_FRAME + pushl $do_segment_not_present + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(segment_not_present) + +ENTRY(stack_segment) + RING0_EC_FRAME + pushl $do_stack_segment + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(stack_segment) + +KPROBE_ENTRY(general_protection) + RING0_EC_FRAME + pushl $do_general_protection + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +KPROBE_END(general_protection) + +ENTRY(alignment_check) + RING0_EC_FRAME + pushl $do_alignment_check + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(alignment_check) + +ENTRY(divide_error) + RING0_INT_FRAME + pushl $0 # no error code + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_divide_error + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(divide_error) + +#ifdef CONFIG_X86_MCE +ENTRY(machine_check) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl machine_check_vector + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(machine_check) +#endif + +ENTRY(spurious_interrupt_bug) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + pushl $do_spurious_interrupt_bug + CFI_ADJUST_CFA_OFFSET 4 + jmp error_code + CFI_ENDPROC +END(spurious_interrupt_bug) + +ENTRY(kernel_thread_helper) + pushl $0 # fake return address for unwinder + CFI_STARTPROC + movl %edx,%eax + push %edx + CFI_ADJUST_CFA_OFFSET 4 + call *%ebx + push %eax + CFI_ADJUST_CFA_OFFSET 4 + call do_exit + CFI_ENDPROC +ENDPROC(kernel_thread_helper) + +#ifdef CONFIG_XEN +ENTRY(xen_hypervisor_callback) + CFI_STARTPROC + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + TRACE_IRQS_OFF + + /* Check to see if we got the event in the critical + region in xen_iret_direct, after we've reenabled + events and checked for pending events. This simulates + iret instruction's behaviour where it delivers a + pending interrupt when enabling interrupts. */ + movl PT_EIP(%esp),%eax + cmpl $xen_iret_start_crit,%eax + jb 1f + cmpl $xen_iret_end_crit,%eax + jae 1f + + call xen_iret_crit_fixup + +1: mov %esp, %eax + call xen_evtchn_do_upcall + jmp ret_from_intr + CFI_ENDPROC +ENDPROC(xen_hypervisor_callback) + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(xen_failsafe_callback) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + testl %eax,%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + lea 16(%esp),%esp + CFI_ADJUST_CFA_OFFSET -16 + jz 5f + addl $16,%esp + jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) +5: pushl $0 # EAX == 0 => Category 1 (Bad segment) + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + jmp ret_from_exception + CFI_ENDPROC + +.section .fixup,"ax" +6: xorl %eax,%eax + movl %eax,4(%esp) + jmp 1b +7: xorl %eax,%eax + movl %eax,8(%esp) + jmp 2b +8: xorl %eax,%eax + movl %eax,12(%esp) + jmp 3b +9: xorl %eax,%eax + movl %eax,16(%esp) + jmp 4b +.previous +.section __ex_table,"a" + .align 4 + .long 1b,6b + .long 2b,7b + .long 3b,8b + .long 4b,9b +.previous +ENDPROC(xen_failsafe_callback) + +#endif /* CONFIG_XEN */ + +.section .rodata,"a" +#include "syscall_table_32.S" + +syscall_table_size=(.-sys_call_table) diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c new file mode 100644 index 000000000000..41e8aec4c61d --- /dev/null +++ b/arch/x86/kernel/geode_32.c @@ -0,0 +1,155 @@ +/* + * AMD Geode southbridge support code + * Copyright (C) 2006, Advanced Micro Devices, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public License + * as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +static struct { + char *name; + u32 msr; + int size; + u32 base; +} lbars[] = { + { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 }, + { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 }, + { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 }, + { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 } +}; + +static void __init init_lbars(void) +{ + u32 lo, hi; + int i; + + for (i = 0; i < ARRAY_SIZE(lbars); i++) { + rdmsr(lbars[i].msr, lo, hi); + if (hi & 0x01) + lbars[i].base = lo & 0x0000ffff; + + if (lbars[i].base == 0) + printk(KERN_ERR "geode: Couldn't initialize '%s'\n", + lbars[i].name); + } +} + +int geode_get_dev_base(unsigned int dev) +{ + BUG_ON(dev >= ARRAY_SIZE(lbars)); + return lbars[dev].base; +} +EXPORT_SYMBOL_GPL(geode_get_dev_base); + +/* === GPIO API === */ + +void geode_gpio_set(unsigned int gpio, unsigned int reg) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + + if (!base) + return; + + if (gpio < 16) + outl(1 << gpio, base + reg); + else + outl(1 << (gpio - 16), base + 0x80 + reg); +} +EXPORT_SYMBOL_GPL(geode_gpio_set); + +void geode_gpio_clear(unsigned int gpio, unsigned int reg) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + + if (!base) + return; + + if (gpio < 16) + outl(1 << (gpio + 16), base + reg); + else + outl(1 << gpio, base + 0x80 + reg); +} +EXPORT_SYMBOL_GPL(geode_gpio_clear); + +int geode_gpio_isset(unsigned int gpio, unsigned int reg) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + + if (!base) + return 0; + + if (gpio < 16) + return (inl(base + reg) & (1 << gpio)) ? 1 : 0; + else + return (inl(base + 0x80 + reg) & (1 << (gpio - 16))) ? 1 : 0; +} +EXPORT_SYMBOL_GPL(geode_gpio_isset); + +void geode_gpio_set_irq(unsigned int group, unsigned int irq) +{ + u32 lo, hi; + + if (group > 7 || irq > 15) + return; + + rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi); + + lo &= ~(0xF << (group * 4)); + lo |= (irq & 0xF) << (group * 4); + + wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi); +} +EXPORT_SYMBOL_GPL(geode_gpio_set_irq); + +void geode_gpio_setup_event(unsigned int gpio, int pair, int pme) +{ + u32 base = geode_get_dev_base(GEODE_DEV_GPIO); + u32 offset, shift, val; + + if (gpio >= 24) + offset = GPIO_MAP_W; + else if (gpio >= 16) + offset = GPIO_MAP_Z; + else if (gpio >= 8) + offset = GPIO_MAP_Y; + else + offset = GPIO_MAP_X; + + shift = (gpio % 8) * 4; + + val = inl(base + offset); + + /* Clear whatever was there before */ + val &= ~(0xF << shift); + + /* And set the new value */ + + val |= ((pair & 7) << shift); + + /* Set the PME bit if this is a PME event */ + + if (pme) + val |= (1 << (shift + 3)); + + outl(val, base + offset); +} +EXPORT_SYMBOL_GPL(geode_gpio_setup_event); + +static int __init geode_southbridge_init(void) +{ + if (!is_geode()) + return -ENODEV; + + init_lbars(); + return 0; +} + +postcore_initcall(geode_southbridge_init); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S new file mode 100644 index 000000000000..9150ca9b5f80 --- /dev/null +++ b/arch/x86/kernel/head_32.S @@ -0,0 +1,578 @@ +/* + * linux/arch/i386/kernel/head.S -- the 32-bit startup code. + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Enhanced CPU detection and feature setting code by Mike Jagdis + * and Martin Mares, November 1997. + */ + +.text +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * References to members of the new_cpu_data structure. + */ + +#define X86 new_cpu_data+CPUINFO_x86 +#define X86_VENDOR new_cpu_data+CPUINFO_x86_vendor +#define X86_MODEL new_cpu_data+CPUINFO_x86_model +#define X86_MASK new_cpu_data+CPUINFO_x86_mask +#define X86_HARD_MATH new_cpu_data+CPUINFO_hard_math +#define X86_CPUID new_cpu_data+CPUINFO_cpuid_level +#define X86_CAPABILITY new_cpu_data+CPUINFO_x86_capability +#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id + +/* + * This is how much memory *in addition to the memory covered up to + * and including _end* we need mapped initially. + * We need: + * - one bit for each possible page, but only in low memory, which means + * 2^32/4096/8 = 128K worst case (4G/4G split.) + * - enough space to map all low memory, which means + * (2^32/4096) / 1024 pages (worst case, non PAE) + * (2^32/4096) / 512 + 4 pages (worst case for PAE) + * - a few pages for allocator use before the kernel pagetable has + * been set up + * + * Modulo rounding, each megabyte assigned here requires a kilobyte of + * memory, which is currently unreclaimed. + * + * This should be a multiple of a page. + */ +LOW_PAGES = 1<<(32-PAGE_SHIFT_asm) + +#if PTRS_PER_PMD > 1 +PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD +#else +PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD) +#endif +BOOTBITMAP_SIZE = LOW_PAGES / 8 +ALLOCATOR_SLOP = 4 + +INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm + +/* + * 32-bit kernel entrypoint; only used by the boot CPU. On entry, + * %esi points to the real-mode code as a 32-bit pointer. + * CS and DS must be 4 GB flat segments, but we don't depend on + * any particular GDT layout, because we load our own as soon as we + * can. + */ +.section .text.head,"ax",@progbits +ENTRY(startup_32) + +/* + * Set segments to known values. + */ + cld + lgdt boot_gdt_descr - __PAGE_OFFSET + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs + +/* + * Clear BSS first so that there are no surprises... + * No need to cld as DF is already clear from cld above... + */ + xorl %eax,%eax + movl $__bss_start - __PAGE_OFFSET,%edi + movl $__bss_stop - __PAGE_OFFSET,%ecx + subl %edi,%ecx + shrl $2,%ecx + rep ; stosl +/* + * Copy bootup parameters out of the way. + * Note: %esi still has the pointer to the real-mode data. + * With the kexec as boot loader, parameter segment might be loaded beyond + * kernel image and might not even be addressable by early boot page tables. + * (kexec on panic case). Hence copy out the parameters before initializing + * page tables. + */ + movl $(boot_params - __PAGE_OFFSET),%edi + movl $(PARAM_SIZE/4),%ecx + cld + rep + movsl + movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi + andl %esi,%esi + jnz 2f # New command line protocol + cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR + jne 1f + movzwl OLD_CL_OFFSET,%esi + addl $(OLD_CL_BASE_ADDR),%esi +2: + movl $(boot_command_line - __PAGE_OFFSET),%edi + movl $(COMMAND_LINE_SIZE/4),%ecx + rep + movsl +1: + +/* + * Initialize page tables. This creates a PDE and a set of page + * tables, which are located immediately beyond _end. The variable + * init_pg_tables_end is set up to point to the first "safe" location. + * Mappings are created both at virtual address 0 (identity mapping) + * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END. + * + * Warning: don't use %esi or the stack in this code. However, %esp + * can be used as a GPR if you really need it... + */ +page_pde_offset = (__PAGE_OFFSET >> 20); + + movl $(pg0 - __PAGE_OFFSET), %edi + movl $(swapper_pg_dir - __PAGE_OFFSET), %edx + movl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ +10: + leal 0x007(%edi),%ecx /* Create PDE entry */ + movl %ecx,(%edx) /* Store identity PDE entry */ + movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ + addl $4,%edx + movl $1024, %ecx +11: + stosl + addl $0x1000,%eax + loop 11b + /* End condition: we must map up to and including INIT_MAP_BEYOND_END */ + /* bytes beyond the end of our own page tables; the +0x007 is the attribute bits */ + leal (INIT_MAP_BEYOND_END+0x007)(%edi),%ebp + cmpl %ebp,%eax + jb 10b + movl %edi,(init_pg_tables_end - __PAGE_OFFSET) + + xorl %ebx,%ebx /* This is the boot CPU (BSP) */ + jmp 3f +/* + * Non-boot CPU entry point; entered from trampoline.S + * We can't lgdt here, because lgdt itself uses a data segment, but + * we know the trampoline has already loaded the boot_gdt for us. + * + * If cpu hotplug is not supported then this code can go in init section + * which will be freed later + */ + +#ifndef CONFIG_HOTPLUG_CPU +.section .init.text,"ax",@progbits +#endif + + /* Do an early initialization of the fixmap area */ + movl $(swapper_pg_dir - __PAGE_OFFSET), %edx + movl $(swapper_pg_pmd - __PAGE_OFFSET), %eax + addl $0x007, %eax /* 0x007 = PRESENT+RW+USER */ + movl %eax, 4092(%edx) + +#ifdef CONFIG_SMP +ENTRY(startup_32_smp) + cld + movl $(__BOOT_DS),%eax + movl %eax,%ds + movl %eax,%es + movl %eax,%fs + movl %eax,%gs + +/* + * New page tables may be in 4Mbyte page mode and may + * be using the global pages. + * + * NOTE! If we are on a 486 we may have no cr4 at all! + * So we do not try to touch it unless we really have + * some bits in it to set. This won't work if the BSP + * implements cr4 but this AP does not -- very unlikely + * but be warned! The same applies to the pse feature + * if not equally supported. --macro + * + * NOTE! We have to correct for the fact that we're + * not yet offset PAGE_OFFSET.. + */ +#define cr4_bits mmu_cr4_features-__PAGE_OFFSET + movl cr4_bits,%edx + andl %edx,%edx + jz 6f + movl %cr4,%eax # Turn on paging options (PSE,PAE,..) + orl %edx,%eax + movl %eax,%cr4 + + btl $5, %eax # check if PAE is enabled + jnc 6f + + /* Check if extended functions are implemented */ + movl $0x80000000, %eax + cpuid + cmpl $0x80000000, %eax + jbe 6f + mov $0x80000001, %eax + cpuid + /* Execute Disable bit supported? */ + btl $20, %edx + jnc 6f + + /* Setup EFER (Extended Feature Enable Register) */ + movl $0xc0000080, %ecx + rdmsr + + btsl $11, %eax + /* Make changes effective */ + wrmsr + +6: + /* This is a secondary processor (AP) */ + xorl %ebx,%ebx + incl %ebx + +#endif /* CONFIG_SMP */ +3: + +/* + * Enable paging + */ + movl $swapper_pg_dir-__PAGE_OFFSET,%eax + movl %eax,%cr3 /* set the page table pointer.. */ + movl %cr0,%eax + orl $0x80000000,%eax + movl %eax,%cr0 /* ..and set paging (PG) bit */ + ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ +1: + /* Set up the stack pointer */ + lss stack_start,%esp + +/* + * Initialize eflags. Some BIOS's leave bits like NT set. This would + * confuse the debugger if this code is traced. + * XXX - best to initialize before switching to protected mode. + */ + pushl $0 + popfl + +#ifdef CONFIG_SMP + andl %ebx,%ebx + jz 1f /* Initial CPU cleans BSS */ + jmp checkCPUtype +1: +#endif /* CONFIG_SMP */ + +/* + * start system 32-bit setup. We need to re-do some of the things done + * in 16-bit mode for the "real" operations. + */ + call setup_idt + +checkCPUtype: + + movl $-1,X86_CPUID # -1 for no CPUID initially + +/* check if it is 486 or 386. */ +/* + * XXX - this does a lot of unnecessary setup. Alignment checks don't + * apply at our cpl of 0 and the stack ought to be aligned already, and + * we don't need to preserve eflags. + */ + + movb $3,X86 # at least 386 + pushfl # push EFLAGS + popl %eax # get EFLAGS + movl %eax,%ecx # save original EFLAGS + xorl $0x240000,%eax # flip AC and ID bits in EFLAGS + pushl %eax # copy to EFLAGS + popfl # set EFLAGS + pushfl # get new EFLAGS + popl %eax # put it in eax + xorl %ecx,%eax # change in flags + pushl %ecx # restore original EFLAGS + popfl + testl $0x40000,%eax # check if AC bit changed + je is386 + + movb $4,X86 # at least 486 + testl $0x200000,%eax # check if ID bit changed + je is486 + + /* get vendor info */ + xorl %eax,%eax # call CPUID with 0 -> return vendor ID + cpuid + movl %eax,X86_CPUID # save CPUID level + movl %ebx,X86_VENDOR_ID # lo 4 chars + movl %edx,X86_VENDOR_ID+4 # next 4 chars + movl %ecx,X86_VENDOR_ID+8 # last 4 chars + + orl %eax,%eax # do we have processor info as well? + je is486 + + movl $1,%eax # Use the CPUID instruction to get CPU type + cpuid + movb %al,%cl # save reg for future use + andb $0x0f,%ah # mask processor family + movb %ah,X86 + andb $0xf0,%al # mask model + shrb $4,%al + movb %al,X86_MODEL + andb $0x0f,%cl # mask mask revision + movb %cl,X86_MASK + movl %edx,X86_CAPABILITY + +is486: movl $0x50022,%ecx # set AM, WP, NE and MP + jmp 2f + +is386: movl $2,%ecx # set MP +2: movl %cr0,%eax + andl $0x80000011,%eax # Save PG,PE,ET + orl %ecx,%eax + movl %eax,%cr0 + + call check_x87 + lgdt early_gdt_descr + lidt idt_descr + ljmp $(__KERNEL_CS),$1f +1: movl $(__KERNEL_DS),%eax # reload all the segment registers + movl %eax,%ss # after changing gdt. + movl %eax,%fs # gets reset once there's real percpu + + movl $(__USER_DS),%eax # DS/ES contains default USER segment + movl %eax,%ds + movl %eax,%es + + xorl %eax,%eax # Clear GS and LDT + movl %eax,%gs + lldt %ax + + cld # gcc2 wants the direction flag cleared at all times + pushl $0 # fake return address for unwinder +#ifdef CONFIG_SMP + movb ready, %cl + movb $1, ready + cmpb $0,%cl # the first CPU calls start_kernel + je 1f + movl $(__KERNEL_PERCPU), %eax + movl %eax,%fs # set this cpu's percpu + jmp initialize_secondary # all other CPUs call initialize_secondary +1: +#endif /* CONFIG_SMP */ + jmp start_kernel + +/* + * We depend on ET to be correct. This checks for 287/387. + */ +check_x87: + movb $0,X86_HARD_MATH + clts + fninit + fstsw %ax + cmpb $0,%al + je 1f + movl %cr0,%eax /* no coprocessor: have to set bits */ + xorl $4,%eax /* set EM */ + movl %eax,%cr0 + ret + ALIGN +1: movb $1,X86_HARD_MATH + .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ + ret + +/* + * setup_idt + * + * sets up a idt with 256 entries pointing to + * ignore_int, interrupt gates. It doesn't actually load + * idt - that can be done only after paging has been enabled + * and the kernel moved to PAGE_OFFSET. Interrupts + * are enabled elsewhere, when we can be relatively + * sure everything is ok. + * + * Warning: %esi is live across this function. + */ +setup_idt: + lea ignore_int,%edx + movl $(__KERNEL_CS << 16),%eax + movw %dx,%ax /* selector = 0x0010 = cs */ + movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ + + lea idt_table,%edi + mov $256,%ecx +rp_sidt: + movl %eax,(%edi) + movl %edx,4(%edi) + addl $8,%edi + dec %ecx + jne rp_sidt + +.macro set_early_handler handler,trapno + lea \handler,%edx + movl $(__KERNEL_CS << 16),%eax + movw %dx,%ax + movw $0x8E00,%dx /* interrupt gate - dpl=0, present */ + lea idt_table,%edi + movl %eax,8*\trapno(%edi) + movl %edx,8*\trapno+4(%edi) +.endm + + set_early_handler handler=early_divide_err,trapno=0 + set_early_handler handler=early_illegal_opcode,trapno=6 + set_early_handler handler=early_protection_fault,trapno=13 + set_early_handler handler=early_page_fault,trapno=14 + + ret + +early_divide_err: + xor %edx,%edx + pushl $0 /* fake errcode */ + jmp early_fault + +early_illegal_opcode: + movl $6,%edx + pushl $0 /* fake errcode */ + jmp early_fault + +early_protection_fault: + movl $13,%edx + jmp early_fault + +early_page_fault: + movl $14,%edx + jmp early_fault + +early_fault: + cld +#ifdef CONFIG_PRINTK + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es + cmpl $2,early_recursion_flag + je hlt_loop + incl early_recursion_flag + movl %cr2,%eax + pushl %eax + pushl %edx /* trapno */ + pushl $fault_msg +#ifdef CONFIG_EARLY_PRINTK + call early_printk +#else + call printk +#endif +#endif +hlt_loop: + hlt + jmp hlt_loop + +/* This is the default interrupt "handler" :-) */ + ALIGN +ignore_int: + cld +#ifdef CONFIG_PRINTK + pushl %eax + pushl %ecx + pushl %edx + pushl %es + pushl %ds + movl $(__KERNEL_DS),%eax + movl %eax,%ds + movl %eax,%es + cmpl $2,early_recursion_flag + je hlt_loop + incl early_recursion_flag + pushl 16(%esp) + pushl 24(%esp) + pushl 32(%esp) + pushl 40(%esp) + pushl $int_msg +#ifdef CONFIG_EARLY_PRINTK + call early_printk +#else + call printk +#endif + addl $(5*4),%esp + popl %ds + popl %es + popl %edx + popl %ecx + popl %eax +#endif + iret + +.section .text +/* + * Real beginning of normal "text" segment + */ +ENTRY(stext) +ENTRY(_stext) + +/* + * BSS section + */ +.section ".bss.page_aligned","wa" + .align PAGE_SIZE_asm +ENTRY(swapper_pg_dir) + .fill 1024,4,0 +ENTRY(swapper_pg_pmd) + .fill 1024,4,0 +ENTRY(empty_zero_page) + .fill 4096,1,0 + +/* + * This starts the data section. + */ +.data +ENTRY(stack_start) + .long init_thread_union+THREAD_SIZE + .long __BOOT_DS + +ready: .byte 0 + +early_recursion_flag: + .long 0 + +int_msg: + .asciz "Unknown interrupt or fault at EIP %p %p %p\n" + +fault_msg: + .ascii "Int %d: CR2 %p err %p EIP %p CS %p flags %p\n" + .asciz "Stack: %p %p %p %p %p %p %p %p\n" + +#include "../../x86/xen/xen-head.S" + +/* + * The IDT and GDT 'descriptors' are a strange 48-bit object + * only used by the lidt and lgdt instructions. They are not + * like usual segment descriptors - they consist of a 16-bit + * segment size, and 32-bit linear address value: + */ + +.globl boot_gdt_descr +.globl idt_descr + + ALIGN +# early boot GDT descriptor (must use 1:1 address mapping) + .word 0 # 32 bit align gdt_desc.address +boot_gdt_descr: + .word __BOOT_DS+7 + .long boot_gdt - __PAGE_OFFSET + + .word 0 # 32-bit align idt_desc.address +idt_descr: + .word IDT_ENTRIES*8-1 # idt contains 256 entries + .long idt_table + +# boot GDT descriptor (later on used by CPU#0): + .word 0 # 32 bit align gdt_desc.address +ENTRY(early_gdt_descr) + .word GDT_ENTRIES*8-1 + .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ + +/* + * The boot_gdt must mirror the equivalent in setup.S and is + * used only for booting. + */ + .align L1_CACHE_BYTES +ENTRY(boot_gdt) + .fill GDT_ENTRY_BOOT_CS,8,0 + .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ + .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ diff --git a/arch/x86/kernel/hpet_32.c b/arch/x86/kernel/hpet_32.c new file mode 100644 index 000000000000..533d4932bc79 --- /dev/null +++ b/arch/x86/kernel/hpet_32.c @@ -0,0 +1,553 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +extern struct clock_event_device *global_clock_event; + +#define HPET_MASK CLOCKSOURCE_MASK(32) +#define HPET_SHIFT 22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000 + +/* + * HPET address is set in acpi/boot.c, when an ACPI entry exists + */ +unsigned long hpet_address; +static void __iomem * hpet_virt_address; + +static inline unsigned long hpet_readl(unsigned long a) +{ + return readl(hpet_virt_address + a); +} + +static inline void hpet_writel(unsigned long d, unsigned long a) +{ + writel(d, hpet_virt_address + a); +} + +/* + * HPET command line enable / disable + */ +static int boot_hpet_disable; + +static int __init hpet_setup(char* str) +{ + if (str) { + if (!strncmp("disable", str, 7)) + boot_hpet_disable = 1; + } + return 1; +} +__setup("hpet=", hpet_setup); + +static inline int is_hpet_capable(void) +{ + return (!boot_hpet_disable && hpet_address); +} + +/* + * HPET timer interrupt enable / disable + */ +static int hpet_legacy_int_enabled; + +/** + * is_hpet_enabled - check whether the hpet timer interrupt is enabled + */ +int is_hpet_enabled(void) +{ + return is_hpet_capable() && hpet_legacy_int_enabled; +} + +/* + * When the hpet driver (/dev/hpet) is enabled, we need to reserve + * timer 0 and timer 1 in case of RTC emulation. + */ +#ifdef CONFIG_HPET +static void hpet_reserve_platform_timers(unsigned long id) +{ + struct hpet __iomem *hpet = hpet_virt_address; + struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; + unsigned int nrtimers, i; + struct hpet_data hd; + + nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; + + memset(&hd, 0, sizeof (hd)); + hd.hd_phys_address = hpet_address; + hd.hd_address = hpet_virt_address; + hd.hd_nirqs = nrtimers; + hd.hd_flags = HPET_DATA_PLATFORM; + hpet_reserve_timer(&hd, 0); + +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + + for (i = 2; i < nrtimers; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; + + hpet_alloc(&hd); + +} +#else +static void hpet_reserve_platform_timers(unsigned long id) { } +#endif + +/* + * Common hpet info + */ +static unsigned long hpet_period; + +static void hpet_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt); +static int hpet_next_event(unsigned long delta, + struct clock_event_device *evt); + +/* + * The hpet clock event device + */ +static struct clock_event_device hpet_clockevent = { + .name = "hpet", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = hpet_set_mode, + .set_next_event = hpet_next_event, + .shift = 32, + .irq = 0, +}; + +static void hpet_start_counter(void) +{ + unsigned long cfg = hpet_readl(HPET_CFG); + + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); + hpet_writel(0, HPET_COUNTER); + hpet_writel(0, HPET_COUNTER + 4); + cfg |= HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); +} + +static void hpet_enable_int(void) +{ + unsigned long cfg = hpet_readl(HPET_CFG); + + cfg |= HPET_CFG_LEGACY; + hpet_writel(cfg, HPET_CFG); + hpet_legacy_int_enabled = 1; +} + +static void hpet_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long cfg, cmp, now; + uint64_t delta; + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; + delta >>= hpet_clockevent.shift; + now = hpet_readl(HPET_COUNTER); + cmp = now + (unsigned long) delta; + cfg = hpet_readl(HPET_T0_CFG); + cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | + HPET_TN_SETVAL | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T0_CFG); + /* + * The first write after writing TN_SETVAL to the + * config register sets the counter value, the second + * write sets the period. + */ + hpet_writel(cmp, HPET_T0_CMP); + udelay(1); + hpet_writel((unsigned long) delta, HPET_T0_CMP); + break; + + case CLOCK_EVT_MODE_ONESHOT: + cfg = hpet_readl(HPET_T0_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T0_CFG); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + cfg = hpet_readl(HPET_T0_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T0_CFG); + break; + + case CLOCK_EVT_MODE_RESUME: + hpet_enable_int(); + break; + } +} + +static int hpet_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + unsigned long cnt; + + cnt = hpet_readl(HPET_COUNTER); + cnt += delta; + hpet_writel(cnt, HPET_T0_CMP); + + return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; +} + +/* + * Clock source related code + */ +static cycle_t read_hpet(void) +{ + return (cycle_t)hpet_readl(HPET_COUNTER); +} + +static struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = HPET_MASK, + .shift = HPET_SHIFT, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .resume = hpet_start_counter, +}; + +/* + * Try to setup the HPET timer + */ +int __init hpet_enable(void) +{ + unsigned long id; + uint64_t hpet_freq; + u64 tmp, start, now; + cycle_t t1; + + if (!is_hpet_capable()) + return 0; + + hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + + /* + * Read the period and check for a sane value: + */ + hpet_period = hpet_readl(HPET_PERIOD); + if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) + goto out_nohpet; + + /* + * The period is a femto seconds value. We need to calculate the + * scaled math multiplication factor for nanosecond to hpet tick + * conversion. + */ + hpet_freq = 1000000000000000ULL; + do_div(hpet_freq, hpet_period); + hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, + NSEC_PER_SEC, 32); + /* Calculate the min / max delta */ + hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, + &hpet_clockevent); + hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, + &hpet_clockevent); + + /* + * Read the HPET ID register to retrieve the IRQ routing + * information and the number of channels + */ + id = hpet_readl(HPET_ID); + +#ifdef CONFIG_HPET_EMULATE_RTC + /* + * The legacy routing mode needs at least two channels, tick timer + * and the rtc emulation channel. + */ + if (!(id & HPET_ID_NUMBER)) + goto out_nohpet; +#endif + + /* Start the counter */ + hpet_start_counter(); + + /* Verify whether hpet counter works */ + t1 = read_hpet(); + rdtscll(start); + + /* + * We don't know the TSC frequency yet, but waiting for + * 200000 TSC cycles is safe: + * 4 GHz == 50us + * 1 GHz == 200us + */ + do { + rep_nop(); + rdtscll(now); + } while ((now - start) < 200000UL); + + if (t1 == read_hpet()) { + printk(KERN_WARNING + "HPET counter not counting. HPET disabled\n"); + goto out_nohpet; + } + + /* Initialize and register HPET clocksource + * + * hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + + clocksource_register(&clocksource_hpet); + + if (id & HPET_ID_LEGSUP) { + hpet_enable_int(); + hpet_reserve_platform_timers(id); + /* + * Start hpet with the boot cpu mask and make it + * global after the IO_APIC has been initialized. + */ + hpet_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); + clockevents_register_device(&hpet_clockevent); + global_clock_event = &hpet_clockevent; + return 1; + } + return 0; + +out_nohpet: + iounmap(hpet_virt_address); + hpet_virt_address = NULL; + boot_hpet_disable = 1; + return 0; +} + + +#ifdef CONFIG_HPET_EMULATE_RTC + +/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET + * is enabled, we support RTC interrupt functionality in software. + * RTC has 3 kinds of interrupts: + * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock + * is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) + * (1) and (2) above are implemented using polling at a frequency of + * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt + * overhead. (DEFAULT_RTC_INT_FREQ) + * For (3), we use interrupts at 64Hz or user specified periodic + * frequency, whichever is higher. + */ +#include +#include + +#define DEFAULT_RTC_INT_FREQ 64 +#define DEFAULT_RTC_SHIFT 6 +#define RTC_NUM_INTS 1 + +static unsigned long hpet_rtc_flags; +static unsigned long hpet_prev_update_sec; +static struct rtc_time hpet_alarm_time; +static unsigned long hpet_pie_count; +static unsigned long hpet_t1_cmp; +static unsigned long hpet_default_delta; +static unsigned long hpet_pie_delta; +static unsigned long hpet_pie_limit; + +/* + * Timer 1 for RTC emulation. We use one shot mode, as periodic mode + * is not supported by all HPET implementations for timer 1. + * + * hpet_rtc_timer_init() is called when the rtc is initialized. + */ +int hpet_rtc_timer_init(void) +{ + unsigned long cfg, cnt, delta, flags; + + if (!is_hpet_enabled()) + return 0; + + if (!hpet_default_delta) { + uint64_t clc; + + clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; + clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; + hpet_default_delta = (unsigned long) clc; + } + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + local_irq_save(flags); + + cnt = delta + hpet_readl(HPET_COUNTER); + hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; + + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + local_irq_restore(flags); + + return 1; +} + +/* + * The functions below are called from rtc driver. + * Return 0 if HPET is not being used. + * Otherwise do the necessary changes and return 1. + */ +int hpet_mask_rtc_irq_bit(unsigned long bit_mask) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags &= ~bit_mask; + return 1; +} + +int hpet_set_rtc_irq_bit(unsigned long bit_mask) +{ + unsigned long oldbits = hpet_rtc_flags; + + if (!is_hpet_enabled()) + return 0; + + hpet_rtc_flags |= bit_mask; + + if (!oldbits) + hpet_rtc_timer_init(); + + return 1; +} + +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, + unsigned char sec) +{ + if (!is_hpet_enabled()) + return 0; + + hpet_alarm_time.tm_hour = hrs; + hpet_alarm_time.tm_min = min; + hpet_alarm_time.tm_sec = sec; + + return 1; +} + +int hpet_set_periodic_freq(unsigned long freq) +{ + uint64_t clc; + + if (!is_hpet_enabled()) + return 0; + + if (freq <= DEFAULT_RTC_INT_FREQ) + hpet_pie_limit = DEFAULT_RTC_INT_FREQ / freq; + else { + clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; + do_div(clc, freq); + clc >>= hpet_clockevent.shift; + hpet_pie_delta = (unsigned long) clc; + } + return 1; +} + +int hpet_rtc_dropped_irq(void) +{ + return is_hpet_enabled(); +} + +static void hpet_rtc_timer_reinit(void) +{ + unsigned long cfg, delta; + int lost_ints = -1; + + if (unlikely(!hpet_rtc_flags)) { + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); + return; + } + + if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) + delta = hpet_default_delta; + else + delta = hpet_pie_delta; + + /* + * Increment the comparator value until we are ahead of the + * current count. + */ + do { + hpet_t1_cmp += delta; + hpet_writel(hpet_t1_cmp, HPET_T1_CMP); + lost_ints++; + } while ((long)(hpet_readl(HPET_COUNTER) - hpet_t1_cmp) > 0); + + if (lost_ints) { + if (hpet_rtc_flags & RTC_PIE) + hpet_pie_count += lost_ints; + if (printk_ratelimit()) + printk(KERN_WARNING "rtc: lost %d interrupts\n", + lost_ints); + } +} + +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) +{ + struct rtc_time curr_time; + unsigned long rtc_int_flag = 0; + + hpet_rtc_timer_reinit(); + + if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) + rtc_get_rtc_time(&curr_time); + + if (hpet_rtc_flags & RTC_UIE && + curr_time.tm_sec != hpet_prev_update_sec) { + rtc_int_flag = RTC_UF; + hpet_prev_update_sec = curr_time.tm_sec; + } + + if (hpet_rtc_flags & RTC_PIE && + ++hpet_pie_count >= hpet_pie_limit) { + rtc_int_flag |= RTC_PF; + hpet_pie_count = 0; + } + + if (hpet_rtc_flags & RTC_PIE && + (curr_time.tm_sec == hpet_alarm_time.tm_sec) && + (curr_time.tm_min == hpet_alarm_time.tm_min) && + (curr_time.tm_hour == hpet_alarm_time.tm_hour)) + rtc_int_flag |= RTC_AF; + + if (rtc_int_flag) { + rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); + rtc_interrupt(rtc_int_flag, dev_id); + } + return IRQ_HANDLED; +} +#endif diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c new file mode 100644 index 000000000000..e3d4b73bfdb0 --- /dev/null +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -0,0 +1,30 @@ +#include +#include +#include + +EXPORT_SYMBOL(__down_failed); +EXPORT_SYMBOL(__down_failed_interruptible); +EXPORT_SYMBOL(__down_failed_trylock); +EXPORT_SYMBOL(__up_wakeup); +/* Networking helper routines. */ +EXPORT_SYMBOL(csum_partial_copy_generic); + +EXPORT_SYMBOL(__get_user_1); +EXPORT_SYMBOL(__get_user_2); +EXPORT_SYMBOL(__get_user_4); + +EXPORT_SYMBOL(__put_user_1); +EXPORT_SYMBOL(__put_user_2); +EXPORT_SYMBOL(__put_user_4); +EXPORT_SYMBOL(__put_user_8); + +EXPORT_SYMBOL(strstr); + +#ifdef CONFIG_SMP +extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); +extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); +EXPORT_SYMBOL(__write_lock_failed); +EXPORT_SYMBOL(__read_lock_failed); +#endif + +EXPORT_SYMBOL(csum_partial); diff --git a/arch/x86/kernel/i387_32.c b/arch/x86/kernel/i387_32.c new file mode 100644 index 000000000000..665847281ed2 --- /dev/null +++ b/arch/x86/kernel/i387_32.c @@ -0,0 +1,546 @@ +/* + * linux/arch/i386/kernel/i387.c + * + * Copyright (C) 1994 Linus Torvalds + * + * Pentium III FXSR, SSE support + * General FPU state handling cleanups + * Gareth Hughes , May 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_MATH_EMULATION +#define HAVE_HWFP (boot_cpu_data.hard_math) +#else +#define HAVE_HWFP 1 +#endif + +static unsigned long mxcsr_feature_mask __read_mostly = 0xffffffff; + +void mxcsr_feature_mask_init(void) +{ + unsigned long mask = 0; + clts(); + if (cpu_has_fxsr) { + memset(¤t->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); + asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave)); + mask = current->thread.i387.fxsave.mxcsr_mask; + if (mask == 0) mask = 0x0000ffbf; + } + mxcsr_feature_mask &= mask; + stts(); +} + +/* + * The _current_ task is using the FPU for the first time + * so initialize it and set the mxcsr to its default + * value at reset if we support XMM instructions and then + * remeber the current task has used the FPU. + */ +void init_fpu(struct task_struct *tsk) +{ + if (cpu_has_fxsr) { + memset(&tsk->thread.i387.fxsave, 0, sizeof(struct i387_fxsave_struct)); + tsk->thread.i387.fxsave.cwd = 0x37f; + if (cpu_has_xmm) + tsk->thread.i387.fxsave.mxcsr = 0x1f80; + } else { + memset(&tsk->thread.i387.fsave, 0, sizeof(struct i387_fsave_struct)); + tsk->thread.i387.fsave.cwd = 0xffff037fu; + tsk->thread.i387.fsave.swd = 0xffff0000u; + tsk->thread.i387.fsave.twd = 0xffffffffu; + tsk->thread.i387.fsave.fos = 0xffff0000u; + } + /* only the device not available exception or ptrace can call init_fpu */ + set_stopped_child_used_math(tsk); +} + +/* + * FPU lazy state save handling. + */ + +void kernel_fpu_begin(void) +{ + struct thread_info *thread = current_thread_info(); + + preempt_disable(); + if (thread->status & TS_USEDFPU) { + __save_init_fpu(thread->task); + return; + } + clts(); +} +EXPORT_SYMBOL_GPL(kernel_fpu_begin); + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr( unsigned short twd ) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + return tmp; +} + +static inline unsigned long twd_fxsr_to_i387( struct i387_fxsave_struct *fxsave ) +{ + struct _fpxreg *st = NULL; + unsigned long tos = (fxsave->swd >> 11) & 7; + unsigned long twd = (unsigned long) fxsave->twd; + unsigned long tag; + unsigned long ret = 0xffff0000u; + int i; + +#define FPREG_ADDR(f, n) ((void *)&(f)->st_space + (n) * 16); + + for ( i = 0 ; i < 8 ; i++ ) { + if ( twd & 0x1 ) { + st = FPREG_ADDR( fxsave, (i - tos) & 7 ); + + switch ( st->exponent & 0x7fff ) { + case 0x7fff: + tag = 2; /* Special */ + break; + case 0x0000: + if ( !st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3] ) { + tag = 1; /* Zero */ + } else { + tag = 2; /* Special */ + } + break; + default: + if ( st->significand[3] & 0x8000 ) { + tag = 0; /* Valid */ + } else { + tag = 2; /* Special */ + } + break; + } + } else { + tag = 3; /* Empty */ + } + ret |= (tag << (2 * i)); + twd = twd >> 1; + } + return ret; +} + +/* + * FPU state interaction. + */ + +unsigned short get_fpu_cwd( struct task_struct *tsk ) +{ + if ( cpu_has_fxsr ) { + return tsk->thread.i387.fxsave.cwd; + } else { + return (unsigned short)tsk->thread.i387.fsave.cwd; + } +} + +unsigned short get_fpu_swd( struct task_struct *tsk ) +{ + if ( cpu_has_fxsr ) { + return tsk->thread.i387.fxsave.swd; + } else { + return (unsigned short)tsk->thread.i387.fsave.swd; + } +} + +#if 0 +unsigned short get_fpu_twd( struct task_struct *tsk ) +{ + if ( cpu_has_fxsr ) { + return tsk->thread.i387.fxsave.twd; + } else { + return (unsigned short)tsk->thread.i387.fsave.twd; + } +} +#endif /* 0 */ + +unsigned short get_fpu_mxcsr( struct task_struct *tsk ) +{ + if ( cpu_has_xmm ) { + return tsk->thread.i387.fxsave.mxcsr; + } else { + return 0x1f80; + } +} + +#if 0 + +void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd ) +{ + if ( cpu_has_fxsr ) { + tsk->thread.i387.fxsave.cwd = cwd; + } else { + tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000u); + } +} + +void set_fpu_swd( struct task_struct *tsk, unsigned short swd ) +{ + if ( cpu_has_fxsr ) { + tsk->thread.i387.fxsave.swd = swd; + } else { + tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000u); + } +} + +void set_fpu_twd( struct task_struct *tsk, unsigned short twd ) +{ + if ( cpu_has_fxsr ) { + tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd); + } else { + tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000u); + } +} + +#endif /* 0 */ + +/* + * FXSR floating point environment conversions. + */ + +static int convert_fxsr_to_user( struct _fpstate __user *buf, + struct i387_fxsave_struct *fxsave ) +{ + unsigned long env[7]; + struct _fpreg __user *to; + struct _fpxreg *from; + int i; + + env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul; + env[1] = (unsigned long)fxsave->swd | 0xffff0000ul; + env[2] = twd_fxsr_to_i387(fxsave); + env[3] = fxsave->fip; + env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16); + env[5] = fxsave->foo; + env[6] = fxsave->fos; + + if ( __copy_to_user( buf, env, 7 * sizeof(unsigned long) ) ) + return 1; + + to = &buf->_st[0]; + from = (struct _fpxreg *) &fxsave->st_space[0]; + for ( i = 0 ; i < 8 ; i++, to++, from++ ) { + unsigned long __user *t = (unsigned long __user *)to; + unsigned long *f = (unsigned long *)from; + + if (__put_user(*f, t) || + __put_user(*(f + 1), t + 1) || + __put_user(from->exponent, &to->exponent)) + return 1; + } + return 0; +} + +static int convert_fxsr_from_user( struct i387_fxsave_struct *fxsave, + struct _fpstate __user *buf ) +{ + unsigned long env[7]; + struct _fpxreg *to; + struct _fpreg __user *from; + int i; + + if ( __copy_from_user( env, buf, 7 * sizeof(long) ) ) + return 1; + + fxsave->cwd = (unsigned short)(env[0] & 0xffff); + fxsave->swd = (unsigned short)(env[1] & 0xffff); + fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff)); + fxsave->fip = env[3]; + fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16); + fxsave->fcs = (env[4] & 0xffff); + fxsave->foo = env[5]; + fxsave->fos = env[6]; + + to = (struct _fpxreg *) &fxsave->st_space[0]; + from = &buf->_st[0]; + for ( i = 0 ; i < 8 ; i++, to++, from++ ) { + unsigned long *t = (unsigned long *)to; + unsigned long __user *f = (unsigned long __user *)from; + + if (__get_user(*t, f) || + __get_user(*(t + 1), f + 1) || + __get_user(to->exponent, &from->exponent)) + return 1; + } + return 0; +} + +/* + * Signal frame handlers. + */ + +static inline int save_i387_fsave( struct _fpstate __user *buf ) +{ + struct task_struct *tsk = current; + + unlazy_fpu( tsk ); + tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd; + if ( __copy_to_user( buf, &tsk->thread.i387.fsave, + sizeof(struct i387_fsave_struct) ) ) + return -1; + return 1; +} + +static int save_i387_fxsave( struct _fpstate __user *buf ) +{ + struct task_struct *tsk = current; + int err = 0; + + unlazy_fpu( tsk ); + + if ( convert_fxsr_to_user( buf, &tsk->thread.i387.fxsave ) ) + return -1; + + err |= __put_user( tsk->thread.i387.fxsave.swd, &buf->status ); + err |= __put_user( X86_FXSR_MAGIC, &buf->magic ); + if ( err ) + return -1; + + if ( __copy_to_user( &buf->_fxsr_env[0], &tsk->thread.i387.fxsave, + sizeof(struct i387_fxsave_struct) ) ) + return -1; + return 1; +} + +int save_i387( struct _fpstate __user *buf ) +{ + if ( !used_math() ) + return 0; + + /* This will cause a "finit" to be triggered by the next + * attempted FPU operation by the 'current' process. + */ + clear_used_math(); + + if ( HAVE_HWFP ) { + if ( cpu_has_fxsr ) { + return save_i387_fxsave( buf ); + } else { + return save_i387_fsave( buf ); + } + } else { + return save_i387_soft( ¤t->thread.i387.soft, buf ); + } +} + +static inline int restore_i387_fsave( struct _fpstate __user *buf ) +{ + struct task_struct *tsk = current; + clear_fpu( tsk ); + return __copy_from_user( &tsk->thread.i387.fsave, buf, + sizeof(struct i387_fsave_struct) ); +} + +static int restore_i387_fxsave( struct _fpstate __user *buf ) +{ + int err; + struct task_struct *tsk = current; + clear_fpu( tsk ); + err = __copy_from_user( &tsk->thread.i387.fxsave, &buf->_fxsr_env[0], + sizeof(struct i387_fxsave_struct) ); + /* mxcsr reserved bits must be masked to zero for security reasons */ + tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; + return err ? 1 : convert_fxsr_from_user( &tsk->thread.i387.fxsave, buf ); +} + +int restore_i387( struct _fpstate __user *buf ) +{ + int err; + + if ( HAVE_HWFP ) { + if ( cpu_has_fxsr ) { + err = restore_i387_fxsave( buf ); + } else { + err = restore_i387_fsave( buf ); + } + } else { + err = restore_i387_soft( ¤t->thread.i387.soft, buf ); + } + set_used_math(); + return err; +} + +/* + * ptrace request handlers. + */ + +static inline int get_fpregs_fsave( struct user_i387_struct __user *buf, + struct task_struct *tsk ) +{ + return __copy_to_user( buf, &tsk->thread.i387.fsave, + sizeof(struct user_i387_struct) ); +} + +static inline int get_fpregs_fxsave( struct user_i387_struct __user *buf, + struct task_struct *tsk ) +{ + return convert_fxsr_to_user( (struct _fpstate __user *)buf, + &tsk->thread.i387.fxsave ); +} + +int get_fpregs( struct user_i387_struct __user *buf, struct task_struct *tsk ) +{ + if ( HAVE_HWFP ) { + if ( cpu_has_fxsr ) { + return get_fpregs_fxsave( buf, tsk ); + } else { + return get_fpregs_fsave( buf, tsk ); + } + } else { + return save_i387_soft( &tsk->thread.i387.soft, + (struct _fpstate __user *)buf ); + } +} + +static inline int set_fpregs_fsave( struct task_struct *tsk, + struct user_i387_struct __user *buf ) +{ + return __copy_from_user( &tsk->thread.i387.fsave, buf, + sizeof(struct user_i387_struct) ); +} + +static inline int set_fpregs_fxsave( struct task_struct *tsk, + struct user_i387_struct __user *buf ) +{ + return convert_fxsr_from_user( &tsk->thread.i387.fxsave, + (struct _fpstate __user *)buf ); +} + +int set_fpregs( struct task_struct *tsk, struct user_i387_struct __user *buf ) +{ + if ( HAVE_HWFP ) { + if ( cpu_has_fxsr ) { + return set_fpregs_fxsave( tsk, buf ); + } else { + return set_fpregs_fsave( tsk, buf ); + } + } else { + return restore_i387_soft( &tsk->thread.i387.soft, + (struct _fpstate __user *)buf ); + } +} + +int get_fpxregs( struct user_fxsr_struct __user *buf, struct task_struct *tsk ) +{ + if ( cpu_has_fxsr ) { + if (__copy_to_user( buf, &tsk->thread.i387.fxsave, + sizeof(struct user_fxsr_struct) )) + return -EFAULT; + return 0; + } else { + return -EIO; + } +} + +int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct __user *buf ) +{ + int ret = 0; + + if ( cpu_has_fxsr ) { + if (__copy_from_user( &tsk->thread.i387.fxsave, buf, + sizeof(struct user_fxsr_struct) )) + ret = -EFAULT; + /* mxcsr reserved bits must be masked to zero for security reasons */ + tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask; + } else { + ret = -EIO; + } + return ret; +} + +/* + * FPU state for core dumps. + */ + +static inline void copy_fpu_fsave( struct task_struct *tsk, + struct user_i387_struct *fpu ) +{ + memcpy( fpu, &tsk->thread.i387.fsave, + sizeof(struct user_i387_struct) ); +} + +static inline void copy_fpu_fxsave( struct task_struct *tsk, + struct user_i387_struct *fpu ) +{ + unsigned short *to; + unsigned short *from; + int i; + + memcpy( fpu, &tsk->thread.i387.fxsave, 7 * sizeof(long) ); + + to = (unsigned short *)&fpu->st_space[0]; + from = (unsigned short *)&tsk->thread.i387.fxsave.st_space[0]; + for ( i = 0 ; i < 8 ; i++, to += 5, from += 8 ) { + memcpy( to, from, 5 * sizeof(unsigned short) ); + } +} + +int dump_fpu( struct pt_regs *regs, struct user_i387_struct *fpu ) +{ + int fpvalid; + struct task_struct *tsk = current; + + fpvalid = !!used_math(); + if ( fpvalid ) { + unlazy_fpu( tsk ); + if ( cpu_has_fxsr ) { + copy_fpu_fxsave( tsk, fpu ); + } else { + copy_fpu_fsave( tsk, fpu ); + } + } + + return fpvalid; +} +EXPORT_SYMBOL(dump_fpu); + +int dump_task_fpu(struct task_struct *tsk, struct user_i387_struct *fpu) +{ + int fpvalid = !!tsk_used_math(tsk); + + if (fpvalid) { + if (tsk == current) + unlazy_fpu(tsk); + if (cpu_has_fxsr) + copy_fpu_fxsave(tsk, fpu); + else + copy_fpu_fsave(tsk, fpu); + } + return fpvalid; +} + +int dump_task_extended_fpu(struct task_struct *tsk, struct user_fxsr_struct *fpu) +{ + int fpvalid = tsk_used_math(tsk) && cpu_has_fxsr; + + if (fpvalid) { + if (tsk == current) + unlazy_fpu(tsk); + memcpy(fpu, &tsk->thread.i387.fxsave, sizeof(*fpu)); + } + return fpvalid; +} diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c new file mode 100644 index 000000000000..6f508e8d7c57 --- /dev/null +++ b/arch/x86/kernel/i8237.c @@ -0,0 +1,72 @@ +/* + * i8237.c: 8237A DMA controller suspend functions. + * + * Written by Pierre Ossman, 2005. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + */ + +#include +#include + +#include + +/* + * This module just handles suspend/resume issues with the + * 8237A DMA controller (used for ISA and LPC). + * Allocation is handled in kernel/dma.c and normal usage is + * in asm/dma.h. + */ + +static int i8237A_resume(struct sys_device *dev) +{ + unsigned long flags; + int i; + + flags = claim_dma_lock(); + + dma_outb(DMA1_RESET_REG, 0); + dma_outb(DMA2_RESET_REG, 0); + + for (i = 0;i < 8;i++) { + set_dma_addr(i, 0x000000); + /* DMA count is a bit weird so this is not 0 */ + set_dma_count(i, 1); + } + + /* Enable cascade DMA or channel 0-3 won't work */ + enable_dma(4); + + release_dma_lock(flags); + + return 0; +} + +static int i8237A_suspend(struct sys_device *dev, pm_message_t state) +{ + return 0; +} + +static struct sysdev_class i8237_sysdev_class = { + set_kset_name("i8237"), + .suspend = i8237A_suspend, + .resume = i8237A_resume, +}; + +static struct sys_device device_i8237A = { + .id = 0, + .cls = &i8237_sysdev_class, +}; + +static int __init i8237A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8237_sysdev_class); + if (!error) + error = sysdev_register(&device_i8237A); + return error; +} + +device_initcall(i8237A_init_sysfs); diff --git a/arch/x86/kernel/i8253_32.c b/arch/x86/kernel/i8253_32.c new file mode 100644 index 000000000000..6d839f2f1b1a --- /dev/null +++ b/arch/x86/kernel/i8253_32.c @@ -0,0 +1,206 @@ +/* + * i8253.c 8253/PIT functions + * + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +DEFINE_SPINLOCK(i8253_lock); +EXPORT_SYMBOL(i8253_lock); + +/* + * HPET replaces the PIT, when enabled. So we need to know, which of + * the two timers is used + */ +struct clock_event_device *global_clock_event; + +/* + * Initialize the PIT timer. + * + * This is also called after resume to bring the PIT into operation again. + */ +static void init_pit_timer(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(0x34, PIT_MODE); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + break; + + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + if (evt->mode == CLOCK_EVT_MODE_PERIODIC || + evt->mode == CLOCK_EVT_MODE_ONESHOT) { + outb_p(0x30, PIT_MODE); + outb_p(0, PIT_CH0); + outb_p(0, PIT_CH0); + } + break; + + case CLOCK_EVT_MODE_ONESHOT: + /* One shot setup */ + outb_p(0x38, PIT_MODE); + break; + + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ + break; + } + spin_unlock_irqrestore(&i8253_lock, flags); +} + +/* + * Program the next event in oneshot mode + * + * Delta is given in PIT ticks + */ +static int pit_next_event(unsigned long delta, struct clock_event_device *evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + outb_p(delta & 0xff , PIT_CH0); /* LSB */ + outb(delta >> 8 , PIT_CH0); /* MSB */ + spin_unlock_irqrestore(&i8253_lock, flags); + + return 0; +} + +/* + * On UP the PIT can serve all of the possible timer functions. On SMP systems + * it can be solely used for the global tick. + * + * The profiling and update capabilites are switched off once the local apic is + * registered. This mechanism replaces the previous #ifdef LOCAL_APIC - + * !using_apic_timer decisions in do_timer_interrupt_hook() + */ +struct clock_event_device pit_clockevent = { + .name = "pit", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = init_pit_timer, + .set_next_event = pit_next_event, + .shift = 32, + .irq = 0, +}; + +/* + * Initialize the conversion factor and the min/max deltas of the clock event + * structure and register the clock event source with the framework. + */ +void __init setup_pit_timer(void) +{ + /* + * Start pit with the boot cpu mask and make it global after the + * IO_APIC has been initialized. + */ + pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); + pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32); + pit_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFF, &pit_clockevent); + pit_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &pit_clockevent); + clockevents_register_device(&pit_clockevent); + global_clock_event = &pit_clockevent; +} + +/* + * Since the PIT overflows every tick, its not very useful + * to just read by itself. So use jiffies to emulate a free + * running counter: + */ +static cycle_t pit_read(void) +{ + unsigned long flags; + int count; + u32 jifs; + static int old_count; + static u32 old_jifs; + + spin_lock_irqsave(&i8253_lock, flags); + /* + * Although our caller may have the read side of xtime_lock, + * this is now a seqlock, and we are cheating in this routine + * by having side effects on state that we cannot undo if + * there is a collision on the seqlock and our caller has to + * retry. (Namely, old_jifs and old_count.) So we must treat + * jiffies as volatile despite the lock. We read jiffies + * before latching the timer count to guarantee that although + * the jiffies value might be older than the count (that is, + * the counter may underflow between the last point where + * jiffies was incremented and the point where we latch the + * count), it cannot be newer. + */ + jifs = jiffies; + outb_p(0x00, PIT_MODE); /* latch the count ASAP */ + count = inb_p(PIT_CH0); /* read the latched count */ + count |= inb_p(PIT_CH0) << 8; + + /* VIA686a test code... reset the latch if count > max + 1 */ + if (count > LATCH) { + outb_p(0x34, PIT_MODE); + outb_p(LATCH & 0xff, PIT_CH0); + outb(LATCH >> 8, PIT_CH0); + count = LATCH - 1; + } + + /* + * It's possible for count to appear to go the wrong way for a + * couple of reasons: + * + * 1. The timer counter underflows, but we haven't handled the + * resulting interrupt and incremented jiffies yet. + * 2. Hardware problem with the timer, not giving us continuous time, + * the counter does small "jumps" upwards on some Pentium systems, + * (see c't 95/10 page 335 for Neptun bug.) + * + * Previous attempts to handle these cases intelligently were + * buggy, so we just do the simple thing now. + */ + if (count > old_count && jifs == old_jifs) { + count = old_count; + } + old_count = count; + old_jifs = jifs; + + spin_unlock_irqrestore(&i8253_lock, flags); + + count = (LATCH - 1) - count; + + return (cycle_t)(jifs * LATCH) + count; +} + +static struct clocksource clocksource_pit = { + .name = "pit", + .rating = 110, + .read = pit_read, + .mask = CLOCKSOURCE_MASK(32), + .mult = 0, + .shift = 20, +}; + +static int __init init_pit_clocksource(void) +{ + if (num_possible_cpus() > 1) /* PIT does not scale! */ + return 0; + + clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); + return clocksource_register(&clocksource_pit); +} +arch_initcall(init_pit_clocksource); diff --git a/arch/x86/kernel/i8259_32.c b/arch/x86/kernel/i8259_32.c new file mode 100644 index 000000000000..0499cbe9871a --- /dev/null +++ b/arch/x86/kernel/i8259_32.c @@ -0,0 +1,420 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + * plus some generic x86 specific things if generic specifics makes + * any sense at all. + * this file should become arch/i386/kernel/irq.c when the old irq.c + * moves to arch independent land + */ + +static int i8259A_auto_eoi; +DEFINE_SPINLOCK(i8259A_lock); +static void mask_and_ack_8259A(unsigned int); + +static struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .mask = disable_8259A_irq, + .disable = disable_8259A_irq, + .unmask = enable_8259A_irq, + .mask_ack = mask_and_ack_8259A, +}; + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +unsigned int cached_irq_mask = 0xffff; + +/* + * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) + * boards the timer interrupt is not really connected to any IO-APIC pin, + * it's fed to the master 8259A's IR0 line only. + * + * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. + * this 'mixed mode' IRQ handling costs nothing because it's only used + * at IRQ setup time. + */ +unsigned long io_apic_irqs; + +void disable_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void enable_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_slave_mask, PIC_SLAVE_IMR); + else + outb(cached_master_mask, PIC_MASTER_IMR); + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +int i8259A_irq_pending(unsigned int irq) +{ + unsigned int mask = 1<> 8); + spin_unlock_irqrestore(&i8259A_lock, flags); + + return ret; +} + +void make_8259A_irq(unsigned int irq) +{ + disable_irq_nosync(irq); + io_apic_irqs &= ~(1<> 8); + outb(0x0A,PIC_SLAVE_CMD); /* back to the IRR register */ + return value; +} + +/* + * Careful! The 8259A is a fragile beast, it pretty + * much _has_ to be done exactly like this (mask it + * first, _then_ send the EOI, and the order of EOI + * to the two 8259s is important! + */ +static void mask_and_ack_8259A(unsigned int irq) +{ + unsigned int irqmask = 1 << irq; + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + /* + * Lightweight spurious IRQ detection. We do not want + * to overdo spurious IRQ handling - it's usually a sign + * of hardware problems, so we only do the checks we can + * do without slowing down good hardware unnecessarily. + * + * Note that IRQ7 and IRQ15 (the two spurious IRQs + * usually resulting from the 8259A-1|2 PICs) occur + * even if the IRQ is masked in the 8259A. Thus we + * can check spurious 8259A IRQs without doing the + * quite slow i8259A_irq_real() call for every IRQ. + * This does not cover 100% of spurious interrupts, + * but should be enough to warn the user that there + * is something bad going on ... + */ + if (cached_irq_mask & irqmask) + goto spurious_8259A_irq; + cached_irq_mask |= irqmask; + +handle_real_irq: + if (irq & 8) { + inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ + outb(cached_slave_mask, PIC_SLAVE_IMR); + outb(0x60+(irq&7),PIC_SLAVE_CMD);/* 'Specific EOI' to slave */ + outb(0x60+PIC_CASCADE_IR,PIC_MASTER_CMD); /* 'Specific EOI' to master-IRQ2 */ + } else { + inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ + outb(cached_master_mask, PIC_MASTER_IMR); + outb(0x60+irq,PIC_MASTER_CMD); /* 'Specific EOI to master */ + } + spin_unlock_irqrestore(&i8259A_lock, flags); + return; + +spurious_8259A_irq: + /* + * this is the slow path - should happen rarely. + */ + if (i8259A_irq_real(irq)) + /* + * oops, the IRQ _is_ in service according to the + * 8259A - not spurious, go handle it. + */ + goto handle_real_irq; + + { + static int spurious_irq_mask; + /* + * At this point we can be sure the IRQ is spurious, + * lets ACK and report it. [once per IRQ] + */ + if (!(spurious_irq_mask & irqmask)) { + printk(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); + spurious_irq_mask |= irqmask; + } + atomic_inc(&irq_err_count); + /* + * Theoretically we do not have to handle this IRQ, + * but in Linux this does not cause problems and is + * simpler for us. + */ + goto handle_real_irq; + } +} + +static char irq_trigger[2]; +/** + * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ + */ +static void restore_ELCR(char *trigger) +{ + outb(trigger[0], 0x4d0); + outb(trigger[1], 0x4d1); +} + +static void save_ELCR(char *trigger) +{ + /* IRQ 0,1,2,8,13 are marked as reserved */ + trigger[0] = inb(0x4d0) & 0xF8; + trigger[1] = inb(0x4d1) & 0xDE; +} + +static int i8259A_resume(struct sys_device *dev) +{ + init_8259A(i8259A_auto_eoi); + restore_ELCR(irq_trigger); + return 0; +} + +static int i8259A_suspend(struct sys_device *dev, pm_message_t state) +{ + save_ELCR(irq_trigger); + return 0; +} + +static int i8259A_shutdown(struct sys_device *dev) +{ + /* Put the i8259A into a quiescent state that + * the kernel initialization code can get it + * out of. + */ + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ + return 0; +} + +static struct sysdev_class i8259_sysdev_class = { + set_kset_name("i8259"), + .suspend = i8259A_suspend, + .resume = i8259A_resume, + .shutdown = i8259A_shutdown, +}; + +static struct sys_device device_i8259A = { + .id = 0, + .cls = &i8259_sysdev_class, +}; + +static int __init i8259A_init_sysfs(void) +{ + int error = sysdev_class_register(&i8259_sysdev_class); + if (!error) + error = sysdev_register(&device_i8259A); + return error; +} + +device_initcall(i8259A_init_sysfs); + +void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + /* + * outb_p - this has to work on a wide range of PC hardware. + */ + outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ + outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ + outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ + if (auto_eoi) /* master does Auto EOI */ + outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); + else /* master expects normal EOI */ + outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + + outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ + outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ + outb_p(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ + outb_p(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ + if (auto_eoi) + /* + * In AEOI mode we just have to mask the interrupt + * when acking. + */ + i8259A_chip.mask_ack = disable_8259A_irq; + else + i8259A_chip.mask_ack = mask_and_ack_8259A; + + udelay(100); /* wait for 8259A to initialize */ + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +/* + * Note that on a 486, we don't want to do a SIGFPE on an irq13 + * as the irq is unreliable, and exception 16 works correctly + * (ie as explained in the intel literature). On a 386, you + * can't use exception 16 due to bad IBM design, so we have to + * rely on the less exact irq13. + * + * Careful.. Not only is IRQ13 unreliable, but it is also + * leads to races. IBM designers who came up with it should + * be shot. + */ + + +static irqreturn_t math_error_irq(int cpl, void *dev_id) +{ + extern void math_error(void __user *); + outb(0,0xF0); + if (ignore_fpu_irq || !boot_cpu_data.hard_math) + return IRQ_NONE; + math_error((void __user *)get_irq_regs()->eip); + return IRQ_HANDLED; +} + +/* + * New motherboards sometimes make IRQ 13 be a PCI interrupt, + * so allow interrupt sharing. + */ +static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL }; + +void __init init_ISA_irqs (void) +{ + int i; + +#ifdef CONFIG_X86_LOCAL_APIC + init_bsp_APIC(); +#endif + init_8259A(0); + + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; + irq_desc[i].depth = 1; + + if (i < 16) { + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &i8259A_chip, + handle_level_irq, "XT"); + } else { + /* + * 'high' PCI IRQs filled in on demand + */ + irq_desc[i].chip = &no_irq_chip; + } + } +} + +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + /* all the set up before the call gates are initialised */ + pre_intr_init_hook(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (i >= NR_IRQS) + break; + if (vector != SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + + /* setup after call gates are initialised (usually add in + * the architecture specific gates) + */ + intr_init_hook(); + + /* + * External FPU? Set up irq13 if so, for + * original braindamaged IBM FERR coupling. + */ + if (boot_cpu_data.hard_math && !cpu_has_fpu) + setup_irq(FPU_IRQ, &fpu_irq); + + irq_ctx_init(smp_processor_id()); +} diff --git a/arch/x86/kernel/init_task_32.c b/arch/x86/kernel/init_task_32.c new file mode 100644 index 000000000000..d26fc063a760 --- /dev/null +++ b/arch/x86/kernel/init_task_32.c @@ -0,0 +1,46 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static struct fs_struct init_fs = INIT_FS; +static struct files_struct init_files = INIT_FILES; +static struct signal_struct init_signals = INIT_SIGNALS(init_signals); +static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); +struct mm_struct init_mm = INIT_MM(init_mm); + +EXPORT_SYMBOL(init_mm); + +/* + * Initial thread structure. + * + * We need to make sure that this is THREAD_SIZE aligned due to the + * way process stacks are handled. This is done by having a special + * "init_task" linker map entry.. + */ +union thread_union init_thread_union + __attribute__((__section__(".data.init_task"))) = + { INIT_THREAD_INFO(init_task) }; + +/* + * Initial task structure. + * + * All other task structs will be allocated on slabs in fork.c + */ +struct task_struct init_task = INIT_TASK(init_task); + +EXPORT_SYMBOL(init_task); + +/* + * per-CPU TSS segments. Threads are completely 'soft' on Linux, + * no more per-task TSS's. + */ +DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; + diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c new file mode 100644 index 000000000000..e2f4a1c68547 --- /dev/null +++ b/arch/x86/kernel/io_apic_32.c @@ -0,0 +1,2847 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku and + * Hidemi Kishimoto , + * further tested and cleaned up by Zach Brown + * and Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "io_ports.h" + +int (*ioapic_renumber_irq)(int ioapic, int irq); +atomic_t irq_mis_count; + +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + +static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); + +int timer_over_8254 __initdata = 1; + +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +static int disable_timer_pin_1 __initdata; + +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +static struct irq_pin_list { + int apic, pin, next; +} irq_2_pin[PIN_MAP_SIZE]; + +struct io_apic { + unsigned int index; + unsigned int unused[3]; + unsigned int data; +}; + +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +{ + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); +} + +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index register + */ +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +{ + volatile struct io_apic __iomem *io_apic = io_apic_base(apic); + if (sis_apic_bug) + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} + +/* + * When we write a new IO APIC routing entry, we need to write the high + * word first! If the mask bit in the low word is clear, we will enable + * the interrupt, and we need to make sure the entry is fully populated + * before that happens. + */ +static void +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + union entry_union eu; + eu.entry = e; + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * When we mask an IO APIC routing entry, we need to write the low + * word first, in order to set the mask bit before we change the + * high bits! + */ +static void ioapic_mask_entry(int apic, int pin) +{ + unsigned long flags; + union entry_union eu = { .entry.mask = 1 }; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static void add_pin_to_irq(unsigned int irq, int apic, int pin) +{ + static int first_free_entry = NR_IRQS; + struct irq_pin_list *entry = irq_2_pin + irq; + + while (entry->next) + entry = irq_2_pin + entry->next; + + if (entry->pin != -1) { + entry->next = first_free_entry; + entry = irq_2_pin + entry->next; + if (++first_free_entry >= PIN_MAP_SIZE) + panic("io_apic.c: whoops"); + } + entry->apic = apic; + entry->pin = pin; +} + +/* + * Reroute an IRQ to a different pin. + */ +static void __init replace_pin_at_irq(unsigned int irq, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_pin_list *entry = irq_2_pin + irq; + + while (1) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + } + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) +{ + struct irq_pin_list *entry = irq_2_pin + irq; + unsigned int pin, reg; + + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg &= ~disable; + reg |= enable; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + +/* mask = 1 */ +static void __mask_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00010000, 0); +} + +/* mask = 0 */ +static void __unmask_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0, 0x00010000); +} + +/* mask = 1, trigger = 0 */ +static void __mask_and_edge_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); +} + +/* mask = 0, trigger = 1 */ +static void __unmask_and_level_IO_APIC_irq (unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); +} + +static void mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + entry = ioapic_read_entry(apic, pin); + if (entry.delivery_mode == dest_SMI) + return; + + /* + * Disable it in the IO-APIC irq-routing table: + */ + ioapic_mask_entry(apic, pin); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + clear_IO_APIC_pin(apic, pin); +} + +#ifdef CONFIG_SMP +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) +{ + unsigned long flags; + int pin; + struct irq_pin_list *entry = irq_2_pin + irq; + unsigned int apicid_value; + cpumask_t tmp; + + cpus_and(tmp, cpumask, cpu_online_map); + if (cpus_empty(tmp)) + tmp = TARGET_CPUS; + + cpus_and(cpumask, tmp, CPU_MASK_ALL); + + apicid_value = cpu_mask_to_apicid(cpumask); + /* Prepare to do the io_apic_write */ + apicid_value = apicid_value << 24; + spin_lock_irqsave(&ioapic_lock, flags); + for (;;) { + pin = entry->pin; + if (pin == -1) + break; + io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + irq_desc[irq].affinity = cpumask; + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#if defined(CONFIG_IRQBALANCE) +# include /* kernel_thread() */ +# include /* kstat */ +# include /* kmalloc() */ +# include /* time_after() */ + +#define IRQBALANCE_CHECK_ARCH -999 +#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) +#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) +#define BALANCED_IRQ_MORE_DELTA (HZ/10) +#define BALANCED_IRQ_LESS_DELTA (HZ) + +static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; +static int physical_balance __read_mostly; +static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; + +static struct irq_cpu_info { + unsigned long * last_irq; + unsigned long * irq_delta; + unsigned long irq; +} irq_cpu_data[NR_CPUS]; + +#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) +#define LAST_CPU_IRQ(cpu,irq) (irq_cpu_data[cpu].last_irq[irq]) +#define IRQ_DELTA(cpu,irq) (irq_cpu_data[cpu].irq_delta[irq]) + +#define IDLE_ENOUGH(cpu,now) \ + (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) + +#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) + +#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i])) + +static cpumask_t balance_irq_affinity[NR_IRQS] = { + [0 ... NR_IRQS-1] = CPU_MASK_ALL +}; + +void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) +{ + balance_irq_affinity[irq] = mask; +} + +static unsigned long move(int curr_cpu, cpumask_t allowed_mask, + unsigned long now, int direction) +{ + int search_idle = 1; + int cpu = curr_cpu; + + goto inside; + + do { + if (unlikely(cpu == curr_cpu)) + search_idle = 0; +inside: + if (direction == 1) { + cpu++; + if (cpu >= NR_CPUS) + cpu = 0; + } else { + cpu--; + if (cpu == -1) + cpu = NR_CPUS-1; + } + } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu,allowed_mask) || + (search_idle && !IDLE_ENOUGH(cpu,now))); + + return cpu; +} + +static inline void balance_irq(int cpu, int irq) +{ + unsigned long now = jiffies; + cpumask_t allowed_mask; + unsigned int new_cpu; + + if (irqbalance_disabled) + return; + + cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); + new_cpu = move(cpu, allowed_mask, now, 1); + if (cpu != new_cpu) { + set_pending_irq(irq, cpumask_of_cpu(new_cpu)); + } +} + +static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) +{ + int i, j; + + for_each_online_cpu(i) { + for (j = 0; j < NR_IRQS; j++) { + if (!irq_desc[j].action) + continue; + /* Is it a significant load ? */ + if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i),j) < + useful_load_threshold) + continue; + balance_irq(i, j); + } + } + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); + return; +} + +static void do_irq_balance(void) +{ + int i, j; + unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); + unsigned long move_this_load = 0; + int max_loaded = 0, min_loaded = 0; + int load; + unsigned long useful_load_threshold = balanced_irq_interval + 10; + int selected_irq; + int tmp_loaded, first_attempt = 1; + unsigned long tmp_cpu_irq; + unsigned long imbalance = 0; + cpumask_t allowed_mask, target_cpu_mask, tmp; + + for_each_possible_cpu(i) { + int package_index; + CPU_IRQ(i) = 0; + if (!cpu_online(i)) + continue; + package_index = CPU_TO_PACKAGEINDEX(i); + for (j = 0; j < NR_IRQS; j++) { + unsigned long value_now, delta; + /* Is this an active IRQ or balancing disabled ? */ + if (!irq_desc[j].action || irq_balancing_disabled(j)) + continue; + if ( package_index == i ) + IRQ_DELTA(package_index,j) = 0; + /* Determine the total count per processor per IRQ */ + value_now = (unsigned long) kstat_cpu(i).irqs[j]; + + /* Determine the activity per processor per IRQ */ + delta = value_now - LAST_CPU_IRQ(i,j); + + /* Update last_cpu_irq[][] for the next time */ + LAST_CPU_IRQ(i,j) = value_now; + + /* Ignore IRQs whose rate is less than the clock */ + if (delta < useful_load_threshold) + continue; + /* update the load for the processor or package total */ + IRQ_DELTA(package_index,j) += delta; + + /* Keep track of the higher numbered sibling as well */ + if (i != package_index) + CPU_IRQ(i) += delta; + /* + * We have sibling A and sibling B in the package + * + * cpu_irq[A] = load for cpu A + load for cpu B + * cpu_irq[B] = load for cpu B + */ + CPU_IRQ(package_index) += delta; + } + } + /* Find the least loaded processor package */ + for_each_online_cpu(i) { + if (i != CPU_TO_PACKAGEINDEX(i)) + continue; + if (min_cpu_irq > CPU_IRQ(i)) { + min_cpu_irq = CPU_IRQ(i); + min_loaded = i; + } + } + max_cpu_irq = ULONG_MAX; + +tryanothercpu: + /* Look for heaviest loaded processor. + * We may come back to get the next heaviest loaded processor. + * Skip processors with trivial loads. + */ + tmp_cpu_irq = 0; + tmp_loaded = -1; + for_each_online_cpu(i) { + if (i != CPU_TO_PACKAGEINDEX(i)) + continue; + if (max_cpu_irq <= CPU_IRQ(i)) + continue; + if (tmp_cpu_irq < CPU_IRQ(i)) { + tmp_cpu_irq = CPU_IRQ(i); + tmp_loaded = i; + } + } + + if (tmp_loaded == -1) { + /* In the case of small number of heavy interrupt sources, + * loading some of the cpus too much. We use Ingo's original + * approach to rotate them around. + */ + if (!first_attempt && imbalance >= useful_load_threshold) { + rotate_irqs_among_cpus(useful_load_threshold); + return; + } + goto not_worth_the_effort; + } + + first_attempt = 0; /* heaviest search */ + max_cpu_irq = tmp_cpu_irq; /* load */ + max_loaded = tmp_loaded; /* processor */ + imbalance = (max_cpu_irq - min_cpu_irq) / 2; + + /* if imbalance is less than approx 10% of max load, then + * observe diminishing returns action. - quit + */ + if (imbalance < (max_cpu_irq >> 3)) + goto not_worth_the_effort; + +tryanotherirq: + /* if we select an IRQ to move that can't go where we want, then + * see if there is another one to try. + */ + move_this_load = 0; + selected_irq = -1; + for (j = 0; j < NR_IRQS; j++) { + /* Is this an active IRQ? */ + if (!irq_desc[j].action) + continue; + if (imbalance <= IRQ_DELTA(max_loaded,j)) + continue; + /* Try to find the IRQ that is closest to the imbalance + * without going over. + */ + if (move_this_load < IRQ_DELTA(max_loaded,j)) { + move_this_load = IRQ_DELTA(max_loaded,j); + selected_irq = j; + } + } + if (selected_irq == -1) { + goto tryanothercpu; + } + + imbalance = move_this_load; + + /* For physical_balance case, we accumlated both load + * values in the one of the siblings cpu_irq[], + * to use the same code for physical and logical processors + * as much as possible. + * + * NOTE: the cpu_irq[] array holds the sum of the load for + * sibling A and sibling B in the slot for the lowest numbered + * sibling (A), _AND_ the load for sibling B in the slot for + * the higher numbered sibling. + * + * We seek the least loaded sibling by making the comparison + * (A+B)/2 vs B + */ + load = CPU_IRQ(min_loaded) >> 1; + for_each_cpu_mask(j, cpu_sibling_map[min_loaded]) { + if (load > CPU_IRQ(j)) { + /* This won't change cpu_sibling_map[min_loaded] */ + load = CPU_IRQ(j); + min_loaded = j; + } + } + + cpus_and(allowed_mask, + cpu_online_map, + balance_irq_affinity[selected_irq]); + target_cpu_mask = cpumask_of_cpu(min_loaded); + cpus_and(tmp, target_cpu_mask, allowed_mask); + + if (!cpus_empty(tmp)) { + /* mark for change destination */ + set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); + + /* Since we made a change, come back sooner to + * check for more variation. + */ + balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, + balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); + return; + } + goto tryanotherirq; + +not_worth_the_effort: + /* + * if we did not find an IRQ to move, then adjust the time interval + * upward + */ + balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, + balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); + return; +} + +static int balanced_irq(void *unused) +{ + int i; + unsigned long prev_balance_time = jiffies; + long time_remaining = balanced_irq_interval; + + /* push everything to CPU 0 to give us a starting point. */ + for (i = 0 ; i < NR_IRQS ; i++) { + irq_desc[i].pending_mask = cpumask_of_cpu(0); + set_pending_irq(i, cpumask_of_cpu(0)); + } + + set_freezable(); + for ( ; ; ) { + time_remaining = schedule_timeout_interruptible(time_remaining); + try_to_freeze(); + if (time_after(jiffies, + prev_balance_time+balanced_irq_interval)) { + preempt_disable(); + do_irq_balance(); + prev_balance_time = jiffies; + time_remaining = balanced_irq_interval; + preempt_enable(); + } + } + return 0; +} + +static int __init balanced_irq_init(void) +{ + int i; + struct cpuinfo_x86 *c; + cpumask_t tmp; + + cpus_shift_right(tmp, cpu_online_map, 2); + c = &boot_cpu_data; + /* When not overwritten by the command line ask subarchitecture. */ + if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) + irqbalance_disabled = NO_BALANCE_IRQ; + if (irqbalance_disabled) + return 0; + + /* disable irqbalance completely if there is only one processor online */ + if (num_online_cpus() < 2) { + irqbalance_disabled = 1; + return 0; + } + /* + * Enable physical balance only if more than 1 physical processor + * is present + */ + if (smp_num_siblings > 1 && !cpus_empty(tmp)) + physical_balance = 1; + + for_each_online_cpu(i) { + irq_cpu_data[i].irq_delta = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + irq_cpu_data[i].last_irq = kmalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { + printk(KERN_ERR "balanced_irq_init: out of memory"); + goto failed; + } + memset(irq_cpu_data[i].irq_delta,0,sizeof(unsigned long) * NR_IRQS); + memset(irq_cpu_data[i].last_irq,0,sizeof(unsigned long) * NR_IRQS); + } + + printk(KERN_INFO "Starting balanced_irq\n"); + if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) + return 0; + printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); +failed: + for_each_possible_cpu(i) { + kfree(irq_cpu_data[i].irq_delta); + irq_cpu_data[i].irq_delta = NULL; + kfree(irq_cpu_data[i].last_irq); + irq_cpu_data[i].last_irq = NULL; + } + return 0; +} + +int __devinit irqbalance_disable(char *str) +{ + irqbalance_disabled = 1; + return 1; +} + +__setup("noirqbalance", irqbalance_disable); + +late_initcall(balanced_irq_init); +#endif /* CONFIG_IRQBALANCE */ +#endif /* CONFIG_SMP */ + +#ifndef CONFIG_SMP +void fastcall send_IPI_self(int vector) +{ + unsigned int cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write_around(APIC_ICR, cfg); +} +#endif /* !CONFIG_SMP */ + + +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; +int skip_ioapic_setup; + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int apic, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_irqtype == type && + (mp_irqs[i].mpc_dstapic == mp_ioapics[apic].mpc_apicid || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) && + mp_irqs[i].mpc_dstirq == pin) + return i; + + return -1; +} + +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || + mp_bus_id_to_type[lbus] == MP_BUS_EISA || + mp_bus_id_to_type[lbus] == MP_BUS_MCA + ) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + + return mp_irqs[i].mpc_dstirq; + } + return -1; +} + +static int __init find_isa_irq_apic(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA || + mp_bus_id_to_type[lbus] == MP_BUS_EISA || + mp_bus_id_to_type[lbus] == MP_BUS_MCA + ) && + (mp_irqs[i].mpc_irqtype == type) && + (mp_irqs[i].mpc_srcbusirq == irq)) + break; + } + if (i < mp_irq_entries) { + int apic; + for(apic = 0; apic < nr_ioapics; apic++) { + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic) + return apic; + } + } + + return -1; +} + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +static int pin_2_irq(int idx, int apic, int pin); + +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " + "slot:%d, pin:%d.\n", bus, slot, pin); + if (mp_bus_id_to_pci_bus[bus] == -1) { + printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mpc_srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].mpc_apicid == mp_irqs[i].mpc_dstapic || + mp_irqs[i].mpc_dstapic == MP_APIC_ALL) + break; + + if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) && + !mp_irqs[i].mpc_irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mpc_dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].mpc_srcbusirq & 3)) + return irq; + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) + best_guess = irq; + } + } + return best_guess; +} +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +#ifdef CONFIG_SMP +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + set_ioapic_affinity_irq(irq, TARGET_CPUS); + } + + } +} +#endif + +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq)) +#define default_EISA_polarity(idx) (0) + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) (0) + +static int __init MPBIOS_polarity(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].mpc_irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + polarity = default_ISA_polarity(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + polarity = default_EISA_polarity(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + polarity = default_PCI_polarity(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + polarity = default_MCA_polarity(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + break; + } + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int MPBIOS_trigger(int idx) +{ + int bus = mp_irqs[idx].mpc_srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].mpc_irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + { + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + { + trigger = default_ISA_trigger(idx); + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + trigger = default_PCI_trigger(idx); + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } + break; + } + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static inline int irq_polarity(int idx) +{ + return MPBIOS_polarity(idx); +} + +static inline int irq_trigger(int idx) +{ + return MPBIOS_trigger(idx); +} + +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq, i; + int bus = mp_irqs[idx].mpc_srcbus; + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].mpc_dstirq != pin) + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + + switch (mp_bus_id_to_type[bus]) + { + case MP_BUS_ISA: /* ISA pin */ + case MP_BUS_EISA: + case MP_BUS_MCA: + { + irq = mp_irqs[idx].mpc_srcbusirq; + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); + + break; + } + default: + { + printk(KERN_ERR "unknown bus type %d.\n",bus); + irq = 0; + break; + } + } + + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } + return irq; +} + +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic,pin,mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} + +/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ +static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; + +static int __assign_irq_vector(int irq) +{ + static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + int vector, offset, i; + + BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); + + if (irq_vector[irq] > 0) + return irq_vector[irq]; + + vector = current_vector; + offset = current_offset; +next: + vector += 8; + if (vector >= FIRST_SYSTEM_VECTOR) { + offset = (offset + 1) % 8; + vector = FIRST_DEVICE_VECTOR + offset; + } + if (vector == current_vector) + return -ENOSPC; + if (vector == SYSCALL_VECTOR) + goto next; + for (i = 0; i < NR_IRQ_VECTORS; i++) + if (irq_vector[i] == vector) + goto next; + + current_vector = vector; + current_offset = offset; + irq_vector[irq] = vector; + + return vector; +} + +static int assign_irq_vector(int irq) +{ + unsigned long flags; + int vector; + + spin_lock_irqsave(&vector_lock, flags); + vector = __assign_irq_vector(irq); + spin_unlock_irqrestore(&vector_lock, flags); + + return vector; +} +static struct irq_chip ioapic_chip; + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +static void ioapic_register_intr(int irq, int vector, unsigned long trigger) +{ + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) { + irq_desc[irq].status |= IRQ_LEVEL; + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_fasteoi_irq, "fasteoi"); + } else { + irq_desc[irq].status &= ~IRQ_LEVEL; + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_edge_irq, "edge"); + } + set_intr_gate(vector, interrupt[irq]); +} + +static void __init setup_IO_APIC_irqs(void) +{ + struct IO_APIC_route_entry entry; + int apic, pin, idx, irq, first_notcon = 1, vector; + unsigned long flags; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + /* + * add it to the IO-APIC irq-routing table: + */ + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* enable IRQ */ + entry.dest.logical.logical_dest = + cpu_mask_to_apicid(TARGET_CPUS); + + idx = find_irq_entry(apic,pin,mp_INT); + if (idx == -1) { + if (first_notcon) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + " IO-APIC (apicid-pin) %d-%d", + mp_ioapics[apic].mpc_apicid, + pin); + first_notcon = 0; + } else + apic_printk(APIC_VERBOSE, ", %d-%d", + mp_ioapics[apic].mpc_apicid, pin); + continue; + } + + entry.trigger = irq_trigger(idx); + entry.polarity = irq_polarity(idx); + + if (irq_trigger(idx)) { + entry.trigger = 1; + entry.mask = 1; + } + + irq = pin_2_irq(idx, apic, pin); + /* + * skip adding the timer int on secondary nodes, which causes + * a small but painful rift in the time-space continuum + */ + if (multi_timer_check(apic, irq)) + continue; + else + add_pin_to_irq(irq, apic, pin); + + if (!apic && !IO_APIC_IRQ(irq)) + continue; + + if (IO_APIC_IRQ(irq)) { + vector = assign_irq_vector(irq); + entry.vector = vector; + ioapic_register_intr(irq, vector, IOAPIC_AUTO); + + if (!apic && (irq < 16)) + disable_8259A_irq(irq); + } + spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, entry); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + } + + if (!first_notcon) + apic_printk(APIC_VERBOSE, " not connected.\n"); +} + +/* + * Set up the 8259A-master output pin: + */ +static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector) +{ + struct IO_APIC_route_entry entry; + + memset(&entry,0,sizeof(entry)); + + disable_8259A_irq(0); + + /* mask LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* unmask IRQ now */ + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we have a 8259A-master in AEOI mode ... + */ + irq_desc[0].chip = &ioapic_chip; + set_irq_handler(0, handle_edge_irq); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(apic, pin, entry); + + enable_8259A_irq(0); +} + +void __init print_IO_APIC(void) +{ + int apic, i; + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (i = 0; i < nr_ioapics; i++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + reg_01.raw = io_apic_read(apic, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(apic, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); + + printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); + + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + } + + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" + " Stat Dest Deli Vect: \n"); + + for (i = 0; i <= reg_01.bits.entries; i++) { + struct IO_APIC_route_entry entry; + + entry = ioapic_read_entry(apic, i); + + printk(KERN_DEBUG " %02x %03X %02X ", + i, + entry.dest.logical.logical_dest, + entry.dest.physical.physical_dest + ); + + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } + } + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for (i = 0; i < NR_IRQS; i++) { + struct irq_pin_list *entry = irq_2_pin + i; + if (entry->pin < 0) + continue; + printk(KERN_DEBUG "IRQ%d ", i); + for (;;) { + printk("-> %d:%d", entry->apic, entry->pin); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + printk("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); + + return; +} + +#if 0 + +static void print_APIC_bitfield (int base) +{ + unsigned int v; + int i, j; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); + for (i = 0; i < 8; i++) { + v = apic_read(base + i*0x10); + for (j = 0; j < 32; j++) { + if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } + + v = apic_read(APIC_ICR); + printk(KERN_DEBUG "... APIC ICR: %08x\n", v); + v = apic_read(APIC_ICR2); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); + + v = apic_read(APIC_LVTT); + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + printk("\n"); +} + +void print_all_local_APICs (void) +{ + on_each_cpu(print_local_APIC, NULL, 1, 1); +} + +void /*__init*/ print_PIC(void) +{ + unsigned int v; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} + +#endif /* 0 */ + +static void __init enable_IO_APIC(void) +{ + union IO_APIC_reg_01 reg_01; + int i8259_apic, i8259_pin; + int i, apic; + unsigned long flags; + + for (i = 0; i < PIN_MAP_SIZE; i++) { + irq_2_pin[i].pin = -1; + irq_2_pin[i].next = 0; + } + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + for(apic = 0; apic < nr_ioapics; apic++) { + int pin; + /* See if any of the pins is in ExtINT mode */ + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + entry = ioapic_read_entry(apic, pin); + + + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; + } + } + } + found_i8259: + /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); + /* Trust the MP table if nothing is setup in the hardware */ + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + ioapic_i8259.pin = i8259_pin; + ioapic_i8259.apic = i8259_apic; + } + /* Complain if the MP table and the hardware disagree */ + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + { + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + + /* + * If the i8259 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrupts can be delivered. + */ + if (ioapic_i8259.pin != -1) { + struct IO_APIC_route_entry entry; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = dest_ExtINT; /* ExtInt */ + entry.vector = 0; + entry.dest.physical.physical_dest = + GET_APIC_ID(apic_read(APIC_ID)); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); + } + disconnect_bsp_APIC(ioapic_i8259.pin != -1); +} + +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 + */ + +#ifndef CONFIG_X86_NUMAQ +static void __init setup_ioapic_ids_from_mpc(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mpc_apicid; + + if (mp_ioapics[apic].mpc_apicid >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + apic, mp_ioapics[apic].mpc_apicid); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + mp_ioapics[apic].mpc_apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(phys_id_present_map, + mp_ioapics[apic].mpc_apicid)) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + apic, mp_ioapics[apic].mpc_apicid); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + mp_ioapics[apic].mpc_apicid = i; + } else { + physid_mask_t tmp; + tmp = apicid_to_cpu_present(mp_ioapics[apic].mpc_apicid); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mp_ioapics[apic].mpc_apicid); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mpc_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mpc_dstapic == old_id) + mp_irqs[i].mpc_dstapic + = mp_ioapics[apic].mpc_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mpc_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mpc_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0, reg_00.raw); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} +#else +static void __init setup_ioapic_ids_from_mpc(void) { } +#endif + +int no_timer_check __initdata; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + + if (no_timer_check) + return 1; + + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + if (jiffies - t1 > 4) + return 1; + + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Startup quirk: + * + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + * + * (We do this for level-triggered IRQs too - it cannot hurt.) + */ +static unsigned int startup_ioapic_irq(unsigned int irq) +{ + int was_pending = 0; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + if (irq < 16) { + disable_8259A_irq(irq); + if (i8259A_irq_pending(irq)) + was_pending = 1; + } + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +static void ack_ioapic_irq(unsigned int irq) +{ + move_native_irq(irq); + ack_APIC_irq(); +} + +static void ack_ioapic_quirk_irq(unsigned int irq) +{ + unsigned long v; + int i; + + move_native_irq(irq); +/* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = irq_vector[irq]; + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + ack_APIC_irq(); + + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +} + +static int ioapic_retrigger_irq(unsigned int irq) +{ + send_IPI_self(irq_vector[irq]); + + return 1; +} + +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_ioapic_irq, + .eoi = ack_ioapic_quirk_irq, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; + + +static inline void init_IO_APIC_traps(void) +{ + int irq; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for (irq = 0; irq < NR_IRQS ; irq++) { + int tmp = irq; + if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < 16) + make_8259A_irq(irq); + else + /* Strange. Oh, well.. */ + irq_desc[irq].chip = &no_irq_chip; + } + } +} + +/* + * The local APIC irq-chip implementation: + */ + +static void ack_apic(unsigned int irq) +{ + ack_APIC_irq(); +} + +static void mask_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void unmask_lapic_irq (unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static struct irq_chip lapic_chip __read_mostly = { + .name = "local-APIC-edge", + .mask = mask_lapic_irq, + .unmask = unmask_lapic_irq, + .eoi = ack_apic, +}; + +static void setup_nmi (void) +{ + /* + * Dirty trick to enable the NMI watchdog ... + * We put the 8259A master into AEOI mode and + * unmask on all local APICs LVT0 as NMI. + * + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') + * is from Maciej W. Rozycki - so we do not have to EOI from + * the NMI handler or the timer interrupt. + */ + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); + + on_each_cpu(enable_NMI_through_LVT0, NULL, 1, 1); + + apic_printk(APIC_VERBOSE, " done.\n"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void unlock_ExtINT_logic(void) +{ + int apic, pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + + pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } + apic = find_isa_irq_apic(8, mp_INT); + if (apic == -1) { + WARN_ON_ONCE(1); + return; + } + + entry0 = ioapic_read_entry(apic, pin); + clear_IO_APIC_pin(apic, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + ioapic_write_entry(apic, pin, entry1); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(apic, pin); + + ioapic_write_entry(apic, pin, entry0); +} + +int timer_uses_ioapic_pin_0; + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + */ +static inline void __init check_timer(void) +{ + int apic1, pin1, apic2, pin2; + int vector; + + /* + * get/set the timer IRQ vector: + */ + disable_8259A_irq(0); + vector = assign_irq_vector(0); + set_intr_gate(vector, interrupt[0]); + + /* + * Subtle, code in do_timer_interrupt() expects an AEOI + * mode for the 8259A whenever interrupts are routed + * through I/O APICs. Also IRQ0 has to be enabled in + * the 8259A which implies the virtual wire has to be + * disabled in the local APIC. + */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + init_8259A(1); + timer_ack = 1; + if (timer_over_8254 > 0) + enable_8259A_irq(0); + + pin1 = find_isa_irq_pin(0, mp_INT); + apic1 = find_isa_irq_apic(0, mp_INT); + pin2 = ioapic_i8259.pin; + apic2 = ioapic_i8259.apic; + + if (pin1 == 0) + timer_uses_ioapic_pin_0 = 1; + + printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", + vector, apic1, pin1, apic2, pin2); + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + unmask_IO_APIC_irq(0); + if (timer_irq_works()) { + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + setup_nmi(); + enable_8259A_irq(0); + } + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); + return; + } + clear_IO_APIC_pin(apic1, pin1); + printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to " + "IO-APIC\n"); + } + + printk(KERN_INFO "...trying to set up timer (IRQ0) through the 8259A ... "); + if (pin2 != -1) { + printk("\n..... (found pin %d) ...", pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + setup_ExtINT_IRQ0_pin(apic2, pin2, vector); + if (timer_irq_works()) { + printk("works.\n"); + if (pin1 != -1) + replace_pin_at_irq(0, apic1, pin1, apic2, pin2); + else + add_pin_to_irq(0, apic2, pin2); + if (nmi_watchdog == NMI_IO_APIC) { + setup_nmi(); + } + return; + } + /* + * Cleanup, just in case ... + */ + clear_IO_APIC_pin(apic2, pin2); + } + printk(" failed.\n"); + + if (nmi_watchdog == NMI_IO_APIC) { + printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + nmi_watchdog = 0; + } + + printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + + disable_8259A_irq(0); + set_irq_chip_and_handler_name(0, &lapic_chip, handle_fasteoi_irq, + "fasteoi"); + apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + enable_8259A_irq(0); + + if (timer_irq_works()) { + printk(" works.\n"); + return; + } + apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + printk(" failed.\n"); + + printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + + timer_ack = 0; + init_8259A(0); + make_8259A_irq(0); + apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + printk(" works.\n"); + return; + } + printk(" failed :(.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option"); +} + +/* + * + * IRQ's that are handled by the PIC in the MPS IOAPIC case. + * - IRQ2 is the cascade IRQ, and cannot be a io-apic IRQ. + * Linux doesn't really care, as it's not actually used + * for any interrupt handling anyway. + */ +#define PIC_IRQS (1 << PIC_CASCADE_IR) + +void __init setup_IO_APIC(void) +{ + enable_IO_APIC(); + + if (acpi_ioapic) + io_apic_irqs = ~0; /* all IRQs go through IOAPIC */ + else + io_apic_irqs = ~PIC_IRQS; + + printk("ENABLING IO-APIC IRQs\n"); + + /* + * Set up IO-APIC IRQ routing. + */ + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); + sync_Arb_IDs(); + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + check_timer(); + if (!acpi_ioapic) + print_IO_APIC(); +} + +static int __init setup_disable_8254_timer(char *s) +{ + timer_over_8254 = -1; + return 1; +} +static int __init setup_enable_8254_timer(char *s) +{ + timer_over_8254 = 2; + return 1; +} + +__setup("disable_8254_timer", setup_disable_8254_timer); +__setup("enable_8254_timer", setup_enable_8254_timer); + +/* + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if(sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; +} + +late_initcall(io_apic_bug_finalize); + +struct sysfs_ioapic_data { + struct sys_device dev; + struct IO_APIC_route_entry entry[0]; +}; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; + +static int ioapic_suspend(struct sys_device *dev, pm_message_t state) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) + entry[i] = ioapic_read_entry(dev->id, i); + + return 0; +} + +static int ioapic_resume(struct sys_device *dev) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + union IO_APIC_reg_00 reg_00; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(dev->id, 0); + if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid; + io_apic_write(dev->id, 0, reg_00.raw); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++) + ioapic_write_entry(dev->id, i, entry[i]); + + return 0; +} + +static struct sysdev_class ioapic_sysdev_class = { + set_kset_name("ioapic"), + .suspend = ioapic_suspend, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_sysfs(void) +{ + struct sys_device * dev; + int i, size, error = 0; + + error = sysdev_class_register(&ioapic_sysdev_class); + if (error) + return error; + + for (i = 0; i < nr_ioapics; i++ ) { + size = sizeof(struct sys_device) + nr_ioapic_registers[i] + * sizeof(struct IO_APIC_route_entry); + mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL); + if (!mp_ioapic_data[i]) { + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + memset(mp_ioapic_data[i], 0, size); + dev = &mp_ioapic_data[i]->dev; + dev->id = i; + dev->cls = &ioapic_sysdev_class; + error = sysdev_register(dev); + if (error) { + kfree(mp_ioapic_data[i]); + mp_ioapic_data[i] = NULL; + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + } + + return 0; +} + +device_initcall(ioapic_init_sysfs); + +/* + * Dynamic irq allocate and deallocation + */ +int create_irq(void) +{ + /* Allocate an unused irq */ + int irq, new, vector = 0; + unsigned long flags; + + irq = -ENOSPC; + spin_lock_irqsave(&vector_lock, flags); + for (new = (NR_IRQS - 1); new >= 0; new--) { + if (platform_legacy_irq(new)) + continue; + if (irq_vector[new] != 0) + continue; + vector = __assign_irq_vector(new); + if (likely(vector > 0)) + irq = new; + break; + } + spin_unlock_irqrestore(&vector_lock, flags); + + if (irq >= 0) { + set_intr_gate(vector, interrupt[irq]); + dynamic_irq_init(irq); + } + return irq; +} + +void destroy_irq(unsigned int irq) +{ + unsigned long flags; + + dynamic_irq_cleanup(irq); + + spin_lock_irqsave(&vector_lock, flags); + irq_vector[irq] = 0; + spin_unlock_irqrestore(&vector_lock, flags); +} + +/* + * MSI mesage composition + */ +#ifdef CONFIG_PCI_MSI +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +{ + int vector; + unsigned dest; + + vector = assign_irq_vector(irq); + if (vector >= 0) { + dest = cpu_mask_to_apicid(TARGET_CPUS); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(vector); + } + return vector; +} + +#ifdef CONFIG_SMP +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + int vector; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + tmp = TARGET_CPUS; + + vector = assign_irq_vector(irq); + if (vector < 0) + return; + + dest = cpu_mask_to_apicid(mask); + + read_msi_msg(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + write_msi_msg(irq, &msg); + irq_desc[irq].affinity = mask; +} +#endif /* CONFIG_SMP */ + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip msi_chip = { + .name = "PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_ioapic_irq, +#ifdef CONFIG_SMP + .set_affinity = set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +{ + struct msi_msg msg; + int irq, ret; + irq = create_irq(); + if (irq < 0) + return irq; + + ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) { + destroy_irq(irq); + return ret; + } + + set_irq_msi(irq, desc); + write_msi_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, + "edge"); + + return 0; +} + +void arch_teardown_msi_irq(unsigned int irq) +{ + destroy_irq(irq); +} + +#endif /* CONFIG_PCI_MSI */ + +/* + * Hypertransport interrupt support + */ +#ifdef CONFIG_HT_IRQ + +#ifdef CONFIG_SMP + +static void target_ht_irq(unsigned int irq, unsigned int dest) +{ + struct ht_irq_msg msg; + fetch_ht_irq_msg(irq, &msg); + + msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + + msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); + + write_ht_irq_msg(irq, &msg); +} + +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) +{ + unsigned int dest; + cpumask_t tmp; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + tmp = TARGET_CPUS; + + cpus_and(mask, tmp, CPU_MASK_ALL); + + dest = cpu_mask_to_apicid(mask); + + target_ht_irq(irq, dest); + irq_desc[irq].affinity = mask; +} +#endif + +static struct irq_chip ht_irq_chip = { + .name = "PCI-HT", + .mask = mask_ht_irq, + .unmask = unmask_ht_irq, + .ack = ack_ioapic_irq, +#ifdef CONFIG_SMP + .set_affinity = set_ht_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +{ + int vector; + + vector = assign_irq_vector(irq); + if (vector >= 0) { + struct ht_irq_msg msg; + unsigned dest; + cpumask_t tmp; + + cpus_clear(tmp); + cpu_set(vector >> 8, tmp); + dest = cpu_mask_to_apicid(tmp); + + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(vector) | + ((INT_DEST_MODE == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; + + write_ht_irq_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); + } + return vector; +} +#endif /* CONFIG_HT_IRQ */ + +/* -------------------------------------------------------------------------- + ACPI-based IOAPIC Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +int __init io_apic_get_unique_id (int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!check_apicid_used(apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + tmp = apicid_to_cpu_present(apic_id); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} + + +int __init io_apic_get_version (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} + + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + + +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low) +{ + struct IO_APIC_route_entry entry; + unsigned long flags; + + if (!IO_APIC_IRQ(irq)) { + printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + /* + * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. + * Note that we mask (disable) IRQs now -- these get enabled when the + * corresponding device driver registers for this IRQ. + */ + + memset(&entry,0,sizeof(entry)); + + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.trigger = edge_level; + entry.polarity = active_high_low; + entry.mask = 1; + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + entry.vector = assign_irq_vector(irq); + + apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " + "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, + mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq, + edge_level, active_high_low); + + ioapic_register_intr(irq, entry.vector, edge_level); + + if (!ioapic && (irq < 16)) + disable_8259A_irq(irq); + + spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(ioapic, pin, entry); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return 0; +} + +#endif /* CONFIG_ACPI */ + +static int __init parse_disable_timer_pin_1(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", parse_disable_timer_pin_1); + +static int __init parse_enable_timer_pin_1(char *arg) +{ + disable_timer_pin_1 = -1; + return 0; +} +early_param("enable_timer_pin_1", parse_enable_timer_pin_1); + +static int __init parse_noapic(char *arg) +{ + /* disable IO-APIC */ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); diff --git a/arch/x86/kernel/ioport_32.c b/arch/x86/kernel/ioport_32.c new file mode 100644 index 000000000000..3d310a946d76 --- /dev/null +++ b/arch/x86/kernel/ioport_32.c @@ -0,0 +1,153 @@ +/* + * linux/arch/i386/kernel/ioport.c + * + * This contains the io-permission bitmap code - written by obz, with changes + * by Linus. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +static void set_bitmap(unsigned long *bitmap, unsigned int base, unsigned int extent, int new_value) +{ + unsigned long mask; + unsigned long *bitmap_base = bitmap + (base / BITS_PER_LONG); + unsigned int low_index = base & (BITS_PER_LONG-1); + int length = low_index + extent; + + if (low_index != 0) { + mask = (~0UL << low_index); + if (length < BITS_PER_LONG) + mask &= ~(~0UL << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + length -= BITS_PER_LONG; + } + + mask = (new_value ? ~0UL : 0UL); + while (length >= BITS_PER_LONG) { + *bitmap_base++ = mask; + length -= BITS_PER_LONG; + } + + if (length > 0) { + mask = ~(~0UL << length); + if (new_value) + *bitmap_base++ |= mask; + else + *bitmap_base++ &= ~mask; + } +} + + +/* + * this changes the io permissions bitmap in the current task. + */ +asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) +{ + unsigned long i, max_long, bytes, bytes_updated; + struct thread_struct * t = ¤t->thread; + struct tss_struct * tss; + unsigned long *bitmap; + + if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) + return -EINVAL; + if (turn_on && !capable(CAP_SYS_RAWIO)) + return -EPERM; + + /* + * If it's the first ioperm() call in this thread's lifetime, set the + * IO bitmap up. ioperm() is much less timing critical than clone(), + * this is why we delay this operation until now: + */ + if (!t->io_bitmap_ptr) { + bitmap = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!bitmap) + return -ENOMEM; + + memset(bitmap, 0xff, IO_BITMAP_BYTES); + t->io_bitmap_ptr = bitmap; + set_thread_flag(TIF_IO_BITMAP); + } + + /* + * do it in the per-thread copy and in the TSS ... + * + * Disable preemption via get_cpu() - we must not switch away + * because the ->io_bitmap_max value must match the bitmap + * contents: + */ + tss = &per_cpu(init_tss, get_cpu()); + + set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); + + /* + * Search for a (possibly new) maximum. This is simple and stupid, + * to keep it obviously correct: + */ + max_long = 0; + for (i = 0; i < IO_BITMAP_LONGS; i++) + if (t->io_bitmap_ptr[i] != ~0UL) + max_long = i; + + bytes = (max_long + 1) * sizeof(long); + bytes_updated = max(bytes, t->io_bitmap_max); + + t->io_bitmap_max = bytes; + + /* + * Sets the lazy trigger so that the next I/O operation will + * reload the correct bitmap. + * Reset the owner so that a process switch will not set + * tss->io_bitmap_base to IO_BITMAP_OFFSET. + */ + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; + tss->io_bitmap_owner = NULL; + + put_cpu(); + + return 0; +} + +/* + * sys_iopl has to be used when you want to access the IO ports + * beyond the 0x3ff range: to get the full 65536 ports bitmapped + * you'd need 8kB of bitmaps/process, which is a bit excessive. + * + * Here we just change the eflags value on the stack: we allow + * only the super-user to do it. This depends on the stack-layout + * on system-call entry - see also fork() and the signal handling + * code. + */ + +asmlinkage long sys_iopl(unsigned long unused) +{ + volatile struct pt_regs * regs = (struct pt_regs *) &unused; + unsigned int level = regs->ebx; + unsigned int old = (regs->eflags >> 12) & 3; + struct thread_struct *t = ¤t->thread; + + if (level > 3) + return -EINVAL; + /* Trying to gain more privileges? */ + if (level > old) { + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + } + t->iopl = level << 12; + regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl; + set_iopl_mask(t->iopl); + return 0; +} diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c new file mode 100644 index 000000000000..dd2b97fc00b2 --- /dev/null +++ b/arch/x86/kernel/irq_32.c @@ -0,0 +1,343 @@ +/* + * linux/arch/i386/kernel/irq.c + * + * Copyright (C) 1992, 1998 Linus Torvalds, Ingo Molnar + * + * This file contains the lowest level x86-specific interrupt + * entry, irq-stacks and irq statistics code. All the remaining + * irq logic is done by the generic kernel/irq/ code and + * by the x86-specific irq controller code. (e.g. i8259.c and + * io_apic.c.) + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); +EXPORT_PER_CPU_SYMBOL(irq_stat); + +DEFINE_PER_CPU(struct pt_regs *, irq_regs); +EXPORT_PER_CPU_SYMBOL(irq_regs); + +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + if (cpu_has_apic) + ack_APIC_irq(); +#endif +} + +#ifdef CONFIG_4KSTACKS +/* + * per-CPU IRQ handling contexts (thread information and stack) + */ +union irq_ctx { + struct thread_info tinfo; + u32 stack[THREAD_SIZE/sizeof(u32)]; +}; + +static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; +static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; +#endif + +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ +fastcall unsigned int do_IRQ(struct pt_regs *regs) +{ + struct pt_regs *old_regs; + /* high bit used in ret_from_ code */ + int irq = ~regs->orig_eax; + struct irq_desc *desc = irq_desc + irq; +#ifdef CONFIG_4KSTACKS + union irq_ctx *curctx, *irqctx; + u32 *isp; +#endif + + if (unlikely((unsigned)irq >= NR_IRQS)) { + printk(KERN_EMERG "%s: cannot handle IRQ %d\n", + __FUNCTION__, irq); + BUG(); + } + + old_regs = set_irq_regs(regs); + irq_enter(); +#ifdef CONFIG_DEBUG_STACKOVERFLOW + /* Debugging check for stack overflow: is there less than 1KB free? */ + { + long esp; + + __asm__ __volatile__("andl %%esp,%0" : + "=r" (esp) : "0" (THREAD_SIZE - 1)); + if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { + printk("do_IRQ: stack overflow: %ld\n", + esp - sizeof(struct thread_info)); + dump_stack(); + } + } +#endif + +#ifdef CONFIG_4KSTACKS + + curctx = (union irq_ctx *) current_thread_info(); + irqctx = hardirq_ctx[smp_processor_id()]; + + /* + * this is where we switch to the IRQ stack. However, if we are + * already using the IRQ stack (because we interrupted a hardirq + * handler) we can't do that and just have to keep using the + * current stack (which is the irq stack already after all) + */ + if (curctx != irqctx) { + int arg1, arg2, ebx; + + /* build the stack frame on the IRQ stack */ + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); + irqctx->tinfo.task = curctx->tinfo.task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + /* + * Copy the softirq bits in preempt_count so that the + * softirq checks work in the hardirq context. + */ + irqctx->tinfo.preempt_count = + (irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) | + (curctx->tinfo.preempt_count & SOFTIRQ_MASK); + + asm volatile( + " xchgl %%ebx,%%esp \n" + " call *%%edi \n" + " movl %%ebx,%%esp \n" + : "=a" (arg1), "=d" (arg2), "=b" (ebx) + : "0" (irq), "1" (desc), "2" (isp), + "D" (desc->handle_irq) + : "memory", "cc" + ); + } else +#endif + desc->handle_irq(irq, desc); + + irq_exit(); + set_irq_regs(old_regs); + return 1; +} + +#ifdef CONFIG_4KSTACKS + +static char softirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__section__(".bss.page_aligned"))); + +static char hardirq_stack[NR_CPUS * THREAD_SIZE] + __attribute__((__section__(".bss.page_aligned"))); + +/* + * allocate per-cpu stacks for hardirq and for softirq processing + */ +void irq_ctx_init(int cpu) +{ + union irq_ctx *irqctx; + + if (hardirq_ctx[cpu]) + return; + + irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE]; + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + hardirq_ctx[cpu] = irqctx; + + irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE]; + irqctx->tinfo.task = NULL; + irqctx->tinfo.exec_domain = NULL; + irqctx->tinfo.cpu = cpu; + irqctx->tinfo.preempt_count = 0; + irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); + + softirq_ctx[cpu] = irqctx; + + printk("CPU %u irqstacks, hard=%p soft=%p\n", + cpu,hardirq_ctx[cpu],softirq_ctx[cpu]); +} + +void irq_ctx_exit(int cpu) +{ + hardirq_ctx[cpu] = NULL; +} + +extern asmlinkage void __do_softirq(void); + +asmlinkage void do_softirq(void) +{ + unsigned long flags; + struct thread_info *curctx; + union irq_ctx *irqctx; + u32 *isp; + + if (in_interrupt()) + return; + + local_irq_save(flags); + + if (local_softirq_pending()) { + curctx = current_thread_info(); + irqctx = softirq_ctx[smp_processor_id()]; + irqctx->tinfo.task = curctx->task; + irqctx->tinfo.previous_esp = current_stack_pointer; + + /* build the stack frame on the softirq stack */ + isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); + + asm volatile( + " xchgl %%ebx,%%esp \n" + " call __do_softirq \n" + " movl %%ebx,%%esp \n" + : "=b"(isp) + : "0"(isp) + : "memory", "cc", "edx", "ecx", "eax" + ); + /* + * Shouldnt happen, we returned above if in_interrupt(): + */ + WARN_ON_ONCE(softirq_count()); + } + + local_irq_restore(flags); +} + +EXPORT_SYMBOL(do_softirq); +#endif + +/* + * Interrupt statistics: + */ + +atomic_t irq_err_count; + +/* + * /proc/interrupts printing: + */ + +int show_interrupts(struct seq_file *p, void *v) +{ + int i = *(loff_t *) v, j; + struct irqaction * action; + unsigned long flags; + + if (i == 0) { + seq_printf(p, " "); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d",j); + seq_putc(p, '\n'); + } + + if (i < NR_IRQS) { + spin_lock_irqsave(&irq_desc[i].lock, flags); + action = irq_desc[i].action; + if (!action) + goto skip; + seq_printf(p, "%3d: ",i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); +#endif + seq_printf(p, " %8s", irq_desc[i].chip->name); + seq_printf(p, "-%-8s", irq_desc[i].name); + seq_printf(p, " %s", action->name); + + for (action=action->next; action; action = action->next) + seq_printf(p, ", %s", action->name); + + seq_putc(p, '\n'); +skip: + spin_unlock_irqrestore(&irq_desc[i].lock, flags); + } else if (i == NR_IRQS) { + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", nmi_count(j)); + seq_putc(p, '\n'); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + per_cpu(irq_stat,j).apic_timer_irqs); + seq_putc(p, '\n'); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif + } + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +#include + +void fixup_irqs(cpumask_t map) +{ + unsigned int irq; + static int warned; + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; + if (irq == 2) + continue; + + cpus_and(mask, irq_desc[irq].affinity, map); + if (any_online_cpu(mask) == NR_CPUS) { + printk("Breaking affinity for irq %i\n", irq); + mask = map; + } + if (irq_desc[irq].chip->set_affinity) + irq_desc[irq].chip->set_affinity(irq, mask); + else if (irq_desc[irq].action && !(warned++)) + printk("Cannot set affinity for irq %i\n", irq); + } + +#if 0 + barrier(); + /* Ingo Molnar says: "after the IO-APIC masks have been redirected + [note the nop - the interrupt-enable boundary on x86 is two + instructions from sti] - to flush out pending hardirqs and + IPIs. After this point nothing is supposed to reach this CPU." */ + __asm__ __volatile__("sti; nop; cli"); + barrier(); +#else + /* That doesn't seem sufficient. Give it 1ms. */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); +#endif +} +#endif + diff --git a/arch/x86/kernel/kprobes_32.c b/arch/x86/kernel/kprobes_32.c new file mode 100644 index 000000000000..448a50b1324c --- /dev/null +++ b/arch/x86/kernel/kprobes_32.c @@ -0,0 +1,751 @@ +/* + * Kernel Probes (KProbes) + * arch/i386/kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S Kernel + * Probes initial implementation ( includes contributions from + * Rusty Russell). + * 2004-July Suparna Bhattacharya added jumper probes + * interface to access function arguments. + * 2005-May Hien Nguyen , Jim Keniston + * and Prasanna S Panchamukhi + * added function-return probes. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +void jprobe_return_end(void); + +DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; +DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); + +/* insert a jmp code */ +static __always_inline void set_jmp_op(void *from, void *to) +{ + struct __arch_jmp_op { + char op; + long raddr; + } __attribute__((packed)) *jop; + jop = (struct __arch_jmp_op *)from; + jop->raddr = (long)(to) - ((long)(from) + 5); + jop->op = RELATIVEJUMP_INSTRUCTION; +} + +/* + * returns non-zero if opcodes can be boosted. + */ +static __always_inline int can_boost(kprobe_opcode_t *opcodes) +{ +#define W(row,b0,b1,b2,b3,b4,b5,b6,b7,b8,b9,ba,bb,bc,bd,be,bf) \ + (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ + (b4##UL << 0x4)|(b5##UL << 0x5)|(b6##UL << 0x6)|(b7##UL << 0x7) | \ + (b8##UL << 0x8)|(b9##UL << 0x9)|(ba##UL << 0xa)|(bb##UL << 0xb) | \ + (bc##UL << 0xc)|(bd##UL << 0xd)|(be##UL << 0xe)|(bf##UL << 0xf)) \ + << (row % 32)) + /* + * Undefined/reserved opcodes, conditional jump, Opcode Extension + * Groups, and some special opcodes can not be boost. + */ + static const unsigned long twobyte_is_boostable[256 / 32] = { + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + /* ------------------------------- */ + W(0x00, 0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0)| /* 00 */ + W(0x10, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 10 */ + W(0x20, 1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0)| /* 20 */ + W(0x30, 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 30 */ + W(0x40, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)| /* 40 */ + W(0x50, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), /* 50 */ + W(0x60, 1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1)| /* 60 */ + W(0x70, 0,0,0,0,1,1,1,1,0,0,0,0,0,0,1,1), /* 70 */ + W(0x80, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)| /* 80 */ + W(0x90, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1), /* 90 */ + W(0xa0, 1,1,0,1,1,1,0,0,1,1,0,1,1,1,0,1)| /* a0 */ + W(0xb0, 1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1), /* b0 */ + W(0xc0, 1,1,0,0,0,0,0,0,1,1,1,1,1,1,1,1)| /* c0 */ + W(0xd0, 0,1,1,1,0,1,0,0,1,1,0,1,1,1,0,1), /* d0 */ + W(0xe0, 0,1,1,0,0,1,0,0,1,1,0,1,1,1,0,1)| /* e0 */ + W(0xf0, 0,1,1,1,0,1,0,0,1,1,1,0,1,1,1,0) /* f0 */ + /* ------------------------------- */ + /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ + }; +#undef W + kprobe_opcode_t opcode; + kprobe_opcode_t *orig_opcodes = opcodes; +retry: + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + opcode = *(opcodes++); + + /* 2nd-byte opcode */ + if (opcode == 0x0f) { + if (opcodes - orig_opcodes > MAX_INSN_SIZE - 1) + return 0; + return test_bit(*opcodes, twobyte_is_boostable); + } + + switch (opcode & 0xf0) { + case 0x60: + if (0x63 < opcode && opcode < 0x67) + goto retry; /* prefixes */ + /* can't boost Address-size override and bound */ + return (opcode != 0x62 && opcode != 0x67); + case 0x70: + return 0; /* can't boost conditional jump */ + case 0xc0: + /* can't boost software-interruptions */ + return (0xc1 < opcode && opcode < 0xcc) || opcode == 0xcf; + case 0xd0: + /* can boost AA* and XLAT */ + return (opcode == 0xd4 || opcode == 0xd5 || opcode == 0xd7); + case 0xe0: + /* can boost in/out and absolute jmps */ + return ((opcode & 0x04) || opcode == 0xea); + case 0xf0: + if ((opcode & 0x0c) == 0 && opcode != 0xf1) + goto retry; /* lock/rep(ne) prefix */ + /* clear and set flags can be boost */ + return (opcode == 0xf5 || (0xf7 < opcode && opcode < 0xfe)); + default: + if (opcode == 0x26 || opcode == 0x36 || opcode == 0x3e) + goto retry; /* prefixes */ + /* can't boost CS override and call */ + return (opcode != 0x2e && opcode != 0x9a); + } +} + +/* + * returns non-zero if opcode modifies the interrupt flag. + */ +static int __kprobes is_IF_modifier(kprobe_opcode_t opcode) +{ + switch (opcode) { + case 0xfa: /* cli */ + case 0xfb: /* sti */ + case 0xcf: /* iret/iretd */ + case 0x9d: /* popf/popfd */ + return 1; + } + return 0; +} + +int __kprobes arch_prepare_kprobe(struct kprobe *p) +{ + /* insn: must be on special executable page on i386. */ + p->ainsn.insn = get_insn_slot(); + if (!p->ainsn.insn) + return -ENOMEM; + + memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + p->opcode = *p->addr; + if (can_boost(p->addr)) { + p->ainsn.boostable = 0; + } else { + p->ainsn.boostable = -1; + } + return 0; +} + +void __kprobes arch_arm_kprobe(struct kprobe *p) +{ + text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); +} + +void __kprobes arch_disarm_kprobe(struct kprobe *p) +{ + text_poke(p->addr, &p->opcode, 1); +} + +void __kprobes arch_remove_kprobe(struct kprobe *p) +{ + mutex_lock(&kprobe_mutex); + free_insn_slot(p->ainsn.insn, (p->ainsn.boostable == 1)); + mutex_unlock(&kprobe_mutex); +} + +static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + kcb->prev_kprobe.kp = kprobe_running(); + kcb->prev_kprobe.status = kcb->kprobe_status; + kcb->prev_kprobe.old_eflags = kcb->kprobe_old_eflags; + kcb->prev_kprobe.saved_eflags = kcb->kprobe_saved_eflags; +} + +static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + kcb->kprobe_status = kcb->prev_kprobe.status; + kcb->kprobe_old_eflags = kcb->prev_kprobe.old_eflags; + kcb->kprobe_saved_eflags = kcb->prev_kprobe.saved_eflags; +} + +static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, + struct kprobe_ctlblk *kcb) +{ + __get_cpu_var(current_kprobe) = p; + kcb->kprobe_saved_eflags = kcb->kprobe_old_eflags + = (regs->eflags & (TF_MASK | IF_MASK)); + if (is_IF_modifier(p->opcode)) + kcb->kprobe_saved_eflags &= ~IF_MASK; +} + +static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) +{ + regs->eflags |= TF_MASK; + regs->eflags &= ~IF_MASK; + /*single step inline if the instruction is an int3*/ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->eip = (unsigned long)p->addr; + else + regs->eip = (unsigned long)p->ainsn.insn; +} + +/* Called with kretprobe_lock held */ +void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + unsigned long *sara = (unsigned long *)®s->esp; + + ri->ret_addr = (kprobe_opcode_t *) *sara; + + /* Replace the return addr with trampoline addr */ + *sara = (unsigned long) &kretprobe_trampoline; +} + +/* + * Interrupts are disabled on entry as trap3 is an interrupt gate and they + * remain disabled thorough out this function. + */ +static int __kprobes kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *p; + int ret = 0; + kprobe_opcode_t *addr; + struct kprobe_ctlblk *kcb; + + addr = (kprobe_opcode_t *)(regs->eip - sizeof(kprobe_opcode_t)); + + /* + * We don't want to be preempted for the entire + * duration of kprobe processing + */ + preempt_disable(); + kcb = get_kprobe_ctlblk(); + + /* Check we're not actually recursing */ + if (kprobe_running()) { + p = get_kprobe(addr); + if (p) { + if (kcb->kprobe_status == KPROBE_HIT_SS && + *p->ainsn.insn == BREAKPOINT_INSTRUCTION) { + regs->eflags &= ~TF_MASK; + regs->eflags |= kcb->kprobe_saved_eflags; + goto no_kprobe; + } + /* We have reentered the kprobe_handler(), since + * another probe was hit while within the handler. + * We here save the original kprobes variables and + * just single step on the instruction of the new probe + * without calling any user handlers. + */ + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kprobes_inc_nmissed_count(p); + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_REENTER; + return 1; + } else { + if (*addr != BREAKPOINT_INSTRUCTION) { + /* The breakpoint instruction was removed by + * another cpu right after we hit, no further + * handling of this interrupt is appropriate + */ + regs->eip -= sizeof(kprobe_opcode_t); + ret = 1; + goto no_kprobe; + } + p = __get_cpu_var(current_kprobe); + if (p->break_handler && p->break_handler(p, regs)) { + goto ss_probe; + } + } + goto no_kprobe; + } + + p = get_kprobe(addr); + if (!p) { + if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + * Back up over the (now missing) int3 and run + * the original instruction. + */ + regs->eip -= sizeof(kprobe_opcode_t); + ret = 1; + } + /* Not one of ours: let kernel handle it */ + goto no_kprobe; + } + + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + + if (p->pre_handler && p->pre_handler(p, regs)) + /* handler has already set things up, so skip ss setup */ + return 1; + +ss_probe: +#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PM) + if (p->ainsn.boostable == 1 && !p->post_handler){ + /* Boost up -- we can execute copied instructions directly */ + reset_current_kprobe(); + regs->eip = (unsigned long)p->ainsn.insn; + preempt_enable_no_resched(); + return 1; + } +#endif + prepare_singlestep(p, regs); + kcb->kprobe_status = KPROBE_HIT_SS; + return 1; + +no_kprobe: + preempt_enable_no_resched(); + return ret; +} + +/* + * For function-return probes, init_kprobes() establishes a probepoint + * here. When a retprobed function returns, this probe is hit and + * trampoline_probe_handler() runs, calling the kretprobe's handler. + */ + void __kprobes kretprobe_trampoline_holder(void) + { + asm volatile ( ".global kretprobe_trampoline\n" + "kretprobe_trampoline: \n" + " pushf\n" + /* skip cs, eip, orig_eax */ + " subl $12, %esp\n" + " pushl %fs\n" + " pushl %ds\n" + " pushl %es\n" + " pushl %eax\n" + " pushl %ebp\n" + " pushl %edi\n" + " pushl %esi\n" + " pushl %edx\n" + " pushl %ecx\n" + " pushl %ebx\n" + " movl %esp, %eax\n" + " call trampoline_handler\n" + /* move eflags to cs */ + " movl 52(%esp), %edx\n" + " movl %edx, 48(%esp)\n" + /* save true return address on eflags */ + " movl %eax, 52(%esp)\n" + " popl %ebx\n" + " popl %ecx\n" + " popl %edx\n" + " popl %esi\n" + " popl %edi\n" + " popl %ebp\n" + " popl %eax\n" + /* skip eip, orig_eax, es, ds, fs */ + " addl $20, %esp\n" + " popf\n" + " ret\n"); +} + +/* + * Called from kretprobe_trampoline + */ +fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) +{ + struct kretprobe_instance *ri = NULL; + struct hlist_head *head, empty_rp; + struct hlist_node *node, *tmp; + unsigned long flags, orig_ret_address = 0; + unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + + INIT_HLIST_HEAD(&empty_rp); + spin_lock_irqsave(&kretprobe_lock, flags); + head = kretprobe_inst_table_head(current); + /* fixup registers */ + regs->xcs = __KERNEL_CS | get_kernel_rpl(); + regs->eip = trampoline_address; + regs->orig_eax = 0xffffffff; + + /* + * It is possible to have multiple instances associated with a given + * task either because an multiple functions in the call path + * have a return probe installed on them, and/or more then one return + * return probe was registered for a target function. + * + * We can handle this because: + * - instances are always inserted at the head of the list + * - when multiple return probes are registered for the same + * function, the first instance's ret_addr will point to the + * real return address, and all the rest will point to + * kretprobe_trampoline + */ + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + if (ri->rp && ri->rp->handler){ + __get_cpu_var(current_kprobe) = &ri->rp->kp; + get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; + ri->rp->handler(ri, regs); + __get_cpu_var(current_kprobe) = NULL; + } + + orig_ret_address = (unsigned long)ri->ret_addr; + recycle_rp_inst(ri, &empty_rp); + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_assert(ri, orig_ret_address, trampoline_address); + spin_unlock_irqrestore(&kretprobe_lock, flags); + + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + return (void*)orig_ret_address; +} + +/* + * Called after single-stepping. p->addr is the address of the + * instruction whose first byte has been replaced by the "int 3" + * instruction. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. The address of this + * copy is p->ainsn.insn. + * + * This function prepares to return from the post-single-step + * interrupt. We have to fix up the stack as follows: + * + * 0) Except in the case of absolute or indirect jump or call instructions, + * the new eip is relative to the copied instruction. We need to make + * it relative to the original instruction. + * + * 1) If the single-stepped instruction was pushfl, then the TF and IF + * flags are set in the just-pushed eflags, and may need to be cleared. + * + * 2) If the single-stepped instruction was a call, the return address + * that is atop the stack is the address following the copied instruction. + * We need to make it the address following the original instruction. + * + * This function also checks instruction size for preparing direct execution. + */ +static void __kprobes resume_execution(struct kprobe *p, + struct pt_regs *regs, struct kprobe_ctlblk *kcb) +{ + unsigned long *tos = (unsigned long *)®s->esp; + unsigned long copy_eip = (unsigned long)p->ainsn.insn; + unsigned long orig_eip = (unsigned long)p->addr; + + regs->eflags &= ~TF_MASK; + switch (p->ainsn.insn[0]) { + case 0x9c: /* pushfl */ + *tos &= ~(TF_MASK | IF_MASK); + *tos |= kcb->kprobe_old_eflags; + break; + case 0xc2: /* iret/ret/lret */ + case 0xc3: + case 0xca: + case 0xcb: + case 0xcf: + case 0xea: /* jmp absolute -- eip is correct */ + /* eip is already adjusted, no more changes required */ + p->ainsn.boostable = 1; + goto no_change; + case 0xe8: /* call relative - Fix return addr */ + *tos = orig_eip + (*tos - copy_eip); + break; + case 0x9a: /* call absolute -- same as call absolute, indirect */ + *tos = orig_eip + (*tos - copy_eip); + goto no_change; + case 0xff: + if ((p->ainsn.insn[1] & 0x30) == 0x10) { + /* + * call absolute, indirect + * Fix return addr; eip is correct. + * But this is not boostable + */ + *tos = orig_eip + (*tos - copy_eip); + goto no_change; + } else if (((p->ainsn.insn[1] & 0x31) == 0x20) || /* jmp near, absolute indirect */ + ((p->ainsn.insn[1] & 0x31) == 0x21)) { /* jmp far, absolute indirect */ + /* eip is correct. And this is boostable */ + p->ainsn.boostable = 1; + goto no_change; + } + default: + break; + } + + if (p->ainsn.boostable == 0) { + if ((regs->eip > copy_eip) && + (regs->eip - copy_eip) + 5 < MAX_INSN_SIZE) { + /* + * These instructions can be executed directly if it + * jumps back to correct address. + */ + set_jmp_op((void *)regs->eip, + (void *)orig_eip + (regs->eip - copy_eip)); + p->ainsn.boostable = 1; + } else { + p->ainsn.boostable = -1; + } + } + + regs->eip = orig_eip + (regs->eip - copy_eip); + +no_change: + return; +} + +/* + * Interrupts are disabled on entry as trap1 is an interrupt gate and they + * remain disabled thoroughout this function. + */ +static int __kprobes post_kprobe_handler(struct pt_regs *regs) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + if (!cur) + return 0; + + if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { + kcb->kprobe_status = KPROBE_HIT_SSDONE; + cur->post_handler(cur, regs, 0); + } + + resume_execution(cur, regs, kcb); + regs->eflags |= kcb->kprobe_saved_eflags; + + /*Restore back the original saved kprobes variables and continue. */ + if (kcb->kprobe_status == KPROBE_REENTER) { + restore_previous_kprobe(kcb); + goto out; + } + reset_current_kprobe(); +out: + preempt_enable_no_resched(); + + /* + * if somebody else is singlestepping across a probe point, eflags + * will have TF set, in which case, continue the remaining processing + * of do_debug, as if this is not a probe hit. + */ + if (regs->eflags & TF_MASK) + return 0; + + return 1; +} + +static int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) +{ + struct kprobe *cur = kprobe_running(); + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + switch(kcb->kprobe_status) { + case KPROBE_HIT_SS: + case KPROBE_REENTER: + /* + * We are here because the instruction being single + * stepped caused a page fault. We reset the current + * kprobe and the eip points back to the probe address + * and allow the page fault handler to continue as a + * normal page fault. + */ + regs->eip = (unsigned long)cur->addr; + regs->eflags |= kcb->kprobe_old_eflags; + if (kcb->kprobe_status == KPROBE_REENTER) + restore_previous_kprobe(kcb); + else + reset_current_kprobe(); + preempt_enable_no_resched(); + break; + case KPROBE_HIT_ACTIVE: + case KPROBE_HIT_SSDONE: + /* + * We increment the nmissed count for accounting, + * we can also use npre/npostfault count for accouting + * these specific fault cases. + */ + kprobes_inc_nmissed_count(cur); + + /* + * We come here because instructions in the pre/post + * handler caused the page_fault, this could happen + * if handler tries to access user space by + * copy_from_user(), get_user() etc. Let the + * user-specified handler try to fix it first. + */ + if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr)) + return 1; + + /* + * In case the user-specified fault handler returned + * zero, try to fix up. + */ + if (fixup_exception(regs)) + return 1; + + /* + * fixup_exception() could not handle it, + * Let do_page_fault() fix it. + */ + break; + default: + break; + } + return 0; +} + +/* + * Wrapper routine to for handling exceptions. + */ +int __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct die_args *args = (struct die_args *)data; + int ret = NOTIFY_DONE; + + if (args->regs && user_mode_vm(args->regs)) + return ret; + + switch (val) { + case DIE_INT3: + if (kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_DEBUG: + if (post_kprobe_handler(args->regs)) + ret = NOTIFY_STOP; + break; + case DIE_GPF: + case DIE_PAGE_FAULT: + /* kprobe_running() needs smp_processor_id() */ + preempt_disable(); + if (kprobe_running() && + kprobe_fault_handler(args->regs, args->trapnr)) + ret = NOTIFY_STOP; + preempt_enable(); + break; + default: + break; + } + return ret; +} + +int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct jprobe *jp = container_of(p, struct jprobe, kp); + unsigned long addr; + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + kcb->jprobe_saved_regs = *regs; + kcb->jprobe_saved_esp = ®s->esp; + addr = (unsigned long)(kcb->jprobe_saved_esp); + + /* + * TBD: As Linus pointed out, gcc assumes that the callee + * owns the argument space and could overwrite it, e.g. + * tailcall optimization. So, to be absolutely safe + * we also save and restore enough stack bytes to cover + * the argument area. + */ + memcpy(kcb->jprobes_stack, (kprobe_opcode_t *)addr, + MIN_STACK_SIZE(addr)); + regs->eflags &= ~IF_MASK; + regs->eip = (unsigned long)(jp->entry); + return 1; +} + +void __kprobes jprobe_return(void) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + + asm volatile (" xchgl %%ebx,%%esp \n" + " int3 \n" + " .globl jprobe_return_end \n" + " jprobe_return_end: \n" + " nop \n"::"b" + (kcb->jprobe_saved_esp):"memory"); +} + +int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + u8 *addr = (u8 *) (regs->eip - 1); + unsigned long stack_addr = (unsigned long)(kcb->jprobe_saved_esp); + struct jprobe *jp = container_of(p, struct jprobe, kp); + + if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { + if (®s->esp != kcb->jprobe_saved_esp) { + struct pt_regs *saved_regs = + container_of(kcb->jprobe_saved_esp, + struct pt_regs, esp); + printk("current esp %p does not match saved esp %p\n", + ®s->esp, kcb->jprobe_saved_esp); + printk("Saved registers for jprobe %p\n", jp); + show_registers(saved_regs); + printk("Current registers\n"); + show_registers(regs); + BUG(); + } + *regs = kcb->jprobe_saved_regs; + memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, + MIN_STACK_SIZE(stack_addr)); + preempt_enable_no_resched(); + return 1; + } + return 0; +} + +int __kprobes arch_trampoline_kprobe(struct kprobe *p) +{ + return 0; +} + +int __init arch_init_kprobes(void) +{ + return 0; +} diff --git a/arch/x86/kernel/ldt_32.c b/arch/x86/kernel/ldt_32.c new file mode 100644 index 000000000000..e0b2d17f4f10 --- /dev/null +++ b/arch/x86/kernel/ldt_32.c @@ -0,0 +1,250 @@ +/* + * linux/arch/i386/kernel/ldt.c + * + * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds + * Copyright (C) 1999 Ingo Molnar + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ +static void flush_ldt(void *null) +{ + if (current->active_mm) + load_LDT(¤t->active_mm->context); +} +#endif + +static int alloc_ldt(mm_context_t *pc, int mincount, int reload) +{ + void *oldldt; + void *newldt; + int oldsize; + + if (mincount <= pc->size) + return 0; + oldsize = pc->size; + mincount = (mincount+511)&(~511); + if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) + newldt = vmalloc(mincount*LDT_ENTRY_SIZE); + else + newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); + + if (!newldt) + return -ENOMEM; + + if (oldsize) + memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); + oldldt = pc->ldt; + memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); + pc->ldt = newldt; + wmb(); + pc->size = mincount; + wmb(); + + if (reload) { +#ifdef CONFIG_SMP + cpumask_t mask; + preempt_disable(); + load_LDT(pc); + mask = cpumask_of_cpu(smp_processor_id()); + if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + smp_call_function(flush_ldt, NULL, 1, 1); + preempt_enable(); +#else + load_LDT(pc); +#endif + } + if (oldsize) { + if (oldsize*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(oldldt); + else + kfree(oldldt); + } + return 0; +} + +static inline int copy_ldt(mm_context_t *new, mm_context_t *old) +{ + int err = alloc_ldt(new, old->size, 0); + if (err < 0) + return err; + memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + return 0; +} + +/* + * we do not have to muck with descriptors here, that is + * done in switch_mm() as needed. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm) +{ + struct mm_struct * old_mm; + int retval = 0; + + init_MUTEX(&mm->context.sem); + mm->context.size = 0; + old_mm = current->mm; + if (old_mm && old_mm->context.size > 0) { + down(&old_mm->context.sem); + retval = copy_ldt(&mm->context, &old_mm->context); + up(&old_mm->context.sem); + } + return retval; +} + +/* + * No need to lock the MM as we are the last user + */ +void destroy_context(struct mm_struct *mm) +{ + if (mm->context.size) { + if (mm == current->active_mm) + clear_LDT(); + if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) + vfree(mm->context.ldt); + else + kfree(mm->context.ldt); + mm->context.size = 0; + } +} + +static int read_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + struct mm_struct * mm = current->mm; + + if (!mm->context.size) + return 0; + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + + down(&mm->context.sem); + size = mm->context.size*LDT_ENTRY_SIZE; + if (size > bytecount) + size = bytecount; + + err = 0; + if (copy_to_user(ptr, mm->context.ldt, size)) + err = -EFAULT; + up(&mm->context.sem); + if (err < 0) + goto error_return; + if (size != bytecount) { + /* zero-fill the rest */ + if (clear_user(ptr+size, bytecount-size) != 0) { + err = -EFAULT; + goto error_return; + } + } + return bytecount; +error_return: + return err; +} + +static int read_default_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + unsigned long size; + + err = 0; + size = 5*sizeof(struct desc_struct); + if (size > bytecount) + size = bytecount; + + err = size; + if (clear_user(ptr, size)) + err = -EFAULT; + + return err; +} + +static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode) +{ + struct mm_struct * mm = current->mm; + __u32 entry_1, entry_2; + int error; + struct user_desc ldt_info; + + error = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + error = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + error = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (oldmode) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + down(&mm->context.sem); + if (ldt_info.entry_number >= mm->context.size) { + error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); + if (error < 0) + goto out_unlock; + } + + /* Allow LDTs to be cleared by the user. */ + if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { + if (oldmode || LDT_empty(&ldt_info)) { + entry_1 = 0; + entry_2 = 0; + goto install; + } + } + + entry_1 = LDT_entry_a(&ldt_info); + entry_2 = LDT_entry_b(&ldt_info); + if (oldmode) + entry_2 &= ~(1 << 20); + + /* Install the new entry ... */ +install: + write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2); + error = 0; + +out_unlock: + up(&mm->context.sem); +out: + return error; +} + +asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + ret = write_ldt(ptr, bytecount, 1); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + case 0x11: + ret = write_ldt(ptr, bytecount, 0); + break; + } + return ret; +} diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c new file mode 100644 index 000000000000..91966bafb3dc --- /dev/null +++ b/arch/x86/kernel/machine_kexec_32.c @@ -0,0 +1,171 @@ +/* + * machine_kexec.c - handle transition of Linux booting another kernel + * Copyright (C) 2002-2005 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) +static u32 kexec_pgd[1024] PAGE_ALIGNED; +#ifdef CONFIG_X86_PAE +static u32 kexec_pmd0[1024] PAGE_ALIGNED; +static u32 kexec_pmd1[1024] PAGE_ALIGNED; +#endif +static u32 kexec_pte0[1024] PAGE_ALIGNED; +static u32 kexec_pte1[1024] PAGE_ALIGNED; + +static void set_idt(void *newidt, __u16 limit) +{ + struct Xgt_desc_struct curidt; + + /* ia32 supports unaliged loads & stores */ + curidt.size = limit; + curidt.address = (unsigned long)newidt; + + load_idt(&curidt); +}; + + +static void set_gdt(void *newgdt, __u16 limit) +{ + struct Xgt_desc_struct curgdt; + + /* ia32 supports unaligned loads & stores */ + curgdt.size = limit; + curgdt.address = (unsigned long)newgdt; + + load_gdt(&curgdt); +}; + +static void load_segments(void) +{ +#define __STR(X) #X +#define STR(X) __STR(X) + + __asm__ __volatile__ ( + "\tljmp $"STR(__KERNEL_CS)",$1f\n" + "\t1:\n" + "\tmovl $"STR(__KERNEL_DS)",%%eax\n" + "\tmovl %%eax,%%ds\n" + "\tmovl %%eax,%%es\n" + "\tmovl %%eax,%%fs\n" + "\tmovl %%eax,%%gs\n" + "\tmovl %%eax,%%ss\n" + ::: "eax", "memory"); +#undef STR +#undef __STR +} + +/* + * A architecture hook called to validate the + * proposed image and prepare the control pages + * as needed. The pages for KEXEC_CONTROL_CODE_SIZE + * have been allocated, but the segments have yet + * been copied into the kernel. + * + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. + * + * Currently nothing. + */ +int machine_kexec_prepare(struct kimage *image) +{ + return 0; +} + +/* + * Undo anything leftover by machine_kexec_prepare + * when an image is freed. + */ +void machine_kexec_cleanup(struct kimage *image) +{ +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +NORET_TYPE void machine_kexec(struct kimage *image) +{ + unsigned long page_list[PAGES_NR]; + void *control_page; + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + control_page = page_address(image->control_code_page); + memcpy(control_page, relocate_kernel, PAGE_SIZE); + + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[PA_PGD] = __pa(kexec_pgd); + page_list[VA_PGD] = (unsigned long)kexec_pgd; +#ifdef CONFIG_X86_PAE + page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; + page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; +#endif + page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; + page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + + /* The segment registers are funny things, they have both a + * visible and an invisible part. Whenever the visible part is + * set to a specific selector, the invisible part is loaded + * with from a table in memory. At no other time is the + * descriptor table in memory accessed. + * + * I take advantage of this here by force loading the + * segments, before I zap the gdt with an invalid value. + */ + load_segments(); + /* The gdt & idt are now invalid. + * If you want to load them you must set up your own idt & gdt. + */ + set_gdt(phys_to_virt(0),0); + set_idt(phys_to_virt(0),0); + + /* now call it */ + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start, cpu_has_pae); +} + +/* crashkernel=size@addr specifies the location to reserve for + * a crash kernel. By reserving this memory we guarantee + * that linux never sets it up as a DMA target. + * Useful for holding code to do something appropriate + * after a kernel panic. + */ +static int __init parse_crashkernel(char *arg) +{ + unsigned long size, base; + size = memparse(arg, &arg); + if (*arg == '@') { + base = memparse(arg+1, &arg); + /* FIXME: Do I want a sanity check + * to validate the memory range? + */ + crashk_res.start = base; + crashk_res.end = base + size - 1; + } + return 0; +} +early_param("crashkernel", parse_crashkernel); diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c new file mode 100644 index 000000000000..b83672b89527 --- /dev/null +++ b/arch/x86/kernel/mca_32.c @@ -0,0 +1,470 @@ +/* + * linux/arch/i386/kernel/mca.c + * Written by Martin Kolinek, February 1996 + * + * Changes: + * + * Chris Beauregard July 28th, 1996 + * - Fixed up integrated SCSI detection + * + * Chris Beauregard August 3rd, 1996 + * - Made mca_info local + * - Made integrated registers accessible through standard function calls + * - Added name field + * - More sanity checking + * + * Chris Beauregard August 9th, 1996 + * - Rewrote /proc/mca + * + * Chris Beauregard January 7th, 1997 + * - Added basic NMI-processing + * - Added more information to mca_info structure + * + * David Weinehall October 12th, 1998 + * - Made a lot of cleaning up in the source + * - Added use of save_flags / restore_flags + * - Added the 'driver_loaded' flag in MCA_adapter + * - Added an alternative implemention of ZP Gu's mca_find_unused_adapter + * + * David Weinehall March 24th, 1999 + * - Fixed the output of 'Driver Installed' in /proc/mca/pos + * - Made the Integrated Video & SCSI show up even if they have id 0000 + * + * Alexander Viro November 9th, 1999 + * - Switched to regular procfs methods + * + * Alfred Arnold & David Weinehall August 23rd, 2000 + * - Added support for Planar POS-registers + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned char which_scsi = 0; + +int MCA_bus = 0; +EXPORT_SYMBOL(MCA_bus); + +/* + * Motherboard register spinlock. Untested on SMP at the moment, but + * are there any MCA SMP boxes? + * + * Yes - Alan + */ +static DEFINE_SPINLOCK(mca_lock); + +/* Build the status info for the adapter */ + +static void mca_configure_adapter_status(struct mca_device *mca_dev) { + mca_dev->status = MCA_ADAPTER_NONE; + + mca_dev->pos_id = mca_dev->pos[0] + + (mca_dev->pos[1] << 8); + + if(!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) { + + /* id = 0x0000 usually indicates hardware failure, + * however, ZP Gu (zpg@castle.net> reports that his 9556 + * has 0x0000 as id and everything still works. There + * also seem to be an adapter with id = 0x0000; the + * NCR Parallel Bus Memory Card. Until this is confirmed, + * however, this code will stay. + */ + + mca_dev->status = MCA_ADAPTER_ERROR; + + return; + } else if(mca_dev->pos_id != 0xffff) { + + /* 0xffff usually indicates that there's no adapter, + * however, some integrated adapters may have 0xffff as + * their id and still be valid. Examples are on-board + * VGA of the 55sx, the integrated SCSI of the 56 & 57, + * and possibly also the 95 ULTIMEDIA. + */ + + mca_dev->status = MCA_ADAPTER_NORMAL; + } + + if((mca_dev->pos_id == 0xffff || + mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) { + int j; + + for(j = 2; j < 8; j++) { + if(mca_dev->pos[j] != 0xff) { + mca_dev->status = MCA_ADAPTER_NORMAL; + break; + } + } + } + + if(!(mca_dev->pos[2] & MCA_ENABLED)) { + + /* enabled bit is in POS 2 */ + + mca_dev->status = MCA_ADAPTER_DISABLED; + } +} /* mca_configure_adapter_status */ + +/*--------------------------------------------------------------------*/ + +static struct resource mca_standard_resources[] = { + { .start = 0x60, .end = 0x60, .name = "system control port B (MCA)" }, + { .start = 0x90, .end = 0x90, .name = "arbitration (MCA)" }, + { .start = 0x91, .end = 0x91, .name = "card Select Feedback (MCA)" }, + { .start = 0x92, .end = 0x92, .name = "system Control port A (MCA)" }, + { .start = 0x94, .end = 0x94, .name = "system board setup (MCA)" }, + { .start = 0x96, .end = 0x97, .name = "POS (MCA)" }, + { .start = 0x100, .end = 0x107, .name = "POS (MCA)" } +}; + +#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources) + +/** + * mca_read_and_store_pos - read the POS registers into a memory buffer + * @pos: a char pointer to 8 bytes, contains the POS register value on + * successful return + * + * Returns 1 if a card actually exists (i.e. the pos isn't + * all 0xff) or 0 otherwise + */ +static int mca_read_and_store_pos(unsigned char *pos) { + int j; + int found = 0; + + for(j=0; j<8; j++) { + if((pos[j] = inb_p(MCA_POS_REG(j))) != 0xff) { + /* 0xff all across means no device. 0x00 means + * something's broken, but a device is + * probably there. However, if you get 0x00 + * from a motherboard register it won't matter + * what we find. For the record, on the + * 57SLC, the integrated SCSI adapter has + * 0xffff for the adapter ID, but nonzero for + * other registers. */ + + found = 1; + } + } + return found; +} + +static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg) +{ + unsigned char byte; + unsigned long flags; + + if(reg < 0 || reg >= 8) + return 0; + + spin_lock_irqsave(&mca_lock, flags); + if(mca_dev->pos_register) { + /* Disable adapter setup, enable motherboard setup */ + + outb_p(0, MCA_ADAPTER_SETUP_REG); + outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); + + byte = inb_p(MCA_POS_REG(reg)); + outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); + } else { + + /* Make sure motherboard setup is off */ + + outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); + + /* Read the appropriate register */ + + outb_p(0x8|(mca_dev->slot & 0xf), MCA_ADAPTER_SETUP_REG); + byte = inb_p(MCA_POS_REG(reg)); + outb_p(0, MCA_ADAPTER_SETUP_REG); + } + spin_unlock_irqrestore(&mca_lock, flags); + + mca_dev->pos[reg] = byte; + + return byte; +} + +static void mca_pc_write_pos(struct mca_device *mca_dev, int reg, + unsigned char byte) +{ + unsigned long flags; + + if(reg < 0 || reg >= 8) + return; + + spin_lock_irqsave(&mca_lock, flags); + + /* Make sure motherboard setup is off */ + + outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); + + /* Read in the appropriate register */ + + outb_p(0x8|(mca_dev->slot&0xf), MCA_ADAPTER_SETUP_REG); + outb_p(byte, MCA_POS_REG(reg)); + outb_p(0, MCA_ADAPTER_SETUP_REG); + + spin_unlock_irqrestore(&mca_lock, flags); + + /* Update the global register list, while we have the byte */ + + mca_dev->pos[reg] = byte; + +} + +/* for the primary MCA bus, we have identity transforms */ +static int mca_dummy_transform_irq(struct mca_device * mca_dev, int irq) +{ + return irq; +} + +static int mca_dummy_transform_ioport(struct mca_device * mca_dev, int port) +{ + return port; +} + +static void *mca_dummy_transform_memory(struct mca_device * mca_dev, void *mem) +{ + return mem; +} + + +static int __init mca_init(void) +{ + unsigned int i, j; + struct mca_device *mca_dev; + unsigned char pos[8]; + short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00}; + struct mca_bus *bus; + + /* WARNING: Be careful when making changes here. Putting an adapter + * and the motherboard simultaneously into setup mode may result in + * damage to chips (according to The Indispensible PC Hardware Book + * by Hans-Peter Messmer). Also, we disable system interrupts (so + * that we are not disturbed in the middle of this). + */ + + /* Make sure the MCA bus is present */ + + if (mca_system_init()) { + printk(KERN_ERR "MCA bus system initialisation failed\n"); + return -ENODEV; + } + + if (!MCA_bus) + return -ENODEV; + + printk(KERN_INFO "Micro Channel bus detected.\n"); + + /* All MCA systems have at least a primary bus */ + bus = mca_attach_bus(MCA_PRIMARY_BUS); + if (!bus) + goto out_nomem; + bus->default_dma_mask = 0xffffffffLL; + bus->f.mca_write_pos = mca_pc_write_pos; + bus->f.mca_read_pos = mca_pc_read_pos; + bus->f.mca_transform_irq = mca_dummy_transform_irq; + bus->f.mca_transform_ioport = mca_dummy_transform_ioport; + bus->f.mca_transform_memory = mca_dummy_transform_memory; + + /* get the motherboard device */ + mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL); + if(unlikely(!mca_dev)) + goto out_nomem; + + /* + * We do not expect many MCA interrupts during initialization, + * but let us be safe: + */ + spin_lock_irq(&mca_lock); + + /* Make sure adapter setup is off */ + + outb_p(0, MCA_ADAPTER_SETUP_REG); + + /* Read motherboard POS registers */ + + mca_dev->pos_register = 0x7f; + outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); + mca_dev->name[0] = 0; + mca_read_and_store_pos(mca_dev->pos); + mca_configure_adapter_status(mca_dev); + /* fake POS and slot for a motherboard */ + mca_dev->pos_id = MCA_MOTHERBOARD_POS; + mca_dev->slot = MCA_MOTHERBOARD; + mca_register_device(MCA_PRIMARY_BUS, mca_dev); + + mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); + if(unlikely(!mca_dev)) + goto out_unlock_nomem; + + /* Put motherboard into video setup mode, read integrated video + * POS registers, and turn motherboard setup off. + */ + + mca_dev->pos_register = 0xdf; + outb_p(mca_dev->pos_register, MCA_MOTHERBOARD_SETUP_REG); + mca_dev->name[0] = 0; + mca_read_and_store_pos(mca_dev->pos); + mca_configure_adapter_status(mca_dev); + /* fake POS and slot for the integrated video */ + mca_dev->pos_id = MCA_INTEGVIDEO_POS; + mca_dev->slot = MCA_INTEGVIDEO; + mca_register_device(MCA_PRIMARY_BUS, mca_dev); + + /* Put motherboard into scsi setup mode, read integrated scsi + * POS registers, and turn motherboard setup off. + * + * It seems there are two possible SCSI registers. Martin says that + * for the 56,57, 0xf7 is the one, but fails on the 76. + * Alfredo (apena@vnet.ibm.com) says + * 0xfd works on his machine. We'll try both of them. I figure it's + * a good bet that only one could be valid at a time. This could + * screw up though if one is used for something else on the other + * machine. + */ + + for(i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) { + outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG); + if(mca_read_and_store_pos(pos)) + break; + } + if(which_scsi) { + /* found a scsi card */ + mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC); + if(unlikely(!mca_dev)) + goto out_unlock_nomem; + + for(j = 0; j < 8; j++) + mca_dev->pos[j] = pos[j]; + + mca_configure_adapter_status(mca_dev); + /* fake POS and slot for integrated SCSI controller */ + mca_dev->pos_id = MCA_INTEGSCSI_POS; + mca_dev->slot = MCA_INTEGSCSI; + mca_dev->pos_register = which_scsi; + mca_register_device(MCA_PRIMARY_BUS, mca_dev); + } + + /* Turn off motherboard setup */ + + outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG); + + /* Now loop over MCA slots: put each adapter into setup mode, and + * read its POS registers. Then put adapter setup off. + */ + + for(i=0; ipos[j]=pos[j]; + + mca_dev->driver_loaded = 0; + mca_dev->slot = i; + mca_dev->pos_register = 0; + mca_configure_adapter_status(mca_dev); + mca_register_device(MCA_PRIMARY_BUS, mca_dev); + } + outb_p(0, MCA_ADAPTER_SETUP_REG); + + /* Enable interrupts and return memory start */ + spin_unlock_irq(&mca_lock); + + for (i = 0; i < MCA_STANDARD_RESOURCES; i++) + request_resource(&ioport_resource, mca_standard_resources + i); + + mca_do_proc_init(); + + return 0; + + out_unlock_nomem: + spin_unlock_irq(&mca_lock); + out_nomem: + printk(KERN_EMERG "Failed memory allocation in MCA setup!\n"); + return -ENOMEM; +} + +subsys_initcall(mca_init); + +/*--------------------------------------------------------------------*/ + +static __kprobes void +mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag) +{ + int slot = mca_dev->slot; + + if(slot == MCA_INTEGSCSI) { + printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n", + mca_dev->name); + } else if(slot == MCA_INTEGVIDEO) { + printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n", + mca_dev->name); + } else if(slot == MCA_MOTHERBOARD) { + printk(KERN_CRIT "NMI: caused by motherboard (%s)\n", + mca_dev->name); + } + + /* More info available in POS 6 and 7? */ + + if(check_flag) { + unsigned char pos6, pos7; + + pos6 = mca_device_read_pos(mca_dev, 6); + pos7 = mca_device_read_pos(mca_dev, 7); + + printk(KERN_CRIT "NMI: POS 6 = 0x%x, POS 7 = 0x%x\n", pos6, pos7); + } + +} /* mca_handle_nmi_slot */ + +/*--------------------------------------------------------------------*/ + +static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data) +{ + struct mca_device *mca_dev = to_mca_device(dev); + unsigned char pos5; + + pos5 = mca_device_read_pos(mca_dev, 5); + + if(!(pos5 & 0x80)) { + /* Bit 7 of POS 5 is reset when this adapter has a hardware + * error. Bit 7 it reset if there's error information + * available in POS 6 and 7. + */ + mca_handle_nmi_device(mca_dev, !(pos5 & 0x40)); + return 1; + } + return 0; +} + +void __kprobes mca_handle_nmi(void) +{ + /* First try - scan the various adapters and see if a specific + * adapter was responsible for the error. + */ + bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback); + + mca_nmi_hook(); +} /* mca_handle_nmi */ diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c new file mode 100644 index 000000000000..09cf78110358 --- /dev/null +++ b/arch/x86/kernel/microcode.c @@ -0,0 +1,850 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.10 of Volume III, Intel Pentium 4 Manual, + * Order Number 245472 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/245472.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman for the fix. + * 1.08 11 Dec 2000, Richard Schaal and + * Tigran Aivazian + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick and + * Tigran Aivazian , + * Serialize updates as required on HT processors due to speculative + * nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble , + * Jun Nakajima + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ + +//#define DEBUG /* pr_debug */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian "); +MODULE_LICENSE("GPL"); + +#define MICROCODE_VERSION "1.14a" + +#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ +#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ +#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */ +#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */ +#define DWSIZE (sizeof (u32)) +#define get_totalsize(mc) \ + (((microcode_t *)mc)->hdr.totalsize ? \ + ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) +#define get_datasize(mc) \ + (((microcode_t *)mc)->hdr.datasize ? \ + ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) + +#define sigmatch(s1, s2, p1, p2) \ + (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) + +#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) + +/* serialize access to the physical write to MSR 0x79 */ +static DEFINE_SPINLOCK(microcode_update_lock); + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +static DEFINE_MUTEX(microcode_mutex); + +static struct ucode_cpu_info { + int valid; + unsigned int sig; + unsigned int pf; + unsigned int rev; + microcode_t *mc; +} ucode_cpu_info[NR_CPUS]; + +static void collect_cpu_info(int cpu_num) +{ + struct cpuinfo_x86 *c = cpu_data + cpu_num; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + unsigned int val[2]; + + /* We should bind the task to the CPU */ + BUG_ON(raw_smp_processor_id() != cpu_num); + uci->pf = uci->rev = 0; + uci->mc = NULL; + uci->valid = 1; + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + printk(KERN_ERR "microcode: CPU%d not a capable Intel " + "processor\n", cpu_num); + uci->valid = 0; + return; + } + + uci->sig = cpuid_eax(0x00000001); + + if ((c->x86_model >= 5) || (c->x86 > 6)) { + /* get processor flags from MSR 0x17 */ + rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + uci->pf = 1 << ((val[1] >> 18) & 7); + } + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + /* see notes above for revision 1.07. Apparent chip bug */ + sync_core(); + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev); + pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", + uci->sig, uci->pf, uci->rev); +} + +static inline int microcode_update_match(int cpu_num, + microcode_header_t *mc_header, int sig, int pf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + + if (!sigmatch(sig, uci->sig, pf, uci->pf) + || mc_header->rev <= uci->rev) + return 0; + return 1; +} + +static int microcode_sanity_check(void *mc) +{ + microcode_header_t *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + struct extended_signature *ext_sig; + unsigned long total_size, data_size, ext_table_size; + int sum, orig_sum, ext_sigcount = 0, i; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); + if (data_size + MC_HEADER_SIZE > total_size) { + printk(KERN_ERR "microcode: error! " + "Bad data size in microcode data file\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + printk(KERN_ERR "microcode: error! " + "Unknown microcode update format\n"); + return -EINVAL; + } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + printk(KERN_ERR "microcode: error! " + "Small exttable size in microcode data file\n"); + return -EINVAL; + } + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + printk(KERN_ERR "microcode: error! " + "Bad exttable size in microcode data file\n"); + return -EFAULT; + } + ext_sigcount = ext_header->count; + } + + /* check extended table checksum */ + if (ext_table_size) { + int ext_table_sum = 0; + int *ext_tablep = (int *)ext_header; + + i = ext_table_size / DWSIZE; + while (i--) + ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { + printk(KERN_WARNING "microcode: aborting, " + "bad extended signature table checksum\n"); + return -EINVAL; + } + } + + /* calculate the checksum */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / DWSIZE; + while (i--) + orig_sum += ((int *)mc)[i]; + if (orig_sum) { + printk(KERN_ERR "microcode: aborting, bad checksum\n"); + return -EINVAL; + } + if (!ext_table_size) + return 0; + /* check extended signature checksum */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (struct extended_signature *)((void *)ext_header + + EXT_HEADER_SIZE + EXT_SIGNATURE_SIZE * i); + sum = orig_sum + - (mc_header->sig + mc_header->pf + mc_header->cksum) + + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + printk(KERN_ERR "microcode: aborting, bad checksum\n"); + return -EINVAL; + } + } + return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + * return < 0 - error + */ +static int get_maching_microcode(void *mc, int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + microcode_header_t *mc_header = mc; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + void *new_mc; + + if (microcode_update_match(cpu, mc_header, + mc_header->sig, mc_header->pf)) + goto find; + + if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) + return 0; + + ext_header = (struct extended_sigtable *)(mc + + get_datasize(mc_header) + MC_HEADER_SIZE); + ext_sigcount = ext_header->count; + ext_sig = (struct extended_signature *)((void *)ext_header + + EXT_HEADER_SIZE); + for (i = 0; i < ext_sigcount; i++) { + if (microcode_update_match(cpu, mc_header, + ext_sig->sig, ext_sig->pf)) + goto find; + ext_sig++; + } + return 0; +find: + pr_debug("microcode: CPU %d found a matching microcode update with" + " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev); + new_mc = vmalloc(total_size); + if (!new_mc) { + printk(KERN_ERR "microcode: error! Can not allocate memory\n"); + return -ENOMEM; + } + + /* free previous update file */ + vfree(uci->mc); + + memcpy(new_mc, mc, total_size); + uci->mc = new_mc; + return 1; +} + +static void apply_microcode(int cpu) +{ + unsigned long flags; + unsigned int val[2]; + int cpu_num = raw_smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + + /* We should bind the task to the CPU */ + BUG_ON(cpu_num != cpu); + + if (uci->mc == NULL) + return; + + /* serialize access to the physical write to MSR 0x79 */ + spin_lock_irqsave(µcode_update_lock, flags); + + /* write microcode via MSR 0x79 */ + wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) uci->mc->bits, + (unsigned long) uci->mc->bits >> 16 >> 16); + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* see notes above for revision 1.07. Apparent chip bug */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + spin_unlock_irqrestore(µcode_update_lock, flags); + if (val[1] != uci->mc->hdr.rev) { + printk(KERN_ERR "microcode: CPU%d updated from revision " + "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]); + return; + } + pr_debug("microcode: CPU%d updated from revision " + "0x%x to 0x%x, date = %08x \n", + cpu_num, uci->rev, val[1], uci->mc->hdr.date); + uci->rev = val[1]; +} + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +static void __user *user_buffer; /* user area microcode data buffer */ +static unsigned int user_buffer_size; /* it's size */ + +static long get_next_ucode(void **mc, long offset) +{ + microcode_header_t mc_header; + unsigned long total_size; + + /* No more data */ + if (offset >= user_buffer_size) + return 0; + if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + return -EFAULT; + } + total_size = get_totalsize(&mc_header); + if (offset + total_size > user_buffer_size) { + printk(KERN_ERR "microcode: error! Bad total size in microcode " + "data file\n"); + return -EINVAL; + } + *mc = vmalloc(total_size); + if (!*mc) + return -ENOMEM; + if (copy_from_user(*mc, user_buffer + offset, total_size)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + vfree(*mc); + return -EFAULT; + } + return offset + total_size; +} + +static int do_microcode_update (void) +{ + long cursor = 0; + int error = 0; + void *new_mc = NULL; + int cpu; + cpumask_t old; + + old = current->cpus_allowed; + + while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { + error = microcode_sanity_check(new_mc); + if (error) + goto out; + /* + * It's possible the data file has multiple matching ucode, + * lets keep searching till the latest version + */ + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!uci->valid) + continue; + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + error = get_maching_microcode(new_mc, cpu); + if (error < 0) + goto out; + if (error == 1) + apply_microcode(cpu); + } + vfree(new_mc); + } +out: + if (cursor > 0) + vfree(new_mc); + if (cursor < 0) + error = cursor; + set_cpus_allowed(current, old); + return error; +} + +static int microcode_open (struct inode *unused1, struct file *unused2) +{ + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) +{ + ssize_t ret; + + if ((len >> PAGE_SHIFT) > num_physpages) { + printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); + return -EINVAL; + } + + lock_cpu_hotplug(); + mutex_lock(µcode_mutex); + + user_buffer = (void __user *) buf; + user_buffer_size = (int) len; + + ret = do_microcode_update(); + if (!ret) + ret = (ssize_t)len; + + mutex_unlock(µcode_mutex); + unlock_cpu_hotplug(); + + return ret; +} + +static const struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, +}; + +static int __init microcode_dev_init (void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + printk(KERN_ERR + "microcode: can't misc_register on minor=%d\n", + MICROCODE_MINOR); + return error; + } + + return 0; +} + +static void microcode_dev_exit (void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while(0) +#endif + +static long get_next_ucode_from_buffer(void **mc, void *buf, + unsigned long size, long offset) +{ + microcode_header_t *mc_header; + unsigned long total_size; + + /* No more data */ + if (offset >= size) + return 0; + mc_header = (microcode_header_t *)(buf + offset); + total_size = get_totalsize(mc_header); + + if (offset + total_size > size) { + printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + return -EINVAL; + } + + *mc = vmalloc(total_size); + if (!*mc) { + printk(KERN_ERR "microcode: error! Can not allocate memory\n"); + return -ENOMEM; + } + memcpy(*mc, buf + offset, total_size); + return offset + total_size; +} + +/* fake device for request_firmware */ +static struct platform_device *microcode_pdev; + +static int cpu_request_microcode(int cpu) +{ + char name[30]; + struct cpuinfo_x86 *c = cpu_data + cpu; + const struct firmware *firmware; + void *buf; + unsigned long size; + long offset = 0; + int error; + void *mc; + + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + sprintf(name,"intel-ucode/%02x-%02x-%02x", + c->x86, c->x86_model, c->x86_mask); + error = request_firmware(&firmware, name, µcode_pdev->dev); + if (error) { + pr_debug("ucode data file %s load failed\n", name); + return error; + } + buf = (void *)firmware->data; + size = firmware->size; + while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) + > 0) { + error = microcode_sanity_check(mc); + if (error) + break; + error = get_maching_microcode(mc, cpu); + if (error < 0) + break; + /* + * It's possible the data file has multiple matching ucode, + * lets keep searching till the latest version + */ + if (error == 1) { + apply_microcode(cpu); + error = 0; + } + vfree(mc); + } + if (offset > 0) + vfree(mc); + if (offset < 0) + error = offset; + release_firmware(firmware); + + return error; +} + +static int apply_microcode_check_cpu(int cpu) +{ + struct cpuinfo_x86 *c = cpu_data + cpu; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + cpumask_t old; + unsigned int val[2]; + int err = 0; + + /* Check if the microcode is available */ + if (!uci->mc) + return 0; + + old = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + + /* Check if the microcode we have in memory matches the CPU */ + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001)) + err = -EINVAL; + + if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) { + /* get processor flags from MSR 0x17 */ + rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + if (uci->pf != (1 << ((val[1] >> 18) & 7))) + err = -EINVAL; + } + + if (!err) { + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + /* see notes above for revision 1.07. Apparent chip bug */ + sync_core(); + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + if (uci->rev != val[1]) + err = -EINVAL; + } + + if (!err) + apply_microcode(cpu); + else + printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:" + " sig=0x%x, pf=0x%x, rev=0x%x\n", + cpu, uci->sig, uci->pf, uci->rev); + + set_cpus_allowed(current, old); + return err; +} + +static void microcode_init_cpu(int cpu, int resume) +{ + cpumask_t old; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + old = current->cpus_allowed; + + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + mutex_lock(µcode_mutex); + collect_cpu_info(cpu); + if (uci->valid && system_state == SYSTEM_RUNNING && !resume) + cpu_request_microcode(cpu); + mutex_unlock(µcode_mutex); + set_cpus_allowed(current, old); +} + +static void microcode_fini_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + mutex_lock(µcode_mutex); + uci->valid = 0; + vfree(uci->mc); + uci->mc = NULL; + mutex_unlock(µcode_mutex); +} + +static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + char *end; + unsigned long val = simple_strtoul(buf, &end, 0); + int err = 0; + int cpu = dev->id; + + if (end == buf) + return -EINVAL; + if (val == 1) { + cpumask_t old; + + old = current->cpus_allowed; + + lock_cpu_hotplug(); + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + + mutex_lock(µcode_mutex); + if (uci->valid) + err = cpu_request_microcode(cpu); + mutex_unlock(µcode_mutex); + unlock_cpu_hotplug(); + set_cpus_allowed(current, old); + } + if (err) + return err; + return sz; +} + +static ssize_t version_show(struct sys_device *dev, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->rev); +} + +static ssize_t pf_show(struct sys_device *dev, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->pf); +} + +static SYSDEV_ATTR(reload, 0200, NULL, reload_store); +static SYSDEV_ATTR(version, 0400, version_show, NULL); +static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); + +static struct attribute *mc_default_attrs[] = { + &attr_reload.attr, + &attr_version.attr, + &attr_processor_flags.attr, + NULL +}; + +static struct attribute_group mc_attr_group = { + .attrs = mc_default_attrs, + .name = "microcode", +}; + +static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) +{ + int err, cpu = sys_dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("Microcode:CPU %d added\n", cpu); + memset(uci, 0, sizeof(*uci)); + + err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); + if (err) + return err; + + microcode_init_cpu(cpu, resume); + + return 0; +} + +static int mc_sysdev_add(struct sys_device *sys_dev) +{ + return __mc_sysdev_add(sys_dev, 0); +} + +static int mc_sysdev_remove(struct sys_device *sys_dev) +{ + int cpu = sys_dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("Microcode:CPU %d removed\n", cpu); + microcode_fini_cpu(cpu); + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + return 0; +} + +static int mc_sysdev_resume(struct sys_device *dev) +{ + int cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + pr_debug("Microcode:CPU %d resumed\n", cpu); + /* only CPU 0 will apply ucode here */ + apply_microcode(0); + return 0; +} + +static struct sysdev_driver mc_sysdev_driver = { + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, +}; + +static __cpuinit int +mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev(cpu); + switch (action) { + case CPU_UP_CANCELED_FROZEN: + /* The CPU refused to come up during a system resume */ + microcode_fini_cpu(cpu); + break; + case CPU_ONLINE: + case CPU_DOWN_FAILED: + mc_sysdev_add(sys_dev); + break; + case CPU_ONLINE_FROZEN: + /* System-wide resume is in progress, try to apply microcode */ + if (apply_microcode_check_cpu(cpu)) { + /* The application of microcode failed */ + microcode_fini_cpu(cpu); + __mc_sysdev_add(sys_dev, 1); + break; + } + case CPU_DOWN_FAILED_FROZEN: + if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) + printk(KERN_ERR "Microcode: Failed to create the sysfs " + "group for CPU%d\n", cpu); + break; + case CPU_DOWN_PREPARE: + mc_sysdev_remove(sys_dev); + break; + case CPU_DOWN_PREPARE_FROZEN: + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata mc_cpu_notifier = { + .notifier_call = mc_cpu_callback, +}; + +static int __init microcode_init (void) +{ + int error; + + error = microcode_dev_init(); + if (error) + return error; + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) { + microcode_dev_exit(); + return PTR_ERR(microcode_pdev); + } + + lock_cpu_hotplug(); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + unlock_cpu_hotplug(); + if (error) { + microcode_dev_exit(); + platform_device_unregister(microcode_pdev); + return error; + } + + register_hotcpu_notifier(&mc_cpu_notifier); + + printk(KERN_INFO + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); + return 0; +} + +static void __exit microcode_exit (void) +{ + microcode_dev_exit(); + + unregister_hotcpu_notifier(&mc_cpu_notifier); + + lock_cpu_hotplug(); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + unlock_cpu_hotplug(); + + platform_device_unregister(microcode_pdev); +} + +module_init(microcode_init) +module_exit(microcode_exit) diff --git a/arch/x86/kernel/module_32.c b/arch/x86/kernel/module_32.c new file mode 100644 index 000000000000..3db0a5442eb1 --- /dev/null +++ b/arch/x86/kernel/module_32.c @@ -0,0 +1,152 @@ +/* Kernel module help for i386. + Copyright (C) 2001 Rusty Russell. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt...) +#endif + +void *module_alloc(unsigned long size) +{ + if (size == 0) + return NULL; + return vmalloc_exec(size); +} + + +/* Free memory returned from module_alloc */ +void module_free(struct module *mod, void *module_region) +{ + vfree(module_region); + /* FIXME: If module_region == mod->init_region, trim exception + table entries. */ +} + +/* We don't need anything special. */ +int module_frob_arch_sections(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + char *secstrings, + struct module *mod) +{ + return 0; +} + +int apply_relocate(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + unsigned int i; + Elf32_Rel *rel = (void *)sechdrs[relsec].sh_addr; + Elf32_Sym *sym; + uint32_t *location; + + DEBUGP("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + /* This is where to make the change */ + location = (void *)sechdrs[sechdrs[relsec].sh_info].sh_addr + + rel[i].r_offset; + /* This is the symbol it is referring to. Note that all + undefined symbols have been resolved. */ + sym = (Elf32_Sym *)sechdrs[symindex].sh_addr + + ELF32_R_SYM(rel[i].r_info); + + switch (ELF32_R_TYPE(rel[i].r_info)) { + case R_386_32: + /* We add the value into the location given */ + *location += sym->st_value; + break; + case R_386_PC32: + /* Add the value, subtract its postition */ + *location += sym->st_value - (uint32_t)location; + break; + default: + printk(KERN_ERR "module %s: Unknown relocation: %u\n", + me->name, ELF32_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; +} + +int apply_relocate_add(Elf32_Shdr *sechdrs, + const char *strtab, + unsigned int symindex, + unsigned int relsec, + struct module *me) +{ + printk(KERN_ERR "module %s: ADD RELOCATION unsupported\n", + me->name); + return -ENOEXEC; +} + +int module_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".text", secstrings + s->sh_name)) + text = s; + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks= s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; + } + + if (alt) { + /* patch .altinstructions */ + void *aseg = (void *)alt->sh_addr; + apply_alternatives(aseg, aseg + alt->sh_size); + } + if (locks && text) { + void *lseg = (void *)locks->sh_addr; + void *tseg = (void *)text->sh_addr; + alternatives_smp_module_add(me, me->name, + lseg, lseg + locks->sh_size, + tseg, tseg + text->sh_size); + } + + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + + return module_bug_finalize(hdr, sechdrs, me); +} + +void module_arch_cleanup(struct module *mod) +{ + alternatives_smp_module_del(mod); + module_bug_cleanup(mod); +} diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse_32.c new file mode 100644 index 000000000000..13abb4ebfb79 --- /dev/null +++ b/arch/x86/kernel/mpparse_32.c @@ -0,0 +1,1132 @@ +/* + * Intel Multiprocessor Specification 1.1 and 1.4 + * compliant MP-table parsing routines. + * + * (c) 1995 Alan Cox, Building #3 + * (c) 1998, 1999, 2000 Ingo Molnar + * + * Fixes + * Erich Boleyn : MP v1.4 and additional changes. + * Alan Cox : Added EBDA scanning + * Ingo Molnar : various cleanups and rewrites + * Maciej W. Rozycki: Bits for default MP configurations + * Paul Diefenbaugh: Added full ACPI support + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Have we found an MP table */ +int smp_found_config; +unsigned int __cpuinitdata maxcpus = NR_CPUS; + +/* + * Various Linux-internal data structures created from the + * MP-table. + */ +int apic_version [MAX_APICS]; +int mp_bus_id_to_type [MAX_MP_BUSSES]; +int mp_bus_id_to_node [MAX_MP_BUSSES]; +int mp_bus_id_to_local [MAX_MP_BUSSES]; +int quad_local_to_mp_bus_id [NR_CPUS/4][4]; +int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 }; +static int mp_current_pci_id; + +/* I/O APIC entries */ +struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS]; + +/* # of MP IRQ source entries */ +struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* MP IRQ source entries */ +int mp_irq_entries; + +int nr_ioapics; + +int pic_mode; +unsigned long mp_lapic_addr; + +unsigned int def_to_bigsmp = 0; + +/* Processor that is doing the boot up */ +unsigned int boot_cpu_physical_apicid = -1U; +/* Internal processor count */ +unsigned int __cpuinitdata num_processors; + +/* Bitmask of physically existing CPUs */ +physid_mask_t phys_cpu_present_map; + +u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +/* + * Intel MP BIOS table parsing routines: + */ + + +/* + * Checksum an MP configuration block. + */ + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +/* + * Have to match translation table entries to main table entries by counter + * hence the mpc_record variable .... can't see a less disgusting way of + * doing this .... + */ + +static int mpc_record; +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata; + +static void __cpuinit MP_processor_info (struct mpc_config_processor *m) +{ + int ver, apicid; + physid_mask_t phys_cpu; + + if (!(m->mpc_cpuflag & CPU_ENABLED)) + return; + + apicid = mpc_apic_id(m, translation_table[mpc_record]); + + if (m->mpc_featureflag&(1<<0)) + Dprintk(" Floating point unit present.\n"); + if (m->mpc_featureflag&(1<<7)) + Dprintk(" Machine Exception supported.\n"); + if (m->mpc_featureflag&(1<<8)) + Dprintk(" 64 bit compare & exchange supported.\n"); + if (m->mpc_featureflag&(1<<9)) + Dprintk(" Internal APIC present.\n"); + if (m->mpc_featureflag&(1<<11)) + Dprintk(" SEP present.\n"); + if (m->mpc_featureflag&(1<<12)) + Dprintk(" MTRR present.\n"); + if (m->mpc_featureflag&(1<<13)) + Dprintk(" PGE present.\n"); + if (m->mpc_featureflag&(1<<14)) + Dprintk(" MCA present.\n"); + if (m->mpc_featureflag&(1<<15)) + Dprintk(" CMOV present.\n"); + if (m->mpc_featureflag&(1<<16)) + Dprintk(" PAT present.\n"); + if (m->mpc_featureflag&(1<<17)) + Dprintk(" PSE present.\n"); + if (m->mpc_featureflag&(1<<18)) + Dprintk(" PSN present.\n"); + if (m->mpc_featureflag&(1<<19)) + Dprintk(" Cache Line Flush Instruction present.\n"); + /* 20 Reserved */ + if (m->mpc_featureflag&(1<<21)) + Dprintk(" Debug Trace and EMON Store present.\n"); + if (m->mpc_featureflag&(1<<22)) + Dprintk(" ACPI Thermal Throttle Registers present.\n"); + if (m->mpc_featureflag&(1<<23)) + Dprintk(" MMX present.\n"); + if (m->mpc_featureflag&(1<<24)) + Dprintk(" FXSR present.\n"); + if (m->mpc_featureflag&(1<<25)) + Dprintk(" XMM present.\n"); + if (m->mpc_featureflag&(1<<26)) + Dprintk(" Willamette New Instructions present.\n"); + if (m->mpc_featureflag&(1<<27)) + Dprintk(" Self Snoop present.\n"); + if (m->mpc_featureflag&(1<<28)) + Dprintk(" HT present.\n"); + if (m->mpc_featureflag&(1<<29)) + Dprintk(" Thermal Monitor present.\n"); + /* 30, 31 Reserved */ + + + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { + Dprintk(" Bootup CPU\n"); + boot_cpu_physical_apicid = m->mpc_apicid; + } + + ver = m->mpc_apicver; + + /* + * Validate version + */ + if (ver == 0x0) { + printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " + "fixing up to 0x10. (tell your hw vendor)\n", + m->mpc_apicid); + ver = 0x10; + } + apic_version[m->mpc_apicid] = ver; + + phys_cpu = apicid_to_cpu_present(apicid); + physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu); + + if (num_processors >= NR_CPUS) { + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + " Processor ignored.\n", NR_CPUS); + return; + } + + if (num_processors >= maxcpus) { + printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." + " Processor ignored.\n", maxcpus); + return; + } + + cpu_set(num_processors, cpu_possible_map); + num_processors++; + + /* + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y + * but we need to work other dependencies like SMP_SUSPEND etc + * before this can be done without some confusion. + * if (CPU_HOTPLUG_ENABLED || num_processors > 8) + * - Ashok Raj + */ + if (num_processors > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(ver)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + bios_cpu_apicid[num_processors - 1] = m->mpc_apicid; +} + +static void __init MP_bus_info (struct mpc_config_bus *m) +{ + char str[7]; + + memcpy(str, m->mpc_bustype, 6); + str[6] = 0; + + mpc_oem_bus_info(m, str, translation_table[mpc_record]); + +#if MAX_MP_BUSSES < 256 + if (m->mpc_busid >= MAX_MP_BUSSES) { + printk(KERN_WARNING "MP table busid value (%d) for bustype %s " + " is too large, max. supported is %d\n", + m->mpc_busid, str, MAX_MP_BUSSES - 1); + return; + } +#endif + + if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; + } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA; + } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) { + mpc_oem_pci_bus(m, translation_table[mpc_record]); + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; + mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id; + mp_current_pci_id++; + } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) { + mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA; + } else { + printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); + } +} + +static void __init MP_ioapic_info (struct mpc_config_ioapic *m) +{ + if (!(m->mpc_flags & MPC_APIC_USABLE)) + return; + + printk(KERN_INFO "I/O APIC #%d Version %d at 0x%lX.\n", + m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr); + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n", + MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!.\n"); + } + if (!m->mpc_apicaddr) { + printk(KERN_ERR "WARNING: bogus zero I/O APIC address" + " found in MP table, skipping!\n"); + return; + } + mp_ioapics[nr_ioapics] = *m; + nr_ioapics++; +} + +static void __init MP_intsrc_info (struct mpc_config_intsrc *m) +{ + mp_irqs [mp_irq_entries] = *m; + Dprintk("Int: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC INT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, + m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq); + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!!\n"); +} + +static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m) +{ + Dprintk("Lint: type %d, pol %d, trig %d, bus %d," + " IRQ %02x, APIC ID %x, APIC LINT %02x\n", + m->mpc_irqtype, m->mpc_irqflag & 3, + (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid, + m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); +} + +#ifdef CONFIG_X86_NUMAQ +static void __init MP_translation_info (struct mpc_config_translation *m) +{ + printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local); + + if (mpc_record >= MAX_MPC_ENTRY) + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); + else + translation_table[mpc_record] = m; /* stash this for later */ + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) + node_set_online(m->trans_quad); +} + +/* + * Read/parse the MPC oem tables + */ + +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \ + unsigned short oemsize) +{ + int count = sizeof (*oemtable); /* the header size */ + unsigned char *oemptr = ((unsigned char *)oemtable)+count; + + mpc_record = 0; + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); + if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4)) + { + printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", + oemtable->oem_signature[0], + oemtable->oem_signature[1], + oemtable->oem_signature[2], + oemtable->oem_signature[3]); + return; + } + if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length)) + { + printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); + return; + } + while (count < oemtable->oem_length) { + switch (*oemptr) { + case MP_TRANSLATION: + { + struct mpc_config_translation *m= + (struct mpc_config_translation *)oemptr; + MP_translation_info(m); + oemptr += sizeof(*m); + count += sizeof(*m); + ++mpc_record; + break; + } + default: + { + printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr); + return; + } + } + } +} + +static inline void mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + if (strncmp(oem, "IBM NUMA", 8)) + printk("Warning! May not be a NUMA-Q system!\n"); + if (mpc->mpc_oemptr) + smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr, + mpc->mpc_oemsize); +} +#endif /* CONFIG_X86_NUMAQ */ + +/* + * Read/parse the MPC + */ + +static int __init smp_read_mpc(struct mp_config_table *mpc) +{ + char str[16]; + char oem[10]; + int count=sizeof(*mpc); + unsigned char *mpt=((unsigned char *)mpc)+count; + + if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) { + printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n", + *(u32 *)mpc->mpc_signature); + return 0; + } + if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) { + printk(KERN_ERR "SMP mptable: checksum error!\n"); + return 0; + } + if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) { + printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n", + mpc->mpc_spec); + return 0; + } + if (!mpc->mpc_lapic) { + printk(KERN_ERR "SMP mptable: null local APIC address!\n"); + return 0; + } + memcpy(oem,mpc->mpc_oem,8); + oem[8]=0; + printk(KERN_INFO "OEM ID: %s ",oem); + + memcpy(str,mpc->mpc_productid,12); + str[12]=0; + printk("Product ID: %s ",str); + + mps_oem_check(mpc, oem, str); + + printk("APIC at: 0x%lX\n",mpc->mpc_lapic); + + /* + * Save the local APIC address (it might be non-default) -- but only + * if we're not using ACPI. + */ + if (!acpi_lapic) + mp_lapic_addr = mpc->mpc_lapic; + + /* + * Now process the configuration blocks. + */ + mpc_record = 0; + while (count < mpc->mpc_length) { + switch(*mpt) { + case MP_PROCESSOR: + { + struct mpc_config_processor *m= + (struct mpc_config_processor *)mpt; + /* ACPI may have already provided this data */ + if (!acpi_lapic) + MP_processor_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_BUS: + { + struct mpc_config_bus *m= + (struct mpc_config_bus *)mpt; + MP_bus_info(m); + mpt += sizeof(*m); + count += sizeof(*m); + break; + } + case MP_IOAPIC: + { + struct mpc_config_ioapic *m= + (struct mpc_config_ioapic *)mpt; + MP_ioapic_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_INTSRC: + { + struct mpc_config_intsrc *m= + (struct mpc_config_intsrc *)mpt; + + MP_intsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + case MP_LINTSRC: + { + struct mpc_config_lintsrc *m= + (struct mpc_config_lintsrc *)mpt; + MP_lintsrc_info(m); + mpt+=sizeof(*m); + count+=sizeof(*m); + break; + } + default: + { + count = mpc->mpc_length; + break; + } + } + ++mpc_record; + } + setup_apic_routing(); + if (!num_processors) + printk(KERN_ERR "SMP mptable: no processors registered!\n"); + return num_processors; +} + +static int __init ELCR_trigger(unsigned int irq) +{ + unsigned int port; + + port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; +} + +static void __init construct_default_ioirq_mptable(int mpc_default_type) +{ + struct mpc_config_intsrc intsrc; + int i; + int ELCR_fallback = 0; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* conforming */ + intsrc.mpc_srcbus = 0; + intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid; + + intsrc.mpc_irqtype = mp_INT; + + /* + * If true, we have an ISA/PCI system with no IRQ entries + * in the MP table. To prevent the PCI interrupts from being set up + * incorrectly, we try to use the ELCR. The sanity check to see if + * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can + * never be level sensitive, so we simply see if the ELCR agrees. + * If it does, we assume it's valid. + */ + if (mpc_default_type == 5) { + printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n"); + + if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13)) + printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n"); + else { + printk(KERN_INFO "Using ELCR to identify PCI interrupts\n"); + ELCR_fallback = 1; + } + } + + for (i = 0; i < 16; i++) { + switch (mpc_default_type) { + case 2: + if (i == 0 || i == 13) + continue; /* IRQ0 & IRQ13 not connected */ + /* fall through */ + default: + if (i == 2) + continue; /* IRQ2 is never connected */ + } + + if (ELCR_fallback) { + /* + * If the ELCR indicates a level-sensitive interrupt, we + * copy that information over to the MP table in the + * irqflag field (level sensitive, active high polarity). + */ + if (ELCR_trigger(i)) + intsrc.mpc_irqflag = 13; + else + intsrc.mpc_irqflag = 0; + } + + intsrc.mpc_srcbusirq = i; + intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ + MP_intsrc_info(&intsrc); + } + + intsrc.mpc_irqtype = mp_ExtINT; + intsrc.mpc_srcbusirq = 0; + intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */ + MP_intsrc_info(&intsrc); +} + +static inline void __init construct_default_ISA_mptable(int mpc_default_type) +{ + struct mpc_config_processor processor; + struct mpc_config_bus bus; + struct mpc_config_ioapic ioapic; + struct mpc_config_lintsrc lintsrc; + int linttypes[2] = { mp_ExtINT, mp_NMI }; + int i; + + /* + * local APIC has default address + */ + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* + * 2 CPUs, numbered 0 & 1. + */ + processor.mpc_type = MP_PROCESSOR; + /* Either an integrated APIC or a discrete 82489DX. */ + processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + processor.mpc_cpuflag = CPU_ENABLED; + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | + boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + for (i = 0; i < 2; i++) { + processor.mpc_apicid = i; + MP_processor_info(&processor); + } + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + switch (mpc_default_type) { + default: + printk("???\n"); + printk(KERN_ERR "Unknown standard configuration %d\n", + mpc_default_type); + /* fall through */ + case 1: + case 5: + memcpy(bus.mpc_bustype, "ISA ", 6); + break; + case 2: + case 6: + case 3: + memcpy(bus.mpc_bustype, "EISA ", 6); + break; + case 4: + case 7: + memcpy(bus.mpc_bustype, "MCA ", 6); + } + MP_bus_info(&bus); + if (mpc_default_type > 4) { + bus.mpc_busid = 1; + memcpy(bus.mpc_bustype, "PCI ", 6); + MP_bus_info(&bus); + } + + ioapic.mpc_type = MP_IOAPIC; + ioapic.mpc_apicid = 2; + ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01; + ioapic.mpc_flags = MPC_APIC_USABLE; + ioapic.mpc_apicaddr = 0xFEC00000; + MP_ioapic_info(&ioapic); + + /* + * We set up most of the low 16 IO-APIC pins according to MPS rules. + */ + construct_default_ioirq_mptable(mpc_default_type); + + lintsrc.mpc_type = MP_LINTSRC; + lintsrc.mpc_irqflag = 0; /* conforming */ + lintsrc.mpc_srcbusid = 0; + lintsrc.mpc_srcbusirq = 0; + lintsrc.mpc_destapic = MP_APIC_ALL; + for (i = 0; i < 2; i++) { + lintsrc.mpc_irqtype = linttypes[i]; + lintsrc.mpc_destapiclint = i; + MP_lintsrc_info(&lintsrc); + } +} + +static struct intel_mp_floating *mpf_found; + +/* + * Scan the memory blocks for an SMP configuration block. + */ +void __init get_smp_config (void) +{ + struct intel_mp_floating *mpf = mpf_found; + + /* + * ACPI supports both logical (e.g. Hyper-Threading) and physical + * processors, where MPS only supports physical. + */ + if (acpi_lapic && acpi_ioapic) { + printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n"); + return; + } + else if (acpi_lapic) + printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n"); + + printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification); + if (mpf->mpf_feature2 & (1<<7)) { + printk(KERN_INFO " IMCR and PIC compatibility mode.\n"); + pic_mode = 1; + } else { + printk(KERN_INFO " Virtual Wire compatibility mode.\n"); + pic_mode = 0; + } + + /* + * Now see if we need to read further. + */ + if (mpf->mpf_feature1 != 0) { + + printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1); + construct_default_ISA_mptable(mpf->mpf_feature1); + + } else if (mpf->mpf_physptr) { + + /* + * Read the physical hardware table. Anything here will + * override the defaults. + */ + if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) { + smp_found_config = 0; + printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); + printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); + return; + } + /* + * If there are no explicit MP IRQ entries, then we are + * broken. We set up most of the low 16 IO-APIC pins to + * ISA defaults and hope it will work. + */ + if (!mp_irq_entries) { + struct mpc_config_bus bus; + + printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n"); + + bus.mpc_type = MP_BUS; + bus.mpc_busid = 0; + memcpy(bus.mpc_bustype, "ISA ", 6); + MP_bus_info(&bus); + + construct_default_ioirq_mptable(0); + } + + } else + BUG(); + + printk(KERN_INFO "Processors: %d\n", num_processors); + /* + * Only use the first configuration found. + */ +} + +static int __init smp_scan_config (unsigned long base, unsigned long length) +{ + unsigned long *bp = phys_to_virt(base); + struct intel_mp_floating *mpf; + + Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length); + if (sizeof(*mpf) != 16) + printk("Error: MPF size\n"); + + while (length > 0) { + mpf = (struct intel_mp_floating *)bp; + if ((*bp == SMP_MAGIC_IDENT) && + (mpf->mpf_length == 1) && + !mpf_checksum((unsigned char *)bp, 16) && + ((mpf->mpf_specification == 1) + || (mpf->mpf_specification == 4)) ) { + + smp_found_config = 1; + printk(KERN_INFO "found SMP MP-table at %08lx\n", + virt_to_phys(mpf)); + reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE); + if (mpf->mpf_physptr) { + /* + * We cannot access to MPC table to compute + * table size yet, as only few megabytes from + * the bottom is mapped now. + * PC-9800's MPC table places on the very last + * of physical memory; so that simply reserving + * PAGE_SIZE from mpg->mpf_physptr yields BUG() + * in reserve_bootmem. + */ + unsigned long size = PAGE_SIZE; + unsigned long end = max_low_pfn * PAGE_SIZE; + if (mpf->mpf_physptr + size > end) + size = end - mpf->mpf_physptr; + reserve_bootmem(mpf->mpf_physptr, size); + } + + mpf_found = mpf; + return 1; + } + bp += 4; + length -= 16; + } + return 0; +} + +void __init find_smp_config (void) +{ + unsigned int address; + + /* + * FIXME: Linux assumes you have 640K of base ram.. + * this continues the error... + * + * 1) Scan the bottom 1K for a signature + * 2) Scan the top 1K of base RAM + * 3) Scan the 64K of bios + */ + if (smp_scan_config(0x0,0x400) || + smp_scan_config(639*0x400,0x400) || + smp_scan_config(0xF0000,0x10000)) + return; + /* + * If it is an SMP machine we should know now, unless the + * configuration is in an EISA/MCA bus machine with an + * extended bios data area. + * + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E, calculate and scan it here. + * + * NOTE! There are Linux loaders that will corrupt the EBDA + * area, and as such this kind of SMP config may be less + * trustworthy, simply because the SMP table may have been + * stomped on during early boot. These loaders are buggy and + * should be fixed. + * + * MP1.4 SPEC states to only scan first 1K of 4K EBDA. + */ + + address = get_bios_ebda(); + if (address) + smp_scan_config(address, 0x400); +} + +int es7000_plat; + +/* -------------------------------------------------------------------------- + ACPI-based MP Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +void __init mp_register_lapic_address(u64 address) +{ + mp_lapic_addr = (unsigned long) address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); + + Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid); +} + +void __cpuinit mp_register_lapic (u8 id, u8 enabled) +{ + struct mpc_config_processor processor; + int boot_cpu = 0; + + if (MAX_APICS - id <= 0) { + printk(KERN_WARNING "Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + if (id == boot_cpu_physical_apicid) + boot_cpu = 1; + + processor.mpc_type = MP_PROCESSOR; + processor.mpc_apicid = id; + processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR)); + processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0); + processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0); + processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) | + (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask; + processor.mpc_featureflag = boot_cpu_data.x86_capability[0]; + processor.mpc_reserved[0] = 0; + processor.mpc_reserved[1] = 0; + + MP_processor_info(&processor); +} + +#ifdef CONFIG_X86_IO_APIC + +#define MP_ISA_BUS 0 +#define MP_MAX_IOAPIC_PIN 127 + +static struct mp_ioapic_routing { + int apic_id; + int gsi_base; + int gsi_end; + u32 pin_programmed[4]; +} mp_ioapic_routing[MAX_IO_APICS]; + +static int mp_find_ioapic (int gsi) +{ + int i = 0; + + /* Find the IOAPIC that manages this GSI. */ + for (i = 0; i < nr_ioapics; i++) { + if ((gsi >= mp_ioapic_routing[i].gsi_base) + && (gsi <= mp_ioapic_routing[i].gsi_end)) + return i; + } + + printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi); + + return -1; +} + +void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) +{ + int idx = 0; + int tmpid; + + if (nr_ioapics >= MAX_IO_APICS) { + printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " + "(found %d)\n", MAX_IO_APICS, nr_ioapics); + panic("Recompile kernel with bigger MAX_IO_APICS!\n"); + } + if (!address) { + printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" + " found in MADT table, skipping!\n"); + return; + } + + idx = nr_ioapics++; + + mp_ioapics[idx].mpc_type = MP_IOAPIC; + mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; + mp_ioapics[idx].mpc_apicaddr = address; + + set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + tmpid = io_apic_get_unique_id(idx, id); + else + tmpid = id; + if (tmpid == -1) { + nr_ioapics--; + return; + } + mp_ioapics[idx].mpc_apicid = tmpid; + mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx); + + /* + * Build basic GSI lookup table to facilitate gsi->io_apic lookups + * and to prevent reprogramming of IOAPIC pins (PCI GSIs). + */ + mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid; + mp_ioapic_routing[idx].gsi_base = gsi_base; + mp_ioapic_routing[idx].gsi_end = gsi_base + + io_apic_get_redir_entries(idx); + + printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%lx, " + "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid, + mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr, + mp_ioapic_routing[idx].gsi_base, + mp_ioapic_routing[idx].gsi_end); +} + +void __init +mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) +{ + struct mpc_config_intsrc intsrc; + int ioapic = -1; + int pin = -1; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + return; + pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + + /* + * TBD: This check is for faulty timer entries, where the override + * erroneously sets the trigger to level, resulting in a HUGE + * increase of timer interrupts! + */ + if ((bus_irq == 0) && (trigger == 3)) + trigger = 1; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_irqflag = (trigger << 2) | polarity; + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_srcbusirq = bus_irq; /* IRQ */ + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ + intsrc.mpc_dstirq = pin; /* INTIN# */ + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", + intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); +} + +void __init mp_config_acpi_legacy_irqs (void) +{ + struct mpc_config_intsrc intsrc; + int i = 0; + int ioapic = -1; + + /* + * Fabricate the legacy ISA bus (bus #31). + */ + mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; + Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + + /* + * Older generations of ES7000 have no legacy identity mappings + */ + if (es7000_plat == 1) + return; + + /* + * Locate the IOAPIC that manages the ISA IRQs (0-15). + */ + ioapic = mp_find_ioapic(0); + if (ioapic < 0) + return; + + intsrc.mpc_type = MP_INTSRC; + intsrc.mpc_irqflag = 0; /* Conforming */ + intsrc.mpc_srcbus = MP_ISA_BUS; + intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; + + /* + * Use the default configuration for the IRQs 0-15. Unless + * overriden by (MADT) interrupt source override entries. + */ + for (i = 0; i < 16; i++) { + int idx; + + for (idx = 0; idx < mp_irq_entries; idx++) { + struct mpc_config_intsrc *irq = mp_irqs + idx; + + /* Do we already have a mapping for this ISA IRQ? */ + if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i) + break; + + /* Do we already have a mapping for this IOAPIC pin */ + if ((irq->mpc_dstapic == intsrc.mpc_dstapic) && + (irq->mpc_dstirq == i)) + break; + } + + if (idx != mp_irq_entries) { + printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i); + continue; /* IRQ already used */ + } + + intsrc.mpc_irqtype = mp_INT; + intsrc.mpc_srcbusirq = i; /* Identity mapped */ + intsrc.mpc_dstirq = i; + + Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " + "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, + (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, + intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, + intsrc.mpc_dstirq); + + mp_irqs[mp_irq_entries] = intsrc; + if (++mp_irq_entries == MAX_IRQ_SOURCES) + panic("Max # of irq sources exceeded!\n"); + } +} + +#define MAX_GSI_NUM 4096 + +int mp_register_gsi(u32 gsi, int triggering, int polarity) +{ + int ioapic = -1; + int ioapic_pin = 0; + int idx, bit = 0; + static int pci_irq = 16; + /* + * Mapping between Global System Interrups, which + * represent all possible interrupts, and IRQs + * assigned to actual devices. + */ + static int gsi_to_irq[MAX_GSI_NUM]; + + /* Don't set up the ACPI SCI because it's already set up */ + if (acpi_gbl_FADT.sci_interrupt == gsi) + return gsi; + + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) { + printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi); + return gsi; + } + + ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base; + + if (ioapic_renumber_irq) + gsi = ioapic_renumber_irq(ioapic, gsi); + + /* + * Avoid pin reprogramming. PRTs typically include entries + * with redundant pin->gsi mappings (but unique PCI devices); + * we only program the IOAPIC on the first. + */ + bit = ioapic_pin % 32; + idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); + if (idx > 3) { + printk(KERN_ERR "Invalid reference to IOAPIC pin " + "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, + ioapic_pin); + return gsi; + } + if ((1< 15), but + * avoid a problem where the 8254 timer (IRQ0) is setup + * via an override (so it's not on pin 0 of the ioapic), + * and at the same time, the pin 0 interrupt is a PCI + * type. The gsi > 15 test could cause these two pins + * to be shared as IRQ0, and they are not shareable. + * So test for this condition, and if necessary, avoid + * the pin collision. + */ + if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) + gsi = pci_irq++; + /* + * Don't assign IRQ used by ACPI SCI + */ + if (gsi == acpi_gbl_FADT.sci_interrupt) + gsi = pci_irq++; + gsi_to_irq[irq] = gsi; + } else { + printk(KERN_ERR "GSI %u is too high\n", gsi); + return gsi; + } + } + + io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, + triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, + polarity == ACPI_ACTIVE_HIGH ? 0 : 1); + return gsi; +} + +#endif /* CONFIG_X86_IO_APIC */ +#endif /* CONFIG_ACPI */ diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c new file mode 100644 index 000000000000..0c1069b8d638 --- /dev/null +++ b/arch/x86/kernel/msr.c @@ -0,0 +1,224 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2000 H. Peter Anvin - All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, Inc., 675 Mass Ave, Cambridge MA 02139, + * USA; either version 2 of the License, or (at your option) any later + * version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * msr.c + * + * x86 MSR access device + * + * This device is accessed by lseek() to the appropriate register number + * and then read/write in chunks of 8 bytes. A larger size means multiple + * reads or writes of the same register. + * + * This driver uses /dev/cpu/%d/msr where %d is the minor number, and on + * an SMP box will direct the access to CPU %d. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static struct class *msr_class; + +static loff_t msr_seek(struct file *file, loff_t offset, int orig) +{ + loff_t ret = -EINVAL; + + lock_kernel(); + switch (orig) { + case 0: + file->f_pos = offset; + ret = file->f_pos; + break; + case 1: + file->f_pos += offset; + ret = file->f_pos; + } + unlock_kernel(); + return ret; +} + +static ssize_t msr_read(struct file *file, char __user * buf, + size_t count, loff_t * ppos) +{ + u32 __user *tmp = (u32 __user *) buf; + u32 data[2]; + u32 reg = *ppos; + int cpu = iminor(file->f_path.dentry->d_inode); + int err; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (; count; count -= 8) { + err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); + if (err) + return -EIO; + if (copy_to_user(tmp, &data, 8)) + return -EFAULT; + tmp += 2; + } + + return ((char __user *)tmp) - buf; +} + +static ssize_t msr_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + const u32 __user *tmp = (const u32 __user *)buf; + u32 data[2]; + u32 reg = *ppos; + int cpu = iminor(file->f_path.dentry->d_inode); + int err; + + if (count % 8) + return -EINVAL; /* Invalid chunk size */ + + for (; count; count -= 8) { + if (copy_from_user(&data, tmp, 8)) + return -EFAULT; + err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); + if (err) + return -EIO; + tmp += 2; + } + + return ((char __user *)tmp) - buf; +} + +static int msr_open(struct inode *inode, struct file *file) +{ + unsigned int cpu = iminor(file->f_path.dentry->d_inode); + struct cpuinfo_x86 *c = &(cpu_data)[cpu]; + + if (cpu >= NR_CPUS || !cpu_online(cpu)) + return -ENXIO; /* No such CPU */ + if (!cpu_has(c, X86_FEATURE_MSR)) + return -EIO; /* MSR not supported */ + + return 0; +} + +/* + * File operations we support + */ +static const struct file_operations msr_fops = { + .owner = THIS_MODULE, + .llseek = msr_seek, + .read = msr_read, + .write = msr_write, + .open = msr_open, +}; + +static int msr_device_create(int i) +{ + int err = 0; + struct device *dev; + + dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, i), "msr%d",i); + if (IS_ERR(dev)) + err = PTR_ERR(dev); + return err; +} + +static int msr_class_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + msr_device_create(cpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata msr_class_cpu_notifier = +{ + .notifier_call = msr_class_cpu_callback, +}; + +static int __init msr_init(void) +{ + int i, err = 0; + i = 0; + + if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { + printk(KERN_ERR "msr: unable to get major %d for msr\n", + MSR_MAJOR); + err = -EBUSY; + goto out; + } + msr_class = class_create(THIS_MODULE, "msr"); + if (IS_ERR(msr_class)) { + err = PTR_ERR(msr_class); + goto out_chrdev; + } + for_each_online_cpu(i) { + err = msr_device_create(i); + if (err != 0) + goto out_class; + } + register_hotcpu_notifier(&msr_class_cpu_notifier); + + err = 0; + goto out; + +out_class: + i = 0; + for_each_online_cpu(i) + device_destroy(msr_class, MKDEV(MSR_MAJOR, i)); + class_destroy(msr_class); +out_chrdev: + unregister_chrdev(MSR_MAJOR, "cpu/msr"); +out: + return err; +} + +static void __exit msr_exit(void) +{ + int cpu = 0; + for_each_online_cpu(cpu) + device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + class_destroy(msr_class); + unregister_chrdev(MSR_MAJOR, "cpu/msr"); + unregister_hotcpu_notifier(&msr_class_cpu_notifier); +} + +module_init(msr_init); +module_exit(msr_exit) + +MODULE_AUTHOR("H. Peter Anvin "); +MODULE_DESCRIPTION("x86 generic MSR driver"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c new file mode 100644 index 000000000000..c7227e2180f8 --- /dev/null +++ b/arch/x86/kernel/nmi_32.c @@ -0,0 +1,468 @@ +/* + * linux/arch/i386/nmi.c + * + * NMI watchdog support on APIC systems + * + * Started by Ingo Molnar + * + * Fixes: + * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog. + * Mikael Pettersson : Power Management for local APIC NMI watchdog. + * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. Disable/enable API. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "mach_traps.h" + +int unknown_nmi_panic; +int nmi_watchdog_enabled; + +static cpumask_t backtrace_mask = CPU_MASK_NONE; + +/* nmi_active: + * >0: the lapic NMI watchdog is active, but can be disabled + * <0: the lapic NMI watchdog has not been set up, and cannot + * be enabled + * 0: the lapic NMI watchdog is disabled, but can be enabled + */ +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ + +unsigned int nmi_watchdog = NMI_DEFAULT; +static unsigned int nmi_hz = HZ; + +static DEFINE_PER_CPU(short, wd_enabled); + +/* local prototypes */ +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu); + +static int endflag __initdata = 0; + +#ifdef CONFIG_SMP +/* The performance counters used by NMI_LOCAL_APIC don't trigger when + * the CPU is idle. To make sure the NMI watchdog really ticks on all + * CPUs during the test make them busy. + */ +static __init void nmi_cpu_busy(void *data) +{ + local_irq_enable_in_hardirq(); + /* Intentionally don't use cpu_relax here. This is + to make sure that the performance counter really ticks, + even if there is a simulator or similar that catches the + pause instruction. On a real HT machine this is fine because + all other CPUs are busy with "useless" delay loops and don't + care if they get somewhat less cycles. */ + while (endflag == 0) + mb(); +} +#endif + +static int __init check_nmi_watchdog(void) +{ + unsigned int *prev_nmi_count; + int cpu; + + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) + return 0; + + if (!atomic_read(&nmi_active)) + return 0; + + prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL); + if (!prev_nmi_count) + return -1; + + printk(KERN_INFO "Testing NMI watchdog ... "); + + if (nmi_watchdog == NMI_LOCAL_APIC) + smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0); + + for_each_possible_cpu(cpu) + prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; + local_irq_enable(); + mdelay((20*1000)/nmi_hz); // wait 20 ticks + + for_each_possible_cpu(cpu) { +#ifdef CONFIG_SMP + /* Check cpu_callin_map here because that is set + after the timer is started. */ + if (!cpu_isset(cpu, cpu_callin_map)) + continue; +#endif + if (!per_cpu(wd_enabled, cpu)) + continue; + if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { + printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n", + cpu, + prev_nmi_count[cpu], + nmi_count(cpu)); + per_cpu(wd_enabled, cpu) = 0; + atomic_dec(&nmi_active); + } + } + endflag = 1; + if (!atomic_read(&nmi_active)) { + kfree(prev_nmi_count); + atomic_set(&nmi_active, -1); + return -1; + } + printk("OK.\n"); + + /* now that we know it works we can reduce NMI frequency to + something more reasonable; makes a difference in some configs */ + if (nmi_watchdog == NMI_LOCAL_APIC) + nmi_hz = lapic_adjust_nmi_hz(1); + + kfree(prev_nmi_count); + return 0; +} +/* This needs to happen later in boot so counters are working */ +late_initcall(check_nmi_watchdog); + +static int __init setup_nmi_watchdog(char *str) +{ + int nmi; + + get_option(&str, &nmi); + + if ((nmi >= NMI_INVALID) || (nmi < NMI_NONE)) + return 0; + + nmi_watchdog = nmi; + return 1; +} + +__setup("nmi_watchdog=", setup_nmi_watchdog); + + +/* Suspend/resume support */ + +#ifdef CONFIG_PM + +static int nmi_pm_active; /* nmi_active before suspend */ + +static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state) +{ + /* only CPU0 goes here, other CPUs should be offline */ + nmi_pm_active = atomic_read(&nmi_active); + stop_apic_nmi_watchdog(NULL); + BUG_ON(atomic_read(&nmi_active) != 0); + return 0; +} + +static int lapic_nmi_resume(struct sys_device *dev) +{ + /* only CPU0 goes here, other CPUs should be offline */ + if (nmi_pm_active > 0) { + setup_apic_nmi_watchdog(NULL); + touch_nmi_watchdog(); + } + return 0; +} + + +static struct sysdev_class nmi_sysclass = { + set_kset_name("lapic_nmi"), + .resume = lapic_nmi_resume, + .suspend = lapic_nmi_suspend, +}; + +static struct sys_device device_lapic_nmi = { + .id = 0, + .cls = &nmi_sysclass, +}; + +static int __init init_lapic_nmi_sysfs(void) +{ + int error; + + /* should really be a BUG_ON but b/c this is an + * init call, it just doesn't work. -dcz + */ + if (nmi_watchdog != NMI_LOCAL_APIC) + return 0; + + if (atomic_read(&nmi_active) < 0) + return 0; + + error = sysdev_class_register(&nmi_sysclass); + if (!error) + error = sysdev_register(&device_lapic_nmi); + return error; +} +/* must come after the local APIC's device_initcall() */ +late_initcall(init_lapic_nmi_sysfs); + +#endif /* CONFIG_PM */ + +static void __acpi_nmi_enable(void *__unused) +{ + apic_write_around(APIC_LVT0, APIC_DM_NMI); +} + +/* + * Enable timer based NMIs on all CPUs: + */ +void acpi_nmi_enable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_enable, NULL, 0, 1); +} + +static void __acpi_nmi_disable(void *__unused) +{ + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); +} + +/* + * Disable timer based NMIs on all CPUs: + */ +void acpi_nmi_disable(void) +{ + if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC) + on_each_cpu(__acpi_nmi_disable, NULL, 0, 1); +} + +void setup_apic_nmi_watchdog (void *unused) +{ + if (__get_cpu_var(wd_enabled)) + return; + + /* cheap hack to support suspend/resume */ + /* if cpu0 is not active neither should the other cpus */ + if ((smp_processor_id() != 0) && (atomic_read(&nmi_active) <= 0)) + return; + + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + __get_cpu_var(wd_enabled) = 1; /* enable it before to avoid race with handler */ + if (lapic_watchdog_init(nmi_hz) < 0) { + __get_cpu_var(wd_enabled) = 0; + return; + } + /* FALL THROUGH */ + case NMI_IO_APIC: + __get_cpu_var(wd_enabled) = 1; + atomic_inc(&nmi_active); + } +} + +void stop_apic_nmi_watchdog(void *unused) +{ + /* only support LOCAL and IO APICs for now */ + if ((nmi_watchdog != NMI_LOCAL_APIC) && + (nmi_watchdog != NMI_IO_APIC)) + return; + if (__get_cpu_var(wd_enabled) == 0) + return; + if (nmi_watchdog == NMI_LOCAL_APIC) + lapic_watchdog_stop(); + __get_cpu_var(wd_enabled) = 0; + atomic_dec(&nmi_active); +} + +/* + * the best way to detect whether a CPU has a 'hard lockup' problem + * is to check it's local APIC timer IRQ counts. If they are not + * changing then that CPU has some problem. + * + * as these watchdog NMI IRQs are generated on every CPU, we only + * have to check the current processor. + * + * since NMIs don't listen to _any_ locks, we have to be extremely + * careful not to rely on unsafe variables. The printk might lock + * up though, so we have to break up any console locks first ... + * [when there will be more tty-related locks, break them up + * here too!] + */ + +static unsigned int + last_irq_sums [NR_CPUS], + alert_counter [NR_CPUS]; + +void touch_nmi_watchdog(void) +{ + if (nmi_watchdog > 0) { + unsigned cpu; + + /* + * Just reset the alert counters, (other CPUs might be + * spinning on locks we hold): + */ + for_each_present_cpu(cpu) { + if (alert_counter[cpu]) + alert_counter[cpu] = 0; + } + } + + /* + * Tickle the softlockup detector too: + */ + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +extern void die_nmi(struct pt_regs *, const char *msg); + +__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +{ + + /* + * Since current_thread_info()-> is always on the stack, and we + * always switch the stack NMI-atomically, it's safe to use + * smp_processor_id(). + */ + unsigned int sum; + int touched = 0; + int cpu = smp_processor_id(); + int rc=0; + + /* check for other users first */ + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) { + rc = 1; + touched = 1; + } + + if (cpu_isset(cpu, backtrace_mask)) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk("NMI backtrace for cpu %d\n", cpu); + dump_stack(); + spin_unlock(&lock); + cpu_clear(cpu, backtrace_mask); + } + + /* + * Take the local apic timer and PIT/HPET into account. We don't + * know which one is active, when we have highres/dyntick on + */ + sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_cpu(cpu).irqs[0]; + + /* if the none of the timers isn't firing, this cpu isn't doing much */ + if (!touched && last_irq_sums[cpu] == sum) { + /* + * Ayiee, looks like this CPU is stuck ... + * wait a few IRQs (5 seconds) before doing the oops ... + */ + alert_counter[cpu]++; + if (alert_counter[cpu] == 5*nmi_hz) + /* + * die_nmi will return ONLY if NOTIFY_STOP happens.. + */ + die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); + } else { + last_irq_sums[cpu] = sum; + alert_counter[cpu] = 0; + } + /* see if the nmi watchdog went off */ + if (!__get_cpu_var(wd_enabled)) + return rc; + switch (nmi_watchdog) { + case NMI_LOCAL_APIC: + rc |= lapic_wd_event(nmi_hz); + break; + case NMI_IO_APIC: + /* don't know how to accurately check for this. + * just assume it was a watchdog timer interrupt + * This matches the old behaviour. + */ + rc = 1; + break; + } + return rc; +} + +int do_nmi_callback(struct pt_regs * regs, int cpu) +{ +#ifdef CONFIG_SYSCTL + if (unknown_nmi_panic) + return unknown_nmi_panic_callback(regs, cpu); +#endif + return 0; +} + +#ifdef CONFIG_SYSCTL + +static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) +{ + unsigned char reason = get_nmi_reason(); + char buf[64]; + + sprintf(buf, "NMI received for unknown reason %02x\n", reason); + die_nmi(regs, buf); + return 0; +} + +/* + * proc handler for /proc/sys/kernel/nmi + */ +int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int old_state; + + nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; + old_state = nmi_watchdog_enabled; + proc_dointvec(table, write, file, buffer, length, ppos); + if (!!old_state == !!nmi_watchdog_enabled) + return 0; + + if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { + printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); + return -EIO; + } + + if (nmi_watchdog == NMI_DEFAULT) { + if (lapic_watchdog_ok()) + nmi_watchdog = NMI_LOCAL_APIC; + else + nmi_watchdog = NMI_IO_APIC; + } + + if (nmi_watchdog == NMI_LOCAL_APIC) { + if (nmi_watchdog_enabled) + enable_lapic_nmi_watchdog(); + else + disable_lapic_nmi_watchdog(); + } else { + printk( KERN_WARNING + "NMI watchdog doesn't know what hardware to touch\n"); + return -EIO; + } + return 0; +} + +#endif + +void __trigger_all_cpu_backtrace(void) +{ + int i; + + backtrace_mask = cpu_online_map; + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpus_empty(backtrace_mask)) + break; + mdelay(1); + } +} + +EXPORT_SYMBOL(nmi_active); +EXPORT_SYMBOL(nmi_watchdog); diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c new file mode 100644 index 000000000000..9000d82c6dc0 --- /dev/null +++ b/arch/x86/kernel/numaq_32.c @@ -0,0 +1,89 @@ +/* + * Written by: Patricia Gaughen, IBM Corporation + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) + +/* + * Function: smp_dump_qct() + * + * Description: gets memory layout from the quad config table. This + * function also updates node_online_map with the nodes (quads) present. + */ +static void __init smp_dump_qct(void) +{ + int node; + struct eachquadmem *eq; + struct sys_cfg_data *scd = + (struct sys_cfg_data *)__va(SYS_CFG_DATA_PRIV_ADDR); + + nodes_clear(node_online_map); + for_each_node(node) { + if (scd->quads_present31_0 & (1 << node)) { + node_set_online(node); + eq = &scd->eq[node]; + /* Convert to pages */ + node_start_pfn[node] = MB_TO_PAGES( + eq->hi_shrd_mem_start - eq->priv_mem_size); + node_end_pfn[node] = MB_TO_PAGES( + eq->hi_shrd_mem_start + eq->hi_shrd_mem_size); + + memory_present(node, + node_start_pfn[node], node_end_pfn[node]); + node_remap_size[node] = node_memmap_size_bytes(node, + node_start_pfn[node], + node_end_pfn[node]); + } + } +} + +/* + * Unlike Summit, we don't really care to let the NUMA-Q + * fall back to flat mode. Don't compile for NUMA-Q + * unless you really need it! + */ +int __init get_memcfg_numaq(void) +{ + smp_dump_qct(); + return 1; +} + +static int __init numaq_tsc_disable(void) +{ + if (num_online_nodes() > 1) { + printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); + tsc_disable = 1; + } + return 0; +} +arch_initcall(numaq_tsc_disable); diff --git a/arch/x86/kernel/paravirt_32.c b/arch/x86/kernel/paravirt_32.c new file mode 100644 index 000000000000..739cfb207dd7 --- /dev/null +++ b/arch/x86/kernel/paravirt_32.c @@ -0,0 +1,392 @@ +/* Paravirtualization interfaces + Copyright (C) 2006 Rusty Russell IBM Corporation + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* nop stub */ +void _paravirt_nop(void) +{ +} + +static void __init default_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + paravirt_ops.name); +} + +char *memory_setup(void) +{ + return paravirt_ops.memory_setup(); +} + +/* Simple instruction patching code. */ +#define DEF_NATIVE(name, code) \ + extern const char start_##name[], end_##name[]; \ + asm("start_" #name ": " code "; end_" #name ":") + +DEF_NATIVE(irq_disable, "cli"); +DEF_NATIVE(irq_enable, "sti"); +DEF_NATIVE(restore_fl, "push %eax; popf"); +DEF_NATIVE(save_fl, "pushf; pop %eax"); +DEF_NATIVE(iret, "iret"); +DEF_NATIVE(irq_enable_sysexit, "sti; sysexit"); +DEF_NATIVE(read_cr2, "mov %cr2, %eax"); +DEF_NATIVE(write_cr3, "mov %eax, %cr3"); +DEF_NATIVE(read_cr3, "mov %cr3, %eax"); +DEF_NATIVE(clts, "clts"); +DEF_NATIVE(read_tsc, "rdtsc"); + +DEF_NATIVE(ud2a, "ud2a"); + +static unsigned native_patch(u8 type, u16 clobbers, void *ibuf, + unsigned long addr, unsigned len) +{ + const unsigned char *start, *end; + unsigned ret; + + switch(type) { +#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site + SITE(irq_disable); + SITE(irq_enable); + SITE(restore_fl); + SITE(save_fl); + SITE(iret); + SITE(irq_enable_sysexit); + SITE(read_cr2); + SITE(read_cr3); + SITE(write_cr3); + SITE(clts); + SITE(read_tsc); +#undef SITE + + patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; + + case PARAVIRT_PATCH(make_pgd): + case PARAVIRT_PATCH(make_pte): + case PARAVIRT_PATCH(pgd_val): + case PARAVIRT_PATCH(pte_val): +#ifdef CONFIG_X86_PAE + case PARAVIRT_PATCH(make_pmd): + case PARAVIRT_PATCH(pmd_val): +#endif + /* These functions end up returning exactly what + they're passed, in the same registers. */ + ret = paravirt_patch_nop(); + break; + + default: + ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); + break; + } + + return ret; +} + +unsigned paravirt_patch_nop(void) +{ + return 0; +} + +unsigned paravirt_patch_ignore(unsigned len) +{ + return len; +} + +struct branch { + unsigned char opcode; + u32 delta; +} __attribute__((packed)); + +unsigned paravirt_patch_call(void *insnbuf, + const void *target, u16 tgt_clobbers, + unsigned long addr, u16 site_clobbers, + unsigned len) +{ + struct branch *b = insnbuf; + unsigned long delta = (unsigned long)target - (addr+5); + + if (tgt_clobbers & ~site_clobbers) + return len; /* target would clobber too much for this site */ + if (len < 5) + return len; /* call too long for patch site */ + + b->opcode = 0xe8; /* call */ + b->delta = delta; + BUILD_BUG_ON(sizeof(*b) != 5); + + return 5; +} + +unsigned paravirt_patch_jmp(const void *target, void *insnbuf, + unsigned long addr, unsigned len) +{ + struct branch *b = insnbuf; + unsigned long delta = (unsigned long)target - (addr+5); + + if (len < 5) + return len; /* call too long for patch site */ + + b->opcode = 0xe9; /* jmp */ + b->delta = delta; + + return 5; +} + +unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, + unsigned long addr, unsigned len) +{ + void *opfunc = *((void **)¶virt_ops + type); + unsigned ret; + + if (opfunc == NULL) + /* If there's no function, patch it with a ud2a (BUG) */ + ret = paravirt_patch_insns(insnbuf, len, start_ud2a, end_ud2a); + else if (opfunc == paravirt_nop) + /* If the operation is a nop, then nop the callsite */ + ret = paravirt_patch_nop(); + else if (type == PARAVIRT_PATCH(iret) || + type == PARAVIRT_PATCH(irq_enable_sysexit)) + /* If operation requires a jmp, then jmp */ + ret = paravirt_patch_jmp(opfunc, insnbuf, addr, len); + else + /* Otherwise call the function; assume target could + clobber any caller-save reg */ + ret = paravirt_patch_call(insnbuf, opfunc, CLBR_ANY, + addr, clobbers, len); + + return ret; +} + +unsigned paravirt_patch_insns(void *insnbuf, unsigned len, + const char *start, const char *end) +{ + unsigned insn_len = end - start; + + if (insn_len > len || start == NULL) + insn_len = len; + else + memcpy(insnbuf, start, insn_len); + + return insn_len; +} + +void init_IRQ(void) +{ + paravirt_ops.init_IRQ(); +} + +static void native_flush_tlb(void) +{ + __native_flush_tlb(); +} + +/* + * Global pages have to be flushed a bit differently. Not a real + * performance problem because this does not happen often. + */ +static void native_flush_tlb_global(void) +{ + __native_flush_tlb_global(); +} + +static void native_flush_tlb_single(unsigned long addr) +{ + __native_flush_tlb_single(addr); +} + +/* These are in entry.S */ +extern void native_iret(void); +extern void native_irq_enable_sysexit(void); + +static int __init print_banner(void) +{ + paravirt_ops.banner(); + return 0; +} +core_initcall(print_banner); + +static struct resource reserve_ioports = { + .start = 0, + .end = IO_SPACE_LIMIT, + .name = "paravirt-ioport", + .flags = IORESOURCE_IO | IORESOURCE_BUSY, +}; + +static struct resource reserve_iomem = { + .start = 0, + .end = -1, + .name = "paravirt-iomem", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + +/* + * Reserve the whole legacy IO space to prevent any legacy drivers + * from wasting time probing for their hardware. This is a fairly + * brute-force approach to disabling all non-virtual drivers. + * + * Note that this must be called very early to have any effect. + */ +int paravirt_disable_iospace(void) +{ + int ret; + + ret = request_resource(&ioport_resource, &reserve_ioports); + if (ret == 0) { + ret = request_resource(&iomem_resource, &reserve_iomem); + if (ret) + release_resource(&reserve_ioports); + } + + return ret; +} + +struct paravirt_ops paravirt_ops = { + .name = "bare hardware", + .paravirt_enabled = 0, + .kernel_rpl = 0, + .shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */ + + .patch = native_patch, + .banner = default_banner, + .arch_setup = paravirt_nop, + .memory_setup = machine_specific_memory_setup, + .get_wallclock = native_get_wallclock, + .set_wallclock = native_set_wallclock, + .time_init = hpet_time_init, + .init_IRQ = native_init_IRQ, + + .cpuid = native_cpuid, + .get_debugreg = native_get_debugreg, + .set_debugreg = native_set_debugreg, + .clts = native_clts, + .read_cr0 = native_read_cr0, + .write_cr0 = native_write_cr0, + .read_cr2 = native_read_cr2, + .write_cr2 = native_write_cr2, + .read_cr3 = native_read_cr3, + .write_cr3 = native_write_cr3, + .read_cr4 = native_read_cr4, + .read_cr4_safe = native_read_cr4_safe, + .write_cr4 = native_write_cr4, + .save_fl = native_save_fl, + .restore_fl = native_restore_fl, + .irq_disable = native_irq_disable, + .irq_enable = native_irq_enable, + .safe_halt = native_safe_halt, + .halt = native_halt, + .wbinvd = native_wbinvd, + .read_msr = native_read_msr_safe, + .write_msr = native_write_msr_safe, + .read_tsc = native_read_tsc, + .read_pmc = native_read_pmc, + .sched_clock = native_sched_clock, + .get_cpu_khz = native_calculate_cpu_khz, + .load_tr_desc = native_load_tr_desc, + .set_ldt = native_set_ldt, + .load_gdt = native_load_gdt, + .load_idt = native_load_idt, + .store_gdt = native_store_gdt, + .store_idt = native_store_idt, + .store_tr = native_store_tr, + .load_tls = native_load_tls, + .write_ldt_entry = write_dt_entry, + .write_gdt_entry = write_dt_entry, + .write_idt_entry = write_dt_entry, + .load_esp0 = native_load_esp0, + + .set_iopl_mask = native_set_iopl_mask, + .io_delay = native_io_delay, + +#ifdef CONFIG_X86_LOCAL_APIC + .apic_write = native_apic_write, + .apic_write_atomic = native_apic_write_atomic, + .apic_read = native_apic_read, + .setup_boot_clock = setup_boot_APIC_clock, + .setup_secondary_clock = setup_secondary_APIC_clock, + .startup_ipi_hook = paravirt_nop, +#endif + .set_lazy_mode = paravirt_nop, + + .pagetable_setup_start = native_pagetable_setup_start, + .pagetable_setup_done = native_pagetable_setup_done, + + .flush_tlb_user = native_flush_tlb, + .flush_tlb_kernel = native_flush_tlb_global, + .flush_tlb_single = native_flush_tlb_single, + .flush_tlb_others = native_flush_tlb_others, + + .alloc_pt = paravirt_nop, + .alloc_pd = paravirt_nop, + .alloc_pd_clone = paravirt_nop, + .release_pt = paravirt_nop, + .release_pd = paravirt_nop, + + .set_pte = native_set_pte, + .set_pte_at = native_set_pte_at, + .set_pmd = native_set_pmd, + .pte_update = paravirt_nop, + .pte_update_defer = paravirt_nop, + +#ifdef CONFIG_HIGHPTE + .kmap_atomic_pte = kmap_atomic, +#endif + +#ifdef CONFIG_X86_PAE + .set_pte_atomic = native_set_pte_atomic, + .set_pte_present = native_set_pte_present, + .set_pud = native_set_pud, + .pte_clear = native_pte_clear, + .pmd_clear = native_pmd_clear, + + .pmd_val = native_pmd_val, + .make_pmd = native_make_pmd, +#endif + + .pte_val = native_pte_val, + .pgd_val = native_pgd_val, + + .make_pte = native_make_pte, + .make_pgd = native_make_pgd, + + .irq_enable_sysexit = native_irq_enable_sysexit, + .iret = native_iret, + + .dup_mmap = paravirt_nop, + .exit_mmap = paravirt_nop, + .activate_mm = paravirt_nop, +}; + +EXPORT_SYMBOL(paravirt_ops); diff --git a/arch/x86/kernel/pci-dma_32.c b/arch/x86/kernel/pci-dma_32.c new file mode 100644 index 000000000000..048f09b62553 --- /dev/null +++ b/arch/x86/kernel/pci-dma_32.c @@ -0,0 +1,177 @@ +/* + * Dynamic DMA mapping support. + * + * On i386 there is no hardware dynamic DMA address translation, + * so consistent alloc/free are merely page allocation/freeing. + * The rest of the dynamic DMA mapping interface is implemented + * in asm/pci.h. + */ + +#include +#include +#include +#include +#include +#include +#include + +struct dma_coherent_mem { + void *virt_base; + u32 device_base; + int size; + int flags; + unsigned long *bitmap; +}; + +void *dma_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t gfp) +{ + void *ret; + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + int order = get_order(size); + /* ignore region specifiers */ + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM); + + if (mem) { + int page = bitmap_find_free_region(mem->bitmap, mem->size, + order); + if (page >= 0) { + *dma_handle = mem->device_base + (page << PAGE_SHIFT); + ret = mem->virt_base + (page << PAGE_SHIFT); + memset(ret, 0, size); + return ret; + } + if (mem->flags & DMA_MEMORY_EXCLUSIVE) + return NULL; + } + + if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff)) + gfp |= GFP_DMA; + + ret = (void *)__get_free_pages(gfp, order); + + if (ret != NULL) { + memset(ret, 0, size); + *dma_handle = virt_to_phys(ret); + } + return ret; +} +EXPORT_SYMBOL(dma_alloc_coherent); + +void dma_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; + int order = get_order(size); + + if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) { + int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; + + bitmap_release_region(mem->bitmap, page, order); + } else + free_pages((unsigned long)vaddr, order); +} +EXPORT_SYMBOL(dma_free_coherent); + +int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, + dma_addr_t device_addr, size_t size, int flags) +{ + void __iomem *mem_base = NULL; + int pages = size >> PAGE_SHIFT; + int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); + + if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) + goto out; + if (!size) + goto out; + if (dev->dma_mem) + goto out; + + /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ + + mem_base = ioremap(bus_addr, size); + if (!mem_base) + goto out; + + dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); + if (!dev->dma_mem) + goto out; + dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + if (!dev->dma_mem->bitmap) + goto free1_out; + + dev->dma_mem->virt_base = mem_base; + dev->dma_mem->device_base = device_addr; + dev->dma_mem->size = pages; + dev->dma_mem->flags = flags; + + if (flags & DMA_MEMORY_MAP) + return DMA_MEMORY_MAP; + + return DMA_MEMORY_IO; + + free1_out: + kfree(dev->dma_mem); + out: + if (mem_base) + iounmap(mem_base); + return 0; +} +EXPORT_SYMBOL(dma_declare_coherent_memory); + +void dma_release_declared_memory(struct device *dev) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + + if(!mem) + return; + dev->dma_mem = NULL; + iounmap(mem->virt_base); + kfree(mem->bitmap); + kfree(mem); +} +EXPORT_SYMBOL(dma_release_declared_memory); + +void *dma_mark_declared_memory_occupied(struct device *dev, + dma_addr_t device_addr, size_t size) +{ + struct dma_coherent_mem *mem = dev->dma_mem; + int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT; + int pos, err; + + if (!mem) + return ERR_PTR(-EINVAL); + + pos = (device_addr - mem->device_base) >> PAGE_SHIFT; + err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); + if (err != 0) + return ERR_PTR(err); + return mem->virt_base + (pos << PAGE_SHIFT); +} +EXPORT_SYMBOL(dma_mark_declared_memory_occupied); + +#ifdef CONFIG_PCI +/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ + +int forbid_dac; +EXPORT_SYMBOL(forbid_dac); + +static __devinit void via_no_dac(struct pci_dev *dev) +{ + if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { + printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n"); + forbid_dac = 1; + } +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); + +static int check_iommu(char *s) +{ + if (!strcmp(s, "usedac")) { + forbid_dac = -1; + return 1; + } + return 0; +} +__setup("iommu=", check_iommu); +#endif diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c new file mode 100644 index 000000000000..bc1f2d3ea277 --- /dev/null +++ b/arch/x86/kernel/pcspeaker.c @@ -0,0 +1,20 @@ +#include +#include +#include + +static __init int add_pcspkr(void) +{ + struct platform_device *pd; + int ret; + + pd = platform_device_alloc("pcspkr", -1); + if (!pd) + return -ENOMEM; + + ret = platform_device_add(pd); + if (ret) + platform_device_put(pd); + + return ret; +} +device_initcall(add_pcspkr); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c new file mode 100644 index 000000000000..84664710b784 --- /dev/null +++ b/arch/x86/kernel/process_32.c @@ -0,0 +1,951 @@ +/* + * linux/arch/i386/kernel/process.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +/* + * This file handles the architecture-dependent parts of process handling.. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_MATH_EMULATION +#include +#endif + +#include + +#include +#include + +asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); + +static int hlt_counter; + +unsigned long boot_option_idle_override = 0; +EXPORT_SYMBOL(boot_option_idle_override); + +DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; +EXPORT_PER_CPU_SYMBOL(current_task); + +DEFINE_PER_CPU(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); + +/* + * Return saved PC of a blocked thread. + */ +unsigned long thread_saved_pc(struct task_struct *tsk) +{ + return ((unsigned long *)tsk->thread.esp)[3]; +} + +/* + * Powermanagement idle function, if any.. + */ +void (*pm_idle)(void); +EXPORT_SYMBOL(pm_idle); +static DEFINE_PER_CPU(unsigned int, cpu_idle_state); + +void disable_hlt(void) +{ + hlt_counter++; +} + +EXPORT_SYMBOL(disable_hlt); + +void enable_hlt(void) +{ + hlt_counter--; +} + +EXPORT_SYMBOL(enable_hlt); + +/* + * We use this if we don't have any better + * idle routine.. + */ +void default_idle(void) +{ + if (!hlt_counter && boot_cpu_data.hlt_works_ok) { + current_thread_info()->status &= ~TS_POLLING; + /* + * TS_POLLING-cleared state must be visible before we + * test NEED_RESCHED: + */ + smp_mb(); + + local_irq_disable(); + if (!need_resched()) + safe_halt(); /* enables interrupts racelessly */ + else + local_irq_enable(); + current_thread_info()->status |= TS_POLLING; + } else { + /* loop is done by the caller */ + cpu_relax(); + } +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(default_idle); +#endif + +/* + * On SMP it's slightly faster (but much more power-consuming!) + * to poll the ->work.need_resched flag instead of waiting for the + * cross-CPU IPI to arrive. Use this option with caution. + */ +static void poll_idle (void) +{ + cpu_relax(); +} + +#ifdef CONFIG_HOTPLUG_CPU +#include +/* We don't actually take CPU down, just spin without interrupts. */ +static inline void play_dead(void) +{ + /* This must be done before dead CPU ack */ + cpu_exit_clear(); + wbinvd(); + mb(); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + /* + * With physical CPU hotplug, we should halt the cpu + */ + local_irq_disable(); + while (1) + halt(); +} +#else +static inline void play_dead(void) +{ + BUG(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +/* + * The idle thread. There's no useful work to be + * done, so just try to conserve power and have a + * low exit latency (ie sit in a loop waiting for + * somebody to say that they'd like to reschedule) + */ +void cpu_idle(void) +{ + int cpu = smp_processor_id(); + + current_thread_info()->status |= TS_POLLING; + + /* endless idle loop with no priority at all */ + while (1) { + tick_nohz_stop_sched_tick(); + while (!need_resched()) { + void (*idle)(void); + + if (__get_cpu_var(cpu_idle_state)) + __get_cpu_var(cpu_idle_state) = 0; + + check_pgt_cache(); + rmb(); + idle = pm_idle; + + if (!idle) + idle = default_idle; + + if (cpu_is_offline(cpu)) + play_dead(); + + __get_cpu_var(irq_stat).idle_timestamp = jiffies; + idle(); + } + tick_nohz_restart_sched_tick(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } +} + +void cpu_idle_wait(void) +{ + unsigned int cpu, this_cpu = get_cpu(); + cpumask_t map, tmp = current->cpus_allowed; + + set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); + put_cpu(); + + cpus_clear(map); + for_each_online_cpu(cpu) { + per_cpu(cpu_idle_state, cpu) = 1; + cpu_set(cpu, map); + } + + __get_cpu_var(cpu_idle_state) = 0; + + wmb(); + do { + ssleep(1); + for_each_online_cpu(cpu) { + if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu)) + cpu_clear(cpu, map); + } + cpus_and(map, map, cpu_online_map); + } while (!cpus_empty(map)); + + set_cpus_allowed(current, tmp); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) +{ + if (!need_resched()) { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __mwait(eax, ecx); + } +} + +/* Default MONITOR/MWAIT with no hints, used for default C1 state */ +static void mwait_idle(void) +{ + local_irq_enable(); + mwait_idle_with_hints(0, 0); +} + +void __devinit select_idle_routine(const struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_MWAIT)) { + printk("monitor/mwait feature present.\n"); + /* + * Skip, if setup has overridden idle. + * One CPU supports mwait => All CPUs supports mwait + */ + if (!pm_idle) { + printk("using mwait in idle threads.\n"); + pm_idle = mwait_idle; + } + } +} + +static int __init idle_setup(char *str) +{ + if (!strcmp(str, "poll")) { + printk("using polling idle threads.\n"); + pm_idle = poll_idle; +#ifdef CONFIG_X86_SMP + if (smp_num_siblings > 1) + printk("WARNING: polling idle and HT enabled, performance may degrade.\n"); +#endif + } else if (!strcmp(str, "mwait")) + force_mwait = 1; + else + return -1; + + boot_option_idle_override = 1; + return 0; +} +early_param("idle", idle_setup); + +void show_regs(struct pt_regs * regs) +{ + unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; + unsigned long d0, d1, d2, d3, d6, d7; + + printk("\n"); + printk("Pid: %d, comm: %20s\n", current->pid, current->comm); + printk("EIP: %04x:[<%08lx>] CPU: %d\n",0xffff & regs->xcs,regs->eip, smp_processor_id()); + print_symbol("EIP is at %s\n", regs->eip); + + if (user_mode_vm(regs)) + printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); + printk(" EFLAGS: %08lx %s (%s %.*s)\n", + regs->eflags, print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + regs->eax,regs->ebx,regs->ecx,regs->edx); + printk("ESI: %08lx EDI: %08lx EBP: %08lx", + regs->esi, regs->edi, regs->ebp); + printk(" DS: %04x ES: %04x FS: %04x\n", + 0xffff & regs->xds,0xffff & regs->xes, 0xffff & regs->xfs); + + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4_safe(); + printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + get_debugreg(d3, 3); + printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + d0, d1, d2, d3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + printk("DR6: %08lx DR7: %08lx\n", d6, d7); + + show_trace(NULL, regs, ®s->esp); +} + +/* + * This gets run with %ebx containing the + * function to call, and %edx containing + * the "args". + */ +extern void kernel_thread_helper(void); + +/* + * Create a kernel thread + */ +int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) +{ + struct pt_regs regs; + + memset(®s, 0, sizeof(regs)); + + regs.ebx = (unsigned long) fn; + regs.edx = (unsigned long) arg; + + regs.xds = __USER_DS; + regs.xes = __USER_DS; + regs.xfs = __KERNEL_PERCPU; + regs.orig_eax = -1; + regs.eip = (unsigned long) kernel_thread_helper; + regs.xcs = __KERNEL_CS | get_kernel_rpl(); + regs.eflags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2; + + /* Ok, create the new process.. */ + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); +} +EXPORT_SYMBOL(kernel_thread); + +/* + * Free current thread data structures etc.. + */ +void exit_thread(void) +{ + /* The process may have allocated an io port bitmap... nuke it. */ + if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { + struct task_struct *tsk = current; + struct thread_struct *t = &tsk->thread; + int cpu = get_cpu(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + + kfree(t->io_bitmap_ptr); + t->io_bitmap_ptr = NULL; + clear_thread_flag(TIF_IO_BITMAP); + /* + * Careful, clear this in the TSS too: + */ + memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); + t->io_bitmap_max = 0; + tss->io_bitmap_owner = NULL; + tss->io_bitmap_max = 0; + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; + put_cpu(); + } +} + +void flush_thread(void) +{ + struct task_struct *tsk = current; + + memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); + memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); + clear_tsk_thread_flag(tsk, TIF_DEBUG); + /* + * Forget coprocessor state.. + */ + clear_fpu(tsk); + clear_used_math(); +} + +void release_thread(struct task_struct *dead_task) +{ + BUG_ON(dead_task->mm); + release_vm86_irqs(dead_task); +} + +/* + * This gets called before we allocate a new thread and copy + * the current task into it. + */ +void prepare_to_copy(struct task_struct *tsk) +{ + unlazy_fpu(tsk); +} + +int copy_thread(int nr, unsigned long clone_flags, unsigned long esp, + unsigned long unused, + struct task_struct * p, struct pt_regs * regs) +{ + struct pt_regs * childregs; + struct task_struct *tsk; + int err; + + childregs = task_pt_regs(p); + *childregs = *regs; + childregs->eax = 0; + childregs->esp = esp; + + p->thread.esp = (unsigned long) childregs; + p->thread.esp0 = (unsigned long) (childregs+1); + + p->thread.eip = (unsigned long) ret_from_fork; + + savesegment(gs,p->thread.gs); + + tsk = current; + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { + p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, + IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { + p->thread.io_bitmap_max = 0; + return -ENOMEM; + } + set_tsk_thread_flag(p, TIF_IO_BITMAP); + } + + /* + * Set a new TLS for the child thread? + */ + if (clone_flags & CLONE_SETTLS) { + struct desc_struct *desc; + struct user_desc info; + int idx; + + err = -EFAULT; + if (copy_from_user(&info, (void __user *)childregs->esi, sizeof(info))) + goto out; + err = -EINVAL; + if (LDT_empty(&info)) + goto out; + + idx = info.entry_number; + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + goto out; + + desc = p->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } + + err = 0; + out: + if (err && p->thread.io_bitmap_ptr) { + kfree(p->thread.io_bitmap_ptr); + p->thread.io_bitmap_max = 0; + } + return err; +} + +/* + * fill in the user structure for a core dump.. + */ +void dump_thread(struct pt_regs * regs, struct user * dump) +{ + int i; + +/* changed the size calculations - should hopefully work better. lbt */ + dump->magic = CMAGIC; + dump->start_code = 0; + dump->start_stack = regs->esp & ~(PAGE_SIZE - 1); + dump->u_tsize = ((unsigned long) current->mm->end_code) >> PAGE_SHIFT; + dump->u_dsize = ((unsigned long) (current->mm->brk + (PAGE_SIZE-1))) >> PAGE_SHIFT; + dump->u_dsize -= dump->u_tsize; + dump->u_ssize = 0; + for (i = 0; i < 8; i++) + dump->u_debugreg[i] = current->thread.debugreg[i]; + + if (dump->start_stack < TASK_SIZE) + dump->u_ssize = ((unsigned long) (TASK_SIZE - dump->start_stack)) >> PAGE_SHIFT; + + dump->regs.ebx = regs->ebx; + dump->regs.ecx = regs->ecx; + dump->regs.edx = regs->edx; + dump->regs.esi = regs->esi; + dump->regs.edi = regs->edi; + dump->regs.ebp = regs->ebp; + dump->regs.eax = regs->eax; + dump->regs.ds = regs->xds; + dump->regs.es = regs->xes; + dump->regs.fs = regs->xfs; + savesegment(gs,dump->regs.gs); + dump->regs.orig_eax = regs->orig_eax; + dump->regs.eip = regs->eip; + dump->regs.cs = regs->xcs; + dump->regs.eflags = regs->eflags; + dump->regs.esp = regs->esp; + dump->regs.ss = regs->xss; + + dump->u_fpvalid = dump_fpu (regs, &dump->i387); +} +EXPORT_SYMBOL(dump_thread); + +/* + * Capture the user space registers if the task is not running (in user space) + */ +int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs) +{ + struct pt_regs ptregs = *task_pt_regs(tsk); + ptregs.xcs &= 0xffff; + ptregs.xds &= 0xffff; + ptregs.xes &= 0xffff; + ptregs.xss &= 0xffff; + + elf_core_copy_regs(regs, &ptregs); + + return 1; +} + +#ifdef CONFIG_SECCOMP +void hard_disable_TSC(void) +{ + write_cr4(read_cr4() | X86_CR4_TSD); +} +void disable_TSC(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_disable_TSC(); + preempt_enable(); +} +void hard_enable_TSC(void) +{ + write_cr4(read_cr4() & ~X86_CR4_TSD); +} +#endif /* CONFIG_SECCOMP */ + +static noinline void +__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *next; + + next = &next_p->thread; + + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + set_debugreg(next->debugreg[0], 0); + set_debugreg(next->debugreg[1], 1); + set_debugreg(next->debugreg[2], 2); + set_debugreg(next->debugreg[3], 3); + /* no 4 and 5 */ + set_debugreg(next->debugreg[6], 6); + set_debugreg(next->debugreg[7], 7); + } + +#ifdef CONFIG_SECCOMP + if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ + test_tsk_thread_flag(next_p, TIF_NOTSC)) { + /* prev and next are different */ + if (test_tsk_thread_flag(next_p, TIF_NOTSC)) + hard_disable_TSC(); + else + hard_enable_TSC(); + } +#endif + + if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + /* + * Disable the bitmap via an invalid offset. We still cache + * the previous bitmap owner and the IO bitmap contents: + */ + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; + return; + } + + if (likely(next == tss->io_bitmap_owner)) { + /* + * Previous owner of the bitmap (hence the bitmap content) + * matches the next task, we dont have to do anything but + * to set a valid offset in the TSS: + */ + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + return; + } + /* + * Lazy TSS's I/O bitmap copy. We set an invalid offset here + * and we let the task to get a GPF in case an I/O instruction + * is performed. The handler of the GPF will verify that the + * faulting task has a valid I/O bitmap and, it true, does the + * real copy and restart the instruction. This will save us + * redundant copies when the currently switched task does not + * perform any I/O during its timeslice. + */ + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; +} + +/* + * switch_to(x,yn) should switch tasks from x to y. + * + * We fsave/fwait so that an exception goes off at the right time + * (as a call from the fsave or fwait in effect) rather than to + * the wrong process. Lazy FP saving no longer makes any sense + * with modern CPU's, and this simplifies a lot of things (SMP + * and UP become the same). + * + * NOTE! We used to use the x86 hardware context switching. The + * reason for not using it any more becomes apparent when you + * try to recover gracefully from saved state that is no longer + * valid (stale segment register values in particular). With the + * hardware task-switch, there is no way to fix up bad state in + * a reasonable manner. + * + * The fact that Intel documents the hardware task-switching to + * be slow is a fairly red herring - this code is not noticeably + * faster. However, there _is_ some room for improvement here, + * so the performance issues may eventually be a valid point. + * More important, however, is the fact that this allows us much + * more flexibility. + * + * The return value (in %eax) will be the "prev" task after + * the task-switch, and shows up in ret_from_fork in entry.S, + * for example. + */ +struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) +{ + struct thread_struct *prev = &prev_p->thread, + *next = &next_p->thread; + int cpu = smp_processor_id(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + + /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ + + __unlazy_fpu(prev_p); + + + /* we're going to use this soon, after a few expensive things */ + if (next_p->fpu_counter > 5) + prefetch(&next->i387.fxsave); + + /* + * Reload esp0. + */ + load_esp0(tss, next); + + /* + * Save away %gs. No need to save %fs, as it was saved on the + * stack on entry. No need to save %es and %ds, as those are + * always kernel segments while inside the kernel. Doing this + * before setting the new TLS descriptors avoids the situation + * where we temporarily have non-reloadable segments in %fs + * and %gs. This could be an issue if the NMI handler ever + * used %fs or %gs (it does not today), or if the kernel is + * running inside of a hypervisor layer. + */ + savesegment(gs, prev->gs); + + /* + * Load the per-thread Thread-Local Storage descriptor. + */ + load_TLS(next, cpu); + + /* + * Restore IOPL if needed. In normal use, the flags restore + * in the switch assembly will handle this. But if the kernel + * is running virtualized at a non-zero CPL, the popf will + * not restore flags, so it must be done in a separate step. + */ + if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) + set_iopl_mask(next->iopl); + + /* + * Now maybe handle debug registers and/or IO bitmaps + */ + if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || + task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) + __switch_to_xtra(prev_p, next_p, tss); + + /* + * Leave lazy mode, flushing any hypercalls made here. + * This must be done before restoring TLS segments so + * the GDT and LDT are properly updated, and must be + * done before math_state_restore, so the TS bit is up + * to date. + */ + arch_leave_lazy_cpu_mode(); + + /* If the task has used fpu the last 5 timeslices, just do a full + * restore of the math state immediately to avoid the trap; the + * chances of needing FPU soon are obviously high now + */ + if (next_p->fpu_counter > 5) + math_state_restore(); + + /* + * Restore %gs if needed (which is common) + */ + if (prev->gs | next->gs) + loadsegment(gs, next->gs); + + x86_write_percpu(current_task, next_p); + + return prev_p; +} + +asmlinkage int sys_fork(struct pt_regs regs) +{ + return do_fork(SIGCHLD, regs.esp, ®s, 0, NULL, NULL); +} + +asmlinkage int sys_clone(struct pt_regs regs) +{ + unsigned long clone_flags; + unsigned long newsp; + int __user *parent_tidptr, *child_tidptr; + + clone_flags = regs.ebx; + newsp = regs.ecx; + parent_tidptr = (int __user *)regs.edx; + child_tidptr = (int __user *)regs.edi; + if (!newsp) + newsp = regs.esp; + return do_fork(clone_flags, newsp, ®s, 0, parent_tidptr, child_tidptr); +} + +/* + * This is trivial, and on the face of it looks like it + * could equally well be done in user mode. + * + * Not so, for quite unobvious reasons - register pressure. + * In user mode vfork() cannot have a stack frame, and if + * done by calling the "clone()" system call directly, you + * do not have enough call-clobbered registers to hold all + * the information you need. + */ +asmlinkage int sys_vfork(struct pt_regs regs) +{ + return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.esp, ®s, 0, NULL, NULL); +} + +/* + * sys_execve() executes a new program. + */ +asmlinkage int sys_execve(struct pt_regs regs) +{ + int error; + char * filename; + + filename = getname((char __user *) regs.ebx); + error = PTR_ERR(filename); + if (IS_ERR(filename)) + goto out; + error = do_execve(filename, + (char __user * __user *) regs.ecx, + (char __user * __user *) regs.edx, + ®s); + if (error == 0) { + task_lock(current); + current->ptrace &= ~PT_DTRACE; + task_unlock(current); + /* Make sure we don't return using sysenter.. */ + set_thread_flag(TIF_IRET); + } + putname(filename); +out: + return error; +} + +#define top_esp (THREAD_SIZE - sizeof(unsigned long)) +#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) + +unsigned long get_wchan(struct task_struct *p) +{ + unsigned long ebp, esp, eip; + unsigned long stack_page; + int count = 0; + if (!p || p == current || p->state == TASK_RUNNING) + return 0; + stack_page = (unsigned long)task_stack_page(p); + esp = p->thread.esp; + if (!stack_page || esp < stack_page || esp > top_esp+stack_page) + return 0; + /* include/asm-i386/system.h:switch_to() pushes ebp last. */ + ebp = *(unsigned long *) esp; + do { + if (ebp < stack_page || ebp > top_ebp+stack_page) + return 0; + eip = *(unsigned long *) (ebp+4); + if (!in_sched_functions(eip)) + return eip; + ebp = *(unsigned long *) ebp; + } while (count++ < 16); + return 0; +} + +/* + * sys_alloc_thread_area: get a yet unused TLS descriptor index. + */ +static int get_free_idx(void) +{ + struct thread_struct *t = ¤t->thread; + int idx; + + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) + if (desc_empty(t->tls_array + idx)) + return idx + GDT_ENTRY_TLS_MIN; + return -ESRCH; +} + +/* + * Set a given TLS descriptor: + */ +asmlinkage int sys_set_thread_area(struct user_desc __user *u_info) +{ + struct thread_struct *t = ¤t->thread; + struct user_desc info; + struct desc_struct *desc; + int cpu, idx; + + if (copy_from_user(&info, u_info, sizeof(info))) + return -EFAULT; + idx = info.entry_number; + + /* + * index -1 means the kernel should try to find and + * allocate an empty descriptor: + */ + if (idx == -1) { + idx = get_free_idx(); + if (idx < 0) + return idx; + if (put_user(idx, &u_info->entry_number)) + return -EFAULT; + } + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = t->tls_array + idx - GDT_ENTRY_TLS_MIN; + + /* + * We must not get preempted while modifying the TLS. + */ + cpu = get_cpu(); + + if (LDT_empty(&info)) { + desc->a = 0; + desc->b = 0; + } else { + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } + load_TLS(t, cpu); + + put_cpu(); + + return 0; +} + +/* + * Get the current Thread-Local Storage area: + */ + +#define GET_BASE(desc) ( \ + (((desc)->a >> 16) & 0x0000ffff) | \ + (((desc)->b << 16) & 0x00ff0000) | \ + ( (desc)->b & 0xff000000) ) + +#define GET_LIMIT(desc) ( \ + ((desc)->a & 0x0ffff) | \ + ((desc)->b & 0xf0000) ) + +#define GET_32BIT(desc) (((desc)->b >> 22) & 1) +#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) +#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) +#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) +#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) +#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) + +asmlinkage int sys_get_thread_area(struct user_desc __user *u_info) +{ + struct user_desc info; + struct desc_struct *desc; + int idx; + + if (get_user(idx, &u_info->entry_number)) + return -EFAULT; + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + memset(&info, 0, sizeof(info)); + + desc = current->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + + info.entry_number = idx; + info.base_addr = GET_BASE(desc); + info.limit = GET_LIMIT(desc); + info.seg_32bit = GET_32BIT(desc); + info.contents = GET_CONTENTS(desc); + info.read_exec_only = !GET_WRITABLE(desc); + info.limit_in_pages = GET_LIMIT_PAGES(desc); + info.seg_not_present = !GET_PRESENT(desc); + info.useable = GET_USEABLE(desc); + + if (copy_to_user(u_info, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} diff --git a/arch/x86/kernel/ptrace_32.c b/arch/x86/kernel/ptrace_32.c new file mode 100644 index 000000000000..7c1b92522e95 --- /dev/null +++ b/arch/x86/kernel/ptrace_32.c @@ -0,0 +1,723 @@ +/* ptrace.c */ +/* By Ross Biro 1/23/92 */ +/* + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * does not yet catch signals sent when the child dies. + * in exit.c or in signal.c. + */ + +/* + * Determines which flags the user has access to [1 = access, 0 = no access]. + * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), NT(14), IOPL(12-13), IF(9). + * Also masks reserved bits (31-22, 15, 5, 3, 1). + */ +#define FLAG_MASK 0x00050dd5 + +/* set's the trap flag. */ +#define TRAP_FLAG 0x100 + +/* + * Offset of eflags on child stack.. + */ +#define EFL_OFFSET offsetof(struct pt_regs, eflags) + +static inline struct pt_regs *get_child_regs(struct task_struct *task) +{ + void *stack_top = (void *)task->thread.esp0; + return stack_top - sizeof(struct pt_regs); +} + +/* + * This routine will get a word off of the processes privileged stack. + * the offset is bytes into the pt_regs structure on the stack. + * This routine assumes that all the privileged stacks are in our + * data space. + */ +static inline int get_stack_long(struct task_struct *task, int offset) +{ + unsigned char *stack; + + stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs); + stack += offset; + return (*((int *)stack)); +} + +/* + * This routine will put a word on the processes privileged stack. + * the offset is bytes into the pt_regs structure on the stack. + * This routine assumes that all the privileged stacks are in our + * data space. + */ +static inline int put_stack_long(struct task_struct *task, int offset, + unsigned long data) +{ + unsigned char * stack; + + stack = (unsigned char *)task->thread.esp0 - sizeof(struct pt_regs); + stack += offset; + *(unsigned long *) stack = data; + return 0; +} + +static int putreg(struct task_struct *child, + unsigned long regno, unsigned long value) +{ + switch (regno >> 2) { + case GS: + if (value && (value & 3) != 3) + return -EIO; + child->thread.gs = value; + return 0; + case DS: + case ES: + case FS: + if (value && (value & 3) != 3) + return -EIO; + value &= 0xffff; + break; + case SS: + case CS: + if ((value & 3) != 3) + return -EIO; + value &= 0xffff; + break; + case EFL: + value &= FLAG_MASK; + value |= get_stack_long(child, EFL_OFFSET) & ~FLAG_MASK; + break; + } + if (regno > FS*4) + regno -= 1*4; + put_stack_long(child, regno, value); + return 0; +} + +static unsigned long getreg(struct task_struct *child, + unsigned long regno) +{ + unsigned long retval = ~0UL; + + switch (regno >> 2) { + case GS: + retval = child->thread.gs; + break; + case DS: + case ES: + case FS: + case SS: + case CS: + retval = 0xffff; + /* fall through */ + default: + if (regno > FS*4) + regno -= 1*4; + retval &= get_stack_long(child, regno); + } + return retval; +} + +#define LDT_SEGMENT 4 + +static unsigned long convert_eip_to_linear(struct task_struct *child, struct pt_regs *regs) +{ + unsigned long addr, seg; + + addr = regs->eip; + seg = regs->xcs & 0xffff; + if (regs->eflags & VM_MASK) { + addr = (addr & 0xffff) + (seg << 4); + return addr; + } + + /* + * We'll assume that the code segments in the GDT + * are all zero-based. That is largely true: the + * TLS segments are used for data, and the PNPBIOS + * and APM bios ones we just ignore here. + */ + if (seg & LDT_SEGMENT) { + u32 *desc; + unsigned long base; + + seg &= ~7UL; + + down(&child->mm->context.sem); + if (unlikely((seg >> 3) >= child->mm->context.size)) + addr = -1L; /* bogus selector, access would fault */ + else { + desc = child->mm->context.ldt + seg; + base = ((desc[0] >> 16) | + ((desc[1] & 0xff) << 16) | + (desc[1] & 0xff000000)); + + /* 16-bit code segment? */ + if (!((desc[1] >> 22) & 1)) + addr &= 0xffff; + addr += base; + } + up(&child->mm->context.sem); + } + return addr; +} + +static inline int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) +{ + int i, copied; + unsigned char opcode[15]; + unsigned long addr = convert_eip_to_linear(child, regs); + + copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0); + for (i = 0; i < copied; i++) { + switch (opcode[i]) { + /* popf and iret */ + case 0x9d: case 0xcf: + return 1; + /* opcode and address size prefixes */ + case 0x66: case 0x67: + continue; + /* irrelevant prefixes (segment overrides and repeats) */ + case 0x26: case 0x2e: + case 0x36: case 0x3e: + case 0x64: case 0x65: + case 0xf0: case 0xf2: case 0xf3: + continue; + + /* + * pushf: NOTE! We should probably not let + * the user see the TF bit being set. But + * it's more pain than it's worth to avoid + * it, and a debugger could emulate this + * all in user space if it _really_ cares. + */ + case 0x9c: + default: + return 0; + } + } + return 0; +} + +static void set_singlestep(struct task_struct *child) +{ + struct pt_regs *regs = get_child_regs(child); + + /* + * Always set TIF_SINGLESTEP - this guarantees that + * we single-step system calls etc.. This will also + * cause us to set TF when returning to user mode. + */ + set_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* + * If TF was already set, don't do anything else + */ + if (regs->eflags & TRAP_FLAG) + return; + + /* Set TF on the kernel stack.. */ + regs->eflags |= TRAP_FLAG; + + /* + * ..but if TF is changed by the instruction we will trace, + * don't mark it as being "us" that set it, so that we + * won't clear it by hand later. + */ + if (is_setting_trap_flag(child, regs)) + return; + + child->ptrace |= PT_DTRACE; +} + +static void clear_singlestep(struct task_struct *child) +{ + /* Always clear TIF_SINGLESTEP... */ + clear_tsk_thread_flag(child, TIF_SINGLESTEP); + + /* But touch TF only if it was set by us.. */ + if (child->ptrace & PT_DTRACE) { + struct pt_regs *regs = get_child_regs(child); + regs->eflags &= ~TRAP_FLAG; + child->ptrace &= ~PT_DTRACE; + } +} + +/* + * Called by kernel/ptrace.c when detaching.. + * + * Make sure the single step bit is not set. + */ +void ptrace_disable(struct task_struct *child) +{ + clear_singlestep(child); + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +} + +/* + * Perform get_thread_area on behalf of the traced child. + */ +static int +ptrace_get_thread_area(struct task_struct *child, + int idx, struct user_desc __user *user_desc) +{ + struct user_desc info; + struct desc_struct *desc; + +/* + * Get the current Thread-Local Storage area: + */ + +#define GET_BASE(desc) ( \ + (((desc)->a >> 16) & 0x0000ffff) | \ + (((desc)->b << 16) & 0x00ff0000) | \ + ( (desc)->b & 0xff000000) ) + +#define GET_LIMIT(desc) ( \ + ((desc)->a & 0x0ffff) | \ + ((desc)->b & 0xf0000) ) + +#define GET_32BIT(desc) (((desc)->b >> 22) & 1) +#define GET_CONTENTS(desc) (((desc)->b >> 10) & 3) +#define GET_WRITABLE(desc) (((desc)->b >> 9) & 1) +#define GET_LIMIT_PAGES(desc) (((desc)->b >> 23) & 1) +#define GET_PRESENT(desc) (((desc)->b >> 15) & 1) +#define GET_USEABLE(desc) (((desc)->b >> 20) & 1) + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + + info.entry_number = idx; + info.base_addr = GET_BASE(desc); + info.limit = GET_LIMIT(desc); + info.seg_32bit = GET_32BIT(desc); + info.contents = GET_CONTENTS(desc); + info.read_exec_only = !GET_WRITABLE(desc); + info.limit_in_pages = GET_LIMIT_PAGES(desc); + info.seg_not_present = !GET_PRESENT(desc); + info.useable = GET_USEABLE(desc); + + if (copy_to_user(user_desc, &info, sizeof(info))) + return -EFAULT; + + return 0; +} + +/* + * Perform set_thread_area on behalf of the traced child. + */ +static int +ptrace_set_thread_area(struct task_struct *child, + int idx, struct user_desc __user *user_desc) +{ + struct user_desc info; + struct desc_struct *desc; + + if (copy_from_user(&info, user_desc, sizeof(info))) + return -EFAULT; + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + desc = child->thread.tls_array + idx - GDT_ENTRY_TLS_MIN; + if (LDT_empty(&info)) { + desc->a = 0; + desc->b = 0; + } else { + desc->a = LDT_entry_a(&info); + desc->b = LDT_entry_b(&info); + } + + return 0; +} + +long arch_ptrace(struct task_struct *child, long request, long addr, long data) +{ + struct user * dummy = NULL; + int i, ret; + unsigned long __user *datap = (unsigned long __user *)data; + + switch (request) { + /* when I and D space are separate, these will need to be fixed. */ + case PTRACE_PEEKTEXT: /* read word at location addr. */ + case PTRACE_PEEKDATA: + ret = generic_ptrace_peekdata(child, addr, data); + break; + + /* read the word at location addr in the USER area. */ + case PTRACE_PEEKUSR: { + unsigned long tmp; + + ret = -EIO; + if ((addr & 3) || addr < 0 || + addr > sizeof(struct user) - 3) + break; + + tmp = 0; /* Default return condition */ + if(addr < FRAME_SIZE*sizeof(long)) + tmp = getreg(child, addr); + if(addr >= (long) &dummy->u_debugreg[0] && + addr <= (long) &dummy->u_debugreg[7]){ + addr -= (long) &dummy->u_debugreg[0]; + addr = addr >> 2; + tmp = child->thread.debugreg[addr]; + } + ret = put_user(tmp, datap); + break; + } + + /* when I and D space are separate, this will have to be fixed. */ + case PTRACE_POKETEXT: /* write the word at location addr. */ + case PTRACE_POKEDATA: + ret = generic_ptrace_pokedata(child, addr, data); + break; + + case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ + ret = -EIO; + if ((addr & 3) || addr < 0 || + addr > sizeof(struct user) - 3) + break; + + if (addr < FRAME_SIZE*sizeof(long)) { + ret = putreg(child, addr, data); + break; + } + /* We need to be very careful here. We implicitly + want to modify a portion of the task_struct, and we + have to be selective about what portions we allow someone + to modify. */ + + ret = -EIO; + if(addr >= (long) &dummy->u_debugreg[0] && + addr <= (long) &dummy->u_debugreg[7]){ + + if(addr == (long) &dummy->u_debugreg[4]) break; + if(addr == (long) &dummy->u_debugreg[5]) break; + if(addr < (long) &dummy->u_debugreg[4] && + ((unsigned long) data) >= TASK_SIZE-3) break; + + /* Sanity-check data. Take one half-byte at once with + * check = (val >> (16 + 4*i)) & 0xf. It contains the + * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits + * 2 and 3 are LENi. Given a list of invalid values, + * we do mask |= 1 << invalid_value, so that + * (mask >> check) & 1 is a correct test for invalid + * values. + * + * R/Wi contains the type of the breakpoint / + * watchpoint, LENi contains the length of the watched + * data in the watchpoint case. + * + * The invalid values are: + * - LENi == 0x10 (undefined), so mask |= 0x0f00. + * - R/Wi == 0x10 (break on I/O reads or writes), so + * mask |= 0x4444. + * - R/Wi == 0x00 && LENi != 0x00, so we have mask |= + * 0x1110. + * + * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54. + * + * See the Intel Manual "System Programming Guide", + * 15.2.4 + * + * Note that LENi == 0x10 is defined on x86_64 in long + * mode (i.e. even for 32-bit userspace software, but + * 64-bit kernel), so the x86_64 mask value is 0x5454. + * See the AMD manual no. 24593 (AMD64 System + * Programming)*/ + + if(addr == (long) &dummy->u_debugreg[7]) { + data &= ~DR_CONTROL_RESERVED; + for(i=0; i<4; i++) + if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1) + goto out_tsk; + if (data) + set_tsk_thread_flag(child, TIF_DEBUG); + else + clear_tsk_thread_flag(child, TIF_DEBUG); + } + addr -= (long) &dummy->u_debugreg; + addr = addr >> 2; + child->thread.debugreg[addr] = data; + ret = 0; + } + break; + + case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */ + case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ + case PTRACE_CONT: /* restart after signal. */ + ret = -EIO; + if (!valid_signal(data)) + break; + if (request == PTRACE_SYSEMU) { + set_tsk_thread_flag(child, TIF_SYSCALL_EMU); + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + } else if (request == PTRACE_SYSCALL) { + set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); + } else { + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + } + child->exit_code = data; + /* make sure the single step bit is not set. */ + clear_singlestep(child); + wake_up_process(child); + ret = 0; + break; + +/* + * make the child exit. Best I can do is send it a sigkill. + * perhaps it should be put in the status that it wants to + * exit. + */ + case PTRACE_KILL: + ret = 0; + if (child->exit_state == EXIT_ZOMBIE) /* already dead */ + break; + child->exit_code = SIGKILL; + /* make sure the single step bit is not set. */ + clear_singlestep(child); + wake_up_process(child); + break; + + case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */ + case PTRACE_SINGLESTEP: /* set the trap flag. */ + ret = -EIO; + if (!valid_signal(data)) + break; + + if (request == PTRACE_SYSEMU_SINGLESTEP) + set_tsk_thread_flag(child, TIF_SYSCALL_EMU); + else + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); + + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + set_singlestep(child); + child->exit_code = data; + /* give it a chance to run. */ + wake_up_process(child); + ret = 0; + break; + + case PTRACE_DETACH: + /* detach a process that was attached. */ + ret = ptrace_detach(child, data); + break; + + case PTRACE_GETREGS: { /* Get all gp regs from the child. */ + if (!access_ok(VERIFY_WRITE, datap, FRAME_SIZE*sizeof(long))) { + ret = -EIO; + break; + } + for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) { + __put_user(getreg(child, i), datap); + datap++; + } + ret = 0; + break; + } + + case PTRACE_SETREGS: { /* Set all gp regs in the child. */ + unsigned long tmp; + if (!access_ok(VERIFY_READ, datap, FRAME_SIZE*sizeof(long))) { + ret = -EIO; + break; + } + for ( i = 0; i < FRAME_SIZE*sizeof(long); i += sizeof(long) ) { + __get_user(tmp, datap); + putreg(child, i, tmp); + datap++; + } + ret = 0; + break; + } + + case PTRACE_GETFPREGS: { /* Get the child FPU state. */ + if (!access_ok(VERIFY_WRITE, datap, + sizeof(struct user_i387_struct))) { + ret = -EIO; + break; + } + ret = 0; + if (!tsk_used_math(child)) + init_fpu(child); + get_fpregs((struct user_i387_struct __user *)data, child); + break; + } + + case PTRACE_SETFPREGS: { /* Set the child FPU state. */ + if (!access_ok(VERIFY_READ, datap, + sizeof(struct user_i387_struct))) { + ret = -EIO; + break; + } + set_stopped_child_used_math(child); + set_fpregs(child, (struct user_i387_struct __user *)data); + ret = 0; + break; + } + + case PTRACE_GETFPXREGS: { /* Get the child extended FPU state. */ + if (!access_ok(VERIFY_WRITE, datap, + sizeof(struct user_fxsr_struct))) { + ret = -EIO; + break; + } + if (!tsk_used_math(child)) + init_fpu(child); + ret = get_fpxregs((struct user_fxsr_struct __user *)data, child); + break; + } + + case PTRACE_SETFPXREGS: { /* Set the child extended FPU state. */ + if (!access_ok(VERIFY_READ, datap, + sizeof(struct user_fxsr_struct))) { + ret = -EIO; + break; + } + set_stopped_child_used_math(child); + ret = set_fpxregs(child, (struct user_fxsr_struct __user *)data); + break; + } + + case PTRACE_GET_THREAD_AREA: + ret = ptrace_get_thread_area(child, addr, + (struct user_desc __user *) data); + break; + + case PTRACE_SET_THREAD_AREA: + ret = ptrace_set_thread_area(child, addr, + (struct user_desc __user *) data); + break; + + default: + ret = ptrace_request(child, request, addr, data); + break; + } + out_tsk: + return ret; +} + +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) +{ + struct siginfo info; + + tsk->thread.trap_no = 1; + tsk->thread.error_code = error_code; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGTRAP; + info.si_code = TRAP_BRKPT; + + /* User-mode eip? */ + info.si_addr = user_mode_vm(regs) ? (void __user *) regs->eip : NULL; + + /* Send us the fakey SIGTRAP */ + force_sig_info(SIGTRAP, &info, tsk); +} + +/* notification of system call entry/exit + * - triggered by current->work.syscall_trace + */ +__attribute__((regparm(3))) +int do_syscall_trace(struct pt_regs *regs, int entryexit) +{ + int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); + /* + * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall + * interception + */ + int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); + int ret = 0; + + /* do the secure computing check first */ + if (!entryexit) + secure_computing(regs->orig_eax); + + if (unlikely(current->audit_context)) { + if (entryexit) + audit_syscall_exit(AUDITSC_RESULT(regs->eax), + regs->eax); + /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only + * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is + * not used, entry.S will call us only on syscall exit, not + * entry; so when TIF_SYSCALL_AUDIT is used we must avoid + * calling send_sigtrap() on syscall entry. + * + * Note that when PTRACE_SYSEMU_SINGLESTEP is used, + * is_singlestep is false, despite his name, so we will still do + * the correct thing. + */ + else if (is_singlestep) + goto out; + } + + if (!(current->ptrace & PT_PTRACED)) + goto out; + + /* If a process stops on the 1st tracepoint with SYSCALL_TRACE + * and then is resumed with SYSEMU_SINGLESTEP, it will come in + * here. We have to check this and return */ + if (is_sysemu && entryexit) + return 0; + + /* Fake a debug trap */ + if (is_singlestep) + send_sigtrap(current, regs, 0); + + if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) + goto out; + + /* the 0x80 provides a way for the tracing parent to distinguish + between a syscall stop and SIGTRAP delivery */ + /* Note that the debugger could change the result of test_thread_flag!*/ + ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); + + /* + * this isn't the same as continuing with a signal, but it will do + * for normal use. strace only continues with a signal if the + * stopping signal is not SIGTRAP. -brl + */ + if (current->exit_code) { + send_sig(current->exit_code, current, 1); + current->exit_code = 0; + } + ret = is_sysemu; +out: + if (unlikely(current->audit_context) && !entryexit) + audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_eax, + regs->ebx, regs->ecx, regs->edx, regs->esi); + if (ret == 0) + return 0; + + regs->orig_eax = -1; /* force skip of syscall restarting */ + if (unlikely(current->audit_context)) + audit_syscall_exit(AUDITSC_RESULT(regs->eax), regs->eax); + return 1; +} diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c new file mode 100644 index 000000000000..6722469c2633 --- /dev/null +++ b/arch/x86/kernel/quirks.c @@ -0,0 +1,49 @@ +/* + * This file contains work-arounds for x86 and x86_64 platform bugs. + */ +#include +#include + +#if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) + +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) +{ + u8 config, rev; + u32 word; + + /* BIOS may enable hardware IRQ balancing for + * E7520/E7320/E7525(revision ID 0x9 and below) + * based platforms. + * Disable SW irqbalance/affinity on those platforms. + */ + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); + if (rev > 0x9) + return; + + /* enable access to config space*/ + pci_read_config_byte(dev, 0xf4, &config); + pci_write_config_byte(dev, 0xf4, config|0x2); + + /* read xTPR register */ + raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); + + if (!(word & (1 << 13))) { + printk(KERN_INFO "Intel E7520/7320/7525 detected. " + "Disabling irq balancing and affinity\n"); +#ifdef CONFIG_IRQBALANCE + irqbalance_disable(""); +#endif + noirqdebug_setup(""); +#ifdef CONFIG_PROC_FS + no_irq_affinity = 1; +#endif + } + + /* put back the original value for config space*/ + if (!(config & 0x2)) + pci_write_config_byte(dev, 0xf4, config); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); +#endif diff --git a/arch/x86/kernel/reboot_32.c b/arch/x86/kernel/reboot_32.c new file mode 100644 index 000000000000..0d796248866c --- /dev/null +++ b/arch/x86/kernel/reboot_32.c @@ -0,0 +1,413 @@ +/* + * linux/arch/i386/kernel/reboot.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mach_reboot.h" +#include +#include + +/* + * Power off function, if any + */ +void (*pm_power_off)(void); +EXPORT_SYMBOL(pm_power_off); + +static int reboot_mode; +static int reboot_thru_bios; + +#ifdef CONFIG_SMP +static int reboot_cpu = -1; +#endif +static int __init reboot_setup(char *str) +{ + while(1) { + switch (*str) { + case 'w': /* "warm" reboot (no memory testing etc) */ + reboot_mode = 0x1234; + break; + case 'c': /* "cold" reboot (with memory testing etc) */ + reboot_mode = 0x0; + break; + case 'b': /* "bios" reboot by jumping through the BIOS */ + reboot_thru_bios = 1; + break; + case 'h': /* "hard" reboot by toggling RESET and/or crashing the CPU */ + reboot_thru_bios = 0; + break; +#ifdef CONFIG_SMP + case 's': /* "smp" reboot by executing reset on BSP or other CPU*/ + if (isdigit(*(str+1))) { + reboot_cpu = (int) (*(str+1) - '0'); + if (isdigit(*(str+2))) + reboot_cpu = reboot_cpu*10 + (int)(*(str+2) - '0'); + } + /* we will leave sorting out the final value + when we are ready to reboot, since we might not + have set up boot_cpu_id or smp_num_cpu */ + break; +#endif + } + if((str = strchr(str,',')) != NULL) + str++; + else + break; + } + return 1; +} + +__setup("reboot=", reboot_setup); + +/* + * Reboot options and system auto-detection code provided by + * Dell Inc. so their systems "just work". :-) + */ + +/* + * Some machines require the "reboot=b" commandline option, this quirk makes that automatic. + */ +static int __init set_bios_reboot(struct dmi_system_id *d) +{ + if (!reboot_thru_bios) { + reboot_thru_bios = 1; + printk(KERN_INFO "%s series board detected. Selecting BIOS-method for reboots.\n", d->ident); + } + return 0; +} + +static struct dmi_system_id __initdata reboot_dmi_table[] = { + { /* Handle problems with rebooting on Dell E520's */ + .callback = set_bios_reboot, + .ident = "Dell E520", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Dell DM061"), + }, + }, + { /* Handle problems with rebooting on Dell 1300's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 1300", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 1300/"), + }, + }, + { /* Handle problems with rebooting on Dell 300's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 300", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 300/"), + }, + }, + { /* Handle problems with rebooting on Dell Optiplex 745's SFF*/ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 745", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 745"), + DMI_MATCH(DMI_BOARD_NAME, "0WF810"), + }, + }, + { /* Handle problems with rebooting on Dell 2400's */ + .callback = set_bios_reboot, + .ident = "Dell PowerEdge 2400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Computer Corporation"), + DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), + }, + }, + { /* Handle problems with rebooting on HP laptops */ + .callback = set_bios_reboot, + .ident = "HP Compaq Laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq"), + }, + }, + { } +}; + +static int __init reboot_init(void) +{ + dmi_check_system(reboot_dmi_table); + return 0; +} + +core_initcall(reboot_init); + +/* The following code and data reboots the machine by switching to real + mode and jumping to the BIOS reset entry point, as if the CPU has + really been reset. The previous version asked the keyboard + controller to pulse the CPU reset line, which is more thorough, but + doesn't work with at least one type of 486 motherboard. It is easy + to stop this code working; hence the copious comments. */ + +static unsigned long long +real_mode_gdt_entries [3] = +{ + 0x0000000000000000ULL, /* Null descriptor */ + 0x00009a000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */ + 0x000092000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */ +}; + +static struct Xgt_desc_struct +real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries }, +real_mode_idt = { 0x3ff, 0 }, +no_idt = { 0, 0 }; + + +/* This is 16-bit protected mode code to disable paging and the cache, + switch to real mode and jump to the BIOS reset code. + + The instruction that switches to real mode by writing to CR0 must be + followed immediately by a far jump instruction, which set CS to a + valid value for real mode, and flushes the prefetch queue to avoid + running instructions that have already been decoded in protected + mode. + + Clears all the flags except ET, especially PG (paging), PE + (protected-mode enable) and TS (task switch for coprocessor state + save). Flushes the TLB after paging has been disabled. Sets CD and + NW, to disable the cache on a 486, and invalidates the cache. This + is more like the state of a 486 after reset. I don't know if + something else should be done for other chips. + + More could be done here to set up the registers as if a CPU reset had + occurred; hopefully real BIOSs don't assume much. */ + +static unsigned char real_mode_switch [] = +{ + 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ + 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ + 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */ + 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */ + 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */ + 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */ + 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */ + 0x74, 0x02, /* jz f */ + 0x0f, 0x09, /* wbinvd */ + 0x24, 0x10, /* f: andb $0x10,al */ + 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */ +}; +static unsigned char jump_to_bios [] = +{ + 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */ +}; + +/* + * Switch to real mode and then execute the code + * specified by the code and length parameters. + * We assume that length will aways be less that 100! + */ +void machine_real_restart(unsigned char *code, int length) +{ + local_irq_disable(); + + /* Write zero to CMOS register number 0x0f, which the BIOS POST + routine will recognize as telling it to do a proper reboot. (Well + that's what this book in front of me says -- it may only apply to + the Phoenix BIOS though, it's not clear). At the same time, + disable NMIs by setting the top bit in the CMOS address register, + as we're about to do peculiar things to the CPU. I'm not sure if + `outb_p' is needed instead of just `outb'. Use it to be on the + safe side. (Yes, CMOS_WRITE does outb_p's. - Paul G.) + */ + + spin_lock(&rtc_lock); + CMOS_WRITE(0x00, 0x8f); + spin_unlock(&rtc_lock); + + /* Remap the kernel at virtual address zero, as well as offset zero + from the kernel segment. This assumes the kernel segment starts at + virtual address PAGE_OFFSET. */ + + memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, + sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); + + /* + * Use `swapper_pg_dir' as our page directory. + */ + load_cr3(swapper_pg_dir); + + /* Write 0x1234 to absolute memory location 0x472. The BIOS reads + this on booting to tell it to "Bypass memory test (also warm + boot)". This seems like a fairly standard thing that gets set by + REBOOT.COM programs, and the previous reset routine did this + too. */ + + *((unsigned short *)0x472) = reboot_mode; + + /* For the switch to real mode, copy some code to low memory. It has + to be in the first 64k because it is running in 16-bit mode, and it + has to have the same physical and virtual address, because it turns + off paging. Copy it near the end of the first page, out of the way + of BIOS variables. */ + + memcpy ((void *) (0x1000 - sizeof (real_mode_switch) - 100), + real_mode_switch, sizeof (real_mode_switch)); + memcpy ((void *) (0x1000 - 100), code, length); + + /* Set up the IDT for real mode. */ + + load_idt(&real_mode_idt); + + /* Set up a GDT from which we can load segment descriptors for real + mode. The GDT is not used in real mode; it is just needed here to + prepare the descriptors. */ + + load_gdt(&real_mode_gdt); + + /* Load the data segment registers, and thus the descriptors ready for + real mode. The base address of each segment is 0x100, 16 times the + selector value being loaded here. This is so that the segment + registers don't have to be reloaded after switching to real mode: + the values are consistent for real mode operation already. */ + + __asm__ __volatile__ ("movl $0x0010,%%eax\n" + "\tmovl %%eax,%%ds\n" + "\tmovl %%eax,%%es\n" + "\tmovl %%eax,%%fs\n" + "\tmovl %%eax,%%gs\n" + "\tmovl %%eax,%%ss" : : : "eax"); + + /* Jump to the 16-bit code that we copied earlier. It disables paging + and the cache, switches to real mode, and jumps to the BIOS reset + entry point. */ + + __asm__ __volatile__ ("ljmp $0x0008,%0" + : + : "i" ((void *) (0x1000 - sizeof (real_mode_switch) - 100))); +} +#ifdef CONFIG_APM_MODULE +EXPORT_SYMBOL(machine_real_restart); +#endif + +static void native_machine_shutdown(void) +{ +#ifdef CONFIG_SMP + int reboot_cpu_id; + + /* The boot cpu is always logical cpu 0 */ + reboot_cpu_id = 0; + + /* See if there has been given a command line override */ + if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && + cpu_isset(reboot_cpu, cpu_online_map)) { + reboot_cpu_id = reboot_cpu; + } + + /* Make certain the cpu I'm rebooting on is online */ + if (!cpu_isset(reboot_cpu_id, cpu_online_map)) { + reboot_cpu_id = smp_processor_id(); + } + + /* Make certain I only run on the appropriate processor */ + set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id)); + + /* O.K. Now that I'm on the appropriate processor, stop + * all of the others, and disable their local APICs. + */ + + smp_send_stop(); +#endif /* CONFIG_SMP */ + + lapic_shutdown(); + +#ifdef CONFIG_X86_IO_APIC + disable_IO_APIC(); +#endif +} + +void __attribute__((weak)) mach_reboot_fixups(void) +{ +} + +static void native_machine_emergency_restart(void) +{ + if (!reboot_thru_bios) { + if (efi_enabled) { + efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); + load_idt(&no_idt); + __asm__ __volatile__("int3"); + } + /* rebooting needs to touch the page at absolute addr 0 */ + *((unsigned short *)__va(0x472)) = reboot_mode; + for (;;) { + mach_reboot_fixups(); /* for board specific fixups */ + mach_reboot(); + /* That didn't work - force a triple fault.. */ + load_idt(&no_idt); + __asm__ __volatile__("int3"); + } + } + if (efi_enabled) + efi.reset_system(EFI_RESET_WARM, EFI_SUCCESS, 0, NULL); + + machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); +} + +static void native_machine_restart(char * __unused) +{ + machine_shutdown(); + machine_emergency_restart(); +} + +static void native_machine_halt(void) +{ +} + +static void native_machine_power_off(void) +{ + if (pm_power_off) { + machine_shutdown(); + pm_power_off(); + } +} + + +struct machine_ops machine_ops = { + .power_off = native_machine_power_off, + .shutdown = native_machine_shutdown, + .emergency_restart = native_machine_emergency_restart, + .restart = native_machine_restart, + .halt = native_machine_halt, +}; + +void machine_power_off(void) +{ + machine_ops.power_off(); +} + +void machine_shutdown(void) +{ + machine_ops.shutdown(); +} + +void machine_emergency_restart(void) +{ + machine_ops.emergency_restart(); +} + +void machine_restart(char *cmd) +{ + machine_ops.restart(cmd); +} + +void machine_halt(void) +{ + machine_ops.halt(); +} diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c new file mode 100644 index 000000000000..03e1cce58f49 --- /dev/null +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -0,0 +1,68 @@ +/* + * linux/arch/i386/kernel/reboot_fixups.c + * + * This is a good place to put board specific reboot fixups. + * + * List of supported fixups: + * geode-gx1/cs5530a - Jaya Kumar + * geode-gx/lx/cs5536 - Andres Salomon + * + */ + +#include +#include +#include +#include + +static void cs5530a_warm_reset(struct pci_dev *dev) +{ + /* writing 1 to the reset control register, 0x44 causes the + cs5530a to perform a system warm reset */ + pci_write_config_byte(dev, 0x44, 0x1); + udelay(50); /* shouldn't get here but be safe and spin-a-while */ + return; +} + +static void cs5536_warm_reset(struct pci_dev *dev) +{ + /* + * 6.6.2.12 Soft Reset (DIVIL_SOFT_RESET) + * writing 1 to the LSB of this MSR causes a hard reset. + */ + wrmsrl(0x51400017, 1ULL); + udelay(50); /* shouldn't get here but be safe and spin a while */ +} + +struct device_fixup { + unsigned int vendor; + unsigned int device; + void (*reboot_fixup)(struct pci_dev *); +}; + +static struct device_fixup fixups_table[] = { +{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, +{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, +}; + +/* + * we see if any fixup is available for our current hardware. if there + * is a fixup, we call it and we expect to never return from it. if we + * do return, we keep looking and then eventually fall back to the + * standard mach_reboot on return. + */ +void mach_reboot_fixups(void) +{ + struct device_fixup *cur; + struct pci_dev *dev; + int i; + + for (i=0; i < ARRAY_SIZE(fixups_table); i++) { + cur = &(fixups_table[i]); + dev = pci_get_device(cur->vendor, cur->device, NULL); + if (!dev) + continue; + + cur->reboot_fixup(dev); + } +} + diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S new file mode 100644 index 000000000000..f151d6fae462 --- /dev/null +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -0,0 +1,252 @@ +/* + * relocate_kernel.S - put the kernel image in place to boot + * Copyright (C) 2002-2004 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include + +/* + * Must be relocatable PIC code callable as a C function + */ + +#define PTR(x) (x << 2) +#define PAGE_ALIGNED (1 << PAGE_SHIFT) +#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ +#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */ + + .text + .align PAGE_ALIGNED + .globl relocate_kernel +relocate_kernel: + movl 8(%esp), %ebp /* list of pages */ + +#ifdef CONFIG_X86_PAE + /* map the control page at its virtual address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0xc0000000, %eax + shrl $27, %eax + addl %edi, %eax + + movl PTR(PA_PMD_0)(%ebp), %edx + orl $PAE_PGD_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PMD_0)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0x3fe00000, %eax + shrl $18, %eax + addl %edi, %eax + + movl PTR(PA_PTE_0)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_0)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0x001ff000, %eax + shrl $9, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + /* identity map the control page at its physical address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0xc0000000, %eax + shrl $27, %eax + addl %edi, %eax + + movl PTR(PA_PMD_1)(%ebp), %edx + orl $PAE_PGD_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PMD_1)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0x3fe00000, %eax + shrl $18, %eax + addl %edi, %eax + + movl PTR(PA_PTE_1)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_1)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0x001ff000, %eax + shrl $9, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) +#else + /* map the control page at its virtual address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0xffc00000, %eax + shrl $20, %eax + addl %edi, %eax + + movl PTR(PA_PTE_0)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_0)(%ebp), %edi + movl PTR(VA_CONTROL_PAGE)(%ebp), %eax + andl $0x003ff000, %eax + shrl $10, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + /* identity map the control page at its physical address */ + + movl PTR(VA_PGD)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0xffc00000, %eax + shrl $20, %eax + addl %edi, %eax + + movl PTR(PA_PTE_1)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) + + movl PTR(VA_PTE_1)(%ebp), %edi + movl PTR(PA_CONTROL_PAGE)(%ebp), %eax + andl $0x003ff000, %eax + shrl $10, %eax + addl %edi, %eax + + movl PTR(PA_CONTROL_PAGE)(%ebp), %edx + orl $PAGE_ATTR, %edx + movl %edx, (%eax) +#endif + +relocate_new_kernel: + /* read the arguments and say goodbye to the stack */ + movl 4(%esp), %ebx /* page_list */ + movl 8(%esp), %ebp /* list of pages */ + movl 12(%esp), %edx /* start address */ + movl 16(%esp), %ecx /* cpu_has_pae */ + + /* zero out flags, and disable interrupts */ + pushl $0 + popfl + + /* get physical address of control page now */ + /* this is impossible after page table switch */ + movl PTR(PA_CONTROL_PAGE)(%ebp), %edi + + /* switch to new set of page tables */ + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea 4096(%edi), %esp + + /* jump to identity mapped page */ + movl %edi, %eax + addl $(identity_mapped - relocate_kernel), %eax + pushl %eax + ret + +identity_mapped: + /* store the start address on the stack */ + pushl %edx + + /* Set cr0 to a known state: + * 31 0 == Paging disabled + * 18 0 == Alignment check disabled + * 16 0 == Write protect disabled + * 3 0 == No task switch + * 2 0 == Don't do FP software emulation. + * 0 1 == Proctected mode enabled + */ + movl %cr0, %eax + andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax + orl $(1<<0), %eax + movl %eax, %cr0 + + /* clear cr4 if applicable */ + testl %ecx, %ecx + jz 1f + /* Set cr4 to a known state: + * Setting everything to zero seems safe. + */ + movl %cr4, %eax + andl $0, %eax + movl %eax, %cr4 + + jmp 1f +1: + + /* Flush the TLB (needed?) */ + xorl %eax, %eax + movl %eax, %cr3 + + /* Do the copies */ + movl %ebx, %ecx + jmp 1f + +0: /* top, read another word from the indirection page */ + movl (%ebx), %ecx + addl $4, %ebx +1: + testl $0x1, %ecx /* is it a destination page */ + jz 2f + movl %ecx, %edi + andl $0xfffff000, %edi + jmp 0b +2: + testl $0x2, %ecx /* is it an indirection page */ + jz 2f + movl %ecx, %ebx + andl $0xfffff000, %ebx + jmp 0b +2: + testl $0x4, %ecx /* is it the done indicator */ + jz 2f + jmp 3f +2: + testl $0x8, %ecx /* is it the source indicator */ + jz 0b /* Ignore it otherwise */ + movl %ecx, %esi /* For every source page do a copy */ + andl $0xfffff000, %esi + + movl $1024, %ecx + rep ; movsl + jmp 0b + +3: + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* set all of the registers to known values */ + /* leave %esp alone */ + + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %edi, %edi + xorl %ebp, %ebp + ret diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/kernel/scx200_32.c new file mode 100644 index 000000000000..c7d3df23f589 --- /dev/null +++ b/arch/x86/kernel/scx200_32.c @@ -0,0 +1,131 @@ +/* linux/arch/i386/kernel/scx200.c + + Copyright (c) 2001,2002 Christer Weinigel + + National Semiconductor SCx200 support. */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +/* Verify that the configuration block really is there */ +#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base)) + +#define NAME "scx200" + +MODULE_AUTHOR("Christer Weinigel "); +MODULE_DESCRIPTION("NatSemi SCx200 Driver"); +MODULE_LICENSE("GPL"); + +unsigned scx200_gpio_base = 0; +long scx200_gpio_shadow[2]; + +unsigned scx200_cb_base = 0; + +static struct pci_device_id scx200_tbl[] = { + { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_BRIDGE) }, + { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE) }, + { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SCx200_XBUS) }, + { PCI_DEVICE(PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_XBUS) }, + { }, +}; +MODULE_DEVICE_TABLE(pci,scx200_tbl); + +static int __devinit scx200_probe(struct pci_dev *, const struct pci_device_id *); + +static struct pci_driver scx200_pci_driver = { + .name = "scx200", + .id_table = scx200_tbl, + .probe = scx200_probe, +}; + +static DEFINE_MUTEX(scx200_gpio_config_lock); + +static void __devinit scx200_init_shadow(void) +{ + int bank; + + /* read the current values driven on the GPIO signals */ + for (bank = 0; bank < 2; ++bank) + scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank); +} + +static int __devinit scx200_probe(struct pci_dev *pdev, const struct pci_device_id *ent) +{ + unsigned base; + + if (pdev->device == PCI_DEVICE_ID_NS_SCx200_BRIDGE || + pdev->device == PCI_DEVICE_ID_NS_SC1100_BRIDGE) { + base = pci_resource_start(pdev, 0); + printk(KERN_INFO NAME ": GPIO base 0x%x\n", base); + + if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) { + printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n"); + return -EBUSY; + } + + scx200_gpio_base = base; + scx200_init_shadow(); + + } else { + /* find the base of the Configuration Block */ + if (scx200_cb_probe(SCx200_CB_BASE_FIXED)) { + scx200_cb_base = SCx200_CB_BASE_FIXED; + } else { + pci_read_config_dword(pdev, SCx200_CBA_SCRATCH, &base); + if (scx200_cb_probe(base)) { + scx200_cb_base = base; + } else { + printk(KERN_WARNING NAME ": Configuration Block not found\n"); + return -ENODEV; + } + } + printk(KERN_INFO NAME ": Configuration Block base 0x%x\n", scx200_cb_base); + } + + return 0; +} + +u32 scx200_gpio_configure(unsigned index, u32 mask, u32 bits) +{ + u32 config, new_config; + + mutex_lock(&scx200_gpio_config_lock); + + outl(index, scx200_gpio_base + 0x20); + config = inl(scx200_gpio_base + 0x24); + + new_config = (config & mask) | bits; + outl(new_config, scx200_gpio_base + 0x24); + + mutex_unlock(&scx200_gpio_config_lock); + + return config; +} + +static int __init scx200_init(void) +{ + printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n"); + + return pci_register_driver(&scx200_pci_driver); +} + +static void __exit scx200_cleanup(void) +{ + pci_unregister_driver(&scx200_pci_driver); + release_region(scx200_gpio_base, SCx200_GPIO_SIZE); +} + +module_init(scx200_init); +module_exit(scx200_cleanup); + +EXPORT_SYMBOL(scx200_gpio_base); +EXPORT_SYMBOL(scx200_gpio_shadow); +EXPORT_SYMBOL(scx200_gpio_configure); +EXPORT_SYMBOL(scx200_cb_base); diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c new file mode 100644 index 000000000000..d474cd639bcb --- /dev/null +++ b/arch/x86/kernel/setup_32.c @@ -0,0 +1,653 @@ +/* + * linux/arch/i386/kernel/setup.c + * + * Copyright (C) 1995 Linus Torvalds + * + * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 + * + * Memory region support + * David Parsons , July-August 1999 + * + * Added E820 sanitization routine (removes overlapping memory regions); + * Brian Moyle , February 2001 + * + * Moved CPU detection code to cpu/${cpu}.c + * Patrick Mochel , March 2002 + * + * Provisions for empty E820 memory regions (reported by certain BIOSes). + * Alex Achenbach , December 2002. + * + */ + +/* + * This file handles the architecture-dependent parts of initialization + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include