summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig123
-rw-r--r--arch/x86/Kconfig.cpu13
-rw-r--r--arch/x86/Kconfig.debug50
-rw-r--r--arch/x86/Makefile7
-rw-r--r--arch/x86/boot/.gitignore5
-rw-r--r--arch/x86/boot/Makefile16
-rw-r--r--arch/x86/boot/a20.c2
-rw-r--r--arch/x86/boot/apm.c2
-rw-r--r--arch/x86/boot/bitops.h2
-rw-r--r--arch/x86/boot/boot.h7
-rw-r--r--arch/x86/boot/cmdline.c2
-rw-r--r--arch/x86/boot/compressed/Makefile2
-rw-r--r--arch/x86/boot/compressed/head_32.S15
-rw-r--r--arch/x86/boot/compressed/head_64.S30
-rw-r--r--arch/x86/boot/compressed/misc.c210
-rw-r--r--arch/x86/boot/compressed/vmlinux_64.lds4
-rw-r--r--arch/x86/boot/copy.S2
-rw-r--r--arch/x86/boot/cpucheck.c22
-rw-r--r--arch/x86/boot/edd.c2
-rw-r--r--arch/x86/boot/header.S10
-rw-r--r--arch/x86/boot/install.sh2
-rw-r--r--arch/x86/boot/main.c2
-rw-r--r--arch/x86/boot/mca.c2
-rw-r--r--arch/x86/boot/memory.c2
-rw-r--r--arch/x86/boot/pm.c4
-rw-r--r--arch/x86/boot/pmjump.S2
-rw-r--r--arch/x86/boot/printf.c2
-rw-r--r--arch/x86/boot/string.c2
-rw-r--r--arch/x86/boot/tools/build.c88
-rw-r--r--arch/x86/boot/tty.c2
-rw-r--r--arch/x86/boot/version.c2
-rw-r--r--arch/x86/boot/video-bios.c8
-rw-r--r--arch/x86/boot/video-mode.c173
-rw-r--r--arch/x86/boot/video-vesa.c10
-rw-r--r--arch/x86/boot/video-vga.c14
-rw-r--r--arch/x86/boot/video.c159
-rw-r--r--arch/x86/boot/video.h2
-rw-r--r--arch/x86/boot/voyager.c2
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/aes-i586-asm_32.S3
-rw-r--r--arch/x86/ia32/ia32_signal.c12
-rw-r--r--arch/x86/ia32/ia32entry.S14
-rw-r--r--arch/x86/ia32/sys_ia32.c32
-rw-r--r--arch/x86/kernel/Makefile31
-rw-r--r--arch/x86/kernel/acpi/Makefile9
-rw-r--r--arch/x86/kernel/acpi/boot.c71
-rw-r--r--arch/x86/kernel/acpi/cstate.c6
-rw-r--r--arch/x86/kernel/acpi/processor.c2
-rw-r--r--arch/x86/kernel/acpi/realmode/.gitignore3
-rw-r--r--arch/x86/kernel/acpi/realmode/Makefile57
-rw-r--r--arch/x86/kernel/acpi/realmode/copy.S1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-bios.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-mode.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vesa.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/video-vga.c1
-rw-r--r--arch/x86/kernel/acpi/realmode/wakemain.c81
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S113
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h36
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S61
-rw-r--r--arch/x86/kernel/acpi/sleep.c73
-rw-r--r--arch/x86/kernel/acpi/sleep.h16
-rw-r--r--arch/x86/kernel/acpi/sleep_32.c40
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S247
-rw-r--r--arch/x86/kernel/acpi/wakeup_64.S313
-rw-r--r--arch/x86/kernel/acpi/wakeup_rm.S10
-rw-r--r--arch/x86/kernel/alternative.c114
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic_32.c227
-rw-r--r--arch/x86/kernel/apic_64.c147
-rw-r--r--arch/x86/kernel/apm_32.c24
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/bugs_64.c14
-rw-r--r--arch/x86/kernel/cpu/Makefile5
-rw-r--r--arch/x86/kernel/cpu/amd.c125
-rw-r--r--arch/x86/kernel/cpu/centaur.c490
-rw-r--r--arch/x86/kernel/cpu/common.c181
-rw-r--r--arch/x86/kernel/cpu/cpu.h26
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c38
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c32
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c13
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c20
-rw-r--r--arch/x86/kernel/cpu/cyrix.c136
-rw-r--r--arch/x86/kernel/cpu/feature_names.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c106
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c92
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c50
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c18
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c46
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c21
-rw-r--r--arch/x86/kernel/cpu/mcheck/p5.c16
-rw-r--r--arch/x86/kernel/cpu/mcheck/p6.c48
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/winchip.c8
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c139
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c14
-rw-r--r--arch/x86/kernel/cpu/nexgen.c60
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c14
-rw-r--r--arch/x86/kernel/cpu/proc.c171
-rw-r--r--arch/x86/kernel/cpu/transmeta.c30
-rw-r--r--arch/x86/kernel/cpu/umc.c19
-rw-r--r--arch/x86/kernel/cpuid.c4
-rw-r--r--arch/x86/kernel/crash.c7
-rw-r--r--arch/x86/kernel/ds.c8
-rw-r--r--arch/x86/kernel/e820_32.c31
-rw-r--r--arch/x86/kernel/e820_64.c191
-rw-r--r--arch/x86/kernel/early_printk.c24
-rw-r--r--arch/x86/kernel/efi.c18
-rw-r--r--arch/x86/kernel/efi_64.c12
-rw-r--r--arch/x86/kernel/entry_32.S52
-rw-r--r--arch/x86/kernel/entry_64.S8
-rw-r--r--arch/x86/kernel/genapic_64.c49
-rw-r--r--arch/x86/kernel/genapic_flat_64.c7
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c250
-rw-r--r--arch/x86/kernel/head32.c14
-rw-r--r--arch/x86/kernel/head64.c101
-rw-r--r--arch/x86/kernel/head_32.S3
-rw-r--r--arch/x86/kernel/head_64.S28
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/i386_ksyms_32.c5
-rw-r--r--arch/x86/kernel/i387.c200
-rw-r--r--arch/x86/kernel/i8253.c6
-rw-r--r--arch/x86/kernel/io_apic_32.c167
-rw-r--r--arch/x86/kernel/io_apic_64.c67
-rw-r--r--arch/x86/kernel/ipi.c178
-rw-r--r--arch/x86/kernel/irq_32.c4
-rw-r--r--arch/x86/kernel/kdebugfs.c163
-rw-r--r--arch/x86/kernel/kgdb.c567
-rw-r--r--arch/x86/kernel/kprobes.c14
-rw-r--r--arch/x86/kernel/kvm.c248
-rw-r--r--arch/x86/kernel/kvmclock.c187
-rw-r--r--arch/x86/kernel/mca_32.c96
-rw-r--r--arch/x86/kernel/mfgpt_32.c11
-rw-r--r--arch/x86/kernel/microcode.c32
-rw-r--r--arch/x86/kernel/mpparse.c (renamed from arch/x86/kernel/mpparse_32.c)867
-rw-r--r--arch/x86/kernel/mpparse_64.c867
-rw-r--r--arch/x86/kernel/msr.c8
-rw-r--r--arch/x86/kernel/nmi_32.c17
-rw-r--r--arch/x86/kernel/nmi_64.c8
-rw-r--r--arch/x86/kernel/paravirt.c30
-rw-r--r--arch/x86/kernel/pci-calgary_64.c7
-rw-r--r--arch/x86/kernel/pci-dma.c (renamed from arch/x86/kernel/pci-dma_64.c)546
-rw-r--r--arch/x86/kernel/pci-dma_32.c177
-rw-r--r--arch/x86/kernel/pci-gart_64.c15
-rw-r--r--arch/x86/kernel/pci-nommu.c (renamed from arch/x86/kernel/pci-nommu_64.c)34
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c9
-rw-r--r--arch/x86/kernel/process.c161
-rw-r--r--arch/x86/kernel/process_32.c222
-rw-r--r--arch/x86/kernel/process_64.c249
-rw-r--r--arch/x86/kernel/ptrace.c96
-rw-r--r--arch/x86/kernel/reboot.c24
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S30
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S40
-rw-r--r--arch/x86/kernel/rtc.c33
-rw-r--r--arch/x86/kernel/setup.c137
-rw-r--r--arch/x86/kernel/setup64.c101
-rw-r--r--arch/x86/kernel/setup_32.c110
-rw-r--r--arch/x86/kernel/setup_64.c271
-rw-r--r--arch/x86/kernel/sigframe.h (renamed from arch/x86/kernel/sigframe_32.h)14
-rw-r--r--arch/x86/kernel/signal_32.c334
-rw-r--r--arch/x86/kernel/signal_64.c138
-rw-r--r--arch/x86/kernel/smp.c343
-rw-r--r--arch/x86/kernel/smp_32.c712
-rw-r--r--arch/x86/kernel/smpboot.c (renamed from arch/x86/kernel/smpboot_32.c)1364
-rw-r--r--arch/x86/kernel/smpboot_64.c1108
-rw-r--r--arch/x86/kernel/smpcommon.c83
-rw-r--r--arch/x86/kernel/smpcommon_32.c81
-rw-r--r--arch/x86/kernel/srat_32.c10
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/summit_32.c62
-rw-r--r--arch/x86/kernel/syscall_64.c13
-rw-r--r--arch/x86/kernel/test_nx.c2
-rw-r--r--arch/x86/kernel/tlb_32.c243
-rw-r--r--arch/x86/kernel/tlb_64.c (renamed from arch/x86/kernel/smp_64.c)272
-rw-r--r--arch/x86/kernel/tls.c4
-rw-r--r--arch/x86/kernel/trampoline.c18
-rw-r--r--arch/x86/kernel/trampoline_32.S2
-rw-r--r--arch/x86/kernel/trampoline_64.S5
-rw-r--r--arch/x86/kernel/traps_32.c647
-rw-r--r--arch/x86/kernel/traps_64.c47
-rw-r--r--arch/x86/kernel/tsc_32.c39
-rw-r--r--arch/x86/kernel/tsc_64.c26
-rw-r--r--arch/x86/kernel/vm86_32.c213
-rw-r--r--arch/x86/kernel/vmi_32.c22
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S5
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S17
-rw-r--r--arch/x86/kernel/vsmp_64.c131
-rw-r--r--arch/x86/kernel/vsyscall_64.c2
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c18
-rw-r--r--arch/x86/kvm/Kconfig13
-rw-r--r--arch/x86/kvm/Makefile6
-rw-r--r--arch/x86/kvm/i8254.c611
-rw-r--r--arch/x86/kvm/i8254.h63
-rw-r--r--arch/x86/kvm/irq.c18
-rw-r--r--arch/x86/kvm/irq.h3
-rw-r--r--arch/x86/kvm/kvm_svm.h2
-rw-r--r--arch/x86/kvm/lapic.c35
-rw-r--r--arch/x86/kvm/mmu.c672
-rw-r--r--arch/x86/kvm/mmu.h6
-rw-r--r--arch/x86/kvm/paging_tmpl.h86
-rw-r--r--arch/x86/kvm/segment_descriptor.h29
-rw-r--r--arch/x86/kvm/svm.c352
-rw-r--r--arch/x86/kvm/svm.h3
-rw-r--r--arch/x86/kvm/tss.h59
-rw-r--r--arch/x86/kvm/vmx.c278
-rw-r--r--arch/x86/kvm/vmx.h10
-rw-r--r--arch/x86/kvm/x86.c897
-rw-r--r--arch/x86/kvm/x86_emulate.c285
-rw-r--r--arch/x86/lguest/boot.c2
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/lib/bitops_32.c70
-rw-r--r--arch/x86/lib/bitops_64.c175
-rw-r--r--arch/x86/lib/memcpy_32.c2
-rw-r--r--arch/x86/lib/memmove_64.c8
-rw-r--r--arch/x86/lib/mmx_32.c197
-rw-r--r--arch/x86/lib/semaphore_32.S83
-rw-r--r--arch/x86/lib/string_32.c60
-rw-r--r--arch/x86/lib/strstr_32.c4
-rw-r--r--arch/x86/lib/thunk_64.S5
-rw-r--r--arch/x86/lib/usercopy_32.c122
-rw-r--r--arch/x86/mach-generic/bigsmp.c30
-rw-r--r--arch/x86/mach-generic/default.c8
-rw-r--r--arch/x86/mach-generic/probe.c37
-rw-r--r--arch/x86/mach-generic/summit.c8
-rw-r--r--arch/x86/mach-rdc321x/Makefile2
-rw-r--r--arch/x86/mach-rdc321x/wdt.c275
-rw-r--r--arch/x86/mach-visws/mpparse.c15
-rw-r--r--arch/x86/mach-visws/visws_apic.c2
-rw-r--r--arch/x86/mach-voyager/voyager_basic.c2
-rw-r--r--arch/x86/mach-voyager/voyager_cat.c2
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c43
-rw-r--r--arch/x86/mach-voyager/voyager_thread.c2
-rw-r--r--arch/x86/math-emu/fpu_entry.c5
-rw-r--r--arch/x86/math-emu/fpu_system.h26
-rw-r--r--arch/x86/math-emu/reg_ld_str.c17
-rw-r--r--arch/x86/mm/Makefile16
-rw-r--r--arch/x86/mm/Makefile_329
-rw-r--r--arch/x86/mm/Makefile_649
-rw-r--r--arch/x86/mm/discontig_32.c8
-rw-r--r--arch/x86/mm/dump_pagetables.c354
-rw-r--r--arch/x86/mm/fault.c6
-rw-r--r--arch/x86/mm/init_32.c105
-rw-r--r--arch/x86/mm/init_64.c299
-rw-r--r--arch/x86/mm/ioremap.c191
-rw-r--r--arch/x86/mm/k8topology_64.c2
-rw-r--r--arch/x86/mm/numa_64.c67
-rw-r--r--arch/x86/mm/pageattr.c111
-rw-r--r--arch/x86/mm/pat.c596
-rw-r--r--arch/x86/mm/pgtable.c276
-rw-r--r--arch/x86/mm/pgtable_32.c208
-rw-r--r--arch/x86/mm/srat_64.c41
-rw-r--r--arch/x86/oprofile/init.c13
-rw-r--r--arch/x86/oprofile/nmi_int.c49
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c6
-rw-r--r--arch/x86/oprofile/op_model_athlon.c46
-rw-r--r--arch/x86/oprofile/op_model_ppro.c52
-rw-r--r--arch/x86/pci/acpi.c28
-rw-r--r--arch/x86/pci/common.c63
-rw-r--r--arch/x86/pci/i386.c111
-rw-r--r--arch/x86/pci/irq.c12
-rw-r--r--arch/x86/pci/numa.c32
-rw-r--r--arch/x86/pci/pcbios.c72
-rw-r--r--arch/x86/pci/pci.h4
-rw-r--r--arch/x86/power/cpu_32.c41
-rw-r--r--arch/x86/vdso/Makefile5
-rw-r--r--arch/x86/vdso/vdso.S10
-rw-r--r--arch/x86/vdso/vdso32-setup.c11
-rw-r--r--arch/x86/video/fbdev.c1
-rw-r--r--arch/x86/xen/Kconfig2
-rw-r--r--arch/x86/xen/Makefile4
-rw-r--r--arch/x86/xen/enlighten.c61
-rw-r--r--arch/x86/xen/events.c591
-rw-r--r--arch/x86/xen/features.c29
-rw-r--r--arch/x86/xen/grant-table.c91
-rw-r--r--arch/x86/xen/mmu.c147
-rw-r--r--arch/x86/xen/multicalls.c4
-rw-r--r--arch/x86/xen/setup.c21
-rw-r--r--arch/x86/xen/smp.c28
-rw-r--r--arch/x86/xen/xen-asm.S53
-rw-r--r--arch/x86/xen/xen-ops.h10
283 files changed, 14529 insertions, 11498 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6c70fed0f9a0..a12dbb2b93f3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,6 +23,7 @@ config X86
select HAVE_KPROBES
select HAVE_KRETPROBES
select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
+ select HAVE_ARCH_KGDB if !X86_VOYAGER
config GENERIC_LOCKBREAK
@@ -53,9 +54,6 @@ config STACKTRACE_SUPPORT
config HAVE_LATENCYTOP_SUPPORT
def_bool y
-config SEMAPHORE_SLEEPERS
- def_bool y
-
config FAST_CMPXCHG_LOCAL
bool
default y
@@ -88,9 +86,6 @@ config GENERIC_GPIO
config ARCH_MAY_HAVE_PC_FDC
def_bool y
-config DMI
- def_bool y
-
config RWSEM_GENERIC_SPINLOCK
def_bool !X86_XADD
@@ -116,8 +111,14 @@ config GENERIC_TIME_VSYSCALL
config ARCH_HAS_CPU_RELAX
def_bool y
+config ARCH_HAS_CACHE_LINE_SIZE
+ def_bool y
+
config HAVE_SETUP_PER_CPU_AREA
- def_bool X86_64
+ def_bool X86_64 || (X86_SMP && !X86_VOYAGER)
+
+config HAVE_CPUMASK_OF_CPU_MAP
+ def_bool X86_64_SMP
config ARCH_HIBERNATION_POSSIBLE
def_bool y
@@ -141,6 +142,9 @@ config AUDIT_ARCH
config ARCH_SUPPORTS_AOUT
def_bool y
+config ARCH_SUPPORTS_OPTIMIZED_INLINING
+ def_bool y
+
# Use the generic interrupt handling code in kernel/irq/:
config GENERIC_HARDIRQS
bool
@@ -171,7 +175,7 @@ config X86_64_SMP
config X86_HT
bool
depends on SMP
- depends on (X86_32 && !(X86_VISWS || X86_VOYAGER)) || (X86_64 && !MK8)
+ depends on (X86_32 && !(X86_VISWS || X86_VOYAGER)) || X86_64
default y
config X86_BIOS_REBOOT
@@ -181,7 +185,7 @@ config X86_BIOS_REBOOT
config X86_TRAMPOLINE
bool
- depends on X86_SMP || (X86_VOYAGER && SMP)
+ depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP)
default y
config KTIME_SCALAR
@@ -241,8 +245,7 @@ config X86_ELAN
config X86_VOYAGER
bool "Voyager (NCR)"
- depends on X86_32
- select SMP if !BROKEN
+ depends on X86_32 && (SMP || BROKEN)
help
Voyager is an MCA-based 32-way capable SMP architecture proprietary
to NCR Corp. Machine classes 345x/35xx/4100/51xx are Voyager-based.
@@ -254,9 +257,8 @@ config X86_VOYAGER
config X86_NUMAQ
bool "NUMAQ (IBM/Sequent)"
- select SMP
+ depends on SMP && X86_32
select NUMA
- depends on X86_32
help
This option is used for getting Linux to run on a (IBM/Sequent) NUMA
multiquad box. This changes the way that processors are bootstrapped,
@@ -327,8 +329,9 @@ config X86_RDC321X
config X86_VSMP
bool "Support for ScaleMP vSMP"
- depends on X86_64 && PCI
- help
+ select PARAVIRT
+ depends on X86_64
+ help
Support for ScaleMP vSMP systems. Say 'Y' here if this kernel is
supposed to run on these EM64T-based machines. Only choose this option
if you have one of these machines.
@@ -370,6 +373,25 @@ config VMI
at the moment), by linking the kernel to a GPL-ed ROM module
provided by the hypervisor.
+config KVM_CLOCK
+ bool "KVM paravirtualized clock"
+ select PARAVIRT
+ depends on !(X86_VISWS || X86_VOYAGER)
+ help
+ Turning on this option will allow you to run a paravirtualized clock
+ when running over the KVM hypervisor. Instead of relying on a PIT
+ (or probably other) emulation by the underlying device model, the host
+ provides the guest with timing infrastructure such as time of day, and
+ system time
+
+config KVM_GUEST
+ bool "KVM Guest support"
+ select PARAVIRT
+ depends on !(X86_VISWS || X86_VOYAGER)
+ help
+ This option enables various optimizations for running under the KVM
+ hypervisor.
+
source "arch/x86/lguest/Kconfig"
config PARAVIRT
@@ -383,6 +405,35 @@ config PARAVIRT
endif
+config MEMTEST_BOOTPARAM
+ bool "Memtest boot parameter"
+ depends on X86_64
+ default y
+ help
+ This option adds a kernel parameter 'memtest', which allows memtest
+ to be disabled at boot. If this option is selected, memtest
+ functionality can be disabled with memtest=0 on the kernel
+ command line. The purpose of this option is to allow a single
+ kernel image to be distributed with memtest built in, but not
+ necessarily enabled.
+
+ If you are unsure how to answer this question, answer Y.
+
+config MEMTEST_BOOTPARAM_VALUE
+ int "Memtest boot parameter default value (0-4)"
+ depends on MEMTEST_BOOTPARAM
+ range 0 4
+ default 0
+ help
+ This option sets the default value for the kernel parameter
+ 'memtest', which allows memtest to be disabled at boot. If this
+ option is set to 0 (zero), the memtest kernel parameter will
+ default to 0, disabling memtest at bootup. If this option is
+ set to 4, the memtest kernel parameter will default to 4,
+ enabling memtest at bootup, and use that as pattern number.
+
+ If you are unsure how to answer this question, answer 0.
+
config ACPI_SRAT
def_bool y
depends on X86_32 && ACPI && NUMA && (X86_SUMMIT || X86_GENERICARCH)
@@ -431,6 +482,15 @@ config HPET_EMULATE_RTC
# Mark as embedded because too many people got it wrong.
# The code disables itself when not needed.
+config DMI
+ default y
+ bool "Enable DMI scanning" if EMBEDDED
+ help
+ Enabled scanning of DMI to identify machine quirks. Say Y
+ here unless you have verified that your setup is not
+ affected by entries in the DMI blacklist. Required by PNP
+ BIOS code.
+
config GART_IOMMU
bool "GART IOMMU support" if EMBEDDED
default y
@@ -507,7 +567,7 @@ config NR_CPUS
config SCHED_SMT
bool "SMT (Hyperthreading) scheduler support"
- depends on (X86_64 && SMP) || (X86_32 && X86_HT)
+ depends on X86_HT
help
SMT scheduler support improves the CPU scheduler's decision making
when dealing with Intel Pentium 4 chips with HyperThreading at a
@@ -517,7 +577,7 @@ config SCHED_SMT
config SCHED_MC
def_bool y
prompt "Multi-core scheduler support"
- depends on (X86_64 && SMP) || (X86_32 && X86_HT)
+ depends on X86_HT
help
Multi-core scheduler support improves the CPU scheduler's decision
making when dealing with multi-core CPU chips at a cost of slightly
@@ -877,6 +937,15 @@ config X86_64_ACPI_NUMA
help
Enable ACPI SRAT based node topology detection.
+# Some NUMA nodes have memory ranges that span
+# other nodes. Even though a pfn is valid and
+# between a node's start and end pfns, it may not
+# reside on that node. See memmap_init_zone()
+# for details.
+config NODES_SPAN_OTHER_NODES
+ def_bool y
+ depends on X86_64_ACPI_NUMA
+
config NUMA_EMU
bool "NUMA emulation"
depends on X86_64 && NUMA
@@ -886,7 +955,7 @@ config NUMA_EMU
number of nodes. This is only useful for debugging.
config NODES_SHIFT
- int
+ int "Max num nodes shift(1-15)"
range 1 15 if X86_64
default "6" if X86_64
default "4" if X86_NUMAQ
@@ -1010,6 +1079,21 @@ config MTRR
See <file:Documentation/mtrr.txt> for more information.
+config X86_PAT
+ bool
+ prompt "x86 PAT support"
+ depends on MTRR
+ help
+ Use PAT attributes to setup page level cache control.
+
+ PATs are the modern equivalents of MTRRs and are much more
+ flexible than MTRRs.
+
+ Say N here if you see bootup problems (boot crash, boot hang,
+ spontaneous reboots) or a non-working video driver.
+
+ If unsure, say Y.
+
config EFI
def_bool n
prompt "EFI runtime service support"
@@ -1078,6 +1162,7 @@ source kernel/Kconfig.hz
config KEXEC
bool "kexec system call"
+ depends on X86_64 || X86_BIOS_REBOOT
help
kexec is a system call that implements the ability to shutdown your
current kernel, and to start another kernel. It is like a reboot
@@ -1379,7 +1464,7 @@ endmenu
menu "Bus options (PCI etc.)"
config PCI
- bool "PCI support" if !X86_VISWS
+ bool "PCI support" if !X86_VISWS && !X86_VSMP
depends on !X86_VOYAGER
default y
select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC)
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 9304bfba7d45..7ef18b01f0bc 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -21,8 +21,8 @@ config M386
Here are the settings recommended for greatest speed:
- "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI
- 486DLC/DLC2, UMC 486SX-S and NexGen Nx586. Only "386" kernels
- will run on a 386 class machine.
+ 486DLC/DLC2, and UMC 486SX-S. Only "386" kernels will run on a 386
+ class machine.
- "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or
SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S.
- "586" for generic Pentium CPUs lacking the TSC
@@ -278,6 +278,11 @@ config GENERIC_CPU
endchoice
+config X86_CPU
+ def_bool y
+ select GENERIC_FIND_FIRST_BIT
+ select GENERIC_FIND_NEXT_BIT
+
config X86_GENERIC
bool "Generic x86 support"
depends on X86_32
@@ -388,7 +393,7 @@ config X86_OOSTORE
#
config X86_P6_NOP
def_bool y
- depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4)
+ depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC)
config X86_TSC
def_bool y
@@ -398,7 +403,7 @@ config X86_TSC
# generates cmov.
config X86_CMOV
def_bool y
- depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7)
+ depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64)
config X86_MINIMUM_CPU_FAMILY
int
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 702eb39901ca..5b1979a45a1e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,6 +5,17 @@ config TRACE_IRQFLAGS_SUPPORT
source "lib/Kconfig.debug"
+config NONPROMISC_DEVMEM
+ bool "Disable promiscuous /dev/mem"
+ help
+ The /dev/mem file by default only allows userspace access to PCI
+ space and the BIOS code and data regions. This is sufficient for
+ dosemu and X and all common users of /dev/mem. With this config
+ option, you allow userspace access to all of memory, including
+ kernel and userspace memory. Accidental access to this is
+ obviously disasterous, but specific access can be used by people
+ debugging the kernel.
+
config EARLY_PRINTK
bool "Early printk" if EMBEDDED
default y
@@ -54,6 +65,18 @@ config DEBUG_PER_CPU_MAPS
Say N if unsure.
+config X86_PTDUMP
+ bool "Export kernel pagetable layout to userspace via debugfs"
+ depends on DEBUG_KERNEL
+ select DEBUG_FS
+ help
+ Say Y here if you want to show the kernel pagetable layout in a
+ debugfs file. This information is only useful for kernel developers
+ who are working in architecture specific areas of the kernel.
+ It is probably not a good idea to enable this feature in a production
+ kernel.
+ If in doubt, say "N"
+
config DEBUG_RODATA
bool "Write protect kernel read-only data structures"
default y
@@ -64,6 +87,18 @@ config DEBUG_RODATA
data. This is recommended so that we can catch kernel bugs sooner.
If in doubt, say "Y".
+config DIRECT_GBPAGES
+ bool "Enable gbpages-mapped kernel pagetables"
+ depends on DEBUG_KERNEL && EXPERIMENTAL && X86_64
+ help
+ Enable gigabyte pages support (if the CPU supports it). This can
+ improve the kernel's performance a tiny bit by reducing TLB
+ pressure.
+
+ This is experimental code.
+
+ If in doubt, say "N".
+
config DEBUG_RODATA_TEST
bool "Testcase for the DEBUG_RODATA feature"
depends on DEBUG_RODATA
@@ -82,8 +117,8 @@ config DEBUG_NX_TEST
config 4KSTACKS
bool "Use 4Kb for kernel stacks instead of 8Kb"
- depends on DEBUG_KERNEL
depends on X86_32
+ default y
help
If you say Y here the kernel will use a 4Kb stacksize for the
kernel stack attached to each process/thread. This facilitates
@@ -222,3 +257,16 @@ config CPA_DEBUG
Do change_page_attr() self-tests every 30 seconds.
endmenu
+
+config OPTIMIZE_INLINING
+ bool "Allow gcc to uninline functions marked 'inline'"
+ default y
+ help
+ This option determines if the kernel forces gcc to inline the functions
+ developers have marked 'inline'. Doing so takes away freedom from gcc to
+ do what it thinks is best, which is desirable for the gcc 3.x series of
+ compilers. The gcc 4.x series have a rewritten inlining algorithm and
+ disabling this option will generate a smaller kernel there. Hopefully
+ this algorithm is so good that allowing gcc4 to make the decision can
+ become the default in the future, until then this option is there to
+ test gcc for this.
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index f1e739a43d41..3cff3c894cf3 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -151,7 +151,6 @@ mflags-y += -Iinclude/asm-x86/mach-default
# 64 bit does not support subarch support - clear sub arch variables
fcore-$(CONFIG_X86_64) :=
mcore-$(CONFIG_X86_64) :=
-mflags-$(CONFIG_X86_64) :=
KBUILD_CFLAGS += $(mflags-y)
KBUILD_AFLAGS += $(mflags-y)
@@ -159,9 +158,9 @@ KBUILD_AFLAGS += $(mflags-y)
###
# Kernel objects
-head-y := arch/x86/kernel/head_$(BITS).o
-head-$(CONFIG_X86_64) += arch/x86/kernel/head64.o
-head-y += arch/x86/kernel/init_task.o
+head-y := arch/x86/kernel/head_$(BITS).o
+head-y += arch/x86/kernel/head$(BITS).o
+head-y += arch/x86/kernel/init_task.o
libs-y += arch/x86/lib/
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index b1bdc4c6f9f2..172cf8a98bdd 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -1,7 +1,8 @@
bootsect
bzImage
+cpustr.h
+mkcpustr
+offsets.h
setup
setup.bin
setup.elf
-cpustr.h
-mkcpustr
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index f88458e83ef0..7ee102f9c4f8 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -30,7 +30,7 @@ subdir- := compressed
setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
setup-y += header.o main.o mca.o memory.o pm.o pmjump.o
-setup-y += printf.o string.o tty.o video.o version.o
+setup-y += printf.o string.o tty.o video.o video-mode.o version.o
setup-$(CONFIG_X86_APM_BOOT) += apm.o
setup-$(CONFIG_X86_VOYAGER) += voyager.o
@@ -94,6 +94,20 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
+sed-offsets := -e 's/^00*/0/' \
+ -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p'
+
+quiet_cmd_offsets = OFFSETS $@
+ cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@
+
+$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE
+ $(call if_changed,offsets)
+
+targets += offsets.h
+
+AFLAGS_header.o += -I$(obj)
+$(obj)/header.o: $(obj)/offsets.h
+
LDFLAGS_setup.elf := -T
$(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
$(call if_changed,ld)
diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 31348d054fca..90943f83e84d 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/a20.c
- *
* Enable A20 gate (return -1 on failure)
*/
diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
index c117c7fb859c..7aa6033001f9 100644
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -12,8 +12,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/apm.c
- *
* Get APM BIOS information
*/
diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h
index 8dcc8dc7db88..878e4b9940d9 100644
--- a/arch/x86/boot/bitops.h
+++ b/arch/x86/boot/bitops.h
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/bitops.h
- *
* Very simple bitops for the boot code.
*/
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 7822a4983da2..a34b9982c7cb 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/boot.h
- *
* Header file for the real-mode kernel code
*/
@@ -286,6 +284,11 @@ int getchar_timeout(void);
/* video.c */
void set_video(void);
+/* video-mode.c */
+int set_mode(u16 mode);
+int mode_defined(u16 mode);
+void probe_cards(int unsafe);
+
/* video-vesa.c */
void vesa_store_edid(void);
diff --git a/arch/x86/boot/cmdline.c b/arch/x86/boot/cmdline.c
index 680408a0f463..a1d35634bce0 100644
--- a/arch/x86/boot/cmdline.c
+++ b/arch/x86/boot/cmdline.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/cmdline.c
- *
* Simple command-line parser for early boot.
*/
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index d2b9f3bb87c0..92fdd35bd93e 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -22,7 +22,7 @@ $(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $
$(call if_changed,ld)
@:
-OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S
+OBJCOPYFLAGS_vmlinux.bin := -R .comment -S
$(obj)/vmlinux.bin: vmlinux FORCE
$(call if_changed,objcopy)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 036e635f18a3..ba7736cf2ec7 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -130,7 +130,7 @@ relocated:
/*
* Setup the stack for the decompressor
*/
- leal stack_end(%ebx), %esp
+ leal boot_stack_end(%ebx), %esp
/*
* Do the decompression, and jump to the new kernel..
@@ -142,8 +142,8 @@ relocated:
pushl %eax # input_len
leal input_data(%ebx), %eax
pushl %eax # input_data
- leal _end(%ebx), %eax
- pushl %eax # end of the image as third argument
+ leal boot_heap(%ebx), %eax
+ pushl %eax # heap area as third argument
pushl %esi # real mode pointer as second arg
call decompress_kernel
addl $20, %esp
@@ -181,7 +181,10 @@ relocated:
jmp *%ebp
.bss
+/* Stack and heap for uncompression */
.balign 4
-stack:
- .fill 4096, 1, 0
-stack_end:
+boot_heap:
+ .fill BOOT_HEAP_SIZE, 1, 0
+boot_stack:
+ .fill BOOT_STACK_SIZE, 1, 0
+boot_stack_end:
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index e8657b98c902..d8819efac81d 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -28,6 +28,7 @@
#include <asm/segment.h>
#include <asm/pgtable.h>
#include <asm/page.h>
+#include <asm/boot.h>
#include <asm/msr.h>
#include <asm/asm-offsets.h>
@@ -62,7 +63,7 @@ startup_32:
subl $1b, %ebp
/* setup a stack and make sure cpu supports long mode. */
- movl $user_stack_end, %eax
+ movl $boot_stack_end, %eax
addl %ebp, %eax
movl %eax, %esp
@@ -243,9 +244,9 @@ ENTRY(startup_64)
/* Copy the compressed kernel to the end of our buffer
* where decompression in place becomes safe.
*/
- leaq _end(%rip), %r8
- leaq _end(%rbx), %r9
- movq $_end /* - $startup_32 */, %rcx
+ leaq _end_before_pgt(%rip), %r8
+ leaq _end_before_pgt(%rbx), %r9
+ movq $_end_before_pgt /* - $startup_32 */, %rcx
1: subq $8, %r8
subq $8, %r9
movq 0(%r8), %rax
@@ -267,14 +268,14 @@ relocated:
*/
xorq %rax, %rax
leaq _edata(%rbx), %rdi
- leaq _end(%rbx), %rcx
+ leaq _end_before_pgt(%rbx), %rcx
subq %rdi, %rcx
cld
rep
stosb
/* Setup the stack */
- leaq user_stack_end(%rip), %rsp
+ leaq boot_stack_end(%rip), %rsp
/* zero EFLAGS after setting rsp */
pushq $0
@@ -285,7 +286,7 @@ relocated:
*/
pushq %rsi # Save the real mode argument
movq %rsi, %rdi # real mode address
- leaq _heap(%rip), %rsi # _heap
+ leaq boot_heap(%rip), %rsi # malloc area for uncompression
leaq input_data(%rip), %rdx # input_data
movl input_len(%rip), %eax
movq %rax, %rcx # input_len
@@ -310,9 +311,12 @@ gdt:
.quad 0x0080890000000000 /* TS descriptor */
.quad 0x0000000000000000 /* TS continued */
gdt_end:
- .bss
-/* Stack for uncompression */
- .balign 4
-user_stack:
- .fill 4096,4,0
-user_stack_end:
+
+.bss
+/* Stack and heap for uncompression */
+.balign 4
+boot_heap:
+ .fill BOOT_HEAP_SIZE, 1, 0
+boot_stack:
+ .fill BOOT_STACK_SIZE, 1, 0
+boot_stack_end:
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 8182e32c1b42..90456cee47c3 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -15,6 +15,10 @@
* we just keep it from happening
*/
#undef CONFIG_PARAVIRT
+#ifdef CONFIG_X86_32
+#define _ASM_DESC_H_ 1
+#endif
+
#ifdef CONFIG_X86_64
#define _LINUX_STRING_H_ 1
#define __LINUX_BITMAP_H 1
@@ -22,6 +26,7 @@
#include <linux/linkage.h>
#include <linux/screen_info.h>
+#include <linux/elf.h>
#include <asm/io.h>
#include <asm/page.h>
#include <asm/boot.h>
@@ -53,8 +58,8 @@
* 1 bit (last block flag)
* 2 bits (block type)
*
- * 1 block occurs every 32K -1 bytes or when there 50% compression has been achieved.
- * The smallest block type encoding is always used.
+ * 1 block occurs every 32K -1 bytes or when there 50% compression
+ * has been achieved. The smallest block type encoding is always used.
*
* stored:
* 32 bits length in bytes.
@@ -90,9 +95,9 @@
*
* All of which is enough to compute an amount of extra data that is required
* to be safe. To avoid problems at the block level allocating 5 extra bytes
- * per 32767 bytes of data is sufficient. To avoind problems internal to a block
- * adding an extra 32767 bytes (the worst case uncompressed block size) is
- * sufficient, to ensure that in the worst case the decompressed data for
+ * per 32767 bytes of data is sufficient. To avoind problems internal to a
+ * block adding an extra 32767 bytes (the worst case uncompressed block size)
+ * is sufficient, to ensure that in the worst case the decompressed data for
* block will stop the byte before the compressed data for a block begins.
* To avoid problems with the compressed data's meta information an extra 18
* bytes are needed. Leading to the formula:
@@ -111,58 +116,66 @@
* gzip declarations
*/
-#define OF(args) args
-#define STATIC static
+#define OF(args) args
+#define STATIC static
#undef memset
#undef memcpy
-#define memzero(s, n) memset ((s), 0, (n))
+#define memzero(s, n) memset((s), 0, (n))
-typedef unsigned char uch;
-typedef unsigned short ush;
-typedef unsigned long ulg;
+typedef unsigned char uch;
+typedef unsigned short ush;
+typedef unsigned long ulg;
-#define WSIZE 0x80000000 /* Window size must be at least 32k,
- * and a power of two
- * We don't actually have a window just
- * a huge output buffer so I report
- * a 2G windows size, as that should
- * always be larger than our output buffer.
- */
+/*
+ * Window size must be at least 32k, and a power of two.
+ * We don't actually have a window just a huge output buffer,
+ * so we report a 2G window size, as that should always be
+ * larger than our output buffer:
+ */
+#define WSIZE 0x80000000
-static uch *inbuf; /* input buffer */
-static uch *window; /* Sliding window buffer, (and final output buffer) */
+/* Input buffer: */
+static unsigned char *inbuf;
-static unsigned insize; /* valid bytes in inbuf */
-static unsigned inptr; /* index of next byte to be processed in inbuf */
-static unsigned outcnt; /* bytes in output buffer */
+/* Sliding window buffer (and final output buffer): */
+static unsigned char *window;
+
+/* Valid bytes in inbuf: */
+static unsigned insize;
+
+/* Index of next byte to be processed in inbuf: */
+static unsigned inptr;
+
+/* Bytes in output buffer: */
+static unsigned outcnt;
/* gzip flag byte */
-#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
-#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gzip file */
-#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
-#define ORIG_NAME 0x08 /* bit 3 set: original file name present */
-#define COMMENT 0x10 /* bit 4 set: file comment present */
-#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */
-#define RESERVED 0xC0 /* bit 6,7: reserved */
-
-#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf())
-
+#define ASCII_FLAG 0x01 /* bit 0 set: file probably ASCII text */
+#define CONTINUATION 0x02 /* bit 1 set: continuation of multi-part gz file */
+#define EXTRA_FIELD 0x04 /* bit 2 set: extra field present */
+#define ORIG_NAM 0x08 /* bit 3 set: original file name present */
+#define COMMENT 0x10 /* bit 4 set: file comment present */
+#define ENCRYPTED 0x20 /* bit 5 set: file is encrypted */
+#define RESERVED 0xC0 /* bit 6, 7: reserved */
+
+#define get_byte() (inptr < insize ? inbuf[inptr++] : fill_inbuf())
+
/* Diagnostic functions */
#ifdef DEBUG
-# define Assert(cond,msg) {if(!(cond)) error(msg);}
-# define Trace(x) fprintf x
-# define Tracev(x) {if (verbose) fprintf x ;}
-# define Tracevv(x) {if (verbose>1) fprintf x ;}
-# define Tracec(c,x) {if (verbose && (c)) fprintf x ;}
-# define Tracecv(c,x) {if (verbose>1 && (c)) fprintf x ;}
+# define Assert(cond, msg) do { if (!(cond)) error(msg); } while (0)
+# define Trace(x) do { fprintf x; } while (0)
+# define Tracev(x) do { if (verbose) fprintf x ; } while (0)
+# define Tracevv(x) do { if (verbose > 1) fprintf x ; } while (0)
+# define Tracec(c, x) do { if (verbose && (c)) fprintf x ; } while (0)
+# define Tracecv(c, x) do { if (verbose > 1 && (c)) fprintf x ; } while (0)
#else
-# define Assert(cond,msg)
+# define Assert(cond, msg)
# define Trace(x)
# define Tracev(x)
# define Tracevv(x)
-# define Tracec(c,x)
-# define Tracecv(c,x)
+# define Tracec(c, x)
+# define Tracecv(c, x)
#endif
static int fill_inbuf(void);
@@ -170,7 +183,7 @@ static void flush_window(void);
static void error(char *m);
static void gzip_mark(void **);
static void gzip_release(void **);
-
+
/*
* This is set up by the setup-routine at boot-time
*/
@@ -185,7 +198,7 @@ static unsigned char *real_mode; /* Pointer to real-mode data */
extern unsigned char input_data[];
extern int input_len;
-static long bytes_out = 0;
+static long bytes_out;
static void *malloc(int size);
static void free(void *where);
@@ -204,13 +217,7 @@ static void putstr(const char *);
static memptr free_mem_ptr;
static memptr free_mem_end_ptr;
-#ifdef CONFIG_X86_64
-#define HEAP_SIZE 0x7000
-#else
-#define HEAP_SIZE 0x4000
-#endif
-
-static char *vidmem = (char *)0xb8000;
+static char *vidmem;
static int vidport;
static int lines, cols;
@@ -224,8 +231,10 @@ static void *malloc(int size)
{
void *p;
- if (size <0) error("Malloc error");
- if (free_mem_ptr <= 0) error("Memory error");
+ if (size < 0)
+ error("Malloc error");
+ if (free_mem_ptr <= 0)
+ error("Memory error");
free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
@@ -251,19 +260,19 @@ static void gzip_release(void **ptr)
{
free_mem_ptr = (memptr) *ptr;
}
-
+
static void scroll(void)
{
int i;
- memcpy ( vidmem, vidmem + cols * 2, ( lines - 1 ) * cols * 2 );
- for ( i = ( lines - 1 ) * cols * 2; i < lines * cols * 2; i += 2 )
+ memcpy(vidmem, vidmem + cols * 2, (lines - 1) * cols * 2);
+ for (i = (lines - 1) * cols * 2; i < lines * cols * 2; i += 2)
vidmem[i] = ' ';
}
static void putstr(const char *s)
{
- int x,y,pos;
+ int x, y, pos;
char c;
#ifdef CONFIG_X86_32
@@ -274,18 +283,18 @@ static void putstr(const char *s)
x = RM_SCREEN_INFO.orig_x;
y = RM_SCREEN_INFO.orig_y;
- while ( ( c = *s++ ) != '\0' ) {
- if ( c == '\n' ) {
+ while ((c = *s++) != '\0') {
+ if (c == '\n') {
x = 0;
- if ( ++y >= lines ) {
+ if (++y >= lines) {
scroll();
y--;
}
} else {
vidmem [(x + cols * y) * 2] = c;
- if ( ++x >= cols ) {
+ if (++x >= cols) {
x = 0;
- if ( ++y >= lines ) {
+ if (++y >= lines) {
scroll();
y--;
}
@@ -303,22 +312,22 @@ static void putstr(const char *s)
outb(0xff & (pos >> 1), vidport+1);
}
-static void* memset(void* s, int c, unsigned n)
+static void *memset(void *s, int c, unsigned n)
{
int i;
char *ss = s;
- for (i=0;i<n;i++) ss[i] = c;
+ for (i = 0; i < n; i++) ss[i] = c;
return s;
}
-static void* memcpy(void* dest, const void* src, unsigned n)
+static void *memcpy(void *dest, const void *src, unsigned n)
{
int i;
const char *s = src;
char *d = dest;
- for (i=0;i<n;i++) d[i] = s[i];
+ for (i = 0; i < n; i++) d[i] = s[i];
return dest;
}
@@ -341,9 +350,9 @@ static void flush_window(void)
/* With my window equal to my output buffer
* I only need to compute the crc here.
*/
- ulg c = crc; /* temporary variable */
+ unsigned long c = crc; /* temporary variable */
unsigned n;
- uch *in, ch;
+ unsigned char *in, ch;
in = window;
for (n = 0; n < outcnt; n++) {
@@ -351,7 +360,7 @@ static void flush_window(void)
c = crc_32_tab[((int)c ^ ch) & 0xff] ^ (c >> 8);
}
crc = c;
- bytes_out += (ulg)outcnt;
+ bytes_out += (unsigned long)outcnt;
outcnt = 0;
}
@@ -365,9 +374,59 @@ static void error(char *x)
asm("hlt");
}
+static void parse_elf(void *output)
+{
+#ifdef CONFIG_X86_64
+ Elf64_Ehdr ehdr;
+ Elf64_Phdr *phdrs, *phdr;
+#else
+ Elf32_Ehdr ehdr;
+ Elf32_Phdr *phdrs, *phdr;
+#endif
+ void *dest;
+ int i;
+
+ memcpy(&ehdr, output, sizeof(ehdr));
+ if (ehdr.e_ident[EI_MAG0] != ELFMAG0 ||
+ ehdr.e_ident[EI_MAG1] != ELFMAG1 ||
+ ehdr.e_ident[EI_MAG2] != ELFMAG2 ||
+ ehdr.e_ident[EI_MAG3] != ELFMAG3) {
+ error("Kernel is not a valid ELF file");
+ return;
+ }
+
+ putstr("Parsing ELF... ");
+
+ phdrs = malloc(sizeof(*phdrs) * ehdr.e_phnum);
+ if (!phdrs)
+ error("Failed to allocate space for phdrs");
+
+ memcpy(phdrs, output + ehdr.e_phoff, sizeof(*phdrs) * ehdr.e_phnum);
+
+ for (i = 0; i < ehdr.e_phnum; i++) {
+ phdr = &phdrs[i];
+
+ switch (phdr->p_type) {
+ case PT_LOAD:
+#ifdef CONFIG_RELOCATABLE
+ dest = output;
+ dest += (phdr->p_paddr - LOAD_PHYSICAL_ADDR);
+#else
+ dest = (void *)(phdr->p_paddr);
+#endif
+ memcpy(dest,
+ output + phdr->p_offset,
+ phdr->p_filesz);
+ break;
+ default: /* Ignore other PT_* */ break;
+ }
+ }
+}
+
asmlinkage void decompress_kernel(void *rmode, memptr heap,
- uch *input_data, unsigned long input_len,
- uch *output)
+ unsigned char *input_data,
+ unsigned long input_len,
+ unsigned char *output)
{
real_mode = rmode;
@@ -384,18 +443,18 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
window = output; /* Output buffer (Normally at 1M) */
free_mem_ptr = heap; /* Heap */
- free_mem_end_ptr = heap + HEAP_SIZE;
+ free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
inbuf = input_data; /* Input buffer */
insize = input_len;
inptr = 0;
#ifdef CONFIG_X86_64
- if ((ulg)output & (__KERNEL_ALIGN - 1))
+ if ((unsigned long)output & (__KERNEL_ALIGN - 1))
error("Destination address not 2M aligned");
- if ((ulg)output >= 0xffffffffffUL)
+ if ((unsigned long)output >= 0xffffffffffUL)
error("Destination address too large");
#else
- if ((u32)output & (CONFIG_PHYSICAL_ALIGN -1))
+ if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1))
error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
error("Destination address too large");
@@ -408,6 +467,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
makecrc();
putstr("\nDecompressing Linux... ");
gunzip();
+ parse_elf(output);
putstr("done.\nBooting the kernel.\n");
return;
}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux_64.lds
index 7e5c7209f6cc..bef1ac891bce 100644
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ b/arch/x86/boot/compressed/vmlinux_64.lds
@@ -39,10 +39,10 @@ SECTIONS
*(.bss.*)
*(COMMON)
. = ALIGN(8);
- _end = . ;
+ _end_before_pgt = . ;
. = ALIGN(4096);
pgtable = . ;
. = . + 4096 * 6;
- _heap = .;
+ _ebss = .;
}
}
diff --git a/arch/x86/boot/copy.S b/arch/x86/boot/copy.S
index ef127e56a3cf..ef50c84e8b4b 100644
--- a/arch/x86/boot/copy.S
+++ b/arch/x86/boot/copy.S
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/copy.S
- *
* Memory copy routines
*/
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 769065bd23d7..7804389ee005 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/cpucheck.c
- *
* Check for obligatory CPU features and abort if the features are not
* present. This code should be compilable as 16-, 32- or 64-bit
* code, so be very careful with types and inline assembly.
@@ -56,27 +54,27 @@ static const u32 req_flags[NCAPINTS] =
REQUIRED_MASK7,
};
-#define A32(a,b,c,d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
+#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
static int is_amd(void)
{
- return cpu_vendor[0] == A32('A','u','t','h') &&
- cpu_vendor[1] == A32('e','n','t','i') &&
- cpu_vendor[2] == A32('c','A','M','D');
+ return cpu_vendor[0] == A32('A', 'u', 't', 'h') &&
+ cpu_vendor[1] == A32('e', 'n', 't', 'i') &&
+ cpu_vendor[2] == A32('c', 'A', 'M', 'D');
}
static int is_centaur(void)
{
- return cpu_vendor[0] == A32('C','e','n','t') &&
- cpu_vendor[1] == A32('a','u','r','H') &&
- cpu_vendor[2] == A32('a','u','l','s');
+ return cpu_vendor[0] == A32('C', 'e', 'n', 't') &&
+ cpu_vendor[1] == A32('a', 'u', 'r', 'H') &&
+ cpu_vendor[2] == A32('a', 'u', 'l', 's');
}
static int is_transmeta(void)
{
- return cpu_vendor[0] == A32('G','e','n','u') &&
- cpu_vendor[1] == A32('i','n','e','T') &&
- cpu_vendor[2] == A32('M','x','8','6');
+ return cpu_vendor[0] == A32('G', 'e', 'n', 'u') &&
+ cpu_vendor[1] == A32('i', 'n', 'e', 'T') &&
+ cpu_vendor[2] == A32('M', 'x', '8', '6');
}
static int has_fpu(void)
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 8721dc46a0b6..d84a48ece785 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/edd.c
- *
* Get EDD BIOS disk information
*/
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 64ad9016585a..af86e431acfa 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -22,6 +22,7 @@
#include <asm/page.h>
#include <asm/setup.h>
#include "boot.h"
+#include "offsets.h"
SETUPSECTS = 4 /* default nr of setup-sectors */
BOOTSEG = 0x07C0 /* original address of boot-sector */
@@ -119,7 +120,7 @@ _start:
# Part 2 of the header, from the old setup.S
.ascii "HdrS" # header signature
- .word 0x0207 # header version number (>= 0x0105)
+ .word 0x0209 # header version number (>= 0x0105)
# or else old loadlin-1.5 will fail)
.globl realmode_swtch
realmode_swtch: .word 0, 0 # default_switch, SETUPSEG
@@ -223,6 +224,13 @@ hardware_subarch: .long 0 # subarchitecture, added with 2.07
hardware_subarch_data: .quad 0
+payload_offset: .long input_data
+payload_length: .long input_data_end-input_data
+
+setup_data: .quad 0 # 64-bit physical pointer to
+ # single linked list of
+ # struct setup_data
+
# End of setup header #####################################################
.section ".inittext", "ax"
diff --git a/arch/x86/boot/install.sh b/arch/x86/boot/install.sh
index 88d77761d01b..8d60ee15dfd9 100644
--- a/arch/x86/boot/install.sh
+++ b/arch/x86/boot/install.sh
@@ -1,7 +1,5 @@
#!/bin/sh
#
-# arch/i386/boot/install.sh
-#
# This file is subject to the terms and conditions of the GNU General Public
# License. See the file "COPYING" in the main directory of this archive
# for more details.
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 7828da5cfd07..77569a4a3be1 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/main.c
- *
* Main module for the real-mode kernel code
*/
diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
index 68222f2d4b67..911eaae5d696 100644
--- a/arch/x86/boot/mca.c
+++ b/arch/x86/boot/mca.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/mca.c
- *
* Get the MCA system description table
*/
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index e77d89f9e8aa..acad32eb4290 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/memory.c
- *
* Memory detection code
*/
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 1a0f936c160b..328956fdb59e 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/pm.c
- *
* Prepare the machine for transition to protected mode.
*/
@@ -100,7 +98,7 @@ static void reset_coprocessor(void)
/*
* Set up the GDT
*/
-#define GDT_ENTRY(flags,base,limit) \
+#define GDT_ENTRY(flags, base, limit) \
(((u64)(base & 0xff000000) << 32) | \
((u64)flags << 40) | \
((u64)(limit & 0x00ff0000) << 32) | \
diff --git a/arch/x86/boot/pmjump.S b/arch/x86/boot/pmjump.S
index f5402d51f7c3..ab049d40a884 100644
--- a/arch/x86/boot/pmjump.S
+++ b/arch/x86/boot/pmjump.S
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/pmjump.S
- *
* The actual transition into protected mode
*/
diff --git a/arch/x86/boot/printf.c b/arch/x86/boot/printf.c
index 7e7e890699be..c1d00c0274c4 100644
--- a/arch/x86/boot/printf.c
+++ b/arch/x86/boot/printf.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/printf.c
- *
* Oh, it's a waste of space, but oh-so-yummy for debugging. This
* version of printf() does not include 64-bit support. "Live with
* it."
diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 481a22097781..f94b7a0c2abf 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/string.c
- *
* Very basic string functions
*/
diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c
index b4248740ff0d..44dc1923c0e3 100644
--- a/arch/x86/boot/tools/build.c
+++ b/arch/x86/boot/tools/build.c
@@ -50,6 +50,75 @@ typedef unsigned long u32;
u8 buf[SETUP_SECT_MAX*512];
int is_big_kernel;
+/*----------------------------------------------------------------------*/
+
+static const u32 crctab32[] = {
+ 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419,
+ 0x706af48f, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4,
+ 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07,
+ 0x90bf1d91, 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+ 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x136c9856,
+ 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
+ 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4,
+ 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+ 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3,
+ 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0x26d930ac, 0x51de003a,
+ 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599,
+ 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+ 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190,
+ 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f,
+ 0x9fbfe4a5, 0xe8b8d433, 0x7807c9a2, 0x0f00f934, 0x9609a88e,
+ 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+ 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed,
+ 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
+ 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3,
+ 0xfbd44c65, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+ 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a,
+ 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5,
+ 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa, 0xbe0b1010,
+ 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+ 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17,
+ 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6,
+ 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615,
+ 0x73dc1683, 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+ 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xf00f9344,
+ 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
+ 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a,
+ 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+ 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1,
+ 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0xd80d2bda, 0xaf0a1b4c,
+ 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef,
+ 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+ 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe,
+ 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31,
+ 0x2cd99e8b, 0x5bdeae1d, 0x9b64c2b0, 0xec63f226, 0x756aa39c,
+ 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+ 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b,
+ 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
+ 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1,
+ 0x18b74777, 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+ 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0xa00ae278,
+ 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7,
+ 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc, 0x40df0b66,
+ 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+ 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605,
+ 0xcdd70693, 0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8,
+ 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b,
+ 0x2d02ef8d
+};
+
+static u32 partial_crc32_one(u8 c, u32 crc)
+{
+ return crctab32[(crc ^ c) & 0xff] ^ (crc >> 8);
+}
+
+static u32 partial_crc32(const u8 *s, int len, u32 crc)
+{
+ while (len--)
+ crc = partial_crc32_one(*s++, crc);
+ return crc;
+}
+
static void die(const char * str, ...)
{
va_list args;
@@ -74,6 +143,7 @@ int main(int argc, char ** argv)
FILE *file;
int fd;
void *kernel;
+ u32 crc = 0xffffffffUL;
if (argc > 2 && !strcmp(argv[1], "-b"))
{
@@ -144,7 +214,8 @@ int main(int argc, char ** argv)
kernel = mmap(NULL, sz, PROT_READ, MAP_SHARED, fd, 0);
if (kernel == MAP_FAILED)
die("Unable to mmap '%s': %m", argv[2]);
- sys_size = (sz + 15) / 16;
+ /* Number of 16-byte paragraphs, including space for a 4-byte CRC */
+ sys_size = (sz + 15 + 4) / 16;
if (!is_big_kernel && sys_size > DEF_SYSSIZE)
die("System is too big. Try using bzImage or modules.");
@@ -155,12 +226,27 @@ int main(int argc, char ** argv)
buf[0x1f6] = sys_size >> 16;
buf[0x1f7] = sys_size >> 24;
+ crc = partial_crc32(buf, i, crc);
if (fwrite(buf, 1, i, stdout) != i)
die("Writing setup failed");
/* Copy the kernel code */
+ crc = partial_crc32(kernel, sz, crc);
if (fwrite(kernel, 1, sz, stdout) != sz)
die("Writing kernel failed");
+
+ /* Add padding leaving 4 bytes for the checksum */
+ while (sz++ < (sys_size*16) - 4) {
+ crc = partial_crc32_one('\0', crc);
+ if (fwrite("\0", 1, 1, stdout) != 1)
+ die("Writing padding failed");
+ }
+
+ /* Write the CRC */
+ fprintf(stderr, "CRC %lx\n", crc);
+ if (fwrite(&crc, 1, 4, stdout) != 4)
+ die("Writing CRC failed");
+
close(fd);
/* Everything is OK */
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index f3f14bd26371..0be77b39328a 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/tty.c
- *
* Very simple screen I/O
* XXX: Probably should add very simple serial I/O?
*/
diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c
index c61462f7d9a7..2723d9b5ce43 100644
--- a/arch/x86/boot/version.c
+++ b/arch/x86/boot/version.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/version.c
- *
* Kernel version string
*/
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index ff664a117096..49f26aaaebc8 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/video-bios.c
- *
* Standard video BIOS modes
*
* We have two options for this; silent and scanned.
@@ -50,6 +48,7 @@ static int set_bios_mode(u8 mode)
if (new_mode == mode)
return 0; /* Mode change OK */
+#ifndef _WAKEUP
if (new_mode != boot_params.screen_info.orig_video_mode) {
/* Mode setting failed, but we didn't end up where we
started. That's bad. Try to revert to the original
@@ -59,13 +58,18 @@ static int set_bios_mode(u8 mode)
: "+a" (ax)
: : "ebx", "ecx", "edx", "esi", "edi");
}
+#endif
return -1;
}
static int bios_probe(void)
{
u8 mode;
+#ifdef _WAKEUP
+ u8 saved_mode = 0x03;
+#else
u8 saved_mode = boot_params.screen_info.orig_video_mode;
+#endif
u16 crtc;
struct mode_info *mi;
int nmodes = 0;
diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c
new file mode 100644
index 000000000000..748e8d06290a
--- /dev/null
+++ b/arch/x86/boot/video-mode.c
@@ -0,0 +1,173 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright 2007-2008 rPath, Inc. - All Rights Reserved
+ *
+ * This file is part of the Linux kernel, and is made available under
+ * the terms of the GNU General Public License version 2.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * arch/i386/boot/video-mode.c
+ *
+ * Set the video mode. This is separated out into a different
+ * file in order to be shared with the ACPI wakeup code.
+ */
+
+#include "boot.h"
+#include "video.h"
+#include "vesa.h"
+
+/*
+ * Common variables
+ */
+int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
+u16 video_segment;
+int force_x, force_y; /* Don't query the BIOS for cols/rows */
+
+int do_restore; /* Screen contents changed during mode flip */
+int graphic_mode; /* Graphic mode with linear frame buffer */
+
+/* Probe the video drivers and have them generate their mode lists. */
+void probe_cards(int unsafe)
+{
+ struct card_info *card;
+ static u8 probed[2];
+
+ if (probed[unsafe])
+ return;
+
+ probed[unsafe] = 1;
+
+ for (card = video_cards; card < video_cards_end; card++) {
+ if (card->unsafe == unsafe) {
+ if (card->probe)
+ card->nmodes = card->probe();
+ else
+ card->nmodes = 0;
+ }
+ }
+}
+
+/* Test if a mode is defined */
+int mode_defined(u16 mode)
+{
+ struct card_info *card;
+ struct mode_info *mi;
+ int i;
+
+ for (card = video_cards; card < video_cards_end; card++) {
+ mi = card->modes;
+ for (i = 0; i < card->nmodes; i++, mi++) {
+ if (mi->mode == mode)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/* Set mode (without recalc) */
+static int raw_set_mode(u16 mode, u16 *real_mode)
+{
+ int nmode, i;
+ struct card_info *card;
+ struct mode_info *mi;
+
+ /* Drop the recalc bit if set */
+ mode &= ~VIDEO_RECALC;
+
+ /* Scan for mode based on fixed ID, position, or resolution */
+ nmode = 0;
+ for (card = video_cards; card < video_cards_end; card++) {
+ mi = card->modes;
+ for (i = 0; i < card->nmodes; i++, mi++) {
+ int visible = mi->x || mi->y;
+
+ if ((mode == nmode && visible) ||
+ mode == mi->mode ||
+ mode == (mi->y << 8)+mi->x) {
+ *real_mode = mi->mode;
+ return card->set_mode(mi);
+ }
+
+ if (visible)
+ nmode++;
+ }
+ }
+
+ /* Nothing found? Is it an "exceptional" (unprobed) mode? */
+ for (card = video_cards; card < video_cards_end; card++) {
+ if (mode >= card->xmode_first &&
+ mode < card->xmode_first+card->xmode_n) {
+ struct mode_info mix;
+ *real_mode = mix.mode = mode;
+ mix.x = mix.y = 0;
+ return card->set_mode(&mix);
+ }
+ }
+
+ /* Otherwise, failure... */
+ return -1;
+}
+
+/*
+ * Recalculate the vertical video cutoff (hack!)
+ */
+static void vga_recalc_vertical(void)
+{
+ unsigned int font_size, rows;
+ u16 crtc;
+ u8 pt, ov;
+
+ set_fs(0);
+ font_size = rdfs8(0x485); /* BIOS: font size (pixels) */
+ rows = force_y ? force_y : rdfs8(0x484)+1; /* Text rows */
+
+ rows *= font_size; /* Visible scan lines */
+ rows--; /* ... minus one */
+
+ crtc = vga_crtc();
+
+ pt = in_idx(crtc, 0x11);
+ pt &= ~0x80; /* Unlock CR0-7 */
+ out_idx(pt, crtc, 0x11);
+
+ out_idx((u8)rows, crtc, 0x12); /* Lower height register */
+
+ ov = in_idx(crtc, 0x07); /* Overflow register */
+ ov &= 0xbd;
+ ov |= (rows >> (8-1)) & 0x02;
+ ov |= (rows >> (9-6)) & 0x40;
+ out_idx(ov, crtc, 0x07);
+}
+
+/* Set mode (with recalc if specified) */
+int set_mode(u16 mode)
+{
+ int rv;
+ u16 real_mode;
+
+ /* Very special mode numbers... */
+ if (mode == VIDEO_CURRENT_MODE)
+ return 0; /* Nothing to do... */
+ else if (mode == NORMAL_VGA)
+ mode = VIDEO_80x25;
+ else if (mode == EXTENDED_VGA)
+ mode = VIDEO_8POINT;
+
+ rv = raw_set_mode(mode, &real_mode);
+ if (rv)
+ return rv;
+
+ if (mode & VIDEO_RECALC)
+ vga_recalc_vertical();
+
+ /* Save the canonical mode number for the kernel, not
+ an alias, size specification or menu position */
+#ifndef _WAKEUP
+ boot_params.hdr.vid_mode = real_mode;
+#endif
+ return 0;
+}
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 419b5c273374..401ad998ad08 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/video-vesa.c
- *
* VESA text modes
*/
@@ -24,7 +22,11 @@ static struct vesa_mode_info vminfo;
__videocard video_vesa;
+#ifndef _WAKEUP
static void vesa_store_mode_params_graphics(void);
+#else /* _WAKEUP */
+static inline void vesa_store_mode_params_graphics(void) {}
+#endif /* _WAKEUP */
static int vesa_probe(void)
{
@@ -165,6 +167,8 @@ static int vesa_set_mode(struct mode_info *mode)
}
+#ifndef _WAKEUP
+
/* Switch DAC to 8-bit mode */
static void vesa_dac_set_8bits(void)
{
@@ -288,6 +292,8 @@ void vesa_store_edid(void)
#endif /* CONFIG_FIRMWARE_EDID */
}
+#endif /* not _WAKEUP */
+
__videocard video_vesa =
{
.card_name = "VESA",
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 7259387b7d19..40ecb8d7688c 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/video-vga.c
- *
* Common all-VGA modes
*/
@@ -210,6 +208,8 @@ static int vga_set_mode(struct mode_info *mode)
*/
static int vga_probe(void)
{
+ u16 ega_bx;
+
static const char *card_name[] = {
"CGA/MDA/HGC", "EGA", "VGA"
};
@@ -226,12 +226,16 @@ static int vga_probe(void)
u8 vga_flag;
asm(INT10
- : "=b" (boot_params.screen_info.orig_video_ega_bx)
+ : "=b" (ega_bx)
: "a" (0x1200), "b" (0x10) /* Check EGA/VGA */
: "ecx", "edx", "esi", "edi");
+#ifndef _WAKEUP
+ boot_params.screen_info.orig_video_ega_bx = ega_bx;
+#endif
+
/* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
- if ((u8)boot_params.screen_info.orig_video_ega_bx != 0x10) {
+ if ((u8)ega_bx != 0x10) {
/* EGA/VGA */
asm(INT10
: "=a" (vga_flag)
@@ -240,7 +244,9 @@ static int vga_probe(void)
if (vga_flag == 0x1a) {
adapter = ADAPTER_VGA;
+#ifndef _WAKEUP
boot_params.screen_info.orig_video_isVGA = 1;
+#endif
} else {
adapter = ADAPTER_EGA;
}
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 696d08f3843c..83598b23093a 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/video.c
- *
* Select video mode
*/
@@ -18,21 +16,6 @@
#include "video.h"
#include "vesa.h"
-/*
- * Mode list variables
- */
-static struct card_info cards[]; /* List of cards to probe for */
-
-/*
- * Common variables
- */
-int adapter; /* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
-u16 video_segment;
-int force_x, force_y; /* Don't query the BIOS for cols/rows */
-
-int do_restore = 0; /* Screen contents changed during mode flip */
-int graphic_mode; /* Graphic mode with linear frame buffer */
-
static void store_cursor_position(void)
{
u16 curpos;
@@ -107,147 +90,6 @@ static void store_mode_params(void)
boot_params.screen_info.orig_video_lines = y;
}
-/* Probe the video drivers and have them generate their mode lists. */
-static void probe_cards(int unsafe)
-{
- struct card_info *card;
- static u8 probed[2];
-
- if (probed[unsafe])
- return;
-
- probed[unsafe] = 1;
-
- for (card = video_cards; card < video_cards_end; card++) {
- if (card->unsafe == unsafe) {
- if (card->probe)
- card->nmodes = card->probe();
- else
- card->nmodes = 0;
- }
- }
-}
-
-/* Test if a mode is defined */
-int mode_defined(u16 mode)
-{
- struct card_info *card;
- struct mode_info *mi;
- int i;
-
- for (card = video_cards; card < video_cards_end; card++) {
- mi = card->modes;
- for (i = 0; i < card->nmodes; i++, mi++) {
- if (mi->mode == mode)
- return 1;
- }
- }
-
- return 0;
-}
-
-/* Set mode (without recalc) */
-static int raw_set_mode(u16 mode, u16 *real_mode)
-{
- int nmode, i;
- struct card_info *card;
- struct mode_info *mi;
-
- /* Drop the recalc bit if set */
- mode &= ~VIDEO_RECALC;
-
- /* Scan for mode based on fixed ID, position, or resolution */
- nmode = 0;
- for (card = video_cards; card < video_cards_end; card++) {
- mi = card->modes;
- for (i = 0; i < card->nmodes; i++, mi++) {
- int visible = mi->x || mi->y;
-
- if ((mode == nmode && visible) ||
- mode == mi->mode ||
- mode == (mi->y << 8)+mi->x) {
- *real_mode = mi->mode;
- return card->set_mode(mi);
- }
-
- if (visible)
- nmode++;
- }
- }
-
- /* Nothing found? Is it an "exceptional" (unprobed) mode? */
- for (card = video_cards; card < video_cards_end; card++) {
- if (mode >= card->xmode_first &&
- mode < card->xmode_first+card->xmode_n) {
- struct mode_info mix;
- *real_mode = mix.mode = mode;
- mix.x = mix.y = 0;
- return card->set_mode(&mix);
- }
- }
-
- /* Otherwise, failure... */
- return -1;
-}
-
-/*
- * Recalculate the vertical video cutoff (hack!)
- */
-static void vga_recalc_vertical(void)
-{
- unsigned int font_size, rows;
- u16 crtc;
- u8 pt, ov;
-
- set_fs(0);
- font_size = rdfs8(0x485); /* BIOS: font size (pixels) */
- rows = force_y ? force_y : rdfs8(0x484)+1; /* Text rows */
-
- rows *= font_size; /* Visible scan lines */
- rows--; /* ... minus one */
-
- crtc = vga_crtc();
-
- pt = in_idx(crtc, 0x11);
- pt &= ~0x80; /* Unlock CR0-7 */
- out_idx(pt, crtc, 0x11);
-
- out_idx((u8)rows, crtc, 0x12); /* Lower height register */
-
- ov = in_idx(crtc, 0x07); /* Overflow register */
- ov &= 0xbd;
- ov |= (rows >> (8-1)) & 0x02;
- ov |= (rows >> (9-6)) & 0x40;
- out_idx(ov, crtc, 0x07);
-}
-
-/* Set mode (with recalc if specified) */
-static int set_mode(u16 mode)
-{
- int rv;
- u16 real_mode;
-
- /* Very special mode numbers... */
- if (mode == VIDEO_CURRENT_MODE)
- return 0; /* Nothing to do... */
- else if (mode == NORMAL_VGA)
- mode = VIDEO_80x25;
- else if (mode == EXTENDED_VGA)
- mode = VIDEO_8POINT;
-
- rv = raw_set_mode(mode, &real_mode);
- if (rv)
- return rv;
-
- if (mode & VIDEO_RECALC)
- vga_recalc_vertical();
-
- /* Save the canonical mode number for the kernel, not
- an alias, size specification or menu position */
- boot_params.hdr.vid_mode = real_mode;
- return 0;
-}
-
static unsigned int get_entry(void)
{
char entry_buf[4];
@@ -486,6 +328,7 @@ void set_video(void)
printf("Undefined video mode number: %x\n", mode);
mode = ASK_VGA;
}
+ boot_params.hdr.vid_mode = mode;
vesa_store_edid();
store_mode_params();
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index d69347f79e8e..ee63f5d14461 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/video.h
- *
* Header file for the real-mode video probing code
*/
diff --git a/arch/x86/boot/voyager.c b/arch/x86/boot/voyager.c
index 6499e3239b41..433909d61e5c 100644
--- a/arch/x86/boot/voyager.c
+++ b/arch/x86/boot/voyager.c
@@ -9,8 +9,6 @@
* ----------------------------------------------------------------------- */
/*
- * arch/i386/boot/voyager.c
- *
* Get the Voyager config information
*/
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 3df340b54e57..ad7ddaaff588 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1421,6 +1421,7 @@ CONFIG_DEBUG_BUGVERBOSE=y
# CONFIG_DEBUG_VM is not set
# CONFIG_DEBUG_LIST is not set
# CONFIG_FRAME_POINTER is not set
+CONFIG_OPTIMIZE_INLINING=y
# CONFIG_RCU_TORTURE_TEST is not set
# CONFIG_LKDTM is not set
# CONFIG_FAULT_INJECTION is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index eef98cb00c62..2d6f5b2809d2 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1346,6 +1346,7 @@ CONFIG_DEBUG_BUGVERBOSE=y
# CONFIG_DEBUG_VM is not set
# CONFIG_DEBUG_LIST is not set
# CONFIG_FRAME_POINTER is not set
+CONFIG_OPTIMIZE_INLINING=y
# CONFIG_RCU_TORTURE_TEST is not set
# CONFIG_LKDTM is not set
# CONFIG_FAULT_INJECTION is not set
diff --git a/arch/x86/crypto/aes-i586-asm_32.S b/arch/x86/crypto/aes-i586-asm_32.S
index 1093bede3e0a..e41b147f4509 100644
--- a/arch/x86/crypto/aes-i586-asm_32.S
+++ b/arch/x86/crypto/aes-i586-asm_32.S
@@ -289,7 +289,6 @@ aes_enc_blk:
pop %ebx
mov %r0,(%ebp)
pop %ebp
- mov $1,%eax
ret
// AES (Rijndael) Decryption Subroutine
@@ -365,6 +364,4 @@ aes_dec_blk:
pop %ebx
mov %r0,(%ebp)
pop %ebp
- mov $1,%eax
ret
-
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 5e7771a3ba2f..bbed3a26ce55 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -468,7 +468,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
restorer = ka->sa.sa_restorer;
} else {
/* Return stub is in 32bit vsyscall page */
- if (current->binfmt->hasvdso)
+ if (current->mm->context.vdso)
restorer = VDSO32_SYMBOL(current->mm->context.vdso,
sigreturn);
else
@@ -499,11 +499,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
regs->cs = __USER32_CS;
regs->ss = __USER32_DS;
- set_fs(USER_DS);
- regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
-
#if DEBUG_SIG
printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
current->comm, current->pid, frame, regs->ip, frame->pretcode);
@@ -599,11 +594,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
regs->cs = __USER32_CS;
regs->ss = __USER32_DS;
- set_fs(USER_DS);
- regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
-
#if DEBUG_SIG
printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n",
current->comm, current->pid, frame, regs->ip, frame->pretcode);
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 8022d3c695c0..b5e329da166c 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -162,12 +162,14 @@ sysenter_tracesys:
SAVE_REST
CLEAR_RREGS
movq %r9,R9(%rsp)
- movq $-ENOSYS,RAX(%rsp) /* really needed? */
+ movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
RESTORE_REST
xchgl %ebp,%r9d
+ cmpl $(IA32_NR_syscalls-1),%eax
+ ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
jmp sysenter_do_call
CFI_ENDPROC
ENDPROC(ia32_sysenter_target)
@@ -261,13 +263,15 @@ cstar_tracesys:
SAVE_REST
CLEAR_RREGS
movq %r9,R9(%rsp)
- movq $-ENOSYS,RAX(%rsp) /* really needed? */
+ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
RESTORE_REST
xchgl %ebp,%r9d
movl RSP-ARGOFFSET(%rsp), %r8d
+ cmpl $(IA32_NR_syscalls-1),%eax
+ ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
jmp cstar_do_call
END(ia32_cstar_target)
@@ -325,7 +329,7 @@ ENTRY(ia32_syscall)
jnz ia32_tracesys
ia32_do_syscall:
cmpl $(IA32_NR_syscalls-1),%eax
- ja ia32_badsys
+ ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
IA32_ARG_FIXUP
call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
ia32_sysret:
@@ -335,7 +339,7 @@ ia32_sysret:
ia32_tracesys:
SAVE_REST
CLEAR_RREGS
- movq $-ENOSYS,RAX(%rsp) /* really needed? */
+ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
@@ -426,7 +430,7 @@ ia32_sys_call_table:
.quad sys_setuid16
.quad sys_getuid16
.quad compat_sys_stime /* stime */ /* 25 */
- .quad sys32_ptrace /* ptrace */
+ .quad compat_sys_ptrace /* ptrace */
.quad sys_alarm
.quad sys_fstat /* (old)fstat */
.quad sys_pause
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index abf71d26fc2a..f00afdf61e67 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -26,51 +26,26 @@
#include <linux/file.h>
#include <linux/signal.h>
#include <linux/syscalls.h>
-#include <linux/resource.h>
#include <linux/times.h>
#include <linux/utsname.h>
-#include <linux/smp.h>
#include <linux/smp_lock.h>
-#include <linux/sem.h>
-#include <linux/msg.h>
#include <linux/mm.h>
-#include <linux/shm.h>
-#include <linux/slab.h>
#include <linux/uio.h>
-#include <linux/nfs_fs.h>
-#include <linux/quota.h>
-#include <linux/module.h>
-#include <linux/sunrpc/svc.h>
-#include <linux/nfsd/nfsd.h>
-#include <linux/nfsd/cache.h>
-#include <linux/nfsd/xdr.h>
-#include <linux/nfsd/syscall.h>
#include <linux/poll.h>
#include <linux/personality.h>
#include <linux/stat.h>
-#include <linux/ipc.h>
#include <linux/rwsem.h>
-#include <linux/binfmts.h>
-#include <linux/init.h>
-#include <linux/aio_abi.h>
-#include <linux/aio.h>
#include <linux/compat.h>
#include <linux/vfs.h>
#include <linux/ptrace.h>
#include <linux/highuid.h>
-#include <linux/vmalloc.h>
-#include <linux/fsnotify.h>
#include <linux/sysctl.h>
#include <asm/mman.h>
#include <asm/types.h>
#include <asm/uaccess.h>
-#include <asm/semaphore.h>
#include <asm/atomic.h>
-#include <asm/ldt.h>
-
-#include <net/scm.h>
-#include <net/sock.h>
#include <asm/ia32.h>
+#include <asm/vgtod.h>
#define AA(__x) ((unsigned long)(__x))
@@ -804,11 +779,6 @@ asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
if (IS_ERR(filename))
return error;
error = compat_do_execve(filename, argv, envp, regs);
- if (error == 0) {
- task_lock(current);
- current->ptrace &= ~PT_DTRACE;
- task_unlock(current);
- }
putname(filename);
return error;
}
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4eb5ce841106..fa19c3819540 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -2,8 +2,7 @@
# Makefile for the linux kernel.
#
-extra-y := head_$(BITS).o init_task.o vmlinux.lds
-extra-$(CONFIG_X86_64) += head64.o
+extra-y := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds
CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
@@ -19,16 +18,18 @@ CFLAGS_tsc_64.o := $(nostackp)
obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
obj-y += traps_$(BITS).o irq_$(BITS).o
obj-y += time_$(BITS).o ioport.o ldt.o
-obj-y += setup_$(BITS).o i8259_$(BITS).o
+obj-y += setup_$(BITS).o i8259_$(BITS).o setup.o
obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o setup64.o
-obj-y += pci-dma_$(BITS).o bootflag.o e820_$(BITS).o
-obj-y += quirks.o i8237.o topology.o kdebugfs.o
-obj-y += alternative.o i8253.o
-obj-$(CONFIG_X86_64) += pci-nommu_64.o bugs_64.o
+obj-y += bootflag.o e820_$(BITS).o
+obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
+obj-y += alternative.o i8253.o pci-nommu.o
+obj-$(CONFIG_X86_64) += bugs_64.o
obj-y += tsc_$(BITS).o io_delay.o rtc.o
+obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
+obj-y += process.o
obj-y += i387.o
obj-y += ptrace.o
obj-y += ds.o
@@ -47,11 +48,12 @@ obj-$(CONFIG_MICROCODE) += microcode.o
obj-$(CONFIG_PCI) += early-quirks.o
apm-y := apm_32.o
obj-$(CONFIG_APM) += apm.o
-obj-$(CONFIG_X86_SMP) += smp_$(BITS).o smpboot_$(BITS).o tsc_sync.o
-obj-$(CONFIG_X86_32_SMP) += smpcommon_32.o
-obj-$(CONFIG_X86_64_SMP) += smp_64.o smpboot_64.o tsc_sync.o
+obj-$(CONFIG_X86_SMP) += smp.o
+obj-$(CONFIG_X86_SMP) += smpboot.o tsc_sync.o ipi.o tlb_$(BITS).o
+obj-$(CONFIG_X86_32_SMP) += smpcommon.o
+obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
-obj-$(CONFIG_X86_MPPARSE) += mpparse_$(BITS).o
+obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi_$(BITS).o
obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
@@ -60,12 +62,13 @@ obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
-obj-$(CONFIG_X86_VSMP) += vsmp_64.o
+obj-y += vsmp_64.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_MODULES) += module_$(BITS).o
obj-$(CONFIG_ACPI_SRAT) += srat_32.o
obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
+obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
@@ -77,6 +80,8 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
+obj-$(CONFIG_KVM_GUEST) += kvm.o
+obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
ifdef CONFIG_INPUT_PCSPKR
@@ -89,7 +94,7 @@ scx200-y += scx200_32.o
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
- obj-y += genapic_64.o genapic_flat_64.o
+ obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o
obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
obj-$(CONFIG_AUDIT) += audit_64.o
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index 19d3d6e9d09b..7335959b6aff 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -1,7 +1,14 @@
+subdir- := realmode
+
obj-$(CONFIG_ACPI) += boot.o
-obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o
+obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
ifneq ($(CONFIG_ACPI_PROCESSOR),)
obj-y += cstate.o processor.o
endif
+$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
+
+$(obj)/realmode/wakeup.bin: FORCE
+ $(Q)$(MAKE) $(build)=$(obj)/realmode $@
+
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2cdc9de9371d..977ed5cdeaa3 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -39,6 +39,11 @@
#include <asm/apic.h>
#include <asm/io.h>
#include <asm/mpspec.h>
+#include <asm/smp.h>
+
+#ifdef CONFIG_X86_LOCAL_APIC
+# include <mach_apic.h>
+#endif
static int __initdata acpi_force = 0;
@@ -52,9 +57,7 @@ EXPORT_SYMBOL(acpi_disabled);
#ifdef CONFIG_X86_64
#include <asm/proto.h>
-
-static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 0; }
-
+#include <asm/genapic.h>
#else /* X86 */
@@ -111,7 +114,7 @@ char *__init __acpi_map_table(unsigned long phys_addr, unsigned long size)
if (!phys_addr || !size)
return NULL;
- if (phys_addr+size <= (end_pfn_map << PAGE_SHIFT) + PAGE_SIZE)
+ if (phys_addr+size <= (max_pfn_mapped << PAGE_SHIFT) + PAGE_SIZE)
return __va(phys_addr);
return NULL;
@@ -237,6 +240,16 @@ static int __init acpi_parse_madt(struct acpi_table_header *table)
return 0;
}
+static void __cpuinit acpi_register_lapic(int id, u8 enabled)
+{
+ if (!enabled) {
+ ++disabled_cpus;
+ return;
+ }
+
+ generic_processor_info(id, 0);
+}
+
static int __init
acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
{
@@ -256,8 +269,26 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end)
* to not preallocating memory for all NR_CPUS
* when we use CPU hotplug.
*/
- mp_register_lapic(processor->id, /* APIC ID */
- processor->lapic_flags & ACPI_MADT_ENABLED); /* Enabled? */
+ acpi_register_lapic(processor->id, /* APIC ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
+
+ return 0;
+}
+
+static int __init
+acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end)
+{
+ struct acpi_madt_local_sapic *processor = NULL;
+
+ processor = (struct acpi_madt_local_sapic *)header;
+
+ if (BAD_MADT_ENTRY(processor, end))
+ return -EINVAL;
+
+ acpi_table_print_madt_entry(header);
+
+ acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */
+ processor->lapic_flags & ACPI_MADT_ENABLED);
return 0;
}
@@ -300,6 +331,8 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
#ifdef CONFIG_X86_IO_APIC
+struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
+
static int __init
acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
{
@@ -532,7 +565,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
buffer.pointer = NULL;
tmp_map = cpu_present_map;
- mp_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
+ acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED);
/*
* If mp_register_lapic successfully generates a new logical cpu
@@ -664,10 +697,6 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
#define HPET_RESOURCE_NAME_SIZE 9
hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE);
- if (!hpet_res)
- return 0;
-
- memset(hpet_res, 0, sizeof(*hpet_res));
hpet_res->name = (void *)&hpet_res[1];
hpet_res->flags = IORESOURCE_MEM;
snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u",
@@ -732,6 +761,16 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
* Parse LAPIC entries in MADT
* returns 0 on success, < 0 on error
*/
+
+static void __init acpi_register_lapic_address(unsigned long address)
+{
+ mp_lapic_addr = address;
+
+ set_fixmap_nocache(FIX_APIC_BASE, address);
+ if (boot_cpu_physical_apicid == -1U)
+ boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
+}
+
static int __init acpi_parse_madt_lapic_entries(void)
{
int count;
@@ -753,10 +792,14 @@ static int __init acpi_parse_madt_lapic_entries(void)
return count;
}
- mp_register_lapic_address(acpi_lapic_addr);
+ acpi_register_lapic_address(acpi_lapic_addr);
+
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
+ acpi_parse_sapic, MAX_APICS);
- count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, acpi_parse_lapic,
- MAX_APICS);
+ if (!count)
+ count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
+ acpi_parse_lapic, MAX_APICS);
if (!count) {
printk(KERN_ERR PREFIX "No LAPIC entries present\n");
/* TBD: Cleanup to allow fallback to MPS */
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 8ca3557a6d59..c2502eb9aa83 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -1,6 +1,4 @@
/*
- * arch/i386/kernel/acpi/cstate.c
- *
* Copyright (C) 2005 Intel Corporation
* Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
* - Added _PDC for SMP C-states on Intel CPUs
@@ -93,7 +91,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
/* Make sure we are running on right CPU */
saved_mask = current->cpus_allowed;
- retval = set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
if (retval)
return -1;
@@ -130,7 +128,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
cx->address);
out:
- set_cpus_allowed(current, saved_mask);
+ set_cpus_allowed_ptr(current, &saved_mask);
return retval;
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index 324eb0cab19c..de2d2e4ebad9 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -1,6 +1,4 @@
/*
- * arch/i386/kernel/acpi/processor.c
- *
* Copyright (C) 2005 Intel Corporation
* Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
* - Added _PDC for platforms with Intel CPUs
diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore
new file mode 100644
index 000000000000..58f1f48a58f8
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/.gitignore
@@ -0,0 +1,3 @@
+wakeup.bin
+wakeup.elf
+wakeup.lds
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
new file mode 100644
index 000000000000..092900854acc
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -0,0 +1,57 @@
+#
+# arch/x86/kernel/acpi/realmode/Makefile
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License. See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+
+targets := wakeup.bin wakeup.elf
+
+wakeup-y += wakeup.o wakemain.o video-mode.o copy.o
+
+# The link order of the video-*.o modules can matter. In particular,
+# video-vga.o *must* be listed first, followed by video-vesa.o.
+# Hardware-specific drivers should follow in the order they should be
+# probed, and video-bios.o should typically be last.
+wakeup-y += video-vga.o
+wakeup-y += video-vesa.o
+wakeup-y += video-bios.o
+
+targets += $(wakeup-y)
+
+bootsrc := $(src)/../../../boot
+
+# ---------------------------------------------------------------------------
+
+# How to compile the 16-bit code. Note we always compile for -march=i386,
+# that way we can complain to the user if the CPU is insufficient.
+# Compile with _SETUP since this is similar to the boot-time setup code.
+KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D_WAKEUP -D__KERNEL__ \
+ -I$(srctree)/$(bootsrc) \
+ $(cflags-y) \
+ -Wall -Wstrict-prototypes \
+ -march=i386 -mregparm=3 \
+ -include $(srctree)/$(bootsrc)/code16gcc.h \
+ -fno-strict-aliasing -fomit-frame-pointer \
+ $(call cc-option, -ffreestanding) \
+ $(call cc-option, -fno-toplevel-reorder,\
+ $(call cc-option, -fno-unit-at-a-time)) \
+ $(call cc-option, -fno-stack-protector) \
+ $(call cc-option, -mpreferred-stack-boundary=2)
+KBUILD_CFLAGS += $(call cc-option, -m32)
+KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
+
+WAKEUP_OBJS = $(addprefix $(obj)/,$(wakeup-y))
+
+LDFLAGS_wakeup.elf := -T
+
+CPPFLAGS_wakeup.lds += -P -C
+
+$(obj)/wakeup.elf: $(src)/wakeup.lds $(WAKEUP_OBJS) FORCE
+ $(call if_changed,ld)
+
+OBJCOPYFLAGS_wakeup.bin := -O binary
+
+$(obj)/wakeup.bin: $(obj)/wakeup.elf FORCE
+ $(call if_changed,objcopy)
diff --git a/arch/x86/kernel/acpi/realmode/copy.S b/arch/x86/kernel/acpi/realmode/copy.S
new file mode 100644
index 000000000000..dc59ebee69d8
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/copy.S
@@ -0,0 +1 @@
+#include "../../../boot/copy.S"
diff --git a/arch/x86/kernel/acpi/realmode/video-bios.c b/arch/x86/kernel/acpi/realmode/video-bios.c
new file mode 100644
index 000000000000..7deabc144a27
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/video-bios.c
@@ -0,0 +1 @@
+#include "../../../boot/video-bios.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-mode.c b/arch/x86/kernel/acpi/realmode/video-mode.c
new file mode 100644
index 000000000000..328ad209f113
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/video-mode.c
@@ -0,0 +1 @@
+#include "../../../boot/video-mode.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vesa.c b/arch/x86/kernel/acpi/realmode/video-vesa.c
new file mode 100644
index 000000000000..9dbb9672226a
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/video-vesa.c
@@ -0,0 +1 @@
+#include "../../../boot/video-vesa.c"
diff --git a/arch/x86/kernel/acpi/realmode/video-vga.c b/arch/x86/kernel/acpi/realmode/video-vga.c
new file mode 100644
index 000000000000..bcc81255f374
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/video-vga.c
@@ -0,0 +1 @@
+#include "../../../boot/video-vga.c"
diff --git a/arch/x86/kernel/acpi/realmode/wakemain.c b/arch/x86/kernel/acpi/realmode/wakemain.c
new file mode 100644
index 000000000000..883962d9eef2
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/wakemain.c
@@ -0,0 +1,81 @@
+#include "wakeup.h"
+#include "boot.h"
+
+static void udelay(int loops)
+{
+ while (loops--)
+ io_delay(); /* Approximately 1 us */
+}
+
+static void beep(unsigned int hz)
+{
+ u8 enable;
+
+ if (!hz) {
+ enable = 0x00; /* Turn off speaker */
+ } else {
+ u16 div = 1193181/hz;
+
+ outb(0xb6, 0x43); /* Ctr 2, squarewave, load, binary */
+ io_delay();
+ outb(div, 0x42); /* LSB of counter */
+ io_delay();
+ outb(div >> 8, 0x42); /* MSB of counter */
+ io_delay();
+
+ enable = 0x03; /* Turn on speaker */
+ }
+ inb(0x61); /* Dummy read of System Control Port B */
+ io_delay();
+ outb(enable, 0x61); /* Enable timer 2 output to speaker */
+ io_delay();
+}
+
+#define DOT_HZ 880
+#define DASH_HZ 587
+#define US_PER_DOT 125000
+
+/* Okay, this is totally silly, but it's kind of fun. */
+static void send_morse(const char *pattern)
+{
+ char s;
+
+ while ((s = *pattern++)) {
+ switch (s) {
+ case '.':
+ beep(DOT_HZ);
+ udelay(US_PER_DOT);
+ beep(0);
+ udelay(US_PER_DOT);
+ break;
+ case '-':
+ beep(DASH_HZ);
+ udelay(US_PER_DOT * 3);
+ beep(0);
+ udelay(US_PER_DOT);
+ break;
+ default: /* Assume it's a space */
+ udelay(US_PER_DOT * 3);
+ break;
+ }
+ }
+}
+
+void main(void)
+{
+ /* Kill machine if structures are wrong */
+ if (wakeup_header.real_magic != 0x12345678)
+ while (1);
+
+ if (wakeup_header.realmode_flags & 4)
+ send_morse("...-");
+
+ if (wakeup_header.realmode_flags & 1)
+ asm volatile("lcallw $0xc000,$3");
+
+ if (wakeup_header.realmode_flags & 2) {
+ /* Need to call BIOS */
+ probe_cards(0);
+ set_mode(wakeup_header.video_mode);
+ }
+}
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
new file mode 100644
index 000000000000..f9b77fb37e5b
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -0,0 +1,113 @@
+/*
+ * ACPI wakeup real mode startup stub
+ */
+#include <asm/segment.h>
+#include <asm/msr-index.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+
+ .code16
+ .section ".header", "a"
+
+/* This should match the structure in wakeup.h */
+ .globl wakeup_header
+wakeup_header:
+video_mode: .short 0 /* Video mode number */
+pmode_return: .byte 0x66, 0xea /* ljmpl */
+ .long 0 /* offset goes here */
+ .short __KERNEL_CS
+pmode_cr0: .long 0 /* Saved %cr0 */
+pmode_cr3: .long 0 /* Saved %cr3 */
+pmode_cr4: .long 0 /* Saved %cr4 */
+pmode_efer: .quad 0 /* Saved EFER */
+pmode_gdt: .quad 0
+realmode_flags: .long 0
+real_magic: .long 0
+trampoline_segment: .word 0
+signature: .long 0x51ee1111
+
+ .text
+ .globl _start
+ .code16
+wakeup_code:
+_start:
+ cli
+ cld
+
+ /* Set up segments */
+ movw %cs, %ax
+ movw %ax, %ds
+ movw %ax, %es
+ movw %ax, %ss
+
+ movl $wakeup_stack_end, %esp
+
+ /* Clear the EFLAGS */
+ pushl $0
+ popfl
+
+ /* Check header signature... */
+ movl signature, %eax
+ cmpl $0x51ee1111, %eax
+ jne bogus_real_magic
+
+ /* Check we really have everything... */
+ movl end_signature, %eax
+ cmpl $0x65a22c82, %eax
+ jne bogus_real_magic
+
+ /* Call the C code */
+ calll main
+
+ /* Do any other stuff... */
+
+#ifndef CONFIG_64BIT
+ /* This could also be done in C code... */
+ movl pmode_cr3, %eax
+ movl %eax, %cr3
+
+ movl pmode_cr4, %ecx
+ jecxz 1f
+ movl %ecx, %cr4
+1:
+ movl pmode_efer, %eax
+ movl pmode_efer + 4, %edx
+ movl %eax, %ecx
+ orl %edx, %ecx
+ jz 1f
+ movl $0xc0000080, %ecx
+ wrmsr
+1:
+
+ lgdtl pmode_gdt
+
+ /* This really couldn't... */
+ movl pmode_cr0, %eax
+ movl %eax, %cr0
+ jmp pmode_return
+#else
+ pushw $0
+ pushw trampoline_segment
+ pushw $0
+ lret
+#endif
+
+bogus_real_magic:
+1:
+ hlt
+ jmp 1b
+
+ .data
+ .balign 4
+ .globl HEAP, heap_end
+HEAP:
+ .long wakeup_heap
+heap_end:
+ .long wakeup_stack
+
+ .bss
+wakeup_heap:
+ .space 2048
+wakeup_stack:
+ .space 2048
+wakeup_stack_end:
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
new file mode 100644
index 000000000000..ef8166fe8020
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -0,0 +1,36 @@
+/*
+ * Definitions for the wakeup data structure at the head of the
+ * wakeup code.
+ */
+
+#ifndef ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
+#define ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+/* This must match data at wakeup.S */
+struct wakeup_header {
+ u16 video_mode; /* Video mode number */
+ u16 _jmp1; /* ljmpl opcode, 32-bit only */
+ u32 pmode_entry; /* Protected mode resume point, 32-bit only */
+ u16 _jmp2; /* CS value, 32-bit only */
+ u32 pmode_cr0; /* Protected mode cr0 */
+ u32 pmode_cr3; /* Protected mode cr3 */
+ u32 pmode_cr4; /* Protected mode cr4 */
+ u32 pmode_efer_low; /* Protected mode EFER */
+ u32 pmode_efer_high;
+ u64 pmode_gdt;
+ u32 realmode_flags;
+ u32 real_magic;
+ u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
+ u32 signature; /* To check we have correct structure */
+} __attribute__((__packed__));
+
+extern struct wakeup_header wakeup_header;
+#endif
+
+#define HEADER_OFFSET 0x3f00
+#define WAKEUP_SIZE 0x4000
+
+#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
new file mode 100644
index 000000000000..22fab6c4be15
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -0,0 +1,61 @@
+/*
+ * wakeup.ld
+ *
+ * Linker script for the real-mode wakeup code
+ */
+#undef i386
+#include "wakeup.h"
+
+OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
+OUTPUT_ARCH(i386)
+ENTRY(_start)
+
+SECTIONS
+{
+ . = HEADER_OFFSET;
+ .header : {
+ *(.header)
+ }
+
+ . = 0;
+ .text : {
+ *(.text*)
+ }
+
+ . = ALIGN(16);
+ .rodata : {
+ *(.rodata*)
+ }
+
+ .videocards : {
+ video_cards = .;
+ *(.videocards)
+ video_cards_end = .;
+ }
+
+ . = ALIGN(16);
+ .data : {
+ *(.data*)
+ }
+
+ .signature : {
+ end_signature = .;
+ LONG(0x65a22c82)
+ }
+
+ . = ALIGN(16);
+ .bss : {
+ __bss_start = .;
+ *(.bss)
+ __bss_end = .;
+ }
+
+ . = ALIGN(16);
+ _end = .;
+
+ /DISCARD/ : {
+ *(.note*)
+ }
+
+ . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
+}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 6bc815cd8cb3..afc25ee9964b 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -10,30 +10,72 @@
#include <linux/dmi.h>
#include <linux/cpumask.h>
-#include <asm/smp.h>
+#include "realmode/wakeup.h"
+#include "sleep.h"
-/* address in low memory of the wakeup routine. */
-unsigned long acpi_wakeup_address = 0;
+unsigned long acpi_wakeup_address;
unsigned long acpi_realmode_flags;
-extern char wakeup_start, wakeup_end;
-extern unsigned long acpi_copy_wakeup_routine(unsigned long);
+/* address in low memory of the wakeup routine. */
+static unsigned long acpi_realmode;
+
+#ifdef CONFIG_64BIT
+static char temp_stack[10240];
+#endif
/**
* acpi_save_state_mem - save kernel state
*
* Create an identity mapped page table and copy the wakeup routine to
* low memory.
+ *
+ * Note that this is too late to change acpi_wakeup_address.
*/
int acpi_save_state_mem(void)
{
- if (!acpi_wakeup_address) {
- printk(KERN_ERR "Could not allocate memory during boot, S3 disabled\n");
+ struct wakeup_header *header;
+
+ if (!acpi_realmode) {
+ printk(KERN_ERR "Could not allocate memory during boot, "
+ "S3 disabled\n");
return -ENOMEM;
}
- memcpy((void *)acpi_wakeup_address, &wakeup_start,
- &wakeup_end - &wakeup_start);
- acpi_copy_wakeup_routine(acpi_wakeup_address);
+ memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
+
+ header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET);
+ if (header->signature != 0x51ee1111) {
+ printk(KERN_ERR "wakeup header does not match\n");
+ return -EINVAL;
+ }
+
+ header->video_mode = saved_video_mode;
+
+#ifndef CONFIG_64BIT
+ store_gdt((struct desc_ptr *)&header->pmode_gdt);
+
+ header->pmode_efer_low = nx_enabled;
+ if (header->pmode_efer_low & 1) {
+ /* This is strange, why not save efer, always? */
+ rdmsr(MSR_EFER, header->pmode_efer_low,
+ header->pmode_efer_high);
+ }
+#endif /* !CONFIG_64BIT */
+
+ header->pmode_cr0 = read_cr0();
+ header->pmode_cr4 = read_cr4();
+ header->realmode_flags = acpi_realmode_flags;
+ header->real_magic = 0x12345678;
+
+#ifndef CONFIG_64BIT
+ header->pmode_entry = (u32)&wakeup_pmode_return;
+ header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET);
+ saved_magic = 0x12345678;
+#else /* CONFIG_64BIT */
+ header->trampoline_segment = setup_trampoline() >> 4;
+ init_rsp = (unsigned long)temp_stack + 4096;
+ initial_code = (unsigned long)wakeup_long64;
+ saved_magic = 0x123456789abcdef0;
+#endif /* CONFIG_64BIT */
return 0;
}
@@ -56,15 +98,20 @@ void acpi_restore_state_mem(void)
*/
void __init acpi_reserve_bootmem(void)
{
- if ((&wakeup_end - &wakeup_start) > PAGE_SIZE*2) {
+ if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
printk(KERN_ERR
"ACPI: Wakeup code way too big, S3 disabled.\n");
return;
}
- acpi_wakeup_address = (unsigned long)alloc_bootmem_low(PAGE_SIZE*2);
- if (!acpi_wakeup_address)
+ acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE);
+
+ if (!acpi_realmode) {
printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
+ return;
+ }
+
+ acpi_wakeup_address = acpi_realmode;
}
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
new file mode 100644
index 000000000000..adbcbaa6f1df
--- /dev/null
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -0,0 +1,16 @@
+/*
+ * Variables and functions used by the code in sleep.c
+ */
+
+#include <asm/trampoline.h>
+
+extern char wakeup_code_start, wakeup_code_end;
+
+extern unsigned long saved_video_mode;
+extern long saved_magic;
+
+extern int wakeup_pmode_return;
+extern char swsusp_pg_dir[PAGE_SIZE];
+
+extern unsigned long acpi_copy_wakeup_routine(unsigned long);
+extern void wakeup_long64(void);
diff --git a/arch/x86/kernel/acpi/sleep_32.c b/arch/x86/kernel/acpi/sleep_32.c
deleted file mode 100644
index 63fe5525e026..000000000000
--- a/arch/x86/kernel/acpi/sleep_32.c
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * sleep.c - x86-specific ACPI sleep support.
- *
- * Copyright (C) 2001-2003 Patrick Mochel
- * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
- */
-
-#include <linux/acpi.h>
-#include <linux/bootmem.h>
-#include <linux/dmi.h>
-#include <linux/cpumask.h>
-
-#include <asm/smp.h>
-
-/* Ouch, we want to delete this. We already have better version in userspace, in
- s2ram from suspend.sf.net project */
-static __init int reset_videomode_after_s3(const struct dmi_system_id *d)
-{
- acpi_realmode_flags |= 2;
- return 0;
-}
-
-static __initdata struct dmi_system_id acpisleep_dmi_table[] = {
- { /* Reset video mode after returning from ACPI S3 sleep */
- .callback = reset_videomode_after_s3,
- .ident = "Toshiba Satellite 4030cdt",
- .matches = {
- DMI_MATCH(DMI_PRODUCT_NAME, "S4030CDT/4.3"),
- },
- },
- {}
-};
-
-static int __init acpisleep_dmi_init(void)
-{
- dmi_check_system(acpisleep_dmi_table);
- return 0;
-}
-
-core_initcall(acpisleep_dmi_init);
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index f53e3277f8e5..a12e6a9fb659 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -3,178 +3,12 @@
#include <asm/segment.h>
#include <asm/page.h>
-#
-# wakeup_code runs in real mode, and at unknown address (determined at run-time).
-# Therefore it must only use relative jumps/calls.
-#
-# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled
-#
-# If physical address of wakeup_code is 0x12345, BIOS should call us with
-# cs = 0x1234, eip = 0x05
-#
-
-#define BEEP \
- inb $97, %al; \
- outb %al, $0x80; \
- movb $3, %al; \
- outb %al, $97; \
- outb %al, $0x80; \
- movb $-74, %al; \
- outb %al, $67; \
- outb %al, $0x80; \
- movb $-119, %al; \
- outb %al, $66; \
- outb %al, $0x80; \
- movb $15, %al; \
- outb %al, $66;
-
-ALIGN
- .align 4096
-ENTRY(wakeup_start)
-wakeup_code:
- wakeup_code_start = .
- .code16
-
- cli
- cld
-
- # setup data segment
- movw %cs, %ax
- movw %ax, %ds # Make ds:0 point to wakeup_start
- movw %ax, %ss
-
- testl $4, realmode_flags - wakeup_code
- jz 1f
- BEEP
-1:
- mov $(wakeup_stack - wakeup_code), %sp # Private stack is needed for ASUS board
-
- pushl $0 # Kill any dangerous flags
- popfl
-
- movl real_magic - wakeup_code, %eax
- cmpl $0x12345678, %eax
- jne bogus_real_magic
-
- testl $1, realmode_flags - wakeup_code
- jz 1f
- lcall $0xc000,$3
- movw %cs, %ax
- movw %ax, %ds # Bios might have played with that
- movw %ax, %ss
-1:
-
- testl $2, realmode_flags - wakeup_code
- jz 1f
- mov video_mode - wakeup_code, %ax
- call mode_set
-1:
-
- # set up page table
- movl $swsusp_pg_dir-__PAGE_OFFSET, %eax
- movl %eax, %cr3
-
- testl $1, real_efer_save_restore - wakeup_code
- jz 4f
- # restore efer setting
- movl real_save_efer_edx - wakeup_code, %edx
- movl real_save_efer_eax - wakeup_code, %eax
- mov $0xc0000080, %ecx
- wrmsr
-4:
- # make sure %cr4 is set correctly (features, etc)
- movl real_save_cr4 - wakeup_code, %eax
- movl %eax, %cr4
-
- # need a gdt -- use lgdtl to force 32-bit operands, in case
- # the GDT is located past 16 megabytes.
- lgdtl real_save_gdt - wakeup_code
-
- movl real_save_cr0 - wakeup_code, %eax
- movl %eax, %cr0
- jmp 1f
-1:
- movl real_magic - wakeup_code, %eax
- cmpl $0x12345678, %eax
- jne bogus_real_magic
-
- testl $8, realmode_flags - wakeup_code
- jz 1f
- BEEP
-1:
- ljmpl $__KERNEL_CS, $wakeup_pmode_return
-
-real_save_gdt: .word 0
- .long 0
-real_save_cr0: .long 0
-real_save_cr3: .long 0
-real_save_cr4: .long 0
-real_magic: .long 0
-video_mode: .long 0
-realmode_flags: .long 0
-real_efer_save_restore: .long 0
-real_save_efer_edx: .long 0
-real_save_efer_eax: .long 0
-
-bogus_real_magic:
- jmp bogus_real_magic
-
-/* This code uses an extended set of video mode numbers. These include:
- * Aliases for standard modes
- * NORMAL_VGA (-1)
- * EXTENDED_VGA (-2)
- * ASK_VGA (-3)
- * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
- * of compatibility when extending the table. These are between 0x00 and 0xff.
- */
-#define VIDEO_FIRST_MENU 0x0000
-
-/* Standard BIOS video modes (BIOS number + 0x0100) */
-#define VIDEO_FIRST_BIOS 0x0100
-
-/* VESA BIOS video modes (VESA number + 0x0200) */
-#define VIDEO_FIRST_VESA 0x0200
-
-/* Video7 special modes (BIOS number + 0x0900) */
-#define VIDEO_FIRST_V7 0x0900
-
-# Setting of user mode (AX=mode ID) => CF=success
-
-# For now, we only handle VESA modes (0x0200..0x03ff). To handle other
-# modes, we should probably compile in the video code from the boot
-# directory.
-mode_set:
- movw %ax, %bx
- subb $VIDEO_FIRST_VESA>>8, %bh
- cmpb $2, %bh
- jb check_vesa
-
-setbad:
- clc
- ret
-
-check_vesa:
- orw $0x4000, %bx # Use linear frame buffer
- movw $0x4f02, %ax # VESA BIOS mode set call
- int $0x10
- cmpw $0x004f, %ax # AL=4f if implemented
- jnz setbad # AH=0 if OK
-
- stc
- ret
+# Copyright 2003, 2008 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
.code32
ALIGN
-.org 0x800
-wakeup_stack_begin: # Stack grows down
-
-.org 0xff0 # Just below end of page
-wakeup_stack:
-ENTRY(wakeup_end)
-
-.org 0x1000
-
+ENTRY(wakeup_pmode_return)
wakeup_pmode_return:
movw $__KERNEL_DS, %ax
movw %ax, %ss
@@ -187,7 +21,7 @@ wakeup_pmode_return:
lgdt saved_gdt
lidt saved_idt
lldt saved_ldt
- ljmp $(__KERNEL_CS),$1f
+ ljmp $(__KERNEL_CS), $1f
1:
movl %cr3, %eax
movl %eax, %cr3
@@ -201,82 +35,41 @@ wakeup_pmode_return:
jne bogus_magic
# jump to place where we left off
- movl saved_eip,%eax
+ movl saved_eip, %eax
jmp *%eax
bogus_magic:
jmp bogus_magic
-##
-# acpi_copy_wakeup_routine
-#
-# Copy the above routine to low memory.
-#
-# Parameters:
-# %eax: place to copy wakeup routine to
-#
-# Returned address is location of code in low memory (past data and stack)
-#
-ENTRY(acpi_copy_wakeup_routine)
- pushl %ebx
+save_registers:
sgdt saved_gdt
sidt saved_idt
sldt saved_ldt
str saved_tss
- movl nx_enabled, %edx
- movl %edx, real_efer_save_restore - wakeup_start (%eax)
- testl $1, real_efer_save_restore - wakeup_start (%eax)
- jz 2f
- # save efer setting
- pushl %eax
- movl %eax, %ebx
- mov $0xc0000080, %ecx
- rdmsr
- movl %edx, real_save_efer_edx - wakeup_start (%ebx)
- movl %eax, real_save_efer_eax - wakeup_start (%ebx)
- popl %eax
-2:
-
- movl %cr3, %edx
- movl %edx, real_save_cr3 - wakeup_start (%eax)
- movl %cr4, %edx
- movl %edx, real_save_cr4 - wakeup_start (%eax)
- movl %cr0, %edx
- movl %edx, real_save_cr0 - wakeup_start (%eax)
- sgdt real_save_gdt - wakeup_start (%eax)
-
- movl saved_videomode, %edx
- movl %edx, video_mode - wakeup_start (%eax)
- movl acpi_realmode_flags, %edx
- movl %edx, realmode_flags - wakeup_start (%eax)
- movl $0x12345678, real_magic - wakeup_start (%eax)
- movl $0x12345678, saved_magic
- popl %ebx
- ret
-
-save_registers:
leal 4(%esp), %eax
movl %eax, saved_context_esp
- movl %ebx, saved_context_ebx
- movl %ebp, saved_context_ebp
- movl %esi, saved_context_esi
- movl %edi, saved_context_edi
- pushfl ; popl saved_context_eflags
-
- movl $ret_point, saved_eip
+ movl %ebx, saved_context_ebx
+ movl %ebp, saved_context_ebp
+ movl %esi, saved_context_esi
+ movl %edi, saved_context_edi
+ pushfl
+ popl saved_context_eflags
+
+ movl $ret_point, saved_eip
ret
restore_registers:
- movl saved_context_ebp, %ebp
- movl saved_context_ebx, %ebx
- movl saved_context_esi, %esi
- movl saved_context_edi, %edi
- pushl saved_context_eflags ; popfl
- ret
+ movl saved_context_ebp, %ebp
+ movl saved_context_ebx, %ebx
+ movl saved_context_esi, %esi
+ movl saved_context_edi, %edi
+ pushl saved_context_eflags
+ popfl
+ ret
ENTRY(do_suspend_lowlevel)
call save_processor_state
diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S
index 2e1b9e0d0767..bcc293423a70 100644
--- a/arch/x86/kernel/acpi/wakeup_64.S
+++ b/arch/x86/kernel/acpi/wakeup_64.S
@@ -7,191 +7,18 @@
#include <asm/asm-offsets.h>
# Copyright 2003 Pavel Machek <pavel@suse.cz>, distribute under GPLv2
-#
-# wakeup_code runs in real mode, and at unknown address (determined at run-time).
-# Therefore it must only use relative jumps/calls.
-#
-# Do we need to deal with A20? It is okay: ACPI specs says A20 must be enabled
-#
-# If physical address of wakeup_code is 0x12345, BIOS should call us with
-# cs = 0x1234, eip = 0x05
-#
-
-#define BEEP \
- inb $97, %al; \
- outb %al, $0x80; \
- movb $3, %al; \
- outb %al, $97; \
- outb %al, $0x80; \
- movb $-74, %al; \
- outb %al, $67; \
- outb %al, $0x80; \
- movb $-119, %al; \
- outb %al, $66; \
- outb %al, $0x80; \
- movb $15, %al; \
- outb %al, $66;
-
-
-ALIGN
- .align 16
-ENTRY(wakeup_start)
-wakeup_code:
- wakeup_code_start = .
- .code16
-
-# Running in *copy* of this code, somewhere in low 1MB.
-
- cli
- cld
- # setup data segment
- movw %cs, %ax
- movw %ax, %ds # Make ds:0 point to wakeup_start
- movw %ax, %ss
-
- # Data segment must be set up before we can see whether to beep.
- testl $4, realmode_flags - wakeup_code
- jz 1f
- BEEP
-1:
-
- # Private stack is needed for ASUS board
- mov $(wakeup_stack - wakeup_code), %sp
-
- pushl $0 # Kill any dangerous flags
- popfl
-
- movl real_magic - wakeup_code, %eax
- cmpl $0x12345678, %eax
- jne bogus_real_magic
-
- testl $1, realmode_flags - wakeup_code
- jz 1f
- lcall $0xc000,$3
- movw %cs, %ax
- movw %ax, %ds # Bios might have played with that
- movw %ax, %ss
-1:
-
- testl $2, realmode_flags - wakeup_code
- jz 1f
- mov video_mode - wakeup_code, %ax
- call mode_set
-1:
-
- mov %ds, %ax # Find 32bit wakeup_code addr
- movzx %ax, %esi # (Convert %ds:gdt to a liner ptr)
- shll $4, %esi
- # Fix up the vectors
- addl %esi, wakeup_32_vector - wakeup_code
- addl %esi, wakeup_long64_vector - wakeup_code
- addl %esi, gdt_48a + 2 - wakeup_code # Fixup the gdt pointer
-
- lidtl %ds:idt_48a - wakeup_code
- lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is
- # appropriate
-
- movl $1, %eax # protected mode (PE) bit
- lmsw %ax # This is it!
- jmp 1f
-1:
-
- ljmpl *(wakeup_32_vector - wakeup_code)
-
- .balign 4
-wakeup_32_vector:
- .long wakeup_32 - wakeup_code
- .word __KERNEL32_CS, 0
-
- .code32
-wakeup_32:
-# Running in this code, but at low address; paging is not yet turned on.
-
- movl $__KERNEL_DS, %eax
- movl %eax, %ds
-
- /*
- * Prepare for entering 64bits mode
- */
-
- /* Enable PAE */
- xorl %eax, %eax
- btsl $5, %eax
- movl %eax, %cr4
-
- /* Setup early boot stage 4 level pagetables */
- leal (wakeup_level4_pgt - wakeup_code)(%esi), %eax
- movl %eax, %cr3
-
- /* Check if nx is implemented */
- movl $0x80000001, %eax
- cpuid
- movl %edx,%edi
-
- /* Enable Long Mode */
- xorl %eax, %eax
- btsl $_EFER_LME, %eax
-
- /* No Execute supported? */
- btl $20,%edi
- jnc 1f
- btsl $_EFER_NX, %eax
-
- /* Make changes effective */
-1: movl $MSR_EFER, %ecx
- xorl %edx, %edx
- wrmsr
-
- xorl %eax, %eax
- btsl $31, %eax /* Enable paging and in turn activate Long Mode */
- btsl $0, %eax /* Enable protected mode */
-
- /* Make changes effective */
- movl %eax, %cr0
-
- /* At this point:
- CR4.PAE must be 1
- CS.L must be 0
- CR3 must point to PML4
- Next instruction must be a branch
- This must be on identity-mapped page
- */
- /*
- * At this point we're in long mode but in 32bit compatibility mode
- * with EFER.LME = 1, CS.L = 0, CS.D = 1 (and in turn
- * EFER.LMA = 1). Now we want to jump in 64bit mode, to do that we load
- * the new gdt/idt that has __KERNEL_CS with CS.L = 1.
- */
-
- /* Finally jump in 64bit mode */
- ljmp *(wakeup_long64_vector - wakeup_code)(%esi)
-
- .balign 4
-wakeup_long64_vector:
- .long wakeup_long64 - wakeup_code
- .word __KERNEL_CS, 0
.code64
-
- /* Hooray, we are in Long 64-bit mode (but still running in
- * low memory)
- */
-wakeup_long64:
/*
- * We must switch to a new descriptor in kernel space for the GDT
- * because soon the kernel won't have access anymore to the userspace
- * addresses where we're currently running on. We have to do that here
- * because in 32bit we couldn't load a 64bit linear address.
+ * Hooray, we are in Long 64-bit mode (but still running in low memory)
*/
- lgdt cpu_gdt_descr
-
- movq saved_magic, %rax
- movq $0x123456789abcdef0, %rdx
- cmpq %rdx, %rax
- jne bogus_64_magic
+ENTRY(wakeup_long64)
+wakeup_long64:
+ movq saved_magic, %rax
+ movq $0x123456789abcdef0, %rdx
+ cmpq %rdx, %rax
+ jne bogus_64_magic
- nop
- nop
movw $__KERNEL_DS, %ax
movw %ax, %ss
movw %ax, %ds
@@ -208,130 +35,8 @@ wakeup_long64:
movq saved_rip, %rax
jmp *%rax
-.code32
-
- .align 64
-gdta:
- /* Its good to keep gdt in sync with one in trampoline.S */
- .word 0, 0, 0, 0 # dummy
- /* ??? Why I need the accessed bit set in order for this to work? */
- .quad 0x00cf9b000000ffff # __KERNEL32_CS
- .quad 0x00af9b000000ffff # __KERNEL_CS
- .quad 0x00cf93000000ffff # __KERNEL_DS
-
-idt_48a:
- .word 0 # idt limit = 0
- .word 0, 0 # idt base = 0L
-
-gdt_48a:
- .word 0x800 # gdt limit=2048,
- # 256 GDT entries
- .long gdta - wakeup_code # gdt base (relocated in later)
-
-real_magic: .quad 0
-video_mode: .quad 0
-realmode_flags: .quad 0
-
-.code16
-bogus_real_magic:
- jmp bogus_real_magic
-
-.code64
bogus_64_magic:
- jmp bogus_64_magic
-
-/* This code uses an extended set of video mode numbers. These include:
- * Aliases for standard modes
- * NORMAL_VGA (-1)
- * EXTENDED_VGA (-2)
- * ASK_VGA (-3)
- * Video modes numbered by menu position -- NOT RECOMMENDED because of lack
- * of compatibility when extending the table. These are between 0x00 and 0xff.
- */
-#define VIDEO_FIRST_MENU 0x0000
-
-/* Standard BIOS video modes (BIOS number + 0x0100) */
-#define VIDEO_FIRST_BIOS 0x0100
-
-/* VESA BIOS video modes (VESA number + 0x0200) */
-#define VIDEO_FIRST_VESA 0x0200
-
-/* Video7 special modes (BIOS number + 0x0900) */
-#define VIDEO_FIRST_V7 0x0900
-
-# Setting of user mode (AX=mode ID) => CF=success
-
-# For now, we only handle VESA modes (0x0200..0x03ff). To handle other
-# modes, we should probably compile in the video code from the boot
-# directory.
-.code16
-mode_set:
- movw %ax, %bx
- subb $VIDEO_FIRST_VESA>>8, %bh
- cmpb $2, %bh
- jb check_vesa
-
-setbad:
- clc
- ret
-
-check_vesa:
- orw $0x4000, %bx # Use linear frame buffer
- movw $0x4f02, %ax # VESA BIOS mode set call
- int $0x10
- cmpw $0x004f, %ax # AL=4f if implemented
- jnz setbad # AH=0 if OK
-
- stc
- ret
-
-wakeup_stack_begin: # Stack grows down
-
-.org 0xff0
-wakeup_stack: # Just below end of page
-
-.org 0x1000
-ENTRY(wakeup_level4_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 510,8,0
- /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad level3_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
-
-ENTRY(wakeup_end)
-
-##
-# acpi_copy_wakeup_routine
-#
-# Copy the above routine to low memory.
-#
-# Parameters:
-# %rdi: place to copy wakeup routine to
-#
-# Returned address is location of code in low memory (past data and stack)
-#
- .code64
-ENTRY(acpi_copy_wakeup_routine)
- pushq %rax
- pushq %rdx
-
- movl saved_video_mode, %edx
- movl %edx, video_mode - wakeup_start (,%rdi)
- movl acpi_realmode_flags, %edx
- movl %edx, realmode_flags - wakeup_start (,%rdi)
- movq $0x12345678, real_magic - wakeup_start (,%rdi)
- movq $0x123456789abcdef0, %rdx
- movq %rdx, saved_magic
-
- movq saved_magic, %rax
- movq $0x123456789abcdef0, %rdx
- cmpq %rdx, %rax
- jne bogus_64_magic
-
- # restore the regs we used
- popq %rdx
- popq %rax
-ENTRY(do_suspend_lowlevel_s4bios)
- ret
+ jmp bogus_64_magic
.align 2
.p2align 4,,15
@@ -414,7 +119,7 @@ do_suspend_lowlevel:
jmp restore_processor_state
.LFE5:
.Lfe5:
- .size do_suspend_lowlevel,.Lfe5-do_suspend_lowlevel
+ .size do_suspend_lowlevel, .Lfe5-do_suspend_lowlevel
.data
ALIGN
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
new file mode 100644
index 000000000000..6ff3b5730575
--- /dev/null
+++ b/arch/x86/kernel/acpi/wakeup_rm.S
@@ -0,0 +1,10 @@
+/*
+ * Wrapper script for the realmode binary as a transport object
+ * before copying to low memory.
+ */
+ .section ".rodata","a"
+ .globl wakeup_code_start, wakeup_code_end
+wakeup_code_start:
+ .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
+wakeup_code_end:
+ .size wakeup_code_start, .-wakeup_code_start
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 45d79ea890ae..65c7857a90dd 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -11,6 +11,8 @@
#include <asm/mce.h>
#include <asm/nmi.h>
#include <asm/vsyscall.h>
+#include <asm/cacheflush.h>
+#include <asm/io.h>
#define MAX_PATCH_LEN (255-1)
@@ -65,7 +67,8 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
get them easily into strings. */
asm("\t.section .rodata, \"a\"\nintelnops: "
GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6
- GENERIC_NOP7 GENERIC_NOP8);
+ GENERIC_NOP7 GENERIC_NOP8
+ "\t.previous");
extern const unsigned char intelnops[];
static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = {
NULL,
@@ -83,7 +86,8 @@ static const unsigned char *const intel_nops[ASM_NOP_MAX+1] = {
#ifdef K8_NOP1
asm("\t.section .rodata, \"a\"\nk8nops: "
K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6
- K8_NOP7 K8_NOP8);
+ K8_NOP7 K8_NOP8
+ "\t.previous");
extern const unsigned char k8nops[];
static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = {
NULL,
@@ -101,7 +105,8 @@ static const unsigned char *const k8_nops[ASM_NOP_MAX+1] = {
#ifdef K7_NOP1
asm("\t.section .rodata, \"a\"\nk7nops: "
K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6
- K7_NOP7 K7_NOP8);
+ K7_NOP7 K7_NOP8
+ "\t.previous");
extern const unsigned char k7nops[];
static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = {
NULL,
@@ -119,7 +124,8 @@ static const unsigned char *const k7_nops[ASM_NOP_MAX+1] = {
#ifdef P6_NOP1
asm("\t.section .rodata, \"a\"\np6nops: "
P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6
- P6_NOP7 P6_NOP8);
+ P6_NOP7 P6_NOP8
+ "\t.previous");
extern const unsigned char p6nops[];
static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
NULL,
@@ -173,7 +179,7 @@ static const unsigned char*const * find_nop_table(void)
#endif /* CONFIG_X86_64 */
/* Use this to add nops to a buffer, then text_poke the whole buffer. */
-static void add_nops(void *insns, unsigned int len)
+void add_nops(void *insns, unsigned int len)
{
const unsigned char *const *noptable = find_nop_table();
@@ -186,6 +192,7 @@ static void add_nops(void *insns, unsigned int len)
len -= noplen;
}
}
+EXPORT_SYMBOL_GPL(add_nops);
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern u8 *__smp_locks[], *__smp_locks_end[];
@@ -201,7 +208,7 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
struct alt_instr *a;
char insnbuf[MAX_PATCH_LEN];
- DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
+ DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
for (a = start; a < end; a++) {
u8 *instr = a->instr;
BUG_ON(a->replacementlen > a->instrlen);
@@ -213,13 +220,13 @@ void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
if (instr >= (u8 *)VSYSCALL_START && instr < (u8*)VSYSCALL_END) {
instr = __va(instr - (u8*)VSYSCALL_START + (u8*)__pa_symbol(&__vsyscall_0));
DPRINTK("%s: vsyscall fixup: %p => %p\n",
- __FUNCTION__, a->instr, instr);
+ __func__, a->instr, instr);
}
#endif
memcpy(insnbuf, a->replacement, a->replacementlen);
add_nops(insnbuf + a->replacementlen,
a->instrlen - a->replacementlen);
- text_poke(instr, insnbuf, a->instrlen);
+ text_poke_early(instr, insnbuf, a->instrlen);
}
}
@@ -280,7 +287,6 @@ void alternatives_smp_module_add(struct module *mod, char *name,
void *text, void *text_end)
{
struct smp_alt_module *smp;
- unsigned long flags;
if (noreplace_smp)
return;
@@ -303,42 +309,40 @@ void alternatives_smp_module_add(struct module *mod, char *name,
smp->text = text;
smp->text_end = text_end;
DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
- __FUNCTION__, smp->locks, smp->locks_end,
+ __func__, smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
- spin_lock_irqsave(&smp_alt, flags);
+ spin_lock(&smp_alt);
list_add_tail(&smp->next, &smp_alt_modules);
if (boot_cpu_has(X86_FEATURE_UP))
alternatives_smp_unlock(smp->locks, smp->locks_end,
smp->text, smp->text_end);
- spin_unlock_irqrestore(&smp_alt, flags);
+ spin_unlock(&smp_alt);
}
void alternatives_smp_module_del(struct module *mod)
{
struct smp_alt_module *item;
- unsigned long flags;
if (smp_alt_once || noreplace_smp)
return;
- spin_lock_irqsave(&smp_alt, flags);
+ spin_lock(&smp_alt);
list_for_each_entry(item, &smp_alt_modules, next) {
if (mod != item->mod)
continue;
list_del(&item->next);
- spin_unlock_irqrestore(&smp_alt, flags);
- DPRINTK("%s: %s\n", __FUNCTION__, item->name);
+ spin_unlock(&smp_alt);
+ DPRINTK("%s: %s\n", __func__, item->name);
kfree(item);
return;
}
- spin_unlock_irqrestore(&smp_alt, flags);
+ spin_unlock(&smp_alt);
}
void alternatives_smp_switch(int smp)
{
struct smp_alt_module *mod;
- unsigned long flags;
#ifdef CONFIG_LOCKDEP
/*
@@ -355,7 +359,7 @@ void alternatives_smp_switch(int smp)
return;
BUG_ON(!smp && (num_online_cpus() > 1));
- spin_lock_irqsave(&smp_alt, flags);
+ spin_lock(&smp_alt);
/*
* Avoid unnecessary switches because it forces JIT based VMs to
@@ -379,7 +383,7 @@ void alternatives_smp_switch(int smp)
mod->text, mod->text_end);
}
smp_mode = smp;
- spin_unlock_irqrestore(&smp_alt, flags);
+ spin_unlock(&smp_alt);
}
#endif
@@ -407,7 +411,7 @@ void apply_paravirt(struct paravirt_patch_site *start,
/* Pad the rest with nops */
add_nops(insnbuf + used, p->len - used);
- text_poke(p->instr, insnbuf, p->len);
+ text_poke_early(p->instr, insnbuf, p->len);
}
}
extern struct paravirt_patch_site __start_parainstructions[],
@@ -416,8 +420,6 @@ extern struct paravirt_patch_site __start_parainstructions[],
void __init alternative_instructions(void)
{
- unsigned long flags;
-
/* The patching is not fully atomic, so try to avoid local interruptions
that might execute the to be patched code.
Other CPUs are not running. */
@@ -426,7 +428,6 @@ void __init alternative_instructions(void)
stop_mce();
#endif
- local_irq_save(flags);
apply_alternatives(__alt_instructions, __alt_instructions_end);
/* switch to patch-once-at-boottime-only mode and free the
@@ -458,7 +459,6 @@ void __init alternative_instructions(void)
}
#endif
apply_paravirt(__parainstructions, __parainstructions_end);
- local_irq_restore(flags);
if (smp_alt_once)
free_init_pages("SMP alternatives",
@@ -471,18 +471,70 @@ void __init alternative_instructions(void)
#endif
}
-/*
- * Warning:
+/**
+ * text_poke_early - Update instructions on a live kernel at boot time
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
* When you use this code to patch more than one byte of an instruction
* you need to make sure that other CPUs cannot execute this code in parallel.
- * Also no thread must be currently preempted in the middle of these instructions.
- * And on the local CPU you need to be protected again NMI or MCE handlers
- * seeing an inconsistent instruction while you patch.
+ * Also no thread must be currently preempted in the middle of these
+ * instructions. And on the local CPU you need to be protected again NMI or MCE
+ * handlers seeing an inconsistent instruction while you patch.
*/
-void __kprobes text_poke(void *addr, unsigned char *opcode, int len)
+void *text_poke_early(void *addr, const void *opcode, size_t len)
{
+ unsigned long flags;
+ local_irq_save(flags);
memcpy(addr, opcode, len);
+ local_irq_restore(flags);
+ sync_core();
+ /* Could also do a CLFLUSH here to speed up CPU recovery; but
+ that causes hangs on some VIA CPUs. */
+ return addr;
+}
+
+/**
+ * text_poke - Update instructions on a live kernel
+ * @addr: address to modify
+ * @opcode: source of the copy
+ * @len: length to copy
+ *
+ * Only atomic text poke/set should be allowed when not doing early patching.
+ * It means the size must be writable atomically and the address must be aligned
+ * in a way that permits an atomic write. It also makes sure we fit on a single
+ * page.
+ */
+void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
+{
+ unsigned long flags;
+ char *vaddr;
+ int nr_pages = 2;
+ struct page *pages[2];
+ int i;
+
+ if (!core_kernel_text((unsigned long)addr)) {
+ pages[0] = vmalloc_to_page(addr);
+ pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
+ } else {
+ pages[0] = virt_to_page(addr);
+ WARN_ON(!PageReserved(pages[0]));
+ pages[1] = virt_to_page(addr + PAGE_SIZE);
+ }
+ BUG_ON(!pages[0]);
+ if (!pages[1])
+ nr_pages = 1;
+ vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL);
+ BUG_ON(!vaddr);
+ local_irq_save(flags);
+ memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len);
+ local_irq_restore(flags);
+ vunmap(vaddr);
sync_core();
/* Could also do a CLFLUSH here to speed up CPU recovery; but
that causes hangs on some VIA CPUs. */
+ for (i = 0; i < len; i++)
+ BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]);
+ return addr;
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 00df126169b4..479926d9e004 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -27,11 +27,11 @@
#include <asm/k8.h>
int gart_iommu_aperture;
-int gart_iommu_aperture_disabled __initdata = 0;
-int gart_iommu_aperture_allowed __initdata = 0;
+int gart_iommu_aperture_disabled __initdata;
+int gart_iommu_aperture_allowed __initdata;
int fallback_aper_order __initdata = 1; /* 64MB */
-int fallback_aper_force __initdata = 0;
+int fallback_aper_force __initdata;
int fix_aperture __initdata = 1;
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 35a568ea8400..4b99b1bdeb6c 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -50,6 +50,11 @@
# error SPURIOUS_APIC_VECTOR definition error
#endif
+unsigned long mp_lapic_addr;
+
+DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
+EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+
/*
* Knob to control our willingness to enable the local APIC.
*
@@ -446,7 +451,8 @@ void __init setup_boot_APIC_clock(void)
}
/* Calculate the scaled math multiplication factor */
- lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32);
+ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
+ lapic_clockevent.shift);
lapic_clockevent.max_delta_ns =
clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
lapic_clockevent.min_delta_ns =
@@ -621,6 +627,35 @@ int setup_profiling_timer(unsigned int multiplier)
}
/*
+ * Setup extended LVT, AMD specific (K8, family 10h)
+ *
+ * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
+ * MCE interrupts are supported. Thus MCE offset must be set to 0.
+ */
+
+#define APIC_EILVT_LVTOFF_MCE 0
+#define APIC_EILVT_LVTOFF_IBS 1
+
+static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
+{
+ unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
+ unsigned int v = (mask << 16) | (msg_type << 8) | vector;
+ apic_write(reg, v);
+}
+
+u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
+{
+ setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
+ return APIC_EILVT_LVTOFF_MCE;
+}
+
+u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
+{
+ setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
+ return APIC_EILVT_LVTOFF_IBS;
+}
+
+/*
* Local APIC start and shutdown
*/
@@ -868,12 +903,50 @@ void __init init_bsp_APIC(void)
apic_write_around(APIC_LVT1, value);
}
+static void __cpuinit lapic_setup_esr(void)
+{
+ unsigned long oldvalue, value, maxlvt;
+ if (lapic_is_integrated() && !esr_disable) {
+ /* !82489DX */
+ maxlvt = lapic_get_maxlvt();
+ if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
+ apic_write(APIC_ESR, 0);
+ oldvalue = apic_read(APIC_ESR);
+
+ /* enables sending errors */
+ value = ERROR_APIC_VECTOR;
+ apic_write_around(APIC_LVTERR, value);
+ /*
+ * spec says clear errors after enabling vector.
+ */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+ value = apic_read(APIC_ESR);
+ if (value != oldvalue)
+ apic_printk(APIC_VERBOSE, "ESR value before enabling "
+ "vector: 0x%08lx after: 0x%08lx\n",
+ oldvalue, value);
+ } else {
+ if (esr_disable)
+ /*
+ * Something untraceable is creating bad interrupts on
+ * secondary quads ... for the moment, just leave the
+ * ESR disabled - we can't do anything useful with the
+ * errors anyway - mbligh
+ */
+ printk(KERN_INFO "Leaving ESR disabled.\n");
+ else
+ printk(KERN_INFO "No ESR for 82489DX.\n");
+ }
+}
+
+
/**
* setup_local_APIC - setup the local APIC
*/
void __cpuinit setup_local_APIC(void)
{
- unsigned long oldvalue, value, maxlvt, integrated;
+ unsigned long value, integrated;
int i, j;
/* Pound the ESR really hard over the head with a big hammer - mbligh */
@@ -997,40 +1070,13 @@ void __cpuinit setup_local_APIC(void)
if (!integrated) /* 82489DX */
value |= APIC_LVT_LEVEL_TRIGGER;
apic_write_around(APIC_LVT1, value);
+}
- if (integrated && !esr_disable) {
- /* !82489DX */
- maxlvt = lapic_get_maxlvt();
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
- oldvalue = apic_read(APIC_ESR);
-
- /* enables sending errors */
- value = ERROR_APIC_VECTOR;
- apic_write_around(APIC_LVTERR, value);
- /*
- * spec says clear errors after enabling vector.
- */
- if (maxlvt > 3)
- apic_write(APIC_ESR, 0);
- value = apic_read(APIC_ESR);
- if (value != oldvalue)
- apic_printk(APIC_VERBOSE, "ESR value before enabling "
- "vector: 0x%08lx after: 0x%08lx\n",
- oldvalue, value);
- } else {
- if (esr_disable)
- /*
- * Something untraceable is creating bad interrupts on
- * secondary quads ... for the moment, just leave the
- * ESR disabled - we can't do anything useful with the
- * errors anyway - mbligh
- */
- printk(KERN_INFO "Leaving ESR disabled.\n");
- else
- printk(KERN_INFO "No ESR for 82489DX.\n");
- }
+void __cpuinit end_local_APIC_setup(void)
+{
+ unsigned long value;
+ lapic_setup_esr();
/* Disable the local apic timer */
value = apic_read(APIC_LVTT);
value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
@@ -1147,7 +1193,7 @@ void __init init_apic_mappings(void)
* default configuration (or the MP table is broken).
*/
if (boot_cpu_physical_apicid == -1U)
- boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+ boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
#ifdef CONFIG_X86_IO_APIC
{
@@ -1185,6 +1231,9 @@ fake_ioapic_page:
* This initializes the IO-APIC and APIC hardware if this is
* a UP kernel.
*/
+
+int apic_version[MAX_APICS];
+
int __init APIC_init_uniprocessor(void)
{
if (enable_local_apic < 0)
@@ -1214,12 +1263,13 @@ int __init APIC_init_uniprocessor(void)
* might be zero if read from MP tables. Get it from LAPIC.
*/
#ifdef CONFIG_CRASH_DUMP
- boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
+ boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
#endif
phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
setup_local_APIC();
+ end_local_APIC_setup();
#ifdef CONFIG_X86_IO_APIC
if (smp_found_config)
if (!skip_ioapic_setup && nr_ioapics)
@@ -1288,6 +1338,29 @@ void smp_error_interrupt(struct pt_regs *regs)
irq_exit();
}
+#ifdef CONFIG_SMP
+void __init smp_intr_init(void)
+{
+ /*
+ * IRQ0 must be given a fixed assignment and initialized,
+ * because it's used before the IO-APIC is set up.
+ */
+ set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
+
+ /*
+ * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+ * IPI, driven by wakeup.
+ */
+ set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+ /* IPI for invalidation */
+ set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
+
+ /* IPI for generic function call */
+ set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+}
+#endif
+
/*
* Initialize APIC interrupts
*/
@@ -1394,6 +1467,88 @@ void disconnect_bsp_APIC(int virt_wire_setup)
}
}
+unsigned int __cpuinitdata maxcpus = NR_CPUS;
+
+void __cpuinit generic_processor_info(int apicid, int version)
+{
+ int cpu;
+ cpumask_t tmp_map;
+ physid_mask_t phys_cpu;
+
+ /*
+ * Validate version
+ */
+ if (version == 0x0) {
+ printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
+ "fixing up to 0x10. (tell your hw vendor)\n",
+ version);
+ version = 0x10;
+ }
+ apic_version[apicid] = version;
+
+ phys_cpu = apicid_to_cpu_present(apicid);
+ physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
+
+ if (num_processors >= NR_CPUS) {
+ printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+ " Processor ignored.\n", NR_CPUS);
+ return;
+ }
+
+ if (num_processors >= maxcpus) {
+ printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
+ " Processor ignored.\n", maxcpus);
+ return;
+ }
+
+ num_processors++;
+ cpus_complement(tmp_map, cpu_present_map);
+ cpu = first_cpu(tmp_map);
+
+ if (apicid == boot_cpu_physical_apicid)
+ /*
+ * x86_bios_cpu_apicid is required to have processors listed
+ * in same order as logical cpu numbers. Hence the first
+ * entry is BSP, and so on.
+ */
+ cpu = 0;
+
+ /*
+ * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
+ * but we need to work other dependencies like SMP_SUSPEND etc
+ * before this can be done without some confusion.
+ * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
+ * - Ashok Raj <ashok.raj@intel.com>
+ */
+ if (num_processors > 8) {
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (!APIC_XAPIC(version)) {
+ def_to_bigsmp = 0;
+ break;
+ }
+ /* If P4 and above fall through */
+ case X86_VENDOR_AMD:
+ def_to_bigsmp = 1;
+ }
+ }
+#ifdef CONFIG_SMP
+ /* are we being called early in kernel startup? */
+ if (x86_cpu_to_apicid_early_ptr) {
+ u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
+ u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+
+ cpu_to_apicid[cpu] = apicid;
+ bios_cpu_apicid[cpu] = apicid;
+ } else {
+ per_cpu(x86_cpu_to_apicid, cpu) = apicid;
+ per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
+ }
+#endif
+ cpu_set(cpu, cpu_possible_map);
+ cpu_set(cpu, cpu_present_map);
+}
+
/*
* Power management
*/
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
index d8d03e09dea2..5910020c3f24 100644
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -34,13 +34,15 @@
#include <asm/mpspec.h>
#include <asm/hpet.h>
#include <asm/pgalloc.h>
-#include <asm/mach_apic.h>
#include <asm/nmi.h>
#include <asm/idle.h>
#include <asm/proto.h>
#include <asm/timex.h>
#include <asm/apic.h>
+#include <mach_ipi.h>
+#include <mach_apic.h>
+
int disable_apic_timer __cpuinitdata;
static int apic_calibrate_pmtmr __initdata;
int disable_apic;
@@ -83,6 +85,12 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
static unsigned long apic_phys;
+unsigned long mp_lapic_addr;
+
+DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
+EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
+
+unsigned int __cpuinitdata maxcpus = NR_CPUS;
/*
* Get the LAPIC version
*/
@@ -352,7 +360,8 @@ static void __init calibrate_APIC_clock(void)
result / 1000 / 1000, result / 1000 % 1000);
/* Calculate the scaled math multiplication factor */
- lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32);
+ lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC,
+ lapic_clockevent.shift);
lapic_clockevent.max_delta_ns =
clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
lapic_clockevent.min_delta_ns =
@@ -421,7 +430,7 @@ void __init setup_boot_APIC_clock(void)
* set the DUMMY flag again and force the broadcast mode in the
* clockevents layer.
*/
-void __cpuinit check_boot_apic_timer_broadcast(void)
+static void __cpuinit check_boot_apic_timer_broadcast(void)
{
if (!disable_apic_timer ||
(lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY))
@@ -431,7 +440,8 @@ void __cpuinit check_boot_apic_timer_broadcast(void)
lapic_clockevent.features |= CLOCK_EVT_FEAT_DUMMY;
local_irq_enable();
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, &boot_cpu_id);
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
+ &boot_cpu_physical_apicid);
local_irq_disable();
}
@@ -640,10 +650,10 @@ int __init verify_local_APIC(void)
/*
* The ID register is read/write in a real APIC.
*/
- reg0 = apic_read(APIC_ID);
+ reg0 = read_apic_id();
apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
- reg1 = apic_read(APIC_ID);
+ reg1 = read_apic_id();
apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
apic_write(APIC_ID, reg0);
if (reg1 != (reg0 ^ APIC_ID_MASK))
@@ -728,6 +738,7 @@ void __cpuinit setup_local_APIC(void)
unsigned int value;
int i, j;
+ preempt_disable();
value = apic_read(APIC_LVR);
BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
@@ -821,9 +832,10 @@ void __cpuinit setup_local_APIC(void)
else
value = APIC_DM_NMI | APIC_LVT_MASKED;
apic_write(APIC_LVT1, value);
+ preempt_enable();
}
-void __cpuinit lapic_setup_esr(void)
+static void __cpuinit lapic_setup_esr(void)
{
unsigned maxlvt = lapic_get_maxlvt();
@@ -857,10 +869,34 @@ static int __init detect_init_APIC(void)
}
mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
- boot_cpu_id = 0;
+ boot_cpu_physical_apicid = 0;
return 0;
}
+void __init early_init_lapic_mapping(void)
+{
+ unsigned long apic_phys;
+
+ /*
+ * If no local APIC can be found then go out
+ * : it means there is no mpatable and MADT
+ */
+ if (!smp_found_config)
+ return;
+
+ apic_phys = mp_lapic_addr;
+
+ set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+ apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
+ APIC_BASE, apic_phys);
+
+ /*
+ * Fetch the APIC ID of the BSP in case we have a
+ * default configuration (or the MP table is broken).
+ */
+ boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
+}
+
/**
* init_apic_mappings - initialize APIC mappings
*/
@@ -881,16 +917,11 @@ void __init init_apic_mappings(void)
apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
APIC_BASE, apic_phys);
- /* Put local APIC into the resource map. */
- lapic_resource.start = apic_phys;
- lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
- insert_resource(&iomem_resource, &lapic_resource);
-
/*
* Fetch the APIC ID of the BSP in case we have a
* default configuration (or the MP table is broken).
*/
- boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
+ boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
}
/*
@@ -911,8 +942,8 @@ int __init APIC_init_uniprocessor(void)
verify_local_APIC();
- phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
- apic_write(APIC_ID, SET_APIC_ID(boot_cpu_id));
+ phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+ apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
setup_local_APIC();
@@ -1029,6 +1060,52 @@ void disconnect_bsp_APIC(int virt_wire_setup)
apic_write(APIC_LVT1, value);
}
+void __cpuinit generic_processor_info(int apicid, int version)
+{
+ int cpu;
+ cpumask_t tmp_map;
+
+ if (num_processors >= NR_CPUS) {
+ printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
+ " Processor ignored.\n", NR_CPUS);
+ return;
+ }
+
+ if (num_processors >= maxcpus) {
+ printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
+ " Processor ignored.\n", maxcpus);
+ return;
+ }
+
+ num_processors++;
+ cpus_complement(tmp_map, cpu_present_map);
+ cpu = first_cpu(tmp_map);
+
+ physid_set(apicid, phys_cpu_present_map);
+ if (apicid == boot_cpu_physical_apicid) {
+ /*
+ * x86_bios_cpu_apicid is required to have processors listed
+ * in same order as logical cpu numbers. Hence the first
+ * entry is BSP, and so on.
+ */
+ cpu = 0;
+ }
+ /* are we being called early in kernel startup? */
+ if (x86_cpu_to_apicid_early_ptr) {
+ u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
+ u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+
+ cpu_to_apicid[cpu] = apicid;
+ bios_cpu_apicid[cpu] = apicid;
+ } else {
+ per_cpu(x86_cpu_to_apicid, cpu) = apicid;
+ per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
+ }
+
+ cpu_set(cpu, cpu_possible_map);
+ cpu_set(cpu, cpu_present_map);
+}
+
/*
* Power management
*/
@@ -1065,7 +1142,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
maxlvt = lapic_get_maxlvt();
- apic_pm_state.apic_id = apic_read(APIC_ID);
+ apic_pm_state.apic_id = read_apic_id();
apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
apic_pm_state.apic_ldr = apic_read(APIC_LDR);
apic_pm_state.apic_dfr = apic_read(APIC_DFR);
@@ -1180,9 +1257,19 @@ __cpuinit int apic_is_clustered_box(void)
{
int i, clusters, zeros;
unsigned id;
- u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
+ u16 *bios_cpu_apicid;
DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
+ /*
+ * there is not this kind of box with AMD CPU yet.
+ * Some AMD box with quadcore cpu and 8 sockets apicid
+ * will be [4, 0x23] or [8, 0x27] could be thought to
+ * vsmp box still need checking...
+ */
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
+ return 0;
+
+ bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
for (i = 0; i < NR_CPUS; i++) {
@@ -1219,6 +1306,12 @@ __cpuinit int apic_is_clustered_box(void)
++zeros;
}
+ /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
+ * not guaranteed to be synced between boards
+ */
+ if (is_vsmp_box() && clusters > 1)
+ return 1;
+
/*
* If clusters > 2, then should be multi-chassis.
* May have to revisit this when multi-core + hyperthreaded CPUs come
@@ -1290,3 +1383,21 @@ static __init int setup_apicpmtimer(char *s)
}
__setup("apicpmtimer", setup_apicpmtimer);
+static int __init lapic_insert_resource(void)
+{
+ if (!apic_phys)
+ return -1;
+
+ /* Put local APIC into the resource map. */
+ lapic_resource.start = apic_phys;
+ lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
+ insert_resource(&iomem_resource, &lapic_resource);
+
+ return 0;
+}
+
+/*
+ * need call insert after e820_reserve_resources()
+ * that is using request_resource
+ */
+late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index d4438ef296d8..bf9290e29013 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -904,6 +904,7 @@ recalc:
original_pm_idle();
else
default_idle();
+ local_irq_disable();
jiffies_since_last_check = jiffies - last_jiffies;
if (jiffies_since_last_check > idle_period)
goto recalc;
@@ -911,6 +912,8 @@ recalc:
if (apm_idle_done)
apm_do_busy();
+
+ local_irq_enable();
}
/**
@@ -1189,19 +1192,6 @@ static int suspend(int vetoable)
int err;
struct apm_user *as;
- if (pm_send_all(PM_SUSPEND, (void *)3)) {
- /* Vetoed */
- if (vetoable) {
- if (apm_info.connection_version > 0x100)
- set_system_power_state(APM_STATE_REJECT);
- err = -EBUSY;
- ignore_sys_suspend = 0;
- printk(KERN_WARNING "apm: suspend was vetoed.\n");
- goto out;
- }
- printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n");
- }
-
device_suspend(PMSG_SUSPEND);
local_irq_disable();
device_power_down(PMSG_SUSPEND);
@@ -1224,9 +1214,7 @@ static int suspend(int vetoable)
device_power_up();
local_irq_enable();
device_resume();
- pm_send_all(PM_RESUME, (void *)0);
queue_event(APM_NORMAL_RESUME, NULL);
- out:
spin_lock(&user_list_lock);
for (as = user_list; as != NULL; as = as->next) {
as->suspend_wait = 0;
@@ -1337,7 +1325,6 @@ static void check_events(void)
if ((event != APM_NORMAL_RESUME)
|| (ignore_normal_resume == 0)) {
device_resume();
- pm_send_all(PM_RESUME, (void *)0);
queue_event(event, NULL);
}
ignore_normal_resume = 0;
@@ -2217,7 +2204,6 @@ static struct dmi_system_id __initdata apm_dmi_table[] = {
*/
static int __init apm_init(void)
{
- struct proc_dir_entry *apm_proc;
struct desc_struct *gdt;
int err;
@@ -2322,9 +2308,7 @@ static int __init apm_init(void)
set_base(gdt[APM_DS >> 3],
__va((unsigned long)apm_info.bios.dseg << 4));
- apm_proc = create_proc_entry("apm", 0, NULL);
- if (apm_proc)
- apm_proc->proc_fops = &apm_file_ops;
+ proc_create("apm", 0, NULL, &apm_file_ops);
kapmd_task = kthread_create(apm, NULL, "kapmd");
if (IS_ERR(kapmd_task)) {
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 8ea040124f7d..670c3c311289 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -10,7 +10,7 @@
#include <linux/personality.h>
#include <linux/suspend.h>
#include <asm/ucontext.h>
-#include "sigframe_32.h"
+#include "sigframe.h"
#include <asm/pgtable.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
diff --git a/arch/x86/kernel/bugs_64.c b/arch/x86/kernel/bugs_64.c
index 8f520f93ffd4..9a3ed0649d4e 100644
--- a/arch/x86/kernel/bugs_64.c
+++ b/arch/x86/kernel/bugs_64.c
@@ -9,13 +9,25 @@
#include <asm/bugs.h>
#include <asm/processor.h>
#include <asm/mtrr.h>
+#include <asm/cacheflush.h>
void __init check_bugs(void)
{
- identify_cpu(&boot_cpu_data);
+ identify_boot_cpu();
#if !defined(CONFIG_SMP)
printk("CPU: ");
print_cpu_info(&boot_cpu_data);
#endif
alternative_instructions();
+
+ /*
+ * Make sure the first 2MB area is not mapped by huge pages
+ * There are typically fixed size MTRRs in there and overlapping
+ * MTRRs into large pages causes slow downs.
+ *
+ * Right now we don't do that with gbpages because there seems
+ * very little benefit for that case.
+ */
+ if (!direct_gbpages)
+ set_memory_4k((unsigned long)__va(0), 1);
}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index a0c4d7c5dbd7..a0c6f8190887 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,15 +3,14 @@
#
obj-y := intel_cacheinfo.o addon_cpuid_features.o
-obj-y += feature_names.o
+obj-y += proc.o feature_names.o
-obj-$(CONFIG_X86_32) += common.o proc.o bugs.o
+obj-$(CONFIG_X86_32) += common.o bugs.o
obj-$(CONFIG_X86_32) += amd.o
obj-$(CONFIG_X86_32) += cyrix.o
obj-$(CONFIG_X86_32) += centaur.o
obj-$(CONFIG_X86_32) += transmeta.o
obj-$(CONFIG_X86_32) += intel.o
-obj-$(CONFIG_X86_32) += nexgen.o
obj-$(CONFIG_X86_32) += umc.o
obj-$(CONFIG_X86_MCE) += mcheck/
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 693e353999cd..245866828294 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -4,8 +4,8 @@
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/apic.h>
-#include <asm/mach_apic.h>
+#include <mach_apic.h>
#include "cpu.h"
/*
@@ -20,7 +20,7 @@
* the chip setting when fixing the bug but they also tweaked some
* performance at the same time..
*/
-
+
extern void vide(void);
__asm__(".align 4\nvide: ret");
@@ -63,12 +63,12 @@ static __cpuinit int amd_apic_timer_broken(void)
int force_mwait __cpuinitdata;
-void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
+static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
{
if (cpuid_eax(0x80000000) >= 0x80000007) {
c->x86_power = cpuid_edx(0x80000007);
if (c->x86_power & (1<<8))
- set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
}
}
@@ -81,7 +81,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
#ifdef CONFIG_SMP
unsigned long long value;
- /* Disable TLB flush filter by setting HWCR.FFDIS on K8
+ /*
+ * Disable TLB flush filter by setting HWCR.FFDIS on K8
* bit 6 of msr C001_0015
*
* Errata 63 for SH-B3 steppings
@@ -102,15 +103,16 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
* no bus pipeline)
*/
- /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
- clear_bit(0*32+31, c->x86_capability);
-
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
+
r = get_model_name(c);
- switch(c->x86)
- {
- case 4:
+ switch (c->x86) {
+ case 4:
/*
* General Systems BIOSen alias the cpu frequency registers
* of the Elan at 0x000df000. Unfortuantly, one of the Linux
@@ -120,61 +122,60 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
#define CBAR_ENB (0x80000000)
#define CBAR_KEY (0X000000CB)
- if (c->x86_model==9 || c->x86_model == 10) {
+ if (c->x86_model == 9 || c->x86_model == 10) {
if (inl (CBAR) & CBAR_ENB)
outl (0 | CBAR_KEY, CBAR);
}
break;
- case 5:
- if( c->x86_model < 6 )
- {
+ case 5:
+ if (c->x86_model < 6) {
/* Based on AMD doc 20734R - June 2000 */
- if ( c->x86_model == 0 ) {
- clear_bit(X86_FEATURE_APIC, c->x86_capability);
- set_bit(X86_FEATURE_PGE, c->x86_capability);
+ if (c->x86_model == 0) {
+ clear_cpu_cap(c, X86_FEATURE_APIC);
+ set_cpu_cap(c, X86_FEATURE_PGE);
}
break;
}
-
- if ( c->x86_model == 6 && c->x86_mask == 1 ) {
+
+ if (c->x86_model == 6 && c->x86_mask == 1) {
const int K6_BUG_LOOP = 1000000;
int n;
void (*f_vide)(void);
unsigned long d, d2;
-
+
printk(KERN_INFO "AMD K6 stepping B detected - ");
-
+
/*
- * It looks like AMD fixed the 2.6.2 bug and improved indirect
+ * It looks like AMD fixed the 2.6.2 bug and improved indirect
* calls at the same time.
*/
n = K6_BUG_LOOP;
f_vide = vide;
rdtscl(d);
- while (n--)
+ while (n--)
f_vide();
rdtscl(d2);
d = d2-d;
- if (d > 20*K6_BUG_LOOP)
+ if (d > 20*K6_BUG_LOOP)
printk("system stability may be impaired when more than 32 MB are used.\n");
- else
+ else
printk("probably OK (after B9730xxxx).\n");
printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
}
/* K6 with old style WHCR */
if (c->x86_model < 8 ||
- (c->x86_model== 8 && c->x86_mask < 8)) {
+ (c->x86_model == 8 && c->x86_mask < 8)) {
/* We can only write allocate on the low 508Mb */
- if(mbytes>508)
- mbytes=508;
+ if (mbytes > 508)
+ mbytes = 508;
rdmsr(MSR_K6_WHCR, l, h);
- if ((l&0x0000FFFF)==0) {
+ if ((l&0x0000FFFF) == 0) {
unsigned long flags;
- l=(1<<0)|((mbytes/4)<<1);
+ l = (1<<0)|((mbytes/4)<<1);
local_irq_save(flags);
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
@@ -185,17 +186,17 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
break;
}
- if ((c->x86_model == 8 && c->x86_mask >7) ||
+ if ((c->x86_model == 8 && c->x86_mask > 7) ||
c->x86_model == 9 || c->x86_model == 13) {
/* The more serious chips .. */
- if(mbytes>4092)
- mbytes=4092;
+ if (mbytes > 4092)
+ mbytes = 4092;
rdmsr(MSR_K6_WHCR, l, h);
- if ((l&0xFFFF0000)==0) {
+ if ((l&0xFFFF0000) == 0) {
unsigned long flags;
- l=((mbytes>>2)<<22)|(1<<16);
+ l = ((mbytes>>2)<<22)|(1<<16);
local_irq_save(flags);
wbinvd();
wrmsr(MSR_K6_WHCR, l, h);
@@ -207,7 +208,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
/* Set MTRR capability flag if appropriate */
if (c->x86_model == 13 || c->x86_model == 9 ||
(c->x86_model == 8 && c->x86_mask >= 8))
- set_bit(X86_FEATURE_K6_MTRR, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_K6_MTRR);
break;
}
@@ -217,10 +218,11 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
break;
}
break;
- case 6: /* An Athlon/Duron */
-
- /* Bit 15 of Athlon specific MSR 15, needs to be 0
- * to enable SSE on Palomino/Morgan/Barton CPU's.
+ case 6: /* An Athlon/Duron */
+
+ /*
+ * Bit 15 of Athlon specific MSR 15, needs to be 0
+ * to enable SSE on Palomino/Morgan/Barton CPU's.
* If the BIOS didn't enable it already, enable it here.
*/
if (c->x86_model >= 6 && c->x86_model <= 10) {
@@ -229,15 +231,16 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
rdmsr(MSR_K7_HWCR, l, h);
l &= ~0x00008000;
wrmsr(MSR_K7_HWCR, l, h);
- set_bit(X86_FEATURE_XMM, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_XMM);
}
}
- /* It's been determined by AMD that Athlons since model 8 stepping 1
+ /*
+ * It's been determined by AMD that Athlons since model 8 stepping 1
* are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
* As per AMD technical note 27212 0.2
*/
- if ((c->x86_model == 8 && c->x86_mask>=1) || (c->x86_model > 8)) {
+ if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
rdmsr(MSR_K7_CLK_CTL, l, h);
if ((l & 0xfff00000) != 0x20000000) {
printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
@@ -253,20 +256,19 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
/* Use K8 tuning for Fam10h and Fam11h */
case 0x10:
case 0x11:
- set_bit(X86_FEATURE_K8, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_K8);
break;
case 6:
- set_bit(X86_FEATURE_K7, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_K7);
break;
}
if (c->x86 >= 6)
- set_bit(X86_FEATURE_FXSAVE_LEAK, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
display_cacheinfo(c);
- if (cpuid_eax(0x80000000) >= 0x80000008) {
+ if (cpuid_eax(0x80000000) >= 0x80000008)
c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1;
- }
#ifdef CONFIG_X86_HT
/*
@@ -302,20 +304,20 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
/* K6s reports MCEs but don't actually have all the MSRs */
if (c->x86 < 6)
- clear_bit(X86_FEATURE_MCE, c->x86_capability);
+ clear_cpu_cap(c, X86_FEATURE_MCE);
if (cpu_has_xmm2)
- set_bit(X86_FEATURE_MFENCE_RDTSC, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
}
-static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/* AMD errata T13 (order #21922) */
if ((c->x86 == 6)) {
if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */
size = 64;
if (c->x86_model == 4 &&
- (c->x86_mask==0 || c->x86_mask==1)) /* Tbird rev A1/A2 */
+ (c->x86_mask == 0 || c->x86_mask == 1)) /* Tbird rev A1/A2 */
size = 256;
}
return size;
@@ -323,25 +325,22 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 * c, unsigned in
static struct cpu_dev amd_cpu_dev __cpuinitdata = {
.c_vendor = "AMD",
- .c_ident = { "AuthenticAMD" },
+ .c_ident = { "AuthenticAMD" },
.c_models = {
{ .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
{
[3] = "486 DX/2",
[7] = "486 DX/2-WB",
- [8] = "486 DX/4",
- [9] = "486 DX/4-WB",
+ [8] = "486 DX/4",
+ [9] = "486 DX/4-WB",
[14] = "Am5x86-WT",
- [15] = "Am5x86-WB"
+ [15] = "Am5x86-WB"
}
},
},
+ .c_early_init = early_init_amd,
.c_init = init_amd,
.c_size_cache = amd_size_cache,
};
-int __init amd_init_cpu(void)
-{
- cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev;
- return 0;
-}
+cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index 9681fa15ddf0..e0f45edd6a55 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -1,31 +1,34 @@
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/bitops.h>
+
#include <asm/processor.h>
#include <asm/msr.h>
#include <asm/e820.h>
#include <asm/mtrr.h>
+
#include "cpu.h"
#ifdef CONFIG_X86_OOSTORE
static u32 __cpuinit power2(u32 x)
{
- u32 s=1;
- while(s<=x)
- s<<=1;
- return s>>=1;
+ u32 s = 1;
+
+ while (s <= x)
+ s <<= 1;
+
+ return s >>= 1;
}
/*
- * Set up an actual MCR
+ * Set up an actual MCR
*/
-
static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key)
{
u32 lo, hi;
-
+
hi = base & ~0xFFF;
lo = ~(size-1); /* Size is a power of 2 so this makes a mask */
lo &= ~0xFFF; /* Remove the ctrl value bits */
@@ -35,30 +38,28 @@ static void __cpuinit centaur_mcr_insert(int reg, u32 base, u32 size, int key)
}
/*
- * Figure what we can cover with MCR's
+ * Figure what we can cover with MCR's
*
- * Shortcut: We know you can't put 4Gig of RAM on a winchip
+ * Shortcut: We know you can't put 4Gig of RAM on a winchip
*/
-
-static u32 __cpuinit ramtop(void) /* 16388 */
+static u32 __cpuinit ramtop(void)
{
- int i;
- u32 top = 0;
u32 clip = 0xFFFFFFFFUL;
-
+ u32 top = 0;
+ int i;
+
for (i = 0; i < e820.nr_map; i++) {
unsigned long start, end;
if (e820.map[i].addr > 0xFFFFFFFFUL)
continue;
/*
- * Don't MCR over reserved space. Ignore the ISA hole
- * we frob around that catastrophe already
+ * Don't MCR over reserved space. Ignore the ISA hole
+ * we frob around that catastrophe already
*/
-
- if (e820.map[i].type == E820_RESERVED)
- {
- if(e820.map[i].addr >= 0x100000UL && e820.map[i].addr < clip)
+ if (e820.map[i].type == E820_RESERVED) {
+ if (e820.map[i].addr >= 0x100000UL &&
+ e820.map[i].addr < clip)
clip = e820.map[i].addr;
continue;
}
@@ -69,28 +70,27 @@ static u32 __cpuinit ramtop(void) /* 16388 */
if (end > top)
top = end;
}
- /* Everything below 'top' should be RAM except for the ISA hole.
- Because of the limited MCR's we want to map NV/ACPI into our
- MCR range for gunk in RAM
-
- Clip might cause us to MCR insufficient RAM but that is an
- acceptable failure mode and should only bite obscure boxes with
- a VESA hole at 15Mb
-
- The second case Clip sometimes kicks in is when the EBDA is marked
- as reserved. Again we fail safe with reasonable results
- */
-
- if(top>clip)
- top=clip;
-
+ /*
+ * Everything below 'top' should be RAM except for the ISA hole.
+ * Because of the limited MCR's we want to map NV/ACPI into our
+ * MCR range for gunk in RAM
+ *
+ * Clip might cause us to MCR insufficient RAM but that is an
+ * acceptable failure mode and should only bite obscure boxes with
+ * a VESA hole at 15Mb
+ *
+ * The second case Clip sometimes kicks in is when the EBDA is marked
+ * as reserved. Again we fail safe with reasonable results
+ */
+ if (top > clip)
+ top = clip;
+
return top;
}
/*
- * Compute a set of MCR's to give maximum coverage
+ * Compute a set of MCR's to give maximum coverage
*/
-
static int __cpuinit centaur_mcr_compute(int nr, int key)
{
u32 mem = ramtop();
@@ -99,141 +99,131 @@ static int __cpuinit centaur_mcr_compute(int nr, int key)
u32 top = root;
u32 floor = 0;
int ct = 0;
-
- while(ct<nr)
- {
+
+ while (ct < nr) {
u32 fspace = 0;
+ u32 high;
+ u32 low;
/*
- * Find the largest block we will fill going upwards
+ * Find the largest block we will fill going upwards
*/
-
- u32 high = power2(mem-top);
+ high = power2(mem-top);
/*
- * Find the largest block we will fill going downwards
+ * Find the largest block we will fill going downwards
*/
-
- u32 low = base/2;
+ low = base/2;
/*
- * Don't fill below 1Mb going downwards as there
- * is an ISA hole in the way.
- */
-
- if(base <= 1024*1024)
+ * Don't fill below 1Mb going downwards as there
+ * is an ISA hole in the way.
+ */
+ if (base <= 1024*1024)
low = 0;
-
+
/*
- * See how much space we could cover by filling below
- * the ISA hole
+ * See how much space we could cover by filling below
+ * the ISA hole
*/
-
- if(floor == 0)
+
+ if (floor == 0)
fspace = 512*1024;
- else if(floor ==512*1024)
+ else if (floor == 512*1024)
fspace = 128*1024;
/* And forget ROM space */
-
+
/*
- * Now install the largest coverage we get
+ * Now install the largest coverage we get
*/
-
- if(fspace > high && fspace > low)
- {
+ if (fspace > high && fspace > low) {
centaur_mcr_insert(ct, floor, fspace, key);
floor += fspace;
- }
- else if(high > low)
- {
+ } else if (high > low) {
centaur_mcr_insert(ct, top, high, key);
top += high;
- }
- else if(low > 0)
- {
+ } else if (low > 0) {
base -= low;
centaur_mcr_insert(ct, base, low, key);
- }
- else break;
+ } else
+ break;
ct++;
}
/*
- * We loaded ct values. We now need to set the mask. The caller
- * must do this bit.
+ * We loaded ct values. We now need to set the mask. The caller
+ * must do this bit.
*/
-
return ct;
}
static void __cpuinit centaur_create_optimal_mcr(void)
{
+ int used;
int i;
+
/*
- * Allocate up to 6 mcrs to mark as much of ram as possible
- * as write combining and weak write ordered.
+ * Allocate up to 6 mcrs to mark as much of ram as possible
+ * as write combining and weak write ordered.
*
- * To experiment with: Linux never uses stack operations for
- * mmio spaces so we could globally enable stack operation wc
+ * To experiment with: Linux never uses stack operations for
+ * mmio spaces so we could globally enable stack operation wc
*
- * Load the registers with type 31 - full write combining, all
- * writes weakly ordered.
+ * Load the registers with type 31 - full write combining, all
+ * writes weakly ordered.
*/
- int used = centaur_mcr_compute(6, 31);
+ used = centaur_mcr_compute(6, 31);
/*
- * Wipe unused MCRs
+ * Wipe unused MCRs
*/
-
- for(i=used;i<8;i++)
+ for (i = used; i < 8; i++)
wrmsr(MSR_IDT_MCR0+i, 0, 0);
}
static void __cpuinit winchip2_create_optimal_mcr(void)
{
u32 lo, hi;
+ int used;
int i;
/*
- * Allocate up to 6 mcrs to mark as much of ram as possible
- * as write combining, weak store ordered.
+ * Allocate up to 6 mcrs to mark as much of ram as possible
+ * as write combining, weak store ordered.
*
- * Load the registers with type 25
- * 8 - weak write ordering
- * 16 - weak read ordering
- * 1 - write combining
+ * Load the registers with type 25
+ * 8 - weak write ordering
+ * 16 - weak read ordering
+ * 1 - write combining
*/
+ used = centaur_mcr_compute(6, 25);
- int used = centaur_mcr_compute(6, 25);
-
/*
- * Mark the registers we are using.
+ * Mark the registers we are using.
*/
-
rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- for(i=0;i<used;i++)
- lo|=1<<(9+i);
+ for (i = 0; i < used; i++)
+ lo |= 1<<(9+i);
wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
-
+
/*
- * Wipe unused MCRs
+ * Wipe unused MCRs
*/
-
- for(i=used;i<8;i++)
+
+ for (i = used; i < 8; i++)
wrmsr(MSR_IDT_MCR0+i, 0, 0);
}
/*
- * Handle the MCR key on the Winchip 2.
+ * Handle the MCR key on the Winchip 2.
*/
-
static void __cpuinit winchip2_unprotect_mcr(void)
{
u32 lo, hi;
u32 key;
-
+
rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- lo&=~0x1C0; /* blank bits 8-6 */
+ lo &= ~0x1C0; /* blank bits 8-6 */
key = (lo>>17) & 7;
lo |= key<<6; /* replace with unlock key */
wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
@@ -242,9 +232,9 @@ static void __cpuinit winchip2_unprotect_mcr(void)
static void __cpuinit winchip2_protect_mcr(void)
{
u32 lo, hi;
-
+
rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- lo&=~0x1C0; /* blank bits 8-6 */
+ lo &= ~0x1C0; /* blank bits 8-6 */
wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
}
#endif /* CONFIG_X86_OOSTORE */
@@ -267,17 +257,17 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
/* enable ACE unit, if present and disabled */
if ((tmp & (ACE_PRESENT | ACE_ENABLED)) == ACE_PRESENT) {
- rdmsr (MSR_VIA_FCR, lo, hi);
+ rdmsr(MSR_VIA_FCR, lo, hi);
lo |= ACE_FCR; /* enable ACE unit */
- wrmsr (MSR_VIA_FCR, lo, hi);
+ wrmsr(MSR_VIA_FCR, lo, hi);
printk(KERN_INFO "CPU: Enabled ACE h/w crypto\n");
}
/* enable RNG unit, if present and disabled */
if ((tmp & (RNG_PRESENT | RNG_ENABLED)) == RNG_PRESENT) {
- rdmsr (MSR_VIA_RNG, lo, hi);
+ rdmsr(MSR_VIA_RNG, lo, hi);
lo |= RNG_ENABLE; /* enable RNG unit */
- wrmsr (MSR_VIA_RNG, lo, hi);
+ wrmsr(MSR_VIA_RNG, lo, hi);
printk(KERN_INFO "CPU: Enabled h/w RNG\n");
}
@@ -288,171 +278,183 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
}
/* Cyrix III family needs CX8 & PGE explicitly enabled. */
- if (c->x86_model >=6 && c->x86_model <= 9) {
- rdmsr (MSR_VIA_FCR, lo, hi);
+ if (c->x86_model >= 6 && c->x86_model <= 9) {
+ rdmsr(MSR_VIA_FCR, lo, hi);
lo |= (1<<1 | 1<<7);
- wrmsr (MSR_VIA_FCR, lo, hi);
- set_bit(X86_FEATURE_CX8, c->x86_capability);
+ wrmsr(MSR_VIA_FCR, lo, hi);
+ set_cpu_cap(c, X86_FEATURE_CX8);
}
/* Before Nehemiah, the C3's had 3dNOW! */
- if (c->x86_model >=6 && c->x86_model <9)
- set_bit(X86_FEATURE_3DNOW, c->x86_capability);
+ if (c->x86_model >= 6 && c->x86_model < 9)
+ set_cpu_cap(c, X86_FEATURE_3DNOW);
get_model_name(c);
display_cacheinfo(c);
}
+enum {
+ ECX8 = 1<<1,
+ EIERRINT = 1<<2,
+ DPM = 1<<3,
+ DMCE = 1<<4,
+ DSTPCLK = 1<<5,
+ ELINEAR = 1<<6,
+ DSMC = 1<<7,
+ DTLOCK = 1<<8,
+ EDCTLB = 1<<8,
+ EMMX = 1<<9,
+ DPDC = 1<<11,
+ EBRPRED = 1<<12,
+ DIC = 1<<13,
+ DDC = 1<<14,
+ DNA = 1<<15,
+ ERETSTK = 1<<16,
+ E2MMX = 1<<19,
+ EAMD3D = 1<<20,
+};
+
static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
{
- enum {
- ECX8=1<<1,
- EIERRINT=1<<2,
- DPM=1<<3,
- DMCE=1<<4,
- DSTPCLK=1<<5,
- ELINEAR=1<<6,
- DSMC=1<<7,
- DTLOCK=1<<8,
- EDCTLB=1<<8,
- EMMX=1<<9,
- DPDC=1<<11,
- EBRPRED=1<<12,
- DIC=1<<13,
- DDC=1<<14,
- DNA=1<<15,
- ERETSTK=1<<16,
- E2MMX=1<<19,
- EAMD3D=1<<20,
- };
char *name;
- u32 fcr_set=0;
- u32 fcr_clr=0;
- u32 lo,hi,newlo;
- u32 aa,bb,cc,dd;
+ u32 fcr_set = 0;
+ u32 fcr_clr = 0;
+ u32 lo, hi, newlo;
+ u32 aa, bb, cc, dd;
- /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
- clear_bit(0*32+31, c->x86_capability);
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
switch (c->x86) {
-
- case 5:
- switch(c->x86_model) {
- case 4:
- name="C6";
- fcr_set=ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
- fcr_clr=DPDC;
- printk(KERN_NOTICE "Disabling bugged TSC.\n");
- clear_bit(X86_FEATURE_TSC, c->x86_capability);
+ case 5:
+ switch (c->x86_model) {
+ case 4:
+ name = "C6";
+ fcr_set = ECX8|DSMC|EDCTLB|EMMX|ERETSTK;
+ fcr_clr = DPDC;
+ printk(KERN_NOTICE "Disabling bugged TSC.\n");
+ clear_cpu_cap(c, X86_FEATURE_TSC);
#ifdef CONFIG_X86_OOSTORE
- centaur_create_optimal_mcr();
- /* Enable
- write combining on non-stack, non-string
- write combining on string, all types
- weak write ordering
-
- The C6 original lacks weak read order
-
- Note 0x120 is write only on Winchip 1 */
-
- wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0);
-#endif
+ centaur_create_optimal_mcr();
+ /*
+ * Enable:
+ * write combining on non-stack, non-string
+ * write combining on string, all types
+ * weak write ordering
+ *
+ * The C6 original lacks weak read order
+ *
+ * Note 0x120 is write only on Winchip 1
+ */
+ wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0);
+#endif
+ break;
+ case 8:
+ switch (c->x86_mask) {
+ default:
+ name = "2";
+ break;
+ case 7 ... 9:
+ name = "2A";
break;
- case 8:
- switch(c->x86_mask) {
- default:
- name="2";
- break;
- case 7 ... 9:
- name="2A";
- break;
- case 10 ... 15:
- name="2B";
- break;
- }
- fcr_set=ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|E2MMX|EAMD3D;
- fcr_clr=DPDC;
+ case 10 ... 15:
+ name = "2B";
+ break;
+ }
+ fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
+ E2MMX|EAMD3D;
+ fcr_clr = DPDC;
#ifdef CONFIG_X86_OOSTORE
- winchip2_unprotect_mcr();
- winchip2_create_optimal_mcr();
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- /* Enable
- write combining on non-stack, non-string
- write combining on string, all types
- weak write ordering
- */
- lo|=31;
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
- winchip2_protect_mcr();
+ winchip2_unprotect_mcr();
+ winchip2_create_optimal_mcr();
+ rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
+ /*
+ * Enable:
+ * write combining on non-stack, non-string
+ * write combining on string, all types
+ * weak write ordering
+ */
+ lo |= 31;
+ wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
+ winchip2_protect_mcr();
#endif
- break;
- case 9:
- name="3";
- fcr_set=ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|E2MMX|EAMD3D;
- fcr_clr=DPDC;
+ break;
+ case 9:
+ name = "3";
+ fcr_set = ECX8|DSMC|DTLOCK|EMMX|EBRPRED|ERETSTK|
+ E2MMX|EAMD3D;
+ fcr_clr = DPDC;
#ifdef CONFIG_X86_OOSTORE
- winchip2_unprotect_mcr();
- winchip2_create_optimal_mcr();
- rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
- /* Enable
- write combining on non-stack, non-string
- write combining on string, all types
- weak write ordering
- */
- lo|=31;
- wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
- winchip2_protect_mcr();
+ winchip2_unprotect_mcr();
+ winchip2_create_optimal_mcr();
+ rdmsr(MSR_IDT_MCR_CTRL, lo, hi);
+ /*
+ * Enable:
+ * write combining on non-stack, non-string
+ * write combining on string, all types
+ * weak write ordering
+ */
+ lo |= 31;
+ wrmsr(MSR_IDT_MCR_CTRL, lo, hi);
+ winchip2_protect_mcr();
#endif
- break;
- default:
- name="??";
- }
+ break;
+ default:
+ name = "??";
+ }
- rdmsr(MSR_IDT_FCR1, lo, hi);
- newlo=(lo|fcr_set) & (~fcr_clr);
+ rdmsr(MSR_IDT_FCR1, lo, hi);
+ newlo = (lo|fcr_set) & (~fcr_clr);
- if (newlo!=lo) {
- printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n", lo, newlo );
- wrmsr(MSR_IDT_FCR1, newlo, hi );
- } else {
- printk(KERN_INFO "Centaur FCR is 0x%X\n",lo);
- }
- /* Emulate MTRRs using Centaur's MCR. */
- set_bit(X86_FEATURE_CENTAUR_MCR, c->x86_capability);
- /* Report CX8 */
- set_bit(X86_FEATURE_CX8, c->x86_capability);
- /* Set 3DNow! on Winchip 2 and above. */
- if (c->x86_model >=8)
- set_bit(X86_FEATURE_3DNOW, c->x86_capability);
- /* See if we can find out some more. */
- if ( cpuid_eax(0x80000000) >= 0x80000005 ) {
- /* Yes, we can. */
- cpuid(0x80000005,&aa,&bb,&cc,&dd);
- /* Add L1 data and code cache sizes. */
- c->x86_cache_size = (cc>>24)+(dd>>24);
- }
- sprintf( c->x86_model_id, "WinChip %s", name );
- break;
+ if (newlo != lo) {
+ printk(KERN_INFO "Centaur FCR was 0x%X now 0x%X\n",
+ lo, newlo);
+ wrmsr(MSR_IDT_FCR1, newlo, hi);
+ } else {
+ printk(KERN_INFO "Centaur FCR is 0x%X\n", lo);
+ }
+ /* Emulate MTRRs using Centaur's MCR. */
+ set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
+ /* Report CX8 */
+ set_cpu_cap(c, X86_FEATURE_CX8);
+ /* Set 3DNow! on Winchip 2 and above. */
+ if (c->x86_model >= 8)
+ set_cpu_cap(c, X86_FEATURE_3DNOW);
+ /* See if we can find out some more. */
+ if (cpuid_eax(0x80000000) >= 0x80000005) {
+ /* Yes, we can. */
+ cpuid(0x80000005, &aa, &bb, &cc, &dd);
+ /* Add L1 data and code cache sizes. */
+ c->x86_cache_size = (cc>>24)+(dd>>24);
+ }
+ sprintf(c->x86_model_id, "WinChip %s", name);
+ break;
- case 6:
- init_c3(c);
- break;
+ case 6:
+ init_c3(c);
+ break;
}
}
-static unsigned int __cpuinit centaur_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+static unsigned int __cpuinit
+centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
/* VIA C3 CPUs (670-68F) need further shifting. */
if ((c->x86 == 6) && ((c->x86_model == 7) || (c->x86_model == 8)))
size >>= 8;
- /* VIA also screwed up Nehemiah stepping 1, and made
- it return '65KB' instead of '64KB'
- - Note, it seems this may only be in engineering samples. */
- if ((c->x86==6) && (c->x86_model==9) && (c->x86_mask==1) && (size==65))
- size -=1;
+ /*
+ * There's also an erratum in Nehemiah stepping 1, which
+ * returns '65KB' instead of '64KB'
+ * - Note, it seems this may only be in engineering samples.
+ */
+ if ((c->x86 == 6) && (c->x86_model == 9) &&
+ (c->x86_mask == 1) && (size == 65))
+ size -= 1;
return size;
}
@@ -464,8 +466,4 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
.c_size_cache = centaur_size_cache,
};
-int __init centaur_init_cpu(void)
-{
- cpu_devs[X86_VENDOR_CENTAUR] = &centaur_cpu_dev;
- return 0;
-}
+cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a38aafaefc23..35b4f6a9c8ef 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -5,7 +5,6 @@
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/bootmem.h>
-#include <asm/semaphore.h>
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/msr.h>
@@ -62,9 +61,9 @@ __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
static int cachesize_override __cpuinitdata = -1;
static int disable_x86_serial_nr __cpuinitdata = 1;
-struct cpu_dev * cpu_devs[X86_VENDOR_NUM] = {};
+struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
-static void __cpuinit default_init(struct cpuinfo_x86 * c)
+static void __cpuinit default_init(struct cpuinfo_x86 *c)
{
/* Not much we can do here... */
/* Check if at least it has cpuid */
@@ -81,11 +80,11 @@ static struct cpu_dev __cpuinitdata default_cpu = {
.c_init = default_init,
.c_vendor = "Unknown",
};
-static struct cpu_dev * this_cpu __cpuinitdata = &default_cpu;
+static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
static int __init cachesize_setup(char *str)
{
- get_option (&str, &cachesize_override);
+ get_option(&str, &cachesize_override);
return 1;
}
__setup("cachesize=", cachesize_setup);
@@ -107,12 +106,12 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
/* Intel chips right-justify this string for some dumb reason;
undo that brain damage */
p = q = &c->x86_model_id[0];
- while ( *p == ' ' )
+ while (*p == ' ')
p++;
- if ( p != q ) {
- while ( *p )
+ if (p != q) {
+ while (*p)
*q++ = *p++;
- while ( q <= &c->x86_model_id[48] )
+ while (q <= &c->x86_model_id[48])
*q++ = '\0'; /* Zero-pad the rest */
}
@@ -130,7 +129,7 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
cpuid(0x80000005, &dummy, &dummy, &ecx, &edx);
printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
- c->x86_cache_size=(ecx>>24)+(edx>>24);
+ c->x86_cache_size = (ecx>>24)+(edx>>24);
}
if (n < 0x80000006) /* Some chips just has a large L1. */
@@ -138,16 +137,16 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
ecx = cpuid_ecx(0x80000006);
l2size = ecx >> 16;
-
+
/* do processor-specific cache resizing */
if (this_cpu->c_size_cache)
- l2size = this_cpu->c_size_cache(c,l2size);
+ l2size = this_cpu->c_size_cache(c, l2size);
/* Allow user to override all this if necessary. */
if (cachesize_override != -1)
l2size = cachesize_override;
- if ( l2size == 0 )
+ if (l2size == 0)
return; /* Again, no L2 cache is possible */
c->x86_cache_size = l2size;
@@ -156,16 +155,19 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
l2size, ecx & 0xFF);
}
-/* Naming convention should be: <Name> [(<Codename>)] */
-/* This table only is used unless init_<vendor>() below doesn't set it; */
-/* in particular, if CPUID levels 0x80000002..4 are supported, this isn't used */
+/*
+ * Naming convention should be: <Name> [(<Codename>)]
+ * This table only is used unless init_<vendor>() below doesn't set it;
+ * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
+ *
+ */
/* Look up CPU names by table lookup. */
static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
{
struct cpu_model_info *info;
- if ( c->x86_model >= 16 )
+ if (c->x86_model >= 16)
return NULL; /* Range check */
if (!this_cpu)
@@ -190,9 +192,9 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
for (i = 0; i < X86_VENDOR_NUM; i++) {
if (cpu_devs[i]) {
- if (!strcmp(v,cpu_devs[i]->c_ident[0]) ||
- (cpu_devs[i]->c_ident[1] &&
- !strcmp(v,cpu_devs[i]->c_ident[1]))) {
+ if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
+ (cpu_devs[i]->c_ident[1] &&
+ !strcmp(v, cpu_devs[i]->c_ident[1]))) {
c->x86_vendor = i;
if (!early)
this_cpu = cpu_devs[i];
@@ -210,7 +212,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early)
}
-static int __init x86_fxsr_setup(char * s)
+static int __init x86_fxsr_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_FXSR);
setup_clear_cpu_cap(X86_FEATURE_XMM);
@@ -219,7 +221,7 @@ static int __init x86_fxsr_setup(char * s)
__setup("nofxsr", x86_fxsr_setup);
-static int __init x86_sep_setup(char * s)
+static int __init x86_sep_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_SEP);
return 1;
@@ -306,14 +308,30 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
}
-}
+ clear_cpu_cap(c, X86_FEATURE_PAT);
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (c->x86 >= 0xf && c->x86 <= 0x11)
+ set_cpu_cap(c, X86_FEATURE_PAT);
+ break;
+ case X86_VENDOR_INTEL:
+ if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
+ set_cpu_cap(c, X86_FEATURE_PAT);
+ break;
+ }
-/* Do minimum CPU detection early.
- Fields really needed: vendor, cpuid_level, family, model, mask, cache alignment.
- The others are not touched to avoid unwanted side effects.
+}
- WARNING: this function is only called on the BP. Don't add code here
- that is supposed to run on all CPUs. */
+/*
+ * Do minimum CPU detection early.
+ * Fields really needed: vendor, cpuid_level, family, model, mask,
+ * cache alignment.
+ * The others are not touched to avoid unwanted side effects.
+ *
+ * WARNING: this function is only called on the BP. Don't add code here
+ * that is supposed to run on all CPUs.
+ */
static void __init early_cpu_detect(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -328,19 +346,14 @@ static void __init early_cpu_detect(void)
get_cpu_vendor(c, 1);
- switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- early_init_amd(c);
- break;
- case X86_VENDOR_INTEL:
- early_init_intel(c);
- break;
- }
+ if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
+ cpu_devs[c->x86_vendor]->c_early_init)
+ cpu_devs[c->x86_vendor]->c_early_init(c);
early_get_cap(c);
}
-static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
+static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
{
u32 tfms, xlvl;
unsigned int ebx;
@@ -351,13 +364,12 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
(unsigned int *)&c->x86_vendor_id[0],
(unsigned int *)&c->x86_vendor_id[8],
(unsigned int *)&c->x86_vendor_id[4]);
-
+
get_cpu_vendor(c, 0);
/* Initialize the standard set of capabilities */
/* Note that the vendor-specific code below might override */
-
/* Intel-defined flags: level 0x00000001 */
- if ( c->cpuid_level >= 0x00000001 ) {
+ if (c->cpuid_level >= 0x00000001) {
u32 capability, excap;
cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
c->x86_capability[0] = capability;
@@ -369,12 +381,14 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
if (c->x86 >= 0x6)
c->x86_model += ((tfms >> 16) & 0xF) << 4;
c->x86_mask = tfms & 15;
+ c->initial_apicid = (ebx >> 24) & 0xFF;
#ifdef CONFIG_X86_HT
- c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
+ c->apicid = phys_pkg_id(c->initial_apicid, 0);
+ c->phys_proc_id = c->initial_apicid;
#else
- c->apicid = (ebx >> 24) & 0xFF;
+ c->apicid = c->initial_apicid;
#endif
- if (c->x86_capability[0] & (1<<19))
+ if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
} else {
/* Have CPUID level 0 only - unheard of */
@@ -383,33 +397,42 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 * c)
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
- if ( (xlvl & 0xffff0000) == 0x80000000 ) {
- if ( xlvl >= 0x80000001 ) {
+ if ((xlvl & 0xffff0000) == 0x80000000) {
+ if (xlvl >= 0x80000001) {
c->x86_capability[1] = cpuid_edx(0x80000001);
c->x86_capability[6] = cpuid_ecx(0x80000001);
}
- if ( xlvl >= 0x80000004 )
+ if (xlvl >= 0x80000004)
get_model_name(c); /* Default name */
}
init_scattered_cpuid_features(c);
}
-#ifdef CONFIG_X86_HT
- c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
-#endif
+ clear_cpu_cap(c, X86_FEATURE_PAT);
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (c->x86 >= 0xf && c->x86 <= 0x11)
+ set_cpu_cap(c, X86_FEATURE_PAT);
+ break;
+ case X86_VENDOR_INTEL:
+ if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
+ set_cpu_cap(c, X86_FEATURE_PAT);
+ break;
+ }
}
static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
{
- if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr ) {
+ if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
/* Disable processor serial number */
- unsigned long lo,hi;
- rdmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
+ unsigned long lo, hi;
+ rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
lo |= 0x200000;
- wrmsr(MSR_IA32_BBL_CR_CTL,lo,hi);
+ wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
printk(KERN_NOTICE "CPU serial number disabled.\n");
- clear_bit(X86_FEATURE_PN, c->x86_capability);
+ clear_cpu_cap(c, X86_FEATURE_PN);
/* Disabling the serial number may affect the cpuid level */
c->cpuid_level = cpuid_eax(0);
@@ -444,9 +467,11 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
memset(&c->x86_capability, 0, sizeof c->x86_capability);
if (!have_cpuid_p()) {
- /* First of all, decide if this is a 486 or higher */
- /* It's a 486 if we can modify the AC flag */
- if ( flag_is_changeable_p(X86_EFLAGS_AC) )
+ /*
+ * First of all, decide if this is a 486 or higher
+ * It's a 486 if we can modify the AC flag
+ */
+ if (flag_is_changeable_p(X86_EFLAGS_AC))
c->x86 = 4;
else
c->x86 = 3;
@@ -479,10 +504,10 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
*/
/* If the model name is still unset, do table lookup. */
- if ( !c->x86_model_id[0] ) {
+ if (!c->x86_model_id[0]) {
char *p;
p = table_lookup_model(c);
- if ( p )
+ if (p)
strcpy(c->x86_model_id, p);
else
/* Last resort... */
@@ -496,9 +521,9 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
* common between the CPUs. The first time this routine gets
* executed, c == &boot_cpu_data.
*/
- if ( c != &boot_cpu_data ) {
+ if (c != &boot_cpu_data) {
/* AND the already accumulated flags with these */
- for ( i = 0 ; i < NCAPINTS ; i++ )
+ for (i = 0 ; i < NCAPINTS ; i++)
boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
}
@@ -542,7 +567,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
if (smp_num_siblings == 1) {
printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
- } else if (smp_num_siblings > 1 ) {
+ } else if (smp_num_siblings > 1) {
if (smp_num_siblings > NR_CPUS) {
printk(KERN_WARNING "CPU: Unsupported number of the "
@@ -552,7 +577,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
}
index_msb = get_count_order(smp_num_siblings);
- c->phys_proc_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb);
+ c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
c->phys_proc_id);
@@ -563,7 +588,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
core_bits = get_count_order(c->x86_max_cores);
- c->cpu_core_id = phys_pkg_id((ebx >> 24) & 0xFF, index_msb) &
+ c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
((1 << core_bits) - 1);
if (c->x86_max_cores > 1)
@@ -597,7 +622,7 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
else
printk("%s", c->x86_model_id);
- if (c->x86_mask || c->cpuid_level >= 0)
+ if (c->x86_mask || c->cpuid_level >= 0)
printk(" stepping %02x\n", c->x86_mask);
else
printk("\n");
@@ -616,23 +641,15 @@ __setup("clearcpuid=", setup_disablecpuid);
cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
-/* This is hacky. :)
- * We're emulating future behavior.
- * In the future, the cpu-specific init functions will be called implicitly
- * via the magic of initcalls.
- * They will insert themselves into the cpu_devs structure.
- * Then, when cpu_init() is called, we can just iterate over that array.
- */
void __init early_cpu_init(void)
{
- intel_cpu_init();
- cyrix_init_cpu();
- nsc_init_cpu();
- amd_init_cpu();
- centaur_init_cpu();
- transmeta_init_cpu();
- nexgen_init_cpu();
- umc_init_cpu();
+ struct cpu_vendor_dev *cvdev;
+
+ for (cvdev = __x86cpuvendor_start ;
+ cvdev < __x86cpuvendor_end ;
+ cvdev++)
+ cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
+
early_cpu_detect();
}
@@ -666,7 +683,7 @@ void __cpuinit cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
- struct tss_struct * t = &per_cpu(init_tss, cpu);
+ struct tss_struct *t = &per_cpu(init_tss, cpu);
struct thread_struct *thread = &curr->thread;
if (cpu_test_and_set(cpu, cpu_initialized)) {
@@ -692,7 +709,7 @@ void __cpuinit cpu_init(void)
enter_lazy_tlb(&init_mm, curr);
load_sp0(t, thread);
- set_tss_desc(cpu,t);
+ set_tss_desc(cpu, t);
load_TR_desc();
load_LDT(&init_mm.context);
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index e0b38c33d842..783691b2a738 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -14,6 +14,7 @@ struct cpu_dev {
struct cpu_model_info c_models[4];
+ void (*c_early_init)(struct cpuinfo_x86 *c);
void (*c_init)(struct cpuinfo_x86 * c);
void (*c_identify)(struct cpuinfo_x86 * c);
unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
@@ -21,18 +22,17 @@ struct cpu_dev {
extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM];
+struct cpu_vendor_dev {
+ int vendor;
+ struct cpu_dev *cpu_dev;
+};
+
+#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \
+ static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \
+ __attribute__((__section__(".x86cpuvendor.init"))) = \
+ { cpu_vendor_id, cpu_dev }
+
+extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
+
extern int get_model_name(struct cpuinfo_x86 *c);
extern void display_cacheinfo(struct cpuinfo_x86 *c);
-
-extern void early_init_intel(struct cpuinfo_x86 *c);
-extern void early_init_amd(struct cpuinfo_x86 *c);
-
-/* Specific CPU type init functions */
-int intel_cpu_init(void);
-int amd_init_cpu(void);
-int cyrix_init_cpu(void);
-int nsc_init_cpu(void);
-int centaur_init_cpu(void);
-int transmeta_init_cpu(void);
-int nexgen_init_cpu(void);
-int umc_init_cpu(void);
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index a962dcb9c408..8db8f73503b3 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -192,9 +192,9 @@ static void drv_read(struct drv_cmd *cmd)
cpumask_t saved_mask = current->cpus_allowed;
cmd->val = 0;
- set_cpus_allowed(current, cmd->mask);
+ set_cpus_allowed_ptr(current, &cmd->mask);
do_drv_read(cmd);
- set_cpus_allowed(current, saved_mask);
+ set_cpus_allowed_ptr(current, &saved_mask);
}
static void drv_write(struct drv_cmd *cmd)
@@ -203,30 +203,30 @@ static void drv_write(struct drv_cmd *cmd)
unsigned int i;
for_each_cpu_mask(i, cmd->mask) {
- set_cpus_allowed(current, cpumask_of_cpu(i));
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
do_drv_write(cmd);
}
- set_cpus_allowed(current, saved_mask);
+ set_cpus_allowed_ptr(current, &saved_mask);
return;
}
-static u32 get_cur_val(cpumask_t mask)
+static u32 get_cur_val(const cpumask_t *mask)
{
struct acpi_processor_performance *perf;
struct drv_cmd cmd;
- if (unlikely(cpus_empty(mask)))
+ if (unlikely(cpus_empty(*mask)))
return 0;
- switch (per_cpu(drv_data, first_cpu(mask))->cpu_feature) {
+ switch (per_cpu(drv_data, first_cpu(*mask))->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
break;
case SYSTEM_IO_CAPABLE:
cmd.type = SYSTEM_IO_CAPABLE;
- perf = per_cpu(drv_data, first_cpu(mask))->acpi_data;
+ perf = per_cpu(drv_data, first_cpu(*mask))->acpi_data;
cmd.addr.io.port = perf->control_register.address;
cmd.addr.io.bit_width = perf->control_register.bit_width;
break;
@@ -234,7 +234,7 @@ static u32 get_cur_val(cpumask_t mask)
return 0;
}
- cmd.mask = mask;
+ cmd.mask = *mask;
drv_read(&cmd);
@@ -271,7 +271,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
unsigned int retval;
saved_mask = current->cpus_allowed;
- set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
if (get_cpu() != cpu) {
/* We were not able to run on requested processor */
put_cpu();
@@ -329,7 +329,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100;
put_cpu();
- set_cpus_allowed(current, saved_mask);
+ set_cpus_allowed_ptr(current, &saved_mask);
dprintk("cpu %d: performance percent %d\n", cpu, perf_percent);
return retval;
@@ -339,6 +339,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
{
struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
unsigned int freq;
+ unsigned int cached_freq;
dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
@@ -347,13 +348,22 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
return 0;
}
- freq = extract_freq(get_cur_val(cpumask_of_cpu(cpu)), data);
+ cached_freq = data->freq_table[data->acpi_data->state].frequency;
+ freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data);
+ if (freq != cached_freq) {
+ /*
+ * The dreaded BIOS frequency change behind our back.
+ * Force set the frequency on next target call.
+ */
+ data->resume = 1;
+ }
+
dprintk("cur freq = %u\n", freq);
return freq;
}
-static unsigned int check_freqs(cpumask_t mask, unsigned int freq,
+static unsigned int check_freqs(const cpumask_t *mask, unsigned int freq,
struct acpi_cpufreq_data *data)
{
unsigned int cur_freq;
@@ -449,7 +459,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
drv_write(&cmd);
if (acpi_pstate_strict) {
- if (!check_freqs(cmd.mask, freqs.new, data)) {
+ if (!check_freqs(&cmd.mask, freqs.new, data)) {
dprintk("acpi_cpufreq_target failed (%d)\n",
policy->cpu);
return -EAGAIN;
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 14791ec55cfd..199e4e05e5dc 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -289,8 +289,8 @@ static int __init cpufreq_p4_init(void)
if (c->x86_vendor != X86_VENDOR_INTEL)
return -ENODEV;
- if (!test_bit(X86_FEATURE_ACPI, c->x86_capability) ||
- !test_bit(X86_FEATURE_ACC, c->x86_capability))
+ if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
+ !test_cpu_cap(c, X86_FEATURE_ACC))
return -ENODEV;
ret = cpufreq_register_driver(&p4clockmod_driver);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index c99d59d8ef2e..46d4034d9f37 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -478,12 +478,12 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi
static int check_supported_cpu(unsigned int cpu)
{
- cpumask_t oldmask = CPU_MASK_ALL;
+ cpumask_t oldmask;
u32 eax, ebx, ecx, edx;
unsigned int rc = 0;
oldmask = current->cpus_allowed;
- set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
if (smp_processor_id() != cpu) {
printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu);
@@ -528,7 +528,7 @@ static int check_supported_cpu(unsigned int cpu)
rc = 1;
out:
- set_cpus_allowed(current, oldmask);
+ set_cpus_allowed_ptr(current, &oldmask);
return rc;
}
@@ -1015,7 +1015,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
/* Driver entry point to switch to the target frequency */
static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
{
- cpumask_t oldmask = CPU_MASK_ALL;
+ cpumask_t oldmask;
struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
u32 checkfid;
u32 checkvid;
@@ -1030,7 +1030,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
/* only run on specific CPU from here on */
oldmask = current->cpus_allowed;
- set_cpus_allowed(current, cpumask_of_cpu(pol->cpu));
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
if (smp_processor_id() != pol->cpu) {
printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1085,7 +1085,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi
ret = 0;
err_out:
- set_cpus_allowed(current, oldmask);
+ set_cpus_allowed_ptr(current, &oldmask);
return ret;
}
@@ -1104,7 +1104,7 @@ static int powernowk8_verify(struct cpufreq_policy *pol)
static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
{
struct powernow_k8_data *data;
- cpumask_t oldmask = CPU_MASK_ALL;
+ cpumask_t oldmask;
int rc;
if (!cpu_online(pol->cpu))
@@ -1145,7 +1145,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
/* only run on specific CPU from here on */
oldmask = current->cpus_allowed;
- set_cpus_allowed(current, cpumask_of_cpu(pol->cpu));
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu));
if (smp_processor_id() != pol->cpu) {
printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1164,7 +1164,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
fidvid_msr_init();
/* run on any CPU again */
- set_cpus_allowed(current, oldmask);
+ set_cpus_allowed_ptr(current, &oldmask);
if (cpu_family == CPU_HW_PSTATE)
pol->cpus = cpumask_of_cpu(pol->cpu);
@@ -1205,7 +1205,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
return 0;
err_out:
- set_cpus_allowed(current, oldmask);
+ set_cpus_allowed_ptr(current, &oldmask);
powernow_k8_cpu_exit_acpi(data);
kfree(data);
@@ -1242,10 +1242,11 @@ static unsigned int powernowk8_get (unsigned int cpu)
if (!data)
return -EINVAL;
- set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
if (smp_processor_id() != cpu) {
- printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu);
- set_cpus_allowed(current, oldmask);
+ printk(KERN_ERR PFX
+ "limiting to CPU %d failed in powernowk8_get\n", cpu);
+ set_cpus_allowed_ptr(current, &oldmask);
return 0;
}
@@ -1253,13 +1254,14 @@ static unsigned int powernowk8_get (unsigned int cpu)
goto out;
if (cpu_family == CPU_HW_PSTATE)
- khz = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
+ khz = find_khz_freq_from_pstate(data->powernow_table,
+ data->currpstate);
else
khz = find_khz_freq_from_fid(data->currfid);
out:
- set_cpus_allowed(current, oldmask);
+ set_cpus_allowed_ptr(current, &oldmask);
return khz;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 3031f1196192..908dd347c67e 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -315,7 +315,7 @@ static unsigned int get_cur_freq(unsigned int cpu)
cpumask_t saved_mask;
saved_mask = current->cpus_allowed;
- set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
if (smp_processor_id() != cpu)
return 0;
@@ -333,7 +333,7 @@ static unsigned int get_cur_freq(unsigned int cpu)
clock_freq = extract_clock(l, cpu, 1);
}
- set_cpus_allowed(current, saved_mask);
+ set_cpus_allowed_ptr(current, &saved_mask);
return clock_freq;
}
@@ -487,7 +487,7 @@ static int centrino_target (struct cpufreq_policy *policy,
else
cpu_set(j, set_mask);
- set_cpus_allowed(current, set_mask);
+ set_cpus_allowed_ptr(current, &set_mask);
preempt_disable();
if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) {
dprintk("couldn't limit to CPUs in this domain\n");
@@ -555,7 +555,8 @@ static int centrino_target (struct cpufreq_policy *policy,
if (!cpus_empty(covered_cpus)) {
for_each_cpu_mask(j, covered_cpus) {
- set_cpus_allowed(current, cpumask_of_cpu(j));
+ set_cpus_allowed_ptr(current,
+ &cpumask_of_cpu(j));
wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
}
}
@@ -569,12 +570,12 @@ static int centrino_target (struct cpufreq_policy *policy,
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
}
}
- set_cpus_allowed(current, saved_mask);
+ set_cpus_allowed_ptr(current, &saved_mask);
return 0;
migrate_end:
preempt_enable();
- set_cpus_allowed(current, saved_mask);
+ set_cpus_allowed_ptr(current, &saved_mask);
return 0;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 14d68aa301ee..1b50244b1fdf 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -229,22 +229,22 @@ static unsigned int speedstep_detect_chipset (void)
return 0;
}
-static unsigned int _speedstep_get(cpumask_t cpus)
+static unsigned int _speedstep_get(const cpumask_t *cpus)
{
unsigned int speed;
cpumask_t cpus_allowed;
cpus_allowed = current->cpus_allowed;
- set_cpus_allowed(current, cpus);
+ set_cpus_allowed_ptr(current, cpus);
speed = speedstep_get_processor_frequency(speedstep_processor);
- set_cpus_allowed(current, cpus_allowed);
+ set_cpus_allowed_ptr(current, &cpus_allowed);
dprintk("detected %u kHz as current frequency\n", speed);
return speed;
}
static unsigned int speedstep_get(unsigned int cpu)
{
- return _speedstep_get(cpumask_of_cpu(cpu));
+ return _speedstep_get(&cpumask_of_cpu(cpu));
}
/**
@@ -267,7 +267,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0], target_freq, relation, &newstate))
return -EINVAL;
- freqs.old = _speedstep_get(policy->cpus);
+ freqs.old = _speedstep_get(&policy->cpus);
freqs.new = speedstep_freqs[newstate].frequency;
freqs.cpu = policy->cpu;
@@ -285,12 +285,12 @@ static int speedstep_target (struct cpufreq_policy *policy,
}
/* switch to physical CPU where state is to be changed */
- set_cpus_allowed(current, policy->cpus);
+ set_cpus_allowed_ptr(current, &policy->cpus);
speedstep_set_state(newstate);
/* allow to be run on all CPUs */
- set_cpus_allowed(current, cpus_allowed);
+ set_cpus_allowed_ptr(current, &cpus_allowed);
for_each_cpu_mask(i, policy->cpus) {
freqs.cpu = i;
@@ -326,7 +326,7 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
#endif
cpus_allowed = current->cpus_allowed;
- set_cpus_allowed(current, policy->cpus);
+ set_cpus_allowed_ptr(current, &policy->cpus);
/* detect low and high frequency and transition latency */
result = speedstep_get_freqs(speedstep_processor,
@@ -334,12 +334,12 @@ static int speedstep_cpu_init(struct cpufreq_policy *policy)
&speedstep_freqs[SPEEDSTEP_HIGH].frequency,
&policy->cpuinfo.transition_latency,
&speedstep_set_state);
- set_cpus_allowed(current, cpus_allowed);
+ set_cpus_allowed_ptr(current, &cpus_allowed);
if (result)
return result;
/* get current speed setting */
- speed = _speedstep_get(policy->cpus);
+ speed = _speedstep_get(&policy->cpus);
if (!speed)
return -EIO;
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 7139b0262703..3fd7a67bb06a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -19,7 +19,7 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
{
unsigned char ccr2, ccr3;
unsigned long flags;
-
+
/* we test for DEVID by checking whether CCR3 is writable */
local_irq_save(flags);
ccr3 = getCx86(CX86_CCR3);
@@ -37,8 +37,7 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
setCx86(CX86_CCR2, ccr2);
*dir0 = 0xfe;
}
- }
- else {
+ } else {
setCx86(CX86_CCR3, ccr3); /* restore CCR3 */
/* read DIR0 and DIR1 CPU registers */
@@ -86,7 +85,7 @@ static char cyrix_model_mult2[] __cpuinitdata = "12233445";
static void __cpuinit check_cx686_slop(struct cpuinfo_x86 *c)
{
unsigned long flags;
-
+
if (Cx86_dir0_msb == 3) {
unsigned char ccr3, ccr5;
@@ -132,7 +131,7 @@ static void __cpuinit set_cx86_memwb(void)
/* set 'Not Write-through' */
write_cr0(read_cr0() | X86_CR0_NW);
/* CCR2 bit 2: lock NW bit and set WT1 */
- setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
+ setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14);
}
static void __cpuinit set_cx86_inc(void)
@@ -148,7 +147,7 @@ static void __cpuinit set_cx86_inc(void)
setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02);
/* PCR0 -- Performance Control */
/* Incrementor Margin 10 */
- setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04);
+ setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04);
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
}
@@ -167,16 +166,16 @@ static void __cpuinit geode_configure(void)
ccr3 = getCx86(CX86_CCR3);
setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
-
+
/* FPU fast, DTE cache, Mem bypass */
setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38);
setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
-
+
set_cx86_memwb();
- set_cx86_reorder();
+ set_cx86_reorder();
set_cx86_inc();
-
+
local_irq_restore(flags);
}
@@ -187,14 +186,16 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
char *buf = c->x86_model_id;
const char *p = NULL;
- /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
- clear_bit(0*32+31, c->x86_capability);
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0*32+31);
/* Cyrix used bit 24 in extended (AMD) CPUID for Cyrix MMX extensions */
- if ( test_bit(1*32+24, c->x86_capability) ) {
- clear_bit(1*32+24, c->x86_capability);
- set_bit(X86_FEATURE_CXMMX, c->x86_capability);
+ if (test_cpu_cap(c, 1*32+24)) {
+ clear_cpu_cap(c, 1*32+24);
+ set_cpu_cap(c, X86_FEATURE_CXMMX);
}
do_cyrix_devid(&dir0, &dir1);
@@ -213,7 +214,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
* the model, multiplier and stepping. Black magic included,
* to make the silicon step/rev numbers match the printed ones.
*/
-
+
switch (dir0_msn) {
unsigned char tmp;
@@ -241,7 +242,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
} else /* 686 */
p = Cx86_cb+1;
/* Emulate MTRRs using Cyrix's ARRs. */
- set_bit(X86_FEATURE_CYRIX_ARR, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
/* 6x86's contain this bug */
c->coma_bug = 1;
break;
@@ -250,17 +251,18 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
#ifdef CONFIG_PCI
{
u32 vendor, device;
- /* It isn't really a PCI quirk directly, but the cure is the
- same. The MediaGX has deep magic SMM stuff that handles the
- SB emulation. It throws away the fifo on disable_dma() which
- is wrong and ruins the audio.
-
- Bug2: VSA1 has a wrap bug so that using maximum sized DMA
- causes bad things. According to NatSemi VSA2 has another
- bug to do with 'hlt'. I've not seen any boards using VSA2
- and X doesn't seem to support it either so who cares 8).
- VSA1 we work around however.
- */
+ /*
+ * It isn't really a PCI quirk directly, but the cure is the
+ * same. The MediaGX has deep magic SMM stuff that handles the
+ * SB emulation. It throws away the fifo on disable_dma() which
+ * is wrong and ruins the audio.
+ *
+ * Bug2: VSA1 has a wrap bug so that using maximum sized DMA
+ * causes bad things. According to NatSemi VSA2 has another
+ * bug to do with 'hlt'. I've not seen any boards using VSA2
+ * and X doesn't seem to support it either so who cares 8).
+ * VSA1 we work around however.
+ */
printk(KERN_INFO "Working around Cyrix MediaGX virtual DMA bugs.\n");
isa_dma_bridge_buggy = 2;
@@ -273,55 +275,51 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
/*
* The 5510/5520 companion chips have a funky PIT.
- */
+ */
if (vendor == PCI_VENDOR_ID_CYRIX &&
(device == PCI_DEVICE_ID_CYRIX_5510 || device == PCI_DEVICE_ID_CYRIX_5520))
mark_tsc_unstable("cyrix 5510/5520 detected");
}
#endif
- c->x86_cache_size=16; /* Yep 16K integrated cache thats it */
+ c->x86_cache_size = 16; /* Yep 16K integrated cache thats it */
/* GXm supports extended cpuid levels 'ala' AMD */
if (c->cpuid_level == 2) {
/* Enable cxMMX extensions (GX1 Datasheet 54) */
setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1);
-
+
/*
* GXm : 0x30 ... 0x5f GXm datasheet 51
* GXlv: 0x6x GXlv datasheet 54
* ? : 0x7x
* GX1 : 0x8x GX1 datasheet 56
*/
- if((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <=dir1 && dir1 <= 0x8f))
+ if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
geode_configure();
get_model_name(c); /* get CPU marketing name */
return;
- }
- else { /* MediaGX */
+ } else { /* MediaGX */
Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
p = Cx86_cb+2;
c->x86_model = (dir1 & 0x20) ? 1 : 2;
}
break;
- case 5: /* 6x86MX/M II */
- if (dir1 > 7)
- {
+ case 5: /* 6x86MX/M II */
+ if (dir1 > 7) {
dir0_msn++; /* M II */
/* Enable MMX extensions (App note 108) */
setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1);
- }
- else
- {
+ } else {
c->coma_bug = 1; /* 6x86MX, it has the bug. */
}
tmp = (!(dir0_lsn & 7) || dir0_lsn & 1) ? 2 : 0;
Cx86_cb[tmp] = cyrix_model_mult2[dir0_lsn & 7];
p = Cx86_cb+tmp;
- if (((dir1 & 0x0f) > 4) || ((dir1 & 0xf0) == 0x20))
+ if (((dir1 & 0x0f) > 4) || ((dir1 & 0xf0) == 0x20))
(c->x86_model)++;
/* Emulate MTRRs using Cyrix's ARRs. */
- set_bit(X86_FEATURE_CYRIX_ARR, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
break;
case 0xf: /* Cyrix 486 without DEVID registers */
@@ -343,7 +341,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
break;
}
strcpy(buf, Cx86_model[dir0_msn & 7]);
- if (p) strcat(buf, p);
+ if (p)
+ strcat(buf, p);
return;
}
@@ -352,7 +351,8 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
*/
static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
{
- /* There may be GX1 processors in the wild that are branded
+ /*
+ * There may be GX1 processors in the wild that are branded
* NSC and not Cyrix.
*
* This function only handles the GX processor, and kicks every
@@ -377,7 +377,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
* by the fact that they preserve the flags across the division of 5/2.
* PII and PPro exhibit this behavior too, but they have cpuid available.
*/
-
+
/*
* Perform the Cyrix 5/2 test. A Cyrix won't change
* the flags, while other 486 chips will.
@@ -398,27 +398,26 @@ static inline int test_cyrix_52div(void)
return (unsigned char) (test >> 8) == 0x02;
}
-static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c)
+static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
{
/* Detect Cyrix with disabled CPUID */
- if ( c->x86 == 4 && test_cyrix_52div() ) {
+ if (c->x86 == 4 && test_cyrix_52div()) {
unsigned char dir0, dir1;
-
+
strcpy(c->x86_vendor_id, "CyrixInstead");
- c->x86_vendor = X86_VENDOR_CYRIX;
-
- /* Actually enable cpuid on the older cyrix */
-
- /* Retrieve CPU revisions */
-
+ c->x86_vendor = X86_VENDOR_CYRIX;
+
+ /* Actually enable cpuid on the older cyrix */
+
+ /* Retrieve CPU revisions */
+
do_cyrix_devid(&dir0, &dir1);
- dir0>>=4;
-
+ dir0 >>= 4;
+
/* Check it is an affected model */
-
- if (dir0 == 5 || dir0 == 3)
- {
+
+ if (dir0 == 5 || dir0 == 3) {
unsigned char ccr3;
unsigned long flags;
printk(KERN_INFO "Enabling CPUID on Cyrix processor.\n");
@@ -434,26 +433,17 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 * c)
static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
.c_vendor = "Cyrix",
- .c_ident = { "CyrixInstead" },
+ .c_ident = { "CyrixInstead" },
.c_init = init_cyrix,
.c_identify = cyrix_identify,
};
-int __init cyrix_init_cpu(void)
-{
- cpu_devs[X86_VENDOR_CYRIX] = &cyrix_cpu_dev;
- return 0;
-}
+cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev);
static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
.c_vendor = "NSC",
- .c_ident = { "Geode by NSC" },
+ .c_ident = { "Geode by NSC" },
.c_init = init_nsc,
};
-int __init nsc_init_cpu(void)
-{
- cpu_devs[X86_VENDOR_NSC] = &nsc_cpu_dev;
- return 0;
-}
-
+cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev);
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
index ee975ac6bbcb..e43ad4ad4cba 100644
--- a/arch/x86/kernel/cpu/feature_names.c
+++ b/arch/x86/kernel/cpu/feature_names.c
@@ -4,7 +4,7 @@
* This file must not contain any executable code.
*/
-#include "asm/cpufeature.h"
+#include <asm/cpufeature.h>
/*
* These flag bits must match the definitions in <asm/cpufeature.h>.
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fae31ce747bd..fe9224c51d37 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -30,7 +30,7 @@
struct movsl_mask movsl_mask __read_mostly;
#endif
-void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
+static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
{
/* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
if (c->x86 == 15 && c->x86_cache_alignment == 64)
@@ -45,7 +45,7 @@ void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
*
* This is called before we do cpu ident work
*/
-
+
int __cpuinit ppro_with_ram_bug(void)
{
/* Uses data from early_cpu_detect now */
@@ -58,7 +58,7 @@ int __cpuinit ppro_with_ram_bug(void)
}
return 0;
}
-
+
/*
* P4 Xeon errata 037 workaround.
@@ -69,7 +69,7 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
unsigned long lo, hi;
if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
- rdmsr (MSR_IA32_MISC_ENABLE, lo, hi);
+ rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
if ((lo & (1<<9)) == 0) {
printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n");
printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n");
@@ -127,10 +127,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
*/
c->f00f_bug = 0;
if (!paravirt_enabled() && c->x86 == 5) {
- static int f00f_workaround_enabled = 0;
+ static int f00f_workaround_enabled;
c->f00f_bug = 1;
- if ( !f00f_workaround_enabled ) {
+ if (!f00f_workaround_enabled) {
trap_init_f00f_bug();
printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
f00f_workaround_enabled = 1;
@@ -139,20 +139,22 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
#endif
l2 = init_intel_cacheinfo(c);
- if (c->cpuid_level > 9 ) {
+ if (c->cpuid_level > 9) {
unsigned eax = cpuid_eax(10);
/* Check for version and the number of counters */
if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
- set_bit(X86_FEATURE_ARCH_PERFMON, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
/* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */
if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
- clear_bit(X86_FEATURE_SEP, c->x86_capability);
+ clear_cpu_cap(c, X86_FEATURE_SEP);
- /* Names for the Pentium II/Celeron processors
- detectable only by also checking the cache size.
- Dixon is NOT a Celeron. */
+ /*
+ * Names for the Pentium II/Celeron processors
+ * detectable only by also checking the cache size.
+ * Dixon is NOT a Celeron.
+ */
if (c->x86 == 6) {
switch (c->x86_model) {
case 5:
@@ -163,14 +165,14 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
p = "Mobile Pentium II (Dixon)";
}
break;
-
+
case 6:
if (l2 == 128)
p = "Celeron (Mendocino)";
else if (c->x86_mask == 0 || c->x86_mask == 5)
p = "Celeron-A";
break;
-
+
case 8:
if (l2 == 128)
p = "Celeron (Coppermine)";
@@ -178,9 +180,9 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
}
}
- if ( p )
+ if (p)
strcpy(c->x86_model_id, p);
-
+
c->x86_max_cores = num_cpu_cores(c);
detect_ht(c);
@@ -207,28 +209,29 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
#endif
if (cpu_has_xmm2)
- set_bit(X86_FEATURE_LFENCE_RDTSC, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
if (c->x86 == 15) {
- set_bit(X86_FEATURE_P4, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_P4);
}
- if (c->x86 == 6)
- set_bit(X86_FEATURE_P3, c->x86_capability);
+ if (c->x86 == 6)
+ set_cpu_cap(c, X86_FEATURE_P3);
if (cpu_has_ds) {
unsigned int l1;
rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
if (!(l1 & (1<<11)))
- set_bit(X86_FEATURE_BTS, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_BTS);
if (!(l1 & (1<<12)))
- set_bit(X86_FEATURE_PEBS, c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_PEBS);
}
if (cpu_has_bts)
ds_init_intel(c);
}
-static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned int size)
+static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
{
- /* Intel PIII Tualatin. This comes in two flavours.
+ /*
+ * Intel PIII Tualatin. This comes in two flavours.
* One has 256kb of cache, the other 512. We have no way
* to determine which, so we use a boottime override
* for the 512kb model, and assume 256 otherwise.
@@ -240,42 +243,42 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 * c, unsigned
static struct cpu_dev intel_cpu_dev __cpuinitdata = {
.c_vendor = "Intel",
- .c_ident = { "GenuineIntel" },
+ .c_ident = { "GenuineIntel" },
.c_models = {
- { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
- {
- [0] = "486 DX-25/33",
- [1] = "486 DX-50",
- [2] = "486 SX",
- [3] = "486 DX/2",
- [4] = "486 SL",
- [5] = "486 SX/2",
- [7] = "486 DX/2-WB",
- [8] = "486 DX/4",
+ { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
+ {
+ [0] = "486 DX-25/33",
+ [1] = "486 DX-50",
+ [2] = "486 SX",
+ [3] = "486 DX/2",
+ [4] = "486 SL",
+ [5] = "486 SX/2",
+ [7] = "486 DX/2-WB",
+ [8] = "486 DX/4",
[9] = "486 DX/4-WB"
}
},
{ .vendor = X86_VENDOR_INTEL, .family = 5, .model_names =
- {
- [0] = "Pentium 60/66 A-step",
- [1] = "Pentium 60/66",
+ {
+ [0] = "Pentium 60/66 A-step",
+ [1] = "Pentium 60/66",
[2] = "Pentium 75 - 200",
- [3] = "OverDrive PODP5V83",
+ [3] = "OverDrive PODP5V83",
[4] = "Pentium MMX",
- [7] = "Mobile Pentium 75 - 200",
+ [7] = "Mobile Pentium 75 - 200",
[8] = "Mobile Pentium MMX"
}
},
{ .vendor = X86_VENDOR_INTEL, .family = 6, .model_names =
- {
+ {
[0] = "Pentium Pro A-step",
- [1] = "Pentium Pro",
- [3] = "Pentium II (Klamath)",
- [4] = "Pentium II (Deschutes)",
- [5] = "Pentium II (Deschutes)",
+ [1] = "Pentium Pro",
+ [3] = "Pentium II (Klamath)",
+ [4] = "Pentium II (Deschutes)",
+ [5] = "Pentium II (Deschutes)",
[6] = "Mobile Pentium II",
- [7] = "Pentium III (Katmai)",
- [8] = "Pentium III (Coppermine)",
+ [7] = "Pentium III (Katmai)",
+ [8] = "Pentium III (Coppermine)",
[10] = "Pentium III (Cascades)",
[11] = "Pentium III (Tualatin)",
}
@@ -290,15 +293,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
}
},
},
+ .c_early_init = early_init_intel,
.c_init = init_intel,
.c_size_cache = intel_size_cache,
};
-__init int intel_cpu_init(void)
-{
- cpu_devs[X86_VENDOR_INTEL] = &intel_cpu_dev;
- return 0;
-}
+cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
#ifndef CONFIG_X86_CMPXCHG
unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
@@ -364,5 +364,5 @@ unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
EXPORT_SYMBOL(cmpxchg_486_u64);
#endif
-// arch_initcall(intel_cpu_init);
+/* arch_initcall(intel_cpu_init); */
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1b889860eb73..26d615dcb149 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -129,7 +129,7 @@ struct _cpuid4_info {
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned long size;
- cpumask_t shared_cpu_map;
+ cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */
};
unsigned short num_cache_leaves;
@@ -451,8 +451,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
}
/* pointer to _cpuid4_info array (for each cache leaf) */
-static struct _cpuid4_info *cpuid4_info[NR_CPUS];
-#define CPUID4_INFO_IDX(x,y) (&((cpuid4_info[x])[y]))
+static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
+#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
#ifdef CONFIG_SMP
static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
@@ -474,7 +474,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
if (cpu_data(i).apicid >> index_msb ==
c->apicid >> index_msb) {
cpu_set(i, this_leaf->shared_cpu_map);
- if (i != cpu && cpuid4_info[i]) {
+ if (i != cpu && per_cpu(cpuid4_info, i)) {
sibling_leaf = CPUID4_INFO_IDX(i, index);
cpu_set(cpu, sibling_leaf->shared_cpu_map);
}
@@ -505,8 +505,8 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
for (i = 0; i < num_cache_leaves; i++)
cache_remove_shared_cpu_map(cpu, i);
- kfree(cpuid4_info[cpu]);
- cpuid4_info[cpu] = NULL;
+ kfree(per_cpu(cpuid4_info, cpu));
+ per_cpu(cpuid4_info, cpu) = NULL;
}
static int __cpuinit detect_cache_attributes(unsigned int cpu)
@@ -519,13 +519,13 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
if (num_cache_leaves == 0)
return -ENOENT;
- cpuid4_info[cpu] = kzalloc(
+ per_cpu(cpuid4_info, cpu) = kzalloc(
sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- if (cpuid4_info[cpu] == NULL)
+ if (per_cpu(cpuid4_info, cpu) == NULL)
return -ENOMEM;
oldmask = current->cpus_allowed;
- retval = set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
if (retval)
goto out;
@@ -542,12 +542,12 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
}
cache_shared_cpu_map_setup(cpu, j);
}
- set_cpus_allowed(current, oldmask);
+ set_cpus_allowed_ptr(current, &oldmask);
out:
if (retval) {
- kfree(cpuid4_info[cpu]);
- cpuid4_info[cpu] = NULL;
+ kfree(per_cpu(cpuid4_info, cpu));
+ per_cpu(cpuid4_info, cpu) = NULL;
}
return retval;
@@ -561,7 +561,7 @@ out:
extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
/* pointer to kobject for cpuX/cache */
-static struct kobject * cache_kobject[NR_CPUS];
+static DEFINE_PER_CPU(struct kobject *, cache_kobject);
struct _index_kobject {
struct kobject kobj;
@@ -570,8 +570,8 @@ struct _index_kobject {
};
/* pointer to array of kobjects for cpuX/cache/indexY */
-static struct _index_kobject *index_kobject[NR_CPUS];
-#define INDEX_KOBJECT_PTR(x,y) (&((index_kobject[x])[y]))
+static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
+#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y]))
#define show_one_plus(file_name, object, val) \
static ssize_t show_##file_name \
@@ -591,11 +591,32 @@ static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf)
return sprintf (buf, "%luK\n", this_leaf->size / 1024);
}
-static ssize_t show_shared_cpu_map(struct _cpuid4_info *this_leaf, char *buf)
+static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
+ int type, char *buf)
{
- char mask_str[NR_CPUS];
- cpumask_scnprintf(mask_str, NR_CPUS, this_leaf->shared_cpu_map);
- return sprintf(buf, "%s\n", mask_str);
+ ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
+ int n = 0;
+
+ if (len > 1) {
+ cpumask_t *mask = &this_leaf->shared_cpu_map;
+
+ n = type?
+ cpulist_scnprintf(buf, len-2, *mask):
+ cpumask_scnprintf(buf, len-2, *mask);
+ buf[n++] = '\n';
+ buf[n] = '\0';
+ }
+ return n;
+}
+
+static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf)
+{
+ return show_shared_cpu_map_func(leaf, 0, buf);
+}
+
+static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf)
+{
+ return show_shared_cpu_map_func(leaf, 1, buf);
}
static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
@@ -633,6 +654,7 @@ define_one_ro(ways_of_associativity);
define_one_ro(number_of_sets);
define_one_ro(size);
define_one_ro(shared_cpu_map);
+define_one_ro(shared_cpu_list);
static struct attribute * default_attrs[] = {
&type.attr,
@@ -643,6 +665,7 @@ static struct attribute * default_attrs[] = {
&number_of_sets.attr,
&size.attr,
&shared_cpu_map.attr,
+ &shared_cpu_list.attr,
NULL
};
@@ -684,10 +707,10 @@ static struct kobj_type ktype_percpu_entry = {
static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
{
- kfree(cache_kobject[cpu]);
- kfree(index_kobject[cpu]);
- cache_kobject[cpu] = NULL;
- index_kobject[cpu] = NULL;
+ kfree(per_cpu(cache_kobject, cpu));
+ kfree(per_cpu(index_kobject, cpu));
+ per_cpu(cache_kobject, cpu) = NULL;
+ per_cpu(index_kobject, cpu) = NULL;
free_cache_attributes(cpu);
}
@@ -703,13 +726,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
return err;
/* Allocate all required memory */
- cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (unlikely(cache_kobject[cpu] == NULL))
+ per_cpu(cache_kobject, cpu) =
+ kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ if (unlikely(per_cpu(cache_kobject, cpu) == NULL))
goto err_out;
- index_kobject[cpu] = kzalloc(
+ per_cpu(index_kobject, cpu) = kzalloc(
sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
- if (unlikely(index_kobject[cpu] == NULL))
+ if (unlikely(per_cpu(index_kobject, cpu) == NULL))
goto err_out;
return 0;
@@ -733,7 +757,8 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
if (unlikely(retval < 0))
return retval;
- retval = kobject_init_and_add(cache_kobject[cpu], &ktype_percpu_entry,
+ retval = kobject_init_and_add(per_cpu(cache_kobject, cpu),
+ &ktype_percpu_entry,
&sys_dev->kobj, "%s", "cache");
if (retval < 0) {
cpuid4_cache_sysfs_exit(cpu);
@@ -745,13 +770,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
this_object->cpu = cpu;
this_object->index = i;
retval = kobject_init_and_add(&(this_object->kobj),
- &ktype_cache, cache_kobject[cpu],
+ &ktype_cache,
+ per_cpu(cache_kobject, cpu),
"index%1lu", i);
if (unlikely(retval)) {
for (j = 0; j < i; j++) {
kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj));
}
- kobject_put(cache_kobject[cpu]);
+ kobject_put(per_cpu(cache_kobject, cpu));
cpuid4_cache_sysfs_exit(cpu);
break;
}
@@ -760,7 +786,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
if (!retval)
cpu_set(cpu, cache_dev_map);
- kobject_uevent(cache_kobject[cpu], KOBJ_ADD);
+ kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
return retval;
}
@@ -769,7 +795,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
unsigned int cpu = sys_dev->id;
unsigned long i;
- if (cpuid4_info[cpu] == NULL)
+ if (per_cpu(cpuid4_info, cpu) == NULL)
return;
if (!cpu_isset(cpu, cache_dev_map))
return;
@@ -777,7 +803,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
for (i = 0; i < num_cache_leaves; i++)
kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
- kobject_put(cache_kobject[cpu]);
+ kobject_put(per_cpu(cache_kobject, cpu));
cpuid4_cache_sysfs_exit(cpu);
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index a5182dcd94ae..774d87cfd8cd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -10,20 +10,20 @@
#include <linux/smp.h>
#include <linux/thread_info.h>
-#include <asm/processor.h>
+#include <asm/processor.h>
#include <asm/system.h>
#include <asm/mce.h>
#include "mce.h"
-int mce_disabled = 0;
+int mce_disabled;
int nr_mce_banks;
EXPORT_SYMBOL_GPL(nr_mce_banks); /* non-fatal.o */
/* Handle unconfigured int18 (should never happen) */
-static void unexpected_machine_check(struct pt_regs * regs, long error_code)
-{
+static void unexpected_machine_check(struct pt_regs *regs, long error_code)
+{
printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n", smp_processor_id());
}
@@ -33,30 +33,30 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) = unexpected_mac
/* This has to be run for each processor */
void mcheck_init(struct cpuinfo_x86 *c)
{
- if (mce_disabled==1)
+ if (mce_disabled == 1)
return;
switch (c->x86_vendor) {
- case X86_VENDOR_AMD:
- amd_mcheck_init(c);
- break;
-
- case X86_VENDOR_INTEL:
- if (c->x86==5)
- intel_p5_mcheck_init(c);
- if (c->x86==6)
- intel_p6_mcheck_init(c);
- if (c->x86==15)
- intel_p4_mcheck_init(c);
- break;
-
- case X86_VENDOR_CENTAUR:
- if (c->x86==5)
- winchip_mcheck_init(c);
- break;
-
- default:
- break;
+ case X86_VENDOR_AMD:
+ amd_mcheck_init(c);
+ break;
+
+ case X86_VENDOR_INTEL:
+ if (c->x86 == 5)
+ intel_p5_mcheck_init(c);
+ if (c->x86 == 6)
+ intel_p6_mcheck_init(c);
+ if (c->x86 == 15)
+ intel_p4_mcheck_init(c);
+ break;
+
+ case X86_VENDOR_CENTAUR:
+ if (c->x86 == 5)
+ winchip_mcheck_init(c);
+ break;
+
+ default:
+ break;
}
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 9a699ed03598..e07e8c068ae0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -49,7 +49,7 @@ static int banks;
static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL };
static unsigned long notify_user;
static int rip_msr;
-static int mce_bootlog = 1;
+static int mce_bootlog = -1;
static atomic_t mce_events;
static char trigger[128];
@@ -471,13 +471,15 @@ static void mce_init(void *dummy)
static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
{
/* This should be disabled by the BIOS, but isn't always */
- if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
- /* disable GART TBL walk error reporting, which trips off
- incorrectly with the IOMMU & 3ware & Cerberus. */
- clear_bit(10, &bank[4]);
- /* Lots of broken BIOS around that don't clear them
- by default and leave crap in there. Don't log. */
- mce_bootlog = 0;
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ if(c->x86 == 15)
+ /* disable GART TBL walk error reporting, which trips off
+ incorrectly with the IOMMU & 3ware & Cerberus. */
+ clear_bit(10, &bank[4]);
+ if(c->x86 <= 17 && mce_bootlog < 0)
+ /* Lots of broken BIOS around that don't clear them
+ by default and leave crap in there. Don't log. */
+ mce_bootlog = 0;
}
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 32671da8184e..7c9a813e1193 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -251,18 +251,18 @@ struct threshold_attr {
ssize_t(*store) (struct threshold_block *, const char *, size_t count);
};
-static cpumask_t affinity_set(unsigned int cpu)
+static void affinity_set(unsigned int cpu, cpumask_t *oldmask,
+ cpumask_t *newmask)
{
- cpumask_t oldmask = current->cpus_allowed;
- cpumask_t newmask = CPU_MASK_NONE;
- cpu_set(cpu, newmask);
- set_cpus_allowed(current, newmask);
- return oldmask;
+ *oldmask = current->cpus_allowed;
+ cpus_clear(*newmask);
+ cpu_set(cpu, *newmask);
+ set_cpus_allowed_ptr(current, newmask);
}
-static void affinity_restore(cpumask_t oldmask)
+static void affinity_restore(const cpumask_t *oldmask)
{
- set_cpus_allowed(current, oldmask);
+ set_cpus_allowed_ptr(current, oldmask);
}
#define SHOW_FIELDS(name) \
@@ -277,15 +277,15 @@ static ssize_t store_interrupt_enable(struct threshold_block *b,
const char *buf, size_t count)
{
char *end;
- cpumask_t oldmask;
+ cpumask_t oldmask, newmask;
unsigned long new = simple_strtoul(buf, &end, 0);
if (end == buf)
return -EINVAL;
b->interrupt_enable = !!new;
- oldmask = affinity_set(b->cpu);
+ affinity_set(b->cpu, &oldmask, &newmask);
threshold_restart_bank(b, 0, 0);
- affinity_restore(oldmask);
+ affinity_restore(&oldmask);
return end - buf;
}
@@ -294,7 +294,7 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
const char *buf, size_t count)
{
char *end;
- cpumask_t oldmask;
+ cpumask_t oldmask, newmask;
u16 old;
unsigned long new = simple_strtoul(buf, &end, 0);
if (end == buf)
@@ -306,9 +306,9 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
old = b->threshold_limit;
b->threshold_limit = new;
- oldmask = affinity_set(b->cpu);
+ affinity_set(b->cpu, &oldmask, &newmask);
threshold_restart_bank(b, 0, old);
- affinity_restore(oldmask);
+ affinity_restore(&oldmask);
return end - buf;
}
@@ -316,10 +316,10 @@ static ssize_t store_threshold_limit(struct threshold_block *b,
static ssize_t show_error_count(struct threshold_block *b, char *buf)
{
u32 high, low;
- cpumask_t oldmask;
- oldmask = affinity_set(b->cpu);
+ cpumask_t oldmask, newmask;
+ affinity_set(b->cpu, &oldmask, &newmask);
rdmsr(b->address, low, high);
- affinity_restore(oldmask);
+ affinity_restore(&oldmask);
return sprintf(buf, "%x\n",
(high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit));
}
@@ -327,10 +327,10 @@ static ssize_t show_error_count(struct threshold_block *b, char *buf)
static ssize_t store_error_count(struct threshold_block *b,
const char *buf, size_t count)
{
- cpumask_t oldmask;
- oldmask = affinity_set(b->cpu);
+ cpumask_t oldmask, newmask;
+ affinity_set(b->cpu, &oldmask, &newmask);
threshold_restart_bank(b, 1, 0);
- affinity_restore(oldmask);
+ affinity_restore(&oldmask);
return 1;
}
@@ -468,7 +468,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
{
int i, err = 0;
struct threshold_bank *b = NULL;
- cpumask_t oldmask = CPU_MASK_NONE;
+ cpumask_t oldmask, newmask;
char name[32];
sprintf(name, "threshold_bank%i", bank);
@@ -519,10 +519,10 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
per_cpu(threshold_banks, cpu)[bank] = b;
- oldmask = affinity_set(cpu);
+ affinity_set(cpu, &oldmask, &newmask);
err = allocate_threshold_blocks(cpu, bank, 0,
MSR_IA32_MC0_MISC + bank * 4);
- affinity_restore(oldmask);
+ affinity_restore(&oldmask);
if (err)
goto out_free;
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index bf39409b3838..00ccb6c14ec2 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -16,7 +16,7 @@
#include <linux/smp.h>
#include <linux/module.h>
-#include <asm/processor.h>
+#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
@@ -26,23 +26,26 @@ static int firstbank;
#define MCE_RATE 15*HZ /* timer rate is 15s */
-static void mce_checkregs (void *info)
+static void mce_checkregs(void *info)
{
u32 low, high;
int i;
- for (i=firstbank; i<nr_mce_banks; i++) {
- rdmsr (MSR_IA32_MC0_STATUS+i*4, low, high);
+ for (i = firstbank; i < nr_mce_banks; i++) {
+ rdmsr(MSR_IA32_MC0_STATUS+i*4, low, high);
if (high & (1<<31)) {
printk(KERN_INFO "MCE: The hardware reports a non "
"fatal, correctable incident occurred on "
"CPU %d.\n",
smp_processor_id());
- printk (KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
+ printk(KERN_INFO "Bank %d: %08x%08x\n", i, high, low);
- /* Scrub the error so we don't pick it up in MCE_RATE seconds time. */
- wrmsr (MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
+ /*
+ * Scrub the error so we don't pick it up in MCE_RATE
+ * seconds time.
+ */
+ wrmsr(MSR_IA32_MC0_STATUS+i*4, 0UL, 0UL);
/* Serialize */
wmb();
@@ -55,10 +58,10 @@ static void mce_work_fn(struct work_struct *work);
static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
static void mce_work_fn(struct work_struct *work)
-{
+{
on_each_cpu(mce_checkregs, NULL, 1, 1);
schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
-}
+}
static int __init init_nonfatal_mce_checker(void)
{
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index a18310aaae0c..bfa5817afdda 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -9,20 +9,20 @@
#include <linux/interrupt.h>
#include <linux/smp.h>
-#include <asm/processor.h>
+#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine check handler for Pentium class Intel */
-static void pentium_machine_check(struct pt_regs * regs, long error_code)
+static void pentium_machine_check(struct pt_regs *regs, long error_code)
{
u32 loaddr, hi, lotype;
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
printk(KERN_EMERG "CPU#%d: Machine Check Exception: 0x%8X (type 0x%8X).\n", smp_processor_id(), loaddr, lotype);
- if(lotype&(1<<5))
+ if (lotype&(1<<5))
printk(KERN_EMERG "CPU#%d: Possible thermal failure (CPU on fire ?).\n", smp_processor_id());
add_taint(TAINT_MACHINE_CHECK);
}
@@ -31,13 +31,13 @@ static void pentium_machine_check(struct pt_regs * regs, long error_code)
void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
-
+
/*Check for MCE support */
- if( !cpu_has(c, X86_FEATURE_MCE) )
- return;
+ if (!cpu_has(c, X86_FEATURE_MCE))
+ return;
/* Default P5 to off as its often misconnected */
- if(mce_disabled != -1)
+ if (mce_disabled != -1)
return;
machine_check_vector = pentium_machine_check;
wmb();
@@ -47,7 +47,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c)
rdmsr(MSR_IA32_P5_MC_TYPE, l, h);
printk(KERN_INFO "Intel old style machine check architecture supported.\n");
- /* Enable MCE */
+ /* Enable MCE */
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id());
}
diff --git a/arch/x86/kernel/cpu/mcheck/p6.c b/arch/x86/kernel/cpu/mcheck/p6.c
index 74342604d30e..62efc9c2b3af 100644
--- a/arch/x86/kernel/cpu/mcheck/p6.c
+++ b/arch/x86/kernel/cpu/mcheck/p6.c
@@ -9,23 +9,23 @@
#include <linux/interrupt.h>
#include <linux/smp.h>
-#include <asm/processor.h>
+#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine Check Handler For PII/PIII */
-static void intel_machine_check(struct pt_regs * regs, long error_code)
+static void intel_machine_check(struct pt_regs *regs, long error_code)
{
- int recover=1;
+ int recover = 1;
u32 alow, ahigh, high, low;
u32 mcgstl, mcgsth;
int i;
- rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
+ rdmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
if (mcgstl & (1<<0)) /* Recoverable ? */
- recover=0;
+ recover = 0;
printk(KERN_EMERG "CPU %d: Machine Check Exception: %08x%08x\n",
smp_processor_id(), mcgsth, mcgstl);
@@ -55,30 +55,30 @@ static void intel_machine_check(struct pt_regs * regs, long error_code)
}
if (recover & 2)
- panic ("CPU context corrupt");
+ panic("CPU context corrupt");
if (recover & 1)
- panic ("Unable to continue");
+ panic("Unable to continue");
- printk (KERN_EMERG "Attempting to continue.\n");
- /*
- * Do not clear the MSR_IA32_MCi_STATUS if the error is not
+ printk(KERN_EMERG "Attempting to continue.\n");
+ /*
+ * Do not clear the MSR_IA32_MCi_STATUS if the error is not
* recoverable/continuable.This will allow BIOS to look at the MSRs
* for errors if the OS could not log the error.
*/
- for (i=0; i<nr_mce_banks; i++) {
+ for (i = 0; i < nr_mce_banks; i++) {
unsigned int msr;
msr = MSR_IA32_MC0_STATUS+i*4;
- rdmsr (msr,low, high);
+ rdmsr(msr, low, high);
if (high & (1<<31)) {
/* Clear it */
- wrmsr (msr, 0UL, 0UL);
+ wrmsr(msr, 0UL, 0UL);
/* Serialize */
wmb();
add_taint(TAINT_MACHINE_CHECK);
}
}
mcgstl &= ~(1<<2);
- wrmsr (MSR_IA32_MCG_STATUS,mcgstl, mcgsth);
+ wrmsr(MSR_IA32_MCG_STATUS, mcgstl, mcgsth);
}
/* Set up machine check reporting for processors with Intel style MCE */
@@ -86,21 +86,21 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
{
u32 l, h;
int i;
-
+
/* Check for MCE support */
if (!cpu_has(c, X86_FEATURE_MCE))
return;
/* Check for PPro style MCA */
- if (!cpu_has(c, X86_FEATURE_MCA))
+ if (!cpu_has(c, X86_FEATURE_MCA))
return;
/* Ok machine check is available */
machine_check_vector = intel_machine_check;
wmb();
- printk (KERN_INFO "Intel machine check architecture supported.\n");
- rdmsr (MSR_IA32_MCG_CAP, l, h);
+ printk(KERN_INFO "Intel machine check architecture supported.\n");
+ rdmsr(MSR_IA32_MCG_CAP, l, h);
if (l & (1<<8)) /* Control register present ? */
wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff);
nr_mce_banks = l & 0xff;
@@ -110,13 +110,13 @@ void intel_p6_mcheck_init(struct cpuinfo_x86 *c)
* - MC0_CTL should not be written
* - Status registers on all banks should be cleared on reset
*/
- for (i=1; i<nr_mce_banks; i++)
- wrmsr (MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
+ for (i = 1; i < nr_mce_banks; i++)
+ wrmsr(MSR_IA32_MC0_CTL+4*i, 0xffffffff, 0xffffffff);
- for (i=0; i<nr_mce_banks; i++)
- wrmsr (MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
+ for (i = 0; i < nr_mce_banks; i++)
+ wrmsr(MSR_IA32_MC0_STATUS+4*i, 0x0, 0x0);
- set_in_cr4 (X86_CR4_MCE);
- printk (KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
+ set_in_cr4(X86_CR4_MCE);
+ printk(KERN_INFO "Intel machine check reporting enabled on CPU#%d.\n",
smp_processor_id());
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 9b7e01daa1ca..1f4cc48c14c6 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -1,5 +1,4 @@
/*
- * linux/arch/i386/kernel/cpu/mcheck/therm_throt.c
*
* Thermal throttle event support code (such as syslog messaging and rate
* limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 3d428d5afc52..f2be3e190c6b 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -8,14 +8,14 @@
#include <linux/kernel.h>
#include <linux/interrupt.h>
-#include <asm/processor.h>
+#include <asm/processor.h>
#include <asm/system.h>
#include <asm/msr.h>
#include "mce.h"
/* Machine check handler for WinChip C6 */
-static void winchip_machine_check(struct pt_regs * regs, long error_code)
+static void winchip_machine_check(struct pt_regs *regs, long error_code)
{
printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK);
@@ -28,8 +28,8 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c)
machine_check_vector = winchip_machine_check;
wmb();
rdmsr(MSR_IDT_FCR1, lo, hi);
- lo|= (1<<2); /* Enable EIERRINT (int 18 MCE) */
- lo&= ~(1<<4); /* Enable MCE */
+ lo |= (1<<2); /* Enable EIERRINT (int 18 MCE) */
+ lo &= ~(1<<4); /* Enable MCE */
wrmsr(MSR_IDT_FCR1, lo, hi);
set_in_cr4(X86_CR4_MCE);
printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n");
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 3e18db4cefee..353efe4f5017 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -11,6 +11,7 @@
#include <asm/cpufeature.h>
#include <asm/processor-flags.h>
#include <asm/tlbflush.h>
+#include <asm/pat.h>
#include "mtrr.h"
struct mtrr_state {
@@ -35,6 +36,8 @@ static struct fixed_range_block fixed_range_blocks[] = {
static unsigned long smp_changes_mask;
static struct mtrr_state mtrr_state = {};
+static int mtrr_state_set;
+static u64 tom2;
#undef MODULE_PARAM_PREFIX
#define MODULE_PARAM_PREFIX "mtrr."
@@ -42,6 +45,111 @@ static struct mtrr_state mtrr_state = {};
static int mtrr_show;
module_param_named(show, mtrr_show, bool, 0);
+/*
+ * Returns the effective MTRR type for the region
+ * Error returns:
+ * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
+ * - 0xFF - when MTRR is not enabled
+ */
+u8 mtrr_type_lookup(u64 start, u64 end)
+{
+ int i;
+ u64 base, mask;
+ u8 prev_match, curr_match;
+
+ if (!mtrr_state_set)
+ return 0xFF;
+
+ if (!mtrr_state.enabled)
+ return 0xFF;
+
+ /* Make end inclusive end, instead of exclusive */
+ end--;
+
+ /* Look in fixed ranges. Just return the type as per start */
+ if (mtrr_state.have_fixed && (start < 0x100000)) {
+ int idx;
+
+ if (start < 0x80000) {
+ idx = 0;
+ idx += (start >> 16);
+ return mtrr_state.fixed_ranges[idx];
+ } else if (start < 0xC0000) {
+ idx = 1 * 8;
+ idx += ((start - 0x80000) >> 14);
+ return mtrr_state.fixed_ranges[idx];
+ } else if (start < 0x1000000) {
+ idx = 3 * 8;
+ idx += ((start - 0xC0000) >> 12);
+ return mtrr_state.fixed_ranges[idx];
+ }
+ }
+
+ /*
+ * Look in variable ranges
+ * Look of multiple ranges matching this address and pick type
+ * as per MTRR precedence
+ */
+ if (!mtrr_state.enabled & 2) {
+ return mtrr_state.def_type;
+ }
+
+ prev_match = 0xFF;
+ for (i = 0; i < num_var_ranges; ++i) {
+ unsigned short start_state, end_state;
+
+ if (!(mtrr_state.var_ranges[i].mask_lo & (1 << 11)))
+ continue;
+
+ base = (((u64)mtrr_state.var_ranges[i].base_hi) << 32) +
+ (mtrr_state.var_ranges[i].base_lo & PAGE_MASK);
+ mask = (((u64)mtrr_state.var_ranges[i].mask_hi) << 32) +
+ (mtrr_state.var_ranges[i].mask_lo & PAGE_MASK);
+
+ start_state = ((start & mask) == (base & mask));
+ end_state = ((end & mask) == (base & mask));
+ if (start_state != end_state)
+ return 0xFE;
+
+ if ((start & mask) != (base & mask)) {
+ continue;
+ }
+
+ curr_match = mtrr_state.var_ranges[i].base_lo & 0xff;
+ if (prev_match == 0xFF) {
+ prev_match = curr_match;
+ continue;
+ }
+
+ if (prev_match == MTRR_TYPE_UNCACHABLE ||
+ curr_match == MTRR_TYPE_UNCACHABLE) {
+ return MTRR_TYPE_UNCACHABLE;
+ }
+
+ if ((prev_match == MTRR_TYPE_WRBACK &&
+ curr_match == MTRR_TYPE_WRTHROUGH) ||
+ (prev_match == MTRR_TYPE_WRTHROUGH &&
+ curr_match == MTRR_TYPE_WRBACK)) {
+ prev_match = MTRR_TYPE_WRTHROUGH;
+ curr_match = MTRR_TYPE_WRTHROUGH;
+ }
+
+ if (prev_match != curr_match) {
+ return MTRR_TYPE_UNCACHABLE;
+ }
+ }
+
+ if (tom2) {
+ if (start >= (1ULL<<32) && (end < tom2))
+ return MTRR_TYPE_WRBACK;
+ }
+
+ if (prev_match != 0xFF)
+ return prev_match;
+
+ return mtrr_state.def_type;
+}
+
/* Get the MSR pair relating to a var range */
static void
get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
@@ -79,12 +187,16 @@ static void print_fixed(unsigned base, unsigned step, const mtrr_type*types)
base, base + step - 1, mtrr_attrib_to_str(*types));
}
+static void prepare_set(void);
+static void post_set(void);
+
/* Grab all of the MTRR state for this CPU into *state */
void __init get_mtrr_state(void)
{
unsigned int i;
struct mtrr_var_range *vrs;
unsigned lo, dummy;
+ unsigned long flags;
vrs = mtrr_state.var_ranges;
@@ -100,6 +212,15 @@ void __init get_mtrr_state(void)
mtrr_state.def_type = (lo & 0xff);
mtrr_state.enabled = (lo & 0xc00) >> 10;
+ if (amd_special_default_mtrr()) {
+ unsigned lo, hi;
+ /* TOP_MEM2 */
+ rdmsr(MSR_K8_TOP_MEM2, lo, hi);
+ tom2 = hi;
+ tom2 <<= 32;
+ tom2 |= lo;
+ tom2 &= 0xffffff8000000ULL;
+ }
if (mtrr_show) {
int high_width;
@@ -130,7 +251,22 @@ void __init get_mtrr_state(void)
else
printk(KERN_INFO "MTRR %u disabled\n", i);
}
+ if (tom2) {
+ printk(KERN_INFO "TOM2: %016llx aka %lldM\n",
+ tom2, tom2>>20);
+ }
}
+ mtrr_state_set = 1;
+
+ /* PAT setup for BP. We need to go through sync steps here */
+ local_irq_save(flags);
+ prepare_set();
+
+ pat_init();
+
+ post_set();
+ local_irq_restore(flags);
+
}
/* Some BIOS's are fucked and don't set all MTRRs the same! */
@@ -397,6 +533,9 @@ static void generic_set_all(void)
/* Actually set the state */
mask = set_mtrr_state();
+ /* also set PAT */
+ pat_init();
+
post_set();
local_irq_restore(flags);
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 91e150acb46c..1960f1985e5e 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -424,11 +424,10 @@ static int __init mtrr_if_init(void)
return -ENODEV;
proc_root_mtrr =
- create_proc_entry("mtrr", S_IWUSR | S_IRUGO, &proc_root);
- if (proc_root_mtrr) {
+ proc_create("mtrr", S_IWUSR | S_IRUGO, &proc_root, &mtrr_fops);
+
+ if (proc_root_mtrr)
proc_root_mtrr->owner = THIS_MODULE;
- proc_root_mtrr->proc_fops = &mtrr_fops;
- }
return 0;
}
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index a6450b3ae759..6a1e278d9323 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -627,7 +627,7 @@ early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
#define Tom2Enabled (1U << 21)
#define Tom2ForceMemTypeWB (1U << 22)
-static __init int amd_special_default_mtrr(void)
+int __init amd_special_default_mtrr(void)
{
u32 l, h;
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 9f8ba923d1c9..7f7e2753685b 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -19,13 +19,15 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
if (use_intel() || is_cpu(CYRIX)) {
/* Save value of CR4 and clear Page Global Enable (bit 7) */
- if ( cpu_has_pge ) {
+ if (cpu_has_pge) {
ctxt->cr4val = read_cr4();
write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
}
- /* Disable and flush caches. Note that wbinvd flushes the TLBs as
- a side-effect */
+ /*
+ * Disable and flush caches. Note that wbinvd flushes the TLBs
+ * as a side-effect
+ */
cr0 = read_cr0() | X86_CR0_CD;
wbinvd();
write_cr0(cr0);
@@ -42,7 +44,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
{
- if (use_intel())
+ if (use_intel())
/* Disable MTRRs, and set the default type to uncached */
mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL,
ctxt->deftype_hi);
@@ -66,12 +68,12 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
else
/* Cyrix ARRs - everything else was excluded at the top */
setCx86(CX86_CCR3, ctxt->ccr3);
-
+
/* Enable caches */
write_cr0(read_cr0() & 0xbfffffff);
/* Restore value of CR4 */
- if ( cpu_has_pge )
+ if (cpu_has_pge)
write_cr4(ctxt->cr4val);
}
/* Re-enable interrupts locally (if enabled previously) */
diff --git a/arch/x86/kernel/cpu/nexgen.c b/arch/x86/kernel/cpu/nexgen.c
deleted file mode 100644
index 961fbe1a748f..000000000000
--- a/arch/x86/kernel/cpu/nexgen.c
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <asm/processor.h>
-
-#include "cpu.h"
-
-/*
- * Detect a NexGen CPU running without BIOS hypercode new enough
- * to have CPUID. (Thanks to Herbert Oppmann)
- */
-
-static int __cpuinit deep_magic_nexgen_probe(void)
-{
- int ret;
-
- __asm__ __volatile__ (
- " movw $0x5555, %%ax\n"
- " xorw %%dx,%%dx\n"
- " movw $2, %%cx\n"
- " divw %%cx\n"
- " movl $0, %%eax\n"
- " jnz 1f\n"
- " movl $1, %%eax\n"
- "1:\n"
- : "=a" (ret) : : "cx", "dx" );
- return ret;
-}
-
-static void __cpuinit init_nexgen(struct cpuinfo_x86 * c)
-{
- c->x86_cache_size = 256; /* A few had 1 MB... */
-}
-
-static void __cpuinit nexgen_identify(struct cpuinfo_x86 * c)
-{
- /* Detect NexGen with old hypercode */
- if ( deep_magic_nexgen_probe() ) {
- strcpy(c->x86_vendor_id, "NexGenDriven");
- }
-}
-
-static struct cpu_dev nexgen_cpu_dev __cpuinitdata = {
- .c_vendor = "Nexgen",
- .c_ident = { "NexGenDriven" },
- .c_models = {
- { .vendor = X86_VENDOR_NEXGEN,
- .family = 5,
- .model_names = { [1] = "Nx586" }
- },
- },
- .c_init = init_nexgen,
- .c_identify = nexgen_identify,
-};
-
-int __init nexgen_init_cpu(void)
-{
- cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev;
- return 0;
-}
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index b943e10ad814..f9ae93adffe5 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -614,16 +614,6 @@ static struct wd_ops intel_arch_wd_ops __read_mostly = {
.evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
};
-static struct wd_ops coreduo_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_intel_arch_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
- .evntsel = MSR_ARCH_PERFMON_EVENTSEL0,
-};
-
static void probe_nmi_watchdog(void)
{
switch (boot_cpu_data.x86_vendor) {
@@ -637,8 +627,8 @@ static void probe_nmi_watchdog(void)
/* Work around Core Duo (Yonah) errata AE49 where perfctr1
doesn't have a working enable bit. */
if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) {
- wd_ops = &coreduo_wd_ops;
- break;
+ intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
+ intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
}
if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
wd_ops = &intel_arch_wd_ops;
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index af11d31dce0a..0d0d9057e7c0 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -1,85 +1,145 @@
#include <linux/smp.h>
#include <linux/timex.h>
#include <linux/string.h>
-#include <asm/semaphore.h>
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
/*
* Get CPU information for use by the procfs.
*/
+#ifdef CONFIG_X86_32
+static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
+ unsigned int cpu)
+{
+#ifdef CONFIG_X86_HT
+ if (c->x86_max_cores * smp_num_siblings > 1) {
+ seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+ seq_printf(m, "siblings\t: %d\n",
+ cpus_weight(per_cpu(cpu_core_map, cpu)));
+ seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+ seq_printf(m, "apicid\t\t: %d\n", c->apicid);
+ seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
+ }
+#endif
+}
+
+static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
+{
+ /*
+ * We use exception 16 if we have hardware math and we've either seen
+ * it or the CPU claims it is internal
+ */
+ int fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
+ seq_printf(m,
+ "fdiv_bug\t: %s\n"
+ "hlt_bug\t\t: %s\n"
+ "f00f_bug\t: %s\n"
+ "coma_bug\t: %s\n"
+ "fpu\t\t: %s\n"
+ "fpu_exception\t: %s\n"
+ "cpuid level\t: %d\n"
+ "wp\t\t: %s\n",
+ c->fdiv_bug ? "yes" : "no",
+ c->hlt_works_ok ? "no" : "yes",
+ c->f00f_bug ? "yes" : "no",
+ c->coma_bug ? "yes" : "no",
+ c->hard_math ? "yes" : "no",
+ fpu_exception ? "yes" : "no",
+ c->cpuid_level,
+ c->wp_works_ok ? "yes" : "no");
+}
+#else
+static void show_cpuinfo_core(struct seq_file *m, struct cpuinfo_x86 *c,
+ unsigned int cpu)
+{
+#ifdef CONFIG_SMP
+ if (c->x86_max_cores * smp_num_siblings > 1) {
+ seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
+ seq_printf(m, "siblings\t: %d\n",
+ cpus_weight(per_cpu(cpu_core_map, cpu)));
+ seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
+ seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
+ seq_printf(m, "apicid\t\t: %d\n", c->apicid);
+ seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid);
+ }
+#endif
+}
+
+static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
+{
+ seq_printf(m,
+ "fpu\t\t: yes\n"
+ "fpu_exception\t: yes\n"
+ "cpuid level\t: %d\n"
+ "wp\t\t: yes\n",
+ c->cpuid_level);
+}
+#endif
+
static int show_cpuinfo(struct seq_file *m, void *v)
{
struct cpuinfo_x86 *c = v;
- int i, n = 0;
- int fpu_exception;
+ unsigned int cpu = 0;
+ int i;
#ifdef CONFIG_SMP
- n = c->cpu_index;
+ cpu = c->cpu_index;
#endif
- seq_printf(m, "processor\t: %d\n"
- "vendor_id\t: %s\n"
- "cpu family\t: %d\n"
- "model\t\t: %d\n"
- "model name\t: %s\n",
- n,
- c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
- c->x86,
- c->x86_model,
- c->x86_model_id[0] ? c->x86_model_id : "unknown");
+ seq_printf(m, "processor\t: %u\n"
+ "vendor_id\t: %s\n"
+ "cpu family\t: %d\n"
+ "model\t\t: %u\n"
+ "model name\t: %s\n",
+ cpu,
+ c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
+ c->x86,
+ c->x86_model,
+ c->x86_model_id[0] ? c->x86_model_id : "unknown");
if (c->x86_mask || c->cpuid_level >= 0)
seq_printf(m, "stepping\t: %d\n", c->x86_mask);
else
seq_printf(m, "stepping\t: unknown\n");
- if ( cpu_has(c, X86_FEATURE_TSC) ) {
- unsigned int freq = cpufreq_quick_get(n);
+ if (cpu_has(c, X86_FEATURE_TSC)) {
+ unsigned int freq = cpufreq_quick_get(cpu);
+
if (!freq)
freq = cpu_khz;
seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
- freq / 1000, (freq % 1000));
+ freq / 1000, (freq % 1000));
}
/* Cache size */
if (c->x86_cache_size >= 0)
seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
-#ifdef CONFIG_X86_HT
- if (c->x86_max_cores * smp_num_siblings > 1) {
- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
- seq_printf(m, "siblings\t: %d\n",
- cpus_weight(per_cpu(cpu_core_map, n)));
- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
- seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
- }
-#endif
-
- /* We use exception 16 if we have hardware math and we've either seen it or the CPU claims it is internal */
- fpu_exception = c->hard_math && (ignore_fpu_irq || cpu_has_fpu);
- seq_printf(m, "fdiv_bug\t: %s\n"
- "hlt_bug\t\t: %s\n"
- "f00f_bug\t: %s\n"
- "coma_bug\t: %s\n"
- "fpu\t\t: %s\n"
- "fpu_exception\t: %s\n"
- "cpuid level\t: %d\n"
- "wp\t\t: %s\n"
- "flags\t\t:",
- c->fdiv_bug ? "yes" : "no",
- c->hlt_works_ok ? "no" : "yes",
- c->f00f_bug ? "yes" : "no",
- c->coma_bug ? "yes" : "no",
- c->hard_math ? "yes" : "no",
- fpu_exception ? "yes" : "no",
- c->cpuid_level,
- c->wp_works_ok ? "yes" : "no");
-
- for ( i = 0 ; i < 32*NCAPINTS ; i++ )
- if ( test_bit(i, c->x86_capability) &&
- x86_cap_flags[i] != NULL )
+
+ show_cpuinfo_core(m, c, cpu);
+ show_cpuinfo_misc(m, c);
+
+ seq_printf(m, "flags\t\t:");
+ for (i = 0; i < 32*NCAPINTS; i++)
+ if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
seq_printf(m, " %s", x86_cap_flags[i]);
- for (i = 0; i < 32; i++)
+ seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
+ c->loops_per_jiffy/(500000/HZ),
+ (c->loops_per_jiffy/(5000/HZ)) % 100);
+
+#ifdef CONFIG_X86_64
+ if (c->x86_tlbsize > 0)
+ seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
+#endif
+ seq_printf(m, "clflush size\t: %u\n", c->x86_clflush_size);
+#ifdef CONFIG_X86_64
+ seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
+ seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
+ c->x86_phys_bits, c->x86_virt_bits);
+#endif
+
+ seq_printf(m, "power management:");
+ for (i = 0; i < 32; i++) {
if (c->x86_power & (1 << i)) {
if (i < ARRAY_SIZE(x86_power_flags) &&
x86_power_flags[i])
@@ -89,11 +149,9 @@ static int show_cpuinfo(struct seq_file *m, void *v)
else
seq_printf(m, " [%d]", i);
}
+ }
- seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
- c->loops_per_jiffy/(500000/HZ),
- (c->loops_per_jiffy/(5000/HZ)) % 100);
- seq_printf(m, "clflush size\t: %u\n\n", c->x86_clflush_size);
+ seq_printf(m, "\n\n");
return 0;
}
@@ -106,14 +164,17 @@ static void *c_start(struct seq_file *m, loff_t *pos)
return &cpu_data(*pos);
return NULL;
}
+
static void *c_next(struct seq_file *m, void *v, loff_t *pos)
{
*pos = next_cpu(*pos, cpu_online_map);
return c_start(m, pos);
}
+
static void c_stop(struct seq_file *m, void *v)
{
}
+
const struct seq_operations cpuinfo_op = {
.start = c_start,
.next = c_next,
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index e8b422c1c512..b911a2c61b8f 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -18,8 +18,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
/* Print CMS and CPU revision */
max = cpuid_eax(0x80860000);
cpu_rev = 0;
- if ( max >= 0x80860001 ) {
- cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
+ if (max >= 0x80860001) {
+ cpuid(0x80860001, &dummy, &cpu_rev, &cpu_freq, &cpu_flags);
if (cpu_rev != 0x02000000) {
printk(KERN_INFO "CPU: Processor revision %u.%u.%u.%u, %u MHz\n",
(cpu_rev >> 24) & 0xff,
@@ -29,7 +29,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
cpu_freq);
}
}
- if ( max >= 0x80860002 ) {
+ if (max >= 0x80860002) {
cpuid(0x80860002, &new_cpu_rev, &cms_rev1, &cms_rev2, &dummy);
if (cpu_rev == 0x02000000) {
printk(KERN_INFO "CPU: Processor revision %08X, %u MHz\n",
@@ -42,7 +42,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
cms_rev1 & 0xff,
cms_rev2);
}
- if ( max >= 0x80860006 ) {
+ if (max >= 0x80860006) {
cpuid(0x80860003,
(void *)&cpu_info[0],
(void *)&cpu_info[4],
@@ -74,23 +74,25 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
wrmsr(0x80860004, cap_mask, uk);
/* All Transmeta CPUs have a constant TSC */
- set_bit(X86_FEATURE_CONSTANT_TSC, c->x86_capability);
-
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+
#ifdef CONFIG_SYSCTL
- /* randomize_va_space slows us down enormously;
- it probably triggers retranslation of x86->native bytecode */
+ /*
+ * randomize_va_space slows us down enormously;
+ * it probably triggers retranslation of x86->native bytecode
+ */
randomize_va_space = 0;
#endif
}
-static void __cpuinit transmeta_identify(struct cpuinfo_x86 * c)
+static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c)
{
u32 xlvl;
/* Transmeta-defined flags: level 0x80860001 */
xlvl = cpuid_eax(0x80860000);
- if ( (xlvl & 0xffff0000) == 0x80860000 ) {
- if ( xlvl >= 0x80860001 )
+ if ((xlvl & 0xffff0000) == 0x80860000) {
+ if (xlvl >= 0x80860001)
c->x86_capability[2] = cpuid_edx(0x80860001);
}
}
@@ -102,8 +104,4 @@ static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
.c_identify = transmeta_identify,
};
-int __init transmeta_init_cpu(void)
-{
- cpu_devs[X86_VENDOR_TRANSMETA] = &transmeta_cpu_dev;
- return 0;
-}
+cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev);
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index a7a4e75bdcd7..b1fc90989d75 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -3,24 +3,23 @@
#include <asm/processor.h>
#include "cpu.h"
-/* UMC chips appear to be only either 386 or 486, so no special init takes place.
+/*
+ * UMC chips appear to be only either 386 or 486,
+ * so no special init takes place.
*/
static struct cpu_dev umc_cpu_dev __cpuinitdata = {
.c_vendor = "UMC",
- .c_ident = { "UMC UMC UMC" },
+ .c_ident = { "UMC UMC UMC" },
.c_models = {
{ .vendor = X86_VENDOR_UMC, .family = 4, .model_names =
- {
- [1] = "U5D",
- [2] = "U5S",
+ {
+ [1] = "U5D",
+ [2] = "U5S",
}
},
},
};
-int __init umc_init_cpu(void)
-{
- cpu_devs[X86_VENDOR_UMC] = &umc_cpu_dev;
- return 0;
-}
+cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev);
+
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 288e7a6598ac..daff52a62248 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -154,12 +154,10 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
err = cpuid_device_create(cpu);
break;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
cpuid_device_destroy(cpu);
break;
- case CPU_UP_CANCELED_FROZEN:
- destroy_suspended_device(cpuid_class, MKDEV(CPUID_MAJOR, cpu));
- break;
}
return err ? NOTIFY_BAD : NOTIFY_OK;
}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 9a5fa0abfcc7..268553817909 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -25,12 +25,9 @@
#include <asm/hpet.h>
#include <linux/kdebug.h>
#include <asm/smp.h>
+#include <asm/reboot.h>
-#ifdef CONFIG_X86_32
#include <mach_ipi.h>
-#else
-#include <asm/mach_apic.h>
-#endif
/* This keeps a track of which one is crashing cpu. */
static int crashing_cpu;
@@ -121,7 +118,7 @@ static void nmi_shootdown_cpus(void)
}
#endif
-void machine_crash_shutdown(struct pt_regs *regs)
+void native_machine_crash_shutdown(struct pt_regs *regs)
{
/* This function is only called after the system
* has panicked or is otherwise in a critical state.
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index dcd918c1580d..11c11b8ec48d 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -220,11 +220,11 @@ int ds_allocate(void **dsp, size_t bts_size_in_bytes)
int ds_free(void **dsp)
{
- if (*dsp)
+ if (*dsp) {
kfree((void *)get_bts_buffer_base(*dsp));
- kfree(*dsp);
- *dsp = NULL;
-
+ kfree(*dsp);
+ *dsp = NULL;
+ }
return 0;
}
diff --git a/arch/x86/kernel/e820_32.c b/arch/x86/kernel/e820_32.c
index 80444c5c9b14..ed733e7cf4e6 100644
--- a/arch/x86/kernel/e820_32.c
+++ b/arch/x86/kernel/e820_32.c
@@ -450,45 +450,32 @@ int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map)
* thinkpad 560x, for example, does not cooperate with the memory
* detection code.)
*/
-int __init copy_e820_map(struct e820entry * biosmap, int nr_map)
+int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
{
/* Only one memory region (or negative)? Ignore it */
if (nr_map < 2)
return -1;
do {
- unsigned long long start = biosmap->addr;
- unsigned long long size = biosmap->size;
- unsigned long long end = start + size;
- unsigned long type = biosmap->type;
+ u64 start = biosmap->addr;
+ u64 size = biosmap->size;
+ u64 end = start + size;
+ u32 type = biosmap->type;
/* Overflow in 64 bits? Ignore the memory map. */
if (start > end)
return -1;
- /*
- * Some BIOSes claim RAM in the 640k - 1M region.
- * Not right. Fix it up.
- */
- if (type == E820_RAM) {
- if (start < 0x100000ULL && end > 0xA0000ULL) {
- if (start < 0xA0000ULL)
- add_memory_region(start, 0xA0000ULL-start, type);
- if (end <= 0x100000ULL)
- continue;
- start = 0x100000ULL;
- size = end - start;
- }
- }
add_memory_region(start, size, type);
- } while (biosmap++,--nr_map);
+ } while (biosmap++, --nr_map);
+
return 0;
}
/*
* Find the highest page frame number we have available
*/
-void __init find_max_pfn(void)
+void __init propagate_e820_map(void)
{
int i;
@@ -717,7 +704,7 @@ static int __init parse_memmap(char *arg)
* size before original memory map is
* reset.
*/
- find_max_pfn();
+ propagate_e820_map();
saved_max_pfn = max_pfn;
#endif
e820.nr_map = 0;
diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c
index 9be697126013..645ee5e32a27 100644
--- a/arch/x86/kernel/e820_64.c
+++ b/arch/x86/kernel/e820_64.c
@@ -27,6 +27,7 @@
#include <asm/setup.h>
#include <asm/sections.h>
#include <asm/kdebug.h>
+#include <asm/trampoline.h>
struct e820map e820;
@@ -36,11 +37,11 @@ struct e820map e820;
unsigned long end_pfn;
/*
- * end_pfn only includes RAM, while end_pfn_map includes all e820 entries.
- * The direct mapping extends to end_pfn_map, so that we can directly access
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
* apertures, ACPI and other tables without having to play with fixmaps.
*/
-unsigned long end_pfn_map;
+unsigned long max_pfn_mapped;
/*
* Last pfn which the user wants to use.
@@ -58,8 +59,8 @@ struct early_res {
};
static struct early_res early_res[MAX_EARLY_RES] __initdata = {
{ 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
-#ifdef CONFIG_SMP
- { SMP_TRAMPOLINE_BASE, SMP_TRAMPOLINE_BASE + 2*PAGE_SIZE, "SMP_TRAMPOLINE" },
+#ifdef CONFIG_X86_TRAMPOLINE
+ { TRAMPOLINE_BASE, TRAMPOLINE_BASE + 2 * PAGE_SIZE, "TRAMPOLINE" },
#endif
{}
};
@@ -83,19 +84,47 @@ void __init reserve_early(unsigned long start, unsigned long end, char *name)
strncpy(r->name, name, sizeof(r->name) - 1);
}
-void __init early_res_to_bootmem(void)
+void __init free_early(unsigned long start, unsigned long end)
+{
+ struct early_res *r;
+ int i, j;
+
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ r = &early_res[i];
+ if (start == r->start && end == r->end)
+ break;
+ }
+ if (i >= MAX_EARLY_RES || !early_res[i].end)
+ panic("free_early on not reserved area: %lx-%lx!", start, end);
+
+ for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
+ ;
+
+ memcpy(&early_res[i], &early_res[i + 1],
+ (j - 1 - i) * sizeof(struct early_res));
+
+ early_res[j - 1].end = 0;
+}
+
+void __init early_res_to_bootmem(unsigned long start, unsigned long end)
{
int i;
+ unsigned long final_start, final_end;
for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
struct early_res *r = &early_res[i];
- printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i,
- r->start, r->end - 1, r->name);
- reserve_bootmem_generic(r->start, r->end - r->start);
+ final_start = max(start, r->start);
+ final_end = min(end, r->end);
+ if (final_start >= final_end)
+ continue;
+ printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i,
+ final_start, final_end - 1, r->name);
+ reserve_bootmem_generic(final_start, final_end - final_start);
}
}
/* Check for already reserved areas */
-static inline int bad_addr(unsigned long *addrp, unsigned long size)
+static inline int __init
+bad_addr(unsigned long *addrp, unsigned long size, unsigned long align)
{
int i;
unsigned long addr = *addrp, last;
@@ -105,7 +134,7 @@ again:
for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
struct early_res *r = &early_res[i];
if (last >= r->start && addr < r->end) {
- *addrp = addr = r->end;
+ *addrp = addr = round_up(r->end, align);
changed = 1;
goto again;
}
@@ -113,6 +142,40 @@ again:
return changed;
}
+/* Check for already reserved areas */
+static inline int __init
+bad_addr_size(unsigned long *addrp, unsigned long *sizep, unsigned long align)
+{
+ int i;
+ unsigned long addr = *addrp, last;
+ unsigned long size = *sizep;
+ int changed = 0;
+again:
+ last = addr + size;
+ for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
+ struct early_res *r = &early_res[i];
+ if (last > r->start && addr < r->start) {
+ size = r->start - addr;
+ changed = 1;
+ goto again;
+ }
+ if (last > r->end && addr < r->end) {
+ addr = round_up(r->end, align);
+ size = last - addr;
+ changed = 1;
+ goto again;
+ }
+ if (last <= r->end && addr >= r->start) {
+ (*sizep)++;
+ return 0;
+ }
+ }
+ if (changed) {
+ *addrp = addr;
+ *sizep = size;
+ }
+ return changed;
+}
/*
* This function checks if any part of the range <start,end> is mapped
* with type.
@@ -174,26 +237,27 @@ int __init e820_all_mapped(unsigned long start, unsigned long end,
* Find a free area with specified alignment in a specific range.
*/
unsigned long __init find_e820_area(unsigned long start, unsigned long end,
- unsigned size, unsigned long align)
+ unsigned long size, unsigned long align)
{
int i;
- unsigned long mask = ~(align - 1);
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
- unsigned long addr = ei->addr, last;
+ unsigned long addr, last;
+ unsigned long ei_last;
if (ei->type != E820_RAM)
continue;
+ addr = round_up(ei->addr, align);
+ ei_last = ei->addr + ei->size;
if (addr < start)
- addr = start;
- if (addr > ei->addr + ei->size)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
continue;
- while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
+ while (bad_addr(&addr, size, align) && addr+size <= ei_last)
;
- addr = (addr + align - 1) & mask;
last = addr + size;
- if (last > ei->addr + ei->size)
+ if (last > ei_last)
continue;
if (last > end)
continue;
@@ -203,6 +267,40 @@ unsigned long __init find_e820_area(unsigned long start, unsigned long end,
}
/*
+ * Find next free range after *start
+ */
+unsigned long __init find_e820_area_size(unsigned long start,
+ unsigned long *sizep,
+ unsigned long align)
+{
+ int i;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ struct e820entry *ei = &e820.map[i];
+ unsigned long addr, last;
+ unsigned long ei_last;
+
+ if (ei->type != E820_RAM)
+ continue;
+ addr = round_up(ei->addr, align);
+ ei_last = ei->addr + ei->size;
+ if (addr < start)
+ addr = round_up(start, align);
+ if (addr >= ei_last)
+ continue;
+ *sizep = ei_last - addr;
+ while (bad_addr_size(&addr, sizep, align) &&
+ addr + *sizep <= ei_last)
+ ;
+ last = addr + *sizep;
+ if (last > ei_last)
+ continue;
+ return addr;
+ }
+ return -1UL;
+
+}
+/*
* Find the highest page frame number we have available
*/
unsigned long __init e820_end_of_ram(void)
@@ -211,29 +309,29 @@ unsigned long __init e820_end_of_ram(void)
end_pfn = find_max_pfn_with_active_regions();
- if (end_pfn > end_pfn_map)
- end_pfn_map = end_pfn;
- if (end_pfn_map > MAXMEM>>PAGE_SHIFT)
- end_pfn_map = MAXMEM>>PAGE_SHIFT;
+ if (end_pfn > max_pfn_mapped)
+ max_pfn_mapped = end_pfn;
+ if (max_pfn_mapped > MAXMEM>>PAGE_SHIFT)
+ max_pfn_mapped = MAXMEM>>PAGE_SHIFT;
if (end_pfn > end_user_pfn)
end_pfn = end_user_pfn;
- if (end_pfn > end_pfn_map)
- end_pfn = end_pfn_map;
+ if (end_pfn > max_pfn_mapped)
+ end_pfn = max_pfn_mapped;
- printk(KERN_INFO "end_pfn_map = %lu\n", end_pfn_map);
+ printk(KERN_INFO "max_pfn_mapped = %lu\n", max_pfn_mapped);
return end_pfn;
}
/*
* Mark e820 reserved areas as busy for the resource manager.
*/
-void __init e820_reserve_resources(struct resource *code_resource,
- struct resource *data_resource, struct resource *bss_resource)
+void __init e820_reserve_resources(void)
{
int i;
+ struct resource *res;
+
+ res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
for (i = 0; i < e820.nr_map; i++) {
- struct resource *res;
- res = alloc_bootmem_low(sizeof(struct resource));
switch (e820.map[i].type) {
case E820_RAM: res->name = "System RAM"; break;
case E820_ACPI: res->name = "ACPI Tables"; break;
@@ -243,21 +341,8 @@ void __init e820_reserve_resources(struct resource *code_resource,
res->start = e820.map[i].addr;
res->end = res->start + e820.map[i].size - 1;
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- request_resource(&iomem_resource, res);
- if (e820.map[i].type == E820_RAM) {
- /*
- * We don't know which RAM region contains kernel data,
- * so we try it repeatedly and let the resource manager
- * test it.
- */
- request_resource(res, code_resource);
- request_resource(res, data_resource);
- request_resource(res, bss_resource);
-#ifdef CONFIG_KEXEC
- if (crashk_res.start != crashk_res.end)
- request_resource(res, &crashk_res);
-#endif
- }
+ insert_resource(&iomem_resource, res);
+ res++;
}
}
@@ -309,9 +394,9 @@ static int __init e820_find_active_region(const struct e820entry *ei,
if (*ei_startpfn >= *ei_endpfn)
return 0;
- /* Check if end_pfn_map should be updated */
- if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map)
- end_pfn_map = *ei_endpfn;
+ /* Check if max_pfn_mapped should be updated */
+ if (ei->type != E820_RAM && *ei_endpfn > max_pfn_mapped)
+ max_pfn_mapped = *ei_endpfn;
/* Skip if map is outside the node */
if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
@@ -634,10 +719,10 @@ static int __init copy_e820_map(struct e820entry *biosmap, int nr_map)
return -1;
do {
- unsigned long start = biosmap->addr;
- unsigned long size = biosmap->size;
- unsigned long end = start + size;
- unsigned long type = biosmap->type;
+ u64 start = biosmap->addr;
+ u64 size = biosmap->size;
+ u64 end = start + size;
+ u32 type = biosmap->type;
/* Overflow in 64 bits? Ignore the memory map. */
if (start > end)
@@ -702,7 +787,7 @@ static int __init parse_memmap_opt(char *p)
saved_max_pfn = e820_end_of_ram();
remove_all_active_ranges();
#endif
- end_pfn_map = 0;
+ max_pfn_mapped = 0;
e820.nr_map = 0;
userdef = 1;
return 0;
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index cff84cd9987f..643fd861b724 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -13,7 +13,7 @@
#define VGABASE (__ISA_IO_base + 0xb8000)
static int max_ypos = 25, max_xpos = 80;
-static int current_ypos = 25, current_xpos = 0;
+static int current_ypos = 25, current_xpos;
static void early_vga_write(struct console *con, const char *str, unsigned n)
{
@@ -108,12 +108,12 @@ static __init void early_serial_init(char *s)
if (*s) {
unsigned port;
- if (!strncmp(s,"0x",2)) {
+ if (!strncmp(s, "0x", 2)) {
early_serial_base = simple_strtoul(s, &e, 16);
} else {
static int bases[] = { 0x3f8, 0x2f8 };
- if (!strncmp(s,"ttyS",4))
+ if (!strncmp(s, "ttyS", 4))
s += 4;
port = simple_strtoul(s, &e, 10);
if (port > 1 || s == e)
@@ -194,7 +194,7 @@ static struct console simnow_console = {
/* Direct interface for emergencies */
static struct console *early_console = &early_vga_console;
-static int early_console_initialized = 0;
+static int early_console_initialized;
void early_printk(const char *fmt, ...)
{
@@ -202,9 +202,9 @@ void early_printk(const char *fmt, ...)
int n;
va_list ap;
- va_start(ap,fmt);
- n = vscnprintf(buf,512,fmt,ap);
- early_console->write(early_console,buf,n);
+ va_start(ap, fmt);
+ n = vscnprintf(buf, 512, fmt, ap);
+ early_console->write(early_console, buf, n);
va_end(ap);
}
@@ -229,15 +229,15 @@ static int __init setup_early_printk(char *buf)
early_serial_init(buf);
early_console = &early_serial_console;
} else if (!strncmp(buf, "vga", 3)
- && boot_params.screen_info.orig_video_isVGA == 1) {
+ && boot_params.screen_info.orig_video_isVGA == 1) {
max_xpos = boot_params.screen_info.orig_video_cols;
max_ypos = boot_params.screen_info.orig_video_lines;
current_ypos = boot_params.screen_info.orig_y;
early_console = &early_vga_console;
- } else if (!strncmp(buf, "simnow", 6)) {
- simnow_init(buf + 6);
- early_console = &simnow_console;
- keep_early = 1;
+ } else if (!strncmp(buf, "simnow", 6)) {
+ simnow_init(buf + 6);
+ early_console = &simnow_console;
+ keep_early = 1;
#ifdef CONFIG_HVC_XEN
} else if (!strncmp(buf, "xen", 3)) {
early_console = &xenboot_console;
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 759e02bec070..77d424cf68b3 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -383,6 +383,7 @@ static void __init runtime_code_page_mkexec(void)
{
efi_memory_desc_t *md;
void *p;
+ u64 addr, npages;
/* Make EFI runtime service code area executable */
for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
@@ -391,7 +392,10 @@ static void __init runtime_code_page_mkexec(void)
if (md->type != EFI_RUNTIME_SERVICES_CODE)
continue;
- set_memory_x(md->virt_addr, md->num_pages);
+ addr = md->virt_addr;
+ npages = md->num_pages;
+ memrange_efi_to_native(&addr, &npages);
+ set_memory_x(addr, npages);
}
}
@@ -408,7 +412,7 @@ void __init efi_enter_virtual_mode(void)
efi_memory_desc_t *md;
efi_status_t status;
unsigned long size;
- u64 end, systab;
+ u64 end, systab, addr, npages;
void *p, *va;
efi.systab = NULL;
@@ -420,7 +424,7 @@ void __init efi_enter_virtual_mode(void)
size = md->num_pages << EFI_PAGE_SHIFT;
end = md->phys_addr + size;
- if ((end >> PAGE_SHIFT) <= max_pfn_mapped)
+ if (PFN_UP(end) <= max_pfn_mapped)
va = __va(md->phys_addr);
else
va = efi_ioremap(md->phys_addr, size);
@@ -433,8 +437,12 @@ void __init efi_enter_virtual_mode(void)
continue;
}
- if (!(md->attribute & EFI_MEMORY_WB))
- set_memory_uc(md->virt_addr, md->num_pages);
+ if (!(md->attribute & EFI_MEMORY_WB)) {
+ addr = md->virt_addr;
+ npages = md->num_pages;
+ memrange_efi_to_native(&addr, &npages);
+ set_memory_uc(addr, npages);
+ }
systab = (u64) (unsigned long) efi_phys.systab;
if (md->phys_addr <= systab && systab < end) {
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c
index d143a1e76b30..d0060fdcccac 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/kernel/efi_64.c
@@ -105,14 +105,14 @@ void __init efi_reserve_bootmem(void)
void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
{
- static unsigned pages_mapped;
+ static unsigned pages_mapped __initdata;
unsigned i, pages;
+ unsigned long offset;
- /* phys_addr and size must be page aligned */
- if ((phys_addr & ~PAGE_MASK) || (size & ~PAGE_MASK))
- return NULL;
+ pages = PFN_UP(phys_addr + size) - PFN_DOWN(phys_addr);
+ offset = phys_addr & ~PAGE_MASK;
+ phys_addr &= PAGE_MASK;
- pages = size >> PAGE_SHIFT;
if (pages_mapped + pages > MAX_EFI_IO_PAGES)
return NULL;
@@ -124,5 +124,5 @@ void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size)
}
return (void __iomem *)__fix_to_virt(FIX_EFI_IO_MAP_FIRST_PAGE - \
- (pages_mapped - pages));
+ (pages_mapped - pages)) + offset;
}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 4b87c32b639f..2a609dc3271c 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1,5 +1,4 @@
/*
- * linux/arch/i386/entry.S
*
* Copyright (C) 1991, 1992 Linus Torvalds
*/
@@ -51,6 +50,7 @@
#include <asm/desc.h>
#include <asm/percpu.h>
#include <asm/dwarf2.h>
+#include <asm/processor-flags.h>
#include "irq_vectors.h"
/*
@@ -68,13 +68,6 @@
#define nr_syscalls ((syscall_table_size)/4)
-CF_MASK = 0x00000001
-TF_MASK = 0x00000100
-IF_MASK = 0x00000200
-DF_MASK = 0x00000400
-NT_MASK = 0x00004000
-VM_MASK = 0x00020000
-
#ifdef CONFIG_PREEMPT
#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
@@ -84,7 +77,7 @@ VM_MASK = 0x00020000
.macro TRACE_IRQS_IRET
#ifdef CONFIG_TRACE_IRQFLAGS
- testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off?
+ testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off?
jz 1f
TRACE_IRQS_ON
1:
@@ -246,7 +239,7 @@ ret_from_intr:
check_userspace:
movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
movb PT_CS(%esp), %al
- andl $(VM_MASK | SEGMENT_RPL_MASK), %eax
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
cmpl $USER_RPL, %eax
jb resume_kernel # not returning to v8086 or userspace
@@ -271,7 +264,7 @@ need_resched:
movl TI_flags(%ebp), %ecx # need_resched set ?
testb $_TIF_NEED_RESCHED, %cl
jz restore_all
- testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ?
+ testl $X86_EFLAGS_IF,PT_EFLAGS(%esp) # interrupts off (exception path) ?
jz restore_all
call preempt_schedule_irq
jmp need_resched
@@ -291,10 +284,10 @@ ENTRY(ia32_sysenter_target)
movl TSS_sysenter_sp0(%esp),%esp
sysenter_past_esp:
/*
- * No need to follow this irqs on/off section: the syscall
- * disabled irqs and here we enable it straight after entry:
+ * Interrupts are disabled here, but we can't trace it until
+ * enough kernel state to call TRACE_IRQS_OFF can be called - but
+ * we immediately enable interrupts at that point anyway.
*/
- ENABLE_INTERRUPTS(CLBR_NONE)
pushl $(__USER_DS)
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET ss, 0*/
@@ -302,6 +295,7 @@ sysenter_past_esp:
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET esp, 0
pushfl
+ orl $X86_EFLAGS_IF, (%esp)
CFI_ADJUST_CFA_OFFSET 4
pushl $(__USER_CS)
CFI_ADJUST_CFA_OFFSET 4
@@ -315,6 +309,11 @@ sysenter_past_esp:
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET eip, 0
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
/*
* Load the potential sixth argument from user stack.
* Careful about security.
@@ -322,14 +321,12 @@ sysenter_past_esp:
cmpl $__PAGE_OFFSET-3,%ebp
jae syscall_fault
1: movl (%ebp),%ebp
+ movl %ebp,PT_EBP(%esp)
.section __ex_table,"a"
.align 4
.long 1b,syscall_fault
.previous
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
GET_THREAD_INFO(%ebp)
/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -384,7 +381,7 @@ syscall_exit:
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
- testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
+ testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
jz no_singlestep
orl $_TIF_SINGLESTEP,TI_flags(%ebp)
no_singlestep:
@@ -399,7 +396,7 @@ restore_all:
# See comments in process.c:copy_thread() for details.
movb PT_OLDSS(%esp), %ah
movb PT_CS(%esp), %al
- andl $(VM_MASK | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
+ andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
@@ -412,7 +409,7 @@ restore_nocheck_notrace:
irq_return:
INTERRUPT_RETURN
.section .fixup,"ax"
-iret_exc:
+ENTRY(iret_exc)
pushl $0 # no error code
pushl $do_iret_error
jmp error_code
@@ -486,7 +483,7 @@ work_resched:
work_notifysig: # deal with pending signals and
# notify-resume requests
#ifdef CONFIG_VM86
- testl $VM_MASK, PT_EFLAGS(%esp)
+ testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
movl %esp, %eax
jne work_notifysig_v86 # returning to kernel-space or
# vm86-space
@@ -543,9 +540,6 @@ END(syscall_exit_work)
RING0_INT_FRAME # can't unwind into user space anyway
syscall_fault:
- pushl %eax # save orig_eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
GET_THREAD_INFO(%ebp)
movl $-EFAULT,PT_EAX(%esp)
jmp resume_userspace
@@ -1023,6 +1017,13 @@ ENTRY(kernel_thread_helper)
ENDPROC(kernel_thread_helper)
#ifdef CONFIG_XEN
+/* Xen doesn't set %esp to be precisely what the normal sysenter
+ entrypoint expects, so fix it up before using the normal path. */
+ENTRY(xen_sysenter_target)
+ RING0_INT_FRAME
+ addl $5*4, %esp /* remove xen-provided frame */
+ jmp sysenter_past_esp
+
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
pushl $0
@@ -1041,8 +1042,9 @@ ENTRY(xen_hypervisor_callback)
cmpl $xen_iret_end_crit,%eax
jae 1f
- call xen_iret_crit_fixup
+ jmp xen_iret_crit_fixup
+ENTRY(xen_do_upcall)
1: mov %esp, %eax
call xen_evtchn_do_upcall
jmp ret_from_intr
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c20c9e7e08dd..556a8df522a7 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -319,19 +319,17 @@ badsys:
/* Do syscall tracing */
tracesys:
SAVE_REST
- movq $-ENOSYS,RAX(%rsp)
+ movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
FIXUP_TOP_OF_STACK %rdi
movq %rsp,%rdi
call syscall_trace_enter
LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */
RESTORE_REST
cmpq $__NR_syscall_max,%rax
- movq $-ENOSYS,%rcx
- cmova %rcx,%rax
- ja 1f
+ ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
movq %r10,%rcx /* fixup for C */
call *sys_call_table(,%rax,8)
-1: movq %rax,RAX-ARGOFFSET(%rsp)
+ movq %rax,RAX-ARGOFFSET(%rsp)
/* Use IRET because user could have changed frame */
/*
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 4ae7b6440260..021624c83583 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -15,6 +15,7 @@
#include <linux/kernel.h>
#include <linux/ctype.h>
#include <linux/init.h>
+#include <linux/hardirq.h>
#include <asm/smp.h>
#include <asm/ipi.h>
@@ -24,20 +25,20 @@
#include <acpi/acpi_bus.h>
#endif
-/* which logical CPU number maps to which CPU (physical APIC ID) */
-u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
- = { [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_cpu_to_apicid_early_ptr;
-DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+DEFINE_PER_CPU(int, x2apic_extra_bits);
struct genapic __read_mostly *genapic = &apic_flat;
+static enum uv_system_type uv_system_type;
+
/*
* Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
*/
void __init setup_apic_routing(void)
{
+ if (uv_system_type == UV_NON_UNIQUE_APIC)
+ genapic = &apic_x2apic_uv_x;
+ else
#ifdef CONFIG_ACPI
/*
* Quirk: some x86_64 machines can only use physical APIC mode
@@ -50,7 +51,7 @@ void __init setup_apic_routing(void)
else
#endif
- if (cpus_weight(cpu_possible_map) <= 8)
+ if (num_possible_cpus() <= 8)
genapic = &apic_flat;
else
genapic = &apic_physflat;
@@ -64,3 +65,37 @@ void send_IPI_self(int vector)
{
__send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
}
+
+int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (!strcmp(oem_id, "SGI")) {
+ if (!strcmp(oem_table_id, "UVL"))
+ uv_system_type = UV_LEGACY_APIC;
+ else if (!strcmp(oem_table_id, "UVX"))
+ uv_system_type = UV_X2APIC;
+ else if (!strcmp(oem_table_id, "UVH"))
+ uv_system_type = UV_NON_UNIQUE_APIC;
+ }
+ return 0;
+}
+
+unsigned int read_apic_id(void)
+{
+ unsigned int id;
+
+ WARN_ON(preemptible());
+ id = apic_read(APIC_ID);
+ if (uv_system_type >= UV_X2APIC)
+ id |= __get_cpu_var(x2apic_extra_bits);
+ return id;
+}
+
+enum uv_system_type get_uv_system_type(void)
+{
+ return uv_system_type;
+}
+
+int is_uv_system(void)
+{
+ return uv_system_type != UV_NONE;
+}
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 07352b74bda6..1a9c68845ee8 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -97,7 +97,7 @@ static void flat_send_IPI_all(int vector)
static int flat_apic_id_registered(void)
{
- return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
+ return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map);
}
static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
@@ -138,12 +138,9 @@ static cpumask_t physflat_target_cpus(void)
static cpumask_t physflat_vector_allocation_domain(int cpu)
{
- cpumask_t domain = CPU_MASK_NONE;
- cpu_set(cpu, domain);
- return domain;
+ return cpumask_of_cpu(cpu);
}
-
static void physflat_send_IPI_mask(cpumask_t cpumask, int vector)
{
send_IPI_mask_sequence(cpumask, vector);
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
new file mode 100644
index 000000000000..ebf13908a743
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -0,0 +1,250 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * SGI UV APIC functions (note: not an Intel compatible APIC)
+ *
+ * Copyright (C) 2007 Silicon Graphics, Inc. All rights reserved.
+ */
+
+#include <linux/threads.h>
+#include <linux/cpumask.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <asm/smp.h>
+#include <asm/ipi.h>
+#include <asm/genapic.h>
+#include <asm/uv/uv_mmrs.h>
+#include <asm/uv/uv_hub.h>
+
+DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
+EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
+
+struct uv_blade_info *uv_blade_info;
+EXPORT_SYMBOL_GPL(uv_blade_info);
+
+short *uv_node_to_blade;
+EXPORT_SYMBOL_GPL(uv_node_to_blade);
+
+short *uv_cpu_to_blade;
+EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
+
+short uv_possible_blades;
+EXPORT_SYMBOL_GPL(uv_possible_blades);
+
+/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
+
+static cpumask_t uv_target_cpus(void)
+{
+ return cpumask_of_cpu(0);
+}
+
+static cpumask_t uv_vector_allocation_domain(int cpu)
+{
+ cpumask_t domain = CPU_MASK_NONE;
+ cpu_set(cpu, domain);
+ return domain;
+}
+
+int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip)
+{
+ unsigned long val;
+ int nasid;
+
+ nasid = uv_apicid_to_nasid(phys_apicid);
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
+ APIC_DM_INIT;
+ uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+ mdelay(10);
+
+ val = (1UL << UVH_IPI_INT_SEND_SHFT) |
+ (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
+ (((long)start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
+ APIC_DM_STARTUP;
+ uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+ return 0;
+}
+
+static void uv_send_IPI_one(int cpu, int vector)
+{
+ unsigned long val, apicid, lapicid;
+ int nasid;
+
+ apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */
+ lapicid = apicid & 0x3f; /* ZZZ macro needed */
+ nasid = uv_apicid_to_nasid(apicid);
+ val =
+ (1UL << UVH_IPI_INT_SEND_SHFT) | (lapicid <<
+ UVH_IPI_INT_APIC_ID_SHFT) |
+ (vector << UVH_IPI_INT_VECTOR_SHFT);
+ uv_write_global_mmr64(nasid, UVH_IPI_INT, val);
+}
+
+static void uv_send_IPI_mask(cpumask_t mask, int vector)
+{
+ unsigned int cpu;
+
+ for (cpu = 0; cpu < NR_CPUS; ++cpu)
+ if (cpu_isset(cpu, mask))
+ uv_send_IPI_one(cpu, vector);
+}
+
+static void uv_send_IPI_allbutself(int vector)
+{
+ cpumask_t mask = cpu_online_map;
+
+ cpu_clear(smp_processor_id(), mask);
+
+ if (!cpus_empty(mask))
+ uv_send_IPI_mask(mask, vector);
+}
+
+static void uv_send_IPI_all(int vector)
+{
+ uv_send_IPI_mask(cpu_online_map, vector);
+}
+
+static int uv_apic_id_registered(void)
+{
+ return 1;
+}
+
+static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
+{
+ int cpu;
+
+ /*
+ * We're using fixed IRQ delivery, can only return one phys APIC ID.
+ * May as well be the first.
+ */
+ cpu = first_cpu(cpumask);
+ if ((unsigned)cpu < NR_CPUS)
+ return per_cpu(x86_cpu_to_apicid, cpu);
+ else
+ return BAD_APICID;
+}
+
+static unsigned int phys_pkg_id(int index_msb)
+{
+ return GET_APIC_ID(read_apic_id()) >> index_msb;
+}
+
+#ifdef ZZZ /* Needs x2apic patch */
+static void uv_send_IPI_self(int vector)
+{
+ apic_write(APIC_SELF_IPI, vector);
+}
+#endif
+
+struct genapic apic_x2apic_uv_x = {
+ .name = "UV large system",
+ .int_delivery_mode = dest_Fixed,
+ .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
+ .target_cpus = uv_target_cpus,
+ .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */
+ .apic_id_registered = uv_apic_id_registered,
+ .send_IPI_all = uv_send_IPI_all,
+ .send_IPI_allbutself = uv_send_IPI_allbutself,
+ .send_IPI_mask = uv_send_IPI_mask,
+ /* ZZZ.send_IPI_self = uv_send_IPI_self, */
+ .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
+ .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */
+};
+
+static __cpuinit void set_x2apic_extra_bits(int nasid)
+{
+ __get_cpu_var(x2apic_extra_bits) = ((nasid >> 1) << 6);
+}
+
+/*
+ * Called on boot cpu.
+ */
+static __init void uv_system_init(void)
+{
+ union uvh_si_addr_map_config_u m_n_config;
+ int bytes, nid, cpu, lcpu, nasid, last_nasid, blade;
+ unsigned long mmr_base;
+
+ m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
+ mmr_base =
+ uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
+ ~UV_MMR_ENABLE;
+ printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
+
+ last_nasid = -1;
+ for_each_possible_cpu(cpu) {
+ nid = cpu_to_node(cpu);
+ nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu));
+ if (nasid != last_nasid)
+ uv_possible_blades++;
+ last_nasid = nasid;
+ }
+ printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
+
+ bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
+ uv_blade_info = alloc_bootmem_pages(bytes);
+
+ bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
+ uv_node_to_blade = alloc_bootmem_pages(bytes);
+ memset(uv_node_to_blade, 255, bytes);
+
+ bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
+ uv_cpu_to_blade = alloc_bootmem_pages(bytes);
+ memset(uv_cpu_to_blade, 255, bytes);
+
+ last_nasid = -1;
+ blade = -1;
+ lcpu = -1;
+ for_each_possible_cpu(cpu) {
+ nid = cpu_to_node(cpu);
+ nasid = uv_apicid_to_nasid(per_cpu(x86_cpu_to_apicid, cpu));
+ if (nasid != last_nasid) {
+ blade++;
+ lcpu = -1;
+ uv_blade_info[blade].nr_posible_cpus = 0;
+ uv_blade_info[blade].nr_online_cpus = 0;
+ }
+ last_nasid = nasid;
+ lcpu++;
+
+ uv_cpu_hub_info(cpu)->m_val = m_n_config.s.m_skt;
+ uv_cpu_hub_info(cpu)->n_val = m_n_config.s.n_skt;
+ uv_cpu_hub_info(cpu)->numa_blade_id = blade;
+ uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
+ uv_cpu_hub_info(cpu)->local_nasid = nasid;
+ uv_cpu_hub_info(cpu)->gnode_upper =
+ nasid & ~((1 << uv_hub_info->n_val) - 1);
+ uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
+ uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */
+ uv_blade_info[blade].nasid = nasid;
+ uv_blade_info[blade].nr_posible_cpus++;
+ uv_node_to_blade[nid] = blade;
+ uv_cpu_to_blade[cpu] = blade;
+
+ printk(KERN_DEBUG "UV cpu %d, apicid 0x%x, nasid %d, nid %d\n",
+ cpu, per_cpu(x86_cpu_to_apicid, cpu), nasid, nid);
+ printk(KERN_DEBUG "UV lcpu %d, blade %d\n", lcpu, blade);
+ }
+}
+
+/*
+ * Called on each cpu to initialize the per_cpu UV data area.
+ */
+void __cpuinit uv_cpu_init(void)
+{
+ if (!uv_node_to_blade)
+ uv_system_init();
+
+ uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
+
+ if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
+ set_x2apic_extra_bits(uv_hub_info->local_nasid);
+}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
new file mode 100644
index 000000000000..3db059058927
--- /dev/null
+++ b/arch/x86/kernel/head32.c
@@ -0,0 +1,14 @@
+/*
+ * linux/arch/i386/kernel/head32.c -- prepare to run common code
+ *
+ * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright (C) 2007 Eric Biederman <ebiederm@xmission.com>
+ */
+
+#include <linux/init.h>
+#include <linux/start_kernel.h>
+
+void __init i386_start_kernel(void)
+{
+ start_kernel();
+}
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index ad2440832de0..e25c57b8aa84 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -11,6 +11,7 @@
#include <linux/string.h>
#include <linux/percpu.h>
#include <linux/start_kernel.h>
+#include <linux/io.h>
#include <asm/processor.h>
#include <asm/proto.h>
@@ -22,6 +23,7 @@
#include <asm/sections.h>
#include <asm/kdebug.h>
#include <asm/e820.h>
+#include <asm/bios_ebda.h>
static void __init zap_identity_mappings(void)
{
@@ -49,39 +51,91 @@ static void __init copy_bootdata(char *real_mode_data)
}
}
-#define EBDA_ADDR_POINTER 0x40E
+#define BIOS_LOWMEM_KILOBYTES 0x413
-static __init void reserve_ebda(void)
+/*
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
+ */
+static void __init reserve_ebda_region(void)
{
- unsigned ebda_addr, ebda_size;
+ unsigned int lowmem, ebda_addr;
+
+ /* To determine the position of the EBDA and the */
+ /* end of conventional memory, we need to look at */
+ /* the BIOS data area. In a paravirtual environment */
+ /* that area is absent. We'll just have to assume */
+ /* that the paravirt case can handle memory setup */
+ /* correctly, without our help. */
+ if (paravirt_enabled())
+ return;
- /*
- * there is a real-mode segmented pointer pointing to the
- * 4K EBDA area at 0x40E
- */
- ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
- ebda_addr <<= 4;
+ /* end of low (conventional) memory */
+ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+ lowmem <<= 10;
- if (!ebda_addr)
- return;
+ /* start of EBDA area */
+ ebda_addr = get_bios_ebda();
+
+ /* Fixup: bios puts an EBDA in the top 64K segment */
+ /* of conventional memory, but does not adjust lowmem. */
+ if ((lowmem - ebda_addr) <= 0x10000)
+ lowmem = ebda_addr;
- ebda_size = *(unsigned short *)__va(ebda_addr);
+ /* Fixup: bios does not report an EBDA at all. */
+ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+ if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+ lowmem = 0x9f000;
- /* Round EBDA up to pages */
- if (ebda_size == 0)
- ebda_size = 1;
- ebda_size <<= 10;
- ebda_size = round_up(ebda_size + (ebda_addr & ~PAGE_MASK), PAGE_SIZE);
- if (ebda_size > 64*1024)
- ebda_size = 64*1024;
+ /* Paranoia: should never happen, but... */
+ if ((lowmem == 0) || (lowmem >= 0x100000))
+ lowmem = 0x9f000;
- reserve_early(ebda_addr, ebda_addr + ebda_size, "EBDA");
+ /* reserve all memory between lowmem and the 1MB mark */
+ reserve_early(lowmem, 0x100000, "BIOS reserved");
+}
+
+static void __init reserve_setup_data(void)
+{
+ struct setup_data *data;
+ unsigned long pa_data;
+ char buf[32];
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, sizeof(*data));
+ sprintf(buf, "setup data %x", data->type);
+ reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
+ pa_data = data->next;
+ early_iounmap(data, sizeof(*data));
+ }
}
void __init x86_64_start_kernel(char * real_mode_data)
{
int i;
+ /*
+ * Build-time sanity checks on the kernel image and module
+ * area mappings. (these are purely build-time and produce no code)
+ */
+ BUILD_BUG_ON(MODULES_VADDR < KERNEL_IMAGE_START);
+ BUILD_BUG_ON(MODULES_VADDR-KERNEL_IMAGE_START < KERNEL_IMAGE_SIZE);
+ BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE);
+ BUILD_BUG_ON((KERNEL_IMAGE_START & ~PMD_MASK) != 0);
+ BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0);
+ BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
+ BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
+ (__START_KERNEL & PGDIR_MASK)));
+
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
@@ -91,7 +145,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
/* Cleanup the over mapped high alias */
cleanup_highmap();
- for (i = 0; i < IDT_ENTRIES; i++) {
+ for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
#ifdef CONFIG_EARLY_PRINTK
set_intr_gate(i, &early_idt_handlers[i]);
#else
@@ -110,6 +164,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
@@ -117,8 +172,10 @@ void __init x86_64_start_kernel(char * real_mode_data)
unsigned long ramdisk_end = ramdisk_image + ramdisk_size;
reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
}
+#endif
- reserve_ebda();
+ reserve_ebda_region();
+ reserve_setup_data();
/*
* At this point everything still needed from the boot loader
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 74d87ea85b5c..90f038af3adc 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -1,5 +1,4 @@
/*
- * linux/arch/i386/kernel/head.S -- the 32-bit startup code.
*
* Copyright (C) 1991, 1992 Linus Torvalds
*
@@ -450,7 +449,7 @@ is386: movl $2,%ecx # set MP
jmp initialize_secondary # all other CPUs call initialize_secondary
1:
#endif /* CONFIG_SMP */
- jmp start_kernel
+ jmp i386_start_kernel
/*
* We depend on ET to be correct. This checks for 287/387.
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index a007454133a3..10a1955bb1d1 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -132,10 +132,6 @@ ident_complete:
addq %rbp, trampoline_level4_pgt + 0(%rip)
addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
#endif
-#ifdef CONFIG_ACPI_SLEEP
- addq %rbp, wakeup_level4_pgt + 0(%rip)
- addq %rbp, wakeup_level4_pgt + (511*8)(%rip)
-#endif
/* Due to ENTRY(), sometimes the empty space gets filled with
* zeros. Better take a jmp than relying on empty space being
@@ -267,21 +263,16 @@ ENTRY(secondary_startup_64)
bad_address:
jmp bad_address
+ .section ".init.text","ax"
#ifdef CONFIG_EARLY_PRINTK
-.macro early_idt_tramp first, last
- .ifgt \last-\first
- early_idt_tramp \first, \last-1
- .endif
- movl $\last,%esi
- jmp early_idt_handler
-.endm
-
.globl early_idt_handlers
early_idt_handlers:
- early_idt_tramp 0, 63
- early_idt_tramp 64, 127
- early_idt_tramp 128, 191
- early_idt_tramp 192, 255
+ i = 0
+ .rept NUM_EXCEPTION_VECTORS
+ movl $i, %esi
+ jmp early_idt_handler
+ i = i + 1
+ .endr
#endif
ENTRY(early_idt_handler)
@@ -327,6 +318,7 @@ early_idt_msg:
early_idt_ripmsg:
.asciz "RIP %s\n"
#endif /* CONFIG_EARLY_PRINTK */
+ .previous
.balign PAGE_SIZE
@@ -383,12 +375,12 @@ NEXT_PAGE(level2_ident_pgt)
NEXT_PAGE(level2_kernel_pgt)
/*
- * 128 MB kernel mapping. We spend a full page on this pagetable
+ * 512 MB kernel mapping. We spend a full page on this pagetable
* anyway.
*
* The kernel code+data+bss must not be bigger than that.
*
- * (NOTE: at +128MB starts the module area, see MODULES_VADDR.
+ * (NOTE: at +512MB starts the module area, see MODULES_VADDR.
* If you want to increase this then increase MODULES_VADDR
* too.)
*/
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 36652ea1a265..9007f9ea64ee 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -218,7 +218,7 @@ static void hpet_legacy_clockevent_register(void)
hpet_freq = 1000000000000000ULL;
do_div(hpet_freq, hpet_period);
hpet_clockevent.mult = div_sc((unsigned long) hpet_freq,
- NSEC_PER_SEC, 32);
+ NSEC_PER_SEC, hpet_clockevent.shift);
/* Calculate the min / max delta */
hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
&hpet_clockevent);
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 061627806a2d..deb43785e923 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,13 +1,8 @@
#include <linux/module.h>
-#include <asm/semaphore.h>
#include <asm/checksum.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
/* Networking helper routines. */
EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d2e39e69aaf8..db6839b53195 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -5,51 +5,48 @@
* General FPU state handling cleanups
* Gareth Hughes <gareth@valinux.com>, May 2000
*/
-
-#include <linux/sched.h>
#include <linux/module.h>
#include <linux/regset.h>
+#include <linux/sched.h>
+
+#include <asm/sigcontext.h>
#include <asm/processor.h>
-#include <asm/i387.h>
#include <asm/math_emu.h>
-#include <asm/sigcontext.h>
-#include <asm/user.h>
-#include <asm/ptrace.h>
#include <asm/uaccess.h>
+#include <asm/ptrace.h>
+#include <asm/i387.h>
+#include <asm/user.h>
#ifdef CONFIG_X86_64
-
-#include <asm/sigcontext32.h>
-#include <asm/user32.h>
-
+# include <asm/sigcontext32.h>
+# include <asm/user32.h>
#else
-
-#define save_i387_ia32 save_i387
-#define restore_i387_ia32 restore_i387
-
-#define _fpstate_ia32 _fpstate
-#define user_i387_ia32_struct user_i387_struct
-#define user32_fxsr_struct user_fxsr_struct
-
+# define save_i387_ia32 save_i387
+# define restore_i387_ia32 restore_i387
+# define _fpstate_ia32 _fpstate
+# define user_i387_ia32_struct user_i387_struct
+# define user32_fxsr_struct user_fxsr_struct
#endif
#ifdef CONFIG_MATH_EMULATION
-#define HAVE_HWFP (boot_cpu_data.hard_math)
+# define HAVE_HWFP (boot_cpu_data.hard_math)
#else
-#define HAVE_HWFP 1
+# define HAVE_HWFP 1
#endif
-static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
+unsigned int xstate_size;
+static struct i387_fxsave_struct fx_scratch __cpuinitdata;
-void mxcsr_feature_mask_init(void)
+void __cpuinit mxcsr_feature_mask_init(void)
{
unsigned long mask = 0;
+
clts();
if (cpu_has_fxsr) {
- memset(&current->thread.i387.fxsave, 0,
- sizeof(struct i387_fxsave_struct));
- asm volatile("fxsave %0" : : "m" (current->thread.i387.fxsave));
- mask = current->thread.i387.fxsave.mxcsr_mask;
+ memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));
+ asm volatile("fxsave %0" : : "m" (fx_scratch));
+ mask = fx_scratch.mxcsr_mask;
if (mask == 0)
mask = 0x0000ffbf;
}
@@ -57,6 +54,16 @@ void mxcsr_feature_mask_init(void)
stts();
}
+void __init init_thread_xstate(void)
+{
+ if (cpu_has_fxsr)
+ xstate_size = sizeof(struct i387_fxsave_struct);
+#ifdef CONFIG_X86_32
+ else
+ xstate_size = sizeof(struct i387_fsave_struct);
+#endif
+}
+
#ifdef CONFIG_X86_64
/*
* Called at bootup to set up the initial FPU state that is later cloned
@@ -65,14 +72,11 @@ void mxcsr_feature_mask_init(void)
void __cpuinit fpu_init(void)
{
unsigned long oldcr0 = read_cr0();
- extern void __bad_fxsave_alignment(void);
- if (offsetof(struct task_struct, thread.i387.fxsave) & 15)
- __bad_fxsave_alignment();
set_in_cr4(X86_CR4_OSFXSR);
set_in_cr4(X86_CR4_OSXMMEXCPT);
- write_cr0(oldcr0 & ~((1UL<<3)|(1UL<<2))); /* clear TS and EM */
+ write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
mxcsr_feature_mask_init();
/* clean state in init */
@@ -87,32 +91,44 @@ void __cpuinit fpu_init(void)
* value at reset if we support XMM instructions and then
* remeber the current task has used the FPU.
*/
-void init_fpu(struct task_struct *tsk)
+int init_fpu(struct task_struct *tsk)
{
if (tsk_used_math(tsk)) {
if (tsk == current)
unlazy_fpu(tsk);
- return;
+ return 0;
+ }
+
+ /*
+ * Memory allocation at the first usage of the FPU and other state.
+ */
+ if (!tsk->thread.xstate) {
+ tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
+ GFP_KERNEL);
+ if (!tsk->thread.xstate)
+ return -ENOMEM;
}
if (cpu_has_fxsr) {
- memset(&tsk->thread.i387.fxsave, 0,
- sizeof(struct i387_fxsave_struct));
- tsk->thread.i387.fxsave.cwd = 0x37f;
+ struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+
+ memset(fx, 0, xstate_size);
+ fx->cwd = 0x37f;
if (cpu_has_xmm)
- tsk->thread.i387.fxsave.mxcsr = MXCSR_DEFAULT;
+ fx->mxcsr = MXCSR_DEFAULT;
} else {
- memset(&tsk->thread.i387.fsave, 0,
- sizeof(struct i387_fsave_struct));
- tsk->thread.i387.fsave.cwd = 0xffff037fu;
- tsk->thread.i387.fsave.swd = 0xffff0000u;
- tsk->thread.i387.fsave.twd = 0xffffffffu;
- tsk->thread.i387.fsave.fos = 0xffff0000u;
+ struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+ memset(fp, 0, xstate_size);
+ fp->cwd = 0xffff037fu;
+ fp->swd = 0xffff0000u;
+ fp->twd = 0xffffffffu;
+ fp->fos = 0xffff0000u;
}
/*
* Only the device not available exception or ptrace can call init_fpu.
*/
set_stopped_child_used_math(tsk);
+ return 0;
}
int fpregs_active(struct task_struct *target, const struct user_regset *regset)
@@ -129,13 +145,17 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
+ int ret;
+
if (!cpu_has_fxsr)
return -ENODEV;
- init_fpu(target);
+ ret = init_fpu(target);
+ if (ret)
+ return ret;
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.i387.fxsave, 0, -1);
+ &target->thread.xstate->fxsave, 0, -1);
}
int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -147,16 +167,19 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
if (!cpu_has_fxsr)
return -ENODEV;
- init_fpu(target);
+ ret = init_fpu(target);
+ if (ret)
+ return ret;
+
set_stopped_child_used_math(target);
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.i387.fxsave, 0, -1);
+ &target->thread.xstate->fxsave, 0, -1);
/*
* mxcsr reserved bits must be masked to zero for security reasons.
*/
- target->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+ target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
return ret;
}
@@ -178,6 +201,7 @@ static inline unsigned short twd_i387_to_fxsr(unsigned short twd)
tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */
tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */
tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */
+
return tmp;
}
@@ -232,10 +256,10 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
* FXSR floating point environment conversions.
*/
-static void convert_from_fxsr(struct user_i387_ia32_struct *env,
- struct task_struct *tsk)
+static void
+convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
{
- struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
+ struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
int i;
@@ -252,10 +276,11 @@ static void convert_from_fxsr(struct user_i387_ia32_struct *env,
* should be actually ds/cs at fpu exception time, but
* that information is not available in 64bit mode.
*/
- asm("mov %%ds,%0" : "=r" (env->fos));
- asm("mov %%cs,%0" : "=r" (env->fcs));
+ asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
+ asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
} else {
struct pt_regs *regs = task_pt_regs(tsk);
+
env->fos = 0xffff0000 | tsk->thread.ds;
env->fcs = regs->cs;
}
@@ -274,7 +299,7 @@ static void convert_to_fxsr(struct task_struct *tsk,
const struct user_i387_ia32_struct *env)
{
- struct i387_fxsave_struct *fxsave = &tsk->thread.i387.fxsave;
+ struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
int i;
@@ -303,15 +328,20 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
void *kbuf, void __user *ubuf)
{
struct user_i387_ia32_struct env;
+ int ret;
if (!HAVE_HWFP)
return fpregs_soft_get(target, regset, pos, count, kbuf, ubuf);
- init_fpu(target);
+ ret = init_fpu(target);
+ if (ret)
+ return ret;
- if (!cpu_has_fxsr)
+ if (!cpu_has_fxsr) {
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.i387.fsave, 0, -1);
+ &target->thread.xstate->fsave, 0,
+ -1);
+ }
if (kbuf && pos == 0 && count == sizeof(env)) {
convert_from_fxsr(kbuf, target);
@@ -319,6 +349,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
}
convert_from_fxsr(&env, target);
+
return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &env, 0, -1);
}
@@ -332,12 +363,16 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
if (!HAVE_HWFP)
return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
- init_fpu(target);
+ ret = init_fpu(target);
+ if (ret)
+ return ret;
+
set_stopped_child_used_math(target);
- if (!cpu_has_fxsr)
+ if (!cpu_has_fxsr) {
return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.i387.fsave, 0, -1);
+ &target->thread.xstate->fsave, 0, -1);
+ }
if (pos > 0 || count < sizeof(env))
convert_from_fxsr(&env, target);
@@ -356,11 +391,11 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
{
struct task_struct *tsk = current;
+ struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
unlazy_fpu(tsk);
- tsk->thread.i387.fsave.status = tsk->thread.i387.fsave.swd;
- if (__copy_to_user(buf, &tsk->thread.i387.fsave,
- sizeof(struct i387_fsave_struct)))
+ fp->status = fp->swd;
+ if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
return -1;
return 1;
}
@@ -368,6 +403,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
{
struct task_struct *tsk = current;
+ struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
struct user_i387_ia32_struct env;
int err = 0;
@@ -377,12 +413,12 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
if (__copy_to_user(buf, &env, sizeof(env)))
return -1;
- err |= __put_user(tsk->thread.i387.fxsave.swd, &buf->status);
+ err |= __put_user(fx->swd, &buf->status);
err |= __put_user(X86_FXSR_MAGIC, &buf->magic);
if (err)
return -1;
- if (__copy_to_user(&buf->_fxsr_env[0], &tsk->thread.i387.fxsave,
+ if (__copy_to_user(&buf->_fxsr_env[0], fx,
sizeof(struct i387_fxsave_struct)))
return -1;
return 1;
@@ -392,46 +428,48 @@ int save_i387_ia32(struct _fpstate_ia32 __user *buf)
{
if (!used_math())
return 0;
-
- /* This will cause a "finit" to be triggered by the next
+ /*
+ * This will cause a "finit" to be triggered by the next
* attempted FPU operation by the 'current' process.
*/
clear_used_math();
- if (HAVE_HWFP) {
- if (cpu_has_fxsr) {
- return save_i387_fxsave(buf);
- } else {
- return save_i387_fsave(buf);
- }
- } else {
+ if (!HAVE_HWFP) {
return fpregs_soft_get(current, NULL,
0, sizeof(struct user_i387_ia32_struct),
NULL, buf) ? -1 : 1;
}
+
+ if (cpu_has_fxsr)
+ return save_i387_fxsave(buf);
+ else
+ return save_i387_fsave(buf);
}
static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
{
struct task_struct *tsk = current;
+
clear_fpu(tsk);
- return __copy_from_user(&tsk->thread.i387.fsave, buf,
+ return __copy_from_user(&tsk->thread.xstate->fsave, buf,
sizeof(struct i387_fsave_struct));
}
static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
{
- int err;
struct task_struct *tsk = current;
struct user_i387_ia32_struct env;
+ int err;
+
clear_fpu(tsk);
- err = __copy_from_user(&tsk->thread.i387.fxsave, &buf->_fxsr_env[0],
+ err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
sizeof(struct i387_fxsave_struct));
/* mxcsr reserved bits must be masked to zero for security reasons */
- tsk->thread.i387.fxsave.mxcsr &= mxcsr_feature_mask;
+ tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
if (err || __copy_from_user(&env, buf, sizeof(env)))
return 1;
convert_to_fxsr(tsk, &env);
+
return 0;
}
@@ -440,17 +478,17 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
int err;
if (HAVE_HWFP) {
- if (cpu_has_fxsr) {
+ if (cpu_has_fxsr)
err = restore_i387_fxsave(buf);
- } else {
+ else
err = restore_i387_fsave(buf);
- }
} else {
err = fpregs_soft_set(current, NULL,
0, sizeof(struct user_i387_ia32_struct),
NULL, buf) != 0;
}
set_used_math();
+
return err;
}
@@ -463,8 +501,8 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
*/
int dump_fpu(struct pt_regs *regs, struct user_i387_struct *fpu)
{
- int fpvalid;
struct task_struct *tsk = current;
+ int fpvalid;
fpvalid = !!used_math();
if (fpvalid)
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 8540abe86ade..c1b5e3ece1f2 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -115,7 +115,8 @@ void __init setup_pit_timer(void)
* IO_APIC has been initialized.
*/
pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id());
- pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32);
+ pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC,
+ pit_clockevent.shift);
pit_clockevent.max_delta_ns =
clockevent_delta2ns(0x7FFF, &pit_clockevent);
pit_clockevent.min_delta_ns =
@@ -224,7 +225,8 @@ static int __init init_pit_clocksource(void)
pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
return 0;
- clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
+ clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE,
+ clocksource_pit.shift);
return clocksource_register(&clocksource_pit);
}
arch_initcall(init_pit_clocksource);
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 4ca548632c8d..a40d54fc1fdd 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -71,6 +71,16 @@ int sis_apic_bug = -1;
*/
int nr_ioapic_registers[MAX_IO_APICS];
+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+int nr_ioapics;
+
+/* MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* # of MP IRQ source entries */
+int mp_irq_entries;
+
static int disable_timer_pin_1 __initdata;
/*
@@ -810,10 +820,7 @@ static int __init find_isa_irq_pin(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].mpc_srcbus;
- if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_MCA
- ) &&
+ if (test_bit(lbus, mp_bus_not_pci) &&
(mp_irqs[i].mpc_irqtype == type) &&
(mp_irqs[i].mpc_srcbusirq == irq))
@@ -829,10 +836,7 @@ static int __init find_isa_irq_apic(int irq, int type)
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].mpc_srcbus;
- if ((mp_bus_id_to_type[lbus] == MP_BUS_ISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_EISA ||
- mp_bus_id_to_type[lbus] == MP_BUS_MCA
- ) &&
+ if (test_bit(lbus, mp_bus_not_pci) &&
(mp_irqs[i].mpc_irqtype == type) &&
(mp_irqs[i].mpc_srcbusirq == irq))
break;
@@ -872,7 +876,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
mp_irqs[i].mpc_dstapic == MP_APIC_ALL)
break;
- if ((mp_bus_id_to_type[lbus] == MP_BUS_PCI) &&
+ if (!test_bit(lbus, mp_bus_not_pci) &&
!mp_irqs[i].mpc_irqtype &&
(bus == lbus) &&
(slot == ((mp_irqs[i].mpc_srcbusirq >> 2) & 0x1f))) {
@@ -921,6 +925,7 @@ void __init setup_ioapic_dest(void)
}
#endif
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
/*
* EISA Edge/Level control register, ELCR
*/
@@ -934,6 +939,13 @@ static int EISA_ELCR(unsigned int irq)
"Broken MPtable reports ISA irq %d\n", irq);
return 0;
}
+#endif
+
+/* ISA interrupts are always polarity zero edge triggered,
+ * when listed as conforming in the MP table. */
+
+#define default_ISA_trigger(idx) (0)
+#define default_ISA_polarity(idx) (0)
/* EISA interrupts are always polarity zero and can be edge or level
* trigger depending on the ELCR value. If an interrupt is listed as
@@ -941,13 +953,7 @@ static int EISA_ELCR(unsigned int irq)
* be read in from the ELCR */
#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mpc_srcbusirq))
-#define default_EISA_polarity(idx) (0)
-
-/* ISA interrupts are always polarity zero edge triggered,
- * when listed as conforming in the MP table. */
-
-#define default_ISA_trigger(idx) (0)
-#define default_ISA_polarity(idx) (0)
+#define default_EISA_polarity(idx) default_ISA_polarity(idx)
/* PCI interrupts are always polarity one level triggered,
* when listed as conforming in the MP table. */
@@ -959,7 +965,7 @@ static int EISA_ELCR(unsigned int irq)
* when listed as conforming in the MP table. */
#define default_MCA_trigger(idx) (1)
-#define default_MCA_polarity(idx) (0)
+#define default_MCA_polarity(idx) default_ISA_polarity(idx)
static int MPBIOS_polarity(int idx)
{
@@ -973,35 +979,9 @@ static int MPBIOS_polarity(int idx)
{
case 0: /* conforms, ie. bus-type dependent polarity */
{
- switch (mp_bus_id_to_type[bus])
- {
- case MP_BUS_ISA: /* ISA pin */
- {
- polarity = default_ISA_polarity(idx);
- break;
- }
- case MP_BUS_EISA: /* EISA pin */
- {
- polarity = default_EISA_polarity(idx);
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- polarity = default_PCI_polarity(idx);
- break;
- }
- case MP_BUS_MCA: /* MCA pin */
- {
- polarity = default_MCA_polarity(idx);
- break;
- }
- default:
- {
- printk(KERN_WARNING "broken BIOS!!\n");
- polarity = 1;
- break;
- }
- }
+ polarity = test_bit(bus, mp_bus_not_pci)?
+ default_ISA_polarity(idx):
+ default_PCI_polarity(idx);
break;
}
case 1: /* high active */
@@ -1042,11 +1022,15 @@ static int MPBIOS_trigger(int idx)
{
case 0: /* conforms, ie. bus-type dependent */
{
+ trigger = test_bit(bus, mp_bus_not_pci)?
+ default_ISA_trigger(idx):
+ default_PCI_trigger(idx);
+#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
switch (mp_bus_id_to_type[bus])
{
case MP_BUS_ISA: /* ISA pin */
{
- trigger = default_ISA_trigger(idx);
+ /* set before the switch */
break;
}
case MP_BUS_EISA: /* EISA pin */
@@ -1056,7 +1040,7 @@ static int MPBIOS_trigger(int idx)
}
case MP_BUS_PCI: /* PCI pin */
{
- trigger = default_PCI_trigger(idx);
+ /* set before the switch */
break;
}
case MP_BUS_MCA: /* MCA pin */
@@ -1071,6 +1055,7 @@ static int MPBIOS_trigger(int idx)
break;
}
}
+#endif
break;
}
case 1: /* edge */
@@ -1120,39 +1105,22 @@ static int pin_2_irq(int idx, int apic, int pin)
if (mp_irqs[idx].mpc_dstirq != pin)
printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
- switch (mp_bus_id_to_type[bus])
- {
- case MP_BUS_ISA: /* ISA pin */
- case MP_BUS_EISA:
- case MP_BUS_MCA:
- {
- irq = mp_irqs[idx].mpc_srcbusirq;
- break;
- }
- case MP_BUS_PCI: /* PCI pin */
- {
- /*
- * PCI IRQs are mapped in order
- */
- i = irq = 0;
- while (i < apic)
- irq += nr_ioapic_registers[i++];
- irq += pin;
-
- /*
- * For MPS mode, so far only needed by ES7000 platform
- */
- if (ioapic_renumber_irq)
- irq = ioapic_renumber_irq(apic, irq);
+ if (test_bit(bus, mp_bus_not_pci))
+ irq = mp_irqs[idx].mpc_srcbusirq;
+ else {
+ /*
+ * PCI IRQs are mapped in order
+ */
+ i = irq = 0;
+ while (i < apic)
+ irq += nr_ioapic_registers[i++];
+ irq += pin;
- break;
- }
- default:
- {
- printk(KERN_ERR "unknown bus type %d.\n",bus);
- irq = 0;
- break;
- }
+ /*
+ * For MPS mode, so far only needed by ES7000 platform
+ */
+ if (ioapic_renumber_irq)
+ irq = ioapic_renumber_irq(apic, irq);
}
/*
@@ -1260,7 +1228,6 @@ static void __init setup_IO_APIC_irqs(void)
{
struct IO_APIC_route_entry entry;
int apic, pin, idx, irq, first_notcon = 1, vector;
- unsigned long flags;
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
@@ -1326,9 +1293,7 @@ static void __init setup_IO_APIC_irqs(void)
if (!apic && (irq < 16))
disable_8259A_irq(irq);
}
- spin_lock_irqsave(&ioapic_lock, flags);
- __ioapic_write_entry(apic, pin, entry);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry);
}
}
@@ -1524,8 +1489,8 @@ void /*__init*/ print_local_APIC(void * dummy)
printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
smp_processor_id(), hard_smp_processor_id());
- v = apic_read(APIC_ID);
- printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
+ GET_APIC_ID(read_apic_id()));
v = apic_read(APIC_LVR);
printk(KERN_INFO "... APIC VERSION: %08x\n", v);
ver = GET_APIC_VERSION(v);
@@ -1734,7 +1699,7 @@ void disable_IO_APIC(void)
entry.delivery_mode = dest_ExtINT; /* ExtInt */
entry.vector = 0;
entry.dest.physical.physical_dest =
- GET_APIC_ID(apic_read(APIC_ID));
+ GET_APIC_ID(read_apic_id());
/*
* Add it to the IO-APIC irq-routing table:
@@ -2031,8 +1996,7 @@ static inline void init_IO_APIC_traps(void)
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
for (irq = 0; irq < NR_IRQS ; irq++) {
- int tmp = irq;
- if (IO_APIC_IRQ(tmp) && !irq_vector[tmp]) {
+ if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
@@ -2104,7 +2068,7 @@ static void __init setup_nmi(void)
* cycles as some i82489DX-based boards have glue logic that keeps the
* 8259A interrupt line asserted until INTA. --macro
*/
-static inline void unlock_ExtINT_logic(void)
+static inline void __init unlock_ExtINT_logic(void)
{
int apic, pin, i;
struct IO_APIC_route_entry entry0, entry1;
@@ -2156,8 +2120,6 @@ static inline void unlock_ExtINT_logic(void)
ioapic_write_entry(apic, pin, entry0);
}
-int timer_uses_ioapic_pin_0;
-
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
@@ -2168,10 +2130,14 @@ static inline void __init check_timer(void)
{
int apic1, pin1, apic2, pin2;
int vector;
+ unsigned int ver;
unsigned long flags;
local_irq_save(flags);
+ ver = apic_read(APIC_LVR);
+ ver = GET_APIC_VERSION(ver);
+
/*
* get/set the timer IRQ vector:
*/
@@ -2184,11 +2150,15 @@ static inline void __init check_timer(void)
* mode for the 8259A whenever interrupts are routed
* through I/O APICs. Also IRQ0 has to be enabled in
* the 8259A which implies the virtual wire has to be
- * disabled in the local APIC.
+ * disabled in the local APIC. Finally timer interrupts
+ * need to be acknowledged manually in the 8259A for
+ * timer_interrupt() and for the i82489DX when using
+ * the NMI watchdog.
*/
apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
- timer_ack = 1;
+ timer_ack = !cpu_has_tsc;
+ timer_ack |= (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
if (timer_over_8254 > 0)
enable_8259A_irq(0);
@@ -2197,9 +2167,6 @@ static inline void __init check_timer(void)
pin2 = ioapic_i8259.pin;
apic2 = ioapic_i8259.apic;
- if (pin1 == 0)
- timer_uses_ioapic_pin_0 = 1;
-
printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
vector, apic1, pin1, apic2, pin2);
@@ -2477,6 +2444,7 @@ void destroy_irq(unsigned int irq)
dynamic_irq_cleanup(irq);
spin_lock_irqsave(&vector_lock, flags);
+ clear_bit(irq_vector[irq], used_vectors);
irq_vector[irq] = 0;
spin_unlock_irqrestore(&vector_lock, flags);
}
@@ -2789,7 +2757,6 @@ int __init io_apic_get_redir_entries (int ioapic)
int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
{
struct IO_APIC_route_entry entry;
- unsigned long flags;
if (!IO_APIC_IRQ(irq)) {
printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
@@ -2830,9 +2797,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
if (!ioapic && (irq < 16))
disable_8259A_irq(irq);
- spin_lock_irqsave(&ioapic_lock, flags);
- __ioapic_write_entry(ioapic, pin, entry);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(ioapic, pin, entry);
return 0;
}
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 1627c0d53e0b..ef1a8dfcc529 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -43,13 +43,15 @@
#include <asm/smp.h>
#include <asm/desc.h>
#include <asm/proto.h>
-#include <asm/mach_apic.h>
#include <asm/acpi.h>
#include <asm/dma.h>
#include <asm/nmi.h>
#include <asm/msidef.h>
#include <asm/hypertransport.h>
+#include <mach_ipi.h>
+#include <mach_apic.h>
+
struct irq_cfg {
cpumask_t domain;
cpumask_t old_domain;
@@ -101,6 +103,16 @@ DEFINE_SPINLOCK(vector_lock);
*/
int nr_ioapic_registers[MAX_IO_APICS];
+/* I/O APIC entries */
+struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
+int nr_ioapics;
+
+/* MP IRQ source entries */
+struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+
+/* # of MP IRQ source entries */
+int mp_irq_entries;
+
/*
* Rough estimation of how many shared IRQs there are, can
* be changed anytime.
@@ -155,11 +167,10 @@ static inline void io_apic_modify(unsigned int apic, unsigned int value)
writel(value, &io_apic->data);
}
-static int io_apic_level_ack_pending(unsigned int irq)
+static bool io_apic_level_ack_pending(unsigned int irq)
{
struct irq_pin_list *entry;
unsigned long flags;
- int pending = 0;
spin_lock_irqsave(&ioapic_lock, flags);
entry = irq_2_pin + irq;
@@ -172,13 +183,17 @@ static int io_apic_level_ack_pending(unsigned int irq)
break;
reg = io_apic_read(entry->apic, 0x10 + pin*2);
/* Is the remote IRR bit set? */
- pending |= (reg >> 14) & 1;
+ if ((reg >> 14) & 1) {
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ return true;
+ }
if (!entry->next)
break;
entry = irq_2_pin + entry->next;
}
spin_unlock_irqrestore(&ioapic_lock, flags);
- return pending;
+
+ return false;
}
/*
@@ -770,7 +785,7 @@ static void __clear_irq_vector(int irq)
per_cpu(vector_irq, cpu)[vector] = -1;
cfg->vector = 0;
- cfg->domain = CPU_MASK_NONE;
+ cpus_clear(cfg->domain);
}
void __setup_vector_irq(int cpu)
@@ -902,9 +917,8 @@ static void __init setup_IO_APIC_irqs(void)
static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, int vector)
{
struct IO_APIC_route_entry entry;
- unsigned long flags;
- memset(&entry,0,sizeof(entry));
+ memset(&entry, 0, sizeof(entry));
disable_8259A_irq(0);
@@ -932,10 +946,7 @@ static void __init setup_ExtINT_IRQ0_pin(unsigned int apic, unsigned int pin, in
/*
* Add it to the IO-APIC irq-routing table:
*/
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
- io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry);
enable_8259A_irq(0);
}
@@ -1066,8 +1077,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
smp_processor_id(), hard_smp_processor_id());
- v = apic_read(APIC_ID);
- printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(v));
+ printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id()));
v = apic_read(APIC_LVR);
printk(KERN_INFO "... APIC VERSION: %08x\n", v);
ver = GET_APIC_VERSION(v);
@@ -1261,7 +1271,7 @@ void disable_IO_APIC(void)
entry.dest_mode = 0; /* Physical */
entry.delivery_mode = dest_ExtINT; /* ExtInt */
entry.vector = 0;
- entry.dest = GET_APIC_ID(apic_read(APIC_ID));
+ entry.dest = GET_APIC_ID(read_apic_id());
/*
* Add it to the IO-APIC irq-routing table:
@@ -1352,9 +1362,7 @@ static int ioapic_retrigger_irq(unsigned int irq)
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- cpus_clear(mask);
- cpu_set(first_cpu(cfg->domain), mask);
-
+ mask = cpumask_of_cpu(first_cpu(cfg->domain));
send_IPI_mask(mask, cfg->vector);
spin_unlock_irqrestore(&vector_lock, flags);
@@ -1517,8 +1525,7 @@ static inline void init_IO_APIC_traps(void)
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
for (irq = 0; irq < NR_IRQS ; irq++) {
- int tmp = irq;
- if (IO_APIC_IRQ(tmp) && !irq_cfg[tmp].vector) {
+ if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
@@ -1592,22 +1599,19 @@ static void __init setup_nmi(void)
* cycles as some i82489DX-based boards have glue logic that keeps the
* 8259A interrupt line asserted until INTA. --macro
*/
-static inline void unlock_ExtINT_logic(void)
+static inline void __init unlock_ExtINT_logic(void)
{
int apic, pin, i;
struct IO_APIC_route_entry entry0, entry1;
unsigned char save_control, save_freq_select;
- unsigned long flags;
pin = find_isa_irq_pin(8, mp_INT);
apic = find_isa_irq_apic(8, mp_INT);
if (pin == -1)
return;
- spin_lock_irqsave(&ioapic_lock, flags);
- *(((int *)&entry0) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
- *(((int *)&entry0) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ entry0 = ioapic_read_entry(apic, pin);
+
clear_IO_APIC_pin(apic, pin);
memset(&entry1, 0, sizeof(entry1));
@@ -1620,10 +1624,7 @@ static inline void unlock_ExtINT_logic(void)
entry1.trigger = 0;
entry1.vector = 0;
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
- io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry1);
save_control = CMOS_READ(RTC_CONTROL);
save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
@@ -1642,10 +1643,7 @@ static inline void unlock_ExtINT_logic(void)
CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
clear_IO_APIC_pin(apic, pin);
- spin_lock_irqsave(&ioapic_lock, flags);
- io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
- io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
- spin_unlock_irqrestore(&ioapic_lock, flags);
+ ioapic_write_entry(apic, pin, entry0);
}
/*
@@ -2314,7 +2312,6 @@ static struct resource * __init ioapic_setup_resources(void)
res = (void *)mem;
if (mem != NULL) {
- memset(mem, 0, n);
mem += sizeof(struct resource) * nr_ioapics;
for (i = 0; i < nr_ioapics; i++) {
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
new file mode 100644
index 000000000000..c0df7b89ca23
--- /dev/null
+++ b/arch/x86/kernel/ipi.c
@@ -0,0 +1,178 @@
+#include <linux/cpumask.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+#include <linux/module.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/apic.h>
+#include <asm/proto.h>
+
+#ifdef CONFIG_X86_32
+#include <mach_apic.h>
+/*
+ * the following functions deal with sending IPIs between CPUs.
+ *
+ * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
+ */
+
+static inline int __prepare_ICR(unsigned int shortcut, int vector)
+{
+ unsigned int icr = shortcut | APIC_DEST_LOGICAL;
+
+ switch (vector) {
+ default:
+ icr |= APIC_DM_FIXED | vector;
+ break;
+ case NMI_VECTOR:
+ icr |= APIC_DM_NMI;
+ break;
+ }
+ return icr;
+}
+
+static inline int __prepare_ICR2(unsigned int mask)
+{
+ return SET_APIC_DEST_FIELD(mask);
+}
+
+void __send_IPI_shortcut(unsigned int shortcut, int vector)
+{
+ /*
+ * Subtle. In the case of the 'never do double writes' workaround
+ * we have to lock out interrupts to be safe. As we don't care
+ * of the value read we use an atomic rmw access to avoid costly
+ * cli/sti. Otherwise we use an even cheaper single atomic write
+ * to the APIC.
+ */
+ unsigned int cfg;
+
+ /*
+ * Wait for idle.
+ */
+ apic_wait_icr_idle();
+
+ /*
+ * No need to touch the target chip field
+ */
+ cfg = __prepare_ICR(shortcut, vector);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ apic_write_around(APIC_ICR, cfg);
+}
+
+void send_IPI_self(int vector)
+{
+ __send_IPI_shortcut(APIC_DEST_SELF, vector);
+}
+
+/*
+ * This is used to send an IPI with no shorthand notation (the destination is
+ * specified in bits 56 to 63 of the ICR).
+ */
+static inline void __send_IPI_dest_field(unsigned long mask, int vector)
+{
+ unsigned long cfg;
+
+ /*
+ * Wait for idle.
+ */
+ if (unlikely(vector == NMI_VECTOR))
+ safe_apic_wait_icr_idle();
+ else
+ apic_wait_icr_idle();
+
+ /*
+ * prepare target chip field
+ */
+ cfg = __prepare_ICR2(mask);
+ apic_write_around(APIC_ICR2, cfg);
+
+ /*
+ * program the ICR
+ */
+ cfg = __prepare_ICR(0, vector);
+
+ /*
+ * Send the IPI. The write to APIC_ICR fires this off.
+ */
+ apic_write_around(APIC_ICR, cfg);
+}
+
+/*
+ * This is only used on smaller machines.
+ */
+void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
+{
+ unsigned long mask = cpus_addr(cpumask)[0];
+ unsigned long flags;
+
+ local_irq_save(flags);
+ WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
+ __send_IPI_dest_field(mask, vector);
+ local_irq_restore(flags);
+}
+
+void send_IPI_mask_sequence(cpumask_t mask, int vector)
+{
+ unsigned long flags;
+ unsigned int query_cpu;
+
+ /*
+ * Hack. The clustered APIC addressing mode doesn't allow us to send
+ * to an arbitrary mask, so I do a unicasts to each CPU instead. This
+ * should be modified to do 1 message per cluster ID - mbligh
+ */
+
+ local_irq_save(flags);
+ for_each_possible_cpu(query_cpu) {
+ if (cpu_isset(query_cpu, mask)) {
+ __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
+ vector);
+ }
+ }
+ local_irq_restore(flags);
+}
+
+/* must come after the send_IPI functions above for inlining */
+#include <mach_ipi.h>
+static int convert_apicid_to_cpu(int apic_id)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
+ return i;
+ }
+ return -1;
+}
+
+int safe_smp_processor_id(void)
+{
+ int apicid, cpuid;
+
+ if (!boot_cpu_has(X86_FEATURE_APIC))
+ return 0;
+
+ apicid = hard_smp_processor_id();
+ if (apicid == BAD_APICID)
+ return 0;
+
+ cpuid = convert_apicid_to_cpu(apicid);
+
+ return cpuid >= 0 ? cpuid : 0;
+}
+#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index cef054b09d27..00bda7bcda63 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -79,7 +79,7 @@ unsigned int do_IRQ(struct pt_regs *regs)
if (unlikely((unsigned)irq >= NR_IRQS)) {
printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
- __FUNCTION__, irq);
+ __func__, irq);
BUG();
}
@@ -134,7 +134,7 @@ unsigned int do_IRQ(struct pt_regs *regs)
: "=a" (arg1), "=d" (arg2), "=b" (bx)
: "0" (irq), "1" (desc), "2" (isp),
"D" (desc->handle_irq)
- : "memory", "cc"
+ : "memory", "cc", "ecx"
);
} else
#endif
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 73354302fda7..c03205991718 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -6,23 +6,171 @@
*
* This file is released under the GPLv2.
*/
-
#include <linux/debugfs.h>
+#include <linux/uaccess.h>
#include <linux/stat.h>
#include <linux/init.h>
+#include <linux/io.h>
+#include <linux/mm.h>
#include <asm/setup.h>
#ifdef CONFIG_DEBUG_BOOT_PARAMS
+struct setup_data_node {
+ u64 paddr;
+ u32 type;
+ u32 len;
+};
+
+static ssize_t
+setup_data_read(struct file *file, char __user *user_buf, size_t count,
+ loff_t *ppos)
+{
+ struct setup_data_node *node = file->private_data;
+ unsigned long remain;
+ loff_t pos = *ppos;
+ struct page *pg;
+ void *p;
+ u64 pa;
+
+ if (pos < 0)
+ return -EINVAL;
+ if (pos >= node->len)
+ return 0;
+
+ if (count > node->len - pos)
+ count = node->len - pos;
+ pa = node->paddr + sizeof(struct setup_data) + pos;
+ pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT);
+ if (PageHighMem(pg)) {
+ p = ioremap_cache(pa, count);
+ if (!p)
+ return -ENXIO;
+ } else {
+ p = __va(pa);
+ }
+
+ remain = copy_to_user(user_buf, p, count);
+
+ if (PageHighMem(pg))
+ iounmap(p);
+
+ if (remain)
+ return -EFAULT;
+
+ *ppos = pos + count;
+
+ return count;
+}
+
+static int setup_data_open(struct inode *inode, struct file *file)
+{
+ file->private_data = inode->i_private;
+ return 0;
+}
+
+static const struct file_operations fops_setup_data = {
+ .read = setup_data_read,
+ .open = setup_data_open,
+};
+
+static int __init
+create_setup_data_node(struct dentry *parent, int no,
+ struct setup_data_node *node)
+{
+ struct dentry *d, *type, *data;
+ char buf[16];
+ int error;
+
+ sprintf(buf, "%d", no);
+ d = debugfs_create_dir(buf, parent);
+ if (!d) {
+ error = -ENOMEM;
+ goto err_return;
+ }
+ type = debugfs_create_x32("type", S_IRUGO, d, &node->type);
+ if (!type) {
+ error = -ENOMEM;
+ goto err_dir;
+ }
+ data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data);
+ if (!data) {
+ error = -ENOMEM;
+ goto err_type;
+ }
+ return 0;
+
+err_type:
+ debugfs_remove(type);
+err_dir:
+ debugfs_remove(d);
+err_return:
+ return error;
+}
+
+static int __init create_setup_data_nodes(struct dentry *parent)
+{
+ struct setup_data_node *node;
+ struct setup_data *data;
+ int error, no = 0;
+ struct dentry *d;
+ struct page *pg;
+ u64 pa_data;
+
+ d = debugfs_create_dir("setup_data", parent);
+ if (!d) {
+ error = -ENOMEM;
+ goto err_return;
+ }
+
+ pa_data = boot_params.hdr.setup_data;
+
+ while (pa_data) {
+ node = kmalloc(sizeof(*node), GFP_KERNEL);
+ if (!node) {
+ error = -ENOMEM;
+ goto err_dir;
+ }
+ pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT);
+ if (PageHighMem(pg)) {
+ data = ioremap_cache(pa_data, sizeof(*data));
+ if (!data) {
+ error = -ENXIO;
+ goto err_dir;
+ }
+ } else {
+ data = __va(pa_data);
+ }
+
+ node->paddr = pa_data;
+ node->type = data->type;
+ node->len = data->len;
+ error = create_setup_data_node(d, no, node);
+ pa_data = data->next;
+
+ if (PageHighMem(pg))
+ iounmap(data);
+ if (error)
+ goto err_dir;
+ no++;
+ }
+ return 0;
+
+err_dir:
+ debugfs_remove(d);
+err_return:
+ return error;
+}
+
static struct debugfs_blob_wrapper boot_params_blob = {
- .data = &boot_params,
- .size = sizeof(boot_params),
+ .data = &boot_params,
+ .size = sizeof(boot_params),
};
static int __init boot_params_kdebugfs_init(void)
{
- int error;
struct dentry *dbp, *version, *data;
+ int error;
dbp = debugfs_create_dir("boot_params", NULL);
if (!dbp) {
@@ -41,7 +189,13 @@ static int __init boot_params_kdebugfs_init(void)
error = -ENOMEM;
goto err_version;
}
+ error = create_setup_data_nodes(dbp);
+ if (error)
+ goto err_data;
return 0;
+
+err_data:
+ debugfs_remove(data);
err_version:
debugfs_remove(version);
err_dir:
@@ -61,5 +215,4 @@ static int __init arch_kdebugfs_init(void)
return error;
}
-
arch_initcall(arch_kdebugfs_init);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
new file mode 100644
index 000000000000..f47f0eb886b8
--- /dev/null
+++ b/arch/x86/kernel/kgdb.c
@@ -0,0 +1,567 @@
+/*
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ */
+
+/*
+ * Copyright (C) 2004 Amit S. Kale <amitkale@linsyssoft.com>
+ * Copyright (C) 2000-2001 VERITAS Software Corporation.
+ * Copyright (C) 2002 Andi Kleen, SuSE Labs
+ * Copyright (C) 2004 LinSysSoft Technologies Pvt. Ltd.
+ * Copyright (C) 2007 MontaVista Software, Inc.
+ * Copyright (C) 2007-2008 Jason Wessel, Wind River Systems, Inc.
+ */
+/****************************************************************************
+ * Contributor: Lake Stevens Instrument Division$
+ * Written by: Glenn Engel $
+ * Updated by: Amit Kale<akale@veritas.com>
+ * Updated by: Tom Rini <trini@kernel.crashing.org>
+ * Updated by: Jason Wessel <jason.wessel@windriver.com>
+ * Modified for 386 by Jim Kingdon, Cygnus Support.
+ * Origianl kgdb, compatibility with 2.1.xx kernel by
+ * David Grothe <dave@gcom.com>
+ * Integrated into 2.2.5 kernel by Tigran Aivazian <tigran@sco.com>
+ * X86_64 changes from Andi Kleen's patch merged by Jim Houston
+ */
+#include <linux/spinlock.h>
+#include <linux/kdebug.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/ptrace.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/kgdb.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <linux/nmi.h>
+
+#include <asm/apicdef.h>
+#include <asm/system.h>
+
+#include <mach_ipi.h>
+
+/*
+ * Put the error code here just in case the user cares:
+ */
+static int gdb_x86errcode;
+
+/*
+ * Likewise, the vector number here (since GDB only gets the signal
+ * number through the usual means, and that's not very specific):
+ */
+static int gdb_x86vector = -1;
+
+/**
+ * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
+ * @gdb_regs: A pointer to hold the registers in the order GDB wants.
+ * @regs: The &struct pt_regs of the current process.
+ *
+ * Convert the pt_regs in @regs into the format for registers that
+ * GDB expects, stored in @gdb_regs.
+ */
+void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ gdb_regs[GDB_AX] = regs->ax;
+ gdb_regs[GDB_BX] = regs->bx;
+ gdb_regs[GDB_CX] = regs->cx;
+ gdb_regs[GDB_DX] = regs->dx;
+ gdb_regs[GDB_SI] = regs->si;
+ gdb_regs[GDB_DI] = regs->di;
+ gdb_regs[GDB_BP] = regs->bp;
+ gdb_regs[GDB_PS] = regs->flags;
+ gdb_regs[GDB_PC] = regs->ip;
+#ifdef CONFIG_X86_32
+ gdb_regs[GDB_DS] = regs->ds;
+ gdb_regs[GDB_ES] = regs->es;
+ gdb_regs[GDB_CS] = regs->cs;
+ gdb_regs[GDB_SS] = __KERNEL_DS;
+ gdb_regs[GDB_FS] = 0xFFFF;
+ gdb_regs[GDB_GS] = 0xFFFF;
+#else
+ gdb_regs[GDB_R8] = regs->r8;
+ gdb_regs[GDB_R9] = regs->r9;
+ gdb_regs[GDB_R10] = regs->r10;
+ gdb_regs[GDB_R11] = regs->r11;
+ gdb_regs[GDB_R12] = regs->r12;
+ gdb_regs[GDB_R13] = regs->r13;
+ gdb_regs[GDB_R14] = regs->r14;
+ gdb_regs[GDB_R15] = regs->r15;
+#endif
+ gdb_regs[GDB_SP] = regs->sp;
+}
+
+/**
+ * sleeping_thread_to_gdb_regs - Convert ptrace regs to GDB regs
+ * @gdb_regs: A pointer to hold the registers in the order GDB wants.
+ * @p: The &struct task_struct of the desired process.
+ *
+ * Convert the register values of the sleeping process in @p to
+ * the format that GDB expects.
+ * This function is called when kgdb does not have access to the
+ * &struct pt_regs and therefore it should fill the gdb registers
+ * @gdb_regs with what has been saved in &struct thread_struct
+ * thread field during switch_to.
+ */
+void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
+{
+ gdb_regs[GDB_AX] = 0;
+ gdb_regs[GDB_BX] = 0;
+ gdb_regs[GDB_CX] = 0;
+ gdb_regs[GDB_DX] = 0;
+ gdb_regs[GDB_SI] = 0;
+ gdb_regs[GDB_DI] = 0;
+ gdb_regs[GDB_BP] = *(unsigned long *)p->thread.sp;
+#ifdef CONFIG_X86_32
+ gdb_regs[GDB_DS] = __KERNEL_DS;
+ gdb_regs[GDB_ES] = __KERNEL_DS;
+ gdb_regs[GDB_PS] = 0;
+ gdb_regs[GDB_CS] = __KERNEL_CS;
+ gdb_regs[GDB_PC] = p->thread.ip;
+ gdb_regs[GDB_SS] = __KERNEL_DS;
+ gdb_regs[GDB_FS] = 0xFFFF;
+ gdb_regs[GDB_GS] = 0xFFFF;
+#else
+ gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
+ gdb_regs[GDB_PC] = 0;
+ gdb_regs[GDB_R8] = 0;
+ gdb_regs[GDB_R9] = 0;
+ gdb_regs[GDB_R10] = 0;
+ gdb_regs[GDB_R11] = 0;
+ gdb_regs[GDB_R12] = 0;
+ gdb_regs[GDB_R13] = 0;
+ gdb_regs[GDB_R14] = 0;
+ gdb_regs[GDB_R15] = 0;
+#endif
+ gdb_regs[GDB_SP] = p->thread.sp;
+}
+
+/**
+ * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
+ * @gdb_regs: A pointer to hold the registers we've received from GDB.
+ * @regs: A pointer to a &struct pt_regs to hold these values in.
+ *
+ * Convert the GDB regs in @gdb_regs into the pt_regs, and store them
+ * in @regs.
+ */
+void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+{
+ regs->ax = gdb_regs[GDB_AX];
+ regs->bx = gdb_regs[GDB_BX];
+ regs->cx = gdb_regs[GDB_CX];
+ regs->dx = gdb_regs[GDB_DX];
+ regs->si = gdb_regs[GDB_SI];
+ regs->di = gdb_regs[GDB_DI];
+ regs->bp = gdb_regs[GDB_BP];
+ regs->flags = gdb_regs[GDB_PS];
+ regs->ip = gdb_regs[GDB_PC];
+#ifdef CONFIG_X86_32
+ regs->ds = gdb_regs[GDB_DS];
+ regs->es = gdb_regs[GDB_ES];
+ regs->cs = gdb_regs[GDB_CS];
+#else
+ regs->r8 = gdb_regs[GDB_R8];
+ regs->r9 = gdb_regs[GDB_R9];
+ regs->r10 = gdb_regs[GDB_R10];
+ regs->r11 = gdb_regs[GDB_R11];
+ regs->r12 = gdb_regs[GDB_R12];
+ regs->r13 = gdb_regs[GDB_R13];
+ regs->r14 = gdb_regs[GDB_R14];
+ regs->r15 = gdb_regs[GDB_R15];
+#endif
+}
+
+static struct hw_breakpoint {
+ unsigned enabled;
+ unsigned type;
+ unsigned len;
+ unsigned long addr;
+} breakinfo[4];
+
+static void kgdb_correct_hw_break(void)
+{
+ unsigned long dr7;
+ int correctit = 0;
+ int breakbit;
+ int breakno;
+
+ get_debugreg(dr7, 7);
+ for (breakno = 0; breakno < 4; breakno++) {
+ breakbit = 2 << (breakno << 1);
+ if (!(dr7 & breakbit) && breakinfo[breakno].enabled) {
+ correctit = 1;
+ dr7 |= breakbit;
+ dr7 &= ~(0xf0000 << (breakno << 2));
+ dr7 |= ((breakinfo[breakno].len << 2) |
+ breakinfo[breakno].type) <<
+ ((breakno << 2) + 16);
+ if (breakno >= 0 && breakno <= 3)
+ set_debugreg(breakinfo[breakno].addr, breakno);
+
+ } else {
+ if ((dr7 & breakbit) && !breakinfo[breakno].enabled) {
+ correctit = 1;
+ dr7 &= ~breakbit;
+ dr7 &= ~(0xf0000 << (breakno << 2));
+ }
+ }
+ }
+ if (correctit)
+ set_debugreg(dr7, 7);
+}
+
+static int
+kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ if (breakinfo[i].addr == addr && breakinfo[i].enabled)
+ break;
+ if (i == 4)
+ return -1;
+
+ breakinfo[i].enabled = 0;
+
+ return 0;
+}
+
+static void kgdb_remove_all_hw_break(void)
+{
+ int i;
+
+ for (i = 0; i < 4; i++)
+ memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint));
+}
+
+static int
+kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
+{
+ unsigned type;
+ int i;
+
+ for (i = 0; i < 4; i++)
+ if (!breakinfo[i].enabled)
+ break;
+ if (i == 4)
+ return -1;
+
+ switch (bptype) {
+ case BP_HARDWARE_BREAKPOINT:
+ type = 0;
+ len = 1;
+ break;
+ case BP_WRITE_WATCHPOINT:
+ type = 1;
+ break;
+ case BP_ACCESS_WATCHPOINT:
+ type = 3;
+ break;
+ default:
+ return -1;
+ }
+
+ if (len == 1 || len == 2 || len == 4)
+ breakinfo[i].len = len - 1;
+ else
+ return -1;
+
+ breakinfo[i].enabled = 1;
+ breakinfo[i].addr = addr;
+ breakinfo[i].type = type;
+
+ return 0;
+}
+
+/**
+ * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
+ * @regs: Current &struct pt_regs.
+ *
+ * This function will be called if the particular architecture must
+ * disable hardware debugging while it is processing gdb packets or
+ * handling exception.
+ */
+void kgdb_disable_hw_debug(struct pt_regs *regs)
+{
+ /* Disable hardware debugging while we are in kgdb: */
+ set_debugreg(0UL, 7);
+}
+
+/**
+ * kgdb_post_primary_code - Save error vector/code numbers.
+ * @regs: Original pt_regs.
+ * @e_vector: Original error vector.
+ * @err_code: Original error code.
+ *
+ * This is needed on architectures which support SMP and KGDB.
+ * This function is called after all the slave cpus have been put
+ * to a know spin state and the primary CPU has control over KGDB.
+ */
+void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
+{
+ /* primary processor is completely in the debugger */
+ gdb_x86vector = e_vector;
+ gdb_x86errcode = err_code;
+}
+
+#ifdef CONFIG_SMP
+/**
+ * kgdb_roundup_cpus - Get other CPUs into a holding pattern
+ * @flags: Current IRQ state
+ *
+ * On SMP systems, we need to get the attention of the other CPUs
+ * and get them be in a known state. This should do what is needed
+ * to get the other CPUs to call kgdb_wait(). Note that on some arches,
+ * the NMI approach is not used for rounding up all the CPUs. For example,
+ * in case of MIPS, smp_call_function() is used to roundup CPUs. In
+ * this case, we have to make sure that interrupts are enabled before
+ * calling smp_call_function(). The argument to this function is
+ * the flags that will be used when restoring the interrupts. There is
+ * local_irq_save() call before kgdb_roundup_cpus().
+ *
+ * On non-SMP systems, this is not called.
+ */
+void kgdb_roundup_cpus(unsigned long flags)
+{
+ send_IPI_allbutself(APIC_DM_NMI);
+}
+#endif
+
+/**
+ * kgdb_arch_handle_exception - Handle architecture specific GDB packets.
+ * @vector: The error vector of the exception that happened.
+ * @signo: The signal number of the exception that happened.
+ * @err_code: The error code of the exception that happened.
+ * @remcom_in_buffer: The buffer of the packet we have read.
+ * @remcom_out_buffer: The buffer of %BUFMAX bytes to write a packet into.
+ * @regs: The &struct pt_regs of the current process.
+ *
+ * This function MUST handle the 'c' and 's' command packets,
+ * as well packets to set / remove a hardware breakpoint, if used.
+ * If there are additional packets which the hardware needs to handle,
+ * they are handled here. The code should return -1 if it wants to
+ * process more packets, and a %0 or %1 if it wants to exit from the
+ * kgdb callback.
+ */
+int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
+ char *remcomInBuffer, char *remcomOutBuffer,
+ struct pt_regs *linux_regs)
+{
+ unsigned long addr;
+ unsigned long dr6;
+ char *ptr;
+ int newPC;
+
+ switch (remcomInBuffer[0]) {
+ case 'c':
+ case 's':
+ /* try to read optional parameter, pc unchanged if no parm */
+ ptr = &remcomInBuffer[1];
+ if (kgdb_hex2long(&ptr, &addr))
+ linux_regs->ip = addr;
+ case 'D':
+ case 'k':
+ newPC = linux_regs->ip;
+
+ /* clear the trace bit */
+ linux_regs->flags &= ~X86_EFLAGS_TF;
+ atomic_set(&kgdb_cpu_doing_single_step, -1);
+
+ /* set the trace bit if we're stepping */
+ if (remcomInBuffer[0] == 's') {
+ linux_regs->flags |= X86_EFLAGS_TF;
+ kgdb_single_step = 1;
+ if (kgdb_contthread) {
+ atomic_set(&kgdb_cpu_doing_single_step,
+ raw_smp_processor_id());
+ }
+ }
+
+ get_debugreg(dr6, 6);
+ if (!(dr6 & 0x4000)) {
+ int breakno;
+
+ for (breakno = 0; breakno < 4; breakno++) {
+ if (dr6 & (1 << breakno) &&
+ breakinfo[breakno].type == 0) {
+ /* Set restore flag: */
+ linux_regs->flags |= X86_EFLAGS_RF;
+ break;
+ }
+ }
+ }
+ set_debugreg(0UL, 6);
+ kgdb_correct_hw_break();
+
+ return 0;
+ }
+
+ /* this means that we do not want to exit from the handler: */
+ return -1;
+}
+
+static inline int
+single_step_cont(struct pt_regs *regs, struct die_args *args)
+{
+ /*
+ * Single step exception from kernel space to user space so
+ * eat the exception and continue the process:
+ */
+ printk(KERN_ERR "KGDB: trap/step from kernel to user space, "
+ "resuming...\n");
+ kgdb_arch_handle_exception(args->trapnr, args->signr,
+ args->err, "c", "", regs);
+
+ return NOTIFY_STOP;
+}
+
+static int was_in_debug_nmi[NR_CPUS];
+
+static int __kgdb_notify(struct die_args *args, unsigned long cmd)
+{
+ struct pt_regs *regs = args->regs;
+
+ switch (cmd) {
+ case DIE_NMI:
+ if (atomic_read(&kgdb_active) != -1) {
+ /* KGDB CPU roundup */
+ kgdb_nmicallback(raw_smp_processor_id(), regs);
+ was_in_debug_nmi[raw_smp_processor_id()] = 1;
+ touch_nmi_watchdog();
+ return NOTIFY_STOP;
+ }
+ return NOTIFY_DONE;
+
+ case DIE_NMI_IPI:
+ if (atomic_read(&kgdb_active) != -1) {
+ /* KGDB CPU roundup */
+ kgdb_nmicallback(raw_smp_processor_id(), regs);
+ was_in_debug_nmi[raw_smp_processor_id()] = 1;
+ touch_nmi_watchdog();
+ }
+ return NOTIFY_DONE;
+
+ case DIE_NMIUNKNOWN:
+ if (was_in_debug_nmi[raw_smp_processor_id()]) {
+ was_in_debug_nmi[raw_smp_processor_id()] = 0;
+ return NOTIFY_STOP;
+ }
+ return NOTIFY_DONE;
+
+ case DIE_NMIWATCHDOG:
+ if (atomic_read(&kgdb_active) != -1) {
+ /* KGDB CPU roundup: */
+ kgdb_nmicallback(raw_smp_processor_id(), regs);
+ return NOTIFY_STOP;
+ }
+ /* Enter debugger: */
+ break;
+
+ case DIE_DEBUG:
+ if (atomic_read(&kgdb_cpu_doing_single_step) ==
+ raw_smp_processor_id() &&
+ user_mode(regs))
+ return single_step_cont(regs, args);
+ /* fall through */
+ default:
+ if (user_mode(regs))
+ return NOTIFY_DONE;
+ }
+
+ if (kgdb_handle_exception(args->trapnr, args->signr, args->err, regs))
+ return NOTIFY_DONE;
+
+ /* Must touch watchdog before return to normal operation */
+ touch_nmi_watchdog();
+ return NOTIFY_STOP;
+}
+
+static int
+kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
+{
+ unsigned long flags;
+ int ret;
+
+ local_irq_save(flags);
+ ret = __kgdb_notify(ptr, cmd);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static struct notifier_block kgdb_notifier = {
+ .notifier_call = kgdb_notify,
+
+ /*
+ * Lowest-prio notifier priority, we want to be notified last:
+ */
+ .priority = -INT_MAX,
+};
+
+/**
+ * kgdb_arch_init - Perform any architecture specific initalization.
+ *
+ * This function will handle the initalization of any architecture
+ * specific callbacks.
+ */
+int kgdb_arch_init(void)
+{
+ return register_die_notifier(&kgdb_notifier);
+}
+
+/**
+ * kgdb_arch_exit - Perform any architecture specific uninitalization.
+ *
+ * This function will handle the uninitalization of any architecture
+ * specific callbacks, for dynamic registration and unregistration.
+ */
+void kgdb_arch_exit(void)
+{
+ unregister_die_notifier(&kgdb_notifier);
+}
+
+/**
+ *
+ * kgdb_skipexception - Bail out of KGDB when we've been triggered.
+ * @exception: Exception vector number
+ * @regs: Current &struct pt_regs.
+ *
+ * On some architectures we need to skip a breakpoint exception when
+ * it occurs after a breakpoint has been removed.
+ *
+ * Skip an int3 exception when it occurs after a breakpoint has been
+ * removed. Backtrack eip by 1 since the int3 would have caused it to
+ * increment by 1.
+ */
+int kgdb_skipexception(int exception, struct pt_regs *regs)
+{
+ if (exception == 3 && kgdb_isremovedbreak(regs->ip - 1)) {
+ regs->ip -= 1;
+ return 1;
+ }
+ return 0;
+}
+
+unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs)
+{
+ if (exception == 3)
+ return instruction_pointer(regs) - 1;
+ return instruction_pointer(regs);
+}
+
+struct kgdb_arch arch_kgdb_ops = {
+ /* Breakpoint instruction: */
+ .gdb_bpt_instr = { 0xcc },
+ .flags = KGDB_HW_BREAKPOINT,
+ .set_hw_breakpoint = kgdb_set_hw_break,
+ .remove_hw_breakpoint = kgdb_remove_hw_break,
+ .remove_all_hw_break = kgdb_remove_all_hw_break,
+ .correct_hw_break = kgdb_correct_hw_break,
+};
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 34a591283f5d..b8c6743a13da 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -410,13 +410,13 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
static void __kprobes clear_btf(void)
{
if (test_thread_flag(TIF_DEBUGCTLMSR))
- wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
+ update_debugctlmsr(0);
}
static void __kprobes restore_btf(void)
{
if (test_thread_flag(TIF_DEBUGCTLMSR))
- wrmsrl(MSR_IA32_DEBUGCTLMSR, current->thread.debugctlmsr);
+ update_debugctlmsr(current->thread.debugctlmsr);
}
static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
@@ -489,7 +489,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
break;
case KPROBE_HIT_SS:
if (p == kprobe_running()) {
- regs->flags &= ~TF_MASK;
+ regs->flags &= ~X86_EFLAGS_TF;
regs->flags |= kcb->kprobe_saved_flags;
return 0;
} else {
@@ -858,15 +858,15 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
if (!cur)
return 0;
+ resume_execution(cur, regs, kcb);
+ regs->flags |= kcb->kprobe_saved_flags;
+ trace_hardirqs_fixup_flags(regs->flags);
+
if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
kcb->kprobe_status = KPROBE_HIT_SSDONE;
cur->post_handler(cur, regs, 0);
}
- resume_execution(cur, regs, kcb);
- regs->flags |= kcb->kprobe_saved_flags;
- trace_hardirqs_fixup_flags(regs->flags);
-
/* Restore back the original saved kprobes variables and continue. */
if (kcb->kprobe_status == KPROBE_REENTER) {
restore_previous_kprobe(kcb);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
new file mode 100644
index 000000000000..8b7a3cf37d2b
--- /dev/null
+++ b/arch/x86/kernel/kvm.c
@@ -0,0 +1,248 @@
+/*
+ * KVM paravirt_ops implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ * Copyright IBM Corporation, 2007
+ * Authors: Anthony Liguori <aliguori@us.ibm.com>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kvm_para.h>
+#include <linux/cpu.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/hardirq.h>
+
+#define MMU_QUEUE_SIZE 1024
+
+struct kvm_para_state {
+ u8 mmu_queue[MMU_QUEUE_SIZE];
+ int mmu_queue_len;
+ enum paravirt_lazy_mode mode;
+};
+
+static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+
+static struct kvm_para_state *kvm_para_state(void)
+{
+ return &per_cpu(para_state, raw_smp_processor_id());
+}
+
+/*
+ * No need for any "IO delay" on KVM
+ */
+static void kvm_io_delay(void)
+{
+}
+
+static void kvm_mmu_op(void *buffer, unsigned len)
+{
+ int r;
+ unsigned long a1, a2;
+
+ do {
+ a1 = __pa(buffer);
+ a2 = 0; /* on i386 __pa() always returns <4G */
+ r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2);
+ buffer += r;
+ len -= r;
+ } while (len);
+}
+
+static void mmu_queue_flush(struct kvm_para_state *state)
+{
+ if (state->mmu_queue_len) {
+ kvm_mmu_op(state->mmu_queue, state->mmu_queue_len);
+ state->mmu_queue_len = 0;
+ }
+}
+
+static void kvm_deferred_mmu_op(void *buffer, int len)
+{
+ struct kvm_para_state *state = kvm_para_state();
+
+ if (state->mode != PARAVIRT_LAZY_MMU) {
+ kvm_mmu_op(buffer, len);
+ return;
+ }
+ if (state->mmu_queue_len + len > sizeof state->mmu_queue)
+ mmu_queue_flush(state);
+ memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len);
+ state->mmu_queue_len += len;
+}
+
+static void kvm_mmu_write(void *dest, u64 val)
+{
+ __u64 pte_phys;
+ struct kvm_mmu_op_write_pte wpte;
+
+#ifdef CONFIG_HIGHPTE
+ struct page *page;
+ unsigned long dst = (unsigned long) dest;
+
+ page = kmap_atomic_to_page(dest);
+ pte_phys = page_to_pfn(page);
+ pte_phys <<= PAGE_SHIFT;
+ pte_phys += (dst & ~(PAGE_MASK));
+#else
+ pte_phys = (unsigned long)__pa(dest);
+#endif
+ wpte.header.op = KVM_MMU_OP_WRITE_PTE;
+ wpte.pte_val = val;
+ wpte.pte_phys = pte_phys;
+
+ kvm_deferred_mmu_op(&wpte, sizeof wpte);
+}
+
+/*
+ * We only need to hook operations that are MMU writes. We hook these so that
+ * we can use lazy MMU mode to batch these operations. We could probably
+ * improve the performance of the host code if we used some of the information
+ * here to simplify processing of batched writes.
+ */
+static void kvm_set_pte(pte_t *ptep, pte_t pte)
+{
+ kvm_mmu_write(ptep, pte_val(pte));
+}
+
+static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ kvm_mmu_write(ptep, pte_val(pte));
+}
+
+static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ kvm_mmu_write(pmdp, pmd_val(pmd));
+}
+
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ kvm_mmu_write(ptep, pte_val(pte));
+}
+
+static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, pte_t pte)
+{
+ kvm_mmu_write(ptep, pte_val(pte));
+}
+
+static void kvm_pte_clear(struct mm_struct *mm,
+ unsigned long addr, pte_t *ptep)
+{
+ kvm_mmu_write(ptep, 0);
+}
+
+static void kvm_pmd_clear(pmd_t *pmdp)
+{
+ kvm_mmu_write(pmdp, 0);
+}
+#endif
+
+static void kvm_set_pud(pud_t *pudp, pud_t pud)
+{
+ kvm_mmu_write(pudp, pud_val(pud));
+}
+
+#if PAGETABLE_LEVELS == 4
+static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ kvm_mmu_write(pgdp, pgd_val(pgd));
+}
+#endif
+#endif /* PAGETABLE_LEVELS >= 3 */
+
+static void kvm_flush_tlb(void)
+{
+ struct kvm_mmu_op_flush_tlb ftlb = {
+ .header.op = KVM_MMU_OP_FLUSH_TLB,
+ };
+
+ kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
+}
+
+static void kvm_release_pt(u32 pfn)
+{
+ struct kvm_mmu_op_release_pt rpt = {
+ .header.op = KVM_MMU_OP_RELEASE_PT,
+ .pt_phys = (u64)pfn << PAGE_SHIFT,
+ };
+
+ kvm_mmu_op(&rpt, sizeof rpt);
+}
+
+static void kvm_enter_lazy_mmu(void)
+{
+ struct kvm_para_state *state = kvm_para_state();
+
+ paravirt_enter_lazy_mmu();
+ state->mode = paravirt_get_lazy_mode();
+}
+
+static void kvm_leave_lazy_mmu(void)
+{
+ struct kvm_para_state *state = kvm_para_state();
+
+ mmu_queue_flush(state);
+ paravirt_leave_lazy(paravirt_get_lazy_mode());
+ state->mode = paravirt_get_lazy_mode();
+}
+
+static void paravirt_ops_setup(void)
+{
+ pv_info.name = "KVM";
+ pv_info.paravirt_enabled = 1;
+
+ if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
+ pv_cpu_ops.io_delay = kvm_io_delay;
+
+ if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
+ pv_mmu_ops.set_pte = kvm_set_pte;
+ pv_mmu_ops.set_pte_at = kvm_set_pte_at;
+ pv_mmu_ops.set_pmd = kvm_set_pmd;
+#if PAGETABLE_LEVELS >= 3
+#ifdef CONFIG_X86_PAE
+ pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic;
+ pv_mmu_ops.set_pte_present = kvm_set_pte_present;
+ pv_mmu_ops.pte_clear = kvm_pte_clear;
+ pv_mmu_ops.pmd_clear = kvm_pmd_clear;
+#endif
+ pv_mmu_ops.set_pud = kvm_set_pud;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.set_pgd = kvm_set_pgd;
+#endif
+#endif
+ pv_mmu_ops.flush_tlb_user = kvm_flush_tlb;
+ pv_mmu_ops.release_pte = kvm_release_pt;
+ pv_mmu_ops.release_pmd = kvm_release_pt;
+ pv_mmu_ops.release_pud = kvm_release_pt;
+
+ pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
+ pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
+ }
+}
+
+void __init kvm_guest_init(void)
+{
+ if (!kvm_para_available())
+ return;
+
+ paravirt_ops_setup();
+}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
new file mode 100644
index 000000000000..ddee04043aeb
--- /dev/null
+++ b/arch/x86/kernel/kvmclock.c
@@ -0,0 +1,187 @@
+/* KVM paravirtual clock driver. A clocksource implementation
+ Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include <linux/clocksource.h>
+#include <linux/kvm_para.h>
+#include <asm/arch_hooks.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <linux/percpu.h>
+#include <asm/reboot.h>
+
+#define KVM_SCALE 22
+
+static int kvmclock = 1;
+
+static int parse_no_kvmclock(char *arg)
+{
+ kvmclock = 0;
+ return 0;
+}
+early_param("no-kvmclock", parse_no_kvmclock);
+
+/* The hypervisor will put information about time periodically here */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock);
+#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field
+
+static inline u64 kvm_get_delta(u64 last_tsc)
+{
+ int cpu = smp_processor_id();
+ u64 delta = native_read_tsc() - last_tsc;
+ return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE;
+}
+
+static struct kvm_wall_clock wall_clock;
+static cycle_t kvm_clock_read(void);
+/*
+ * The wallclock is the time of day when we booted. Since then, some time may
+ * have elapsed since the hypervisor wrote the data. So we try to account for
+ * that with system time
+ */
+unsigned long kvm_get_wallclock(void)
+{
+ u32 wc_sec, wc_nsec;
+ u64 delta;
+ struct timespec ts;
+ int version, nsec;
+ int low, high;
+
+ low = (int)__pa(&wall_clock);
+ high = ((u64)__pa(&wall_clock) >> 32);
+
+ delta = kvm_clock_read();
+
+ native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+ do {
+ version = wall_clock.wc_version;
+ rmb();
+ wc_sec = wall_clock.wc_sec;
+ wc_nsec = wall_clock.wc_nsec;
+ rmb();
+ } while ((wall_clock.wc_version != version) || (version & 1));
+
+ delta = kvm_clock_read() - delta;
+ delta += wc_nsec;
+ nsec = do_div(delta, NSEC_PER_SEC);
+ set_normalized_timespec(&ts, wc_sec + delta, nsec);
+ /*
+ * Of all mechanisms of time adjustment I've tested, this one
+ * was the champion!
+ */
+ return ts.tv_sec + 1;
+}
+
+int kvm_set_wallclock(unsigned long now)
+{
+ return 0;
+}
+
+/*
+ * This is our read_clock function. The host puts an tsc timestamp each time
+ * it updates a new time. Without the tsc adjustment, we can have a situation
+ * in which a vcpu starts to run earlier (smaller system_time), but probes
+ * time later (compared to another vcpu), leading to backwards time
+ */
+static cycle_t kvm_clock_read(void)
+{
+ u64 last_tsc, now;
+ int cpu;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+
+ last_tsc = get_clock(cpu, tsc_timestamp);
+ now = get_clock(cpu, system_time);
+
+ now += kvm_get_delta(last_tsc);
+ preempt_enable();
+
+ return now;
+}
+static struct clocksource kvm_clock = {
+ .name = "kvm-clock",
+ .read = kvm_clock_read,
+ .rating = 400,
+ .mask = CLOCKSOURCE_MASK(64),
+ .mult = 1 << KVM_SCALE,
+ .shift = KVM_SCALE,
+ .flags = CLOCK_SOURCE_IS_CONTINUOUS,
+};
+
+static int kvm_register_clock(void)
+{
+ int cpu = smp_processor_id();
+ int low, high;
+ low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
+ high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
+
+ return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+}
+
+static void kvm_setup_secondary_clock(void)
+{
+ /*
+ * Now that the first cpu already had this clocksource initialized,
+ * we shouldn't fail.
+ */
+ WARN_ON(kvm_register_clock());
+ /* ok, done with our trickery, call native */
+ setup_secondary_APIC_clock();
+}
+
+/*
+ * After the clock is registered, the host will keep writing to the
+ * registered memory location. If the guest happens to shutdown, this memory
+ * won't be valid. In cases like kexec, in which you install a new kernel, this
+ * means a random memory location will be kept being written. So before any
+ * kind of shutdown from our side, we unregister the clock by writting anything
+ * that does not have the 'enable' bit set in the msr
+ */
+#ifdef CONFIG_KEXEC
+static void kvm_crash_shutdown(struct pt_regs *regs)
+{
+ native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+ native_machine_crash_shutdown(regs);
+}
+#endif
+
+static void kvm_shutdown(void)
+{
+ native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+ native_machine_shutdown();
+}
+
+void __init kvmclock_init(void)
+{
+ if (!kvm_para_available())
+ return;
+
+ if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
+ if (kvm_register_clock())
+ return;
+ pv_time_ops.get_wallclock = kvm_get_wallclock;
+ pv_time_ops.set_wallclock = kvm_set_wallclock;
+ pv_time_ops.sched_clock = kvm_clock_read;
+ pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
+ machine_ops.shutdown = kvm_shutdown;
+#ifdef CONFIG_KEXEC
+ machine_ops.crash_shutdown = kvm_crash_shutdown;
+#endif
+ clocksource_register(&kvm_clock);
+ }
+}
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 9482033ed0fe..2dc183758be3 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -53,9 +53,9 @@
#include <linux/init.h>
#include <asm/arch_hooks.h>
-static unsigned char which_scsi = 0;
+static unsigned char which_scsi;
-int MCA_bus = 0;
+int MCA_bus;
EXPORT_SYMBOL(MCA_bus);
/*
@@ -68,15 +68,17 @@ static DEFINE_SPINLOCK(mca_lock);
/* Build the status info for the adapter */
-static void mca_configure_adapter_status(struct mca_device *mca_dev) {
+static void mca_configure_adapter_status(struct mca_device *mca_dev)
+{
mca_dev->status = MCA_ADAPTER_NONE;
mca_dev->pos_id = mca_dev->pos[0]
+ (mca_dev->pos[1] << 8);
- if(!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
+ if (!mca_dev->pos_id && mca_dev->slot < MCA_MAX_SLOT_NR) {
- /* id = 0x0000 usually indicates hardware failure,
+ /*
+ * id = 0x0000 usually indicates hardware failure,
* however, ZP Gu (zpg@castle.net> reports that his 9556
* has 0x0000 as id and everything still works. There
* also seem to be an adapter with id = 0x0000; the
@@ -87,9 +89,10 @@ static void mca_configure_adapter_status(struct mca_device *mca_dev) {
mca_dev->status = MCA_ADAPTER_ERROR;
return;
- } else if(mca_dev->pos_id != 0xffff) {
+ } else if (mca_dev->pos_id != 0xffff) {
- /* 0xffff usually indicates that there's no adapter,
+ /*
+ * 0xffff usually indicates that there's no adapter,
* however, some integrated adapters may have 0xffff as
* their id and still be valid. Examples are on-board
* VGA of the 55sx, the integrated SCSI of the 56 & 57,
@@ -99,19 +102,19 @@ static void mca_configure_adapter_status(struct mca_device *mca_dev) {
mca_dev->status = MCA_ADAPTER_NORMAL;
}
- if((mca_dev->pos_id == 0xffff ||
+ if ((mca_dev->pos_id == 0xffff ||
mca_dev->pos_id == 0x0000) && mca_dev->slot >= MCA_MAX_SLOT_NR) {
int j;
- for(j = 2; j < 8; j++) {
- if(mca_dev->pos[j] != 0xff) {
+ for (j = 2; j < 8; j++) {
+ if (mca_dev->pos[j] != 0xff) {
mca_dev->status = MCA_ADAPTER_NORMAL;
break;
}
}
}
- if(!(mca_dev->pos[2] & MCA_ENABLED)) {
+ if (!(mca_dev->pos[2] & MCA_ENABLED)) {
/* enabled bit is in POS 2 */
@@ -133,7 +136,7 @@ static struct resource mca_standard_resources[] = {
#define MCA_STANDARD_RESOURCES ARRAY_SIZE(mca_standard_resources)
-/**
+/*
* mca_read_and_store_pos - read the POS registers into a memory buffer
* @pos: a char pointer to 8 bytes, contains the POS register value on
* successful return
@@ -141,12 +144,14 @@ static struct resource mca_standard_resources[] = {
* Returns 1 if a card actually exists (i.e. the pos isn't
* all 0xff) or 0 otherwise
*/
-static int mca_read_and_store_pos(unsigned char *pos) {
+static int mca_read_and_store_pos(unsigned char *pos)
+{
int j;
int found = 0;
- for(j=0; j<8; j++) {
- if((pos[j] = inb_p(MCA_POS_REG(j))) != 0xff) {
+ for (j = 0; j < 8; j++) {
+ pos[j] = inb_p(MCA_POS_REG(j));
+ if (pos[j] != 0xff) {
/* 0xff all across means no device. 0x00 means
* something's broken, but a device is
* probably there. However, if you get 0x00
@@ -167,11 +172,11 @@ static unsigned char mca_pc_read_pos(struct mca_device *mca_dev, int reg)
unsigned char byte;
unsigned long flags;
- if(reg < 0 || reg >= 8)
+ if (reg < 0 || reg >= 8)
return 0;
spin_lock_irqsave(&mca_lock, flags);
- if(mca_dev->pos_register) {
+ if (mca_dev->pos_register) {
/* Disable adapter setup, enable motherboard setup */
outb_p(0, MCA_ADAPTER_SETUP_REG);
@@ -203,7 +208,7 @@ static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
{
unsigned long flags;
- if(reg < 0 || reg >= 8)
+ if (reg < 0 || reg >= 8)
return;
spin_lock_irqsave(&mca_lock, flags);
@@ -227,17 +232,17 @@ static void mca_pc_write_pos(struct mca_device *mca_dev, int reg,
}
/* for the primary MCA bus, we have identity transforms */
-static int mca_dummy_transform_irq(struct mca_device * mca_dev, int irq)
+static int mca_dummy_transform_irq(struct mca_device *mca_dev, int irq)
{
return irq;
}
-static int mca_dummy_transform_ioport(struct mca_device * mca_dev, int port)
+static int mca_dummy_transform_ioport(struct mca_device *mca_dev, int port)
{
return port;
}
-static void *mca_dummy_transform_memory(struct mca_device * mca_dev, void *mem)
+static void *mca_dummy_transform_memory(struct mca_device *mca_dev, void *mem)
{
return mem;
}
@@ -251,7 +256,8 @@ static int __init mca_init(void)
short mca_builtin_scsi_ports[] = {0xf7, 0xfd, 0x00};
struct mca_bus *bus;
- /* WARNING: Be careful when making changes here. Putting an adapter
+ /*
+ * WARNING: Be careful when making changes here. Putting an adapter
* and the motherboard simultaneously into setup mode may result in
* damage to chips (according to The Indispensible PC Hardware Book
* by Hans-Peter Messmer). Also, we disable system interrupts (so
@@ -283,7 +289,7 @@ static int __init mca_init(void)
/* get the motherboard device */
mca_dev = kzalloc(sizeof(struct mca_device), GFP_KERNEL);
- if(unlikely(!mca_dev))
+ if (unlikely(!mca_dev))
goto out_nomem;
/*
@@ -309,7 +315,7 @@ static int __init mca_init(void)
mca_register_device(MCA_PRIMARY_BUS, mca_dev);
mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if(unlikely(!mca_dev))
+ if (unlikely(!mca_dev))
goto out_unlock_nomem;
/* Put motherboard into video setup mode, read integrated video
@@ -326,7 +332,8 @@ static int __init mca_init(void)
mca_dev->slot = MCA_INTEGVIDEO;
mca_register_device(MCA_PRIMARY_BUS, mca_dev);
- /* Put motherboard into scsi setup mode, read integrated scsi
+ /*
+ * Put motherboard into scsi setup mode, read integrated scsi
* POS registers, and turn motherboard setup off.
*
* It seems there are two possible SCSI registers. Martin says that
@@ -338,18 +345,18 @@ static int __init mca_init(void)
* machine.
*/
- for(i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
+ for (i = 0; (which_scsi = mca_builtin_scsi_ports[i]) != 0; i++) {
outb_p(which_scsi, MCA_MOTHERBOARD_SETUP_REG);
- if(mca_read_and_store_pos(pos))
+ if (mca_read_and_store_pos(pos))
break;
}
- if(which_scsi) {
+ if (which_scsi) {
/* found a scsi card */
mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if(unlikely(!mca_dev))
+ if (unlikely(!mca_dev))
goto out_unlock_nomem;
- for(j = 0; j < 8; j++)
+ for (j = 0; j < 8; j++)
mca_dev->pos[j] = pos[j];
mca_configure_adapter_status(mca_dev);
@@ -364,21 +371,22 @@ static int __init mca_init(void)
outb_p(0xff, MCA_MOTHERBOARD_SETUP_REG);
- /* Now loop over MCA slots: put each adapter into setup mode, and
+ /*
+ * Now loop over MCA slots: put each adapter into setup mode, and
* read its POS registers. Then put adapter setup off.
*/
- for(i=0; i<MCA_MAX_SLOT_NR; i++) {
+ for (i = 0; i < MCA_MAX_SLOT_NR; i++) {
outb_p(0x8|(i&0xf), MCA_ADAPTER_SETUP_REG);
- if(!mca_read_and_store_pos(pos))
+ if (!mca_read_and_store_pos(pos))
continue;
mca_dev = kzalloc(sizeof(struct mca_device), GFP_ATOMIC);
- if(unlikely(!mca_dev))
+ if (unlikely(!mca_dev))
goto out_unlock_nomem;
- for(j=0; j<8; j++)
- mca_dev->pos[j]=pos[j];
+ for (j = 0; j < 8; j++)
+ mca_dev->pos[j] = pos[j];
mca_dev->driver_loaded = 0;
mca_dev->slot = i;
@@ -414,20 +422,20 @@ mca_handle_nmi_device(struct mca_device *mca_dev, int check_flag)
{
int slot = mca_dev->slot;
- if(slot == MCA_INTEGSCSI) {
+ if (slot == MCA_INTEGSCSI) {
printk(KERN_CRIT "NMI: caused by MCA integrated SCSI adapter (%s)\n",
mca_dev->name);
- } else if(slot == MCA_INTEGVIDEO) {
+ } else if (slot == MCA_INTEGVIDEO) {
printk(KERN_CRIT "NMI: caused by MCA integrated video adapter (%s)\n",
mca_dev->name);
- } else if(slot == MCA_MOTHERBOARD) {
+ } else if (slot == MCA_MOTHERBOARD) {
printk(KERN_CRIT "NMI: caused by motherboard (%s)\n",
mca_dev->name);
}
/* More info available in POS 6 and 7? */
- if(check_flag) {
+ if (check_flag) {
unsigned char pos6, pos7;
pos6 = mca_device_read_pos(mca_dev, 6);
@@ -447,8 +455,9 @@ static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
pos5 = mca_device_read_pos(mca_dev, 5);
- if(!(pos5 & 0x80)) {
- /* Bit 7 of POS 5 is reset when this adapter has a hardware
+ if (!(pos5 & 0x80)) {
+ /*
+ * Bit 7 of POS 5 is reset when this adapter has a hardware
* error. Bit 7 it reset if there's error information
* available in POS 6 and 7.
*/
@@ -460,7 +469,8 @@ static int __kprobes mca_handle_nmi_callback(struct device *dev, void *data)
void __kprobes mca_handle_nmi(void)
{
- /* First try - scan the various adapters and see if a specific
+ /*
+ * First try - scan the various adapters and see if a specific
* adapter was responsible for the error.
*/
bus_for_each_dev(&mca_bus_type, NULL, NULL, mca_handle_nmi_callback);
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index b402c0f3f192..3cad17fe026b 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -63,7 +63,7 @@ static int __init mfgpt_fix(char *s)
/* The following udocumented bit resets the MFGPT timers */
val = 0xFF; dummy = 0;
- wrmsr(0x5140002B, val, dummy);
+ wrmsr(MSR_MFGPT_SETUP, val, dummy);
return 1;
}
__setup("mfgptfix", mfgpt_fix);
@@ -127,17 +127,17 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
* 6; that is, resets for 7 and 8 will be ignored. Is this
* a problem? -dilinger
*/
- msr = MFGPT_NR_MSR;
+ msr = MSR_MFGPT_NR;
mask = 1 << (timer + 24);
break;
case MFGPT_EVENT_NMI:
- msr = MFGPT_NR_MSR;
+ msr = MSR_MFGPT_NR;
mask = 1 << (timer + shift);
break;
case MFGPT_EVENT_IRQ:
- msr = MFGPT_IRQ_MSR;
+ msr = MSR_MFGPT_IRQ;
mask = 1 << (timer + shift);
break;
@@ -364,7 +364,8 @@ int __init mfgpt_timer_setup(void)
geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
/* Set up the clock event */
- mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC, 32);
+ mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC,
+ mfgpt_clockevent.shift);
mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF,
&mfgpt_clockevent);
mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE,
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index f2702d01b8a8..69729e38b78a 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -290,7 +290,7 @@ static int get_maching_microcode(void *mc, int cpu)
}
return 0;
find:
- pr_debug("microcode: CPU %d found a matching microcode update with"
+ pr_debug("microcode: CPU%d found a matching microcode update with"
" version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
new_mc = vmalloc(total_size);
if (!new_mc) {
@@ -336,11 +336,11 @@ static void apply_microcode(int cpu)
spin_unlock_irqrestore(&microcode_update_lock, flags);
if (val[1] != uci->mc->hdr.rev) {
- printk(KERN_ERR "microcode: CPU%d updated from revision "
+ printk(KERN_ERR "microcode: CPU%d update from revision "
"0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
return;
}
- pr_debug("microcode: CPU%d updated from revision "
+ printk(KERN_INFO "microcode: CPU%d updated from revision "
"0x%x to 0x%x, date = %08x \n",
cpu_num, uci->rev, val[1], uci->mc->hdr.date);
uci->rev = val[1];
@@ -402,7 +402,7 @@ static int do_microcode_update (void)
if (!uci->valid)
continue;
- set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
error = get_maching_microcode(new_mc, cpu);
if (error < 0)
goto out;
@@ -416,7 +416,7 @@ out:
vfree(new_mc);
if (cursor < 0)
error = cursor;
- set_cpus_allowed(current, old);
+ set_cpus_allowed_ptr(current, &old);
return error;
}
@@ -534,7 +534,7 @@ static int cpu_request_microcode(int cpu)
c->x86, c->x86_model, c->x86_mask);
error = request_firmware(&firmware, name, &microcode_pdev->dev);
if (error) {
- pr_debug("ucode data file %s load failed\n", name);
+ pr_debug("microcode: ucode data file %s load failed\n", name);
return error;
}
buf = firmware->data;
@@ -579,7 +579,7 @@ static int apply_microcode_check_cpu(int cpu)
return 0;
old = current->cpus_allowed;
- set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
/* Check if the microcode we have in memory matches the CPU */
if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
@@ -610,7 +610,7 @@ static int apply_microcode_check_cpu(int cpu)
" sig=0x%x, pf=0x%x, rev=0x%x\n",
cpu, uci->sig, uci->pf, uci->rev);
- set_cpus_allowed(current, old);
+ set_cpus_allowed_ptr(current, &old);
return err;
}
@@ -621,13 +621,13 @@ static void microcode_init_cpu(int cpu, int resume)
old = current->cpus_allowed;
- set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
mutex_lock(&microcode_mutex);
collect_cpu_info(cpu);
if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
cpu_request_microcode(cpu);
mutex_unlock(&microcode_mutex);
- set_cpus_allowed(current, old);
+ set_cpus_allowed_ptr(current, &old);
}
static void microcode_fini_cpu(int cpu)
@@ -657,14 +657,14 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
old = current->cpus_allowed;
get_online_cpus();
- set_cpus_allowed(current, cpumask_of_cpu(cpu));
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
mutex_lock(&microcode_mutex);
if (uci->valid)
err = cpu_request_microcode(cpu);
mutex_unlock(&microcode_mutex);
put_online_cpus();
- set_cpus_allowed(current, old);
+ set_cpus_allowed_ptr(current, &old);
}
if (err)
return err;
@@ -709,7 +709,7 @@ static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
if (!cpu_online(cpu))
return 0;
- pr_debug("Microcode:CPU %d added\n", cpu);
+ pr_debug("microcode: CPU%d added\n", cpu);
memset(uci, 0, sizeof(*uci));
err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
@@ -733,7 +733,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
if (!cpu_online(cpu))
return 0;
- pr_debug("Microcode:CPU %d removed\n", cpu);
+ pr_debug("microcode: CPU%d removed\n", cpu);
microcode_fini_cpu(cpu);
sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
return 0;
@@ -745,7 +745,7 @@ static int mc_sysdev_resume(struct sys_device *dev)
if (!cpu_online(cpu))
return 0;
- pr_debug("Microcode:CPU %d resumed\n", cpu);
+ pr_debug("microcode: CPU%d resumed\n", cpu);
/* only CPU 0 will apply ucode here */
apply_microcode(0);
return 0;
@@ -783,7 +783,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
}
case CPU_DOWN_FAILED_FROZEN:
if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
- printk(KERN_ERR "Microcode: Failed to create the sysfs "
+ printk(KERN_ERR "microcode: Failed to create the sysfs "
"group for CPU%d\n", cpu);
break;
case CPU_DOWN_PREPARE:
diff --git a/arch/x86/kernel/mpparse_32.c b/arch/x86/kernel/mpparse.c
index f349e68e45a0..3e2c54dc8b29 100644
--- a/arch/x86/kernel/mpparse_32.c
+++ b/arch/x86/kernel/mpparse.c
@@ -4,82 +4,56 @@
*
* (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
* (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- * Fixes
- * Erich Boleyn : MP v1.4 and additional changes.
- * Alan Cox : Added EBDA scanning
- * Ingo Molnar : various cleanups and rewrites
- * Maciej W. Rozycki: Bits for default MP configurations
- * Paul Diefenbaugh: Added full ACPI support
+ * (c) 2008 Alexey Starikovskiy <astarikovskiy@suse.de>
*/
#include <linux/mm.h>
#include <linux/init.h>
-#include <linux/acpi.h>
#include <linux/delay.h>
#include <linux/bootmem.h>
#include <linux/kernel_stat.h>
#include <linux/mc146818rtc.h>
#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/module.h>
#include <asm/smp.h>
-#include <asm/acpi.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
+#include <asm/pgalloc.h>
#include <asm/io_apic.h>
+#include <asm/proto.h>
+#include <asm/acpi.h>
+#include <asm/bios_ebda.h>
#include <mach_apic.h>
+#ifdef CONFIG_X86_32
#include <mach_apicdef.h>
#include <mach_mpparse.h>
-#include <bios_ebda.h>
+#endif
/* Have we found an MP table */
int smp_found_config;
-unsigned int __cpuinitdata maxcpus = NR_CPUS;
/*
* Various Linux-internal data structures created from the
* MP-table.
*/
-int apic_version [MAX_APICS];
-int mp_bus_id_to_type [MAX_MP_BUSSES];
-int mp_bus_id_to_node [MAX_MP_BUSSES];
-int mp_bus_id_to_local [MAX_MP_BUSSES];
-int quad_local_to_mp_bus_id [NR_CPUS/4][4];
-int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
-static int mp_current_pci_id;
-
-/* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
-
-/* # of MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+int mp_bus_id_to_type[MAX_MP_BUSSES];
+#endif
-/* MP IRQ source entries */
-int mp_irq_entries;
+DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
+int mp_bus_id_to_pci_bus[MAX_MP_BUSSES] = {[0 ... MAX_MP_BUSSES - 1] = -1 };
-int nr_ioapics;
+static int mp_current_pci_id;
int pic_mode;
-unsigned long mp_lapic_addr;
-
-unsigned int def_to_bigsmp = 0;
-
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
-/* Internal processor count */
-unsigned int num_processors;
-
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map;
-
-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
/*
* Intel MP BIOS table parsing routines:
*/
-
/*
* Checksum an MP configuration block.
*/
@@ -94,216 +68,153 @@ static int __init mpf_checksum(unsigned char *mp, int len)
return sum & 0xFF;
}
+#ifdef CONFIG_X86_NUMAQ
/*
* Have to match translation table entries to main table entries by counter
* hence the mpc_record variable .... can't see a less disgusting way of
* doing this ....
*/
-static int mpc_record;
-static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] __cpuinitdata;
+static int mpc_record;
+static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
+ __cpuinitdata;
+#endif
-static void __cpuinit MP_processor_info (struct mpc_config_processor *m)
+static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
{
- int ver, apicid;
- physid_mask_t phys_cpu;
-
- if (!(m->mpc_cpuflag & CPU_ENABLED))
- return;
+ int apicid;
+ char *bootup_cpu = "";
+ if (!(m->mpc_cpuflag & CPU_ENABLED)) {
+ disabled_cpus++;
+ return;
+ }
+#ifdef CONFIG_X86_NUMAQ
apicid = mpc_apic_id(m, translation_table[mpc_record]);
-
- if (m->mpc_featureflag&(1<<0))
- Dprintk(" Floating point unit present.\n");
- if (m->mpc_featureflag&(1<<7))
- Dprintk(" Machine Exception supported.\n");
- if (m->mpc_featureflag&(1<<8))
- Dprintk(" 64 bit compare & exchange supported.\n");
- if (m->mpc_featureflag&(1<<9))
- Dprintk(" Internal APIC present.\n");
- if (m->mpc_featureflag&(1<<11))
- Dprintk(" SEP present.\n");
- if (m->mpc_featureflag&(1<<12))
- Dprintk(" MTRR present.\n");
- if (m->mpc_featureflag&(1<<13))
- Dprintk(" PGE present.\n");
- if (m->mpc_featureflag&(1<<14))
- Dprintk(" MCA present.\n");
- if (m->mpc_featureflag&(1<<15))
- Dprintk(" CMOV present.\n");
- if (m->mpc_featureflag&(1<<16))
- Dprintk(" PAT present.\n");
- if (m->mpc_featureflag&(1<<17))
- Dprintk(" PSE present.\n");
- if (m->mpc_featureflag&(1<<18))
- Dprintk(" PSN present.\n");
- if (m->mpc_featureflag&(1<<19))
- Dprintk(" Cache Line Flush Instruction present.\n");
- /* 20 Reserved */
- if (m->mpc_featureflag&(1<<21))
- Dprintk(" Debug Trace and EMON Store present.\n");
- if (m->mpc_featureflag&(1<<22))
- Dprintk(" ACPI Thermal Throttle Registers present.\n");
- if (m->mpc_featureflag&(1<<23))
- Dprintk(" MMX present.\n");
- if (m->mpc_featureflag&(1<<24))
- Dprintk(" FXSR present.\n");
- if (m->mpc_featureflag&(1<<25))
- Dprintk(" XMM present.\n");
- if (m->mpc_featureflag&(1<<26))
- Dprintk(" Willamette New Instructions present.\n");
- if (m->mpc_featureflag&(1<<27))
- Dprintk(" Self Snoop present.\n");
- if (m->mpc_featureflag&(1<<28))
- Dprintk(" HT present.\n");
- if (m->mpc_featureflag&(1<<29))
- Dprintk(" Thermal Monitor present.\n");
- /* 30, 31 Reserved */
-
-
+#else
+ apicid = m->mpc_apicid;
+#endif
if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
- Dprintk(" Bootup CPU\n");
+ bootup_cpu = " (Bootup-CPU)";
boot_cpu_physical_apicid = m->mpc_apicid;
}
- ver = m->mpc_apicver;
-
- /*
- * Validate version
- */
- if (ver == 0x0) {
- printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! "
- "fixing up to 0x10. (tell your hw vendor)\n",
- m->mpc_apicid);
- ver = 0x10;
- }
- apic_version[m->mpc_apicid] = ver;
-
- phys_cpu = apicid_to_cpu_present(apicid);
- physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
-
- if (num_processors >= NR_CPUS) {
- printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
- " Processor ignored.\n", NR_CPUS);
- return;
- }
-
- if (num_processors >= maxcpus) {
- printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
- " Processor ignored.\n", maxcpus);
- return;
- }
-
- cpu_set(num_processors, cpu_possible_map);
- num_processors++;
-
- /*
- * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
- * but we need to work other dependencies like SMP_SUSPEND etc
- * before this can be done without some confusion.
- * if (CPU_HOTPLUG_ENABLED || num_processors > 8)
- * - Ashok Raj <ashok.raj@intel.com>
- */
- if (num_processors > 8) {
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_INTEL:
- if (!APIC_XAPIC(ver)) {
- def_to_bigsmp = 0;
- break;
- }
- /* If P4 and above fall through */
- case X86_VENDOR_AMD:
- def_to_bigsmp = 1;
- }
- }
- bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
+ printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
+ generic_processor_info(apicid, m->mpc_apicver);
}
-static void __init MP_bus_info (struct mpc_config_bus *m)
+static void __init MP_bus_info(struct mpc_config_bus *m)
{
char str[7];
memcpy(str, m->mpc_bustype, 6);
str[6] = 0;
+#ifdef CONFIG_X86_NUMAQ
mpc_oem_bus_info(m, str, translation_table[mpc_record]);
+#else
+ Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
+#endif
#if MAX_MP_BUSSES < 256
if (m->mpc_busid >= MAX_MP_BUSSES) {
printk(KERN_WARNING "MP table busid value (%d) for bustype %s "
- " is too large, max. supported is %d\n",
- m->mpc_busid, str, MAX_MP_BUSSES - 1);
+ " is too large, max. supported is %d\n",
+ m->mpc_busid, str, MAX_MP_BUSSES - 1);
return;
}
#endif
- if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) {
+ if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) {
+ set_bit(m->mpc_busid, mp_bus_not_pci);
+#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
- } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA)-1) == 0) {
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
- } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI)-1) == 0) {
+#endif
+ } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
+#ifdef CONFIG_X86_NUMAQ
mpc_oem_pci_bus(m, translation_table[mpc_record]);
- mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+#endif
+ clear_bit(m->mpc_busid, mp_bus_not_pci);
mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
mp_current_pci_id++;
- } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA)-1) == 0) {
+#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
+ } else if (strncmp(str, BUSTYPE_EISA, sizeof(BUSTYPE_EISA) - 1) == 0) {
+ mp_bus_id_to_type[m->mpc_busid] = MP_BUS_EISA;
+ } else if (strncmp(str, BUSTYPE_MCA, sizeof(BUSTYPE_MCA) - 1) == 0) {
mp_bus_id_to_type[m->mpc_busid] = MP_BUS_MCA;
- } else {
+#endif
+ } else
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
+}
+
+#ifdef CONFIG_X86_IO_APIC
+
+static int bad_ioapic(unsigned long address)
+{
+ if (nr_ioapics >= MAX_IO_APICS) {
+ printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
+ "(found %d)\n", MAX_IO_APICS, nr_ioapics);
+ panic("Recompile kernel with bigger MAX_IO_APICS!\n");
+ }
+ if (!address) {
+ printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
+ " found in table, skipping!\n");
+ return 1;
}
+ return 0;
}
-static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
+static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
{
if (!(m->mpc_flags & MPC_APIC_USABLE))
return;
printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
- m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
- if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_CRIT "Max # of I/O APICs (%d) exceeded (found %d).\n",
- MAX_IO_APICS, nr_ioapics);
- panic("Recompile kernel with bigger MAX_IO_APICS!.\n");
- }
- if (!m->mpc_apicaddr) {
- printk(KERN_ERR "WARNING: bogus zero I/O APIC address"
- " found in MP table, skipping!\n");
+ m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
+
+ if (bad_ioapic(m->mpc_apicaddr))
return;
- }
+
mp_ioapics[nr_ioapics] = *m;
nr_ioapics++;
}
-static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
+static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
{
- mp_irqs [mp_irq_entries] = *m;
+ mp_irqs[mp_irq_entries] = *m;
Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
" IRQ %02x, APIC ID %x, APIC INT %02x\n",
- m->mpc_irqtype, m->mpc_irqflag & 3,
- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
- m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
+ m->mpc_irqtype, m->mpc_irqflag & 3,
+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
+ m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
if (++mp_irq_entries == MAX_IRQ_SOURCES)
panic("Max # of irq sources exceeded!!\n");
}
-static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
+#endif
+
+static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
{
Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
" IRQ %02x, APIC ID %x, APIC LINT %02x\n",
- m->mpc_irqtype, m->mpc_irqflag & 3,
- (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
- m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
+ m->mpc_irqtype, m->mpc_irqflag & 3,
+ (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
+ m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
}
#ifdef CONFIG_X86_NUMAQ
-static void __init MP_translation_info (struct mpc_config_translation *m)
+static void __init MP_translation_info(struct mpc_config_translation *m)
{
- printk(KERN_INFO "Translation: record %d, type %d, quad %d, global %d, local %d\n", mpc_record, m->trans_type, m->trans_quad, m->trans_global, m->trans_local);
+ printk(KERN_INFO
+ "Translation: record %d, type %d, quad %d, global %d, local %d\n",
+ mpc_record, m->trans_type, m->trans_quad, m->trans_global,
+ m->trans_local);
- if (mpc_record >= MAX_MPC_ENTRY)
+ if (mpc_record >= MAX_MPC_ENTRY)
printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
else
- translation_table[mpc_record] = m; /* stash this for later */
+ translation_table[mpc_record] = m; /* stash this for later */
if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
node_set_online(m->trans_quad);
}
@@ -312,118 +223,124 @@ static void __init MP_translation_info (struct mpc_config_translation *m)
* Read/parse the MPC oem tables
*/
-static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, \
- unsigned short oemsize)
+static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
+ unsigned short oemsize)
{
- int count = sizeof (*oemtable); /* the header size */
- unsigned char *oemptr = ((unsigned char *)oemtable)+count;
-
+ int count = sizeof(*oemtable); /* the header size */
+ unsigned char *oemptr = ((unsigned char *)oemtable) + count;
+
mpc_record = 0;
- printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", oemtable);
- if (memcmp(oemtable->oem_signature,MPC_OEM_SIGNATURE,4))
- {
- printk(KERN_WARNING "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
- oemtable->oem_signature[0],
- oemtable->oem_signature[1],
- oemtable->oem_signature[2],
- oemtable->oem_signature[3]);
+ printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
+ oemtable);
+ if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
+ printk(KERN_WARNING
+ "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
+ oemtable->oem_signature[0], oemtable->oem_signature[1],
+ oemtable->oem_signature[2], oemtable->oem_signature[3]);
return;
}
- if (mpf_checksum((unsigned char *)oemtable,oemtable->oem_length))
- {
+ if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
return;
}
while (count < oemtable->oem_length) {
switch (*oemptr) {
- case MP_TRANSLATION:
+ case MP_TRANSLATION:
{
- struct mpc_config_translation *m=
- (struct mpc_config_translation *)oemptr;
+ struct mpc_config_translation *m =
+ (struct mpc_config_translation *)oemptr;
MP_translation_info(m);
oemptr += sizeof(*m);
count += sizeof(*m);
++mpc_record;
break;
}
- default:
+ default:
{
- printk(KERN_WARNING "Unrecognised OEM table entry type! - %d\n", (int) *oemptr);
+ printk(KERN_WARNING
+ "Unrecognised OEM table entry type! - %d\n",
+ (int)*oemptr);
return;
}
}
- }
+ }
}
static inline void mps_oem_check(struct mp_config_table *mpc, char *oem,
- char *productid)
+ char *productid)
{
if (strncmp(oem, "IBM NUMA", 8))
printk("Warning! May not be a NUMA-Q system!\n");
if (mpc->mpc_oemptr)
- smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
- mpc->mpc_oemsize);
+ smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
+ mpc->mpc_oemsize);
}
-#endif /* CONFIG_X86_NUMAQ */
+#endif /* CONFIG_X86_NUMAQ */
/*
* Read/parse the MPC
*/
-static int __init smp_read_mpc(struct mp_config_table *mpc)
+static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
{
char str[16];
char oem[10];
- int count=sizeof(*mpc);
- unsigned char *mpt=((unsigned char *)mpc)+count;
+ int count = sizeof(*mpc);
+ unsigned char *mpt = ((unsigned char *)mpc) + count;
- if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
- printk(KERN_ERR "SMP mptable: bad signature [0x%x]!\n",
- *(u32 *)mpc->mpc_signature);
+ if (memcmp(mpc->mpc_signature, MPC_SIGNATURE, 4)) {
+ printk(KERN_ERR "MPTABLE: bad signature [%c%c%c%c]!\n",
+ mpc->mpc_signature[0], mpc->mpc_signature[1],
+ mpc->mpc_signature[2], mpc->mpc_signature[3]);
return 0;
}
- if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
- printk(KERN_ERR "SMP mptable: checksum error!\n");
+ if (mpf_checksum((unsigned char *)mpc, mpc->mpc_length)) {
+ printk(KERN_ERR "MPTABLE: checksum error!\n");
return 0;
}
- if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
- printk(KERN_ERR "SMP mptable: bad table version (%d)!!\n",
- mpc->mpc_spec);
+ if (mpc->mpc_spec != 0x01 && mpc->mpc_spec != 0x04) {
+ printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
+ mpc->mpc_spec);
return 0;
}
if (!mpc->mpc_lapic) {
- printk(KERN_ERR "SMP mptable: null local APIC address!\n");
+ printk(KERN_ERR "MPTABLE: null local APIC address!\n");
return 0;
}
- memcpy(oem,mpc->mpc_oem,8);
- oem[8]=0;
- printk(KERN_INFO "OEM ID: %s ",oem);
+ memcpy(oem, mpc->mpc_oem, 8);
+ oem[8] = 0;
+ printk(KERN_INFO "MPTABLE: OEM ID: %s ", oem);
- memcpy(str,mpc->mpc_productid,12);
- str[12]=0;
- printk("Product ID: %s ",str);
+ memcpy(str, mpc->mpc_productid, 12);
+ str[12] = 0;
+ printk("Product ID: %s ", str);
+#ifdef CONFIG_X86_32
mps_oem_check(mpc, oem, str);
+#endif
+ printk(KERN_INFO "MPTABLE: Product ID: %s ", str);
- printk("APIC at: 0x%X\n", mpc->mpc_lapic);
+ printk(KERN_INFO "MPTABLE: APIC at: 0x%X\n", mpc->mpc_lapic);
- /*
- * Save the local APIC address (it might be non-default) -- but only
- * if we're not using ACPI.
- */
+ /* save the local APIC address, it might be non-default */
if (!acpi_lapic)
mp_lapic_addr = mpc->mpc_lapic;
+ if (early)
+ return 1;
+
/*
- * Now process the configuration blocks.
+ * Now process the configuration blocks.
*/
+#ifdef CONFIG_X86_NUMAQ
mpc_record = 0;
+#endif
while (count < mpc->mpc_length) {
- switch(*mpt) {
- case MP_PROCESSOR:
+ switch (*mpt) {
+ case MP_PROCESSOR:
{
- struct mpc_config_processor *m=
- (struct mpc_config_processor *)mpt;
+ struct mpc_config_processor *m =
+ (struct mpc_config_processor *)mpt;
/* ACPI may have already provided this data */
if (!acpi_lapic)
MP_processor_info(m);
@@ -431,57 +348,68 @@ static int __init smp_read_mpc(struct mp_config_table *mpc)
count += sizeof(*m);
break;
}
- case MP_BUS:
+ case MP_BUS:
{
- struct mpc_config_bus *m=
- (struct mpc_config_bus *)mpt;
+ struct mpc_config_bus *m =
+ (struct mpc_config_bus *)mpt;
MP_bus_info(m);
mpt += sizeof(*m);
count += sizeof(*m);
break;
}
- case MP_IOAPIC:
+ case MP_IOAPIC:
{
- struct mpc_config_ioapic *m=
- (struct mpc_config_ioapic *)mpt;
+#ifdef CONFIG_X86_IO_APIC
+ struct mpc_config_ioapic *m =
+ (struct mpc_config_ioapic *)mpt;
MP_ioapic_info(m);
- mpt+=sizeof(*m);
- count+=sizeof(*m);
+#endif
+ mpt += sizeof(struct mpc_config_ioapic);
+ count += sizeof(struct mpc_config_ioapic);
break;
}
- case MP_INTSRC:
+ case MP_INTSRC:
{
- struct mpc_config_intsrc *m=
- (struct mpc_config_intsrc *)mpt;
+#ifdef CONFIG_X86_IO_APIC
+ struct mpc_config_intsrc *m =
+ (struct mpc_config_intsrc *)mpt;
MP_intsrc_info(m);
- mpt+=sizeof(*m);
- count+=sizeof(*m);
+#endif
+ mpt += sizeof(struct mpc_config_intsrc);
+ count += sizeof(struct mpc_config_intsrc);
break;
}
- case MP_LINTSRC:
+ case MP_LINTSRC:
{
- struct mpc_config_lintsrc *m=
- (struct mpc_config_lintsrc *)mpt;
+ struct mpc_config_lintsrc *m =
+ (struct mpc_config_lintsrc *)mpt;
MP_lintsrc_info(m);
- mpt+=sizeof(*m);
- count+=sizeof(*m);
- break;
- }
- default:
- {
- count = mpc->mpc_length;
+ mpt += sizeof(*m);
+ count += sizeof(*m);
break;
}
+ default:
+ /* wrong mptable */
+ printk(KERN_ERR "Your mptable is wrong, contact your HW vendor!\n");
+ printk(KERN_ERR "type %x\n", *mpt);
+ print_hex_dump(KERN_ERR, " ", DUMP_PREFIX_ADDRESS, 16,
+ 1, mpc, mpc->mpc_length, 1);
+ count = mpc->mpc_length;
+ break;
}
+#ifdef CONFIG_X86_NUMAQ
++mpc_record;
+#endif
}
setup_apic_routing();
if (!num_processors)
- printk(KERN_ERR "SMP mptable: no processors registered!\n");
+ printk(KERN_ERR "MPTABLE: no processors registered!\n");
return num_processors;
}
+#ifdef CONFIG_X86_IO_APIC
+
static int __init ELCR_trigger(unsigned int irq)
{
unsigned int port;
@@ -497,7 +425,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
int ELCR_fallback = 0;
intsrc.mpc_type = MP_INTSRC;
- intsrc.mpc_irqflag = 0; /* conforming */
+ intsrc.mpc_irqflag = 0; /* conforming */
intsrc.mpc_srcbus = 0;
intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
@@ -512,12 +440,16 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
* If it does, we assume it's valid.
*/
if (mpc_default_type == 5) {
- printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
+ printk(KERN_INFO "ISA/PCI bus type with no IRQ information... "
+ "falling back to ELCR\n");
- if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
- printk(KERN_WARNING "ELCR contains invalid data... not using ELCR\n");
+ if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) ||
+ ELCR_trigger(13))
+ printk(KERN_ERR "ELCR contains invalid data... "
+ "not using ELCR\n");
else {
- printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
+ printk(KERN_INFO
+ "Using ELCR to identify PCI interrupts\n");
ELCR_fallback = 1;
}
}
@@ -546,21 +478,25 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
}
intsrc.mpc_srcbusirq = i;
- intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
+ intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
MP_intsrc_info(&intsrc);
}
intsrc.mpc_irqtype = mp_ExtINT;
intsrc.mpc_srcbusirq = 0;
- intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
+ intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
MP_intsrc_info(&intsrc);
}
+#endif
+
static inline void __init construct_default_ISA_mptable(int mpc_default_type)
{
struct mpc_config_processor processor;
struct mpc_config_bus bus;
+#ifdef CONFIG_X86_IO_APIC
struct mpc_config_ioapic ioapic;
+#endif
struct mpc_config_lintsrc lintsrc;
int linttypes[2] = { mp_ExtINT, mp_NMI };
int i;
@@ -578,8 +514,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
processor.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
processor.mpc_cpuflag = CPU_ENABLED;
processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
- (boot_cpu_data.x86_model << 4) |
- boot_cpu_data.x86_mask;
+ (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
processor.mpc_reserved[0] = 0;
processor.mpc_reserved[1] = 0;
@@ -591,23 +526,22 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
bus.mpc_type = MP_BUS;
bus.mpc_busid = 0;
switch (mpc_default_type) {
- default:
- printk("???\n");
- printk(KERN_ERR "Unknown standard configuration %d\n",
- mpc_default_type);
- /* fall through */
- case 1:
- case 5:
- memcpy(bus.mpc_bustype, "ISA ", 6);
- break;
- case 2:
- case 6:
- case 3:
- memcpy(bus.mpc_bustype, "EISA ", 6);
- break;
- case 4:
- case 7:
- memcpy(bus.mpc_bustype, "MCA ", 6);
+ default:
+ printk(KERN_ERR "???\nUnknown standard configuration %d\n",
+ mpc_default_type);
+ /* fall through */
+ case 1:
+ case 5:
+ memcpy(bus.mpc_bustype, "ISA ", 6);
+ break;
+ case 2:
+ case 6:
+ case 3:
+ memcpy(bus.mpc_bustype, "EISA ", 6);
+ break;
+ case 4:
+ case 7:
+ memcpy(bus.mpc_bustype, "MCA ", 6);
}
MP_bus_info(&bus);
if (mpc_default_type > 4) {
@@ -616,6 +550,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
MP_bus_info(&bus);
}
+#ifdef CONFIG_X86_IO_APIC
ioapic.mpc_type = MP_IOAPIC;
ioapic.mpc_apicid = 2;
ioapic.mpc_apicver = mpc_default_type > 4 ? 0x10 : 0x01;
@@ -627,9 +562,9 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
* We set up most of the low 16 IO-APIC pins according to MPS rules.
*/
construct_default_ioirq_mptable(mpc_default_type);
-
+#endif
lintsrc.mpc_type = MP_LINTSRC;
- lintsrc.mpc_irqflag = 0; /* conforming */
+ lintsrc.mpc_irqflag = 0; /* conforming */
lintsrc.mpc_srcbusid = 0;
lintsrc.mpc_srcbusirq = 0;
lintsrc.mpc_destapic = MP_APIC_ALL;
@@ -645,36 +580,49 @@ static struct intel_mp_floating *mpf_found;
/*
* Scan the memory blocks for an SMP configuration block.
*/
-void __init get_smp_config (void)
+static void __init __get_smp_config(unsigned early)
{
struct intel_mp_floating *mpf = mpf_found;
+ if (acpi_lapic && early)
+ return;
/*
- * ACPI supports both logical (e.g. Hyper-Threading) and physical
+ * ACPI supports both logical (e.g. Hyper-Threading) and physical
* processors, where MPS only supports physical.
*/
if (acpi_lapic && acpi_ioapic) {
- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
+ printk(KERN_INFO "Using ACPI (MADT) for SMP configuration "
+ "information\n");
return;
- }
- else if (acpi_lapic)
- printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
-
- printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
- if (mpf->mpf_feature2 & (1<<7)) {
+ } else if (acpi_lapic)
+ printk(KERN_INFO "Using ACPI for processor (LAPIC) "
+ "configuration information\n");
+
+ printk(KERN_INFO "Intel MultiProcessor Specification v1.%d\n",
+ mpf->mpf_specification);
+#ifdef CONFIG_X86_32
+ if (mpf->mpf_feature2 & (1 << 7)) {
printk(KERN_INFO " IMCR and PIC compatibility mode.\n");
pic_mode = 1;
} else {
printk(KERN_INFO " Virtual Wire compatibility mode.\n");
pic_mode = 0;
}
-
+#endif
/*
* Now see if we need to read further.
*/
if (mpf->mpf_feature1 != 0) {
+ if (early) {
+ /*
+ * local APIC has default address
+ */
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+ return;
+ }
- printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
+ printk(KERN_INFO "Default MP configuration #%d\n",
+ mpf->mpf_feature1);
construct_default_ISA_mptable(mpf->mpf_feature1);
} else if (mpf->mpf_physptr) {
@@ -683,12 +631,18 @@ void __init get_smp_config (void)
* Read the physical hardware table. Anything here will
* override the defaults.
*/
- if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
+ if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr), early)) {
smp_found_config = 0;
- printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
- printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
+ printk(KERN_ERR
+ "BIOS bug, MP table errors detected!...\n");
+ printk(KERN_ERR "... disabling SMP support. "
+ "(tell your hw vendor)\n");
return;
}
+
+ if (early)
+ return;
+#ifdef CONFIG_X86_IO_APIC
/*
* If there are no explicit MP IRQ entries, then we are
* broken. We set up most of the low 16 IO-APIC pins to
@@ -697,7 +651,9 @@ void __init get_smp_config (void)
if (!mp_irq_entries) {
struct mpc_config_bus bus;
- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
+ printk(KERN_ERR "BIOS bug, no explicit IRQ entries, "
+ "using default mptable. "
+ "(tell your hw vendor)\n");
bus.mpc_type = MP_BUS;
bus.mpc_busid = 0;
@@ -706,36 +662,49 @@ void __init get_smp_config (void)
construct_default_ioirq_mptable(0);
}
-
+#endif
} else
BUG();
- printk(KERN_INFO "Processors: %d\n", num_processors);
+ if (!early)
+ printk(KERN_INFO "Processors: %d\n", num_processors);
/*
* Only use the first configuration found.
*/
}
-static int __init smp_scan_config (unsigned long base, unsigned long length)
+void __init early_get_smp_config(void)
{
- unsigned long *bp = phys_to_virt(base);
+ __get_smp_config(1);
+}
+
+void __init get_smp_config(void)
+{
+ __get_smp_config(0);
+}
+
+static int __init smp_scan_config(unsigned long base, unsigned long length,
+ unsigned reserve)
+{
+ unsigned int *bp = phys_to_virt(base);
struct intel_mp_floating *mpf;
- printk(KERN_INFO "Scan SMP from %p for %ld bytes.\n", bp,length);
- if (sizeof(*mpf) != 16)
- printk("Error: MPF size\n");
+ Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length);
+ BUILD_BUG_ON(sizeof(*mpf) != 16);
while (length > 0) {
mpf = (struct intel_mp_floating *)bp;
if ((*bp == SMP_MAGIC_IDENT) &&
- (mpf->mpf_length == 1) &&
- !mpf_checksum((unsigned char *)bp, 16) &&
- ((mpf->mpf_specification == 1)
- || (mpf->mpf_specification == 4)) ) {
+ (mpf->mpf_length == 1) &&
+ !mpf_checksum((unsigned char *)bp, 16) &&
+ ((mpf->mpf_specification == 1)
+ || (mpf->mpf_specification == 4))) {
smp_found_config = 1;
+ mpf_found = mpf;
+#ifdef CONFIG_X86_32
printk(KERN_INFO "found SMP MP-table at [%p] %08lx\n",
- mpf, virt_to_phys(mpf));
+ mpf, virt_to_phys(mpf));
reserve_bootmem(virt_to_phys(mpf), PAGE_SIZE,
BOOTMEM_DEFAULT);
if (mpf->mpf_physptr) {
@@ -756,8 +725,16 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
BOOTMEM_DEFAULT);
}
- mpf_found = mpf;
- return 1;
+#else
+ if (!reserve)
+ return 1;
+
+ reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
+ if (mpf->mpf_physptr)
+ reserve_bootmem_generic(mpf->mpf_physptr,
+ PAGE_SIZE);
+#endif
+ return 1;
}
bp += 4;
length -= 16;
@@ -765,7 +742,7 @@ static int __init smp_scan_config (unsigned long base, unsigned long length)
return 0;
}
-void __init find_smp_config (void)
+static void __init __find_smp_config(unsigned reserve)
{
unsigned int address;
@@ -777,9 +754,9 @@ void __init find_smp_config (void)
* 2) Scan the top 1K of base RAM
* 3) Scan the 64K of bios
*/
- if (smp_scan_config(0x0,0x400) ||
- smp_scan_config(639*0x400,0x400) ||
- smp_scan_config(0xF0000,0x10000))
+ if (smp_scan_config(0x0, 0x400, reserve) ||
+ smp_scan_config(639 * 0x400, 0x400, reserve) ||
+ smp_scan_config(0xF0000, 0x10000, reserve))
return;
/*
* If it is an SMP machine we should know now, unless the
@@ -800,144 +777,112 @@ void __init find_smp_config (void)
address = get_bios_ebda();
if (address)
- smp_scan_config(address, 0x400);
+ smp_scan_config(address, 0x400, reserve);
}
-int es7000_plat;
-
-/* --------------------------------------------------------------------------
- ACPI-based MP Configuration
- -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-void __init mp_register_lapic_address(u64 address)
+void __init early_find_smp_config(void)
{
- mp_lapic_addr = (unsigned long) address;
-
- set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
-
- if (boot_cpu_physical_apicid == -1U)
- boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
-
- Dprintk("Boot CPU = %d\n", boot_cpu_physical_apicid);
+ __find_smp_config(0);
}
-void __cpuinit mp_register_lapic (u8 id, u8 enabled)
+void __init find_smp_config(void)
{
- struct mpc_config_processor processor;
- int boot_cpu = 0;
-
- if (MAX_APICS - id <= 0) {
- printk(KERN_WARNING "Processor #%d invalid (max %d)\n",
- id, MAX_APICS);
- return;
- }
-
- if (id == boot_cpu_physical_apicid)
- boot_cpu = 1;
+ __find_smp_config(1);
+}
- processor.mpc_type = MP_PROCESSOR;
- processor.mpc_apicid = id;
- processor.mpc_apicver = GET_APIC_VERSION(apic_read(APIC_LVR));
- processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
- processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
- processor.mpc_cpufeature = (boot_cpu_data.x86 << 8) |
- (boot_cpu_data.x86_model << 4) | boot_cpu_data.x86_mask;
- processor.mpc_featureflag = boot_cpu_data.x86_capability[0];
- processor.mpc_reserved[0] = 0;
- processor.mpc_reserved[1] = 0;
+/* --------------------------------------------------------------------------
+ ACPI-based MP Configuration
+ -------------------------------------------------------------------------- */
- MP_processor_info(&processor);
-}
+#ifdef CONFIG_ACPI
#ifdef CONFIG_X86_IO_APIC
#define MP_ISA_BUS 0
-#define MP_MAX_IOAPIC_PIN 127
-static struct mp_ioapic_routing {
- int apic_id;
- int gsi_base;
- int gsi_end;
- u32 pin_programmed[4];
-} mp_ioapic_routing[MAX_IO_APICS];
+extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS];
-static int mp_find_ioapic (int gsi)
+static int mp_find_ioapic(int gsi)
{
int i = 0;
/* Find the IOAPIC that manages this GSI. */
for (i = 0; i < nr_ioapics; i++) {
if ((gsi >= mp_ioapic_routing[i].gsi_base)
- && (gsi <= mp_ioapic_routing[i].gsi_end))
+ && (gsi <= mp_ioapic_routing[i].gsi_end))
return i;
}
printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
-
return -1;
}
-void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
+static u8 __init uniq_ioapic_id(u8 id)
+{
+#ifdef CONFIG_X86_32
+ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return io_apic_get_unique_id(nr_ioapics, id);
+ else
+ return id;
+#else
+ int i;
+ DECLARE_BITMAP(used, 256);
+ bitmap_zero(used, 256);
+ for (i = 0; i < nr_ioapics; i++) {
+ struct mpc_config_ioapic *ia = &mp_ioapics[i];
+ __set_bit(ia->mpc_apicid, used);
+ }
+ if (!test_bit(id, used))
+ return id;
+ return find_first_zero_bit(used, 256);
+#endif
+}
+
+void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
{
int idx = 0;
- int tmpid;
- if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
- }
- if (!address) {
- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
- " found in MADT table, skipping!\n");
+ if (bad_ioapic(address))
return;
- }
- idx = nr_ioapics++;
+ idx = nr_ioapics;
mp_ioapics[idx].mpc_type = MP_IOAPIC;
mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
mp_ioapics[idx].mpc_apicaddr = address;
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
- if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- && !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- tmpid = io_apic_get_unique_id(idx, id);
- else
- tmpid = id;
- if (tmpid == -1) {
- nr_ioapics--;
- return;
- }
- mp_ioapics[idx].mpc_apicid = tmpid;
+ mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
+#ifdef CONFIG_X86_32
mp_ioapics[idx].mpc_apicver = io_apic_get_version(idx);
-
- /*
+#else
+ mp_ioapics[idx].mpc_apicver = 0;
+#endif
+ /*
* Build basic GSI lookup table to facilitate gsi->io_apic lookups
* and to prevent reprogramming of IOAPIC pins (PCI GSIs).
*/
mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
mp_ioapic_routing[idx].gsi_base = gsi_base;
mp_ioapic_routing[idx].gsi_end = gsi_base +
- io_apic_get_redir_entries(idx);
+ io_apic_get_redir_entries(idx);
- printk("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
+ printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
"GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
mp_ioapics[idx].mpc_apicver, mp_ioapics[idx].mpc_apicaddr,
- mp_ioapic_routing[idx].gsi_base,
- mp_ioapic_routing[idx].gsi_end);
+ mp_ioapic_routing[idx].gsi_base, mp_ioapic_routing[idx].gsi_end);
+
+ nr_ioapics++;
}
-void __init
-mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
+void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
{
struct mpc_config_intsrc intsrc;
- int ioapic = -1;
- int pin = -1;
+ int ioapic = -1;
+ int pin = -1;
- /*
+ /*
* Convert 'gsi' to 'ioapic.pin'.
*/
ioapic = mp_find_ioapic(gsi);
@@ -947,7 +892,7 @@ mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
/*
* TBD: This check is for faulty timer entries, where the override
- * erroneously sets the trigger to level, resulting in a HUGE
+ * erroneously sets the trigger to level, resulting in a HUGE
* increase of timer interrupts!
*/
if ((bus_irq == 0) && (trigger == 3))
@@ -957,30 +902,28 @@ mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
intsrc.mpc_irqtype = mp_INT;
intsrc.mpc_irqflag = (trigger << 2) | polarity;
intsrc.mpc_srcbus = MP_ISA_BUS;
- intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
- intsrc.mpc_dstirq = pin; /* INTIN# */
+ intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
+ intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
+ intsrc.mpc_dstirq = pin; /* INTIN# */
- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
- intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
-
- mp_irqs[mp_irq_entries] = intsrc;
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!\n");
+ MP_intsrc_info(&intsrc);
}
-void __init mp_config_acpi_legacy_irqs (void)
+int es7000_plat;
+
+void __init mp_config_acpi_legacy_irqs(void)
{
struct mpc_config_intsrc intsrc;
int i = 0;
int ioapic = -1;
- /*
+#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
+ /*
* Fabricate the legacy ISA bus (bus #31).
*/
mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
+#endif
+ set_bit(MP_ISA_BUS, mp_bus_not_pci);
Dprintk("Bus #%d is ISA\n", MP_ISA_BUS);
/*
@@ -989,19 +932,20 @@ void __init mp_config_acpi_legacy_irqs (void)
if (es7000_plat == 1)
return;
- /*
- * Locate the IOAPIC that manages the ISA IRQs (0-15).
+ /*
+ * Locate the IOAPIC that manages the ISA IRQs (0-15).
*/
ioapic = mp_find_ioapic(0);
if (ioapic < 0)
return;
intsrc.mpc_type = MP_INTSRC;
- intsrc.mpc_irqflag = 0; /* Conforming */
+ intsrc.mpc_irqflag = 0; /* Conforming */
intsrc.mpc_srcbus = MP_ISA_BUS;
+#ifdef CONFIG_X86_IO_APIC
intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
-
- /*
+#endif
+ /*
* Use the default configuration for the IRQs 0-15. Unless
* overridden by (MADT) interrupt source override entries.
*/
@@ -1012,51 +956,49 @@ void __init mp_config_acpi_legacy_irqs (void)
struct mpc_config_intsrc *irq = mp_irqs + idx;
/* Do we already have a mapping for this ISA IRQ? */
- if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
+ if (irq->mpc_srcbus == MP_ISA_BUS
+ && irq->mpc_srcbusirq == i)
break;
/* Do we already have a mapping for this IOAPIC pin */
if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
- (irq->mpc_dstirq == i))
+ (irq->mpc_dstirq == i))
break;
}
if (idx != mp_irq_entries) {
printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
- continue; /* IRQ already used */
+ continue; /* IRQ already used */
}
intsrc.mpc_irqtype = mp_INT;
- intsrc.mpc_srcbusirq = i; /* Identity mapped */
+ intsrc.mpc_srcbusirq = i; /* Identity mapped */
intsrc.mpc_dstirq = i;
- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
- "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
- intsrc.mpc_dstirq);
-
- mp_irqs[mp_irq_entries] = intsrc;
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!\n");
+ MP_intsrc_info(&intsrc);
}
}
+int mp_register_gsi(u32 gsi, int triggering, int polarity)
+{
+ int ioapic;
+ int ioapic_pin;
+#ifdef CONFIG_X86_32
#define MAX_GSI_NUM 4096
#define IRQ_COMPRESSION_START 64
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
-{
- int ioapic = -1;
- int ioapic_pin = 0;
- int idx, bit = 0;
static int pci_irq = IRQ_COMPRESSION_START;
/*
* Mapping between Global System Interrupts, which
* represent all possible interrupts, and IRQs
* assigned to actual devices.
*/
- static int gsi_to_irq[MAX_GSI_NUM];
+ static int gsi_to_irq[MAX_GSI_NUM];
+#else
+
+ if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
+ return gsi;
+#endif
/* Don't set up the ACPI SCI because it's already set up */
if (acpi_gbl_FADT.sci_interrupt == gsi)
@@ -1070,35 +1012,39 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_base;
+#ifdef CONFIG_X86_32
if (ioapic_renumber_irq)
gsi = ioapic_renumber_irq(ioapic, gsi);
+#endif
- /*
- * Avoid pin reprogramming. PRTs typically include entries
+ /*
+ * Avoid pin reprogramming. PRTs typically include entries
* with redundant pin->gsi mappings (but unique PCI devices);
* we only program the IOAPIC on the first.
*/
- bit = ioapic_pin % 32;
- idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
- if (idx > 3) {
+ if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
- ioapic_pin);
+ "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+ ioapic_pin);
return gsi;
}
- if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
+ if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
+#ifdef CONFIG_X86_32
return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
+#else
+ return gsi;
+#endif
}
- mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
-
+ set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
+#ifdef CONFIG_X86_32
/*
* For GSI >= 64, use IRQ compression
*/
if ((gsi >= IRQ_COMPRESSION_START)
- && (triggering == ACPI_LEVEL_SENSITIVE)) {
+ && (triggering == ACPI_LEVEL_SENSITIVE)) {
/*
* For PCI devices assign IRQs in order, avoiding gaps
* due to unused I/O APIC pins.
@@ -1115,8 +1061,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
* So test for this condition, and if necessary, avoid
* the pin collision.
*/
- if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0))
- gsi = pci_irq++;
+ gsi = pci_irq++;
/*
* Don't assign IRQ used by ACPI SCI
*/
@@ -1128,10 +1073,10 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
return gsi;
}
}
-
+#endif
io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+ triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
+ polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
return gsi;
}
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
deleted file mode 100644
index 72ab1403fed7..000000000000
--- a/arch/x86/kernel/mpparse_64.c
+++ /dev/null
@@ -1,867 +0,0 @@
-/*
- * Intel Multiprocessor Specification 1.1 and 1.4
- * compliant MP-table parsing routines.
- *
- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
- *
- * Fixes
- * Erich Boleyn : MP v1.4 and additional changes.
- * Alan Cox : Added EBDA scanning
- * Ingo Molnar : various cleanups and rewrites
- * Maciej W. Rozycki: Bits for default MP configurations
- * Paul Diefenbaugh: Added full ACPI support
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/acpi.h>
-#include <linux/module.h>
-
-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
-#include <asm/pgalloc.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/acpi.h>
-
-/* Have we found an MP table */
-int smp_found_config;
-
-/*
- * Various Linux-internal data structures created from the
- * MP-table.
- */
-DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
-int mp_bus_id_to_pci_bus [MAX_MP_BUSSES] = { [0 ... MAX_MP_BUSSES-1] = -1 };
-
-static int mp_current_pci_id = 0;
-/* I/O APIC entries */
-struct mpc_config_ioapic mp_ioapics[MAX_IO_APICS];
-
-/* # of MP IRQ source entries */
-struct mpc_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
-
-/* MP IRQ source entries */
-int mp_irq_entries;
-
-int nr_ioapics;
-unsigned long mp_lapic_addr = 0;
-
-
-
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_id = -1U;
-EXPORT_SYMBOL(boot_cpu_id);
-
-/* Internal processor count */
-unsigned int num_processors;
-
-unsigned disabled_cpus __cpuinitdata;
-
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
-
-u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
- = { [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_bios_cpu_apicid_early_ptr;
-DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
-
-
-/*
- * Intel MP BIOS table parsing routines:
- */
-
-/*
- * Checksum an MP configuration block.
- */
-
-static int __init mpf_checksum(unsigned char *mp, int len)
-{
- int sum = 0;
-
- while (len--)
- sum += *mp++;
-
- return sum & 0xFF;
-}
-
-static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
-{
- int cpu;
- cpumask_t tmp_map;
- char *bootup_cpu = "";
-
- if (!(m->mpc_cpuflag & CPU_ENABLED)) {
- disabled_cpus++;
- return;
- }
- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
- bootup_cpu = " (Bootup-CPU)";
- boot_cpu_id = m->mpc_apicid;
- }
-
- printk(KERN_INFO "Processor #%d%s\n", m->mpc_apicid, bootup_cpu);
-
- if (num_processors >= NR_CPUS) {
- printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
- " Processor ignored.\n", NR_CPUS);
- return;
- }
-
- num_processors++;
- cpus_complement(tmp_map, cpu_present_map);
- cpu = first_cpu(tmp_map);
-
- physid_set(m->mpc_apicid, phys_cpu_present_map);
- if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
- /*
- * x86_bios_cpu_apicid is required to have processors listed
- * in same order as logical cpu numbers. Hence the first
- * entry is BSP, and so on.
- */
- cpu = 0;
- }
- /* are we being called early in kernel startup? */
- if (x86_cpu_to_apicid_early_ptr) {
- u16 *cpu_to_apicid = x86_cpu_to_apicid_early_ptr;
- u16 *bios_cpu_apicid = x86_bios_cpu_apicid_early_ptr;
-
- cpu_to_apicid[cpu] = m->mpc_apicid;
- bios_cpu_apicid[cpu] = m->mpc_apicid;
- } else {
- per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
- per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
- }
-
- cpu_set(cpu, cpu_possible_map);
- cpu_set(cpu, cpu_present_map);
-}
-
-static void __init MP_bus_info (struct mpc_config_bus *m)
-{
- char str[7];
-
- memcpy(str, m->mpc_bustype, 6);
- str[6] = 0;
- Dprintk("Bus #%d is %s\n", m->mpc_busid, str);
-
- if (strncmp(str, "ISA", 3) == 0) {
- set_bit(m->mpc_busid, mp_bus_not_pci);
- } else if (strncmp(str, "PCI", 3) == 0) {
- clear_bit(m->mpc_busid, mp_bus_not_pci);
- mp_bus_id_to_pci_bus[m->mpc_busid] = mp_current_pci_id;
- mp_current_pci_id++;
- } else {
- printk(KERN_ERR "Unknown bustype %s\n", str);
- }
-}
-
-static int bad_ioapic(unsigned long address)
-{
- if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
- }
- if (!address) {
- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
- " found in table, skipping!\n");
- return 1;
- }
- return 0;
-}
-
-static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
-{
- if (!(m->mpc_flags & MPC_APIC_USABLE))
- return;
-
- printk("I/O APIC #%d at 0x%X.\n",
- m->mpc_apicid, m->mpc_apicaddr);
-
- if (bad_ioapic(m->mpc_apicaddr))
- return;
-
- mp_ioapics[nr_ioapics] = *m;
- nr_ioapics++;
-}
-
-static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
-{
- mp_irqs [mp_irq_entries] = *m;
- Dprintk("Int: type %d, pol %d, trig %d, bus %d,"
- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
- m->mpc_irqtype, m->mpc_irqflag & 3,
- (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
- m->mpc_srcbusirq, m->mpc_dstapic, m->mpc_dstirq);
- if (++mp_irq_entries >= MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!!\n");
-}
-
-static void __init MP_lintsrc_info (struct mpc_config_lintsrc *m)
-{
- Dprintk("Lint: type %d, pol %d, trig %d, bus %d,"
- " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
- m->mpc_irqtype, m->mpc_irqflag & 3,
- (m->mpc_irqflag >> 2) &3, m->mpc_srcbusid,
- m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
-}
-
-/*
- * Read/parse the MPC
- */
-
-static int __init smp_read_mpc(struct mp_config_table *mpc)
-{
- char str[16];
- int count=sizeof(*mpc);
- unsigned char *mpt=((unsigned char *)mpc)+count;
-
- if (memcmp(mpc->mpc_signature,MPC_SIGNATURE,4)) {
- printk("MPTABLE: bad signature [%c%c%c%c]!\n",
- mpc->mpc_signature[0],
- mpc->mpc_signature[1],
- mpc->mpc_signature[2],
- mpc->mpc_signature[3]);
- return 0;
- }
- if (mpf_checksum((unsigned char *)mpc,mpc->mpc_length)) {
- printk("MPTABLE: checksum error!\n");
- return 0;
- }
- if (mpc->mpc_spec!=0x01 && mpc->mpc_spec!=0x04) {
- printk(KERN_ERR "MPTABLE: bad table version (%d)!!\n",
- mpc->mpc_spec);
- return 0;
- }
- if (!mpc->mpc_lapic) {
- printk(KERN_ERR "MPTABLE: null local APIC address!\n");
- return 0;
- }
- memcpy(str,mpc->mpc_oem,8);
- str[8] = 0;
- printk(KERN_INFO "MPTABLE: OEM ID: %s ",str);
-
- memcpy(str,mpc->mpc_productid,12);
- str[12] = 0;
- printk("MPTABLE: Product ID: %s ",str);
-
- printk("MPTABLE: APIC at: 0x%X\n",mpc->mpc_lapic);
-
- /* save the local APIC address, it might be non-default */
- if (!acpi_lapic)
- mp_lapic_addr = mpc->mpc_lapic;
-
- /*
- * Now process the configuration blocks.
- */
- while (count < mpc->mpc_length) {
- switch(*mpt) {
- case MP_PROCESSOR:
- {
- struct mpc_config_processor *m=
- (struct mpc_config_processor *)mpt;
- if (!acpi_lapic)
- MP_processor_info(m);
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
- case MP_BUS:
- {
- struct mpc_config_bus *m=
- (struct mpc_config_bus *)mpt;
- MP_bus_info(m);
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
- case MP_IOAPIC:
- {
- struct mpc_config_ioapic *m=
- (struct mpc_config_ioapic *)mpt;
- MP_ioapic_info(m);
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
- case MP_INTSRC:
- {
- struct mpc_config_intsrc *m=
- (struct mpc_config_intsrc *)mpt;
-
- MP_intsrc_info(m);
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
- case MP_LINTSRC:
- {
- struct mpc_config_lintsrc *m=
- (struct mpc_config_lintsrc *)mpt;
- MP_lintsrc_info(m);
- mpt += sizeof(*m);
- count += sizeof(*m);
- break;
- }
- }
- }
- setup_apic_routing();
- if (!num_processors)
- printk(KERN_ERR "MPTABLE: no processors registered!\n");
- return num_processors;
-}
-
-static int __init ELCR_trigger(unsigned int irq)
-{
- unsigned int port;
-
- port = 0x4d0 + (irq >> 3);
- return (inb(port) >> (irq & 7)) & 1;
-}
-
-static void __init construct_default_ioirq_mptable(int mpc_default_type)
-{
- struct mpc_config_intsrc intsrc;
- int i;
- int ELCR_fallback = 0;
-
- intsrc.mpc_type = MP_INTSRC;
- intsrc.mpc_irqflag = 0; /* conforming */
- intsrc.mpc_srcbus = 0;
- intsrc.mpc_dstapic = mp_ioapics[0].mpc_apicid;
-
- intsrc.mpc_irqtype = mp_INT;
-
- /*
- * If true, we have an ISA/PCI system with no IRQ entries
- * in the MP table. To prevent the PCI interrupts from being set up
- * incorrectly, we try to use the ELCR. The sanity check to see if
- * there is good ELCR data is very simple - IRQ0, 1, 2 and 13 can
- * never be level sensitive, so we simply see if the ELCR agrees.
- * If it does, we assume it's valid.
- */
- if (mpc_default_type == 5) {
- printk(KERN_INFO "ISA/PCI bus type with no IRQ information... falling back to ELCR\n");
-
- if (ELCR_trigger(0) || ELCR_trigger(1) || ELCR_trigger(2) || ELCR_trigger(13))
- printk(KERN_ERR "ELCR contains invalid data... not using ELCR\n");
- else {
- printk(KERN_INFO "Using ELCR to identify PCI interrupts\n");
- ELCR_fallback = 1;
- }
- }
-
- for (i = 0; i < 16; i++) {
- switch (mpc_default_type) {
- case 2:
- if (i == 0 || i == 13)
- continue; /* IRQ0 & IRQ13 not connected */
- /* fall through */
- default:
- if (i == 2)
- continue; /* IRQ2 is never connected */
- }
-
- if (ELCR_fallback) {
- /*
- * If the ELCR indicates a level-sensitive interrupt, we
- * copy that information over to the MP table in the
- * irqflag field (level sensitive, active high polarity).
- */
- if (ELCR_trigger(i))
- intsrc.mpc_irqflag = 13;
- else
- intsrc.mpc_irqflag = 0;
- }
-
- intsrc.mpc_srcbusirq = i;
- intsrc.mpc_dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
- MP_intsrc_info(&intsrc);
- }
-
- intsrc.mpc_irqtype = mp_ExtINT;
- intsrc.mpc_srcbusirq = 0;
- intsrc.mpc_dstirq = 0; /* 8259A to INTIN0 */
- MP_intsrc_info(&intsrc);
-}
-
-static inline void __init construct_default_ISA_mptable(int mpc_default_type)
-{
- struct mpc_config_processor processor;
- struct mpc_config_bus bus;
- struct mpc_config_ioapic ioapic;
- struct mpc_config_lintsrc lintsrc;
- int linttypes[2] = { mp_ExtINT, mp_NMI };
- int i;
-
- /*
- * local APIC has default address
- */
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
- /*
- * 2 CPUs, numbered 0 & 1.
- */
- processor.mpc_type = MP_PROCESSOR;
- processor.mpc_apicver = 0;
- processor.mpc_cpuflag = CPU_ENABLED;
- processor.mpc_cpufeature = 0;
- processor.mpc_featureflag = 0;
- processor.mpc_reserved[0] = 0;
- processor.mpc_reserved[1] = 0;
- for (i = 0; i < 2; i++) {
- processor.mpc_apicid = i;
- MP_processor_info(&processor);
- }
-
- bus.mpc_type = MP_BUS;
- bus.mpc_busid = 0;
- switch (mpc_default_type) {
- default:
- printk(KERN_ERR "???\nUnknown standard configuration %d\n",
- mpc_default_type);
- /* fall through */
- case 1:
- case 5:
- memcpy(bus.mpc_bustype, "ISA ", 6);
- break;
- }
- MP_bus_info(&bus);
- if (mpc_default_type > 4) {
- bus.mpc_busid = 1;
- memcpy(bus.mpc_bustype, "PCI ", 6);
- MP_bus_info(&bus);
- }
-
- ioapic.mpc_type = MP_IOAPIC;
- ioapic.mpc_apicid = 2;
- ioapic.mpc_apicver = 0;
- ioapic.mpc_flags = MPC_APIC_USABLE;
- ioapic.mpc_apicaddr = 0xFEC00000;
- MP_ioapic_info(&ioapic);
-
- /*
- * We set up most of the low 16 IO-APIC pins according to MPS rules.
- */
- construct_default_ioirq_mptable(mpc_default_type);
-
- lintsrc.mpc_type = MP_LINTSRC;
- lintsrc.mpc_irqflag = 0; /* conforming */
- lintsrc.mpc_srcbusid = 0;
- lintsrc.mpc_srcbusirq = 0;
- lintsrc.mpc_destapic = MP_APIC_ALL;
- for (i = 0; i < 2; i++) {
- lintsrc.mpc_irqtype = linttypes[i];
- lintsrc.mpc_destapiclint = i;
- MP_lintsrc_info(&lintsrc);
- }
-}
-
-static struct intel_mp_floating *mpf_found;
-
-/*
- * Scan the memory blocks for an SMP configuration block.
- */
-void __init get_smp_config (void)
-{
- struct intel_mp_floating *mpf = mpf_found;
-
- /*
- * ACPI supports both logical (e.g. Hyper-Threading) and physical
- * processors, where MPS only supports physical.
- */
- if (acpi_lapic && acpi_ioapic) {
- printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
- return;
- }
- else if (acpi_lapic)
- printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration information\n");
-
- printk("Intel MultiProcessor Specification v1.%d\n", mpf->mpf_specification);
-
- /*
- * Now see if we need to read further.
- */
- if (mpf->mpf_feature1 != 0) {
-
- printk(KERN_INFO "Default MP configuration #%d\n", mpf->mpf_feature1);
- construct_default_ISA_mptable(mpf->mpf_feature1);
-
- } else if (mpf->mpf_physptr) {
-
- /*
- * Read the physical hardware table. Anything here will
- * override the defaults.
- */
- if (!smp_read_mpc(phys_to_virt(mpf->mpf_physptr))) {
- smp_found_config = 0;
- printk(KERN_ERR "BIOS bug, MP table errors detected!...\n");
- printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n");
- return;
- }
- /*
- * If there are no explicit MP IRQ entries, then we are
- * broken. We set up most of the low 16 IO-APIC pins to
- * ISA defaults and hope it will work.
- */
- if (!mp_irq_entries) {
- struct mpc_config_bus bus;
-
- printk(KERN_ERR "BIOS bug, no explicit IRQ entries, using default mptable. (tell your hw vendor)\n");
-
- bus.mpc_type = MP_BUS;
- bus.mpc_busid = 0;
- memcpy(bus.mpc_bustype, "ISA ", 6);
- MP_bus_info(&bus);
-
- construct_default_ioirq_mptable(0);
- }
-
- } else
- BUG();
-
- printk(KERN_INFO "Processors: %d\n", num_processors);
- /*
- * Only use the first configuration found.
- */
-}
-
-static int __init smp_scan_config (unsigned long base, unsigned long length)
-{
- extern void __bad_mpf_size(void);
- unsigned int *bp = phys_to_virt(base);
- struct intel_mp_floating *mpf;
-
- Dprintk("Scan SMP from %p for %ld bytes.\n", bp,length);
- if (sizeof(*mpf) != 16)
- __bad_mpf_size();
-
- while (length > 0) {
- mpf = (struct intel_mp_floating *)bp;
- if ((*bp == SMP_MAGIC_IDENT) &&
- (mpf->mpf_length == 1) &&
- !mpf_checksum((unsigned char *)bp, 16) &&
- ((mpf->mpf_specification == 1)
- || (mpf->mpf_specification == 4)) ) {
-
- smp_found_config = 1;
- reserve_bootmem_generic(virt_to_phys(mpf), PAGE_SIZE);
- if (mpf->mpf_physptr)
- reserve_bootmem_generic(mpf->mpf_physptr, PAGE_SIZE);
- mpf_found = mpf;
- return 1;
- }
- bp += 4;
- length -= 16;
- }
- return 0;
-}
-
-void __init find_smp_config(void)
-{
- unsigned int address;
-
- /*
- * FIXME: Linux assumes you have 640K of base ram..
- * this continues the error...
- *
- * 1) Scan the bottom 1K for a signature
- * 2) Scan the top 1K of base RAM
- * 3) Scan the 64K of bios
- */
- if (smp_scan_config(0x0,0x400) ||
- smp_scan_config(639*0x400,0x400) ||
- smp_scan_config(0xF0000,0x10000))
- return;
- /*
- * If it is an SMP machine we should know now.
- *
- * there is a real-mode segmented pointer pointing to the
- * 4K EBDA area at 0x40E, calculate and scan it here.
- *
- * NOTE! There are Linux loaders that will corrupt the EBDA
- * area, and as such this kind of SMP config may be less
- * trustworthy, simply because the SMP table may have been
- * stomped on during early boot. These loaders are buggy and
- * should be fixed.
- */
-
- address = *(unsigned short *)phys_to_virt(0x40E);
- address <<= 4;
- if (smp_scan_config(address, 0x1000))
- return;
-
- /* If we have come this far, we did not find an MP table */
- printk(KERN_INFO "No mptable found.\n");
-}
-
-/* --------------------------------------------------------------------------
- ACPI-based MP Configuration
- -------------------------------------------------------------------------- */
-
-#ifdef CONFIG_ACPI
-
-void __init mp_register_lapic_address(u64 address)
-{
- mp_lapic_addr = (unsigned long) address;
- set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
- if (boot_cpu_id == -1U)
- boot_cpu_id = GET_APIC_ID(apic_read(APIC_ID));
-}
-
-void __cpuinit mp_register_lapic (u8 id, u8 enabled)
-{
- struct mpc_config_processor processor;
- int boot_cpu = 0;
-
- if (id == boot_cpu_id)
- boot_cpu = 1;
-
- processor.mpc_type = MP_PROCESSOR;
- processor.mpc_apicid = id;
- processor.mpc_apicver = 0;
- processor.mpc_cpuflag = (enabled ? CPU_ENABLED : 0);
- processor.mpc_cpuflag |= (boot_cpu ? CPU_BOOTPROCESSOR : 0);
- processor.mpc_cpufeature = 0;
- processor.mpc_featureflag = 0;
- processor.mpc_reserved[0] = 0;
- processor.mpc_reserved[1] = 0;
-
- MP_processor_info(&processor);
-}
-
-#define MP_ISA_BUS 0
-#define MP_MAX_IOAPIC_PIN 127
-
-static struct mp_ioapic_routing {
- int apic_id;
- int gsi_start;
- int gsi_end;
- u32 pin_programmed[4];
-} mp_ioapic_routing[MAX_IO_APICS];
-
-static int mp_find_ioapic(int gsi)
-{
- int i = 0;
-
- /* Find the IOAPIC that manages this GSI. */
- for (i = 0; i < nr_ioapics; i++) {
- if ((gsi >= mp_ioapic_routing[i].gsi_start)
- && (gsi <= mp_ioapic_routing[i].gsi_end))
- return i;
- }
-
- printk(KERN_ERR "ERROR: Unable to locate IOAPIC for GSI %d\n", gsi);
- return -1;
-}
-
-static u8 uniq_ioapic_id(u8 id)
-{
- int i;
- DECLARE_BITMAP(used, 256);
- bitmap_zero(used, 256);
- for (i = 0; i < nr_ioapics; i++) {
- struct mpc_config_ioapic *ia = &mp_ioapics[i];
- __set_bit(ia->mpc_apicid, used);
- }
- if (!test_bit(id, used))
- return id;
- return find_first_zero_bit(used, 256);
-}
-
-void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base)
-{
- int idx = 0;
-
- if (bad_ioapic(address))
- return;
-
- idx = nr_ioapics;
-
- mp_ioapics[idx].mpc_type = MP_IOAPIC;
- mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
- mp_ioapics[idx].mpc_apicaddr = address;
-
- set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
- mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id);
- mp_ioapics[idx].mpc_apicver = 0;
-
- /*
- * Build basic IRQ lookup table to facilitate gsi->io_apic lookups
- * and to prevent reprogramming of IOAPIC pins (PCI IRQs).
- */
- mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].mpc_apicid;
- mp_ioapic_routing[idx].gsi_start = gsi_base;
- mp_ioapic_routing[idx].gsi_end = gsi_base +
- io_apic_get_redir_entries(idx);
-
- printk(KERN_INFO "IOAPIC[%d]: apic_id %d, address 0x%x, "
- "GSI %d-%d\n", idx, mp_ioapics[idx].mpc_apicid,
- mp_ioapics[idx].mpc_apicaddr,
- mp_ioapic_routing[idx].gsi_start,
- mp_ioapic_routing[idx].gsi_end);
-
- nr_ioapics++;
-}
-
-void __init
-mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
-{
- struct mpc_config_intsrc intsrc;
- int ioapic = -1;
- int pin = -1;
-
- /*
- * Convert 'gsi' to 'ioapic.pin'.
- */
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return;
- pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
-
- /*
- * TBD: This check is for faulty timer entries, where the override
- * erroneously sets the trigger to level, resulting in a HUGE
- * increase of timer interrupts!
- */
- if ((bus_irq == 0) && (trigger == 3))
- trigger = 1;
-
- intsrc.mpc_type = MP_INTSRC;
- intsrc.mpc_irqtype = mp_INT;
- intsrc.mpc_irqflag = (trigger << 2) | polarity;
- intsrc.mpc_srcbus = MP_ISA_BUS;
- intsrc.mpc_srcbusirq = bus_irq; /* IRQ */
- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */
- intsrc.mpc_dstirq = pin; /* INTIN# */
-
- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n",
- intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq);
-
- mp_irqs[mp_irq_entries] = intsrc;
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!\n");
-}
-
-void __init mp_config_acpi_legacy_irqs(void)
-{
- struct mpc_config_intsrc intsrc;
- int i = 0;
- int ioapic = -1;
-
- /*
- * Fabricate the legacy ISA bus (bus #31).
- */
- set_bit(MP_ISA_BUS, mp_bus_not_pci);
-
- /*
- * Locate the IOAPIC that manages the ISA IRQs (0-15).
- */
- ioapic = mp_find_ioapic(0);
- if (ioapic < 0)
- return;
-
- intsrc.mpc_type = MP_INTSRC;
- intsrc.mpc_irqflag = 0; /* Conforming */
- intsrc.mpc_srcbus = MP_ISA_BUS;
- intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid;
-
- /*
- * Use the default configuration for the IRQs 0-15. Unless
- * overridden by (MADT) interrupt source override entries.
- */
- for (i = 0; i < 16; i++) {
- int idx;
-
- for (idx = 0; idx < mp_irq_entries; idx++) {
- struct mpc_config_intsrc *irq = mp_irqs + idx;
-
- /* Do we already have a mapping for this ISA IRQ? */
- if (irq->mpc_srcbus == MP_ISA_BUS && irq->mpc_srcbusirq == i)
- break;
-
- /* Do we already have a mapping for this IOAPIC pin */
- if ((irq->mpc_dstapic == intsrc.mpc_dstapic) &&
- (irq->mpc_dstirq == i))
- break;
- }
-
- if (idx != mp_irq_entries) {
- printk(KERN_DEBUG "ACPI: IRQ%d used by override.\n", i);
- continue; /* IRQ already used */
- }
-
- intsrc.mpc_irqtype = mp_INT;
- intsrc.mpc_srcbusirq = i; /* Identity mapped */
- intsrc.mpc_dstirq = i;
-
- Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, "
- "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3,
- (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus,
- intsrc.mpc_srcbusirq, intsrc.mpc_dstapic,
- intsrc.mpc_dstirq);
-
- mp_irqs[mp_irq_entries] = intsrc;
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!\n");
- }
-}
-
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
-{
- int ioapic = -1;
- int ioapic_pin = 0;
- int idx, bit = 0;
-
- if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
- return gsi;
-
- /* Don't set up the ACPI SCI because it's already set up */
- if (acpi_gbl_FADT.sci_interrupt == gsi)
- return gsi;
-
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0) {
- printk(KERN_WARNING "No IOAPIC for GSI %u\n", gsi);
- return gsi;
- }
-
- ioapic_pin = gsi - mp_ioapic_routing[ioapic].gsi_start;
-
- /*
- * Avoid pin reprogramming. PRTs typically include entries
- * with redundant pin->gsi mappings (but unique PCI devices);
- * we only program the IOAPIC on the first.
- */
- bit = ioapic_pin % 32;
- idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32);
- if (idx > 3) {
- printk(KERN_ERR "Invalid reference to IOAPIC pin "
- "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
- ioapic_pin);
- return gsi;
- }
- if ((1<<bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) {
- Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n",
- mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
- return gsi;
- }
-
- mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1<<bit);
-
- io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
- triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
- polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- return gsi;
-}
-#endif /*CONFIG_ACPI*/
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index af51ea8400b2..1f3abe048e93 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -65,8 +65,8 @@ static loff_t msr_seek(struct file *file, loff_t offset, int orig)
return ret;
}
-static ssize_t msr_read(struct file *file, char __user * buf,
- size_t count, loff_t * ppos)
+static ssize_t msr_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
{
u32 __user *tmp = (u32 __user *) buf;
u32 data[2];
@@ -162,12 +162,10 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
err = msr_device_create(cpu);
break;
case CPU_UP_CANCELED:
+ case CPU_UP_CANCELED_FROZEN:
case CPU_DEAD:
msr_device_destroy(cpu);
break;
- case CPU_UP_CANCELED_FROZEN:
- destroy_suspended_device(msr_class, MKDEV(MSR_MAJOR, cpu));
- break;
}
return err ? NOTIFY_BAD : NOTIFY_OK;
}
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 6a0aa7038685..11b14bbaa61e 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -22,9 +22,11 @@
#include <linux/cpumask.h>
#include <linux/kernel_stat.h>
#include <linux/kdebug.h>
+#include <linux/slab.h>
#include <asm/smp.h>
#include <asm/nmi.h>
+#include <asm/timer.h>
#include "mach_traps.h"
@@ -67,7 +69,7 @@ static __init void nmi_cpu_busy(void *data)
}
#endif
-static int __init check_nmi_watchdog(void)
+int __init check_nmi_watchdog(void)
{
unsigned int *prev_nmi_count;
int cpu;
@@ -80,7 +82,7 @@ static int __init check_nmi_watchdog(void)
prev_nmi_count = kmalloc(NR_CPUS * sizeof(int), GFP_KERNEL);
if (!prev_nmi_count)
- return -1;
+ goto error;
printk(KERN_INFO "Testing NMI watchdog ... ");
@@ -117,7 +119,7 @@ static int __init check_nmi_watchdog(void)
if (!atomic_read(&nmi_active)) {
kfree(prev_nmi_count);
atomic_set(&nmi_active, -1);
- return -1;
+ goto error;
}
printk("OK.\n");
@@ -128,9 +130,11 @@ static int __init check_nmi_watchdog(void)
kfree(prev_nmi_count);
return 0;
+error:
+ timer_ack = !cpu_has_tsc;
+
+ return -1;
}
-/* This needs to happen later in boot so counters are working */
-late_initcall(check_nmi_watchdog);
static int __init setup_nmi_watchdog(char *str)
{
@@ -317,7 +321,8 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
extern void die_nmi(struct pt_regs *, const char *msg);
-__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+notrace __kprobes int
+nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
{
/*
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 9a4fde74bee1..5a29ded994fa 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -26,6 +26,8 @@
#include <asm/proto.h>
#include <asm/mce.h>
+#include <mach_traps.h>
+
int unknown_nmi_panic;
int nmi_watchdog_enabled;
int panic_on_unrecovered_nmi;
@@ -311,7 +313,8 @@ void touch_nmi_watchdog(void)
}
EXPORT_SYMBOL(touch_nmi_watchdog);
-int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
+notrace __kprobes int
+nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
{
int sum;
int touched = 0;
@@ -382,7 +385,8 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason)
static unsigned ignore_nmis;
-asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code)
+asmlinkage notrace __kprobes void
+do_nmi(struct pt_regs *regs, long error_code)
{
nmi_enter();
add_pda(__nmi_count,1);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 075962cc75ab..74f0c5ea2a03 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -206,13 +206,6 @@ static struct resource reserve_ioports = {
.flags = IORESOURCE_IO | IORESOURCE_BUSY,
};
-static struct resource reserve_iomem = {
- .start = 0,
- .end = -1,
- .name = "paravirt-iomem",
- .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
-};
-
/*
* Reserve the whole legacy IO space to prevent any legacy drivers
* from wasting time probing for their hardware. This is a fairly
@@ -222,16 +215,7 @@ static struct resource reserve_iomem = {
*/
int paravirt_disable_iospace(void)
{
- int ret;
-
- ret = request_resource(&ioport_resource, &reserve_ioports);
- if (ret == 0) {
- ret = request_resource(&iomem_resource, &reserve_iomem);
- if (ret)
- release_resource(&reserve_ioports);
- }
-
- return ret;
+ return request_resource(&ioport_resource, &reserve_ioports);
}
static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
@@ -382,11 +366,13 @@ struct pv_mmu_ops pv_mmu_ops = {
.flush_tlb_single = native_flush_tlb_single,
.flush_tlb_others = native_flush_tlb_others,
- .alloc_pt = paravirt_nop,
- .alloc_pd = paravirt_nop,
- .alloc_pd_clone = paravirt_nop,
- .release_pt = paravirt_nop,
- .release_pd = paravirt_nop,
+ .alloc_pte = paravirt_nop,
+ .alloc_pmd = paravirt_nop,
+ .alloc_pmd_clone = paravirt_nop,
+ .alloc_pud = paravirt_nop,
+ .release_pte = paravirt_nop,
+ .release_pmd = paravirt_nop,
+ .release_pud = paravirt_nop,
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 1b5464c2434f..e28ec497e142 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -43,6 +43,7 @@
#include <asm/system.h>
#include <asm/dma.h>
#include <asm/rio.h>
+#include <asm/bios_ebda.h>
#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
int use_calgary __read_mostly = 1;
@@ -470,10 +471,11 @@ error:
return 0;
}
-static dma_addr_t calgary_map_single(struct device *dev, void *vaddr,
+static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
size_t size, int direction)
{
dma_addr_t dma_handle = bad_dma_address;
+ void *vaddr = phys_to_virt(paddr);
unsigned long uaddr;
unsigned int npages;
struct iommu_table *tbl = find_iommu_table(dev);
@@ -1232,8 +1234,7 @@ static int __init calgary_init(void)
error:
do {
- dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM,
- PCI_ANY_ID, dev);
+ dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
if (!dev)
break;
if (!is_cal_pci_dev(dev->device))
diff --git a/arch/x86/kernel/pci-dma_64.c b/arch/x86/kernel/pci-dma.c
index 375cb2bc45be..388b113a7d88 100644
--- a/arch/x86/kernel/pci-dma_64.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -1,61 +1,370 @@
-/*
- * Dynamic DMA mapping support.
- */
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/module.h>
+#include <linux/dma-mapping.h>
#include <linux/dmar.h>
-#include <asm/io.h>
+#include <linux/bootmem.h>
+#include <linux/pci.h>
+
+#include <asm/proto.h>
+#include <asm/dma.h>
#include <asm/gart.h>
#include <asm/calgary.h>
-int iommu_merge __read_mostly = 0;
+int forbid_dac __read_mostly;
+EXPORT_SYMBOL(forbid_dac);
-dma_addr_t bad_dma_address __read_mostly;
-EXPORT_SYMBOL(bad_dma_address);
+const struct dma_mapping_ops *dma_ops;
+EXPORT_SYMBOL(dma_ops);
-/* This tells the BIO block layer to assume merging. Default to off
- because we cannot guarantee merging later. */
-int iommu_bio_merge __read_mostly = 0;
-EXPORT_SYMBOL(iommu_bio_merge);
-
-static int iommu_sac_force __read_mostly = 0;
+int iommu_sac_force __read_mostly = 0;
-int no_iommu __read_mostly;
#ifdef CONFIG_IOMMU_DEBUG
int panic_on_overflow __read_mostly = 1;
int force_iommu __read_mostly = 1;
#else
int panic_on_overflow __read_mostly = 0;
-int force_iommu __read_mostly= 0;
+int force_iommu __read_mostly = 0;
#endif
+int iommu_merge __read_mostly = 0;
+
+int no_iommu __read_mostly;
/* Set this to 1 if there is a HW IOMMU in the system */
int iommu_detected __read_mostly = 0;
+/* This tells the BIO block layer to assume merging. Default to off
+ because we cannot guarantee merging later. */
+int iommu_bio_merge __read_mostly = 0;
+EXPORT_SYMBOL(iommu_bio_merge);
+
+dma_addr_t bad_dma_address __read_mostly = 0;
+EXPORT_SYMBOL(bad_dma_address);
+
/* Dummy device used for NULL arguments (normally ISA). Better would
be probably a smaller DMA mask, but this is bug-to-bug compatible
- to i386. */
+ to older i386. */
struct device fallback_dev = {
.bus_id = "fallback device",
.coherent_dma_mask = DMA_32BIT_MASK,
.dma_mask = &fallback_dev.coherent_dma_mask,
};
+int dma_set_mask(struct device *dev, u64 mask)
+{
+ if (!dev->dma_mask || !dma_supported(dev, mask))
+ return -EIO;
+
+ *dev->dma_mask = mask;
+
+ return 0;
+}
+EXPORT_SYMBOL(dma_set_mask);
+
+#ifdef CONFIG_X86_64
+static __initdata void *dma32_bootmem_ptr;
+static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
+
+static int __init parse_dma32_size_opt(char *p)
+{
+ if (!p)
+ return -EINVAL;
+ dma32_bootmem_size = memparse(p, &p);
+ return 0;
+}
+early_param("dma32_size", parse_dma32_size_opt);
+
+void __init dma32_reserve_bootmem(void)
+{
+ unsigned long size, align;
+ if (end_pfn <= MAX_DMA32_PFN)
+ return;
+
+ align = 64ULL<<20;
+ size = round_up(dma32_bootmem_size, align);
+ dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
+ __pa(MAX_DMA_ADDRESS));
+ if (dma32_bootmem_ptr)
+ dma32_bootmem_size = size;
+ else
+ dma32_bootmem_size = 0;
+}
+static void __init dma32_free_bootmem(void)
+{
+ int node;
+
+ if (end_pfn <= MAX_DMA32_PFN)
+ return;
+
+ if (!dma32_bootmem_ptr)
+ return;
+
+ for_each_online_node(node)
+ free_bootmem_node(NODE_DATA(node), __pa(dma32_bootmem_ptr),
+ dma32_bootmem_size);
+
+ dma32_bootmem_ptr = NULL;
+ dma32_bootmem_size = 0;
+}
+
+void __init pci_iommu_alloc(void)
+{
+ /* free the range so iommu could get some range less than 4G */
+ dma32_free_bootmem();
+ /*
+ * The order of these functions is important for
+ * fall-back/fail-over reasons
+ */
+#ifdef CONFIG_GART_IOMMU
+ gart_iommu_hole_init();
+#endif
+
+#ifdef CONFIG_CALGARY_IOMMU
+ detect_calgary();
+#endif
+
+ detect_intel_iommu();
+
+#ifdef CONFIG_SWIOTLB
+ pci_swiotlb_init();
+#endif
+}
+#endif
+
+/*
+ * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
+ * documentation.
+ */
+static __init int iommu_setup(char *p)
+{
+ iommu_merge = 1;
+
+ if (!p)
+ return -EINVAL;
+
+ while (*p) {
+ if (!strncmp(p, "off", 3))
+ no_iommu = 1;
+ /* gart_parse_options has more force support */
+ if (!strncmp(p, "force", 5))
+ force_iommu = 1;
+ if (!strncmp(p, "noforce", 7)) {
+ iommu_merge = 0;
+ force_iommu = 0;
+ }
+
+ if (!strncmp(p, "biomerge", 8)) {
+ iommu_bio_merge = 4096;
+ iommu_merge = 1;
+ force_iommu = 1;
+ }
+ if (!strncmp(p, "panic", 5))
+ panic_on_overflow = 1;
+ if (!strncmp(p, "nopanic", 7))
+ panic_on_overflow = 0;
+ if (!strncmp(p, "merge", 5)) {
+ iommu_merge = 1;
+ force_iommu = 1;
+ }
+ if (!strncmp(p, "nomerge", 7))
+ iommu_merge = 0;
+ if (!strncmp(p, "forcesac", 8))
+ iommu_sac_force = 1;
+ if (!strncmp(p, "allowdac", 8))
+ forbid_dac = 0;
+ if (!strncmp(p, "nodac", 5))
+ forbid_dac = -1;
+ if (!strncmp(p, "usedac", 6)) {
+ forbid_dac = -1;
+ return 1;
+ }
+#ifdef CONFIG_SWIOTLB
+ if (!strncmp(p, "soft", 4))
+ swiotlb = 1;
+#endif
+
+#ifdef CONFIG_GART_IOMMU
+ gart_parse_options(p);
+#endif
+
+#ifdef CONFIG_CALGARY_IOMMU
+ if (!strncmp(p, "calgary", 7))
+ use_calgary = 1;
+#endif /* CONFIG_CALGARY_IOMMU */
+
+ p += strcspn(p, ",");
+ if (*p == ',')
+ ++p;
+ }
+ return 0;
+}
+early_param("iommu", iommu_setup);
+
+#ifdef CONFIG_X86_32
+int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
+ dma_addr_t device_addr, size_t size, int flags)
+{
+ void __iomem *mem_base = NULL;
+ int pages = size >> PAGE_SHIFT;
+ int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
+
+ if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
+ goto out;
+ if (!size)
+ goto out;
+ if (dev->dma_mem)
+ goto out;
+
+ /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
+
+ mem_base = ioremap(bus_addr, size);
+ if (!mem_base)
+ goto out;
+
+ dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
+ if (!dev->dma_mem)
+ goto out;
+ dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ if (!dev->dma_mem->bitmap)
+ goto free1_out;
+
+ dev->dma_mem->virt_base = mem_base;
+ dev->dma_mem->device_base = device_addr;
+ dev->dma_mem->size = pages;
+ dev->dma_mem->flags = flags;
+
+ if (flags & DMA_MEMORY_MAP)
+ return DMA_MEMORY_MAP;
+
+ return DMA_MEMORY_IO;
+
+ free1_out:
+ kfree(dev->dma_mem);
+ out:
+ if (mem_base)
+ iounmap(mem_base);
+ return 0;
+}
+EXPORT_SYMBOL(dma_declare_coherent_memory);
+
+void dma_release_declared_memory(struct device *dev)
+{
+ struct dma_coherent_mem *mem = dev->dma_mem;
+
+ if (!mem)
+ return;
+ dev->dma_mem = NULL;
+ iounmap(mem->virt_base);
+ kfree(mem->bitmap);
+ kfree(mem);
+}
+EXPORT_SYMBOL(dma_release_declared_memory);
+
+void *dma_mark_declared_memory_occupied(struct device *dev,
+ dma_addr_t device_addr, size_t size)
+{
+ struct dma_coherent_mem *mem = dev->dma_mem;
+ int pos, err;
+ int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
+
+ pages >>= PAGE_SHIFT;
+
+ if (!mem)
+ return ERR_PTR(-EINVAL);
+
+ pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
+ err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
+ if (err != 0)
+ return ERR_PTR(err);
+ return mem->virt_base + (pos << PAGE_SHIFT);
+}
+EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
+
+static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
+ dma_addr_t *dma_handle, void **ret)
+{
+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+ int order = get_order(size);
+
+ if (mem) {
+ int page = bitmap_find_free_region(mem->bitmap, mem->size,
+ order);
+ if (page >= 0) {
+ *dma_handle = mem->device_base + (page << PAGE_SHIFT);
+ *ret = mem->virt_base + (page << PAGE_SHIFT);
+ memset(*ret, 0, size);
+ }
+ if (mem->flags & DMA_MEMORY_EXCLUSIVE)
+ *ret = NULL;
+ }
+ return (mem != NULL);
+}
+
+static int dma_release_coherent(struct device *dev, int order, void *vaddr)
+{
+ struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
+
+ if (mem && vaddr >= mem->virt_base && vaddr <
+ (mem->virt_base + (mem->size << PAGE_SHIFT))) {
+ int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
+
+ bitmap_release_region(mem->bitmap, page, order);
+ return 1;
+ }
+ return 0;
+}
+#else
+#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
+#define dma_release_coherent(dev, order, vaddr) (0)
+#endif /* CONFIG_X86_32 */
+
+int dma_supported(struct device *dev, u64 mask)
+{
+#ifdef CONFIG_PCI
+ if (mask > 0xffffffff && forbid_dac > 0) {
+ printk(KERN_INFO "PCI: Disallowing DAC for device %s\n",
+ dev->bus_id);
+ return 0;
+ }
+#endif
+
+ if (dma_ops->dma_supported)
+ return dma_ops->dma_supported(dev, mask);
+
+ /* Copied from i386. Doesn't make much sense, because it will
+ only work for pci_alloc_coherent.
+ The caller just has to use GFP_DMA in this case. */
+ if (mask < DMA_24BIT_MASK)
+ return 0;
+
+ /* Tell the device to use SAC when IOMMU force is on. This
+ allows the driver to use cheaper accesses in some cases.
+
+ Problem with this is that if we overflow the IOMMU area and
+ return DAC as fallback address the device may not handle it
+ correctly.
+
+ As a special case some controllers have a 39bit address
+ mode that is as efficient as 32bit (aic79xx). Don't force
+ SAC for these. Assume all masks <= 40 bits are of this
+ type. Normally this doesn't make any difference, but gives
+ more gentle handling of IOMMU overflow. */
+ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
+ printk(KERN_INFO "%s: Force SAC with mask %Lx\n",
+ dev->bus_id, mask);
+ return 0;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL(dma_supported);
+
/* Allocate DMA memory on node near device */
-noinline static void *
+noinline struct page *
dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
{
- struct page *page;
int node;
node = dev_to_node(dev);
- page = alloc_pages_node(node, gfp, order);
- return page ? page_address(page) : NULL;
+ return alloc_pages_node(node, gfp, order);
}
/*
@@ -65,9 +374,16 @@ void *
dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t gfp)
{
- void *memory;
+ void *memory = NULL;
+ struct page *page;
unsigned long dma_mask = 0;
- u64 bus;
+ dma_addr_t bus;
+
+ /* ignore region specifiers */
+ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
+
+ if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
+ return memory;
if (!dev)
dev = &fallback_dev;
@@ -82,26 +398,25 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
/* Don't invoke OOM killer */
gfp |= __GFP_NORETRY;
- /* Kludge to make it bug-to-bug compatible with i386. i386
- uses the normal dma_mask for alloc_coherent. */
- dma_mask &= *dev->dma_mask;
-
+#ifdef CONFIG_X86_64
/* Why <=? Even when the mask is smaller than 4GB it is often
larger than 16MB and in this case we have a chance of
finding fitting memory in the next higher zone first. If
not retry with true GFP_DMA. -AK */
if (dma_mask <= DMA_32BIT_MASK)
gfp |= GFP_DMA32;
+#endif
again:
- memory = dma_alloc_pages(dev, gfp, get_order(size));
- if (memory == NULL)
+ page = dma_alloc_pages(dev, gfp, get_order(size));
+ if (page == NULL)
return NULL;
{
int high, mmu;
- bus = virt_to_bus(memory);
- high = (bus + size) >= dma_mask;
+ bus = page_to_phys(page);
+ memory = page_address(page);
+ high = (bus + size) >= dma_mask;
mmu = high;
if (force_iommu && !(gfp & GFP_DMA))
mmu = 1;
@@ -127,7 +442,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
memset(memory, 0, size);
if (!mmu) {
- *dma_handle = virt_to_bus(memory);
+ *dma_handle = bus;
return memory;
}
}
@@ -139,7 +454,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
}
if (dma_ops->map_simple) {
- *dma_handle = dma_ops->map_simple(dev, memory,
+ *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory),
size,
PCI_DMA_BIDIRECTIONAL);
if (*dma_handle != bad_dma_address)
@@ -147,7 +462,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
}
if (panic_on_overflow)
- panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",size);
+ panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
+ (unsigned long)size);
free_pages((unsigned long)memory, get_order(size));
return NULL;
}
@@ -160,153 +476,16 @@ EXPORT_SYMBOL(dma_alloc_coherent);
void dma_free_coherent(struct device *dev, size_t size,
void *vaddr, dma_addr_t bus)
{
+ int order = get_order(size);
WARN_ON(irqs_disabled()); /* for portability */
+ if (dma_release_coherent(dev, order, vaddr))
+ return;
if (dma_ops->unmap_single)
dma_ops->unmap_single(dev, bus, size, 0);
- free_pages((unsigned long)vaddr, get_order(size));
+ free_pages((unsigned long)vaddr, order);
}
EXPORT_SYMBOL(dma_free_coherent);
-static int forbid_dac __read_mostly;
-
-int dma_supported(struct device *dev, u64 mask)
-{
-#ifdef CONFIG_PCI
- if (mask > 0xffffffff && forbid_dac > 0) {
-
-
-
- printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", dev->bus_id);
- return 0;
- }
-#endif
-
- if (dma_ops->dma_supported)
- return dma_ops->dma_supported(dev, mask);
-
- /* Copied from i386. Doesn't make much sense, because it will
- only work for pci_alloc_coherent.
- The caller just has to use GFP_DMA in this case. */
- if (mask < DMA_24BIT_MASK)
- return 0;
-
- /* Tell the device to use SAC when IOMMU force is on. This
- allows the driver to use cheaper accesses in some cases.
-
- Problem with this is that if we overflow the IOMMU area and
- return DAC as fallback address the device may not handle it
- correctly.
-
- As a special case some controllers have a 39bit address
- mode that is as efficient as 32bit (aic79xx). Don't force
- SAC for these. Assume all masks <= 40 bits are of this
- type. Normally this doesn't make any difference, but gives
- more gentle handling of IOMMU overflow. */
- if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
- printk(KERN_INFO "%s: Force SAC with mask %Lx\n", dev->bus_id,mask);
- return 0;
- }
-
- return 1;
-}
-EXPORT_SYMBOL(dma_supported);
-
-int dma_set_mask(struct device *dev, u64 mask)
-{
- if (!dev->dma_mask || !dma_supported(dev, mask))
- return -EIO;
- *dev->dma_mask = mask;
- return 0;
-}
-EXPORT_SYMBOL(dma_set_mask);
-
-/*
- * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
- * documentation.
- */
-static __init int iommu_setup(char *p)
-{
- iommu_merge = 1;
-
- if (!p)
- return -EINVAL;
-
- while (*p) {
- if (!strncmp(p,"off",3))
- no_iommu = 1;
- /* gart_parse_options has more force support */
- if (!strncmp(p,"force",5))
- force_iommu = 1;
- if (!strncmp(p,"noforce",7)) {
- iommu_merge = 0;
- force_iommu = 0;
- }
-
- if (!strncmp(p, "biomerge",8)) {
- iommu_bio_merge = 4096;
- iommu_merge = 1;
- force_iommu = 1;
- }
- if (!strncmp(p, "panic",5))
- panic_on_overflow = 1;
- if (!strncmp(p, "nopanic",7))
- panic_on_overflow = 0;
- if (!strncmp(p, "merge",5)) {
- iommu_merge = 1;
- force_iommu = 1;
- }
- if (!strncmp(p, "nomerge",7))
- iommu_merge = 0;
- if (!strncmp(p, "forcesac",8))
- iommu_sac_force = 1;
- if (!strncmp(p, "allowdac", 8))
- forbid_dac = 0;
- if (!strncmp(p, "nodac", 5))
- forbid_dac = -1;
-
-#ifdef CONFIG_SWIOTLB
- if (!strncmp(p, "soft",4))
- swiotlb = 1;
-#endif
-
-#ifdef CONFIG_GART_IOMMU
- gart_parse_options(p);
-#endif
-
-#ifdef CONFIG_CALGARY_IOMMU
- if (!strncmp(p, "calgary", 7))
- use_calgary = 1;
-#endif /* CONFIG_CALGARY_IOMMU */
-
- p += strcspn(p, ",");
- if (*p == ',')
- ++p;
- }
- return 0;
-}
-early_param("iommu", iommu_setup);
-
-void __init pci_iommu_alloc(void)
-{
- /*
- * The order of these functions is important for
- * fall-back/fail-over reasons
- */
-#ifdef CONFIG_GART_IOMMU
- gart_iommu_hole_init();
-#endif
-
-#ifdef CONFIG_CALGARY_IOMMU
- detect_calgary();
-#endif
-
- detect_intel_iommu();
-
-#ifdef CONFIG_SWIOTLB
- pci_swiotlb_init();
-#endif
-}
-
static int __init pci_iommu_init(void)
{
#ifdef CONFIG_CALGARY_IOMMU
@@ -327,6 +506,8 @@ void pci_iommu_shutdown(void)
{
gart_iommu_shutdown();
}
+/* Must execute after PCI subsystem */
+fs_initcall(pci_iommu_init);
#ifdef CONFIG_PCI
/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
@@ -334,11 +515,10 @@ void pci_iommu_shutdown(void)
static __devinit void via_no_dac(struct pci_dev *dev)
{
if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
- printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
+ printk(KERN_INFO "PCI: VIA PCI bridge detected."
+ "Disabling DAC.\n");
forbid_dac = 1;
}
}
DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
#endif
-/* Must execute after PCI subsystem */
-fs_initcall(pci_iommu_init);
diff --git a/arch/x86/kernel/pci-dma_32.c b/arch/x86/kernel/pci-dma_32.c
deleted file mode 100644
index 51330321a5d3..000000000000
--- a/arch/x86/kernel/pci-dma_32.c
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Dynamic DMA mapping support.
- *
- * On i386 there is no hardware dynamic DMA address translation,
- * so consistent alloc/free are merely page allocation/freeing.
- * The rest of the dynamic DMA mapping interface is implemented
- * in asm/pci.h.
- */
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/string.h>
-#include <linux/pci.h>
-#include <linux/module.h>
-#include <asm/io.h>
-
-struct dma_coherent_mem {
- void *virt_base;
- u32 device_base;
- int size;
- int flags;
- unsigned long *bitmap;
-};
-
-void *dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, gfp_t gfp)
-{
- void *ret;
- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
- int order = get_order(size);
- /* ignore region specifiers */
- gfp &= ~(__GFP_DMA | __GFP_HIGHMEM);
-
- if (mem) {
- int page = bitmap_find_free_region(mem->bitmap, mem->size,
- order);
- if (page >= 0) {
- *dma_handle = mem->device_base + (page << PAGE_SHIFT);
- ret = mem->virt_base + (page << PAGE_SHIFT);
- memset(ret, 0, size);
- return ret;
- }
- if (mem->flags & DMA_MEMORY_EXCLUSIVE)
- return NULL;
- }
-
- if (dev == NULL || (dev->coherent_dma_mask < 0xffffffff))
- gfp |= GFP_DMA;
-
- ret = (void *)__get_free_pages(gfp, order);
-
- if (ret != NULL) {
- memset(ret, 0, size);
- *dma_handle = virt_to_phys(ret);
- }
- return ret;
-}
-EXPORT_SYMBOL(dma_alloc_coherent);
-
-void dma_free_coherent(struct device *dev, size_t size,
- void *vaddr, dma_addr_t dma_handle)
-{
- struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
- int order = get_order(size);
-
- WARN_ON(irqs_disabled()); /* for portability */
- if (mem && vaddr >= mem->virt_base && vaddr < (mem->virt_base + (mem->size << PAGE_SHIFT))) {
- int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
-
- bitmap_release_region(mem->bitmap, page, order);
- } else
- free_pages((unsigned long)vaddr, order);
-}
-EXPORT_SYMBOL(dma_free_coherent);
-
-int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
- dma_addr_t device_addr, size_t size, int flags)
-{
- void __iomem *mem_base = NULL;
- int pages = size >> PAGE_SHIFT;
- int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
-
- if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
- goto out;
- if (!size)
- goto out;
- if (dev->dma_mem)
- goto out;
-
- /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
-
- mem_base = ioremap(bus_addr, size);
- if (!mem_base)
- goto out;
-
- dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
- if (!dev->dma_mem)
- goto out;
- dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!dev->dma_mem->bitmap)
- goto free1_out;
-
- dev->dma_mem->virt_base = mem_base;
- dev->dma_mem->device_base = device_addr;
- dev->dma_mem->size = pages;
- dev->dma_mem->flags = flags;
-
- if (flags & DMA_MEMORY_MAP)
- return DMA_MEMORY_MAP;
-
- return DMA_MEMORY_IO;
-
- free1_out:
- kfree(dev->dma_mem);
- out:
- if (mem_base)
- iounmap(mem_base);
- return 0;
-}
-EXPORT_SYMBOL(dma_declare_coherent_memory);
-
-void dma_release_declared_memory(struct device *dev)
-{
- struct dma_coherent_mem *mem = dev->dma_mem;
-
- if(!mem)
- return;
- dev->dma_mem = NULL;
- iounmap(mem->virt_base);
- kfree(mem->bitmap);
- kfree(mem);
-}
-EXPORT_SYMBOL(dma_release_declared_memory);
-
-void *dma_mark_declared_memory_occupied(struct device *dev,
- dma_addr_t device_addr, size_t size)
-{
- struct dma_coherent_mem *mem = dev->dma_mem;
- int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1) >> PAGE_SHIFT;
- int pos, err;
-
- if (!mem)
- return ERR_PTR(-EINVAL);
-
- pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
- err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
- if (err != 0)
- return ERR_PTR(err);
- return mem->virt_base + (pos << PAGE_SHIFT);
-}
-EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
-
-#ifdef CONFIG_PCI
-/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
-
-int forbid_dac;
-EXPORT_SYMBOL(forbid_dac);
-
-static __devinit void via_no_dac(struct pci_dev *dev)
-{
- if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
- printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n");
- forbid_dac = 1;
- }
-}
-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
-
-static int check_iommu(char *s)
-{
- if (!strcmp(s, "usedac")) {
- forbid_dac = -1;
- return 1;
- }
- return 0;
-}
-__setup("iommu=", check_iommu);
-#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 700e4647dd30..c07455d1695f 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -264,9 +264,9 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
}
static dma_addr_t
-gart_map_simple(struct device *dev, char *buf, size_t size, int dir)
+gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
{
- dma_addr_t map = dma_map_area(dev, virt_to_bus(buf), size, dir);
+ dma_addr_t map = dma_map_area(dev, paddr, size, dir);
flush_gart();
@@ -275,18 +275,17 @@ gart_map_simple(struct device *dev, char *buf, size_t size, int dir)
/* Map a single area into the IOMMU */
static dma_addr_t
-gart_map_single(struct device *dev, void *addr, size_t size, int dir)
+gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
{
- unsigned long phys_mem, bus;
+ unsigned long bus;
if (!dev)
dev = &fallback_dev;
- phys_mem = virt_to_phys(addr);
- if (!need_iommu(dev, phys_mem, size))
- return phys_mem;
+ if (!need_iommu(dev, paddr, size))
+ return paddr;
- bus = gart_map_simple(dev, addr, size, dir);
+ bus = gart_map_simple(dev, paddr, size, dir);
return bus;
}
diff --git a/arch/x86/kernel/pci-nommu_64.c b/arch/x86/kernel/pci-nommu.c
index ab08e1832228..aec43d56f49c 100644
--- a/arch/x86/kernel/pci-nommu_64.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
static int
check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
{
- if (hwdev && bus + size > *hwdev->dma_mask) {
+ if (hwdev && bus + size > *hwdev->dma_mask) {
if (*hwdev->dma_mask >= DMA_32BIT_MASK)
printk(KERN_ERR
"nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -26,19 +26,17 @@ check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
}
static dma_addr_t
-nommu_map_single(struct device *hwdev, void *ptr, size_t size,
+nommu_map_single(struct device *hwdev, phys_addr_t paddr, size_t size,
int direction)
{
- dma_addr_t bus = virt_to_bus(ptr);
+ dma_addr_t bus = paddr;
+ WARN_ON(size == 0);
if (!check_addr("map_single", hwdev, bus, size))
return bad_dma_address;
+ flush_write_buffers();
return bus;
}
-static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size,
- int direction)
-{
-}
/* Map a set of buffers described by scatterlist in streaming
* mode for DMA. This is the scatter-gather version of the
@@ -61,30 +59,34 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
struct scatterlist *s;
int i;
+ WARN_ON(nents == 0 || sg[0].length == 0);
+
for_each_sg(sg, s, nents, i) {
BUG_ON(!sg_page(s));
- s->dma_address = virt_to_bus(sg_virt(s));
+ s->dma_address = sg_phys(s);
if (!check_addr("map_sg", hwdev, s->dma_address, s->length))
return 0;
s->dma_length = s->length;
}
+ flush_write_buffers();
return nents;
}
-/* Unmap a set of streaming mode DMA translations.
- * Again, cpu read rules concerning calls here are the same as for
- * pci_unmap_single() above.
- */
-static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg,
- int nents, int dir)
+/* Make sure we keep the same behaviour */
+static int nommu_mapping_error(dma_addr_t dma_addr)
{
+#ifdef CONFIG_X86_32
+ return 0;
+#else
+ return (dma_addr == bad_dma_address);
+#endif
}
+
const struct dma_mapping_ops nommu_dma_ops = {
.map_single = nommu_map_single,
- .unmap_single = nommu_unmap_single,
.map_sg = nommu_map_sg,
- .unmap_sg = nommu_unmap_sg,
+ .mapping_error = nommu_mapping_error,
.is_phys = 1,
};
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 82a0a674a003..490da7f4b8d0 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -11,11 +11,18 @@
int swiotlb __read_mostly;
+static dma_addr_t
+swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
+ int direction)
+{
+ return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
+}
+
const struct dma_mapping_ops swiotlb_dma_ops = {
.mapping_error = swiotlb_dma_mapping_error,
.alloc_coherent = swiotlb_alloc_coherent,
.free_coherent = swiotlb_free_coherent,
- .map_single = swiotlb_map_single,
+ .map_single = swiotlb_map_single_phys,
.unmap_single = swiotlb_unmap_single,
.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
.sync_single_for_device = swiotlb_sync_single_for_device,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
new file mode 100644
index 000000000000..67e9b4a1e89d
--- /dev/null
+++ b/arch/x86/kernel/process.c
@@ -0,0 +1,161 @@
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/pm.h>
+
+struct kmem_cache *task_xstate_cachep;
+
+int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
+{
+ *dst = *src;
+ if (src->thread.xstate) {
+ dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
+ GFP_KERNEL);
+ if (!dst->thread.xstate)
+ return -ENOMEM;
+ WARN_ON((unsigned long)dst->thread.xstate & 15);
+ memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
+ }
+ return 0;
+}
+
+void free_thread_xstate(struct task_struct *tsk)
+{
+ if (tsk->thread.xstate) {
+ kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
+ tsk->thread.xstate = NULL;
+ }
+}
+
+void free_thread_info(struct thread_info *ti)
+{
+ free_thread_xstate(ti->task);
+ free_pages((unsigned long)ti, get_order(THREAD_SIZE));
+}
+
+void arch_task_cache_init(void)
+{
+ task_xstate_cachep =
+ kmem_cache_create("task_xstate", xstate_size,
+ __alignof__(union thread_xstate),
+ SLAB_PANIC, NULL);
+}
+
+static void do_nothing(void *unused)
+{
+}
+
+/*
+ * cpu_idle_wait - Used to ensure that all the CPUs discard old value of
+ * pm_idle and update to new pm_idle value. Required while changing pm_idle
+ * handler on SMP systems.
+ *
+ * Caller must have changed pm_idle to the new value before the call. Old
+ * pm_idle value will not be used by any CPU after the return of this function.
+ */
+void cpu_idle_wait(void)
+{
+ smp_mb();
+ /* kick all the CPUs so that they exit out of pm_idle */
+ smp_call_function(do_nothing, NULL, 0, 1);
+}
+EXPORT_SYMBOL_GPL(cpu_idle_wait);
+
+/*
+ * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
+ * which can obviate IPI to trigger checking of need_resched.
+ * We execute MONITOR against need_resched and enter optimized wait state
+ * through MWAIT. Whenever someone changes need_resched, we would be woken
+ * up from MWAIT (without an IPI).
+ *
+ * New with Core Duo processors, MWAIT can take some hints based on CPU
+ * capability.
+ */
+void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
+{
+ if (!need_resched()) {
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (!need_resched())
+ __mwait(ax, cx);
+ }
+}
+
+/* Default MONITOR/MWAIT with no hints, used for default C1 state */
+static void mwait_idle(void)
+{
+ if (!need_resched()) {
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ smp_mb();
+ if (!need_resched())
+ __sti_mwait(0, 0);
+ else
+ local_irq_enable();
+ } else
+ local_irq_enable();
+}
+
+
+static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
+{
+ if (force_mwait)
+ return 1;
+ /* Any C1 states supported? */
+ return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
+}
+
+/*
+ * On SMP it's slightly faster (but much more power-consuming!)
+ * to poll the ->work.need_resched flag instead of waiting for the
+ * cross-CPU IPI to arrive. Use this option with caution.
+ */
+static void poll_idle(void)
+{
+ local_irq_enable();
+ cpu_relax();
+}
+
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+{
+ static int selected;
+
+ if (selected)
+ return;
+#ifdef CONFIG_X86_SMP
+ if (pm_idle == poll_idle && smp_num_siblings > 1) {
+ printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
+ " performance may degrade.\n");
+ }
+#endif
+ if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
+ /*
+ * Skip, if setup has overridden idle.
+ * One CPU supports mwait => All CPUs supports mwait
+ */
+ if (!pm_idle) {
+ printk(KERN_INFO "using mwait in idle threads.\n");
+ pm_idle = mwait_idle;
+ }
+ }
+ selected = 1;
+}
+
+static int __init idle_setup(char *str)
+{
+ if (!strcmp(str, "poll")) {
+ printk("using polling idle threads.\n");
+ pm_idle = poll_idle;
+ } else if (!strcmp(str, "mwait"))
+ force_mwait = 1;
+ else
+ return -1;
+
+ boot_option_idle_override = 1;
+ return 0;
+}
+early_param("idle", idle_setup);
+
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index be3c7a299f02..f8476dfbb60d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -36,6 +36,7 @@
#include <linux/personality.h>
#include <linux/tick.h>
#include <linux/percpu.h>
+#include <linux/prctl.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -45,7 +46,6 @@
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/desc.h>
-#include <asm/vm86.h>
#ifdef CONFIG_MATH_EMULATION
#include <asm/math_emu.h>
#endif
@@ -82,7 +82,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
*/
void (*pm_idle)(void);
EXPORT_SYMBOL(pm_idle);
-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
void disable_hlt(void)
{
@@ -112,22 +111,13 @@ void default_idle(void)
*/
smp_mb();
- local_irq_disable();
- if (!need_resched()) {
- ktime_t t0, t1;
- u64 t0n, t1n;
-
- t0 = ktime_get();
- t0n = ktime_to_ns(t0);
+ if (!need_resched())
safe_halt(); /* enables interrupts racelessly */
- local_irq_disable();
- t1 = ktime_get();
- t1n = ktime_to_ns(t1);
- sched_clock_idle_wakeup_event(t1n - t0n);
- }
- local_irq_enable();
+ else
+ local_irq_enable();
current_thread_info()->status |= TS_POLLING;
} else {
+ local_irq_enable();
/* loop is done by the caller */
cpu_relax();
}
@@ -136,16 +126,6 @@ void default_idle(void)
EXPORT_SYMBOL(default_idle);
#endif
-/*
- * On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->work.need_resched flag instead of waiting for the
- * cross-CPU IPI to arrive. Use this option with caution.
- */
-static void poll_idle(void)
-{
- cpu_relax();
-}
-
#ifdef CONFIG_HOTPLUG_CPU
#include <asm/nmi.h>
/* We don't actually take CPU down, just spin without interrupts. */
@@ -190,9 +170,6 @@ void cpu_idle(void)
while (!need_resched()) {
void (*idle)(void);
- if (__get_cpu_var(cpu_idle_state))
- __get_cpu_var(cpu_idle_state) = 0;
-
check_pgt_cache();
rmb();
idle = pm_idle;
@@ -206,6 +183,7 @@ void cpu_idle(void)
if (cpu_is_offline(cpu))
play_dead();
+ local_irq_disable();
__get_cpu_var(irq_stat).idle_timestamp = jiffies;
idle();
}
@@ -216,122 +194,6 @@ void cpu_idle(void)
}
}
-static void do_nothing(void *unused)
-{
-}
-
-void cpu_idle_wait(void)
-{
- unsigned int cpu, this_cpu = get_cpu();
- cpumask_t map, tmp = current->cpus_allowed;
-
- set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
- put_cpu();
-
- cpus_clear(map);
- for_each_online_cpu(cpu) {
- per_cpu(cpu_idle_state, cpu) = 1;
- cpu_set(cpu, map);
- }
-
- __get_cpu_var(cpu_idle_state) = 0;
-
- wmb();
- do {
- ssleep(1);
- for_each_online_cpu(cpu) {
- if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
- cpu_clear(cpu, map);
- }
- cpus_and(map, map, cpu_online_map);
- /*
- * We waited 1 sec, if a CPU still did not call idle
- * it may be because it is in idle and not waking up
- * because it has nothing to do.
- * Give all the remaining CPUS a kick.
- */
- smp_call_function_mask(map, do_nothing, NULL, 0);
- } while (!cpus_empty(map));
-
- set_cpus_allowed(current, tmp);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-{
- if (!need_resched()) {
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __mwait(ax, cx);
- }
-}
-
-/* Default MONITOR/MWAIT with no hints, used for default C1 state */
-static void mwait_idle(void)
-{
- local_irq_enable();
- mwait_idle_with_hints(0, 0);
-}
-
-static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
-{
- if (force_mwait)
- return 1;
- /* Any C1 states supported? */
- return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
-}
-
-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
-{
- static int selected;
-
- if (selected)
- return;
-#ifdef CONFIG_X86_SMP
- if (pm_idle == poll_idle && smp_num_siblings > 1) {
- printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
- " performance may degrade.\n");
- }
-#endif
- if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
- /*
- * Skip, if setup has overridden idle.
- * One CPU supports mwait => All CPUs supports mwait
- */
- if (!pm_idle) {
- printk(KERN_INFO "using mwait in idle threads.\n");
- pm_idle = mwait_idle;
- }
- }
- selected = 1;
-}
-
-static int __init idle_setup(char *str)
-{
- if (!strcmp(str, "poll")) {
- printk("using polling idle threads.\n");
- pm_idle = poll_idle;
- } else if (!strcmp(str, "mwait"))
- force_mwait = 1;
- else
- return -1;
-
- boot_option_idle_override = 1;
- return 0;
-}
-early_param("idle", idle_setup);
-
void __show_registers(struct pt_regs *regs, int all)
{
unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
@@ -357,7 +219,7 @@ void __show_registers(struct pt_regs *regs, int all)
init_utsname()->version);
printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
- 0xffff & regs->cs, regs->ip, regs->flags,
+ (u16)regs->cs, regs->ip, regs->flags,
smp_processor_id());
print_symbol("EIP is at %s\n", regs->ip);
@@ -366,8 +228,7 @@ void __show_registers(struct pt_regs *regs, int all)
printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
regs->si, regs->di, regs->bp, sp);
printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
- regs->ds & 0xffff, regs->es & 0xffff,
- regs->fs & 0xffff, gs, ss);
+ (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
if (!all)
return;
@@ -538,11 +399,30 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
return err;
}
-#ifdef CONFIG_SECCOMP
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+ __asm__("movl %0, %%gs" :: "r"(0));
+ regs->fs = 0;
+ set_fs(USER_DS);
+ regs->ds = __USER_DS;
+ regs->es = __USER_DS;
+ regs->ss = __USER_DS;
+ regs->cs = __USER_CS;
+ regs->ip = new_ip;
+ regs->sp = new_sp;
+ /*
+ * Free the old FP and other extended state
+ */
+ free_thread_xstate(current);
+}
+EXPORT_SYMBOL_GPL(start_thread);
+
static void hard_disable_TSC(void)
{
write_cr4(read_cr4() | X86_CR4_TSD);
}
+
void disable_TSC(void)
{
preempt_disable();
@@ -554,11 +434,47 @@ void disable_TSC(void)
hard_disable_TSC();
preempt_enable();
}
+
static void hard_enable_TSC(void)
{
write_cr4(read_cr4() & ~X86_CR4_TSD);
}
-#endif /* CONFIG_SECCOMP */
+
+static void enable_TSC(void)
+{
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ hard_enable_TSC();
+ preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+ unsigned int val;
+
+ if (test_thread_flag(TIF_NOTSC))
+ val = PR_TSC_SIGSEGV;
+ else
+ val = PR_TSC_ENABLE;
+
+ return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+ if (val == PR_TSC_SIGSEGV)
+ disable_TSC();
+ else if (val == PR_TSC_ENABLE)
+ enable_TSC();
+ else
+ return -EINVAL;
+
+ return 0;
+}
static noinline void
__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
@@ -575,12 +491,12 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
/* we clear debugctl to make sure DS
* is not in use when we change it */
debugctl = 0;
- wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
+ update_debugctlmsr(0);
wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
}
if (next->debugctlmsr != debugctl)
- wrmsr(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr, 0);
+ update_debugctlmsr(next->debugctlmsr);
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
set_debugreg(next->debugreg0, 0);
@@ -592,7 +508,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
set_debugreg(next->debugreg7, 7);
}
-#ifdef CONFIG_SECCOMP
if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
test_tsk_thread_flag(next_p, TIF_NOTSC)) {
/* prev and next are different */
@@ -601,7 +516,6 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
else
hard_enable_TSC();
}
-#endif
#ifdef X86_BTS
if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
@@ -683,7 +597,7 @@ struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct
/* we're going to use this soon, after a few expensive things */
if (next_p->fpu_counter > 5)
- prefetch(&next->i387.fxsave);
+ prefetch(next->xstate);
/*
* Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3baf9b9f4c87..e2319f39988b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -36,6 +36,7 @@
#include <linux/kprobes.h>
#include <linux/kdebug.h>
#include <linux/tick.h>
+#include <linux/prctl.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -63,7 +64,6 @@ EXPORT_SYMBOL(boot_option_idle_override);
*/
void (*pm_idle)(void);
EXPORT_SYMBOL(pm_idle);
-static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
static ATOMIC_NOTIFIER_HEAD(idle_notifier);
@@ -106,34 +106,13 @@ void default_idle(void)
* test NEED_RESCHED:
*/
smp_mb();
- local_irq_disable();
- if (!need_resched()) {
- ktime_t t0, t1;
- u64 t0n, t1n;
-
- t0 = ktime_get();
- t0n = ktime_to_ns(t0);
+ if (!need_resched())
safe_halt(); /* enables interrupts racelessly */
- local_irq_disable();
- t1 = ktime_get();
- t1n = ktime_to_ns(t1);
- sched_clock_idle_wakeup_event(t1n - t0n);
- }
- local_irq_enable();
+ else
+ local_irq_enable();
current_thread_info()->status |= TS_POLLING;
}
-/*
- * On SMP it's slightly faster (but much more power-consuming!)
- * to poll the ->need_resched flag instead of waiting for the
- * cross-CPU IPI to arrive. Use this option with caution.
- */
-static void poll_idle(void)
-{
- local_irq_enable();
- cpu_relax();
-}
-
#ifdef CONFIG_HOTPLUG_CPU
DECLARE_PER_CPU(int, cpu_state);
@@ -173,9 +152,6 @@ void cpu_idle(void)
while (!need_resched()) {
void (*idle)(void);
- if (__get_cpu_var(cpu_idle_state))
- __get_cpu_var(cpu_idle_state) = 0;
-
rmb();
idle = pm_idle;
if (!idle)
@@ -203,131 +179,6 @@ void cpu_idle(void)
}
}
-static void do_nothing(void *unused)
-{
-}
-
-void cpu_idle_wait(void)
-{
- unsigned int cpu, this_cpu = get_cpu();
- cpumask_t map, tmp = current->cpus_allowed;
-
- set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
- put_cpu();
-
- cpus_clear(map);
- for_each_online_cpu(cpu) {
- per_cpu(cpu_idle_state, cpu) = 1;
- cpu_set(cpu, map);
- }
-
- __get_cpu_var(cpu_idle_state) = 0;
-
- wmb();
- do {
- ssleep(1);
- for_each_online_cpu(cpu) {
- if (cpu_isset(cpu, map) && !per_cpu(cpu_idle_state, cpu))
- cpu_clear(cpu, map);
- }
- cpus_and(map, map, cpu_online_map);
- /*
- * We waited 1 sec, if a CPU still did not call idle
- * it may be because it is in idle and not waking up
- * because it has nothing to do.
- * Give all the remaining CPUS a kick.
- */
- smp_call_function_mask(map, do_nothing, 0, 0);
- } while (!cpus_empty(map));
-
- set_cpus_allowed(current, tmp);
-}
-EXPORT_SYMBOL_GPL(cpu_idle_wait);
-
-/*
- * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
- * which can obviate IPI to trigger checking of need_resched.
- * We execute MONITOR against need_resched and enter optimized wait state
- * through MWAIT. Whenever someone changes need_resched, we would be woken
- * up from MWAIT (without an IPI).
- *
- * New with Core Duo processors, MWAIT can take some hints based on CPU
- * capability.
- */
-void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
-{
- if (!need_resched()) {
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __mwait(ax, cx);
- }
-}
-
-/* Default MONITOR/MWAIT with no hints, used for default C1 state */
-static void mwait_idle(void)
-{
- if (!need_resched()) {
- __monitor((void *)&current_thread_info()->flags, 0, 0);
- smp_mb();
- if (!need_resched())
- __sti_mwait(0, 0);
- else
- local_irq_enable();
- } else {
- local_irq_enable();
- }
-}
-
-
-static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
-{
- if (force_mwait)
- return 1;
- /* Any C1 states supported? */
- return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0;
-}
-
-void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
-{
- static int selected;
-
- if (selected)
- return;
-#ifdef CONFIG_X86_SMP
- if (pm_idle == poll_idle && smp_num_siblings > 1) {
- printk(KERN_WARNING "WARNING: polling idle and HT enabled,"
- " performance may degrade.\n");
- }
-#endif
- if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) {
- /*
- * Skip, if setup has overridden idle.
- * One CPU supports mwait => All CPUs supports mwait
- */
- if (!pm_idle) {
- printk(KERN_INFO "using mwait in idle threads.\n");
- pm_idle = mwait_idle;
- }
- }
- selected = 1;
-}
-
-static int __init idle_setup(char *str)
-{
- if (!strcmp(str, "poll")) {
- printk("using polling idle threads.\n");
- pm_idle = poll_idle;
- } else if (!strcmp(str, "mwait"))
- force_mwait = 1;
- else
- return -1;
-
- boot_option_idle_override = 1;
- return 0;
-}
-early_param("idle", idle_setup);
-
/* Prints also some state that isn't saved in the pt_regs */
void __show_regs(struct pt_regs * regs)
{
@@ -553,6 +404,83 @@ out:
return err;
}
+void
+start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
+{
+ asm volatile("movl %0, %%fs; movl %0, %%es; movl %0, %%ds" :: "r"(0));
+ load_gs_index(0);
+ regs->ip = new_ip;
+ regs->sp = new_sp;
+ write_pda(oldrsp, new_sp);
+ regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
+ regs->flags = 0x200;
+ set_fs(USER_DS);
+ /*
+ * Free the old FP and other extended state
+ */
+ free_thread_xstate(current);
+}
+EXPORT_SYMBOL_GPL(start_thread);
+
+static void hard_disable_TSC(void)
+{
+ write_cr4(read_cr4() | X86_CR4_TSD);
+}
+
+void disable_TSC(void)
+{
+ preempt_disable();
+ if (!test_and_set_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ hard_disable_TSC();
+ preempt_enable();
+}
+
+static void hard_enable_TSC(void)
+{
+ write_cr4(read_cr4() & ~X86_CR4_TSD);
+}
+
+static void enable_TSC(void)
+{
+ preempt_disable();
+ if (test_and_clear_thread_flag(TIF_NOTSC))
+ /*
+ * Must flip the CPU state synchronously with
+ * TIF_NOTSC in the current running context.
+ */
+ hard_enable_TSC();
+ preempt_enable();
+}
+
+int get_tsc_mode(unsigned long adr)
+{
+ unsigned int val;
+
+ if (test_thread_flag(TIF_NOTSC))
+ val = PR_TSC_SIGSEGV;
+ else
+ val = PR_TSC_ENABLE;
+
+ return put_user(val, (unsigned int __user *)adr);
+}
+
+int set_tsc_mode(unsigned int val)
+{
+ if (val == PR_TSC_SIGSEGV)
+ disable_TSC();
+ else if (val == PR_TSC_ENABLE)
+ enable_TSC();
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
/*
* This special macro can be used to load a debugging register
*/
@@ -573,12 +501,12 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
/* we clear debugctl to make sure DS
* is not in use when we change it */
debugctl = 0;
- wrmsrl(MSR_IA32_DEBUGCTLMSR, 0);
+ update_debugctlmsr(0);
wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr);
}
if (next->debugctlmsr != debugctl)
- wrmsrl(MSR_IA32_DEBUGCTLMSR, next->debugctlmsr);
+ update_debugctlmsr(next->debugctlmsr);
if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
loaddebug(next, 0);
@@ -590,6 +518,15 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
loaddebug(next, 7);
}
+ if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
+ test_tsk_thread_flag(next_p, TIF_NOTSC)) {
+ /* prev and next are different */
+ if (test_tsk_thread_flag(next_p, TIF_NOTSC))
+ hard_disable_TSC();
+ else
+ hard_enable_TSC();
+ }
+
if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
/*
* Copy the relevant range of the IO bitmap.
@@ -632,7 +569,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* we're going to use this soon, after a few expensive things */
if (next_p->fpu_counter>5)
- prefetch(&next->i387.fxsave);
+ prefetch(next->xstate);
/*
* Reload esp0, LDT and the page table pointer:
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index eb92ccbb3502..fb03ef380f0e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1207,97 +1207,16 @@ static int genregs32_set(struct task_struct *target,
return ret;
}
-static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data)
+long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
+ compat_ulong_t caddr, compat_ulong_t cdata)
{
- siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t));
- compat_siginfo_t __user *si32 = compat_ptr(data);
- siginfo_t ssi;
- int ret;
-
- if (request == PTRACE_SETSIGINFO) {
- memset(&ssi, 0, sizeof(siginfo_t));
- ret = copy_siginfo_from_user32(&ssi, si32);
- if (ret)
- return ret;
- if (copy_to_user(si, &ssi, sizeof(siginfo_t)))
- return -EFAULT;
- }
- ret = sys_ptrace(request, pid, addr, (unsigned long)si);
- if (ret)
- return ret;
- if (request == PTRACE_GETSIGINFO) {
- if (copy_from_user(&ssi, si, sizeof(siginfo_t)))
- return -EFAULT;
- ret = copy_siginfo_to_user32(si32, &ssi);
- }
- return ret;
-}
-
-asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
-{
- struct task_struct *child;
- struct pt_regs *childregs;
+ unsigned long addr = caddr;
+ unsigned long data = cdata;
void __user *datap = compat_ptr(data);
int ret;
__u32 val;
switch (request) {
- case PTRACE_TRACEME:
- case PTRACE_ATTACH:
- case PTRACE_KILL:
- case PTRACE_CONT:
- case PTRACE_SINGLESTEP:
- case PTRACE_SINGLEBLOCK:
- case PTRACE_DETACH:
- case PTRACE_SYSCALL:
- case PTRACE_OLDSETOPTIONS:
- case PTRACE_SETOPTIONS:
- case PTRACE_SET_THREAD_AREA:
- case PTRACE_GET_THREAD_AREA:
-#ifdef X86_BTS
- case PTRACE_BTS_CONFIG:
- case PTRACE_BTS_STATUS:
- case PTRACE_BTS_SIZE:
- case PTRACE_BTS_GET:
- case PTRACE_BTS_CLEAR:
- case PTRACE_BTS_DRAIN:
-#endif
- return sys_ptrace(request, pid, addr, data);
-
- default:
- return -EINVAL;
-
- case PTRACE_PEEKTEXT:
- case PTRACE_PEEKDATA:
- case PTRACE_POKEDATA:
- case PTRACE_POKETEXT:
- case PTRACE_POKEUSR:
- case PTRACE_PEEKUSR:
- case PTRACE_GETREGS:
- case PTRACE_SETREGS:
- case PTRACE_SETFPREGS:
- case PTRACE_GETFPREGS:
- case PTRACE_SETFPXREGS:
- case PTRACE_GETFPXREGS:
- case PTRACE_GETEVENTMSG:
- break;
-
- case PTRACE_SETSIGINFO:
- case PTRACE_GETSIGINFO:
- return ptrace32_siginfo(request, pid, addr, data);
- }
-
- child = ptrace_get_task_struct(pid);
- if (IS_ERR(child))
- return PTR_ERR(child);
-
- ret = ptrace_check_attach(child, request == PTRACE_KILL);
- if (ret < 0)
- goto out;
-
- childregs = task_pt_regs(child);
-
- switch (request) {
case PTRACE_PEEKUSR:
ret = getreg32(child, addr, &val);
if (ret == 0)
@@ -1343,12 +1262,14 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data)
sizeof(struct user32_fxsr_struct),
datap);
+ case PTRACE_GET_THREAD_AREA:
+ case PTRACE_SET_THREAD_AREA:
+ return arch_ptrace(child, request, addr, data);
+
default:
return compat_ptrace_request(child, request, addr, data);
}
- out:
- put_task_struct(child);
return ret;
}
@@ -1456,7 +1377,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
/* notification of system call entry/exit
* - triggered by current->work.syscall_trace
*/
-__attribute__((regparm(3)))
int do_syscall_trace(struct pt_regs *regs, int entryexit)
{
int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 484c4a80d38a..a4a838306b2c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -1,5 +1,4 @@
#include <linux/module.h>
-#include <linux/init.h>
#include <linux/reboot.h>
#include <linux/init.h>
#include <linux/pm.h>
@@ -9,6 +8,7 @@
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
+#include <asm/pgtable.h>
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
@@ -16,7 +16,6 @@
# include <linux/dmi.h>
# include <linux/ctype.h>
# include <linux/mc146818rtc.h>
-# include <asm/pgtable.h>
#else
# include <asm/iommu.h>
#endif
@@ -276,7 +275,7 @@ void machine_real_restart(unsigned char *code, int length)
/* Remap the kernel at virtual address zero, as well as offset zero
from the kernel segment. This assumes the kernel segment starts at
virtual address PAGE_OFFSET. */
- memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+ memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
/*
@@ -400,7 +399,7 @@ static void native_machine_emergency_restart(void)
}
}
-static void native_machine_shutdown(void)
+void native_machine_shutdown(void)
{
/* Stop the cpus and apics */
#ifdef CONFIG_SMP
@@ -412,16 +411,16 @@ static void native_machine_shutdown(void)
#ifdef CONFIG_X86_32
/* See if there has been given a command line override */
if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) &&
- cpu_isset(reboot_cpu, cpu_online_map))
+ cpu_online(reboot_cpu))
reboot_cpu_id = reboot_cpu;
#endif
/* Make certain the cpu I'm about to reboot on is online */
- if (!cpu_isset(reboot_cpu_id, cpu_online_map))
+ if (!cpu_online(reboot_cpu_id))
reboot_cpu_id = smp_processor_id();
/* Make certain I only run on the appropriate processor */
- set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+ set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id));
/* O.K Now that I'm on the appropriate processor,
* stop all of the others.
@@ -471,7 +470,10 @@ struct machine_ops machine_ops = {
.shutdown = native_machine_shutdown,
.emergency_restart = native_machine_emergency_restart,
.restart = native_machine_restart,
- .halt = native_machine_halt
+ .halt = native_machine_halt,
+#ifdef CONFIG_KEXEC
+ .crash_shutdown = native_machine_crash_shutdown,
+#endif
};
void machine_power_off(void)
@@ -499,3 +501,9 @@ void machine_halt(void)
machine_ops.halt();
}
+#ifdef CONFIG_KEXEC
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+ machine_ops.crash_shutdown(regs);
+}
+#endif
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index f151d6fae462..c30fe25d470d 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -9,18 +9,19 @@
#include <linux/linkage.h>
#include <asm/page.h>
#include <asm/kexec.h>
+#include <asm/processor-flags.h>
+#include <asm/pgtable.h>
/*
* Must be relocatable PIC code callable as a C function
*/
#define PTR(x) (x << 2)
-#define PAGE_ALIGNED (1 << PAGE_SHIFT)
-#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
-#define PAE_PGD_ATTR 0x01 /* _PAGE_PRESENT */
+#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define PAE_PGD_ATTR (_PAGE_PRESENT)
.text
- .align PAGE_ALIGNED
+ .align PAGE_SIZE
.globl relocate_kernel
relocate_kernel:
movl 8(%esp), %ebp /* list of pages */
@@ -155,7 +156,7 @@ relocate_new_kernel:
movl %eax, %cr3
/* setup a new stack at the end of the physical control page */
- lea 4096(%edi), %esp
+ lea PAGE_SIZE(%edi), %esp
/* jump to identity mapped page */
movl %edi, %eax
@@ -168,16 +169,16 @@ identity_mapped:
pushl %edx
/* Set cr0 to a known state:
- * 31 0 == Paging disabled
- * 18 0 == Alignment check disabled
- * 16 0 == Write protect disabled
- * 3 0 == No task switch
- * 2 0 == Don't do FP software emulation.
- * 0 1 == Proctected mode enabled
+ * - Paging disabled
+ * - Alignment check disabled
+ * - Write protect disabled
+ * - No task switch
+ * - Don't do FP software emulation.
+ * - Proctected mode enabled
*/
movl %cr0, %eax
- andl $~((1<<31)|(1<<18)|(1<<16)|(1<<3)|(1<<2)), %eax
- orl $(1<<0), %eax
+ andl $~(X86_CR0_PG | X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %eax
+ orl $(X86_CR0_PE), %eax
movl %eax, %cr0
/* clear cr4 if applicable */
@@ -186,8 +187,7 @@ identity_mapped:
/* Set cr4 to a known state:
* Setting everything to zero seems safe.
*/
- movl %cr4, %eax
- andl $0, %eax
+ xorl %eax, %eax
movl %eax, %cr4
jmp 1f
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 14e95872c6a3..f5afe665a82b 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -9,17 +9,18 @@
#include <linux/linkage.h>
#include <asm/page.h>
#include <asm/kexec.h>
+#include <asm/processor-flags.h>
+#include <asm/pgtable.h>
/*
* Must be relocatable PIC code callable as a C function
*/
#define PTR(x) (x << 3)
-#define PAGE_ALIGNED (1 << PAGE_SHIFT)
-#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */
+#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
.text
- .align PAGE_ALIGNED
+ .align PAGE_SIZE
.code64
.globl relocate_kernel
relocate_kernel:
@@ -160,7 +161,7 @@ relocate_new_kernel:
movq %r9, %cr3
/* setup a new stack at the end of the physical control page */
- lea 4096(%r8), %rsp
+ lea PAGE_SIZE(%r8), %rsp
/* jump to identity mapped page */
addq $(identity_mapped - relocate_kernel), %r8
@@ -172,33 +173,22 @@ identity_mapped:
pushq %rdx
/* Set cr0 to a known state:
- * 31 1 == Paging enabled
- * 18 0 == Alignment check disabled
- * 16 0 == Write protect disabled
- * 3 0 == No task switch
- * 2 0 == Don't do FP software emulation.
- * 0 1 == Proctected mode enabled
+ * - Paging enabled
+ * - Alignment check disabled
+ * - Write protect disabled
+ * - No task switch
+ * - Don't do FP software emulation.
+ * - Proctected mode enabled
*/
movq %cr0, %rax
- andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
- orl $((1<<31)|(1<<0)), %eax
+ andq $~(X86_CR0_AM | X86_CR0_WP | X86_CR0_TS | X86_CR0_EM), %rax
+ orl $(X86_CR0_PG | X86_CR0_PE), %eax
movq %rax, %cr0
/* Set cr4 to a known state:
- * 10 0 == xmm exceptions disabled
- * 9 0 == xmm registers instructions disabled
- * 8 0 == performance monitoring counter disabled
- * 7 0 == page global disabled
- * 6 0 == machine check exceptions disabled
- * 5 1 == physical address extension enabled
- * 4 0 == page size extensions disabled
- * 3 0 == Debug extensions disabled
- * 2 0 == Time stamp disable (disabled)
- * 1 0 == Protected mode virtual interrupts disabled
- * 0 0 == VME disabled
+ * - physical address extension enabled
*/
-
- movq $((1<<5)), %rax
+ movq $X86_CR4_PAE, %rax
movq %rax, %cr4
jmp 1f
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index eb9b1a198f5e..9615eee9b775 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -9,7 +9,6 @@
#include <asm/vsyscall.h>
#ifdef CONFIG_X86_32
-# define CMOS_YEARS_OFFS 1900
/*
* This is a special lock that is owned by the CPU and holds the index
* register we are working with. It is required for NMI access to the
@@ -17,14 +16,11 @@
*/
volatile unsigned long cmos_lock = 0;
EXPORT_SYMBOL(cmos_lock);
-#else
-/*
- * x86-64 systems only exists since 2002.
- * This will work up to Dec 31, 2100
- */
-# define CMOS_YEARS_OFFS 2000
#endif
+/* For two digit years assume time is always after that */
+#define CMOS_YEARS_OFFS 2000
+
DEFINE_SPINLOCK(rtc_lock);
EXPORT_SYMBOL(rtc_lock);
@@ -98,7 +94,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
unsigned long mach_get_cmos_time(void)
{
- unsigned int year, mon, day, hour, min, sec, century = 0;
+ unsigned int status, year, mon, day, hour, min, sec, century = 0;
/*
* If UIP is clear, then we have >= 244 microseconds before
@@ -116,14 +112,16 @@ unsigned long mach_get_cmos_time(void)
mon = CMOS_READ(RTC_MONTH);
year = CMOS_READ(RTC_YEAR);
-#if defined(CONFIG_ACPI) && defined(CONFIG_X86_64)
- /* CHECKME: Is this really 64bit only ??? */
+#ifdef CONFIG_ACPI
if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
acpi_gbl_FADT.century)
century = CMOS_READ(acpi_gbl_FADT.century);
#endif
- if (RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY)) {
+ status = CMOS_READ(RTC_CONTROL);
+ WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
+
+ if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
BCD_TO_BIN(sec);
BCD_TO_BIN(min);
BCD_TO_BIN(hour);
@@ -136,11 +134,8 @@ unsigned long mach_get_cmos_time(void)
BCD_TO_BIN(century);
year += century * 100;
printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
- } else {
+ } else
year += CMOS_YEARS_OFFS;
- if (year < 1970)
- year += 100;
- }
return mktime(year, mon, day, hour, min, sec);
}
@@ -151,8 +146,8 @@ unsigned char rtc_cmos_read(unsigned char addr)
unsigned char val;
lock_cmos_prefix(addr);
- outb_p(addr, RTC_PORT(0));
- val = inb_p(RTC_PORT(1));
+ outb(addr, RTC_PORT(0));
+ val = inb(RTC_PORT(1));
lock_cmos_suffix(addr);
return val;
}
@@ -161,8 +156,8 @@ EXPORT_SYMBOL(rtc_cmos_read);
void rtc_cmos_write(unsigned char val, unsigned char addr)
{
lock_cmos_prefix(addr);
- outb_p(addr, RTC_PORT(0));
- outb_p(val, RTC_PORT(1));
+ outb(addr, RTC_PORT(0));
+ outb(val, RTC_PORT(1));
lock_cmos_suffix(addr);
}
EXPORT_SYMBOL(rtc_cmos_write);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
new file mode 100644
index 000000000000..c0c68c18a788
--- /dev/null
+++ b/arch/x86/kernel/setup.c
@@ -0,0 +1,137 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/percpu.h>
+#include <asm/smp.h>
+#include <asm/percpu.h>
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <asm/setup.h>
+#include <asm/topology.h>
+#include <asm/mpspec.h>
+#include <asm/apicdef.h>
+
+unsigned int num_processors;
+unsigned disabled_cpus __cpuinitdata;
+/* Processor that is doing the boot up */
+unsigned int boot_cpu_physical_apicid = -1U;
+EXPORT_SYMBOL(boot_cpu_physical_apicid);
+
+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
+
+/* Bitmask of physically existing CPUs */
+physid_mask_t phys_cpu_present_map;
+
+#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_SMP)
+/*
+ * Copy data used in early init routines from the initial arrays to the
+ * per cpu data areas. These arrays then become expendable and the
+ * *_early_ptr's are zeroed indicating that the static arrays are gone.
+ */
+static void __init setup_per_cpu_maps(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ per_cpu(x86_cpu_to_apicid, cpu) = x86_cpu_to_apicid_init[cpu];
+ per_cpu(x86_bios_cpu_apicid, cpu) =
+ x86_bios_cpu_apicid_init[cpu];
+#ifdef CONFIG_NUMA
+ per_cpu(x86_cpu_to_node_map, cpu) =
+ x86_cpu_to_node_map_init[cpu];
+#endif
+ }
+
+ /* indicate the early static arrays will soon be gone */
+ x86_cpu_to_apicid_early_ptr = NULL;
+ x86_bios_cpu_apicid_early_ptr = NULL;
+#ifdef CONFIG_NUMA
+ x86_cpu_to_node_map_early_ptr = NULL;
+#endif
+}
+
+#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
+cpumask_t *cpumask_of_cpu_map __read_mostly;
+EXPORT_SYMBOL(cpumask_of_cpu_map);
+
+/* requires nr_cpu_ids to be initialized */
+static void __init setup_cpumask_of_cpu(void)
+{
+ int i;
+
+ /* alloc_bootmem zeroes memory */
+ cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
+ for (i = 0; i < nr_cpu_ids; i++)
+ cpu_set(i, cpumask_of_cpu_map[i]);
+}
+#else
+static inline void setup_cpumask_of_cpu(void) { }
+#endif
+
+#ifdef CONFIG_X86_32
+/*
+ * Great future not-so-futuristic plan: make i386 and x86_64 do it
+ * the same way
+ */
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+EXPORT_SYMBOL(__per_cpu_offset);
+#endif
+
+/*
+ * Great future plan:
+ * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
+ * Always point %gs to its beginning
+ */
+void __init setup_per_cpu_areas(void)
+{
+ int i, highest_cpu = 0;
+ unsigned long size;
+
+#ifdef CONFIG_HOTPLUG_CPU
+ prefill_possible_map();
+#endif
+
+ /* Copy section for each CPU (we discard the original) */
+ size = PERCPU_ENOUGH_ROOM;
+ printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n",
+ size);
+
+ for_each_possible_cpu(i) {
+ char *ptr;
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ ptr = alloc_bootmem_pages(size);
+#else
+ int node = early_cpu_to_node(i);
+ if (!node_online(node) || !NODE_DATA(node)) {
+ ptr = alloc_bootmem_pages(size);
+ printk(KERN_INFO
+ "cpu %d has no node or node-local memory\n", i);
+ }
+ else
+ ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+#endif
+ if (!ptr)
+ panic("Cannot allocate cpu data for CPU %d\n", i);
+#ifdef CONFIG_X86_64
+ cpu_pda(i)->data_offset = ptr - __per_cpu_start;
+#else
+ __per_cpu_offset[i] = ptr - __per_cpu_start;
+#endif
+ memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+
+ highest_cpu = i;
+ }
+
+ nr_cpu_ids = highest_cpu + 1;
+ printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d\n", NR_CPUS, nr_cpu_ids);
+
+ /* Setup percpu data maps */
+ setup_per_cpu_maps();
+
+ /* Setup cpumask_of_cpu map */
+ setup_cpumask_of_cpu();
+}
+
+#endif
diff --git a/arch/x86/kernel/setup64.c b/arch/x86/kernel/setup64.c
index e24c45677094..aee0e8200777 100644
--- a/arch/x86/kernel/setup64.c
+++ b/arch/x86/kernel/setup64.c
@@ -11,6 +11,7 @@
#include <linux/bootmem.h>
#include <linux/bitops.h>
#include <linux/module.h>
+#include <linux/kgdb.h>
#include <asm/pda.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -23,6 +24,7 @@
#include <asm/proto.h>
#include <asm/sections.h>
#include <asm/setup.h>
+#include <asm/genapic.h>
#ifndef CONFIG_DEBUG_BOOT_PARAMS
struct boot_params __initdata boot_params;
@@ -72,8 +74,8 @@ int force_personality32 = 0;
Control non executable heap for 32bit processes.
To control the stack too use noexec=off
-on PROT_READ does not imply PROT_EXEC for 32bit processes
-off PROT_READ implies PROT_EXEC (default)
+on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
+off PROT_READ implies PROT_EXEC
*/
static int __init nonx32_setup(char *str)
{
@@ -85,83 +87,6 @@ static int __init nonx32_setup(char *str)
}
__setup("noexec32=", nonx32_setup);
-/*
- * Copy data used in early init routines from the initial arrays to the
- * per cpu data areas. These arrays then become expendable and the
- * *_early_ptr's are zeroed indicating that the static arrays are gone.
- */
-static void __init setup_per_cpu_maps(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
-#ifdef CONFIG_SMP
- if (per_cpu_offset(cpu)) {
-#endif
- per_cpu(x86_cpu_to_apicid, cpu) =
- x86_cpu_to_apicid_init[cpu];
- per_cpu(x86_bios_cpu_apicid, cpu) =
- x86_bios_cpu_apicid_init[cpu];
-#ifdef CONFIG_NUMA
- per_cpu(x86_cpu_to_node_map, cpu) =
- x86_cpu_to_node_map_init[cpu];
-#endif
-#ifdef CONFIG_SMP
- }
- else
- printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
- cpu);
-#endif
- }
-
- /* indicate the early static arrays will soon be gone */
- x86_cpu_to_apicid_early_ptr = NULL;
- x86_bios_cpu_apicid_early_ptr = NULL;
-#ifdef CONFIG_NUMA
- x86_cpu_to_node_map_early_ptr = NULL;
-#endif
-}
-
-/*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
- */
-void __init setup_per_cpu_areas(void)
-{
- int i;
- unsigned long size;
-
-#ifdef CONFIG_HOTPLUG_CPU
- prefill_possible_map();
-#endif
-
- /* Copy section for each CPU (we discard the original) */
- size = PERCPU_ENOUGH_ROOM;
-
- printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", size);
- for_each_cpu_mask (i, cpu_possible_map) {
- char *ptr;
-#ifndef CONFIG_NEED_MULTIPLE_NODES
- ptr = alloc_bootmem_pages(size);
-#else
- int node = early_cpu_to_node(i);
-
- if (!node_online(node) || !NODE_DATA(node))
- ptr = alloc_bootmem_pages(size);
- else
- ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
-#endif
- if (!ptr)
- panic("Cannot allocate cpu data for CPU %d\n", i);
- cpu_pda(i)->data_offset = ptr - __per_cpu_start;
- memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
- }
-
- /* setup percpu data maps early */
- setup_per_cpu_maps();
-}
-
void pda_init(int cpu)
{
struct x8664_pda *pda = cpu_pda(cpu);
@@ -327,6 +252,17 @@ void __cpuinit cpu_init (void)
load_TR_desc();
load_LDT(&init_mm.context);
+#ifdef CONFIG_KGDB
+ /*
+ * If the kgdb is connected no debug regs should be altered. This
+ * is only applicable when KGDB and a KGDB I/O module are built
+ * into the kernel and you are using early debugging with
+ * kgdbwait. KGDB will control the kernel HW breakpoint registers.
+ */
+ if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
+ else {
+#endif
/*
* Clear all 6 debug registers:
*/
@@ -337,8 +273,15 @@ void __cpuinit cpu_init (void)
set_debugreg(0UL, 3);
set_debugreg(0UL, 6);
set_debugreg(0UL, 7);
+#ifdef CONFIG_KGDB
+ /* If the kgdb is connected no debug regs should be altered. */
+ }
+#endif
fpu_init();
raw_local_save_flags(kernel_eflags);
+
+ if (is_uv_system())
+ uv_cpu_init();
}
diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c
index 2b3e5d45176b..2283422af794 100644
--- a/arch/x86/kernel/setup_32.c
+++ b/arch/x86/kernel/setup_32.c
@@ -39,6 +39,7 @@
#include <linux/efi.h>
#include <linux/init.h>
#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
#include <linux/nodemask.h>
#include <linux/kexec.h>
#include <linux/crash_dump.h>
@@ -46,6 +47,7 @@
#include <linux/pfn.h>
#include <linux/pci.h>
#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
#include <video/edid.h>
@@ -62,8 +64,9 @@
#include <asm/io.h>
#include <asm/vmi.h>
#include <setup_arch.h>
-#include <bios_ebda.h>
+#include <asm/bios_ebda.h>
#include <asm/cacheflush.h>
+#include <asm/processor.h>
/* This value is set up by the early boot code to point to the value
immediately after the boot time page tables. It contains a *physical*
@@ -154,6 +157,8 @@ struct cpuinfo_x86 new_cpu_data __cpuinitdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
EXPORT_SYMBOL(boot_cpu_data);
+unsigned int def_to_bigsmp;
+
#ifndef CONFIG_X86_PAE
unsigned long mmu_cr4_features;
#else
@@ -189,7 +194,7 @@ EXPORT_SYMBOL(ist_info);
extern void early_cpu_init(void);
extern int root_mountflags;
-unsigned long saved_videomode;
+unsigned long saved_video_mode;
#define RAMDISK_IMAGE_START_MASK 0x07FF
#define RAMDISK_PROMPT_FLAG 0x8000
@@ -227,7 +232,7 @@ static inline void copy_edd(void)
}
#endif
-int __initdata user_defined_memmap = 0;
+int __initdata user_defined_memmap;
/*
* "mem=nopentium" disables the 4MB page tables.
@@ -385,19 +390,58 @@ unsigned long __init find_max_low_pfn(void)
return max_low_pfn;
}
+#define BIOS_LOWMEM_KILOBYTES 0x413
+
/*
- * workaround for Dell systems that neglect to reserve EBDA
+ * The BIOS places the EBDA/XBDA at the top of conventional
+ * memory, and usually decreases the reported amount of
+ * conventional memory (int 0x12) too. This also contains a
+ * workaround for Dell systems that neglect to reserve EBDA.
+ * The same workaround also avoids a problem with the AMD768MPX
+ * chipset: reserve a page before VGA to prevent PCI prefetch
+ * into it (errata #56). Usually the page is reserved anyways,
+ * unless you have no PS/2 mouse plugged in.
*/
static void __init reserve_ebda_region(void)
{
- unsigned int addr;
- addr = get_bios_ebda();
- if (addr)
- reserve_bootmem(addr, PAGE_SIZE, BOOTMEM_DEFAULT);
+ unsigned int lowmem, ebda_addr;
+
+ /* To determine the position of the EBDA and the */
+ /* end of conventional memory, we need to look at */
+ /* the BIOS data area. In a paravirtual environment */
+ /* that area is absent. We'll just have to assume */
+ /* that the paravirt case can handle memory setup */
+ /* correctly, without our help. */
+ if (paravirt_enabled())
+ return;
+
+ /* end of low (conventional) memory */
+ lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES);
+ lowmem <<= 10;
+
+ /* start of EBDA area */
+ ebda_addr = get_bios_ebda();
+
+ /* Fixup: bios puts an EBDA in the top 64K segment */
+ /* of conventional memory, but does not adjust lowmem. */
+ if ((lowmem - ebda_addr) <= 0x10000)
+ lowmem = ebda_addr;
+
+ /* Fixup: bios does not report an EBDA at all. */
+ /* Some old Dells seem to need 4k anyhow (bugzilla 2990) */
+ if ((ebda_addr == 0) && (lowmem >= 0x9f000))
+ lowmem = 0x9f000;
+
+ /* Paranoia: should never happen, but... */
+ if ((lowmem == 0) || (lowmem >= 0x100000))
+ lowmem = 0x9f000;
+
+ /* reserve all memory between lowmem and the 1MB mark */
+ reserve_bootmem(lowmem, 0x100000 - lowmem, BOOTMEM_DEFAULT);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
-void __init setup_bootmem_allocator(void);
+static void __init setup_bootmem_allocator(void);
static unsigned long __init setup_memory(void)
{
/*
@@ -432,7 +476,7 @@ static unsigned long __init setup_memory(void)
return max_low_pfn;
}
-void __init zone_sizes_init(void)
+static void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
@@ -617,16 +661,9 @@ void __init setup_bootmem_allocator(void)
*/
reserve_bootmem(0, PAGE_SIZE, BOOTMEM_DEFAULT);
- /* reserve EBDA region, it's a 4K region */
+ /* reserve EBDA region */
reserve_ebda_region();
- /* could be an AMD 768MPX chipset. Reserve a page before VGA to prevent
- PCI prefetch into it (errata #56). Usually the page is reserved anyways,
- unless you have no PS/2 mouse plugged in. */
- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
- boot_cpu_data.x86 == 6)
- reserve_bootmem(0xa0000 - 4096, 4096, BOOTMEM_DEFAULT);
-
#ifdef CONFIG_SMP
/*
* But first pinch a few for the stack/trampoline stuff
@@ -652,6 +689,8 @@ void __init setup_bootmem_allocator(void)
#endif
numa_kva_reserve();
reserve_crashkernel();
+
+ reserve_ibft_region();
}
/*
@@ -687,6 +726,18 @@ char * __init __attribute__((weak)) memory_setup(void)
return machine_specific_memory_setup();
}
+#ifdef CONFIG_NUMA
+/*
+ * In the golden day, when everything among i386 and x86_64 will be
+ * integrated, this will not live here
+ */
+void *x86_cpu_to_node_map_early_ptr;
+int x86_cpu_to_node_map_init[NR_CPUS] = {
+ [0 ... NR_CPUS-1] = NUMA_NO_NODE
+};
+DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
+#endif
+
/*
* Determine if we were loaded by an EFI loader. If so, then we have also been
* passed the efi memmap, systab, etc., so we should use these data structures
@@ -714,7 +765,7 @@ void __init setup_arch(char **cmdline_p)
edid_info = boot_params.edid_info;
apm_info.bios = boot_params.apm_bios_info;
ist_info = boot_params.ist_info;
- saved_videomode = boot_params.hdr.vid_mode;
+ saved_video_mode = boot_params.hdr.vid_mode;
if( boot_params.sys_desc_table.length != 0 ) {
set_mca_bus(boot_params.sys_desc_table.table[3] & 0x2);
machine_id = boot_params.sys_desc_table.table[0];
@@ -763,13 +814,17 @@ void __init setup_arch(char **cmdline_p)
efi_init();
/* update e820 for memory not covered by WB MTRRs */
- find_max_pfn();
+ propagate_e820_map();
mtrr_bp_init();
if (mtrr_trim_uncached_memory(max_pfn))
- find_max_pfn();
+ propagate_e820_map();
max_low_pfn = setup_memory();
+#ifdef CONFIG_KVM_CLOCK
+ kvmclock_init();
+#endif
+
#ifdef CONFIG_VMI
/*
* Must be after max_low_pfn is determined, and before kernel
@@ -777,6 +832,7 @@ void __init setup_arch(char **cmdline_p)
*/
vmi_init();
#endif
+ kvm_guest_init();
/*
* NOTE: before this point _nobody_ is allowed to allocate
@@ -820,6 +876,18 @@ void __init setup_arch(char **cmdline_p)
io_delay_init();
+#ifdef CONFIG_X86_SMP
+ /*
+ * setup to use the early static init tables during kernel startup
+ * X86_SMP will exclude sub-arches that don't deal well with it.
+ */
+ x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
+ x86_bios_cpu_apicid_early_ptr = (void *)x86_bios_cpu_apicid_init;
+#ifdef CONFIG_NUMA
+ x86_cpu_to_node_map_early_ptr = (void *)x86_cpu_to_node_map_init;
+#endif
+#endif
+
#ifdef CONFIG_X86_GENERICARCH
generic_apic_probe();
#endif
diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index f4f7ecfb898c..a94fb959a87a 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -33,6 +33,7 @@
#include <linux/acpi.h>
#include <linux/kallsyms.h>
#include <linux/edd.h>
+#include <linux/iscsi_ibft.h>
#include <linux/mmzone.h>
#include <linux/kexec.h>
#include <linux/cpufreq.h>
@@ -41,6 +42,7 @@
#include <linux/ctype.h>
#include <linux/uaccess.h>
#include <linux/init_ohci1394_dma.h>
+#include <linux/kvm_para.h>
#include <asm/mtrr.h>
#include <asm/uaccess.h>
@@ -58,7 +60,6 @@
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/setup.h>
-#include <asm/mach_apic.h>
#include <asm/numa.h>
#include <asm/sections.h>
#include <asm/dmi.h>
@@ -66,7 +67,9 @@
#include <asm/mce.h>
#include <asm/ds.h>
#include <asm/topology.h>
+#include <asm/trampoline.h>
+#include <mach_apic.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
@@ -114,7 +117,7 @@ extern int root_mountflags;
char __initdata command_line[COMMAND_LINE_SIZE];
-struct resource standard_io_resources[] = {
+static struct resource standard_io_resources[] = {
{ .name = "dma1", .start = 0x00, .end = 0x1f,
.flags = IORESOURCE_BUSY | IORESOURCE_IO },
{ .name = "pic1", .start = 0x20, .end = 0x21,
@@ -188,6 +191,7 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
e820_register_active_regions(0, start_pfn, end_pfn);
free_bootmem_with_active_regions(0, end_pfn);
+ early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}
#endif
@@ -248,6 +252,7 @@ static void __init reserve_crashkernel(void)
(unsigned long)(total_mem >> 20));
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
+ insert_resource(&iomem_resource, &crashk_res);
}
}
#else
@@ -261,6 +266,28 @@ void __attribute__((weak)) __init memory_setup(void)
machine_specific_memory_setup();
}
+static void __init parse_setup_data(void)
+{
+ struct setup_data *data;
+ unsigned long pa_data;
+
+ if (boot_params.hdr.version < 0x0209)
+ return;
+ pa_data = boot_params.hdr.setup_data;
+ while (pa_data) {
+ data = early_ioremap(pa_data, PAGE_SIZE);
+ switch (data->type) {
+ default:
+ break;
+ }
+#ifndef CONFIG_DEBUG_BOOT_PARAMS
+ free_early(pa_data, pa_data+sizeof(*data)+data->len);
+#endif
+ pa_data = data->next;
+ early_iounmap(data, PAGE_SIZE);
+ }
+}
+
/*
* setup_arch - architecture-specific boot-time initializations
*
@@ -313,6 +340,8 @@ void __init setup_arch(char **cmdline_p)
strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
+ parse_setup_data();
+
parse_early_param();
#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
@@ -322,6 +351,11 @@ void __init setup_arch(char **cmdline_p)
finish_e820_parsing();
+ /* after parse_early_param, so could debug it */
+ insert_resource(&iomem_resource, &code_resource);
+ insert_resource(&iomem_resource, &data_resource);
+ insert_resource(&iomem_resource, &bss_resource);
+
early_gart_iommu_check();
e820_register_active_regions(0, 0, -1UL);
@@ -341,14 +375,20 @@ void __init setup_arch(char **cmdline_p)
check_efer();
- init_memory_mapping(0, (end_pfn_map << PAGE_SHIFT));
+ max_pfn_mapped = init_memory_mapping(0, (max_pfn_mapped << PAGE_SHIFT));
if (efi_enabled)
efi_init();
+ vsmp_init();
+
dmi_scan_machine();
io_delay_init();
+#ifdef CONFIG_KVM_CLOCK
+ kvmclock_init();
+#endif
+
#ifdef CONFIG_SMP
/* setup to use the early static init tables during kernel startup */
x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init;
@@ -387,7 +427,7 @@ void __init setup_arch(char **cmdline_p)
contig_initmem_init(0, end_pfn);
#endif
- early_res_to_bootmem();
+ dma32_reserve_bootmem();
#ifdef CONFIG_ACPI_SLEEP
/*
@@ -411,11 +451,14 @@ void __init setup_arch(char **cmdline_p)
unsigned long end_of_mem = end_pfn << PAGE_SHIFT;
if (ramdisk_end <= end_of_mem) {
- reserve_bootmem_generic(ramdisk_image, ramdisk_size);
+ /*
+ * don't need to reserve again, already reserved early
+ * in x86_64_start_kernel, and early_res_to_bootmem
+ * convert that to reserved in bootmem
+ */
initrd_start = ramdisk_image + PAGE_OFFSET;
initrd_end = initrd_start+ramdisk_size;
} else {
- /* Assumes everything on node 0 */
free_bootmem(ramdisk_image, ramdisk_size);
printk(KERN_ERR "initrd extends beyond end of memory "
"(0x%08lx > 0x%08lx)\ndisabling initrd\n",
@@ -425,6 +468,9 @@ void __init setup_arch(char **cmdline_p)
}
#endif
reserve_crashkernel();
+
+ reserve_ibft_region();
+
paging_init();
map_vsyscall();
@@ -447,10 +493,12 @@ void __init setup_arch(char **cmdline_p)
init_apic_mappings();
ioapic_init_mappings();
+ kvm_guest_init();
+
/*
* We trust e820 completely. No explicit ROM probing in memory.
*/
- e820_reserve_resources(&code_resource, &data_resource, &bss_resource);
+ e820_reserve_resources();
e820_mark_nosave_regions();
/* request I/O space for devices used on all i[345]86 PCs */
@@ -552,9 +600,9 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
bits = c->x86_coreid_bits;
/* Low order bits define the core id (index of core in socket) */
- c->cpu_core_id = c->phys_proc_id & ((1 << bits)-1);
- /* Convert the APIC ID into the socket ID */
- c->phys_proc_id = phys_pkg_id(bits);
+ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
+ /* Convert the initial APIC ID into the socket ID */
+ c->phys_proc_id = c->initial_apicid >> bits;
#ifdef CONFIG_NUMA
node = c->phys_proc_id;
@@ -571,7 +619,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
If that doesn't result in a usable node fall back to the
path for the previous case. */
- int ht_nodeid = apicid - (cpu_data(0).phys_proc_id << bits);
+ int ht_nodeid = c->initial_apicid;
if (ht_nodeid >= 0 &&
apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
@@ -677,7 +725,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
/* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
- clear_bit(0*32+31, (unsigned long *)&c->x86_capability);
+ clear_cpu_cap(c, 0*32+31);
/* On C+ stepping K8 rep microcode works well for copy/memset */
level = cpuid_eax(1);
@@ -721,6 +769,19 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
if (amd_apic_timer_broken())
disable_apic_timer = 1;
+
+ if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+ unsigned long long tseg;
+
+ /*
+ * Split up direct mapping around the TSEG SMM area.
+ * Don't do it for gbpages because there seems very little
+ * benefit in doing so.
+ */
+ if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg) &&
+ (tseg >> PMD_SHIFT) < (max_pfn_mapped >> (PMD_SHIFT-PAGE_SHIFT)))
+ set_memory_4k((unsigned long)__va(tseg), 1);
+ }
}
void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -813,7 +874,7 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
{
if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
(c->x86 == 0x6 && c->x86_model >= 0x0e))
- set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
}
static void __cpuinit init_intel(struct cpuinfo_x86 *c)
@@ -856,9 +917,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
if (c->x86 == 15)
c->x86_cache_alignment = c->x86_clflush_size * 2;
- if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
- (c->x86 == 0x6 && c->x86_model >= 0x0e))
- set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
if (c->x86 == 6)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -867,6 +925,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
srat_detect_node();
}
+static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
+{
+ if (c->x86 == 0x6 && c->x86_model >= 0xf)
+ set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability);
+}
+
+static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
+{
+ /* Cache sizes */
+ unsigned n;
+
+ n = c->extended_cpuid_level;
+ if (n >= 0x80000008) {
+ unsigned eax = cpuid_eax(0x80000008);
+ c->x86_virt_bits = (eax >> 8) & 0xff;
+ c->x86_phys_bits = eax & 0xff;
+ }
+
+ if (c->x86 == 0x6 && c->x86_model >= 0xf) {
+ c->x86_cache_alignment = c->x86_clflush_size * 2;
+ set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ }
+ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+}
+
static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
@@ -875,6 +959,8 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
c->x86_vendor = X86_VENDOR_AMD;
else if (!strcmp(v, "GenuineIntel"))
c->x86_vendor = X86_VENDOR_INTEL;
+ else if (!strcmp(v, "CentaurHauls"))
+ c->x86_vendor = X86_VENDOR_CENTAUR;
else
c->x86_vendor = X86_VENDOR_UNKNOWN;
}
@@ -922,15 +1008,16 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
c->x86 += (tfms >> 20) & 0xff;
if (c->x86 >= 0x6)
c->x86_model += ((tfms >> 16) & 0xF) << 4;
- if (c->x86_capability[0] & (1<<19))
+ if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
} else {
/* Have CPUID level 0 only - unheard of */
c->x86 = 4;
}
+ c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
#ifdef CONFIG_SMP
- c->phys_proc_id = (cpuid_ebx(1) >> 24) & 0xff;
+ c->phys_proc_id = c->initial_apicid;
#endif
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
@@ -956,12 +1043,22 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x80000007)
c->x86_power = cpuid_edx(0x80000007);
+
+ clear_cpu_cap(c, X86_FEATURE_PAT);
+
switch (c->x86_vendor) {
case X86_VENDOR_AMD:
early_init_amd(c);
+ if (c->x86 >= 0xf && c->x86 <= 0x11)
+ set_cpu_cap(c, X86_FEATURE_PAT);
break;
case X86_VENDOR_INTEL:
early_init_intel(c);
+ if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15))
+ set_cpu_cap(c, X86_FEATURE_PAT);
+ break;
+ case X86_VENDOR_CENTAUR:
+ early_init_centaur(c);
break;
}
@@ -999,6 +1096,10 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
init_intel(c);
break;
+ case X86_VENDOR_CENTAUR:
+ init_centaur(c);
+ break;
+
case X86_VENDOR_UNKNOWN:
default:
display_cacheinfo(c);
@@ -1028,14 +1129,24 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
#endif
select_idle_routine(c);
- if (c != &boot_cpu_data)
- mtrr_ap_init();
#ifdef CONFIG_NUMA
numa_add_cpu(smp_processor_id());
#endif
}
+void __cpuinit identify_boot_cpu(void)
+{
+ identify_cpu(&boot_cpu_data);
+}
+
+void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
+{
+ BUG_ON(c == &boot_cpu_data);
+ identify_cpu(c);
+ mtrr_ap_init();
+}
+
static __init int setup_noclflush(char *arg)
{
setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
@@ -1064,123 +1175,3 @@ static __init int setup_disablecpuid(char *arg)
return 1;
}
__setup("clearcpuid=", setup_disablecpuid);
-
-/*
- * Get CPU information for use by the procfs.
- */
-
-static int show_cpuinfo(struct seq_file *m, void *v)
-{
- struct cpuinfo_x86 *c = v;
- int cpu = 0, i;
-
-#ifdef CONFIG_SMP
- cpu = c->cpu_index;
-#endif
-
- seq_printf(m, "processor\t: %u\n"
- "vendor_id\t: %s\n"
- "cpu family\t: %d\n"
- "model\t\t: %d\n"
- "model name\t: %s\n",
- (unsigned)cpu,
- c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown",
- c->x86,
- (int)c->x86_model,
- c->x86_model_id[0] ? c->x86_model_id : "unknown");
-
- if (c->x86_mask || c->cpuid_level >= 0)
- seq_printf(m, "stepping\t: %d\n", c->x86_mask);
- else
- seq_printf(m, "stepping\t: unknown\n");
-
- if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = cpufreq_quick_get((unsigned)cpu);
-
- if (!freq)
- freq = cpu_khz;
- seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
- freq / 1000, (freq % 1000));
- }
-
- /* Cache size */
- if (c->x86_cache_size >= 0)
- seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
-
-#ifdef CONFIG_SMP
- if (smp_num_siblings * c->x86_max_cores > 1) {
- seq_printf(m, "physical id\t: %d\n", c->phys_proc_id);
- seq_printf(m, "siblings\t: %d\n",
- cpus_weight(per_cpu(cpu_core_map, cpu)));
- seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id);
- seq_printf(m, "cpu cores\t: %d\n", c->booted_cores);
- }
-#endif
-
- seq_printf(m,
- "fpu\t\t: yes\n"
- "fpu_exception\t: yes\n"
- "cpuid level\t: %d\n"
- "wp\t\t: yes\n"
- "flags\t\t:",
- c->cpuid_level);
-
- for (i = 0; i < 32*NCAPINTS; i++)
- if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
- seq_printf(m, " %s", x86_cap_flags[i]);
-
- seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
- c->loops_per_jiffy/(500000/HZ),
- (c->loops_per_jiffy/(5000/HZ)) % 100);
-
- if (c->x86_tlbsize > 0)
- seq_printf(m, "TLB size\t: %d 4K pages\n", c->x86_tlbsize);
- seq_printf(m, "clflush size\t: %d\n", c->x86_clflush_size);
- seq_printf(m, "cache_alignment\t: %d\n", c->x86_cache_alignment);
-
- seq_printf(m, "address sizes\t: %u bits physical, %u bits virtual\n",
- c->x86_phys_bits, c->x86_virt_bits);
-
- seq_printf(m, "power management:");
- for (i = 0; i < 32; i++) {
- if (c->x86_power & (1 << i)) {
- if (i < ARRAY_SIZE(x86_power_flags) &&
- x86_power_flags[i])
- seq_printf(m, "%s%s",
- x86_power_flags[i][0]?" ":"",
- x86_power_flags[i]);
- else
- seq_printf(m, " [%d]", i);
- }
- }
-
- seq_printf(m, "\n\n");
-
- return 0;
-}
-
-static void *c_start(struct seq_file *m, loff_t *pos)
-{
- if (*pos == 0) /* just in case, cpu 0 is not the first */
- *pos = first_cpu(cpu_online_map);
- if ((*pos) < NR_CPUS && cpu_online(*pos))
- return &cpu_data(*pos);
- return NULL;
-}
-
-static void *c_next(struct seq_file *m, void *v, loff_t *pos)
-{
- *pos = next_cpu(*pos, cpu_online_map);
- return c_start(m, pos);
-}
-
-static void c_stop(struct seq_file *m, void *v)
-{
-}
-
-const struct seq_operations cpuinfo_op = {
- .start = c_start,
- .next = c_next,
- .stop = c_stop,
- .show = show_cpuinfo,
-};
diff --git a/arch/x86/kernel/sigframe_32.h b/arch/x86/kernel/sigframe.h
index 0b2221711dad..72bbb519d2dc 100644
--- a/arch/x86/kernel/sigframe_32.h
+++ b/arch/x86/kernel/sigframe.h
@@ -1,5 +1,5 @@
-struct sigframe
-{
+#ifdef CONFIG_X86_32
+struct sigframe {
char __user *pretcode;
int sig;
struct sigcontext sc;
@@ -8,8 +8,7 @@ struct sigframe
char retcode[8];
};
-struct rt_sigframe
-{
+struct rt_sigframe {
char __user *pretcode;
int sig;
struct siginfo __user *pinfo;
@@ -19,3 +18,10 @@ struct rt_sigframe
struct _fpstate fpstate;
char retcode[8];
};
+#else
+struct rt_sigframe {
+ char __user *pretcode;
+ struct ucontext uc;
+ struct siginfo info;
+};
+#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 0157a6f0f41f..8e05e7f7bd40 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -4,32 +4,44 @@
* 1997-11-28 Modified for POSIX.1b signals by Richard Henderson
* 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes
*/
+#include <linux/list.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
+#include <linux/personality.h>
+#include <linux/binfmts.h>
+#include <linux/suspend.h>
#include <linux/kernel.h>
+#include <linux/ptrace.h>
#include <linux/signal.h>
+#include <linux/stddef.h>
+#include <linux/unistd.h>
#include <linux/errno.h>
+#include <linux/sched.h>
#include <linux/wait.h>
-#include <linux/unistd.h>
-#include <linux/stddef.h>
-#include <linux/personality.h>
-#include <linux/suspend.h>
-#include <linux/ptrace.h>
#include <linux/elf.h>
-#include <linux/binfmts.h>
+#include <linux/smp.h>
+#include <linux/mm.h>
+
#include <asm/processor.h>
#include <asm/ucontext.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/vdso.h>
-#include "sigframe_32.h"
-#define DEBUG_SIG 0
+#include "sigframe.h"
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
+ X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
+ X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
+ X86_EFLAGS_CF)
+
+#ifdef CONFIG_X86_32
+# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF)
+#else
+# define FIX_EFLAGS __FIX_EFLAGS
+#endif
+
/*
* Atomically swap in the new signal mask, and wait for a signal.
*/
@@ -46,10 +58,11 @@ sys_sigsuspend(int history0, int history1, old_sigset_t mask)
current->state = TASK_INTERRUPTIBLE;
schedule();
set_thread_flag(TIF_RESTORE_SIGMASK);
+
return -ERESTARTNOHAND;
}
-asmlinkage int
+asmlinkage int
sys_sigaction(int sig, const struct old_sigaction __user *act,
struct old_sigaction __user *oact)
{
@@ -58,10 +71,12 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
if (act) {
old_sigset_t mask;
+
if (!access_ok(VERIFY_READ, act, sizeof(*act)) ||
__get_user(new_ka.sa.sa_handler, &act->sa_handler) ||
__get_user(new_ka.sa.sa_restorer, &act->sa_restorer))
return -EFAULT;
+
__get_user(new_ka.sa.sa_flags, &act->sa_flags);
__get_user(mask, &act->sa_mask);
siginitset(&new_ka.sa.sa_mask, mask);
@@ -74,6 +89,7 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
__put_user(old_ka.sa.sa_handler, &oact->sa_handler) ||
__put_user(old_ka.sa.sa_restorer, &oact->sa_restorer))
return -EFAULT;
+
__put_user(old_ka.sa.sa_flags, &oact->sa_flags);
__put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask);
}
@@ -81,10 +97,12 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
return ret;
}
-asmlinkage int
-sys_sigaltstack(unsigned long bx)
+asmlinkage int sys_sigaltstack(unsigned long bx)
{
- /* This is needed to make gcc realize it doesn't own the "struct pt_regs" */
+ /*
+ * This is needed to make gcc realize it doesn't own the
+ * "struct pt_regs"
+ */
struct pt_regs *regs = (struct pt_regs *)&bx;
const stack_t __user *uss = (const stack_t __user *)bx;
stack_t __user *uoss = (stack_t __user *)regs->cx;
@@ -96,9 +114,9 @@ sys_sigaltstack(unsigned long bx)
/*
* Do a signal return; undo the signal stack.
*/
-
static int
-restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax)
+restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
+ unsigned long *pax)
{
unsigned int err = 0;
@@ -120,37 +138,29 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
#define GET_SEG(seg) \
{ unsigned short tmp; \
err |= __get_user(tmp, &sc->seg); \
- loadsegment(seg,tmp); }
-
-#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_RF | \
- X86_EFLAGS_OF | X86_EFLAGS_DF | \
- X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \
- X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF)
+ loadsegment(seg, tmp); }
GET_SEG(gs);
COPY_SEG(fs);
COPY_SEG(es);
COPY_SEG(ds);
- COPY(di);
- COPY(si);
- COPY(bp);
- COPY(sp);
- COPY(bx);
- COPY(dx);
- COPY(cx);
- COPY(ip);
+ COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
+ COPY(dx); COPY(cx); COPY(ip);
COPY_SEG_STRICT(cs);
COPY_SEG_STRICT(ss);
-
+
{
unsigned int tmpflags;
+
err |= __get_user(tmpflags, &sc->flags);
- regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
+ regs->flags = (regs->flags & ~FIX_EFLAGS) |
+ (tmpflags & FIX_EFLAGS);
regs->orig_ax = -1; /* disable syscall checks */
}
{
- struct _fpstate __user * buf;
+ struct _fpstate __user *buf;
+
err |= __get_user(buf, &sc->fpstate);
if (buf) {
if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
@@ -158,6 +168,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
err |= restore_i387(buf);
} else {
struct task_struct *me = current;
+
if (used_math()) {
clear_fpu(me);
clear_used_math();
@@ -165,24 +176,26 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, int *peax
}
}
- err |= __get_user(*peax, &sc->ax);
+ err |= __get_user(*pax, &sc->ax);
return err;
badframe:
return 1;
}
-asmlinkage int sys_sigreturn(unsigned long __unused)
+asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
{
- struct pt_regs *regs = (struct pt_regs *) &__unused;
- struct sigframe __user *frame = (struct sigframe __user *)(regs->sp - 8);
+ struct sigframe __user *frame;
+ struct pt_regs *regs;
+ unsigned long ax;
sigset_t set;
- int ax;
+
+ regs = (struct pt_regs *) &__unused;
+ frame = (struct sigframe __user *)(regs->sp - 8);
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
- if (__get_user(set.sig[0], &frame->sc.oldmask)
- || (_NSIG_WORDS > 1
+ if (__get_user(set.sig[0], &frame->sc.oldmask) || (_NSIG_WORDS > 1
&& __copy_from_user(&set.sig[1], &frame->extramask,
sizeof(frame->extramask))))
goto badframe;
@@ -192,33 +205,35 @@ asmlinkage int sys_sigreturn(unsigned long __unused)
current->blocked = set;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
-
+
if (restore_sigcontext(regs, &frame->sc, &ax))
goto badframe;
return ax;
badframe:
if (show_unhandled_signals && printk_ratelimit()) {
- printk("%s%s[%d] bad frame in sigreturn frame:%p ip:%lx"
- " sp:%lx oeax:%lx",
+ printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:"
+ "%p ip:%lx sp:%lx oeax:%lx",
task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
current->comm, task_pid_nr(current), frame, regs->ip,
regs->sp, regs->orig_ax);
print_vma_addr(" in ", regs->ip);
- printk("\n");
+ printk(KERN_CONT "\n");
}
force_sig(SIGSEGV, current);
+
return 0;
-}
+}
asmlinkage int sys_rt_sigreturn(unsigned long __unused)
{
- struct pt_regs *regs = (struct pt_regs *) &__unused;
- struct rt_sigframe __user *frame = (struct rt_sigframe __user *)(regs->sp - 4);
+ struct pt_regs *regs = (struct pt_regs *)&__unused;
+ struct rt_sigframe __user *frame;
+ unsigned long ax;
sigset_t set;
- int ax;
+ frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
@@ -229,7 +244,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
current->blocked = set;
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
-
+
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
goto badframe;
@@ -241,12 +256,11 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
badframe:
force_sig(SIGSEGV, current);
return 0;
-}
+}
/*
* Set up a signal frame.
*/
-
static int
setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
struct pt_regs *regs, unsigned long mask)
@@ -277,9 +291,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
tmp = save_i387(fpstate);
if (tmp < 0)
- err = 1;
+ err = 1;
else
- err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
+ err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
/* non-iBCS2 extensions.. */
err |= __put_user(mask, &sc->oldmask);
@@ -292,7 +306,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
* Determine which stack to use..
*/
static inline void __user *
-get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
+get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
{
unsigned long sp;
@@ -310,32 +324,30 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs * regs, size_t frame_size)
if (ka->sa.sa_flags & SA_ONSTACK) {
if (sas_ss_flags(sp) == 0)
sp = current->sas_ss_sp + current->sas_ss_size;
- }
-
- /* This is the legacy signal stack switching. */
- else if ((regs->ss & 0xffff) != __USER_DS &&
- !(ka->sa.sa_flags & SA_RESTORER) &&
- ka->sa.sa_restorer) {
- sp = (unsigned long) ka->sa.sa_restorer;
+ } else {
+ /* This is the legacy signal stack switching. */
+ if ((regs->ss & 0xffff) != __USER_DS &&
+ !(ka->sa.sa_flags & SA_RESTORER) &&
+ ka->sa.sa_restorer)
+ sp = (unsigned long) ka->sa.sa_restorer;
}
sp -= frame_size;
- /* Align the stack pointer according to the i386 ABI,
- * i.e. so that on function entry ((sp + 4) & 15) == 0. */
+ /*
+ * Align the stack pointer according to the i386 ABI,
+ * i.e. so that on function entry ((sp + 4) & 15) == 0.
+ */
sp = ((sp + 4) & -16ul) - 4;
+
return (void __user *) sp;
}
-/* These symbols are defined with the addresses in the vsyscall page.
- See vsyscall-sigreturn.S. */
-extern void __user __kernel_sigreturn;
-extern void __user __kernel_rt_sigreturn;
-
-static int setup_frame(int sig, struct k_sigaction *ka,
- sigset_t *set, struct pt_regs * regs)
+static int
+setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
+ struct pt_regs *regs)
{
- void __user *restorer;
struct sigframe __user *frame;
+ void __user *restorer;
int err = 0;
int usig;
@@ -365,7 +377,7 @@ static int setup_frame(int sig, struct k_sigaction *ka,
goto give_sigsegv;
}
- if (current->binfmt->hasvdso)
+ if (current->mm->context.vdso)
restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn);
else
restorer = &frame->retcode;
@@ -374,9 +386,9 @@ static int setup_frame(int sig, struct k_sigaction *ka,
/* Set up to return from userspace. */
err |= __put_user(restorer, &frame->pretcode);
-
+
/*
- * This is popl %eax ; movl $,%eax ; int $0x80
+ * This is popl %eax ; movl $__NR_sigreturn, %eax ; int $0x80
*
* WE DO NOT USE IT ANY MORE! It's only left here for historical
* reasons and because gdb uses it as a signature to notice
@@ -390,32 +402,17 @@ static int setup_frame(int sig, struct k_sigaction *ka,
goto give_sigsegv;
/* Set up registers for signal handler */
- regs->sp = (unsigned long) frame;
- regs->ip = (unsigned long) ka->sa.sa_handler;
- regs->ax = (unsigned long) sig;
- regs->dx = (unsigned long) 0;
- regs->cx = (unsigned long) 0;
+ regs->sp = (unsigned long)frame;
+ regs->ip = (unsigned long)ka->sa.sa_handler;
+ regs->ax = (unsigned long)sig;
+ regs->dx = 0;
+ regs->cx = 0;
regs->ds = __USER_DS;
regs->es = __USER_DS;
regs->ss = __USER_DS;
regs->cs = __USER_CS;
- /*
- * Clear TF when entering the signal handler, but
- * notify any tracer that was single-stepping it.
- * The tracer may want to single-step inside the
- * handler too.
- */
- regs->flags &= ~(TF_MASK | X86_EFLAGS_DF);
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
-
-#if DEBUG_SIG
- printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
- current->comm, current->pid, frame, regs->ip, frame->pretcode);
-#endif
-
return 0;
give_sigsegv:
@@ -424,10 +421,10 @@ give_sigsegv:
}
static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
- sigset_t *set, struct pt_regs * regs)
+ sigset_t *set, struct pt_regs *regs)
{
- void __user *restorer;
struct rt_sigframe __user *frame;
+ void __user *restorer;
int err = 0;
int usig;
@@ -457,7 +454,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
&frame->uc.uc_stack.ss_flags);
err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
- regs, set->sig[0]);
+ regs, set->sig[0]);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
if (err)
goto give_sigsegv;
@@ -467,9 +464,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
if (ka->sa.sa_flags & SA_RESTORER)
restorer = ka->sa.sa_restorer;
err |= __put_user(restorer, &frame->pretcode);
-
+
/*
- * This is movl $,%ax ; int $0x80
+ * This is movl $__NR_rt_sigreturn, %ax ; int $0x80
*
* WE DO NOT USE IT ANY MORE! It's only left here for historical
* reasons and because gdb uses it as a signature to notice
@@ -483,32 +480,17 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
goto give_sigsegv;
/* Set up registers for signal handler */
- regs->sp = (unsigned long) frame;
- regs->ip = (unsigned long) ka->sa.sa_handler;
- regs->ax = (unsigned long) usig;
- regs->dx = (unsigned long) &frame->info;
- regs->cx = (unsigned long) &frame->uc;
+ regs->sp = (unsigned long)frame;
+ regs->ip = (unsigned long)ka->sa.sa_handler;
+ regs->ax = (unsigned long)usig;
+ regs->dx = (unsigned long)&frame->info;
+ regs->cx = (unsigned long)&frame->uc;
regs->ds = __USER_DS;
regs->es = __USER_DS;
regs->ss = __USER_DS;
regs->cs = __USER_CS;
- /*
- * Clear TF when entering the signal handler, but
- * notify any tracer that was single-stepping it.
- * The tracer may want to single-step inside the
- * handler too.
- */
- regs->flags &= ~(TF_MASK | X86_EFLAGS_DF);
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
-
-#if DEBUG_SIG
- printk("SIG deliver (%s:%d): sp=%p pc=%p ra=%p\n",
- current->comm, current->pid, frame, regs->ip, frame->pretcode);
-#endif
-
return 0;
give_sigsegv:
@@ -517,33 +499,33 @@ give_sigsegv:
}
/*
- * OK, we're invoking a handler
- */
-
+ * OK, we're invoking a handler:
+ */
static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
- sigset_t *oldset, struct pt_regs * regs)
+ sigset_t *oldset, struct pt_regs *regs)
{
int ret;
/* Are we from a system call? */
- if (regs->orig_ax >= 0) {
+ if ((long)regs->orig_ax >= 0) {
/* If so, check system call restarting.. */
switch (regs->ax) {
- case -ERESTART_RESTARTBLOCK:
- case -ERESTARTNOHAND:
+ case -ERESTART_RESTARTBLOCK:
+ case -ERESTARTNOHAND:
+ regs->ax = -EINTR;
+ break;
+
+ case -ERESTARTSYS:
+ if (!(ka->sa.sa_flags & SA_RESTART)) {
regs->ax = -EINTR;
break;
-
- case -ERESTARTSYS:
- if (!(ka->sa.sa_flags & SA_RESTART)) {
- regs->ax = -EINTR;
- break;
- }
- /* fallthrough */
- case -ERESTARTNOINTR:
- regs->ax = regs->orig_ax;
- regs->ip -= 2;
+ }
+ /* fallthrough */
+ case -ERESTARTNOINTR:
+ regs->ax = regs->orig_ax;
+ regs->ip -= 2;
+ break;
}
}
@@ -561,16 +543,32 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
else
ret = setup_frame(sig, ka, oldset, regs);
- if (ret == 0) {
- spin_lock_irq(&current->sighand->siglock);
- sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
- if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&current->blocked,sig);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- }
+ if (ret)
+ return ret;
- return ret;
+ /*
+ * Clear the direction flag as per the ABI for function entry.
+ */
+ regs->flags &= ~X86_EFLAGS_DF;
+
+ /*
+ * Clear TF when entering the signal handler, but
+ * notify any tracer that was single-stepping it.
+ * The tracer may want to single-step inside the
+ * handler too.
+ */
+ regs->flags &= ~X86_EFLAGS_TF;
+ if (test_thread_flag(TIF_SINGLESTEP))
+ ptrace_notify(SIGTRAP);
+
+ spin_lock_irq(&current->sighand->siglock);
+ sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
+ if (!(ka->sa.sa_flags & SA_NODEFER))
+ sigaddset(&current->blocked, sig);
+ recalc_sigpending();
+ spin_unlock_irq(&current->sighand->siglock);
+
+ return 0;
}
/*
@@ -580,18 +578,17 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
*/
static void do_signal(struct pt_regs *regs)
{
+ struct k_sigaction ka;
siginfo_t info;
int signr;
- struct k_sigaction ka;
sigset_t *oldset;
/*
- * We want the common case to go fast, which
- * is why we may in certain cases get here from
- * kernel mode. Just return without doing anything
- * if so. vm86 regs switched out by assembly code
- * before reaching here, so testing against kernel
- * CS suffices.
+ * We want the common case to go fast, which is why we may in certain
+ * cases get here from kernel mode. Just return without doing anything
+ * if so.
+ * X86_32: vm86 regs switched out by assembly code before reaching
+ * here, so testing against kernel CS suffices.
*/
if (!user_mode(regs))
return;
@@ -603,29 +600,31 @@ static void do_signal(struct pt_regs *regs)
signr = get_signal_to_deliver(&info, &ka, regs, NULL);
if (signr > 0) {
- /* Re-enable any watchpoints before delivering the
+ /*
+ * Re-enable any watchpoints before delivering the
* signal to user space. The processor register will
* have been cleared if the watchpoint triggered
* inside the kernel.
*/
- if (unlikely(current->thread.debugreg7))
+ if (current->thread.debugreg7)
set_debugreg(current->thread.debugreg7, 7);
- /* Whee! Actually deliver the signal. */
+ /* Whee! Actually deliver the signal. */
if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
- /* a signal was successfully delivered; the saved
+ /*
+ * a signal was successfully delivered; the saved
* sigmask will have been stored in the signal frame,
* and will be restored by sigreturn, so we can simply
- * clear the TIF_RESTORE_SIGMASK flag */
+ * clear the TIF_RESTORE_SIGMASK flag
+ */
if (test_thread_flag(TIF_RESTORE_SIGMASK))
clear_thread_flag(TIF_RESTORE_SIGMASK);
}
-
return;
}
/* Did we come from a system call? */
- if (regs->orig_ax >= 0) {
+ if ((long)regs->orig_ax >= 0) {
/* Restart the system call - no handlers present */
switch (regs->ax) {
case -ERESTARTNOHAND:
@@ -642,8 +641,10 @@ static void do_signal(struct pt_regs *regs)
}
}
- /* if there's no signal to deliver, we just put the saved sigmask
- * back */
+ /*
+ * If there's no signal to deliver, we just put the saved sigmask
+ * back.
+ */
if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
clear_thread_flag(TIF_RESTORE_SIGMASK);
sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
@@ -654,13 +655,12 @@ static void do_signal(struct pt_regs *regs)
* notification of userspace execution resumption
* - triggered by the TIF_WORK_MASK flags
*/
-__attribute__((regparm(3)))
-void do_notify_resume(struct pt_regs *regs, void *_unused,
- __u32 thread_info_flags)
+void
+do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
/* Pending single-step? */
if (thread_info_flags & _TIF_SINGLESTEP) {
- regs->flags |= TF_MASK;
+ regs->flags |= X86_EFLAGS_TF;
clear_thread_flag(TIF_SINGLESTEP);
}
@@ -670,6 +670,6 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
if (thread_info_flags & _TIF_HRTICK_RESCHED)
hrtick_resched();
-
+
clear_thread_flag(TIF_IRET);
}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index 1c83e5124c65..ccb2a4560c2d 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -19,17 +19,28 @@
#include <linux/stddef.h>
#include <linux/personality.h>
#include <linux/compiler.h>
+#include <asm/processor.h>
#include <asm/ucontext.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
#include <asm/mce.h>
-
-/* #define DEBUG_SIG 1 */
+#include "sigframe.h"
#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
+#define __FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
+ X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
+ X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
+ X86_EFLAGS_CF)
+
+#ifdef CONFIG_X86_32
+# define FIX_EFLAGS (__FIX_EFLAGS | X86_EFLAGS_RF)
+#else
+# define FIX_EFLAGS __FIX_EFLAGS
+#endif
+
int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs * regs);
int ia32_setup_frame(int sig, struct k_sigaction *ka,
@@ -46,16 +57,9 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
/*
* Do a signal return; undo the signal stack.
*/
-
-struct rt_sigframe
-{
- char __user *pretcode;
- struct ucontext uc;
- struct siginfo info;
-};
-
static int
-restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *prax)
+restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
+ unsigned long *pax)
{
unsigned int err = 0;
@@ -87,7 +91,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
{
unsigned int tmpflags;
err |= __get_user(tmpflags, &sc->flags);
- regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5);
+ regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
regs->orig_ax = -1; /* disable syscall checks */
}
@@ -108,7 +112,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned
}
}
- err |= __get_user(*prax, &sc->ax);
+ err |= __get_user(*pax, &sc->ax);
return err;
badframe:
@@ -121,13 +125,11 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
sigset_t set;
unsigned long ax;
- frame = (struct rt_sigframe __user *)(regs->sp - 8);
- if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) {
+ frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
+ if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
goto badframe;
- }
- if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set))) {
+ if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
goto badframe;
- }
sigdelsetmask(&set, ~_BLOCKABLE);
spin_lock_irq(&current->sighand->siglock);
@@ -138,10 +140,6 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
goto badframe;
-#ifdef DEBUG_SIG
- printk("%d sigreturn ip:%lx sp:%lx frame:%p ax:%lx\n",current->pid,regs->ip,regs->sp,frame,ax);
-#endif
-
if (do_sigaltstack(&frame->uc.uc_stack, NULL, regs->sp) == -EFAULT)
goto badframe;
@@ -270,10 +268,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
if (err)
goto give_sigsegv;
-#ifdef DEBUG_SIG
- printk("%d old ip %lx old sp %lx old ax %lx\n", current->pid,regs->ip,regs->sp,regs->ax);
-#endif
-
/* Set up registers for signal handler */
regs->di = sig;
/* In case the signal handler was declared without prototypes */
@@ -291,18 +285,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
even if the handler happens to be interrupting 32-bit code. */
regs->cs = __USER_CS;
- /* This, by contrast, has nothing to do with segment registers -
- see include/asm-x86_64/uaccess.h for details. */
- set_fs(USER_DS);
-
- regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF);
- if (test_thread_flag(TIF_SINGLESTEP))
- ptrace_notify(SIGTRAP);
-#ifdef DEBUG_SIG
- printk("SIG deliver (%s:%d): sp=%p pc=%lx ra=%p\n",
- current->comm, current->pid, frame, regs->ip, frame->pretcode);
-#endif
-
return 0;
give_sigsegv:
@@ -345,35 +327,29 @@ static long current_syscall_ret(struct pt_regs *regs)
static int
handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
- sigset_t *oldset, struct pt_regs *regs)
+ sigset_t *oldset, struct pt_regs *regs)
{
int ret;
-#ifdef DEBUG_SIG
- printk("handle_signal pid:%d sig:%lu ip:%lx sp:%lx regs=%p\n",
- current->pid, sig,
- regs->ip, regs->sp, regs);
-#endif
-
/* Are we from a system call? */
if (current_syscall(regs) >= 0) {
/* If so, check system call restarting.. */
switch (current_syscall_ret(regs)) {
- case -ERESTART_RESTARTBLOCK:
- case -ERESTARTNOHAND:
- regs->ax = -EINTR;
- break;
+ case -ERESTART_RESTARTBLOCK:
+ case -ERESTARTNOHAND:
+ regs->ax = -EINTR;
+ break;
- case -ERESTARTSYS:
- if (!(ka->sa.sa_flags & SA_RESTART)) {
- regs->ax = -EINTR;
- break;
- }
- /* fallthrough */
- case -ERESTARTNOINTR:
- regs->ax = regs->orig_ax;
- regs->ip -= 2;
+ case -ERESTARTSYS:
+ if (!(ka->sa.sa_flags & SA_RESTART)) {
+ regs->ax = -EINTR;
break;
+ }
+ /* fallthrough */
+ case -ERESTARTNOINTR:
+ regs->ax = regs->orig_ax;
+ regs->ip -= 2;
+ break;
}
}
@@ -396,6 +372,28 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
ret = setup_rt_frame(sig, ka, info, oldset, regs);
if (ret == 0) {
+ /*
+ * This has nothing to do with segment registers,
+ * despite the name. This magic affects uaccess.h
+ * macros' behavior. Reset it to the normal setting.
+ */
+ set_fs(USER_DS);
+
+ /*
+ * Clear the direction flag as per the ABI for function entry.
+ */
+ regs->flags &= ~X86_EFLAGS_DF;
+
+ /*
+ * Clear TF when entering the signal handler, but
+ * notify any tracer that was single-stepping it.
+ * The tracer may want to single-step inside the
+ * handler too.
+ */
+ regs->flags &= ~X86_EFLAGS_TF;
+ if (test_thread_flag(TIF_SINGLESTEP))
+ ptrace_notify(SIGTRAP);
+
spin_lock_irq(&current->sighand->siglock);
sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
if (!(ka->sa.sa_flags & SA_NODEFER))
@@ -420,10 +418,11 @@ static void do_signal(struct pt_regs *regs)
sigset_t *oldset;
/*
- * We want the common case to go fast, which
- * is why we may in certain cases get here from
- * kernel mode. Just return without doing anything
+ * We want the common case to go fast, which is why we may in certain
+ * cases get here from kernel mode. Just return without doing anything
* if so.
+ * X86_32: vm86 regs switched out by assembly code before reaching
+ * here, so testing against kernel CS suffices.
*/
if (!user_mode(regs))
return;
@@ -473,22 +472,19 @@ static void do_signal(struct pt_regs *regs)
}
}
- /* if there's no signal to deliver, we just put the saved sigmask
- back. */
+ /*
+ * If there's no signal to deliver, we just put the saved sigmask
+ * back.
+ */
if (test_thread_flag(TIF_RESTORE_SIGMASK)) {
clear_thread_flag(TIF_RESTORE_SIGMASK);
sigprocmask(SIG_SETMASK, &current->saved_sigmask, NULL);
}
}
-void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
+void do_notify_resume(struct pt_regs *regs, void *unused,
+ __u32 thread_info_flags)
{
-#ifdef DEBUG_SIG
- printk("do_notify_resume flags:%x ip:%lx sp:%lx caller:%p pending:%x\n",
- thread_info_flags, regs->ip, regs->sp, __builtin_return_address(0),signal_pending(current));
-#endif
-
/* Pending single-step? */
if (thread_info_flags & _TIF_SINGLESTEP) {
regs->flags |= X86_EFLAGS_TF;
@@ -502,7 +498,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
#endif /* CONFIG_X86_MCE */
/* deal with pending signal delivery */
- if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
+ if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
do_signal(regs);
if (thread_info_flags & _TIF_HRTICK_RESCHED)
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
new file mode 100644
index 000000000000..8f75893a6467
--- /dev/null
+++ b/arch/x86/kernel/smp.c
@@ -0,0 +1,343 @@
+/*
+ * Intel SMP support routines.
+ *
+ * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
+ * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
+ * (c) 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ * i386 and x86_64 integration by Glauber Costa <gcosta@redhat.com>
+ *
+ * This code is released under the GNU General Public License version 2 or
+ * later.
+ */
+
+#include <linux/init.h>
+
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/kernel_stat.h>
+#include <linux/mc146818rtc.h>
+#include <linux/cache.h>
+#include <linux/interrupt.h>
+#include <linux/cpu.h>
+
+#include <asm/mtrr.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/proto.h>
+#include <mach_ipi.h>
+#include <mach_apic.h>
+/*
+ * Some notes on x86 processor bugs affecting SMP operation:
+ *
+ * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
+ * The Linux implications for SMP are handled as follows:
+ *
+ * Pentium III / [Xeon]
+ * None of the E1AP-E3AP errata are visible to the user.
+ *
+ * E1AP. see PII A1AP
+ * E2AP. see PII A2AP
+ * E3AP. see PII A3AP
+ *
+ * Pentium II / [Xeon]
+ * None of the A1AP-A3AP errata are visible to the user.
+ *
+ * A1AP. see PPro 1AP
+ * A2AP. see PPro 2AP
+ * A3AP. see PPro 7AP
+ *
+ * Pentium Pro
+ * None of 1AP-9AP errata are visible to the normal user,
+ * except occasional delivery of 'spurious interrupt' as trap #15.
+ * This is very rare and a non-problem.
+ *
+ * 1AP. Linux maps APIC as non-cacheable
+ * 2AP. worked around in hardware
+ * 3AP. fixed in C0 and above steppings microcode update.
+ * Linux does not use excessive STARTUP_IPIs.
+ * 4AP. worked around in hardware
+ * 5AP. symmetric IO mode (normal Linux operation) not affected.
+ * 'noapic' mode has vector 0xf filled out properly.
+ * 6AP. 'noapic' mode might be affected - fixed in later steppings
+ * 7AP. We do not assume writes to the LVT deassering IRQs
+ * 8AP. We do not enable low power mode (deep sleep) during MP bootup
+ * 9AP. We do not use mixed mode
+ *
+ * Pentium
+ * There is a marginal case where REP MOVS on 100MHz SMP
+ * machines with B stepping processors can fail. XXX should provide
+ * an L1cache=Writethrough or L1cache=off option.
+ *
+ * B stepping CPUs may hang. There are hardware work arounds
+ * for this. We warn about it in case your board doesn't have the work
+ * arounds. Basically that's so I can tell anyone with a B stepping
+ * CPU and SMP problems "tough".
+ *
+ * Specific items [From Pentium Processor Specification Update]
+ *
+ * 1AP. Linux doesn't use remote read
+ * 2AP. Linux doesn't trust APIC errors
+ * 3AP. We work around this
+ * 4AP. Linux never generated 3 interrupts of the same priority
+ * to cause a lost local interrupt.
+ * 5AP. Remote read is never used
+ * 6AP. not affected - worked around in hardware
+ * 7AP. not affected - worked around in hardware
+ * 8AP. worked around in hardware - we get explicit CS errors if not
+ * 9AP. only 'noapic' mode affected. Might generate spurious
+ * interrupts, we log only the first one and count the
+ * rest silently.
+ * 10AP. not affected - worked around in hardware
+ * 11AP. Linux reads the APIC between writes to avoid this, as per
+ * the documentation. Make sure you preserve this as it affects
+ * the C stepping chips too.
+ * 12AP. not affected - worked around in hardware
+ * 13AP. not affected - worked around in hardware
+ * 14AP. we always deassert INIT during bootup
+ * 15AP. not affected - worked around in hardware
+ * 16AP. not affected - worked around in hardware
+ * 17AP. not affected - worked around in hardware
+ * 18AP. not affected - worked around in hardware
+ * 19AP. not affected - worked around in BIOS
+ *
+ * If this sounds worrying believe me these bugs are either ___RARE___,
+ * or are signal timing bugs worked around in hardware and there's
+ * about nothing of note with C stepping upwards.
+ */
+
+/*
+ * this function sends a 'reschedule' IPI to another CPU.
+ * it goes straight through and wastes no time serializing
+ * anything. Worst case is that we lose a reschedule ...
+ */
+static void native_smp_send_reschedule(int cpu)
+{
+ if (unlikely(cpu_is_offline(cpu))) {
+ WARN_ON(1);
+ return;
+ }
+ send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
+}
+
+/*
+ * Structure and data for smp_call_function(). This is designed to minimise
+ * static memory requirements. It also looks cleaner.
+ */
+static DEFINE_SPINLOCK(call_lock);
+
+struct call_data_struct {
+ void (*func) (void *info);
+ void *info;
+ atomic_t started;
+ atomic_t finished;
+ int wait;
+};
+
+void lock_ipi_call_lock(void)
+{
+ spin_lock_irq(&call_lock);
+}
+
+void unlock_ipi_call_lock(void)
+{
+ spin_unlock_irq(&call_lock);
+}
+
+static struct call_data_struct *call_data;
+
+static void __smp_call_function(void (*func) (void *info), void *info,
+ int nonatomic, int wait)
+{
+ struct call_data_struct data;
+ int cpus = num_online_cpus() - 1;
+
+ if (!cpus)
+ return;
+
+ data.func = func;
+ data.info = info;
+ atomic_set(&data.started, 0);
+ data.wait = wait;
+ if (wait)
+ atomic_set(&data.finished, 0);
+
+ call_data = &data;
+ mb();
+
+ /* Send a message to all other CPUs and wait for them to respond */
+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+
+ /* Wait for response */
+ while (atomic_read(&data.started) != cpus)
+ cpu_relax();
+
+ if (wait)
+ while (atomic_read(&data.finished) != cpus)
+ cpu_relax();
+}
+
+
+/**
+ * smp_call_function_mask(): Run a function on a set of other CPUs.
+ * @mask: The set of cpus to run on. Must not include the current cpu.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+static int
+native_smp_call_function_mask(cpumask_t mask,
+ void (*func)(void *), void *info,
+ int wait)
+{
+ struct call_data_struct data;
+ cpumask_t allbutself;
+ int cpus;
+
+ /* Can deadlock when called with interrupts disabled */
+ WARN_ON(irqs_disabled());
+
+ /* Holding any lock stops cpus from going down. */
+ spin_lock(&call_lock);
+
+ allbutself = cpu_online_map;
+ cpu_clear(smp_processor_id(), allbutself);
+
+ cpus_and(mask, mask, allbutself);
+ cpus = cpus_weight(mask);
+
+ if (!cpus) {
+ spin_unlock(&call_lock);
+ return 0;
+ }
+
+ data.func = func;
+ data.info = info;
+ atomic_set(&data.started, 0);
+ data.wait = wait;
+ if (wait)
+ atomic_set(&data.finished, 0);
+
+ call_data = &data;
+ wmb();
+
+ /* Send a message to other CPUs */
+ if (cpus_equal(mask, allbutself))
+ send_IPI_allbutself(CALL_FUNCTION_VECTOR);
+ else
+ send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
+
+ /* Wait for response */
+ while (atomic_read(&data.started) != cpus)
+ cpu_relax();
+
+ if (wait)
+ while (atomic_read(&data.finished) != cpus)
+ cpu_relax();
+ spin_unlock(&call_lock);
+
+ return 0;
+}
+
+static void stop_this_cpu(void *dummy)
+{
+ local_irq_disable();
+ /*
+ * Remove this CPU:
+ */
+ cpu_clear(smp_processor_id(), cpu_online_map);
+ disable_local_APIC();
+ if (hlt_works(smp_processor_id()))
+ for (;;) halt();
+ for (;;);
+}
+
+/*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+
+static void native_smp_send_stop(void)
+{
+ int nolock;
+ unsigned long flags;
+
+ if (reboot_force)
+ return;
+
+ /* Don't deadlock on the call lock in panic */
+ nolock = !spin_trylock(&call_lock);
+ local_irq_save(flags);
+ __smp_call_function(stop_this_cpu, NULL, 0, 0);
+ if (!nolock)
+ spin_unlock(&call_lock);
+ disable_local_APIC();
+ local_irq_restore(flags);
+}
+
+/*
+ * Reschedule call back. Nothing to do,
+ * all the work is done automatically when
+ * we return from the interrupt.
+ */
+void smp_reschedule_interrupt(struct pt_regs *regs)
+{
+ ack_APIC_irq();
+#ifdef CONFIG_X86_32
+ __get_cpu_var(irq_stat).irq_resched_count++;
+#else
+ add_pda(irq_resched_count, 1);
+#endif
+}
+
+void smp_call_function_interrupt(struct pt_regs *regs)
+{
+ void (*func) (void *info) = call_data->func;
+ void *info = call_data->info;
+ int wait = call_data->wait;
+
+ ack_APIC_irq();
+ /*
+ * Notify initiating CPU that I've grabbed the data and am
+ * about to execute the function
+ */
+ mb();
+ atomic_inc(&call_data->started);
+ /*
+ * At this point the info structure may be out of scope unless wait==1
+ */
+ irq_enter();
+ (*func)(info);
+#ifdef CONFIG_X86_32
+ __get_cpu_var(irq_stat).irq_call_count++;
+#else
+ add_pda(irq_call_count, 1);
+#endif
+ irq_exit();
+
+ if (wait) {
+ mb();
+ atomic_inc(&call_data->finished);
+ }
+}
+
+struct smp_ops smp_ops = {
+ .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = native_smp_prepare_cpus,
+ .cpu_up = native_cpu_up,
+ .smp_cpus_done = native_smp_cpus_done,
+
+ .smp_send_stop = native_smp_send_stop,
+ .smp_send_reschedule = native_smp_send_reschedule,
+ .smp_call_function_mask = native_smp_call_function_mask,
+};
+EXPORT_SYMBOL_GPL(smp_ops);
+
diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c
deleted file mode 100644
index dc0cde9d16fb..000000000000
--- a/arch/x86/kernel/smp_32.c
+++ /dev/null
@@ -1,712 +0,0 @@
-/*
- * Intel SMP support routines.
- *
- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
- *
- * This code is released under the GNU General Public License version 2 or
- * later.
- */
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/kernel_stat.h>
-#include <linux/mc146818rtc.h>
-#include <linux/cache.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/module.h>
-
-#include <asm/mtrr.h>
-#include <asm/tlbflush.h>
-#include <asm/mmu_context.h>
-#include <mach_apic.h>
-
-/*
- * Some notes on x86 processor bugs affecting SMP operation:
- *
- * Pentium, Pentium Pro, II, III (and all CPUs) have bugs.
- * The Linux implications for SMP are handled as follows:
- *
- * Pentium III / [Xeon]
- * None of the E1AP-E3AP errata are visible to the user.
- *
- * E1AP. see PII A1AP
- * E2AP. see PII A2AP
- * E3AP. see PII A3AP
- *
- * Pentium II / [Xeon]
- * None of the A1AP-A3AP errata are visible to the user.
- *
- * A1AP. see PPro 1AP
- * A2AP. see PPro 2AP
- * A3AP. see PPro 7AP
- *
- * Pentium Pro
- * None of 1AP-9AP errata are visible to the normal user,
- * except occasional delivery of 'spurious interrupt' as trap #15.
- * This is very rare and a non-problem.
- *
- * 1AP. Linux maps APIC as non-cacheable
- * 2AP. worked around in hardware
- * 3AP. fixed in C0 and above steppings microcode update.
- * Linux does not use excessive STARTUP_IPIs.
- * 4AP. worked around in hardware
- * 5AP. symmetric IO mode (normal Linux operation) not affected.
- * 'noapic' mode has vector 0xf filled out properly.
- * 6AP. 'noapic' mode might be affected - fixed in later steppings
- * 7AP. We do not assume writes to the LVT deassering IRQs
- * 8AP. We do not enable low power mode (deep sleep) during MP bootup
- * 9AP. We do not use mixed mode
- *
- * Pentium
- * There is a marginal case where REP MOVS on 100MHz SMP
- * machines with B stepping processors can fail. XXX should provide
- * an L1cache=Writethrough or L1cache=off option.
- *
- * B stepping CPUs may hang. There are hardware work arounds
- * for this. We warn about it in case your board doesn't have the work
- * arounds. Basically that's so I can tell anyone with a B stepping
- * CPU and SMP problems "tough".
- *
- * Specific items [From Pentium Processor Specification Update]
- *
- * 1AP. Linux doesn't use remote read
- * 2AP. Linux doesn't trust APIC errors
- * 3AP. We work around this
- * 4AP. Linux never generated 3 interrupts of the same priority
- * to cause a lost local interrupt.
- * 5AP. Remote read is never used
- * 6AP. not affected - worked around in hardware
- * 7AP. not affected - worked around in hardware
- * 8AP. worked around in hardware - we get explicit CS errors if not
- * 9AP. only 'noapic' mode affected. Might generate spurious
- * interrupts, we log only the first one and count the
- * rest silently.
- * 10AP. not affected - worked around in hardware
- * 11AP. Linux reads the APIC between writes to avoid this, as per
- * the documentation. Make sure you preserve this as it affects
- * the C stepping chips too.
- * 12AP. not affected - worked around in hardware
- * 13AP. not affected - worked around in hardware
- * 14AP. we always deassert INIT during bootup
- * 15AP. not affected - worked around in hardware
- * 16AP. not affected - worked around in hardware
- * 17AP. not affected - worked around in hardware
- * 18AP. not affected - worked around in hardware
- * 19AP. not affected - worked around in BIOS
- *
- * If this sounds worrying believe me these bugs are either ___RARE___,
- * or are signal timing bugs worked around in hardware and there's
- * about nothing of note with C stepping upwards.
- */
-
-DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, };
-
-/*
- * the following functions deal with sending IPIs between CPUs.
- *
- * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
- */
-
-static inline int __prepare_ICR (unsigned int shortcut, int vector)
-{
- unsigned int icr = shortcut | APIC_DEST_LOGICAL;
-
- switch (vector) {
- default:
- icr |= APIC_DM_FIXED | vector;
- break;
- case NMI_VECTOR:
- icr |= APIC_DM_NMI;
- break;
- }
- return icr;
-}
-
-static inline int __prepare_ICR2 (unsigned int mask)
-{
- return SET_APIC_DEST_FIELD(mask);
-}
-
-void __send_IPI_shortcut(unsigned int shortcut, int vector)
-{
- /*
- * Subtle. In the case of the 'never do double writes' workaround
- * we have to lock out interrupts to be safe. As we don't care
- * of the value read we use an atomic rmw access to avoid costly
- * cli/sti. Otherwise we use an even cheaper single atomic write
- * to the APIC.
- */
- unsigned int cfg;
-
- /*
- * Wait for idle.
- */
- apic_wait_icr_idle();
-
- /*
- * No need to touch the target chip field
- */
- cfg = __prepare_ICR(shortcut, vector);
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write_around(APIC_ICR, cfg);
-}
-
-void send_IPI_self(int vector)
-{
- __send_IPI_shortcut(APIC_DEST_SELF, vector);
-}
-
-/*
- * This is used to send an IPI with no shorthand notation (the destination is
- * specified in bits 56 to 63 of the ICR).
- */
-static inline void __send_IPI_dest_field(unsigned long mask, int vector)
-{
- unsigned long cfg;
-
- /*
- * Wait for idle.
- */
- if (unlikely(vector == NMI_VECTOR))
- safe_apic_wait_icr_idle();
- else
- apic_wait_icr_idle();
-
- /*
- * prepare target chip field
- */
- cfg = __prepare_ICR2(mask);
- apic_write_around(APIC_ICR2, cfg);
-
- /*
- * program the ICR
- */
- cfg = __prepare_ICR(0, vector);
-
- /*
- * Send the IPI. The write to APIC_ICR fires this off.
- */
- apic_write_around(APIC_ICR, cfg);
-}
-
-/*
- * This is only used on smaller machines.
- */
-void send_IPI_mask_bitmask(cpumask_t cpumask, int vector)
-{
- unsigned long mask = cpus_addr(cpumask)[0];
- unsigned long flags;
-
- local_irq_save(flags);
- WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
- __send_IPI_dest_field(mask, vector);
- local_irq_restore(flags);
-}
-
-void send_IPI_mask_sequence(cpumask_t mask, int vector)
-{
- unsigned long flags;
- unsigned int query_cpu;
-
- /*
- * Hack. The clustered APIC addressing mode doesn't allow us to send
- * to an arbitrary mask, so I do a unicasts to each CPU instead. This
- * should be modified to do 1 message per cluster ID - mbligh
- */
-
- local_irq_save(flags);
- for_each_possible_cpu(query_cpu) {
- if (cpu_isset(query_cpu, mask)) {
- __send_IPI_dest_field(cpu_to_logical_apicid(query_cpu),
- vector);
- }
- }
- local_irq_restore(flags);
-}
-
-#include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
-
-/*
- * Smarter SMP flushing macros.
- * c/o Linus Torvalds.
- *
- * These mean you can really definitely utterly forget about
- * writing to user space from interrupts. (Its not allowed anyway).
- *
- * Optimizations Manfred Spraul <manfred@colorfullife.com>
- */
-
-static cpumask_t flush_cpumask;
-static struct mm_struct * flush_mm;
-static unsigned long flush_va;
-static DEFINE_SPINLOCK(tlbstate_lock);
-
-/*
- * We cannot call mmdrop() because we are in interrupt context,
- * instead update mm->cpu_vm_mask.
- *
- * We need to reload %cr3 since the page tables may be going
- * away from under us..
- */
-void leave_mm(int cpu)
-{
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
- BUG();
- cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
- load_cr3(swapper_pg_dir);
-}
-EXPORT_SYMBOL_GPL(leave_mm);
-
-/*
- *
- * The flush IPI assumes that a thread switch happens in this order:
- * [cpu0: the cpu that switches]
- * 1) switch_mm() either 1a) or 1b)
- * 1a) thread switch to a different mm
- * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
- * Stop ipi delivery for the old mm. This is not synchronized with
- * the other cpus, but smp_invalidate_interrupt ignore flush ipis
- * for the wrong mm, and in the worst case we perform a superfluous
- * tlb flush.
- * 1a2) set cpu_tlbstate to TLBSTATE_OK
- * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
- * was in lazy tlb mode.
- * 1a3) update cpu_tlbstate[].active_mm
- * Now cpu0 accepts tlb flushes for the new mm.
- * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
- * Now the other cpus will send tlb flush ipis.
- * 1a4) change cr3.
- * 1b) thread switch without mm change
- * cpu_tlbstate[].active_mm is correct, cpu0 already handles
- * flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
- * 1b2) test_and_set the cpu bit in cpu_vm_mask.
- * Atomically set the bit [other cpus will start sending flush ipis],
- * and test the bit.
- * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
- * 2) switch %%esp, ie current
- *
- * The interrupt must handle 2 special cases:
- * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
- * - the cpu performs speculative tlb reads, i.e. even if the cpu only
- * runs in kernel space, the cpu could load tlb entries for user space
- * pages.
- *
- * The good news is that cpu_tlbstate is local to each cpu, no
- * write/read ordering problems.
- */
-
-/*
- * TLB flush IPI:
- *
- * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
- * 2) Leave the mm if we are in the lazy tlb mode.
- */
-
-void smp_invalidate_interrupt(struct pt_regs *regs)
-{
- unsigned long cpu;
-
- cpu = get_cpu();
-
- if (!cpu_isset(cpu, flush_cpumask))
- goto out;
- /*
- * This was a BUG() but until someone can quote me the
- * line from the intel manual that guarantees an IPI to
- * multiple CPUs is retried _only_ on the erroring CPUs
- * its staying as a return
- *
- * BUG();
- */
-
- if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
- if (flush_va == TLB_FLUSH_ALL)
- local_flush_tlb();
- else
- __flush_tlb_one(flush_va);
- } else
- leave_mm(cpu);
- }
- ack_APIC_irq();
- smp_mb__before_clear_bit();
- cpu_clear(cpu, flush_cpumask);
- smp_mb__after_clear_bit();
-out:
- put_cpu_no_resched();
- __get_cpu_var(irq_stat).irq_tlb_count++;
-}
-
-void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
- unsigned long va)
-{
- cpumask_t cpumask = *cpumaskp;
-
- /*
- * A couple of (to be removed) sanity checks:
- *
- * - current CPU must not be in mask
- * - mask must exist :)
- */
- BUG_ON(cpus_empty(cpumask));
- BUG_ON(cpu_isset(smp_processor_id(), cpumask));
- BUG_ON(!mm);
-
-#ifdef CONFIG_HOTPLUG_CPU
- /* If a CPU which we ran on has gone down, OK. */
- cpus_and(cpumask, cpumask, cpu_online_map);
- if (unlikely(cpus_empty(cpumask)))
- return;
-#endif
-
- /*
- * i'm not happy about this global shared spinlock in the
- * MM hot path, but we'll see how contended it is.
- * AK: x86-64 has a faster method that could be ported.
- */
- spin_lock(&tlbstate_lock);
-
- flush_mm = mm;
- flush_va = va;
- cpus_or(flush_cpumask, cpumask, flush_cpumask);
- /*
- * We have to send the IPI only to
- * CPUs affected.
- */
- send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
-
- while (!cpus_empty(flush_cpumask))
- /* nothing. lockup detection does not belong here */
- cpu_relax();
-
- flush_mm = NULL;
- flush_va = 0;
- spin_unlock(&tlbstate_lock);
-}
-
-void flush_tlb_current_task(void)
-{
- struct mm_struct *mm = current->mm;
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- local_flush_tlb();
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
- preempt_enable();
-}
-
-void flush_tlb_mm (struct mm_struct * mm)
-{
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- if (current->active_mm == mm) {
- if (current->mm)
- local_flush_tlb();
- else
- leave_mm(smp_processor_id());
- }
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-
- preempt_enable();
-}
-
-void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
-{
- struct mm_struct *mm = vma->vm_mm;
- cpumask_t cpu_mask;
-
- preempt_disable();
- cpu_mask = mm->cpu_vm_mask;
- cpu_clear(smp_processor_id(), cpu_mask);
-
- if (current->active_mm == mm) {
- if(current->mm)
- __flush_tlb_one(va);
- else
- leave_mm(smp_processor_id());
- }
-
- if (!cpus_empty(cpu_mask))
- flush_tlb_others(cpu_mask, mm, va);
-
- preempt_enable();
-}
-EXPORT_SYMBOL(flush_tlb_page);
-
-static void do_flush_tlb_all(void* info)
-{
- unsigned long cpu = smp_processor_id();
-
- __flush_tlb_all();
- if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
- leave_mm(cpu);
-}
-
-void flush_tlb_all(void)
-{
- on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
-}
-
-/*
- * this function sends a 'reschedule' IPI to another CPU.
- * it goes straight through and wastes no time serializing
- * anything. Worst case is that we lose a reschedule ...
- */
-static void native_smp_send_reschedule(int cpu)
-{
- WARN_ON(cpu_is_offline(cpu));
- send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
-}
-
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
- void (*func) (void *info);
- void *info;
- atomic_t started;
- atomic_t finished;
- int wait;
-};
-
-void lock_ipi_call_lock(void)
-{
- spin_lock_irq(&call_lock);
-}
-
-void unlock_ipi_call_lock(void)
-{
- spin_unlock_irq(&call_lock);
-}
-
-static struct call_data_struct *call_data;
-
-static void __smp_call_function(void (*func) (void *info), void *info,
- int nonatomic, int wait)
-{
- struct call_data_struct data;
- int cpus = num_online_cpus() - 1;
-
- if (!cpus)
- return;
-
- data.func = func;
- data.info = info;
- atomic_set(&data.started, 0);
- data.wait = wait;
- if (wait)
- atomic_set(&data.finished, 0);
-
- call_data = &data;
- mb();
-
- /* Send a message to all other CPUs and wait for them to respond */
- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-
- /* Wait for response */
- while (atomic_read(&data.started) != cpus)
- cpu_relax();
-
- if (wait)
- while (atomic_read(&data.finished) != cpus)
- cpu_relax();
-}
-
-
-/**
- * smp_call_function_mask(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on. Must not include the current cpu.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-static int
-native_smp_call_function_mask(cpumask_t mask,
- void (*func)(void *), void *info,
- int wait)
-{
- struct call_data_struct data;
- cpumask_t allbutself;
- int cpus;
-
- /* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
-
- /* Holding any lock stops cpus from going down. */
- spin_lock(&call_lock);
-
- allbutself = cpu_online_map;
- cpu_clear(smp_processor_id(), allbutself);
-
- cpus_and(mask, mask, allbutself);
- cpus = cpus_weight(mask);
-
- if (!cpus) {
- spin_unlock(&call_lock);
- return 0;
- }
-
- data.func = func;
- data.info = info;
- atomic_set(&data.started, 0);
- data.wait = wait;
- if (wait)
- atomic_set(&data.finished, 0);
-
- call_data = &data;
- mb();
-
- /* Send a message to other CPUs */
- if (cpus_equal(mask, allbutself))
- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
- else
- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-
- /* Wait for response */
- while (atomic_read(&data.started) != cpus)
- cpu_relax();
-
- if (wait)
- while (atomic_read(&data.finished) != cpus)
- cpu_relax();
- spin_unlock(&call_lock);
-
- return 0;
-}
-
-static void stop_this_cpu (void * dummy)
-{
- local_irq_disable();
- /*
- * Remove this CPU:
- */
- cpu_clear(smp_processor_id(), cpu_online_map);
- disable_local_APIC();
- if (cpu_data(smp_processor_id()).hlt_works_ok)
- for(;;) halt();
- for (;;);
-}
-
-/*
- * this function calls the 'stop' function on all other CPUs in the system.
- */
-
-static void native_smp_send_stop(void)
-{
- /* Don't deadlock on the call lock in panic */
- int nolock = !spin_trylock(&call_lock);
- unsigned long flags;
-
- local_irq_save(flags);
- __smp_call_function(stop_this_cpu, NULL, 0, 0);
- if (!nolock)
- spin_unlock(&call_lock);
- disable_local_APIC();
- local_irq_restore(flags);
-}
-
-/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
- */
-void smp_reschedule_interrupt(struct pt_regs *regs)
-{
- ack_APIC_irq();
- __get_cpu_var(irq_stat).irq_resched_count++;
-}
-
-void smp_call_function_interrupt(struct pt_regs *regs)
-{
- void (*func) (void *info) = call_data->func;
- void *info = call_data->info;
- int wait = call_data->wait;
-
- ack_APIC_irq();
- /*
- * Notify initiating CPU that I've grabbed the data and am
- * about to execute the function
- */
- mb();
- atomic_inc(&call_data->started);
- /*
- * At this point the info structure may be out of scope unless wait==1
- */
- irq_enter();
- (*func)(info);
- __get_cpu_var(irq_stat).irq_call_count++;
- irq_exit();
-
- if (wait) {
- mb();
- atomic_inc(&call_data->finished);
- }
-}
-
-static int convert_apicid_to_cpu(int apic_id)
-{
- int i;
-
- for_each_possible_cpu(i) {
- if (per_cpu(x86_cpu_to_apicid, i) == apic_id)
- return i;
- }
- return -1;
-}
-
-int safe_smp_processor_id(void)
-{
- int apicid, cpuid;
-
- if (!boot_cpu_has(X86_FEATURE_APIC))
- return 0;
-
- apicid = hard_smp_processor_id();
- if (apicid == BAD_APICID)
- return 0;
-
- cpuid = convert_apicid_to_cpu(apicid);
-
- return cpuid >= 0 ? cpuid : 0;
-}
-
-struct smp_ops smp_ops = {
- .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
- .smp_prepare_cpus = native_smp_prepare_cpus,
- .cpu_up = native_cpu_up,
- .smp_cpus_done = native_smp_cpus_done,
-
- .smp_send_stop = native_smp_send_stop,
- .smp_send_reschedule = native_smp_send_reschedule,
- .smp_call_function_mask = native_smp_call_function_mask,
-};
-EXPORT_SYMBOL_GPL(smp_ops);
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot.c
index 579b9b740c7c..04c662ba18f1 100644
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot.c
@@ -3,6 +3,7 @@
*
* (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
* (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
+ * Copyright 2001 Andi Kleen, SuSE Labs.
*
* Much of the core SMP work is based on previous work by Thomas Radke, to
* whom a great many thanks are extended.
@@ -29,53 +30,91 @@
* Ingo Molnar : various cleanups and rewrites
* Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
* Maciej W. Rozycki : Bits for genuine 82489DX APICs
+ * Andi Kleen : Changed for SMP boot into long mode.
* Martin J. Bligh : Added support for multi-quad systems
* Dave Jones : Report invalid combinations of Athlon CPUs.
-* Rusty Russell : Hacked into shape for new "hotplug" boot process. */
+ * Rusty Russell : Hacked into shape for new "hotplug" boot process.
+ * Andi Kleen : Converted to new state machine.
+ * Ashok Raj : CPU hotplug support
+ * Glauber Costa : i386 and x86_64 integration
+ */
-#include <linux/module.h>
#include <linux/init.h>
-#include <linux/kernel.h>
-
-#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/module.h>
#include <linux/sched.h>
-#include <linux/kernel_stat.h>
-#include <linux/bootmem.h>
-#include <linux/notifier.h>
-#include <linux/cpu.h>
#include <linux/percpu.h>
+#include <linux/bootmem.h>
+#include <linux/err.h>
#include <linux/nmi.h>
-#include <linux/delay.h>
-#include <linux/mc146818rtc.h>
-#include <asm/tlbflush.h>
+#include <asm/acpi.h>
#include <asm/desc.h>
-#include <asm/arch_hooks.h>
#include <asm/nmi.h>
+#include <asm/irq.h>
+#include <asm/smp.h>
+#include <asm/trampoline.h>
+#include <asm/cpu.h>
+#include <asm/numa.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/mtrr.h>
+#include <asm/nmi.h>
+#include <asm/vmi.h>
+#include <asm/genapic.h>
+#include <linux/mc146818rtc.h>
#include <mach_apic.h>
#include <mach_wakecpu.h>
#include <smpboot_hooks.h>
-#include <asm/vmi.h>
-#include <asm/mtrr.h>
-/* Set if we find a B stepping CPU */
-static int __cpuinitdata smp_b_stepping;
+/*
+ * FIXME: For x86_64, those are defined in other files. But moving them here,
+ * would make the setup areas dependent on smp, which is a loss. When we
+ * integrate apic between arches, we can probably do a better job, but
+ * right now, they'll stay here -- glommer
+ */
+
+/* which logical CPU number maps to which CPU (physical APIC ID) */
+u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
+ { [0 ... NR_CPUS-1] = BAD_APICID };
+void *x86_cpu_to_apicid_early_ptr;
+
+u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
+ = { [0 ... NR_CPUS-1] = BAD_APICID };
+void *x86_bios_cpu_apicid_early_ptr;
+
+#ifdef CONFIG_X86_32
+u8 apicid_2_node[MAX_APICID];
+#endif
+
+/* State of each CPU */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
+/* Store all idle threads, this can be reused instead of creating
+* a new thread. Also avoids complicated thread destroy functionality
+* for idle threads.
+*/
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
+ * removed after init for !CONFIG_HOTPLUG_CPU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
+#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
+#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
+#else
+struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+#define get_idle_for_cpu(x) (idle_thread_array[(x)])
+#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
+#endif
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);
/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID;
-
-/* representing HT siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
-EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
-
-/* representing HT and core siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_t, cpu_core_map);
-EXPORT_PER_CPU_SYMBOL(cpu_core_map);
+DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
/* bitmap of online cpus */
cpumask_t cpu_online_map __read_mostly;
@@ -85,125 +124,93 @@ cpumask_t cpu_callin_map;
cpumask_t cpu_callout_map;
cpumask_t cpu_possible_map;
EXPORT_SYMBOL(cpu_possible_map);
-static cpumask_t smp_commenced_mask;
+
+/* representing HT siblings of each logical CPU */
+DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
+EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
+
+/* representing HT and core siblings of each logical CPU */
+DEFINE_PER_CPU(cpumask_t, cpu_core_map);
+EXPORT_PER_CPU_SYMBOL(cpu_core_map);
/* Per CPU bogomips and other parameters */
DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
-/* which logical CPU number maps to which CPU (physical APIC ID) */
-u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
- { [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_cpu_to_apicid_early_ptr;
-DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
-EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
-
-u8 apicid_2_node[MAX_APICID];
+static atomic_t init_deasserted;
-/*
- * Trampoline 80x86 program as an array.
- */
+static int boot_cpu_logical_apicid;
-extern const unsigned char trampoline_data [];
-extern const unsigned char trampoline_end [];
-static unsigned char *trampoline_base;
+/* representing cpus for which sibling maps can be computed */
+static cpumask_t cpu_sibling_setup_map;
-static void map_cpu_to_logical_apicid(void);
+/* Set if we find a B stepping CPU */
+int __cpuinitdata smp_b_stepping;
-/* State of each CPU. */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
+#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
-/*
- * Currently trivial. Write the real->protected mode
- * bootstrap into the page concerned. The caller
- * has made sure it's suitably aligned.
- */
+/* which logical CPUs are on which nodes */
+cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly =
+ { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
+EXPORT_SYMBOL(node_to_cpumask_map);
+/* which node each logical CPU is on */
+int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
+EXPORT_SYMBOL(cpu_to_node_map);
-static unsigned long __cpuinit setup_trampoline(void)
+/* set up a mapping between cpu and node. */
+static void map_cpu_to_node(int cpu, int node)
{
- memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data);
- return virt_to_phys(trampoline_base);
+ printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
+ cpu_set(cpu, node_to_cpumask_map[node]);
+ cpu_to_node_map[cpu] = node;
}
-/*
- * We are called very early to get the low memory for the
- * SMP bootup trampoline page.
- */
-void __init smp_alloc_memory(void)
+/* undo a mapping between cpu and node. */
+static void unmap_cpu_to_node(int cpu)
{
- trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
- /*
- * Has to be in very low memory so we can execute
- * real-mode AP code.
- */
- if (__pa(trampoline_base) >= 0x9F000)
- BUG();
+ int node;
+
+ printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
+ for (node = 0; node < MAX_NUMNODES; node++)
+ cpu_clear(cpu, node_to_cpumask_map[node]);
+ cpu_to_node_map[cpu] = 0;
}
+#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
+#define map_cpu_to_node(cpu, node) ({})
+#define unmap_cpu_to_node(cpu) ({})
+#endif
-/*
- * The bootstrap kernel entry code has set these up. Save them for
- * a given CPU
- */
+#ifdef CONFIG_X86_32
+u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
+ { [0 ... NR_CPUS-1] = BAD_APICID };
-void __cpuinit smp_store_cpu_info(int id)
+static void map_cpu_to_logical_apicid(void)
{
- struct cpuinfo_x86 *c = &cpu_data(id);
-
- *c = boot_cpu_data;
- c->cpu_index = id;
- if (id!=0)
- identify_secondary_cpu(c);
- /*
- * Mask B, Pentium, but not Pentium MMX
- */
- if (c->x86_vendor == X86_VENDOR_INTEL &&
- c->x86 == 5 &&
- c->x86_mask >= 1 && c->x86_mask <= 4 &&
- c->x86_model <= 3)
- /*
- * Remember we have B step Pentia with bugs
- */
- smp_b_stepping = 1;
-
- /*
- * Certain Athlons might work (for various values of 'work') in SMP
- * but they are not certified as MP capable.
- */
- if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
-
- if (num_possible_cpus() == 1)
- goto valid_k7;
-
- /* Athlon 660/661 is valid. */
- if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1)))
- goto valid_k7;
-
- /* Duron 670 is valid */
- if ((c->x86_model==7) && (c->x86_mask==0))
- goto valid_k7;
-
- /*
- * Athlon 662, Duron 671, and Athlon >model 7 have capability bit.
- * It's worth noting that the A5 stepping (662) of some Athlon XP's
- * have the MP bit set.
- * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more.
- */
- if (((c->x86_model==6) && (c->x86_mask>=2)) ||
- ((c->x86_model==7) && (c->x86_mask>=1)) ||
- (c->x86_model> 7))
- if (cpu_has_mp)
- goto valid_k7;
+ int cpu = smp_processor_id();
+ int apicid = logical_smp_processor_id();
+ int node = apicid_to_node(apicid);
- /* If we get here, it's not a certified SMP capable AMD system. */
- add_taint(TAINT_UNSAFE_SMP);
- }
+ if (!node_online(node))
+ node = first_online_node;
-valid_k7:
- ;
+ cpu_2_logical_apicid[cpu] = apicid;
+ map_cpu_to_node(cpu, node);
}
-static atomic_t init_deasserted;
+static void unmap_cpu_to_logical_apicid(int cpu)
+{
+ cpu_2_logical_apicid[cpu] = BAD_APICID;
+ unmap_cpu_to_node(cpu);
+}
+#else
+#define unmap_cpu_to_logical_apicid(cpu) do {} while (0)
+#define map_cpu_to_logical_apicid() do {} while (0)
+#endif
+/*
+ * Report back to the Boot Processor.
+ * Running on AP.
+ */
static void __cpuinit smp_callin(void)
{
int cpuid, phys_id;
@@ -220,12 +227,11 @@ static void __cpuinit smp_callin(void)
/*
* (This works even if the APIC is not enabled.)
*/
- phys_id = GET_APIC_ID(apic_read(APIC_ID));
+ phys_id = GET_APIC_ID(read_apic_id());
cpuid = smp_processor_id();
if (cpu_isset(cpuid, cpu_callin_map)) {
- printk("huh, phys CPU#%d, CPU#%d already present??\n",
+ panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
phys_id, cpuid);
- BUG();
}
Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
@@ -247,13 +253,12 @@ static void __cpuinit smp_callin(void)
*/
if (cpu_isset(cpuid, cpu_callout_map))
break;
- rep_nop();
+ cpu_relax();
}
if (!time_before(jiffies, timeout)) {
- printk("BUG: CPU%d started up but did not get a callout!\n",
- cpuid);
- BUG();
+ panic("%s: CPU%d started up but did not get a callout!\n",
+ __func__, cpuid);
}
/*
@@ -266,13 +271,19 @@ static void __cpuinit smp_callin(void)
Dprintk("CALLIN, before setup_local_APIC().\n");
smp_callin_clear_local_apic();
setup_local_APIC();
+ end_local_APIC_setup();
map_cpu_to_logical_apicid();
/*
* Get our bogomips.
+ *
+ * Need to enable IRQs because it can take longer and then
+ * the NMI watchdog might kill us.
*/
+ local_irq_enable();
calibrate_delay();
- Dprintk("Stack at about %p\n",&cpuid);
+ local_irq_disable();
+ Dprintk("Stack at about %p\n", &cpuid);
/*
* Save our processor parameters
@@ -285,91 +296,10 @@ static void __cpuinit smp_callin(void)
cpu_set(cpuid, cpu_callin_map);
}
-static int cpucount;
-
-/* maps the cpu to the sched domain representing multi-core */
-cpumask_t cpu_coregroup_map(int cpu)
-{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- /*
- * For perf, we return last level cache shared map.
- * And for power savings, we return cpu_core_map
- */
- if (sched_mc_power_savings || sched_smt_power_savings)
- return per_cpu(cpu_core_map, cpu);
- else
- return c->llc_shared_map;
-}
-
-/* representing cpus for which sibling maps can be computed */
-static cpumask_t cpu_sibling_setup_map;
-
-void __cpuinit set_cpu_sibling_map(int cpu)
-{
- int i;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- cpu_set(cpu, cpu_sibling_setup_map);
-
- if (smp_num_siblings > 1) {
- for_each_cpu_mask(i, cpu_sibling_setup_map) {
- if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
- c->cpu_core_id == cpu_data(i).cpu_core_id) {
- cpu_set(i, per_cpu(cpu_sibling_map, cpu));
- cpu_set(cpu, per_cpu(cpu_sibling_map, i));
- cpu_set(i, per_cpu(cpu_core_map, cpu));
- cpu_set(cpu, per_cpu(cpu_core_map, i));
- cpu_set(i, c->llc_shared_map);
- cpu_set(cpu, cpu_data(i).llc_shared_map);
- }
- }
- } else {
- cpu_set(cpu, per_cpu(cpu_sibling_map, cpu));
- }
-
- cpu_set(cpu, c->llc_shared_map);
-
- if (current_cpu_data.x86_max_cores == 1) {
- per_cpu(cpu_core_map, cpu) = per_cpu(cpu_sibling_map, cpu);
- c->booted_cores = 1;
- return;
- }
-
- for_each_cpu_mask(i, cpu_sibling_setup_map) {
- if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
- per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
- cpu_set(i, c->llc_shared_map);
- cpu_set(cpu, cpu_data(i).llc_shared_map);
- }
- if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
- cpu_set(i, per_cpu(cpu_core_map, cpu));
- cpu_set(cpu, per_cpu(cpu_core_map, i));
- /*
- * Does this new cpu bringup a new core?
- */
- if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) {
- /*
- * for each core in package, increment
- * the booted_cores for this new cpu
- */
- if (first_cpu(per_cpu(cpu_sibling_map, i)) == i)
- c->booted_cores++;
- /*
- * increment the core count for all
- * the other cpus in this package
- */
- if (i != cpu)
- cpu_data(i).booted_cores++;
- } else if (i != cpu && !c->booted_cores)
- c->booted_cores = cpu_data(i).booted_cores;
- }
- }
-}
-
/*
* Activate a secondary processor.
*/
-static void __cpuinit start_secondary(void *unused)
+void __cpuinit start_secondary(void *unused)
{
/*
* Don't put *anything* before cpu_init(), SMP booting is too
@@ -382,24 +312,19 @@ static void __cpuinit start_secondary(void *unused)
cpu_init();
preempt_disable();
smp_callin();
- while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
- rep_nop();
+
+ /* otherwise gcc will move up smp_processor_id before the cpu_init */
+ barrier();
/*
* Check TSC synchronization with the BP:
*/
check_tsc_sync_target();
- setup_secondary_clock();
if (nmi_watchdog == NMI_IO_APIC) {
disable_8259A_irq(0);
enable_NMI_through_LVT0();
enable_8259A_irq(0);
}
- /*
- * low-memory mappings have been cleared, flush them from
- * the local TLBs too.
- */
- local_flush_tlb();
/* This must be done before setting cpu_online_map */
set_cpu_sibling_map(raw_smp_processor_id());
@@ -414,17 +339,27 @@ static void __cpuinit start_secondary(void *unused)
* smp_call_function().
*/
lock_ipi_call_lock();
+#ifdef CONFIG_X86_64
+ spin_lock(&vector_lock);
+
+ /* Setup the per cpu irq handling data structures */
+ __setup_vector_irq(smp_processor_id());
+ /*
+ * Allow the master to continue.
+ */
+ spin_unlock(&vector_lock);
+#endif
cpu_set(smp_processor_id(), cpu_online_map);
unlock_ipi_call_lock();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
- /* We can take interrupts now: we're officially "up". */
- local_irq_enable();
+ setup_secondary_clock();
wmb();
cpu_idle();
}
+#ifdef CONFIG_X86_32
/*
* Everything has been set up for the secondary
* CPUs - they just need to reload everything
@@ -442,89 +377,233 @@ void __devinit initialize_secondary(void)
"movl %0,%%esp\n\t"
"jmp *%1"
:
- :"m" (current->thread.sp),"m" (current->thread.ip));
+ :"m" (current->thread.sp), "m" (current->thread.ip));
}
+#endif
-/* Static state in head.S used to set up a CPU */
-extern struct {
- void * sp;
- unsigned short ss;
-} stack_start;
+static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_X86_32
+ /*
+ * Mask B, Pentium, but not Pentium MMX
+ */
+ if (c->x86_vendor == X86_VENDOR_INTEL &&
+ c->x86 == 5 &&
+ c->x86_mask >= 1 && c->x86_mask <= 4 &&
+ c->x86_model <= 3)
+ /*
+ * Remember we have B step Pentia with bugs
+ */
+ smp_b_stepping = 1;
-#ifdef CONFIG_NUMA
+ /*
+ * Certain Athlons might work (for various values of 'work') in SMP
+ * but they are not certified as MP capable.
+ */
+ if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) {
-/* which logical CPUs are on which nodes */
-cpumask_t node_to_cpumask_map[MAX_NUMNODES] __read_mostly =
- { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE };
-EXPORT_SYMBOL(node_to_cpumask_map);
-/* which node each logical CPU is on */
-int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
-EXPORT_SYMBOL(cpu_to_node_map);
+ if (num_possible_cpus() == 1)
+ goto valid_k7;
-/* set up a mapping between cpu and node. */
-static inline void map_cpu_to_node(int cpu, int node)
-{
- printk("Mapping cpu %d to node %d\n", cpu, node);
- cpu_set(cpu, node_to_cpumask_map[node]);
- cpu_to_node_map[cpu] = node;
+ /* Athlon 660/661 is valid. */
+ if ((c->x86_model == 6) && ((c->x86_mask == 0) ||
+ (c->x86_mask == 1)))
+ goto valid_k7;
+
+ /* Duron 670 is valid */
+ if ((c->x86_model == 7) && (c->x86_mask == 0))
+ goto valid_k7;
+
+ /*
+ * Athlon 662, Duron 671, and Athlon >model 7 have capability
+ * bit. It's worth noting that the A5 stepping (662) of some
+ * Athlon XP's have the MP bit set.
+ * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for
+ * more.
+ */
+ if (((c->x86_model == 6) && (c->x86_mask >= 2)) ||
+ ((c->x86_model == 7) && (c->x86_mask >= 1)) ||
+ (c->x86_model > 7))
+ if (cpu_has_mp)
+ goto valid_k7;
+
+ /* If we get here, not a certified SMP capable AMD system. */
+ add_taint(TAINT_UNSAFE_SMP);
+ }
+
+valid_k7:
+ ;
+#endif
}
-/* undo a mapping between cpu and node. */
-static inline void unmap_cpu_to_node(int cpu)
+static void __cpuinit smp_checks(void)
{
- int node;
+ if (smp_b_stepping)
+ printk(KERN_WARNING "WARNING: SMP operation may be unreliable"
+ "with B stepping processors.\n");
- printk("Unmapping cpu %d from all nodes\n", cpu);
- for (node = 0; node < MAX_NUMNODES; node ++)
- cpu_clear(cpu, node_to_cpumask_map[node]);
- cpu_to_node_map[cpu] = 0;
+ /*
+ * Don't taint if we are running SMP kernel on a single non-MP
+ * approved Athlon
+ */
+ if (tainted & TAINT_UNSAFE_SMP) {
+ if (num_online_cpus())
+ printk(KERN_INFO "WARNING: This combination of AMD"
+ "processors is not suitable for SMP.\n");
+ else
+ tainted &= ~TAINT_UNSAFE_SMP;
+ }
}
-#else /* !CONFIG_NUMA */
-#define map_cpu_to_node(cpu, node) ({})
-#define unmap_cpu_to_node(cpu) ({})
+/*
+ * The bootstrap kernel entry code has set these up. Save them for
+ * a given CPU
+ */
-#endif /* CONFIG_NUMA */
+void __cpuinit smp_store_cpu_info(int id)
+{
+ struct cpuinfo_x86 *c = &cpu_data(id);
-u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+ *c = boot_cpu_data;
+ c->cpu_index = id;
+ if (id != 0)
+ identify_secondary_cpu(c);
+ smp_apply_quirks(c);
+}
-static void map_cpu_to_logical_apicid(void)
+
+void __cpuinit set_cpu_sibling_map(int cpu)
{
- int cpu = smp_processor_id();
- int apicid = logical_smp_processor_id();
- int node = apicid_to_node(apicid);
+ int i;
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
- if (!node_online(node))
- node = first_online_node;
+ cpu_set(cpu, cpu_sibling_setup_map);
- cpu_2_logical_apicid[cpu] = apicid;
- map_cpu_to_node(cpu, node);
+ if (smp_num_siblings > 1) {
+ for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
+ c->cpu_core_id == cpu_data(i).cpu_core_id) {
+ cpu_set(i, per_cpu(cpu_sibling_map, cpu));
+ cpu_set(cpu, per_cpu(cpu_sibling_map, i));
+ cpu_set(i, per_cpu(cpu_core_map, cpu));
+ cpu_set(cpu, per_cpu(cpu_core_map, i));
+ cpu_set(i, c->llc_shared_map);
+ cpu_set(cpu, cpu_data(i).llc_shared_map);
+ }
+ }
+ } else {
+ cpu_set(cpu, per_cpu(cpu_sibling_map, cpu));
+ }
+
+ cpu_set(cpu, c->llc_shared_map);
+
+ if (current_cpu_data.x86_max_cores == 1) {
+ per_cpu(cpu_core_map, cpu) = per_cpu(cpu_sibling_map, cpu);
+ c->booted_cores = 1;
+ return;
+ }
+
+ for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
+ per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
+ cpu_set(i, c->llc_shared_map);
+ cpu_set(cpu, cpu_data(i).llc_shared_map);
+ }
+ if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
+ cpu_set(i, per_cpu(cpu_core_map, cpu));
+ cpu_set(cpu, per_cpu(cpu_core_map, i));
+ /*
+ * Does this new cpu bringup a new core?
+ */
+ if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) {
+ /*
+ * for each core in package, increment
+ * the booted_cores for this new cpu
+ */
+ if (first_cpu(per_cpu(cpu_sibling_map, i)) == i)
+ c->booted_cores++;
+ /*
+ * increment the core count for all
+ * the other cpus in this package
+ */
+ if (i != cpu)
+ cpu_data(i).booted_cores++;
+ } else if (i != cpu && !c->booted_cores)
+ c->booted_cores = cpu_data(i).booted_cores;
+ }
+ }
}
-static void unmap_cpu_to_logical_apicid(int cpu)
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
{
- cpu_2_logical_apicid[cpu] = BAD_APICID;
- unmap_cpu_to_node(cpu);
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+ /*
+ * For perf, we return last level cache shared map.
+ * And for power savings, we return cpu_core_map
+ */
+ if (sched_mc_power_savings || sched_smt_power_savings)
+ return per_cpu(cpu_core_map, cpu);
+ else
+ return c->llc_shared_map;
+}
+
+#ifdef CONFIG_X86_32
+/*
+ * We are called very early to get the low memory for the
+ * SMP bootup trampoline page.
+ */
+void __init smp_alloc_memory(void)
+{
+ trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
+ /*
+ * Has to be in very low memory so we can execute
+ * real-mode AP code.
+ */
+ if (__pa(trampoline_base) >= 0x9F000)
+ BUG();
+}
+#endif
+
+static void impress_friends(void)
+{
+ int cpu;
+ unsigned long bogosum = 0;
+ /*
+ * Allow the user to impress friends.
+ */
+ Dprintk("Before bogomips.\n");
+ for_each_possible_cpu(cpu)
+ if (cpu_isset(cpu, cpu_callout_map))
+ bogosum += cpu_data(cpu).loops_per_jiffy;
+ printk(KERN_INFO
+ "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
+ num_online_cpus(),
+ bogosum/(500000/HZ),
+ (bogosum/(5000/HZ))%100);
+
+ Dprintk("Before bogocount - setting activated=1.\n");
}
static inline void __inquire_remote_apic(int apicid)
{
- int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
+ unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
char *names[] = { "ID", "VERSION", "SPIV" };
int timeout;
- unsigned long status;
+ u32 status;
- printk("Inquiring remote APIC #%d...\n", apicid);
+ printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
for (i = 0; i < ARRAY_SIZE(regs); i++) {
- printk("... APIC #%d %s: ", apicid, names[i]);
+ printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]);
/*
* Wait for idle.
*/
status = safe_apic_wait_icr_idle();
if (status)
- printk("a previous APIC delivery may have failed\n");
+ printk(KERN_CONT
+ "a previous APIC delivery may have failed\n");
apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
@@ -538,16 +617,16 @@ static inline void __inquire_remote_apic(int apicid)
switch (status) {
case APIC_ICR_RR_VALID:
status = apic_read(APIC_RRR);
- printk("%lx\n", status);
+ printk(KERN_CONT "%08x\n", status);
break;
default:
- printk("failed\n");
+ printk(KERN_CONT "failed\n");
}
}
}
#ifdef WAKE_SECONDARY_VIA_NMI
-/*
+/*
* Poke the other CPU in the eye via NMI to wake it up. Remember that the normal
* INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
* won't ... remember to clear down the APIC, etc later.
@@ -584,9 +663,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
Dprintk("NMI sent.\n");
if (send_status)
- printk("APIC never delivered???\n");
+ printk(KERN_ERR "APIC never delivered???\n");
if (accept_status)
- printk("APIC delivery error (%lx).\n", accept_status);
+ printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
return (send_status | accept_status);
}
@@ -599,6 +678,12 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
unsigned long send_status, accept_status = 0;
int maxlvt, num_starts, j;
+ if (get_uv_system_type() == UV_NON_UNIQUE_APIC) {
+ send_status = uv_wakeup_secondary(phys_apicid, start_eip);
+ atomic_set(&init_deasserted, 1);
+ return send_status;
+ }
+
/*
* Be paranoid about clearing APIC errors.
*/
@@ -637,6 +722,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
Dprintk("Waiting for send to finish...\n");
send_status = safe_apic_wait_icr_idle();
+ mb();
atomic_set(&init_deasserted, 1);
/*
@@ -655,7 +741,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
* target processor state.
*/
startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
- (unsigned long) stack_start.sp);
+#ifdef CONFIG_X86_64
+ (unsigned long)init_rsp);
+#else
+ (unsigned long)stack_start.sp);
+#endif
/*
* Run STARTUP IPI loop.
@@ -665,7 +755,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
maxlvt = lapic_get_maxlvt();
for (j = 1; j <= num_starts; j++) {
- Dprintk("Sending STARTUP #%d.\n",j);
+ Dprintk("Sending STARTUP #%d.\n", j);
apic_read_around(APIC_SPIV);
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
@@ -711,49 +801,29 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
Dprintk("After Startup.\n");
if (send_status)
- printk("APIC never delivered???\n");
+ printk(KERN_ERR "APIC never delivered???\n");
if (accept_status)
- printk("APIC delivery error (%lx).\n", accept_status);
+ printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
return (send_status | accept_status);
}
#endif /* WAKE_SECONDARY_VIA_INIT */
-extern cpumask_t cpu_initialized;
-static inline int alloc_cpu_id(void)
-{
- cpumask_t tmp_map;
+struct create_idle {
+ struct work_struct work;
+ struct task_struct *idle;
+ struct completion done;
int cpu;
- cpus_complement(tmp_map, cpu_present_map);
- cpu = first_cpu(tmp_map);
- if (cpu >= NR_CPUS)
- return -ENODEV;
- return cpu;
-}
+};
-#ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct * __cpuinitdata cpu_idle_tasks[NR_CPUS];
-static inline struct task_struct * __cpuinit alloc_idle_task(int cpu)
+static void __cpuinit do_fork_idle(struct work_struct *work)
{
- struct task_struct *idle;
-
- if ((idle = cpu_idle_tasks[cpu]) != NULL) {
- /* initialize thread_struct. we really want to avoid destroy
- * idle tread
- */
- idle->thread.sp = (unsigned long)task_pt_regs(idle);
- init_idle(idle, cpu);
- return idle;
- }
- idle = fork_idle(cpu);
+ struct create_idle *c_idle =
+ container_of(work, struct create_idle, work);
- if (!IS_ERR(idle))
- cpu_idle_tasks[cpu] = idle;
- return idle;
+ c_idle->idle = fork_idle(c_idle->cpu);
+ complete(&c_idle->done);
}
-#else
-#define alloc_idle_task(cpu) fork_idle(cpu)
-#endif
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
@@ -762,45 +832,92 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
* Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu.
*/
{
- struct task_struct *idle;
- unsigned long boot_error;
+ unsigned long boot_error = 0;
int timeout;
- unsigned long start_eip;
+ unsigned long start_ip;
unsigned short nmi_high = 0, nmi_low = 0;
+ struct create_idle c_idle = {
+ .cpu = cpu,
+ .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
+ };
+ INIT_WORK(&c_idle.work, do_fork_idle);
+#ifdef CONFIG_X86_64
+ /* allocate memory for gdts of secondary cpus. Hotplug is considered */
+ if (!cpu_gdt_descr[cpu].address &&
+ !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
+ printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
+ return -1;
+ }
- /*
- * Save current MTRR state in case it was changed since early boot
- * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
- */
- mtrr_save_state();
+ /* Allocate node local memory for AP pdas */
+ if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
+ struct x8664_pda *newpda, *pda;
+ int node = cpu_to_node(cpu);
+ pda = cpu_pda(cpu);
+ newpda = kmalloc_node(sizeof(struct x8664_pda), GFP_ATOMIC,
+ node);
+ if (newpda) {
+ memcpy(newpda, pda, sizeof(struct x8664_pda));
+ cpu_pda(cpu) = newpda;
+ } else
+ printk(KERN_ERR
+ "Could not allocate node local PDA for CPU %d on node %d\n",
+ cpu, node);
+ }
+#endif
+
+ alternatives_smp_switch(1);
+
+ c_idle.idle = get_idle_for_cpu(cpu);
/*
* We can't use kernel_thread since we must avoid to
* reschedule the child.
*/
- idle = alloc_idle_task(cpu);
- if (IS_ERR(idle))
- panic("failed fork for CPU %d", cpu);
+ if (c_idle.idle) {
+ c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
+ (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
+ init_idle(c_idle.idle, cpu);
+ goto do_rest;
+ }
+
+ if (!keventd_up() || current_is_keventd())
+ c_idle.work.func(&c_idle.work);
+ else {
+ schedule_work(&c_idle.work);
+ wait_for_completion(&c_idle.done);
+ }
+ if (IS_ERR(c_idle.idle)) {
+ printk("failed fork for CPU %d\n", cpu);
+ return PTR_ERR(c_idle.idle);
+ }
+
+ set_idle_for_cpu(cpu, c_idle.idle);
+do_rest:
+#ifdef CONFIG_X86_32
+ per_cpu(current_task, cpu) = c_idle.idle;
init_gdt(cpu);
- per_cpu(current_task, cpu) = idle;
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+ c_idle.idle->thread.ip = (unsigned long) start_secondary;
+ /* Stack for startup_32 can be just as for start_secondary onwards */
+ stack_start.sp = (void *) c_idle.idle->thread.sp;
+ irq_ctx_init(cpu);
+#else
+ cpu_pda(cpu)->pcurrent = c_idle.idle;
+ init_rsp = c_idle.idle->thread.sp;
+ load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
+ initial_code = (unsigned long)start_secondary;
+ clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+#endif
- idle->thread.ip = (unsigned long) start_secondary;
- /* start_eip had better be page-aligned! */
- start_eip = setup_trampoline();
-
- ++cpucount;
- alternatives_smp_switch(1);
+ /* start_ip had better be page-aligned! */
+ start_ip = setup_trampoline();
/* So we see what's up */
- printk("Booting processor %d/%d ip %lx\n", cpu, apicid, start_eip);
- /* Stack for startup_32 can be just as for start_secondary onwards */
- stack_start.sp = (void *) idle->thread.sp;
+ printk(KERN_INFO "Booting processor %d/%d ip %lx\n",
+ cpu, apicid, start_ip);
- irq_ctx_init(cpu);
-
- per_cpu(x86_cpu_to_apicid, cpu) = apicid;
/*
* This grunge runs the startup process for
* the targeted processor.
@@ -808,16 +925,24 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
atomic_set(&init_deasserted, 0);
- Dprintk("Setting warm reset code and vector.\n");
+ if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
+
+ Dprintk("Setting warm reset code and vector.\n");
- store_NMI_vector(&nmi_high, &nmi_low);
+ store_NMI_vector(&nmi_high, &nmi_low);
- smpboot_setup_warm_reset_vector(start_eip);
+ smpboot_setup_warm_reset_vector(start_ip);
+ /*
+ * Be paranoid about clearing APIC errors.
+ */
+ apic_write(APIC_ESR, 0);
+ apic_read(APIC_ESR);
+ }
/*
* Starting actual IPI sequence...
*/
- boot_error = wakeup_secondary_cpu(apicid, start_eip);
+ boot_error = wakeup_secondary_cpu(apicid, start_ip);
if (!boot_error) {
/*
@@ -839,175 +964,179 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
if (cpu_isset(cpu, cpu_callin_map)) {
/* number CPUs logically, starting from 1 (BSP is 0) */
Dprintk("OK.\n");
- printk("CPU%d: ", cpu);
+ printk(KERN_INFO "CPU%d: ", cpu);
print_cpu_info(&cpu_data(cpu));
Dprintk("CPU has booted.\n");
} else {
- boot_error= 1;
+ boot_error = 1;
if (*((volatile unsigned char *)trampoline_base)
== 0xA5)
/* trampoline started but...? */
- printk("Stuck ??\n");
+ printk(KERN_ERR "Stuck ??\n");
else
/* trampoline code not run */
- printk("Not responding.\n");
- inquire_remote_apic(apicid);
+ printk(KERN_ERR "Not responding.\n");
+ if (get_uv_system_type() != UV_NON_UNIQUE_APIC)
+ inquire_remote_apic(apicid);
}
}
if (boot_error) {
/* Try to put things back the way they were before ... */
unmap_cpu_to_logical_apicid(cpu);
- cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
+#ifdef CONFIG_X86_64
+ clear_node_cpumask(cpu); /* was set by numa_add_cpu */
+#endif
+ cpu_clear(cpu, cpu_callout_map); /* was set by do_boot_cpu() */
cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */
- cpucount--;
- } else {
- per_cpu(x86_cpu_to_apicid, cpu) = apicid;
- cpu_set(cpu, cpu_present_map);
+ cpu_clear(cpu, cpu_possible_map);
+ cpu_clear(cpu, cpu_present_map);
+ per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
}
/* mark "stuck" area as not stuck */
*((volatile unsigned long *)trampoline_base) = 0;
+ /*
+ * Cleanup possible dangling ends...
+ */
+ smpboot_restore_warm_reset_vector();
+
return boot_error;
}
-#ifdef CONFIG_HOTPLUG_CPU
-void cpu_exit_clear(void)
+int __cpuinit native_cpu_up(unsigned int cpu)
{
- int cpu = raw_smp_processor_id();
+ int apicid = cpu_present_to_apicid(cpu);
+ unsigned long flags;
+ int err;
- idle_task_exit();
+ WARN_ON(irqs_disabled());
- cpucount --;
- cpu_uninit();
- irq_ctx_exit(cpu);
+ Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu);
- cpu_clear(cpu, cpu_callout_map);
- cpu_clear(cpu, cpu_callin_map);
+ if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
+ !physid_isset(apicid, phys_cpu_present_map)) {
+ printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu);
+ return -EINVAL;
+ }
- cpu_clear(cpu, smp_commenced_mask);
- unmap_cpu_to_logical_apicid(cpu);
-}
+ /*
+ * Already booted CPU?
+ */
+ if (cpu_isset(cpu, cpu_callin_map)) {
+ Dprintk("do_boot_cpu %d Already started\n", cpu);
+ return -ENOSYS;
+ }
-struct warm_boot_cpu_info {
- struct completion *complete;
- struct work_struct task;
- int apicid;
- int cpu;
-};
+ /*
+ * Save current MTRR state in case it was changed since early boot
+ * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
+ */
+ mtrr_save_state();
-static void __cpuinit do_warm_boot_cpu(struct work_struct *work)
-{
- struct warm_boot_cpu_info *info =
- container_of(work, struct warm_boot_cpu_info, task);
- do_boot_cpu(info->apicid, info->cpu);
- complete(info->complete);
-}
+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
-static int __cpuinit __smp_prepare_cpu(int cpu)
-{
- DECLARE_COMPLETION_ONSTACK(done);
- struct warm_boot_cpu_info info;
- int apicid, ret;
-
- apicid = per_cpu(x86_cpu_to_apicid, cpu);
- if (apicid == BAD_APICID) {
- ret = -ENODEV;
- goto exit;
+#ifdef CONFIG_X86_32
+ /* init low mem mapping */
+ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
+ flush_tlb_all();
+#endif
+
+ err = do_boot_cpu(apicid, cpu);
+ if (err < 0) {
+ Dprintk("do_boot_cpu failed %d\n", err);
+ return err;
}
- info.complete = &done;
- info.apicid = apicid;
- info.cpu = cpu;
- INIT_WORK(&info.task, do_warm_boot_cpu);
+ /*
+ * Check TSC synchronization with the AP (keep irqs disabled
+ * while doing so):
+ */
+ local_irq_save(flags);
+ check_tsc_sync_source(cpu);
+ local_irq_restore(flags);
- /* init low mem mapping */
- clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
- min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
- flush_tlb_all();
- schedule_work(&info.task);
- wait_for_completion(&done);
+ while (!cpu_online(cpu)) {
+ cpu_relax();
+ touch_nmi_watchdog();
+ }
- zap_low_mappings();
- ret = 0;
-exit:
- return ret;
+ return 0;
}
-#endif
/*
- * Cycle through the processors sending APIC IPIs to boot each.
+ * Fall back to non SMP mode after errors.
+ *
+ * RED-PEN audit/test this more. I bet there is more state messed up here.
*/
-
-static int boot_cpu_logical_apicid;
-/* Where the IO area was mapped on multiquad, always 0 otherwise */
-void *xquad_portio;
-#ifdef CONFIG_X86_NUMAQ
-EXPORT_SYMBOL(xquad_portio);
+static __init void disable_smp(void)
+{
+ cpu_present_map = cpumask_of_cpu(0);
+ cpu_possible_map = cpumask_of_cpu(0);
+#ifdef CONFIG_X86_32
+ smpboot_clear_io_apic_irqs();
#endif
+ if (smp_found_config)
+ phys_cpu_present_map =
+ physid_mask_of_physid(boot_cpu_physical_apicid);
+ else
+ phys_cpu_present_map = physid_mask_of_physid(0);
+ map_cpu_to_logical_apicid();
+ cpu_set(0, per_cpu(cpu_sibling_map, 0));
+ cpu_set(0, per_cpu(cpu_core_map, 0));
+}
-static void __init smp_boot_cpus(unsigned int max_cpus)
+/*
+ * Various sanity checks.
+ */
+static int __init smp_sanity_check(unsigned max_cpus)
{
- int apicid, cpu, bit, kicked;
- unsigned long bogosum = 0;
-
- /*
- * Setup boot CPU information
- */
- smp_store_cpu_info(0); /* Final full version of the data */
- printk("CPU%d: ", 0);
- print_cpu_info(&cpu_data(0));
-
- boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID));
- boot_cpu_logical_apicid = logical_smp_processor_id();
- per_cpu(x86_cpu_to_apicid, 0) = boot_cpu_physical_apicid;
-
- current_thread_info()->cpu = 0;
-
- set_cpu_sibling_map(0);
+ preempt_disable();
+ if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
+ printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
+ "by the BIOS.\n", hard_smp_processor_id());
+ physid_set(hard_smp_processor_id(), phys_cpu_present_map);
+ }
/*
* If we couldn't find an SMP configuration at boot time,
* get out of here now!
*/
if (!smp_found_config && !acpi_lapic) {
+ preempt_enable();
printk(KERN_NOTICE "SMP motherboard not detected.\n");
- smpboot_clear_io_apic_irqs();
- phys_cpu_present_map = physid_mask_of_physid(0);
+ disable_smp();
if (APIC_init_uniprocessor())
printk(KERN_NOTICE "Local APIC not detected."
" Using dummy APIC emulation.\n");
- map_cpu_to_logical_apicid();
- cpu_set(0, per_cpu(cpu_sibling_map, 0));
- cpu_set(0, per_cpu(cpu_core_map, 0));
- return;
+ return -1;
}
/*
* Should not be necessary because the MP table should list the boot
* CPU too, but we do it for the sake of robustness anyway.
- * Makes no sense to do this check in clustered apic mode, so skip it
*/
if (!check_phys_apicid_present(boot_cpu_physical_apicid)) {
- printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
- boot_cpu_physical_apicid);
+ printk(KERN_NOTICE
+ "weird, boot CPU (#%d) not listed by the BIOS.\n",
+ boot_cpu_physical_apicid);
physid_set(hard_smp_processor_id(), phys_cpu_present_map);
}
+ preempt_enable();
/*
* If we couldn't find a local APIC, then get out of here now!
*/
- if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) {
+ if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
+ !cpu_has_apic) {
printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
boot_cpu_physical_apicid);
- printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
- smpboot_clear_io_apic_irqs();
- phys_cpu_present_map = physid_mask_of_physid(0);
- map_cpu_to_logical_apicid();
- cpu_set(0, per_cpu(cpu_sibling_map, 0));
- cpu_set(0, per_cpu(cpu_core_map, 0));
- return;
+ printk(KERN_ERR "... forcing use of dummy APIC emulation."
+ "(tell your hw vendor)\n");
+ smpboot_clear_io_apic();
+ return -1;
}
verify_local_APIC();
@@ -1016,138 +1145,149 @@ static void __init smp_boot_cpus(unsigned int max_cpus)
* If SMP should be disabled, then really disable it!
*/
if (!max_cpus) {
- smp_found_config = 0;
- printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
-
+ printk(KERN_INFO "SMP mode deactivated,"
+ "forcing use of dummy APIC emulation.\n");
+ smpboot_clear_io_apic();
+#ifdef CONFIG_X86_32
if (nmi_watchdog == NMI_LOCAL_APIC) {
- printk(KERN_INFO "activating minimal APIC for NMI watchdog use.\n");
+ printk(KERN_INFO "activating minimal APIC for"
+ "NMI watchdog use.\n");
connect_bsp_APIC();
setup_local_APIC();
+ end_local_APIC_setup();
}
- smpboot_clear_io_apic_irqs();
- phys_cpu_present_map = physid_mask_of_physid(0);
- map_cpu_to_logical_apicid();
- cpu_set(0, per_cpu(cpu_sibling_map, 0));
- cpu_set(0, per_cpu(cpu_core_map, 0));
- return;
+#endif
+ return -1;
}
- connect_bsp_APIC();
- setup_local_APIC();
- map_cpu_to_logical_apicid();
+ return 0;
+}
+static void __init smp_cpu_index_default(void)
+{
+ int i;
+ struct cpuinfo_x86 *c;
- setup_portio_remap();
+ for_each_possible_cpu(i) {
+ c = &cpu_data(i);
+ /* mark all to hotplug */
+ c->cpu_index = NR_CPUS;
+ }
+}
+/*
+ * Prepare for SMP bootup. The MP table or ACPI has been read
+ * earlier. Just do some sanity checking here and enable APIC mode.
+ */
+void __init native_smp_prepare_cpus(unsigned int max_cpus)
+{
+ nmi_watchdog_default();
+ smp_cpu_index_default();
+ current_cpu_data = boot_cpu_data;
+ cpu_callin_map = cpumask_of_cpu(0);
+ mb();
/*
- * Scan the CPU present map and fire up the other CPUs via do_boot_cpu
- *
- * In clustered apic mode, phys_cpu_present_map is a constructed thus:
- * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the
- * clustered apic ID.
+ * Setup boot CPU information
*/
- Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map));
-
- kicked = 1;
- for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) {
- apicid = cpu_present_to_apicid(bit);
- /*
- * Don't even attempt to start the boot CPU!
- */
- if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID))
- continue;
+ smp_store_cpu_info(0); /* Final full version of the data */
+ boot_cpu_logical_apicid = logical_smp_processor_id();
+ current_thread_info()->cpu = 0; /* needed? */
+ set_cpu_sibling_map(0);
- if (!check_apicid_present(bit))
- continue;
- if (max_cpus <= cpucount+1)
- continue;
+ if (smp_sanity_check(max_cpus) < 0) {
+ printk(KERN_INFO "SMP disabled\n");
+ disable_smp();
+ return;
+ }
- if (((cpu = alloc_cpu_id()) <= 0) || do_boot_cpu(apicid, cpu))
- printk("CPU #%d not responding - cannot use it.\n",
- apicid);
- else
- ++kicked;
+ preempt_disable();
+ if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) {
+ panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
+ GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid);
+ /* Or can we switch back to PIC here? */
}
+ preempt_enable();
+#ifdef CONFIG_X86_32
+ connect_bsp_APIC();
+#endif
/*
- * Cleanup possible dangling ends...
+ * Switch from PIC to APIC mode.
*/
- smpboot_restore_warm_reset_vector();
+ setup_local_APIC();
+#ifdef CONFIG_X86_64
/*
- * Allow the user to impress friends.
+ * Enable IO APIC before setting up error vector
*/
- Dprintk("Before bogomips.\n");
- for_each_possible_cpu(cpu)
- if (cpu_isset(cpu, cpu_callout_map))
- bogosum += cpu_data(cpu).loops_per_jiffy;
- printk(KERN_INFO
- "Total of %d processors activated (%lu.%02lu BogoMIPS).\n",
- cpucount+1,
- bogosum/(500000/HZ),
- (bogosum/(5000/HZ))%100);
-
- Dprintk("Before bogocount - setting activated=1.\n");
-
- if (smp_b_stepping)
- printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n");
+ if (!skip_ioapic_setup && nr_ioapics)
+ enable_IO_APIC();
+#endif
+ end_local_APIC_setup();
- /*
- * Don't taint if we are running SMP kernel on a single non-MP
- * approved Athlon
- */
- if (tainted & TAINT_UNSAFE_SMP) {
- if (cpucount)
- printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n");
- else
- tainted &= ~TAINT_UNSAFE_SMP;
- }
+ map_cpu_to_logical_apicid();
- Dprintk("Boot done.\n");
+ setup_portio_remap();
+ smpboot_setup_io_apic();
/*
- * construct cpu_sibling_map, so that we can tell sibling CPUs
- * efficiently.
+ * Set up local APIC timer on boot CPU.
*/
- for_each_possible_cpu(cpu) {
- cpus_clear(per_cpu(cpu_sibling_map, cpu));
- cpus_clear(per_cpu(cpu_core_map, cpu));
- }
-
- cpu_set(0, per_cpu(cpu_sibling_map, 0));
- cpu_set(0, per_cpu(cpu_core_map, 0));
-
- smpboot_setup_io_apic();
+ printk(KERN_INFO "CPU%d: ", 0);
+ print_cpu_info(&cpu_data(0));
setup_boot_clock();
}
+/*
+ * Early setup to make printk work.
+ */
+void __init native_smp_prepare_boot_cpu(void)
+{
+ int me = smp_processor_id();
+#ifdef CONFIG_X86_32
+ init_gdt(me);
+ switch_to_new_gdt();
+#endif
+ /* already set me in cpu_online_map in boot_cpu_init() */
+ cpu_set(me, cpu_callout_map);
+ per_cpu(cpu_state, me) = CPU_ONLINE;
+}
-/* These are wrappers to interface to the new boot process. Someone
- who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
-void __init native_smp_prepare_cpus(unsigned int max_cpus)
+void __init native_smp_cpus_done(unsigned int max_cpus)
{
- smp_commenced_mask = cpumask_of_cpu(0);
- cpu_callin_map = cpumask_of_cpu(0);
- mb();
- smp_boot_cpus(max_cpus);
+ Dprintk("Boot done.\n");
+
+ impress_friends();
+ smp_checks();
+#ifdef CONFIG_X86_IO_APIC
+ setup_ioapic_dest();
+#endif
+ check_nmi_watchdog();
+#ifdef CONFIG_X86_32
+ zap_low_mappings();
+#endif
}
-void __init native_smp_prepare_boot_cpu(void)
+#ifdef CONFIG_HOTPLUG_CPU
+
+# ifdef CONFIG_X86_32
+void cpu_exit_clear(void)
{
- unsigned int cpu = smp_processor_id();
+ int cpu = raw_smp_processor_id();
- init_gdt(cpu);
- switch_to_new_gdt();
+ idle_task_exit();
+
+ cpu_uninit();
+ irq_ctx_exit(cpu);
+
+ cpu_clear(cpu, cpu_callout_map);
+ cpu_clear(cpu, cpu_callin_map);
- cpu_set(cpu, cpu_online_map);
- cpu_set(cpu, cpu_callout_map);
- cpu_set(cpu, cpu_present_map);
- cpu_set(cpu, cpu_possible_map);
- __get_cpu_var(cpu_state) = CPU_ONLINE;
+ unmap_cpu_to_logical_apicid(cpu);
}
+# endif /* CONFIG_X86_32 */
-#ifdef CONFIG_HOTPLUG_CPU
-void remove_siblinginfo(int cpu)
+static void remove_siblinginfo(int cpu)
{
int sibling;
struct cpuinfo_x86 *c = &cpu_data(cpu);
@@ -1160,7 +1300,7 @@ void remove_siblinginfo(int cpu)
if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
cpu_data(sibling).booted_cores--;
}
-
+
for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu))
cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
cpus_clear(per_cpu(cpu_sibling_map, cpu));
@@ -1170,35 +1310,99 @@ void remove_siblinginfo(int cpu)
cpu_clear(cpu, cpu_sibling_setup_map);
}
+int additional_cpus __initdata = -1;
+
+static __init int setup_additional_cpus(char *s)
+{
+ return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
+}
+early_param("additional_cpus", setup_additional_cpus);
+
+/*
+ * cpu_possible_map should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_map on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * - Ashok Raj
+ *
+ * Three ways to find out the number of additional hotplug CPUs:
+ * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
+ * - The user can overwrite it with additional_cpus=NUM
+ * - Otherwise don't reserve additional CPUs.
+ * We do this because additional CPUs waste a lot of memory.
+ * -AK
+ */
+__init void prefill_possible_map(void)
+{
+ int i;
+ int possible;
+
+ if (additional_cpus == -1) {
+ if (disabled_cpus > 0)
+ additional_cpus = disabled_cpus;
+ else
+ additional_cpus = 0;
+ }
+ possible = num_processors + additional_cpus;
+ if (possible > NR_CPUS)
+ possible = NR_CPUS;
+
+ printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
+ possible, max_t(int, possible - num_processors, 0));
+
+ for (i = 0; i < possible; i++)
+ cpu_set(i, cpu_possible_map);
+}
+
+static void __ref remove_cpu_from_maps(int cpu)
+{
+ cpu_clear(cpu, cpu_online_map);
+#ifdef CONFIG_X86_64
+ cpu_clear(cpu, cpu_callout_map);
+ cpu_clear(cpu, cpu_callin_map);
+ /* was set by cpu_init() */
+ clear_bit(cpu, (unsigned long *)&cpu_initialized);
+ clear_node_cpumask(cpu);
+#endif
+}
+
int __cpu_disable(void)
{
- cpumask_t map = cpu_online_map;
int cpu = smp_processor_id();
/*
* Perhaps use cpufreq to drop frequency, but that could go
* into generic code.
- *
+ *
* We won't take down the boot processor on i386 due to some
* interrupts only being able to be serviced by the BSP.
* Especially so if we're not using an IOAPIC -zwane
*/
if (cpu == 0)
return -EBUSY;
+
if (nmi_watchdog == NMI_LOCAL_APIC)
stop_apic_nmi_watchdog(NULL);
clear_local_APIC();
- /* Allow any queued timer interrupts to get serviced */
+
+ /*
+ * HACK:
+ * Allow any queued timer interrupts to get serviced
+ * This is only a temporary solution until we cleanup
+ * fixup_irqs as we do for IA64.
+ */
local_irq_enable();
mdelay(1);
- local_irq_disable();
+ local_irq_disable();
remove_siblinginfo(cpu);
- cpu_clear(cpu, map);
- fixup_irqs(map);
/* It's now safe to remove this processor from the online map */
- cpu_clear(cpu, cpu_online_map);
+ remove_cpu_from_maps(cpu);
+ fixup_irqs(cpu_online_map);
return 0;
}
@@ -1210,14 +1414,14 @@ void __cpu_die(unsigned int cpu)
for (i = 0; i < 10; i++) {
/* They ack this in play_dead by setting CPU_DEAD */
if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
- printk ("CPU %d is now offline\n", cpu);
+ printk(KERN_INFO "CPU %d is now offline\n", cpu);
if (1 == num_online_cpus())
alternatives_smp_switch(0);
return;
}
msleep(100);
}
- printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+ printk(KERN_ERR "CPU %u didn't die...\n", cpu);
}
#else /* ... !CONFIG_HOTPLUG_CPU */
int __cpu_disable(void)
@@ -1230,82 +1434,8 @@ void __cpu_die(unsigned int cpu)
/* We said "no" in __cpu_disable */
BUG();
}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-int __cpuinit native_cpu_up(unsigned int cpu)
-{
- unsigned long flags;
-#ifdef CONFIG_HOTPLUG_CPU
- int ret = 0;
-
- /*
- * We do warm boot only on cpus that had booted earlier
- * Otherwise cold boot is all handled from smp_boot_cpus().
- * cpu_callin_map is set during AP kickstart process. Its reset
- * when a cpu is taken offline from cpu_exit_clear().
- */
- if (!cpu_isset(cpu, cpu_callin_map))
- ret = __smp_prepare_cpu(cpu);
-
- if (ret)
- return -EIO;
#endif
- /* In case one didn't come up */
- if (!cpu_isset(cpu, cpu_callin_map)) {
- printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
- return -EIO;
- }
-
- per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
- /* Unleash the CPU! */
- cpu_set(cpu, smp_commenced_mask);
-
- /*
- * Check TSC synchronization with the AP (keep irqs disabled
- * while doing so):
- */
- local_irq_save(flags);
- check_tsc_sync_source(cpu);
- local_irq_restore(flags);
-
- while (!cpu_isset(cpu, cpu_online_map)) {
- cpu_relax();
- touch_nmi_watchdog();
- }
-
- return 0;
-}
-
-void __init native_smp_cpus_done(unsigned int max_cpus)
-{
-#ifdef CONFIG_X86_IO_APIC
- setup_ioapic_dest();
-#endif
- zap_low_mappings();
-}
-
-void __init smp_intr_init(void)
-{
- /*
- * IRQ0 must be given a fixed assignment and initialized,
- * because it's used before the IO-APIC is set up.
- */
- set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
-
- /*
- * The reschedule interrupt is a CPU-to-CPU reschedule-helper
- * IPI, driven by wakeup.
- */
- set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
- /* IPI for invalidation */
- set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
-
- /* IPI for generic function call */
- set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-}
-
/*
* If the BIOS enumerates physical processors before logical,
* maxcpus=N at enumeration-time can be used to disable HT.
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
deleted file mode 100644
index 0880f2c388a9..000000000000
--- a/arch/x86/kernel/smpboot_64.c
+++ /dev/null
@@ -1,1108 +0,0 @@
-/*
- * x86 SMP booting functions
- *
- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- * (c) 1998, 1999, 2000 Ingo Molnar <mingo@redhat.com>
- * Copyright 2001 Andi Kleen, SuSE Labs.
- *
- * Much of the core SMP work is based on previous work by Thomas Radke, to
- * whom a great many thanks are extended.
- *
- * Thanks to Intel for making available several different Pentium,
- * Pentium Pro and Pentium-II/Xeon MP machines.
- * Original development of Linux SMP code supported by Caldera.
- *
- * This code is released under the GNU General Public License version 2
- *
- * Fixes
- * Felix Koop : NR_CPUS used properly
- * Jose Renau : Handle single CPU case.
- * Alan Cox : By repeated request 8) - Total BogoMIP report.
- * Greg Wright : Fix for kernel stacks panic.
- * Erich Boleyn : MP v1.4 and additional changes.
- * Matthias Sattler : Changes for 2.1 kernel map.
- * Michel Lespinasse : Changes for 2.1 kernel map.
- * Michael Chastain : Change trampoline.S to gnu as.
- * Alan Cox : Dumb bug: 'B' step PPro's are fine
- * Ingo Molnar : Added APIC timers, based on code
- * from Jose Renau
- * Ingo Molnar : various cleanups and rewrites
- * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug.
- * Maciej W. Rozycki : Bits for genuine 82489DX APICs
- * Andi Kleen : Changed for SMP boot into long mode.
- * Rusty Russell : Hacked into shape for new "hotplug" boot process.
- * Andi Kleen : Converted to new state machine.
- * Various cleanups.
- * Probably mostly hotplug CPU ready now.
- * Ashok Raj : CPU hotplug support
- */
-
-
-#include <linux/init.h>
-
-#include <linux/mm.h>
-#include <linux/kernel_stat.h>
-#include <linux/bootmem.h>
-#include <linux/thread_info.h>
-#include <linux/module.h>
-#include <linux/delay.h>
-#include <linux/mc146818rtc.h>
-#include <linux/smp.h>
-#include <linux/kdebug.h>
-
-#include <asm/mtrr.h>
-#include <asm/pgalloc.h>
-#include <asm/desc.h>
-#include <asm/tlbflush.h>
-#include <asm/proto.h>
-#include <asm/nmi.h>
-#include <asm/irq.h>
-#include <asm/hw_irq.h>
-#include <asm/numa.h>
-
-/* Number of siblings per CPU package */
-int smp_num_siblings = 1;
-EXPORT_SYMBOL(smp_num_siblings);
-
-/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
-
-/* Bitmask of currently online CPUs */
-cpumask_t cpu_online_map __read_mostly;
-
-EXPORT_SYMBOL(cpu_online_map);
-
-/*
- * Private maps to synchronize booting between AP and BP.
- * Probably not needed anymore, but it makes for easier debugging. -AK
- */
-cpumask_t cpu_callin_map;
-cpumask_t cpu_callout_map;
-cpumask_t cpu_possible_map;
-EXPORT_SYMBOL(cpu_possible_map);
-
-/* Per CPU bogomips and other parameters */
-DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
-EXPORT_PER_CPU_SYMBOL(cpu_info);
-
-/* Set when the idlers are all forked */
-int smp_threads_ready;
-
-/* representing HT siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_t, cpu_sibling_map);
-EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
-
-/* representing HT and core siblings of each logical CPU */
-DEFINE_PER_CPU(cpumask_t, cpu_core_map);
-EXPORT_PER_CPU_SYMBOL(cpu_core_map);
-
-/*
- * Trampoline 80x86 program as an array.
- */
-
-extern const unsigned char trampoline_data[];
-extern const unsigned char trampoline_end[];
-
-/* State of each CPU */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
-/*
- * Store all idle threads, this can be reused instead of creating
- * a new thread. Also avoids complicated thread destroy functionality
- * for idle threads.
- */
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
- * removed after init for !CONFIG_HOTPLUG_CPU.
- */
-static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
-#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
-#define set_idle_for_cpu(x,p) (per_cpu(idle_thread_array, x) = (p))
-#else
-struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-#define get_idle_for_cpu(x) (idle_thread_array[(x)])
-#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
-#endif
-
-
-/*
- * Currently trivial. Write the real->protected mode
- * bootstrap into the page concerned. The caller
- * has made sure it's suitably aligned.
- */
-
-static unsigned long __cpuinit setup_trampoline(void)
-{
- void *tramp = __va(SMP_TRAMPOLINE_BASE);
- memcpy(tramp, trampoline_data, trampoline_end - trampoline_data);
- return virt_to_phys(tramp);
-}
-
-/*
- * The bootstrap kernel entry code has set these up. Save them for
- * a given CPU
- */
-
-static void __cpuinit smp_store_cpu_info(int id)
-{
- struct cpuinfo_x86 *c = &cpu_data(id);
-
- *c = boot_cpu_data;
- c->cpu_index = id;
- identify_cpu(c);
- print_cpu_info(c);
-}
-
-static atomic_t init_deasserted __cpuinitdata;
-
-/*
- * Report back to the Boot Processor.
- * Running on AP.
- */
-void __cpuinit smp_callin(void)
-{
- int cpuid, phys_id;
- unsigned long timeout;
-
- /*
- * If waken up by an INIT in an 82489DX configuration
- * we may get here before an INIT-deassert IPI reaches
- * our local APIC. We have to wait for the IPI or we'll
- * lock up on an APIC access.
- */
- while (!atomic_read(&init_deasserted))
- cpu_relax();
-
- /*
- * (This works even if the APIC is not enabled.)
- */
- phys_id = GET_APIC_ID(apic_read(APIC_ID));
- cpuid = smp_processor_id();
- if (cpu_isset(cpuid, cpu_callin_map)) {
- panic("smp_callin: phys CPU#%d, CPU#%d already present??\n",
- phys_id, cpuid);
- }
- Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
-
- /*
- * STARTUP IPIs are fragile beasts as they might sometimes
- * trigger some glue motherboard logic. Complete APIC bus
- * silence for 1 second, this overestimates the time the
- * boot CPU is spending to send the up to 2 STARTUP IPIs
- * by a factor of two. This should be enough.
- */
-
- /*
- * Waiting 2s total for startup (udelay is not yet working)
- */
- timeout = jiffies + 2*HZ;
- while (time_before(jiffies, timeout)) {
- /*
- * Has the boot CPU finished it's STARTUP sequence?
- */
- if (cpu_isset(cpuid, cpu_callout_map))
- break;
- cpu_relax();
- }
-
- if (!time_before(jiffies, timeout)) {
- panic("smp_callin: CPU%d started up but did not get a callout!\n",
- cpuid);
- }
-
- /*
- * the boot CPU has finished the init stage and is spinning
- * on callin_map until we finish. We are free to set up this
- * CPU, first the APIC. (this is probably redundant on most
- * boards)
- */
-
- Dprintk("CALLIN, before setup_local_APIC().\n");
- setup_local_APIC();
- end_local_APIC_setup();
-
- /*
- * Get our bogomips.
- *
- * Need to enable IRQs because it can take longer and then
- * the NMI watchdog might kill us.
- */
- local_irq_enable();
- calibrate_delay();
- local_irq_disable();
- Dprintk("Stack at about %p\n",&cpuid);
-
- /*
- * Save our processor parameters
- */
- smp_store_cpu_info(cpuid);
-
- /*
- * Allow the master to continue.
- */
- cpu_set(cpuid, cpu_callin_map);
-}
-
-/* maps the cpu to the sched domain representing multi-core */
-cpumask_t cpu_coregroup_map(int cpu)
-{
- struct cpuinfo_x86 *c = &cpu_data(cpu);
- /*
- * For perf, we return last level cache shared map.
- * And for power savings, we return cpu_core_map
- */
- if (sched_mc_power_savings || sched_smt_power_savings)
- return per_cpu(cpu_core_map, cpu);
- else
- return c->llc_shared_map;
-}
-
-/* representing cpus for which sibling maps can be computed */
-static cpumask_t cpu_sibling_setup_map;
-
-static inline void set_cpu_sibling_map(int cpu)
-{
- int i;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- cpu_set(cpu, cpu_sibling_setup_map);
-
- if (smp_num_siblings > 1) {
- for_each_cpu_mask(i, cpu_sibling_setup_map) {
- if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
- c->cpu_core_id == cpu_data(i).cpu_core_id) {
- cpu_set(i, per_cpu(cpu_sibling_map, cpu));
- cpu_set(cpu, per_cpu(cpu_sibling_map, i));
- cpu_set(i, per_cpu(cpu_core_map, cpu));
- cpu_set(cpu, per_cpu(cpu_core_map, i));
- cpu_set(i, c->llc_shared_map);
- cpu_set(cpu, cpu_data(i).llc_shared_map);
- }
- }
- } else {
- cpu_set(cpu, per_cpu(cpu_sibling_map, cpu));
- }
-
- cpu_set(cpu, c->llc_shared_map);
-
- if (current_cpu_data.x86_max_cores == 1) {
- per_cpu(cpu_core_map, cpu) = per_cpu(cpu_sibling_map, cpu);
- c->booted_cores = 1;
- return;
- }
-
- for_each_cpu_mask(i, cpu_sibling_setup_map) {
- if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
- per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
- cpu_set(i, c->llc_shared_map);
- cpu_set(cpu, cpu_data(i).llc_shared_map);
- }
- if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
- cpu_set(i, per_cpu(cpu_core_map, cpu));
- cpu_set(cpu, per_cpu(cpu_core_map, i));
- /*
- * Does this new cpu bringup a new core?
- */
- if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) {
- /*
- * for each core in package, increment
- * the booted_cores for this new cpu
- */
- if (first_cpu(per_cpu(cpu_sibling_map, i)) == i)
- c->booted_cores++;
- /*
- * increment the core count for all
- * the other cpus in this package
- */
- if (i != cpu)
- cpu_data(i).booted_cores++;
- } else if (i != cpu && !c->booted_cores)
- c->booted_cores = cpu_data(i).booted_cores;
- }
- }
-}
-
-/*
- * Setup code on secondary processor (after comming out of the trampoline)
- */
-void __cpuinit start_secondary(void)
-{
- /*
- * Dont put anything before smp_callin(), SMP
- * booting is too fragile that we want to limit the
- * things done here to the most necessary things.
- */
- cpu_init();
- preempt_disable();
- smp_callin();
-
- /* otherwise gcc will move up the smp_processor_id before the cpu_init */
- barrier();
-
- /*
- * Check TSC sync first:
- */
- check_tsc_sync_target();
-
- if (nmi_watchdog == NMI_IO_APIC) {
- disable_8259A_irq(0);
- enable_NMI_through_LVT0();
- enable_8259A_irq(0);
- }
-
- /*
- * The sibling maps must be set before turing the online map on for
- * this cpu
- */
- set_cpu_sibling_map(smp_processor_id());
-
- /*
- * We need to hold call_lock, so there is no inconsistency
- * between the time smp_call_function() determines number of
- * IPI recipients, and the time when the determination is made
- * for which cpus receive the IPI in genapic_flat.c. Holding this
- * lock helps us to not include this cpu in a currently in progress
- * smp_call_function().
- */
- lock_ipi_call_lock();
- spin_lock(&vector_lock);
-
- /* Setup the per cpu irq handling data structures */
- __setup_vector_irq(smp_processor_id());
- /*
- * Allow the master to continue.
- */
- cpu_set(smp_processor_id(), cpu_online_map);
- per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
- spin_unlock(&vector_lock);
-
- unlock_ipi_call_lock();
-
- setup_secondary_clock();
-
- cpu_idle();
-}
-
-extern volatile unsigned long init_rsp;
-extern void (*initial_code)(void);
-
-#ifdef APIC_DEBUG
-static void inquire_remote_apic(int apicid)
-{
- unsigned i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 };
- char *names[] = { "ID", "VERSION", "SPIV" };
- int timeout;
- u32 status;
-
- printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid);
-
- for (i = 0; i < ARRAY_SIZE(regs); i++) {
- printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]);
-
- /*
- * Wait for idle.
- */
- status = safe_apic_wait_icr_idle();
- if (status)
- printk(KERN_CONT
- "a previous APIC delivery may have failed\n");
-
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid));
- apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
-
- timeout = 0;
- do {
- udelay(100);
- status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK;
- } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000);
-
- switch (status) {
- case APIC_ICR_RR_VALID:
- status = apic_read(APIC_RRR);
- printk(KERN_CONT "%08x\n", status);
- break;
- default:
- printk(KERN_CONT "failed\n");
- }
- }
-}
-#endif
-
-/*
- * Kick the secondary to wake up.
- */
-static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int start_rip)
-{
- unsigned long send_status, accept_status = 0;
- int maxlvt, num_starts, j;
-
- Dprintk("Asserting INIT.\n");
-
- /*
- * Turn INIT on target chip
- */
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
- /*
- * Send IPI
- */
- apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT
- | APIC_DM_INIT);
-
- Dprintk("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
-
- mdelay(10);
-
- Dprintk("Deasserting INIT.\n");
-
- /* Target chip */
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
- /* Send IPI */
- apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT);
-
- Dprintk("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
-
- mb();
- atomic_set(&init_deasserted, 1);
-
- num_starts = 2;
-
- /*
- * Run STARTUP IPI loop.
- */
- Dprintk("#startup loops: %d.\n", num_starts);
-
- maxlvt = lapic_get_maxlvt();
-
- for (j = 1; j <= num_starts; j++) {
- Dprintk("Sending STARTUP #%d.\n",j);
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
- Dprintk("After apic_write.\n");
-
- /*
- * STARTUP IPI
- */
-
- /* Target chip */
- apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
-
- /* Boot on the stack */
- /* Kick the second */
- apic_write(APIC_ICR, APIC_DM_STARTUP | (start_rip >> 12));
-
- /*
- * Give the other CPU some time to accept the IPI.
- */
- udelay(300);
-
- Dprintk("Startup point 1.\n");
-
- Dprintk("Waiting for send to finish...\n");
- send_status = safe_apic_wait_icr_idle();
-
- /*
- * Give the other CPU some time to accept the IPI.
- */
- udelay(200);
- /*
- * Due to the Pentium erratum 3AP.
- */
- if (maxlvt > 3) {
- apic_write(APIC_ESR, 0);
- }
- accept_status = (apic_read(APIC_ESR) & 0xEF);
- if (send_status || accept_status)
- break;
- }
- Dprintk("After Startup.\n");
-
- if (send_status)
- printk(KERN_ERR "APIC never delivered???\n");
- if (accept_status)
- printk(KERN_ERR "APIC delivery error (%lx).\n", accept_status);
-
- return (send_status | accept_status);
-}
-
-struct create_idle {
- struct work_struct work;
- struct task_struct *idle;
- struct completion done;
- int cpu;
-};
-
-static void __cpuinit do_fork_idle(struct work_struct *work)
-{
- struct create_idle *c_idle =
- container_of(work, struct create_idle, work);
-
- c_idle->idle = fork_idle(c_idle->cpu);
- complete(&c_idle->done);
-}
-
-/*
- * Boot one CPU.
- */
-static int __cpuinit do_boot_cpu(int cpu, int apicid)
-{
- unsigned long boot_error;
- int timeout;
- unsigned long start_rip;
- struct create_idle c_idle = {
- .cpu = cpu,
- .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
- };
- INIT_WORK(&c_idle.work, do_fork_idle);
-
- /* allocate memory for gdts of secondary cpus. Hotplug is considered */
- if (!cpu_gdt_descr[cpu].address &&
- !(cpu_gdt_descr[cpu].address = get_zeroed_page(GFP_KERNEL))) {
- printk(KERN_ERR "Failed to allocate GDT for CPU %d\n", cpu);
- return -1;
- }
-
- /* Allocate node local memory for AP pdas */
- if (cpu_pda(cpu) == &boot_cpu_pda[cpu]) {
- struct x8664_pda *newpda, *pda;
- int node = cpu_to_node(cpu);
- pda = cpu_pda(cpu);
- newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC,
- node);
- if (newpda) {
- memcpy(newpda, pda, sizeof (struct x8664_pda));
- cpu_pda(cpu) = newpda;
- } else
- printk(KERN_ERR
- "Could not allocate node local PDA for CPU %d on node %d\n",
- cpu, node);
- }
-
- alternatives_smp_switch(1);
-
- c_idle.idle = get_idle_for_cpu(cpu);
-
- if (c_idle.idle) {
- c_idle.idle->thread.sp = (unsigned long) (((struct pt_regs *)
- (THREAD_SIZE + task_stack_page(c_idle.idle))) - 1);
- init_idle(c_idle.idle, cpu);
- goto do_rest;
- }
-
- /*
- * During cold boot process, keventd thread is not spun up yet.
- * When we do cpu hot-add, we create idle threads on the fly, we should
- * not acquire any attributes from the calling context. Hence the clean
- * way to create kernel_threads() is to do that from keventd().
- * We do the current_is_keventd() due to the fact that ACPI notifier
- * was also queuing to keventd() and when the caller is already running
- * in context of keventd(), we would end up with locking up the keventd
- * thread.
- */
- if (!keventd_up() || current_is_keventd())
- c_idle.work.func(&c_idle.work);
- else {
- schedule_work(&c_idle.work);
- wait_for_completion(&c_idle.done);
- }
-
- if (IS_ERR(c_idle.idle)) {
- printk("failed fork for CPU %d\n", cpu);
- return PTR_ERR(c_idle.idle);
- }
-
- set_idle_for_cpu(cpu, c_idle.idle);
-
-do_rest:
-
- cpu_pda(cpu)->pcurrent = c_idle.idle;
-
- start_rip = setup_trampoline();
-
- init_rsp = c_idle.idle->thread.sp;
- load_sp0(&per_cpu(init_tss, cpu), &c_idle.idle->thread);
- initial_code = start_secondary;
- clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
-
- printk(KERN_INFO "Booting processor %d/%d APIC 0x%x\n", cpu,
- cpus_weight(cpu_present_map),
- apicid);
-
- /*
- * This grunge runs the startup process for
- * the targeted processor.
- */
-
- atomic_set(&init_deasserted, 0);
-
- Dprintk("Setting warm reset code and vector.\n");
-
- CMOS_WRITE(0xa, 0xf);
- local_flush_tlb();
- Dprintk("1.\n");
- *((volatile unsigned short *) phys_to_virt(0x469)) = start_rip >> 4;
- Dprintk("2.\n");
- *((volatile unsigned short *) phys_to_virt(0x467)) = start_rip & 0xf;
- Dprintk("3.\n");
-
- /*
- * Be paranoid about clearing APIC errors.
- */
- apic_write(APIC_ESR, 0);
- apic_read(APIC_ESR);
-
- /*
- * Status is now clean
- */
- boot_error = 0;
-
- /*
- * Starting actual IPI sequence...
- */
- boot_error = wakeup_secondary_via_INIT(apicid, start_rip);
-
- if (!boot_error) {
- /*
- * allow APs to start initializing.
- */
- Dprintk("Before Callout %d.\n", cpu);
- cpu_set(cpu, cpu_callout_map);
- Dprintk("After Callout %d.\n", cpu);
-
- /*
- * Wait 5s total for a response
- */
- for (timeout = 0; timeout < 50000; timeout++) {
- if (cpu_isset(cpu, cpu_callin_map))
- break; /* It has booted */
- udelay(100);
- }
-
- if (cpu_isset(cpu, cpu_callin_map)) {
- /* number CPUs logically, starting from 1 (BSP is 0) */
- Dprintk("CPU has booted.\n");
- } else {
- boot_error = 1;
- if (*((volatile unsigned char *)phys_to_virt(SMP_TRAMPOLINE_BASE))
- == 0xA5)
- /* trampoline started but...? */
- printk("Stuck ??\n");
- else
- /* trampoline code not run */
- printk("Not responding.\n");
-#ifdef APIC_DEBUG
- inquire_remote_apic(apicid);
-#endif
- }
- }
- if (boot_error) {
- cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */
- clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */
- clear_node_cpumask(cpu); /* was set by numa_add_cpu */
- cpu_clear(cpu, cpu_present_map);
- cpu_clear(cpu, cpu_possible_map);
- per_cpu(x86_cpu_to_apicid, cpu) = BAD_APICID;
- return -EIO;
- }
-
- return 0;
-}
-
-cycles_t cacheflush_time;
-unsigned long cache_decay_ticks;
-
-/*
- * Cleanup possible dangling ends...
- */
-static __cpuinit void smp_cleanup_boot(void)
-{
- /*
- * Paranoid: Set warm reset code and vector here back
- * to default values.
- */
- CMOS_WRITE(0, 0xf);
-
- /*
- * Reset trampoline flag
- */
- *((volatile int *) phys_to_virt(0x467)) = 0;
-}
-
-/*
- * Fall back to non SMP mode after errors.
- *
- * RED-PEN audit/test this more. I bet there is more state messed up here.
- */
-static __init void disable_smp(void)
-{
- cpu_present_map = cpumask_of_cpu(0);
- cpu_possible_map = cpumask_of_cpu(0);
- if (smp_found_config)
- phys_cpu_present_map = physid_mask_of_physid(boot_cpu_id);
- else
- phys_cpu_present_map = physid_mask_of_physid(0);
- cpu_set(0, per_cpu(cpu_sibling_map, 0));
- cpu_set(0, per_cpu(cpu_core_map, 0));
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-int additional_cpus __initdata = -1;
-
-/*
- * cpu_possible_map should be static, it cannot change as cpu's
- * are onlined, or offlined. The reason is per-cpu data-structures
- * are allocated by some modules at init time, and dont expect to
- * do this dynamically on cpu arrival/departure.
- * cpu_present_map on the other hand can change dynamically.
- * In case when cpu_hotplug is not compiled, then we resort to current
- * behaviour, which is cpu_possible == cpu_present.
- * - Ashok Raj
- *
- * Three ways to find out the number of additional hotplug CPUs:
- * - If the BIOS specified disabled CPUs in ACPI/mptables use that.
- * - The user can overwrite it with additional_cpus=NUM
- * - Otherwise don't reserve additional CPUs.
- * We do this because additional CPUs waste a lot of memory.
- * -AK
- */
-__init void prefill_possible_map(void)
-{
- int i;
- int possible;
-
- if (additional_cpus == -1) {
- if (disabled_cpus > 0)
- additional_cpus = disabled_cpus;
- else
- additional_cpus = 0;
- }
- possible = num_processors + additional_cpus;
- if (possible > NR_CPUS)
- possible = NR_CPUS;
-
- printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
- possible,
- max_t(int, possible - num_processors, 0));
-
- for (i = 0; i < possible; i++)
- cpu_set(i, cpu_possible_map);
-}
-#endif
-
-/*
- * Various sanity checks.
- */
-static int __init smp_sanity_check(unsigned max_cpus)
-{
- if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
- printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
- hard_smp_processor_id());
- physid_set(hard_smp_processor_id(), phys_cpu_present_map);
- }
-
- /*
- * If we couldn't find an SMP configuration at boot time,
- * get out of here now!
- */
- if (!smp_found_config) {
- printk(KERN_NOTICE "SMP motherboard not detected.\n");
- disable_smp();
- if (APIC_init_uniprocessor())
- printk(KERN_NOTICE "Local APIC not detected."
- " Using dummy APIC emulation.\n");
- return -1;
- }
-
- /*
- * Should not be necessary because the MP table should list the boot
- * CPU too, but we do it for the sake of robustness anyway.
- */
- if (!physid_isset(boot_cpu_id, phys_cpu_present_map)) {
- printk(KERN_NOTICE "weird, boot CPU (#%d) not listed by the BIOS.\n",
- boot_cpu_id);
- physid_set(hard_smp_processor_id(), phys_cpu_present_map);
- }
-
- /*
- * If we couldn't find a local APIC, then get out of here now!
- */
- if (!cpu_has_apic) {
- printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
- boot_cpu_id);
- printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n");
- nr_ioapics = 0;
- return -1;
- }
-
- /*
- * If SMP should be disabled, then really disable it!
- */
- if (!max_cpus) {
- printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n");
- nr_ioapics = 0;
- return -1;
- }
-
- return 0;
-}
-
-static void __init smp_cpu_index_default(void)
-{
- int i;
- struct cpuinfo_x86 *c;
-
- for_each_cpu_mask(i, cpu_possible_map) {
- c = &cpu_data(i);
- /* mark all to hotplug */
- c->cpu_index = NR_CPUS;
- }
-}
-
-/*
- * Prepare for SMP bootup. The MP table or ACPI has been read
- * earlier. Just do some sanity checking here and enable APIC mode.
- */
-void __init smp_prepare_cpus(unsigned int max_cpus)
-{
- nmi_watchdog_default();
- smp_cpu_index_default();
- current_cpu_data = boot_cpu_data;
- current_thread_info()->cpu = 0; /* needed? */
- set_cpu_sibling_map(0);
-
- if (smp_sanity_check(max_cpus) < 0) {
- printk(KERN_INFO "SMP disabled\n");
- disable_smp();
- return;
- }
-
-
- /*
- * Switch from PIC to APIC mode.
- */
- setup_local_APIC();
-
- /*
- * Enable IO APIC before setting up error vector
- */
- if (!skip_ioapic_setup && nr_ioapics)
- enable_IO_APIC();
- end_local_APIC_setup();
-
- if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_id) {
- panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
- GET_APIC_ID(apic_read(APIC_ID)), boot_cpu_id);
- /* Or can we switch back to PIC here? */
- }
-
- /*
- * Now start the IO-APICs
- */
- if (!skip_ioapic_setup && nr_ioapics)
- setup_IO_APIC();
- else
- nr_ioapics = 0;
-
- /*
- * Set up local APIC timer on boot CPU.
- */
-
- setup_boot_clock();
-}
-
-/*
- * Early setup to make printk work.
- */
-void __init smp_prepare_boot_cpu(void)
-{
- int me = smp_processor_id();
- /* already set me in cpu_online_map in boot_cpu_init() */
- cpu_set(me, cpu_callout_map);
- per_cpu(cpu_state, me) = CPU_ONLINE;
-}
-
-/*
- * Entry point to boot a CPU.
- */
-int __cpuinit __cpu_up(unsigned int cpu)
-{
- int apicid = cpu_present_to_apicid(cpu);
- unsigned long flags;
- int err;
-
- WARN_ON(irqs_disabled());
-
- Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu);
-
- if (apicid == BAD_APICID || apicid == boot_cpu_id ||
- !physid_isset(apicid, phys_cpu_present_map)) {
- printk("__cpu_up: bad cpu %d\n", cpu);
- return -EINVAL;
- }
-
- /*
- * Already booted CPU?
- */
- if (cpu_isset(cpu, cpu_callin_map)) {
- Dprintk("do_boot_cpu %d Already started\n", cpu);
- return -ENOSYS;
- }
-
- /*
- * Save current MTRR state in case it was changed since early boot
- * (e.g. by the ACPI SMI) to initialize new CPUs with MTRRs in sync:
- */
- mtrr_save_state();
-
- per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
- /* Boot it! */
- err = do_boot_cpu(cpu, apicid);
- if (err < 0) {
- Dprintk("do_boot_cpu failed %d\n", err);
- return err;
- }
-
- /* Unleash the CPU! */
- Dprintk("waiting for cpu %d\n", cpu);
-
- /*
- * Make sure and check TSC sync:
- */
- local_irq_save(flags);
- check_tsc_sync_source(cpu);
- local_irq_restore(flags);
-
- while (!cpu_isset(cpu, cpu_online_map))
- cpu_relax();
- err = 0;
-
- return err;
-}
-
-/*
- * Finish the SMP boot.
- */
-void __init smp_cpus_done(unsigned int max_cpus)
-{
- smp_cleanup_boot();
- setup_ioapic_dest();
- check_nmi_watchdog();
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-static void remove_siblinginfo(int cpu)
-{
- int sibling;
- struct cpuinfo_x86 *c = &cpu_data(cpu);
-
- for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) {
- cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
- /*
- * last thread sibling in this cpu core going down
- */
- if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
- cpu_data(sibling).booted_cores--;
- }
-
- for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu))
- cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
- cpus_clear(per_cpu(cpu_sibling_map, cpu));
- cpus_clear(per_cpu(cpu_core_map, cpu));
- c->phys_proc_id = 0;
- c->cpu_core_id = 0;
- cpu_clear(cpu, cpu_sibling_setup_map);
-}
-
-static void __ref remove_cpu_from_maps(void)
-{
- int cpu = smp_processor_id();
-
- cpu_clear(cpu, cpu_callout_map);
- cpu_clear(cpu, cpu_callin_map);
- clear_bit(cpu, (unsigned long *)&cpu_initialized); /* was set by cpu_init() */
- clear_node_cpumask(cpu);
-}
-
-int __cpu_disable(void)
-{
- int cpu = smp_processor_id();
-
- /*
- * Perhaps use cpufreq to drop frequency, but that could go
- * into generic code.
- *
- * We won't take down the boot processor on i386 due to some
- * interrupts only being able to be serviced by the BSP.
- * Especially so if we're not using an IOAPIC -zwane
- */
- if (cpu == 0)
- return -EBUSY;
-
- if (nmi_watchdog == NMI_LOCAL_APIC)
- stop_apic_nmi_watchdog(NULL);
- clear_local_APIC();
-
- /*
- * HACK:
- * Allow any queued timer interrupts to get serviced
- * This is only a temporary solution until we cleanup
- * fixup_irqs as we do for IA64.
- */
- local_irq_enable();
- mdelay(1);
-
- local_irq_disable();
- remove_siblinginfo(cpu);
-
- spin_lock(&vector_lock);
- /* It's now safe to remove this processor from the online map */
- cpu_clear(cpu, cpu_online_map);
- spin_unlock(&vector_lock);
- remove_cpu_from_maps();
- fixup_irqs(cpu_online_map);
- return 0;
-}
-
-void __cpu_die(unsigned int cpu)
-{
- /* We don't do anything here: idle task is faking death itself. */
- unsigned int i;
-
- for (i = 0; i < 10; i++) {
- /* They ack this in play_dead by setting CPU_DEAD */
- if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
- printk ("CPU %d is now offline\n", cpu);
- if (1 == num_online_cpus())
- alternatives_smp_switch(0);
- return;
- }
- msleep(100);
- }
- printk(KERN_ERR "CPU %u didn't die...\n", cpu);
-}
-
-static __init int setup_additional_cpus(char *s)
-{
- return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
-}
-early_param("additional_cpus", setup_additional_cpus);
-
-#else /* ... !CONFIG_HOTPLUG_CPU */
-
-int __cpu_disable(void)
-{
- return -ENOSYS;
-}
-
-void __cpu_die(unsigned int cpu)
-{
- /* We said "no" in __cpu_disable */
- BUG();
-}
-#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
new file mode 100644
index 000000000000..3449064d141a
--- /dev/null
+++ b/arch/x86/kernel/smpcommon.c
@@ -0,0 +1,83 @@
+/*
+ * SMP stuff which is common to all sub-architectures.
+ */
+#include <linux/module.h>
+#include <asm/smp.h>
+
+#ifdef CONFIG_X86_32
+DEFINE_PER_CPU(unsigned long, this_cpu_off);
+EXPORT_PER_CPU_SYMBOL(this_cpu_off);
+
+/* Initialize the CPU's GDT. This is either the boot CPU doing itself
+ (still using the master per-cpu area), or a CPU doing it for a
+ secondary which will soon come up. */
+__cpuinit void init_gdt(int cpu)
+{
+ struct desc_struct *gdt = get_cpu_gdt_table(cpu);
+
+ pack_descriptor(&gdt[GDT_ENTRY_PERCPU],
+ __per_cpu_offset[cpu], 0xFFFFF,
+ 0x2 | DESCTYPE_S, 0x8);
+
+ gdt[GDT_ENTRY_PERCPU].s = 1;
+
+ per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
+ per_cpu(cpu_number, cpu) = cpu;
+}
+#endif
+
+/**
+ * smp_call_function(): Run a function on all other CPUs.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Unused.
+ * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ *
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.
+ */
+int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
+ int wait)
+{
+ return smp_call_function_mask(cpu_online_map, func, info, wait);
+}
+EXPORT_SYMBOL(smp_call_function);
+
+/**
+ * smp_call_function_single - Run a function on a specific CPU
+ * @cpu: The target CPU. Cannot be the calling CPU.
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @nonatomic: Unused.
+ * @wait: If true, wait until function has completed on other CPUs.
+ *
+ * Returns 0 on success, else a negative status code.
+ *
+ * If @wait is true, then returns once @func has returned; otherwise
+ * it returns just before the target cpu calls @func.
+ */
+int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+ int nonatomic, int wait)
+{
+ /* prevent preemption and reschedule on another processor */
+ int ret;
+ int me = get_cpu();
+ if (cpu == me) {
+ local_irq_disable();
+ func(info);
+ local_irq_enable();
+ put_cpu();
+ return 0;
+ }
+
+ ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
+
+ put_cpu();
+ return ret;
+}
+EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
index 8bc38af29aef..8b137891791f 100644
--- a/arch/x86/kernel/smpcommon_32.c
+++ b/arch/x86/kernel/smpcommon_32.c
@@ -1,82 +1 @@
-/*
- * SMP stuff which is common to all sub-architectures.
- */
-#include <linux/module.h>
-#include <asm/smp.h>
-DEFINE_PER_CPU(unsigned long, this_cpu_off);
-EXPORT_PER_CPU_SYMBOL(this_cpu_off);
-
-/* Initialize the CPU's GDT. This is either the boot CPU doing itself
- (still using the master per-cpu area), or a CPU doing it for a
- secondary which will soon come up. */
-__cpuinit void init_gdt(int cpu)
-{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-
- pack_descriptor(&gdt[GDT_ENTRY_PERCPU],
- __per_cpu_offset[cpu], 0xFFFFF,
- 0x2 | DESCTYPE_S, 0x8);
-
- gdt[GDT_ENTRY_PERCPU].s = 1;
-
- per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
- per_cpu(cpu_number, cpu) = cpu;
-}
-
-
-/**
- * smp_call_function(): Run a function on all other CPUs.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: Unused.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
- int wait)
-{
- return smp_call_function_mask(cpu_online_map, func, info, wait);
-}
-EXPORT_SYMBOL(smp_call_function);
-
-/**
- * smp_call_function_single - Run a function on a specific CPU
- * @cpu: The target CPU. Cannot be the calling CPU.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: Unused.
- * @wait: If true, wait until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- */
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
- int nonatomic, int wait)
-{
- /* prevent preemption and reschedule on another processor */
- int ret;
- int me = get_cpu();
- if (cpu == me) {
- local_irq_disable();
- func(info);
- local_irq_enable();
- put_cpu();
- return 0;
- }
-
- ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
-
- put_cpu();
- return ret;
-}
-EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/x86/kernel/srat_32.c b/arch/x86/kernel/srat_32.c
index b72e61359c36..70e4a374b4e8 100644
--- a/arch/x86/kernel/srat_32.c
+++ b/arch/x86/kernel/srat_32.c
@@ -277,14 +277,14 @@ int __init get_memcfg_from_srat(void)
rsdp_address = acpi_os_get_root_pointer();
if (!rsdp_address) {
printk("%s: System description tables not found\n",
- __FUNCTION__);
+ __func__);
goto out_err;
}
- printk("%s: assigning address to rsdp\n", __FUNCTION__);
+ printk("%s: assigning address to rsdp\n", __func__);
rsdp = (struct acpi_table_rsdp *)(u32)rsdp_address;
if (!rsdp) {
- printk("%s: Didn't find ACPI root!\n", __FUNCTION__);
+ printk("%s: Didn't find ACPI root!\n", __func__);
goto out_err;
}
@@ -292,7 +292,7 @@ int __init get_memcfg_from_srat(void)
rsdp->oem_id);
if (strncmp(rsdp->signature, ACPI_SIG_RSDP,strlen(ACPI_SIG_RSDP))) {
- printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__);
+ printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __func__);
goto out_err;
}
@@ -302,7 +302,7 @@ int __init get_memcfg_from_srat(void)
if (!rsdt) {
printk(KERN_WARNING
"%s: ACPI: Invalid root system description tables (RSDT)\n",
- __FUNCTION__);
+ __func__);
goto out_err;
}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 071ff4798236..92c20fee6781 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -148,7 +148,7 @@ static void write_debugctlmsr(struct task_struct *child, unsigned long val)
if (child != current)
return;
- wrmsrl(MSR_IA32_DEBUGCTLMSR, val);
+ update_debugctlmsr(val);
}
/*
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index 72f463401592..ae751094eba9 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -29,49 +29,54 @@
#include <linux/mm.h>
#include <linux/init.h>
#include <asm/io.h>
+#include <asm/bios_ebda.h>
#include <asm/mach-summit/mach_mpparse.h>
static struct rio_table_hdr *rio_table_hdr __initdata;
static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
static struct rio_detail *rio_devs[MAX_NUMNODES*4] __initdata;
+static int mp_bus_id_to_node[MAX_MP_BUSSES] __initdata;
+
static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
{
int twister = 0, node = 0;
int i, bus, num_buses;
- for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
- if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id){
+ for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
+ if (rio_devs[i]->node_id == rio_devs[wpeg_num]->owner_id) {
twister = rio_devs[i]->owner_id;
break;
}
}
- if (i == rio_table_hdr->num_rio_dev){
- printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __FUNCTION__);
+ if (i == rio_table_hdr->num_rio_dev) {
+ printk(KERN_ERR "%s: Couldn't find owner Cyclone for Winnipeg!\n", __func__);
return last_bus;
}
- for(i = 0; i < rio_table_hdr->num_scal_dev; i++){
- if (scal_devs[i]->node_id == twister){
+ for (i = 0; i < rio_table_hdr->num_scal_dev; i++) {
+ if (scal_devs[i]->node_id == twister) {
node = scal_devs[i]->node_id;
break;
}
}
- if (i == rio_table_hdr->num_scal_dev){
- printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __FUNCTION__);
+ if (i == rio_table_hdr->num_scal_dev) {
+ printk(KERN_ERR "%s: Couldn't find owner Twister for Cyclone!\n", __func__);
return last_bus;
}
- switch (rio_devs[wpeg_num]->type){
+ switch (rio_devs[wpeg_num]->type) {
case CompatWPEG:
- /* The Compatibility Winnipeg controls the 2 legacy buses,
+ /*
+ * The Compatibility Winnipeg controls the 2 legacy buses,
* the 66MHz PCI bus [2 slots] and the 2 "extra" buses in case
* a PCI-PCI bridge card is used in either slot: total 5 buses.
*/
num_buses = 5;
break;
case AltWPEG:
- /* The Alternate Winnipeg controls the 2 133MHz buses [1 slot
+ /*
+ * The Alternate Winnipeg controls the 2 133MHz buses [1 slot
* each], their 2 "extra" buses, the 100MHz bus [2 slots] and
* the "extra" buses for each of those slots: total 7 buses.
*/
@@ -79,17 +84,18 @@ static int __init setup_pci_node_map_for_wpeg(int wpeg_num, int last_bus)
break;
case LookOutAWPEG:
case LookOutBWPEG:
- /* A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
+ /*
+ * A Lookout Winnipeg controls 3 100MHz buses [2 slots each]
* & the "extra" buses for each of those slots: total 9 buses.
*/
num_buses = 9;
break;
default:
- printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __FUNCTION__);
+ printk(KERN_INFO "%s: Unsupported Winnipeg type!\n", __func__);
return last_bus;
}
- for(bus = last_bus; bus < last_bus + num_buses; bus++)
+ for (bus = last_bus; bus < last_bus + num_buses; bus++)
mp_bus_id_to_node[bus] = node;
return bus;
}
@@ -99,14 +105,14 @@ static int __init build_detail_arrays(void)
unsigned long ptr;
int i, scal_detail_size, rio_detail_size;
- if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){
- printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __FUNCTION__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
+ if (rio_table_hdr->num_scal_dev > MAX_NUMNODES) {
+ printk(KERN_WARNING "%s: MAX_NUMNODES too low! Defined as %d, but system has %d nodes.\n", __func__, MAX_NUMNODES, rio_table_hdr->num_scal_dev);
return 0;
}
- switch (rio_table_hdr->version){
+ switch (rio_table_hdr->version) {
default:
- printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __FUNCTION__, rio_table_hdr->version);
+ printk(KERN_WARNING "%s: Invalid Rio Grande Table Version: %d\n", __func__, rio_table_hdr->version);
return 0;
case 2:
scal_detail_size = 11;
@@ -119,10 +125,10 @@ static int __init build_detail_arrays(void)
}
ptr = (unsigned long)rio_table_hdr + 3;
- for(i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
+ for (i = 0; i < rio_table_hdr->num_scal_dev; i++, ptr += scal_detail_size)
scal_devs[i] = (struct scal_detail *)ptr;
- for(i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
+ for (i = 0; i < rio_table_hdr->num_rio_dev; i++, ptr += rio_detail_size)
rio_devs[i] = (struct rio_detail *)ptr;
return 1;
@@ -135,14 +141,14 @@ void __init setup_summit(void)
int i, next_wpeg, next_bus = 0;
/* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */
- ptr = *(unsigned short *)phys_to_virt(0x40Eul);
- ptr = (unsigned long)phys_to_virt(ptr << 4);
+ ptr = get_bios_ebda();
+ ptr = (unsigned long)phys_to_virt(ptr);
rio_table_hdr = NULL;
offset = 0x180;
- while (offset){
+ while (offset) {
/* The block id is stored in the 2nd word */
- if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){
+ if (*((unsigned short *)(ptr + offset + 2)) == 0x4752) {
/* set the pointer past the offset & block id */
rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4);
break;
@@ -150,8 +156,8 @@ void __init setup_summit(void)
/* The next offset is stored in the 1st word. 0 means no more */
offset = *((unsigned short *)(ptr + offset));
}
- if (!rio_table_hdr){
- printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __FUNCTION__);
+ if (!rio_table_hdr) {
+ printk(KERN_ERR "%s: Unable to locate Rio Grande Table in EBDA - bailing!\n", __func__);
return;
}
@@ -161,8 +167,8 @@ void __init setup_summit(void)
/* The first Winnipeg we're looking for has an index of 0 */
next_wpeg = 0;
do {
- for(i = 0; i < rio_table_hdr->num_rio_dev; i++){
- if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg){
+ for (i = 0; i < rio_table_hdr->num_rio_dev; i++) {
+ if (is_WPEG(rio_devs[i]) && rio_devs[i]->WP_index == next_wpeg) {
/* It's the Winnipeg we're looking for! */
next_bus = setup_pci_node_map_for_wpeg(i, next_bus);
next_wpeg++;
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 9d498c2f8eea..170d43c17487 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -1,4 +1,4 @@
-/* System call table for x86-64. */
+/* System call table for x86-64. */
#include <linux/linkage.h>
#include <linux/sys.h>
@@ -7,20 +7,23 @@
#define __NO_STUBS
-#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
+#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
#undef _ASM_X86_64_UNISTD_H_
#include <asm/unistd_64.h>
#undef __SYSCALL
-#define __SYSCALL(nr, sym) [ nr ] = sym,
+#define __SYSCALL(nr, sym) [nr] = sym,
#undef _ASM_X86_64_UNISTD_H_
-typedef void (*sys_call_ptr_t)(void);
+typedef void (*sys_call_ptr_t)(void);
extern void sys_ni_syscall(void);
const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
- /* Smells like a like a compiler bug -- it doesn't work when the & below is removed. */
+ /*
+ *Smells like a like a compiler bug -- it doesn't work
+ *when the & below is removed.
+ */
[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/unistd_64.h>
};
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 10b8a6f69f84..787a5e499dd1 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -11,6 +11,8 @@
*/
#include <linux/module.h>
#include <linux/sort.h>
+#include <linux/slab.h>
+
#include <asm/uaccess.h>
#include <asm/asm.h>
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
new file mode 100644
index 000000000000..9bb2363851af
--- /dev/null
+++ b/arch/x86/kernel/tlb_32.c
@@ -0,0 +1,243 @@
+#include <linux/spinlock.h>
+#include <linux/cpu.h>
+#include <linux/interrupt.h>
+
+#include <asm/tlbflush.h>
+
+DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate)
+ ____cacheline_aligned = { &init_mm, 0, };
+
+/* must come after the send_IPI functions above for inlining */
+#include <mach_ipi.h>
+
+/*
+ * Smarter SMP flushing macros.
+ * c/o Linus Torvalds.
+ *
+ * These mean you can really definitely utterly forget about
+ * writing to user space from interrupts. (Its not allowed anyway).
+ *
+ * Optimizations Manfred Spraul <manfred@colorfullife.com>
+ */
+
+static cpumask_t flush_cpumask;
+static struct mm_struct *flush_mm;
+static unsigned long flush_va;
+static DEFINE_SPINLOCK(tlbstate_lock);
+
+/*
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update mm->cpu_vm_mask.
+ *
+ * We need to reload %cr3 since the page tables may be going
+ * away from under us..
+ */
+void leave_mm(int cpu)
+{
+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
+ BUG();
+ cpu_clear(cpu, per_cpu(cpu_tlbstate, cpu).active_mm->cpu_vm_mask);
+ load_cr3(swapper_pg_dir);
+}
+EXPORT_SYMBOL_GPL(leave_mm);
+
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * [cpu0: the cpu that switches]
+ * 1) switch_mm() either 1a) or 1b)
+ * 1a) thread switch to a different mm
+ * 1a1) cpu_clear(cpu, old_mm->cpu_vm_mask);
+ * Stop ipi delivery for the old mm. This is not synchronized with
+ * the other cpus, but smp_invalidate_interrupt ignore flush ipis
+ * for the wrong mm, and in the worst case we perform a superfluous
+ * tlb flush.
+ * 1a2) set cpu_tlbstate to TLBSTATE_OK
+ * Now the smp_invalidate_interrupt won't call leave_mm if cpu0
+ * was in lazy tlb mode.
+ * 1a3) update cpu_tlbstate[].active_mm
+ * Now cpu0 accepts tlb flushes for the new mm.
+ * 1a4) cpu_set(cpu, new_mm->cpu_vm_mask);
+ * Now the other cpus will send tlb flush ipis.
+ * 1a4) change cr3.
+ * 1b) thread switch without mm change
+ * cpu_tlbstate[].active_mm is correct, cpu0 already handles
+ * flush ipis.
+ * 1b1) set cpu_tlbstate to TLBSTATE_OK
+ * 1b2) test_and_set the cpu bit in cpu_vm_mask.
+ * Atomically set the bit [other cpus will start sending flush ipis],
+ * and test the bit.
+ * 1b3) if the bit was 0: leave_mm was called, flush the tlb.
+ * 2) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ * runs in kernel space, the cpu could load tlb entries for user space
+ * pages.
+ *
+ * The good news is that cpu_tlbstate is local to each cpu, no
+ * write/read ordering problems.
+ */
+
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ */
+
+void smp_invalidate_interrupt(struct pt_regs *regs)
+{
+ unsigned long cpu;
+
+ cpu = get_cpu();
+
+ if (!cpu_isset(cpu, flush_cpumask))
+ goto out;
+ /*
+ * This was a BUG() but until someone can quote me the
+ * line from the intel manual that guarantees an IPI to
+ * multiple CPUs is retried _only_ on the erroring CPUs
+ * its staying as a return
+ *
+ * BUG();
+ */
+
+ if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) {
+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) {
+ if (flush_va == TLB_FLUSH_ALL)
+ local_flush_tlb();
+ else
+ __flush_tlb_one(flush_va);
+ } else
+ leave_mm(cpu);
+ }
+ ack_APIC_irq();
+ smp_mb__before_clear_bit();
+ cpu_clear(cpu, flush_cpumask);
+ smp_mb__after_clear_bit();
+out:
+ put_cpu_no_resched();
+ __get_cpu_var(irq_stat).irq_tlb_count++;
+}
+
+void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
+ unsigned long va)
+{
+ cpumask_t cpumask = *cpumaskp;
+
+ /*
+ * A couple of (to be removed) sanity checks:
+ *
+ * - current CPU must not be in mask
+ * - mask must exist :)
+ */
+ BUG_ON(cpus_empty(cpumask));
+ BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+ BUG_ON(!mm);
+
+#ifdef CONFIG_HOTPLUG_CPU
+ /* If a CPU which we ran on has gone down, OK. */
+ cpus_and(cpumask, cpumask, cpu_online_map);
+ if (unlikely(cpus_empty(cpumask)))
+ return;
+#endif
+
+ /*
+ * i'm not happy about this global shared spinlock in the
+ * MM hot path, but we'll see how contended it is.
+ * AK: x86-64 has a faster method that could be ported.
+ */
+ spin_lock(&tlbstate_lock);
+
+ flush_mm = mm;
+ flush_va = va;
+ cpus_or(flush_cpumask, cpumask, flush_cpumask);
+ /*
+ * We have to send the IPI only to
+ * CPUs affected.
+ */
+ send_IPI_mask(cpumask, INVALIDATE_TLB_VECTOR);
+
+ while (!cpus_empty(flush_cpumask))
+ /* nothing. lockup detection does not belong here */
+ cpu_relax();
+
+ flush_mm = NULL;
+ flush_va = 0;
+ spin_unlock(&tlbstate_lock);
+}
+
+void flush_tlb_current_task(void)
+{
+ struct mm_struct *mm = current->mm;
+ cpumask_t cpu_mask;
+
+ preempt_disable();
+ cpu_mask = mm->cpu_vm_mask;
+ cpu_clear(smp_processor_id(), cpu_mask);
+
+ local_flush_tlb();
+ if (!cpus_empty(cpu_mask))
+ flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+ preempt_enable();
+}
+
+void flush_tlb_mm(struct mm_struct *mm)
+{
+ cpumask_t cpu_mask;
+
+ preempt_disable();
+ cpu_mask = mm->cpu_vm_mask;
+ cpu_clear(smp_processor_id(), cpu_mask);
+
+ if (current->active_mm == mm) {
+ if (current->mm)
+ local_flush_tlb();
+ else
+ leave_mm(smp_processor_id());
+ }
+ if (!cpus_empty(cpu_mask))
+ flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
+
+ preempt_enable();
+}
+
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ cpumask_t cpu_mask;
+
+ preempt_disable();
+ cpu_mask = mm->cpu_vm_mask;
+ cpu_clear(smp_processor_id(), cpu_mask);
+
+ if (current->active_mm == mm) {
+ if (current->mm)
+ __flush_tlb_one(va);
+ else
+ leave_mm(smp_processor_id());
+ }
+
+ if (!cpus_empty(cpu_mask))
+ flush_tlb_others(cpu_mask, mm, va);
+
+ preempt_enable();
+}
+EXPORT_SYMBOL(flush_tlb_page);
+
+static void do_flush_tlb_all(void *info)
+{
+ unsigned long cpu = smp_processor_id();
+
+ __flush_tlb_all();
+ if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_LAZY)
+ leave_mm(cpu);
+}
+
+void flush_tlb_all(void)
+{
+ on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+}
+
diff --git a/arch/x86/kernel/smp_64.c b/arch/x86/kernel/tlb_64.c
index 2fd74b06db67..a1f07d793202 100644
--- a/arch/x86/kernel/smp_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -1,14 +1,3 @@
-/*
- * Intel SMP support routines.
- *
- * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
- * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
- * (c) 2002,2003 Andi Kleen, SuSE Labs.
- *
- * This code is released under the GNU General Public License version 2 or
- * later.
- */
-
#include <linux/init.h>
#include <linux/mm.h>
@@ -22,12 +11,12 @@
#include <asm/mtrr.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
-#include <asm/mach_apic.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/apicdef.h>
#include <asm/idle.h>
+#include <mach_ipi.h>
/*
* Smarter SMP flushing macros.
* c/o Linus Torvalds.
@@ -202,13 +191,13 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm,
spin_unlock(&f->tlbstate_lock);
}
-int __cpuinit init_smp_flush(void)
+static int __cpuinit init_smp_flush(void)
{
int i;
- for_each_cpu_mask(i, cpu_possible_map) {
+ for_each_possible_cpu(i)
spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock);
- }
+
return 0;
}
core_initcall(init_smp_flush);
@@ -228,7 +217,7 @@ void flush_tlb_current_task(void)
preempt_enable();
}
-void flush_tlb_mm (struct mm_struct * mm)
+void flush_tlb_mm(struct mm_struct *mm)
{
cpumask_t cpu_mask;
@@ -248,7 +237,7 @@ void flush_tlb_mm (struct mm_struct * mm)
preempt_enable();
}
-void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
+void flush_tlb_page(struct vm_area_struct *vma, unsigned long va)
{
struct mm_struct *mm = vma->vm_mm;
cpumask_t cpu_mask;
@@ -258,7 +247,7 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
cpu_clear(smp_processor_id(), cpu_mask);
if (current->active_mm == mm) {
- if(current->mm)
+ if (current->mm)
__flush_tlb_one(va);
else
leave_mm(smp_processor_id());
@@ -270,7 +259,7 @@ void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
preempt_enable();
}
-static void do_flush_tlb_all(void* info)
+static void do_flush_tlb_all(void *info)
{
unsigned long cpu = smp_processor_id();
@@ -283,248 +272,3 @@ void flush_tlb_all(void)
{
on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
}
-
-/*
- * this function sends a 'reschedule' IPI to another CPU.
- * it goes straight through and wastes no time serializing
- * anything. Worst case is that we lose a reschedule ...
- */
-
-void smp_send_reschedule(int cpu)
-{
- send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
-}
-
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
- void (*func) (void *info);
- void *info;
- atomic_t started;
- atomic_t finished;
- int wait;
-};
-
-static struct call_data_struct * call_data;
-
-void lock_ipi_call_lock(void)
-{
- spin_lock_irq(&call_lock);
-}
-
-void unlock_ipi_call_lock(void)
-{
- spin_unlock_irq(&call_lock);
-}
-
-/*
- * this function sends a 'generic call function' IPI to all other CPU
- * of the system defined in the mask.
- */
-static int __smp_call_function_mask(cpumask_t mask,
- void (*func)(void *), void *info,
- int wait)
-{
- struct call_data_struct data;
- cpumask_t allbutself;
- int cpus;
-
- allbutself = cpu_online_map;
- cpu_clear(smp_processor_id(), allbutself);
-
- cpus_and(mask, mask, allbutself);
- cpus = cpus_weight(mask);
-
- if (!cpus)
- return 0;
-
- data.func = func;
- data.info = info;
- atomic_set(&data.started, 0);
- data.wait = wait;
- if (wait)
- atomic_set(&data.finished, 0);
-
- call_data = &data;
- wmb();
-
- /* Send a message to other CPUs */
- if (cpus_equal(mask, allbutself))
- send_IPI_allbutself(CALL_FUNCTION_VECTOR);
- else
- send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-
- /* Wait for response */
- while (atomic_read(&data.started) != cpus)
- cpu_relax();
-
- if (!wait)
- return 0;
-
- while (atomic_read(&data.finished) != cpus)
- cpu_relax();
-
- return 0;
-}
-/**
- * smp_call_function_mask(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on. Must not include the current cpu.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-int smp_call_function_mask(cpumask_t mask,
- void (*func)(void *), void *info,
- int wait)
-{
- int ret;
-
- /* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
-
- spin_lock(&call_lock);
- ret = __smp_call_function_mask(mask, func, info, wait);
- spin_unlock(&call_lock);
- return ret;
-}
-EXPORT_SYMBOL(smp_call_function_mask);
-
-/*
- * smp_call_function_single - Run a function on a specific CPU
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: Currently unused.
- * @wait: If true, wait until function has completed on other CPUs.
- *
- * Retrurns 0 on success, else a negative status code.
- *
- * Does not return until the remote CPU is nearly ready to execute <func>
- * or is or has executed.
- */
-
-int smp_call_function_single (int cpu, void (*func) (void *info), void *info,
- int nonatomic, int wait)
-{
- /* prevent preemption and reschedule on another processor */
- int ret, me = get_cpu();
-
- /* Can deadlock when called with interrupts disabled */
- WARN_ON(irqs_disabled());
-
- if (cpu == me) {
- local_irq_disable();
- func(info);
- local_irq_enable();
- put_cpu();
- return 0;
- }
-
- ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
-
- put_cpu();
- return ret;
-}
-EXPORT_SYMBOL(smp_call_function_single);
-
-/*
- * smp_call_function - run a function on all other CPUs.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: currently unused.
- * @wait: If true, wait (atomically) until function has completed on other
- * CPUs.
- *
- * Returns 0 on success, else a negative status code. Does not return until
- * remote CPUs are nearly ready to execute func or are or have executed.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- * Actually there are a few legal cases, like panic.
- */
-int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
- int wait)
-{
- return smp_call_function_mask(cpu_online_map, func, info, wait);
-}
-EXPORT_SYMBOL(smp_call_function);
-
-static void stop_this_cpu(void *dummy)
-{
- local_irq_disable();
- /*
- * Remove this CPU:
- */
- cpu_clear(smp_processor_id(), cpu_online_map);
- disable_local_APIC();
- for (;;)
- halt();
-}
-
-void smp_send_stop(void)
-{
- int nolock;
- unsigned long flags;
-
- if (reboot_force)
- return;
-
- /* Don't deadlock on the call lock in panic */
- nolock = !spin_trylock(&call_lock);
- local_irq_save(flags);
- __smp_call_function_mask(cpu_online_map, stop_this_cpu, NULL, 0);
- if (!nolock)
- spin_unlock(&call_lock);
- disable_local_APIC();
- local_irq_restore(flags);
-}
-
-/*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
- */
-asmlinkage void smp_reschedule_interrupt(void)
-{
- ack_APIC_irq();
- add_pda(irq_resched_count, 1);
-}
-
-asmlinkage void smp_call_function_interrupt(void)
-{
- void (*func) (void *info) = call_data->func;
- void *info = call_data->info;
- int wait = call_data->wait;
-
- ack_APIC_irq();
- /*
- * Notify initiating CPU that I've grabbed the data and am
- * about to execute the function
- */
- mb();
- atomic_inc(&call_data->started);
- /*
- * At this point the info structure may be out of scope unless wait==1
- */
- exit_idle();
- irq_enter();
- (*func)(info);
- add_pda(irq_call_count, 1);
- irq_exit();
- if (wait) {
- mb();
- atomic_inc(&call_data->finished);
- }
-}
-
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 022bcaa3b42e..ab6bf375a307 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -92,7 +92,7 @@ int do_set_thread_area(struct task_struct *p, int idx,
asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
{
int ret = do_set_thread_area(current, -1, u_info, 1);
- prevent_tail_call(ret);
+ asmlinkage_protect(1, ret, u_info);
return ret;
}
@@ -142,7 +142,7 @@ int do_get_thread_area(struct task_struct *p, int idx,
asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
{
int ret = do_get_thread_area(current, -1, u_info);
- prevent_tail_call(ret);
+ asmlinkage_protect(1, ret, u_info);
return ret;
}
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
new file mode 100644
index 000000000000..abbf199adebb
--- /dev/null
+++ b/arch/x86/kernel/trampoline.c
@@ -0,0 +1,18 @@
+#include <linux/io.h>
+
+#include <asm/trampoline.h>
+
+/* ready for x86_64, no harm for x86, since it will overwrite after alloc */
+unsigned char *trampoline_base = __va(TRAMPOLINE_BASE);
+
+/*
+ * Currently trivial. Write the real->protected mode
+ * bootstrap into the page concerned. The caller
+ * has made sure it's suitably aligned.
+ */
+unsigned long setup_trampoline(void)
+{
+ memcpy(trampoline_base, trampoline_data,
+ trampoline_end - trampoline_data);
+ return virt_to_phys(trampoline_base);
+}
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 64580679861e..d8ccc3c6552f 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -33,7 +33,7 @@
/* We can free up trampoline after bootup if cpu hotplug is not supported. */
#ifndef CONFIG_HOTPLUG_CPU
-.section ".init.data","aw",@progbits
+.section ".cpuinit.data","aw",@progbits
#else
.section .rodata,"a",@progbits
#endif
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 4aedd0bcee4c..894293c598db 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -30,12 +30,7 @@
#include <asm/msr.h>
#include <asm/segment.h>
-/* We can free up trampoline after bootup if cpu hotplug is not supported. */
-#ifndef CONFIG_HOTPLUG_CPU
-.section .init.data, "aw", @progbits
-#else
.section .rodata, "a", @progbits
-#endif
.code16
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index b22c01e05a18..bde6f63e15d5 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -9,26 +9,28 @@
* 'Traps.c' handles hardware traps and faults after we have saved some
* state in 'asm.s'.
*/
-#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/spinlock.h>
+#include <linux/highmem.h>
+#include <linux/kprobes.h>
+#include <linux/uaccess.h>
+#include <linux/utsname.h>
+#include <linux/kdebug.h>
#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
#include <linux/string.h>
+#include <linux/unwind.h>
+#include <linux/delay.h>
#include <linux/errno.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
#include <linux/timer.h>
-#include <linux/mm.h>
#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/spinlock.h>
-#include <linux/interrupt.h>
-#include <linux/highmem.h>
-#include <linux/kallsyms.h>
-#include <linux/ptrace.h>
-#include <linux/utsname.h>
-#include <linux/kprobes.h>
-#include <linux/kexec.h>
-#include <linux/unwind.h>
-#include <linux/uaccess.h>
-#include <linux/nmi.h>
#include <linux/bug.h>
+#include <linux/nmi.h>
+#include <linux/mm.h>
#ifdef CONFIG_EISA
#include <linux/ioport.h>
@@ -43,21 +45,18 @@
#include <linux/edac.h>
#endif
+#include <asm/arch_hooks.h>
+#include <asm/stacktrace.h>
#include <asm/processor.h>
-#include <asm/system.h>
-#include <asm/io.h>
-#include <asm/atomic.h>
#include <asm/debugreg.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/unwind.h>
#include <asm/desc.h>
#include <asm/i387.h>
#include <asm/nmi.h>
-#include <asm/unwind.h>
#include <asm/smp.h>
-#include <asm/arch_hooks.h>
-#include <linux/kdebug.h>
-#include <asm/stacktrace.h>
-
-#include <linux/module.h>
+#include <asm/io.h>
#include "mach_traps.h"
@@ -69,7 +68,7 @@ EXPORT_SYMBOL_GPL(used_vectors);
asmlinkage int system_call(void);
/* Do we ignore FPU interrupts ? */
-char ignore_fpu_irq = 0;
+char ignore_fpu_irq;
/*
* The IDT has to be page-aligned to simplify the Pentium
@@ -105,12 +104,13 @@ static unsigned int code_bytes = 64;
void printk_address(unsigned long address, int reliable)
{
#ifdef CONFIG_KALLSYMS
- unsigned long offset = 0, symsize;
+ char namebuf[KSYM_NAME_LEN];
+ unsigned long offset = 0;
+ unsigned long symsize;
const char *symname;
- char *modname;
- char *delim = ":";
- char namebuf[128];
char reliab[4] = "";
+ char *delim = ":";
+ char *modname;
symname = kallsyms_lookup(address, &symsize, &offset,
&modname, namebuf);
@@ -138,13 +138,14 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, void *p, unsigned s
/* The form of the top of the frame on the stack */
struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
+ struct stack_frame *next_frame;
+ unsigned long return_address;
};
-static inline unsigned long print_context_stack(struct thread_info *tinfo,
- unsigned long *stack, unsigned long bp,
- const struct stacktrace_ops *ops, void *data)
+static inline unsigned long
+print_context_stack(struct thread_info *tinfo,
+ unsigned long *stack, unsigned long bp,
+ const struct stacktrace_ops *ops, void *data)
{
struct stack_frame *frame = (struct stack_frame *)bp;
@@ -166,7 +167,7 @@ static inline unsigned long print_context_stack(struct thread_info *tinfo,
return bp;
}
-#define MSG(msg) ops->warning(data, msg)
+#define MSG(msg) ops->warning(data, msg)
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
@@ -177,6 +178,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (!stack) {
unsigned long dummy;
+
stack = &dummy;
if (task != current)
stack = (unsigned long *)task->thread.sp;
@@ -186,7 +188,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
if (!bp) {
if (task == current) {
/* Grab bp right from our regs */
- asm ("movl %%ebp, %0" : "=r" (bp) : );
+ asm("movl %%ebp, %0" : "=r" (bp) :);
} else {
/* bp is the last reg pushed by switch_to */
bp = *(unsigned long *) task->thread.sp;
@@ -196,15 +198,18 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
while (1) {
struct thread_info *context;
+
context = (struct thread_info *)
((unsigned long)stack & (~(THREAD_SIZE - 1)));
bp = print_context_stack(context, stack, bp, ops, data);
- /* Should be after the line below, but somewhere
- in early boot context comes out corrupted and we
- can't reference it -AK */
+ /*
+ * Should be after the line below, but somewhere
+ * in early boot context comes out corrupted and we
+ * can't reference it:
+ */
if (ops->stack(data, "IRQ") < 0)
break;
- stack = (unsigned long*)context->previous_esp;
+ stack = (unsigned long *)context->previous_esp;
if (!stack)
break;
touch_nmi_watchdog();
@@ -243,15 +248,15 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
}
static const struct stacktrace_ops print_trace_ops = {
- .warning = print_trace_warning,
- .warning_symbol = print_trace_warning_symbol,
- .stack = print_trace_stack,
- .address = print_trace_address,
+ .warning = print_trace_warning,
+ .warning_symbol = print_trace_warning_symbol,
+ .stack = print_trace_stack,
+ .address = print_trace_address,
};
static void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
+ unsigned long *stack, unsigned long bp, char *log_lvl)
{
dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
printk("%s =======================\n", log_lvl);
@@ -263,21 +268,22 @@ void show_trace(struct task_struct *task, struct pt_regs *regs,
show_trace_log_lvl(task, regs, stack, bp, "");
}
-static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+static void
+show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
+ unsigned long *sp, unsigned long bp, char *log_lvl)
{
unsigned long *stack;
int i;
if (sp == NULL) {
if (task)
- sp = (unsigned long*)task->thread.sp;
+ sp = (unsigned long *)task->thread.sp;
else
sp = (unsigned long *)&sp;
}
stack = sp;
- for(i = 0; i < kstack_depth_to_print; i++) {
+ for (i = 0; i < kstack_depth_to_print; i++) {
if (kstack_end(stack))
break;
if (i && ((i % 8) == 0))
@@ -285,6 +291,7 @@ static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
printk("%08lx ", *stack++);
}
printk("\n%sCall Trace:\n", log_lvl);
+
show_trace_log_lvl(task, regs, sp, bp, log_lvl);
}
@@ -299,8 +306,8 @@ void show_stack(struct task_struct *task, unsigned long *sp)
*/
void dump_stack(void)
{
- unsigned long stack;
unsigned long bp = 0;
+ unsigned long stack;
#ifdef CONFIG_FRAME_POINTER
if (!bp)
@@ -312,6 +319,7 @@ void dump_stack(void)
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
+
show_trace(current, NULL, &stack, bp);
}
@@ -323,6 +331,7 @@ void show_registers(struct pt_regs *regs)
print_modules();
__show_registers(regs, 0);
+
printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)",
TASK_COMM_LEN, current->comm, task_pid_nr(current),
current_thread_info(), current, task_thread_info(current));
@@ -331,10 +340,10 @@ void show_registers(struct pt_regs *regs)
* time of the fault..
*/
if (!user_mode_vm(regs)) {
- u8 *ip;
unsigned int code_prologue = code_bytes * 43 / 64;
unsigned int code_len = code_bytes;
unsigned char c;
+ u8 *ip;
printk("\n" KERN_EMERG "Stack: ");
show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
@@ -361,7 +370,7 @@ void show_registers(struct pt_regs *regs)
}
}
printk("\n");
-}
+}
int is_valid_bugaddr(unsigned long ip)
{
@@ -377,10 +386,10 @@ int is_valid_bugaddr(unsigned long ip)
static int die_counter;
-int __kprobes __die(const char * str, struct pt_regs * regs, long err)
+int __kprobes __die(const char *str, struct pt_regs *regs, long err)
{
- unsigned long sp;
unsigned short ss;
+ unsigned long sp;
printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
#ifdef CONFIG_PREEMPT
@@ -395,8 +404,8 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err)
printk("\n");
if (notify_die(DIE_OOPS, str, regs, err,
- current->thread.trap_no, SIGSEGV) !=
- NOTIFY_STOP) {
+ current->thread.trap_no, SIGSEGV) != NOTIFY_STOP) {
+
show_registers(regs);
/* Executive summary in case the oops scrolled away */
sp = (unsigned long) (&regs->sp);
@@ -408,17 +417,18 @@ int __kprobes __die(const char * str, struct pt_regs * regs, long err)
printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
print_symbol("%s", regs->ip);
printk(" SS:ESP %04x:%08lx\n", ss, sp);
+
return 0;
- } else {
- return 1;
}
+
+ return 1;
}
/*
- * This is gone through when something in the kernel has done something bad and
- * is about to be terminated.
+ * This is gone through when something in the kernel has done something bad
+ * and is about to be terminated:
*/
-void die(const char * str, struct pt_regs * regs, long err)
+void die(const char *str, struct pt_regs *regs, long err)
{
static struct {
raw_spinlock_t lock;
@@ -440,8 +450,9 @@ void die(const char * str, struct pt_regs * regs, long err)
die.lock_owner = smp_processor_id();
die.lock_owner_depth = 0;
bust_spinlocks(1);
- } else
+ } else {
raw_local_irq_save(flags);
+ }
if (++die.lock_owner_depth < 3) {
report_bug(regs->ip, regs);
@@ -474,19 +485,20 @@ void die(const char * str, struct pt_regs * regs, long err)
do_exit(SIGSEGV);
}
-static inline void die_if_kernel(const char * str, struct pt_regs * regs, long err)
+static inline void
+die_if_kernel(const char *str, struct pt_regs *regs, long err)
{
if (!user_mode_vm(regs))
die(str, regs, err);
}
-static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
- struct pt_regs * regs, long error_code,
- siginfo_t *info)
+static void __kprobes
+do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs,
+ long error_code, siginfo_t *info)
{
struct task_struct *tsk = current;
- if (regs->flags & VM_MASK) {
+ if (regs->flags & X86_VM_MASK) {
if (vm86)
goto vm86_trap;
goto trap_signal;
@@ -495,111 +507,112 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
if (!user_mode(regs))
goto kernel_trap;
- trap_signal: {
- /*
- * We want error_code and trap_no set for userspace faults and
- * kernelspace faults which result in die(), but not
- * kernelspace faults which are fixed up. die() gives the
- * process no chance to handle the signal and notice the
- * kernel fault information, so that won't result in polluting
- * the information about previously queued, but not yet
- * delivered, faults. See also do_general_protection below.
- */
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
+trap_signal:
+ /*
+ * We want error_code and trap_no set for userspace faults and
+ * kernelspace faults which result in die(), but not
+ * kernelspace faults which are fixed up. die() gives the
+ * process no chance to handle the signal and notice the
+ * kernel fault information, so that won't result in polluting
+ * the information about previously queued, but not yet
+ * delivered, faults. See also do_general_protection below.
+ */
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
- if (info)
- force_sig_info(signr, info, tsk);
- else
- force_sig(signr, tsk);
- return;
- }
+ if (info)
+ force_sig_info(signr, info, tsk);
+ else
+ force_sig(signr, tsk);
+ return;
- kernel_trap: {
- if (!fixup_exception(regs)) {
- tsk->thread.error_code = error_code;
- tsk->thread.trap_no = trapnr;
- die(str, regs, error_code);
- }
- return;
+kernel_trap:
+ if (!fixup_exception(regs)) {
+ tsk->thread.error_code = error_code;
+ tsk->thread.trap_no = trapnr;
+ die(str, regs, error_code);
}
+ return;
- vm86_trap: {
- int ret = handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr);
- if (ret) goto trap_signal;
- return;
- }
+vm86_trap:
+ if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
+ error_code, trapnr))
+ goto trap_signal;
+ return;
}
-#define DO_ERROR(trapnr, signr, str, name) \
-void do_##name(struct pt_regs * regs, long error_code) \
-{ \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
+#define DO_ERROR(trapnr, signr, str, name) \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
+ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
}
-#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
-void do_##name(struct pt_regs * regs, long error_code) \
-{ \
- siginfo_t info; \
- if (irq) \
- local_irq_enable(); \
- info.si_signo = signr; \
- info.si_errno = 0; \
- info.si_code = sicode; \
- info.si_addr = (void __user *)siaddr; \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
+#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+ siginfo_t info; \
+ if (irq) \
+ local_irq_enable(); \
+ info.si_signo = signr; \
+ info.si_errno = 0; \
+ info.si_code = sicode; \
+ info.si_addr = (void __user *)siaddr; \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
+ do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
}
-#define DO_VM86_ERROR(trapnr, signr, str, name) \
-void do_##name(struct pt_regs * regs, long error_code) \
-{ \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
+#define DO_VM86_ERROR(trapnr, signr, str, name) \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
+ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
}
-#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
-void do_##name(struct pt_regs * regs, long error_code) \
-{ \
- siginfo_t info; \
- info.si_signo = signr; \
- info.si_errno = 0; \
- info.si_code = sicode; \
- info.si_addr = (void __user *)siaddr; \
- trace_hardirqs_fixup(); \
- if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
- == NOTIFY_STOP) \
- return; \
- do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
+#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
+void do_##name(struct pt_regs *regs, long error_code) \
+{ \
+ siginfo_t info; \
+ info.si_signo = signr; \
+ info.si_errno = 0; \
+ info.si_code = sicode; \
+ info.si_addr = (void __user *)siaddr; \
+ trace_hardirqs_fixup(); \
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
+ == NOTIFY_STOP) \
+ return; \
+ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \
}
-DO_VM86_ERROR_INFO( 0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
+DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
#ifndef CONFIG_KPROBES
-DO_VM86_ERROR( 3, SIGTRAP, "int3", int3)
+DO_VM86_ERROR(3, SIGTRAP, "int3", int3)
#endif
-DO_VM86_ERROR( 4, SIGSEGV, "overflow", overflow)
-DO_VM86_ERROR( 5, SIGSEGV, "bounds", bounds)
-DO_ERROR_INFO( 6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
-DO_ERROR( 9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
+DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
+DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
+DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
+DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0)
-DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1)
+DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1)
-void __kprobes do_general_protection(struct pt_regs * regs,
- long error_code)
+void __kprobes do_general_protection(struct pt_regs *regs, long error_code)
{
- int cpu = get_cpu();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
- struct thread_struct *thread = &current->thread;
+ struct thread_struct *thread;
+ struct tss_struct *tss;
+ int cpu;
+
+ cpu = get_cpu();
+ tss = &per_cpu(init_tss, cpu);
+ thread = &current->thread;
/*
* Perform the lazy TSS's I/O bitmap copy. If the TSS has an
@@ -616,19 +629,21 @@ void __kprobes do_general_protection(struct pt_regs * regs,
* If the previously set map was extending to higher ports
* than the current one, pad extra space with 0xff (no access).
*/
- if (thread->io_bitmap_max < tss->io_bitmap_max)
+ if (thread->io_bitmap_max < tss->io_bitmap_max) {
memset((char *) tss->io_bitmap +
thread->io_bitmap_max, 0xff,
tss->io_bitmap_max - thread->io_bitmap_max);
+ }
tss->io_bitmap_max = thread->io_bitmap_max;
tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
tss->io_bitmap_owner = thread;
put_cpu();
+
return;
}
put_cpu();
- if (regs->flags & VM_MASK)
+ if (regs->flags & X86_VM_MASK)
goto gp_in_vm86;
if (!user_mode(regs))
@@ -636,6 +651,7 @@ void __kprobes do_general_protection(struct pt_regs * regs,
current->thread.error_code = error_code;
current->thread.trap_no = 13;
+
if (show_unhandled_signals && unhandled_signal(current, SIGSEGV) &&
printk_ratelimit()) {
printk(KERN_INFO
@@ -665,22 +681,25 @@ gp_in_kernel:
}
}
-static __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs * regs)
+static notrace __kprobes void
+mem_parity_error(unsigned char reason, struct pt_regs *regs)
{
- printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
- "CPU %d.\n", reason, smp_processor_id());
- printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
+ printk(KERN_EMERG
+ "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+
+ printk(KERN_EMERG
+ "You have some hardware problem, likely on the PCI bus.\n");
#if defined(CONFIG_EDAC)
- if(edac_handler_set()) {
+ if (edac_handler_set()) {
edac_atomic_assert_error();
return;
}
#endif
if (panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ panic("NMI: Not continuing");
printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
@@ -688,8 +707,8 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
clear_mem_error(reason);
}
-static __kprobes void
-io_check_error(unsigned char reason, struct pt_regs * regs)
+static notrace __kprobes void
+io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
@@ -699,44 +718,52 @@ io_check_error(unsigned char reason, struct pt_regs * regs)
/* Re-enable the IOCK line, wait for a few seconds */
reason = (reason & 0xf) | 8;
outb(reason, 0x61);
+
i = 2000;
- while (--i) udelay(1000);
+ while (--i)
+ udelay(1000);
+
reason &= ~8;
outb(reason, 0x61);
}
-static __kprobes void
-unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
+static notrace __kprobes void
+unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
{
+ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+ return;
#ifdef CONFIG_MCA
- /* Might actually be able to figure out what the guilty party
- * is. */
- if( MCA_bus ) {
+ /*
+ * Might actually be able to figure out what the guilty party
+ * is:
+ */
+ if (MCA_bus) {
mca_handle_nmi();
return;
}
#endif
- printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on "
- "CPU %d.\n", reason, smp_processor_id());
+ printk(KERN_EMERG
+ "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+
printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
if (panic_on_unrecovered_nmi)
- panic("NMI: Not continuing");
+ panic("NMI: Not continuing");
printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
}
static DEFINE_SPINLOCK(nmi_print_lock);
-void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
+void notrace __kprobes die_nmi(struct pt_regs *regs, const char *msg)
{
- if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) ==
- NOTIFY_STOP)
+ if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 2, SIGINT) == NOTIFY_STOP)
return;
spin_lock(&nmi_print_lock);
/*
* We are in trouble anyway, lets at least try
- * to get a message out.
+ * to get a message out:
*/
bust_spinlocks(1);
printk(KERN_EMERG "%s", msg);
@@ -747,9 +774,10 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
spin_unlock(&nmi_print_lock);
bust_spinlocks(0);
- /* If we are in kernel we are probably nested up pretty bad
- * and might aswell get out now while we still can.
- */
+ /*
+ * If we are in kernel we are probably nested up pretty bad
+ * and might aswell get out now while we still can:
+ */
if (!user_mode_vm(regs)) {
current->thread.trap_no = 2;
crash_kexec(regs);
@@ -758,14 +786,14 @@ void __kprobes die_nmi(struct pt_regs *regs, const char *msg)
do_exit(SIGSEGV);
}
-static __kprobes void default_do_nmi(struct pt_regs * regs)
+static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
- /* Only the BSP gets external NMIs from the system. */
+ /* Only the BSP gets external NMIs from the system: */
if (!smp_processor_id())
reason = get_nmi_reason();
-
+
if (!(reason & 0xc0)) {
if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
== NOTIFY_STOP)
@@ -778,8 +806,10 @@ static __kprobes void default_do_nmi(struct pt_regs * regs)
if (nmi_watchdog_tick(regs, reason))
return;
if (!do_nmi_callback(regs, smp_processor_id()))
-#endif
unknown_nmi_error(reason, regs);
+#else
+ unknown_nmi_error(reason, regs);
+#endif
return;
}
@@ -791,14 +821,14 @@ static __kprobes void default_do_nmi(struct pt_regs * regs)
io_check_error(reason, regs);
/*
* Reassert NMI in case it became active meanwhile
- * as it's edge-triggered.
+ * as it's edge-triggered:
*/
reassert_nmi();
}
static int ignore_nmis;
-__kprobes void do_nmi(struct pt_regs * regs, long error_code)
+notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code)
{
int cpu;
@@ -834,9 +864,12 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code)
if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
== NOTIFY_STOP)
return;
- /* This is an interrupt gate, because kprobes wants interrupts
- disabled. Normal trap handlers don't. */
+ /*
+ * This is an interrupt gate, because kprobes wants interrupts
+ * disabled. Normal trap handlers don't.
+ */
restore_interrupts(regs);
+
do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL);
}
#endif
@@ -851,7 +884,7 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code)
* from user space. Such code must not hold kernel locks (since it
* can equally take a page fault), therefore it is safe to call
* force_sig_info even though that claims and releases locks.
- *
+ *
* Code in ./signal.c ensures that the debug control register
* is restored before we deliver any signal, and therefore that
* user code runs with the correct debug control register even though
@@ -863,10 +896,10 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code)
* find every occurrence of the TF bit that could be saved away even
* by user code)
*/
-void __kprobes do_debug(struct pt_regs * regs, long error_code)
+void __kprobes do_debug(struct pt_regs *regs, long error_code)
{
- unsigned int condition;
struct task_struct *tsk = current;
+ unsigned int condition;
trace_hardirqs_fixup();
@@ -891,7 +924,7 @@ void __kprobes do_debug(struct pt_regs * regs, long error_code)
goto clear_dr7;
}
- if (regs->flags & VM_MASK)
+ if (regs->flags & X86_VM_MASK)
goto debug_vm86;
/* Save debug status register where ptrace can see it */
@@ -914,7 +947,8 @@ void __kprobes do_debug(struct pt_regs * regs, long error_code)
/* Ok, finally something we can handle */
send_sigtrap(tsk, regs, error_code);
- /* Disable additional traps. They'll be re-enabled when
+ /*
+ * Disable additional traps. They'll be re-enabled when
* the signal is delivered.
*/
clear_dr7:
@@ -927,7 +961,7 @@ debug_vm86:
clear_TF_reenable:
set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
- regs->flags &= ~TF_MASK;
+ regs->flags &= ~X86_EFLAGS_TF;
return;
}
@@ -938,9 +972,10 @@ clear_TF_reenable:
*/
void math_error(void __user *ip)
{
- struct task_struct * task;
+ struct task_struct *task;
+ unsigned short cwd;
+ unsigned short swd;
siginfo_t info;
- unsigned short cwd, swd;
/*
* Save the info for the exception handler and clear the error.
@@ -966,36 +1001,36 @@ void math_error(void __user *ip)
cwd = get_fpu_cwd(task);
swd = get_fpu_swd(task);
switch (swd & ~cwd & 0x3f) {
- case 0x000: /* No unmasked exception */
- return;
- default: /* Multiple exceptions */
- break;
- case 0x001: /* Invalid Op */
- /*
- * swd & 0x240 == 0x040: Stack Underflow
- * swd & 0x240 == 0x240: Stack Overflow
- * User must clear the SF bit (0x40) if set
- */
- info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
- info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
- info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
- info.si_code = FPE_FLTRES;
- break;
+ case 0x000: /* No unmasked exception */
+ return;
+ default: /* Multiple exceptions */
+ break;
+ case 0x001: /* Invalid Op */
+ /*
+ * swd & 0x240 == 0x040: Stack Underflow
+ * swd & 0x240 == 0x240: Stack Overflow
+ * User must clear the SF bit (0x40) if set
+ */
+ info.si_code = FPE_FLTINV;
+ break;
+ case 0x002: /* Denormalize */
+ case 0x010: /* Underflow */
+ info.si_code = FPE_FLTUND;
+ break;
+ case 0x004: /* Zero Divide */
+ info.si_code = FPE_FLTDIV;
+ break;
+ case 0x008: /* Overflow */
+ info.si_code = FPE_FLTOVF;
+ break;
+ case 0x020: /* Precision */
+ info.si_code = FPE_FLTRES;
+ break;
}
force_sig_info(SIGFPE, &info, task);
}
-void do_coprocessor_error(struct pt_regs * regs, long error_code)
+void do_coprocessor_error(struct pt_regs *regs, long error_code)
{
ignore_fpu_irq = 1;
math_error((void __user *)regs->ip);
@@ -1003,9 +1038,9 @@ void do_coprocessor_error(struct pt_regs * regs, long error_code)
static void simd_math_error(void __user *ip)
{
- struct task_struct * task;
- siginfo_t info;
+ struct task_struct *task;
unsigned short mxcsr;
+ siginfo_t info;
/*
* Save the info for the exception handler and clear the error.
@@ -1026,82 +1061,80 @@ static void simd_math_error(void __user *ip)
*/
mxcsr = get_fpu_mxcsr(task);
switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
- case 0x000:
- default:
- break;
- case 0x001: /* Invalid Op */
- info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
- info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
- info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
- info.si_code = FPE_FLTRES;
- break;
+ case 0x000:
+ default:
+ break;
+ case 0x001: /* Invalid Op */
+ info.si_code = FPE_FLTINV;
+ break;
+ case 0x002: /* Denormalize */
+ case 0x010: /* Underflow */
+ info.si_code = FPE_FLTUND;
+ break;
+ case 0x004: /* Zero Divide */
+ info.si_code = FPE_FLTDIV;
+ break;
+ case 0x008: /* Overflow */
+ info.si_code = FPE_FLTOVF;
+ break;
+ case 0x020: /* Precision */
+ info.si_code = FPE_FLTRES;
+ break;
}
force_sig_info(SIGFPE, &info, task);
}
-void do_simd_coprocessor_error(struct pt_regs * regs,
- long error_code)
+void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
{
if (cpu_has_xmm) {
/* Handle SIMD FPU exceptions on PIII+ processors. */
ignore_fpu_irq = 1;
simd_math_error((void __user *)regs->ip);
- } else {
- /*
- * Handle strange cache flush from user space exception
- * in all other cases. This is undocumented behaviour.
- */
- if (regs->flags & VM_MASK) {
- handle_vm86_fault((struct kernel_vm86_regs *)regs,
- error_code);
- return;
- }
- current->thread.trap_no = 19;
- current->thread.error_code = error_code;
- die_if_kernel("cache flush denied", regs, error_code);
- force_sig(SIGSEGV, current);
+ return;
}
+ /*
+ * Handle strange cache flush from user space exception
+ * in all other cases. This is undocumented behaviour.
+ */
+ if (regs->flags & X86_VM_MASK) {
+ handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
+ return;
+ }
+ current->thread.trap_no = 19;
+ current->thread.error_code = error_code;
+ die_if_kernel("cache flush denied", regs, error_code);
+ force_sig(SIGSEGV, current);
}
-void do_spurious_interrupt_bug(struct pt_regs * regs,
- long error_code)
+void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
{
#if 0
/* No need to warn about this any longer. */
- printk("Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
+ printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
#endif
}
-unsigned long patch_espfix_desc(unsigned long uesp,
- unsigned long kesp)
+unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
{
struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt;
unsigned long base = (kesp - uesp) & -THREAD_SIZE;
unsigned long new_kesp = kesp - base;
unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
__u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
+
/* Set up base for espfix segment */
- desc &= 0x00f0ff0000000000ULL;
- desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
+ desc &= 0x00f0ff0000000000ULL;
+ desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
((((__u64)base) << 32) & 0xff00000000000000ULL) |
((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
(lim_pages & 0xffff);
*(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
+
return new_kesp;
}
/*
- * 'math_state_restore()' saves the current math information in the
+ * 'math_state_restore()' saves the current math information in the
* old math state array, and gets the new ones from the current task
*
* Careful.. There are problems with IBM-designed IRQ13 behaviour.
@@ -1115,9 +1148,22 @@ asmlinkage void math_state_restore(void)
struct thread_info *thread = current_thread_info();
struct task_struct *tsk = thread->task;
- clts(); /* Allow maths ops (or we recurse) */
- if (!tsk_used_math(tsk))
- init_fpu(tsk);
+ if (!tsk_used_math(tsk)) {
+ local_irq_enable();
+ /*
+ * does a slab alloc which can sleep
+ */
+ if (init_fpu(tsk)) {
+ /*
+ * ran out of memory!
+ */
+ do_group_exit(SIGKILL);
+ return;
+ }
+ local_irq_disable();
+ }
+
+ clts(); /* Allow maths ops (or we recurse) */
restore_fpu(tsk);
thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
tsk->fpu_counter++;
@@ -1128,80 +1174,76 @@ EXPORT_SYMBOL_GPL(math_state_restore);
asmlinkage void math_emulate(long arg)
{
- printk(KERN_EMERG "math-emulation not enabled and no coprocessor found.\n");
- printk(KERN_EMERG "killing %s.\n",current->comm);
- force_sig(SIGFPE,current);
+ printk(KERN_EMERG
+ "math-emulation not enabled and no coprocessor found.\n");
+ printk(KERN_EMERG "killing %s.\n", current->comm);
+ force_sig(SIGFPE, current);
schedule();
}
#endif /* CONFIG_MATH_EMULATION */
-
void __init trap_init(void)
{
int i;
#ifdef CONFIG_EISA
void __iomem *p = early_ioremap(0x0FFFD9, 4);
- if (readl(p) == 'E'+('I'<<8)+('S'<<16)+('A'<<24)) {
+
+ if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24))
EISA_bus = 1;
- }
early_iounmap(p, 4);
#endif
#ifdef CONFIG_X86_LOCAL_APIC
init_apic_mappings();
#endif
-
- set_trap_gate(0,&divide_error);
- set_intr_gate(1,&debug);
- set_intr_gate(2,&nmi);
+ set_trap_gate(0, &divide_error);
+ set_intr_gate(1, &debug);
+ set_intr_gate(2, &nmi);
set_system_intr_gate(3, &int3); /* int3/4 can be called from all */
- set_system_gate(4,&overflow);
- set_trap_gate(5,&bounds);
- set_trap_gate(6,&invalid_op);
- set_trap_gate(7,&device_not_available);
- set_task_gate(8,GDT_ENTRY_DOUBLEFAULT_TSS);
- set_trap_gate(9,&coprocessor_segment_overrun);
- set_trap_gate(10,&invalid_TSS);
- set_trap_gate(11,&segment_not_present);
- set_trap_gate(12,&stack_segment);
- set_trap_gate(13,&general_protection);
- set_intr_gate(14,&page_fault);
- set_trap_gate(15,&spurious_interrupt_bug);
- set_trap_gate(16,&coprocessor_error);
- set_trap_gate(17,&alignment_check);
+ set_system_gate(4, &overflow);
+ set_trap_gate(5, &bounds);
+ set_trap_gate(6, &invalid_op);
+ set_trap_gate(7, &device_not_available);
+ set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
+ set_trap_gate(9, &coprocessor_segment_overrun);
+ set_trap_gate(10, &invalid_TSS);
+ set_trap_gate(11, &segment_not_present);
+ set_trap_gate(12, &stack_segment);
+ set_trap_gate(13, &general_protection);
+ set_intr_gate(14, &page_fault);
+ set_trap_gate(15, &spurious_interrupt_bug);
+ set_trap_gate(16, &coprocessor_error);
+ set_trap_gate(17, &alignment_check);
#ifdef CONFIG_X86_MCE
- set_trap_gate(18,&machine_check);
+ set_trap_gate(18, &machine_check);
#endif
- set_trap_gate(19,&simd_coprocessor_error);
+ set_trap_gate(19, &simd_coprocessor_error);
- /*
- * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned.
- * Generate a build-time error if the alignment is wrong.
- */
- BUILD_BUG_ON(offsetof(struct task_struct, thread.i387.fxsave) & 15);
if (cpu_has_fxsr) {
printk(KERN_INFO "Enabling fast FPU save and restore... ");
set_in_cr4(X86_CR4_OSFXSR);
printk("done.\n");
}
if (cpu_has_xmm) {
- printk(KERN_INFO "Enabling unmasked SIMD FPU exception "
- "support... ");
+ printk(KERN_INFO
+ "Enabling unmasked SIMD FPU exception support... ");
set_in_cr4(X86_CR4_OSXMMEXCPT);
printk("done.\n");
}
- set_system_gate(SYSCALL_VECTOR,&system_call);
+ set_system_gate(SYSCALL_VECTOR, &system_call);
- /* Reserve all the builtin and the syscall vector. */
+ /* Reserve all the builtin and the syscall vector: */
for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
set_bit(i, used_vectors);
+
set_bit(SYSCALL_VECTOR, used_vectors);
+ init_thread_xstate();
/*
- * Should be a barrier for any external CPU state.
+ * Should be a barrier for any external CPU state:
*/
cpu_init();
@@ -1211,6 +1253,7 @@ void __init trap_init(void)
static int __init kstack_setup(char *s)
{
kstack_depth_to_print = simple_strtoul(s, NULL, 0);
+
return 1;
}
__setup("kstack=", kstack_setup);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index 045466681911..adff76ea97c4 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -33,6 +33,8 @@
#include <linux/kdebug.h>
#include <linux/utsname.h>
+#include <mach_traps.h>
+
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
@@ -598,10 +600,16 @@ void die(const char * str, struct pt_regs * regs, long err)
oops_end(flags, regs, SIGSEGV);
}
-void __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
+notrace __kprobes void
+die_nmi(char *str, struct pt_regs *regs, int do_panic)
{
- unsigned long flags = oops_begin();
+ unsigned long flags;
+
+ if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) ==
+ NOTIFY_STOP)
+ return;
+ flags = oops_begin();
/*
* We are in trouble anyway, lets at least try
* to get a message out.
@@ -765,7 +773,7 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs,
die("general protection fault", regs, error_code);
}
-static __kprobes void
+static notrace __kprobes void
mem_parity_error(unsigned char reason, struct pt_regs * regs)
{
printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
@@ -789,7 +797,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs)
outb(reason, 0x61);
}
-static __kprobes void
+static notrace __kprobes void
io_check_error(unsigned char reason, struct pt_regs * regs)
{
printk("NMI: IOCK error (debug interrupt?)\n");
@@ -803,9 +811,11 @@ io_check_error(unsigned char reason, struct pt_regs * regs)
outb(reason, 0x61);
}
-static __kprobes void
+static notrace __kprobes void
unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
{
+ if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+ return;
printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
reason);
printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
@@ -818,7 +828,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
/* Runs on IST stack. This code must keep interrupts off all the time.
Nested NMIs are prevented by the CPU. */
-asmlinkage __kprobes void default_do_nmi(struct pt_regs *regs)
+asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int cpu;
@@ -1114,11 +1124,24 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
asmlinkage void math_state_restore(void)
{
struct task_struct *me = current;
- clts(); /* Allow maths ops (or we recurse) */
- if (!used_math())
- init_fpu(me);
- restore_fpu_checking(&me->thread.i387.fxsave);
+ if (!used_math()) {
+ local_irq_enable();
+ /*
+ * does a slab alloc which can sleep
+ */
+ if (init_fpu(me)) {
+ /*
+ * ran out of memory!
+ */
+ do_group_exit(SIGKILL);
+ return;
+ }
+ local_irq_disable();
+ }
+
+ clts(); /* Allow maths ops (or we recurse) */
+ restore_fpu_checking(&me->thread.xstate->fxsave);
task_thread_info(me)->status |= TS_USEDFPU;
me->fpu_counter++;
}
@@ -1154,6 +1177,10 @@ void __init trap_init(void)
#endif
/*
+ * initialize the per thread extended state:
+ */
+ init_thread_xstate();
+ /*
* Should be a barrier for any external CPU state.
*/
cpu_init();
diff --git a/arch/x86/kernel/tsc_32.c b/arch/x86/kernel/tsc_32.c
index c2241e04ea5f..e4790728b224 100644
--- a/arch/x86/kernel/tsc_32.c
+++ b/arch/x86/kernel/tsc_32.c
@@ -84,8 +84,8 @@ DEFINE_PER_CPU(unsigned long, cyc2ns);
static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
{
- unsigned long flags, prev_scale, *scale;
unsigned long long tsc_now, ns_now;
+ unsigned long flags, *scale;
local_irq_save(flags);
sched_clock_idle_sleep_event();
@@ -95,7 +95,6 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
rdtscll(tsc_now);
ns_now = __cycles_2_ns(tsc_now);
- prev_scale = *scale;
if (cpu_khz)
*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
@@ -222,9 +221,9 @@ EXPORT_SYMBOL(recalibrate_cpu_khz);
* if the CPU frequency is scaled, TSC-based delays will need a different
* loops_per_jiffy value to function properly.
*/
-static unsigned int ref_freq = 0;
-static unsigned long loops_per_jiffy_ref = 0;
-static unsigned long cpu_khz_ref = 0;
+static unsigned int ref_freq;
+static unsigned long loops_per_jiffy_ref;
+static unsigned long cpu_khz_ref;
static int
time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
@@ -284,15 +283,28 @@ core_initcall(cpufreq_tsc);
/* clock source code */
-static unsigned long current_tsc_khz = 0;
+static unsigned long current_tsc_khz;
+static struct clocksource clocksource_tsc;
+/*
+ * We compare the TSC to the cycle_last value in the clocksource
+ * structure to avoid a nasty time-warp issue. This can be observed in
+ * a very small window right after one CPU updated cycle_last under
+ * xtime lock and the other CPU reads a TSC value which is smaller
+ * than the cycle_last reference value due to a TSC which is slighty
+ * behind. This delta is nowhere else observable, but in that case it
+ * results in a forward time jump in the range of hours due to the
+ * unsigned delta calculation of the time keeping core code, which is
+ * necessary to support wrapping clocksources like pm timer.
+ */
static cycle_t read_tsc(void)
{
cycle_t ret;
rdtscll(ret);
- return ret;
+ return ret >= clocksource_tsc.cycle_last ?
+ ret : clocksource_tsc.cycle_last;
}
static struct clocksource clocksource_tsc = {
@@ -392,13 +404,15 @@ void __init tsc_init(void)
int cpu;
if (!cpu_has_tsc)
- goto out_no_tsc;
+ return;
cpu_khz = calculate_cpu_khz();
tsc_khz = cpu_khz;
- if (!cpu_khz)
- goto out_no_tsc;
+ if (!cpu_khz) {
+ mark_tsc_unstable("could not calculate TSC khz");
+ return;
+ }
printk("Detected %lu.%03lu MHz processor.\n",
(unsigned long)cpu_khz / 1000,
@@ -431,9 +445,4 @@ void __init tsc_init(void)
tsc_enabled = 1;
clocksource_register(&clocksource_tsc);
-
- return;
-
-out_no_tsc:
- setup_clear_cpu_cap(X86_FEATURE_TSC);
}
diff --git a/arch/x86/kernel/tsc_64.c b/arch/x86/kernel/tsc_64.c
index d3bebaaad842..fcc16e58609e 100644
--- a/arch/x86/kernel/tsc_64.c
+++ b/arch/x86/kernel/tsc_64.c
@@ -11,6 +11,7 @@
#include <asm/hpet.h>
#include <asm/timex.h>
#include <asm/timer.h>
+#include <asm/vgtod.h>
static int notsc __initdata = 0;
@@ -44,8 +45,8 @@ DEFINE_PER_CPU(unsigned long, cyc2ns);
static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
{
- unsigned long flags, prev_scale, *scale;
unsigned long long tsc_now, ns_now;
+ unsigned long flags, *scale;
local_irq_save(flags);
sched_clock_idle_sleep_event();
@@ -55,7 +56,6 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
rdtscll(tsc_now);
ns_now = __cycles_2_ns(tsc_now);
- prev_scale = *scale;
if (cpu_khz)
*scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz;
@@ -288,18 +288,34 @@ int __init notsc_setup(char *s)
__setup("notsc", notsc_setup);
+static struct clocksource clocksource_tsc;
-/* clock source code: */
+/*
+ * We compare the TSC to the cycle_last value in the clocksource
+ * structure to avoid a nasty time-warp. This can be observed in a
+ * very small window right after one CPU updated cycle_last under
+ * xtime/vsyscall_gtod lock and the other CPU reads a TSC value which
+ * is smaller than the cycle_last reference value due to a TSC which
+ * is slighty behind. This delta is nowhere else observable, but in
+ * that case it results in a forward time jump in the range of hours
+ * due to the unsigned delta calculation of the time keeping core
+ * code, which is necessary to support wrapping clocksources like pm
+ * timer.
+ */
static cycle_t read_tsc(void)
{
cycle_t ret = (cycle_t)get_cycles();
- return ret;
+
+ return ret >= clocksource_tsc.cycle_last ?
+ ret : clocksource_tsc.cycle_last;
}
static cycle_t __vsyscall_fn vread_tsc(void)
{
cycle_t ret = (cycle_t)vget_cycles();
- return ret;
+
+ return ret >= __vsyscall_gtod_data.clock.cycle_last ?
+ ret : __vsyscall_gtod_data.clock.cycle_last;
}
static struct clocksource clocksource_tsc = {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 738c2104df30..38f566fa27d2 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -64,7 +64,7 @@
#define KVM86 ((struct kernel_vm86_struct *)regs)
-#define VMPI KVM86->vm86plus
+#define VMPI KVM86->vm86plus
/*
@@ -81,7 +81,7 @@
#define VFLAGS (*(unsigned short *)&(current->thread.v86flags))
#define VEFLAGS (current->thread.v86flags)
-#define set_flags(X,new,mask) \
+#define set_flags(X, new, mask) \
((X) = ((X) & ~(mask)) | ((new) & (mask)))
#define SAFE_MASK (0xDD5)
@@ -93,8 +93,10 @@ static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
{
int ret = 0;
- /* kernel_vm86_regs is missing gs, so copy everything up to
- (but not including) orig_eax, and then rest including orig_eax. */
+ /*
+ * kernel_vm86_regs is missing gs, so copy everything up to
+ * (but not including) orig_eax, and then rest including orig_eax.
+ */
ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
sizeof(struct kernel_vm86_regs) -
@@ -120,7 +122,7 @@ static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
return ret;
}
-struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs)
+struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
{
struct tss_struct *tss;
struct pt_regs *ret;
@@ -137,9 +139,9 @@ struct pt_regs * save_v86_state(struct kernel_vm86_regs * regs)
printk("no vm86_info: BAD\n");
do_exit(SIGSEGV);
}
- set_flags(regs->pt.flags, VEFLAGS, VIF_MASK | current->thread.v86mask);
- tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs,regs);
- tmp += put_user(current->thread.screen_bitmap,&current->thread.vm86_info->screen_bitmap);
+ set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
+ tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
+ tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
if (tmp) {
printk("vm86: could not access userspace vm86_info\n");
do_exit(SIGSEGV);
@@ -237,20 +239,21 @@ asmlinkage int sys_vm86(struct pt_regs regs)
tsk = current;
switch (regs.bx) {
- case VM86_REQUEST_IRQ:
- case VM86_FREE_IRQ:
- case VM86_GET_IRQ_BITS:
- case VM86_GET_AND_RESET_IRQ:
- ret = do_vm86_irq_handling(regs.bx, (int)regs.cx);
- goto out;
- case VM86_PLUS_INSTALL_CHECK:
- /* NOTE: on old vm86 stuff this will return the error
- from access_ok(), because the subfunction is
- interpreted as (invalid) address to vm86_struct.
- So the installation check works.
- */
- ret = 0;
- goto out;
+ case VM86_REQUEST_IRQ:
+ case VM86_FREE_IRQ:
+ case VM86_GET_IRQ_BITS:
+ case VM86_GET_AND_RESET_IRQ:
+ ret = do_vm86_irq_handling(regs.bx, (int)regs.cx);
+ goto out;
+ case VM86_PLUS_INSTALL_CHECK:
+ /*
+ * NOTE: on old vm86 stuff this will return the error
+ * from access_ok(), because the subfunction is
+ * interpreted as (invalid) address to vm86_struct.
+ * So the installation check works.
+ */
+ ret = 0;
+ goto out;
}
/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
@@ -296,21 +299,21 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
VEFLAGS = info->regs.pt.flags;
info->regs.pt.flags &= SAFE_MASK;
info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
- info->regs.pt.flags |= VM_MASK;
+ info->regs.pt.flags |= X86_VM_MASK;
switch (info->cpu_type) {
- case CPU_286:
- tsk->thread.v86mask = 0;
- break;
- case CPU_386:
- tsk->thread.v86mask = NT_MASK | IOPL_MASK;
- break;
- case CPU_486:
- tsk->thread.v86mask = AC_MASK | NT_MASK | IOPL_MASK;
- break;
- default:
- tsk->thread.v86mask = ID_MASK | AC_MASK | NT_MASK | IOPL_MASK;
- break;
+ case CPU_286:
+ tsk->thread.v86mask = 0;
+ break;
+ case CPU_386:
+ tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ break;
+ case CPU_486:
+ tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ break;
+ default:
+ tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+ break;
}
/*
@@ -346,9 +349,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
/* we never return here */
}
-static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
+static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
{
- struct pt_regs * regs32;
+ struct pt_regs *regs32;
regs32 = save_v86_state(regs16);
regs32->ax = retval;
@@ -358,29 +361,30 @@ static inline void return_to_32bit(struct kernel_vm86_regs * regs16, int retval)
: : "r" (regs32), "r" (current_thread_info()));
}
-static inline void set_IF(struct kernel_vm86_regs * regs)
+static inline void set_IF(struct kernel_vm86_regs *regs)
{
- VEFLAGS |= VIF_MASK;
- if (VEFLAGS & VIP_MASK)
+ VEFLAGS |= X86_EFLAGS_VIF;
+ if (VEFLAGS & X86_EFLAGS_VIP)
return_to_32bit(regs, VM86_STI);
}
-static inline void clear_IF(struct kernel_vm86_regs * regs)
+static inline void clear_IF(struct kernel_vm86_regs *regs)
{
- VEFLAGS &= ~VIF_MASK;
+ VEFLAGS &= ~X86_EFLAGS_VIF;
}
-static inline void clear_TF(struct kernel_vm86_regs * regs)
+static inline void clear_TF(struct kernel_vm86_regs *regs)
{
- regs->pt.flags &= ~TF_MASK;
+ regs->pt.flags &= ~X86_EFLAGS_TF;
}
-static inline void clear_AC(struct kernel_vm86_regs * regs)
+static inline void clear_AC(struct kernel_vm86_regs *regs)
{
- regs->pt.flags &= ~AC_MASK;
+ regs->pt.flags &= ~X86_EFLAGS_AC;
}
-/* It is correct to call set_IF(regs) from the set_vflags_*
+/*
+ * It is correct to call set_IF(regs) from the set_vflags_*
* functions. However someone forgot to call clear_IF(regs)
* in the opposite case.
* After the command sequence CLI PUSHF STI POPF you should
@@ -391,41 +395,41 @@ static inline void clear_AC(struct kernel_vm86_regs * regs)
* [KD]
*/
-static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs * regs)
+static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
{
set_flags(VEFLAGS, flags, current->thread.v86mask);
set_flags(regs->pt.flags, flags, SAFE_MASK);
- if (flags & IF_MASK)
+ if (flags & X86_EFLAGS_IF)
set_IF(regs);
else
clear_IF(regs);
}
-static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs * regs)
+static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
{
set_flags(VFLAGS, flags, current->thread.v86mask);
set_flags(regs->pt.flags, flags, SAFE_MASK);
- if (flags & IF_MASK)
+ if (flags & X86_EFLAGS_IF)
set_IF(regs);
else
clear_IF(regs);
}
-static inline unsigned long get_vflags(struct kernel_vm86_regs * regs)
+static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
{
unsigned long flags = regs->pt.flags & RETURN_MASK;
- if (VEFLAGS & VIF_MASK)
- flags |= IF_MASK;
- flags |= IOPL_MASK;
+ if (VEFLAGS & X86_EFLAGS_VIF)
+ flags |= X86_EFLAGS_IF;
+ flags |= X86_EFLAGS_IOPL;
return flags | (VEFLAGS & current->thread.v86mask);
}
-static inline int is_revectored(int nr, struct revectored_struct * bitmap)
+static inline int is_revectored(int nr, struct revectored_struct *bitmap)
{
__asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0"
:"=r" (nr)
- :"m" (*bitmap),"r" (nr));
+ :"m" (*bitmap), "r" (nr));
return nr;
}
@@ -437,7 +441,7 @@ static inline int is_revectored(int nr, struct revectored_struct * bitmap)
ptr--; \
if (put_user(__val, base + ptr) < 0) \
goto err_label; \
- } while(0)
+ } while (0)
#define pushw(base, ptr, val, err_label) \
do { \
@@ -448,7 +452,7 @@ static inline int is_revectored(int nr, struct revectored_struct * bitmap)
ptr--; \
if (put_user(val_byte(__val, 0), base + ptr) < 0) \
goto err_label; \
- } while(0)
+ } while (0)
#define pushl(base, ptr, val, err_label) \
do { \
@@ -465,7 +469,7 @@ static inline int is_revectored(int nr, struct revectored_struct * bitmap)
ptr--; \
if (put_user(val_byte(__val, 0), base + ptr) < 0) \
goto err_label; \
- } while(0)
+ } while (0)
#define popb(base, ptr, err_label) \
({ \
@@ -512,7 +516,7 @@ static inline int is_revectored(int nr, struct revectored_struct * bitmap)
* in userspace is always better than an Oops anyway.) [KD]
*/
static void do_int(struct kernel_vm86_regs *regs, int i,
- unsigned char __user * ssp, unsigned short sp)
+ unsigned char __user *ssp, unsigned short sp)
{
unsigned long __user *intr_ptr;
unsigned long segoffs;
@@ -521,7 +525,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
goto cannot_handle;
if (is_revectored(i, &KVM86->int_revectored))
goto cannot_handle;
- if (i==0x21 && is_revectored(AH(regs),&KVM86->int21_revectored))
+ if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored))
goto cannot_handle;
intr_ptr = (unsigned long __user *) (i << 2);
if (get_user(segoffs, intr_ptr))
@@ -543,30 +547,23 @@ cannot_handle:
return_to_32bit(regs, VM86_INTx + (i << 8));
}
-int handle_vm86_trap(struct kernel_vm86_regs * regs, long error_code, int trapno)
+int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
{
if (VMPI.is_vm86pus) {
- if ( (trapno==3) || (trapno==1) )
+ if ((trapno == 3) || (trapno == 1))
return_to_32bit(regs, VM86_TRAP + (trapno << 8));
do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
return 0;
}
- if (trapno !=1)
+ if (trapno != 1)
return 1; /* we let this handle by the calling routine */
- if (current->ptrace & PT_PTRACED) {
- unsigned long flags;
- spin_lock_irqsave(&current->sighand->siglock, flags);
- sigdelset(&current->blocked, SIGTRAP);
- recalc_sigpending();
- spin_unlock_irqrestore(&current->sighand->siglock, flags);
- }
- send_sig(SIGTRAP, current, 1);
current->thread.trap_no = trapno;
current->thread.error_code = error_code;
+ force_sig(SIGTRAP, current);
return 0;
}
-void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
+void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
{
unsigned char opcode;
unsigned char __user *csp;
@@ -576,11 +573,11 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
#define CHECK_IF_IN_TRAP \
if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
- newflags |= TF_MASK
+ newflags |= X86_EFLAGS_TF
#define VM86_FAULT_RETURN do { \
- if (VMPI.force_return_for_pic && (VEFLAGS & (IF_MASK | VIF_MASK))) \
+ if (VMPI.force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
return_to_32bit(regs, VM86_PICRETURN); \
- if (orig_flags & TF_MASK) \
+ if (orig_flags & X86_EFLAGS_TF) \
handle_vm86_trap(regs, 0, 1); \
return; } while (0)
@@ -595,17 +592,17 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
pref_done = 0;
do {
switch (opcode = popb(csp, ip, simulate_sigsegv)) {
- case 0x66: /* 32-bit data */ data32=1; break;
- case 0x67: /* 32-bit address */ break;
- case 0x2e: /* CS */ break;
- case 0x3e: /* DS */ break;
- case 0x26: /* ES */ break;
- case 0x36: /* SS */ break;
- case 0x65: /* GS */ break;
- case 0x64: /* FS */ break;
- case 0xf2: /* repnz */ break;
- case 0xf3: /* rep */ break;
- default: pref_done = 1;
+ case 0x66: /* 32-bit data */ data32 = 1; break;
+ case 0x67: /* 32-bit address */ break;
+ case 0x2e: /* CS */ break;
+ case 0x3e: /* DS */ break;
+ case 0x26: /* ES */ break;
+ case 0x36: /* SS */ break;
+ case 0x65: /* GS */ break;
+ case 0x64: /* FS */ break;
+ case 0xf2: /* repnz */ break;
+ case 0xf3: /* rep */ break;
+ default: pref_done = 1;
}
} while (!pref_done);
@@ -628,7 +625,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
{
unsigned long newflags;
if (data32) {
- newflags=popl(ssp, sp, simulate_sigsegv);
+ newflags = popl(ssp, sp, simulate_sigsegv);
SP(regs) += 4;
} else {
newflags = popw(ssp, sp, simulate_sigsegv);
@@ -636,20 +633,20 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
}
IP(regs) = ip;
CHECK_IF_IN_TRAP;
- if (data32) {
+ if (data32)
set_vflags_long(newflags, regs);
- } else {
+ else
set_vflags_short(newflags, regs);
- }
+
VM86_FAULT_RETURN;
}
/* int xx */
case 0xcd: {
- int intno=popb(csp, ip, simulate_sigsegv);
+ int intno = popb(csp, ip, simulate_sigsegv);
IP(regs) = ip;
if (VMPI.vm86dbg_active) {
- if ( (1 << (intno &7)) & VMPI.vm86dbg_intxxtab[intno >> 3] )
+ if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3])
return_to_32bit(regs, VM86_INTx + (intno << 8));
}
do_int(regs, intno, ssp, sp);
@@ -663,9 +660,9 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
unsigned long newcs;
unsigned long newflags;
if (data32) {
- newip=popl(ssp, sp, simulate_sigsegv);
- newcs=popl(ssp, sp, simulate_sigsegv);
- newflags=popl(ssp, sp, simulate_sigsegv);
+ newip = popl(ssp, sp, simulate_sigsegv);
+ newcs = popl(ssp, sp, simulate_sigsegv);
+ newflags = popl(ssp, sp, simulate_sigsegv);
SP(regs) += 12;
} else {
newip = popw(ssp, sp, simulate_sigsegv);
@@ -734,18 +731,18 @@ static struct vm86_irqs {
static DEFINE_SPINLOCK(irqbits_lock);
static int irqbits;
-#define ALLOWED_SIGS ( 1 /* 0 = don't send a signal */ \
+#define ALLOWED_SIGS (1 /* 0 = don't send a signal */ \
| (1 << SIGUSR1) | (1 << SIGUSR2) | (1 << SIGIO) | (1 << SIGURG) \
- | (1 << SIGUNUSED) )
-
+ | (1 << SIGUNUSED))
+
static irqreturn_t irq_handler(int intno, void *dev_id)
{
int irq_bit;
unsigned long flags;
- spin_lock_irqsave(&irqbits_lock, flags);
+ spin_lock_irqsave(&irqbits_lock, flags);
irq_bit = 1 << intno;
- if ((irqbits & irq_bit) || ! vm86_irqs[intno].tsk)
+ if ((irqbits & irq_bit) || !vm86_irqs[intno].tsk)
goto out;
irqbits |= irq_bit;
if (vm86_irqs[intno].sig)
@@ -759,7 +756,7 @@ static irqreturn_t irq_handler(int intno, void *dev_id)
return IRQ_HANDLED;
out:
- spin_unlock_irqrestore(&irqbits_lock, flags);
+ spin_unlock_irqrestore(&irqbits_lock, flags);
return IRQ_NONE;
}
@@ -770,9 +767,9 @@ static inline void free_vm86_irq(int irqnumber)
free_irq(irqnumber, NULL);
vm86_irqs[irqnumber].tsk = NULL;
- spin_lock_irqsave(&irqbits_lock, flags);
+ spin_lock_irqsave(&irqbits_lock, flags);
irqbits &= ~(1 << irqnumber);
- spin_unlock_irqrestore(&irqbits_lock, flags);
+ spin_unlock_irqrestore(&irqbits_lock, flags);
}
void release_vm86_irqs(struct task_struct *task)
@@ -788,10 +785,10 @@ static inline int get_and_reset_irq(int irqnumber)
int bit;
unsigned long flags;
int ret = 0;
-
+
if (invalid_vm86_irq(irqnumber)) return 0;
if (vm86_irqs[irqnumber].tsk != current) return 0;
- spin_lock_irqsave(&irqbits_lock, flags);
+ spin_lock_irqsave(&irqbits_lock, flags);
bit = irqbits & (1 << irqnumber);
irqbits &= ~bit;
if (bit) {
@@ -799,7 +796,7 @@ static inline int get_and_reset_irq(int irqnumber)
ret = 1;
}
- spin_unlock_irqrestore(&irqbits_lock, flags);
+ spin_unlock_irqrestore(&irqbits_lock, flags);
return ret;
}
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 12affe1f9bce..956f38927aa7 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -320,7 +320,7 @@ static void check_zeroed_page(u32 pfn, int type, struct page *page)
* pdes need to be zeroed.
*/
if (type & VMI_PAGE_CLONE)
- limit = USER_PTRS_PER_PGD;
+ limit = KERNEL_PGD_BOUNDARY;
for (i = 0; i < limit; i++)
BUG_ON(ptr[i]);
}
@@ -392,13 +392,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
}
#endif
-static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn)
{
vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
}
-static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
{
/*
* This call comes in very early, before mem_map is setup.
@@ -409,20 +409,20 @@ static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
}
-static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
{
vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
vmi_check_page_type(clonepfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
}
-static void vmi_release_pt(u32 pfn)
+static void vmi_release_pte(u32 pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L1);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
}
-static void vmi_release_pd(u32 pfn)
+static void vmi_release_pmd(u32 pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L2);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
@@ -871,15 +871,15 @@ static inline int __init activate_vmi(void)
vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
if (vmi_ops.allocate_page) {
- pv_mmu_ops.alloc_pt = vmi_allocate_pt;
- pv_mmu_ops.alloc_pd = vmi_allocate_pd;
- pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+ pv_mmu_ops.alloc_pte = vmi_allocate_pte;
+ pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
+ pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
}
vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
if (vmi_ops.release_page) {
- pv_mmu_ops.release_pt = vmi_release_pt;
- pv_mmu_ops.release_pd = vmi_release_pd;
+ pv_mmu_ops.release_pte = vmi_release_pte;
+ pv_mmu_ops.release_pmd = vmi_release_pmd;
}
/* Set linear is needed in all cases */
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 2ffa9656fe7a..ce5ed083a1e9 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -149,6 +149,11 @@ SECTIONS
*(.con_initcall.init)
__con_initcall_end = .;
}
+ .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) {
+ __x86cpuvendor_start = .;
+ *(.x86cpuvendor.init)
+ __x86cpuvendor_end = .;
+ }
SECURITY_INIT
. = ALIGN(4);
.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fab132299735..fad3674b06a5 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -177,6 +177,11 @@ SECTIONS
*(.con_initcall.init)
}
__con_initcall_end = .;
+ __x86cpuvendor_start = .;
+ .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) {
+ *(.x86cpuvendor.init)
+ }
+ __x86cpuvendor_end = .;
SECURITY_INIT
. = ALIGN(8);
@@ -204,12 +209,6 @@ SECTIONS
EXIT_DATA
}
-/* vdso blob that is mapped into user space */
- vdso_start = . ;
- .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) }
- . = ALIGN(PAGE_SIZE);
- vdso_end = .;
-
#ifdef CONFIG_BLK_DEV_INITRD
. = ALIGN(PAGE_SIZE);
__initramfs_start = .;
@@ -247,3 +246,9 @@ SECTIONS
DWARF_DEBUG
}
+
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE")
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index d971210a6d36..caf2a26f5cfd 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -8,6 +8,8 @@
*
* Ravikiran Thirumalai <kiran@scalemp.com>,
* Shai Fultheim <shai@scalemp.com>
+ * Paravirt ops integration: Glauber de Oliveira Costa <gcosta@redhat.com>,
+ * Ravikiran Thirumalai <kiran@scalemp.com>
*/
#include <linux/init.h>
@@ -15,38 +17,137 @@
#include <linux/pci_regs.h>
#include <asm/pci-direct.h>
#include <asm/io.h>
+#include <asm/paravirt.h>
-static int __init vsmp_init(void)
+#if defined CONFIG_PCI && defined CONFIG_PARAVIRT
+/*
+ * Interrupt control on vSMPowered systems:
+ * ~AC is a shadow of IF. If IF is 'on' AC should be 'off'
+ * and vice versa.
+ */
+
+static unsigned long vsmp_save_fl(void)
{
- void *address;
- unsigned int cap, ctl;
+ unsigned long flags = native_save_fl();
- if (!early_pci_allowed())
- return 0;
+ if (!(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC))
+ flags &= ~X86_EFLAGS_IF;
+ return flags;
+}
- /* Check if we are running on a ScaleMP vSMP box */
- if ((read_pci_config_16(0, 0x1f, 0, PCI_VENDOR_ID) !=
- PCI_VENDOR_ID_SCALEMP) ||
- (read_pci_config_16(0, 0x1f, 0, PCI_DEVICE_ID) !=
- PCI_DEVICE_ID_SCALEMP_VSMP_CTL))
- return 0;
+static void vsmp_restore_fl(unsigned long flags)
+{
+ if (flags & X86_EFLAGS_IF)
+ flags &= ~X86_EFLAGS_AC;
+ else
+ flags |= X86_EFLAGS_AC;
+ native_restore_fl(flags);
+}
+
+static void vsmp_irq_disable(void)
+{
+ unsigned long flags = native_save_fl();
+
+ native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
+}
+
+static void vsmp_irq_enable(void)
+{
+ unsigned long flags = native_save_fl();
+
+ native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
+}
+
+static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf,
+ unsigned long addr, unsigned len)
+{
+ switch (type) {
+ case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
+ case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
+ case PARAVIRT_PATCH(pv_irq_ops.save_fl):
+ case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
+ return paravirt_patch_default(type, clobbers, ibuf, addr, len);
+ default:
+ return native_patch(type, clobbers, ibuf, addr, len);
+ }
+
+}
+
+static void __init set_vsmp_pv_ops(void)
+{
+ void *address;
+ unsigned int cap, ctl, cfg;
/* set vSMP magic bits to indicate vSMP capable kernel */
- address = ioremap(read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0), 8);
+ cfg = read_pci_config(0, 0x1f, 0, PCI_BASE_ADDRESS_0);
+ address = early_ioremap(cfg, 8);
cap = readl(address);
ctl = readl(address + 4);
printk(KERN_INFO "vSMP CTL: capabilities:0x%08x control:0x%08x\n",
cap, ctl);
if (cap & ctl & (1 << 4)) {
- /* Turn on vSMP IRQ fastpath handling (see system.h) */
+ /* Setup irq ops and turn on vSMP IRQ fastpath handling */
+ pv_irq_ops.irq_disable = vsmp_irq_disable;
+ pv_irq_ops.irq_enable = vsmp_irq_enable;
+ pv_irq_ops.save_fl = vsmp_save_fl;
+ pv_irq_ops.restore_fl = vsmp_restore_fl;
+ pv_init_ops.patch = vsmp_patch;
+
ctl &= ~(1 << 4);
writel(ctl, address + 4);
ctl = readl(address + 4);
printk(KERN_INFO "vSMP CTL: control set to:0x%08x\n", ctl);
}
- iounmap(address);
+ early_iounmap(address, 8);
+}
+#else
+static void __init set_vsmp_pv_ops(void)
+{
+}
+#endif
+
+#ifdef CONFIG_PCI
+static int is_vsmp = -1;
+
+static void __init detect_vsmp_box(void)
+{
+ is_vsmp = 0;
+
+ if (!early_pci_allowed())
+ return;
+
+ /* Check if we are running on a ScaleMP vSMPowered box */
+ if (read_pci_config(0, 0x1f, 0, PCI_VENDOR_ID) ==
+ (PCI_VENDOR_ID_SCALEMP | (PCI_DEVICE_ID_SCALEMP_VSMP_CTL << 16)))
+ is_vsmp = 1;
+}
+
+int is_vsmp_box(void)
+{
+ if (is_vsmp != -1)
+ return is_vsmp;
+ else {
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+}
+#else
+static int __init detect_vsmp_box(void)
+{
+}
+int is_vsmp_box(void)
+{
return 0;
}
+#endif
-core_initcall(vsmp_init);
+void __init vsmp_init(void)
+{
+ detect_vsmp_box();
+ if (!is_vsmp_box())
+ return;
+
+ set_vsmp_pv_ops();
+ return;
+}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index edff4c985485..61efa2f7d564 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -216,7 +216,7 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
return 0;
}
-long __vsyscall(3) venosys_1(void)
+static long __vsyscall(3) venosys_1(void)
{
return -ENOSYS;
}
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index a66e9c1a0537..58882f9f2637 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -4,7 +4,6 @@
#include <linux/module.h>
#include <linux/smp.h>
-#include <asm/semaphore.h>
#include <asm/processor.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -12,11 +11,6 @@
EXPORT_SYMBOL(kernel_thread);
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
-
EXPORT_SYMBOL(__get_user_1);
EXPORT_SYMBOL(__get_user_2);
EXPORT_SYMBOL(__get_user_4);
@@ -35,15 +29,17 @@ EXPORT_SYMBOL(__copy_from_user_inatomic);
EXPORT_SYMBOL(copy_page);
EXPORT_SYMBOL(clear_page);
-/* Export string functions. We normally rely on gcc builtin for most of these,
- but gcc sometimes decides not to inline them. */
+/*
+ * Export string functions. We normally rely on gcc builtin for most of these,
+ * but gcc sometimes decides not to inline them.
+ */
#undef memcpy
#undef memset
#undef memmove
-extern void * memset(void *,int,__kernel_size_t);
-extern void * memcpy(void *,const void *,__kernel_size_t);
-extern void * __memcpy(void *,const void *,__kernel_size_t);
+extern void *memset(void *, int, __kernel_size_t);
+extern void *memcpy(void *, const void *, __kernel_size_t);
+extern void *__memcpy(void *, const void *, __kernel_size_t);
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memcpy);
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 41962e793c0f..8d45fabc5f3b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -19,7 +19,7 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
- depends on HAVE_KVM && EXPERIMENTAL
+ depends on HAVE_KVM
select PREEMPT_NOTIFIERS
select ANON_INODES
---help---
@@ -50,6 +50,17 @@ config KVM_AMD
Provides support for KVM on AMD processors equipped with the AMD-V
(SVM) extensions.
+config KVM_TRACE
+ bool "KVM trace support"
+ depends on KVM && MARKERS && SYSFS
+ select RELAY
+ select DEBUG_FS
+ default n
+ ---help---
+ This option allows reading a trace of kvm-related events through
+ relayfs. Note the ABI is not considered stable and will be
+ modified in future updates.
+
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/lguest/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index ffdd0b310784..c97d35c218db 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,10 +3,14 @@
#
common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o)
+ifeq ($(CONFIG_KVM_TRACE),y)
+common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
+endif
EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
-kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o
+kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
+ i8254.o
obj-$(CONFIG_KVM) += kvm.o
kvm-intel-objs = vmx.o
obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
new file mode 100644
index 000000000000..361e31611276
--- /dev/null
+++ b/arch/x86/kvm/i8254.c
@@ -0,0 +1,611 @@
+/*
+ * 8253/8254 interval timer emulation
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ * Copyright (c) 2006 Intel Corporation
+ * Copyright (c) 2007 Keir Fraser, XenSource Inc
+ * Copyright (c) 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * Authors:
+ * Sheng Yang <sheng.yang@intel.com>
+ * Based on QEMU and Xen.
+ */
+
+#include <linux/kvm_host.h>
+
+#include "irq.h"
+#include "i8254.h"
+
+#ifndef CONFIG_X86_64
+#define mod_64(x, y) ((x) - (y) * div64_64(x, y))
+#else
+#define mod_64(x, y) ((x) % (y))
+#endif
+
+#define RW_STATE_LSB 1
+#define RW_STATE_MSB 2
+#define RW_STATE_WORD0 3
+#define RW_STATE_WORD1 4
+
+/* Compute with 96 bit intermediate result: (a*b)/c */
+static u64 muldiv64(u64 a, u32 b, u32 c)
+{
+ union {
+ u64 ll;
+ struct {
+ u32 low, high;
+ } l;
+ } u, res;
+ u64 rl, rh;
+
+ u.ll = a;
+ rl = (u64)u.l.low * (u64)b;
+ rh = (u64)u.l.high * (u64)b;
+ rh += (rl >> 32);
+ res.l.high = div64_64(rh, c);
+ res.l.low = div64_64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c);
+ return res.ll;
+}
+
+static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ switch (c->mode) {
+ default:
+ case 0:
+ case 4:
+ /* XXX: just disable/enable counting */
+ break;
+ case 1:
+ case 2:
+ case 3:
+ case 5:
+ /* Restart counting on rising edge. */
+ if (c->gate < val)
+ c->count_load_time = ktime_get();
+ break;
+ }
+
+ c->gate = val;
+}
+
+int pit_get_gate(struct kvm *kvm, int channel)
+{
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ return kvm->arch.vpit->pit_state.channels[channel].gate;
+}
+
+static int pit_get_count(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+ s64 d, t;
+ int counter;
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+ d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ case 0:
+ case 1:
+ case 4:
+ case 5:
+ counter = (c->count - d) & 0xffff;
+ break;
+ case 3:
+ /* XXX: may be incorrect for odd counts */
+ counter = c->count - (mod_64((2 * d), c->count));
+ break;
+ default:
+ counter = c->count - mod_64(d, c->count);
+ break;
+ }
+ return counter;
+}
+
+static int pit_get_out(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+ s64 d, t;
+ int out;
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+ d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
+
+ switch (c->mode) {
+ default:
+ case 0:
+ out = (d >= c->count);
+ break;
+ case 1:
+ out = (d < c->count);
+ break;
+ case 2:
+ out = ((mod_64(d, c->count) == 0) && (d != 0));
+ break;
+ case 3:
+ out = (mod_64(d, c->count) < ((c->count + 1) >> 1));
+ break;
+ case 4:
+ case 5:
+ out = (d == c->count);
+ break;
+ }
+
+ return out;
+}
+
+static void pit_latch_count(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ if (!c->count_latched) {
+ c->latched_count = pit_get_count(kvm, channel);
+ c->count_latched = c->rw_mode;
+ }
+}
+
+static void pit_latch_status(struct kvm *kvm, int channel)
+{
+ struct kvm_kpit_channel_state *c =
+ &kvm->arch.vpit->pit_state.channels[channel];
+
+ WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
+
+ if (!c->status_latched) {
+ /* TODO: Return NULL COUNT (bit 6). */
+ c->status = ((pit_get_out(kvm, channel) << 7) |
+ (c->rw_mode << 4) |
+ (c->mode << 1) |
+ c->bcd);
+ c->status_latched = 1;
+ }
+}
+
+int __pit_timer_fn(struct kvm_kpit_state *ps)
+{
+ struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
+ struct kvm_kpit_timer *pt = &ps->pit_timer;
+
+ atomic_inc(&pt->pending);
+ smp_mb__after_atomic_inc();
+ /* FIXME: handle case where the guest is in guest mode */
+ if (vcpu0 && waitqueue_active(&vcpu0->wq)) {
+ vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ wake_up_interruptible(&vcpu0->wq);
+ }
+
+ pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period);
+ pt->scheduled = ktime_to_ns(pt->timer.expires);
+
+ return (pt->period == 0 ? 0 : 1);
+}
+
+int pit_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+
+ if (pit && vcpu->vcpu_id == 0)
+ return atomic_read(&pit->pit_state.pit_timer.pending);
+
+ return 0;
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+ struct kvm_kpit_state *ps;
+ int restart_timer = 0;
+
+ ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
+
+ restart_timer = __pit_timer_fn(ps);
+
+ if (restart_timer)
+ return HRTIMER_RESTART;
+ else
+ return HRTIMER_NORESTART;
+}
+
+static void destroy_pit_timer(struct kvm_kpit_timer *pt)
+{
+ pr_debug("pit: execute del timer!\n");
+ hrtimer_cancel(&pt->timer);
+}
+
+static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
+{
+ s64 interval;
+
+ interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
+
+ pr_debug("pit: create pit timer, interval is %llu nsec\n", interval);
+
+ /* TODO The new value only affected after the retriggered */
+ hrtimer_cancel(&pt->timer);
+ pt->period = (is_period == 0) ? 0 : interval;
+ pt->timer.function = pit_timer_fn;
+ atomic_set(&pt->pending, 0);
+
+ hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
+ HRTIMER_MODE_ABS);
+}
+
+static void pit_load_count(struct kvm *kvm, int channel, u32 val)
+{
+ struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+
+ WARN_ON(!mutex_is_locked(&ps->lock));
+
+ pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
+
+ /*
+ * Though spec said the state of 8254 is undefined after power-up,
+ * seems some tricky OS like Windows XP depends on IRQ0 interrupt
+ * when booting up.
+ * So here setting initialize rate for it, and not a specific number
+ */
+ if (val == 0)
+ val = 0x10000;
+
+ ps->channels[channel].count_load_time = ktime_get();
+ ps->channels[channel].count = val;
+
+ if (channel != 0)
+ return;
+
+ /* Two types of timer
+ * mode 1 is one shot, mode 2 is period, otherwise del timer */
+ switch (ps->channels[0].mode) {
+ case 1:
+ create_pit_timer(&ps->pit_timer, val, 0);
+ break;
+ case 2:
+ create_pit_timer(&ps->pit_timer, val, 1);
+ break;
+ default:
+ destroy_pit_timer(&ps->pit_timer);
+ }
+}
+
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val)
+{
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ pit_load_count(kvm, channel, val);
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+}
+
+static void pit_ioport_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ int channel, access;
+ struct kvm_kpit_channel_state *s;
+ u32 val = *(u32 *) data;
+
+ val &= 0xff;
+ addr &= KVM_PIT_CHANNEL_MASK;
+
+ mutex_lock(&pit_state->lock);
+
+ if (val != 0)
+ pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n",
+ (unsigned int)addr, len, val);
+
+ if (addr == 3) {
+ channel = val >> 6;
+ if (channel == 3) {
+ /* Read-Back Command. */
+ for (channel = 0; channel < 3; channel++) {
+ s = &pit_state->channels[channel];
+ if (val & (2 << channel)) {
+ if (!(val & 0x20))
+ pit_latch_count(kvm, channel);
+ if (!(val & 0x10))
+ pit_latch_status(kvm, channel);
+ }
+ }
+ } else {
+ /* Select Counter <channel>. */
+ s = &pit_state->channels[channel];
+ access = (val >> 4) & KVM_PIT_CHANNEL_MASK;
+ if (access == 0) {
+ pit_latch_count(kvm, channel);
+ } else {
+ s->rw_mode = access;
+ s->read_state = access;
+ s->write_state = access;
+ s->mode = (val >> 1) & 7;
+ if (s->mode > 5)
+ s->mode -= 4;
+ s->bcd = val & 1;
+ }
+ }
+ } else {
+ /* Write Count. */
+ s = &pit_state->channels[addr];
+ switch (s->write_state) {
+ default:
+ case RW_STATE_LSB:
+ pit_load_count(kvm, addr, val);
+ break;
+ case RW_STATE_MSB:
+ pit_load_count(kvm, addr, val << 8);
+ break;
+ case RW_STATE_WORD0:
+ s->write_latch = val;
+ s->write_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ pit_load_count(kvm, addr, s->write_latch | (val << 8));
+ s->write_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ mutex_unlock(&pit_state->lock);
+}
+
+static void pit_ioport_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
+{
+ struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ int ret, count;
+ struct kvm_kpit_channel_state *s;
+
+ addr &= KVM_PIT_CHANNEL_MASK;
+ s = &pit_state->channels[addr];
+
+ mutex_lock(&pit_state->lock);
+
+ if (s->status_latched) {
+ s->status_latched = 0;
+ ret = s->status;
+ } else if (s->count_latched) {
+ switch (s->count_latched) {
+ default:
+ case RW_STATE_LSB:
+ ret = s->latched_count & 0xff;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_MSB:
+ ret = s->latched_count >> 8;
+ s->count_latched = 0;
+ break;
+ case RW_STATE_WORD0:
+ ret = s->latched_count & 0xff;
+ s->count_latched = RW_STATE_MSB;
+ break;
+ }
+ } else {
+ switch (s->read_state) {
+ default:
+ case RW_STATE_LSB:
+ count = pit_get_count(kvm, addr);
+ ret = count & 0xff;
+ break;
+ case RW_STATE_MSB:
+ count = pit_get_count(kvm, addr);
+ ret = (count >> 8) & 0xff;
+ break;
+ case RW_STATE_WORD0:
+ count = pit_get_count(kvm, addr);
+ ret = count & 0xff;
+ s->read_state = RW_STATE_WORD1;
+ break;
+ case RW_STATE_WORD1:
+ count = pit_get_count(kvm, addr);
+ ret = (count >> 8) & 0xff;
+ s->read_state = RW_STATE_WORD0;
+ break;
+ }
+ }
+
+ if (len > sizeof(ret))
+ len = sizeof(ret);
+ memcpy(data, (char *)&ret, len);
+
+ mutex_unlock(&pit_state->lock);
+}
+
+static int pit_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+ return ((addr >= KVM_PIT_BASE_ADDRESS) &&
+ (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
+}
+
+static void speaker_ioport_write(struct kvm_io_device *this,
+ gpa_t addr, int len, const void *data)
+{
+ struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ u32 val = *(u32 *) data;
+
+ mutex_lock(&pit_state->lock);
+ pit_state->speaker_data_on = (val >> 1) & 1;
+ pit_set_gate(kvm, 2, val & 1);
+ mutex_unlock(&pit_state->lock);
+}
+
+static void speaker_ioport_read(struct kvm_io_device *this,
+ gpa_t addr, int len, void *data)
+{
+ struct kvm_pit *pit = (struct kvm_pit *)this->private;
+ struct kvm_kpit_state *pit_state = &pit->pit_state;
+ struct kvm *kvm = pit->kvm;
+ unsigned int refresh_clock;
+ int ret;
+
+ /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */
+ refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1;
+
+ mutex_lock(&pit_state->lock);
+ ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) |
+ (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4));
+ if (len > sizeof(ret))
+ len = sizeof(ret);
+ memcpy(data, (char *)&ret, len);
+ mutex_unlock(&pit_state->lock);
+}
+
+static int speaker_in_range(struct kvm_io_device *this, gpa_t addr)
+{
+ return (addr == KVM_SPEAKER_BASE_ADDRESS);
+}
+
+void kvm_pit_reset(struct kvm_pit *pit)
+{
+ int i;
+ struct kvm_kpit_channel_state *c;
+
+ mutex_lock(&pit->pit_state.lock);
+ for (i = 0; i < 3; i++) {
+ c = &pit->pit_state.channels[i];
+ c->mode = 0xff;
+ c->gate = (i != 2);
+ pit_load_count(pit->kvm, i, 0);
+ }
+ mutex_unlock(&pit->pit_state.lock);
+
+ atomic_set(&pit->pit_state.pit_timer.pending, 0);
+ pit->pit_state.inject_pending = 1;
+}
+
+struct kvm_pit *kvm_create_pit(struct kvm *kvm)
+{
+ struct kvm_pit *pit;
+ struct kvm_kpit_state *pit_state;
+
+ pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
+ if (!pit)
+ return NULL;
+
+ mutex_init(&pit->pit_state.lock);
+ mutex_lock(&pit->pit_state.lock);
+
+ /* Initialize PIO device */
+ pit->dev.read = pit_ioport_read;
+ pit->dev.write = pit_ioport_write;
+ pit->dev.in_range = pit_in_range;
+ pit->dev.private = pit;
+ kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev);
+
+ pit->speaker_dev.read = speaker_ioport_read;
+ pit->speaker_dev.write = speaker_ioport_write;
+ pit->speaker_dev.in_range = speaker_in_range;
+ pit->speaker_dev.private = pit;
+ kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev);
+
+ kvm->arch.vpit = pit;
+ pit->kvm = kvm;
+
+ pit_state = &pit->pit_state;
+ pit_state->pit = pit;
+ hrtimer_init(&pit_state->pit_timer.timer,
+ CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ mutex_unlock(&pit->pit_state.lock);
+
+ kvm_pit_reset(pit);
+
+ return pit;
+}
+
+void kvm_free_pit(struct kvm *kvm)
+{
+ struct hrtimer *timer;
+
+ if (kvm->arch.vpit) {
+ mutex_lock(&kvm->arch.vpit->pit_state.lock);
+ timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
+ hrtimer_cancel(timer);
+ mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+ kfree(kvm->arch.vpit);
+ }
+}
+
+void __inject_pit_timer_intr(struct kvm *kvm)
+{
+ mutex_lock(&kvm->lock);
+ kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1);
+ kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0);
+ kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
+ kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
+ mutex_unlock(&kvm->lock);
+}
+
+void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ struct kvm_pit *pit = vcpu->kvm->arch.vpit;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_kpit_state *ps;
+
+ if (vcpu && pit) {
+ ps = &pit->pit_state;
+
+ /* Try to inject pending interrupts when:
+ * 1. Pending exists
+ * 2. Last interrupt was accepted or waited for too long time*/
+ if (atomic_read(&ps->pit_timer.pending) &&
+ (ps->inject_pending ||
+ (jiffies - ps->last_injected_time
+ >= KVM_MAX_PIT_INTR_INTERVAL))) {
+ ps->inject_pending = 0;
+ __inject_pit_timer_intr(kvm);
+ ps->last_injected_time = jiffies;
+ }
+ }
+}
+
+void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
+{
+ struct kvm_arch *arch = &vcpu->kvm->arch;
+ struct kvm_kpit_state *ps;
+
+ if (vcpu && arch->vpit) {
+ ps = &arch->vpit->pit_state;
+ if (atomic_read(&ps->pit_timer.pending) &&
+ (((arch->vpic->pics[0].imr & 1) == 0 &&
+ arch->vpic->pics[0].irq_base == vec) ||
+ (arch->vioapic->redirtbl[0].fields.vector == vec &&
+ arch->vioapic->redirtbl[0].fields.mask != 1))) {
+ ps->inject_pending = 1;
+ atomic_dec(&ps->pit_timer.pending);
+ ps->channels[0].count_load_time = ktime_get();
+ }
+ }
+}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
new file mode 100644
index 000000000000..db25c2a6c8c4
--- /dev/null
+++ b/arch/x86/kvm/i8254.h
@@ -0,0 +1,63 @@
+#ifndef __I8254_H
+#define __I8254_H
+
+#include "iodev.h"
+
+struct kvm_kpit_timer {
+ struct hrtimer timer;
+ int irq;
+ s64 period; /* unit: ns */
+ s64 scheduled;
+ ktime_t last_update;
+ atomic_t pending;
+};
+
+struct kvm_kpit_channel_state {
+ u32 count; /* can be 65536 */
+ u16 latched_count;
+ u8 count_latched;
+ u8 status_latched;
+ u8 status;
+ u8 read_state;
+ u8 write_state;
+ u8 write_latch;
+ u8 rw_mode;
+ u8 mode;
+ u8 bcd; /* not supported */
+ u8 gate; /* timer start */
+ ktime_t count_load_time;
+};
+
+struct kvm_kpit_state {
+ struct kvm_kpit_channel_state channels[3];
+ struct kvm_kpit_timer pit_timer;
+ u32 speaker_data_on;
+ struct mutex lock;
+ struct kvm_pit *pit;
+ bool inject_pending; /* if inject pending interrupts */
+ unsigned long last_injected_time;
+};
+
+struct kvm_pit {
+ unsigned long base_addresss;
+ struct kvm_io_device dev;
+ struct kvm_io_device speaker_dev;
+ struct kvm *kvm;
+ struct kvm_kpit_state pit_state;
+};
+
+#define KVM_PIT_BASE_ADDRESS 0x40
+#define KVM_SPEAKER_BASE_ADDRESS 0x61
+#define KVM_PIT_MEM_LENGTH 4
+#define KVM_PIT_FREQ 1193181
+#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
+#define KVM_PIT_CHANNEL_MASK 0x3
+
+void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
+void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
+void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
+struct kvm_pit *kvm_create_pit(struct kvm *kvm);
+void kvm_free_pit(struct kvm *kvm);
+void kvm_pit_reset(struct kvm_pit *pit);
+
+#endif
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index e5714759e97f..ce1f583459b1 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -23,6 +23,22 @@
#include <linux/kvm_host.h>
#include "irq.h"
+#include "i8254.h"
+
+/*
+ * check if there are pending timer events
+ * to be processed.
+ */
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ ret = pit_has_pending_timer(vcpu);
+ ret |= apic_has_pending_timer(vcpu);
+
+ return ret;
+}
+EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
/*
* check if there is pending interrupt without
@@ -66,6 +82,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
kvm_inject_apic_timer_irqs(vcpu);
+ kvm_inject_pit_timer_irqs(vcpu);
/* TODO: PIT, RTC etc. */
}
EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
@@ -73,6 +90,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
{
kvm_apic_timer_intr_post(vcpu, vec);
+ kvm_pit_timer_intr_post(vcpu, vec);
/* TODO: PIT, RTC etc. */
}
EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index fa5ed5d59b5d..1802134b836f 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -85,4 +85,7 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
+int pit_has_pending_timer(struct kvm_vcpu *vcpu);
+int apic_has_pending_timer(struct kvm_vcpu *vcpu);
+
#endif
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index ecdfe97e4635..65ef0fc2c036 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -39,6 +39,8 @@ struct vcpu_svm {
unsigned long host_db_regs[NUM_DB_REGS];
unsigned long host_dr6;
unsigned long host_dr7;
+
+ u32 *msrpm;
};
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 68a6b1511934..57ac4e4c556a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -338,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
} else
apic_clear_vector(vector, apic->regs + APIC_TMR);
- if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
+ if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
kvm_vcpu_kick(vcpu);
- else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) {
- vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+ else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
}
@@ -362,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
case APIC_DM_INIT:
if (level) {
- if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE)
+ if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
printk(KERN_DEBUG
"INIT on a runnable vcpu %d\n",
vcpu->vcpu_id);
- vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED;
+ vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
kvm_vcpu_kick(vcpu);
} else {
printk(KERN_DEBUG
@@ -379,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
case APIC_DM_STARTUP:
printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n",
vcpu->vcpu_id, vector);
- if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) {
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
vcpu->arch.sipi_vector = vector;
- vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED;
+ vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
if (waitqueue_active(&vcpu->wq))
wake_up_interruptible(&vcpu->wq);
}
@@ -658,7 +658,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
PRIx64 ", "
"timer initial count 0x%x, period %lldns, "
- "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__,
+ "expire @ 0x%016" PRIx64 ".\n", __func__,
APIC_BUS_CYCLE_NS, ktime_to_ns(now),
apic_get_reg(apic, APIC_TMICT),
apic->timer.period,
@@ -691,7 +691,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
/* too common printing */
if (offset != APIC_EOI)
apic_debug("%s: offset 0x%x with length 0x%x, and value is "
- "0x%x\n", __FUNCTION__, offset, len, val);
+ "0x%x\n", __func__, offset, len, val);
offset &= 0xff0;
@@ -822,6 +822,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
| (apic_get_reg(apic, APIC_TASKPRI) & 4));
}
+EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr);
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
{
@@ -869,7 +870,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic;
int i;
- apic_debug("%s\n", __FUNCTION__);
+ apic_debug("%s\n", __func__);
ASSERT(vcpu);
apic = vcpu->arch.apic;
@@ -907,7 +908,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
apic_update_ppr(apic);
apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
- "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__,
+ "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
vcpu, kvm_apic_id(apic),
vcpu->arch.apic_base, apic->base_address);
}
@@ -940,7 +941,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
atomic_inc(&apic->timer.pending);
if (waitqueue_active(q)) {
- apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+ apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
wake_up_interruptible(q);
}
if (apic_lvtt_period(apic)) {
@@ -952,6 +953,16 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
return result;
}
+int apic_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *lapic = vcpu->arch.apic;
+
+ if (lapic)
+ return atomic_read(&lapic->timer.pending);
+
+ return 0;
+}
+
static int __inject_apic_timer_irq(struct kvm_lapic *apic)
{
int vector;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e55af12e11b7..2ad6f5481671 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -27,11 +27,22 @@
#include <linux/highmem.h>
#include <linux/module.h>
#include <linux/swap.h>
+#include <linux/hugetlb.h>
+#include <linux/compiler.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
#include <asm/io.h>
+/*
+ * When setting this variable to true it enables Two-Dimensional-Paging
+ * where the hardware walks 2 page tables:
+ * 1. the guest-virtual to guest-physical
+ * 2. while doing 1. it walks guest-physical to host-physical
+ * If the hardware supports that we don't need to do shadow paging.
+ */
+bool tdp_enabled = false;
+
#undef MMU_DEBUG
#undef AUDIT
@@ -101,8 +112,6 @@ static int dbg = 1;
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
-#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
-
#define VALID_PAGE(x) ((x) != INVALID_PAGE)
#define PT64_LEVEL_BITS 9
@@ -159,6 +168,13 @@ static int dbg = 1;
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+struct kvm_pv_mmu_op_buffer {
+ void *ptr;
+ unsigned len;
+ unsigned processed;
+ char buf[512] __aligned(sizeof(long));
+};
+
struct kvm_rmap_desc {
u64 *shadow_ptes[RMAP_EXT];
struct kvm_rmap_desc *more;
@@ -200,11 +216,15 @@ static int is_present_pte(unsigned long pte)
static int is_shadow_present_pte(u64 pte)
{
- pte &= ~PT_SHADOW_IO_MARK;
return pte != shadow_trap_nonpresent_pte
&& pte != shadow_notrap_nonpresent_pte;
}
+static int is_large_pte(u64 pte)
+{
+ return pte & PT_PAGE_SIZE_MASK;
+}
+
static int is_writeble_pte(unsigned long pte)
{
return pte & PT_WRITABLE_MASK;
@@ -215,14 +235,14 @@ static int is_dirty_pte(unsigned long pte)
return pte & PT_DIRTY_MASK;
}
-static int is_io_pte(unsigned long pte)
+static int is_rmap_pte(u64 pte)
{
- return pte & PT_SHADOW_IO_MARK;
+ return is_shadow_present_pte(pte);
}
-static int is_rmap_pte(u64 pte)
+static pfn_t spte_to_pfn(u64 pte)
{
- return is_shadow_present_pte(pte);
+ return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
}
static gfn_t pse36_gfn_delta(u32 gpte)
@@ -349,16 +369,100 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
}
/*
+ * Return the pointer to the largepage write count for a given
+ * gfn, handling slots that are not large page aligned.
+ */
+static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot)
+{
+ unsigned long idx;
+
+ idx = (gfn / KVM_PAGES_PER_HPAGE) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+ return &slot->lpage_info[idx].write_count;
+}
+
+static void account_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+ int *write_count;
+
+ write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+ *write_count += 1;
+ WARN_ON(*write_count > KVM_PAGES_PER_HPAGE);
+}
+
+static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
+{
+ int *write_count;
+
+ write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+ *write_count -= 1;
+ WARN_ON(*write_count < 0);
+}
+
+static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+ int *largepage_idx;
+
+ if (slot) {
+ largepage_idx = slot_largepage_idx(gfn, slot);
+ return *largepage_idx;
+ }
+
+ return 1;
+}
+
+static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
+{
+ struct vm_area_struct *vma;
+ unsigned long addr;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr))
+ return 0;
+
+ vma = find_vma(current->mm, addr);
+ if (vma && is_vm_hugetlb_page(vma))
+ return 1;
+
+ return 0;
+}
+
+static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+ struct kvm_memory_slot *slot;
+
+ if (has_wrprotected_page(vcpu->kvm, large_gfn))
+ return 0;
+
+ if (!host_largepage_backed(vcpu->kvm, large_gfn))
+ return 0;
+
+ slot = gfn_to_memslot(vcpu->kvm, large_gfn);
+ if (slot && slot->dirty_bitmap)
+ return 0;
+
+ return 1;
+}
+
+/*
* Take gfn and return the reverse mapping to it.
* Note: gfn must be unaliased before this function get called
*/
-static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
+static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage)
{
struct kvm_memory_slot *slot;
+ unsigned long idx;
slot = gfn_to_memslot(kvm, gfn);
- return &slot->rmap[gfn - slot->base_gfn];
+ if (!lpage)
+ return &slot->rmap[gfn - slot->base_gfn];
+
+ idx = (gfn / KVM_PAGES_PER_HPAGE) -
+ (slot->base_gfn / KVM_PAGES_PER_HPAGE);
+
+ return &slot->lpage_info[idx].rmap_pde;
}
/*
@@ -370,7 +474,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn)
* If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc
* containing more mappings.
*/
-static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage)
{
struct kvm_mmu_page *sp;
struct kvm_rmap_desc *desc;
@@ -382,7 +486,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
gfn = unalias_gfn(vcpu->kvm, gfn);
sp = page_header(__pa(spte));
sp->gfns[spte - sp->spt] = gfn;
- rmapp = gfn_to_rmap(vcpu->kvm, gfn);
+ rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage);
if (!*rmapp) {
rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
*rmapp = (unsigned long)spte;
@@ -435,20 +539,21 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
struct kvm_rmap_desc *desc;
struct kvm_rmap_desc *prev_desc;
struct kvm_mmu_page *sp;
- struct page *page;
+ pfn_t pfn;
unsigned long *rmapp;
int i;
if (!is_rmap_pte(*spte))
return;
sp = page_header(__pa(spte));
- page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
- mark_page_accessed(page);
+ pfn = spte_to_pfn(*spte);
+ if (*spte & PT_ACCESSED_MASK)
+ kvm_set_pfn_accessed(pfn);
if (is_writeble_pte(*spte))
- kvm_release_page_dirty(page);
+ kvm_release_pfn_dirty(pfn);
else
- kvm_release_page_clean(page);
- rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]);
+ kvm_release_pfn_clean(pfn);
+ rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte));
if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
BUG();
@@ -514,7 +619,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
int write_protected = 0;
gfn = unalias_gfn(kvm, gfn);
- rmapp = gfn_to_rmap(kvm, gfn);
+ rmapp = gfn_to_rmap(kvm, gfn, 0);
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
@@ -527,8 +632,35 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
}
spte = rmap_next(kvm, rmapp, spte);
}
+ if (write_protected) {
+ pfn_t pfn;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ pfn = spte_to_pfn(*spte);
+ kvm_set_pfn_dirty(pfn);
+ }
+
+ /* check for huge page mappings */
+ rmapp = gfn_to_rmap(kvm, gfn, 1);
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ BUG_ON(!spte);
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
+ pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
+ if (is_writeble_pte(*spte)) {
+ rmap_remove(kvm, spte);
+ --kvm->stat.lpages;
+ set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ write_protected = 1;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+
if (write_protected)
kvm_flush_remote_tlbs(kvm);
+
+ account_shadowed(kvm, gfn);
}
#ifdef MMU_DEBUG
@@ -538,8 +670,8 @@ static int is_empty_shadow_page(u64 *spt)
u64 *end;
for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
- if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) {
- printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+ if (*pos != shadow_trap_nonpresent_pte) {
+ printk(KERN_ERR "%s: %p %llx\n", __func__,
pos, *pos);
return 0;
}
@@ -559,7 +691,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
- return gfn;
+ return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
}
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
@@ -662,13 +794,14 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
struct kvm_mmu_page *sp;
struct hlist_node *node;
- pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
+ index = kvm_page_table_hashfn(gfn);
bucket = &kvm->arch.mmu_page_hash[index];
hlist_for_each_entry(sp, node, bucket, hash_link)
- if (sp->gfn == gfn && !sp->role.metaphysical) {
+ if (sp->gfn == gfn && !sp->role.metaphysical
+ && !sp->role.invalid) {
pgprintk("%s: found role %x\n",
- __FUNCTION__, sp->role.word);
+ __func__, sp->role.word);
return sp;
}
return NULL;
@@ -699,27 +832,27 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
+ pgprintk("%s: looking gfn %lx role %x\n", __func__,
gfn, role.word);
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry(sp, node, bucket, hash_link)
if (sp->gfn == gfn && sp->role.word == role.word) {
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
- pgprintk("%s: found\n", __FUNCTION__);
+ pgprintk("%s: found\n", __func__);
return sp;
}
++vcpu->kvm->stat.mmu_cache_miss;
sp = kvm_mmu_alloc_page(vcpu, parent_pte);
if (!sp)
return sp;
- pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
+ pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, bucket);
- vcpu->arch.mmu.prefetch_page(vcpu, sp);
if (!metaphysical)
rmap_write_protect(vcpu->kvm, gfn);
+ vcpu->arch.mmu.prefetch_page(vcpu, sp);
return sp;
}
@@ -745,11 +878,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
ent = pt[i];
+ if (is_shadow_present_pte(ent)) {
+ if (!is_large_pte(ent)) {
+ ent &= PT64_BASE_ADDR_MASK;
+ mmu_page_remove_parent_pte(page_header(ent),
+ &pt[i]);
+ } else {
+ --kvm->stat.lpages;
+ rmap_remove(kvm, &pt[i]);
+ }
+ }
pt[i] = shadow_trap_nonpresent_pte;
- if (!is_shadow_present_pte(ent))
- continue;
- ent &= PT64_BASE_ADDR_MASK;
- mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
}
kvm_flush_remote_tlbs(kvm);
}
@@ -789,10 +928,15 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
}
kvm_mmu_page_unlink_children(kvm, sp);
if (!sp->root_count) {
+ if (!sp->role.metaphysical)
+ unaccount_shadowed(kvm, sp->gfn);
hlist_del(&sp->hash_link);
kvm_mmu_free_page(kvm, sp);
- } else
+ } else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
+ sp->role.invalid = 1;
+ kvm_reload_remote_mmus(kvm);
+ }
kvm_mmu_reset_last_pte_updated(kvm);
}
@@ -838,13 +982,13 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
struct hlist_node *node, *n;
int r;
- pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+ pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
r = 0;
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ index = kvm_page_table_hashfn(gfn);
bucket = &kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
if (sp->gfn == gfn && !sp->role.metaphysical) {
- pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+ pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
sp->role.word);
kvm_mmu_zap_page(kvm, sp);
r = 1;
@@ -857,7 +1001,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
struct kvm_mmu_page *sp;
while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) {
- pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word);
+ pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word);
kvm_mmu_zap_page(kvm, sp);
}
}
@@ -889,26 +1033,39 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault, int dirty,
- int *ptwrite, gfn_t gfn, struct page *page)
+ int *ptwrite, int largepage, gfn_t gfn,
+ pfn_t pfn, bool speculative)
{
u64 spte;
int was_rmapped = 0;
int was_writeble = is_writeble_pte(*shadow_pte);
- hfn_t host_pfn = (*shadow_pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
pgprintk("%s: spte %llx access %x write_fault %d"
" user_fault %d gfn %lx\n",
- __FUNCTION__, *shadow_pte, pt_access,
+ __func__, *shadow_pte, pt_access,
write_fault, user_fault, gfn);
if (is_rmap_pte(*shadow_pte)) {
- if (host_pfn != page_to_pfn(page)) {
+ /*
+ * If we overwrite a PTE page pointer with a 2MB PMD, unlink
+ * the parent of the now unreachable PTE.
+ */
+ if (largepage && !is_large_pte(*shadow_pte)) {
+ struct kvm_mmu_page *child;
+ u64 pte = *shadow_pte;
+
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ mmu_page_remove_parent_pte(child, shadow_pte);
+ } else if (pfn != spte_to_pfn(*shadow_pte)) {
pgprintk("hfn old %lx new %lx\n",
- host_pfn, page_to_pfn(page));
+ spte_to_pfn(*shadow_pte), pfn);
rmap_remove(vcpu->kvm, shadow_pte);
+ } else {
+ if (largepage)
+ was_rmapped = is_large_pte(*shadow_pte);
+ else
+ was_rmapped = 1;
}
- else
- was_rmapped = 1;
}
/*
@@ -917,6 +1074,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
* demand paging).
*/
spte = PT_PRESENT_MASK | PT_DIRTY_MASK;
+ if (!speculative)
+ pte_access |= PT_ACCESSED_MASK;
if (!dirty)
pte_access &= ~ACC_WRITE_MASK;
if (!(pte_access & ACC_EXEC_MASK))
@@ -925,15 +1084,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
spte |= PT_PRESENT_MASK;
if (pte_access & ACC_USER_MASK)
spte |= PT_USER_MASK;
+ if (largepage)
+ spte |= PT_PAGE_SIZE_MASK;
- if (is_error_page(page)) {
- set_shadow_pte(shadow_pte,
- shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
- kvm_release_page_clean(page);
- return;
- }
-
- spte |= page_to_phys(page);
+ spte |= (u64)pfn << PAGE_SHIFT;
if ((pte_access & ACC_WRITE_MASK)
|| (write_fault && !is_write_protection(vcpu) && !user_fault)) {
@@ -946,9 +1100,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
}
shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
- if (shadow) {
+ if (shadow ||
+ (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
- __FUNCTION__, gfn);
+ __func__, gfn);
pte_access &= ~ACC_WRITE_MASK;
if (is_writeble_pte(spte)) {
spte &= ~PT_WRITABLE_MASK;
@@ -964,18 +1119,25 @@ unshadowed:
if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn);
- pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
+ pgprintk("%s: setting spte %llx\n", __func__, spte);
+ pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n",
+ (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
+ (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
set_shadow_pte(shadow_pte, spte);
+ if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK)
+ && (spte & PT_PRESENT_MASK))
+ ++vcpu->kvm->stat.lpages;
+
page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
if (!was_rmapped) {
- rmap_add(vcpu, shadow_pte, gfn);
+ rmap_add(vcpu, shadow_pte, gfn, largepage);
if (!is_rmap_pte(*shadow_pte))
- kvm_release_page_clean(page);
+ kvm_release_pfn_clean(pfn);
} else {
if (was_writeble)
- kvm_release_page_dirty(page);
+ kvm_release_pfn_dirty(pfn);
else
- kvm_release_page_clean(page);
+ kvm_release_pfn_clean(pfn);
}
if (!ptwrite || !*ptwrite)
vcpu->arch.last_pte_updated = shadow_pte;
@@ -985,10 +1147,10 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
-static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
- gfn_t gfn, struct page *page)
+static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
+ int largepage, gfn_t gfn, pfn_t pfn,
+ int level)
{
- int level = PT32E_ROOT_LEVEL;
hpa_t table_addr = vcpu->arch.mmu.root_hpa;
int pt_write = 0;
@@ -1001,8 +1163,14 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
if (level == 1) {
mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
- 0, write, 1, &pt_write, gfn, page);
- return pt_write || is_io_pte(table[index]);
+ 0, write, 1, &pt_write, 0, gfn, pfn, false);
+ return pt_write;
+ }
+
+ if (largepage && level == 2) {
+ mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL,
+ 0, write, 1, &pt_write, 1, gfn, pfn, false);
+ return pt_write;
}
if (table[index] == shadow_trap_nonpresent_pte) {
@@ -1016,7 +1184,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
1, ACC_ALL, &table[index]);
if (!new_table) {
pgprintk("nonpaging_map: ENOMEM\n");
- kvm_release_page_clean(page);
+ kvm_release_pfn_clean(pfn);
return -ENOMEM;
}
@@ -1030,21 +1198,30 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write,
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
{
int r;
-
- struct page *page;
-
- down_read(&vcpu->kvm->slots_lock);
+ int largepage = 0;
+ pfn_t pfn;
down_read(&current->mm->mmap_sem);
- page = gfn_to_page(vcpu->kvm, gfn);
+ if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ largepage = 1;
+ }
+
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);
+ /* mmio */
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return 1;
+ }
+
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
- r = __nonpaging_map(vcpu, v, write, gfn, page);
+ r = __direct_map(vcpu, v, write, largepage, gfn, pfn,
+ PT32E_ROOT_LEVEL);
spin_unlock(&vcpu->kvm->mmu_lock);
- up_read(&vcpu->kvm->slots_lock);
return r;
}
@@ -1073,6 +1250,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
sp = page_header(root);
--sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_zap_page(vcpu->kvm, sp);
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
spin_unlock(&vcpu->kvm->mmu_lock);
return;
@@ -1085,6 +1264,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
root &= PT64_BASE_ADDR_MASK;
sp = page_header(root);
--sp->root_count;
+ if (!sp->root_count && sp->role.invalid)
+ kvm_mmu_zap_page(vcpu->kvm, sp);
}
vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
}
@@ -1097,6 +1278,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
int i;
gfn_t root_gfn;
struct kvm_mmu_page *sp;
+ int metaphysical = 0;
root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
@@ -1105,14 +1287,20 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu.root_hpa;
ASSERT(!VALID_PAGE(root));
+ if (tdp_enabled)
+ metaphysical = 1;
sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
- PT64_ROOT_LEVEL, 0, ACC_ALL, NULL);
+ PT64_ROOT_LEVEL, metaphysical,
+ ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
vcpu->arch.mmu.root_hpa = root;
return;
}
#endif
+ metaphysical = !is_paging(vcpu);
+ if (tdp_enabled)
+ metaphysical = 1;
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];
@@ -1126,7 +1314,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
} else if (vcpu->arch.mmu.root_level == 0)
root_gfn = 0;
sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
- PT32_ROOT_LEVEL, !is_paging(vcpu),
+ PT32_ROOT_LEVEL, metaphysical,
ACC_ALL, NULL);
root = __pa(sp->spt);
++sp->root_count;
@@ -1146,7 +1334,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn_t gfn;
int r;
- pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code);
+ pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
@@ -1160,6 +1348,41 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
error_code & PFERR_WRITE_MASK, gfn);
}
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
+ u32 error_code)
+{
+ pfn_t pfn;
+ int r;
+ int largepage = 0;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+
+ ASSERT(vcpu);
+ ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ down_read(&current->mm->mmap_sem);
+ if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ largepage = 1;
+ }
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ up_read(&current->mm->mmap_sem);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return 1;
+ }
+ spin_lock(&vcpu->kvm->mmu_lock);
+ kvm_mmu_free_some_pages(vcpu);
+ r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
+ largepage, gfn, pfn, TDP_ROOT_LEVEL);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+
+ return r;
+}
+
static void nonpaging_free(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
@@ -1188,7 +1411,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
- pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3);
+ pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
mmu_free_roots(vcpu);
}
@@ -1253,7 +1476,35 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu)
return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
}
-static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu *context = &vcpu->arch.mmu;
+
+ context->new_cr3 = nonpaging_new_cr3;
+ context->page_fault = tdp_page_fault;
+ context->free = nonpaging_free;
+ context->prefetch_page = nonpaging_prefetch_page;
+ context->shadow_root_level = TDP_ROOT_LEVEL;
+ context->root_hpa = INVALID_PAGE;
+
+ if (!is_paging(vcpu)) {
+ context->gva_to_gpa = nonpaging_gva_to_gpa;
+ context->root_level = 0;
+ } else if (is_long_mode(vcpu)) {
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->root_level = PT64_ROOT_LEVEL;
+ } else if (is_pae(vcpu)) {
+ context->gva_to_gpa = paging64_gva_to_gpa;
+ context->root_level = PT32E_ROOT_LEVEL;
+ } else {
+ context->gva_to_gpa = paging32_gva_to_gpa;
+ context->root_level = PT32_ROOT_LEVEL;
+ }
+
+ return 0;
+}
+
+static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1268,6 +1519,16 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
return paging32_init_context(vcpu);
}
+static int init_kvm_mmu(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.update_pte.pfn = bad_pfn;
+
+ if (tdp_enabled)
+ return init_kvm_tdp_mmu(vcpu);
+ else
+ return init_kvm_softmmu(vcpu);
+}
+
static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
@@ -1316,7 +1577,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
pte = *spte;
if (is_shadow_present_pte(pte)) {
- if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+ if (sp->role.level == PT_PAGE_TABLE_LEVEL ||
+ is_large_pte(pte))
rmap_remove(vcpu->kvm, spte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
@@ -1324,24 +1586,26 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
}
}
set_shadow_pte(spte, shadow_trap_nonpresent_pte);
+ if (is_large_pte(pte))
+ --vcpu->kvm->stat.lpages;
}
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
u64 *spte,
- const void *new, int bytes,
- int offset_in_pte)
+ const void *new)
{
- if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
+ if ((sp->role.level != PT_PAGE_TABLE_LEVEL)
+ && !vcpu->arch.update_pte.largepage) {
++vcpu->kvm->stat.mmu_pde_zapped;
return;
}
++vcpu->kvm->stat.mmu_pte_updated;
if (sp->role.glevels == PT32_ROOT_LEVEL)
- paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+ paging32_update_pte(vcpu, sp, spte, new);
else
- paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte);
+ paging64_update_pte(vcpu, sp, spte, new);
}
static bool need_remote_flush(u64 old, u64 new)
@@ -1378,7 +1642,9 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
gfn_t gfn;
int r;
u64 gpte = 0;
- struct page *page;
+ pfn_t pfn;
+
+ vcpu->arch.update_pte.largepage = 0;
if (bytes != 4 && bytes != 8)
return;
@@ -1408,11 +1674,19 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
down_read(&current->mm->mmap_sem);
- page = gfn_to_page(vcpu->kvm, gfn);
+ if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
+ gfn &= ~(KVM_PAGES_PER_HPAGE-1);
+ vcpu->arch.update_pte.largepage = 1;
+ }
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
up_read(&current->mm->mmap_sem);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return;
+ }
vcpu->arch.update_pte.gfn = gfn;
- vcpu->arch.update_pte.page = page;
+ vcpu->arch.update_pte.pfn = pfn;
}
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -1423,7 +1697,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
struct hlist_node *node, *n;
struct hlist_head *bucket;
unsigned index;
- u64 entry;
+ u64 entry, gentry;
u64 *spte;
unsigned offset = offset_in_page(gpa);
unsigned pte_size;
@@ -1433,8 +1707,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
int level;
int flooded = 0;
int npte;
+ int r;
- pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+ pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
@@ -1450,7 +1725,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
vcpu->arch.last_pt_write_count = 1;
vcpu->arch.last_pte_updated = NULL;
}
- index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ index = kvm_page_table_hashfn(gfn);
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
if (sp->gfn != gfn || sp->role.metaphysical)
@@ -1496,20 +1771,29 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
continue;
}
spte = &sp->spt[page_offset / sizeof(*spte)];
+ if ((gpa & (pte_size - 1)) || (bytes < pte_size)) {
+ gentry = 0;
+ r = kvm_read_guest_atomic(vcpu->kvm,
+ gpa & ~(u64)(pte_size - 1),
+ &gentry, pte_size);
+ new = (const void *)&gentry;
+ if (r < 0)
+ new = NULL;
+ }
while (npte--) {
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
- mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes,
- page_offset & (pte_size - 1));
+ if (new)
+ mmu_pte_write_new_pte(vcpu, sp, spte, new);
mmu_pte_write_flush_tlb(vcpu, entry, *spte);
++spte;
}
}
kvm_mmu_audit(vcpu, "post pte write");
spin_unlock(&vcpu->kvm->mmu_lock);
- if (vcpu->arch.update_pte.page) {
- kvm_release_page_clean(vcpu->arch.update_pte.page);
- vcpu->arch.update_pte.page = NULL;
+ if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
+ kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
+ vcpu->arch.update_pte.pfn = bad_pfn;
}
}
@@ -1518,9 +1802,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa_t gpa;
int r;
- down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva);
- up_read(&vcpu->kvm->slots_lock);
spin_lock(&vcpu->kvm->mmu_lock);
r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
@@ -1577,6 +1859,12 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
+void kvm_enable_tdp(void)
+{
+ tdp_enabled = true;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_tdp);
+
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
struct kvm_mmu_page *sp;
@@ -1677,7 +1965,53 @@ void kvm_mmu_zap_all(struct kvm *kvm)
kvm_flush_remote_tlbs(kvm);
}
-void kvm_mmu_module_exit(void)
+void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
+{
+ struct kvm_mmu_page *page;
+
+ page = container_of(kvm->arch.active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+ kvm_mmu_zap_page(kvm, page);
+}
+
+static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
+{
+ struct kvm *kvm;
+ struct kvm *kvm_freed = NULL;
+ int cache_count = 0;
+
+ spin_lock(&kvm_lock);
+
+ list_for_each_entry(kvm, &vm_list, vm_list) {
+ int npages;
+
+ spin_lock(&kvm->mmu_lock);
+ npages = kvm->arch.n_alloc_mmu_pages -
+ kvm->arch.n_free_mmu_pages;
+ cache_count += npages;
+ if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
+ kvm_mmu_remove_one_alloc_mmu_page(kvm);
+ cache_count--;
+ kvm_freed = kvm;
+ }
+ nr_to_scan--;
+
+ spin_unlock(&kvm->mmu_lock);
+ }
+ if (kvm_freed)
+ list_move_tail(&kvm_freed->vm_list, &vm_list);
+
+ spin_unlock(&kvm_lock);
+
+ return cache_count;
+}
+
+static struct shrinker mmu_shrinker = {
+ .shrink = mmu_shrink,
+ .seeks = DEFAULT_SEEKS * 10,
+};
+
+void mmu_destroy_caches(void)
{
if (pte_chain_cache)
kmem_cache_destroy(pte_chain_cache);
@@ -1687,6 +2021,12 @@ void kvm_mmu_module_exit(void)
kmem_cache_destroy(mmu_page_header_cache);
}
+void kvm_mmu_module_exit(void)
+{
+ mmu_destroy_caches();
+ unregister_shrinker(&mmu_shrinker);
+}
+
int kvm_mmu_module_init(void)
{
pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -1706,10 +2046,12 @@ int kvm_mmu_module_init(void)
if (!mmu_page_header_cache)
goto nomem;
+ register_shrinker(&mmu_shrinker);
+
return 0;
nomem:
- kvm_mmu_module_exit();
+ mmu_destroy_caches();
return -ENOMEM;
}
@@ -1732,6 +2074,127 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
return nr_mmu_pages;
}
+static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+ unsigned len)
+{
+ if (len > buffer->len)
+ return NULL;
+ return buffer->ptr;
+}
+
+static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer,
+ unsigned len)
+{
+ void *ret;
+
+ ret = pv_mmu_peek_buffer(buffer, len);
+ if (!ret)
+ return ret;
+ buffer->ptr += len;
+ buffer->len -= len;
+ buffer->processed += len;
+ return ret;
+}
+
+static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
+ gpa_t addr, gpa_t value)
+{
+ int bytes = 8;
+ int r;
+
+ if (!is_long_mode(vcpu) && !is_pae(vcpu))
+ bytes = 4;
+
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
+
+ if (!emulator_write_phys(vcpu, addr, &value, bytes))
+ return -EFAULT;
+
+ return 1;
+}
+
+static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ kvm_x86_ops->tlb_flush(vcpu);
+ return 1;
+}
+
+static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr)
+{
+ spin_lock(&vcpu->kvm->mmu_lock);
+ mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT);
+ spin_unlock(&vcpu->kvm->mmu_lock);
+ return 1;
+}
+
+static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu,
+ struct kvm_pv_mmu_op_buffer *buffer)
+{
+ struct kvm_mmu_op_header *header;
+
+ header = pv_mmu_peek_buffer(buffer, sizeof *header);
+ if (!header)
+ return 0;
+ switch (header->op) {
+ case KVM_MMU_OP_WRITE_PTE: {
+ struct kvm_mmu_op_write_pte *wpte;
+
+ wpte = pv_mmu_read_buffer(buffer, sizeof *wpte);
+ if (!wpte)
+ return 0;
+ return kvm_pv_mmu_write(vcpu, wpte->pte_phys,
+ wpte->pte_val);
+ }
+ case KVM_MMU_OP_FLUSH_TLB: {
+ struct kvm_mmu_op_flush_tlb *ftlb;
+
+ ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb);
+ if (!ftlb)
+ return 0;
+ return kvm_pv_mmu_flush_tlb(vcpu);
+ }
+ case KVM_MMU_OP_RELEASE_PT: {
+ struct kvm_mmu_op_release_pt *rpt;
+
+ rpt = pv_mmu_read_buffer(buffer, sizeof *rpt);
+ if (!rpt)
+ return 0;
+ return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys);
+ }
+ default: return 0;
+ }
+}
+
+int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
+ gpa_t addr, unsigned long *ret)
+{
+ int r;
+ struct kvm_pv_mmu_op_buffer buffer;
+
+ buffer.ptr = buffer.buf;
+ buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf);
+ buffer.processed = 0;
+
+ r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len);
+ if (r)
+ goto out;
+
+ while (buffer.len) {
+ r = kvm_pv_mmu_op_one(vcpu, &buffer);
+ if (r < 0)
+ goto out;
+ if (r == 0)
+ break;
+ }
+
+ r = 1;
+out:
+ *ret = buffer.processed;
+ return r;
+}
+
#ifdef AUDIT
static const char *audit_msg;
@@ -1768,8 +2231,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
audit_mappings_page(vcpu, ent, va, level - 1);
} else {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
- struct page *page = gpa_to_page(vcpu, gpa);
- hpa_t hpa = page_to_phys(page);
+ hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT;
if (is_shadow_present_pte(ent)
&& (ent & PT64_BASE_ADDR_MASK) != hpa)
@@ -1782,7 +2244,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
&& !is_error_hpa(hpa))
printk(KERN_ERR "audit: (%s) notrap shadow,"
" valid guest gva %lx\n", audit_msg, va);
- kvm_release_page_clean(page);
+ kvm_release_pfn_clean(pfn);
}
}
@@ -1867,7 +2329,7 @@ static void audit_rmap(struct kvm_vcpu *vcpu)
if (n_rmap != n_actual)
printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
- __FUNCTION__, audit_msg, n_rmap, n_actual);
+ __func__, audit_msg, n_rmap, n_actual);
}
static void audit_write_protection(struct kvm_vcpu *vcpu)
@@ -1887,7 +2349,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
if (*rmapp)
printk(KERN_ERR "%s: (%s) shadow page has writable"
" mappings: gfn %lx role %x\n",
- __FUNCTION__, audit_msg, sp->gfn,
+ __func__, audit_msg, sp->gfn,
sp->role.word);
}
}
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1fce19ec7a23..e64e9f56a65e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -3,6 +3,12 @@
#include <linux/kvm_host.h>
+#ifdef CONFIG_X86_64
+#define TDP_ROOT_LEVEL PT64_ROOT_LEVEL
+#else
+#define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL
+#endif
+
static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
{
if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ecc0856268c4..156fe10288ae 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -130,7 +130,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
unsigned index, pt_access, pte_access;
gpa_t pte_gpa;
- pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
+ pgprintk("%s: addr %lx\n", __func__, addr);
walk:
walker->level = vcpu->arch.mmu.root_level;
pte = vcpu->arch.cr3;
@@ -155,7 +155,7 @@ walk:
pte_gpa += index * sizeof(pt_element_t);
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
- pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+ pgprintk("%s: table_gfn[%d] %lx\n", __func__,
walker->level - 1, table_gfn);
kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte));
@@ -222,7 +222,7 @@ walk:
walker->pt_access = pt_access;
walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __FUNCTION__, (u64)pte, pt_access, pte_access);
+ __func__, (u64)pte, pt_access, pte_access);
return 1;
not_present:
@@ -243,31 +243,30 @@ err:
}
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
- u64 *spte, const void *pte, int bytes,
- int offset_in_pte)
+ u64 *spte, const void *pte)
{
pt_element_t gpte;
unsigned pte_access;
- struct page *npage;
+ pfn_t pfn;
+ int largepage = vcpu->arch.update_pte.largepage;
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!offset_in_pte && !is_present_pte(gpte))
+ if (!is_present_pte(gpte))
set_shadow_pte(spte, shadow_notrap_nonpresent_pte);
return;
}
- if (bytes < sizeof(pt_element_t))
- return;
- pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte);
+ pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
return;
- npage = vcpu->arch.update_pte.page;
- if (!npage)
+ pfn = vcpu->arch.update_pte.pfn;
+ if (is_error_pfn(pfn))
return;
- get_page(npage);
+ kvm_get_pfn(pfn);
mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
- gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage);
+ gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
+ pfn, true);
}
/*
@@ -275,8 +274,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
*/
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *walker,
- int user_fault, int write_fault, int *ptwrite,
- struct page *page)
+ int user_fault, int write_fault, int largepage,
+ int *ptwrite, pfn_t pfn)
{
hpa_t shadow_addr;
int level;
@@ -304,11 +303,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
shadow_ent = ((u64 *)__va(shadow_addr)) + index;
if (level == PT_PAGE_TABLE_LEVEL)
break;
- if (is_shadow_present_pte(*shadow_ent)) {
+
+ if (largepage && level == PT_DIRECTORY_LEVEL)
+ break;
+
+ if (is_shadow_present_pte(*shadow_ent)
+ && !is_large_pte(*shadow_ent)) {
shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK;
continue;
}
+ if (is_large_pte(*shadow_ent))
+ rmap_remove(vcpu->kvm, shadow_ent);
+
if (level - 1 == PT_PAGE_TABLE_LEVEL
&& walker->level == PT_DIRECTORY_LEVEL) {
metaphysical = 1;
@@ -329,7 +336,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
walker->pte_gpa[level - 2],
&curr_pte, sizeof(curr_pte));
if (r || curr_pte != walker->ptes[level - 2]) {
- kvm_release_page_clean(page);
+ kvm_release_pfn_clean(pfn);
return NULL;
}
}
@@ -342,7 +349,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access,
user_fault, write_fault,
walker->ptes[walker->level-1] & PT_DIRTY_MASK,
- ptwrite, walker->gfn, page);
+ ptwrite, largepage, walker->gfn, pfn, false);
return shadow_ent;
}
@@ -371,16 +378,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
u64 *shadow_pte;
int write_pt = 0;
int r;
- struct page *page;
+ pfn_t pfn;
+ int largepage = 0;
- pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
+ pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
kvm_mmu_audit(vcpu, "pre page fault");
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
- down_read(&vcpu->kvm->slots_lock);
/*
* Look up the shadow pte for the faulting address.
*/
@@ -391,40 +398,45 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
- pgprintk("%s: guest page fault\n", __FUNCTION__);
+ pgprintk("%s: guest page fault\n", __func__);
inject_page_fault(vcpu, addr, walker.error_code);
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
- up_read(&vcpu->kvm->slots_lock);
return 0;
}
down_read(&current->mm->mmap_sem);
- page = gfn_to_page(vcpu->kvm, walker.gfn);
+ if (walker.level == PT_DIRECTORY_LEVEL) {
+ gfn_t large_gfn;
+ large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
+ if (is_largepage_backed(vcpu, large_gfn)) {
+ walker.gfn = large_gfn;
+ largepage = 1;
+ }
+ }
+ pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
up_read(&current->mm->mmap_sem);
+ /* mmio */
+ if (is_error_pfn(pfn)) {
+ pgprintk("gfn %x is mmio\n", walker.gfn);
+ kvm_release_pfn_clean(pfn);
+ return 1;
+ }
+
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- &write_pt, page);
- pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__,
+ largepage, &write_pt, pfn);
+
+ pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
shadow_pte, *shadow_pte, write_pt);
if (!write_pt)
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
- /*
- * mmio: emulate if accessible, otherwise its a guest fault.
- */
- if (shadow_pte && is_io_pte(*shadow_pte)) {
- spin_unlock(&vcpu->kvm->mmu_lock);
- up_read(&vcpu->kvm->slots_lock);
- return 1;
- }
-
++vcpu->stat.pf_fixed;
kvm_mmu_audit(vcpu, "post page fault (fixed)");
spin_unlock(&vcpu->kvm->mmu_lock);
- up_read(&vcpu->kvm->slots_lock);
return write_pt;
}
diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h
deleted file mode 100644
index 56fc4c873389..000000000000
--- a/arch/x86/kvm/segment_descriptor.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef __SEGMENT_DESCRIPTOR_H
-#define __SEGMENT_DESCRIPTOR_H
-
-struct segment_descriptor {
- u16 limit_low;
- u16 base_low;
- u8 base_mid;
- u8 type : 4;
- u8 system : 1;
- u8 dpl : 2;
- u8 present : 1;
- u8 limit_high : 4;
- u8 avl : 1;
- u8 long_mode : 1;
- u8 default_op : 1;
- u8 granularity : 1;
- u8 base_high;
-} __attribute__((packed));
-
-#ifdef CONFIG_X86_64
-/* LDT or TSS descriptor in the GDT. 16 bytes. */
-struct segment_descriptor_64 {
- struct segment_descriptor s;
- u32 base_higher;
- u32 pad_zero;
-};
-
-#endif
-#endif
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1a582f1090e8..89e0be2c10d0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -47,6 +47,18 @@ MODULE_LICENSE("GPL");
#define SVM_FEATURE_LBRV (1 << 1)
#define SVM_DEATURE_SVML (1 << 2)
+#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+
+/* enable NPT for AMD64 and X86 with PAE */
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static bool npt_enabled = true;
+#else
+static bool npt_enabled = false;
+#endif
+static int npt = 1;
+
+module_param(npt, int, S_IRUGO);
+
static void kvm_reput_irq(struct vcpu_svm *svm);
static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
@@ -54,8 +66,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_svm, vcpu);
}
-unsigned long iopm_base;
-unsigned long msrpm_base;
+static unsigned long iopm_base;
struct kvm_ldttss_desc {
u16 limit0;
@@ -182,7 +193,7 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- if (!(efer & EFER_LMA))
+ if (!npt_enabled && !(efer & EFER_LMA))
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK;
@@ -219,12 +230,12 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
if (!svm->next_rip) {
- printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__);
+ printk(KERN_DEBUG "%s: NOP\n", __func__);
return;
}
if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE)
printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n",
- __FUNCTION__,
+ __func__,
svm->vmcb->save.rip,
svm->next_rip);
@@ -279,11 +290,7 @@ static void svm_hardware_enable(void *garbage)
struct svm_cpu_data *svm_data;
uint64_t efer;
-#ifdef CONFIG_X86_64
- struct desc_ptr gdt_descr;
-#else
struct desc_ptr gdt_descr;
-#endif
struct desc_struct *gdt;
int me = raw_smp_processor_id();
@@ -302,7 +309,6 @@ static void svm_hardware_enable(void *garbage)
svm_data->asid_generation = 1;
svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
svm_data->next_asid = svm_data->max_asid + 1;
- svm_features = cpuid_edx(SVM_CPUID_FUNC);
asm volatile ("sgdt %0" : "=m"(gdt_descr));
gdt = (struct desc_struct *)gdt_descr.address;
@@ -361,12 +367,51 @@ static void set_msr_interception(u32 *msrpm, unsigned msr,
BUG();
}
+static void svm_vcpu_init_msrpm(u32 *msrpm)
+{
+ memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
+
+#ifdef CONFIG_X86_64
+ set_msr_interception(msrpm, MSR_GS_BASE, 1, 1);
+ set_msr_interception(msrpm, MSR_FS_BASE, 1, 1);
+ set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1);
+ set_msr_interception(msrpm, MSR_LSTAR, 1, 1);
+ set_msr_interception(msrpm, MSR_CSTAR, 1, 1);
+ set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1);
+#endif
+ set_msr_interception(msrpm, MSR_K6_STAR, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1);
+}
+
+static void svm_enable_lbrv(struct vcpu_svm *svm)
+{
+ u32 *msrpm = svm->msrpm;
+
+ svm->vmcb->control.lbr_ctl = 1;
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+}
+
+static void svm_disable_lbrv(struct vcpu_svm *svm)
+{
+ u32 *msrpm = svm->msrpm;
+
+ svm->vmcb->control.lbr_ctl = 0;
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
+ set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+}
+
static __init int svm_hardware_setup(void)
{
int cpu;
struct page *iopm_pages;
- struct page *msrpm_pages;
- void *iopm_va, *msrpm_va;
+ void *iopm_va;
int r;
iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER);
@@ -379,41 +424,33 @@ static __init int svm_hardware_setup(void)
clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */
iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
- msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ for_each_online_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
- r = -ENOMEM;
- if (!msrpm_pages)
- goto err_1;
+ svm_features = cpuid_edx(SVM_CPUID_FUNC);
- msrpm_va = page_address(msrpm_pages);
- memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER));
- msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT;
+ if (!svm_has(SVM_FEATURE_NPT))
+ npt_enabled = false;
-#ifdef CONFIG_X86_64
- set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1);
- set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1);
- set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1);
- set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1);
- set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1);
- set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1);
-#endif
- set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1);
- set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1);
- set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1);
- set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1);
+ if (npt_enabled && !npt) {
+ printk(KERN_INFO "kvm: Nested Paging disabled\n");
+ npt_enabled = false;
+ }
- for_each_online_cpu(cpu) {
- r = svm_cpu_init(cpu);
- if (r)
- goto err_2;
+ if (npt_enabled) {
+ printk(KERN_INFO "kvm: Nested Paging enabled\n");
+ kvm_enable_tdp();
}
+
return 0;
-err_2:
- __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
- msrpm_base = 0;
-err_1:
+err:
__free_pages(iopm_pages, IOPM_ALLOC_ORDER);
iopm_base = 0;
return r;
@@ -421,9 +458,8 @@ err_1:
static __exit void svm_hardware_unsetup(void)
{
- __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER);
__free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
- iopm_base = msrpm_base = 0;
+ iopm_base = 0;
}
static void init_seg(struct vmcb_seg *seg)
@@ -443,15 +479,14 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
seg->base = 0;
}
-static void init_vmcb(struct vmcb *vmcb)
+static void init_vmcb(struct vcpu_svm *svm)
{
- struct vmcb_control_area *control = &vmcb->control;
- struct vmcb_save_area *save = &vmcb->save;
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ struct vmcb_save_area *save = &svm->vmcb->save;
control->intercept_cr_read = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK |
- INTERCEPT_CR8_MASK;
+ INTERCEPT_CR4_MASK;
control->intercept_cr_write = INTERCEPT_CR0_MASK |
INTERCEPT_CR3_MASK |
@@ -471,23 +506,13 @@ static void init_vmcb(struct vmcb *vmcb)
INTERCEPT_DR7_MASK;
control->intercept_exceptions = (1 << PF_VECTOR) |
- (1 << UD_VECTOR);
+ (1 << UD_VECTOR) |
+ (1 << MC_VECTOR);
control->intercept = (1ULL << INTERCEPT_INTR) |
(1ULL << INTERCEPT_NMI) |
(1ULL << INTERCEPT_SMI) |
- /*
- * selective cr0 intercept bug?
- * 0: 0f 22 d8 mov %eax,%cr3
- * 3: 0f 20 c0 mov %cr0,%eax
- * 6: 0d 00 00 00 80 or $0x80000000,%eax
- * b: 0f 22 c0 mov %eax,%cr0
- * set cr3 ->interception
- * get cr0 ->interception
- * set cr0 -> no interception
- */
- /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */
(1ULL << INTERCEPT_CPUID) |
(1ULL << INTERCEPT_INVD) |
(1ULL << INTERCEPT_HLT) |
@@ -508,7 +533,7 @@ static void init_vmcb(struct vmcb *vmcb)
(1ULL << INTERCEPT_MWAIT);
control->iopm_base_pa = iopm_base;
- control->msrpm_base_pa = msrpm_base;
+ control->msrpm_base_pa = __pa(svm->msrpm);
control->tsc_offset = 0;
control->int_ctl = V_INTR_MASKING_MASK;
@@ -550,13 +575,30 @@ static void init_vmcb(struct vmcb *vmcb)
save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP;
save->cr4 = X86_CR4_PAE;
/* rdx = ?? */
+
+ if (npt_enabled) {
+ /* Setup VMCB for Nested Paging */
+ control->nested_ctl = 1;
+ control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH);
+ control->intercept_exceptions &= ~(1 << PF_VECTOR);
+ control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
+ INTERCEPT_CR3_MASK);
+ control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
+ INTERCEPT_CR3_MASK);
+ save->g_pat = 0x0007040600070406ULL;
+ /* enable caching because the QEMU Bios doesn't enable it */
+ save->cr0 = X86_CR0_ET;
+ save->cr3 = 0;
+ save->cr4 = 0;
+ }
+ force_new_asid(&svm->vcpu);
}
static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- init_vmcb(svm->vmcb);
+ init_vmcb(svm);
if (vcpu->vcpu_id != 0) {
svm->vmcb->save.rip = 0;
@@ -571,6 +613,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
{
struct vcpu_svm *svm;
struct page *page;
+ struct page *msrpm_pages;
int err;
svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
@@ -589,12 +632,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
goto uninit;
}
+ err = -ENOMEM;
+ msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ if (!msrpm_pages)
+ goto uninit;
+ svm->msrpm = page_address(msrpm_pages);
+ svm_vcpu_init_msrpm(svm->msrpm);
+
svm->vmcb = page_address(page);
clear_page(svm->vmcb);
svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
svm->asid_generation = 0;
memset(svm->db_regs, 0, sizeof(svm->db_regs));
- init_vmcb(svm->vmcb);
+ init_vmcb(svm);
fx_init(&svm->vcpu);
svm->vcpu.fpu_active = 1;
@@ -617,6 +667,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
__free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+ __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
kmem_cache_free(kvm_vcpu_cache, svm);
}
@@ -731,6 +782,13 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
var->unusable = !var->present;
}
+static int svm_get_cpl(struct kvm_vcpu *vcpu)
+{
+ struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save;
+
+ return save->cpl;
+}
+
static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -784,6 +842,9 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
}
}
#endif
+ if (npt_enabled)
+ goto set;
+
if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) {
svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
vcpu->fpu_active = 1;
@@ -791,18 +852,29 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
vcpu->arch.cr0 = cr0;
cr0 |= X86_CR0_PG | X86_CR0_WP;
- cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
if (!vcpu->fpu_active) {
svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
cr0 |= X86_CR0_TS;
}
+set:
+ /*
+ * re-enable caching here because the QEMU bios
+ * does not do it - this results in some delay at
+ * reboot
+ */
+ cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
}
static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- vcpu->arch.cr4 = cr4;
- to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE;
+ unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
+
+ vcpu->arch.cr4 = cr4;
+ if (!npt_enabled)
+ cr4 |= X86_CR4_PAE;
+ cr4 |= host_cr4_mce;
+ to_svm(vcpu)->vmcb->save.cr4 = cr4;
}
static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -833,13 +905,6 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
}
-/* FIXME:
-
- svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK;
- svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK);
-
-*/
-
static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
{
return -EOPNOTSUPP;
@@ -920,7 +985,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
}
default:
printk(KERN_DEBUG "%s: unexpected dr %u\n",
- __FUNCTION__, dr);
+ __func__, dr);
*exception = UD_VECTOR;
return;
}
@@ -962,6 +1027,19 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
return 1;
}
+static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+ /*
+ * On an #MC intercept the MCE handler is not called automatically in
+ * the host. So do it by hand here.
+ */
+ asm volatile (
+ "int $0x12\n");
+ /* not sure if we ever come back to this point */
+
+ return 1;
+}
+
static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
{
/*
@@ -969,7 +1047,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
* so reinitialize it.
*/
clear_page(svm->vmcb);
- init_vmcb(svm->vmcb);
+ init_vmcb(svm);
kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
return 0;
@@ -1033,9 +1111,18 @@ static int invalid_op_interception(struct vcpu_svm *svm,
static int task_switch_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
- pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__);
- kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
- return 0;
+ u16 tss_selector;
+
+ tss_selector = (u16)svm->vmcb->control.exit_info_1;
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
+ return kvm_task_switch(&svm->vcpu, tss_selector,
+ TASK_SWITCH_IRET);
+ if (svm->vmcb->control.exit_info_2 &
+ (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+ return kvm_task_switch(&svm->vcpu, tss_selector,
+ TASK_SWITCH_JMP);
+ return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL);
}
static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1049,7 +1136,7 @@ static int emulate_on_interception(struct vcpu_svm *svm,
struct kvm_run *kvm_run)
{
if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE)
- pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__);
+ pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
return 1;
}
@@ -1179,8 +1266,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
svm->vmcb->save.sysenter_esp = data;
break;
case MSR_IA32_DEBUGCTLMSR:
- pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __FUNCTION__, data);
+ if (!svm_has(SVM_FEATURE_LBRV)) {
+ pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
+ __func__, data);
+ break;
+ }
+ if (data & DEBUGCTL_RESERVED_BITS)
+ return 1;
+
+ svm->vmcb->save.dbgctl = data;
+ if (data & (1ULL<<0))
+ svm_enable_lbrv(svm);
+ else
+ svm_disable_lbrv(svm);
break;
case MSR_K7_EVNTSEL0:
case MSR_K7_EVNTSEL1:
@@ -1265,6 +1363,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
[SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
+ [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
[SVM_EXIT_INTR] = nop_on_interception,
[SVM_EXIT_NMI] = nop_on_interception,
[SVM_EXIT_SMI] = nop_on_interception,
@@ -1290,14 +1389,34 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
[SVM_EXIT_WBINVD] = emulate_on_interception,
[SVM_EXIT_MONITOR] = invalid_op_interception,
[SVM_EXIT_MWAIT] = invalid_op_interception,
+ [SVM_EXIT_NPF] = pf_interception,
};
-
static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
u32 exit_code = svm->vmcb->control.exit_code;
+ if (npt_enabled) {
+ int mmu_reload = 0;
+ if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
+ svm_set_cr0(vcpu, svm->vmcb->save.cr0);
+ mmu_reload = 1;
+ }
+ vcpu->arch.cr0 = svm->vmcb->save.cr0;
+ vcpu->arch.cr3 = svm->vmcb->save.cr3;
+ if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
+ if (!load_pdptrs(vcpu, vcpu->arch.cr3)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ }
+ if (mmu_reload) {
+ kvm_mmu_reset_context(vcpu);
+ kvm_mmu_load(vcpu);
+ }
+ }
+
kvm_reput_irq(svm);
if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
@@ -1308,10 +1427,11 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
}
if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
- exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR)
+ exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
+ exit_code != SVM_EXIT_NPF)
printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
"exit_code 0x%x\n",
- __FUNCTION__, svm->vmcb->control.exit_int_info,
+ __func__, svm->vmcb->control.exit_int_info,
exit_code);
if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
@@ -1364,6 +1484,27 @@ static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
svm_inject_irq(svm, irq);
}
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+ int max_irr, tpr;
+
+ if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr)
+ return;
+
+ vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ if (max_irr == -1)
+ return;
+
+ tpr = kvm_lapic_get_cr8(vcpu) << 4;
+
+ if (tpr >= (max_irr & 0xf0))
+ vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+}
+
static void svm_intr_assist(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1376,14 +1517,14 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
SVM_EVTINJ_VEC_MASK;
vmcb->control.exit_int_info = 0;
svm_inject_irq(svm, intr_vector);
- return;
+ goto out;
}
if (vmcb->control.int_ctl & V_IRQ_MASK)
- return;
+ goto out;
if (!kvm_cpu_has_interrupt(vcpu))
- return;
+ goto out;
if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
@@ -1391,12 +1532,14 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu)
/* unable to deliver irq, set pending irq */
vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR);
svm_inject_irq(svm, 0x0);
- return;
+ goto out;
}
/* Okay, we can deliver the interrupt: grab it and update PIC state. */
intr_vector = kvm_cpu_get_interrupt(vcpu);
svm_inject_irq(svm, intr_vector);
kvm_timer_intr_post(vcpu, intr_vector);
+out:
+ update_cr8_intercept(vcpu);
}
static void kvm_reput_irq(struct vcpu_svm *svm)
@@ -1482,6 +1625,29 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
{
}
+static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
+ int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
+ kvm_lapic_set_tpr(vcpu, cr8);
+ }
+}
+
+static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ u64 cr8;
+
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return;
+
+ cr8 = kvm_get_cr8(vcpu);
+ svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
+ svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
+}
+
static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1491,6 +1657,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
pre_svm_run(svm);
+ sync_lapic_to_cr8(vcpu);
+
save_host_msrs(vcpu);
fs_selector = read_fs();
gs_selector = read_gs();
@@ -1499,6 +1667,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
svm->host_dr6 = read_dr6();
svm->host_dr7 = read_dr7();
svm->vmcb->save.cr2 = vcpu->arch.cr2;
+ /* required for live migration with NPT */
+ if (npt_enabled)
+ svm->vmcb->save.cr3 = vcpu->arch.cr3;
if (svm->vmcb->save.dr7 & 0xff) {
write_dr7(0);
@@ -1635,6 +1806,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
stgi();
+ sync_cr8_to_lapic(vcpu);
+
svm->next_rip = 0;
}
@@ -1642,6 +1815,12 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ if (npt_enabled) {
+ svm->vmcb->control.nested_cr3 = root;
+ force_new_asid(vcpu);
+ return;
+ }
+
svm->vmcb->save.cr3 = root;
force_new_asid(vcpu);
@@ -1709,6 +1888,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_segment_base = svm_get_segment_base,
.get_segment = svm_get_segment,
.set_segment = svm_set_segment,
+ .get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
.set_cr0 = svm_set_cr0,
diff --git a/arch/x86/kvm/svm.h b/arch/x86/kvm/svm.h
index 5fd50491b555..1b8afa78e869 100644
--- a/arch/x86/kvm/svm.h
+++ b/arch/x86/kvm/svm.h
@@ -238,6 +238,9 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID
#define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR
+#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36
+#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
+
#define SVM_EXIT_READ_CR0 0x000
#define SVM_EXIT_READ_CR3 0x003
#define SVM_EXIT_READ_CR4 0x004
diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h
new file mode 100644
index 000000000000..622aa10f692f
--- /dev/null
+++ b/arch/x86/kvm/tss.h
@@ -0,0 +1,59 @@
+#ifndef __TSS_SEGMENT_H
+#define __TSS_SEGMENT_H
+
+struct tss_segment_32 {
+ u32 prev_task_link;
+ u32 esp0;
+ u32 ss0;
+ u32 esp1;
+ u32 ss1;
+ u32 esp2;
+ u32 ss2;
+ u32 cr3;
+ u32 eip;
+ u32 eflags;
+ u32 eax;
+ u32 ecx;
+ u32 edx;
+ u32 ebx;
+ u32 esp;
+ u32 ebp;
+ u32 esi;
+ u32 edi;
+ u32 es;
+ u32 cs;
+ u32 ss;
+ u32 ds;
+ u32 fs;
+ u32 gs;
+ u32 ldt_selector;
+ u16 t;
+ u16 io_map;
+};
+
+struct tss_segment_16 {
+ u16 prev_task_link;
+ u16 sp0;
+ u16 ss0;
+ u16 sp1;
+ u16 ss1;
+ u16 sp2;
+ u16 ss2;
+ u16 ip;
+ u16 flag;
+ u16 ax;
+ u16 cx;
+ u16 dx;
+ u16 bx;
+ u16 sp;
+ u16 bp;
+ u16 si;
+ u16 di;
+ u16 es;
+ u16 cs;
+ u16 ss;
+ u16 ds;
+ u16 ldt;
+};
+
+#endif
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8e1462880d1f..8e5d6645b90d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -17,7 +17,6 @@
#include "irq.h"
#include "vmx.h"
-#include "segment_descriptor.h"
#include "mmu.h"
#include <linux/kvm_host.h>
@@ -37,6 +36,12 @@ MODULE_LICENSE("GPL");
static int bypass_guest_pf = 1;
module_param(bypass_guest_pf, bool, 0);
+static int enable_vpid = 1;
+module_param(enable_vpid, bool, 0);
+
+static int flexpriority_enabled = 1;
+module_param(flexpriority_enabled, bool, 0);
+
struct vmcs {
u32 revision_id;
u32 abort;
@@ -71,6 +76,7 @@ struct vcpu_vmx {
unsigned rip;
} irq;
} rmode;
+ int vpid;
};
static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -85,6 +91,10 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
static struct page *vmx_io_bitmap_a;
static struct page *vmx_io_bitmap_b;
+static struct page *vmx_msr_bitmap;
+
+static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
+static DEFINE_SPINLOCK(vmx_vpid_lock);
static struct vmcs_config {
int size;
@@ -176,6 +186,11 @@ static inline int is_external_interrupt(u32 intr_info)
== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
}
+static inline int cpu_has_vmx_msr_bitmap(void)
+{
+ return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS);
+}
+
static inline int cpu_has_vmx_tpr_shadow(void)
{
return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
@@ -194,8 +209,9 @@ static inline int cpu_has_secondary_exec_ctrls(void)
static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
{
- return (vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ return flexpriority_enabled
+ && (vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
}
static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
@@ -204,6 +220,12 @@ static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
(irqchip_in_kernel(kvm)));
}
+static inline int cpu_has_vmx_vpid(void)
+{
+ return (vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_ENABLE_VPID);
+}
+
static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -214,6 +236,20 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
return -1;
}
+static inline void __invvpid(int ext, u16 vpid, gva_t gva)
+{
+ struct {
+ u64 vpid : 16;
+ u64 rsvd : 48;
+ u64 gva;
+ } operand = { vpid, 0, gva };
+
+ asm volatile (ASM_VMX_INVVPID
+ /* CF==1 or ZF==1 --> rc = -1 */
+ "; ja 1f ; ud2 ; 1:"
+ : : "a"(&operand), "c"(ext) : "cc", "memory");
+}
+
static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
{
int i;
@@ -257,6 +293,14 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
vmx->launched = 0;
}
+static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
+{
+ if (vmx->vpid == 0)
+ return;
+
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+}
+
static unsigned long vmcs_readl(unsigned long field)
{
unsigned long value;
@@ -353,7 +397,7 @@ static void reload_tss(void)
* VT restores TR but not its size. Useless.
*/
struct descriptor_table gdt;
- struct segment_descriptor *descs;
+ struct desc_struct *descs;
get_gdt(&gdt);
descs = (void *)gdt.base;
@@ -485,11 +529,12 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 phys_addr = __pa(vmx->vmcs);
- u64 tsc_this, delta;
+ u64 tsc_this, delta, new_offset;
if (vcpu->cpu != cpu) {
vcpu_clear(vmx);
kvm_migrate_apic_timer(vcpu);
+ vpid_sync_vcpu_all(vmx);
}
if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
@@ -524,8 +569,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
* Make sure the time stamp counter is monotonous.
*/
rdtscll(tsc_this);
- delta = vcpu->arch.host_tsc - tsc_this;
- vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta);
+ if (tsc_this < vcpu->arch.host_tsc) {
+ delta = vcpu->arch.host_tsc - tsc_this;
+ new_offset = vmcs_read64(TSC_OFFSET) + delta;
+ vmcs_write64(TSC_OFFSET, new_offset);
+ }
}
}
@@ -596,7 +644,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
{
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
nr | INTR_TYPE_EXCEPTION
- | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0)
+ | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
| INTR_INFO_VALID_MASK);
if (has_error_code)
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
@@ -959,6 +1007,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_MOV_DR_EXITING |
CPU_BASED_USE_TSC_OFFSETING;
opt = CPU_BASED_TPR_SHADOW |
+ CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
&_cpu_based_exec_control) < 0)
@@ -971,7 +1020,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
min = 0;
opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
- SECONDARY_EXEC_WBINVD_EXITING;
+ SECONDARY_EXEC_WBINVD_EXITING |
+ SECONDARY_EXEC_ENABLE_VPID;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
return -EIO;
@@ -1080,6 +1130,10 @@ static __init int hardware_setup(void)
{
if (setup_vmcs_config(&vmcs_config) < 0)
return -EIO;
+
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
+
return alloc_kvm_area();
}
@@ -1214,7 +1268,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
- __FUNCTION__);
+ __func__);
vmcs_write32(GUEST_TR_AR_BYTES,
(guest_tr_ar & ~AR_TYPE_MASK)
| AR_TYPE_BUSY_64_TSS);
@@ -1239,6 +1293,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
#endif
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+ vpid_sync_vcpu_all(to_vmx(vcpu));
+}
+
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK;
@@ -1275,6 +1334,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
+ vmx_flush_tlb(vcpu);
vmcs_writel(GUEST_CR3, cr3);
if (vcpu->arch.cr0 & X86_CR0_PE)
vmx_fpu_deactivate(vcpu);
@@ -1288,14 +1348,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vcpu->arch.cr4 = cr4;
}
-#ifdef CONFIG_X86_64
-
static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
vcpu->arch.shadow_efer = efer;
+ if (!msr)
+ return;
if (efer & EFER_LMA) {
vmcs_write32(VM_ENTRY_CONTROLS,
vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1312,8 +1372,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
setup_msrs(vmx);
}
-#endif
-
static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
@@ -1344,6 +1402,20 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
var->unusable = (ar >> 16) & 1;
}
+static int vmx_get_cpl(struct kvm_vcpu *vcpu)
+{
+ struct kvm_segment kvm_seg;
+
+ if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */
+ return 0;
+
+ if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
+ return 3;
+
+ vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS);
+ return kvm_seg.selector & 3;
+}
+
static u32 vmx_segment_access_rights(struct kvm_segment *var)
{
u32 ar;
@@ -1433,7 +1505,6 @@ static int init_rmode_tss(struct kvm *kvm)
int ret = 0;
int r;
- down_read(&kvm->slots_lock);
r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
if (r < 0)
goto out;
@@ -1456,7 +1527,6 @@ static int init_rmode_tss(struct kvm *kvm)
ret = 1;
out:
- up_read(&kvm->slots_lock);
return ret;
}
@@ -1494,6 +1564,46 @@ out:
return r;
}
+static void allocate_vpid(struct vcpu_vmx *vmx)
+{
+ int vpid;
+
+ vmx->vpid = 0;
+ if (!enable_vpid || !cpu_has_vmx_vpid())
+ return;
+ spin_lock(&vmx_vpid_lock);
+ vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
+ if (vpid < VMX_NR_VPIDS) {
+ vmx->vpid = vpid;
+ __set_bit(vpid, vmx_vpid_bitmap);
+ }
+ spin_unlock(&vmx_vpid_lock);
+}
+
+void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
+{
+ void *va;
+
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ /*
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
+ * have the write-low and read-high bitmap offsets the wrong way round.
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
+ */
+ va = kmap(msr_bitmap);
+ if (msr <= 0x1fff) {
+ __clear_bit(msr, va + 0x000); /* read-low */
+ __clear_bit(msr, va + 0x800); /* write-low */
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
+ msr &= 0x1fff;
+ __clear_bit(msr, va + 0x400); /* read-high */
+ __clear_bit(msr, va + 0xc00); /* write-high */
+ }
+ kunmap(msr_bitmap);
+}
+
/*
* Sets up the vmcs for emulated real mode.
*/
@@ -1511,6 +1621,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
+ if (cpu_has_vmx_msr_bitmap())
+ vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap));
+
vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
/* Control */
@@ -1532,6 +1645,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
exec_control &=
~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+ if (vmx->vpid == 0)
+ exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
@@ -1613,6 +1728,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
u64 msr;
int ret;
+ down_read(&vcpu->kvm->slots_lock);
if (!init_rmode_tss(vmx->vcpu.kvm)) {
ret = -ENOMEM;
goto out;
@@ -1621,7 +1737,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmx->vcpu.arch.rmode.active = 0;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
- set_cr8(&vmx->vcpu, 0);
+ kvm_set_cr8(&vmx->vcpu, 0);
msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
if (vmx->vcpu.vcpu_id == 0)
msr |= MSR_IA32_APICBASE_BSP;
@@ -1704,18 +1820,22 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write64(APIC_ACCESS_ADDR,
page_to_phys(vmx->vcpu.kvm->arch.apic_access_page));
+ if (vmx->vpid != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+
vmx->vcpu.arch.cr0 = 0x60000010;
vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */
vmx_set_cr4(&vmx->vcpu, 0);
-#ifdef CONFIG_X86_64
vmx_set_efer(&vmx->vcpu, 0);
-#endif
vmx_fpu_activate(&vmx->vcpu);
update_exception_bitmap(&vmx->vcpu);
- return 0;
+ vpid_sync_vcpu_all(vmx);
+
+ ret = 0;
out:
+ up_read(&vcpu->kvm->slots_lock);
return ret;
}
@@ -1723,6 +1843,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
+
if (vcpu->arch.rmode.active) {
vmx->rmode.irq.pending = true;
vmx->rmode.irq.vector = irq;
@@ -1844,7 +1966,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
if ((vect_info & VECTORING_INFO_VALID_MASK) &&
!is_page_fault(intr_info))
printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
- "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
+ "intr info 0x%x\n", __func__, vect_info, intr_info);
if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
@@ -1869,10 +1991,12 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
error_code = 0;
rip = vmcs_readl(GUEST_RIP);
- if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
+ if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
if (is_page_fault(intr_info)) {
cr2 = vmcs_readl(EXIT_QUALIFICATION);
+ KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
+ (u32)((u64)cr2 >> 32), handler);
return kvm_mmu_page_fault(vcpu, cr2, error_code);
}
@@ -1901,6 +2025,7 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
++vcpu->stat.irq_exits;
+ KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler);
return 1;
}
@@ -1958,25 +2083,27 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
case 0: /* mov to cr */
+ KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg],
+ (u32)((u64)vcpu->arch.regs[reg] >> 32), handler);
switch (cr) {
case 0:
vcpu_load_rsp_rip(vcpu);
- set_cr0(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
case 3:
vcpu_load_rsp_rip(vcpu);
- set_cr3(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
case 4:
vcpu_load_rsp_rip(vcpu);
- set_cr4(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
return 1;
case 8:
vcpu_load_rsp_rip(vcpu);
- set_cr8(vcpu, vcpu->arch.regs[reg]);
+ kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
skip_emulated_instruction(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
return 1;
@@ -1990,6 +2117,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->arch.cr0 &= ~X86_CR0_TS;
vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
vmx_fpu_activate(vcpu);
+ KVMTRACE_0D(CLTS, vcpu, handler);
skip_emulated_instruction(vcpu);
return 1;
case 1: /*mov from cr*/
@@ -1998,18 +2126,24 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu_load_rsp_rip(vcpu);
vcpu->arch.regs[reg] = vcpu->arch.cr3;
vcpu_put_rsp_rip(vcpu);
+ KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
+ (u32)vcpu->arch.regs[reg],
+ (u32)((u64)vcpu->arch.regs[reg] >> 32),
+ handler);
skip_emulated_instruction(vcpu);
return 1;
case 8:
vcpu_load_rsp_rip(vcpu);
- vcpu->arch.regs[reg] = get_cr8(vcpu);
+ vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
vcpu_put_rsp_rip(vcpu);
+ KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
+ (u32)vcpu->arch.regs[reg], handler);
skip_emulated_instruction(vcpu);
return 1;
}
break;
case 3: /* lmsw */
- lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
+ kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
skip_emulated_instruction(vcpu);
return 1;
@@ -2049,6 +2183,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
val = 0;
}
vcpu->arch.regs[reg] = val;
+ KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
} else {
/* mov to dr */
}
@@ -2073,6 +2208,9 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
+ KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32),
+ handler);
+
/* FIXME: handling of bits 32:63 of rax, rdx */
vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u;
vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
@@ -2086,6 +2224,9 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
| ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+ KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32),
+ handler);
+
if (vmx_set_msr(vcpu, ecx, data) != 0) {
kvm_inject_gp(vcpu, 0);
return 1;
@@ -2110,6 +2251,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+
+ KVMTRACE_0D(PEND_INTR, vcpu, handler);
+
/*
* If the user space waits to inject interrupts, exit as soon as
* possible
@@ -2152,6 +2296,8 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
offset = exit_qualification & 0xffful;
+ KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler);
+
er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
if (er != EMULATE_DONE) {
@@ -2163,6 +2309,20 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
return 1;
}
+static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+ unsigned long exit_qualification;
+ u16 tss_selector;
+ int reason;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ reason = (u32)exit_qualification >> 30;
+ tss_selector = exit_qualification;
+
+ return kvm_task_switch(vcpu, tss_selector, reason);
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2185,6 +2345,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
[EXIT_REASON_WBINVD] = handle_wbinvd,
+ [EXIT_REASON_TASK_SWITCH] = handle_task_switch,
};
static const int kvm_vmx_max_exit_handlers =
@@ -2200,6 +2361,9 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vectoring_info = vmx->idt_vectoring_info;
+ KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP),
+ (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit);
+
if (unlikely(vmx->fail)) {
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
kvm_run->fail_entry.hardware_entry_failure_reason
@@ -2210,7 +2374,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
exit_reason != EXIT_REASON_EXCEPTION_NMI)
printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
- "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
+ "exit reason is 0x%x\n", __func__, exit_reason);
if (exit_reason < kvm_vmx_max_exit_handlers
&& kvm_vmx_exit_handlers[exit_reason])
return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -2221,10 +2385,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
return 0;
}
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
-{
-}
-
static void update_tpr_threshold(struct kvm_vcpu *vcpu)
{
int max_irr, tpr;
@@ -2285,11 +2445,13 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
return;
}
+ KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
+
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
- if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK))
+ if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK))
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
vmcs_read32(IDT_VECTORING_ERROR_CODE));
if (unlikely(has_ext_irq))
@@ -2470,8 +2632,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
/* We need to handle NMIs before interrupts are enabled */
- if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+ if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
+ KVMTRACE_0D(NMI, vcpu, handler);
asm("int $2");
+ }
}
static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
@@ -2489,6 +2653,10 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ spin_lock(&vmx_vpid_lock);
+ if (vmx->vpid != 0)
+ __clear_bit(vmx->vpid, vmx_vpid_bitmap);
+ spin_unlock(&vmx_vpid_lock);
vmx_free_vmcs(vcpu);
kfree(vmx->host_msrs);
kfree(vmx->guest_msrs);
@@ -2505,6 +2673,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!vmx)
return ERR_PTR(-ENOMEM);
+ allocate_vpid(vmx);
+
err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
if (err)
goto free_vcpu;
@@ -2591,14 +2761,13 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_segment_base = vmx_get_segment_base,
.get_segment = vmx_get_segment,
.set_segment = vmx_set_segment,
+ .get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
.set_cr0 = vmx_set_cr0,
.set_cr3 = vmx_set_cr3,
.set_cr4 = vmx_set_cr4,
-#ifdef CONFIG_X86_64
.set_efer = vmx_set_efer,
-#endif
.get_idt = vmx_get_idt,
.set_idt = vmx_set_idt,
.get_gdt = vmx_get_gdt,
@@ -2626,7 +2795,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
static int __init vmx_init(void)
{
- void *iova;
+ void *va;
int r;
vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
@@ -2639,28 +2808,48 @@ static int __init vmx_init(void)
goto out;
}
+ vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (!vmx_msr_bitmap) {
+ r = -ENOMEM;
+ goto out1;
+ }
+
/*
* Allow direct access to the PC debug port (it is often used for I/O
* delays, but the vmexits simply slow things down).
*/
- iova = kmap(vmx_io_bitmap_a);
- memset(iova, 0xff, PAGE_SIZE);
- clear_bit(0x80, iova);
+ va = kmap(vmx_io_bitmap_a);
+ memset(va, 0xff, PAGE_SIZE);
+ clear_bit(0x80, va);
kunmap(vmx_io_bitmap_a);
- iova = kmap(vmx_io_bitmap_b);
- memset(iova, 0xff, PAGE_SIZE);
+ va = kmap(vmx_io_bitmap_b);
+ memset(va, 0xff, PAGE_SIZE);
kunmap(vmx_io_bitmap_b);
+ va = kmap(vmx_msr_bitmap);
+ memset(va, 0xff, PAGE_SIZE);
+ kunmap(vmx_msr_bitmap);
+
+ set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
+
r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
if (r)
- goto out1;
+ goto out2;
+
+ vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE);
+ vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE);
+ vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS);
+ vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
+ vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
if (bypass_guest_pf)
kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
return 0;
+out2:
+ __free_page(vmx_msr_bitmap);
out1:
__free_page(vmx_io_bitmap_b);
out:
@@ -2670,6 +2859,7 @@ out:
static void __exit vmx_exit(void)
{
+ __free_page(vmx_msr_bitmap);
__free_page(vmx_io_bitmap_b);
__free_page(vmx_io_bitmap_a);
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index d52ae8d7303d..5dff4606b988 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -49,6 +49,7 @@
* Definitions of Secondary Processor-Based VM-Execution Controls.
*/
#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
@@ -65,6 +66,7 @@
/* VMCS Encodings */
enum vmcs_field {
+ VIRTUAL_PROCESSOR_ID = 0x00000000,
GUEST_ES_SELECTOR = 0x00000800,
GUEST_CS_SELECTOR = 0x00000802,
GUEST_SS_SELECTOR = 0x00000804,
@@ -231,12 +233,12 @@ enum vmcs_field {
*/
#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
-#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */
+#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */
#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
-#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK
+#define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK
#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
@@ -321,4 +323,8 @@ enum vmcs_field {
#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
+#define VMX_NR_VPIDS (1 << 16)
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1
+#define VMX_VPID_EXTENT_ALL_CONTEXT 2
+
#endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6b01552bd1f1..0ce556372a4d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -15,10 +15,12 @@
*/
#include <linux/kvm_host.h>
-#include "segment_descriptor.h"
#include "irq.h"
#include "mmu.h"
+#include "i8254.h"
+#include "tss.h"
+#include <linux/clocksource.h>
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
@@ -28,6 +30,7 @@
#include <asm/uaccess.h>
#include <asm/msr.h>
+#include <asm/desc.h>
#define MAX_IO_MSRS 256
#define CR0_RESERVED_BITS \
@@ -41,7 +44,15 @@
| X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
-#define EFER_RESERVED_BITS 0xfffffffffffff2fe
+/* EFER defaults:
+ * - enable syscall per default because its emulated by KVM
+ * - enable LME and LMA per default on 64 bit KVM
+ */
+#ifdef CONFIG_X86_64
+static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL;
+#else
+static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL;
+#endif
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
@@ -63,6 +74,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "irq_window", VCPU_STAT(irq_window_exits) },
{ "halt_exits", VCPU_STAT(halt_exits) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
+ { "hypercalls", VCPU_STAT(hypercalls) },
{ "request_irq", VCPU_STAT(request_irq_exits) },
{ "irq_exits", VCPU_STAT(irq_exits) },
{ "host_state_reload", VCPU_STAT(host_state_reload) },
@@ -78,6 +90,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_recycled", VM_STAT(mmu_recycled) },
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
+ { "largepages", VM_STAT(lpages) },
{ NULL }
};
@@ -85,7 +98,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
unsigned long segment_base(u16 selector)
{
struct descriptor_table gdt;
- struct segment_descriptor *d;
+ struct desc_struct *d;
unsigned long table_base;
unsigned long v;
@@ -101,13 +114,12 @@ unsigned long segment_base(u16 selector)
asm("sldt %0" : "=g"(ldt_selector));
table_base = segment_base(ldt_selector);
}
- d = (struct segment_descriptor *)(table_base + (selector & ~7));
- v = d->base_low | ((unsigned long)d->base_mid << 16) |
- ((unsigned long)d->base_high << 24);
+ d = (struct desc_struct *)(table_base + (selector & ~7));
+ v = d->base0 | ((unsigned long)d->base1 << 16) |
+ ((unsigned long)d->base2 << 24);
#ifdef CONFIG_X86_64
- if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
- v |= ((unsigned long) \
- ((struct segment_descriptor_64 *)d)->base_higher) << 32;
+ if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
+ v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
#endif
return v;
}
@@ -145,11 +157,16 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
u32 error_code)
{
++vcpu->stat.pf_guest;
- if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) {
- printk(KERN_DEBUG "kvm: inject_page_fault:"
- " double fault 0x%lx\n", addr);
- vcpu->arch.exception.nr = DF_VECTOR;
- vcpu->arch.exception.error_code = 0;
+ if (vcpu->arch.exception.pending) {
+ if (vcpu->arch.exception.nr == PF_VECTOR) {
+ printk(KERN_DEBUG "kvm: inject_page_fault:"
+ " double fault 0x%lx\n", addr);
+ vcpu->arch.exception.nr = DF_VECTOR;
+ vcpu->arch.exception.error_code = 0;
+ } else if (vcpu->arch.exception.nr == DF_VECTOR) {
+ /* triple fault -> shutdown */
+ set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+ }
return;
}
vcpu->arch.cr2 = addr;
@@ -184,7 +201,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
int ret;
u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)];
- down_read(&vcpu->kvm->slots_lock);
ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
offset * sizeof(u64), sizeof(pdpte));
if (ret < 0) {
@@ -201,10 +217,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
out:
- up_read(&vcpu->kvm->slots_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(load_pdptrs);
static bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
@@ -215,18 +231,16 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
if (is_long_mode(vcpu) || !is_pae(vcpu))
return false;
- down_read(&vcpu->kvm->slots_lock);
r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte));
if (r < 0)
goto out;
changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0;
out:
- up_read(&vcpu->kvm->slots_lock);
return changed;
}
-void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
if (cr0 & CR0_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
@@ -284,15 +298,18 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
kvm_mmu_reset_context(vcpu);
return;
}
-EXPORT_SYMBOL_GPL(set_cr0);
+EXPORT_SYMBOL_GPL(kvm_set_cr0);
-void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
+void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
{
- set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+ kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f));
+ KVMTRACE_1D(LMSW, vcpu,
+ (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)),
+ handler);
}
-EXPORT_SYMBOL_GPL(lmsw);
+EXPORT_SYMBOL_GPL(kvm_lmsw);
-void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
if (cr4 & CR4_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
@@ -323,9 +340,9 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vcpu->arch.cr4 = cr4;
kvm_mmu_reset_context(vcpu);
}
-EXPORT_SYMBOL_GPL(set_cr4);
+EXPORT_SYMBOL_GPL(kvm_set_cr4);
-void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
kvm_mmu_flush_tlb(vcpu);
@@ -359,7 +376,6 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
*/
}
- down_read(&vcpu->kvm->slots_lock);
/*
* Does the new cr3 value map to physical memory? (Note, we
* catch an invalid cr3 even in real-mode, because it would
@@ -375,11 +391,10 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
vcpu->arch.cr3 = cr3;
vcpu->arch.mmu.new_cr3(vcpu);
}
- up_read(&vcpu->kvm->slots_lock);
}
-EXPORT_SYMBOL_GPL(set_cr3);
+EXPORT_SYMBOL_GPL(kvm_set_cr3);
-void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
@@ -391,16 +406,16 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
else
vcpu->arch.cr8 = cr8;
}
-EXPORT_SYMBOL_GPL(set_cr8);
+EXPORT_SYMBOL_GPL(kvm_set_cr8);
-unsigned long get_cr8(struct kvm_vcpu *vcpu)
+unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
{
if (irqchip_in_kernel(vcpu->kvm))
return kvm_lapic_get_cr8(vcpu);
else
return vcpu->arch.cr8;
}
-EXPORT_SYMBOL_GPL(get_cr8);
+EXPORT_SYMBOL_GPL(kvm_get_cr8);
/*
* List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -415,7 +430,8 @@ static u32 msrs_to_save[] = {
#ifdef CONFIG_X86_64
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
- MSR_IA32_TIME_STAMP_COUNTER,
+ MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_IA32_PERF_STATUS,
};
static unsigned num_msrs_to_save;
@@ -424,11 +440,9 @@ static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
};
-#ifdef CONFIG_X86_64
-
static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
- if (efer & EFER_RESERVED_BITS) {
+ if (efer & efer_reserved_bits) {
printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
efer);
kvm_inject_gp(vcpu, 0);
@@ -450,7 +464,12 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
vcpu->arch.shadow_efer = efer;
}
-#endif
+void kvm_enable_efer_bits(u64 mask)
+{
+ efer_reserved_bits &= ~mask;
+}
+EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
+
/*
* Writes msr value into into the appropriate "register".
@@ -470,26 +489,86 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
return kvm_set_msr(vcpu, index, *data);
}
+static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
+{
+ static int version;
+ struct kvm_wall_clock wc;
+ struct timespec wc_ts;
+
+ if (!wall_clock)
+ return;
+
+ version++;
+
+ kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+
+ wc_ts = current_kernel_time();
+ wc.wc_sec = wc_ts.tv_sec;
+ wc.wc_nsec = wc_ts.tv_nsec;
+ wc.wc_version = version;
+
+ kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
+
+ version++;
+ kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
+}
+
+static void kvm_write_guest_time(struct kvm_vcpu *v)
+{
+ struct timespec ts;
+ unsigned long flags;
+ struct kvm_vcpu_arch *vcpu = &v->arch;
+ void *shared_kaddr;
+
+ if ((!vcpu->time_page))
+ return;
+
+ /* Keep irq disabled to prevent changes to the clock */
+ local_irq_save(flags);
+ kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER,
+ &vcpu->hv_clock.tsc_timestamp);
+ ktime_get_ts(&ts);
+ local_irq_restore(flags);
+
+ /* With all the info we got, fill in the values */
+
+ vcpu->hv_clock.system_time = ts.tv_nsec +
+ (NSEC_PER_SEC * (u64)ts.tv_sec);
+ /*
+ * The interface expects us to write an even number signaling that the
+ * update is finished. Since the guest won't see the intermediate
+ * state, we just write "2" at the end
+ */
+ vcpu->hv_clock.version = 2;
+
+ shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0);
+
+ memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
+
+ kunmap_atomic(shared_kaddr, KM_USER0);
+
+ mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
-#ifdef CONFIG_X86_64
case MSR_EFER:
set_efer(vcpu, data);
break;
-#endif
case MSR_IA32_MC0_STATUS:
pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
- __FUNCTION__, data);
+ __func__, data);
break;
case MSR_IA32_MCG_STATUS:
pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
- __FUNCTION__, data);
+ __func__, data);
break;
case MSR_IA32_MCG_CTL:
pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
- __FUNCTION__, data);
+ __func__, data);
break;
case MSR_IA32_UCODE_REV:
case MSR_IA32_UCODE_WRITE:
@@ -501,6 +580,42 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
+ case MSR_KVM_WALL_CLOCK:
+ vcpu->kvm->arch.wall_clock = data;
+ kvm_write_wall_clock(vcpu->kvm, data);
+ break;
+ case MSR_KVM_SYSTEM_TIME: {
+ if (vcpu->arch.time_page) {
+ kvm_release_page_dirty(vcpu->arch.time_page);
+ vcpu->arch.time_page = NULL;
+ }
+
+ vcpu->arch.time = data;
+
+ /* we verify if the enable bit is set... */
+ if (!(data & 1))
+ break;
+
+ /* ...but clean it before doing the actual write */
+ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
+
+ vcpu->arch.hv_clock.tsc_to_system_mul =
+ clocksource_khz2mult(tsc_khz, 22);
+ vcpu->arch.hv_clock.tsc_shift = 22;
+
+ down_read(&current->mm->mmap_sem);
+ vcpu->arch.time_page =
+ gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
+ up_read(&current->mm->mmap_sem);
+
+ if (is_error_page(vcpu->arch.time_page)) {
+ kvm_release_page_clean(vcpu->arch.time_page);
+ vcpu->arch.time_page = NULL;
+ }
+
+ kvm_write_guest_time(vcpu);
+ break;
+ }
default:
pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data);
return 1;
@@ -540,7 +655,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MC0_MISC+12:
case MSR_IA32_MC0_MISC+16:
case MSR_IA32_UCODE_REV:
- case MSR_IA32_PERF_STATUS:
case MSR_IA32_EBL_CR_POWERON:
/* MTRR registers */
case 0xfe:
@@ -556,11 +670,21 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_MISC_ENABLE:
data = vcpu->arch.ia32_misc_enable_msr;
break;
-#ifdef CONFIG_X86_64
+ case MSR_IA32_PERF_STATUS:
+ /* TSC increment by tick */
+ data = 1000ULL;
+ /* CPU multiplier */
+ data |= (((uint64_t)4ULL) << 40);
+ break;
case MSR_EFER:
data = vcpu->arch.shadow_efer;
break;
-#endif
+ case MSR_KVM_WALL_CLOCK:
+ data = vcpu->kvm->arch.wall_clock;
+ break;
+ case MSR_KVM_SYSTEM_TIME:
+ data = vcpu->arch.time;
+ break;
default:
pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
return 1;
@@ -584,9 +708,11 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
vcpu_load(vcpu);
+ down_read(&vcpu->kvm->slots_lock);
for (i = 0; i < msrs->nmsrs; ++i)
if (do_msr(vcpu, entries[i].index, &entries[i].data))
break;
+ up_read(&vcpu->kvm->slots_lock);
vcpu_put(vcpu);
@@ -688,11 +814,24 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_USER_MEMORY:
case KVM_CAP_SET_TSS_ADDR:
case KVM_CAP_EXT_CPUID:
+ case KVM_CAP_CLOCKSOURCE:
+ case KVM_CAP_PIT:
+ case KVM_CAP_NOP_IO_DELAY:
+ case KVM_CAP_MP_STATE:
r = 1;
break;
case KVM_CAP_VAPIC:
r = !kvm_x86_ops->cpu_has_accelerated_tpr();
break;
+ case KVM_CAP_NR_VCPUS:
+ r = KVM_MAX_VCPUS;
+ break;
+ case KVM_CAP_NR_MEMSLOTS:
+ r = KVM_MEMORY_SLOTS;
+ break;
+ case KVM_CAP_PV_MMU:
+ r = !tdp_enabled;
+ break;
default:
r = 0;
break;
@@ -763,6 +902,7 @@ out:
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
kvm_x86_ops->vcpu_load(vcpu, cpu);
+ kvm_write_guest_time(vcpu);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -958,32 +1098,32 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
}
/* function 4 and 0xb have additional index. */
case 4: {
- int index, cache_type;
+ int i, cache_type;
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* read more entries until cache_type is zero */
- for (index = 1; *nent < maxnent; ++index) {
- cache_type = entry[index - 1].eax & 0x1f;
+ for (i = 1; *nent < maxnent; ++i) {
+ cache_type = entry[i - 1].eax & 0x1f;
if (!cache_type)
break;
- do_cpuid_1_ent(&entry[index], function, index);
- entry[index].flags |=
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
++*nent;
}
break;
}
case 0xb: {
- int index, level_type;
+ int i, level_type;
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* read more entries until level_type is zero */
- for (index = 1; *nent < maxnent; ++index) {
- level_type = entry[index - 1].ecx & 0xff;
+ for (i = 1; *nent < maxnent; ++i) {
+ level_type = entry[i - 1].ecx & 0xff;
if (!level_type)
break;
- do_cpuid_1_ent(&entry[index], function, index);
- entry[index].flags |=
+ do_cpuid_1_ent(&entry[i], function, i);
+ entry[i].flags |=
KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
++*nent;
}
@@ -1365,6 +1505,23 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
return r;
}
+static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int r = 0;
+
+ memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
+ return r;
+}
+
+static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
+{
+ int r = 0;
+
+ memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
+ kvm_pit_load_count(kvm, 0, ps->channels[0].count);
+ return r;
+}
+
/*
* Get (and clear) the dirty memory log for a memory slot.
*/
@@ -1457,6 +1614,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
} else
goto out;
break;
+ case KVM_CREATE_PIT:
+ r = -ENOMEM;
+ kvm->arch.vpit = kvm_create_pit(kvm);
+ if (kvm->arch.vpit)
+ r = 0;
+ break;
case KVM_IRQ_LINE: {
struct kvm_irq_level irq_event;
@@ -1512,6 +1675,37 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
break;
}
+ case KVM_GET_PIT: {
+ struct kvm_pit_state ps;
+ r = -EFAULT;
+ if (copy_from_user(&ps, argp, sizeof ps))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_get_pit(kvm, &ps);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(argp, &ps, sizeof ps))
+ goto out;
+ r = 0;
+ break;
+ }
+ case KVM_SET_PIT: {
+ struct kvm_pit_state ps;
+ r = -EFAULT;
+ if (copy_from_user(&ps, argp, sizeof ps))
+ goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
+ r = kvm_vm_ioctl_set_pit(kvm, &ps);
+ if (r)
+ goto out;
+ r = 0;
+ break;
+ }
default:
;
}
@@ -1570,7 +1764,6 @@ int emulator_read_std(unsigned long addr,
void *data = val;
int r = X86EMUL_CONTINUE;
- down_read(&vcpu->kvm->slots_lock);
while (bytes) {
gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
unsigned offset = addr & (PAGE_SIZE-1);
@@ -1592,7 +1785,6 @@ int emulator_read_std(unsigned long addr,
addr += tocopy;
}
out:
- up_read(&vcpu->kvm->slots_lock);
return r;
}
EXPORT_SYMBOL_GPL(emulator_read_std);
@@ -1611,9 +1803,7 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
- up_read(&vcpu->kvm->slots_lock);
/* For APIC access vmexit */
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -1646,19 +1836,15 @@ mmio:
return X86EMUL_UNHANDLEABLE;
}
-static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes)
+int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ const void *val, int bytes)
{
int ret;
- down_read(&vcpu->kvm->slots_lock);
ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
- if (ret < 0) {
- up_read(&vcpu->kvm->slots_lock);
+ if (ret < 0)
return 0;
- }
kvm_mmu_pte_write(vcpu, gpa, val, bytes);
- up_read(&vcpu->kvm->slots_lock);
return 1;
}
@@ -1670,9 +1856,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
struct kvm_io_device *mmio_dev;
gpa_t gpa;
- down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
- up_read(&vcpu->kvm->slots_lock);
if (gpa == UNMAPPED_GVA) {
kvm_inject_page_fault(vcpu, addr, 2);
@@ -1749,7 +1933,6 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
char *kaddr;
u64 val;
- down_read(&vcpu->kvm->slots_lock);
gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr);
if (gpa == UNMAPPED_GVA ||
@@ -1769,9 +1952,8 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
kunmap_atomic(kaddr, KM_USER0);
kvm_release_page_dirty(page);
- emul_write:
- up_read(&vcpu->kvm->slots_lock);
}
+emul_write:
#endif
return emulator_write_emulated(addr, new, bytes, vcpu);
@@ -1802,7 +1984,7 @@ int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
*dest = kvm_x86_ops->get_dr(vcpu, dr);
return X86EMUL_CONTINUE;
default:
- pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
+ pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
return X86EMUL_UNHANDLEABLE;
}
}
@@ -1840,7 +2022,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
}
EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
-struct x86_emulate_ops emulate_ops = {
+static struct x86_emulate_ops emulate_ops = {
.read_std = emulator_read_std,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
@@ -2091,6 +2273,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->arch.pio.guest_page_offset = 0;
vcpu->arch.pio.rep = 0;
+ if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
+ KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
+ handler);
+ else
+ KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
+ handler);
+
kvm_x86_ops->cache_regs(vcpu);
memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4);
kvm_x86_ops->decache_regs(vcpu);
@@ -2129,6 +2318,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
vcpu->arch.pio.guest_page_offset = offset_in_page(address);
vcpu->arch.pio.rep = rep;
+ if (vcpu->run->io.direction == KVM_EXIT_IO_IN)
+ KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size,
+ handler);
+ else
+ KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
+ handler);
+
if (!count) {
kvm_x86_ops->skip_emulated_instruction(vcpu);
return 1;
@@ -2163,10 +2359,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
kvm_x86_ops->skip_emulated_instruction(vcpu);
for (i = 0; i < nr_pages; ++i) {
- down_read(&vcpu->kvm->slots_lock);
page = gva_to_page(vcpu, address + i * PAGE_SIZE);
vcpu->arch.pio.guest_pages[i] = page;
- up_read(&vcpu->kvm->slots_lock);
if (!page) {
kvm_inject_gp(vcpu, 0);
free_pio_guest_pages(vcpu);
@@ -2238,10 +2432,13 @@ void kvm_arch_exit(void)
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.halt_exits;
+ KVMTRACE_0D(HLT, vcpu, handler);
if (irqchip_in_kernel(vcpu->kvm)) {
- vcpu->arch.mp_state = VCPU_MP_STATE_HALTED;
+ vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+ up_read(&vcpu->kvm->slots_lock);
kvm_vcpu_block(vcpu);
- if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE)
+ down_read(&vcpu->kvm->slots_lock);
+ if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
return -EINTR;
return 1;
} else {
@@ -2251,9 +2448,19 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
+ unsigned long a1)
+{
+ if (is_long_mode(vcpu))
+ return a0;
+ else
+ return a0 | ((gpa_t)a1 << 32);
+}
+
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
{
unsigned long nr, a0, a1, a2, a3, ret;
+ int r = 1;
kvm_x86_ops->cache_regs(vcpu);
@@ -2263,6 +2470,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
a2 = vcpu->arch.regs[VCPU_REGS_RDX];
a3 = vcpu->arch.regs[VCPU_REGS_RSI];
+ KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
+
if (!is_long_mode(vcpu)) {
nr &= 0xFFFFFFFF;
a0 &= 0xFFFFFFFF;
@@ -2275,13 +2484,17 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
break;
+ case KVM_HC_MMU_OP:
+ r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret);
+ break;
default:
ret = -KVM_ENOSYS;
break;
}
vcpu->arch.regs[VCPU_REGS_RAX] = ret;
kvm_x86_ops->decache_regs(vcpu);
- return 0;
+ ++vcpu->stat.hypercalls;
+ return r;
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
@@ -2329,7 +2542,7 @@ void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
unsigned long *rflags)
{
- lmsw(vcpu, msw);
+ kvm_lmsw(vcpu, msw);
*rflags = kvm_x86_ops->get_rflags(vcpu);
}
@@ -2346,9 +2559,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
case 4:
return vcpu->arch.cr4;
case 8:
- return get_cr8(vcpu);
+ return kvm_get_cr8(vcpu);
default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
return 0;
}
}
@@ -2358,23 +2571,23 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
{
switch (cr) {
case 0:
- set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
+ kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
*rflags = kvm_x86_ops->get_rflags(vcpu);
break;
case 2:
vcpu->arch.cr2 = val;
break;
case 3:
- set_cr3(vcpu, val);
+ kvm_set_cr3(vcpu, val);
break;
case 4:
- set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
+ kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val));
break;
case 8:
- set_cr8(vcpu, val & 0xfUL);
+ kvm_set_cr8(vcpu, val & 0xfUL);
break;
default:
- vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
+ vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
}
}
@@ -2447,6 +2660,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
}
kvm_x86_ops->decache_regs(vcpu);
kvm_x86_ops->skip_emulated_instruction(vcpu);
+ KVMTRACE_5D(CPUID, vcpu, function,
+ (u32)vcpu->arch.regs[VCPU_REGS_RAX],
+ (u32)vcpu->arch.regs[VCPU_REGS_RBX],
+ (u32)vcpu->arch.regs[VCPU_REGS_RCX],
+ (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler);
}
EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
@@ -2469,7 +2687,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
struct kvm_run *kvm_run)
{
kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
- kvm_run->cr8 = get_cr8(vcpu);
+ kvm_run->cr8 = kvm_get_cr8(vcpu);
kvm_run->apic_base = kvm_get_apic_base(vcpu);
if (irqchip_in_kernel(vcpu->kvm))
kvm_run->ready_for_interrupt_injection = 1;
@@ -2509,16 +2727,17 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
int r;
- if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
+ if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
pr_debug("vcpu %d received sipi with vector # %x\n",
vcpu->vcpu_id, vcpu->arch.sipi_vector);
kvm_lapic_reset(vcpu);
r = kvm_x86_ops->vcpu_reset(vcpu);
if (r)
return r;
- vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
+ down_read(&vcpu->kvm->slots_lock);
vapic_enter(vcpu);
preempted:
@@ -2526,6 +2745,10 @@ preempted:
kvm_x86_ops->guest_debug_pre(vcpu);
again:
+ if (vcpu->requests)
+ if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
+ kvm_mmu_unload(vcpu);
+
r = kvm_mmu_reload(vcpu);
if (unlikely(r))
goto out;
@@ -2539,6 +2762,11 @@ again:
r = 0;
goto out;
}
+ if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+ kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
+ r = 0;
+ goto out;
+ }
}
kvm_inject_pending_timer_irqs(vcpu);
@@ -2557,6 +2785,14 @@ again:
goto out;
}
+ if (vcpu->requests)
+ if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) {
+ local_irq_enable();
+ preempt_enable();
+ r = 1;
+ goto out;
+ }
+
if (signal_pending(current)) {
local_irq_enable();
preempt_enable();
@@ -2566,6 +2802,13 @@ again:
goto out;
}
+ vcpu->guest_mode = 1;
+ /*
+ * Make sure that guest_mode assignment won't happen after
+ * testing the pending IRQ vector bitmap.
+ */
+ smp_wmb();
+
if (vcpu->arch.exception.pending)
__queue_exception(vcpu);
else if (irqchip_in_kernel(vcpu->kvm))
@@ -2575,13 +2818,15 @@ again:
kvm_lapic_sync_to_vapic(vcpu);
- vcpu->guest_mode = 1;
+ up_read(&vcpu->kvm->slots_lock);
+
kvm_guest_enter();
if (vcpu->requests)
if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
kvm_x86_ops->tlb_flush(vcpu);
+ KVMTRACE_0D(VMENTRY, vcpu, entryexit);
kvm_x86_ops->run(vcpu, kvm_run);
vcpu->guest_mode = 0;
@@ -2601,6 +2846,8 @@ again:
preempt_enable();
+ down_read(&vcpu->kvm->slots_lock);
+
/*
* Profile KVM exit RIPs:
*/
@@ -2628,14 +2875,18 @@ again:
}
out:
+ up_read(&vcpu->kvm->slots_lock);
if (r > 0) {
kvm_resched(vcpu);
+ down_read(&vcpu->kvm->slots_lock);
goto preempted;
}
post_kvm_run_save(vcpu, kvm_run);
+ down_read(&vcpu->kvm->slots_lock);
vapic_exit(vcpu);
+ up_read(&vcpu->kvm->slots_lock);
return r;
}
@@ -2647,7 +2898,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu_load(vcpu);
- if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
+ if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
kvm_vcpu_block(vcpu);
vcpu_put(vcpu);
return -EAGAIN;
@@ -2658,7 +2909,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
/* re-sync apic's tpr */
if (!irqchip_in_kernel(vcpu->kvm))
- set_cr8(vcpu, kvm_run->cr8);
+ kvm_set_cr8(vcpu, kvm_run->cr8);
if (vcpu->arch.pio.cur_count) {
r = complete_pio(vcpu);
@@ -2670,9 +2921,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
vcpu->mmio_read_completed = 1;
vcpu->mmio_needed = 0;
+
+ down_read(&vcpu->kvm->slots_lock);
r = emulate_instruction(vcpu, kvm_run,
vcpu->arch.mmio_fault_cr2, 0,
EMULTYPE_NO_DECODE);
+ up_read(&vcpu->kvm->slots_lock);
if (r == EMULATE_DO_MMIO) {
/*
* Read-modify-write. Back to userspace.
@@ -2773,7 +3027,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
static void get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- return kvm_x86_ops->get_segment(vcpu, var, seg);
+ kvm_x86_ops->get_segment(vcpu, var, seg);
}
void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
@@ -2816,7 +3070,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->cr2 = vcpu->arch.cr2;
sregs->cr3 = vcpu->arch.cr3;
sregs->cr4 = vcpu->arch.cr4;
- sregs->cr8 = get_cr8(vcpu);
+ sregs->cr8 = kvm_get_cr8(vcpu);
sregs->efer = vcpu->arch.shadow_efer;
sregs->apic_base = kvm_get_apic_base(vcpu);
@@ -2836,12 +3090,438 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
return 0;
}
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ vcpu_load(vcpu);
+ mp_state->mp_state = vcpu->arch.mp_state;
+ vcpu_put(vcpu);
+ return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+ struct kvm_mp_state *mp_state)
+{
+ vcpu_load(vcpu);
+ vcpu->arch.mp_state = mp_state->mp_state;
+ vcpu_put(vcpu);
+ return 0;
+}
+
static void set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- return kvm_x86_ops->set_segment(vcpu, var, seg);
+ kvm_x86_ops->set_segment(vcpu, var, seg);
+}
+
+static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
+ struct kvm_segment *kvm_desct)
+{
+ kvm_desct->base = seg_desc->base0;
+ kvm_desct->base |= seg_desc->base1 << 16;
+ kvm_desct->base |= seg_desc->base2 << 24;
+ kvm_desct->limit = seg_desc->limit0;
+ kvm_desct->limit |= seg_desc->limit << 16;
+ kvm_desct->selector = selector;
+ kvm_desct->type = seg_desc->type;
+ kvm_desct->present = seg_desc->p;
+ kvm_desct->dpl = seg_desc->dpl;
+ kvm_desct->db = seg_desc->d;
+ kvm_desct->s = seg_desc->s;
+ kvm_desct->l = seg_desc->l;
+ kvm_desct->g = seg_desc->g;
+ kvm_desct->avl = seg_desc->avl;
+ if (!selector)
+ kvm_desct->unusable = 1;
+ else
+ kvm_desct->unusable = 0;
+ kvm_desct->padding = 0;
+}
+
+static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
+ u16 selector,
+ struct descriptor_table *dtable)
+{
+ if (selector & 1 << 2) {
+ struct kvm_segment kvm_seg;
+
+ get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
+
+ if (kvm_seg.unusable)
+ dtable->limit = 0;
+ else
+ dtable->limit = kvm_seg.limit;
+ dtable->base = kvm_seg.base;
+ }
+ else
+ kvm_x86_ops->get_gdt(vcpu, dtable);
+}
+
+/* allowed just for 8 bytes segments */
+static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+ struct desc_struct *seg_desc)
+{
+ struct descriptor_table dtable;
+ u16 index = selector >> 3;
+
+ get_segment_descritptor_dtable(vcpu, selector, &dtable);
+
+ if (dtable.limit < index * 8 + 7) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
+ return 1;
+ }
+ return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8);
+}
+
+/* allowed just for 8 bytes segments */
+static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+ struct desc_struct *seg_desc)
+{
+ struct descriptor_table dtable;
+ u16 index = selector >> 3;
+
+ get_segment_descritptor_dtable(vcpu, selector, &dtable);
+
+ if (dtable.limit < index * 8 + 7)
+ return 1;
+ return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8);
+}
+
+static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
+ struct desc_struct *seg_desc)
+{
+ u32 base_addr;
+
+ base_addr = seg_desc->base0;
+ base_addr |= (seg_desc->base1 << 16);
+ base_addr |= (seg_desc->base2 << 24);
+
+ return base_addr;
+}
+
+static int load_tss_segment32(struct kvm_vcpu *vcpu,
+ struct desc_struct *seg_desc,
+ struct tss_segment_32 *tss)
+{
+ u32 base_addr;
+
+ base_addr = get_tss_base_addr(vcpu, seg_desc);
+
+ return kvm_read_guest(vcpu->kvm, base_addr, tss,
+ sizeof(struct tss_segment_32));
+}
+
+static int save_tss_segment32(struct kvm_vcpu *vcpu,
+ struct desc_struct *seg_desc,
+ struct tss_segment_32 *tss)
+{
+ u32 base_addr;
+
+ base_addr = get_tss_base_addr(vcpu, seg_desc);
+
+ return kvm_write_guest(vcpu->kvm, base_addr, tss,
+ sizeof(struct tss_segment_32));
+}
+
+static int load_tss_segment16(struct kvm_vcpu *vcpu,
+ struct desc_struct *seg_desc,
+ struct tss_segment_16 *tss)
+{
+ u32 base_addr;
+
+ base_addr = get_tss_base_addr(vcpu, seg_desc);
+
+ return kvm_read_guest(vcpu->kvm, base_addr, tss,
+ sizeof(struct tss_segment_16));
+}
+
+static int save_tss_segment16(struct kvm_vcpu *vcpu,
+ struct desc_struct *seg_desc,
+ struct tss_segment_16 *tss)
+{
+ u32 base_addr;
+
+ base_addr = get_tss_base_addr(vcpu, seg_desc);
+
+ return kvm_write_guest(vcpu->kvm, base_addr, tss,
+ sizeof(struct tss_segment_16));
+}
+
+static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
+{
+ struct kvm_segment kvm_seg;
+
+ get_segment(vcpu, &kvm_seg, seg);
+ return kvm_seg.selector;
+}
+
+static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
+ u16 selector,
+ struct kvm_segment *kvm_seg)
+{
+ struct desc_struct seg_desc;
+
+ if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
+ return 1;
+ seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
+ return 0;
+}
+
+static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
+ int type_bits, int seg)
+{
+ struct kvm_segment kvm_seg;
+
+ if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
+ return 1;
+ kvm_seg.type |= type_bits;
+
+ if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS &&
+ seg != VCPU_SREG_LDTR)
+ if (!kvm_seg.s)
+ kvm_seg.unusable = 1;
+
+ set_segment(vcpu, &kvm_seg, seg);
+ return 0;
+}
+
+static void save_state_to_tss32(struct kvm_vcpu *vcpu,
+ struct tss_segment_32 *tss)
+{
+ tss->cr3 = vcpu->arch.cr3;
+ tss->eip = vcpu->arch.rip;
+ tss->eflags = kvm_x86_ops->get_rflags(vcpu);
+ tss->eax = vcpu->arch.regs[VCPU_REGS_RAX];
+ tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX];
+ tss->edx = vcpu->arch.regs[VCPU_REGS_RDX];
+ tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX];
+ tss->esp = vcpu->arch.regs[VCPU_REGS_RSP];
+ tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP];
+ tss->esi = vcpu->arch.regs[VCPU_REGS_RSI];
+ tss->edi = vcpu->arch.regs[VCPU_REGS_RDI];
+
+ tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+ tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
+ tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
+ tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+ tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
+}
+
+static int load_state_from_tss32(struct kvm_vcpu *vcpu,
+ struct tss_segment_32 *tss)
+{
+ kvm_set_cr3(vcpu, tss->cr3);
+
+ vcpu->arch.rip = tss->eip;
+ kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
+
+ vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax;
+ vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx;
+ vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx;
+ vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx;
+ vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp;
+ vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp;
+ vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi;
+ vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi;
+
+ if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
+ return 1;
+
+ if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+ return 1;
+
+ if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+ return 1;
+
+ if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+ return 1;
+
+ if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+ return 1;
+
+ if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
+ return 1;
+
+ if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
+ return 1;
+ return 0;
+}
+
+static void save_state_to_tss16(struct kvm_vcpu *vcpu,
+ struct tss_segment_16 *tss)
+{
+ tss->ip = vcpu->arch.rip;
+ tss->flag = kvm_x86_ops->get_rflags(vcpu);
+ tss->ax = vcpu->arch.regs[VCPU_REGS_RAX];
+ tss->cx = vcpu->arch.regs[VCPU_REGS_RCX];
+ tss->dx = vcpu->arch.regs[VCPU_REGS_RDX];
+ tss->bx = vcpu->arch.regs[VCPU_REGS_RBX];
+ tss->sp = vcpu->arch.regs[VCPU_REGS_RSP];
+ tss->bp = vcpu->arch.regs[VCPU_REGS_RBP];
+ tss->si = vcpu->arch.regs[VCPU_REGS_RSI];
+ tss->di = vcpu->arch.regs[VCPU_REGS_RDI];
+
+ tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
+ tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
+ tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
+ tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
+ tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
+ tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
+}
+
+static int load_state_from_tss16(struct kvm_vcpu *vcpu,
+ struct tss_segment_16 *tss)
+{
+ vcpu->arch.rip = tss->ip;
+ kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
+ vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax;
+ vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx;
+ vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx;
+ vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx;
+ vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp;
+ vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp;
+ vcpu->arch.regs[VCPU_REGS_RSI] = tss->si;
+ vcpu->arch.regs[VCPU_REGS_RDI] = tss->di;
+
+ if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
+ return 1;
+
+ if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
+ return 1;
+
+ if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
+ return 1;
+
+ if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
+ return 1;
+
+ if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
+ return 1;
+ return 0;
+}
+
+int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
+ struct desc_struct *cseg_desc,
+ struct desc_struct *nseg_desc)
+{
+ struct tss_segment_16 tss_segment_16;
+ int ret = 0;
+
+ if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16))
+ goto out;
+
+ save_state_to_tss16(vcpu, &tss_segment_16);
+ save_tss_segment16(vcpu, cseg_desc, &tss_segment_16);
+
+ if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16))
+ goto out;
+ if (load_state_from_tss16(vcpu, &tss_segment_16))
+ goto out;
+
+ ret = 1;
+out:
+ return ret;
+}
+
+int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
+ struct desc_struct *cseg_desc,
+ struct desc_struct *nseg_desc)
+{
+ struct tss_segment_32 tss_segment_32;
+ int ret = 0;
+
+ if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32))
+ goto out;
+
+ save_state_to_tss32(vcpu, &tss_segment_32);
+ save_tss_segment32(vcpu, cseg_desc, &tss_segment_32);
+
+ if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32))
+ goto out;
+ if (load_state_from_tss32(vcpu, &tss_segment_32))
+ goto out;
+
+ ret = 1;
+out:
+ return ret;
}
+int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
+{
+ struct kvm_segment tr_seg;
+ struct desc_struct cseg_desc;
+ struct desc_struct nseg_desc;
+ int ret = 0;
+
+ get_segment(vcpu, &tr_seg, VCPU_SREG_TR);
+
+ if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
+ goto out;
+
+ if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc))
+ goto out;
+
+
+ if (reason != TASK_SWITCH_IRET) {
+ int cpl;
+
+ cpl = kvm_x86_ops->get_cpl(vcpu);
+ if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) {
+ kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
+ return 1;
+ }
+ }
+
+ if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) {
+ kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
+ return 1;
+ }
+
+ if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
+ cseg_desc.type &= ~(1 << 8); //clear the B flag
+ save_guest_segment_descriptor(vcpu, tr_seg.selector,
+ &cseg_desc);
+ }
+
+ if (reason == TASK_SWITCH_IRET) {
+ u32 eflags = kvm_x86_ops->get_rflags(vcpu);
+ kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
+ }
+
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+ kvm_x86_ops->cache_regs(vcpu);
+
+ if (nseg_desc.type & 8)
+ ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc,
+ &nseg_desc);
+ else
+ ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc,
+ &nseg_desc);
+
+ if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
+ u32 eflags = kvm_x86_ops->get_rflags(vcpu);
+ kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT);
+ }
+
+ if (reason != TASK_SWITCH_IRET) {
+ nseg_desc.type |= (1 << 8);
+ save_guest_segment_descriptor(vcpu, tss_selector,
+ &nseg_desc);
+ }
+
+ kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
+ seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
+ tr_seg.type = 11;
+ set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
+out:
+ kvm_x86_ops->decache_regs(vcpu);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_task_switch);
+
int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
@@ -2862,12 +3542,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
- set_cr8(vcpu, sregs->cr8);
+ kvm_set_cr8(vcpu, sregs->cr8);
mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer;
-#ifdef CONFIG_X86_64
kvm_x86_ops->set_efer(vcpu, sregs->efer);
-#endif
kvm_set_apic_base(vcpu, sregs->apic_base);
kvm_x86_ops->decache_cr4_guest_bits(vcpu);
@@ -3141,9 +3819,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
vcpu->arch.mmu.root_hpa = INVALID_PAGE;
if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
- vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE;
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
- vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED;
+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) {
@@ -3175,7 +3853,9 @@ fail:
void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
{
kvm_free_lapic(vcpu);
+ down_read(&vcpu->kvm->slots_lock);
kvm_mmu_destroy(vcpu);
+ up_read(&vcpu->kvm->slots_lock);
free_page((unsigned long)vcpu->arch.pio_data);
}
@@ -3219,10 +3899,13 @@ static void kvm_free_vcpus(struct kvm *kvm)
void kvm_arch_destroy_vm(struct kvm *kvm)
{
+ kvm_free_pit(kvm);
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
kvm_free_physmem(kvm);
+ if (kvm->arch.apic_access_page)
+ put_page(kvm->arch.apic_access_page);
kfree(kvm);
}
@@ -3278,8 +3961,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE
- || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED;
+ return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
+ || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
}
static void vcpu_kick_intr(void *info)
@@ -3293,11 +3976,17 @@ static void vcpu_kick_intr(void *info)
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
{
int ipi_pcpu = vcpu->cpu;
+ int cpu = get_cpu();
if (waitqueue_active(&vcpu->wq)) {
wake_up_interruptible(&vcpu->wq);
++vcpu->stat.halt_wakeup;
}
- if (vcpu->guest_mode)
+ /*
+ * We may be called synchronously with irqs disabled in guest mode,
+ * So need not to call smp_call_function_single() in that case.
+ */
+ if (vcpu->guest_mode && vcpu->cpu != cpu)
smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
+ put_cpu();
}
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 79586003397a..2ca08386f993 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -65,6 +65,14 @@
#define MemAbs (1<<9) /* Memory operand is absolute displacement */
#define String (1<<10) /* String instruction (rep capable) */
#define Stack (1<<11) /* Stack instruction (push/pop) */
+#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
+#define GroupMask 0xff /* Group number stored in bits 0:7 */
+
+enum {
+ Group1_80, Group1_81, Group1_82, Group1_83,
+ Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
+};
static u16 opcode_table[256] = {
/* 0x00 - 0x07 */
@@ -123,14 +131,14 @@ static u16 opcode_table[256] = {
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
/* 0x80 - 0x87 */
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
- ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
+ Group | Group1_80, Group | Group1_81,
+ Group | Group1_82, Group | Group1_83,
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
/* 0x88 - 0x8F */
ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
- 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack,
+ 0, ModRM | DstReg, 0, Group | Group1A,
/* 0x90 - 0x9F */
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
@@ -164,16 +172,15 @@ static u16 opcode_table[256] = {
0, 0, 0, 0,
/* 0xF0 - 0xF7 */
0, 0, 0, 0,
- ImplicitOps, ImplicitOps,
- ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+ ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
/* 0xF8 - 0xFF */
ImplicitOps, 0, ImplicitOps, ImplicitOps,
- 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM
+ 0, 0, Group | Group4, Group | Group5,
};
static u16 twobyte_table[256] = {
/* 0x00 - 0x0F */
- 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0,
+ 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
/* 0x10 - 0x1F */
0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
@@ -229,6 +236,56 @@ static u16 twobyte_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
+static u16 group_table[] = {
+ [Group1_80*8] =
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ [Group1_81*8] =
+ DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+ DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+ DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+ DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
+ [Group1_82*8] =
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
+ [Group1_83*8] =
+ DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+ DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+ DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+ DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM,
+ [Group1A*8] =
+ DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
+ [Group3_Byte*8] =
+ ByteOp | SrcImm | DstMem | ModRM, 0,
+ ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ 0, 0, 0, 0,
+ [Group3*8] =
+ DstMem | SrcImm | ModRM | SrcImm, 0,
+ DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ 0, 0, 0, 0,
+ [Group4*8] =
+ ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+ 0, 0, 0, 0, 0, 0,
+ [Group5*8] =
+ DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0,
+ SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0,
+ [Group7*8] =
+ 0, 0, ModRM | SrcMem, ModRM | SrcMem,
+ SrcNone | ModRM | DstMem | Mov, 0,
+ SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
+};
+
+static u16 group2_table[] = {
+ [Group7*8] =
+ SrcNone | ModRM, 0, 0, 0,
+ SrcNone | ModRM | DstMem | Mov, 0,
+ SrcMem16 | ModRM | Mov, 0,
+};
+
/* EFLAGS bit definitions. */
#define EFLG_OF (1<<11)
#define EFLG_DF (1<<10)
@@ -317,7 +374,7 @@ static u16 twobyte_table[256] = {
#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
do { \
- unsigned long _tmp; \
+ unsigned long __tmp; \
switch ((_dst).bytes) { \
case 1: \
__asm__ __volatile__ ( \
@@ -325,7 +382,7 @@ static u16 twobyte_table[256] = {
_op"b %"_bx"3,%1; " \
_POST_EFLAGS("0", "4", "2") \
: "=m" (_eflags), "=m" ((_dst).val), \
- "=&r" (_tmp) \
+ "=&r" (__tmp) \
: _by ((_src).val), "i" (EFLAGS_MASK)); \
break; \
default: \
@@ -426,29 +483,40 @@ static u16 twobyte_table[256] = {
(_type)_x; \
})
+static inline unsigned long ad_mask(struct decode_cache *c)
+{
+ return (1UL << (c->ad_bytes << 3)) - 1;
+}
+
/* Access/update address held in a register, based on addressing mode. */
-#define address_mask(reg) \
- ((c->ad_bytes == sizeof(unsigned long)) ? \
- (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1)))
-#define register_address(base, reg) \
- ((base) + address_mask(reg))
-#define register_address_increment(reg, inc) \
- do { \
- /* signed type ensures sign extension to long */ \
- int _inc = (inc); \
- if (c->ad_bytes == sizeof(unsigned long)) \
- (reg) += _inc; \
- else \
- (reg) = ((reg) & \
- ~((1UL << (c->ad_bytes << 3)) - 1)) | \
- (((reg) + _inc) & \
- ((1UL << (c->ad_bytes << 3)) - 1)); \
- } while (0)
+static inline unsigned long
+address_mask(struct decode_cache *c, unsigned long reg)
+{
+ if (c->ad_bytes == sizeof(unsigned long))
+ return reg;
+ else
+ return reg & ad_mask(c);
+}
-#define JMP_REL(rel) \
- do { \
- register_address_increment(c->eip, rel); \
- } while (0)
+static inline unsigned long
+register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
+{
+ return base + address_mask(c, reg);
+}
+
+static inline void
+register_address_increment(struct decode_cache *c, unsigned long *reg, int inc)
+{
+ if (c->ad_bytes == sizeof(unsigned long))
+ *reg += inc;
+ else
+ *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c));
+}
+
+static inline void jmp_rel(struct decode_cache *c, int rel)
+{
+ register_address_increment(c, &c->eip, rel);
+}
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
@@ -763,7 +831,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
struct decode_cache *c = &ctxt->decode;
int rc = 0;
int mode = ctxt->mode;
- int def_op_bytes, def_ad_bytes;
+ int def_op_bytes, def_ad_bytes, group;
/* Shadow copy of register state. Committed on successful emulation. */
@@ -864,12 +932,24 @@ done_prefixes:
c->b = insn_fetch(u8, 1, c->eip);
c->d = twobyte_table[c->b];
}
+ }
- /* Unrecognised? */
- if (c->d == 0) {
- DPRINTF("Cannot emulate %02x\n", c->b);
- return -1;
- }
+ if (c->d & Group) {
+ group = c->d & GroupMask;
+ c->modrm = insn_fetch(u8, 1, c->eip);
+ --c->eip;
+
+ group = (group << 3) + ((c->modrm >> 3) & 7);
+ if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
+ c->d = group2_table[group];
+ else
+ c->d = group_table[group];
+ }
+
+ /* Unrecognised? */
+ if (c->d == 0) {
+ DPRINTF("Cannot emulate %02x\n", c->b);
+ return -1;
}
if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
@@ -924,6 +1004,7 @@ done_prefixes:
*/
if ((c->d & ModRM) && c->modrm_mod == 3) {
c->src.type = OP_REG;
+ c->src.val = c->modrm_val;
break;
}
c->src.type = OP_MEM;
@@ -967,6 +1048,7 @@ done_prefixes:
case DstMem:
if ((c->d & ModRM) && c->modrm_mod == 3) {
c->dst.type = OP_REG;
+ c->dst.val = c->dst.orig_val = c->modrm_val;
break;
}
c->dst.type = OP_MEM;
@@ -984,8 +1066,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
c->dst.type = OP_MEM;
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
- register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.ptr = (void *) register_address(ctxt->ss_base,
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
+ c->dst.ptr = (void *) register_address(c, ctxt->ss_base,
c->regs[VCPU_REGS_RSP]);
}
@@ -995,13 +1077,13 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int rc;
- rc = ops->read_std(register_address(ctxt->ss_base,
+ rc = ops->read_std(register_address(c, ctxt->ss_base,
c->regs[VCPU_REGS_RSP]),
&c->dst.val, c->dst.bytes, ctxt->vcpu);
if (rc != 0)
return rc;
- register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes);
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes);
return 0;
}
@@ -1043,26 +1125,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
switch (c->modrm_reg) {
case 0 ... 1: /* test */
- /*
- * Special case in Grp3: test has an immediate
- * source operand.
- */
- c->src.type = OP_IMM;
- c->src.ptr = (unsigned long *)c->eip;
- c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- if (c->src.bytes == 8)
- c->src.bytes = 4;
- switch (c->src.bytes) {
- case 1:
- c->src.val = insn_fetch(s8, 1, c->eip);
- break;
- case 2:
- c->src.val = insn_fetch(s16, 2, c->eip);
- break;
- case 4:
- c->src.val = insn_fetch(s32, 4, c->eip);
- break;
- }
emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
break;
case 2: /* not */
@@ -1076,7 +1138,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
rc = X86EMUL_UNHANDLEABLE;
break;
}
-done:
return rc;
}
@@ -1084,7 +1145,6 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops)
{
struct decode_cache *c = &ctxt->decode;
- int rc;
switch (c->modrm_reg) {
case 0: /* inc */
@@ -1094,36 +1154,11 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
emulate_1op("dec", c->dst, ctxt->eflags);
break;
case 4: /* jmp abs */
- if (c->b == 0xff)
- c->eip = c->dst.val;
- else {
- DPRINTF("Cannot emulate %02x\n", c->b);
- return X86EMUL_UNHANDLEABLE;
- }
+ c->eip = c->src.val;
break;
case 6: /* push */
-
- /* 64-bit mode: PUSH always pushes a 64-bit operand. */
-
- if (ctxt->mode == X86EMUL_MODE_PROT64) {
- c->dst.bytes = 8;
- rc = ops->read_std((unsigned long)c->dst.ptr,
- &c->dst.val, 8, ctxt->vcpu);
- if (rc != 0)
- return rc;
- }
- register_address_increment(c->regs[VCPU_REGS_RSP],
- -c->dst.bytes);
- rc = ops->write_emulated(register_address(ctxt->ss_base,
- c->regs[VCPU_REGS_RSP]), &c->dst.val,
- c->dst.bytes, ctxt->vcpu);
- if (rc != 0)
- return rc;
- c->dst.type = OP_NONE;
+ emulate_push(ctxt);
break;
- default:
- DPRINTF("Cannot emulate %02x\n", c->b);
- return X86EMUL_UNHANDLEABLE;
}
return 0;
}
@@ -1361,19 +1396,19 @@ special_insn:
c->dst.type = OP_MEM;
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
- register_address_increment(c->regs[VCPU_REGS_RSP],
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-c->op_bytes);
c->dst.ptr = (void *) register_address(
- ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
+ c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]);
break;
case 0x58 ... 0x5f: /* pop reg */
pop_instruction:
- if ((rc = ops->read_std(register_address(ctxt->ss_base,
+ if ((rc = ops->read_std(register_address(c, ctxt->ss_base,
c->regs[VCPU_REGS_RSP]), c->dst.ptr,
c->op_bytes, ctxt->vcpu)) != 0)
goto done;
- register_address_increment(c->regs[VCPU_REGS_RSP],
+ register_address_increment(c, &c->regs[VCPU_REGS_RSP],
c->op_bytes);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
@@ -1393,9 +1428,9 @@ special_insn:
1,
(c->d & ByteOp) ? 1 : c->op_bytes,
c->rep_prefix ?
- address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+ address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
(ctxt->eflags & EFLG_DF),
- register_address(ctxt->es_base,
+ register_address(c, ctxt->es_base,
c->regs[VCPU_REGS_RDI]),
c->rep_prefix,
c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1409,9 +1444,9 @@ special_insn:
0,
(c->d & ByteOp) ? 1 : c->op_bytes,
c->rep_prefix ?
- address_mask(c->regs[VCPU_REGS_RCX]) : 1,
+ address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
(ctxt->eflags & EFLG_DF),
- register_address(c->override_base ?
+ register_address(c, c->override_base ?
*c->override_base :
ctxt->ds_base,
c->regs[VCPU_REGS_RSI]),
@@ -1425,7 +1460,7 @@ special_insn:
int rel = insn_fetch(s8, 1, c->eip);
if (test_cc(c->b, ctxt->eflags))
- JMP_REL(rel);
+ jmp_rel(c, rel);
break;
}
case 0x80 ... 0x83: /* Grp1 */
@@ -1477,7 +1512,7 @@ special_insn:
case 0x88 ... 0x8b: /* mov */
goto mov;
case 0x8d: /* lea r16/r32, m */
- c->dst.val = c->modrm_val;
+ c->dst.val = c->modrm_ea;
break;
case 0x8f: /* pop (sole member of Grp1a) */
rc = emulate_grp1a(ctxt, ops);
@@ -1501,27 +1536,27 @@ special_insn:
case 0xa4 ... 0xa5: /* movs */
c->dst.type = OP_MEM;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(
+ c->dst.ptr = (unsigned long *)register_address(c,
ctxt->es_base,
c->regs[VCPU_REGS_RDI]);
- if ((rc = ops->read_emulated(register_address(
+ if ((rc = ops->read_emulated(register_address(c,
c->override_base ? *c->override_base :
ctxt->ds_base,
c->regs[VCPU_REGS_RSI]),
&c->dst.val,
c->dst.bytes, ctxt->vcpu)) != 0)
goto done;
- register_address_increment(c->regs[VCPU_REGS_RSI],
+ register_address_increment(c, &c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
- register_address_increment(c->regs[VCPU_REGS_RDI],
+ register_address_increment(c, &c->regs[VCPU_REGS_RDI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
break;
case 0xa6 ... 0xa7: /* cmps */
c->src.type = OP_NONE; /* Disable writeback. */
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.ptr = (unsigned long *)register_address(
+ c->src.ptr = (unsigned long *)register_address(c,
c->override_base ? *c->override_base :
ctxt->ds_base,
c->regs[VCPU_REGS_RSI]);
@@ -1533,7 +1568,7 @@ special_insn:
c->dst.type = OP_NONE; /* Disable writeback. */
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(
+ c->dst.ptr = (unsigned long *)register_address(c,
ctxt->es_base,
c->regs[VCPU_REGS_RDI]);
if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
@@ -1546,10 +1581,10 @@ special_insn:
emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
- register_address_increment(c->regs[VCPU_REGS_RSI],
+ register_address_increment(c, &c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->src.bytes
: c->src.bytes);
- register_address_increment(c->regs[VCPU_REGS_RDI],
+ register_address_increment(c, &c->regs[VCPU_REGS_RDI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
@@ -1557,11 +1592,11 @@ special_insn:
case 0xaa ... 0xab: /* stos */
c->dst.type = OP_MEM;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.ptr = (unsigned long *)register_address(
+ c->dst.ptr = (unsigned long *)register_address(c,
ctxt->es_base,
c->regs[VCPU_REGS_RDI]);
c->dst.val = c->regs[VCPU_REGS_RAX];
- register_address_increment(c->regs[VCPU_REGS_RDI],
+ register_address_increment(c, &c->regs[VCPU_REGS_RDI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
break;
@@ -1569,7 +1604,7 @@ special_insn:
c->dst.type = OP_REG;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
- if ((rc = ops->read_emulated(register_address(
+ if ((rc = ops->read_emulated(register_address(c,
c->override_base ? *c->override_base :
ctxt->ds_base,
c->regs[VCPU_REGS_RSI]),
@@ -1577,7 +1612,7 @@ special_insn:
c->dst.bytes,
ctxt->vcpu)) != 0)
goto done;
- register_address_increment(c->regs[VCPU_REGS_RSI],
+ register_address_increment(c, &c->regs[VCPU_REGS_RSI],
(ctxt->eflags & EFLG_DF) ? -c->dst.bytes
: c->dst.bytes);
break;
@@ -1616,14 +1651,14 @@ special_insn:
goto cannot_emulate;
}
c->src.val = (unsigned long) c->eip;
- JMP_REL(rel);
+ jmp_rel(c, rel);
c->op_bytes = c->ad_bytes;
emulate_push(ctxt);
break;
}
case 0xe9: /* jmp rel */
case 0xeb: /* jmp rel short */
- JMP_REL(c->src.val);
+ jmp_rel(c, c->src.val);
c->dst.type = OP_NONE; /* Disable writeback. */
break;
case 0xf4: /* hlt */
@@ -1690,6 +1725,8 @@ twobyte_insn:
goto done;
kvm_emulate_hypercall(ctxt->vcpu);
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
break;
case 2: /* lgdt */
rc = read_descriptor(ctxt, ops, c->src.ptr,
@@ -1697,6 +1734,8 @@ twobyte_insn:
if (rc)
goto done;
realmode_lgdt(ctxt->vcpu, size, address);
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
break;
case 3: /* lidt/vmmcall */
if (c->modrm_mod == 3 && c->modrm_rm == 1) {
@@ -1712,27 +1751,25 @@ twobyte_insn:
goto done;
realmode_lidt(ctxt->vcpu, size, address);
}
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
break;
case 4: /* smsw */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- *(u16 *)&c->regs[c->modrm_rm]
- = realmode_get_cr(ctxt->vcpu, 0);
+ c->dst.bytes = 2;
+ c->dst.val = realmode_get_cr(ctxt->vcpu, 0);
break;
case 6: /* lmsw */
- if (c->modrm_mod != 3)
- goto cannot_emulate;
- realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val,
- &ctxt->eflags);
+ realmode_lmsw(ctxt->vcpu, (u16)c->src.val,
+ &ctxt->eflags);
break;
case 7: /* invlpg*/
emulate_invlpg(ctxt->vcpu, memop);
+ /* Disable writeback. */
+ c->dst.type = OP_NONE;
break;
default:
goto cannot_emulate;
}
- /* Disable writeback. */
- c->dst.type = OP_NONE;
break;
case 0x06:
emulate_clts(ctxt->vcpu);
@@ -1823,7 +1860,7 @@ twobyte_insn:
goto cannot_emulate;
}
if (test_cc(c->b, ctxt->eflags))
- JMP_REL(rel);
+ jmp_rel(c, rel);
c->dst.type = OP_NONE;
break;
}
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 3335b4595efd..af65b2da3ba0 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -661,7 +661,7 @@ static int lguest_clockevent_set_next_event(unsigned long delta,
if (delta < LG_CLOCK_MIN_DELTA) {
if (printk_ratelimit())
printk(KERN_DEBUG "%s: small delta %lu ns\n",
- __FUNCTION__, delta);
+ __func__, delta);
return -ETIME;
}
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 25df1c1989fe..76f60f52a885 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -11,7 +11,7 @@ lib-y += memcpy_$(BITS).o
ifeq ($(CONFIG_X86_32),y)
lib-y += checksum_32.o
lib-y += strstr_32.o
- lib-y += bitops_32.o semaphore_32.o string_32.o
+ lib-y += semaphore_32.o string_32.o
lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
else
@@ -21,7 +21,6 @@ else
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
lib-y += thunk_64.o clear_page_64.o copy_page_64.o
- lib-y += bitops_64.o
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
endif
diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c
deleted file mode 100644
index b65440459859..000000000000
--- a/arch/x86/lib/bitops_32.c
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <linux/bitops.h>
-#include <linux/module.h>
-
-/**
- * find_next_bit - find the next set bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-int find_next_bit(const unsigned long *addr, int size, int offset)
-{
- const unsigned long *p = addr + (offset >> 5);
- int set = 0, bit = offset & 31, res;
-
- if (bit) {
- /*
- * Look for nonzero in the first 32 bits:
- */
- __asm__("bsfl %1,%0\n\t"
- "jne 1f\n\t"
- "movl $32, %0\n"
- "1:"
- : "=r" (set)
- : "r" (*p >> bit));
- if (set < (32 - bit))
- return set + offset;
- set = 32 - bit;
- p++;
- }
- /*
- * No set bit yet, search remaining full words for a bit
- */
- res = find_first_bit (p, size - 32 * (p - addr));
- return (offset + set + res);
-}
-EXPORT_SYMBOL(find_next_bit);
-
-/**
- * find_next_zero_bit - find the first zero bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-int find_next_zero_bit(const unsigned long *addr, int size, int offset)
-{
- const unsigned long *p = addr + (offset >> 5);
- int set = 0, bit = offset & 31, res;
-
- if (bit) {
- /*
- * Look for zero in the first 32 bits.
- */
- __asm__("bsfl %1,%0\n\t"
- "jne 1f\n\t"
- "movl $32, %0\n"
- "1:"
- : "=r" (set)
- : "r" (~(*p >> bit)));
- if (set < (32 - bit))
- return set + offset;
- set = 32 - bit;
- p++;
- }
- /*
- * No zero yet, search remaining full bytes for a zero
- */
- res = find_first_zero_bit(p, size - 32 * (p - addr));
- return (offset + set + res);
-}
-EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c
deleted file mode 100644
index 0e8f491e6ccc..000000000000
--- a/arch/x86/lib/bitops_64.c
+++ /dev/null
@@ -1,175 +0,0 @@
-#include <linux/bitops.h>
-
-#undef find_first_zero_bit
-#undef find_next_zero_bit
-#undef find_first_bit
-#undef find_next_bit
-
-static inline long
-__find_first_zero_bit(const unsigned long * addr, unsigned long size)
-{
- long d0, d1, d2;
- long res;
-
- /*
- * We must test the size in words, not in bits, because
- * otherwise incoming sizes in the range -63..-1 will not run
- * any scasq instructions, and then the flags used by the je
- * instruction will have whatever random value was in place
- * before. Nobody should call us like that, but
- * find_next_zero_bit() does when offset and size are at the
- * same word and it fails to find a zero itself.
- */
- size += 63;
- size >>= 6;
- if (!size)
- return 0;
- asm volatile(
- " repe; scasq\n"
- " je 1f\n"
- " xorq -8(%%rdi),%%rax\n"
- " subq $8,%%rdi\n"
- " bsfq %%rax,%%rdx\n"
- "1: subq %[addr],%%rdi\n"
- " shlq $3,%%rdi\n"
- " addq %%rdi,%%rdx"
- :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
- :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
- [addr] "S" (addr) : "memory");
- /*
- * Any register would do for [addr] above, but GCC tends to
- * prefer rbx over rsi, even though rsi is readily available
- * and doesn't have to be saved.
- */
- return res;
-}
-
-/**
- * find_first_zero_bit - find the first zero bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first zero bit, not the number of the byte
- * containing a bit.
- */
-long find_first_zero_bit(const unsigned long * addr, unsigned long size)
-{
- return __find_first_zero_bit (addr, size);
-}
-
-/**
- * find_next_zero_bit - find the next zero bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-long find_next_zero_bit (const unsigned long * addr, long size, long offset)
-{
- const unsigned long * p = addr + (offset >> 6);
- unsigned long set = 0;
- unsigned long res, bit = offset&63;
-
- if (bit) {
- /*
- * Look for zero in first word
- */
- asm("bsfq %1,%0\n\t"
- "cmoveq %2,%0"
- : "=r" (set)
- : "r" (~(*p >> bit)), "r"(64L));
- if (set < (64 - bit))
- return set + offset;
- set = 64 - bit;
- p++;
- }
- /*
- * No zero yet, search remaining full words for a zero
- */
- res = __find_first_zero_bit (p, size - 64 * (p - addr));
-
- return (offset + set + res);
-}
-
-static inline long
-__find_first_bit(const unsigned long * addr, unsigned long size)
-{
- long d0, d1;
- long res;
-
- /*
- * We must test the size in words, not in bits, because
- * otherwise incoming sizes in the range -63..-1 will not run
- * any scasq instructions, and then the flags used by the jz
- * instruction will have whatever random value was in place
- * before. Nobody should call us like that, but
- * find_next_bit() does when offset and size are at the same
- * word and it fails to find a one itself.
- */
- size += 63;
- size >>= 6;
- if (!size)
- return 0;
- asm volatile(
- " repe; scasq\n"
- " jz 1f\n"
- " subq $8,%%rdi\n"
- " bsfq (%%rdi),%%rax\n"
- "1: subq %[addr],%%rdi\n"
- " shlq $3,%%rdi\n"
- " addq %%rdi,%%rax"
- :"=a" (res), "=&c" (d0), "=&D" (d1)
- :"0" (0ULL), "1" (size), "2" (addr),
- [addr] "r" (addr) : "memory");
- return res;
-}
-
-/**
- * find_first_bit - find the first set bit in a memory region
- * @addr: The address to start the search at
- * @size: The maximum size to search
- *
- * Returns the bit-number of the first set bit, not the number of the byte
- * containing a bit.
- */
-long find_first_bit(const unsigned long * addr, unsigned long size)
-{
- return __find_first_bit(addr,size);
-}
-
-/**
- * find_next_bit - find the first set bit in a memory region
- * @addr: The address to base the search on
- * @offset: The bitnumber to start searching at
- * @size: The maximum size to search
- */
-long find_next_bit(const unsigned long * addr, long size, long offset)
-{
- const unsigned long * p = addr + (offset >> 6);
- unsigned long set = 0, bit = offset & 63, res;
-
- if (bit) {
- /*
- * Look for nonzero in the first 64 bits:
- */
- asm("bsfq %1,%0\n\t"
- "cmoveq %2,%0\n\t"
- : "=r" (set)
- : "r" (*p >> bit), "r" (64L));
- if (set < (64 - bit))
- return set + offset;
- set = 64 - bit;
- p++;
- }
- /*
- * No set bit yet, search remaining full words for a bit
- */
- res = __find_first_bit (p, size - 64 * (p - addr));
- return (offset + set + res);
-}
-
-#include <linux/module.h>
-
-EXPORT_SYMBOL(find_next_bit);
-EXPORT_SYMBOL(find_first_bit);
-EXPORT_SYMBOL(find_first_zero_bit);
-EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 37756b6fb329..5415a9d06f53 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -25,7 +25,7 @@ void *memmove(void *dest, const void *src, size_t n)
int d0, d1, d2;
if (dest < src) {
- memcpy(dest,src,n);
+ memcpy(dest, src, n);
} else {
__asm__ __volatile__(
"std\n\t"
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 80175e47b190..0a33909bf122 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -6,10 +6,10 @@
#include <linux/module.h>
#undef memmove
-void *memmove(void * dest,const void *src,size_t count)
+void *memmove(void *dest, const void *src, size_t count)
{
- if (dest < src) {
- return memcpy(dest,src,count);
+ if (dest < src) {
+ return memcpy(dest, src, count);
} else {
char *p = dest + count;
const char *s = src + count;
@@ -17,5 +17,5 @@ void *memmove(void * dest,const void *src,size_t count)
*--p = *--s;
}
return dest;
-}
+}
EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
index cc9b4a4450f3..c9f2d9ba8dd8 100644
--- a/arch/x86/lib/mmx_32.c
+++ b/arch/x86/lib/mmx_32.c
@@ -1,32 +1,30 @@
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/sched.h>
-#include <linux/hardirq.h>
-#include <linux/module.h>
-
-#include <asm/asm.h>
-#include <asm/i387.h>
-
-
/*
* MMX 3DNow! library helper functions
*
* To do:
- * We can use MMX just for prefetch in IRQ's. This may be a win.
+ * We can use MMX just for prefetch in IRQ's. This may be a win.
* (reported so on K6-III)
* We should use a better code neutral filler for the short jump
* leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
* We also want to clobber the filler register so we don't get any
- * register forwarding stalls on the filler.
+ * register forwarding stalls on the filler.
*
* Add *user handling. Checksums are not a win with MMX on any CPU
* tested so far for any MMX solution figured.
*
- * 22/09/2000 - Arjan van de Ven
- * Improved for non-egineering-sample Athlons
+ * 22/09/2000 - Arjan van de Ven
+ * Improved for non-egineering-sample Athlons
*
*/
-
+#include <linux/hardirq.h>
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+
+#include <asm/i387.h>
+#include <asm/asm.h>
+
void *_mmx_memcpy(void *to, const void *from, size_t len)
{
void *p;
@@ -51,12 +49,10 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
" jmp 2b\n"
".previous\n"
- _ASM_EXTABLE(1b,3b)
- : : "r" (from) );
-
-
- for(; i>5; i--)
- {
+ _ASM_EXTABLE(1b, 3b)
+ : : "r" (from));
+
+ for ( ; i > 5; i--) {
__asm__ __volatile__ (
"1: prefetch 320(%0)\n"
"2: movq (%0), %%mm0\n"
@@ -79,14 +75,14 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
".previous\n"
- _ASM_EXTABLE(1b,3b)
- : : "r" (from), "r" (to) : "memory");
- from+=64;
- to+=64;
+ _ASM_EXTABLE(1b, 3b)
+ : : "r" (from), "r" (to) : "memory");
+
+ from += 64;
+ to += 64;
}
- for(; i>0; i--)
- {
+ for ( ; i > 0; i--) {
__asm__ __volatile__ (
" movq (%0), %%mm0\n"
" movq 8(%0), %%mm1\n"
@@ -104,17 +100,20 @@ void *_mmx_memcpy(void *to, const void *from, size_t len)
" movq %%mm1, 40(%1)\n"
" movq %%mm2, 48(%1)\n"
" movq %%mm3, 56(%1)\n"
- : : "r" (from), "r" (to) : "memory");
- from+=64;
- to+=64;
+ : : "r" (from), "r" (to) : "memory");
+
+ from += 64;
+ to += 64;
}
/*
- * Now do the tail of the block
+ * Now do the tail of the block:
*/
- __memcpy(to, from, len&63);
+ __memcpy(to, from, len & 63);
kernel_fpu_end();
+
return p;
}
+EXPORT_SYMBOL(_mmx_memcpy);
#ifdef CONFIG_MK7
@@ -128,13 +127,12 @@ static void fast_clear_page(void *page)
int i;
kernel_fpu_begin();
-
+
__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
);
- for(i=0;i<4096/64;i++)
- {
+ for (i = 0; i < 4096/64; i++) {
__asm__ __volatile__ (
" movntq %%mm0, (%0)\n"
" movntq %%mm0, 8(%0)\n"
@@ -145,14 +143,15 @@ static void fast_clear_page(void *page)
" movntq %%mm0, 48(%0)\n"
" movntq %%mm0, 56(%0)\n"
: : "r" (page) : "memory");
- page+=64;
+ page += 64;
}
- /* since movntq is weakly-ordered, a "sfence" is needed to become
- * ordered again.
+
+ /*
+ * Since movntq is weakly-ordered, a "sfence" is needed to become
+ * ordered again:
*/
- __asm__ __volatile__ (
- " sfence \n" : :
- );
+ __asm__ __volatile__("sfence\n"::);
+
kernel_fpu_end();
}
@@ -162,10 +161,11 @@ static void fast_copy_page(void *to, void *from)
kernel_fpu_begin();
- /* maybe the prefetch stuff can go before the expensive fnsave...
+ /*
+ * maybe the prefetch stuff can go before the expensive fnsave...
* but that is for later. -AV
*/
- __asm__ __volatile__ (
+ __asm__ __volatile__(
"1: prefetch (%0)\n"
" prefetch 64(%0)\n"
" prefetch 128(%0)\n"
@@ -176,11 +176,9 @@ static void fast_copy_page(void *to, void *from)
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
" jmp 2b\n"
".previous\n"
- _ASM_EXTABLE(1b,3b)
- : : "r" (from) );
+ _ASM_EXTABLE(1b, 3b) : : "r" (from));
- for(i=0; i<(4096-320)/64; i++)
- {
+ for (i = 0; i < (4096-320)/64; i++) {
__asm__ __volatile__ (
"1: prefetch 320(%0)\n"
"2: movq (%0), %%mm0\n"
@@ -203,13 +201,13 @@ static void fast_copy_page(void *to, void *from)
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
".previous\n"
- _ASM_EXTABLE(1b,3b)
- : : "r" (from), "r" (to) : "memory");
- from+=64;
- to+=64;
+ _ASM_EXTABLE(1b, 3b) : : "r" (from), "r" (to) : "memory");
+
+ from += 64;
+ to += 64;
}
- for(i=(4096-320)/64; i<4096/64; i++)
- {
+
+ for (i = (4096-320)/64; i < 4096/64; i++) {
__asm__ __volatile__ (
"2: movq (%0), %%mm0\n"
" movntq %%mm0, (%1)\n"
@@ -227,37 +225,34 @@ static void fast_copy_page(void *to, void *from)
" movntq %%mm6, 48(%1)\n"
" movq 56(%0), %%mm7\n"
" movntq %%mm7, 56(%1)\n"
- : : "r" (from), "r" (to) : "memory");
- from+=64;
- to+=64;
+ : : "r" (from), "r" (to) : "memory");
+ from += 64;
+ to += 64;
}
- /* since movntq is weakly-ordered, a "sfence" is needed to become
- * ordered again.
+ /*
+ * Since movntq is weakly-ordered, a "sfence" is needed to become
+ * ordered again:
*/
- __asm__ __volatile__ (
- " sfence \n" : :
- );
+ __asm__ __volatile__("sfence \n"::);
kernel_fpu_end();
}
-#else
+#else /* CONFIG_MK7 */
/*
* Generic MMX implementation without K7 specific streaming
*/
-
static void fast_clear_page(void *page)
{
int i;
-
+
kernel_fpu_begin();
-
+
__asm__ __volatile__ (
" pxor %%mm0, %%mm0\n" : :
);
- for(i=0;i<4096/128;i++)
- {
+ for (i = 0; i < 4096/128; i++) {
__asm__ __volatile__ (
" movq %%mm0, (%0)\n"
" movq %%mm0, 8(%0)\n"
@@ -275,8 +270,8 @@ static void fast_clear_page(void *page)
" movq %%mm0, 104(%0)\n"
" movq %%mm0, 112(%0)\n"
" movq %%mm0, 120(%0)\n"
- : : "r" (page) : "memory");
- page+=128;
+ : : "r" (page) : "memory");
+ page += 128;
}
kernel_fpu_end();
@@ -285,8 +280,7 @@ static void fast_clear_page(void *page)
static void fast_copy_page(void *to, void *from)
{
int i;
-
-
+
kernel_fpu_begin();
__asm__ __volatile__ (
@@ -300,11 +294,9 @@ static void fast_copy_page(void *to, void *from)
"3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
" jmp 2b\n"
".previous\n"
- _ASM_EXTABLE(1b,3b)
- : : "r" (from) );
+ _ASM_EXTABLE(1b, 3b) : : "r" (from));
- for(i=0; i<4096/64; i++)
- {
+ for (i = 0; i < 4096/64; i++) {
__asm__ __volatile__ (
"1: prefetch 320(%0)\n"
"2: movq (%0), %%mm0\n"
@@ -327,60 +319,59 @@ static void fast_copy_page(void *to, void *from)
"3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
" jmp 2b\n"
".previous\n"
- _ASM_EXTABLE(1b,3b)
- : : "r" (from), "r" (to) : "memory");
- from+=64;
- to+=64;
+ _ASM_EXTABLE(1b, 3b)
+ : : "r" (from), "r" (to) : "memory");
+
+ from += 64;
+ to += 64;
}
kernel_fpu_end();
}
-
-#endif
+#endif /* !CONFIG_MK7 */
/*
- * Favour MMX for page clear and copy.
+ * Favour MMX for page clear and copy:
*/
-
-static void slow_zero_page(void * page)
+static void slow_zero_page(void *page)
{
int d0, d1;
- __asm__ __volatile__( \
- "cld\n\t" \
- "rep ; stosl" \
- : "=&c" (d0), "=&D" (d1)
- :"a" (0),"1" (page),"0" (1024)
- :"memory");
+
+ __asm__ __volatile__(
+ "cld\n\t"
+ "rep ; stosl"
+
+ : "=&c" (d0), "=&D" (d1)
+ :"a" (0), "1" (page), "0" (1024)
+ :"memory");
}
-
-void mmx_clear_page(void * page)
+
+void mmx_clear_page(void *page)
{
- if(unlikely(in_interrupt()))
+ if (unlikely(in_interrupt()))
slow_zero_page(page);
else
fast_clear_page(page);
}
+EXPORT_SYMBOL(mmx_clear_page);
static void slow_copy_page(void *to, void *from)
{
int d0, d1, d2;
- __asm__ __volatile__( \
- "cld\n\t" \
- "rep ; movsl" \
- : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
- : "0" (1024),"1" ((long) to),"2" ((long) from) \
+
+ __asm__ __volatile__(
+ "cld\n\t"
+ "rep ; movsl"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (1024), "1" ((long) to), "2" ((long) from)
: "memory");
}
-
void mmx_copy_page(void *to, void *from)
{
- if(unlikely(in_interrupt()))
+ if (unlikely(in_interrupt()))
slow_copy_page(to, from);
else
fast_copy_page(to, from);
}
-
-EXPORT_SYMBOL(_mmx_memcpy);
-EXPORT_SYMBOL(mmx_clear_page);
EXPORT_SYMBOL(mmx_copy_page);
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 3899bd37fdf0..648fe4741782 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -30,89 +30,6 @@
* value or just clobbered..
*/
.section .sched.text, "ax"
-ENTRY(__down_failed)
- CFI_STARTPROC
- FRAME
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx,0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx,0
- call __down
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ecx
- popl %edx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edx
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__down_failed)
-
-ENTRY(__down_failed_interruptible)
- CFI_STARTPROC
- FRAME
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx,0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx,0
- call __down_interruptible
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ecx
- popl %edx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edx
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__down_failed_interruptible)
-
-ENTRY(__down_failed_trylock)
- CFI_STARTPROC
- FRAME
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx,0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx,0
- call __down_trylock
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ecx
- popl %edx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edx
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__down_failed_trylock)
-
-ENTRY(__up_wakeup)
- CFI_STARTPROC
- FRAME
- pushl %edx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET edx,0
- pushl %ecx
- CFI_ADJUST_CFA_OFFSET 4
- CFI_REL_OFFSET ecx,0
- call __up
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE ecx
- popl %edx
- CFI_ADJUST_CFA_OFFSET -4
- CFI_RESTORE edx
- ENDFRAME
- ret
- CFI_ENDPROC
- ENDPROC(__up_wakeup)
/*
* rw spinlock fallbacks
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index c2c0504a3071..94972e7c094d 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -14,25 +14,25 @@
#include <linux/module.h>
#ifdef __HAVE_ARCH_STRCPY
-char *strcpy(char * dest,const char *src)
+char *strcpy(char *dest, const char *src)
{
int d0, d1, d2;
- asm volatile( "1:\tlodsb\n\t"
+ asm volatile("1:\tlodsb\n\t"
"stosb\n\t"
"testb %%al,%%al\n\t"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2)
- :"0" (src),"1" (dest) : "memory");
+ :"0" (src), "1" (dest) : "memory");
return dest;
}
EXPORT_SYMBOL(strcpy);
#endif
#ifdef __HAVE_ARCH_STRNCPY
-char *strncpy(char * dest,const char *src,size_t count)
+char *strncpy(char *dest, const char *src, size_t count)
{
int d0, d1, d2, d3;
- asm volatile( "1:\tdecl %2\n\t"
+ asm volatile("1:\tdecl %2\n\t"
"js 2f\n\t"
"lodsb\n\t"
"stosb\n\t"
@@ -42,17 +42,17 @@ char *strncpy(char * dest,const char *src,size_t count)
"stosb\n"
"2:"
: "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
- :"0" (src),"1" (dest),"2" (count) : "memory");
+ :"0" (src), "1" (dest), "2" (count) : "memory");
return dest;
}
EXPORT_SYMBOL(strncpy);
#endif
#ifdef __HAVE_ARCH_STRCAT
-char *strcat(char * dest,const char * src)
+char *strcat(char *dest, const char *src)
{
int d0, d1, d2, d3;
- asm volatile( "repne\n\t"
+ asm volatile("repne\n\t"
"scasb\n\t"
"decl %1\n"
"1:\tlodsb\n\t"
@@ -67,10 +67,10 @@ EXPORT_SYMBOL(strcat);
#endif
#ifdef __HAVE_ARCH_STRNCAT
-char *strncat(char * dest,const char * src,size_t count)
+char *strncat(char *dest, const char *src, size_t count)
{
int d0, d1, d2, d3;
- asm volatile( "repne\n\t"
+ asm volatile("repne\n\t"
"scasb\n\t"
"decl %1\n\t"
"movl %8,%3\n"
@@ -83,7 +83,7 @@ char *strncat(char * dest,const char * src,size_t count)
"2:\txorl %2,%2\n\t"
"stosb"
: "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
- : "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count)
+ : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu), "g" (count)
: "memory");
return dest;
}
@@ -91,11 +91,11 @@ EXPORT_SYMBOL(strncat);
#endif
#ifdef __HAVE_ARCH_STRCMP
-int strcmp(const char * cs,const char * ct)
+int strcmp(const char *cs, const char *ct)
{
int d0, d1;
int res;
- asm volatile( "1:\tlodsb\n\t"
+ asm volatile("1:\tlodsb\n\t"
"scasb\n\t"
"jne 2f\n\t"
"testb %%al,%%al\n\t"
@@ -106,7 +106,7 @@ int strcmp(const char * cs,const char * ct)
"orb $1,%%al\n"
"3:"
:"=a" (res), "=&S" (d0), "=&D" (d1)
- :"1" (cs),"2" (ct)
+ :"1" (cs), "2" (ct)
:"memory");
return res;
}
@@ -114,11 +114,11 @@ EXPORT_SYMBOL(strcmp);
#endif
#ifdef __HAVE_ARCH_STRNCMP
-int strncmp(const char * cs,const char * ct,size_t count)
+int strncmp(const char *cs, const char *ct, size_t count)
{
int res;
int d0, d1, d2;
- asm volatile( "1:\tdecl %3\n\t"
+ asm volatile("1:\tdecl %3\n\t"
"js 2f\n\t"
"lodsb\n\t"
"scasb\n\t"
@@ -131,7 +131,7 @@ int strncmp(const char * cs,const char * ct,size_t count)
"orb $1,%%al\n"
"4:"
:"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
- :"1" (cs),"2" (ct),"3" (count)
+ :"1" (cs), "2" (ct), "3" (count)
:"memory");
return res;
}
@@ -139,11 +139,11 @@ EXPORT_SYMBOL(strncmp);
#endif
#ifdef __HAVE_ARCH_STRCHR
-char *strchr(const char * s, int c)
+char *strchr(const char *s, int c)
{
int d0;
- char * res;
- asm volatile( "movb %%al,%%ah\n"
+ char *res;
+ asm volatile("movb %%al,%%ah\n"
"1:\tlodsb\n\t"
"cmpb %%ah,%%al\n\t"
"je 2f\n\t"
@@ -153,7 +153,7 @@ char *strchr(const char * s, int c)
"2:\tmovl %1,%0\n\t"
"decl %0"
:"=a" (res), "=&S" (d0)
- :"1" (s),"0" (c)
+ :"1" (s), "0" (c)
:"memory");
return res;
}
@@ -161,16 +161,16 @@ EXPORT_SYMBOL(strchr);
#endif
#ifdef __HAVE_ARCH_STRLEN
-size_t strlen(const char * s)
+size_t strlen(const char *s)
{
int d0;
int res;
- asm volatile( "repne\n\t"
+ asm volatile("repne\n\t"
"scasb\n\t"
"notl %0\n\t"
"decl %0"
:"=c" (res), "=&D" (d0)
- :"1" (s),"a" (0), "0" (0xffffffffu)
+ :"1" (s), "a" (0), "0" (0xffffffffu)
:"memory");
return res;
}
@@ -178,19 +178,19 @@ EXPORT_SYMBOL(strlen);
#endif
#ifdef __HAVE_ARCH_MEMCHR
-void *memchr(const void *cs,int c,size_t count)
+void *memchr(const void *cs, int c, size_t count)
{
int d0;
void *res;
if (!count)
return NULL;
- asm volatile( "repne\n\t"
+ asm volatile("repne\n\t"
"scasb\n\t"
"je 1f\n\t"
"movl $1,%0\n"
"1:\tdecl %0"
:"=D" (res), "=&c" (d0)
- :"a" (c),"0" (cs),"1" (count)
+ :"a" (c), "0" (cs), "1" (count)
:"memory");
return res;
}
@@ -198,7 +198,7 @@ EXPORT_SYMBOL(memchr);
#endif
#ifdef __HAVE_ARCH_MEMSCAN
-void *memscan(void * addr, int c, size_t size)
+void *memscan(void *addr, int c, size_t size)
{
if (!size)
return addr;
@@ -219,7 +219,7 @@ size_t strnlen(const char *s, size_t count)
{
int d0;
int res;
- asm volatile( "movl %2,%0\n\t"
+ asm volatile("movl %2,%0\n\t"
"jmp 2f\n"
"1:\tcmpb $0,(%0)\n\t"
"je 3f\n\t"
@@ -229,7 +229,7 @@ size_t strnlen(const char *s, size_t count)
"jne 1b\n"
"3:\tsubl %2,%0"
:"=a" (res), "=&d" (d0)
- :"c" (s),"1" (count)
+ :"c" (s), "1" (count)
:"memory");
return res;
}
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
index a3dafbf59dae..42e8a50303f3 100644
--- a/arch/x86/lib/strstr_32.c
+++ b/arch/x86/lib/strstr_32.c
@@ -1,9 +1,9 @@
#include <linux/string.h>
-char * strstr(const char * cs,const char * ct)
+char *strstr(const char *cs, const char *ct)
{
int d0, d1;
-register char * __res;
+register char *__res;
__asm__ __volatile__(
"movl %6,%%edi\n\t"
"repne\n\t"
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 8b92d428ab02..e009251d4e9f 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -41,11 +41,6 @@
thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
#endif
- thunk __down_failed,__down
- thunk_retrax __down_failed_interruptible,__down_interruptible
- thunk_retrax __down_failed_trylock,__down_trylock
- thunk __up_wakeup,__up
-
#ifdef CONFIG_TRACE_IRQFLAGS
thunk trace_hardirqs_on_thunk,trace_hardirqs_on
thunk trace_hardirqs_off_thunk,trace_hardirqs_off
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index e849b9998b0e..24e60944971a 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -1,4 +1,4 @@
-/*
+/*
* User address space access functions.
* The non inlined parts of asm-i386/uaccess.h are here.
*
@@ -22,14 +22,14 @@ static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned lon
#endif
return 1;
}
-#define movsl_is_ok(a1,a2,n) \
- __movsl_is_ok((unsigned long)(a1),(unsigned long)(a2),(n))
+#define movsl_is_ok(a1, a2, n) \
+ __movsl_is_ok((unsigned long)(a1), (unsigned long)(a2), (n))
/*
* Copy a null terminated string from userspace.
*/
-#define __do_strncpy_from_user(dst,src,count,res) \
+#define __do_strncpy_from_user(dst, src, count, res) \
do { \
int __d0, __d1, __d2; \
might_sleep(); \
@@ -61,7 +61,7 @@ do { \
* least @count bytes long.
* @src: Source address, in user space.
* @count: Maximum number of bytes to copy, including the trailing NUL.
- *
+ *
* Copies a NUL-terminated string from userspace to kernel space.
* Caller must check the specified block with access_ok() before calling
* this function.
@@ -90,7 +90,7 @@ EXPORT_SYMBOL(__strncpy_from_user);
* least @count bytes long.
* @src: Source address, in user space.
* @count: Maximum number of bytes to copy, including the trailing NUL.
- *
+ *
* Copies a NUL-terminated string from userspace to kernel space.
*
* On success, returns the length of the string (not including the trailing
@@ -120,7 +120,7 @@ EXPORT_SYMBOL(strncpy_from_user);
do { \
int __d0; \
might_sleep(); \
- __asm__ __volatile__( \
+ __asm__ __volatile__( \
"0: rep; stosl\n" \
" movl %2,%0\n" \
"1: rep; stosb\n" \
@@ -333,17 +333,17 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
__asm__ __volatile__(
" .align 2,0x90\n"
"0: movl 32(%4), %%eax\n"
- " cmpl $67, %0\n"
- " jbe 2f\n"
+ " cmpl $67, %0\n"
+ " jbe 2f\n"
"1: movl 64(%4), %%eax\n"
- " .align 2,0x90\n"
- "2: movl 0(%4), %%eax\n"
- "21: movl 4(%4), %%edx\n"
- " movl %%eax, 0(%3)\n"
- " movl %%edx, 4(%3)\n"
- "3: movl 8(%4), %%eax\n"
- "31: movl 12(%4),%%edx\n"
- " movl %%eax, 8(%3)\n"
+ " .align 2,0x90\n"
+ "2: movl 0(%4), %%eax\n"
+ "21: movl 4(%4), %%edx\n"
+ " movl %%eax, 0(%3)\n"
+ " movl %%edx, 4(%3)\n"
+ "3: movl 8(%4), %%eax\n"
+ "31: movl 12(%4),%%edx\n"
+ " movl %%eax, 8(%3)\n"
" movl %%edx, 12(%3)\n"
"4: movl 16(%4), %%eax\n"
"41: movl 20(%4), %%edx\n"
@@ -369,38 +369,38 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
"91: movl 60(%4), %%edx\n"
" movl %%eax, 56(%3)\n"
" movl %%edx, 60(%3)\n"
- " addl $-64, %0\n"
- " addl $64, %4\n"
- " addl $64, %3\n"
- " cmpl $63, %0\n"
- " ja 0b\n"
- "5: movl %0, %%eax\n"
- " shrl $2, %0\n"
- " andl $3, %%eax\n"
- " cld\n"
- "6: rep; movsl\n"
+ " addl $-64, %0\n"
+ " addl $64, %4\n"
+ " addl $64, %3\n"
+ " cmpl $63, %0\n"
+ " ja 0b\n"
+ "5: movl %0, %%eax\n"
+ " shrl $2, %0\n"
+ " andl $3, %%eax\n"
+ " cld\n"
+ "6: rep; movsl\n"
" movl %%eax,%0\n"
- "7: rep; movsb\n"
- "8:\n"
+ "7: rep; movsb\n"
+ "8:\n"
".section .fixup,\"ax\"\n"
- "9: lea 0(%%eax,%0,4),%0\n"
- "16: pushl %0\n"
- " pushl %%eax\n"
+ "9: lea 0(%%eax,%0,4),%0\n"
+ "16: pushl %0\n"
+ " pushl %%eax\n"
" xorl %%eax,%%eax\n"
- " rep; stosb\n"
- " popl %%eax\n"
- " popl %0\n"
- " jmp 8b\n"
- ".previous\n"
+ " rep; stosb\n"
+ " popl %%eax\n"
+ " popl %0\n"
+ " jmp 8b\n"
+ ".previous\n"
".section __ex_table,\"a\"\n"
- " .align 4\n"
- " .long 0b,16b\n"
+ " .align 4\n"
+ " .long 0b,16b\n"
" .long 1b,16b\n"
" .long 2b,16b\n"
" .long 21b,16b\n"
- " .long 3b,16b\n"
+ " .long 3b,16b\n"
" .long 31b,16b\n"
- " .long 4b,16b\n"
+ " .long 4b,16b\n"
" .long 41b,16b\n"
" .long 10b,16b\n"
" .long 51b,16b\n"
@@ -412,9 +412,9 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
" .long 81b,16b\n"
" .long 14b,16b\n"
" .long 91b,16b\n"
- " .long 6b,9b\n"
- " .long 7b,16b\n"
- ".previous"
+ " .long 6b,9b\n"
+ " .long 7b,16b\n"
+ ".previous"
: "=&c"(size), "=&D" (d0), "=&S" (d1)
: "1"(to), "2"(from), "0"(size)
: "eax", "edx", "memory");
@@ -429,7 +429,7 @@ __copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
static unsigned long __copy_user_zeroing_intel_nocache(void *to,
const void __user *from, unsigned long size)
{
- int d0, d1;
+ int d0, d1;
__asm__ __volatile__(
" .align 2,0x90\n"
@@ -526,7 +526,7 @@ static unsigned long __copy_user_zeroing_intel_nocache(void *to,
static unsigned long __copy_user_intel_nocache(void *to,
const void __user *from, unsigned long size)
{
- int d0, d1;
+ int d0, d1;
__asm__ __volatile__(
" .align 2,0x90\n"
@@ -629,7 +629,7 @@ unsigned long __copy_user_zeroing_intel_nocache(void *to,
#endif /* CONFIG_X86_INTEL_USERCOPY */
/* Generic arbitrary sized copy. */
-#define __copy_user(to,from,size) \
+#define __copy_user(to, from, size) \
do { \
int __d0, __d1, __d2; \
__asm__ __volatile__( \
@@ -665,7 +665,7 @@ do { \
: "memory"); \
} while (0)
-#define __copy_user_zeroing(to,from,size) \
+#define __copy_user_zeroing(to, from, size) \
do { \
int __d0, __d1, __d2; \
__asm__ __volatile__( \
@@ -712,7 +712,7 @@ unsigned long __copy_to_user_ll(void __user *to, const void *from,
{
#ifndef CONFIG_X86_WP_WORKS_OK
if (unlikely(boot_cpu_data.wp_works_ok == 0) &&
- ((unsigned long )to) < TASK_SIZE) {
+ ((unsigned long)to) < TASK_SIZE) {
/*
* When we are in an atomic section (see
* mm/filemap.c:file_read_actor), return the full
@@ -721,26 +721,26 @@ unsigned long __copy_to_user_ll(void __user *to, const void *from,
if (in_atomic())
return n;
- /*
+ /*
* CPU does not honor the WP bit when writing
* from supervisory mode, and due to preemption or SMP,
* the page tables can change at any time.
* Do it manually. Manfred <manfred@colorfullife.com>
*/
while (n) {
- unsigned long offset = ((unsigned long)to)%PAGE_SIZE;
+ unsigned long offset = ((unsigned long)to)%PAGE_SIZE;
unsigned long len = PAGE_SIZE - offset;
int retval;
struct page *pg;
void *maddr;
-
+
if (len > n)
len = n;
survive:
down_read(&current->mm->mmap_sem);
retval = get_user_pages(current, current->mm,
- (unsigned long )to, 1, 1, 0, &pg, NULL);
+ (unsigned long)to, 1, 1, 0, &pg, NULL);
if (retval == -ENOMEM && is_global_init(current)) {
up_read(&current->mm->mmap_sem);
@@ -750,8 +750,8 @@ survive:
if (retval != 1) {
up_read(&current->mm->mmap_sem);
- break;
- }
+ break;
+ }
maddr = kmap_atomic(pg, KM_USER0);
memcpy(maddr + offset, from, len);
@@ -802,12 +802,12 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
unsigned long n)
{
#ifdef CONFIG_X86_INTEL_USERCOPY
- if ( n > 64 && cpu_has_xmm2)
- n = __copy_user_zeroing_intel_nocache(to, from, n);
+ if (n > 64 && cpu_has_xmm2)
+ n = __copy_user_zeroing_intel_nocache(to, from, n);
else
__copy_user_zeroing(to, from, n);
#else
- __copy_user_zeroing(to, from, n);
+ __copy_user_zeroing(to, from, n);
#endif
return n;
}
@@ -817,12 +817,12 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr
unsigned long n)
{
#ifdef CONFIG_X86_INTEL_USERCOPY
- if ( n > 64 && cpu_has_xmm2)
- n = __copy_user_intel_nocache(to, from, n);
+ if (n > 64 && cpu_has_xmm2)
+ n = __copy_user_intel_nocache(to, from, n);
else
__copy_user(to, from, n);
#else
- __copy_user(to, from, n);
+ __copy_user(to, from, n);
#endif
return n;
}
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 292a225edabe..95fc463056d0 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -1,4 +1,4 @@
-/*
+/*
* APIC driver for "bigsmp" XAPIC machines with more than 8 virtual CPUs.
* Drives the local APIC in "clustered mode".
*/
@@ -32,26 +32,26 @@ static int hp_ht_bigsmp(const struct dmi_system_id *d)
static const struct dmi_system_id bigsmp_dmi_table[] = {
- { hp_ht_bigsmp, "HP ProLiant DL760 G2", {
- DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P44-"),
- }},
-
- { hp_ht_bigsmp, "HP ProLiant DL740", {
- DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
- DMI_MATCH(DMI_BIOS_VERSION, "P47-"),
- }},
+ { hp_ht_bigsmp, "HP ProLiant DL760 G2",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+ DMI_MATCH(DMI_BIOS_VERSION, "P44-"),}
+ },
+
+ { hp_ht_bigsmp, "HP ProLiant DL740",
+ { DMI_MATCH(DMI_BIOS_VENDOR, "HP"),
+ DMI_MATCH(DMI_BIOS_VERSION, "P47-"),}
+ },
{ }
};
static int probe_bigsmp(void)
-{
+{
if (def_to_bigsmp)
- dmi_bigsmp = 1;
+ dmi_bigsmp = 1;
else
dmi_check_system(bigsmp_dmi_table);
- return dmi_bigsmp;
-}
+ return dmi_bigsmp;
+}
-struct genapic apic_bigsmp = APIC_INIT("bigsmp", probe_bigsmp);
+struct genapic apic_bigsmp = APIC_INIT("bigsmp", probe_bigsmp);
diff --git a/arch/x86/mach-generic/default.c b/arch/x86/mach-generic/default.c
index 1af0cc7648f0..9e835a11a13a 100644
--- a/arch/x86/mach-generic/default.c
+++ b/arch/x86/mach-generic/default.c
@@ -1,4 +1,4 @@
-/*
+/*
* Default generic APIC driver. This handles up to 8 CPUs.
*/
#define APIC_DEFINITION 1
@@ -19,8 +19,8 @@
/* should be called last. */
static int probe_default(void)
-{
+{
return 1;
-}
+}
-struct genapic apic_default = APIC_INIT("default", probe_default);
+struct genapic apic_default = APIC_INIT("default", probe_default);
diff --git a/arch/x86/mach-generic/probe.c b/arch/x86/mach-generic/probe.c
index f410d3cb5659..c5ae751b994a 100644
--- a/arch/x86/mach-generic/probe.c
+++ b/arch/x86/mach-generic/probe.c
@@ -1,8 +1,9 @@
-/* Copyright 2003 Andi Kleen, SuSE Labs.
- * Subject to the GNU Public License, v.2
- *
+/*
+ * Copyright 2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License, v.2
+ *
* Generic x86 APIC driver probe layer.
- */
+ */
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/string.h>
@@ -24,7 +25,7 @@ struct genapic *genapic = &apic_default;
static struct genapic *apic_probe[] __initdata = {
&apic_summit,
- &apic_bigsmp,
+ &apic_bigsmp,
&apic_es7000,
&apic_default, /* must be last */
NULL,
@@ -69,7 +70,7 @@ void __init generic_bigsmp_probe(void)
}
void __init generic_apic_probe(void)
-{
+{
if (!cmdline_apic) {
int i;
for (i = 0; apic_probe[i]; i++) {
@@ -83,40 +84,40 @@ void __init generic_apic_probe(void)
panic("Didn't find an APIC driver");
}
printk(KERN_INFO "Using APIC driver %s\n", genapic->name);
-}
+}
/* These functions can switch the APIC even after the initial ->probe() */
int __init mps_oem_check(struct mp_config_table *mpc, char *oem, char *productid)
-{
+{
int i;
- for (i = 0; apic_probe[i]; ++i) {
- if (apic_probe[i]->mps_oem_check(mpc,oem,productid)) {
+ for (i = 0; apic_probe[i]; ++i) {
+ if (apic_probe[i]->mps_oem_check(mpc, oem, productid)) {
if (!cmdline_apic) {
genapic = apic_probe[i];
printk(KERN_INFO "Switched to APIC driver `%s'.\n",
genapic->name);
}
return 1;
- }
- }
+ }
+ }
return 0;
-}
+}
int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
int i;
- for (i = 0; apic_probe[i]; ++i) {
- if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
+ for (i = 0; apic_probe[i]; ++i) {
+ if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
if (!cmdline_apic) {
genapic = apic_probe[i];
printk(KERN_INFO "Switched to APIC driver `%s'.\n",
genapic->name);
}
return 1;
- }
- }
- return 0;
+ }
+ }
+ return 0;
}
int hard_smp_processor_id(void)
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index 74883ccb8f73..a97ea0f35b1e 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -1,4 +1,4 @@
-/*
+/*
* APIC driver for the IBM "Summit" chipset.
*/
#define APIC_DEFINITION 1
@@ -19,9 +19,9 @@
#include <asm/mach-summit/mach_mpparse.h>
static int probe_summit(void)
-{
+{
/* probed later in mptable/ACPI hooks */
return 0;
-}
+}
-struct genapic apic_summit = APIC_INIT("summit", probe_summit);
+struct genapic apic_summit = APIC_INIT("summit", probe_summit);
diff --git a/arch/x86/mach-rdc321x/Makefile b/arch/x86/mach-rdc321x/Makefile
index 1faac8125e3d..8325b4ca431c 100644
--- a/arch/x86/mach-rdc321x/Makefile
+++ b/arch/x86/mach-rdc321x/Makefile
@@ -1,5 +1,5 @@
#
# Makefile for the RDC321x specific parts of the kernel
#
-obj-$(CONFIG_X86_RDC321X) := gpio.o platform.o wdt.o
+obj-$(CONFIG_X86_RDC321X) := gpio.o platform.o
diff --git a/arch/x86/mach-rdc321x/wdt.c b/arch/x86/mach-rdc321x/wdt.c
deleted file mode 100644
index ec5625ae7061..000000000000
--- a/arch/x86/mach-rdc321x/wdt.c
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * RDC321x watchdog driver
- *
- * Copyright (C) 2007 Florian Fainelli <florian@openwrt.org>
- *
- * This driver is highly inspired from the cpu5_wdt driver
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- */
-
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/miscdevice.h>
-#include <linux/fs.h>
-#include <linux/init.h>
-#include <linux/ioport.h>
-#include <linux/timer.h>
-#include <linux/completion.h>
-#include <linux/jiffies.h>
-#include <linux/platform_device.h>
-#include <linux/watchdog.h>
-#include <linux/io.h>
-#include <linux/uaccess.h>
-
-#include <asm/mach-rdc321x/rdc321x_defs.h>
-
-#define RDC_WDT_MASK 0x80000000 /* Mask */
-#define RDC_WDT_EN 0x00800000 /* Enable bit */
-#define RDC_WDT_WTI 0x00200000 /* Generate CPU reset/NMI/WDT on timeout */
-#define RDC_WDT_RST 0x00100000 /* Reset bit */
-#define RDC_WDT_WIF 0x00040000 /* WDT IRQ Flag */
-#define RDC_WDT_IRT 0x00000100 /* IRQ Routing table */
-#define RDC_WDT_CNT 0x00000001 /* WDT count */
-
-#define RDC_CLS_TMR 0x80003844 /* Clear timer */
-
-#define RDC_WDT_INTERVAL (HZ/10+1)
-
-int nowayout = WATCHDOG_NOWAYOUT;
-module_param(nowayout, int, 0);
-MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
-
-static int ticks = 1000;
-
-/* some device data */
-
-static struct {
- struct completion stop;
- volatile int running;
- struct timer_list timer;
- volatile int queue;
- int default_ticks;
- unsigned long inuse;
-} rdc321x_wdt_device;
-
-/* generic helper functions */
-
-static void rdc321x_wdt_trigger(unsigned long unused)
-{
- if (rdc321x_wdt_device.running)
- ticks--;
-
- /* keep watchdog alive */
- outl(RDC_WDT_EN|inl(RDC3210_CFGREG_DATA), RDC3210_CFGREG_DATA);
-
- /* requeue?? */
- if (rdc321x_wdt_device.queue && ticks)
- mod_timer(&rdc321x_wdt_device.timer,
- jiffies + RDC_WDT_INTERVAL);
- else {
- /* ticks doesn't matter anyway */
- complete(&rdc321x_wdt_device.stop);
- }
-
-}
-
-static void rdc321x_wdt_reset(void)
-{
- ticks = rdc321x_wdt_device.default_ticks;
-}
-
-static void rdc321x_wdt_start(void)
-{
- if (!rdc321x_wdt_device.queue) {
- rdc321x_wdt_device.queue = 1;
-
- /* Clear the timer */
- outl(RDC_CLS_TMR, RDC3210_CFGREG_ADDR);
-
- /* Enable watchdog and set the timeout to 81.92 us */
- outl(RDC_WDT_EN|RDC_WDT_CNT, RDC3210_CFGREG_DATA);
-
- mod_timer(&rdc321x_wdt_device.timer,
- jiffies + RDC_WDT_INTERVAL);
- }
-
- /* if process dies, counter is not decremented */
- rdc321x_wdt_device.running++;
-}
-
-static int rdc321x_wdt_stop(void)
-{
- if (rdc321x_wdt_device.running)
- rdc321x_wdt_device.running = 0;
-
- ticks = rdc321x_wdt_device.default_ticks;
-
- return -EIO;
-}
-
-/* filesystem operations */
-
-static int rdc321x_wdt_open(struct inode *inode, struct file *file)
-{
- if (test_and_set_bit(0, &rdc321x_wdt_device.inuse))
- return -EBUSY;
-
- return nonseekable_open(inode, file);
-}
-
-static int rdc321x_wdt_release(struct inode *inode, struct file *file)
-{
- clear_bit(0, &rdc321x_wdt_device.inuse);
- return 0;
-}
-
-static int rdc321x_wdt_ioctl(struct inode *inode, struct file *file,
- unsigned int cmd, unsigned long arg)
-{
- void __user *argp = (void __user *)arg;
- unsigned int value;
- static struct watchdog_info ident = {
- .options = WDIOF_CARDRESET,
- .identity = "RDC321x WDT",
- };
-
- switch (cmd) {
- case WDIOC_KEEPALIVE:
- rdc321x_wdt_reset();
- break;
- case WDIOC_GETSTATUS:
- /* Read the value from the DATA register */
- value = inl(RDC3210_CFGREG_DATA);
- if (copy_to_user(argp, &value, sizeof(int)))
- return -EFAULT;
- break;
- case WDIOC_GETSUPPORT:
- if (copy_to_user(argp, &ident, sizeof(ident)))
- return -EFAULT;
- break;
- case WDIOC_SETOPTIONS:
- if (copy_from_user(&value, argp, sizeof(int)))
- return -EFAULT;
- switch (value) {
- case WDIOS_ENABLECARD:
- rdc321x_wdt_start();
- break;
- case WDIOS_DISABLECARD:
- return rdc321x_wdt_stop();
- default:
- return -EINVAL;
- }
- break;
- default:
- return -ENOTTY;
- }
- return 0;
-}
-
-static ssize_t rdc321x_wdt_write(struct file *file, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- if (!count)
- return -EIO;
-
- rdc321x_wdt_reset();
-
- return count;
-}
-
-static const struct file_operations rdc321x_wdt_fops = {
- .owner = THIS_MODULE,
- .llseek = no_llseek,
- .ioctl = rdc321x_wdt_ioctl,
- .open = rdc321x_wdt_open,
- .write = rdc321x_wdt_write,
- .release = rdc321x_wdt_release,
-};
-
-static struct miscdevice rdc321x_wdt_misc = {
- .minor = WATCHDOG_MINOR,
- .name = "watchdog",
- .fops = &rdc321x_wdt_fops,
-};
-
-static int __devinit rdc321x_wdt_probe(struct platform_device *pdev)
-{
- int err;
-
- err = misc_register(&rdc321x_wdt_misc);
- if (err < 0) {
- printk(KERN_ERR PFX "watchdog misc_register failed\n");
- return err;
- }
-
- /* Reset the watchdog */
- outl(RDC_WDT_RST, RDC3210_CFGREG_DATA);
-
- init_completion(&rdc321x_wdt_device.stop);
- rdc321x_wdt_device.queue = 0;
-
- clear_bit(0, &rdc321x_wdt_device.inuse);
-
- setup_timer(&rdc321x_wdt_device.timer, rdc321x_wdt_trigger, 0);
-
- rdc321x_wdt_device.default_ticks = ticks;
-
- printk(KERN_INFO PFX "watchdog init success\n");
-
- return 0;
-}
-
-static int rdc321x_wdt_remove(struct platform_device *pdev)
-{
- if (rdc321x_wdt_device.queue) {
- rdc321x_wdt_device.queue = 0;
- wait_for_completion(&rdc321x_wdt_device.stop);
- }
-
- misc_deregister(&rdc321x_wdt_misc);
-
- return 0;
-}
-
-static struct platform_driver rdc321x_wdt_driver = {
- .probe = rdc321x_wdt_probe,
- .remove = rdc321x_wdt_remove,
- .driver = {
- .owner = THIS_MODULE,
- .name = "rdc321x-wdt",
- },
-};
-
-static int __init rdc321x_wdt_init(void)
-{
- return platform_driver_register(&rdc321x_wdt_driver);
-}
-
-static void __exit rdc321x_wdt_exit(void)
-{
- platform_driver_unregister(&rdc321x_wdt_driver);
-}
-
-module_init(rdc321x_wdt_init);
-module_exit(rdc321x_wdt_exit);
-
-MODULE_AUTHOR("Florian Fainelli <florian@openwrt.org>");
-MODULE_DESCRIPTION("RDC321x watchdog driver");
-MODULE_LICENSE("GPL");
-MODULE_ALIAS_MISCDEV(WATCHDOG_MINOR);
diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c
index 2a8456a1f44f..57484e91ab90 100644
--- a/arch/x86/mach-visws/mpparse.c
+++ b/arch/x86/mach-visws/mpparse.c
@@ -11,22 +11,9 @@
/* Have we found an MP table */
int smp_found_config;
-/*
- * Various Linux-internal data structures created from the
- * MP-table.
- */
-int apic_version [MAX_APICS];
-
int pic_mode;
-unsigned long mp_lapic_addr;
-
-/* Processor that is doing the boot up */
-unsigned int boot_cpu_physical_apicid = -1U;
-
-/* Bitmask of physically existing CPUs */
-physid_mask_t phys_cpu_present_map;
-unsigned int __initdata maxcpus = NR_CPUS;
+extern unsigned int __cpuinitdata maxcpus;
/*
* The Visual Workstation is Intel MP compliant in the hardware
diff --git a/arch/x86/mach-visws/visws_apic.c b/arch/x86/mach-visws/visws_apic.c
index 710faf71a650..cef9cb1d15ac 100644
--- a/arch/x86/mach-visws/visws_apic.c
+++ b/arch/x86/mach-visws/visws_apic.c
@@ -1,6 +1,4 @@
/*
- * linux/arch/i386/mach-visws/visws_apic.c
- *
* Copyright (C) 1999 Bent Hagemark, Ingo Molnar
*
* SGI Visual Workstation interrupt controller
diff --git a/arch/x86/mach-voyager/voyager_basic.c b/arch/x86/mach-voyager/voyager_basic.c
index 6a949e4edde8..46d6f8067690 100644
--- a/arch/x86/mach-voyager/voyager_basic.c
+++ b/arch/x86/mach-voyager/voyager_basic.c
@@ -2,8 +2,6 @@
*
* Author: J.E.J.Bottomley@HansenPartnership.com
*
- * linux/arch/i386/kernel/voyager.c
- *
* This file contains all the voyager specific routines for getting
* initialisation of the architecture to function. For additional
* features see:
diff --git a/arch/x86/mach-voyager/voyager_cat.c b/arch/x86/mach-voyager/voyager_cat.c
index 17a7904f75b1..ecab9fff0fd1 100644
--- a/arch/x86/mach-voyager/voyager_cat.c
+++ b/arch/x86/mach-voyager/voyager_cat.c
@@ -4,8 +4,6 @@
*
* Author: J.E.J.Bottomley@HansenPartnership.com
*
- * linux/arch/i386/kernel/voyager_cat.c
- *
* This file contains all the logic for manipulating the CAT bus
* in a level 5 machine.
*
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 3cc8eb2f36a9..8acbf0cdf1a5 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -4,8 +4,6 @@
*
* Author: J.E.J.Bottomley@HansenPartnership.com
*
- * linux/arch/i386/kernel/voyager_smp.c
- *
* This file provides all the same external entries as smp.c but uses
* the voyager hal to provide the functionality
*/
@@ -27,6 +25,7 @@
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/arch_hooks.h>
+#include <asm/trampoline.h>
/* TLB state -- visible externally, indexed physically */
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { &init_mm, 0 };
@@ -114,7 +113,7 @@ static inline void send_QIC_CPI(__u32 cpuset, __u8 cpi)
for_each_online_cpu(cpu) {
if (cpuset & (1 << cpu)) {
#ifdef VOYAGER_DEBUG
- if (!cpu_isset(cpu, cpu_online_map))
+ if (!cpu_online(cpu))
VDEBUG(("CPU%d sending cpi %d to CPU%d not in "
"cpu_online_map\n",
hard_smp_processor_id(), cpi, cpu));
@@ -207,11 +206,6 @@ static struct irq_chip vic_chip = {
/* used to count up as CPUs are brought on line (starts at 0) */
static int cpucount = 0;
-/* steal a page from the bottom of memory for the trampoline and
- * squirrel its address away here. This will be in kernel virtual
- * space */
-static __u32 trampoline_base;
-
/* The per cpu profile stuff - used in smp_local_timer_interrupt */
static DEFINE_PER_CPU(int, prof_multiplier) = 1;
static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
@@ -428,18 +422,6 @@ void __init smp_store_cpu_info(int id)
identify_secondary_cpu(c);
}
-/* set up the trampoline and return the physical address of the code */
-static __u32 __init setup_trampoline(void)
-{
- /* these two are global symbols in trampoline.S */
- extern const __u8 trampoline_end[];
- extern const __u8 trampoline_data[];
-
- memcpy((__u8 *) trampoline_base, trampoline_data,
- trampoline_end - trampoline_data);
- return virt_to_phys((__u8 *) trampoline_base);
-}
-
/* Routine initially called when a non-boot CPU is brought online */
static void __init start_secondary(void *unused)
{
@@ -520,13 +502,6 @@ static void __init do_boot_cpu(__u8 cpu)
& ~(voyager_extended_vic_processors
& voyager_allowed_boot_processors);
- /* This is an area in head.S which was used to set up the
- * initial kernel stack. We need to alter this to give the
- * booting CPU a new stack (taken from its idle process) */
- extern struct {
- __u8 *sp;
- unsigned short ss;
- } stack_start;
/* This is the format of the CPI IDT gate (in real mode) which
* we're hijacking to boot the CPU */
union IDTFormat {
@@ -568,8 +543,8 @@ static void __init do_boot_cpu(__u8 cpu)
hijack_source.idt.Offset, stack_start.sp));
/* init lowmem identity mapping */
- clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
- min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
+ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
flush_tlb_all();
if (quad_boot) {
@@ -708,9 +683,9 @@ void __init smp_boot_cpus(void)
* Code added from smpboot.c */
{
unsigned long bogosum = 0;
- for (i = 0; i < NR_CPUS; i++)
- if (cpu_isset(i, cpu_online_map))
- bogosum += cpu_data(i).loops_per_jiffy;
+
+ for_each_online_cpu(i)
+ bogosum += cpu_data(i).loops_per_jiffy;
printk(KERN_INFO "Total of %d processors activated "
"(%lu.%02lu BogoMIPS).\n",
cpucount + 1, bogosum / (500000 / HZ),
@@ -1166,7 +1141,7 @@ void flush_tlb_all(void)
* is sorted out */
void __init smp_alloc_memory(void)
{
- trampoline_base = (__u32) alloc_bootmem_low_pages(PAGE_SIZE);
+ trampoline_base = alloc_bootmem_low_pages(PAGE_SIZE);
if (__pa(trampoline_base) >= 0x93000)
BUG();
}
@@ -1863,7 +1838,7 @@ static int __cpuinit voyager_cpu_up(unsigned int cpu)
return -EIO;
/* Unleash the CPU! */
cpu_set(cpu, smp_commenced_mask);
- while (!cpu_isset(cpu, cpu_online_map))
+ while (!cpu_online(cpu))
mb();
return 0;
}
diff --git a/arch/x86/mach-voyager/voyager_thread.c b/arch/x86/mach-voyager/voyager_thread.c
index c69c931818ed..15464a20fb38 100644
--- a/arch/x86/mach-voyager/voyager_thread.c
+++ b/arch/x86/mach-voyager/voyager_thread.c
@@ -4,8 +4,6 @@
*
* Author: J.E.J.Bottomley@HansenPartnership.com
*
- * linux/arch/i386/kernel/voyager_thread.c
- *
* This module provides the machine status monitor thread for the
* voyager architecture. This allows us to monitor the machine
* environment (temp, voltage, fan function) and the front panel and
diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c
index 760baeea5f07..6e38d877ea77 100644
--- a/arch/x86/math-emu/fpu_entry.c
+++ b/arch/x86/math-emu/fpu_entry.c
@@ -276,6 +276,7 @@ asmlinkage void math_emulate(long arg)
entry_sel_off.offset = FPU_ORIG_EIP;
entry_sel_off.selector = FPU_CS;
entry_sel_off.opcode = (byte1 << 8) | FPU_modrm;
+ entry_sel_off.empty = 0;
FPU_rm = FPU_modrm & 7;
@@ -677,7 +678,7 @@ int fpregs_soft_set(struct task_struct *target,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- struct i387_soft_struct *s387 = &target->thread.i387.soft;
+ struct i387_soft_struct *s387 = &target->thread.xstate->soft;
void *space = s387->st_space;
int ret;
int offset, other, i, tags, regnr, tag, newtop;
@@ -729,7 +730,7 @@ int fpregs_soft_get(struct task_struct *target,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
- struct i387_soft_struct *s387 = &target->thread.i387.soft;
+ struct i387_soft_struct *s387 = &target->thread.xstate->soft;
const void *space = s387->st_space;
int ret;
int offset = (S387->ftop & 7) * 10, other = 80 - offset;
diff --git a/arch/x86/math-emu/fpu_system.h b/arch/x86/math-emu/fpu_system.h
index a3ae28c49ddd..13488fa153e0 100644
--- a/arch/x86/math-emu/fpu_system.h
+++ b/arch/x86/math-emu/fpu_system.h
@@ -35,8 +35,8 @@
#define SEG_EXPAND_DOWN(s) (((s).b & ((1 << 11) | (1 << 10))) \
== (1 << 10))
-#define I387 (current->thread.i387)
-#define FPU_info (I387.soft.info)
+#define I387 (current->thread.xstate)
+#define FPU_info (I387->soft.info)
#define FPU_CS (*(unsigned short *) &(FPU_info->___cs))
#define FPU_SS (*(unsigned short *) &(FPU_info->___ss))
@@ -46,25 +46,25 @@
#define FPU_EIP (FPU_info->___eip)
#define FPU_ORIG_EIP (FPU_info->___orig_eip)
-#define FPU_lookahead (I387.soft.lookahead)
+#define FPU_lookahead (I387->soft.lookahead)
/* nz if ip_offset and cs_selector are not to be set for the current
instruction. */
-#define no_ip_update (*(u_char *)&(I387.soft.no_update))
-#define FPU_rm (*(u_char *)&(I387.soft.rm))
+#define no_ip_update (*(u_char *)&(I387->soft.no_update))
+#define FPU_rm (*(u_char *)&(I387->soft.rm))
/* Number of bytes of data which can be legally accessed by the current
instruction. This only needs to hold a number <= 108, so a byte will do. */
-#define access_limit (*(u_char *)&(I387.soft.alimit))
+#define access_limit (*(u_char *)&(I387->soft.alimit))
-#define partial_status (I387.soft.swd)
-#define control_word (I387.soft.cwd)
-#define fpu_tag_word (I387.soft.twd)
-#define registers (I387.soft.st_space)
-#define top (I387.soft.ftop)
+#define partial_status (I387->soft.swd)
+#define control_word (I387->soft.cwd)
+#define fpu_tag_word (I387->soft.twd)
+#define registers (I387->soft.st_space)
+#define top (I387->soft.ftop)
-#define instruction_address (*(struct address *)&I387.soft.fip)
-#define operand_address (*(struct address *)&I387.soft.foo)
+#define instruction_address (*(struct address *)&I387->soft.fip)
+#define operand_address (*(struct address *)&I387->soft.foo)
#define FPU_access_ok(x,y,z) if ( !access_ok(x,y,z) ) \
math_abort(FPU_info,SIGSEGV)
diff --git a/arch/x86/math-emu/reg_ld_str.c b/arch/x86/math-emu/reg_ld_str.c
index 799d4af5be66..d597fe7423c9 100644
--- a/arch/x86/math-emu/reg_ld_str.c
+++ b/arch/x86/math-emu/reg_ld_str.c
@@ -383,15 +383,15 @@ int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat)
int exp;
FPU_REG tmp;
+ l[0] = 0;
+ l[1] = 0;
if (st0_tag == TAG_Valid) {
reg_copy(st0_ptr, &tmp);
exp = exponent(&tmp);
if (exp < DOUBLE_Emin) { /* It may be a denormal */
addexponent(&tmp, -DOUBLE_Emin + 52); /* largest exp to be 51 */
-
- denormal_arg:
-
+denormal_arg:
if ((precision_loss = FPU_round_to_int(&tmp, st0_tag))) {
#ifdef PECULIAR_486
/* Did it round to a non-denormal ? */
@@ -477,8 +477,7 @@ int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat)
/* This is a special case: see sec 16.2.5.1 of the 80486 book */
/* Overflow to infinity */
- l[0] = 0x00000000; /* Set to */
- l[1] = 0x7ff00000; /* + INF */
+ l[1] = 0x7ff00000; /* Set to + INF */
} else {
if (precision_loss) {
if (increment)
@@ -492,8 +491,6 @@ int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat)
}
} else if (st0_tag == TAG_Zero) {
/* Number is zero */
- l[0] = 0;
- l[1] = 0;
} else if (st0_tag == TAG_Special) {
st0_tag = FPU_Special(st0_ptr);
if (st0_tag == TW_Denormal) {
@@ -508,7 +505,6 @@ int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat)
reg_copy(st0_ptr, &tmp);
goto denormal_arg;
} else if (st0_tag == TW_Infinity) {
- l[0] = 0;
l[1] = 0x7ff00000;
} else if (st0_tag == TW_NaN) {
/* Is it really a NaN ? */
@@ -532,7 +528,6 @@ int FPU_store_double(FPU_REG *st0_ptr, u_char st0_tag, double __user *dfloat)
EXCEPTION(EX_Invalid);
if (!(control_word & CW_Invalid))
return 0;
- l[0] = 0;
l[1] = 0xfff80000;
}
}
@@ -1185,8 +1180,8 @@ u_char __user *fstenv(fpu_addr_modes addr_modes, u_char __user *d)
control_word |= 0xffff0040;
partial_status = status_word() | 0xffff0000;
fpu_tag_word |= 0xffff0000;
- I387.soft.fcs &= ~0xf8000000;
- I387.soft.fos |= 0xffff0000;
+ I387->soft.fcs &= ~0xf8000000;
+ I387->soft.fos |= 0xffff0000;
#endif /* PECULIAR_486 */
if (__copy_to_user(d, &control_word, 7 * 4))
FPU_abort;
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 983291096848..b7b3e4c7cfc9 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,17 @@
+obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
+ pat.o pgtable.o
+
+obj-$(CONFIG_X86_32) += pgtable_32.o
+
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
+obj-$(CONFIG_X86_PTDUMP) += dump_pagetables.o
+
+obj-$(CONFIG_HIGHMEM) += highmem_32.o
+
ifeq ($(CONFIG_X86_32),y)
-include ${srctree}/arch/x86/mm/Makefile_32
+obj-$(CONFIG_NUMA) += discontig_32.o
else
-include ${srctree}/arch/x86/mm/Makefile_64
+obj-$(CONFIG_NUMA) += numa_64.o
+obj-$(CONFIG_K8_NUMA) += k8topology_64.o
+obj-$(CONFIG_ACPI_NUMA) += srat_64.o
endif
diff --git a/arch/x86/mm/Makefile_32 b/arch/x86/mm/Makefile_32
deleted file mode 100644
index c36ae88bb543..000000000000
--- a/arch/x86/mm/Makefile_32
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for the linux i386-specific parts of the memory manager.
-#
-
-obj-y := init_32.o pgtable_32.o fault.o ioremap.o extable.o pageattr.o mmap.o
-
-obj-$(CONFIG_NUMA) += discontig_32.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_HIGHMEM) += highmem_32.o
diff --git a/arch/x86/mm/Makefile_64 b/arch/x86/mm/Makefile_64
deleted file mode 100644
index 688c8c28ac8f..000000000000
--- a/arch/x86/mm/Makefile_64
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for the linux x86_64-specific parts of the memory manager.
-#
-
-obj-y := init_64.o fault.o ioremap.o extable.o pageattr.o mmap.o
-obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
-obj-$(CONFIG_NUMA) += numa_64.o
-obj-$(CONFIG_K8_NUMA) += k8topology_64.o
-obj-$(CONFIG_ACPI_NUMA) += srat_64.o
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c
index 8e25e06ff730..18378850e25a 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/discontig_32.c
@@ -37,7 +37,7 @@
#include <asm/e820.h>
#include <asm/setup.h>
#include <asm/mmzone.h>
-#include <bios_ebda.h>
+#include <asm/bios_ebda.h>
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
@@ -120,7 +120,7 @@ int __init get_memcfg_numa_flat(void)
printk("NUMA - single node, flat memory mode\n");
/* Run the memory configuration and find the top of memory. */
- find_max_pfn();
+ propagate_e820_map();
node_start_pfn[0] = 0;
node_end_pfn[0] = max_pfn;
memory_present(0, 0, max_pfn);
@@ -134,7 +134,7 @@ int __init get_memcfg_numa_flat(void)
/*
* Find the highest page frame number we have available for the node
*/
-static void __init find_max_pfn_node(int nid)
+static void __init propagate_e820_map_node(int nid)
{
if (node_end_pfn[nid] > max_pfn)
node_end_pfn[nid] = max_pfn;
@@ -379,7 +379,7 @@ unsigned long __init setup_memory(void)
printk("High memory starts at vaddr %08lx\n",
(ulong) pfn_to_kaddr(highstart_pfn));
for_each_online_node(nid)
- find_max_pfn_node(nid);
+ propagate_e820_map_node(nid);
memset(NODE_DATA(0), 0, sizeof(struct pglist_data));
NODE_DATA(0)->bdata = &node0_bdata;
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
new file mode 100644
index 000000000000..2c24bea92c66
--- /dev/null
+++ b/arch/x86/mm/dump_pagetables.c
@@ -0,0 +1,354 @@
+/*
+ * Debug helper to dump the current kernel pagetables of the system
+ * so that we can see what the various memory ranges are set to.
+ *
+ * (C) Copyright 2008 Intel Corporation
+ *
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <asm/pgtable.h>
+
+/*
+ * The dumper groups pagetable entries of the same type into one, and for
+ * that it needs to keep some state when walking, and flush this state
+ * when a "break" in the continuity is found.
+ */
+struct pg_state {
+ int level;
+ pgprot_t current_prot;
+ unsigned long start_address;
+ unsigned long current_address;
+ const struct addr_marker *marker;
+};
+
+struct addr_marker {
+ unsigned long start_address;
+ const char *name;
+};
+
+/* Address space markers hints */
+static struct addr_marker address_markers[] = {
+ { 0, "User Space" },
+#ifdef CONFIG_X86_64
+ { 0x8000000000000000UL, "Kernel Space" },
+ { 0xffff810000000000UL, "Low Kernel Mapping" },
+ { VMALLOC_START, "vmalloc() Area" },
+ { VMEMMAP_START, "Vmemmap" },
+ { __START_KERNEL_map, "High Kernel Mapping" },
+ { MODULES_VADDR, "Modules" },
+ { MODULES_END, "End Modules" },
+#else
+ { PAGE_OFFSET, "Kernel Mapping" },
+ { 0/* VMALLOC_START */, "vmalloc() Area" },
+ { 0/*VMALLOC_END*/, "vmalloc() End" },
+# ifdef CONFIG_HIGHMEM
+ { 0/*PKMAP_BASE*/, "Persisent kmap() Area" },
+# endif
+ { 0/*FIXADDR_START*/, "Fixmap Area" },
+#endif
+ { -1, NULL } /* End of list */
+};
+
+/* Multipliers for offsets within the PTEs */
+#define PTE_LEVEL_MULT (PAGE_SIZE)
+#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
+#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
+#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
+
+/*
+ * Print a readable form of a pgprot_t to the seq_file
+ */
+static void printk_prot(struct seq_file *m, pgprot_t prot, int level)
+{
+ pgprotval_t pr = pgprot_val(prot);
+ static const char * const level_name[] =
+ { "cr3", "pgd", "pud", "pmd", "pte" };
+
+ if (!pgprot_val(prot)) {
+ /* Not present */
+ seq_printf(m, " ");
+ } else {
+ if (pr & _PAGE_USER)
+ seq_printf(m, "USR ");
+ else
+ seq_printf(m, " ");
+ if (pr & _PAGE_RW)
+ seq_printf(m, "RW ");
+ else
+ seq_printf(m, "ro ");
+ if (pr & _PAGE_PWT)
+ seq_printf(m, "PWT ");
+ else
+ seq_printf(m, " ");
+ if (pr & _PAGE_PCD)
+ seq_printf(m, "PCD ");
+ else
+ seq_printf(m, " ");
+
+ /* Bit 9 has a different meaning on level 3 vs 4 */
+ if (level <= 3) {
+ if (pr & _PAGE_PSE)
+ seq_printf(m, "PSE ");
+ else
+ seq_printf(m, " ");
+ } else {
+ if (pr & _PAGE_PAT)
+ seq_printf(m, "pat ");
+ else
+ seq_printf(m, " ");
+ }
+ if (pr & _PAGE_GLOBAL)
+ seq_printf(m, "GLB ");
+ else
+ seq_printf(m, " ");
+ if (pr & _PAGE_NX)
+ seq_printf(m, "NX ");
+ else
+ seq_printf(m, "x ");
+ }
+ seq_printf(m, "%s\n", level_name[level]);
+}
+
+/*
+ * On 64 bits, sign-extend the 48 bit address to 64 bit
+ */
+static unsigned long normalize_addr(unsigned long u)
+{
+#ifdef CONFIG_X86_64
+ return (signed long)(u << 16) >> 16;
+#else
+ return u;
+#endif
+}
+
+/*
+ * This function gets called on a break in a continuous series
+ * of PTE entries; the next one is different so we need to
+ * print what we collected so far.
+ */
+static void note_page(struct seq_file *m, struct pg_state *st,
+ pgprot_t new_prot, int level)
+{
+ pgprotval_t prot, cur;
+ static const char units[] = "KMGTPE";
+
+ /*
+ * If we have a "break" in the series, we need to flush the state that
+ * we have now. "break" is either changing perms, levels or
+ * address space marker.
+ */
+ prot = pgprot_val(new_prot) & ~(PTE_MASK);
+ cur = pgprot_val(st->current_prot) & ~(PTE_MASK);
+
+ if (!st->level) {
+ /* First entry */
+ st->current_prot = new_prot;
+ st->level = level;
+ st->marker = address_markers;
+ seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ } else if (prot != cur || level != st->level ||
+ st->current_address >= st->marker[1].start_address) {
+ const char *unit = units;
+ unsigned long delta;
+
+ /*
+ * Now print the actual finished series
+ */
+ seq_printf(m, "0x%p-0x%p ",
+ (void *)st->start_address,
+ (void *)st->current_address);
+
+ delta = (st->current_address - st->start_address) >> 10;
+ while (!(delta & 1023) && unit[1]) {
+ delta >>= 10;
+ unit++;
+ }
+ seq_printf(m, "%9lu%c ", delta, *unit);
+ printk_prot(m, st->current_prot, st->level);
+
+ /*
+ * We print markers for special areas of address space,
+ * such as the start of vmalloc space etc.
+ * This helps in the interpretation.
+ */
+ if (st->current_address >= st->marker[1].start_address) {
+ st->marker++;
+ seq_printf(m, "---[ %s ]---\n", st->marker->name);
+ }
+
+ st->start_address = st->current_address;
+ st->current_prot = new_prot;
+ st->level = level;
+ }
+}
+
+static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
+ unsigned long P)
+{
+ int i;
+ pte_t *start;
+
+ start = (pte_t *) pmd_page_vaddr(addr);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pgprot_t prot = pte_pgprot(*start);
+
+ st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
+ note_page(m, st, prot, 4);
+ start++;
+ }
+}
+
+#if PTRS_PER_PMD > 1
+
+static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
+ unsigned long P)
+{
+ int i;
+ pmd_t *start;
+
+ start = (pmd_t *) pud_page_vaddr(addr);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
+ if (!pmd_none(*start)) {
+ pgprotval_t prot = pmd_val(*start) & ~PTE_MASK;
+
+ if (pmd_large(*start) || !pmd_present(*start))
+ note_page(m, st, __pgprot(prot), 3);
+ else
+ walk_pte_level(m, st, *start,
+ P + i * PMD_LEVEL_MULT);
+ } else
+ note_page(m, st, __pgprot(0), 3);
+ start++;
+ }
+}
+
+#else
+#define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
+#define pud_large(a) pmd_large(__pmd(pud_val(a)))
+#define pud_none(a) pmd_none(__pmd(pud_val(a)))
+#endif
+
+#if PTRS_PER_PUD > 1
+
+static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
+ unsigned long P)
+{
+ int i;
+ pud_t *start;
+
+ start = (pud_t *) pgd_page_vaddr(addr);
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
+ if (!pud_none(*start)) {
+ pgprotval_t prot = pud_val(*start) & ~PTE_MASK;
+
+ if (pud_large(*start) || !pud_present(*start))
+ note_page(m, st, __pgprot(prot), 2);
+ else
+ walk_pmd_level(m, st, *start,
+ P + i * PUD_LEVEL_MULT);
+ } else
+ note_page(m, st, __pgprot(0), 2);
+
+ start++;
+ }
+}
+
+#else
+#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
+#define pgd_large(a) pud_large(__pud(pgd_val(a)))
+#define pgd_none(a) pud_none(__pud(pgd_val(a)))
+#endif
+
+static void walk_pgd_level(struct seq_file *m)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *start = (pgd_t *) &init_level4_pgt;
+#else
+ pgd_t *start = swapper_pg_dir;
+#endif
+ int i;
+ struct pg_state st;
+
+ memset(&st, 0, sizeof(st));
+
+ for (i = 0; i < PTRS_PER_PGD; i++) {
+ st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
+ if (!pgd_none(*start)) {
+ pgprotval_t prot = pgd_val(*start) & ~PTE_MASK;
+
+ if (pgd_large(*start) || !pgd_present(*start))
+ note_page(m, &st, __pgprot(prot), 1);
+ else
+ walk_pud_level(m, &st, *start,
+ i * PGD_LEVEL_MULT);
+ } else
+ note_page(m, &st, __pgprot(0), 1);
+
+ start++;
+ }
+
+ /* Flush out the last page */
+ st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
+ note_page(m, &st, __pgprot(0), 0);
+}
+
+static int ptdump_show(struct seq_file *m, void *v)
+{
+ walk_pgd_level(m);
+ return 0;
+}
+
+static int ptdump_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, ptdump_show, NULL);
+}
+
+static const struct file_operations ptdump_fops = {
+ .open = ptdump_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int pt_dump_init(void)
+{
+ struct dentry *pe;
+
+#ifdef CONFIG_X86_32
+ /* Not a compile-time constant on x86-32 */
+ address_markers[2].start_address = VMALLOC_START;
+ address_markers[3].start_address = VMALLOC_END;
+# ifdef CONFIG_HIGHMEM
+ address_markers[4].start_address = PKMAP_BASE;
+ address_markers[5].start_address = FIXADDR_START;
+# else
+ address_markers[4].start_address = FIXADDR_START;
+# endif
+#endif
+
+ pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
+ &ptdump_fops);
+ if (!pe)
+ return -ENOMEM;
+
+ return 0;
+}
+
+__initcall(pt_dump_init);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
+MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ec08d8389850..fd7e1798c75a 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -639,7 +639,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
#ifdef CONFIG_X86_32
/* It's safe to allow irq's after cr2 has been saved and the vmalloc
fault has been handled. */
- if (regs->flags & (X86_EFLAGS_IF|VM_MASK))
+ if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
local_irq_enable();
/*
@@ -976,9 +976,5 @@ void vmalloc_sync_all(void)
if (address == start)
start = address + PGDIR_SIZE;
}
- /* Check that there is no need to do the same for the modules area. */
- BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
- BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
- (__START_KERNEL & PGDIR_MASK)));
#endif
}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ee1091a46964..de236e419cb5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -1,5 +1,4 @@
/*
- * linux/arch/i386/mm/init.c
*
* Copyright (C) 1995 Linus Torvalds
*
@@ -51,6 +50,8 @@
unsigned int __VMALLOC_RESERVE = 128 << 20;
+unsigned long max_pfn_mapped;
+
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
unsigned long highstart_pfn, highend_pfn;
@@ -70,7 +71,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
- paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
@@ -99,7 +100,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
}
- paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+ paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
BUG_ON(page_table != pte_offset_kernel(pmd, 0));
}
@@ -179,8 +180,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
/*
* Map with big pages if possible, otherwise
* create normal page tables:
+ *
+ * Don't use a large page for the first 2/4MB of memory
+ * because there are often fixed size MTRRs in there
+ * and overlapping MTRRs into large pages can cause
+ * slowdowns.
*/
- if (cpu_has_pse) {
+ if (cpu_has_pse && !(pgd_idx == 0 && pmd_idx == 0)) {
unsigned int addr2;
pgprot_t prot = PAGE_KERNEL_LARGE;
@@ -194,6 +200,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
set_pmd(pmd, pfn_pmd(pfn, prot));
pfn += PTRS_PER_PTE;
+ max_pfn_mapped = pfn;
continue;
}
pte = one_page_table_init(pmd);
@@ -208,6 +215,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
set_pte(pte, pfn_pte(pfn, prot));
}
+ max_pfn_mapped = pfn;
}
}
}
@@ -219,6 +227,25 @@ static inline int page_kills_ppro(unsigned long pagenr)
return 0;
}
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+ if (pagenr <= 256)
+ return 1;
+ if (!page_is_ram(pagenr))
+ return 1;
+ return 0;
+}
+
#ifdef CONFIG_HIGHMEM
pte_t *kmap_pte;
pgprot_t kmap_prot;
@@ -260,47 +287,17 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
pkmap_page_table = pte;
}
-static void __meminit free_new_highpage(struct page *page)
-{
- init_page_count(page);
- __free_page(page);
- totalhigh_pages++;
-}
-
void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro)
{
if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) {
ClearPageReserved(page);
- free_new_highpage(page);
+ init_page_count(page);
+ __free_page(page);
+ totalhigh_pages++;
} else
SetPageReserved(page);
}
-static int __meminit
-add_one_highpage_hotplug(struct page *page, unsigned long pfn)
-{
- free_new_highpage(page);
- totalram_pages++;
-#ifdef CONFIG_FLATMEM
- max_mapnr = max(pfn, max_mapnr);
-#endif
- num_physpages++;
-
- return 0;
-}
-
-/*
- * Not currently handling the NUMA case.
- * Assuming single node and all memory that
- * has been added dynamically that would be
- * onlined here is in HIGHMEM.
- */
-void __meminit online_page(struct page *page)
-{
- ClearPageReserved(page);
- add_one_highpage_hotplug(page, page_to_pfn(page));
-}
-
#ifndef CONFIG_NUMA
static void __init set_highmem_pages_init(int bad_ppro)
{
@@ -357,7 +354,7 @@ void __init native_pagetable_setup_start(pgd_t *base)
pte_clear(NULL, va, pte);
}
- paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
+ paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
}
void __init native_pagetable_setup_done(pgd_t *base)
@@ -449,7 +446,7 @@ void zap_low_mappings(void)
* Note that "pgd_clear()" doesn't do it for
* us, because pgd_clear() is a no-op on i386.
*/
- for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+ for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
#ifdef CONFIG_X86_PAE
set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
#else
@@ -539,9 +536,9 @@ void __init paging_init(void)
/*
* Test if the WP bit works in supervisor mode. It isn't supported on 386's
- * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This
- * used to involve black magic jumps to work around some nasty CPU bugs,
- * but fortunately the switch to using exceptions got rid of all that.
+ * and also on some strange 486's. All 586+'s are OK. This used to involve
+ * black magic jumps to work around some nasty CPU bugs, but fortunately the
+ * switch to using exceptions got rid of all that.
*/
static void __init test_wp_bit(void)
{
@@ -723,25 +720,17 @@ void mark_rodata_ro(void)
unsigned long start = PFN_ALIGN(_text);
unsigned long size = PFN_ALIGN(_etext) - start;
-#ifndef CONFIG_KPROBES
-#ifdef CONFIG_HOTPLUG_CPU
- /* It must still be possible to apply SMP alternatives. */
- if (num_possible_cpus() <= 1)
-#endif
- {
- set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
- printk(KERN_INFO "Write protecting the kernel text: %luk\n",
- size >> 10);
+ set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
+ printk(KERN_INFO "Write protecting the kernel text: %luk\n",
+ size >> 10);
#ifdef CONFIG_CPA_DEBUG
- printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
- start, start+size);
- set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
+ printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n",
+ start, start+size);
+ set_pages_rw(virt_to_page(start), size>>PAGE_SHIFT);
- printk(KERN_INFO "Testing CPA: write protecting again\n");
- set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
-#endif
- }
+ printk(KERN_INFO "Testing CPA: write protecting again\n");
+ set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
#endif
start += size;
size = (unsigned long)__end_rodata - start;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a02a14f0f324..32ba13b0f818 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -47,13 +47,30 @@
#include <asm/numa.h>
#include <asm/cacheflush.h>
-const struct dma_mapping_ops *dma_ops;
-EXPORT_SYMBOL(dma_ops);
-
static unsigned long dma_reserve __initdata;
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+int direct_gbpages __meminitdata
+#ifdef CONFIG_DIRECT_GBPAGES
+ = 1
+#endif
+;
+
+static int __init parse_direct_gbpages_off(char *arg)
+{
+ direct_gbpages = 0;
+ return 0;
+}
+early_param("nogbpages", parse_direct_gbpages_off);
+
+static int __init parse_direct_gbpages_on(char *arg)
+{
+ direct_gbpages = 1;
+ return 0;
+}
+early_param("gbpages", parse_direct_gbpages_on);
+
/*
* NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
* physical space so we can cache the place of the first one and move
@@ -69,9 +86,6 @@ void show_mem(void)
printk(KERN_INFO "Mem-info:\n");
show_free_areas();
- printk(KERN_INFO "Free swap: %6ldkB\n",
- nr_swap_pages << (PAGE_SHIFT-10));
-
for_each_online_pgdat(pgdat) {
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
/*
@@ -121,7 +135,7 @@ static __init void *spp_getpage(void)
return ptr;
}
-static __init void
+static void
set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
{
pgd_t *pgd;
@@ -159,7 +173,7 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot)
new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
pte = pte_offset_kernel(pmd, vaddr);
- if (!pte_none(*pte) &&
+ if (!pte_none(*pte) && pte_val(new_pte) &&
pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
pte_ERROR(*pte);
set_pte(pte, new_pte);
@@ -200,8 +214,7 @@ void __init cleanup_highmap(void)
}
/* NOTE: this is meant to be run only at boot */
-void __init
-__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
+void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
{
unsigned long address = __fix_to_virt(idx);
@@ -296,7 +309,7 @@ __meminit void early_iounmap(void *addr, unsigned long size)
__flush_tlb_all();
}
-static void __meminit
+static unsigned long __meminit
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
{
int i = pmd_index(address);
@@ -318,21 +331,26 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
set_pte((pte_t *)pmd,
pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
}
+ return address;
}
-static void __meminit
+static unsigned long __meminit
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
{
pmd_t *pmd = pmd_offset(pud, 0);
+ unsigned long last_map_addr;
+
spin_lock(&init_mm.page_table_lock);
- phys_pmd_init(pmd, address, end);
+ last_map_addr = phys_pmd_init(pmd, address, end);
spin_unlock(&init_mm.page_table_lock);
__flush_tlb_all();
+ return last_map_addr;
}
-static void __meminit
+static unsigned long __meminit
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
{
+ unsigned long last_map_addr = end;
int i = pud_index(addr);
for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
@@ -350,7 +368,15 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
}
if (pud_val(*pud)) {
- phys_pmd_update(pud, addr, end);
+ if (!pud_large(*pud))
+ last_map_addr = phys_pmd_update(pud, addr, end);
+ continue;
+ }
+
+ if (direct_gbpages) {
+ set_pte((pte_t *)pud,
+ pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
+ last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
continue;
}
@@ -358,12 +384,14 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
spin_lock(&init_mm.page_table_lock);
set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
- phys_pmd_init(pmd, addr, end);
+ last_map_addr = phys_pmd_init(pmd, addr, end);
spin_unlock(&init_mm.page_table_lock);
unmap_low_page(pmd);
}
__flush_tlb_all();
+
+ return last_map_addr >> PAGE_SHIFT;
}
static void __init find_early_table_space(unsigned long end)
@@ -371,9 +399,11 @@ static void __init find_early_table_space(unsigned long end)
unsigned long puds, pmds, tables, start;
puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
- pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
- tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
- round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+ tables = round_up(puds * sizeof(pud_t), PAGE_SIZE);
+ if (!direct_gbpages) {
+ pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
+ tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
+ }
/*
* RED-PEN putting page tables only on node 0 could
@@ -393,16 +423,135 @@ static void __init find_early_table_space(unsigned long end)
(table_start << PAGE_SHIFT) + tables);
}
+static void __init init_gbpages(void)
+{
+ if (direct_gbpages && cpu_has_gbpages)
+ printk(KERN_INFO "Using GB pages for direct mapping\n");
+ else
+ direct_gbpages = 0;
+}
+
+#ifdef CONFIG_MEMTEST_BOOTPARAM
+
+static void __init memtest(unsigned long start_phys, unsigned long size,
+ unsigned pattern)
+{
+ unsigned long i;
+ unsigned long *start;
+ unsigned long start_bad;
+ unsigned long last_bad;
+ unsigned long val;
+ unsigned long start_phys_aligned;
+ unsigned long count;
+ unsigned long incr;
+
+ switch (pattern) {
+ case 0:
+ val = 0UL;
+ break;
+ case 1:
+ val = -1UL;
+ break;
+ case 2:
+ val = 0x5555555555555555UL;
+ break;
+ case 3:
+ val = 0xaaaaaaaaaaaaaaaaUL;
+ break;
+ default:
+ return;
+ }
+
+ incr = sizeof(unsigned long);
+ start_phys_aligned = ALIGN(start_phys, incr);
+ count = (size - (start_phys_aligned - start_phys))/incr;
+ start = __va(start_phys_aligned);
+ start_bad = 0;
+ last_bad = 0;
+
+ for (i = 0; i < count; i++)
+ start[i] = val;
+ for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
+ if (*start != val) {
+ if (start_phys_aligned == last_bad + incr) {
+ last_bad += incr;
+ } else {
+ if (start_bad) {
+ printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
+ val, start_bad, last_bad + incr);
+ reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+ }
+ start_bad = last_bad = start_phys_aligned;
+ }
+ }
+ }
+ if (start_bad) {
+ printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
+ val, start_bad, last_bad + incr);
+ reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
+ }
+
+}
+
+static int memtest_pattern __initdata = CONFIG_MEMTEST_BOOTPARAM_VALUE;
+
+static int __init parse_memtest(char *arg)
+{
+ if (arg)
+ memtest_pattern = simple_strtoul(arg, NULL, 0);
+ return 0;
+}
+
+early_param("memtest", parse_memtest);
+
+static void __init early_memtest(unsigned long start, unsigned long end)
+{
+ unsigned long t_start, t_size;
+ unsigned pattern;
+
+ if (!memtest_pattern)
+ return;
+
+ printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
+ for (pattern = 0; pattern < memtest_pattern; pattern++) {
+ t_start = start;
+ t_size = 0;
+ while (t_start < end) {
+ t_start = find_e820_area_size(t_start, &t_size, 1);
+
+ /* done ? */
+ if (t_start >= end)
+ break;
+ if (t_start + t_size > end)
+ t_size = end - t_start;
+
+ printk(KERN_CONT "\n %016lx - %016lx pattern %d",
+ t_start, t_start + t_size, pattern);
+
+ memtest(t_start, t_size, pattern);
+
+ t_start += t_size;
+ }
+ }
+ printk(KERN_CONT "\n");
+}
+#else
+static void __init early_memtest(unsigned long start, unsigned long end)
+{
+}
+#endif
+
/*
* Setup the direct mapping of the physical memory at PAGE_OFFSET.
* This runs before bootmem is initialized and gets pages directly from
* the physical memory. To access them they are temporarily mapped.
*/
-void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
+unsigned long __init_refok init_memory_mapping(unsigned long start, unsigned long end)
{
- unsigned long next;
+ unsigned long next, last_map_addr = end;
+ unsigned long start_phys = start, end_phys = end;
- pr_debug("init_memory_mapping\n");
+ printk(KERN_INFO "init_memory_mapping\n");
/*
* Find space for the kernel direct mapping tables.
@@ -411,8 +560,10 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
* memory mapped. Unfortunately this is done currently before the
* nodes are discovered.
*/
- if (!after_bootmem)
+ if (!after_bootmem) {
+ init_gbpages();
find_early_table_space(end);
+ }
start = (unsigned long)__va(start);
end = (unsigned long)__va(end);
@@ -430,7 +581,7 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
next = start + PGDIR_SIZE;
if (next > end)
next = end;
- phys_pud_init(pud, __pa(start), __pa(next));
+ last_map_addr = phys_pud_init(pud, __pa(start), __pa(next));
if (!after_bootmem)
set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
unmap_low_page(pud);
@@ -443,6 +594,11 @@ void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
if (!after_bootmem)
reserve_early(table_start << PAGE_SHIFT,
table_end << PAGE_SHIFT, "PGTABLE");
+
+ if (!after_bootmem)
+ early_memtest(start_phys, end_phys);
+
+ return last_map_addr;
}
#ifndef CONFIG_NUMA
@@ -464,15 +620,6 @@ void __init paging_init(void)
/*
* Memory hotplug specific functions
*/
-void online_page(struct page *page)
-{
- ClearPageReserved(page);
- init_page_count(page);
- __free_page(page);
- totalram_pages++;
- num_physpages++;
-}
-
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Memory is added always to NORMAL zone. This means you will never get
@@ -482,11 +629,13 @@ int arch_add_memory(int nid, u64 start, u64 size)
{
struct pglist_data *pgdat = NODE_DATA(nid);
struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
- unsigned long start_pfn = start >> PAGE_SHIFT;
+ unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
- init_memory_mapping(start, start + size-1);
+ last_mapped_pfn = init_memory_mapping(start, start + size-1);
+ if (last_mapped_pfn > max_pfn_mapped)
+ max_pfn_mapped = last_mapped_pfn;
ret = __add_pages(zone, start_pfn, nr_pages);
WARN_ON(1);
@@ -505,6 +654,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
#endif /* CONFIG_MEMORY_HOTPLUG */
+/*
+ * devmem_is_allowed() checks to see if /dev/mem access to a certain address
+ * is valid. The argument is a physical page number.
+ *
+ *
+ * On x86, access has to be given to the first megabyte of ram because that area
+ * contains bios code and data regions used by X and dosemu and similar apps.
+ * Access has to be given to non-kernel-ram areas as well, these contain the PCI
+ * mmio resources as well as potential bios/acpi data regions.
+ */
+int devmem_is_allowed(unsigned long pagenr)
+{
+ if (pagenr <= 256)
+ return 1;
+ if (!page_is_ram(pagenr))
+ return 1;
+ return 0;
+}
+
+
static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel,
kcore_modules, kcore_vsyscall;
@@ -596,24 +765,7 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
void mark_rodata_ro(void)
{
- unsigned long start = (unsigned long)_stext, end;
-
-#ifdef CONFIG_HOTPLUG_CPU
- /* It must still be possible to apply SMP alternatives. */
- if (num_possible_cpus() > 1)
- start = (unsigned long)_etext;
-#endif
-
-#ifdef CONFIG_KPROBES
- start = (unsigned long)__start_rodata;
-#endif
-
- end = (unsigned long)__end_rodata;
- start = (start + PAGE_SIZE - 1) & PAGE_MASK;
- end &= PAGE_MASK;
- if (end <= start)
- return;
-
+ unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
(end - start) >> 10);
@@ -636,6 +788,7 @@ void mark_rodata_ro(void)
set_memory_ro(start, (end-start) >> PAGE_SHIFT);
#endif
}
+
#endif
#ifdef CONFIG_BLK_DEV_INITRD
@@ -648,7 +801,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
{
#ifdef CONFIG_NUMA
- int nid = phys_to_nid(phys);
+ int nid, next_nid;
#endif
unsigned long pfn = phys >> PAGE_SHIFT;
@@ -657,7 +810,7 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
* This can happen with kdump kernels when accessing
* firmware tables:
*/
- if (pfn < end_pfn_map)
+ if (pfn < max_pfn_mapped)
return;
printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
@@ -667,10 +820,16 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
- reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+ nid = phys_to_nid(phys);
+ next_nid = phys_to_nid(phys + len - 1);
+ if (nid == next_nid)
+ reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT);
+ else
+ reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
#else
reserve_bootmem(phys, len, BOOTMEM_DEFAULT);
#endif
+
if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
dma_reserve += len / PAGE_SIZE;
set_dma_reserve(dma_reserve);
@@ -764,6 +923,10 @@ const char *arch_vma_name(struct vm_area_struct *vma)
/*
* Initialise the sparsemem vmemmap using huge-pages at the PMD level.
*/
+static long __meminitdata addr_start, addr_end;
+static void __meminitdata *p_start, *p_end;
+static int __meminitdata node_start;
+
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
{
@@ -798,12 +961,32 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
PAGE_KERNEL_LARGE);
set_pmd(pmd, __pmd(pte_val(entry)));
- printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
- addr, addr + PMD_SIZE - 1, p, node);
+ /* check to see if we have contiguous blocks */
+ if (p_end != p || node_start != node) {
+ if (p_start)
+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ addr_start = addr;
+ node_start = node;
+ p_start = p;
+ }
+ addr_end = addr + PMD_SIZE;
+ p_end = p + PMD_SIZE;
} else {
vmemmap_verify((pte_t *)pmd, node, addr, next);
}
}
return 0;
}
+
+void __meminit vmemmap_populate_print_last(void)
+{
+ if (p_start) {
+ printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
+ addr_start, addr_end-1, p_start, p_end-1, node_start);
+ p_start = NULL;
+ p_end = NULL;
+ node_start = 0;
+ }
+}
#endif
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 794895c6dcc9..804de18abcc2 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -19,11 +19,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/pgalloc.h>
-
-enum ioremap_mode {
- IOR_MODE_UNCACHED,
- IOR_MODE_CACHED,
-};
+#include <asm/pat.h>
#ifdef CONFIG_X86_64
@@ -35,11 +31,23 @@ unsigned long __phys_addr(unsigned long x)
}
EXPORT_SYMBOL(__phys_addr);
+static inline int phys_addr_valid(unsigned long addr)
+{
+ return addr < (1UL << boot_cpu_data.x86_phys_bits);
+}
+
+#else
+
+static inline int phys_addr_valid(unsigned long addr)
+{
+ return 1;
+}
+
#endif
int page_is_ram(unsigned long pagenr)
{
- unsigned long addr, end;
+ resource_size_t addr, end;
int i;
/*
@@ -78,19 +86,22 @@ int page_is_ram(unsigned long pagenr)
* Fix up the linear direct mapping of the kernel to avoid cache attribute
* conflicts.
*/
-static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
- enum ioremap_mode mode)
+int ioremap_change_attr(unsigned long vaddr, unsigned long size,
+ unsigned long prot_val)
{
unsigned long nrpages = size >> PAGE_SHIFT;
int err;
- switch (mode) {
- case IOR_MODE_UNCACHED:
+ switch (prot_val) {
+ case _PAGE_CACHE_UC:
default:
- err = set_memory_uc(vaddr, nrpages);
+ err = _set_memory_uc(vaddr, nrpages);
+ break;
+ case _PAGE_CACHE_WC:
+ err = _set_memory_wc(vaddr, nrpages);
break;
- case IOR_MODE_CACHED:
- err = set_memory_wb(vaddr, nrpages);
+ case _PAGE_CACHE_WB:
+ err = _set_memory_wb(vaddr, nrpages);
break;
}
@@ -106,18 +117,28 @@ static int ioremap_change_attr(unsigned long vaddr, unsigned long size,
* have to convert them into an offset in a page-aligned mapping, but the
* caller shouldn't need to know that small detail.
*/
-static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
- enum ioremap_mode mode)
+static void __iomem *__ioremap_caller(resource_size_t phys_addr,
+ unsigned long size, unsigned long prot_val, void *caller)
{
- unsigned long pfn, offset, last_addr, vaddr;
+ unsigned long pfn, offset, vaddr;
+ resource_size_t last_addr;
struct vm_struct *area;
+ unsigned long new_prot_val;
pgprot_t prot;
+ int retval;
/* Don't allow wraparound or zero size */
last_addr = phys_addr + size - 1;
if (!size || last_addr < phys_addr)
return NULL;
+ if (!phys_addr_valid(phys_addr)) {
+ printk(KERN_WARNING "ioremap: invalid physical address %llx\n",
+ (unsigned long long)phys_addr);
+ WARN_ON_ONCE(1);
+ return NULL;
+ }
+
/*
* Don't remap the low PCI/ISA area, it's always mapped..
*/
@@ -127,25 +148,14 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
/*
* Don't allow anybody to remap normal RAM that we're using..
*/
- for (pfn = phys_addr >> PAGE_SHIFT; pfn < max_pfn_mapped &&
- (pfn << PAGE_SHIFT) < last_addr; pfn++) {
- if (page_is_ram(pfn) && pfn_valid(pfn) &&
- !PageReserved(pfn_to_page(pfn)))
- return NULL;
- }
+ for (pfn = phys_addr >> PAGE_SHIFT;
+ (pfn << PAGE_SHIFT) < last_addr; pfn++) {
- switch (mode) {
- case IOR_MODE_UNCACHED:
- default:
- /*
- * FIXME: we will use UC MINUS for now, as video fb drivers
- * depend on it. Upcoming ioremap_wc() will fix this behavior.
- */
- prot = PAGE_KERNEL_UC_MINUS;
- break;
- case IOR_MODE_CACHED:
- prot = PAGE_KERNEL;
- break;
+ int is_ram = page_is_ram(pfn);
+
+ if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn)))
+ return NULL;
+ WARN_ON_ONCE(is_ram);
}
/*
@@ -155,20 +165,66 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
phys_addr &= PAGE_MASK;
size = PAGE_ALIGN(last_addr+1) - phys_addr;
+ retval = reserve_memtype(phys_addr, phys_addr + size,
+ prot_val, &new_prot_val);
+ if (retval) {
+ pr_debug("Warning: reserve_memtype returned %d\n", retval);
+ return NULL;
+ }
+
+ if (prot_val != new_prot_val) {
+ /*
+ * Do not fallback to certain memory types with certain
+ * requested type:
+ * - request is uncached, return cannot be write-back
+ * - request is uncached, return cannot be write-combine
+ * - request is write-combine, return cannot be write-back
+ */
+ if ((prot_val == _PAGE_CACHE_UC &&
+ (new_prot_val == _PAGE_CACHE_WB ||
+ new_prot_val == _PAGE_CACHE_WC)) ||
+ (prot_val == _PAGE_CACHE_WC &&
+ new_prot_val == _PAGE_CACHE_WB)) {
+ pr_debug(
+ "ioremap error for 0x%llx-0x%llx, requested 0x%lx, got 0x%lx\n",
+ (unsigned long long)phys_addr,
+ (unsigned long long)(phys_addr + size),
+ prot_val, new_prot_val);
+ free_memtype(phys_addr, phys_addr + size);
+ return NULL;
+ }
+ prot_val = new_prot_val;
+ }
+
+ switch (prot_val) {
+ case _PAGE_CACHE_UC:
+ default:
+ prot = PAGE_KERNEL_NOCACHE;
+ break;
+ case _PAGE_CACHE_WC:
+ prot = PAGE_KERNEL_WC;
+ break;
+ case _PAGE_CACHE_WB:
+ prot = PAGE_KERNEL;
+ break;
+ }
+
/*
* Ok, go for it..
*/
- area = get_vm_area(size, VM_IOREMAP);
+ area = get_vm_area_caller(size, VM_IOREMAP, caller);
if (!area)
return NULL;
area->phys_addr = phys_addr;
vaddr = (unsigned long) area->addr;
if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) {
+ free_memtype(phys_addr, phys_addr + size);
free_vm_area(area);
return NULL;
}
- if (ioremap_change_attr(vaddr, size, mode) < 0) {
+ if (ioremap_change_attr(vaddr, size, prot_val) < 0) {
+ free_memtype(phys_addr, phys_addr + size);
vunmap(area->addr);
return NULL;
}
@@ -199,13 +255,35 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size,
*/
void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size)
{
- return __ioremap(phys_addr, size, IOR_MODE_UNCACHED);
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_UC,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_nocache);
+/**
+ * ioremap_wc - map memory into CPU space write combined
+ * @offset: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * This version of ioremap ensures that the memory is marked write combining.
+ * Write combining allows faster writes to some hardware devices.
+ *
+ * Must be freed with iounmap.
+ */
+void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size)
+{
+ if (pat_wc_enabled)
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC,
+ __builtin_return_address(0));
+ else
+ return ioremap_nocache(phys_addr, size);
+}
+EXPORT_SYMBOL(ioremap_wc);
+
void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
{
- return __ioremap(phys_addr, size, IOR_MODE_CACHED);
+ return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(ioremap_cache);
@@ -252,6 +330,8 @@ void iounmap(volatile void __iomem *addr)
return;
}
+ free_memtype(p->phys_addr, p->phys_addr + get_vm_area_size(p));
+
/* Finally remove it */
o = remove_vm_area((void *)addr);
BUG_ON(p != o || o == NULL);
@@ -259,6 +339,35 @@ void iounmap(volatile void __iomem *addr)
}
EXPORT_SYMBOL(iounmap);
+/*
+ * Convert a physical pointer to a virtual kernel pointer for /dev/mem
+ * access
+ */
+void *xlate_dev_mem_ptr(unsigned long phys)
+{
+ void *addr;
+ unsigned long start = phys & PAGE_MASK;
+
+ /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */
+ if (page_is_ram(start >> PAGE_SHIFT))
+ return __va(phys);
+
+ addr = (void *)ioremap(start, PAGE_SIZE);
+ if (addr)
+ addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
+
+ return addr;
+}
+
+void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
+{
+ if (page_is_ram(phys >> PAGE_SHIFT))
+ return;
+
+ iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK));
+ return;
+}
+
#ifdef CONFIG_X86_32
int __initdata early_ioremap_debug;
@@ -272,8 +381,8 @@ static int __init early_ioremap_debug_setup(char *str)
early_param("early_ioremap_debug", early_ioremap_debug_setup);
static __initdata int after_paging_init;
-static __initdata pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
- __attribute__((aligned(PAGE_SIZE)));
+static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)]
+ __section(.bss.page_aligned);
static inline pmd_t * __init early_ioremap_pmd(unsigned long addr)
{
@@ -330,7 +439,7 @@ void __init early_ioremap_clear(void)
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
pmd_clear(pmd);
- paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT);
+ paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT);
__flush_tlb_all();
}
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 7a2ebce87df5..86808e666f9c 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -164,7 +164,7 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
if (!found)
return -1;
- memnode_shift = compute_hash_shift(nodes, 8);
+ memnode_shift = compute_hash_shift(nodes, 8, NULL);
if (memnode_shift < 0) {
printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n");
return -1;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 16b82ad34b96..c5066d519e5d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -31,13 +31,15 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES];
struct memnode memnode;
+#ifdef CONFIG_SMP
int x86_cpu_to_node_map_init[NR_CPUS] = {
[0 ... NR_CPUS-1] = NUMA_NO_NODE
};
void *x86_cpu_to_node_map_early_ptr;
+EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
+#endif
DEFINE_PER_CPU(int, x86_cpu_to_node_map) = NUMA_NO_NODE;
EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
-EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
@@ -58,7 +60,7 @@ unsigned long __initdata nodemap_size;
* -1 if node overlap or lost ram (shift too big)
*/
static int __init populate_memnodemap(const struct bootnode *nodes,
- int numnodes, int shift)
+ int numnodes, int shift, int *nodeids)
{
unsigned long addr, end;
int i, res = -1;
@@ -74,7 +76,12 @@ static int __init populate_memnodemap(const struct bootnode *nodes,
do {
if (memnodemap[addr >> shift] != NUMA_NO_NODE)
return -1;
- memnodemap[addr >> shift] = i;
+
+ if (!nodeids)
+ memnodemap[addr >> shift] = i;
+ else
+ memnodemap[addr >> shift] = nodeids[i];
+
addr += (1UL << shift);
} while (addr < end);
res = 1;
@@ -137,7 +144,8 @@ static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
return i;
}
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
+ int *nodeids)
{
int shift;
@@ -147,7 +155,7 @@ int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
shift);
- if (populate_memnodemap(nodes, numnodes, shift) != 1) {
+ if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
printk(KERN_INFO "Your memory is not aligned you need to "
"rebuild your kernel with a bigger NODEMAPSIZE "
"shift=%d\n", shift);
@@ -188,6 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
unsigned long bootmap_start, nodedata_phys;
void *bootmap;
const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
+ int nid;
start = round_up(start, ZONE_ALIGN);
@@ -210,9 +219,19 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
NODE_DATA(nodeid)->node_start_pfn = start_pfn;
NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
- /* Find a place for the bootmem map */
+ /*
+ * Find a place for the bootmem map
+ * nodedata_phys could be on other nodes by alloc_bootmem,
+ * so need to sure bootmap_start not to be small, otherwise
+ * early_node_mem will get that with find_e820_area instead
+ * of alloc_bootmem, that could clash with reserved range
+ */
bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
- bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+ nid = phys_to_nid(nodedata_phys);
+ if (nid == nodeid)
+ bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
+ else
+ bootmap_start = round_up(start, PAGE_SIZE);
/*
* SMP_CAHCE_BYTES could be enough, but init_bootmem_node like
* to use that to align to PAGE_SIZE
@@ -237,10 +256,29 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
free_bootmem_with_active_regions(nodeid, end);
- reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size,
- BOOTMEM_DEFAULT);
- reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
- bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
+ /*
+ * convert early reserve to bootmem reserve earlier
+ * otherwise early_node_mem could use early reserved mem
+ * on previous node
+ */
+ early_res_to_bootmem(start, end);
+
+ /*
+ * in some case early_node_mem could use alloc_bootmem
+ * to get range on other node, don't reserve that again
+ */
+ if (nid != nodeid)
+ printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
+ else
+ reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
+ pgdat_size, BOOTMEM_DEFAULT);
+ nid = phys_to_nid(bootmap_start);
+ if (nid != nodeid)
+ printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
+ else
+ reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
+ bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
+
#ifdef CONFIG_ACPI_NUMA
srat_reserve_add_area(nodeid);
#endif
@@ -378,9 +416,10 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
* Sets up the system RAM area from start_pfn to end_pfn according to the
* numa=fake command-line option.
*/
+static struct bootnode nodes[MAX_NUMNODES] __initdata;
+
static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
{
- struct bootnode nodes[MAX_NUMNODES];
u64 size, addr = start_pfn << PAGE_SHIFT;
u64 max_addr = end_pfn << PAGE_SHIFT;
int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i;
@@ -460,7 +499,7 @@ done:
}
}
out:
- memnode_shift = compute_hash_shift(nodes, num_nodes);
+ memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
if (memnode_shift < 0) {
memnode_shift = 0;
printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
@@ -548,8 +587,6 @@ void __cpuinit numa_set_node(int cpu, int node)
{
int *cpu_to_node_map = x86_cpu_to_node_map_early_ptr;
- cpu_pda(cpu)->nodenumber = node;
-
if(cpu_to_node_map)
cpu_to_node_map[cpu] = node;
else if(per_cpu_offset(cpu))
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7b79f6be4e7d..bd5e05c654dc 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -9,6 +9,8 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -17,6 +19,7 @@
#include <asm/uaccess.h>
#include <asm/pgalloc.h>
#include <asm/proto.h>
+#include <asm/pat.h>
/*
* The current flushing context - we pass it instead of 5 arguments:
@@ -28,6 +31,7 @@ struct cpa_data {
int numpages;
int flushtlb;
unsigned long pfn;
+ unsigned force_split : 1;
};
#ifdef CONFIG_X86_64
@@ -259,6 +263,9 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
int i, do_split = 1;
unsigned int level;
+ if (cpa->force_split)
+ return 1;
+
spin_lock_irqsave(&pgd_lock, flags);
/*
* Check for races, another CPU might have split this page
@@ -476,9 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
goto out_unlock;
pbase = (pte_t *)page_address(base);
-#ifdef CONFIG_X86_32
- paravirt_alloc_pt(&init_mm, page_to_pfn(base));
-#endif
+ paravirt_alloc_pte(&init_mm, page_to_pfn(base));
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
#ifdef CONFIG_X86_64
@@ -535,7 +540,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary)
repeat:
kpte = lookup_address(address, &level);
if (!kpte)
- return primary ? -EINVAL : 0;
+ return 0;
old_pte = *kpte;
if (!pte_val(old_pte)) {
@@ -693,7 +698,8 @@ static inline int cache_attr(pgprot_t attr)
}
static int change_page_attr_set_clr(unsigned long addr, int numpages,
- pgprot_t mask_set, pgprot_t mask_clr)
+ pgprot_t mask_set, pgprot_t mask_clr,
+ int force_split)
{
struct cpa_data cpa;
int ret, cache, checkalias;
@@ -704,7 +710,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
*/
mask_set = canon_pgprot(mask_set);
mask_clr = canon_pgprot(mask_clr);
- if (!pgprot_val(mask_set) && !pgprot_val(mask_clr))
+ if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split)
return 0;
/* Ensure we are PAGE_SIZE aligned */
@@ -721,6 +727,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
cpa.mask_set = mask_set;
cpa.mask_clr = mask_clr;
cpa.flushtlb = 0;
+ cpa.force_split = force_split;
/* No alias checking for _NX bit modifications */
checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
@@ -759,26 +766,61 @@ out:
static inline int change_page_attr_set(unsigned long addr, int numpages,
pgprot_t mask)
{
- return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0));
+ return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0);
}
static inline int change_page_attr_clear(unsigned long addr, int numpages,
pgprot_t mask)
{
- return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask);
+ return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0);
}
-int set_memory_uc(unsigned long addr, int numpages)
+int _set_memory_uc(unsigned long addr, int numpages)
{
return change_page_attr_set(addr, numpages,
- __pgprot(_PAGE_PCD));
+ __pgprot(_PAGE_CACHE_UC));
+}
+
+int set_memory_uc(unsigned long addr, int numpages)
+{
+ if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+ _PAGE_CACHE_UC, NULL))
+ return -EINVAL;
+
+ return _set_memory_uc(addr, numpages);
}
EXPORT_SYMBOL(set_memory_uc);
-int set_memory_wb(unsigned long addr, int numpages)
+int _set_memory_wc(unsigned long addr, int numpages)
+{
+ return change_page_attr_set(addr, numpages,
+ __pgprot(_PAGE_CACHE_WC));
+}
+
+int set_memory_wc(unsigned long addr, int numpages)
+{
+ if (!pat_wc_enabled)
+ return set_memory_uc(addr, numpages);
+
+ if (reserve_memtype(addr, addr + numpages * PAGE_SIZE,
+ _PAGE_CACHE_WC, NULL))
+ return -EINVAL;
+
+ return _set_memory_wc(addr, numpages);
+}
+EXPORT_SYMBOL(set_memory_wc);
+
+int _set_memory_wb(unsigned long addr, int numpages)
{
return change_page_attr_clear(addr, numpages,
- __pgprot(_PAGE_PCD | _PAGE_PWT));
+ __pgprot(_PAGE_CACHE_MASK));
+}
+
+int set_memory_wb(unsigned long addr, int numpages)
+{
+ free_memtype(addr, addr + numpages * PAGE_SIZE);
+
+ return _set_memory_wb(addr, numpages);
}
EXPORT_SYMBOL(set_memory_wb);
@@ -809,6 +851,12 @@ int set_memory_np(unsigned long addr, int numpages)
return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT));
}
+int set_memory_4k(unsigned long addr, int numpages)
+{
+ return change_page_attr_set_clr(addr, numpages, __pgprot(0),
+ __pgprot(0), 1);
+}
+
int set_pages_uc(struct page *page, int numpages)
{
unsigned long addr = (unsigned long)page_address(page);
@@ -918,6 +966,45 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
cpa_fill_pool(NULL);
}
+#ifdef CONFIG_DEBUG_FS
+static int dpa_show(struct seq_file *m, void *v)
+{
+ seq_puts(m, "DEBUG_PAGEALLOC\n");
+ seq_printf(m, "pool_size : %lu\n", pool_size);
+ seq_printf(m, "pool_pages : %lu\n", pool_pages);
+ seq_printf(m, "pool_low : %lu\n", pool_low);
+ seq_printf(m, "pool_used : %lu\n", pool_used);
+ seq_printf(m, "pool_failed : %lu\n", pool_failed);
+
+ return 0;
+}
+
+static int dpa_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, dpa_show, NULL);
+}
+
+static const struct file_operations dpa_fops = {
+ .open = dpa_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init debug_pagealloc_proc_init(void)
+{
+ struct dentry *de;
+
+ de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
+ &dpa_fops);
+ if (!de)
+ return -ENOMEM;
+
+ return 0;
+}
+__initcall(debug_pagealloc_proc_init);
+#endif
+
#ifdef CONFIG_HIBERNATION
bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
new file mode 100644
index 000000000000..277446cd30b6
--- /dev/null
+++ b/arch/x86/mm/pat.c
@@ -0,0 +1,596 @@
+/*
+ * Handle caching attributes in page tables (PAT)
+ *
+ * Authors: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
+ * Suresh B Siddha <suresh.b.siddha@intel.com>
+ *
+ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
+ */
+
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/bootmem.h>
+
+#include <asm/msr.h>
+#include <asm/tlbflush.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/pat.h>
+#include <asm/e820.h>
+#include <asm/cacheflush.h>
+#include <asm/fcntl.h>
+#include <asm/mtrr.h>
+#include <asm/io.h>
+
+int pat_wc_enabled = 1;
+
+static u64 __read_mostly boot_pat_state;
+
+static int nopat(char *str)
+{
+ pat_wc_enabled = 0;
+ printk(KERN_INFO "x86: PAT support disabled.\n");
+
+ return 0;
+}
+early_param("nopat", nopat);
+
+static int pat_known_cpu(void)
+{
+ if (!pat_wc_enabled)
+ return 0;
+
+ if (cpu_has_pat)
+ return 1;
+
+ pat_wc_enabled = 0;
+ printk(KERN_INFO "CPU and/or kernel does not support PAT.\n");
+ return 0;
+}
+
+enum {
+ PAT_UC = 0, /* uncached */
+ PAT_WC = 1, /* Write combining */
+ PAT_WT = 4, /* Write Through */
+ PAT_WP = 5, /* Write Protected */
+ PAT_WB = 6, /* Write Back (default) */
+ PAT_UC_MINUS = 7, /* UC, but can be overriden by MTRR */
+};
+
+#define PAT(x,y) ((u64)PAT_ ## y << ((x)*8))
+
+void pat_init(void)
+{
+ u64 pat;
+
+#ifndef CONFIG_X86_PAT
+ nopat(NULL);
+#endif
+
+ /* Boot CPU enables PAT based on CPU feature */
+ if (!smp_processor_id() && !pat_known_cpu())
+ return;
+
+ /* APs enable PAT iff boot CPU has enabled it before */
+ if (smp_processor_id() && !pat_wc_enabled)
+ return;
+
+ /* Set PWT to Write-Combining. All other bits stay the same */
+ /*
+ * PTE encoding used in Linux:
+ * PAT
+ * |PCD
+ * ||PWT
+ * |||
+ * 000 WB _PAGE_CACHE_WB
+ * 001 WC _PAGE_CACHE_WC
+ * 010 UC- _PAGE_CACHE_UC_MINUS
+ * 011 UC _PAGE_CACHE_UC
+ * PAT bit unused
+ */
+ pat = PAT(0,WB) | PAT(1,WC) | PAT(2,UC_MINUS) | PAT(3,UC) |
+ PAT(4,WB) | PAT(5,WC) | PAT(6,UC_MINUS) | PAT(7,UC);
+
+ /* Boot CPU check */
+ if (!smp_processor_id()) {
+ rdmsrl(MSR_IA32_CR_PAT, boot_pat_state);
+ }
+
+ wrmsrl(MSR_IA32_CR_PAT, pat);
+ printk(KERN_INFO "x86 PAT enabled: cpu %d, old 0x%Lx, new 0x%Lx\n",
+ smp_processor_id(), boot_pat_state, pat);
+}
+
+#undef PAT
+
+static char *cattr_name(unsigned long flags)
+{
+ switch (flags & _PAGE_CACHE_MASK) {
+ case _PAGE_CACHE_UC: return "uncached";
+ case _PAGE_CACHE_UC_MINUS: return "uncached-minus";
+ case _PAGE_CACHE_WB: return "write-back";
+ case _PAGE_CACHE_WC: return "write-combining";
+ default: return "broken";
+ }
+}
+
+/*
+ * The global memtype list keeps track of memory type for specific
+ * physical memory areas. Conflicting memory types in different
+ * mappings can cause CPU cache corruption. To avoid this we keep track.
+ *
+ * The list is sorted based on starting address and can contain multiple
+ * entries for each address (this allows reference counting for overlapping
+ * areas). All the aliases have the same cache attributes of course.
+ * Zero attributes are represented as holes.
+ *
+ * Currently the data structure is a list because the number of mappings
+ * are expected to be relatively small. If this should be a problem
+ * it could be changed to a rbtree or similar.
+ *
+ * memtype_lock protects the whole list.
+ */
+
+struct memtype {
+ u64 start;
+ u64 end;
+ unsigned long type;
+ struct list_head nd;
+};
+
+static LIST_HEAD(memtype_list);
+static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
+
+/*
+ * Does intersection of PAT memory type and MTRR memory type and returns
+ * the resulting memory type as PAT understands it.
+ * (Type in pat and mtrr will not have same value)
+ * The intersection is based on "Effective Memory Type" tables in IA-32
+ * SDM vol 3a
+ */
+static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot,
+ unsigned long *ret_prot)
+{
+ unsigned long pat_type;
+ u8 mtrr_type;
+
+ mtrr_type = mtrr_type_lookup(start, end);
+ if (mtrr_type == 0xFF) { /* MTRR not enabled */
+ *ret_prot = prot;
+ return 0;
+ }
+ if (mtrr_type == 0xFE) { /* MTRR match error */
+ *ret_prot = _PAGE_CACHE_UC;
+ return -1;
+ }
+ if (mtrr_type != MTRR_TYPE_UNCACHABLE &&
+ mtrr_type != MTRR_TYPE_WRBACK &&
+ mtrr_type != MTRR_TYPE_WRCOMB) { /* MTRR type unhandled */
+ *ret_prot = _PAGE_CACHE_UC;
+ return -1;
+ }
+
+ pat_type = prot & _PAGE_CACHE_MASK;
+ prot &= (~_PAGE_CACHE_MASK);
+
+ /* Currently doing intersection by hand. Optimize it later. */
+ if (pat_type == _PAGE_CACHE_WC) {
+ *ret_prot = prot | _PAGE_CACHE_WC;
+ } else if (pat_type == _PAGE_CACHE_UC_MINUS) {
+ *ret_prot = prot | _PAGE_CACHE_UC_MINUS;
+ } else if (pat_type == _PAGE_CACHE_UC ||
+ mtrr_type == MTRR_TYPE_UNCACHABLE) {
+ *ret_prot = prot | _PAGE_CACHE_UC;
+ } else if (mtrr_type == MTRR_TYPE_WRCOMB) {
+ *ret_prot = prot | _PAGE_CACHE_WC;
+ } else {
+ *ret_prot = prot | _PAGE_CACHE_WB;
+ }
+
+ return 0;
+}
+
+/*
+ * req_type typically has one of the:
+ * - _PAGE_CACHE_WB
+ * - _PAGE_CACHE_WC
+ * - _PAGE_CACHE_UC_MINUS
+ * - _PAGE_CACHE_UC
+ *
+ * req_type will have a special case value '-1', when requester want to inherit
+ * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
+ *
+ * If ret_type is NULL, function will return an error if it cannot reserve the
+ * region with req_type. If ret_type is non-null, function will return
+ * available type in ret_type in case of no error. In case of any error
+ * it will return a negative return value.
+ */
+int reserve_memtype(u64 start, u64 end, unsigned long req_type,
+ unsigned long *ret_type)
+{
+ struct memtype *new_entry = NULL;
+ struct memtype *parse;
+ unsigned long actual_type;
+ int err = 0;
+
+ /* Only track when pat_wc_enabled */
+ if (!pat_wc_enabled) {
+ /* This is identical to page table setting without PAT */
+ if (ret_type) {
+ if (req_type == -1) {
+ *ret_type = _PAGE_CACHE_WB;
+ } else {
+ *ret_type = req_type;
+ }
+ }
+ return 0;
+ }
+
+ /* Low ISA region is always mapped WB in page table. No need to track */
+ if (start >= ISA_START_ADDRESS && (end - 1) <= ISA_END_ADDRESS) {
+ if (ret_type)
+ *ret_type = _PAGE_CACHE_WB;
+
+ return 0;
+ }
+
+ if (req_type == -1) {
+ /*
+ * Special case where caller wants to inherit from mtrr or
+ * existing pat mapping, defaulting to UC_MINUS in case of
+ * no match.
+ */
+ u8 mtrr_type = mtrr_type_lookup(start, end);
+ if (mtrr_type == 0xFE) { /* MTRR match error */
+ err = -1;
+ }
+
+ if (mtrr_type == MTRR_TYPE_WRBACK) {
+ req_type = _PAGE_CACHE_WB;
+ actual_type = _PAGE_CACHE_WB;
+ } else {
+ req_type = _PAGE_CACHE_UC_MINUS;
+ actual_type = _PAGE_CACHE_UC_MINUS;
+ }
+ } else {
+ req_type &= _PAGE_CACHE_MASK;
+ err = pat_x_mtrr_type(start, end, req_type, &actual_type);
+ }
+
+ if (err) {
+ if (ret_type)
+ *ret_type = actual_type;
+
+ return -EINVAL;
+ }
+
+ new_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
+ if (!new_entry)
+ return -ENOMEM;
+
+ new_entry->start = start;
+ new_entry->end = end;
+ new_entry->type = actual_type;
+
+ if (ret_type)
+ *ret_type = actual_type;
+
+ spin_lock(&memtype_lock);
+
+ /* Search for existing mapping that overlaps the current range */
+ list_for_each_entry(parse, &memtype_list, nd) {
+ struct memtype *saved_ptr;
+
+ if (parse->start >= end) {
+ pr_debug("New Entry\n");
+ list_add(&new_entry->nd, parse->nd.prev);
+ new_entry = NULL;
+ break;
+ }
+
+ if (start <= parse->start && end >= parse->start) {
+ if (actual_type != parse->type && ret_type) {
+ actual_type = parse->type;
+ *ret_type = actual_type;
+ new_entry->type = actual_type;
+ }
+
+ if (actual_type != parse->type) {
+ printk(
+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid,
+ start, end,
+ cattr_name(actual_type),
+ cattr_name(parse->type));
+ err = -EBUSY;
+ break;
+ }
+
+ saved_ptr = parse;
+ /*
+ * Check to see whether the request overlaps more
+ * than one entry in the list
+ */
+ list_for_each_entry_continue(parse, &memtype_list, nd) {
+ if (end <= parse->start) {
+ break;
+ }
+
+ if (actual_type != parse->type) {
+ printk(
+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid,
+ start, end,
+ cattr_name(actual_type),
+ cattr_name(parse->type));
+ err = -EBUSY;
+ break;
+ }
+ }
+
+ if (err) {
+ break;
+ }
+
+ pr_debug("Overlap at 0x%Lx-0x%Lx\n",
+ saved_ptr->start, saved_ptr->end);
+ /* No conflict. Go ahead and add this new entry */
+ list_add(&new_entry->nd, saved_ptr->nd.prev);
+ new_entry = NULL;
+ break;
+ }
+
+ if (start < parse->end) {
+ if (actual_type != parse->type && ret_type) {
+ actual_type = parse->type;
+ *ret_type = actual_type;
+ new_entry->type = actual_type;
+ }
+
+ if (actual_type != parse->type) {
+ printk(
+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid,
+ start, end,
+ cattr_name(actual_type),
+ cattr_name(parse->type));
+ err = -EBUSY;
+ break;
+ }
+
+ saved_ptr = parse;
+ /*
+ * Check to see whether the request overlaps more
+ * than one entry in the list
+ */
+ list_for_each_entry_continue(parse, &memtype_list, nd) {
+ if (end <= parse->start) {
+ break;
+ }
+
+ if (actual_type != parse->type) {
+ printk(
+ KERN_INFO "%s:%d conflicting memory types %Lx-%Lx %s<->%s\n",
+ current->comm, current->pid,
+ start, end,
+ cattr_name(actual_type),
+ cattr_name(parse->type));
+ err = -EBUSY;
+ break;
+ }
+ }
+
+ if (err) {
+ break;
+ }
+
+ pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n",
+ saved_ptr->start, saved_ptr->end);
+ /* No conflict. Go ahead and add this new entry */
+ list_add(&new_entry->nd, &saved_ptr->nd);
+ new_entry = NULL;
+ break;
+ }
+ }
+
+ if (err) {
+ printk(KERN_INFO
+ "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n",
+ start, end, cattr_name(new_entry->type),
+ cattr_name(req_type));
+ kfree(new_entry);
+ spin_unlock(&memtype_lock);
+ return err;
+ }
+
+ if (new_entry) {
+ /* No conflict. Not yet added to the list. Add to the tail */
+ list_add_tail(&new_entry->nd, &memtype_list);
+ pr_debug("New Entry\n");
+ }
+
+ if (ret_type) {
+ pr_debug(
+ "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n",
+ start, end, cattr_name(actual_type),
+ cattr_name(req_type), cattr_name(*ret_type));
+ } else {
+ pr_debug(
+ "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n",
+ start, end, cattr_name(actual_type),
+ cattr_name(req_type));
+ }
+
+ spin_unlock(&memtype_lock);
+ return err;
+}
+
+int free_memtype(u64 start, u64 end)
+{
+ struct memtype *ml;
+ int err = -EINVAL;
+
+ /* Only track when pat_wc_enabled */
+ if (!pat_wc_enabled) {
+ return 0;
+ }
+
+ /* Low ISA region is always mapped WB. No need to track */
+ if (start >= ISA_START_ADDRESS && end <= ISA_END_ADDRESS) {
+ return 0;
+ }
+
+ spin_lock(&memtype_lock);
+ list_for_each_entry(ml, &memtype_list, nd) {
+ if (ml->start == start && ml->end == end) {
+ list_del(&ml->nd);
+ kfree(ml);
+ err = 0;
+ break;
+ }
+ }
+ spin_unlock(&memtype_lock);
+
+ if (err) {
+ printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n",
+ current->comm, current->pid, start, end);
+ }
+
+ pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end);
+ return err;
+}
+
+
+/*
+ * /dev/mem mmap interface. The memtype used for mapping varies:
+ * - Use UC for mappings with O_SYNC flag
+ * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
+ * inherit the memtype from existing mapping.
+ * - Else use UC_MINUS memtype (for backward compatibility with existing
+ * X drivers.
+ */
+pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t vma_prot)
+{
+ return vma_prot;
+}
+
+#ifdef CONFIG_NONPROMISC_DEVMEM
+/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+ return 1;
+}
+#else
+static inline int range_is_allowed(unsigned long pfn, unsigned long size)
+{
+ u64 from = ((u64)pfn) << PAGE_SHIFT;
+ u64 to = from + size;
+ u64 cursor = from;
+
+ while (cursor < to) {
+ if (!devmem_is_allowed(pfn)) {
+ printk(KERN_INFO
+ "Program %s tried to access /dev/mem between %Lx->%Lx.\n",
+ current->comm, from, to);
+ return 0;
+ }
+ cursor += PAGE_SIZE;
+ pfn++;
+ }
+ return 1;
+}
+#endif /* CONFIG_NONPROMISC_DEVMEM */
+
+int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
+ unsigned long size, pgprot_t *vma_prot)
+{
+ u64 offset = ((u64) pfn) << PAGE_SHIFT;
+ unsigned long flags = _PAGE_CACHE_UC_MINUS;
+ int retval;
+
+ if (!range_is_allowed(pfn, size))
+ return 0;
+
+ if (file->f_flags & O_SYNC) {
+ flags = _PAGE_CACHE_UC;
+ }
+
+#ifdef CONFIG_X86_32
+ /*
+ * On the PPro and successors, the MTRRs are used to set
+ * memory types for physical addresses outside main memory,
+ * so blindly setting UC or PWT on those pages is wrong.
+ * For Pentiums and earlier, the surround logic should disable
+ * caching for the high addresses through the KEN pin, but
+ * we maintain the tradition of paranoia in this code.
+ */
+ if (!pat_wc_enabled &&
+ ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) ||
+ test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) ||
+ test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) ||
+ test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) &&
+ (pfn << PAGE_SHIFT) >= __pa(high_memory)) {
+ flags = _PAGE_CACHE_UC;
+ }
+#endif
+
+ /*
+ * With O_SYNC, we can only take UC mapping. Fail if we cannot.
+ * Without O_SYNC, we want to get
+ * - WB for WB-able memory and no other conflicting mappings
+ * - UC_MINUS for non-WB-able memory with no other conflicting mappings
+ * - Inherit from confliting mappings otherwise
+ */
+ if (flags != _PAGE_CACHE_UC_MINUS) {
+ retval = reserve_memtype(offset, offset + size, flags, NULL);
+ } else {
+ retval = reserve_memtype(offset, offset + size, -1, &flags);
+ }
+
+ if (retval < 0)
+ return 0;
+
+ if (pfn <= max_pfn_mapped &&
+ ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
+ free_memtype(offset, offset + size);
+ printk(KERN_INFO
+ "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n",
+ current->comm, current->pid,
+ cattr_name(flags),
+ offset, offset + size);
+ return 0;
+ }
+
+ *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) |
+ flags);
+ return 1;
+}
+
+void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
+{
+ u64 addr = (u64)pfn << PAGE_SHIFT;
+ unsigned long flags;
+ unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
+
+ reserve_memtype(addr, addr + size, want_flags, &flags);
+ if (flags != want_flags) {
+ printk(KERN_INFO
+ "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n",
+ current->comm, current->pid,
+ cattr_name(want_flags),
+ addr, addr + size,
+ cattr_name(flags));
+ }
+}
+
+void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
+{
+ u64 addr = (u64)pfn << PAGE_SHIFT;
+
+ free_memtype(addr, addr + size);
+}
+
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
new file mode 100644
index 000000000000..50159764f694
--- /dev/null
+++ b/arch/x86/mm/pgtable.c
@@ -0,0 +1,276 @@
+#include <linux/mm.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+#include <asm/tlb.h>
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+ return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+ struct page *pte;
+
+#ifdef CONFIG_HIGHPTE
+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+ if (pte)
+ pgtable_page_ctor(pte);
+ return pte;
+}
+
+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+ pgtable_page_dtor(pte);
+ paravirt_release_pte(page_to_pfn(pte));
+ tlb_remove_page(tlb, pte);
+}
+
+#if PAGETABLE_LEVELS > 2
+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+ paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
+ tlb_remove_page(tlb, virt_to_page(pmd));
+}
+
+#if PAGETABLE_LEVELS > 3
+void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+ paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
+ tlb_remove_page(tlb, virt_to_page(pud));
+}
+#endif /* PAGETABLE_LEVELS > 3 */
+#endif /* PAGETABLE_LEVELS > 2 */
+
+static inline void pgd_list_add(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+
+ list_add(&page->lru, &pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+
+ list_del(&page->lru);
+}
+
+#define UNSHARED_PTRS_PER_PGD \
+ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
+
+static void pgd_ctor(void *p)
+{
+ pgd_t *pgd = p;
+ unsigned long flags;
+
+ /* Clear usermode parts of PGD */
+ memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
+
+ spin_lock_irqsave(&pgd_lock, flags);
+
+ /* If the pgd points to a shared pagetable level (either the
+ ptes in non-PAE, or shared PMD in PAE), then just copy the
+ references from swapper_pg_dir. */
+ if (PAGETABLE_LEVELS == 2 ||
+ (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+ PAGETABLE_LEVELS == 4) {
+ clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+ paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
+ __pa(swapper_pg_dir) >> PAGE_SHIFT,
+ KERNEL_PGD_BOUNDARY,
+ KERNEL_PGD_PTRS);
+ }
+
+ /* list required to sync kernel mapping updates */
+ if (!SHARED_KERNEL_PMD)
+ pgd_list_add(pgd);
+
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+static void pgd_dtor(void *pgd)
+{
+ unsigned long flags; /* can be called from interrupt context */
+
+ if (SHARED_KERNEL_PMD)
+ return;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ pgd_list_del(pgd);
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
+
+#ifdef CONFIG_X86_PAE
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+ int i;
+
+ for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+ pgd_t pgd = pgdp[i];
+
+ if (pgd_val(pgd) != 0) {
+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+ pgdp[i] = native_make_pgd(0);
+
+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
+ pmd_free(mm, pmd);
+ }
+ }
+}
+
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update. Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+ pud_t *pud;
+ unsigned long addr;
+ int i;
+
+ pud = pud_offset(pgd, 0);
+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
+ i++, pud++, addr += PUD_SIZE) {
+ pmd_t *pmd = pmd_alloc_one(mm, addr);
+
+ if (!pmd) {
+ pgd_mop_up_pmds(mm, pgd);
+ return 0;
+ }
+
+ if (i >= KERNEL_PGD_BOUNDARY)
+ memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+ sizeof(pmd_t) * PTRS_PER_PMD);
+
+ pud_populate(mm, pud, pmd);
+ }
+
+ return 1;
+}
+
+void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
+
+ /* Note: almost everything apart from _PAGE_PRESENT is
+ reserved at the pmd (PDPT) level. */
+ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+
+ /*
+ * According to Intel App note "TLBs, Paging-Structure Caches,
+ * and Their Invalidation", April 2007, document 317080-001,
+ * section 8.1: in PAE mode we explicitly have to flush the
+ * TLB via cr3 if the top-level pgd is changed...
+ */
+ if (mm == current->active_mm)
+ write_cr3(read_cr3());
+}
+#else /* !CONFIG_X86_PAE */
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+ return 1;
+}
+
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
+{
+}
+#endif /* CONFIG_X86_PAE */
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+
+ /* so that alloc_pmd can use it */
+ mm->pgd = pgd;
+ if (pgd)
+ pgd_ctor(pgd);
+
+ if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
+ pgd_dtor(pgd);
+ free_page((unsigned long)pgd);
+ pgd = NULL;
+ }
+
+ return pgd;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ pgd_mop_up_pmds(mm, pgd);
+ pgd_dtor(pgd);
+ free_page((unsigned long)pgd);
+}
+
+int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ int changed = !pte_same(*ptep, entry);
+
+ if (changed && dirty) {
+ *ptep = entry;
+ pte_update_defer(vma->vm_mm, address, ptep);
+ flush_tlb_page(vma, address);
+ }
+
+ return changed;
+}
+
+int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ int ret = 0;
+
+ if (pte_young(*ptep))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ &ptep->pte);
+
+ if (ret)
+ pte_update(vma->vm_mm, addr, ptep);
+
+ return ret;
+}
+
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ int young;
+
+ young = ptep_test_and_clear_young(vma, address, ptep);
+ if (young)
+ flush_tlb_page(vma, address);
+
+ return young;
+}
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 2f9e9afcb9f4..9ee007be9142 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -1,7 +1,3 @@
-/*
- * linux/arch/i386/mm/pgtable.c
- */
-
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
@@ -36,7 +32,6 @@ void show_mem(void)
printk(KERN_INFO "Mem-info:\n");
show_free_areas();
- printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
for_each_online_pgdat(pgdat) {
pgdat_resize_lock(pgdat, &flags);
for (i = 0; i < pgdat->node_spanned_pages; ++i) {
@@ -178,206 +173,9 @@ void reserve_top_address(unsigned long reserve)
__VMALLOC_RESERVE += reserve;
}
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
- return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-}
-
-pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+int pmd_bad(pmd_t pmd)
{
- struct page *pte;
+ WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd));
-#ifdef CONFIG_HIGHPTE
- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
-#else
- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-#endif
- if (pte)
- pgtable_page_ctor(pte);
- return pte;
+ return pmd_bad_v1(pmd);
}
-
-/*
- * List of all pgd's needed for non-PAE so it can invalidate entries
- * in both cached and uncached pgd's; not needed for PAE since the
- * kernel pmd is shared. If PAE were not to share the pmd a similar
- * tactic would be needed. This is essentially codepath-based locking
- * against pageattr.c; it is the unique case in which a valid change
- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
- * vmalloc faults work because attached pagetables are never freed.
- * -- wli
- */
-static inline void pgd_list_add(pgd_t *pgd)
-{
- struct page *page = virt_to_page(pgd);
-
- list_add(&page->lru, &pgd_list);
-}
-
-static inline void pgd_list_del(pgd_t *pgd)
-{
- struct page *page = virt_to_page(pgd);
-
- list_del(&page->lru);
-}
-
-#define UNSHARED_PTRS_PER_PGD \
- (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
-
-static void pgd_ctor(void *p)
-{
- pgd_t *pgd = p;
- unsigned long flags;
-
- /* Clear usermode parts of PGD */
- memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
-
- spin_lock_irqsave(&pgd_lock, flags);
-
- /* If the pgd points to a shared pagetable level (either the
- ptes in non-PAE, or shared PMD in PAE), then just copy the
- references from swapper_pg_dir. */
- if (PAGETABLE_LEVELS == 2 ||
- (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
- clone_pgd_range(pgd + USER_PTRS_PER_PGD,
- swapper_pg_dir + USER_PTRS_PER_PGD,
- KERNEL_PGD_PTRS);
- paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
- __pa(swapper_pg_dir) >> PAGE_SHIFT,
- USER_PTRS_PER_PGD,
- KERNEL_PGD_PTRS);
- }
-
- /* list required to sync kernel mapping updates */
- if (!SHARED_KERNEL_PMD)
- pgd_list_add(pgd);
-
- spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-static void pgd_dtor(void *pgd)
-{
- unsigned long flags; /* can be called from interrupt context */
-
- if (SHARED_KERNEL_PMD)
- return;
-
- spin_lock_irqsave(&pgd_lock, flags);
- pgd_list_del(pgd);
- spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * Mop up any pmd pages which may still be attached to the pgd.
- * Normally they will be freed by munmap/exit_mmap, but any pmd we
- * preallocate which never got a corresponding vma will need to be
- * freed manually.
- */
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
- int i;
-
- for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
- pgd_t pgd = pgdp[i];
-
- if (pgd_val(pgd) != 0) {
- pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
-
- pgdp[i] = native_make_pgd(0);
-
- paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
- pmd_free(mm, pmd);
- }
- }
-}
-
-/*
- * In PAE mode, we need to do a cr3 reload (=tlb flush) when
- * updating the top-level pagetable entries to guarantee the
- * processor notices the update. Since this is expensive, and
- * all 4 top-level entries are used almost immediately in a
- * new process's life, we just pre-populate them here.
- *
- * Also, if we're in a paravirt environment where the kernel pmd is
- * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
- * and initialize the kernel pmds here.
- */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
- pud_t *pud;
- unsigned long addr;
- int i;
-
- pud = pud_offset(pgd, 0);
- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
- i++, pud++, addr += PUD_SIZE) {
- pmd_t *pmd = pmd_alloc_one(mm, addr);
-
- if (!pmd) {
- pgd_mop_up_pmds(mm, pgd);
- return 0;
- }
-
- if (i >= USER_PTRS_PER_PGD)
- memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
- sizeof(pmd_t) * PTRS_PER_PMD);
-
- pud_populate(mm, pud, pmd);
- }
-
- return 1;
-}
-#else /* !CONFIG_X86_PAE */
-/* No need to prepopulate any pagetable entries in non-PAE modes. */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
- return 1;
-}
-
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-}
-#endif /* CONFIG_X86_PAE */
-
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-
- /* so that alloc_pd can use it */
- mm->pgd = pgd;
- if (pgd)
- pgd_ctor(pgd);
-
- if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
- pgd_dtor(pgd);
- free_page((unsigned long)pgd);
- pgd = NULL;
- }
-
- return pgd;
-}
-
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- pgd_mop_up_pmds(mm, pgd);
- pgd_dtor(pgd);
- free_page((unsigned long)pgd);
-}
-
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
-{
- pgtable_page_dtor(pte);
- paravirt_release_pt(page_to_pfn(pte));
- tlb_remove_page(tlb, pte);
-}
-
-#ifdef CONFIG_X86_PAE
-
-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
-{
- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
- tlb_remove_page(tlb, virt_to_page(pmd));
-}
-
-#endif
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 845001c617cc..3890234e5b26 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -20,6 +20,7 @@
#include <asm/proto.h>
#include <asm/numa.h>
#include <asm/e820.h>
+#include <asm/genapic.h>
int acpi_numa __initdata;
@@ -31,6 +32,10 @@ static struct bootnode nodes_add[MAX_NUMNODES];
static int found_add_area __initdata;
int hotadd_percent __initdata = 0;
+static int num_node_memblks __initdata;
+static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
+static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
+
/* Too small nodes confuse the VM badly. Usually they result
from BIOS bugs. */
#define NODE_MIN_SIZE (4*1024*1024)
@@ -40,17 +45,17 @@ static __init int setup_node(int pxm)
return acpi_map_pxm_to_node(pxm);
}
-static __init int conflicting_nodes(unsigned long start, unsigned long end)
+static __init int conflicting_memblks(unsigned long start, unsigned long end)
{
int i;
- for_each_node_mask(i, nodes_parsed) {
- struct bootnode *nd = &nodes[i];
+ for (i = 0; i < num_node_memblks; i++) {
+ struct bootnode *nd = &node_memblk_range[i];
if (nd->start == nd->end)
continue;
if (nd->end > start && nd->start < end)
- return i;
+ return memblk_nodeid[i];
if (nd->end == end && nd->start == start)
- return i;
+ return memblk_nodeid[i];
}
return -1;
}
@@ -132,7 +137,6 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
int pxm, node;
int apic_id;
- apic_id = pa->apic_id;
if (srat_disabled())
return;
if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
@@ -148,13 +152,18 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
bad_srat();
return;
}
+
+ if (is_uv_system())
+ apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
+ else
+ apic_id = pa->apic_id;
apicid_to_node[apic_id] = node;
acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
pxm, apic_id, node);
}
-int update_end_of_memory(unsigned long end) {return -1;}
+static int update_end_of_memory(unsigned long end) {return -1;}
static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static inline int save_add_info(void) {return 1;}
@@ -253,7 +262,7 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
bad_srat();
return;
}
- i = conflicting_nodes(start, end);
+ i = conflicting_memblks(start, end);
if (i == node) {
printk(KERN_WARNING
"SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
@@ -278,10 +287,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
nd->end = end;
}
- printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
- nd->start, nd->end);
- e820_register_active_regions(node, nd->start >> PAGE_SHIFT,
- nd->end >> PAGE_SHIFT);
+ printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
+ start, end);
+ e820_register_active_regions(node, start >> PAGE_SHIFT,
+ end >> PAGE_SHIFT);
push_node_boundaries(node, nd->start >> PAGE_SHIFT,
nd->end >> PAGE_SHIFT);
@@ -293,6 +302,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
if ((nd->start | nd->end) == 0)
node_clear(node, nodes_parsed);
}
+
+ node_memblk_range[num_node_memblks].start = start;
+ node_memblk_range[num_node_memblks].end = end;
+ memblk_nodeid[num_node_memblks] = node;
+ num_node_memblks++;
}
/* Sanity check to catch more bad SRATs (they are amazingly common).
@@ -363,7 +377,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
return -1;
}
- memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
+ memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
+ memblk_nodeid);
if (memnode_shift < 0) {
printk(KERN_ERR
"SRAT: No NUMA node hash function found. Contact maintainer\n");
diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c
index 5341d481d92f..cdfe4c54deca 100644
--- a/arch/x86/oprofile/init.c
+++ b/arch/x86/oprofile/init.c
@@ -10,18 +10,19 @@
#include <linux/oprofile.h>
#include <linux/init.h>
#include <linux/errno.h>
-
-/* We support CPUs that have performance counters like the Pentium Pro
+
+/*
+ * We support CPUs that have performance counters like the Pentium Pro
* with the NMI mode driver.
*/
-
-extern int op_nmi_init(struct oprofile_operations * ops);
-extern int op_nmi_timer_init(struct oprofile_operations * ops);
+
+extern int op_nmi_init(struct oprofile_operations *ops);
+extern int op_nmi_timer_init(struct oprofile_operations *ops);
extern void op_nmi_exit(void);
extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
-int __init oprofile_arch_init(struct oprofile_operations * ops)
+int __init oprofile_arch_init(struct oprofile_operations *ops)
{
int ret;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 1f11cf0a307f..cc48d3fde545 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -23,8 +23,8 @@
#include "op_x86_model.h"
static struct op_x86_model_spec const *model;
-static struct op_msrs cpu_msrs[NR_CPUS];
-static unsigned long saved_lvtpc[NR_CPUS];
+static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
+static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
static int nmi_start(void);
static void nmi_stop(void);
@@ -89,7 +89,7 @@ static int profile_exceptions_notify(struct notifier_block *self,
switch (val) {
case DIE_NMI:
- if (model->check_ctrs(args->regs, &cpu_msrs[cpu]))
+ if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)))
ret = NOTIFY_STOP;
break;
default:
@@ -126,7 +126,7 @@ static void nmi_cpu_save_registers(struct op_msrs *msrs)
static void nmi_save_registers(void *dummy)
{
int cpu = smp_processor_id();
- struct op_msrs *msrs = &cpu_msrs[cpu];
+ struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
nmi_cpu_save_registers(msrs);
}
@@ -134,10 +134,10 @@ static void free_msrs(void)
{
int i;
for_each_possible_cpu(i) {
- kfree(cpu_msrs[i].counters);
- cpu_msrs[i].counters = NULL;
- kfree(cpu_msrs[i].controls);
- cpu_msrs[i].controls = NULL;
+ kfree(per_cpu(cpu_msrs, i).counters);
+ per_cpu(cpu_msrs, i).counters = NULL;
+ kfree(per_cpu(cpu_msrs, i).controls);
+ per_cpu(cpu_msrs, i).controls = NULL;
}
}
@@ -149,13 +149,15 @@ static int allocate_msrs(void)
int i;
for_each_possible_cpu(i) {
- cpu_msrs[i].counters = kmalloc(counters_size, GFP_KERNEL);
- if (!cpu_msrs[i].counters) {
+ per_cpu(cpu_msrs, i).counters = kmalloc(counters_size,
+ GFP_KERNEL);
+ if (!per_cpu(cpu_msrs, i).counters) {
success = 0;
break;
}
- cpu_msrs[i].controls = kmalloc(controls_size, GFP_KERNEL);
- if (!cpu_msrs[i].controls) {
+ per_cpu(cpu_msrs, i).controls = kmalloc(controls_size,
+ GFP_KERNEL);
+ if (!per_cpu(cpu_msrs, i).controls) {
success = 0;
break;
}
@@ -170,11 +172,11 @@ static int allocate_msrs(void)
static void nmi_cpu_setup(void *dummy)
{
int cpu = smp_processor_id();
- struct op_msrs *msrs = &cpu_msrs[cpu];
+ struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu);
spin_lock(&oprofilefs_lock);
model->setup_ctrs(msrs);
spin_unlock(&oprofilefs_lock);
- saved_lvtpc[cpu] = apic_read(APIC_LVTPC);
+ per_cpu(saved_lvtpc, cpu) = apic_read(APIC_LVTPC);
apic_write(APIC_LVTPC, APIC_DM_NMI);
}
@@ -203,13 +205,15 @@ static int nmi_setup(void)
*/
/* Assume saved/restored counters are the same on all CPUs */
- model->fill_in_addresses(&cpu_msrs[0]);
+ model->fill_in_addresses(&per_cpu(cpu_msrs, 0));
for_each_possible_cpu(cpu) {
if (cpu != 0) {
- memcpy(cpu_msrs[cpu].counters, cpu_msrs[0].counters,
+ memcpy(per_cpu(cpu_msrs, cpu).counters,
+ per_cpu(cpu_msrs, 0).counters,
sizeof(struct op_msr) * model->num_counters);
- memcpy(cpu_msrs[cpu].controls, cpu_msrs[0].controls,
+ memcpy(per_cpu(cpu_msrs, cpu).controls,
+ per_cpu(cpu_msrs, 0).controls,
sizeof(struct op_msr) * model->num_controls);
}
@@ -249,7 +253,7 @@ static void nmi_cpu_shutdown(void *dummy)
{
unsigned int v;
int cpu = smp_processor_id();
- struct op_msrs *msrs = &cpu_msrs[cpu];
+ struct op_msrs *msrs = &__get_cpu_var(cpu_msrs);
/* restoring APIC_LVTPC can trigger an apic error because the delivery
* mode and vector nr combination can be illegal. That's by design: on
@@ -258,23 +262,24 @@ static void nmi_cpu_shutdown(void *dummy)
*/
v = apic_read(APIC_LVTERR);
apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
- apic_write(APIC_LVTPC, saved_lvtpc[cpu]);
+ apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu));
apic_write(APIC_LVTERR, v);
nmi_restore_registers(msrs);
}
static void nmi_shutdown(void)
{
+ struct op_msrs *msrs = &__get_cpu_var(cpu_msrs);
nmi_enabled = 0;
on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1);
unregister_die_notifier(&profile_exceptions_nb);
- model->shutdown(cpu_msrs);
+ model->shutdown(msrs);
free_msrs();
}
static void nmi_cpu_start(void *dummy)
{
- struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()];
+ struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
model->start(msrs);
}
@@ -286,7 +291,7 @@ static int nmi_start(void)
static void nmi_cpu_stop(void *dummy)
{
- struct op_msrs const *msrs = &cpu_msrs[smp_processor_id()];
+ struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs);
model->stop(msrs);
}
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
index 1418e36ae7ab..e3ecb71b5790 100644
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -17,14 +17,14 @@
#include <asm/nmi.h>
#include <asm/apic.h>
#include <asm/ptrace.h>
-
+
static int profile_timer_exceptions_notify(struct notifier_block *self,
unsigned long val, void *data)
{
struct die_args *args = (struct die_args *)data;
int ret = NOTIFY_DONE;
- switch(val) {
+ switch (val) {
case DIE_NMI:
oprofile_add_sample(args->regs, 0);
ret = NOTIFY_STOP;
@@ -56,7 +56,7 @@ static void timer_stop(void)
}
-int __init op_nmi_timer_init(struct oprofile_operations * ops)
+int __init op_nmi_timer_init(struct oprofile_operations *ops)
{
if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0))
return -ENODEV;
diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c
index c3ee43333f26..3d534879a9dc 100644
--- a/arch/x86/oprofile/op_model_athlon.c
+++ b/arch/x86/oprofile/op_model_athlon.c
@@ -1,4 +1,4 @@
-/**
+/*
* @file op_model_athlon.h
* athlon / K7 / K8 / Family 10h model-specific MSR operations
*
@@ -14,28 +14,28 @@
#include <asm/ptrace.h>
#include <asm/msr.h>
#include <asm/nmi.h>
-
+
#include "op_x86_model.h"
#include "op_counter.h"
#define NUM_COUNTERS 4
#define NUM_CONTROLS 4
-#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
-#define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0)
-#define CTR_WRITE(l,msrs,c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1);} while (0)
+#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
+#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
+#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0)
#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
-#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
-#define CTRL_READ(l,h,msrs,c) do {rdmsr(msrs->controls[(c)].addr, (l), (h));} while (0)
-#define CTRL_WRITE(l,h,msrs,c) do {wrmsr(msrs->controls[(c)].addr, (l), (h));} while (0)
+#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
+#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
+#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
#define CTRL_CLEAR_LO(x) (x &= (1<<21))
#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
#define CTRL_SET_ENABLE(val) (val |= 1<<20)
-#define CTRL_SET_USR(val,u) (val |= ((u & 1) << 16))
-#define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17))
+#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
+#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
#define CTRL_SET_UM(val, m) (val |= (m << 8))
#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
@@ -43,19 +43,19 @@
#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
static unsigned long reset_value[NUM_COUNTERS];
-
+
static void athlon_fill_in_addresses(struct op_msrs * const msrs)
{
int i;
- for (i=0; i < NUM_COUNTERS; i++) {
+ for (i = 0; i < NUM_COUNTERS; i++) {
if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
else
msrs->counters[i].addr = 0;
}
- for (i=0; i < NUM_CONTROLS; i++) {
+ for (i = 0; i < NUM_CONTROLS; i++) {
if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
else
@@ -63,15 +63,15 @@ static void athlon_fill_in_addresses(struct op_msrs * const msrs)
}
}
-
+
static void athlon_setup_ctrs(struct op_msrs const * const msrs)
{
unsigned int low, high;
int i;
-
+
/* clear all counters */
for (i = 0 ; i < NUM_CONTROLS; ++i) {
- if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
+ if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
continue;
CTRL_READ(low, high, msrs, i);
CTRL_CLEAR_LO(low);
@@ -81,14 +81,14 @@ static void athlon_setup_ctrs(struct op_msrs const * const msrs)
/* avoid a false detection of ctr overflows in NMI handler */
for (i = 0; i < NUM_COUNTERS; ++i) {
- if (unlikely(!CTR_IS_RESERVED(msrs,i)))
+ if (unlikely(!CTR_IS_RESERVED(msrs, i)))
continue;
CTR_WRITE(1, msrs, i);
}
/* enable active counters */
for (i = 0; i < NUM_COUNTERS; ++i) {
- if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) {
+ if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
reset_value[i] = counter_config[i].count;
CTR_WRITE(counter_config[i].count, msrs, i);
@@ -112,7 +112,7 @@ static void athlon_setup_ctrs(struct op_msrs const * const msrs)
}
}
-
+
static int athlon_check_ctrs(struct pt_regs * const regs,
struct op_msrs const * const msrs)
{
@@ -133,7 +133,7 @@ static int athlon_check_ctrs(struct pt_regs * const regs,
return 1;
}
-
+
static void athlon_start(struct op_msrs const * const msrs)
{
unsigned int low, high;
@@ -150,7 +150,7 @@ static void athlon_start(struct op_msrs const * const msrs)
static void athlon_stop(struct op_msrs const * const msrs)
{
- unsigned int low,high;
+ unsigned int low, high;
int i;
/* Subtle: stop on all counters to avoid race with
@@ -169,11 +169,11 @@ static void athlon_shutdown(struct op_msrs const * const msrs)
int i;
for (i = 0 ; i < NUM_COUNTERS ; ++i) {
- if (CTR_IS_RESERVED(msrs,i))
+ if (CTR_IS_RESERVED(msrs, i))
release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
}
for (i = 0 ; i < NUM_CONTROLS ; ++i) {
- if (CTRL_IS_RESERVED(msrs,i))
+ if (CTRL_IS_RESERVED(msrs, i))
release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
}
}
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index c554f52cb808..eff431f6c57b 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -1,4 +1,4 @@
-/**
+/*
* @file op_model_ppro.h
* pentium pro / P6 model-specific MSR operations
*
@@ -15,45 +15,45 @@
#include <asm/msr.h>
#include <asm/apic.h>
#include <asm/nmi.h>
-
+
#include "op_x86_model.h"
#include "op_counter.h"
#define NUM_COUNTERS 2
#define NUM_CONTROLS 2
-#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0)
-#define CTR_READ(l,h,msrs,c) do {rdmsr(msrs->counters[(c)].addr, (l), (h));} while (0)
-#define CTR_32BIT_WRITE(l,msrs,c) \
- do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0);} while (0)
+#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
+#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
+#define CTR_32BIT_WRITE(l, msrs, c) \
+ do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0); } while (0)
#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
-#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0)
-#define CTRL_READ(l,h,msrs,c) do {rdmsr((msrs->controls[(c)].addr), (l), (h));} while (0)
-#define CTRL_WRITE(l,h,msrs,c) do {wrmsr((msrs->controls[(c)].addr), (l), (h));} while (0)
+#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
+#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0)
+#define CTRL_WRITE(l, h, msrs, c) do {wrmsr((msrs->controls[(c)].addr), (l), (h)); } while (0)
#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
#define CTRL_CLEAR(x) (x &= (1<<21))
#define CTRL_SET_ENABLE(val) (val |= 1<<20)
-#define CTRL_SET_USR(val,u) (val |= ((u & 1) << 16))
-#define CTRL_SET_KERN(val,k) (val |= ((k & 1) << 17))
+#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
+#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
#define CTRL_SET_UM(val, m) (val |= (m << 8))
#define CTRL_SET_EVENT(val, e) (val |= e)
static unsigned long reset_value[NUM_COUNTERS];
-
+
static void ppro_fill_in_addresses(struct op_msrs * const msrs)
{
int i;
- for (i=0; i < NUM_COUNTERS; i++) {
+ for (i = 0; i < NUM_COUNTERS; i++) {
if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
else
msrs->counters[i].addr = 0;
}
-
- for (i=0; i < NUM_CONTROLS; i++) {
+
+ for (i = 0; i < NUM_CONTROLS; i++) {
if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
else
@@ -69,23 +69,23 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
/* clear all counters */
for (i = 0 ; i < NUM_CONTROLS; ++i) {
- if (unlikely(!CTRL_IS_RESERVED(msrs,i)))
+ if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
continue;
CTRL_READ(low, high, msrs, i);
CTRL_CLEAR(low);
CTRL_WRITE(low, high, msrs, i);
}
-
+
/* avoid a false detection of ctr overflows in NMI handler */
for (i = 0; i < NUM_COUNTERS; ++i) {
- if (unlikely(!CTR_IS_RESERVED(msrs,i)))
+ if (unlikely(!CTR_IS_RESERVED(msrs, i)))
continue;
CTR_32BIT_WRITE(1, msrs, i);
}
/* enable active counters */
for (i = 0; i < NUM_COUNTERS; ++i) {
- if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs,i))) {
+ if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
reset_value[i] = counter_config[i].count;
CTR_32BIT_WRITE(counter_config[i].count, msrs, i);
@@ -104,13 +104,13 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
}
}
-
+
static int ppro_check_ctrs(struct pt_regs * const regs,
struct op_msrs const * const msrs)
{
unsigned int low, high;
int i;
-
+
for (i = 0 ; i < NUM_COUNTERS; ++i) {
if (!reset_value[i])
continue;
@@ -135,10 +135,10 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
return 1;
}
-
+
static void ppro_start(struct op_msrs const * const msrs)
{
- unsigned int low,high;
+ unsigned int low, high;
int i;
for (i = 0; i < NUM_COUNTERS; ++i) {
@@ -153,7 +153,7 @@ static void ppro_start(struct op_msrs const * const msrs)
static void ppro_stop(struct op_msrs const * const msrs)
{
- unsigned int low,high;
+ unsigned int low, high;
int i;
for (i = 0; i < NUM_COUNTERS; ++i) {
@@ -170,11 +170,11 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
int i;
for (i = 0 ; i < NUM_COUNTERS ; ++i) {
- if (CTR_IS_RESERVED(msrs,i))
+ if (CTR_IS_RESERVED(msrs, i))
release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
}
for (i = 0 ; i < NUM_CONTROLS ; ++i) {
- if (CTRL_IS_RESERVED(msrs,i))
+ if (CTRL_IS_RESERVED(msrs, i))
release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
}
}
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 0234f2831bf3..2664cb3fc96c 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -151,7 +151,7 @@ adjust_transparent_bridge_resources(struct pci_bus *bus)
static void
get_current_resources(struct acpi_device *device, int busnum,
- struct pci_bus *bus)
+ int domain, struct pci_bus *bus)
{
struct pci_root_info info;
size_t size;
@@ -168,10 +168,10 @@ get_current_resources(struct acpi_device *device, int busnum,
if (!info.res)
goto res_alloc_fail;
- info.name = kmalloc(12, GFP_KERNEL);
+ info.name = kmalloc(16, GFP_KERNEL);
if (!info.name)
goto name_alloc_fail;
- sprintf(info.name, "PCI Bus #%02x", busnum);
+ sprintf(info.name, "PCI Bus %04x:%02x", domain, busnum);
info.res_num = 0;
acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
@@ -219,8 +219,21 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
if (pxm >= 0)
sd->node = pxm_to_node(pxm);
#endif
+ /*
+ * Maybe the desired pci bus has been already scanned. In such case
+ * it is unnecessary to scan the pci bus with the given domain,busnum.
+ */
+ bus = pci_find_bus(domain, busnum);
+ if (bus) {
+ /*
+ * If the desired bus exits, the content of bus->sysdata will
+ * be replaced by sd.
+ */
+ memcpy(bus->sysdata, sd, sizeof(*sd));
+ kfree(sd);
+ } else
+ bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
- bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd);
if (!bus)
kfree(sd);
@@ -228,13 +241,13 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
if (bus != NULL) {
if (pxm >= 0) {
printk("bus %d -> pxm %d -> node %d\n",
- busnum, pxm, sd->node);
+ busnum, pxm, pxm_to_node(pxm));
}
}
#endif
if (bus && (pci_probe & PCI_USE__CRS))
- get_current_resources(device, busnum, bus);
+ get_current_resources(device, busnum, domain, bus);
return bus;
}
@@ -265,8 +278,7 @@ static int __init pci_acpi_init(void)
printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
for_each_pci_dev(dev)
acpi_pci_irq_enable(dev);
- } else
- printk(KERN_INFO "PCI: If a device doesn't work, try \"pci=routeirq\". If it helps, post a report\n");
+ }
#ifdef CONFIG_X86_IO_APIC
if (acpi_ioapic)
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 7b6e3bb9b28c..75fcc29ecf52 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -77,59 +77,6 @@ int pcibios_scanned;
*/
DEFINE_SPINLOCK(pci_config_lock);
-/*
- * Several buggy motherboards address only 16 devices and mirror
- * them to next 16 IDs. We try to detect this `feature' on all
- * primary buses (those containing host bridges as they are
- * expected to be unique) and remove the ghost devices.
- */
-
-static void __devinit pcibios_fixup_ghosts(struct pci_bus *b)
-{
- struct list_head *ln, *mn;
- struct pci_dev *d, *e;
- int mirror = PCI_DEVFN(16,0);
- int seen_host_bridge = 0;
- int i;
-
- DBG("PCI: Scanning for ghost devices on bus %d\n", b->number);
- list_for_each(ln, &b->devices) {
- d = pci_dev_b(ln);
- if ((d->class >> 8) == PCI_CLASS_BRIDGE_HOST)
- seen_host_bridge++;
- for (mn=ln->next; mn != &b->devices; mn=mn->next) {
- e = pci_dev_b(mn);
- if (e->devfn != d->devfn + mirror ||
- e->vendor != d->vendor ||
- e->device != d->device ||
- e->class != d->class)
- continue;
- for(i=0; i<PCI_NUM_RESOURCES; i++)
- if (e->resource[i].start != d->resource[i].start ||
- e->resource[i].end != d->resource[i].end ||
- e->resource[i].flags != d->resource[i].flags)
- continue;
- break;
- }
- if (mn == &b->devices)
- return;
- }
- if (!seen_host_bridge)
- return;
- printk(KERN_WARNING "PCI: Ignoring ghost devices on bus %02x\n", b->number);
-
- ln = &b->devices;
- while (ln->next != &b->devices) {
- d = pci_dev_b(ln->next);
- if (d->devfn >= mirror) {
- list_del(&d->global_list);
- list_del(&d->bus_list);
- kfree(d);
- } else
- ln = ln->next;
- }
-}
-
static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
{
struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
@@ -152,7 +99,6 @@ void __devinit pcibios_fixup_bus(struct pci_bus *b)
{
struct pci_dev *dev;
- pcibios_fixup_ghosts(b);
pci_read_bridge_bases(b);
list_for_each_entry(dev, &b->devices, bus_list)
pcibios_fixup_device_resources(dev);
@@ -427,10 +373,6 @@ static int __init pcibios_init(void)
if (pci_bf_sort >= pci_force_bf)
pci_sort_breadthfirst();
-#ifdef CONFIG_PCI_BIOS
- if ((pci_probe & PCI_BIOS_SORT) && !(pci_probe & PCI_NO_SORT))
- pcibios_sort();
-#endif
return 0;
}
@@ -455,9 +397,6 @@ char * __devinit pcibios_setup(char *str)
} else if (!strcmp(str, "nobios")) {
pci_probe &= ~PCI_PROBE_BIOS;
return NULL;
- } else if (!strcmp(str, "nosort")) {
- pci_probe |= PCI_NO_SORT;
- return NULL;
} else if (!strcmp(str, "biosirq")) {
pci_probe |= PCI_BIOS_IRQ_SCAN;
return NULL;
@@ -527,7 +466,7 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
{
int err;
- if ((err = pcibios_enable_resources(dev, mask)) < 0)
+ if ((err = pci_enable_resources(dev, mask)) < 0)
return err;
if (!dev->msi_enabled)
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 103b9dff1213..94f6c73a53d0 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -30,6 +30,9 @@
#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/errno.h>
+#include <linux/bootmem.h>
+
+#include <asm/pat.h>
#include "pci.h"
@@ -238,44 +241,6 @@ void __init pcibios_resource_survey(void)
*/
fs_initcall(pcibios_assign_resources);
-int pcibios_enable_resources(struct pci_dev *dev, int mask)
-{
- u16 cmd, old_cmd;
- int idx;
- struct resource *r;
-
- pci_read_config_word(dev, PCI_COMMAND, &cmd);
- old_cmd = cmd;
- for (idx = 0; idx < PCI_NUM_RESOURCES; idx++) {
- /* Only set up the requested stuff */
- if (!(mask & (1 << idx)))
- continue;
-
- r = &dev->resource[idx];
- if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
- continue;
- if ((idx == PCI_ROM_RESOURCE) &&
- (!(r->flags & IORESOURCE_ROM_ENABLE)))
- continue;
- if (!r->start && r->end) {
- printk(KERN_ERR "PCI: Device %s not available "
- "because of resource %d collisions\n",
- pci_name(dev), idx);
- return -EINVAL;
- }
- if (r->flags & IORESOURCE_IO)
- cmd |= PCI_COMMAND_IO;
- if (r->flags & IORESOURCE_MEM)
- cmd |= PCI_COMMAND_MEMORY;
- }
- if (cmd != old_cmd) {
- printk("PCI: Enabling device %s (%04x -> %04x)\n",
- pci_name(dev), old_cmd, cmd);
- pci_write_config_word(dev, PCI_COMMAND, cmd);
- }
- return 0;
-}
-
/*
* If we set up a device for bus mastering, we need to check the latency
* timer as certain crappy BIOSes forget to set it properly.
@@ -297,10 +262,35 @@ void pcibios_set_master(struct pci_dev *dev)
pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
}
+static void pci_unmap_page_range(struct vm_area_struct *vma)
+{
+ u64 addr = (u64)vma->vm_pgoff << PAGE_SHIFT;
+ free_memtype(addr, addr + vma->vm_end - vma->vm_start);
+}
+
+static void pci_track_mmap_page_range(struct vm_area_struct *vma)
+{
+ u64 addr = (u64)vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long flags = pgprot_val(vma->vm_page_prot)
+ & _PAGE_CACHE_MASK;
+
+ reserve_memtype(addr, addr + vma->vm_end - vma->vm_start, flags, NULL);
+}
+
+static struct vm_operations_struct pci_mmap_ops = {
+ .open = pci_track_mmap_page_range,
+ .close = pci_unmap_page_range,
+};
+
int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
enum pci_mmap_state mmap_state, int write_combine)
{
unsigned long prot;
+ u64 addr = vma->vm_pgoff << PAGE_SHIFT;
+ unsigned long len = vma->vm_end - vma->vm_start;
+ unsigned long flags;
+ unsigned long new_flags;
+ int retval;
/* I/O space cannot be accessed via normal processor loads and
* stores on this platform.
@@ -308,21 +298,50 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
if (mmap_state == pci_mmap_io)
return -EINVAL;
- /* Leave vm_pgoff as-is, the PCI space address is the physical
- * address on this platform.
- */
prot = pgprot_val(vma->vm_page_prot);
- if (boot_cpu_data.x86 > 3)
- prot |= _PAGE_PCD | _PAGE_PWT;
+ if (pat_wc_enabled && write_combine)
+ prot |= _PAGE_CACHE_WC;
+ else if (boot_cpu_data.x86 > 3)
+ prot |= _PAGE_CACHE_UC;
+
vma->vm_page_prot = __pgprot(prot);
- /* Write-combine setting is ignored, it is changed via the mtrr
- * interfaces on this platform.
- */
+ flags = pgprot_val(vma->vm_page_prot) & _PAGE_CACHE_MASK;
+ retval = reserve_memtype(addr, addr + len, flags, &new_flags);
+ if (retval)
+ return retval;
+
+ if (flags != new_flags) {
+ /*
+ * Do not fallback to certain memory types with certain
+ * requested type:
+ * - request is uncached, return cannot be write-back
+ * - request is uncached, return cannot be write-combine
+ * - request is write-combine, return cannot be write-back
+ */
+ if ((flags == _PAGE_CACHE_UC &&
+ (new_flags == _PAGE_CACHE_WB ||
+ new_flags == _PAGE_CACHE_WC)) ||
+ (flags == _PAGE_CACHE_WC &&
+ new_flags == _PAGE_CACHE_WB)) {
+ free_memtype(addr, addr+len);
+ return -EINVAL;
+ }
+ flags = new_flags;
+ }
+
+ if (vma->vm_pgoff <= max_pfn_mapped &&
+ ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
+ free_memtype(addr, addr + len);
+ return -EINVAL;
+ }
+
if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
vma->vm_end - vma->vm_start,
vma->vm_page_prot))
return -EAGAIN;
+ vma->vm_ops = &pci_mmap_ops;
+
return 0;
}
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index a8715861877e..579745ca6b66 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -200,7 +200,7 @@ static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
{
static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
- WARN_ON_ONCE(pirq >= 16);
+ WARN_ON_ONCE(pirq > 16);
return irqmap[read_config_nybble(router, 0x48, pirq-1)];
}
@@ -209,7 +209,7 @@ static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
unsigned int val = irqmap[irq];
- WARN_ON_ONCE(pirq >= 16);
+ WARN_ON_ONCE(pirq > 16);
if (val) {
write_config_nybble(router, 0x48, pirq-1, val);
return 1;
@@ -260,7 +260,7 @@ static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq
{
static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
- WARN_ON_ONCE(pirq >= 5);
+ WARN_ON_ONCE(pirq > 5);
return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
}
@@ -268,7 +268,7 @@ static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq
{
static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
- WARN_ON_ONCE(pirq >= 5);
+ WARN_ON_ONCE(pirq > 5);
write_config_nybble(router, 0x55, pirqmap[pirq-1], irq);
return 1;
}
@@ -282,7 +282,7 @@ static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
{
static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
- WARN_ON_ONCE(pirq >= 4);
+ WARN_ON_ONCE(pirq > 4);
return read_config_nybble(router,0x43, pirqmap[pirq-1]);
}
@@ -290,7 +290,7 @@ static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
{
static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
- WARN_ON_ONCE(pirq >= 4);
+ WARN_ON_ONCE(pirq > 4);
write_config_nybble(router, 0x43, pirqmap[pirq-1], irq);
return 1;
}
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c
index 55270c26237c..d9afbae5092b 100644
--- a/arch/x86/pci/numa.c
+++ b/arch/x86/pci/numa.c
@@ -11,11 +11,41 @@
#define XQUAD_PORTIO_BASE 0xfe400000
#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
+int mp_bus_id_to_node[MAX_MP_BUSSES];
#define BUS2QUAD(global) (mp_bus_id_to_node[global])
+
+int mp_bus_id_to_local[MAX_MP_BUSSES];
#define BUS2LOCAL(global) (mp_bus_id_to_local[global])
+
+void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
+ struct mpc_config_translation *translation)
+{
+ int quad = translation->trans_quad;
+ int local = translation->trans_local;
+
+ mp_bus_id_to_node[m->mpc_busid] = quad;
+ mp_bus_id_to_local[m->mpc_busid] = local;
+ printk(KERN_INFO "Bus #%d is %s (node %d)\n",
+ m->mpc_busid, name, quad);
+}
+
+int quad_local_to_mp_bus_id [NR_CPUS/4][4];
#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
+void mpc_oem_pci_bus(struct mpc_config_bus *m,
+ struct mpc_config_translation *translation)
+{
+ int quad = translation->trans_quad;
+ int local = translation->trans_local;
+
+ quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
+}
+
+/* Where the IO area was mapped on multiquad, always 0 otherwise */
+void *xquad_portio;
+#ifdef CONFIG_X86_NUMAQ
+EXPORT_SYMBOL(xquad_portio);
+#endif
-extern void *xquad_portio; /* Where the IO area was mapped */
#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 2f7109ac4c15..37472fc6f729 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -152,28 +152,6 @@ static int __devinit check_pcibios(void)
return 0;
}
-static int __devinit pci_bios_find_device (unsigned short vendor, unsigned short device_id,
- unsigned short index, unsigned char *bus, unsigned char *device_fn)
-{
- unsigned short bx;
- unsigned short ret;
-
- __asm__("lcall *(%%edi); cld\n\t"
- "jc 1f\n\t"
- "xor %%ah, %%ah\n"
- "1:"
- : "=b" (bx),
- "=a" (ret)
- : "1" (PCIBIOS_FIND_PCI_DEVICE),
- "c" (device_id),
- "d" (vendor),
- "S" ((int) index),
- "D" (&pci_indirect));
- *bus = (bx >> 8) & 0xff;
- *device_fn = bx & 0xff;
- return (int) (ret & 0xff00) >> 8;
-}
-
static int pci_bios_read(unsigned int seg, unsigned int bus,
unsigned int devfn, int reg, int len, u32 *value)
{
@@ -364,55 +342,6 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
}
/*
- * Sort the device list according to PCI BIOS. Nasty hack, but since some
- * fool forgot to define the `correct' device order in the PCI BIOS specs
- * and we want to be (possibly bug-to-bug ;-]) compatible with older kernels
- * which used BIOS ordering, we are bound to do this...
- */
-
-void __devinit pcibios_sort(void)
-{
- LIST_HEAD(sorted_devices);
- struct list_head *ln;
- struct pci_dev *dev, *d;
- int idx, found;
- unsigned char bus, devfn;
-
- DBG("PCI: Sorting device list...\n");
- while (!list_empty(&pci_devices)) {
- ln = pci_devices.next;
- dev = pci_dev_g(ln);
- idx = found = 0;
- while (pci_bios_find_device(dev->vendor, dev->device, idx, &bus, &devfn) == PCIBIOS_SUCCESSFUL) {
- idx++;
- list_for_each(ln, &pci_devices) {
- d = pci_dev_g(ln);
- if (d->bus->number == bus && d->devfn == devfn) {
- list_move_tail(&d->global_list, &sorted_devices);
- if (d == dev)
- found = 1;
- break;
- }
- }
- if (ln == &pci_devices) {
- printk(KERN_WARNING "PCI: BIOS reporting unknown device %02x:%02x\n", bus, devfn);
- /*
- * We must not continue scanning as several buggy BIOSes
- * return garbage after the last device. Grr.
- */
- break;
- }
- }
- if (!found) {
- printk(KERN_WARNING "PCI: Device %s not found by BIOS\n",
- pci_name(dev));
- list_move_tail(&dev->global_list, &sorted_devices);
- }
- }
- list_splice(&sorted_devices, &pci_devices);
-}
-
-/*
* BIOS Functions for IRQ Routing
*/
@@ -495,7 +424,6 @@ void __init pci_pcbios_init(void)
{
if ((pci_probe & PCI_PROBE_BIOS)
&& ((raw_pci_ops = pci_find_bios()))) {
- pci_probe |= PCI_BIOS_SORT;
pci_bios_present = 1;
}
}
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index 3431518d921a..c4bddaeff619 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -19,8 +19,6 @@
#define PCI_PROBE_MASK 0x000f
#define PCI_PROBE_NOEARLY 0x0010
-#define PCI_NO_SORT 0x0100
-#define PCI_BIOS_SORT 0x0200
#define PCI_NO_CHECKS 0x0400
#define PCI_USE_PIRQ_MASK 0x0800
#define PCI_ASSIGN_ROMS 0x1000
@@ -44,7 +42,6 @@ enum pci_bf_sort_state {
extern unsigned int pcibios_max_latency;
void pcibios_resource_survey(void);
-int pcibios_enable_resources(struct pci_dev *, int);
/* pci-pc.c */
@@ -101,7 +98,6 @@ extern int pci_direct_probe(void);
extern void pci_direct_init(int type);
extern void pci_pcbios_init(void);
extern void pci_mmcfg_init(int type);
-extern void pcibios_sort(void);
/* pci-mmconfig.c */
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index 7f9c6da04a4c..7dc5d5cf50a2 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -27,17 +27,17 @@ static void __save_processor_state(struct saved_context *ctxt)
/*
* descriptor tables
*/
- store_gdt(&ctxt->gdt);
- store_idt(&ctxt->idt);
- store_tr(ctxt->tr);
+ store_gdt(&ctxt->gdt);
+ store_idt(&ctxt->idt);
+ store_tr(ctxt->tr);
/*
* segment registers
*/
- savesegment(es, ctxt->es);
- savesegment(fs, ctxt->fs);
- savesegment(gs, ctxt->gs);
- savesegment(ss, ctxt->ss);
+ savesegment(es, ctxt->es);
+ savesegment(fs, ctxt->fs);
+ savesegment(gs, ctxt->gs);
+ savesegment(ss, ctxt->ss);
/*
* control registers
@@ -48,10 +48,12 @@ static void __save_processor_state(struct saved_context *ctxt)
ctxt->cr4 = read_cr4();
}
+/* Needed by apm.c */
void save_processor_state(void)
{
__save_processor_state(&saved_context);
}
+EXPORT_SYMBOL(save_processor_state);
static void do_fpu_end(void)
{
@@ -64,9 +66,14 @@ static void do_fpu_end(void)
static void fix_processor_context(void)
{
int cpu = smp_processor_id();
- struct tss_struct * t = &per_cpu(init_tss, cpu);
+ struct tss_struct *t = &per_cpu(init_tss, cpu);
- set_tss_desc(cpu,t); /* This just modifies memory; should not be necessary. But... This is necessary, because 386 hardware has concept of busy TSS or some similar stupidity. */
+ set_tss_desc(cpu, t); /*
+ * This just modifies memory; should not be
+ * necessary. But... This is necessary, because
+ * 386 hardware has concept of busy TSS or some
+ * similar stupidity.
+ */
load_TR_desc(); /* This does ltr */
load_LDT(&current->active_mm->context); /* This does lldt */
@@ -100,16 +107,16 @@ static void __restore_processor_state(struct saved_context *ctxt)
* now restore the descriptor tables to their proper values
* ltr is done i fix_processor_context().
*/
- load_gdt(&ctxt->gdt);
- load_idt(&ctxt->idt);
+ load_gdt(&ctxt->gdt);
+ load_idt(&ctxt->idt);
/*
* segment registers
*/
- loadsegment(es, ctxt->es);
- loadsegment(fs, ctxt->fs);
- loadsegment(gs, ctxt->gs);
- loadsegment(ss, ctxt->ss);
+ loadsegment(es, ctxt->es);
+ loadsegment(fs, ctxt->fs);
+ loadsegment(gs, ctxt->gs);
+ loadsegment(ss, ctxt->ss);
/*
* sysenter MSRs
@@ -123,11 +130,9 @@ static void __restore_processor_state(struct saved_context *ctxt)
mcheck_init(&boot_cpu_data);
}
+/* Needed by apm.c */
void restore_processor_state(void)
{
__restore_processor_state(&saved_context);
}
-
-/* Needed by apm.c */
-EXPORT_SYMBOL(save_processor_state);
EXPORT_SYMBOL(restore_processor_state);
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 0a8f4742ef51..b7ad9f89d21f 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -37,9 +37,10 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
$(obj)/%.so: $(obj)/%.so.dbg FORCE
$(call if_changed,objcopy)
-CFL := $(PROFILING) -mcmodel=small -fPIC -g0 -O2 -fasynchronous-unwind-tables -m64
+CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
+ $(filter -g%,$(KBUILD_CFLAGS))
-$(vobjs): KBUILD_CFLAGS = $(CFL)
+$(vobjs): KBUILD_CFLAGS += $(CFL)
targets += vdso-syms.lds
obj-$(VDSO64-y) += vdso-syms.lds
diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S
index 4b1620a1529e..1d3aa6b87181 100644
--- a/arch/x86/vdso/vdso.S
+++ b/arch/x86/vdso/vdso.S
@@ -1,2 +1,10 @@
- .section ".vdso","a"
+#include <linux/init.h>
+
+__INITDATA
+
+ .globl vdso_start, vdso_end
+vdso_start:
.incbin "arch/x86/vdso/vdso.so"
+vdso_end:
+
+__FINIT
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 348f1341e1c8..e2af8eee80e3 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -210,8 +210,12 @@ static int use_sysenter __read_mostly = -1;
/* May not be __init: called during resume */
void syscall32_cpu_init(void)
{
- if (use_sysenter < 0)
- use_sysenter = (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL);
+ if (use_sysenter < 0) {
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ use_sysenter = 1;
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR)
+ use_sysenter = 1;
+ }
/* Load these always in case some future AMD CPU supports
SYSENTER from compat mode too. */
@@ -325,6 +329,9 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
int ret = 0;
bool compat;
+ if (vdso_enabled == VDSO_DISABLED)
+ return 0;
+
down_write(&mm->mmap_sem);
/* Test compat mode once here, in case someone
diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c
index 48fb38d7d2c0..4db42bff8c60 100644
--- a/arch/x86/video/fbdev.c
+++ b/arch/x86/video/fbdev.c
@@ -1,5 +1,4 @@
/*
- * arch/i386/video/fbdev.c - i386 Framebuffer
*
* Copyright (C) 2007 Antonino Daplas <adaplas@gmail.com>
*
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 4d5f2649bee4..2e641be2737e 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,7 +6,7 @@ config XEN
bool "Xen guest support"
select PARAVIRT
depends on X86_32
- depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER)
+ depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER)
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 343df246bd3e..3d8df981d5fd 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
-obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \
- events.o time.o manage.o xen-asm.o
+obj-y := enlighten.o setup.o multicalls.o mmu.o \
+ time.o manage.o xen-asm.o grant-table.o
obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 27ee26aedf94..c8a56e457d61 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -25,6 +25,7 @@
#include <linux/mm.h>
#include <linux/page-flags.h>
#include <linux/highmem.h>
+#include <linux/console.h>
#include <xen/interface/xen.h>
#include <xen/interface/physdev.h>
@@ -154,7 +155,8 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
if (*ax == 1)
maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
(1 << X86_FEATURE_ACPI) | /* disable ACPI */
- (1 << X86_FEATURE_SEP) | /* disable SEP */
+ (1 << X86_FEATURE_MCE) | /* disable MCE */
+ (1 << X86_FEATURE_MCA) | /* disable MCA */
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
asm(XEN_EMULATE_PREFIX "cpuid"
@@ -530,26 +532,37 @@ static void xen_apic_write(unsigned long reg, u32 val)
static void xen_flush_tlb(void)
{
struct mmuext_op *op;
- struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+ struct multicall_space mcs;
+
+ preempt_disable();
+
+ mcs = xen_mc_entry(sizeof(*op));
op = mcs.args;
op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
}
static void xen_flush_tlb_single(unsigned long addr)
{
struct mmuext_op *op;
- struct multicall_space mcs = xen_mc_entry(sizeof(*op));
+ struct multicall_space mcs;
+
+ preempt_disable();
+ mcs = xen_mc_entry(sizeof(*op));
op = mcs.args;
op->cmd = MMUEXT_INVLPG_LOCAL;
op->arg1.linear_addr = addr & PAGE_MASK;
MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
}
static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
@@ -654,15 +667,17 @@ static void xen_write_cr3(unsigned long cr3)
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
-static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
+static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
{
+#ifdef CONFIG_FLATMEM
BUG_ON(mem_map); /* should only be used early */
+#endif
make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
}
-/* Early release_pt assumes that all pts are pinned, since there's
+/* Early release_pte assumes that all pts are pinned, since there's
only init_mm and anything attached to that is pinned. */
-static void xen_release_pt_init(u32 pfn)
+static void xen_release_pte_init(u32 pfn)
{
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
@@ -696,12 +711,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
}
}
-static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pte(struct mm_struct *mm, u32 pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PTE);
}
-static void xen_alloc_pd(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PMD);
}
@@ -721,12 +736,12 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
}
}
-static void xen_release_pt(u32 pfn)
+static void xen_release_pte(u32 pfn)
{
xen_release_ptpage(pfn, PT_PTE);
}
-static void xen_release_pd(u32 pfn)
+static void xen_release_pmd(u32 pfn)
{
xen_release_ptpage(pfn, PT_PMD);
}
@@ -848,10 +863,10 @@ static __init void xen_pagetable_setup_done(pgd_t *base)
{
/* This will work as long as patching hasn't happened yet
(which it hasn't) */
- pv_mmu_ops.alloc_pt = xen_alloc_pt;
- pv_mmu_ops.alloc_pd = xen_alloc_pd;
- pv_mmu_ops.release_pt = xen_release_pt;
- pv_mmu_ops.release_pd = xen_release_pd;
+ pv_mmu_ops.alloc_pte = xen_alloc_pte;
+ pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+ pv_mmu_ops.release_pte = xen_release_pte;
+ pv_mmu_ops.release_pmd = xen_release_pmd;
pv_mmu_ops.set_pte = xen_set_pte;
setup_shared_info();
@@ -889,7 +904,6 @@ void __init xen_setup_vcpu_info_placement(void)
pv_irq_ops.irq_disable = xen_irq_disable_direct;
pv_irq_ops.irq_enable = xen_irq_enable_direct;
pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
- pv_cpu_ops.iret = xen_iret_direct;
}
}
@@ -993,8 +1007,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
- .iret = (void *)&hypercall_page[__HYPERVISOR_iret],
- .irq_enable_syscall_ret = NULL, /* never called */
+ .iret = xen_iret,
+ .irq_enable_syscall_ret = xen_sysexit,
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
@@ -1059,11 +1073,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
- .alloc_pt = xen_alloc_pt_init,
- .release_pt = xen_release_pt_init,
- .alloc_pd = xen_alloc_pt_init,
- .alloc_pd_clone = paravirt_nop,
- .release_pd = xen_release_pt_init,
+ .alloc_pte = xen_alloc_pte_init,
+ .release_pte = xen_release_pte_init,
+ .alloc_pmd = xen_alloc_pte_init,
+ .alloc_pmd_clone = paravirt_nop,
+ .release_pmd = xen_release_pte_init,
#ifdef CONFIG_HIGHPTE
.kmap_atomic_pte = xen_kmap_atomic_pte,
@@ -1228,6 +1242,9 @@ asmlinkage void __init xen_start_kernel(void)
? __pa(xen_start_info->mod_start) : 0;
boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
+ if (!is_initial_xendomain())
+ add_preferred_console("hvc", 0, NULL);
+
/* Start the world */
start_kernel();
}
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
deleted file mode 100644
index dcf613e17581..000000000000
--- a/arch/x86/xen/events.c
+++ /dev/null
@@ -1,591 +0,0 @@
-/*
- * Xen event channels
- *
- * Xen models interrupts with abstract event channels. Because each
- * domain gets 1024 event channels, but NR_IRQ is not that large, we
- * must dynamically map irqs<->event channels. The event channels
- * interface with the rest of the kernel by defining a xen interrupt
- * chip. When an event is recieved, it is mapped to an irq and sent
- * through the normal interrupt processing path.
- *
- * There are four kinds of events which can be mapped to an event
- * channel:
- *
- * 1. Inter-domain notifications. This includes all the virtual
- * device events, since they're driven by front-ends in another domain
- * (typically dom0).
- * 2. VIRQs, typically used for timers. These are per-cpu events.
- * 3. IPIs.
- * 4. Hardware interrupts. Not supported at present.
- *
- * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
- */
-
-#include <linux/linkage.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
-#include <linux/module.h>
-#include <linux/string.h>
-
-#include <asm/ptrace.h>
-#include <asm/irq.h>
-#include <asm/sync_bitops.h>
-#include <asm/xen/hypercall.h>
-#include <asm/xen/hypervisor.h>
-
-#include <xen/events.h>
-#include <xen/interface/xen.h>
-#include <xen/interface/event_channel.h>
-
-#include "xen-ops.h"
-
-/*
- * This lock protects updates to the following mapping and reference-count
- * arrays. The lock does not need to be acquired to read the mapping tables.
- */
-static DEFINE_SPINLOCK(irq_mapping_update_lock);
-
-/* IRQ <-> VIRQ mapping. */
-static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1};
-
-/* IRQ <-> IPI mapping */
-static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1};
-
-/* Packed IRQ information: binding type, sub-type index, and event channel. */
-struct packed_irq
-{
- unsigned short evtchn;
- unsigned char index;
- unsigned char type;
-};
-
-static struct packed_irq irq_info[NR_IRQS];
-
-/* Binding types. */
-enum {
- IRQT_UNBOUND,
- IRQT_PIRQ,
- IRQT_VIRQ,
- IRQT_IPI,
- IRQT_EVTCHN
-};
-
-/* Convenient shorthand for packed representation of an unbound IRQ. */
-#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0)
-
-static int evtchn_to_irq[NR_EVENT_CHANNELS] = {
- [0 ... NR_EVENT_CHANNELS-1] = -1
-};
-static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG];
-static u8 cpu_evtchn[NR_EVENT_CHANNELS];
-
-/* Reference counts for bindings to IRQs. */
-static int irq_bindcount[NR_IRQS];
-
-/* Xen will never allocate port zero for any purpose. */
-#define VALID_EVTCHN(chn) ((chn) != 0)
-
-/*
- * Force a proper event-channel callback from Xen after clearing the
- * callback mask. We do this in a very simple manner, by making a call
- * down into Xen. The pending flag will be checked by Xen on return.
- */
-void force_evtchn_callback(void)
-{
- (void)HYPERVISOR_xen_version(0, NULL);
-}
-EXPORT_SYMBOL_GPL(force_evtchn_callback);
-
-static struct irq_chip xen_dynamic_chip;
-
-/* Constructor for packed IRQ information. */
-static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn)
-{
- return (struct packed_irq) { evtchn, index, type };
-}
-
-/*
- * Accessors for packed IRQ information.
- */
-static inline unsigned int evtchn_from_irq(int irq)
-{
- return irq_info[irq].evtchn;
-}
-
-static inline unsigned int index_from_irq(int irq)
-{
- return irq_info[irq].index;
-}
-
-static inline unsigned int type_from_irq(int irq)
-{
- return irq_info[irq].type;
-}
-
-static inline unsigned long active_evtchns(unsigned int cpu,
- struct shared_info *sh,
- unsigned int idx)
-{
- return (sh->evtchn_pending[idx] &
- cpu_evtchn_mask[cpu][idx] &
- ~sh->evtchn_mask[idx]);
-}
-
-static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
-{
- int irq = evtchn_to_irq[chn];
-
- BUG_ON(irq == -1);
-#ifdef CONFIG_SMP
- irq_desc[irq].affinity = cpumask_of_cpu(cpu);
-#endif
-
- __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
- __set_bit(chn, cpu_evtchn_mask[cpu]);
-
- cpu_evtchn[chn] = cpu;
-}
-
-static void init_evtchn_cpu_bindings(void)
-{
-#ifdef CONFIG_SMP
- int i;
- /* By default all event channels notify CPU#0. */
- for (i = 0; i < NR_IRQS; i++)
- irq_desc[i].affinity = cpumask_of_cpu(0);
-#endif
-
- memset(cpu_evtchn, 0, sizeof(cpu_evtchn));
- memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0]));
-}
-
-static inline unsigned int cpu_from_evtchn(unsigned int evtchn)
-{
- return cpu_evtchn[evtchn];
-}
-
-static inline void clear_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- sync_clear_bit(port, &s->evtchn_pending[0]);
-}
-
-static inline void set_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- sync_set_bit(port, &s->evtchn_pending[0]);
-}
-
-
-/**
- * notify_remote_via_irq - send event to remote end of event channel via irq
- * @irq: irq of event channel to send event to
- *
- * Unlike notify_remote_via_evtchn(), this is safe to use across
- * save/restore. Notifications on a broken connection are silently
- * dropped.
- */
-void notify_remote_via_irq(int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- notify_remote_via_evtchn(evtchn);
-}
-EXPORT_SYMBOL_GPL(notify_remote_via_irq);
-
-static void mask_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- sync_set_bit(port, &s->evtchn_mask[0]);
-}
-
-static void unmask_evtchn(int port)
-{
- struct shared_info *s = HYPERVISOR_shared_info;
- unsigned int cpu = get_cpu();
-
- BUG_ON(!irqs_disabled());
-
- /* Slow path (hypercall) if this is a non-local port. */
- if (unlikely(cpu != cpu_from_evtchn(port))) {
- struct evtchn_unmask unmask = { .port = port };
- (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask);
- } else {
- struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
-
- sync_clear_bit(port, &s->evtchn_mask[0]);
-
- /*
- * The following is basically the equivalent of
- * 'hw_resend_irq'. Just like a real IO-APIC we 'lose
- * the interrupt edge' if the channel is masked.
- */
- if (sync_test_bit(port, &s->evtchn_pending[0]) &&
- !sync_test_and_set_bit(port / BITS_PER_LONG,
- &vcpu_info->evtchn_pending_sel))
- vcpu_info->evtchn_upcall_pending = 1;
- }
-
- put_cpu();
-}
-
-static int find_unbound_irq(void)
-{
- int irq;
-
- /* Only allocate from dynirq range */
- for (irq = 0; irq < NR_IRQS; irq++)
- if (irq_bindcount[irq] == 0)
- break;
-
- if (irq == NR_IRQS)
- panic("No available IRQ to bind to: increase NR_IRQS!\n");
-
- return irq;
-}
-
-int bind_evtchn_to_irq(unsigned int evtchn)
-{
- int irq;
-
- spin_lock(&irq_mapping_update_lock);
-
- irq = evtchn_to_irq[evtchn];
-
- if (irq == -1) {
- irq = find_unbound_irq();
-
- dynamic_irq_init(irq);
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "event");
-
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn);
- }
-
- irq_bindcount[irq]++;
-
- spin_unlock(&irq_mapping_update_lock);
-
- return irq;
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irq);
-
-static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
-{
- struct evtchn_bind_ipi bind_ipi;
- int evtchn, irq;
-
- spin_lock(&irq_mapping_update_lock);
-
- irq = per_cpu(ipi_to_irq, cpu)[ipi];
- if (irq == -1) {
- irq = find_unbound_irq();
- if (irq < 0)
- goto out;
-
- dynamic_irq_init(irq);
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "ipi");
-
- bind_ipi.vcpu = cpu;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
- &bind_ipi) != 0)
- BUG();
- evtchn = bind_ipi.port;
-
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn);
-
- per_cpu(ipi_to_irq, cpu)[ipi] = irq;
-
- bind_evtchn_to_cpu(evtchn, cpu);
- }
-
- irq_bindcount[irq]++;
-
- out:
- spin_unlock(&irq_mapping_update_lock);
- return irq;
-}
-
-
-static int bind_virq_to_irq(unsigned int virq, unsigned int cpu)
-{
- struct evtchn_bind_virq bind_virq;
- int evtchn, irq;
-
- spin_lock(&irq_mapping_update_lock);
-
- irq = per_cpu(virq_to_irq, cpu)[virq];
-
- if (irq == -1) {
- bind_virq.virq = virq;
- bind_virq.vcpu = cpu;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
- &bind_virq) != 0)
- BUG();
- evtchn = bind_virq.port;
-
- irq = find_unbound_irq();
-
- dynamic_irq_init(irq);
- set_irq_chip_and_handler_name(irq, &xen_dynamic_chip,
- handle_level_irq, "virq");
-
- evtchn_to_irq[evtchn] = irq;
- irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn);
-
- per_cpu(virq_to_irq, cpu)[virq] = irq;
-
- bind_evtchn_to_cpu(evtchn, cpu);
- }
-
- irq_bindcount[irq]++;
-
- spin_unlock(&irq_mapping_update_lock);
-
- return irq;
-}
-
-static void unbind_from_irq(unsigned int irq)
-{
- struct evtchn_close close;
- int evtchn = evtchn_from_irq(irq);
-
- spin_lock(&irq_mapping_update_lock);
-
- if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) {
- close.port = evtchn;
- if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0)
- BUG();
-
- switch (type_from_irq(irq)) {
- case IRQT_VIRQ:
- per_cpu(virq_to_irq, cpu_from_evtchn(evtchn))
- [index_from_irq(irq)] = -1;
- break;
- default:
- break;
- }
-
- /* Closed ports are implicitly re-bound to VCPU0. */
- bind_evtchn_to_cpu(evtchn, 0);
-
- evtchn_to_irq[evtchn] = -1;
- irq_info[irq] = IRQ_UNBOUND;
-
- dynamic_irq_init(irq);
- }
-
- spin_unlock(&irq_mapping_update_lock);
-}
-
-int bind_evtchn_to_irqhandler(unsigned int evtchn,
- irq_handler_t handler,
- unsigned long irqflags,
- const char *devname, void *dev_id)
-{
- unsigned int irq;
- int retval;
-
- irq = bind_evtchn_to_irq(evtchn);
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
- if (retval != 0) {
- unbind_from_irq(irq);
- return retval;
- }
-
- return irq;
-}
-EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler);
-
-int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu,
- irq_handler_t handler,
- unsigned long irqflags, const char *devname, void *dev_id)
-{
- unsigned int irq;
- int retval;
-
- irq = bind_virq_to_irq(virq, cpu);
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
- if (retval != 0) {
- unbind_from_irq(irq);
- return retval;
- }
-
- return irq;
-}
-EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler);
-
-int bind_ipi_to_irqhandler(enum ipi_vector ipi,
- unsigned int cpu,
- irq_handler_t handler,
- unsigned long irqflags,
- const char *devname,
- void *dev_id)
-{
- int irq, retval;
-
- irq = bind_ipi_to_irq(ipi, cpu);
- if (irq < 0)
- return irq;
-
- retval = request_irq(irq, handler, irqflags, devname, dev_id);
- if (retval != 0) {
- unbind_from_irq(irq);
- return retval;
- }
-
- return irq;
-}
-
-void unbind_from_irqhandler(unsigned int irq, void *dev_id)
-{
- free_irq(irq, dev_id);
- unbind_from_irq(irq);
-}
-EXPORT_SYMBOL_GPL(unbind_from_irqhandler);
-
-void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector)
-{
- int irq = per_cpu(ipi_to_irq, cpu)[vector];
- BUG_ON(irq < 0);
- notify_remote_via_irq(irq);
-}
-
-
-/*
- * Search the CPUs pending events bitmasks. For each one found, map
- * the event number to an irq, and feed it into do_IRQ() for
- * handling.
- *
- * Xen uses a two-level bitmap to speed searching. The first level is
- * a bitset of words which contain pending event bits. The second
- * level is a bitset of pending events themselves.
- */
-void xen_evtchn_do_upcall(struct pt_regs *regs)
-{
- int cpu = get_cpu();
- struct shared_info *s = HYPERVISOR_shared_info;
- struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
- unsigned long pending_words;
-
- vcpu_info->evtchn_upcall_pending = 0;
-
- /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
- pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
- while (pending_words != 0) {
- unsigned long pending_bits;
- int word_idx = __ffs(pending_words);
- pending_words &= ~(1UL << word_idx);
-
- while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
- int bit_idx = __ffs(pending_bits);
- int port = (word_idx * BITS_PER_LONG) + bit_idx;
- int irq = evtchn_to_irq[port];
-
- if (irq != -1) {
- regs->orig_ax = ~irq;
- do_IRQ(regs);
- }
- }
- }
-
- put_cpu();
-}
-
-/* Rebind an evtchn so that it gets delivered to a specific cpu */
-static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
-{
- struct evtchn_bind_vcpu bind_vcpu;
- int evtchn = evtchn_from_irq(irq);
-
- if (!VALID_EVTCHN(evtchn))
- return;
-
- /* Send future instances of this interrupt to other vcpu. */
- bind_vcpu.port = evtchn;
- bind_vcpu.vcpu = tcpu;
-
- /*
- * If this fails, it usually just indicates that we're dealing with a
- * virq or IPI channel, which don't actually need to be rebound. Ignore
- * it, but don't do the xenlinux-level rebind in that case.
- */
- if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
- bind_evtchn_to_cpu(evtchn, tcpu);
-}
-
-
-static void set_affinity_irq(unsigned irq, cpumask_t dest)
-{
- unsigned tcpu = first_cpu(dest);
- rebind_irq_to_cpu(irq, tcpu);
-}
-
-static void enable_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- unmask_evtchn(evtchn);
-}
-
-static void disable_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- mask_evtchn(evtchn);
-}
-
-static void ack_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
-
- move_native_irq(irq);
-
- if (VALID_EVTCHN(evtchn))
- clear_evtchn(evtchn);
-}
-
-static int retrigger_dynirq(unsigned int irq)
-{
- int evtchn = evtchn_from_irq(irq);
- int ret = 0;
-
- if (VALID_EVTCHN(evtchn)) {
- set_evtchn(evtchn);
- ret = 1;
- }
-
- return ret;
-}
-
-static struct irq_chip xen_dynamic_chip __read_mostly = {
- .name = "xen-dyn",
- .mask = disable_dynirq,
- .unmask = enable_dynirq,
- .ack = ack_dynirq,
- .set_affinity = set_affinity_irq,
- .retrigger = retrigger_dynirq,
-};
-
-void __init xen_init_IRQ(void)
-{
- int i;
-
- init_evtchn_cpu_bindings();
-
- /* No event channels are 'live' right now. */
- for (i = 0; i < NR_EVENT_CHANNELS; i++)
- mask_evtchn(i);
-
- /* Dynamic IRQ space is currently unbound. Zero the refcnts. */
- for (i = 0; i < NR_IRQS; i++)
- irq_bindcount[i] = 0;
-
- irq_ctx_init(smp_processor_id());
-}
diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c
deleted file mode 100644
index 0707714e40d6..000000000000
--- a/arch/x86/xen/features.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/******************************************************************************
- * features.c
- *
- * Xen feature flags.
- *
- * Copyright (c) 2006, Ian Campbell, XenSource Inc.
- */
-#include <linux/types.h>
-#include <linux/cache.h>
-#include <linux/module.h>
-#include <asm/xen/hypervisor.h>
-#include <xen/features.h>
-
-u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly;
-EXPORT_SYMBOL_GPL(xen_features);
-
-void xen_setup_features(void)
-{
- struct xen_feature_info fi;
- int i, j;
-
- for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) {
- fi.submap_idx = i;
- if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0)
- break;
- for (j = 0; j < 32; j++)
- xen_features[i * 32 + j] = !!(fi.submap & 1<<j);
- }
-}
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
new file mode 100644
index 000000000000..49ba9b5224d1
--- /dev/null
+++ b/arch/x86/xen/grant-table.c
@@ -0,0 +1,91 @@
+/******************************************************************************
+ * grant_table.c
+ * x86 specific part
+ *
+ * Granting foreign access to our memory reservation.
+ *
+ * Copyright (c) 2005-2006, Christopher Clark
+ * Copyright (c) 2004-2005, K A Fraser
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan. Split out x86 specific part.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+#include <xen/interface/xen.h>
+#include <xen/page.h>
+#include <xen/grant_table.h>
+
+#include <asm/pgtable.h>
+
+static int map_pte_fn(pte_t *pte, struct page *pmd_page,
+ unsigned long addr, void *data)
+{
+ unsigned long **frames = (unsigned long **)data;
+
+ set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL));
+ (*frames)++;
+ return 0;
+}
+
+static int unmap_pte_fn(pte_t *pte, struct page *pmd_page,
+ unsigned long addr, void *data)
+{
+
+ set_pte_at(&init_mm, addr, pte, __pte(0));
+ return 0;
+}
+
+int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ struct grant_entry **__shared)
+{
+ int rc;
+ struct grant_entry *shared = *__shared;
+
+ if (shared == NULL) {
+ struct vm_struct *area =
+ xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes);
+ BUG_ON(area == NULL);
+ shared = area->addr;
+ *__shared = shared;
+ }
+
+ rc = apply_to_page_range(&init_mm, (unsigned long)shared,
+ PAGE_SIZE * nr_gframes,
+ map_pte_fn, &frames);
+ return rc;
+}
+
+void arch_gnttab_unmap_shared(struct grant_entry *shared,
+ unsigned long nr_gframes)
+{
+ apply_to_page_range(&init_mm, (unsigned long)shared,
+ PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL);
+}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 2a054ef2a3da..126766d43aea 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -156,6 +156,10 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
+ /* updates to init_mm may be done without lock */
+ if (mm == &init_mm)
+ preempt_disable();
+
if (mm == current->mm || mm == &init_mm) {
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
struct multicall_space mcs;
@@ -163,14 +167,61 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
xen_mc_issue(PARAVIRT_LAZY_MMU);
- return;
+ goto out;
} else
if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
- return;
+ goto out;
}
xen_set_pte(ptep, pteval);
+
+out:
+ if (mm == &init_mm)
+ preempt_enable();
+}
+
+pteval_t xen_pte_val(pte_t pte)
+{
+ pteval_t ret = pte.pte;
+
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+
+ return ret;
+}
+
+pgdval_t xen_pgd_val(pgd_t pgd)
+{
+ pgdval_t ret = pgd.pgd;
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+ return ret;
+}
+
+pte_t xen_make_pte(pteval_t pte)
+{
+ if (pte & _PAGE_PRESENT) {
+ pte = phys_to_machine(XPADDR(pte)).maddr;
+ pte &= ~(_PAGE_PCD | _PAGE_PWT);
+ }
+
+ return (pte_t){ .pte = pte };
}
+pgd_t xen_make_pgd(pgdval_t pgd)
+{
+ if (pgd & _PAGE_PRESENT)
+ pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+ return (pgd_t){ pgd };
+}
+
+pmdval_t xen_pmd_val(pmd_t pmd)
+{
+ pmdval_t ret = native_pmd_val(pmd);
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+ return ret;
+}
#ifdef CONFIG_X86_PAE
void xen_set_pud(pud_t *ptr, pud_t val)
{
@@ -214,100 +265,18 @@ void xen_pmd_clear(pmd_t *pmdp)
xen_set_pmd(pmdp, __pmd(0));
}
-unsigned long long xen_pte_val(pte_t pte)
+pmd_t xen_make_pmd(pmdval_t pmd)
{
- unsigned long long ret = 0;
-
- if (pte.pte_low) {
- ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
- }
-
- return ret;
-}
-
-unsigned long long xen_pmd_val(pmd_t pmd)
-{
- unsigned long long ret = pmd.pmd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
- return ret;
-}
-
-unsigned long long xen_pgd_val(pgd_t pgd)
-{
- unsigned long long ret = pgd.pgd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
- return ret;
-}
-
-pte_t xen_make_pte(unsigned long long pte)
-{
- if (pte & _PAGE_PRESENT) {
- pte = phys_to_machine(XPADDR(pte)).maddr;
- pte &= ~(_PAGE_PCD | _PAGE_PWT);
- }
-
- return (pte_t){ .pte = pte };
-}
-
-pmd_t xen_make_pmd(unsigned long long pmd)
-{
- if (pmd & 1)
+ if (pmd & _PAGE_PRESENT)
pmd = phys_to_machine(XPADDR(pmd)).maddr;
- return (pmd_t){ pmd };
-}
-
-pgd_t xen_make_pgd(unsigned long long pgd)
-{
- if (pgd & _PAGE_PRESENT)
- pgd = phys_to_machine(XPADDR(pgd)).maddr;
-
- return (pgd_t){ pgd };
+ return native_make_pmd(pmd);
}
#else /* !PAE */
void xen_set_pte(pte_t *ptep, pte_t pte)
{
*ptep = pte;
}
-
-unsigned long xen_pte_val(pte_t pte)
-{
- unsigned long ret = pte.pte_low;
-
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr;
-
- return ret;
-}
-
-unsigned long xen_pgd_val(pgd_t pgd)
-{
- unsigned long ret = pgd.pgd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
- return ret;
-}
-
-pte_t xen_make_pte(unsigned long pte)
-{
- if (pte & _PAGE_PRESENT) {
- pte = phys_to_machine(XPADDR(pte)).maddr;
- pte &= ~(_PAGE_PCD | _PAGE_PWT);
- }
-
- return (pte_t){ pte };
-}
-
-pgd_t xen_make_pgd(unsigned long pgd)
-{
- if (pgd & _PAGE_PRESENT)
- pgd = phys_to_machine(XPADDR(pgd)).maddr;
-
- return (pgd_t){ pgd };
-}
#endif /* CONFIG_X86_PAE */
/*
@@ -418,7 +387,7 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
static int pin_page(struct page *page, enum pt_level level)
{
- unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags);
+ unsigned pgfl = TestSetPagePinned(page);
int flush;
if (pgfl)
@@ -499,7 +468,7 @@ void __init xen_mark_init_mm_pinned(void)
static int unpin_page(struct page *page, enum pt_level level)
{
- unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags);
+ unsigned pgfl = TestClearPagePinned(page);
if (pgfl && !PageHighMem(page)) {
void *pt = lowmem_page_address(page);
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 5e6f36f6d876..5791eb2e3750 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -76,7 +76,7 @@ void xen_mc_flush(void)
if (ret) {
printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
ret, smp_processor_id());
- for(i = 0; i < b->mcidx; i++) {
+ for (i = 0; i < b->mcidx; i++) {
printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
i+1, b->mcidx,
b->debug[i].op,
@@ -93,7 +93,7 @@ void xen_mc_flush(void)
local_irq_restore(flags);
- for(i = 0; i < b->cbidx; i++) {
+ for (i = 0; i < b->cbidx; i++) {
struct callback *cb = &b->callbacks[i];
(*cb->fn)(cb->data);
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 2341492bf7a0..82517e4a752a 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -16,6 +16,7 @@
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+#include <xen/interface/callback.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
@@ -68,6 +69,24 @@ static void __init fiddle_vdso(void)
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
}
+void xen_enable_sysenter(void)
+{
+ int cpu = smp_processor_id();
+ extern void xen_sysenter_target(void);
+ /* Mask events on entry, even though they get enabled immediately */
+ static struct callback_register sysenter = {
+ .type = CALLBACKTYPE_sysenter,
+ .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
+ .flags = CALLBACKF_mask_events,
+ };
+
+ if (!boot_cpu_has(X86_FEATURE_SEP) ||
+ HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) {
+ clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP);
+ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
+ }
+}
+
void __init xen_arch_setup(void)
{
struct physdev_set_iopl set_iopl;
@@ -82,6 +101,8 @@ void __init xen_arch_setup(void)
HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
__KERNEL_CS, (unsigned long)xen_failsafe_callback);
+ xen_enable_sysenter();
+
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
if (rc != 0)
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index aafc54437403..94e69000f982 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -35,9 +35,10 @@
#include "xen-ops.h"
#include "mmu.h"
-static cpumask_t cpu_initialized_map;
-static DEFINE_PER_CPU(int, resched_irq);
-static DEFINE_PER_CPU(int, callfunc_irq);
+static cpumask_t xen_cpu_initialized_map;
+static DEFINE_PER_CPU(int, resched_irq) = -1;
+static DEFINE_PER_CPU(int, callfunc_irq) = -1;
+static DEFINE_PER_CPU(int, debug_irq) = -1;
/*
* Structure and data for smp_call_function(). This is designed to minimise
@@ -72,6 +73,7 @@ static __cpuinit void cpu_bringup_and_idle(void)
int cpu = smp_processor_id();
cpu_init();
+ xen_enable_sysenter();
preempt_disable();
per_cpu(cpu_state, cpu) = CPU_ONLINE;
@@ -88,9 +90,7 @@ static __cpuinit void cpu_bringup_and_idle(void)
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
- const char *resched_name, *callfunc_name;
-
- per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+ const char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -114,6 +114,14 @@ static int xen_smp_intr_init(unsigned int cpu)
goto fail;
per_cpu(callfunc_irq, cpu) = rc;
+ debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
+ rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
+ IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING,
+ debug_name, NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(debug_irq, cpu) = rc;
+
return 0;
fail:
@@ -121,6 +129,8 @@ static int xen_smp_intr_init(unsigned int cpu)
unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
if (per_cpu(callfunc_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+ if (per_cpu(debug_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
return rc;
}
@@ -179,11 +189,11 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus)
if (xen_smp_intr_init(0))
BUG();
- cpu_initialized_map = cpumask_of_cpu(0);
+ xen_cpu_initialized_map = cpumask_of_cpu(0);
/* Restrict the possible_map according to max_cpus. */
while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
- for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--)
+ for (cpu = NR_CPUS - 1; !cpu_possible(cpu); cpu--)
continue;
cpu_clear(cpu, cpu_possible_map);
}
@@ -210,7 +220,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
struct vcpu_guest_context *ctxt;
struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
- if (cpu_test_and_set(cpu, cpu_initialized_map))
+ if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
return 0;
ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
index 6b7190449d07..2497a30f41de 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -108,6 +108,20 @@ ENDPATCH(xen_restore_fl_direct)
RELOC(xen_restore_fl_direct, 2b+1)
/*
+ We can't use sysexit directly, because we're not running in ring0.
+ But we can easily fake it up using iret. Assuming xen_sysexit
+ is jumped to with a standard stack frame, we can just strip it
+ back to a standard iret frame and use iret.
+ */
+ENTRY(xen_sysexit)
+ movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
+ orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
+ lea PT_EIP(%esp), %esp
+
+ jmp xen_iret
+ENDPROC(xen_sysexit)
+
+/*
This is run where a normal iret would be run, with the same stack setup:
8: eflags
4: cs
@@ -135,13 +149,8 @@ ENDPATCH(xen_restore_fl_direct)
current stack state in whatever form its in, we keep things
simple by only using a single register which is pushed/popped
on the stack.
-
- Non-direct iret could be done in the same way, but it would
- require an annoying amount of code duplication. We'll assume
- that direct mode will be the common case once the hypervisor
- support becomes commonplace.
*/
-ENTRY(xen_iret_direct)
+ENTRY(xen_iret)
/* test eflags for special cases */
testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
jnz hyper_iret
@@ -155,9 +164,9 @@ ENTRY(xen_iret_direct)
GET_THREAD_INFO(%eax)
movl TI_cpu(%eax),%eax
movl __per_cpu_offset(,%eax,4),%eax
- lea per_cpu__xen_vcpu_info(%eax),%eax
+ mov per_cpu__xen_vcpu(%eax),%eax
#else
- movl $per_cpu__xen_vcpu_info, %eax
+ movl per_cpu__xen_vcpu, %eax
#endif
/* check IF state we're restoring */
@@ -189,8 +198,12 @@ iret_restore_end:
region is OK. */
je xen_hypervisor_callback
- iret
+1: iret
xen_iret_end_crit:
+.section __ex_table,"a"
+ .align 4
+ .long 1b,iret_exc
+.previous
hyper_iret:
/* put this out of line since its very rarely used */
@@ -224,9 +237,7 @@ hyper_iret:
ds } SAVE_ALL state
eax }
: :
- ebx }
- ----------------
- return addr <- esp
+ ebx }<- esp
----------------
In order to deliver the nested exception properly, we need to shift
@@ -241,10 +252,8 @@ hyper_iret:
it's usermode state which we eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
- /* offsets +4 for return address */
-
/*
- Paranoia: Make sure we're really coming from userspace.
+ Paranoia: Make sure we're really coming from kernel space.
One could imagine a case where userspace jumps into the
critical range address, but just before the CPU delivers a GP,
it decides to deliver an interrupt instead. Unlikely?
@@ -253,32 +262,32 @@ ENTRY(xen_iret_crit_fixup)
jump instruction itself, not the destination, but some virtual
environments get this wrong.
*/
- movl PT_CS+4(%esp), %ecx
+ movl PT_CS(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
cmpl $USER_RPL, %ecx
je 2f
- lea PT_ORIG_EAX+4(%esp), %esi
- lea PT_EFLAGS+4(%esp), %edi
+ lea PT_ORIG_EAX(%esp), %esi
+ lea PT_EFLAGS(%esp), %edi
/* If eip is before iret_restore_end then stack
hasn't been restored yet. */
cmp $iret_restore_end, %eax
jae 1f
- movl 0+4(%edi),%eax /* copy EAX */
- movl %eax, PT_EAX+4(%esp)
+ movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */
+ movl %eax, PT_EAX(%esp)
lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
/* set up the copy */
1: std
- mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
+ mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
rep movsl
cld
lea 4(%edi),%esp /* point esp to new frame */
-2: ret
+2: jmp xen_do_upcall
/*
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index b02a909bfd4c..f1063ae08037 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,8 @@
#define XEN_OPS_H
#include <linux/init.h>
+#include <linux/irqreturn.h>
+#include <xen/xen-ops.h>
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
@@ -9,7 +11,6 @@ extern const char xen_failsafe_callback[];
void xen_copy_trap_info(struct trap_info *traps);
-DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu);
DECLARE_PER_CPU(unsigned long, xen_cr3);
DECLARE_PER_CPU(unsigned long, xen_current_cr3);
@@ -19,6 +20,7 @@ extern struct shared_info *HYPERVISOR_shared_info;
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
void __init xen_init_IRQ(void);
+void xen_enable_sysenter(void);
void xen_setup_timer(int cpu);
void xen_setup_cpu_clockevents(void);
@@ -28,6 +30,8 @@ unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
unsigned long long xen_sched_clock(void);
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
+
bool xen_vcpu_stolen(int vcpu);
void xen_mark_init_mm_pinned(void);
@@ -63,5 +67,7 @@ DECL_ASM(void, xen_irq_disable_direct, void);
DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
-void xen_iret_direct(void);
+void xen_iret(void);
+void xen_sysexit(void);
+
#endif /* XEN_OPS_H */