summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig57
-rw-r--r--arch/x86/Kconfig.cpu5
-rw-r--r--arch/x86/Kconfig.debug15
-rw-r--r--arch/x86/boot/compressed/Makefile5
-rw-r--r--arch/x86/boot/compressed/head_64.S2
-rw-r--r--arch/x86/boot/compressed/misc.c6
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c2
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S1832
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c540
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c1
-rw-r--r--arch/x86/ia32/sys_ia32.c1
-rw-r--r--arch/x86/include/asm/acpi.h11
-rw-r--r--arch/x86/include/asm/alternative.h8
-rw-r--r--arch/x86/include/asm/amd_nb.h60
-rw-r--r--arch/x86/include/asm/apic.h13
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/boot.h6
-rw-r--r--arch/x86/include/asm/cacheflush.h42
-rw-r--r--arch/x86/include/asm/cpu.h1
-rw-r--r--arch/x86/include/asm/debugreg.h2
-rw-r--r--arch/x86/include/asm/e820.h3
-rw-r--r--arch/x86/include/asm/fixmap.h8
-rw-r--r--arch/x86/include/asm/gpio.h5
-rw-r--r--arch/x86/include/asm/hypervisor.h12
-rw-r--r--arch/x86/include/asm/i387.h24
-rw-r--r--arch/x86/include/asm/io_apic.h9
-rw-r--r--arch/x86/include/asm/irq.h7
-rw-r--r--arch/x86/include/asm/jump_label.h2
-rw-r--r--arch/x86/include/asm/kdebug.h3
-rw-r--r--arch/x86/include/asm/kvm_emulate.h35
-rw-r--r--arch/x86/include/asm/kvm_host.h102
-rw-r--r--arch/x86/include/asm/kvm_para.h24
-rw-r--r--arch/x86/include/asm/mach_traps.h12
-rw-r--r--arch/x86/include/asm/mce.h3
-rw-r--r--arch/x86/include/asm/microcode.h6
-rw-r--r--arch/x86/include/asm/mpspec.h31
-rw-r--r--arch/x86/include/asm/mpspec_def.h7
-rw-r--r--arch/x86/include/asm/msr-index.h18
-rw-r--r--arch/x86/include/asm/nmi.h71
-rw-r--r--arch/x86/include/asm/numa_32.h2
-rw-r--r--arch/x86/include/asm/numa_64.h3
-rw-r--r--arch/x86/include/asm/olpc.h10
-rw-r--r--arch/x86/include/asm/olpc_ofw.h9
-rw-r--r--arch/x86/include/asm/paravirt.h36
-rw-r--r--arch/x86/include/asm/paravirt_types.h6
-rw-r--r--arch/x86/include/asm/pci.h1
-rw-r--r--arch/x86/include/asm/percpu.h158
-rw-r--r--arch/x86/include/asm/perf_event.h2
-rw-r--r--arch/x86/include/asm/perf_event_p4.h66
-rw-r--r--arch/x86/include/asm/pgalloc.h2
-rw-r--r--arch/x86/include/asm/pgtable-2level.h9
-rw-r--r--arch/x86/include/asm/pgtable-3level.h23
-rw-r--r--arch/x86/include/asm/pgtable.h143
-rw-r--r--arch/x86/include/asm/pgtable_64.h28
-rw-r--r--arch/x86/include/asm/pgtable_types.h3
-rw-r--r--arch/x86/include/asm/processor.h10
-rw-r--r--arch/x86/include/asm/prom.h1
-rw-r--r--arch/x86/include/asm/pvclock.h1
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h1
-rw-r--r--arch/x86/include/asm/stacktrace.h33
-rw-r--r--arch/x86/include/asm/svm.h57
-rw-r--r--arch/x86/include/asm/system_64.h22
-rw-r--r--arch/x86/include/asm/timer.h6
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h9
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h4
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h208
-rw-r--r--arch/x86/include/asm/vmx.h15
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h35
-rw-r--r--arch/x86/include/asm/xen/interface.h6
-rw-r--r--arch/x86/include/asm/xen/interface_32.h5
-rw-r--r--arch/x86/include/asm/xen/interface_64.h13
-rw-r--r--arch/x86/include/asm/xen/page.h21
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/acpi/boot.c60
-rw-r--r--arch/x86/kernel/alternative.c52
-rw-r--r--arch/x86/kernel/amd_iommu.c4
-rw-r--r--arch/x86/kernel/amd_nb.c142
-rw-r--r--arch/x86/kernel/apb_timer.c14
-rw-r--r--arch/x86/kernel/aperture_64.c54
-rw-r--r--arch/x86/kernel/apic/Makefile5
-rw-r--r--arch/x86/kernel/apic/apic.c201
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c44
-rw-r--r--arch/x86/kernel/apic/io_apic.c119
-rw-r--r--arch/x86/kernel/apic/nmi.c567
-rw-r--r--arch/x86/kernel/apic/probe_64.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c96
-rw-r--r--arch/x86/kernel/cpu/amd.c2
-rw-r--r--arch/x86/kernel/cpu/common.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c4
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c154
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c20
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c135
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c41
-rw-r--r--arch/x86/kernel/cpu/perf_event.c110
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c20
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c30
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c28
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c644
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/dumpstack.c19
-rw-r--r--arch/x86/kernel/dumpstack_32.c25
-rw-r--r--arch/x86/kernel/dumpstack_64.c26
-rw-r--r--arch/x86/kernel/e820.c1
-rw-r--r--arch/x86/kernel/entry_32.S12
-rw-r--r--arch/x86/kernel/entry_64.S41
-rw-r--r--arch/x86/kernel/ftrace.c9
-rw-r--r--arch/x86/kernel/head_32.S93
-rw-r--r--arch/x86/kernel/hpet.c26
-rw-r--r--arch/x86/kernel/hw_breakpoint.c16
-rw-r--r--arch/x86/kernel/i387.c1
-rw-r--r--arch/x86/kernel/irq.c16
-rw-r--r--arch/x86/kernel/irq_32.c11
-rw-r--r--arch/x86/kernel/kgdb.c19
-rw-r--r--arch/x86/kernel/kprobes.c127
-rw-r--r--arch/x86/kernel/kvm.c317
-rw-r--r--arch/x86/kernel/kvmclock.c13
-rw-r--r--arch/x86/kernel/microcode_amd.c36
-rw-r--r--arch/x86/kernel/microcode_intel.c16
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c71
-rw-r--r--arch/x86/kernel/module.c17
-rw-r--r--arch/x86/kernel/mpparse.c114
-rw-r--r--arch/x86/kernel/msr.c1
-rw-r--r--arch/x86/kernel/paravirt.c3
-rw-r--r--arch/x86/kernel/pci-gart_64.c34
-rw-r--r--arch/x86/kernel/process.c45
-rw-r--r--arch/x86/kernel/process_32.c4
-rw-r--r--arch/x86/kernel/process_64.c4
-rw-r--r--arch/x86/kernel/pvclock.c43
-rw-r--r--arch/x86/kernel/reboot.c5
-rw-r--r--arch/x86/kernel/resource.c48
-rw-r--r--arch/x86/kernel/rtc.c2
-rw-r--r--arch/x86/kernel/setup.c31
-rw-r--r--arch/x86/kernel/smpboot.c55
-rw-r--r--arch/x86/kernel/stacktrace.c8
-rw-r--r--arch/x86/kernel/tboot.c2
-rw-r--r--arch/x86/kernel/time.c18
-rw-r--r--arch/x86/kernel/trampoline_64.S2
-rw-r--r--arch/x86/kernel/traps.c131
-rw-r--r--arch/x86/kernel/tsc.c100
-rw-r--r--arch/x86/kernel/verify_cpu.S (renamed from arch/x86/kernel/verify_cpu_64.S)49
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kernel/vmlinux.lds.S8
-rw-r--r--arch/x86/kernel/xsave.c3
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c367
-rw-r--r--arch/x86/kvm/i8259.c2
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h22
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/mmu.c379
-rw-r--r--arch/x86/kvm/mmu_audit.c39
-rw-r--r--arch/x86/kvm/paging_tmpl.h156
-rw-r--r--arch/x86/kvm/svm.c861
-rw-r--r--arch/x86/kvm/trace.h17
-rw-r--r--arch/x86/kvm/vmx.c180
-rw-r--r--arch/x86/kvm/x86.c487
-rw-r--r--arch/x86/kvm/x86.h5
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c18
-rw-r--r--arch/x86/lib/delay.c2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/amdtopology_64.c (renamed from arch/x86/mm/k8topology_64.c)97
-rw-r--r--arch/x86/mm/gup.c28
-rw-r--r--arch/x86/mm/init.c3
-rw-r--r--arch/x86/mm/init_32.c22
-rw-r--r--arch/x86/mm/kmemcheck/error.c2
-rw-r--r--arch/x86/mm/numa.c22
-rw-r--r--arch/x86/mm/numa_64.c199
-rw-r--r--arch/x86/mm/pageattr.c33
-rw-r--r--arch/x86/mm/pgtable.c66
-rw-r--r--arch/x86/mm/setup_nx.c2
-rw-r--r--arch/x86/mm/srat_32.c2
-rw-r--r--arch/x86/mm/srat_64.c36
-rw-r--r--arch/x86/mm/tlb.c7
-rw-r--r--arch/x86/oprofile/backtrace.c2
-rw-r--r--arch/x86/oprofile/nmi_int.c8
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c5
-rw-r--r--arch/x86/oprofile/op_model_amd.c79
-rw-r--r--arch/x86/oprofile/op_model_p4.c2
-rw-r--r--arch/x86/oprofile/op_model_ppro.c8
-rw-r--r--arch/x86/pci/acpi.c103
-rw-r--r--arch/x86/pci/amd_bus.c33
-rw-r--r--arch/x86/pci/broadcom_bus.c11
-rw-r--r--arch/x86/pci/common.c41
-rw-r--r--arch/x86/pci/i386.c18
-rw-r--r--arch/x86/pci/irq.c3
-rw-r--r--arch/x86/pci/pcbios.c23
-rw-r--r--arch/x86/pci/xen.c35
-rw-r--r--arch/x86/platform/mrst/early_printk_mrst.c2
-rw-r--r--arch/x86/platform/mrst/mrst.c30
-rw-r--r--arch/x86/platform/olpc/Makefile1
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c101
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c183
-rw-r--r--arch/x86/platform/olpc/olpc_ofw.c5
-rw-r--r--arch/x86/platform/sfi/sfi.c17
-rw-r--r--arch/x86/platform/uv/tlb_uv.c37
-rw-r--r--arch/x86/platform/uv/uv_time.c4
-rw-r--r--arch/x86/platform/visws/visws_quirks.c2
-rw-r--r--arch/x86/vdso/Makefile4
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/enlighten.c71
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu.c454
-rw-r--r--arch/x86/xen/multicalls.h2
-rw-r--r--arch/x86/xen/p2m.c522
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/setup.c79
-rw-r--r--arch/x86/xen/spinlock.c8
-rw-r--r--arch/x86/xen/suspend.c1
-rw-r--r--arch/x86/xen/time.c10
-rw-r--r--arch/x86/xen/xen-ops.h2
214 files changed, 8533 insertions, 4571 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6ab63107eeaf..d5ed94d30aad 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -21,7 +21,7 @@ config X86
select HAVE_UNSTABLE_SCHED_CLOCK
select HAVE_IDE
select HAVE_OPROFILE
- select HAVE_PERF_EVENTS if (!M386 && !M486)
+ select HAVE_PERF_EVENTS
select HAVE_IRQ_WORK
select HAVE_IOREMAP_PROT
select HAVE_KPROBES
@@ -51,6 +51,7 @@ config X86
select HAVE_KERNEL_GZIP
select HAVE_KERNEL_BZIP2
select HAVE_KERNEL_LZMA
+ select HAVE_KERNEL_XZ
select HAVE_KERNEL_LZO
select HAVE_HW_BREAKPOINT
select HAVE_MIXED_BREAKPOINTS_REGS
@@ -65,6 +66,7 @@ config X86
select HAVE_SPARSE_IRQ
select GENERIC_IRQ_PROBE
select GENERIC_PENDING_IRQ if SMP
+ select USE_GENERIC_SMP_HELPERS if SMP
config INSTRUCTION_DECODER
def_bool (KPROBES || PERF_EVENTS)
@@ -203,10 +205,6 @@ config HAVE_INTEL_TXT
def_bool y
depends on EXPERIMENTAL && DMAR && ACPI
-config USE_GENERIC_SMP_HELPERS
- def_bool y
- depends on SMP
-
config X86_32_SMP
def_bool y
depends on X86_32 && SMP
@@ -629,11 +627,11 @@ config APB_TIMER
as it is off-chip. APB timers are always running regardless of CPU
C states, they are used as per CPU clockevent device when possible.
-# Mark as embedded because too many people got it wrong.
+# Mark as expert because too many people got it wrong.
# The code disables itself when not needed.
config DMI
default y
- bool "Enable DMI scanning" if EMBEDDED
+ bool "Enable DMI scanning" if EXPERT
---help---
Enabled scanning of DMI to identify machine quirks. Say Y
here unless you have verified that your setup is not
@@ -641,7 +639,7 @@ config DMI
BIOS code.
config GART_IOMMU
- bool "GART IOMMU support" if EMBEDDED
+ bool "GART IOMMU support" if EXPERT
default y
select SWIOTLB
depends on X86_64 && PCI && AMD_NB
@@ -891,7 +889,7 @@ config X86_THERMAL_VECTOR
depends on X86_MCE_INTEL
config VM86
- bool "Enable VM86 support" if EMBEDDED
+ bool "Enable VM86 support" if EXPERT
default y
depends on X86_32
---help---
@@ -1075,7 +1073,7 @@ endchoice
choice
depends on EXPERIMENTAL
- prompt "Memory split" if EMBEDDED
+ prompt "Memory split" if EXPERT
default VMSPLIT_3G
depends on X86_32
---help---
@@ -1137,7 +1135,7 @@ config ARCH_DMA_ADDR_T_64BIT
def_bool X86_64 || HIGHMEM64G
config DIRECT_GBPAGES
- bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
+ bool "Enable 1GB pages for kernel pagetables" if EXPERT
default y
depends on X86_64
---help---
@@ -1170,16 +1168,16 @@ config NUMA
comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
-config K8_NUMA
+config AMD_NUMA
def_bool y
prompt "Old style AMD Opteron NUMA detection"
depends on X86_64 && NUMA && PCI
---help---
- Enable K8 NUMA node topology detection. You should say Y here if
- you have a multi processor AMD K8 system. This uses an old
- method to read the NUMA configuration directly from the builtin
- Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA
- instead, which also takes priority if both are compiled in.
+ Enable AMD NUMA node topology detection. You should say Y here if
+ you have a multi processor AMD system. This uses an old method to
+ read the NUMA configuration directly from the builtin Northbridge
+ of Opteron. It is recommended to use X86_64_ACPI_NUMA instead,
+ which also takes priority if both are compiled in.
config X86_64_ACPI_NUMA
def_bool y
@@ -1371,7 +1369,7 @@ config MATH_EMULATION
config MTRR
def_bool y
- prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
+ prompt "MTRR (Memory Type Range Register) support" if EXPERT
---help---
On Intel P6 family processors (Pentium Pro, Pentium II and later)
the Memory Type Range Registers (MTRRs) may be used to control
@@ -1437,7 +1435,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
config X86_PAT
def_bool y
- prompt "x86 PAT support" if EMBEDDED
+ prompt "x86 PAT support" if EXPERT
depends on MTRR
---help---
Use PAT attributes to setup page level cache control.
@@ -1541,7 +1539,7 @@ config KEXEC_JUMP
code in physical address mode via KEXEC
config PHYSICAL_START
- hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
+ hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
default "0x1000000"
---help---
This gives the physical address where the kernel is loaded.
@@ -1936,13 +1934,19 @@ config PCI_MMCONFIG
depends on X86_64 && PCI && ACPI
config PCI_CNB20LE_QUIRK
- bool "Read CNB20LE Host Bridge Windows"
- depends on PCI
+ bool "Read CNB20LE Host Bridge Windows" if EXPERT
+ default n
+ depends on PCI && EXPERIMENTAL
help
Read the PCI windows out of the CNB20LE host bridge. This allows
PCI hotplug to work on systems with the CNB20LE chipset which do
not have ACPI.
+ There's no public spec for this chipset, and this functionality
+ is known to be incomplete.
+
+ You should say N unless you know you need this.
+
config DMAR
bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
depends on PCI_MSI && ACPI && EXPERIMENTAL
@@ -2064,13 +2068,14 @@ config OLPC
bool "One Laptop Per Child support"
select GPIOLIB
select OLPC_OPENFIRMWARE
+ depends on !X86_64 && !X86_PAE
---help---
Add support for detecting the unique features of the OLPC
XO hardware.
config OLPC_XO1
tristate "OLPC XO-1 support"
- depends on OLPC && PCI
+ depends on OLPC && MFD_CS5535
---help---
Add support for non-essential features of the OLPC XO-1 laptop.
@@ -2078,11 +2083,17 @@ config OLPC_OPENFIRMWARE
bool "Support for OLPC's Open Firmware"
depends on !X86_64 && !X86_PAE
default n
+ select OF
help
This option adds support for the implementation of Open Firmware
that is used on the OLPC XO-1 Children's Machine.
If unsure, say N here.
+config OLPC_OPENFIRMWARE_DT
+ bool
+ default y if OLPC_OPENFIRMWARE && PROC_DEVICETREE
+ select OF_PROMTREE
+
endif # X86_32
config AMD_NB
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2ac9069890cd..283c5a6a03a6 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -310,6 +310,9 @@ config X86_INTERNODE_CACHE_SHIFT
config X86_CMPXCHG
def_bool X86_64 || (X86_32 && !M386)
+config CMPXCHG_LOCAL
+ def_bool X86_64 || (X86_32 && !M386)
+
config X86_L1_CACHE_SHIFT
int
default "7" if MPENTIUM4 || MPSC
@@ -421,7 +424,7 @@ config X86_DEBUGCTLMSR
depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML
menuconfig PROCESSOR_SELECT
- bool "Supported processor vendors" if EMBEDDED
+ bool "Supported processor vendors" if EXPERT
---help---
This lets you choose what x86 vendor support code your kernel
will include.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index b59ee765414e..615e18810f48 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -31,7 +31,7 @@ config X86_VERBOSE_BOOTUP
see errors. Disable this if you want silent bootup.
config EARLY_PRINTK
- bool "Early printk" if EMBEDDED
+ bool "Early printk" if EXPERT
default y
---help---
Write kernel log output directly into the VGA buffer or to a serial
@@ -117,6 +117,17 @@ config DEBUG_RODATA_TEST
feature as well as for the change_page_attr() infrastructure.
If in doubt, say "N"
+config DEBUG_SET_MODULE_RONX
+ bool "Set loadable kernel module data as NX and text as RO"
+ depends on MODULES
+ ---help---
+ This option helps catch unintended modifications to loadable
+ kernel module's text and read-only data. It also prevents execution
+ of module data. Such protection may interfere with run-time code
+ patching and dynamic kernel tracing - and they might also protect
+ against certain classes of kernel exploits.
+ If in doubt, say "N".
+
config DEBUG_NX_TEST
tristate "Testcase for the NX non-executable stack feature"
depends on DEBUG_KERNEL && m
@@ -127,7 +138,7 @@ config DEBUG_NX_TEST
config DOUBLEFAULT
default y
- bool "Enable doublefault exception handler" if EMBEDDED
+ bool "Enable doublefault exception handler" if EXPERT
depends on X86_32
---help---
This option allows trapping of rare doublefault exceptions that
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0c229551eead..09664efb9cee 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
# create a compressed vmlinux image from the original vmlinux
#
-targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
+targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -49,12 +49,15 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
$(call if_changed,bzip2)
$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lzma)
+$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
+ $(call if_changed,xzkern)
$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
$(call if_changed,lzo)
suffix-$(CONFIG_KERNEL_GZIP) := gz
suffix-$(CONFIG_KERNEL_BZIP2) := bz2
suffix-$(CONFIG_KERNEL_LZMA) := lzma
+suffix-$(CONFIG_KERNEL_XZ) := xz
suffix-$(CONFIG_KERNEL_LZO) := lzo
quiet_cmd_mkpiggy = MKPIGGY $@
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 52f85a196fa0..35af09d13dc1 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -182,7 +182,7 @@ no_longmode:
hlt
jmp 1b
-#include "../../kernel/verify_cpu_64.S"
+#include "../../kernel/verify_cpu.S"
/*
* Be careful here startup_64 needs to be at a predictable
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 23f315c9f215..3a19d04cebeb 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -139,6 +139,10 @@ static int lines, cols;
#include "../../../../lib/decompress_unlzma.c"
#endif
+#ifdef CONFIG_KERNEL_XZ
+#include "../../../../lib/decompress_unxz.c"
+#endif
+
#ifdef CONFIG_KERNEL_LZO
#include "../../../../lib/decompress_unlzo.c"
#endif
@@ -355,7 +359,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
if (heap > 0x3fffffffffffUL)
error("Destination address too large");
#else
- if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
+ if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
error("Destination address too large");
#endif
#ifndef CONFIG_RELOCATABLE
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index 5c228129d175..646aa78ba5fd 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -74,7 +74,7 @@ int main(int argc, char *argv[])
offs = (olen > ilen) ? olen - ilen : 0;
offs += olen >> 12; /* Add 8 bytes for each 32K block */
- offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */
+ offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */
offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
printf(".section \".rodata..compressed\",\"a\",@progbits\n");
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756a51c1..8fe2a4966b7a 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,20 @@
* Vinodh Gopal <vinodh.gopal@intel.com>
* Kahraman Akdemir
*
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
+ * Aidan O'Mahony (aidan.o.mahony@intel.com)
+ * Adrian Hoban <adrian.hoban@intel.com>
+ * James Guilford (james.guilford@intel.com)
+ * Gabriele Paoloni <gabriele.paoloni@intel.com>
+ * Tadeusz Struk (tadeusz.struk@intel.com)
+ * Wajdi Feghali (wajdi.k.feghali@intel.com)
+ * Copyright (c) 2010, Intel Corporation.
+ *
+ * Ported x86_64 version to x86:
+ * Author: Mathias Krause <minipli@googlemail.com>
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -18,8 +32,62 @@
#include <linux/linkage.h>
#include <asm/inst.h>
+#ifdef __x86_64__
+.data
+POLY: .octa 0xC2000000000000000000000000000001
+TWOONE: .octa 0x00000001000000000000000000000001
+
+# order of these constants should not change.
+# more specifically, ALL_F should follow SHIFT_MASK,
+# and ZERO should follow ALL_F
+
+SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
+MASK1: .octa 0x0000000000000000ffffffffffffffff
+MASK2: .octa 0xffffffffffffffff0000000000000000
+SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
+ALL_F: .octa 0xffffffffffffffffffffffffffffffff
+ZERO: .octa 0x00000000000000000000000000000000
+ONE: .octa 0x00000000000000000000000000000001
+F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
+dec: .octa 0x1
+enc: .octa 0x2
+
+
.text
+
+#define STACK_OFFSET 8*3
+#define HashKey 16*0 // store HashKey <<1 mod poly here
+#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
+#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
+#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
+#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
+ // bits of HashKey <<1 mod poly here
+ //(for Karatsuba purposes)
+#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
+ // bits of HashKey^2 <<1 mod poly here
+ // (for Karatsuba purposes)
+#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
+ // bits of HashKey^3 <<1 mod poly here
+ // (for Karatsuba purposes)
+#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
+ // bits of HashKey^4 <<1 mod poly here
+ // (for Karatsuba purposes)
+#define VARIABLE_OFFSET 16*8
+
+#define arg1 rdi
+#define arg2 rsi
+#define arg3 rdx
+#define arg4 rcx
+#define arg5 r8
+#define arg6 r9
+#define arg7 STACK_OFFSET+8(%r14)
+#define arg8 STACK_OFFSET+16(%r14)
+#define arg9 STACK_OFFSET+24(%r14)
+#define arg10 STACK_OFFSET+32(%r14)
+#endif
+
+
#define STATE1 %xmm0
#define STATE2 %xmm4
#define STATE3 %xmm5
@@ -32,12 +100,16 @@
#define IN IN1
#define KEY %xmm2
#define IV %xmm3
+
#define BSWAP_MASK %xmm10
#define CTR %xmm11
#define INC %xmm12
+#ifdef __x86_64__
+#define AREG %rax
#define KEYP %rdi
#define OUTP %rsi
+#define UKEYP OUTP
#define INP %rdx
#define LEN %rcx
#define IVP %r8
@@ -46,6 +118,1588 @@
#define TKEYP T1
#define T2 %r11
#define TCTR_LOW T2
+#else
+#define AREG %eax
+#define KEYP %edi
+#define OUTP AREG
+#define UKEYP OUTP
+#define INP %edx
+#define LEN %esi
+#define IVP %ebp
+#define KLEN %ebx
+#define T1 %ecx
+#define TKEYP T1
+#endif
+
+
+#ifdef __x86_64__
+/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
+*
+*
+* Input: A and B (128-bits each, bit-reflected)
+* Output: C = A*B*x mod poly, (i.e. >>1 )
+* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
+* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
+*
+*/
+.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
+ movdqa \GH, \TMP1
+ pshufd $78, \GH, \TMP2
+ pshufd $78, \HK, \TMP3
+ pxor \GH, \TMP2 # TMP2 = a1+a0
+ pxor \HK, \TMP3 # TMP3 = b1+b0
+ PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
+ PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
+ pxor \GH, \TMP2
+ pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
+ movdqa \TMP2, \TMP3
+ pslldq $8, \TMP3 # left shift TMP3 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP3, \GH
+ pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
+
+ # first phase of the reduction
+
+ movdqa \GH, \TMP2
+ movdqa \GH, \TMP3
+ movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
+ # in in order to perform
+ # independent shifts
+ pslld $31, \TMP2 # packed right shift <<31
+ pslld $30, \TMP3 # packed right shift <<30
+ pslld $25, \TMP4 # packed right shift <<25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP5
+ psrldq $4, \TMP5 # right shift TMP5 1 DW
+ pslldq $12, \TMP2 # left shift TMP2 3 DWs
+ pxor \TMP2, \GH
+
+ # second phase of the reduction
+
+ movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
+ # in in order to perform
+ # independent shifts
+ movdqa \GH,\TMP3
+ movdqa \GH,\TMP4
+ psrld $1,\TMP2 # packed left shift >>1
+ psrld $2,\TMP3 # packed left shift >>2
+ psrld $7,\TMP4 # packed left shift >>7
+ pxor \TMP3,\TMP2 # xor the shifted versions
+ pxor \TMP4,\TMP2
+ pxor \TMP5, \TMP2
+ pxor \TMP2, \GH
+ pxor \TMP1, \GH # result is in TMP1
+.endm
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+
+.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+ mov arg7, %r10 # %r10 = AAD
+ mov arg8, %r12 # %r12 = aadLen
+ mov %r12, %r11
+ pxor %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+ movd (%r10), \TMP1
+ pslldq $12, \TMP1
+ psrldq $4, %xmm\i
+ pxor \TMP1, %xmm\i
+ add $4, %r10
+ sub $4, %r12
+ jne _get_AAD_loop\num_initial_blocks\operation
+ cmp $16, %r11
+ je _get_AAD_loop2_done\num_initial_blocks\operation
+ mov $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+ psrldq $4, %xmm\i
+ sub $4, %r12
+ cmp %r11, %r12
+ jne _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
+
+ xor %r11, %r11 # initialise the data pointer offset as zero
+
+ # start AES for num_initial_blocks blocks
+
+ mov %arg5, %rax # %rax = *Y0
+ movdqu (%rax), \XMM0 # XMM0 = Y0
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM0
+
+.if (\i == 5) || (\i == 6) || (\i == 7)
+.irpc index, \i_seq
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, %xmm\index
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
+
+.endr
+.irpc index, \i_seq
+ pxor 16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+ movaps 0x10(%rdi), \TMP1
+ AESENC \TMP1, %xmm\index # Round 1
+.endr
+.irpc index, \i_seq
+ movaps 0x20(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x30(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x40(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x50(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x60(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x70(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x80(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x90(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0xa0(%arg1), \TMP1
+ AESENCLAST \TMP1, %xmm\index # Round 10
+.endr
+.irpc index, \i_seq
+ movdqu (%arg3 , %r11, 1), \TMP1
+ pxor \TMP1, %xmm\index
+ movdqu %xmm\index, (%arg2 , %r11, 1)
+ # write back plaintext/ciphertext for num_initial_blocks
+ add $16, %r11
+
+ movdqa \TMP1, %xmm\index
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, %xmm\index
+
+ # prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+ GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+ pxor %xmm5, %xmm6
+ GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm6, %xmm7
+ GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+ pxor %xmm6, %xmm7
+ GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+ cmp $64, %r13
+ jl _initial_blocks_done\num_initial_blocks\operation
+ # no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM1
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM2
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM3
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM4
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+ pxor 16*0(%arg1), \XMM1
+ pxor 16*0(%arg1), \XMM2
+ pxor 16*0(%arg1), \XMM3
+ pxor 16*0(%arg1), \XMM4
+ movdqa \TMP3, \TMP5
+ pshufd $78, \TMP3, \TMP1
+ pxor \TMP3, \TMP1
+ movdqa \TMP1, HashKey_k(%rsp)
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+ movdqa \TMP5, HashKey_2(%rsp)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqa \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+ movaps 0x10*\index(%arg1), \TMP1
+ AESENC \TMP1, \XMM1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+.endr
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+ movdqa \TMP5, HashKey_3(%rsp)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqa \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+ movaps 0x10*\index(%arg1), \TMP1
+ AESENC \TMP1, \XMM1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+.endr
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+ movdqa \TMP5, HashKey_4(%rsp)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqa \TMP1, HashKey_4_k(%rsp)
+ movaps 0xa0(%arg1), \TMP2
+ AESENCLAST \TMP2, \XMM1
+ AESENCLAST \TMP2, \XMM2
+ AESENCLAST \TMP2, \XMM3
+ AESENCLAST \TMP2, \XMM4
+ movdqu 16*0(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM1
+ movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
+ movdqa \TMP1, \XMM1
+ movdqu 16*1(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM2
+ movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
+ movdqa \TMP1, \XMM2
+ movdqu 16*2(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM3
+ movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
+ movdqa \TMP1, \XMM3
+ movdqu 16*3(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM4
+ movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
+ movdqa \TMP1, \XMM4
+ add $64, %r11
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+ pxor \XMMDst, \XMM1
+# combine GHASHed value with the corresponding ciphertext
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+_initial_blocks_done\num_initial_blocks\operation:
+
+.endm
+
+
+/*
+* if a = number of total plaintext bytes
+* b = floor(a/16)
+* num_initial_blocks = b mod 4
+* encrypt the initial num_initial_blocks blocks and apply ghash on
+* the ciphertext
+* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
+* are clobbered
+* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
+*/
+
+
+.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
+XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
+ mov arg7, %r10 # %r10 = AAD
+ mov arg8, %r12 # %r12 = aadLen
+ mov %r12, %r11
+ pxor %xmm\i, %xmm\i
+_get_AAD_loop\num_initial_blocks\operation:
+ movd (%r10), \TMP1
+ pslldq $12, \TMP1
+ psrldq $4, %xmm\i
+ pxor \TMP1, %xmm\i
+ add $4, %r10
+ sub $4, %r12
+ jne _get_AAD_loop\num_initial_blocks\operation
+ cmp $16, %r11
+ je _get_AAD_loop2_done\num_initial_blocks\operation
+ mov $16, %r12
+_get_AAD_loop2\num_initial_blocks\operation:
+ psrldq $4, %xmm\i
+ sub $4, %r12
+ cmp %r11, %r12
+ jne _get_AAD_loop2\num_initial_blocks\operation
+_get_AAD_loop2_done\num_initial_blocks\operation:
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
+
+ xor %r11, %r11 # initialise the data pointer offset as zero
+
+ # start AES for num_initial_blocks blocks
+
+ mov %arg5, %rax # %rax = *Y0
+ movdqu (%rax), \XMM0 # XMM0 = Y0
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM0
+
+.if (\i == 5) || (\i == 6) || (\i == 7)
+.irpc index, \i_seq
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, %xmm\index
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
+
+.endr
+.irpc index, \i_seq
+ pxor 16*0(%arg1), %xmm\index
+.endr
+.irpc index, \i_seq
+ movaps 0x10(%rdi), \TMP1
+ AESENC \TMP1, %xmm\index # Round 1
+.endr
+.irpc index, \i_seq
+ movaps 0x20(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x30(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x40(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x50(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x60(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x70(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x80(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0x90(%arg1), \TMP1
+ AESENC \TMP1, %xmm\index # Round 2
+.endr
+.irpc index, \i_seq
+ movaps 0xa0(%arg1), \TMP1
+ AESENCLAST \TMP1, %xmm\index # Round 10
+.endr
+.irpc index, \i_seq
+ movdqu (%arg3 , %r11, 1), \TMP1
+ pxor \TMP1, %xmm\index
+ movdqu %xmm\index, (%arg2 , %r11, 1)
+ # write back plaintext/ciphertext for num_initial_blocks
+ add $16, %r11
+
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, %xmm\index
+
+ # prepare plaintext/ciphertext for GHASH computation
+.endr
+.endif
+ GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ # apply GHASH on num_initial_blocks blocks
+
+.if \i == 5
+ pxor %xmm5, %xmm6
+ GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm6, %xmm7
+ GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 6
+ pxor %xmm6, %xmm7
+ GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.elseif \i == 7
+ pxor %xmm7, %xmm8
+ GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
+.endif
+ cmp $64, %r13
+ jl _initial_blocks_done\num_initial_blocks\operation
+ # no need for precomputed values
+/*
+*
+* Precomputations for HashKey parallel with encryption of first 4 blocks.
+* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
+*/
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM1
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM2
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM3
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+
+ paddd ONE(%rip), \XMM0 # INCR Y0
+ movdqa \XMM0, \XMM4
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+ pxor 16*0(%arg1), \XMM1
+ pxor 16*0(%arg1), \XMM2
+ pxor 16*0(%arg1), \XMM3
+ pxor 16*0(%arg1), \XMM4
+ movdqa \TMP3, \TMP5
+ pshufd $78, \TMP3, \TMP1
+ pxor \TMP3, \TMP1
+ movdqa \TMP1, HashKey_k(%rsp)
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^2<<1 (mod poly)
+ movdqa \TMP5, HashKey_2(%rsp)
+# HashKey_2 = HashKey^2<<1 (mod poly)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqa \TMP1, HashKey_2_k(%rsp)
+.irpc index, 1234 # do 4 rounds
+ movaps 0x10*\index(%arg1), \TMP1
+ AESENC \TMP1, \XMM1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+.endr
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+ movdqa \TMP5, HashKey_3(%rsp)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqa \TMP1, HashKey_3_k(%rsp)
+.irpc index, 56789 # do next 5 rounds
+ movaps 0x10*\index(%arg1), \TMP1
+ AESENC \TMP1, \XMM1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+.endr
+ GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
+# TMP5 = HashKey^3<<1 (mod poly)
+ movdqa \TMP5, HashKey_4(%rsp)
+ pshufd $78, \TMP5, \TMP1
+ pxor \TMP5, \TMP1
+ movdqa \TMP1, HashKey_4_k(%rsp)
+ movaps 0xa0(%arg1), \TMP2
+ AESENCLAST \TMP2, \XMM1
+ AESENCLAST \TMP2, \XMM2
+ AESENCLAST \TMP2, \XMM3
+ AESENCLAST \TMP2, \XMM4
+ movdqu 16*0(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM1
+ movdqu 16*1(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM2
+ movdqu 16*2(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM3
+ movdqu 16*3(%arg3 , %r11 , 1), \TMP1
+ pxor \TMP1, \XMM4
+ movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
+ movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
+ movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
+ movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
+
+ add $64, %r11
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
+ pxor \XMMDst, \XMM1
+# combine GHASHed value with the corresponding ciphertext
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
+ movdqa SHUF_MASK(%rip), %xmm14
+ PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
+
+_initial_blocks_done\num_initial_blocks\operation:
+
+.endm
+
+/*
+* encrypt 4 blocks at a time
+* ghash the 4 previously encrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+ movdqa \XMM1, \XMM5
+ movdqa \XMM2, \XMM6
+ movdqa \XMM3, \XMM7
+ movdqa \XMM4, \XMM8
+
+ movdqa SHUF_MASK(%rip), %xmm15
+ # multiply TMP5 * HashKey using karatsuba
+
+ movdqa \XMM5, \TMP4
+ pshufd $78, \XMM5, \TMP6
+ pxor \XMM5, \TMP6
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa HashKey_4(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
+ movdqa \XMM0, \XMM1
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM2
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM3
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM4
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor (%arg1), \XMM1
+ pxor (%arg1), \XMM2
+ pxor (%arg1), \XMM3
+ pxor (%arg1), \XMM4
+ movdqa HashKey_4_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
+ movaps 0x10(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movaps 0x20(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 2
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movdqa \XMM6, \TMP1
+ pshufd $78, \XMM6, \TMP2
+ pxor \XMM6, \TMP2
+ movdqa HashKey_3(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
+ movaps 0x30(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 3
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
+ movaps 0x40(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 4
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqa HashKey_3_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x50(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 5
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM6, \XMM5
+ pxor \TMP2, \TMP6
+ movdqa \XMM7, \TMP1
+ pshufd $78, \XMM7, \TMP2
+ pxor \XMM7, \TMP2
+ movdqa HashKey_2(%rsp ), \TMP5
+
+ # Multiply TMP5 * HashKey using karatsuba
+
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x60(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 6
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
+ movaps 0x70(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 7
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqa HashKey_2_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x80(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 8
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM7, \XMM5
+ pxor \TMP2, \TMP6
+
+ # Multiply XMM8 * HashKey
+ # XMM8 and TMP5 hold the values for the two operands
+
+ movdqa \XMM8, \TMP1
+ pshufd $78, \XMM8, \TMP2
+ pxor \XMM8, \TMP2
+ movdqa HashKey(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x90(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 9
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
+ movaps 0xa0(%arg1), \TMP3
+ AESENCLAST \TMP3, \XMM1 # Round 10
+ AESENCLAST \TMP3, \XMM2
+ AESENCLAST \TMP3, \XMM3
+ AESENCLAST \TMP3, \XMM4
+ movdqa HashKey_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movdqu (%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
+ movdqu 16(%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
+ movdqu 32(%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
+ movdqu 48(%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
+ movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
+ movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
+ movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor \TMP4, \TMP1
+ pxor \XMM8, \XMM5
+ pxor \TMP6, \TMP2
+ pxor \TMP1, \TMP2
+ pxor \XMM5, \TMP2
+ movdqa \TMP2, \TMP3
+ pslldq $8, \TMP3 # left shift TMP3 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP3, \XMM5
+ pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
+
+ # first phase of reduction
+
+ movdqa \XMM5, \TMP2
+ movdqa \XMM5, \TMP3
+ movdqa \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+ pslld $31, \TMP2 # packed right shift << 31
+ pslld $30, \TMP3 # packed right shift << 30
+ pslld $25, \TMP4 # packed right shift << 25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP5
+ psrldq $4, \TMP5 # right shift T5 1 DW
+ pslldq $12, \TMP2 # left shift T2 3 DWs
+ pxor \TMP2, \XMM5
+
+ # second phase of reduction
+
+ movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+ movdqa \XMM5,\TMP3
+ movdqa \XMM5,\TMP4
+ psrld $1, \TMP2 # packed left shift >>1
+ psrld $2, \TMP3 # packed left shift >>2
+ psrld $7, \TMP4 # packed left shift >>7
+ pxor \TMP3,\TMP2 # xor the shifted versions
+ pxor \TMP4,\TMP2
+ pxor \TMP5, \TMP2
+ pxor \TMP2, \XMM5
+ pxor \TMP1, \XMM5 # result is in TMP1
+
+ pxor \XMM5, \XMM1
+.endm
+
+/*
+* decrypt 4 blocks at a time
+* ghash the 4 previously decrypted ciphertext blocks
+* arg1, %arg2, %arg3 are used as pointers only, not modified
+* %r11 is the data offset value
+*/
+.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
+TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
+
+ movdqa \XMM1, \XMM5
+ movdqa \XMM2, \XMM6
+ movdqa \XMM3, \XMM7
+ movdqa \XMM4, \XMM8
+
+ movdqa SHUF_MASK(%rip), %xmm15
+ # multiply TMP5 * HashKey using karatsuba
+
+ movdqa \XMM5, \TMP4
+ pshufd $78, \XMM5, \TMP6
+ pxor \XMM5, \TMP6
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa HashKey_4(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
+ movdqa \XMM0, \XMM1
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM2
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM3
+ paddd ONE(%rip), \XMM0 # INCR CNT
+ movdqa \XMM0, \XMM4
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor (%arg1), \XMM1
+ pxor (%arg1), \XMM2
+ pxor (%arg1), \XMM3
+ pxor (%arg1), \XMM4
+ movdqa HashKey_4_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
+ movaps 0x10(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 1
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movaps 0x20(%arg1), \TMP1
+ AESENC \TMP1, \XMM1 # Round 2
+ AESENC \TMP1, \XMM2
+ AESENC \TMP1, \XMM3
+ AESENC \TMP1, \XMM4
+ movdqa \XMM6, \TMP1
+ pshufd $78, \XMM6, \TMP2
+ pxor \XMM6, \TMP2
+ movdqa HashKey_3(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
+ movaps 0x30(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 3
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
+ movaps 0x40(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 4
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqa HashKey_3_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x50(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 5
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM6, \XMM5
+ pxor \TMP2, \TMP6
+ movdqa \XMM7, \TMP1
+ pshufd $78, \XMM7, \TMP2
+ pxor \XMM7, \TMP2
+ movdqa HashKey_2(%rsp ), \TMP5
+
+ # Multiply TMP5 * HashKey using karatsuba
+
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x60(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 6
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
+ movaps 0x70(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 7
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ movdqa HashKey_2_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movaps 0x80(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 8
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ pxor \TMP1, \TMP4
+# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
+ pxor \XMM7, \XMM5
+ pxor \TMP2, \TMP6
+
+ # Multiply XMM8 * HashKey
+ # XMM8 and TMP5 hold the values for the two operands
+
+ movdqa \XMM8, \TMP1
+ pshufd $78, \XMM8, \TMP2
+ pxor \XMM8, \TMP2
+ movdqa HashKey(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ movaps 0x90(%arg1), \TMP3
+ AESENC \TMP3, \XMM1 # Round 9
+ AESENC \TMP3, \XMM2
+ AESENC \TMP3, \XMM3
+ AESENC \TMP3, \XMM4
+ PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
+ movaps 0xa0(%arg1), \TMP3
+ AESENCLAST \TMP3, \XMM1 # Round 10
+ AESENCLAST \TMP3, \XMM2
+ AESENCLAST \TMP3, \XMM3
+ AESENCLAST \TMP3, \XMM4
+ movdqa HashKey_k(%rsp), \TMP5
+ PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movdqu (%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM1
+ movdqu 16(%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM2
+ movdqu 32(%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM3
+ movdqu 48(%arg3,%r11,1), \TMP3
+ pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
+ movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
+ movdqa \TMP3, \XMM4
+ PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
+ PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
+
+ pxor \TMP4, \TMP1
+ pxor \XMM8, \XMM5
+ pxor \TMP6, \TMP2
+ pxor \TMP1, \TMP2
+ pxor \XMM5, \TMP2
+ movdqa \TMP2, \TMP3
+ pslldq $8, \TMP3 # left shift TMP3 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP3, \XMM5
+ pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
+
+ # first phase of reduction
+
+ movdqa \XMM5, \TMP2
+ movdqa \XMM5, \TMP3
+ movdqa \XMM5, \TMP4
+# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
+ pslld $31, \TMP2 # packed right shift << 31
+ pslld $30, \TMP3 # packed right shift << 30
+ pslld $25, \TMP4 # packed right shift << 25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP5
+ psrldq $4, \TMP5 # right shift T5 1 DW
+ pslldq $12, \TMP2 # left shift T2 3 DWs
+ pxor \TMP2, \XMM5
+
+ # second phase of reduction
+
+ movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
+ movdqa \XMM5,\TMP3
+ movdqa \XMM5,\TMP4
+ psrld $1, \TMP2 # packed left shift >>1
+ psrld $2, \TMP3 # packed left shift >>2
+ psrld $7, \TMP4 # packed left shift >>7
+ pxor \TMP3,\TMP2 # xor the shifted versions
+ pxor \TMP4,\TMP2
+ pxor \TMP5, \TMP2
+ pxor \TMP2, \XMM5
+ pxor \TMP1, \XMM5 # result is in TMP1
+
+ pxor \XMM5, \XMM1
+.endm
+
+/* GHASH the last 4 ciphertext blocks. */
+.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
+TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
+
+ # Multiply TMP6 * HashKey (using Karatsuba)
+
+ movdqa \XMM1, \TMP6
+ pshufd $78, \XMM1, \TMP2
+ pxor \XMM1, \TMP2
+ movdqa HashKey_4(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
+ movdqa HashKey_4_k(%rsp), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ movdqa \XMM1, \XMMDst
+ movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
+
+ # Multiply TMP1 * HashKey (using Karatsuba)
+
+ movdqa \XMM2, \TMP1
+ pshufd $78, \XMM2, \TMP2
+ pxor \XMM2, \TMP2
+ movdqa HashKey_3(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
+ movdqa HashKey_3_k(%rsp), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pxor \TMP1, \TMP6
+ pxor \XMM2, \XMMDst
+ pxor \TMP2, \XMM1
+# results accumulated in TMP6, XMMDst, XMM1
+
+ # Multiply TMP1 * HashKey (using Karatsuba)
+
+ movdqa \XMM3, \TMP1
+ pshufd $78, \XMM3, \TMP2
+ pxor \XMM3, \TMP2
+ movdqa HashKey_2(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
+ movdqa HashKey_2_k(%rsp), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pxor \TMP1, \TMP6
+ pxor \XMM3, \XMMDst
+ pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
+
+ # Multiply TMP1 * HashKey (using Karatsuba)
+ movdqa \XMM4, \TMP1
+ pshufd $78, \XMM4, \TMP2
+ pxor \XMM4, \TMP2
+ movdqa HashKey(%rsp), \TMP5
+ PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
+ PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
+ movdqa HashKey_k(%rsp), \TMP4
+ PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
+ pxor \TMP1, \TMP6
+ pxor \XMM4, \XMMDst
+ pxor \XMM1, \TMP2
+ pxor \TMP6, \TMP2
+ pxor \XMMDst, \TMP2
+ # middle section of the temp results combined as in karatsuba algorithm
+ movdqa \TMP2, \TMP4
+ pslldq $8, \TMP4 # left shift TMP4 2 DWs
+ psrldq $8, \TMP2 # right shift TMP2 2 DWs
+ pxor \TMP4, \XMMDst
+ pxor \TMP2, \TMP6
+# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
+ # first phase of the reduction
+ movdqa \XMMDst, \TMP2
+ movdqa \XMMDst, \TMP3
+ movdqa \XMMDst, \TMP4
+# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
+ pslld $31, \TMP2 # packed right shifting << 31
+ pslld $30, \TMP3 # packed right shifting << 30
+ pslld $25, \TMP4 # packed right shifting << 25
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ movdqa \TMP2, \TMP7
+ psrldq $4, \TMP7 # right shift TMP7 1 DW
+ pslldq $12, \TMP2 # left shift TMP2 3 DWs
+ pxor \TMP2, \XMMDst
+
+ # second phase of the reduction
+ movdqa \XMMDst, \TMP2
+ # make 3 copies of XMMDst for doing 3 shift operations
+ movdqa \XMMDst, \TMP3
+ movdqa \XMMDst, \TMP4
+ psrld $1, \TMP2 # packed left shift >> 1
+ psrld $2, \TMP3 # packed left shift >> 2
+ psrld $7, \TMP4 # packed left shift >> 7
+ pxor \TMP3, \TMP2 # xor the shifted versions
+ pxor \TMP4, \TMP2
+ pxor \TMP7, \TMP2
+ pxor \TMP2, \XMMDst
+ pxor \TMP6, \XMMDst # reduced result is in XMMDst
+.endm
+
+/* Encryption of a single block done*/
+.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
+
+ pxor (%arg1), \XMM0
+ movaps 16(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 32(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 48(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 64(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 80(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 96(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 112(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 128(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 144(%arg1), \TMP1
+ AESENC \TMP1, \XMM0
+ movaps 160(%arg1), \TMP1
+ AESENCLAST \TMP1, \XMM0
+.endm
+
+
+/*****************************************************************************
+* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* u8 *out, // Plaintext output. Encrypt in-place is allowed.
+* const u8 *in, // Ciphertext input
+* u64 plaintext_len, // Length of data in bytes for decryption.
+* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
+* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+* // concatenated with 0x00000001. 16-byte aligned pointer.
+* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
+* const u8 *aad, // Additional Authentication Data (AAD)
+* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
+* // given authentication tag and only return the plaintext if they match.
+* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
+* // (most likely), 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+* keys are pre-expanded and aligned to 16 bytes. we are using the first
+* set of 11 keys in the data structure void *aes_ctx
+*
+* iv:
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Salt (From the SA) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Initialization Vector |
+* | (This is the sequence number from IPSec header) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x1 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+* AAD padded to 128 bits with 0
+* for example, assume AAD is a u32 vector
+*
+* if AAD is 8 bytes:
+* AAD[3] = {A0, A1};
+* padded AAD in xmm register = {A1 A0 0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A1) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 32-bit Sequence Number (A0) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 32-bit Sequence Number
+*
+* if AAD is 12 bytes:
+* AAD[3] = {A0, A1, A2};
+* padded AAD in xmm register = {A2 A1 A0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A2) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 64-bit Extended Sequence Number {A1,A0} |
+* | |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 64-bit Extended Sequence Number
+*
+* aadLen:
+* from the definition of the spec, aadLen can only be 8 or 12 bytes.
+* The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+* For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+*
+*****************************************************************************/
+
+ENTRY(aesni_gcm_dec)
+ push %r12
+ push %r13
+ push %r14
+ mov %rsp, %r14
+/*
+* states of %xmm registers %xmm6:%xmm15 not saved
+* all %xmm registers are clobbered
+*/
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp # align rsp to 64 bytes
+ mov %arg6, %r12
+ movdqu (%r12), %xmm13 # %xmm13 = HashKey
+ movdqa SHUF_MASK(%rip), %xmm2
+ PSHUFB_XMM %xmm2, %xmm13
+
+
+# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
+
+ movdqa %xmm13, %xmm2
+ psllq $1, %xmm13
+ psrlq $63, %xmm2
+ movdqa %xmm2, %xmm1
+ pslldq $8, %xmm2
+ psrldq $8, %xmm1
+ por %xmm2, %xmm13
+
+ # Reduction
+
+ pshufd $0x24, %xmm1, %xmm2
+ pcmpeqd TWOONE(%rip), %xmm2
+ pand POLY(%rip), %xmm2
+ pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
+
+
+ # Decrypt first few blocks
+
+ movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
+ mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
+ and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
+ mov %r13, %r12
+ and $(3<<4), %r12
+ jz _initial_num_blocks_is_0_decrypt
+ cmp $(2<<4), %r12
+ jb _initial_num_blocks_is_1_decrypt
+ je _initial_num_blocks_is_2_decrypt
+_initial_num_blocks_is_3_decrypt:
+ INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
+ sub $48, %r13
+ jmp _initial_blocks_decrypted
+_initial_num_blocks_is_2_decrypt:
+ INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
+ sub $32, %r13
+ jmp _initial_blocks_decrypted
+_initial_num_blocks_is_1_decrypt:
+ INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
+ sub $16, %r13
+ jmp _initial_blocks_decrypted
+_initial_num_blocks_is_0_decrypt:
+ INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
+_initial_blocks_decrypted:
+ cmp $0, %r13
+ je _zero_cipher_left_decrypt
+ sub $64, %r13
+ je _four_cipher_left_decrypt
+_decrypt_by_4:
+ GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
+ add $64, %r11
+ sub $64, %r13
+ jne _decrypt_by_4
+_four_cipher_left_decrypt:
+ GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_decrypt:
+ mov %arg4, %r13
+ and $15, %r13 # %r13 = arg4 (mod 16)
+ je _multiple_of_16_bytes_decrypt
+
+ # Handle the last <16 byte block seperately
+
+ paddd ONE(%rip), %xmm0 # increment CNT to get Yn
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm0
+
+ ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
+ sub $16, %r11
+ add %r13, %r11
+ movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12
+# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
+# (%r13 is the number of bytes in plaintext mod 16)
+ movdqu (%r12), %xmm2 # get the appropriate shuffle mask
+ PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
+
+ movdqa %xmm1, %xmm2
+ pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
+ movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
+ pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
+ pand %xmm1, %xmm2
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10 ,%xmm2
+
+ pxor %xmm2, %xmm8
+ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+ # GHASH computation for the last <16 byte block
+ sub %r13, %r11
+ add $16, %r11
+
+ # output %r13 bytes
+ MOVQ_R64_XMM %xmm0, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left_decrypt
+ mov %rax, (%arg2 , %r11, 1)
+ add $8, %r11
+ psrldq $8, %xmm0
+ MOVQ_R64_XMM %xmm0, %rax
+ sub $8, %r13
+_less_than_8_bytes_left_decrypt:
+ mov %al, (%arg2, %r11, 1)
+ add $1, %r11
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left_decrypt
+_multiple_of_16_bytes_decrypt:
+ mov arg8, %r12 # %r13 = aadLen (number of bytes)
+ shl $3, %r12 # convert into number of bits
+ movd %r12d, %xmm15 # len(A) in %xmm15
+ shl $3, %arg4 # len(C) in bits (*128)
+ MOVQ_R64_XMM %arg4, %xmm1
+ pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
+ pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
+ pxor %xmm15, %xmm8
+ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+ # final GHASH computation
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm8
+
+ mov %arg5, %rax # %rax = *Y0
+ movdqu (%rax), %xmm0 # %xmm0 = Y0
+ ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
+ pxor %xmm8, %xmm0
+_return_T_decrypt:
+ mov arg9, %r10 # %r10 = authTag
+ mov arg10, %r11 # %r11 = auth_tag_len
+ cmp $16, %r11
+ je _T_16_decrypt
+ cmp $12, %r11
+ je _T_12_decrypt
+_T_8_decrypt:
+ MOVQ_R64_XMM %xmm0, %rax
+ mov %rax, (%r10)
+ jmp _return_T_done_decrypt
+_T_12_decrypt:
+ MOVQ_R64_XMM %xmm0, %rax
+ mov %rax, (%r10)
+ psrldq $8, %xmm0
+ movd %xmm0, %eax
+ mov %eax, 8(%r10)
+ jmp _return_T_done_decrypt
+_T_16_decrypt:
+ movdqu %xmm0, (%r10)
+_return_T_done_decrypt:
+ mov %r14, %rsp
+ pop %r14
+ pop %r13
+ pop %r12
+ ret
+
+
+/*****************************************************************************
+* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
+* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
+* const u8 *in, // Plaintext input
+* u64 plaintext_len, // Length of data in bytes for encryption.
+* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
+* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
+* // concatenated with 0x00000001. 16-byte aligned pointer.
+* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
+* const u8 *aad, // Additional Authentication Data (AAD)
+* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
+* u8 *auth_tag, // Authenticated Tag output.
+* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
+* // 12 or 8.
+*
+* Assumptions:
+*
+* keys:
+* keys are pre-expanded and aligned to 16 bytes. we are using the
+* first set of 11 keys in the data structure void *aes_ctx
+*
+*
+* iv:
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Salt (From the SA) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | Initialization Vector |
+* | (This is the sequence number from IPSec header) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x1 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+*
+*
+* AAD:
+* AAD padded to 128 bits with 0
+* for example, assume AAD is a u32 vector
+*
+* if AAD is 8 bytes:
+* AAD[3] = {A0, A1};
+* padded AAD in xmm register = {A1 A0 0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A1) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 32-bit Sequence Number (A0) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 32-bit Sequence Number
+*
+* if AAD is 12 bytes:
+* AAD[3] = {A0, A1, A2};
+* padded AAD in xmm register = {A2 A1 A0 0}
+*
+* 0 1 2 3
+* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | SPI (A2) |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 64-bit Extended Sequence Number {A1,A0} |
+* | |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+* | 0x0 |
+* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*
+* AAD Format with 64-bit Extended Sequence Number
+*
+* aadLen:
+* from the definition of the spec, aadLen can only be 8 or 12 bytes.
+* The code supports 16 too but for other sizes, the code will fail.
+*
+* TLen:
+* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
+* For other sizes, the code will fail.
+*
+* poly = x^128 + x^127 + x^126 + x^121 + 1
+***************************************************************************/
+ENTRY(aesni_gcm_enc)
+ push %r12
+ push %r13
+ push %r14
+ mov %rsp, %r14
+#
+# states of %xmm registers %xmm6:%xmm15 not saved
+# all %xmm registers are clobbered
+#
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp
+ mov %arg6, %r12
+ movdqu (%r12), %xmm13
+ movdqa SHUF_MASK(%rip), %xmm2
+ PSHUFB_XMM %xmm2, %xmm13
+
+
+# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
+
+ movdqa %xmm13, %xmm2
+ psllq $1, %xmm13
+ psrlq $63, %xmm2
+ movdqa %xmm2, %xmm1
+ pslldq $8, %xmm2
+ psrldq $8, %xmm1
+ por %xmm2, %xmm13
+
+ # reduce HashKey<<1
+
+ pshufd $0x24, %xmm1, %xmm2
+ pcmpeqd TWOONE(%rip), %xmm2
+ pand POLY(%rip), %xmm2
+ pxor %xmm2, %xmm13
+ movdqa %xmm13, HashKey(%rsp)
+ mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
+ and $-16, %r13
+ mov %r13, %r12
+
+ # Encrypt first few blocks
+
+ and $(3<<4), %r12
+ jz _initial_num_blocks_is_0_encrypt
+ cmp $(2<<4), %r12
+ jb _initial_num_blocks_is_1_encrypt
+ je _initial_num_blocks_is_2_encrypt
+_initial_num_blocks_is_3_encrypt:
+ INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
+ sub $48, %r13
+ jmp _initial_blocks_encrypted
+_initial_num_blocks_is_2_encrypt:
+ INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
+ sub $32, %r13
+ jmp _initial_blocks_encrypted
+_initial_num_blocks_is_1_encrypt:
+ INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
+ sub $16, %r13
+ jmp _initial_blocks_encrypted
+_initial_num_blocks_is_0_encrypt:
+ INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
+%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
+_initial_blocks_encrypted:
+
+ # Main loop - Encrypt remaining blocks
+
+ cmp $0, %r13
+ je _zero_cipher_left_encrypt
+ sub $64, %r13
+ je _four_cipher_left_encrypt
+_encrypt_by_4_encrypt:
+ GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
+%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
+ add $64, %r11
+ sub $64, %r13
+ jne _encrypt_by_4_encrypt
+_four_cipher_left_encrypt:
+ GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
+%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
+_zero_cipher_left_encrypt:
+ mov %arg4, %r13
+ and $15, %r13 # %r13 = arg4 (mod 16)
+ je _multiple_of_16_bytes_encrypt
+
+ # Handle the last <16 Byte block seperately
+ paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm0
+
+ ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
+ sub $16, %r11
+ add %r13, %r11
+ movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12
+ # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+ # (%r13 is the number of bytes in plaintext mod 16)
+ movdqu (%r12), %xmm2 # get the appropriate shuffle mask
+ PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
+ pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
+ movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out top 16-r13 bytes of xmm0
+ pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10,%xmm0
+
+ pxor %xmm0, %xmm8
+ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+ # GHASH computation for the last <16 byte block
+ sub %r13, %r11
+ add $16, %r11
+ PSHUFB_XMM %xmm10, %xmm1
+
+ # shuffle xmm0 back to output as ciphertext
+
+ # Output %r13 bytes
+ MOVQ_R64_XMM %xmm0, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left_encrypt
+ mov %rax, (%arg2 , %r11, 1)
+ add $8, %r11
+ psrldq $8, %xmm0
+ MOVQ_R64_XMM %xmm0, %rax
+ sub $8, %r13
+_less_than_8_bytes_left_encrypt:
+ mov %al, (%arg2, %r11, 1)
+ add $1, %r11
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left_encrypt
+_multiple_of_16_bytes_encrypt:
+ mov arg8, %r12 # %r12 = addLen (number of bytes)
+ shl $3, %r12
+ movd %r12d, %xmm15 # len(A) in %xmm15
+ shl $3, %arg4 # len(C) in bits (*128)
+ MOVQ_R64_XMM %arg4, %xmm1
+ pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
+ pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
+ pxor %xmm15, %xmm8
+ GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
+ # final GHASH computation
+ movdqa SHUF_MASK(%rip), %xmm10
+ PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
+
+ mov %arg5, %rax # %rax = *Y0
+ movdqu (%rax), %xmm0 # %xmm0 = Y0
+ ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
+ pxor %xmm8, %xmm0
+_return_T_encrypt:
+ mov arg9, %r10 # %r10 = authTag
+ mov arg10, %r11 # %r11 = auth_tag_len
+ cmp $16, %r11
+ je _T_16_encrypt
+ cmp $12, %r11
+ je _T_12_encrypt
+_T_8_encrypt:
+ MOVQ_R64_XMM %xmm0, %rax
+ mov %rax, (%r10)
+ jmp _return_T_done_encrypt
+_T_12_encrypt:
+ MOVQ_R64_XMM %xmm0, %rax
+ mov %rax, (%r10)
+ psrldq $8, %xmm0
+ movd %xmm0, %eax
+ mov %eax, 8(%r10)
+ jmp _return_T_done_encrypt
+_T_16_encrypt:
+ movdqu %xmm0, (%r10)
+_return_T_done_encrypt:
+ mov %r14, %rsp
+ pop %r14
+ pop %r13
+ pop %r12
+ ret
+
+#endif
+
_key_expansion_128:
_key_expansion_256a:
@@ -55,10 +1709,11 @@ _key_expansion_256a:
shufps $0b10001100, %xmm0, %xmm4
pxor %xmm4, %xmm0
pxor %xmm1, %xmm0
- movaps %xmm0, (%rcx)
- add $0x10, %rcx
+ movaps %xmm0, (TKEYP)
+ add $0x10, TKEYP
ret
+.align 4
_key_expansion_192a:
pshufd $0b01010101, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
@@ -76,12 +1731,13 @@ _key_expansion_192a:
movaps %xmm0, %xmm1
shufps $0b01000100, %xmm0, %xmm6
- movaps %xmm6, (%rcx)
+ movaps %xmm6, (TKEYP)
shufps $0b01001110, %xmm2, %xmm1
- movaps %xmm1, 16(%rcx)
- add $0x20, %rcx
+ movaps %xmm1, 0x10(TKEYP)
+ add $0x20, TKEYP
ret
+.align 4
_key_expansion_192b:
pshufd $0b01010101, %xmm1, %xmm1
shufps $0b00010000, %xmm0, %xmm4
@@ -96,10 +1752,11 @@ _key_expansion_192b:
pxor %xmm3, %xmm2
pxor %xmm5, %xmm2
- movaps %xmm0, (%rcx)
- add $0x10, %rcx
+ movaps %xmm0, (TKEYP)
+ add $0x10, TKEYP
ret
+.align 4
_key_expansion_256b:
pshufd $0b10101010, %xmm1, %xmm1
shufps $0b00010000, %xmm2, %xmm4
@@ -107,8 +1764,8 @@ _key_expansion_256b:
shufps $0b10001100, %xmm2, %xmm4
pxor %xmm4, %xmm2
pxor %xmm1, %xmm2
- movaps %xmm2, (%rcx)
- add $0x10, %rcx
+ movaps %xmm2, (TKEYP)
+ add $0x10, TKEYP
ret
/*
@@ -116,17 +1773,23 @@ _key_expansion_256b:
* unsigned int key_len)
*/
ENTRY(aesni_set_key)
- movups (%rsi), %xmm0 # user key (first 16 bytes)
- movaps %xmm0, (%rdi)
- lea 0x10(%rdi), %rcx # key addr
- movl %edx, 480(%rdi)
+#ifndef __x86_64__
+ pushl KEYP
+ movl 8(%esp), KEYP # ctx
+ movl 12(%esp), UKEYP # in_key
+ movl 16(%esp), %edx # key_len
+#endif
+ movups (UKEYP), %xmm0 # user key (first 16 bytes)
+ movaps %xmm0, (KEYP)
+ lea 0x10(KEYP), TKEYP # key addr
+ movl %edx, 480(KEYP)
pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
cmp $24, %dl
jb .Lenc_key128
je .Lenc_key192
- movups 0x10(%rsi), %xmm2 # other user key
- movaps %xmm2, (%rcx)
- add $0x10, %rcx
+ movups 0x10(UKEYP), %xmm2 # other user key
+ movaps %xmm2, (TKEYP)
+ add $0x10, TKEYP
AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
call _key_expansion_256a
AESKEYGENASSIST 0x1 %xmm0 %xmm1
@@ -155,7 +1818,7 @@ ENTRY(aesni_set_key)
call _key_expansion_256a
jmp .Ldec_key
.Lenc_key192:
- movq 0x10(%rsi), %xmm2 # other user key
+ movq 0x10(UKEYP), %xmm2 # other user key
AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
call _key_expansion_192a
AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
@@ -195,33 +1858,47 @@ ENTRY(aesni_set_key)
AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
call _key_expansion_128
.Ldec_key:
- sub $0x10, %rcx
- movaps (%rdi), %xmm0
- movaps (%rcx), %xmm1
- movaps %xmm0, 240(%rcx)
- movaps %xmm1, 240(%rdi)
- add $0x10, %rdi
- lea 240-16(%rcx), %rsi
+ sub $0x10, TKEYP
+ movaps (KEYP), %xmm0
+ movaps (TKEYP), %xmm1
+ movaps %xmm0, 240(TKEYP)
+ movaps %xmm1, 240(KEYP)
+ add $0x10, KEYP
+ lea 240-16(TKEYP), UKEYP
.align 4
.Ldec_key_loop:
- movaps (%rdi), %xmm0
+ movaps (KEYP), %xmm0
AESIMC %xmm0 %xmm1
- movaps %xmm1, (%rsi)
- add $0x10, %rdi
- sub $0x10, %rsi
- cmp %rcx, %rdi
+ movaps %xmm1, (UKEYP)
+ add $0x10, KEYP
+ sub $0x10, UKEYP
+ cmp TKEYP, KEYP
jb .Ldec_key_loop
- xor %rax, %rax
+ xor AREG, AREG
+#ifndef __x86_64__
+ popl KEYP
+#endif
ret
/*
* void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_enc)
+#ifndef __x86_64__
+ pushl KEYP
+ pushl KLEN
+ movl 12(%esp), KEYP
+ movl 16(%esp), OUTP
+ movl 20(%esp), INP
+#endif
movl 480(KEYP), KLEN # key length
movups (INP), STATE # input
call _aesni_enc1
movups STATE, (OUTP) # output
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+#endif
ret
/*
@@ -236,6 +1913,7 @@ ENTRY(aesni_enc)
* KEY
* TKEYP (T1)
*/
+.align 4
_aesni_enc1:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@@ -298,6 +1976,7 @@ _aesni_enc1:
* KEY
* TKEYP (T1)
*/
+.align 4
_aesni_enc4:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@@ -391,11 +2070,22 @@ _aesni_enc4:
* void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
*/
ENTRY(aesni_dec)
+#ifndef __x86_64__
+ pushl KEYP
+ pushl KLEN
+ movl 12(%esp), KEYP
+ movl 16(%esp), OUTP
+ movl 20(%esp), INP
+#endif
mov 480(KEYP), KLEN # key length
add $240, KEYP
movups (INP), STATE # input
call _aesni_dec1
movups STATE, (OUTP) #output
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+#endif
ret
/*
@@ -410,6 +2100,7 @@ ENTRY(aesni_dec)
* KEY
* TKEYP (T1)
*/
+.align 4
_aesni_dec1:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@@ -472,6 +2163,7 @@ _aesni_dec1:
* KEY
* TKEYP (T1)
*/
+.align 4
_aesni_dec4:
movaps (KEYP), KEY # key
mov KEYP, TKEYP
@@ -566,6 +2258,15 @@ _aesni_dec4:
* size_t len)
*/
ENTRY(aesni_ecb_enc)
+#ifndef __x86_64__
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl 16(%esp), KEYP
+ movl 20(%esp), OUTP
+ movl 24(%esp), INP
+ movl 28(%esp), LEN
+#endif
test LEN, LEN # check length
jz .Lecb_enc_ret
mov 480(KEYP), KLEN
@@ -602,6 +2303,11 @@ ENTRY(aesni_ecb_enc)
cmp $16, LEN
jge .Lecb_enc_loop1
.Lecb_enc_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+#endif
ret
/*
@@ -609,6 +2315,15 @@ ENTRY(aesni_ecb_enc)
* size_t len);
*/
ENTRY(aesni_ecb_dec)
+#ifndef __x86_64__
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl 16(%esp), KEYP
+ movl 20(%esp), OUTP
+ movl 24(%esp), INP
+ movl 28(%esp), LEN
+#endif
test LEN, LEN
jz .Lecb_dec_ret
mov 480(KEYP), KLEN
@@ -646,6 +2361,11 @@ ENTRY(aesni_ecb_dec)
cmp $16, LEN
jge .Lecb_dec_loop1
.Lecb_dec_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+#endif
ret
/*
@@ -653,6 +2373,17 @@ ENTRY(aesni_ecb_dec)
* size_t len, u8 *iv)
*/
ENTRY(aesni_cbc_enc)
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl 20(%esp), KEYP
+ movl 24(%esp), OUTP
+ movl 28(%esp), INP
+ movl 32(%esp), LEN
+ movl 36(%esp), IVP
+#endif
cmp $16, LEN
jb .Lcbc_enc_ret
mov 480(KEYP), KLEN
@@ -670,6 +2401,12 @@ ENTRY(aesni_cbc_enc)
jge .Lcbc_enc_loop
movups STATE, (IVP)
.Lcbc_enc_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
ret
/*
@@ -677,6 +2414,17 @@ ENTRY(aesni_cbc_enc)
* size_t len, u8 *iv)
*/
ENTRY(aesni_cbc_dec)
+#ifndef __x86_64__
+ pushl IVP
+ pushl LEN
+ pushl KEYP
+ pushl KLEN
+ movl 20(%esp), KEYP
+ movl 24(%esp), OUTP
+ movl 28(%esp), INP
+ movl 32(%esp), LEN
+ movl 36(%esp), IVP
+#endif
cmp $16, LEN
jb .Lcbc_dec_just_ret
mov 480(KEYP), KLEN
@@ -690,16 +2438,30 @@ ENTRY(aesni_cbc_dec)
movaps IN1, STATE1
movups 0x10(INP), IN2
movaps IN2, STATE2
+#ifdef __x86_64__
movups 0x20(INP), IN3
movaps IN3, STATE3
movups 0x30(INP), IN4
movaps IN4, STATE4
+#else
+ movups 0x20(INP), IN1
+ movaps IN1, STATE3
+ movups 0x30(INP), IN2
+ movaps IN2, STATE4
+#endif
call _aesni_dec4
pxor IV, STATE1
+#ifdef __x86_64__
pxor IN1, STATE2
pxor IN2, STATE3
pxor IN3, STATE4
movaps IN4, IV
+#else
+ pxor (INP), STATE2
+ pxor 0x10(INP), STATE3
+ pxor IN1, STATE4
+ movaps IN2, IV
+#endif
movups STATE1, (OUTP)
movups STATE2, 0x10(OUTP)
movups STATE3, 0x20(OUTP)
@@ -727,8 +2489,15 @@ ENTRY(aesni_cbc_dec)
.Lcbc_dec_ret:
movups IV, (IVP)
.Lcbc_dec_just_ret:
+#ifndef __x86_64__
+ popl KLEN
+ popl KEYP
+ popl LEN
+ popl IVP
+#endif
ret
+#ifdef __x86_64__
.align 16
.Lbswap_mask:
.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@@ -744,6 +2513,7 @@ ENTRY(aesni_cbc_dec)
* INC: == 1, in little endian
* BSWAP_MASK == endian swapping mask
*/
+.align 4
_aesni_inc_init:
movaps .Lbswap_mask, BSWAP_MASK
movaps IV, CTR
@@ -768,6 +2538,7 @@ _aesni_inc_init:
* CTR: == output IV, in little endian
* TCTR_LOW: == lower qword of CTR
*/
+.align 4
_aesni_inc:
paddq INC, CTR
add $1, TCTR_LOW
@@ -839,3 +2610,4 @@ ENTRY(aesni_ctr_enc)
movups IV, (IVP)
.Lctr_enc_just_ret:
ret
+#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc4490a..e1e60c7d5813 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
* Copyright (C) 2008, Intel Corp.
* Author: Huang Ying <ying.huang@intel.com>
*
+ * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
+ * interface for 64-bit kernels.
+ * Authors: Adrian Hoban <adrian.hoban@intel.com>
+ * Gabriele Paoloni <gabriele.paoloni@intel.com>
+ * Tadeusz Struk (tadeusz.struk@intel.com)
+ * Aidan O'Mahony (aidan.o.mahony@intel.com)
+ * Copyright (c) 2010, Intel Corporation.
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
#include <crypto/ctr.h>
#include <asm/i387.h>
#include <asm/aes.h>
+#include <crypto/scatterwalk.h>
+#include <crypto/internal/aead.h>
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
#define HAS_CTR
@@ -42,8 +54,31 @@ struct async_aes_ctx {
struct cryptd_ablkcipher *cryptd_tfm;
};
-#define AESNI_ALIGN 16
+/* This data is stored at the end of the crypto_tfm struct.
+ * It's a type of per "session" data storage location.
+ * This needs to be 16 byte aligned.
+ */
+struct aesni_rfc4106_gcm_ctx {
+ u8 hash_subkey[16];
+ struct crypto_aes_ctx aes_key_expanded;
+ u8 nonce[4];
+ struct cryptd_aead *cryptd_tfm;
+};
+
+struct aesni_gcm_set_hash_subkey_result {
+ int err;
+ struct completion completion;
+};
+
+struct aesni_hash_subkey_req_data {
+ u8 iv[16];
+ struct aesni_gcm_set_hash_subkey_result result;
+ struct scatterlist sg;
+};
+
+#define AESNI_ALIGN (16)
#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1))
+#define RFC4106_HASH_SUBKEY_SIZE 16
asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
unsigned int key_len);
@@ -59,9 +94,62 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
+#ifdef CONFIG_X86_64
asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
const u8 *in, unsigned int len, u8 *iv);
+/* asmlinkage void aesni_gcm_enc()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Ciphertext output. Encrypt in-place is allowed.
+ * const u8 *in, Plaintext input
+ * unsigned long plaintext_len, Length of data in bytes for encryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ * concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
+ * is going to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
+ const u8 *in, unsigned long plaintext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+/* asmlinkage void aesni_gcm_dec()
+ * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
+ * u8 *out, Plaintext output. Decrypt in-place is allowed.
+ * const u8 *in, Ciphertext input
+ * unsigned long ciphertext_len, Length of data in bytes for decryption.
+ * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
+ * concatenated with 8 byte Initialisation Vector (from IPSec ESP
+ * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
+ * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
+ * const u8 *aad, Additional Authentication Data (AAD)
+ * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
+ * to be 8 or 12 bytes
+ * u8 *auth_tag, Authenticated Tag output.
+ * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
+ * Valid values are 16 (most likely), 12 or 8.
+ */
+asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
+ const u8 *in, unsigned long ciphertext_len, u8 *iv,
+ u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
+ u8 *auth_tag, unsigned long auth_tag_len);
+
+static inline struct
+aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
+{
+ return
+ (struct aesni_rfc4106_gcm_ctx *)
+ PTR_ALIGN((u8 *)
+ crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
+}
+#endif
+
static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
{
unsigned long addr = (unsigned long)raw_ctx;
@@ -324,6 +412,7 @@ static struct crypto_alg blk_cbc_alg = {
},
};
+#ifdef CONFIG_X86_64
static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
struct blkcipher_walk *walk)
{
@@ -389,6 +478,7 @@ static struct crypto_alg blk_ctr_alg = {
},
},
};
+#endif
static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
unsigned int key_len)
@@ -536,6 +626,7 @@ static struct crypto_alg ablk_cbc_alg = {
},
};
+#ifdef CONFIG_X86_64
static int ablk_ctr_init(struct crypto_tfm *tfm)
{
struct cryptd_ablkcipher *cryptd_tfm;
@@ -612,6 +703,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = {
},
};
#endif
+#endif
#ifdef HAS_LRW
static int ablk_lrw_init(struct crypto_tfm *tfm)
@@ -730,6 +822,424 @@ static struct crypto_alg ablk_xts_alg = {
};
#endif
+#ifdef CONFIG_X86_64
+static int rfc4106_init(struct crypto_tfm *tfm)
+{
+ struct cryptd_aead *cryptd_tfm;
+ struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
+ PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+ cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
+ if (IS_ERR(cryptd_tfm))
+ return PTR_ERR(cryptd_tfm);
+ ctx->cryptd_tfm = cryptd_tfm;
+ tfm->crt_aead.reqsize = sizeof(struct aead_request)
+ + crypto_aead_reqsize(&cryptd_tfm->base);
+ return 0;
+}
+
+static void rfc4106_exit(struct crypto_tfm *tfm)
+{
+ struct aesni_rfc4106_gcm_ctx *ctx =
+ (struct aesni_rfc4106_gcm_ctx *)
+ PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
+ if (!IS_ERR(ctx->cryptd_tfm))
+ cryptd_free_aead(ctx->cryptd_tfm);
+ return;
+}
+
+static void
+rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
+{
+ struct aesni_gcm_set_hash_subkey_result *result = req->data;
+
+ if (err == -EINPROGRESS)
+ return;
+ result->err = err;
+ complete(&result->completion);
+}
+
+static int
+rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
+{
+ struct crypto_ablkcipher *ctr_tfm;
+ struct ablkcipher_request *req;
+ int ret = -EINVAL;
+ struct aesni_hash_subkey_req_data *req_data;
+
+ ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
+ if (IS_ERR(ctr_tfm))
+ return PTR_ERR(ctr_tfm);
+
+ crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
+
+ ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
+ if (ret) {
+ crypto_free_ablkcipher(ctr_tfm);
+ return ret;
+ }
+
+ req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
+ if (!req) {
+ crypto_free_ablkcipher(ctr_tfm);
+ return -EINVAL;
+ }
+
+ req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
+ if (!req_data) {
+ crypto_free_ablkcipher(ctr_tfm);
+ return -ENOMEM;
+ }
+ memset(req_data->iv, 0, sizeof(req_data->iv));
+
+ /* Clear the data in the hash sub key container to zero.*/
+ /* We want to cipher all zeros to create the hash sub key. */
+ memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
+
+ init_completion(&req_data->result.completion);
+ sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
+ ablkcipher_request_set_tfm(req, ctr_tfm);
+ ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
+ CRYPTO_TFM_REQ_MAY_BACKLOG,
+ rfc4106_set_hash_subkey_done,
+ &req_data->result);
+
+ ablkcipher_request_set_crypt(req, &req_data->sg,
+ &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
+
+ ret = crypto_ablkcipher_encrypt(req);
+ if (ret == -EINPROGRESS || ret == -EBUSY) {
+ ret = wait_for_completion_interruptible
+ (&req_data->result.completion);
+ if (!ret)
+ ret = req_data->result.err;
+ }
+ ablkcipher_request_free(req);
+ kfree(req_data);
+ crypto_free_ablkcipher(ctr_tfm);
+ return ret;
+}
+
+static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
+ unsigned int key_len)
+{
+ int ret = 0;
+ struct crypto_tfm *tfm = crypto_aead_tfm(parent);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+ u8 *new_key_mem = NULL;
+
+ if (key_len < 4) {
+ crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ /*Account for 4 byte nonce at the end.*/
+ key_len -= 4;
+ if (key_len != AES_KEYSIZE_128) {
+ crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+
+ memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
+ /*This must be on a 16 byte boundary!*/
+ if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
+ return -EINVAL;
+
+ if ((unsigned long)key % AESNI_ALIGN) {
+ /*key is not aligned: use an auxuliar aligned pointer*/
+ new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
+ if (!new_key_mem)
+ return -ENOMEM;
+
+ new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
+ memcpy(new_key_mem, key, key_len);
+ key = new_key_mem;
+ }
+
+ if (!irq_fpu_usable())
+ ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
+ key, key_len);
+ else {
+ kernel_fpu_begin();
+ ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
+ kernel_fpu_end();
+ }
+ /*This must be on a 16 byte boundary!*/
+ if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
+ ret = -EINVAL;
+ goto exit;
+ }
+ ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
+exit:
+ kfree(new_key_mem);
+ return ret;
+}
+
+/* This is the Integrity Check Value (aka the authentication tag length and can
+ * be 8, 12 or 16 bytes long. */
+static int rfc4106_set_authsize(struct crypto_aead *parent,
+ unsigned int authsize)
+{
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+ struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+ switch (authsize) {
+ case 8:
+ case 12:
+ case 16:
+ break;
+ default:
+ return -EINVAL;
+ }
+ crypto_aead_crt(parent)->authsize = authsize;
+ crypto_aead_crt(cryptd_child)->authsize = authsize;
+ return 0;
+}
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+ int ret;
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+ if (!irq_fpu_usable()) {
+ struct aead_request *cryptd_req =
+ (struct aead_request *) aead_request_ctx(req);
+ memcpy(cryptd_req, req, sizeof(*req));
+ aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+ return crypto_aead_encrypt(cryptd_req);
+ } else {
+ kernel_fpu_begin();
+ ret = cryptd_child->base.crt_aead.encrypt(req);
+ kernel_fpu_end();
+ return ret;
+ }
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+ int ret;
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+
+ if (!irq_fpu_usable()) {
+ struct aead_request *cryptd_req =
+ (struct aead_request *) aead_request_ctx(req);
+ memcpy(cryptd_req, req, sizeof(*req));
+ aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+ return crypto_aead_decrypt(cryptd_req);
+ } else {
+ kernel_fpu_begin();
+ ret = cryptd_child->base.crt_aead.decrypt(req);
+ kernel_fpu_end();
+ return ret;
+ }
+}
+
+static struct crypto_alg rfc4106_alg = {
+ .cra_name = "rfc4106(gcm(aes))",
+ .cra_driver_name = "rfc4106-gcm-aesni",
+ .cra_priority = 400,
+ .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+ .cra_alignmask = 0,
+ .cra_type = &crypto_nivaead_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
+ .cra_init = rfc4106_init,
+ .cra_exit = rfc4106_exit,
+ .cra_u = {
+ .aead = {
+ .setkey = rfc4106_set_key,
+ .setauthsize = rfc4106_set_authsize,
+ .encrypt = rfc4106_encrypt,
+ .decrypt = rfc4106_decrypt,
+ .geniv = "seqiv",
+ .ivsize = 8,
+ .maxauthsize = 16,
+ },
+ },
+};
+
+static int __driver_rfc4106_encrypt(struct aead_request *req)
+{
+ u8 one_entry_in_sg = 0;
+ u8 *src, *dst, *assoc;
+ __be32 counter = cpu_to_be32(1);
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ u8 iv_tab[16+AESNI_ALIGN];
+ u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
+ struct scatter_walk src_sg_walk;
+ struct scatter_walk assoc_sg_walk;
+ struct scatter_walk dst_sg_walk;
+ unsigned int i;
+
+ /* Assuming we are supporting rfc4106 64-bit extended */
+ /* sequence numbers We need to have the AAD length equal */
+ /* to 8 or 12 bytes */
+ if (unlikely(req->assoclen != 8 && req->assoclen != 12))
+ return -EINVAL;
+ /* IV below built */
+ for (i = 0; i < 4; i++)
+ *(iv+i) = ctx->nonce[i];
+ for (i = 0; i < 8; i++)
+ *(iv+4+i) = req->iv[i];
+ *((__be32 *)(iv+12)) = counter;
+
+ if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+ one_entry_in_sg = 1;
+ scatterwalk_start(&src_sg_walk, req->src);
+ scatterwalk_start(&assoc_sg_walk, req->assoc);
+ src = scatterwalk_map(&src_sg_walk, 0);
+ assoc = scatterwalk_map(&assoc_sg_walk, 0);
+ dst = src;
+ if (unlikely(req->src != req->dst)) {
+ scatterwalk_start(&dst_sg_walk, req->dst);
+ dst = scatterwalk_map(&dst_sg_walk, 0);
+ }
+
+ } else {
+ /* Allocate memory for src, dst, assoc */
+ src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
+ GFP_ATOMIC);
+ if (unlikely(!src))
+ return -ENOMEM;
+ assoc = (src + req->cryptlen + auth_tag_len);
+ scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+ scatterwalk_map_and_copy(assoc, req->assoc, 0,
+ req->assoclen, 0);
+ dst = src;
+ }
+
+ aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
+ ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
+ + ((unsigned long)req->cryptlen), auth_tag_len);
+
+ /* The authTag (aka the Integrity Check Value) needs to be written
+ * back to the packet. */
+ if (one_entry_in_sg) {
+ if (unlikely(req->src != req->dst)) {
+ scatterwalk_unmap(dst, 0);
+ scatterwalk_done(&dst_sg_walk, 0, 0);
+ }
+ scatterwalk_unmap(src, 0);
+ scatterwalk_unmap(assoc, 0);
+ scatterwalk_done(&src_sg_walk, 0, 0);
+ scatterwalk_done(&assoc_sg_walk, 0, 0);
+ } else {
+ scatterwalk_map_and_copy(dst, req->dst, 0,
+ req->cryptlen + auth_tag_len, 1);
+ kfree(src);
+ }
+ return 0;
+}
+
+static int __driver_rfc4106_decrypt(struct aead_request *req)
+{
+ u8 one_entry_in_sg = 0;
+ u8 *src, *dst, *assoc;
+ unsigned long tempCipherLen = 0;
+ __be32 counter = cpu_to_be32(1);
+ int retval = 0;
+ struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+ struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+ void *aes_ctx = &(ctx->aes_key_expanded);
+ unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ u8 iv_and_authTag[32+AESNI_ALIGN];
+ u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
+ u8 *authTag = iv + 16;
+ struct scatter_walk src_sg_walk;
+ struct scatter_walk assoc_sg_walk;
+ struct scatter_walk dst_sg_walk;
+ unsigned int i;
+
+ if (unlikely((req->cryptlen < auth_tag_len) ||
+ (req->assoclen != 8 && req->assoclen != 12)))
+ return -EINVAL;
+ /* Assuming we are supporting rfc4106 64-bit extended */
+ /* sequence numbers We need to have the AAD length */
+ /* equal to 8 or 12 bytes */
+
+ tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
+ /* IV below built */
+ for (i = 0; i < 4; i++)
+ *(iv+i) = ctx->nonce[i];
+ for (i = 0; i < 8; i++)
+ *(iv+4+i) = req->iv[i];
+ *((__be32 *)(iv+12)) = counter;
+
+ if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
+ one_entry_in_sg = 1;
+ scatterwalk_start(&src_sg_walk, req->src);
+ scatterwalk_start(&assoc_sg_walk, req->assoc);
+ src = scatterwalk_map(&src_sg_walk, 0);
+ assoc = scatterwalk_map(&assoc_sg_walk, 0);
+ dst = src;
+ if (unlikely(req->src != req->dst)) {
+ scatterwalk_start(&dst_sg_walk, req->dst);
+ dst = scatterwalk_map(&dst_sg_walk, 0);
+ }
+
+ } else {
+ /* Allocate memory for src, dst, assoc */
+ src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
+ if (!src)
+ return -ENOMEM;
+ assoc = (src + req->cryptlen + auth_tag_len);
+ scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
+ scatterwalk_map_and_copy(assoc, req->assoc, 0,
+ req->assoclen, 0);
+ dst = src;
+ }
+
+ aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
+ ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
+ authTag, auth_tag_len);
+
+ /* Compare generated tag with passed in tag. */
+ retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
+ -EBADMSG : 0;
+
+ if (one_entry_in_sg) {
+ if (unlikely(req->src != req->dst)) {
+ scatterwalk_unmap(dst, 0);
+ scatterwalk_done(&dst_sg_walk, 0, 0);
+ }
+ scatterwalk_unmap(src, 0);
+ scatterwalk_unmap(assoc, 0);
+ scatterwalk_done(&src_sg_walk, 0, 0);
+ scatterwalk_done(&assoc_sg_walk, 0, 0);
+ } else {
+ scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
+ kfree(src);
+ }
+ return retval;
+}
+
+static struct crypto_alg __rfc4106_alg = {
+ .cra_name = "__gcm-aes-aesni",
+ .cra_driver_name = "__driver-gcm-aes-aesni",
+ .cra_priority = 0,
+ .cra_flags = CRYPTO_ALG_TYPE_AEAD,
+ .cra_blocksize = 1,
+ .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
+ .cra_alignmask = 0,
+ .cra_type = &crypto_aead_type,
+ .cra_module = THIS_MODULE,
+ .cra_list = LIST_HEAD_INIT(__rfc4106_alg.cra_list),
+ .cra_u = {
+ .aead = {
+ .encrypt = __driver_rfc4106_encrypt,
+ .decrypt = __driver_rfc4106_decrypt,
+ },
+ },
+};
+#endif
+
static int __init aesni_init(void)
{
int err;
@@ -738,6 +1248,7 @@ static int __init aesni_init(void)
printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
return -ENODEV;
}
+
if ((err = crypto_register_alg(&aesni_alg)))
goto aes_err;
if ((err = crypto_register_alg(&__aesni_alg)))
@@ -746,18 +1257,24 @@ static int __init aesni_init(void)
goto blk_ecb_err;
if ((err = crypto_register_alg(&blk_cbc_alg)))
goto blk_cbc_err;
- if ((err = crypto_register_alg(&blk_ctr_alg)))
- goto blk_ctr_err;
if ((err = crypto_register_alg(&ablk_ecb_alg)))
goto ablk_ecb_err;
if ((err = crypto_register_alg(&ablk_cbc_alg)))
goto ablk_cbc_err;
+#ifdef CONFIG_X86_64
+ if ((err = crypto_register_alg(&blk_ctr_alg)))
+ goto blk_ctr_err;
if ((err = crypto_register_alg(&ablk_ctr_alg)))
goto ablk_ctr_err;
+ if ((err = crypto_register_alg(&__rfc4106_alg)))
+ goto __aead_gcm_err;
+ if ((err = crypto_register_alg(&rfc4106_alg)))
+ goto aead_gcm_err;
#ifdef HAS_CTR
if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
goto ablk_rfc3686_ctr_err;
#endif
+#endif
#ifdef HAS_LRW
if ((err = crypto_register_alg(&ablk_lrw_alg)))
goto ablk_lrw_err;
@@ -770,7 +1287,6 @@ static int __init aesni_init(void)
if ((err = crypto_register_alg(&ablk_xts_alg)))
goto ablk_xts_err;
#endif
-
return err;
#ifdef HAS_XTS
@@ -784,18 +1300,24 @@ ablk_pcbc_err:
crypto_unregister_alg(&ablk_lrw_alg);
ablk_lrw_err:
#endif
+#ifdef CONFIG_X86_64
#ifdef HAS_CTR
crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
ablk_rfc3686_ctr_err:
#endif
+ crypto_unregister_alg(&rfc4106_alg);
+aead_gcm_err:
+ crypto_unregister_alg(&__rfc4106_alg);
+__aead_gcm_err:
crypto_unregister_alg(&ablk_ctr_alg);
ablk_ctr_err:
+ crypto_unregister_alg(&blk_ctr_alg);
+blk_ctr_err:
+#endif
crypto_unregister_alg(&ablk_cbc_alg);
ablk_cbc_err:
crypto_unregister_alg(&ablk_ecb_alg);
ablk_ecb_err:
- crypto_unregister_alg(&blk_ctr_alg);
-blk_ctr_err:
crypto_unregister_alg(&blk_cbc_alg);
blk_cbc_err:
crypto_unregister_alg(&blk_ecb_alg);
@@ -818,13 +1340,17 @@ static void __exit aesni_exit(void)
#ifdef HAS_LRW
crypto_unregister_alg(&ablk_lrw_alg);
#endif
+#ifdef CONFIG_X86_64
#ifdef HAS_CTR
crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
#endif
+ crypto_unregister_alg(&rfc4106_alg);
+ crypto_unregister_alg(&__rfc4106_alg);
crypto_unregister_alg(&ablk_ctr_alg);
+ crypto_unregister_alg(&blk_ctr_alg);
+#endif
crypto_unregister_alg(&ablk_cbc_alg);
crypto_unregister_alg(&ablk_ecb_alg);
- crypto_unregister_alg(&blk_ctr_alg);
crypto_unregister_alg(&blk_cbc_alg);
crypto_unregister_alg(&blk_ecb_alg);
crypto_unregister_alg(&__aesni_alg);
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index cbcc8d8ea93a..7a6e68e4f748 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -10,6 +10,7 @@
* by the Free Software Foundation.
*/
+#include <linux/err.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 849813f398e7..5852519b2d0f 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -28,7 +28,6 @@
#include <linux/syscalls.h>
#include <linux/times.h>
#include <linux/utsname.h>
-#include <linux/smp_lock.h>
#include <linux/mm.h>
#include <linux/uio.h>
#include <linux/poll.h>
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 55d106b5e31b..211ca3f7fd16 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -185,17 +185,16 @@ struct bootnode;
#ifdef CONFIG_ACPI_NUMA
extern int acpi_numa;
-extern int acpi_get_nodes(struct bootnode *physnodes);
+extern void acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
+ unsigned long end);
extern int acpi_scan_nodes(unsigned long start, unsigned long end);
#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
+
+#ifdef CONFIG_NUMA_EMU
extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
int num_nodes);
-#else
-static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
- int num_nodes)
-{
-}
#endif
+#endif /* CONFIG_ACPI_NUMA */
#define acpi_unlazy_tlb(x) leave_mm(x)
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 76561d20ea2f..13009d1af99a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -66,6 +66,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
extern void alternatives_smp_module_del(struct module *mod);
extern void alternatives_smp_switch(int smp);
extern int alternatives_text_reserved(void *start, void *end);
+extern bool skip_smp_alternatives;
#else
static inline void alternatives_smp_module_add(struct module *mod, char *name,
void *locks, void *locks_end,
@@ -180,8 +181,15 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len);
* On the local CPU you need to be protected again NMI or MCE handlers seeing an
* inconsistent instruction while you patch.
*/
+struct text_poke_param {
+ void *addr;
+ const void *opcode;
+ size_t len;
+};
+
extern void *text_poke(void *addr, const void *opcode, size_t len);
extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
+extern void text_poke_smp_batch(struct text_poke_param *params, int n);
#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
#define IDEAL_NOP_SIZE_5 5
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index c8517f81b21e..64dc82ee19f0 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -3,36 +3,64 @@
#include <linux/pci.h>
-extern struct pci_device_id k8_nb_ids[];
+struct amd_nb_bus_dev_range {
+ u8 bus;
+ u8 dev_base;
+ u8 dev_limit;
+};
+
+extern struct pci_device_id amd_nb_misc_ids[];
+extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
struct bootnode;
-extern int early_is_k8_nb(u32 value);
-extern int cache_k8_northbridges(void);
-extern void k8_flush_garts(void);
-extern int k8_get_nodes(struct bootnode *nodes);
-extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
-extern int k8_scan_nodes(void);
+extern int early_is_amd_nb(u32 value);
+extern int amd_cache_northbridges(void);
+extern void amd_flush_garts(void);
+extern int amd_numa_init(unsigned long start_pfn, unsigned long end_pfn);
+extern int amd_scan_nodes(void);
+
+#ifdef CONFIG_NUMA_EMU
+extern void amd_fake_nodes(const struct bootnode *nodes, int nr_nodes);
+extern void amd_get_nodes(struct bootnode *nodes);
+#endif
-struct k8_northbridge_info {
+struct amd_northbridge {
+ struct pci_dev *misc;
+};
+
+struct amd_northbridge_info {
u16 num;
- u8 gart_supported;
- struct pci_dev **nb_misc;
+ u64 flags;
+ struct amd_northbridge *nb;
};
-extern struct k8_northbridge_info k8_northbridges;
+extern struct amd_northbridge_info amd_northbridges;
+
+#define AMD_NB_GART 0x1
+#define AMD_NB_L3_INDEX_DISABLE 0x2
#ifdef CONFIG_AMD_NB
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
+static inline int amd_nb_num(void)
{
- return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
+ return amd_northbridges.num;
}
-#else
+static inline int amd_nb_has_feature(int feature)
+{
+ return ((amd_northbridges.flags & feature) == feature);
+}
-static inline struct pci_dev *node_to_k8_nb_misc(int node)
+static inline struct amd_northbridge *node_to_amd_nb(int node)
{
- return NULL;
+ return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
}
+
+#else
+
+#define amd_nb_num(x) 0
+#define amd_nb_has_feature(x) false
+#define node_to_amd_nb(x) NULL
+
#endif
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 286de34b0ed6..5e3969c36d7f 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -141,13 +141,13 @@ static inline void native_apic_msr_write(u32 reg, u32 v)
static inline u32 native_apic_msr_read(u32 reg)
{
- u32 low, high;
+ u64 msr;
if (reg == APIC_DFR)
return -1;
- rdmsr(APIC_BASE_MSR + (reg >> 4), low, high);
- return low;
+ rdmsrl(APIC_BASE_MSR + (reg >> 4), msr);
+ return (u32)msr;
}
static inline void native_x2apic_wait_icr_idle(void)
@@ -181,12 +181,12 @@ extern void enable_x2apic(void);
extern void x2apic_icr_write(u32 low, u32 id);
static inline int x2apic_enabled(void)
{
- int msr, msr2;
+ u64 msr;
if (!cpu_has_x2apic)
return 0;
- rdmsr(MSR_IA32_APICBASE, msr, msr2);
+ rdmsrl(MSR_IA32_APICBASE, msr);
if (msr & X2APIC_ENABLE)
return 1;
return 0;
@@ -234,16 +234,17 @@ extern void init_bsp_APIC(void);
extern void setup_local_APIC(void);
extern void end_local_APIC_setup(void);
extern void init_apic_mappings(void);
+void register_lapic_address(unsigned long address);
extern void setup_boot_APIC_clock(void);
extern void setup_secondary_APIC_clock(void);
extern int APIC_init_uniprocessor(void);
extern void enable_NMI_through_LVT0(void);
+extern int apic_force_enable(void);
/*
* On 32bit this is mach-xxx local
*/
#ifdef CONFIG_X86_64
-extern void early_init_lapic_mapping(void);
extern int apic_is_clustered_box(void);
#else
static inline int apic_is_clustered_box(void)
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index a859ca461fb0..47a30ff8e517 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -145,6 +145,7 @@
#ifdef CONFIG_X86_32
# define MAX_IO_APICS 64
+# define MAX_LOCAL_APIC 256
#else
# define MAX_IO_APICS 128
# define MAX_LOCAL_APIC 32768
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 3b62ab56c7a0..5e1a2eef3e7c 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -32,11 +32,7 @@
#define BOOT_HEAP_SIZE 0x400000
#else /* !CONFIG_KERNEL_BZIP2 */
-#ifdef CONFIG_X86_64
-#define BOOT_HEAP_SIZE 0x7000
-#else
-#define BOOT_HEAP_SIZE 0x4000
-#endif
+#define BOOT_HEAP_SIZE 0x8000
#endif /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 63e35ec9075c..62f084478f7e 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -1,48 +1,8 @@
#ifndef _ASM_X86_CACHEFLUSH_H
#define _ASM_X86_CACHEFLUSH_H
-/* Keep includes the same across arches. */
-#include <linux/mm.h>
-
/* Caches aren't brain-dead on the intel. */
-static inline void flush_cache_all(void) { }
-static inline void flush_cache_mm(struct mm_struct *mm) { }
-static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
-static inline void flush_cache_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end) { }
-static inline void flush_cache_page(struct vm_area_struct *vma,
- unsigned long vmaddr, unsigned long pfn) { }
-#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
-static inline void flush_dcache_page(struct page *page) { }
-static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
-static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
-static inline void flush_icache_range(unsigned long start,
- unsigned long end) { }
-static inline void flush_icache_page(struct vm_area_struct *vma,
- struct page *page) { }
-static inline void flush_icache_user_range(struct vm_area_struct *vma,
- struct page *page,
- unsigned long addr,
- unsigned long len) { }
-static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
-static inline void flush_cache_vunmap(unsigned long start,
- unsigned long end) { }
-
-static inline void copy_to_user_page(struct vm_area_struct *vma,
- struct page *page, unsigned long vaddr,
- void *dst, const void *src,
- unsigned long len)
-{
- memcpy(dst, src, len);
-}
-
-static inline void copy_from_user_page(struct vm_area_struct *vma,
- struct page *page, unsigned long vaddr,
- void *dst, const void *src,
- unsigned long len)
-{
- memcpy(dst, src, len);
-}
+#include <asm-generic/cacheflush.h>
#ifdef CONFIG_X86_PAT
/*
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 4fab24de26b1..6e6e7558e702 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,5 +32,6 @@ extern void arch_unregister_cpu(int);
DECLARE_PER_CPU(int, cpu_state);
+int __cpuinit mwait_usable(const struct cpuinfo_x86 *);
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index b81002f23614..078ad0caefc6 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -94,7 +94,7 @@ static inline void hw_breakpoint_disable(void)
static inline int hw_breakpoint_active(void)
{
- return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
+ return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
}
extern void aout_dump_debugregs(struct user *dump);
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 5be1542fbfaf..e99d55d74df5 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -72,6 +72,9 @@ struct e820map {
#define BIOS_BEGIN 0x000a0000
#define BIOS_END 0x00100000
+#define BIOS_ROM_BASE 0xffe00000
+#define BIOS_ROM_END 0xffffffff
+
#ifdef __KERNEL__
/* see comment in arch/x86/kernel/e820.c */
extern struct e820map e820;
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 139591a933f6..4729b2b63117 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -116,11 +116,11 @@ enum fixed_addresses {
#endif
FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
- __end_of_permanent_fixed_addresses,
-
#ifdef CONFIG_X86_MRST
FIX_LNW_VRTC,
#endif
+ __end_of_permanent_fixed_addresses,
+
/*
* 256 temporary boot-time mappings, used by early_ioremap(),
* before ioremap() is functional.
@@ -220,8 +220,8 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
}
/* Return an pointer with offset calculated */
-static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx,
- phys_addr_t phys, pgprot_t flags)
+static __always_inline unsigned long
+__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
{
__set_fixmap(idx, phys, flags);
return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
index 49dbfdfa50f9..91d915a65259 100644
--- a/arch/x86/include/asm/gpio.h
+++ b/arch/x86/include/asm/gpio.h
@@ -38,12 +38,9 @@ static inline int gpio_cansleep(unsigned int gpio)
return __gpio_cansleep(gpio);
}
-/*
- * Not implemented, yet.
- */
static inline int gpio_to_irq(unsigned int gpio)
{
- return -ENOSYS;
+ return __gpio_to_irq(gpio);
}
static inline int irq_to_gpio(unsigned int irq)
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index ff2546ce7178..7a15153c675d 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,6 +20,9 @@
#ifndef _ASM_X86_HYPERVISOR_H
#define _ASM_X86_HYPERVISOR_H
+#include <asm/kvm_para.h>
+#include <asm/xen/hypervisor.h>
+
extern void init_hypervisor(struct cpuinfo_x86 *c);
extern void init_hypervisor_platform(void);
@@ -47,4 +50,13 @@ extern const struct hypervisor_x86 x86_hyper_vmware;
extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
extern const struct hypervisor_x86 x86_hyper_xen_hvm;
+static inline bool hypervisor_x2apic_available(void)
+{
+ if (kvm_para_available())
+ return true;
+ if (xen_x2apic_para_available())
+ return true;
+ return false;
+}
+
#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 4aa2bb3b242a..ef328901c802 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -93,6 +93,17 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
int err;
/* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+ asm volatile("1: fxrstorq %[fx]\n\t"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=r" (err)
+ : [fx] "m" (*fx), "0" (0));
+#else
asm volatile("1: rex64/fxrstor (%[fx])\n\t"
"2:\n"
".section .fixup,\"ax\"\n"
@@ -102,6 +113,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err)
: [fx] "R" (fx), "m" (*fx), "0" (0));
+#endif
return err;
}
@@ -119,6 +131,17 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
return -EFAULT;
/* See comment in fxsave() below. */
+#ifdef CONFIG_AS_FXSAVEQ
+ asm volatile("1: fxsaveq %[fx]\n\t"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=r" (err), [fx] "=m" (*fx)
+ : "0" (0));
+#else
asm volatile("1: rex64/fxsave (%[fx])\n\t"
"2:\n"
".section .fixup,\"ax\"\n"
@@ -128,6 +151,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err), "=m" (*fx)
: [fx] "R" (fx), "0" (0));
+#endif
if (unlikely(err) &&
__clear_user(fx, sizeof(struct i387_fxsave_struct)))
err = -EFAULT;
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index a6b28d017c2f..f327d386d6cc 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -159,7 +159,7 @@ struct io_apic_irq_attr;
extern int io_apic_set_pci_routing(struct device *dev, int irq,
struct io_apic_irq_attr *irq_attr);
void setup_IO_APIC_irq_extra(u32 gsi);
-extern void ioapic_init_mappings(void);
+extern void ioapic_and_gsi_init(void);
extern void ioapic_insert_resources(void);
extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
@@ -168,10 +168,10 @@ extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern void probe_nr_irqs_gsi(void);
extern int get_nr_irqs_gsi(void);
extern void setup_ioapic_ids_from_mpc(void);
+extern void setup_ioapic_ids_from_mpc_nocheck(void);
struct mp_ioapic_gsi{
u32 gsi_base;
@@ -184,14 +184,15 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi);
void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
extern void __init pre_init_apic_IRQ0(void);
+extern void mp_save_irq(struct mpc_intsrc *m);
+
#else /* !CONFIG_X86_IO_APIC */
#define io_apic_assign_pci_irqs 0
#define setup_ioapic_ids_from_mpc x86_init_noop
static const int timer_through_8259 = 0;
-static inline void ioapic_init_mappings(void) { }
+static inline void ioapic_and_gsi_init(void) { }
static inline void ioapic_insert_resources(void) { }
-static inline void probe_nr_irqs_gsi(void) { }
#define gsi_top (NR_IRQS_LEGACY)
static inline int mp_find_ioapic(u32 gsi) { return 0; }
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 13b0ebaa512f..c704b38c57a2 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -10,15 +10,14 @@
#include <asm/apicdef.h>
#include <asm/irq_vectors.h>
+/* Even though we don't support this, supply it to appease OF */
+static inline void irq_dispose_mapping(unsigned int virq) { }
+
static inline int irq_canonicalize(int irq)
{
return ((irq == 2) ? 9 : irq);
}
-#ifdef CONFIG_X86_LOCAL_APIC
-# define ARCH_HAS_NMI_WATCHDOG
-#endif
-
#ifdef CONFIG_X86_32
extern void irq_ctx_init(int cpu);
#else
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index f52d42e80585..574dbc22893a 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -14,7 +14,7 @@
do { \
asm goto("1:" \
JUMP_LABEL_INITIAL_NOP \
- ".pushsection __jump_table, \"a\" \n\t"\
+ ".pushsection __jump_table, \"aw\" \n\t"\
_ASM_PTR "1b, %l[" #label "], %c0 \n\t" \
".popsection \n\t" \
: : "i" (key) : : label); \
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 5bdfca86581b..ca242d35e873 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -18,7 +18,6 @@ enum die_val {
DIE_TRAP,
DIE_GPF,
DIE_CALL,
- DIE_NMI_IPI,
DIE_PAGE_FAULT,
DIE_NMIUNKNOWN,
};
@@ -28,7 +27,7 @@ extern void die(const char *, struct pt_regs *,long);
extern int __must_check __die(const char *, struct pt_regs *, long);
extern void show_registers(struct pt_regs *regs);
extern void show_trace(struct task_struct *t, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp);
+ unsigned long *sp);
extern void __show_regs(struct pt_regs *regs, int all);
extern void show_regs(struct pt_regs *regs);
extern unsigned long oops_begin(void);
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b36c6b3fe144..8e37deb1eb38 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -15,6 +15,14 @@
struct x86_emulate_ctxt;
+struct x86_exception {
+ u8 vector;
+ bool error_code_valid;
+ u16 error_code;
+ bool nested_page_fault;
+ u64 address; /* cr2 or nested page fault gpa */
+};
+
/*
* x86_emulate_ops:
*
@@ -64,7 +72,8 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*read_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+ unsigned int bytes, struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
/*
* write_std: Write bytes of standard (non-emulated/special) memory.
@@ -74,7 +83,8 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to write to memory.
*/
int (*write_std)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+ unsigned int bytes, struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
/*
* fetch: Read bytes of standard (non-emulated/special) memory.
* Used for instruction fetch.
@@ -83,7 +93,8 @@ struct x86_emulate_ops {
* @bytes: [IN ] Number of bytes to read from memory.
*/
int (*fetch)(unsigned long addr, void *val,
- unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
+ unsigned int bytes, struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
/*
* read_emulated: Read bytes from emulated/special memory area.
@@ -94,7 +105,7 @@ struct x86_emulate_ops {
int (*read_emulated)(unsigned long addr,
void *val,
unsigned int bytes,
- unsigned int *error,
+ struct x86_exception *fault,
struct kvm_vcpu *vcpu);
/*
@@ -107,7 +118,7 @@ struct x86_emulate_ops {
int (*write_emulated)(unsigned long addr,
const void *val,
unsigned int bytes,
- unsigned int *error,
+ struct x86_exception *fault,
struct kvm_vcpu *vcpu);
/*
@@ -122,7 +133,7 @@ struct x86_emulate_ops {
const void *old,
const void *new,
unsigned int bytes,
- unsigned int *error,
+ struct x86_exception *fault,
struct kvm_vcpu *vcpu);
int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -159,7 +170,10 @@ struct operand {
};
union {
unsigned long *reg;
- unsigned long mem;
+ struct segmented_address {
+ ulong ea;
+ unsigned seg;
+ } mem;
} addr;
union {
unsigned long val;
@@ -226,9 +240,8 @@ struct x86_emulate_ctxt {
bool perm_ok; /* do not check permissions if true */
- int exception; /* exception that happens during emulation or -1 */
- u32 error_code; /* error code for exception */
- bool error_code_valid;
+ bool have_exception;
+ struct x86_exception exception;
/* decode cache */
struct decode_cache decode;
@@ -252,7 +265,7 @@ struct x86_emulate_ctxt {
#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
#endif
-int x86_decode_insn(struct x86_emulate_ctxt *ctxt);
+int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
#define EMULATION_FAILED -1
#define EMULATION_OK 0
#define EMULATION_RESTART 1
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9e6fe391094e..ffd7f8d29187 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -79,15 +79,18 @@
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
-#define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_MAX_CPUID_ENTRIES 80
#define KVM_NR_FIXED_MTRR_REGION 88
#define KVM_NR_VAR_MTRR 8
+#define ASYNC_PF_PER_VCPU 64
+
extern spinlock_t kvm_lock;
extern struct list_head vm_list;
struct kvm_vcpu;
struct kvm;
+struct kvm_async_pf;
enum kvm_reg {
VCPU_REGS_RAX = 0,
@@ -114,6 +117,7 @@ enum kvm_reg {
enum kvm_reg_ex {
VCPU_EXREG_PDPTR = NR_VCPU_REGS,
+ VCPU_EXREG_CR3,
};
enum {
@@ -238,16 +242,18 @@ struct kvm_mmu {
void (*new_cr3)(struct kvm_vcpu *vcpu);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
- void (*inject_page_fault)(struct kvm_vcpu *vcpu);
+ int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
+ bool prefault);
+ void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
void (*free)(struct kvm_vcpu *vcpu);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
- u32 *error);
+ struct x86_exception *exception);
gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, bool clear_unsync);
+ struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
hpa_t root_hpa;
int root_level;
@@ -315,16 +321,6 @@ struct kvm_vcpu_arch {
*/
struct kvm_mmu *walk_mmu;
- /*
- * This struct is filled with the necessary information to propagate a
- * page fault into the guest
- */
- struct {
- u64 address;
- unsigned error_code;
- bool nested;
- } fault;
-
/* only needed in kvm_pv_mmu_op() path, but it's hot so
* put it here to avoid allocation */
struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -412,6 +408,15 @@ struct kvm_vcpu_arch {
u64 hv_vapic;
cpumask_var_t wbinvd_dirty_mask;
+
+ struct {
+ bool halted;
+ gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
+ struct gfn_to_hva_cache data;
+ u64 msr_val;
+ u32 id;
+ bool send_user_only;
+ } apf;
};
struct kvm_arch {
@@ -456,6 +461,10 @@ struct kvm_arch {
/* fields used by HYPER-V emulation */
u64 hv_guest_os_id;
u64 hv_hypercall;
+
+ #ifdef CONFIG_KVM_MMU_AUDIT
+ int audit_point;
+ #endif
};
struct kvm_vm_stat {
@@ -529,6 +538,7 @@ struct kvm_x86_ops {
struct kvm_segment *var, int seg);
void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
+ void (*decache_cr3)(struct kvm_vcpu *vcpu);
void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -582,9 +592,17 @@ struct kvm_x86_ops {
void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
+ void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
const struct trace_print_flags *exit_reasons_str;
};
+struct kvm_arch_async_pf {
+ u32 token;
+ gfn_t gfn;
+ unsigned long cr3;
+ bool direct_map;
+};
+
extern struct kvm_x86_ops *kvm_x86_ops;
int kvm_mmu_module_init(void);
@@ -594,7 +612,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
-void kvm_mmu_set_base_ptes(u64 base_pte);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask);
@@ -623,8 +640,15 @@ enum emulation_result {
#define EMULTYPE_NO_DECODE (1 << 0)
#define EMULTYPE_TRAP_UD (1 << 1)
#define EMULTYPE_SKIP (1 << 2)
-int emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2, u16 error_code, int emulation_type);
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
+ int emulation_type, void *insn, int insn_len);
+
+static inline int emulate_instruction(struct kvm_vcpu *vcpu,
+ int emulation_type)
+{
+ return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
+}
+
void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -650,7 +674,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
@@ -668,11 +692,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t gfn, void *data, int offset, int len,
u32 access);
-void kvm_propagate_fault(struct kvm_vcpu *vcpu);
+void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
int kvm_pic_set_irq(void *opaque, int irq, int level);
@@ -690,16 +714,21 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
int kvm_mmu_load(struct kvm_vcpu *vcpu);
void kvm_mmu_unload(struct kvm_vcpu *vcpu);
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception);
int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+ void *insn, int insn_len);
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
void kvm_enable_tdp(void);
@@ -766,20 +795,25 @@ enum {
#define HF_VINTR_MASK (1 << 2)
#define HF_NMI_MASK (1 << 3)
#define HF_IRET_MASK (1 << 4)
+#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
/*
* Hardware virtualization extension instructions may fault if a
* reboot turns off virtualization while processes are running.
* Trap the fault and ignore the instruction if that happens.
*/
-asmlinkage void kvm_handle_fault_on_reboot(void);
+asmlinkage void kvm_spurious_fault(void);
+extern bool kvm_rebooting;
#define __kvm_handle_fault_on_reboot(insn) \
"666: " insn "\n\t" \
+ "668: \n\t" \
".pushsection .fixup, \"ax\" \n" \
"667: \n\t" \
+ "cmpb $0, kvm_rebooting \n\t" \
+ "jne 668b \n\t" \
__ASM_SIZE(push) " $666b \n\t" \
- "jmp kvm_handle_fault_on_reboot \n\t" \
+ "call kvm_spurious_fault \n\t" \
".popsection \n\t" \
".pushsection __ex_table, \"a\" \n\t" \
_ASM_PTR " 666b, 667b \n\t" \
@@ -788,6 +822,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
#define KVM_ARCH_WANT_MMU_NOTIFIER
int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
@@ -799,4 +834,15 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
+
+void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 7b562b6184bc..a427bf77a93d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -20,6 +20,7 @@
* are available. The use of 0x11 and 0x12 is deprecated
*/
#define KVM_FEATURE_CLOCKSOURCE2 3
+#define KVM_FEATURE_ASYNC_PF 4
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
@@ -32,9 +33,13 @@
/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
+#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
#define KVM_MAX_MMU_OP_BATCH 32
+#define KVM_ASYNC_PF_ENABLED (1 << 0)
+#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
+
/* Operations for KVM_HC_MMU_OP */
#define KVM_MMU_OP_WRITE_PTE 1
#define KVM_MMU_OP_FLUSH_TLB 2
@@ -61,10 +66,20 @@ struct kvm_mmu_op_release_pt {
__u64 pt_phys;
};
+#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
+#define KVM_PV_REASON_PAGE_READY 2
+
+struct kvm_vcpu_pv_apf_data {
+ __u32 reason;
+ __u8 pad[60];
+ __u32 enabled;
+};
+
#ifdef __KERNEL__
#include <asm/processor.h>
extern void kvmclock_init(void);
+extern int kvm_register_clock(char *txt);
/* This instruction is vmcall. On non-VT architectures, it will generate a
@@ -160,8 +175,17 @@ static inline unsigned int kvm_arch_para_features(void)
#ifdef CONFIG_KVM_GUEST
void __init kvm_guest_init(void);
+void kvm_async_pf_task_wait(u32 token);
+void kvm_async_pf_task_wake(u32 token);
+u32 kvm_read_and_reset_pf_reason(void);
#else
#define kvm_guest_init() do { } while (0)
+#define kvm_async_pf_task_wait(T) do {} while(0)
+#define kvm_async_pf_task_wake(T) do {} while(0)
+static inline u32 kvm_read_and_reset_pf_reason(void)
+{
+ return 0;
+}
#endif
#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index f7920601e472..72a8b52e7dfd 100644
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -7,9 +7,19 @@
#include <asm/mc146818rtc.h>
+#define NMI_REASON_PORT 0x61
+
+#define NMI_REASON_SERR 0x80
+#define NMI_REASON_IOCHK 0x40
+#define NMI_REASON_MASK (NMI_REASON_SERR | NMI_REASON_IOCHK)
+
+#define NMI_REASON_CLEAR_SERR 0x04
+#define NMI_REASON_CLEAR_IOCHK 0x08
+#define NMI_REASON_CLEAR_MASK 0x0f
+
static inline unsigned char get_nmi_reason(void)
{
- return inb(0x61);
+ return inb(NMI_REASON_PORT);
}
static inline void reassert_nmi(void)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c62c13cb9788..eb16e94ae04f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -223,6 +223,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
void mce_log_therm_throt_event(__u64 status);
+/* Interrupt Handler for core thermal thresholds */
+extern int (*platform_thermal_notify)(__u64 msr_val);
+
#ifdef CONFIG_X86_THERMAL_VECTOR
extern void mcheck_intel_therm_init(void);
#else
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b501e22a..24215072d0e1 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -48,6 +48,12 @@ static inline struct microcode_ops * __init init_intel_microcode(void)
#ifdef CONFIG_MICROCODE_AMD
extern struct microcode_ops * __init init_amd_microcode(void);
+
+static inline void get_ucode_data(void *to, const u8 *from, size_t n)
+{
+ memcpy(to, from, n);
+}
+
#else
static inline struct microcode_ops * __init init_amd_microcode(void)
{
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index c82868e9f905..0c90dd9f0505 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -5,8 +5,9 @@
#include <asm/mpspec_def.h>
#include <asm/x86_init.h>
+#include <asm/apicdef.h>
-extern int apic_version[MAX_APICS];
+extern int apic_version[];
extern int pic_mode;
#ifdef CONFIG_X86_32
@@ -107,7 +108,7 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
int active_high_low);
#endif /* CONFIG_ACPI */
-#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS)
+#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
struct physid_mask {
unsigned long mask[PHYSID_ARRAY_SIZE];
@@ -122,31 +123,31 @@ typedef struct physid_mask physid_mask_t;
test_and_set_bit(physid, (map).mask)
#define physids_and(dst, src1, src2) \
- bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+ bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
#define physids_or(dst, src1, src2) \
- bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
+ bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
#define physids_clear(map) \
- bitmap_zero((map).mask, MAX_APICS)
+ bitmap_zero((map).mask, MAX_LOCAL_APIC)
#define physids_complement(dst, src) \
- bitmap_complement((dst).mask, (src).mask, MAX_APICS)
+ bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC)
#define physids_empty(map) \
- bitmap_empty((map).mask, MAX_APICS)
+ bitmap_empty((map).mask, MAX_LOCAL_APIC)
#define physids_equal(map1, map2) \
- bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
+ bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC)
#define physids_weight(map) \
- bitmap_weight((map).mask, MAX_APICS)
+ bitmap_weight((map).mask, MAX_LOCAL_APIC)
#define physids_shift_right(d, s, n) \
- bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
+ bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC)
#define physids_shift_left(d, s, n) \
- bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
+ bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC)
static inline unsigned long physids_coerce(physid_mask_t *map)
{
@@ -159,14 +160,6 @@ static inline void physids_promote(unsigned long physids, physid_mask_t *map)
map->mask[0] = physids;
}
-/* Note: will create very large stack frames if physid_mask_t is big */
-#define physid_mask_of_physid(physid) \
- ({ \
- physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
- physid_set(physid, __physid_mask); \
- __physid_mask; \
- })
-
static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
{
physids_clear(*map);
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
index 4a7f96d7c188..c0a955a9a087 100644
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -15,13 +15,6 @@
#ifdef CONFIG_X86_32
# define MAX_MPC_ENTRY 1024
-# define MAX_APICS 256
-#else
-# if NR_CPUS <= 255
-# define MAX_APICS 255
-# else
-# define MAX_APICS 32768
-# endif
#endif
/* Intel MP Floating Pointer Structure */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 3ea3dc487047..4d0dfa0d998e 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -123,12 +123,16 @@
#define MSR_AMD64_IBSCTL 0xc001103a
#define MSR_AMD64_IBSBRTARGET 0xc001103b
+/* Fam 15h MSRs */
+#define MSR_F15H_PERF_CTL 0xc0010200
+#define MSR_F15H_PERF_CTR 0xc0010201
+
/* Fam 10h MSRs */
#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
#define FAM10H_MMIO_CONF_ENABLE (1<<0)
#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
-#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
+#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
#define FAM10H_MMIO_CONF_BASE_SHIFT 20
#define MSR_FAM10H_NODE_ID 0xc001100c
@@ -253,6 +257,18 @@
#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
+/* Thermal Thresholds Support */
+#define THERM_INT_THRESHOLD0_ENABLE (1 << 15)
+#define THERM_SHIFT_THRESHOLD0 8
+#define THERM_MASK_THRESHOLD0 (0x7f << THERM_SHIFT_THRESHOLD0)
+#define THERM_INT_THRESHOLD1_ENABLE (1 << 23)
+#define THERM_SHIFT_THRESHOLD1 16
+#define THERM_MASK_THRESHOLD1 (0x7f << THERM_SHIFT_THRESHOLD1)
+#define THERM_STATUS_THRESHOLD0 (1 << 6)
+#define THERM_LOG_THRESHOLD0 (1 << 7)
+#define THERM_STATUS_THRESHOLD1 (1 << 8)
+#define THERM_LOG_THRESHOLD1 (1 << 9)
+
/* MISC_ENABLE bits: architectural */
#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 932f0f86b4b7..c76f5b92b840 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -5,41 +5,15 @@
#include <asm/irq.h>
#include <asm/io.h>
-#ifdef ARCH_HAS_NMI_WATCHDOG
-
-/**
- * do_nmi_callback
- *
- * Check to see if a callback exists and execute it. Return 1
- * if the handler exists and was handled successfully.
- */
-int do_nmi_callback(struct pt_regs *regs, int cpu);
+#ifdef CONFIG_X86_LOCAL_APIC
extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
-extern int check_nmi_watchdog(void);
-#if !defined(CONFIG_LOCKUP_DETECTOR)
-extern int nmi_watchdog_enabled;
-#endif
extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
extern int reserve_perfctr_nmi(unsigned int);
extern void release_perfctr_nmi(unsigned int);
extern int reserve_evntsel_nmi(unsigned int);
extern void release_evntsel_nmi(unsigned int);
-extern void setup_apic_nmi_watchdog(void *);
-extern void stop_apic_nmi_watchdog(void *);
-extern void disable_timer_nmi_watchdog(void);
-extern void enable_timer_nmi_watchdog(void);
-extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
-extern void cpu_nmi_set_wd_enabled(void);
-
-extern atomic_t nmi_active;
-extern unsigned int nmi_watchdog;
-#define NMI_NONE 0
-#define NMI_IO_APIC 1
-#define NMI_LOCAL_APIC 2
-#define NMI_INVALID 3
-
struct ctl_table;
extern int proc_nmi_enabled(struct ctl_table *, int ,
void __user *, size_t *, loff_t *);
@@ -47,33 +21,28 @@ extern int unknown_nmi_panic;
void arch_trigger_all_cpu_backtrace(void);
#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
+#endif
-static inline void localise_nmi_watchdog(void)
-{
- if (nmi_watchdog == NMI_IO_APIC)
- nmi_watchdog = NMI_LOCAL_APIC;
-}
+/*
+ * Define some priorities for the nmi notifier call chain.
+ *
+ * Create a local nmi bit that has a higher priority than
+ * external nmis, because the local ones are more frequent.
+ *
+ * Also setup some default high/normal/low settings for
+ * subsystems to registers with. Using 4 bits to seperate
+ * the priorities. This can go alot higher if needed be.
+ */
-/* check if nmi_watchdog is active (ie was specified at boot) */
-static inline int nmi_watchdog_active(void)
-{
- /*
- * actually it should be:
- * return (nmi_watchdog == NMI_LOCAL_APIC ||
- * nmi_watchdog == NMI_IO_APIC)
- * but since they are power of two we could use a
- * cheaper way --cvg
- */
- return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
-}
-#endif
+#define NMI_LOCAL_SHIFT 16 /* randomly picked */
+#define NMI_LOCAL_BIT (1ULL << NMI_LOCAL_SHIFT)
+#define NMI_HIGH_PRIOR (1ULL << 8)
+#define NMI_NORMAL_PRIOR (1ULL << 4)
+#define NMI_LOW_PRIOR (1ULL << 0)
+#define NMI_LOCAL_HIGH_PRIOR (NMI_LOCAL_BIT | NMI_HIGH_PRIOR)
+#define NMI_LOCAL_NORMAL_PRIOR (NMI_LOCAL_BIT | NMI_NORMAL_PRIOR)
+#define NMI_LOCAL_LOW_PRIOR (NMI_LOCAL_BIT | NMI_LOW_PRIOR)
-void lapic_watchdog_stop(void);
-int lapic_watchdog_init(unsigned nmi_hz);
-int lapic_wd_event(unsigned nmi_hz);
-unsigned lapic_adjust_nmi_hz(unsigned hz);
-void disable_lapic_nmi_watchdog(void);
-void enable_lapic_nmi_watchdog(void);
void stop_nmi(void);
void restart_nmi(void);
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index a37229011b56..b0ef2b449a9d 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_NUMA_32_H
#define _ASM_X86_NUMA_32_H
+extern int numa_off;
+
extern int pxm_to_nid(int pxm);
extern void numa_remove_cpu(int cpu);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 823e070e7c26..0493be39607c 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -38,8 +38,9 @@ extern void __cpuinit numa_add_cpu(int cpu);
extern void __cpuinit numa_remove_cpu(int cpu);
#ifdef CONFIG_NUMA_EMU
-#define FAKE_NODE_MIN_SIZE ((u64)64 << 20)
+#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
+void numa_emu_cmdline(char *);
#endif /* CONFIG_NUMA_EMU */
#else
static inline void init_cpu_to_node(void) { }
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 42a978c0c1b3..f482010350fb 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -107,10 +107,14 @@ extern int olpc_ec_mask_unset(uint8_t bits);
/* GPIO assignments */
#define OLPC_GPIO_MIC_AC 1
-#define OLPC_GPIO_DCON_IRQ geode_gpio(7)
+#define OLPC_GPIO_DCON_STAT0 5
+#define OLPC_GPIO_DCON_STAT1 6
+#define OLPC_GPIO_DCON_IRQ 7
#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
-#define OLPC_GPIO_SMB_CLK geode_gpio(14)
-#define OLPC_GPIO_SMB_DATA geode_gpio(15)
+#define OLPC_GPIO_DCON_LOAD 11
+#define OLPC_GPIO_DCON_BLANK 12
+#define OLPC_GPIO_SMB_CLK 14
+#define OLPC_GPIO_SMB_DATA 15
#define OLPC_GPIO_WORKAUX geode_gpio(24)
#define OLPC_GPIO_LID geode_gpio(26)
#define OLPC_GPIO_ECSCI geode_gpio(27)
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 2a8478140bb3..641988efe063 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -8,6 +8,8 @@
#ifdef CONFIG_OLPC_OPENFIRMWARE
+extern bool olpc_ofw_is_installed(void);
+
/* run an OFW command by calling into the firmware */
#define olpc_ofw(name, args, res) \
__olpc_ofw((name), ARRAY_SIZE(args), args, ARRAY_SIZE(res), res)
@@ -26,10 +28,17 @@ extern bool olpc_ofw_present(void);
#else /* !CONFIG_OLPC_OPENFIRMWARE */
+static inline bool olpc_ofw_is_installed(void) { return false; }
static inline void olpc_ofw_detect(void) { }
static inline void setup_olpc_ofw_pgd(void) { }
static inline bool olpc_ofw_present(void) { return false; }
#endif /* !CONFIG_OLPC_OPENFIRMWARE */
+#ifdef CONFIG_OLPC_OPENFIRMWARE_DT
+extern void olpc_dt_build_devicetree(void);
+#else
+static inline void olpc_dt_build_devicetree(void) { }
+#endif /* CONFIG_OLPC_OPENFIRMWARE_DT */
+
#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 18e3b8a8709f..ebbc4d8ab170 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -112,7 +112,7 @@ static inline void arch_safe_halt(void)
static inline void halt(void)
{
- PVOP_VCALL0(pv_irq_ops.safe_halt);
+ PVOP_VCALL0(pv_irq_ops.halt);
}
static inline void wbinvd(void)
@@ -435,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
{
PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
}
+static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
+}
static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
@@ -442,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
}
+static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
+}
+
static inline pte_t __pte(pteval_t val)
{
pteval_t ret;
@@ -543,6 +554,19 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmd)
+{
+ if (sizeof(pmdval_t) > sizeof(long))
+ /* 5 arg words */
+ pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
+ else
+ PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp,
+ native_pmd_val(pmd));
+}
+#endif
+
static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
{
pmdval_t val = native_pmd_val(pmd);
@@ -824,27 +848,27 @@ static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
#define __PV_IS_CALLEE_SAVE(func) \
((struct paravirt_callee_save) { func })
-static inline unsigned long arch_local_save_flags(void)
+static inline notrace unsigned long arch_local_save_flags(void)
{
return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
}
-static inline void arch_local_irq_restore(unsigned long f)
+static inline notrace void arch_local_irq_restore(unsigned long f)
{
PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
}
-static inline void arch_local_irq_disable(void)
+static inline notrace void arch_local_irq_disable(void)
{
PVOP_VCALLEE0(pv_irq_ops.irq_disable);
}
-static inline void arch_local_irq_enable(void)
+static inline notrace void arch_local_irq_enable(void)
{
PVOP_VCALLEE0(pv_irq_ops.irq_enable);
}
-static inline unsigned long arch_local_irq_save(void)
+static inline notrace unsigned long arch_local_irq_save(void)
{
unsigned long f;
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index b82bac975250..82885099c869 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -265,10 +265,16 @@ struct pv_mmu_ops {
void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
+ void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp, pmd_t pmdval);
void (*pte_update)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
void (*pte_update_defer)(struct mm_struct *mm,
unsigned long addr, pte_t *ptep);
+ void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp);
+ void (*pmd_update_defer)(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp);
pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
pte_t *ptep);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ca0437c714b2..676129229630 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -65,6 +65,7 @@ extern unsigned long pci_mem_start;
#define PCIBIOS_MIN_CARDBUS_IO 0x4000
+extern int pcibios_enabled;
void pcibios_config_init(void);
struct pci_bus *pcibios_scan_root(int bus);
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index f899e01a8ac9..7e172955ee57 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,6 +230,125 @@ do { \
})
/*
+ * Add return operation
+ */
+#define percpu_add_return_op(var, val) \
+({ \
+ typeof(var) paro_ret__ = val; \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm("xaddb %0, "__percpu_arg(1) \
+ : "+q" (paro_ret__), "+m" (var) \
+ : : "memory"); \
+ break; \
+ case 2: \
+ asm("xaddw %0, "__percpu_arg(1) \
+ : "+r" (paro_ret__), "+m" (var) \
+ : : "memory"); \
+ break; \
+ case 4: \
+ asm("xaddl %0, "__percpu_arg(1) \
+ : "+r" (paro_ret__), "+m" (var) \
+ : : "memory"); \
+ break; \
+ case 8: \
+ asm("xaddq %0, "__percpu_arg(1) \
+ : "+re" (paro_ret__), "+m" (var) \
+ : : "memory"); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ paro_ret__ += val; \
+ paro_ret__; \
+})
+
+/*
+ * xchg is implemented using cmpxchg without a lock prefix. xchg is
+ * expensive due to the implied lock prefix. The processor cannot prefetch
+ * cachelines if xchg is used.
+ */
+#define percpu_xchg_op(var, nval) \
+({ \
+ typeof(var) pxo_ret__; \
+ typeof(var) pxo_new__ = (nval); \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm("\n\tmov "__percpu_arg(1)",%%al" \
+ "\n1:\tcmpxchgb %2, "__percpu_arg(1) \
+ "\n\tjnz 1b" \
+ : "=&a" (pxo_ret__), "+m" (var) \
+ : "q" (pxo_new__) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm("\n\tmov "__percpu_arg(1)",%%ax" \
+ "\n1:\tcmpxchgw %2, "__percpu_arg(1) \
+ "\n\tjnz 1b" \
+ : "=&a" (pxo_ret__), "+m" (var) \
+ : "r" (pxo_new__) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm("\n\tmov "__percpu_arg(1)",%%eax" \
+ "\n1:\tcmpxchgl %2, "__percpu_arg(1) \
+ "\n\tjnz 1b" \
+ : "=&a" (pxo_ret__), "+m" (var) \
+ : "r" (pxo_new__) \
+ : "memory"); \
+ break; \
+ case 8: \
+ asm("\n\tmov "__percpu_arg(1)",%%rax" \
+ "\n1:\tcmpxchgq %2, "__percpu_arg(1) \
+ "\n\tjnz 1b" \
+ : "=&a" (pxo_ret__), "+m" (var) \
+ : "r" (pxo_new__) \
+ : "memory"); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ pxo_ret__; \
+})
+
+/*
+ * cmpxchg has no such implied lock semantics as a result it is much
+ * more efficient for cpu local operations.
+ */
+#define percpu_cmpxchg_op(var, oval, nval) \
+({ \
+ typeof(var) pco_ret__; \
+ typeof(var) pco_old__ = (oval); \
+ typeof(var) pco_new__ = (nval); \
+ switch (sizeof(var)) { \
+ case 1: \
+ asm("cmpxchgb %2, "__percpu_arg(1) \
+ : "=a" (pco_ret__), "+m" (var) \
+ : "q" (pco_new__), "0" (pco_old__) \
+ : "memory"); \
+ break; \
+ case 2: \
+ asm("cmpxchgw %2, "__percpu_arg(1) \
+ : "=a" (pco_ret__), "+m" (var) \
+ : "r" (pco_new__), "0" (pco_old__) \
+ : "memory"); \
+ break; \
+ case 4: \
+ asm("cmpxchgl %2, "__percpu_arg(1) \
+ : "=a" (pco_ret__), "+m" (var) \
+ : "r" (pco_new__), "0" (pco_old__) \
+ : "memory"); \
+ break; \
+ case 8: \
+ asm("cmpxchgq %2, "__percpu_arg(1) \
+ : "=a" (pco_ret__), "+m" (var) \
+ : "r" (pco_new__), "0" (pco_old__) \
+ : "memory"); \
+ break; \
+ default: __bad_percpu_size(); \
+ } \
+ pco_ret__; \
+})
+
+/*
* percpu_read() makes gcc load the percpu variable every time it is
* accessed while percpu_read_stable() allows the value to be cached.
* percpu_read_stable() is more efficient and can be used if its value
@@ -267,6 +386,12 @@ do { \
#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
+/*
+ * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much
+ * faster than an xchg with forced lock semantics.
+ */
+#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
+#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -286,6 +411,9 @@ do { \
#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
+#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
@@ -299,6 +427,29 @@ do { \
#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
+#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
+
+#ifndef CONFIG_M386
+#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
+#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
+#define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
+#define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+
+#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
+#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
+#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
+#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+
+#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#endif /* !CONFIG_M386 */
/*
* Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -311,6 +462,7 @@ do { \
#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
+#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
@@ -318,12 +470,16 @@ do { \
#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
+#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
+#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
-
+#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
+#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
#endif
/* This is not atomic against other CPUs -- CPU preemption needs to be off */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 550e26b1dbb3..d9d4dae305f6 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -125,7 +125,6 @@ union cpuid10_edx {
#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
#ifdef CONFIG_PERF_EVENTS
-extern void init_hw_perf_events(void);
extern void perf_events_lapic_init(void);
#define PERF_EVENT_INDEX_OFFSET 0
@@ -156,7 +155,6 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
}
#else
-static inline void init_hw_perf_events(void) { }
static inline void perf_events_lapic_init(void) { }
#endif
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index a70cd216be5d..e2f6a99f14ab 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -20,6 +20,9 @@
#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
#define ARCH_P4_MAX_CCCR (18)
+#define ARCH_P4_CNTRVAL_BITS (40)
+#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1)
+
#define P4_ESCR_EVENT_MASK 0x7e000000U
#define P4_ESCR_EVENT_SHIFT 25
#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U
@@ -744,14 +747,6 @@ enum P4_ESCR_EMASKS {
};
/*
- * P4 PEBS specifics (Replay Event only)
- *
- * Format (bits):
- * 0-6: metric from P4_PEBS_METRIC enum
- * 7 : reserved
- * 8 : reserved
- * 9-11 : reserved
- *
* Note we have UOP and PEBS bits reserved for now
* just in case if we will need them once
*/
@@ -788,5 +783,60 @@ enum P4_PEBS_METRIC {
P4_PEBS_METRIC__max
};
+/*
+ * Notes on internal configuration of ESCR+CCCR tuples
+ *
+ * Since P4 has quite the different architecture of
+ * performance registers in compare with "architectural"
+ * once and we have on 64 bits to keep configuration
+ * of performance event, the following trick is used.
+ *
+ * 1) Since both ESCR and CCCR registers have only low
+ * 32 bits valuable, we pack them into a single 64 bit
+ * configuration. Low 32 bits of such config correspond
+ * to low 32 bits of CCCR register and high 32 bits
+ * correspond to low 32 bits of ESCR register.
+ *
+ * 2) The meaning of every bit of such config field can
+ * be found in Intel SDM but it should be noted that
+ * we "borrow" some reserved bits for own usage and
+ * clean them or set to a proper value when we do
+ * a real write to hardware registers.
+ *
+ * 3) The format of bits of config is the following
+ * and should be either 0 or set to some predefined
+ * values:
+ *
+ * Low 32 bits
+ * -----------
+ * 0-6: P4_PEBS_METRIC enum
+ * 7-11: reserved
+ * 12: reserved (Enable)
+ * 13-15: reserved (ESCR select)
+ * 16-17: Active Thread
+ * 18: Compare
+ * 19: Complement
+ * 20-23: Threshold
+ * 24: Edge
+ * 25: reserved (FORCE_OVF)
+ * 26: reserved (OVF_PMI_T0)
+ * 27: reserved (OVF_PMI_T1)
+ * 28-29: reserved
+ * 30: reserved (Cascade)
+ * 31: reserved (OVF)
+ *
+ * High 32 bits
+ * ------------
+ * 0: reserved (T1_USR)
+ * 1: reserved (T1_OS)
+ * 2: reserved (T0_USR)
+ * 3: reserved (T0_OS)
+ * 4: Tag Enable
+ * 5-8: Tag Value
+ * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper)
+ * 25-30: enum P4_EVENTS
+ * 31: reserved (HT thread)
+ */
+
#endif /* PERF_EVENT_P4_H */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 271de94c3810..b4389a468fb6 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -92,7 +92,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
- unsigned long adddress)
+ unsigned long address)
{
___pmd_free_tlb(tlb, pmd);
}
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 2334982b339e..98391db840c6 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
#endif
+#ifdef CONFIG_SMP
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
+{
+ return __pmd(xchg((pmdval_t *)xp, 0));
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
/*
* Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
* split up the 29 bits of offset into this range:
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b0165ea01..94b979d1b58d 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
#endif
+#ifdef CONFIG_SMP
+union split_pmd {
+ struct {
+ u32 pmd_low;
+ u32 pmd_high;
+ };
+ pmd_t pmd;
+};
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
+{
+ union split_pmd res, *orig = (union split_pmd *)pmdp;
+
+ /* xchg acts as a barrier before setting of the high bits */
+ res.pmd_low = xchg(&orig->pmd_low, 0);
+ res.pmd_high = orig->pmd_high;
+ orig->pmd_high = 0;
+
+ return res.pmd;
+}
+#else
+#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+#endif
+
/*
* Bits 0, 6 and 7 are taken in the low part of the pte,
* put the 32 bits of offset into the high part.
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ada823a13c7c..18601c86fab1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -35,6 +35,7 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#else /* !CONFIG_PARAVIRT */
#define set_pte(ptep, pte) native_set_pte(ptep, pte)
#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
+#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
#define set_pte_atomic(ptep, pte) \
native_set_pte_atomic(ptep, pte)
@@ -59,6 +60,8 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page);
#define pte_update(mm, addr, ptep) do { } while (0)
#define pte_update_defer(mm, addr, ptep) do { } while (0)
+#define pmd_update(mm, addr, ptep) do { } while (0)
+#define pmd_update_defer(mm, addr, ptep) do { } while (0)
#define pgd_val(x) native_pgd_val(x)
#define __pgd(x) native_make_pgd(x)
@@ -94,6 +97,11 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
}
+static inline int pmd_young(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_ACCESSED;
+}
+
static inline int pte_write(pte_t pte)
{
return pte_flags(pte) & _PAGE_RW;
@@ -142,6 +150,23 @@ static inline int pmd_large(pmd_t pte)
(_PAGE_PSE | _PAGE_PRESENT);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int pmd_trans_splitting(pmd_t pmd)
+{
+ return pmd_val(pmd) & _PAGE_SPLITTING;
+}
+
+static inline int pmd_trans_huge(pmd_t pmd)
+{
+ return pmd_val(pmd) & _PAGE_PSE;
+}
+
+static inline int has_transparent_hugepage(void)
+{
+ return cpu_has_pse;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
{
pteval_t v = native_pte_val(pte);
@@ -216,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte)
return pte_set_flags(pte, _PAGE_SPECIAL);
}
+static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return __pmd(v | set);
+}
+
+static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
+{
+ pmdval_t v = native_pmd_val(pmd);
+
+ return __pmd(v & ~clear);
+}
+
+static inline pmd_t pmd_mkold(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_wrprotect(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mkdirty(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_DIRTY);
+}
+
+static inline pmd_t pmd_mkhuge(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_PSE);
+}
+
+static inline pmd_t pmd_mkyoung(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_ACCESSED);
+}
+
+static inline pmd_t pmd_mkwrite(pmd_t pmd)
+{
+ return pmd_set_flags(pmd, _PAGE_RW);
+}
+
+static inline pmd_t pmd_mknotpresent(pmd_t pmd)
+{
+ return pmd_clear_flags(pmd, _PAGE_PRESENT);
+}
+
/*
* Mask out unsupported bits in a present pgprot. Non-present pgprots
* can use those bits for other purposes, so leave them be.
@@ -256,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
return __pte(val);
}
+static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
+{
+ pmdval_t val = pmd_val(pmd);
+
+ val &= _HPAGE_CHG_MASK;
+ val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+
+ return __pmd(val);
+}
+
/* mprotect needs to preserve PAT bits when updating vm_page_prot */
#define pgprot_modify pgprot_modify
static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
@@ -350,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
* Currently stuck as a macro due to indirect forward reference to
* linux/mmzone.h's __section_mem_map_addr() definition:
*/
-#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)
+#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
/*
* the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -524,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
return res;
}
+static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
+{
+ pmd_t res = *pmdp;
+
+ native_pmd_clear(pmdp);
+ return res;
+}
+
static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep , pte_t pte)
{
native_set_pte(ptep, pte);
}
+static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp , pmd_t pmd)
+{
+ native_set_pmd(pmdp, pmd);
+}
+
#ifndef CONFIG_PARAVIRT
/*
* Rules for using pte_update - it must be called after any PTE update which
@@ -607,6 +705,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
#define flush_tlb_fix_spurious_fault(vma, address)
+#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
+
+#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+extern int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty);
+
+#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
+extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp);
+
+
+#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+extern void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp);
+
+#define __HAVE_ARCH_PMD_WRITE
+static inline int pmd_write(pmd_t pmd)
+{
+ return pmd_flags(pmd) & _PAGE_RW;
+}
+
+#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
+static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmdp)
+{
+ pmd_t pmd = native_pmdp_get_and_clear(pmdp);
+ pmd_update(mm, addr, pmdp);
+ return pmd;
+}
+
+#define __HAVE_ARCH_PMDP_SET_WRPROTECT
+static inline void pmdp_set_wrprotect(struct mm_struct *mm,
+ unsigned long addr, pmd_t *pmdp)
+{
+ clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
+ pmd_update(mm, addr, pmdp);
+}
+
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
*
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index f86da20347f2..975f709e09ae 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
native_set_pte(ptep, pte);
}
+static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+ *pmdp = pmd;
+}
+
+static inline void native_pmd_clear(pmd_t *pmd)
+{
+ native_set_pmd(pmd, native_make_pmd(0));
+}
+
static inline pte_t native_ptep_get_and_clear(pte_t *xp)
{
#ifdef CONFIG_SMP
@@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
#endif
}
-static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
+static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
{
- *pmdp = pmd;
-}
-
-static inline void native_pmd_clear(pmd_t *pmd)
-{
- native_set_pmd(pmd, native_make_pmd(0));
+#ifdef CONFIG_SMP
+ return native_make_pmd(xchg(&xp->pmd, 0));
+#else
+ /* native_local_pmdp_get_and_clear,
+ but duplicated because of cyclic dependency */
+ pmd_t ret = *xp;
+ native_pmd_clear(xp);
+ return ret;
+#endif
}
static inline void native_set_pud(pud_t *pudp, pud_t pud)
@@ -168,6 +181,7 @@ extern void cleanup_highmap(void);
#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
#define __HAVE_ARCH_PTE_SAME
+
#endif /* !__ASSEMBLY__ */
#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d1f4a760be23..7db7723d1f32 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,6 +22,7 @@
#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
+#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
/* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -45,6 +46,7 @@
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
#define __HAVE_ARCH_PTE_SPECIAL
#ifdef CONFIG_KMEMCHECK
@@ -70,6 +72,7 @@
/* Set of bits not changed in pte_modify */
#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
_PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
#define _PAGE_CACHE_WB (0)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index cae9c3cb95cf..45636cefa186 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -141,10 +141,9 @@ extern __u32 cpu_caps_set[NCAPINTS];
#ifdef CONFIG_SMP
DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
#define cpu_data(cpu) per_cpu(cpu_info, cpu)
-#define current_cpu_data __get_cpu_var(cpu_info)
#else
+#define cpu_info boot_cpu_data
#define cpu_data(cpu) boot_cpu_data
-#define current_cpu_data boot_cpu_data
#endif
extern const struct seq_operations cpuinfo_op;
@@ -762,10 +761,11 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c);
extern void init_c1e_mask(void);
extern unsigned long boot_option_idle_override;
-extern unsigned long idle_halt;
-extern unsigned long idle_nomwait;
extern bool c1e_detected;
+enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
+ IDLE_POLL, IDLE_FORCE_MWAIT};
+
extern void enable_sep_cpu(void);
extern int sysenter_setup(void);
@@ -902,7 +902,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
/*
* The below -8 is to reserve 8 bytes on top of the ring0 stack.
* This is necessary to guarantee that the entire "struct pt_regs"
- * is accessable even if the CPU haven't stored the SS/ESP registers
+ * is accessible even if the CPU haven't stored the SS/ESP registers
* on the stack (interrupt gate does not save these registers
* when switching to the same priv ring).
* Therefore beware: accessing the ss/esp fields of the
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
new file mode 100644
index 000000000000..b4ec95f07518
--- /dev/null
+++ b/arch/x86/include/asm/prom.h
@@ -0,0 +1 @@
+/* dummy prom.h; here to make linux/of.h's #includes happy */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 7f7e577a0e39..31d84acc1512 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -11,6 +11,7 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
struct pvclock_vcpu_time_info *vcpu,
struct timespec *ts);
+void pvclock_resume(void);
/*
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 1def60114906..6c22bf353f26 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -48,7 +48,6 @@ static inline void __init smpboot_setup_io_apic(void)
setup_IO_APIC();
else {
nr_ioapics = 0;
- localise_nmi_watchdog();
}
#endif
}
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 2b16a2ad23dc..52b5c7ed3608 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -7,6 +7,7 @@
#define _ASM_X86_STACKTRACE_H
#include <linux/uaccess.h>
+#include <linux/ptrace.h>
extern int kstack_depth_to_print;
@@ -46,7 +47,7 @@ struct stacktrace_ops {
};
void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
+ unsigned long *stack,
const struct stacktrace_ops *ops, void *data);
#ifdef CONFIG_X86_32
@@ -57,13 +58,39 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
#endif
+#ifdef CONFIG_FRAME_POINTER
+static inline unsigned long
+stack_frame(struct task_struct *task, struct pt_regs *regs)
+{
+ unsigned long bp;
+
+ if (regs)
+ return regs->bp;
+
+ if (task == current) {
+ /* Grab bp right from our regs */
+ get_bp(bp);
+ return bp;
+ }
+
+ /* bp is the last reg pushed by switch_to */
+ return *(unsigned long *)task->thread.sp;
+}
+#else
+static inline unsigned long
+stack_frame(struct task_struct *task, struct pt_regs *regs)
+{
+ return 0;
+}
+#endif
+
extern void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl);
+ unsigned long *stack, char *log_lvl);
extern void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl);
+ unsigned long *sp, char *log_lvl);
extern unsigned int code_bytes;
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0e831059ac5a..f2b83bc7d784 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -47,14 +47,13 @@ enum {
INTERCEPT_MONITOR,
INTERCEPT_MWAIT,
INTERCEPT_MWAIT_COND,
+ INTERCEPT_XSETBV,
};
struct __attribute__ ((__packed__)) vmcb_control_area {
- u16 intercept_cr_read;
- u16 intercept_cr_write;
- u16 intercept_dr_read;
- u16 intercept_dr_write;
+ u32 intercept_cr;
+ u32 intercept_dr;
u32 intercept_exceptions;
u64 intercept;
u8 reserved_1[42];
@@ -81,14 +80,19 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
u32 event_inj_err;
u64 nested_cr3;
u64 lbr_ctl;
- u64 reserved_5;
+ u32 clean;
+ u32 reserved_5;
u64 next_rip;
- u8 reserved_6[816];
+ u8 insn_len;
+ u8 insn_bytes[15];
+ u8 reserved_6[800];
};
#define TLB_CONTROL_DO_NOTHING 0
#define TLB_CONTROL_FLUSH_ALL_ASID 1
+#define TLB_CONTROL_FLUSH_ASID 3
+#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
#define V_TPR_MASK 0x0f
@@ -204,19 +208,31 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
#define SVM_SELECTOR_CODE_MASK (1 << 3)
-#define INTERCEPT_CR0_MASK 1
-#define INTERCEPT_CR3_MASK (1 << 3)
-#define INTERCEPT_CR4_MASK (1 << 4)
-#define INTERCEPT_CR8_MASK (1 << 8)
-
-#define INTERCEPT_DR0_MASK 1
-#define INTERCEPT_DR1_MASK (1 << 1)
-#define INTERCEPT_DR2_MASK (1 << 2)
-#define INTERCEPT_DR3_MASK (1 << 3)
-#define INTERCEPT_DR4_MASK (1 << 4)
-#define INTERCEPT_DR5_MASK (1 << 5)
-#define INTERCEPT_DR6_MASK (1 << 6)
-#define INTERCEPT_DR7_MASK (1 << 7)
+#define INTERCEPT_CR0_READ 0
+#define INTERCEPT_CR3_READ 3
+#define INTERCEPT_CR4_READ 4
+#define INTERCEPT_CR8_READ 8
+#define INTERCEPT_CR0_WRITE (16 + 0)
+#define INTERCEPT_CR3_WRITE (16 + 3)
+#define INTERCEPT_CR4_WRITE (16 + 4)
+#define INTERCEPT_CR8_WRITE (16 + 8)
+
+#define INTERCEPT_DR0_READ 0
+#define INTERCEPT_DR1_READ 1
+#define INTERCEPT_DR2_READ 2
+#define INTERCEPT_DR3_READ 3
+#define INTERCEPT_DR4_READ 4
+#define INTERCEPT_DR5_READ 5
+#define INTERCEPT_DR6_READ 6
+#define INTERCEPT_DR7_READ 7
+#define INTERCEPT_DR0_WRITE (16 + 0)
+#define INTERCEPT_DR1_WRITE (16 + 1)
+#define INTERCEPT_DR2_WRITE (16 + 2)
+#define INTERCEPT_DR3_WRITE (16 + 3)
+#define INTERCEPT_DR4_WRITE (16 + 4)
+#define INTERCEPT_DR5_WRITE (16 + 5)
+#define INTERCEPT_DR6_WRITE (16 + 6)
+#define INTERCEPT_DR7_WRITE (16 + 7)
#define SVM_EVTINJ_VEC_MASK 0xff
@@ -246,6 +262,8 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
+#define SVM_EXITINFO_REG_MASK 0x0F
+
#define SVM_EXIT_READ_CR0 0x000
#define SVM_EXIT_READ_CR3 0x003
#define SVM_EXIT_READ_CR4 0x004
@@ -316,6 +334,7 @@ struct __attribute__ ((__packed__)) vmcb {
#define SVM_EXIT_MONITOR 0x08a
#define SVM_EXIT_MWAIT 0x08b
#define SVM_EXIT_MWAIT_COND 0x08c
+#define SVM_EXIT_XSETBV 0x08d
#define SVM_EXIT_NPF 0x400
#define SVM_EXIT_ERR -1
diff --git a/arch/x86/include/asm/system_64.h b/arch/x86/include/asm/system_64.h
deleted file mode 100644
index 1159e091ad09..000000000000
--- a/arch/x86/include/asm/system_64.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef _ASM_X86_SYSTEM_64_H
-#define _ASM_X86_SYSTEM_64_H
-
-#include <asm/segment.h>
-#include <asm/cmpxchg.h>
-
-
-static inline unsigned long read_cr8(void)
-{
- unsigned long cr8;
- asm volatile("movq %%cr8,%0" : "=r" (cr8));
- return cr8;
-}
-
-static inline void write_cr8(unsigned long val)
-{
- asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
-}
-
-#include <linux/irqflags.h>
-
-#endif /* _ASM_X86_SYSTEM_64_H */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 5469630b27f5..fa7b9176b76c 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -10,12 +10,6 @@
unsigned long long native_sched_clock(void);
extern int recalibrate_cpu_khz(void);
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-extern int timer_ack;
-#else
-# define timer_ack (0)
-#endif
-
extern int no_timer_check;
/* Accelerators for sched_clock()
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index f66cda56781d..0310da67307f 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void);
asmlinkage void stack_segment(void);
asmlinkage void general_protection(void);
asmlinkage void page_fault(void);
+asmlinkage void async_page_fault(void);
asmlinkage void spurious_interrupt_bug(void);
asmlinkage void coprocessor_error(void);
asmlinkage void alignment_check(void);
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 42d412fd8b02..ce1d54c8a433 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -26,20 +26,22 @@
* BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
* set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
*
- * We will use 31 sets, one for sending BAU messages from each of the 32
+ * We will use one set for sending BAU messages from each of the
* cpu's on the uvhub.
*
* TLB shootdown will use the first of the 8 descriptors of each set.
* Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
*/
+#define MAX_CPUS_PER_UVHUB 64
+#define MAX_CPUS_PER_SOCKET 32
+#define UV_ADP_SIZE 64 /* hardware-provided max. */
+#define UV_CPUS_PER_ACT_STATUS 32 /* hardware-provided max. */
#define UV_ITEMS_PER_DESCRIPTOR 8
/* the 'throttle' to prevent the hardware stay-busy bug */
#define MAX_BAU_CONCURRENT 3
-#define UV_CPUS_PER_ACT_STATUS 32
#define UV_ACT_STATUS_MASK 0x3
#define UV_ACT_STATUS_SIZE 2
-#define UV_ADP_SIZE 32
#define UV_DISTRIBUTION_SIZE 256
#define UV_SW_ACK_NPENDING 8
#define UV_NET_ENDPOINT_INTD 0x38
@@ -100,7 +102,6 @@
* number of destination side software ack resources
*/
#define DEST_NUM_RESOURCES 8
-#define MAX_CPUS_PER_NODE 32
/*
* completion statuses for sending a TLB flush message
*/
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index e969f691cbfd..a501741c2335 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -199,6 +199,8 @@ union uvh_apicid {
#define UVH_APICID 0x002D0E00L
#define UV_APIC_PNODE_SHIFT 6
+#define UV_APICID_HIBIT_MASK 0xffff0000
+
/* Local Bus from cpu's perspective */
#define LOCAL_BUS_BASE 0x1c00000
#define LOCAL_BUS_SIZE (4 * 1024 * 1024)
@@ -491,8 +493,10 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
}
}
+extern unsigned int uv_apicid_hibits;
static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode)
{
+ apicid |= uv_apicid_hibits;
return (1UL << UVH_IPI_INT_SEND_SHFT) |
((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
(mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index b2f2d2e05cec..20cafeac7455 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -5,7 +5,7 @@
*
* SGI UV MMR definitions
*
- * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
+ * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
*/
#ifndef _ASM_X86_UV_UV_MMRS_H
@@ -754,6 +754,23 @@ union uvh_lb_bau_sb_descriptor_base_u {
};
/* ========================================================================= */
+/* UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK */
+/* ========================================================================= */
+#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
+#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x009f0
+
+#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
+#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
+
+union uvh_lb_target_physical_apic_id_mask_u {
+ unsigned long v;
+ struct uvh_lb_target_physical_apic_id_mask_s {
+ unsigned long bit_enables : 32; /* RW */
+ unsigned long rsvd_32_63 : 32; /* */
+ } s;
+};
+
+/* ========================================================================= */
/* UVH_NODE_ID */
/* ========================================================================= */
#define UVH_NODE_ID 0x0UL
@@ -806,6 +823,78 @@ union uvh_node_present_table_u {
};
/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
+
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_alias210_overlay_config_0_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
+ unsigned long rsvd_0_23: 24; /* */
+ unsigned long base : 8; /* RW */
+ unsigned long rsvd_32_47: 16; /* */
+ unsigned long m_alias : 5; /* RW */
+ unsigned long rsvd_53_62: 10; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
+
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_alias210_overlay_config_1_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
+ unsigned long rsvd_0_23: 24; /* */
+ unsigned long base : 8; /* RW */
+ unsigned long rsvd_32_47: 16; /* */
+ unsigned long m_alias : 5; /* RW */
+ unsigned long rsvd_53_62: 10; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
+/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
+
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
+#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
+
+union uvh_rh_gam_alias210_overlay_config_2_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
+ unsigned long rsvd_0_23: 24; /* */
+ unsigned long base : 8; /* RW */
+ unsigned long rsvd_32_47: 16; /* */
+ unsigned long m_alias : 5; /* RW */
+ unsigned long rsvd_53_62: 10; /* */
+ unsigned long enable : 1; /* RW */
+ } s;
+};
+
+/* ========================================================================= */
/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */
/* ========================================================================= */
#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
@@ -857,6 +946,29 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
};
/* ========================================================================= */
+/* UVH_RH_GAM_CONFIG_MMR */
+/* ========================================================================= */
+#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL
+
+#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
+#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
+#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
+#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
+#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
+
+union uvh_rh_gam_config_mmr_u {
+ unsigned long v;
+ struct uvh_rh_gam_config_mmr_s {
+ unsigned long m_skt : 6; /* RW */
+ unsigned long n_skt : 4; /* RW */
+ unsigned long rsvd_10_11: 2; /* */
+ unsigned long mmiol_cfg : 1; /* RW */
+ unsigned long rsvd_13_63: 51; /* */
+ } s;
+};
+
+/* ========================================================================= */
/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
/* ========================================================================= */
#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
@@ -987,97 +1099,5 @@ union uvh_rtc1_int_config_u {
} s;
};
-/* ========================================================================= */
-/* UVH_SI_ADDR_MAP_CONFIG */
-/* ========================================================================= */
-#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL
-
-#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0
-#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL
-#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8
-#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL
-
-union uvh_si_addr_map_config_u {
- unsigned long v;
- struct uvh_si_addr_map_config_s {
- unsigned long m_skt : 6; /* RW */
- unsigned long rsvd_6_7: 2; /* */
- unsigned long n_skt : 4; /* RW */
- unsigned long rsvd_12_63: 52; /* */
- } s;
-};
-
-/* ========================================================================= */
-/* UVH_SI_ALIAS0_OVERLAY_CONFIG */
-/* ========================================================================= */
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL
-
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias0_overlay_config_u {
- unsigned long v;
- struct uvh_si_alias0_overlay_config_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
-};
-
-/* ========================================================================= */
-/* UVH_SI_ALIAS1_OVERLAY_CONFIG */
-/* ========================================================================= */
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL
-
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias1_overlay_config_u {
- unsigned long v;
- struct uvh_si_alias1_overlay_config_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
-};
-
-/* ========================================================================= */
-/* UVH_SI_ALIAS2_OVERLAY_CONFIG */
-/* ========================================================================= */
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL
-
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63
-#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
-
-union uvh_si_alias2_overlay_config_u {
- unsigned long v;
- struct uvh_si_alias2_overlay_config_s {
- unsigned long rsvd_0_23: 24; /* */
- unsigned long base : 8; /* RW */
- unsigned long rsvd_32_47: 16; /* */
- unsigned long m_alias : 5; /* RW */
- unsigned long rsvd_53_62: 10; /* */
- unsigned long enable : 1; /* RW */
- } s;
-};
-
-#endif /* _ASM_X86_UV_UV_MMRS_H */
+#endif /* __ASM_UV_MMRS_X86_H__ */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9f0cbd987d50..84471b810460 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,15 +66,23 @@
#define PIN_BASED_NMI_EXITING 0x00000008
#define PIN_BASED_VIRTUAL_NMIS 0x00000020
+#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
+#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
#define VM_EXIT_SAVE_IA32_PAT 0x00040000
#define VM_EXIT_LOAD_IA32_PAT 0x00080000
+#define VM_EXIT_SAVE_IA32_EFER 0x00100000
+#define VM_EXIT_LOAD_IA32_EFER 0x00200000
+#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
#define VM_ENTRY_IA32E_MODE 0x00000200
#define VM_ENTRY_SMM 0x00000400
#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
+#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
+#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
/* VMCS Encodings */
enum vmcs_field {
@@ -239,6 +247,7 @@ enum vmcs_field {
#define EXIT_REASON_TASK_SWITCH 9
#define EXIT_REASON_CPUID 10
#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
#define EXIT_REASON_INVLPG 14
#define EXIT_REASON_RDPMC 15
#define EXIT_REASON_RDTSC 16
@@ -296,6 +305,12 @@ enum vmcs_field {
#define GUEST_INTR_STATE_SMI 0x00000004
#define GUEST_INTR_STATE_NMI 0x00000008
+/* GUEST_ACTIVITY_STATE flags */
+#define GUEST_ACTIVITY_ACTIVE 0
+#define GUEST_ACTIVITY_HLT 1
+#define GUEST_ACTIVITY_SHUTDOWN 2
+#define GUEST_ACTIVITY_WAIT_SIPI 3
+
/*
* Exit Qualifications for MOV for Control Register Access
*/
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 396ff4cc8ed4..66d0fff1ee84 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -37,4 +37,39 @@
extern struct shared_info *HYPERVISOR_shared_info;
extern struct start_info *xen_start_info;
+#include <asm/processor.h>
+
+static inline uint32_t xen_cpuid_base(void)
+{
+ uint32_t base, eax, ebx, ecx, edx;
+ char signature[13];
+
+ for (base = 0x40000000; base < 0x40010000; base += 0x100) {
+ cpuid(base, &eax, &ebx, &ecx, &edx);
+ *(uint32_t *)(signature + 0) = ebx;
+ *(uint32_t *)(signature + 4) = ecx;
+ *(uint32_t *)(signature + 8) = edx;
+ signature[12] = 0;
+
+ if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
+ return base;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_XEN
+extern bool xen_hvm_need_lapic(void);
+
+static inline bool xen_x2apic_para_available(void)
+{
+ return xen_hvm_need_lapic();
+}
+#else
+static inline bool xen_x2apic_para_available(void)
+{
+ return (xen_cpuid_base() != 0);
+}
+#endif
+
#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index e8506c1f0c55..1c10c88ee4e1 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -61,9 +61,9 @@ DEFINE_GUEST_HANDLE(void);
#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
#endif
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
+#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
/* Maximum number of virtual CPUs in multi-processor guests. */
#define MAX_VIRT_CPUS 32
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
index 42a7e004ae5c..8413688b2571 100644
--- a/arch/x86/include/asm/xen/interface_32.h
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -32,6 +32,11 @@
/* And the trap vector is... */
#define TRAP_INSTR "int $0x82"
+#define __MACH2PHYS_VIRT_START 0xF5800000
+#define __MACH2PHYS_VIRT_END 0xF6800000
+
+#define __MACH2PHYS_SHIFT 2
+
/*
* Virtual addresses beyond this are not modifiable by guest OSes. The
* machine->physical mapping table starts at this address, read-only.
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
index 100d2662b97c..839a4811cf98 100644
--- a/arch/x86/include/asm/xen/interface_64.h
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -39,18 +39,7 @@
#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
-
-#ifndef HYPERVISOR_VIRT_START
-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
-#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
-#endif
-
-#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
-#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
-#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
+#define __MACH2PHYS_SHIFT 3
/*
* int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index dd8c1414b3d5..f25bdf238a33 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/spinlock.h>
#include <linux/pfn.h>
+#include <linux/mm.h>
#include <asm/uaccess.h>
#include <asm/page.h>
@@ -35,10 +36,17 @@ typedef struct xpaddr {
#define MAX_DOMAIN_PAGES \
((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
+extern unsigned long *machine_to_phys_mapping;
+extern unsigned int machine_to_phys_order;
extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+extern int m2p_add_override(unsigned long mfn, struct page *page);
+extern int m2p_remove_override(struct page *page);
+extern struct page *m2p_find_override(unsigned long mfn);
+extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
+
static inline unsigned long pfn_to_mfn(unsigned long pfn)
{
unsigned long mfn;
@@ -69,11 +77,6 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
if (xen_feature(XENFEAT_auto_translated_physmap))
return mfn;
-#if 0
- if (unlikely((mfn >> machine_to_phys_order) != 0))
- return max_mapnr;
-#endif
-
pfn = 0;
/*
* The array access can fail (e.g., device space beyond end of RAM).
@@ -82,6 +85,14 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
*/
__get_user(pfn, &machine_to_phys_mapping[mfn]);
+ /*
+ * If this appears to be a foreign mfn (because the pfn
+ * doesn't map back to the mfn), then check the local override
+ * table to see if there's a better pfn to use.
+ */
+ if (get_phys_to_machine(pfn) != mfn)
+ pfn = m2p_find_override_pfn(mfn, pfn);
+
return pfn;
}
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index f60153d5de57..34244b2cd880 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -45,6 +45,7 @@ obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
obj-y += tsc.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
+obj-y += resource.o
obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 71232b941b6c..b3a71137983a 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -198,6 +198,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
{
unsigned int ver = 0;
+ if (id >= (MAX_LOCAL_APIC-1)) {
+ printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
+ return;
+ }
+
if (!enabled) {
++disabled_cpus;
return;
@@ -504,6 +509,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
return 0;
}
+EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
{
@@ -847,18 +853,6 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
* returns 0 on success, < 0 on error
*/
-static void __init acpi_register_lapic_address(unsigned long address)
-{
- mp_lapic_addr = address;
-
- set_fixmap_nocache(FIX_APIC_BASE, address);
- if (boot_cpu_physical_apicid == -1U) {
- boot_cpu_physical_apicid = read_apic_id();
- apic_version[boot_cpu_physical_apicid] =
- GET_APIC_VERSION(apic_read(APIC_LVR));
- }
-}
-
static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
{
int count;
@@ -880,7 +874,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
return count;
}
- acpi_register_lapic_address(acpi_lapic_addr);
+ register_lapic_address(acpi_lapic_addr);
return count;
}
@@ -907,16 +901,16 @@ static int __init acpi_parse_madt_lapic_entries(void)
return count;
}
- acpi_register_lapic_address(acpi_lapic_addr);
+ register_lapic_address(acpi_lapic_addr);
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
- acpi_parse_sapic, MAX_APICS);
+ acpi_parse_sapic, MAX_LOCAL_APIC);
if (!count) {
x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
- acpi_parse_x2apic, MAX_APICS);
+ acpi_parse_x2apic, MAX_LOCAL_APIC);
count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
- acpi_parse_lapic, MAX_APICS);
+ acpi_parse_lapic, MAX_LOCAL_APIC);
}
if (!count && !x2count) {
printk(KERN_ERR PREFIX "No LAPIC entries present\n");
@@ -949,32 +943,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
extern int es7000_plat;
#endif
-static void assign_to_mp_irq(struct mpc_intsrc *m,
- struct mpc_intsrc *mp_irq)
-{
- memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-
-static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
- struct mpc_intsrc *m)
-{
- return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-
-static void save_mp_irq(struct mpc_intsrc *m)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++) {
- if (!mp_irq_cmp(&mp_irqs[i], m))
- return;
- }
-
- assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!!\n");
-}
-
void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
{
int ioapic;
@@ -1005,7 +973,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */
mp_irq.dstirq = pin; /* INTIN# */
- save_mp_irq(&mp_irq);
+ mp_save_irq(&mp_irq);
isa_irq_to_gsi[bus_irq] = gsi;
}
@@ -1080,7 +1048,7 @@ void __init mp_config_acpi_legacy_irqs(void)
mp_irq.srcbusirq = i; /* Identity mapped */
mp_irq.dstirq = pin;
- save_mp_irq(&mp_irq);
+ mp_save_irq(&mp_irq);
}
}
@@ -1117,7 +1085,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
mp_irq.dstapic = mp_ioapics[ioapic].apicid;
mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
- save_mp_irq(&mp_irq);
+ mp_save_irq(&mp_irq);
#endif
return 0;
}
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 5079f24c955a..123608531c8f 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -353,6 +353,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
mutex_unlock(&smp_alt);
}
+bool skip_smp_alternatives;
void alternatives_smp_switch(int smp)
{
struct smp_alt_module *mod;
@@ -368,7 +369,7 @@ void alternatives_smp_switch(int smp)
printk("lockdep: fixing up alternatives.\n");
#endif
- if (noreplace_smp || smp_alt_once)
+ if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
return;
BUG_ON(!smp && (num_online_cpus() > 1));
@@ -591,17 +592,21 @@ static atomic_t stop_machine_first;
static int wrote_text;
struct text_poke_params {
- void *addr;
- const void *opcode;
- size_t len;
+ struct text_poke_param *params;
+ int nparams;
};
static int __kprobes stop_machine_text_poke(void *data)
{
struct text_poke_params *tpp = data;
+ struct text_poke_param *p;
+ int i;
if (atomic_dec_and_test(&stop_machine_first)) {
- text_poke(tpp->addr, tpp->opcode, tpp->len);
+ for (i = 0; i < tpp->nparams; i++) {
+ p = &tpp->params[i];
+ text_poke(p->addr, p->opcode, p->len);
+ }
smp_wmb(); /* Make sure other cpus see that this has run */
wrote_text = 1;
} else {
@@ -610,8 +615,12 @@ static int __kprobes stop_machine_text_poke(void *data)
smp_mb(); /* Load wrote_text before following execution */
}
- flush_icache_range((unsigned long)tpp->addr,
- (unsigned long)tpp->addr + tpp->len);
+ for (i = 0; i < tpp->nparams; i++) {
+ p = &tpp->params[i];
+ flush_icache_range((unsigned long)p->addr,
+ (unsigned long)p->addr + p->len);
+ }
+
return 0;
}
@@ -631,10 +640,13 @@ static int __kprobes stop_machine_text_poke(void *data)
void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
{
struct text_poke_params tpp;
+ struct text_poke_param p;
- tpp.addr = addr;
- tpp.opcode = opcode;
- tpp.len = len;
+ p.addr = addr;
+ p.opcode = opcode;
+ p.len = len;
+ tpp.params = &p;
+ tpp.nparams = 1;
atomic_set(&stop_machine_first, 1);
wrote_text = 0;
/* Use __stop_machine() because the caller already got online_cpus. */
@@ -642,6 +654,26 @@ void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
return addr;
}
+/**
+ * text_poke_smp_batch - Update instructions on a live kernel on SMP
+ * @params: an array of text_poke parameters
+ * @n: the number of elements in params.
+ *
+ * Modify multi-byte instruction by using stop_machine() on SMP. Since the
+ * stop_machine() is heavy task, it is better to aggregate text_poke requests
+ * and do it once if possible.
+ *
+ * Note: Must be called under get_online_cpus() and text_mutex.
+ */
+void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
+{
+ struct text_poke_params tpp = {.params = params, .nparams = n};
+
+ atomic_set(&stop_machine_first, 1);
+ wrote_text = 0;
+ stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
+}
+
#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index d2fdb0826df2..57ca77787220 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1086,7 +1086,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
dma_dom->aperture_size += APERTURE_RANGE_SIZE;
- /* Intialize the exclusion range if necessary */
+ /* Initialize the exclusion range if necessary */
for_each_iommu(iommu) {
if (iommu->exclusion_start &&
iommu->exclusion_start >= dma_dom->aperture[index]->offset
@@ -1353,7 +1353,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
/*
* Allocates a new protection domain usable for the dma_ops functions.
- * It also intializes the page table and the address allocator data
+ * It also initializes the page table and the address allocator data
* structures required for the dma_ops interface
*/
static struct dma_ops_domain *dma_ops_domain_alloc(void)
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 8f6463d8ed0d..0a99f7198bc3 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -12,95 +12,123 @@
static u32 *flush_words;
-struct pci_device_id k8_nb_ids[] = {
+struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
{}
};
-EXPORT_SYMBOL(k8_nb_ids);
+EXPORT_SYMBOL(amd_nb_misc_ids);
-struct k8_northbridge_info k8_northbridges;
-EXPORT_SYMBOL(k8_northbridges);
+const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
+ { 0x00, 0x18, 0x20 },
+ { 0xff, 0x00, 0x20 },
+ { 0xfe, 0x00, 0x20 },
+ { }
+};
+
+struct amd_northbridge_info amd_northbridges;
+EXPORT_SYMBOL(amd_northbridges);
-static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
+static struct pci_dev *next_northbridge(struct pci_dev *dev,
+ struct pci_device_id *ids)
{
do {
dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
if (!dev)
break;
- } while (!pci_match_id(&k8_nb_ids[0], dev));
+ } while (!pci_match_id(ids, dev));
return dev;
}
-int cache_k8_northbridges(void)
+int amd_cache_northbridges(void)
{
- int i;
- struct pci_dev *dev;
+ int i = 0;
+ struct amd_northbridge *nb;
+ struct pci_dev *misc;
- if (k8_northbridges.num)
+ if (amd_nb_num())
return 0;
- dev = NULL;
- while ((dev = next_k8_northbridge(dev)) != NULL)
- k8_northbridges.num++;
+ misc = NULL;
+ while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
+ i++;
- /* some CPU families (e.g. family 0x11) do not support GART */
- if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
- boot_cpu_data.x86 == 0x15)
- k8_northbridges.gart_supported = 1;
+ if (i == 0)
+ return 0;
- k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
- sizeof(void *), GFP_KERNEL);
- if (!k8_northbridges.nb_misc)
+ nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
+ if (!nb)
return -ENOMEM;
- if (!k8_northbridges.num) {
- k8_northbridges.nb_misc[0] = NULL;
- return 0;
- }
+ amd_northbridges.nb = nb;
+ amd_northbridges.num = i;
- if (k8_northbridges.gart_supported) {
- flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
- GFP_KERNEL);
- if (!flush_words) {
- kfree(k8_northbridges.nb_misc);
- return -ENOMEM;
- }
- }
+ misc = NULL;
+ for (i = 0; i != amd_nb_num(); i++) {
+ node_to_amd_nb(i)->misc = misc =
+ next_northbridge(misc, amd_nb_misc_ids);
+ }
+
+ /* some CPU families (e.g. family 0x11) do not support GART */
+ if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
+ boot_cpu_data.x86 == 0x15)
+ amd_northbridges.flags |= AMD_NB_GART;
+
+ /*
+ * Some CPU families support L3 Cache Index Disable. There are some
+ * limitations because of E382 and E388 on family 0x10.
+ */
+ if (boot_cpu_data.x86 == 0x10 &&
+ boot_cpu_data.x86_model >= 0x8 &&
+ (boot_cpu_data.x86_model > 0x9 ||
+ boot_cpu_data.x86_mask >= 0x1))
+ amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
- dev = NULL;
- i = 0;
- while ((dev = next_k8_northbridge(dev)) != NULL) {
- k8_northbridges.nb_misc[i] = dev;
- if (k8_northbridges.gart_supported)
- pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
- }
- k8_northbridges.nb_misc[i] = NULL;
return 0;
}
-EXPORT_SYMBOL_GPL(cache_k8_northbridges);
+EXPORT_SYMBOL_GPL(amd_cache_northbridges);
/* Ignores subdevice/subvendor but as far as I can figure out
they're useless anyways */
-int __init early_is_k8_nb(u32 device)
+int __init early_is_amd_nb(u32 device)
{
struct pci_device_id *id;
u32 vendor = device & 0xffff;
device >>= 16;
- for (id = k8_nb_ids; id->vendor; id++)
+ for (id = amd_nb_misc_ids; id->vendor; id++)
if (vendor == id->vendor && device == id->device)
return 1;
return 0;
}
-void k8_flush_garts(void)
+int amd_cache_gart(void)
+{
+ int i;
+
+ if (!amd_nb_has_feature(AMD_NB_GART))
+ return 0;
+
+ flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
+ if (!flush_words) {
+ amd_northbridges.flags &= ~AMD_NB_GART;
+ return -ENOMEM;
+ }
+
+ for (i = 0; i != amd_nb_num(); i++)
+ pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+ &flush_words[i]);
+
+ return 0;
+}
+
+void amd_flush_garts(void)
{
int flushed, i;
unsigned long flags;
static DEFINE_SPINLOCK(gart_lock);
- if (!k8_northbridges.gart_supported)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return;
/* Avoid races between AGP and IOMMU. In theory it's not needed
@@ -109,16 +137,16 @@ void k8_flush_garts(void)
that it doesn't matter to serialize more. -AK */
spin_lock_irqsave(&gart_lock, flags);
flushed = 0;
- for (i = 0; i < k8_northbridges.num; i++) {
- pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
- flush_words[i]|1);
+ for (i = 0; i < amd_nb_num(); i++) {
+ pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
+ flush_words[i] | 1);
flushed++;
}
- for (i = 0; i < k8_northbridges.num; i++) {
+ for (i = 0; i < amd_nb_num(); i++) {
u32 w;
/* Make sure the hardware actually executed the flush*/
for (;;) {
- pci_read_config_dword(k8_northbridges.nb_misc[i],
+ pci_read_config_dword(node_to_amd_nb(i)->misc,
0x9c, &w);
if (!(w & 1))
break;
@@ -129,19 +157,23 @@ void k8_flush_garts(void)
if (!flushed)
printk("nothing to flush?\n");
}
-EXPORT_SYMBOL_GPL(k8_flush_garts);
+EXPORT_SYMBOL_GPL(amd_flush_garts);
-static __init int init_k8_nbs(void)
+static __init int init_amd_nbs(void)
{
int err = 0;
- err = cache_k8_northbridges();
+ err = amd_cache_northbridges();
if (err < 0)
- printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
+ printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
+
+ if (amd_cache_gart() < 0)
+ printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
+ "GART support disabled.\n");
return err;
}
/* This has to go after the PCI subsystem */
-fs_initcall(init_k8_nbs);
+fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index afc406498c9d..671d5aad7a0c 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -313,14 +313,16 @@ static void apbt_setup_irq(struct apbt_dev *adev)
if (adev->irq == 0)
return;
+ irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
+ irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
+ /* APB timer irqs are set up as mp_irqs, timer is edge type */
+ __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
+
if (system_state == SYSTEM_BOOTING) {
- irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
- irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
- /* APB timer irqs are set up as mp_irqs, timer is edge type */
- __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
if (request_irq(adev->irq, apbt_interrupt_handler,
- IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
- adev->name, adev)) {
+ IRQF_TIMER | IRQF_DISABLED |
+ IRQF_NOBALANCING,
+ adev->name, adev)) {
printk(KERN_ERR "Failed request IRQ for APBT%d\n",
adev->num);
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b3a16e8f0703..5955a7800a96 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -39,18 +39,6 @@ int fallback_aper_force __initdata;
int fix_aperture __initdata = 1;
-struct bus_dev_range {
- int bus;
- int dev_base;
- int dev_limit;
-};
-
-static struct bus_dev_range bus_dev_ranges[] __initdata = {
- { 0x00, 0x18, 0x20},
- { 0xff, 0x00, 0x20},
- { 0xfe, 0x00, 0x20}
-};
-
static struct resource gart_resource = {
.name = "GART",
.flags = IORESOURCE_MEM,
@@ -206,7 +194,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
* Do an PCI bus scan by hand because we're running before the PCI
* subsystem.
*
- * All K8 AGP bridges are AGPv3 compliant, so we can do this scan
+ * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
* generically. It's probably overkill to always scan all slots because
* the AGP bridges should be always an own bus on the HT hierarchy,
* but do it here for future safety.
@@ -294,16 +282,16 @@ void __init early_gart_iommu_check(void)
search_agp_bridge(&agp_aper_order, &valid_agp);
fix = 0;
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -349,16 +337,16 @@ void __init early_gart_iommu_check(void)
return;
/* disable them all at first */
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
@@ -390,17 +378,17 @@ int __init gart_iommu_hole_init(void)
fix = 0;
node = 0;
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus;
int dev_base, dev_limit;
u32 ctl;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
iommu_detected = 1;
@@ -505,7 +493,7 @@ out:
}
/* Fix up the north bridges */
- for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
+ for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
int bus, dev_base, dev_limit;
/*
@@ -514,11 +502,11 @@ out:
*/
u32 ctl = DISTLBWALKPRB | aper_order << 1;
- bus = bus_dev_ranges[i].bus;
- dev_base = bus_dev_ranges[i].dev_base;
- dev_limit = bus_dev_ranges[i].dev_limit;
+ bus = amd_nb_bus_dev_ranges[i].bus;
+ dev_base = amd_nb_bus_dev_ranges[i].dev_base;
+ dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
for (slot = dev_base; slot < dev_limit; slot++) {
- if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
+ if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
continue;
write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 910f20b457c4..3966b564ea47 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -3,10 +3,7 @@
#
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
-ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
-obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
-endif
-obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
+obj-y += hw_nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SMP) += ipi.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 850657d1b0ed..06c196d7e59c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -31,7 +31,6 @@
#include <linux/init.h>
#include <linux/cpu.h>
#include <linux/dmi.h>
-#include <linux/nmi.h>
#include <linux/smp.h>
#include <linux/mm.h>
@@ -50,9 +49,8 @@
#include <asm/mtrr.h>
#include <asm/smp.h>
#include <asm/mce.h>
-#include <asm/kvm_para.h>
#include <asm/tsc.h>
-#include <asm/atomic.h>
+#include <asm/hypervisor.h>
unsigned int num_processors;
@@ -433,17 +431,18 @@ int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
reserved = reserve_eilvt_offset(offset, new);
if (reserved != new) {
- pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but "
- "vector 0x%x was already reserved by another core, "
- "APIC%lX=0x%x\n",
- smp_processor_id(), new, reserved, reg, old);
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on another cpu\n",
+ smp_processor_id(), reg, offset, new, reserved);
return -EINVAL;
}
if (!eilvt_entry_is_changeable(old, new)) {
- pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but "
- "register already in use, APIC%lX=0x%x\n",
- smp_processor_id(), new, reg, old);
+ pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
+ "vector 0x%x, but the register is already in use for "
+ "vector 0x%x on this cpu\n",
+ smp_processor_id(), reg, offset, new, old);
return -EBUSY;
}
@@ -517,7 +516,7 @@ static void __cpuinit setup_APIC_timer(void)
{
struct clock_event_device *levt = &__get_cpu_var(lapic_events);
- if (cpu_has(&current_cpu_data, X86_FEATURE_ARAT)) {
+ if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) {
lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
/* Make LAPIC timer preferrable over percpu HPET */
lapic_clockevent.rating = 150;
@@ -685,7 +684,7 @@ static int __init calibrate_APIC_clock(void)
lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
lapic_clockevent.shift);
lapic_clockevent.max_delta_ns =
- clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+ clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
lapic_clockevent.min_delta_ns =
clockevent_delta2ns(0xF, &lapic_clockevent);
@@ -800,11 +799,7 @@ void __init setup_boot_APIC_clock(void)
* PIT/HPET going. Otherwise register lapic as a dummy
* device.
*/
- if (nmi_watchdog != NMI_IO_APIC)
- lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
- else
- pr_warning("APIC timer registered as dummy,"
- " due to nmi_watchdog=%d!\n", nmi_watchdog);
+ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
/* Setup the lapic or request the broadcast */
setup_APIC_timer();
@@ -1196,12 +1191,15 @@ static void __cpuinit lapic_setup_esr(void)
oldvalue, value);
}
-
/**
* setup_local_APIC - setup the local APIC
+ *
+ * Used to setup local APIC while initializing BSP or bringin up APs.
+ * Always called with preemption disabled.
*/
void __cpuinit setup_local_APIC(void)
{
+ int cpu = smp_processor_id();
unsigned int value, queued;
int i, j, acked = 0;
unsigned long long tsc = 0, ntsc;
@@ -1226,8 +1224,6 @@ void __cpuinit setup_local_APIC(void)
#endif
perf_events_lapic_init();
- preempt_disable();
-
/*
* Double-check whether this APIC is really registered.
* This is meaningless in clustered apic mode, so we skip it.
@@ -1343,21 +1339,19 @@ void __cpuinit setup_local_APIC(void)
* TODO: set up through-local-APIC from through-I/O-APIC? --macro
*/
value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
- if (!smp_processor_id() && (pic_mode || !value)) {
+ if (!cpu && (pic_mode || !value)) {
value = APIC_DM_EXTINT;
- apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
- smp_processor_id());
+ apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu);
} else {
value = APIC_DM_EXTINT | APIC_LVT_MASKED;
- apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
- smp_processor_id());
+ apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu);
}
apic_write(APIC_LVT0, value);
/*
* only the BP should see the LINT1 NMI signal, obviously.
*/
- if (!smp_processor_id())
+ if (!cpu)
value = APIC_DM_NMI;
else
value = APIC_DM_NMI | APIC_LVT_MASKED;
@@ -1365,11 +1359,9 @@ void __cpuinit setup_local_APIC(void)
value |= APIC_LVT_LEVEL_TRIGGER;
apic_write(APIC_LVT1, value);
- preempt_enable();
-
#ifdef CONFIG_X86_MCE_INTEL
/* Recheck CMCI information after local APIC is up on CPU #0 */
- if (smp_processor_id() == 0)
+ if (!cpu)
cmci_recheck();
#endif
}
@@ -1388,8 +1380,15 @@ void __cpuinit end_local_APIC_setup(void)
}
#endif
- setup_apic_nmi_watchdog(NULL);
apic_pm_activate();
+
+ /*
+ * Now that local APIC setup is completed for BP, configure the fault
+ * handling for interrupt remapping.
+ */
+ if (!smp_processor_id() && intr_remapping_enabled)
+ enable_drhd_fault_handling();
+
}
#ifdef CONFIG_X86_X2APIC
@@ -1477,7 +1476,8 @@ void __init enable_IR_x2apic(void)
/* IR is required if there is APIC ID > 255 even when running
* under KVM
*/
- if (max_physical_apicid > 255 || !kvm_para_available())
+ if (max_physical_apicid > 255 ||
+ !hypervisor_x2apic_available())
goto nox2apic;
/*
* without IR all CPUs can be addressed by IOAPIC/MSI
@@ -1531,13 +1531,60 @@ static int __init detect_init_APIC(void)
return 0;
}
#else
+
+static int apic_verify(void)
+{
+ u32 features, h, l;
+
+ /*
+ * The APIC feature bit should now be enabled
+ * in `cpuid'
+ */
+ features = cpuid_edx(1);
+ if (!(features & (1 << X86_FEATURE_APIC))) {
+ pr_warning("Could not enable APIC!\n");
+ return -1;
+ }
+ set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
+ mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
+
+ /* The BIOS may have set up the APIC at some other address */
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (l & MSR_IA32_APICBASE_ENABLE)
+ mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
+
+ pr_info("Found and enabled local APIC!\n");
+ return 0;
+}
+
+int apic_force_enable(void)
+{
+ u32 h, l;
+
+ if (disable_apic)
+ return -1;
+
+ /*
+ * Some BIOSes disable the local APIC in the APIC_BASE
+ * MSR. This can only be done in software for Intel P6 or later
+ * and AMD K7 (Model > 1) or later.
+ */
+ rdmsr(MSR_IA32_APICBASE, l, h);
+ if (!(l & MSR_IA32_APICBASE_ENABLE)) {
+ pr_info("Local APIC disabled by BIOS -- reenabling.\n");
+ l &= ~MSR_IA32_APICBASE_BASE;
+ l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
+ wrmsr(MSR_IA32_APICBASE, l, h);
+ enabled_via_apicbase = 1;
+ }
+ return apic_verify();
+}
+
/*
* Detect and initialize APIC
*/
static int __init detect_init_APIC(void)
{
- u32 h, l, features;
-
/* Disabled by kernel option? */
if (disable_apic)
return -1;
@@ -1567,38 +1614,12 @@ static int __init detect_init_APIC(void)
"you can enable it with \"lapic\"\n");
return -1;
}
- /*
- * Some BIOSes disable the local APIC in the APIC_BASE
- * MSR. This can only be done in software for Intel P6 or later
- * and AMD K7 (Model > 1) or later.
- */
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (!(l & MSR_IA32_APICBASE_ENABLE)) {
- pr_info("Local APIC disabled by BIOS -- reenabling.\n");
- l &= ~MSR_IA32_APICBASE_BASE;
- l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
- wrmsr(MSR_IA32_APICBASE, l, h);
- enabled_via_apicbase = 1;
- }
- }
- /*
- * The APIC feature bit should now be enabled
- * in `cpuid'
- */
- features = cpuid_edx(1);
- if (!(features & (1 << X86_FEATURE_APIC))) {
- pr_warning("Could not enable APIC!\n");
- return -1;
+ if (apic_force_enable())
+ return -1;
+ } else {
+ if (apic_verify())
+ return -1;
}
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
- mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-
- /* The BIOS may have set up the APIC at some other address */
- rdmsr(MSR_IA32_APICBASE, l, h);
- if (l & MSR_IA32_APICBASE_ENABLE)
- mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
-
- pr_info("Found and enabled local APIC!\n");
apic_pm_activate();
@@ -1610,28 +1631,6 @@ no_apic:
}
#endif
-#ifdef CONFIG_X86_64
-void __init early_init_lapic_mapping(void)
-{
- /*
- * If no local APIC can be found then go out
- * : it means there is no mpatable and MADT
- */
- if (!smp_found_config)
- return;
-
- set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
- apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
- APIC_BASE, mp_lapic_addr);
-
- /*
- * Fetch the APIC ID of the BSP in case we have a
- * default configuration (or the MP table is broken).
- */
- boot_cpu_physical_apicid = read_apic_id();
-}
-#endif
-
/**
* init_apic_mappings - initialize APIC mappings
*/
@@ -1657,10 +1656,7 @@ void __init init_apic_mappings(void)
* acpi_register_lapic_address()
*/
if (!acpi_lapic && !smp_found_config)
- set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
-
- apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
- APIC_BASE, apic_phys);
+ register_lapic_address(apic_phys);
}
/*
@@ -1682,11 +1678,27 @@ void __init init_apic_mappings(void)
}
}
+void __init register_lapic_address(unsigned long address)
+{
+ mp_lapic_addr = address;
+
+ if (!x2apic_mode) {
+ set_fixmap_nocache(FIX_APIC_BASE, address);
+ apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
+ APIC_BASE, mp_lapic_addr);
+ }
+ if (boot_cpu_physical_apicid == -1U) {
+ boot_cpu_physical_apicid = read_apic_id();
+ apic_version[boot_cpu_physical_apicid] =
+ GET_APIC_VERSION(apic_read(APIC_LVR));
+ }
+}
+
/*
* This initializes the IO-APIC and APIC hardware if this is
* a UP kernel.
*/
-int apic_version[MAX_APICS];
+int apic_version[MAX_LOCAL_APIC];
int __init APIC_init_uniprocessor(void)
{
@@ -1751,17 +1763,10 @@ int __init APIC_init_uniprocessor(void)
setup_IO_APIC();
else {
nr_ioapics = 0;
- localise_nmi_watchdog();
}
-#else
- localise_nmi_watchdog();
#endif
x86_init.timers.setup_percpu_clockev();
-#ifdef CONFIG_X86_64
- check_nmi_watchdog();
-#endif
-
return 0;
}
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index cefd6942f0e9..79fd43ca6f96 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -17,19 +17,31 @@
#include <linux/nmi.h>
#include <linux/module.h>
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
u64 hw_nmi_get_sample_period(void)
{
return (u64)(cpu_khz) * 1000 * 60;
}
+#endif
+
+#ifdef arch_trigger_all_cpu_backtrace
+/* For reliability, we're prepared to waste bits here. */
+static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+
+/* "in progress" flag of arch_trigger_all_cpu_backtrace */
+static unsigned long backtrace_flag;
-#ifdef ARCH_HAS_NMI_WATCHDOG
void arch_trigger_all_cpu_backtrace(void)
{
int i;
+ if (test_and_set_bit(0, &backtrace_flag))
+ /*
+ * If there is already a trigger_all_cpu_backtrace() in progress
+ * (backtrace_flag == 1), don't output double cpu dump infos.
+ */
+ return;
+
cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
printk(KERN_INFO "sending NMI to all CPUs:\n");
@@ -41,6 +53,9 @@ void arch_trigger_all_cpu_backtrace(void)
break;
mdelay(1);
}
+
+ clear_bit(0, &backtrace_flag);
+ smp_mb__after_clear_bit();
}
static int __kprobes
@@ -49,11 +64,10 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
{
struct die_args *args = __args;
struct pt_regs *regs;
- int cpu = smp_processor_id();
+ int cpu;
switch (cmd) {
case DIE_NMI:
- case DIE_NMI_IPI:
break;
default:
@@ -61,6 +75,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
}
regs = args->regs;
+ cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -80,7 +95,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
static __read_mostly struct notifier_block backtrace_notifier = {
.notifier_call = arch_trigger_all_cpu_backtrace_handler,
.next = NULL,
- .priority = 1
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
static int __init register_trigger_all_cpu_backtrace(void)
@@ -90,18 +105,3 @@ static int __init register_trigger_all_cpu_backtrace(void)
}
early_initcall(register_trigger_all_cpu_backtrace);
#endif
-
-/* STUB calls to mimic old nmi_watchdog behaviour */
-#if defined(CONFIG_X86_LOCAL_APIC)
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-void acpi_nmi_enable(void) { return; }
-void acpi_nmi_disable(void) { return; }
-#endif
-atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
-int unknown_nmi_panic;
-void cpu_nmi_set_wd_enabled(void) { return; }
-void stop_apic_nmi_watchdog(void *unused) { return; }
-void setup_apic_nmi_watchdog(void *unused) { return; }
-int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7cc0a721f628..697dc34b7b87 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -54,7 +54,6 @@
#include <asm/dma.h>
#include <asm/timer.h>
#include <asm/i8259.h>
-#include <asm/nmi.h>
#include <asm/msidef.h>
#include <asm/hypertransport.h>
#include <asm/setup.h>
@@ -126,6 +125,26 @@ static int __init parse_noapic(char *str)
}
early_param("noapic", parse_noapic);
+/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
+void mp_save_irq(struct mpc_intsrc *m)
+{
+ int i;
+
+ apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
+ " IRQ %02x, APIC ID %x, APIC INT %02x\n",
+ m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
+ m->srcbusirq, m->dstapic, m->dstirq);
+
+ for (i = 0; i < mp_irq_entries; i++) {
+ if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
+ return;
+ }
+
+ memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
+ if (++mp_irq_entries == MAX_IRQ_SOURCES)
+ panic("Max # of irq sources exceeded!!\n");
+}
+
struct irq_pin_list {
int apic, pin;
struct irq_pin_list *next;
@@ -136,6 +155,7 @@ static struct irq_pin_list *alloc_irq_pin_list(int node)
return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
}
+
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
#ifdef CONFIG_SPARSE_IRQ
static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
@@ -1934,8 +1954,7 @@ void disable_IO_APIC(void)
*
* by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
*/
-
-void __init setup_ioapic_ids_from_mpc(void)
+void __init setup_ioapic_ids_from_mpc_nocheck(void)
{
union IO_APIC_reg_00 reg_00;
physid_mask_t phys_id_present_map;
@@ -1944,15 +1963,6 @@ void __init setup_ioapic_ids_from_mpc(void)
unsigned char old_id;
unsigned long flags;
- if (acpi_ioapic)
- return;
- /*
- * Don't check I/O APIC IDs for xAPIC systems. They have
- * no meaning without the serial APIC bus.
- */
- if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
- || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return;
/*
* This is broken; anything with a real cpu count has to
* circumvent this idiocy regardless.
@@ -2006,7 +2016,6 @@ void __init setup_ioapic_ids_from_mpc(void)
physids_or(phys_id_present_map, phys_id_present_map, tmp);
}
-
/*
* We need to adjust the IRQ routing table
* if the ID changed.
@@ -2018,9 +2027,12 @@ void __init setup_ioapic_ids_from_mpc(void)
= mp_ioapics[apic_id].apicid;
/*
- * Read the right value from the MPC table and
- * write it into the ID register.
+ * Update the ID register according to the right value
+ * from the MPC table if they are different.
*/
+ if (mp_ioapics[apic_id].apicid == reg_00.bits.ID)
+ continue;
+
apic_printk(APIC_VERBOSE, KERN_INFO
"...changing IO-APIC physical APIC ID to %d ...",
mp_ioapics[apic_id].apicid);
@@ -2042,6 +2054,21 @@ void __init setup_ioapic_ids_from_mpc(void)
apic_printk(APIC_VERBOSE, " ok.\n");
}
}
+
+void __init setup_ioapic_ids_from_mpc(void)
+{
+
+ if (acpi_ioapic)
+ return;
+ /*
+ * Don't check I/O APIC IDs for xAPIC systems. They have
+ * no meaning without the serial APIC bus.
+ */
+ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
+ return;
+ setup_ioapic_ids_from_mpc_nocheck();
+}
#endif
int no_timer_check __initdata;
@@ -2302,7 +2329,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
unsigned int irr;
struct irq_desc *desc;
struct irq_cfg *cfg;
- irq = __get_cpu_var(vector_irq)[vector];
+ irq = __this_cpu_read(vector_irq[vector]);
if (irq == -1)
continue;
@@ -2336,7 +2363,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
goto unlock;
}
- __get_cpu_var(vector_irq)[vector] = -1;
+ __this_cpu_write(vector_irq[vector], -1);
unlock:
raw_spin_unlock(&desc->lock);
}
@@ -2430,13 +2457,12 @@ static void ack_apic_level(struct irq_data *data)
{
struct irq_cfg *cfg = data->chip_data;
int i, do_unmask_irq = 0, irq = data->irq;
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long v;
irq_complete_move(cfg);
#ifdef CONFIG_GENERIC_PENDING_IRQ
/* If we are moving the irq we need to mask it */
- if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
+ if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
do_unmask_irq = 1;
mask_ioapic(cfg);
}
@@ -2643,24 +2669,6 @@ static void lapic_register_intr(int irq)
"edge");
}
-static void __init setup_nmi(void)
-{
- /*
- * Dirty trick to enable the NMI watchdog ...
- * We put the 8259A master into AEOI mode and
- * unmask on all local APICs LVT0 as NMI.
- *
- * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
- * is from Maciej W. Rozycki - so we do not have to EOI from
- * the NMI handler or the timer interrupt.
- */
- apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
-
- enable_NMI_through_LVT0();
-
- apic_printk(APIC_VERBOSE, " done.\n");
-}
-
/*
* This looks a bit hackish but it's about the only one way of sending
* a few INTA cycles to 8259As and any associated glue logic. ICR does
@@ -2766,15 +2774,6 @@ static inline void __init check_timer(void)
*/
apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
legacy_pic->init(1);
-#ifdef CONFIG_X86_32
- {
- unsigned int ver;
-
- ver = apic_read(APIC_LVR);
- ver = GET_APIC_VERSION(ver);
- timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
- }
-#endif
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
@@ -2822,10 +2821,6 @@ static inline void __init check_timer(void)
unmask_ioapic(cfg);
}
if (timer_irq_works()) {
- if (nmi_watchdog == NMI_IO_APIC) {
- setup_nmi();
- legacy_pic->unmask(0);
- }
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
goto out;
@@ -2851,11 +2846,6 @@ static inline void __init check_timer(void)
if (timer_irq_works()) {
apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
timer_through_8259 = 1;
- if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->mask(0);
- setup_nmi();
- legacy_pic->unmask(0);
- }
goto out;
}
/*
@@ -2867,15 +2857,6 @@ static inline void __init check_timer(void)
apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
}
- if (nmi_watchdog == NMI_IO_APIC) {
- apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
- "through the IO-APIC - disabling NMI Watchdog!\n");
- nmi_watchdog = NMI_NONE;
- }
-#ifdef CONFIG_X86_32
- timer_ack = 0;
-#endif
-
apic_printk(APIC_QUIET, KERN_INFO
"...trying to set up timer as Virtual Wire IRQ...\n");
@@ -3413,6 +3394,7 @@ dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
msg.data |= MSI_DATA_VECTOR(cfg->vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+ msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
dmar_msi_write(irq, &msg);
@@ -3639,7 +3621,7 @@ int __init io_apic_get_redir_entries (int ioapic)
return reg_01.bits.entries + 1;
}
-void __init probe_nr_irqs_gsi(void)
+static void __init probe_nr_irqs_gsi(void)
{
int nr;
@@ -3956,7 +3938,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
return res;
}
-void __init ioapic_init_mappings(void)
+void __init ioapic_and_gsi_init(void)
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
struct resource *ioapic_res;
@@ -3994,6 +3976,8 @@ fake_ioapic_page:
ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
ioapic_res++;
}
+
+ probe_nr_irqs_gsi();
}
void __init ioapic_insert_resources(void)
@@ -4103,7 +4087,8 @@ void __init pre_init_apic_IRQ0(void)
printk(KERN_INFO "Early APIC setup for system timer0\n");
#ifndef CONFIG_SMP
- phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
+ physid_set_mask_of_physid(boot_cpu_physical_apicid,
+ &phys_cpu_present_map);
#endif
/* Make sure the irq descriptor is set up */
cfg = alloc_irq_and_cfg_at(0, 0);
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
deleted file mode 100644
index c90041ccb742..000000000000
--- a/arch/x86/kernel/apic/nmi.c
+++ /dev/null
@@ -1,567 +0,0 @@
-/*
- * NMI watchdog support on APIC systems
- *
- * Started by Ingo Molnar <mingo@redhat.com>
- *
- * Fixes:
- * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
- * Mikael Pettersson : Power Management for local APIC NMI watchdog.
- * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
- * Pavel Machek and
- * Mikael Pettersson : PM converted to driver model. Disable/enable API.
- */
-
-#include <asm/apic.h>
-
-#include <linux/nmi.h>
-#include <linux/mm.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/module.h>
-#include <linux/slab.h>
-#include <linux/sysdev.h>
-#include <linux/sysctl.h>
-#include <linux/percpu.h>
-#include <linux/kprobes.h>
-#include <linux/cpumask.h>
-#include <linux/kernel_stat.h>
-#include <linux/kdebug.h>
-#include <linux/smp.h>
-
-#include <asm/i8259.h>
-#include <asm/io_apic.h>
-#include <asm/proto.h>
-#include <asm/timer.h>
-
-#include <asm/mce.h>
-
-#include <asm/mach_traps.h>
-
-int unknown_nmi_panic;
-int nmi_watchdog_enabled;
-
-/* For reliability, we're prepared to waste bits here. */
-static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
-
-/* nmi_active:
- * >0: the lapic NMI watchdog is active, but can be disabled
- * <0: the lapic NMI watchdog has not been set up, and cannot
- * be enabled
- * 0: the lapic NMI watchdog is disabled, but can be enabled
- */
-atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
-EXPORT_SYMBOL(nmi_active);
-
-unsigned int nmi_watchdog = NMI_NONE;
-EXPORT_SYMBOL(nmi_watchdog);
-
-static int panic_on_timeout;
-
-static unsigned int nmi_hz = HZ;
-static DEFINE_PER_CPU(short, wd_enabled);
-static int endflag __initdata;
-
-static inline unsigned int get_nmi_count(int cpu)
-{
- return per_cpu(irq_stat, cpu).__nmi_count;
-}
-
-static inline int mce_in_progress(void)
-{
-#if defined(CONFIG_X86_MCE)
- return atomic_read(&mce_entry) > 0;
-#endif
- return 0;
-}
-
-/*
- * Take the local apic timer and PIT/HPET into account. We don't
- * know which one is active, when we have highres/dyntick on
- */
-static inline unsigned int get_timer_irqs(int cpu)
-{
- return per_cpu(irq_stat, cpu).apic_timer_irqs +
- per_cpu(irq_stat, cpu).irq0_irqs;
-}
-
-#ifdef CONFIG_SMP
-/*
- * The performance counters used by NMI_LOCAL_APIC don't trigger when
- * the CPU is idle. To make sure the NMI watchdog really ticks on all
- * CPUs during the test make them busy.
- */
-static __init void nmi_cpu_busy(void *data)
-{
- local_irq_enable_in_hardirq();
- /*
- * Intentionally don't use cpu_relax here. This is
- * to make sure that the performance counter really ticks,
- * even if there is a simulator or similar that catches the
- * pause instruction. On a real HT machine this is fine because
- * all other CPUs are busy with "useless" delay loops and don't
- * care if they get somewhat less cycles.
- */
- while (endflag == 0)
- mb();
-}
-#endif
-
-static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
-{
- printk(KERN_CONT "\n");
-
- printk(KERN_WARNING
- "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
- cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
-
- printk(KERN_WARNING
- "Please report this to bugzilla.kernel.org,\n");
- printk(KERN_WARNING
- "and attach the output of the 'dmesg' command.\n");
-
- per_cpu(wd_enabled, cpu) = 0;
- atomic_dec(&nmi_active);
-}
-
-static void __acpi_nmi_disable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
-}
-
-int __init check_nmi_watchdog(void)
-{
- unsigned int *prev_nmi_count;
- int cpu;
-
- if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
- return 0;
-
- prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
- if (!prev_nmi_count)
- goto error;
-
- printk(KERN_INFO "Testing NMI watchdog ... ");
-
-#ifdef CONFIG_SMP
- if (nmi_watchdog == NMI_LOCAL_APIC)
- smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
-#endif
-
- for_each_possible_cpu(cpu)
- prev_nmi_count[cpu] = get_nmi_count(cpu);
- local_irq_enable();
- mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
-
- for_each_online_cpu(cpu) {
- if (!per_cpu(wd_enabled, cpu))
- continue;
- if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
- report_broken_nmi(cpu, prev_nmi_count);
- }
- endflag = 1;
- if (!atomic_read(&nmi_active)) {
- kfree(prev_nmi_count);
- atomic_set(&nmi_active, -1);
- goto error;
- }
- printk("OK.\n");
-
- /*
- * now that we know it works we can reduce NMI frequency to
- * something more reasonable; makes a difference in some configs
- */
- if (nmi_watchdog == NMI_LOCAL_APIC)
- nmi_hz = lapic_adjust_nmi_hz(1);
-
- kfree(prev_nmi_count);
- return 0;
-error:
- if (nmi_watchdog == NMI_IO_APIC) {
- if (!timer_through_8259)
- legacy_pic->mask(0);
- on_each_cpu(__acpi_nmi_disable, NULL, 1);
- }
-
-#ifdef CONFIG_X86_32
- timer_ack = 0;
-#endif
- return -1;
-}
-
-static int __init setup_nmi_watchdog(char *str)
-{
- unsigned int nmi;
-
- if (!strncmp(str, "panic", 5)) {
- panic_on_timeout = 1;
- str = strchr(str, ',');
- if (!str)
- return 1;
- ++str;
- }
-
- if (!strncmp(str, "lapic", 5))
- nmi_watchdog = NMI_LOCAL_APIC;
- else if (!strncmp(str, "ioapic", 6))
- nmi_watchdog = NMI_IO_APIC;
- else {
- get_option(&str, &nmi);
- if (nmi >= NMI_INVALID)
- return 0;
- nmi_watchdog = nmi;
- }
-
- return 1;
-}
-__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-/*
- * Suspend/resume support
- */
-#ifdef CONFIG_PM
-
-static int nmi_pm_active; /* nmi_active before suspend */
-
-static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
-{
- /* only CPU0 goes here, other CPUs should be offline */
- nmi_pm_active = atomic_read(&nmi_active);
- stop_apic_nmi_watchdog(NULL);
- BUG_ON(atomic_read(&nmi_active) != 0);
- return 0;
-}
-
-static int lapic_nmi_resume(struct sys_device *dev)
-{
- /* only CPU0 goes here, other CPUs should be offline */
- if (nmi_pm_active > 0) {
- setup_apic_nmi_watchdog(NULL);
- touch_nmi_watchdog();
- }
- return 0;
-}
-
-static struct sysdev_class nmi_sysclass = {
- .name = "lapic_nmi",
- .resume = lapic_nmi_resume,
- .suspend = lapic_nmi_suspend,
-};
-
-static struct sys_device device_lapic_nmi = {
- .id = 0,
- .cls = &nmi_sysclass,
-};
-
-static int __init init_lapic_nmi_sysfs(void)
-{
- int error;
-
- /*
- * should really be a BUG_ON but b/c this is an
- * init call, it just doesn't work. -dcz
- */
- if (nmi_watchdog != NMI_LOCAL_APIC)
- return 0;
-
- if (atomic_read(&nmi_active) < 0)
- return 0;
-
- error = sysdev_class_register(&nmi_sysclass);
- if (!error)
- error = sysdev_register(&device_lapic_nmi);
- return error;
-}
-
-/* must come after the local APIC's device_initcall() */
-late_initcall(init_lapic_nmi_sysfs);
-
-#endif /* CONFIG_PM */
-
-static void __acpi_nmi_enable(void *__unused)
-{
- apic_write(APIC_LVT0, APIC_DM_NMI);
-}
-
-/*
- * Enable timer based NMIs on all CPUs:
- */
-void acpi_nmi_enable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_enable, NULL, 1);
-}
-
-/*
- * Disable timer based NMIs on all CPUs:
- */
-void acpi_nmi_disable(void)
-{
- if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
- on_each_cpu(__acpi_nmi_disable, NULL, 1);
-}
-
-/*
- * This function is called as soon the LAPIC NMI watchdog driver has everything
- * in place and it's ready to check if the NMIs belong to the NMI watchdog
- */
-void cpu_nmi_set_wd_enabled(void)
-{
- __get_cpu_var(wd_enabled) = 1;
-}
-
-void setup_apic_nmi_watchdog(void *unused)
-{
- if (__get_cpu_var(wd_enabled))
- return;
-
- /* cheap hack to support suspend/resume */
- /* if cpu0 is not active neither should the other cpus */
- if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
- return;
-
- switch (nmi_watchdog) {
- case NMI_LOCAL_APIC:
- if (lapic_watchdog_init(nmi_hz) < 0) {
- __get_cpu_var(wd_enabled) = 0;
- return;
- }
- /* FALL THROUGH */
- case NMI_IO_APIC:
- __get_cpu_var(wd_enabled) = 1;
- atomic_inc(&nmi_active);
- }
-}
-
-void stop_apic_nmi_watchdog(void *unused)
-{
- /* only support LOCAL and IO APICs for now */
- if (!nmi_watchdog_active())
- return;
- if (__get_cpu_var(wd_enabled) == 0)
- return;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- lapic_watchdog_stop();
- else
- __acpi_nmi_disable(NULL);
- __get_cpu_var(wd_enabled) = 0;
- atomic_dec(&nmi_active);
-}
-
-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- *
- * since NMIs don't listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up here too!]
- */
-
-static DEFINE_PER_CPU(unsigned, last_irq_sum);
-static DEFINE_PER_CPU(long, alert_counter);
-static DEFINE_PER_CPU(int, nmi_touch);
-
-void touch_nmi_watchdog(void)
-{
- if (nmi_watchdog_active()) {
- unsigned cpu;
-
- /*
- * Tell other CPUs to reset their alert counters. We cannot
- * do it ourselves because the alert count increase is not
- * atomic.
- */
- for_each_present_cpu(cpu) {
- if (per_cpu(nmi_touch, cpu) != 1)
- per_cpu(nmi_touch, cpu) = 1;
- }
- }
-
- /*
- * Tickle the softlockup detector too:
- */
- touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-notrace __kprobes int
-nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
-{
- /*
- * Since current_thread_info()-> is always on the stack, and we
- * always switch the stack NMI-atomically, it's safe to use
- * smp_processor_id().
- */
- unsigned int sum;
- int touched = 0;
- int cpu = smp_processor_id();
- int rc = 0;
-
- sum = get_timer_irqs(cpu);
-
- if (__get_cpu_var(nmi_touch)) {
- __get_cpu_var(nmi_touch) = 0;
- touched = 1;
- }
-
- /* We can be called before check_nmi_watchdog, hence NULL check. */
- if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
- static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
-
- raw_spin_lock(&lock);
- printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
- show_regs(regs);
- dump_stack();
- raw_spin_unlock(&lock);
- cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
-
- rc = 1;
- }
-
- /* Could check oops_in_progress here too, but it's safer not to */
- if (mce_in_progress())
- touched = 1;
-
- /* if the none of the timers isn't firing, this cpu isn't doing much */
- if (!touched && __get_cpu_var(last_irq_sum) == sum) {
- /*
- * Ayiee, looks like this CPU is stuck ...
- * wait a few IRQs (5 seconds) before doing the oops ...
- */
- __this_cpu_inc(alert_counter);
- if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
- /*
- * die_nmi will return ONLY if NOTIFY_STOP happens..
- */
- die_nmi("BUG: NMI Watchdog detected LOCKUP",
- regs, panic_on_timeout);
- } else {
- __get_cpu_var(last_irq_sum) = sum;
- __this_cpu_write(alert_counter, 0);
- }
-
- /* see if the nmi watchdog went off */
- if (!__get_cpu_var(wd_enabled))
- return rc;
- switch (nmi_watchdog) {
- case NMI_LOCAL_APIC:
- rc |= lapic_wd_event(nmi_hz);
- break;
- case NMI_IO_APIC:
- /*
- * don't know how to accurately check for this.
- * just assume it was a watchdog timer interrupt
- * This matches the old behaviour.
- */
- rc = 1;
- break;
- }
- return rc;
-}
-
-#ifdef CONFIG_SYSCTL
-
-static void enable_ioapic_nmi_watchdog_single(void *unused)
-{
- __get_cpu_var(wd_enabled) = 1;
- atomic_inc(&nmi_active);
- __acpi_nmi_enable(NULL);
-}
-
-static void enable_ioapic_nmi_watchdog(void)
-{
- on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
- touch_nmi_watchdog();
-}
-
-static void disable_ioapic_nmi_watchdog(void)
-{
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-}
-
-static int __init setup_unknown_nmi_panic(char *str)
-{
- unknown_nmi_panic = 1;
- return 1;
-}
-__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
-
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
- unsigned char reason = get_nmi_reason();
- char buf[64];
-
- sprintf(buf, "NMI received for unknown reason %02x\n", reason);
- die_nmi(buf, regs, 1); /* Always panic here */
- return 0;
-}
-
-/*
- * proc handler for /proc/sys/kernel/nmi
- */
-int proc_nmi_enabled(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
-{
- int old_state;
-
- nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
- old_state = nmi_watchdog_enabled;
- proc_dointvec(table, write, buffer, length, ppos);
- if (!!old_state == !!nmi_watchdog_enabled)
- return 0;
-
- if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
- printk(KERN_WARNING
- "NMI watchdog is permanently disabled\n");
- return -EIO;
- }
-
- if (nmi_watchdog == NMI_LOCAL_APIC) {
- if (nmi_watchdog_enabled)
- enable_lapic_nmi_watchdog();
- else
- disable_lapic_nmi_watchdog();
- } else if (nmi_watchdog == NMI_IO_APIC) {
- if (nmi_watchdog_enabled)
- enable_ioapic_nmi_watchdog();
- else
- disable_ioapic_nmi_watchdog();
- } else {
- printk(KERN_WARNING
- "NMI watchdog doesn't know what hardware to touch\n");
- return -EIO;
- }
- return 0;
-}
-
-#endif /* CONFIG_SYSCTL */
-
-int do_nmi_callback(struct pt_regs *regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
- if (unknown_nmi_panic)
- return unknown_nmi_panic_callback(regs, cpu);
-#endif
- return 0;
-}
-
-void arch_trigger_all_cpu_backtrace(void)
-{
- int i;
-
- cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
-
- printk(KERN_INFO "sending NMI to all CPUs:\n");
- apic->send_IPI_all(NMI_VECTOR);
-
- /* Wait for up to 10 seconds for all CPUs to do the backtrace */
- for (i = 0; i < 10 * 1000; i++) {
- if (cpumask_empty(to_cpumask(backtrace_mask)))
- break;
- mdelay(1);
- }
-}
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index f9e4e6a54073..d8c4a6feb286 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -79,13 +79,6 @@ void __init default_setup_apic_routing(void)
/* need to update phys_pkg_id */
apic->phys_pkg_id = apicid_phys_pkg_id;
}
-
- /*
- * Now that apic routing model is selected, configure the
- * fault handling for intr remapping.
- */
- if (intr_remapping_enabled)
- enable_drhd_fault_handling();
}
/* Same for both flat and physical. */
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index ed4118de249e..bd16b58b8850 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -44,8 +44,20 @@ static u64 gru_start_paddr, gru_end_paddr;
static union uvh_apicid uvh_apicid;
int uv_min_hub_revision_id;
EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
+unsigned int uv_apicid_hibits;
+EXPORT_SYMBOL_GPL(uv_apicid_hibits);
static DEFINE_SPINLOCK(uv_nmi_lock);
+static unsigned long __init uv_early_read_mmr(unsigned long addr)
+{
+ unsigned long val, *mmr;
+
+ mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
+ val = *mmr;
+ early_iounmap(mmr, sizeof(*mmr));
+ return val;
+}
+
static inline bool is_GRU_range(u64 start, u64 end)
{
return start >= gru_start_paddr && end <= gru_end_paddr;
@@ -56,28 +68,24 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
return is_ISA_range(start, end) || is_GRU_range(start, end);
}
-static int early_get_nodeid(void)
+static int __init early_get_pnodeid(void)
{
union uvh_node_id_u node_id;
- unsigned long *mmr;
-
- mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
- node_id.v = *mmr;
- early_iounmap(mmr, sizeof(*mmr));
+ union uvh_rh_gam_config_mmr_u m_n_config;
+ int pnode;
/* Currently, all blades have same revision number */
+ node_id.v = uv_early_read_mmr(UVH_NODE_ID);
+ m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
uv_min_hub_revision_id = node_id.s.revision;
- return node_id.s.node_id;
+ pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
+ return pnode;
}
static void __init early_get_apic_pnode_shift(void)
{
- unsigned long *mmr;
-
- mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_APICID, sizeof(*mmr));
- uvh_apicid.v = *mmr;
- early_iounmap(mmr, sizeof(*mmr));
+ uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
if (!uvh_apicid.v)
/*
* Old bios, use default value
@@ -85,12 +93,25 @@ static void __init early_get_apic_pnode_shift(void)
uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT;
}
+/*
+ * Add an extra bit as dictated by bios to the destination apicid of
+ * interrupts potentially passing through the UV HUB. This prevents
+ * a deadlock between interrupts and IO port operations.
+ */
+static void __init uv_set_apicid_hibit(void)
+{
+ union uvh_lb_target_physical_apic_id_mask_u apicid_mask;
+
+ apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK);
+ uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK;
+}
+
static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- int nodeid;
+ int pnodeid;
if (!strcmp(oem_id, "SGI")) {
- nodeid = early_get_nodeid();
+ pnodeid = early_get_pnodeid();
early_get_apic_pnode_shift();
x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
x86_platform.nmi_init = uv_nmi_init;
@@ -99,9 +120,10 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
else if (!strcmp(oem_table_id, "UVX"))
uv_system_type = UV_X2APIC;
else if (!strcmp(oem_table_id, "UVH")) {
- __get_cpu_var(x2apic_extra_bits) =
- nodeid << (uvh_apicid.s.pnode_shift - 1);
+ __this_cpu_write(x2apic_extra_bits,
+ pnodeid << uvh_apicid.s.pnode_shift);
uv_system_type = UV_NON_UNIQUE_APIC;
+ uv_set_apicid_hibit();
return 1;
}
}
@@ -155,6 +177,7 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri
int pnode;
pnode = uv_apicid_to_pnode(phys_apicid);
+ phys_apicid |= uv_apicid_hibits;
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
@@ -236,7 +259,7 @@ static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
int cpu = cpumask_first(cpumask);
if ((unsigned)cpu < nr_cpu_ids)
- return per_cpu(x86_cpu_to_apicid, cpu);
+ return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
else
return BAD_APICID;
}
@@ -255,7 +278,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
if (cpumask_test_cpu(cpu, cpu_online_mask))
break;
}
- return per_cpu(x86_cpu_to_apicid, cpu);
+ return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
}
static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -263,7 +286,7 @@ static unsigned int x2apic_get_apic_id(unsigned long x)
unsigned int id;
WARN_ON(preemptible() && num_online_cpus() > 1);
- id = x | __get_cpu_var(x2apic_extra_bits);
+ id = x | __this_cpu_read(x2apic_extra_bits);
return id;
}
@@ -355,7 +378,7 @@ struct apic __refdata apic_x2apic_uv_x = {
static __cpuinit void set_x2apic_extra_bits(int pnode)
{
- __get_cpu_var(x2apic_extra_bits) = (pnode << 6);
+ __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
}
/*
@@ -379,14 +402,14 @@ struct redir_addr {
#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
static __initdata struct redir_addr redir_addrs[] = {
- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG},
- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG},
- {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG},
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR},
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR},
+ {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR},
};
static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
{
- union uvh_si_alias0_overlay_config_u alias;
+ union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
int i;
@@ -618,7 +641,7 @@ void __cpuinit uv_cpu_init(void)
*/
int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
{
- if (reason != DIE_NMI_IPI)
+ if (reason != DIE_NMIUNKNOWN)
return NOTIFY_OK;
if (in_crash_kexec)
@@ -660,28 +683,33 @@ void uv_nmi_init(void)
void __init uv_system_init(void)
{
- union uvh_si_addr_map_config_u m_n_config;
+ union uvh_rh_gam_config_mmr_u m_n_config;
+ union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
union uvh_node_id_u node_id;
unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
- int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
+ int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
int gnode_extra, max_pnode = 0;
unsigned long mmr_base, present, paddr;
- unsigned short pnode_mask;
+ unsigned short pnode_mask, pnode_io_mask;
map_low_mmrs();
- m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
+ m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
m_val = m_n_config.s.m_skt;
n_val = m_n_config.s.n_skt;
+ mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
+ n_io = mmioh.s.n_io;
mmr_base =
uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
~UV_MMR_ENABLE;
pnode_mask = (1 << n_val) - 1;
+ pnode_io_mask = (1 << n_io) - 1;
+
node_id.v = uv_read_local_mmr(UVH_NODE_ID);
gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
gnode_upper = ((unsigned long)gnode_extra << m_val);
- printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
- n_val, m_val, gnode_upper, gnode_extra);
+ printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
+ n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
@@ -714,7 +742,7 @@ void __init uv_system_init(void)
for (j = 0; j < 64; j++) {
if (!test_bit(j, &present))
continue;
- pnode = (i * 64 + j);
+ pnode = (i * 64 + j) & pnode_mask;
uv_blade_info[blade].pnode = pnode;
uv_blade_info[blade].nr_possible_cpus = 0;
uv_blade_info[blade].nr_online_cpus = 0;
@@ -735,6 +763,7 @@ void __init uv_system_init(void)
/*
* apic_pnode_shift must be set before calling uv_apicid_to_pnode();
*/
+ uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
pnode = uv_apicid_to_pnode(apicid);
blade = boot_pnode_to_blade(pnode);
@@ -751,7 +780,6 @@ void __init uv_system_init(void)
uv_cpu_hub_info(cpu)->numa_blade_id = blade;
uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
uv_cpu_hub_info(cpu)->pnode = pnode;
- uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
@@ -775,7 +803,7 @@ void __init uv_system_init(void)
map_gru_high(max_pnode);
map_mmr_high(max_pnode);
- map_mmioh_high(max_pnode);
+ map_mmioh_high(max_pnode & pnode_io_mask);
uv_cpu_init();
uv_scir_register_cpu_notifier();
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 9e093f8fe78c..7c7bedb83c5a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -668,7 +668,7 @@ EXPORT_SYMBOL_GPL(amd_erratum_383);
bool cpu_has_amd_erratum(const int *erratum)
{
- struct cpuinfo_x86 *cpu = &current_cpu_data;
+ struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info);
int osvw_id = *erratum++;
u32 range;
u32 ms;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4b68bda30938..1d59834396bd 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -894,7 +894,6 @@ void __init identify_boot_cpu(void)
#else
vgetcpu_set_mode();
#endif
- init_hw_perf_events();
}
void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 491977baf6c0..35c7e65e59be 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -521,7 +521,7 @@ static void check_supported_cpu(void *_rc)
*rc = -ENODEV;
- if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
+ if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD)
return;
eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
@@ -1377,7 +1377,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
static void query_values_on_cpu(void *_err)
{
int *err = _err;
- struct powernow_k8_data *data = __get_cpu_var(powernow_data);
+ struct powernow_k8_data *data = __this_cpu_read(powernow_data);
*err = query_current_values_with_pending_wait(data);
}
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 17ad03366211..ec2c19a7b8ef 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -45,6 +45,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
{ 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
{ 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
{ 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
+ { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */
{ 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
{ 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
{ 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
@@ -66,6 +67,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
{ 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
{ 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
{ 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
+ { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */
{ 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
{ 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
{ 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
@@ -87,6 +89,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
{ 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
{ 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
{ 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
+ { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */
{ 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
{ 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
{ 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
@@ -149,8 +152,7 @@ union _cpuid4_leaf_ecx {
};
struct amd_l3_cache {
- struct pci_dev *dev;
- bool can_disable;
+ struct amd_northbridge *nb;
unsigned indices;
u8 subcaches[4];
};
@@ -266,7 +268,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
line_size = l2.line_size;
lines_per_tag = l2.lines_per_tag;
/* cpu_data has errata corrections for K7 applied */
- size_in_kb = current_cpu_data.x86_cache_size;
+ size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
break;
case 3:
if (!l3.val)
@@ -288,7 +290,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
eax->split.type = types[leaf];
eax->split.level = levels[leaf];
eax->split.num_threads_sharing = 0;
- eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
+ eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
if (assoc == 0xffff)
@@ -311,14 +313,12 @@ struct _cache_attr {
/*
* L3 cache descriptors
*/
-static struct amd_l3_cache **__cpuinitdata l3_caches;
-
static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
{
unsigned int sc0, sc1, sc2, sc3;
u32 val = 0;
- pci_read_config_dword(l3->dev, 0x1C4, &val);
+ pci_read_config_dword(l3->nb->misc, 0x1C4, &val);
/* calculate subcache sizes */
l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -330,47 +330,14 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
}
-static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
-{
- struct amd_l3_cache *l3;
- struct pci_dev *dev = node_to_k8_nb_misc(node);
-
- l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
- if (!l3) {
- printk(KERN_WARNING "Error allocating L3 struct\n");
- return NULL;
- }
-
- l3->dev = dev;
-
- amd_calc_l3_indices(l3);
-
- return l3;
-}
-
-static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
- int index)
+static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
+ int index)
{
+ static struct amd_l3_cache *__cpuinitdata l3_caches;
int node;
- if (boot_cpu_data.x86 != 0x10)
- return;
-
- if (index < 3)
- return;
-
- /* see errata #382 and #388 */
- if (boot_cpu_data.x86_model < 0x8)
- return;
-
- if ((boot_cpu_data.x86_model == 0x8 ||
- boot_cpu_data.x86_model == 0x9)
- &&
- boot_cpu_data.x86_mask < 0x1)
- return;
-
- /* not in virtualized environments */
- if (k8_northbridges.num == 0)
+ /* only for L3, and not in virtualized environments */
+ if (index < 3 || amd_nb_num() == 0)
return;
/*
@@ -378,7 +345,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
* never freed but this is done only on shutdown so it doesn't matter.
*/
if (!l3_caches) {
- int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
+ int size = amd_nb_num() * sizeof(struct amd_l3_cache);
l3_caches = kzalloc(size, GFP_ATOMIC);
if (!l3_caches)
@@ -387,14 +354,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
node = amd_get_nb_id(smp_processor_id());
- if (!l3_caches[node]) {
- l3_caches[node] = amd_init_l3_cache(node);
- l3_caches[node]->can_disable = true;
+ if (!l3_caches[node].nb) {
+ l3_caches[node].nb = node_to_amd_nb(node);
+ amd_calc_l3_indices(&l3_caches[node]);
}
- WARN_ON(!l3_caches[node]);
-
- this_leaf->l3 = l3_caches[node];
+ this_leaf->l3 = &l3_caches[node];
}
/*
@@ -408,7 +373,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
{
unsigned int reg = 0;
- pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
+ pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, &reg);
/* check whether this slot is activated already */
if (reg & (3UL << 30))
@@ -422,7 +387,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
{
int index;
- if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+ if (!this_leaf->l3 ||
+ !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
return -EINVAL;
index = amd_get_l3_disable_slot(this_leaf->l3, slot);
@@ -457,7 +423,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
if (!l3->subcaches[i])
continue;
- pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+ pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
/*
* We need to WBINVD on a core on the node containing the L3
@@ -467,7 +433,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
wbinvd_on_cpu(cpu);
reg |= BIT(31);
- pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+ pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
}
}
@@ -524,7 +490,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!this_leaf->l3 || !this_leaf->l3->can_disable)
+ if (!this_leaf->l3 ||
+ !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
return -EINVAL;
cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -545,7 +512,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
#define STORE_CACHE_DISABLE(slot) \
static ssize_t \
store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
- const char *buf, size_t count) \
+ const char *buf, size_t count) \
{ \
return store_cache_disable(this_leaf, buf, count, slot); \
}
@@ -558,10 +525,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
show_cache_disable_1, store_cache_disable_1);
#else /* CONFIG_AMD_NB */
-static void __cpuinit
-amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
-{
-};
+#define amd_init_l3_cache(x, y)
#endif /* CONFIG_AMD_NB */
static int
@@ -575,7 +539,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
amd_cpuid4(index, &eax, &ebx, &ecx);
- amd_check_l3_disable(this_leaf, index);
+ amd_init_l3_cache(this_leaf, index);
} else {
cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
}
@@ -983,30 +947,48 @@ define_one_ro(size);
define_one_ro(shared_cpu_map);
define_one_ro(shared_cpu_list);
-#define DEFAULT_SYSFS_CACHE_ATTRS \
- &type.attr, \
- &level.attr, \
- &coherency_line_size.attr, \
- &physical_line_partition.attr, \
- &ways_of_associativity.attr, \
- &number_of_sets.attr, \
- &size.attr, \
- &shared_cpu_map.attr, \
- &shared_cpu_list.attr
-
static struct attribute *default_attrs[] = {
- DEFAULT_SYSFS_CACHE_ATTRS,
+ &type.attr,
+ &level.attr,
+ &coherency_line_size.attr,
+ &physical_line_partition.attr,
+ &ways_of_associativity.attr,
+ &number_of_sets.attr,
+ &size.attr,
+ &shared_cpu_map.attr,
+ &shared_cpu_list.attr,
NULL
};
-static struct attribute *default_l3_attrs[] = {
- DEFAULT_SYSFS_CACHE_ATTRS,
#ifdef CONFIG_AMD_NB
- &cache_disable_0.attr,
- &cache_disable_1.attr,
+static struct attribute ** __cpuinit amd_l3_attrs(void)
+{
+ static struct attribute **attrs;
+ int n;
+
+ if (attrs)
+ return attrs;
+
+ n = sizeof (default_attrs) / sizeof (struct attribute *);
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ n += 2;
+
+ attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
+ if (attrs == NULL)
+ return attrs = default_attrs;
+
+ for (n = 0; default_attrs[n]; n++)
+ attrs[n] = default_attrs[n];
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+ attrs[n++] = &cache_disable_0.attr;
+ attrs[n++] = &cache_disable_1.attr;
+ }
+
+ return attrs;
+}
#endif
- NULL
-};
static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
{
@@ -1117,11 +1099,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
this_leaf = CPUID4_INFO_IDX(cpu, i);
- if (this_leaf->l3 && this_leaf->l3->can_disable)
- ktype_cache.default_attrs = default_l3_attrs;
- else
- ktype_cache.default_attrs = default_attrs;
-
+ ktype_cache.default_attrs = default_attrs;
+#ifdef CONFIG_AMD_NB
+ if (this_leaf->l3)
+ ktype_cache.default_attrs = amd_l3_attrs();
+#endif
retval = kobject_init_and_add(&(this_object->kobj),
&ktype_cache,
per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index e7dbde7bfedb..a77971979564 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -25,6 +25,7 @@
#include <linux/gfp.h>
#include <asm/mce.h>
#include <asm/apic.h>
+#include <asm/nmi.h>
/* Update fake mce registers on current CPU. */
static void inject_mce(struct mce *m)
@@ -83,7 +84,7 @@ static int mce_raise_notify(struct notifier_block *self,
struct die_args *args = (struct die_args *)data;
int cpu = smp_processor_id();
struct mce *m = &__get_cpu_var(injectm);
- if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
+ if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
return NOTIFY_DONE;
cpumask_clear_cpu(cpu, mce_inject_cpumask);
if (m->inject_flags & MCJ_EXCEPTION)
@@ -95,7 +96,7 @@ static int mce_raise_notify(struct notifier_block *self,
static struct notifier_block mce_raise_nb = {
.notifier_call = mce_raise_notify,
- .priority = 1000,
+ .priority = NMI_LOCAL_NORMAL_PRIOR,
};
/* Inject mce on current CPU */
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7a35b72d7c03..d916183b7f9c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -326,7 +326,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
static int msr_to_offset(u32 msr)
{
- unsigned bank = __get_cpu_var(injectm.bank);
+ unsigned bank = __this_cpu_read(injectm.bank);
if (msr == rip_msr)
return offsetof(struct mce, ip);
@@ -346,7 +346,7 @@ static u64 mce_rdmsrl(u32 msr)
{
u64 v;
- if (__get_cpu_var(injectm).finished) {
+ if (__this_cpu_read(injectm.finished)) {
int offset = msr_to_offset(msr);
if (offset < 0)
@@ -369,7 +369,7 @@ static u64 mce_rdmsrl(u32 msr)
static void mce_wrmsrl(u32 msr, u64 v)
{
- if (__get_cpu_var(injectm).finished) {
+ if (__this_cpu_read(injectm.finished)) {
int offset = msr_to_offset(msr);
if (offset >= 0)
@@ -1159,7 +1159,7 @@ static void mce_start_timer(unsigned long data)
WARN_ON(smp_processor_id() != data);
- if (mce_available(&current_cpu_data)) {
+ if (mce_available(__this_cpu_ptr(&cpu_info))) {
machine_check_poll(MCP_TIMESTAMP,
&__get_cpu_var(mce_poll_banks));
}
@@ -1767,7 +1767,7 @@ static int mce_shutdown(struct sys_device *dev)
static int mce_resume(struct sys_device *dev)
{
__mcheck_cpu_init_generic();
- __mcheck_cpu_init_vendor(&current_cpu_data);
+ __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
return 0;
}
@@ -1775,7 +1775,7 @@ static int mce_resume(struct sys_device *dev)
static void mce_cpu_restart(void *data)
{
del_timer_sync(&__get_cpu_var(mce_timer));
- if (!mce_available(&current_cpu_data))
+ if (!mce_available(__this_cpu_ptr(&cpu_info)))
return;
__mcheck_cpu_init_generic();
__mcheck_cpu_init_timer();
@@ -1790,7 +1790,7 @@ static void mce_restart(void)
/* Toggle features for corrected errors */
static void mce_disable_ce(void *all)
{
- if (!mce_available(&current_cpu_data))
+ if (!mce_available(__this_cpu_ptr(&cpu_info)))
return;
if (all)
del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1799,7 +1799,7 @@ static void mce_disable_ce(void *all)
static void mce_enable_ce(void *all)
{
- if (!mce_available(&current_cpu_data))
+ if (!mce_available(__this_cpu_ptr(&cpu_info)))
return;
cmci_reenable();
cmci_recheck();
@@ -2022,7 +2022,7 @@ static void __cpuinit mce_disable_cpu(void *h)
unsigned long action = *(unsigned long *)h;
int i;
- if (!mce_available(&current_cpu_data))
+ if (!mce_available(__this_cpu_ptr(&cpu_info)))
return;
if (!(action & CPU_TASKS_FROZEN))
@@ -2040,7 +2040,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
unsigned long action = *(unsigned long *)h;
int i;
- if (!mce_available(&current_cpu_data))
+ if (!mce_available(__this_cpu_ptr(&cpu_info)))
return;
if (!(action & CPU_TASKS_FROZEN))
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 80c482382d5c..5bf2fac52aca 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -31,8 +31,6 @@
#include <asm/mce.h>
#include <asm/msr.h>
-#define PFX "mce_threshold: "
-#define VERSION "version 1.1.1"
#define NR_BANKS 6
#define NR_BLOCKS 9
#define THRESHOLD_MAX 0xFFF
@@ -59,12 +57,6 @@ struct threshold_block {
struct list_head miscj;
};
-/* defaults used early on boot */
-static struct threshold_block threshold_defaults = {
- .interrupt_enable = 0,
- .threshold_limit = THRESHOLD_MAX,
-};
-
struct threshold_bank {
struct kobject *kobj;
struct threshold_block *blocks;
@@ -89,50 +81,101 @@ static void amd_threshold_interrupt(void);
struct thresh_restart {
struct threshold_block *b;
int reset;
+ int set_lvt_off;
+ int lvt_off;
u16 old_limit;
};
+static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
+{
+ int msr = (hi & MASK_LVTOFF_HI) >> 20;
+
+ if (apic < 0) {
+ pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
+ b->bank, b->block, b->address, hi, lo);
+ return 0;
+ }
+
+ if (apic != msr) {
+ pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
+ "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
+ b->cpu, apic, b->bank, b->block, b->address, hi, lo);
+ return 0;
+ }
+
+ return 1;
+};
+
/* must be called with correct cpu affinity */
/* Called via smp_call_function_single() */
static void threshold_restart_bank(void *_tr)
{
struct thresh_restart *tr = _tr;
- u32 mci_misc_hi, mci_misc_lo;
+ u32 hi, lo;
- rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+ rdmsr(tr->b->address, lo, hi);
- if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX))
+ if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
tr->reset = 1; /* limit cannot be lower than err count */
if (tr->reset) { /* reset err count and overflow bit */
- mci_misc_hi =
- (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
+ hi =
+ (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
(THRESHOLD_MAX - tr->b->threshold_limit);
} else if (tr->old_limit) { /* change limit w/o reset */
- int new_count = (mci_misc_hi & THRESHOLD_MAX) +
+ int new_count = (hi & THRESHOLD_MAX) +
(tr->old_limit - tr->b->threshold_limit);
- mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) |
+ hi = (hi & ~MASK_ERR_COUNT_HI) |
(new_count & THRESHOLD_MAX);
}
+ if (tr->set_lvt_off) {
+ if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
+ /* set new lvt offset */
+ hi &= ~MASK_LVTOFF_HI;
+ hi |= tr->lvt_off << 20;
+ }
+ }
+
tr->b->interrupt_enable ?
- (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
- (mci_misc_hi &= ~MASK_INT_TYPE_HI);
+ (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
+ (hi &= ~MASK_INT_TYPE_HI);
- mci_misc_hi |= MASK_COUNT_EN_HI;
- wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi);
+ hi |= MASK_COUNT_EN_HI;
+ wrmsr(tr->b->address, lo, hi);
+}
+
+static void mce_threshold_block_init(struct threshold_block *b, int offset)
+{
+ struct thresh_restart tr = {
+ .b = b,
+ .set_lvt_off = 1,
+ .lvt_off = offset,
+ };
+
+ b->threshold_limit = THRESHOLD_MAX;
+ threshold_restart_bank(&tr);
+};
+
+static int setup_APIC_mce(int reserved, int new)
+{
+ if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
+ APIC_EILVT_MSG_FIX, 0))
+ return new;
+
+ return reserved;
}
/* cpu init entry point, called from mce.c with preempt off */
void mce_amd_feature_init(struct cpuinfo_x86 *c)
{
+ struct threshold_block b;
unsigned int cpu = smp_processor_id();
u32 low = 0, high = 0, address = 0;
unsigned int bank, block;
- struct thresh_restart tr;
- int lvt_off = -1;
- u8 offset;
+ int offset = -1;
for (bank = 0; bank < NR_BANKS; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
@@ -163,39 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (shared_bank[bank] && c->cpu_core_id)
break;
#endif
- offset = (high & MASK_LVTOFF_HI) >> 20;
- if (lvt_off < 0) {
- if (setup_APIC_eilvt(offset,
- THRESHOLD_APIC_VECTOR,
- APIC_EILVT_MSG_FIX, 0)) {
- pr_err(FW_BUG "cpu %d, failed to "
- "setup threshold interrupt "
- "for bank %d, block %d "
- "(MSR%08X=0x%x%08x)",
- smp_processor_id(), bank, block,
- address, high, low);
- continue;
- }
- lvt_off = offset;
- } else if (lvt_off != offset) {
- pr_err(FW_BUG "cpu %d, invalid threshold "
- "interrupt offset %d for bank %d,"
- "block %d (MSR%08X=0x%x%08x)",
- smp_processor_id(), lvt_off, bank,
- block, address, high, low);
- continue;
- }
-
- high &= ~MASK_LVTOFF_HI;
- high |= lvt_off << 20;
- wrmsr(address, low, high);
+ offset = setup_APIC_mce(offset,
+ (high & MASK_LVTOFF_HI) >> 20);
- threshold_defaults.address = address;
- tr.b = &threshold_defaults;
- tr.reset = 0;
- tr.old_limit = 0;
- threshold_restart_bank(&tr);
+ memset(&b, 0, sizeof(b));
+ b.cpu = cpu;
+ b.bank = bank;
+ b.block = block;
+ b.address = address;
+ mce_threshold_block_init(&b, offset);
mce_threshold_vector = amd_threshold_interrupt;
}
}
@@ -298,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
b->interrupt_enable = !!new;
+ memset(&tr, 0, sizeof(tr));
tr.b = b;
- tr.reset = 0;
- tr.old_limit = 0;
smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
@@ -321,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
if (new < 1)
new = 1;
+ memset(&tr, 0, sizeof(tr));
tr.old_limit = b->threshold_limit;
b->threshold_limit = new;
tr.b = b;
- tr.reset = 0;
smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
@@ -603,9 +622,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
continue;
err = threshold_create_bank(cpu, bank);
if (err)
- goto out;
+ return err;
}
-out:
+
return err;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 6fcd0936194f..8694ef56459d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -130,7 +130,7 @@ void cmci_recheck(void)
unsigned long flags;
int banks;
- if (!mce_available(&current_cpu_data) || !cmci_supported(&banks))
+ if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
return;
local_irq_save(flags);
machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 4b683267eca5..6f8c5e9da97f 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -53,8 +53,14 @@ struct thermal_state {
struct _thermal_state core_power_limit;
struct _thermal_state package_throttle;
struct _thermal_state package_power_limit;
+ struct _thermal_state core_thresh0;
+ struct _thermal_state core_thresh1;
};
+/* Callback to handle core threshold interrupts */
+int (*platform_thermal_notify)(__u64 msr_val);
+EXPORT_SYMBOL(platform_thermal_notify);
+
static DEFINE_PER_CPU(struct thermal_state, thermal_state);
static atomic_t therm_throt_en = ATOMIC_INIT(0);
@@ -200,6 +206,22 @@ static int therm_throt_process(bool new_event, int event, int level)
return 0;
}
+static int thresh_event_valid(int event)
+{
+ struct _thermal_state *state;
+ unsigned int this_cpu = smp_processor_id();
+ struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
+ u64 now = get_jiffies_64();
+
+ state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1;
+
+ if (time_before64(now, state->next_check))
+ return 0;
+
+ state->next_check = now + CHECK_INTERVAL;
+ return 1;
+}
+
#ifdef CONFIG_SYSFS
/* Add/Remove thermal_throttle interface for CPU device: */
static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
@@ -313,6 +335,22 @@ device_initcall(thermal_throttle_init_device);
#define PACKAGE_THROTTLED ((__u64)2 << 62)
#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
+static void notify_thresholds(__u64 msr_val)
+{
+ /* check whether the interrupt handler is defined;
+ * otherwise simply return
+ */
+ if (!platform_thermal_notify)
+ return;
+
+ /* lower threshold reached */
+ if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0))
+ platform_thermal_notify(msr_val);
+ /* higher threshold reached */
+ if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1))
+ platform_thermal_notify(msr_val);
+}
+
/* Thermal transition interrupt handler */
static void intel_thermal_interrupt(void)
{
@@ -321,6 +359,9 @@ static void intel_thermal_interrupt(void)
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
+ /* Check for violation of core thermal thresholds*/
+ notify_thresholds(msr_val);
+
if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
CORE_LEVEL) != 0)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ed6310183efb..9d977a2ea693 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -330,9 +330,6 @@ static bool reserve_pmc_hardware(void)
{
int i;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- disable_lapic_nmi_watchdog();
-
for (i = 0; i < x86_pmu.num_counters; i++) {
if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
goto perfctr_fail;
@@ -355,9 +352,6 @@ perfctr_fail:
for (i--; i >= 0; i--)
release_perfctr_nmi(x86_pmu.perfctr + i);
- if (nmi_watchdog == NMI_LOCAL_APIC)
- enable_lapic_nmi_watchdog();
-
return false;
}
@@ -369,9 +363,6 @@ static void release_pmc_hardware(void)
release_perfctr_nmi(x86_pmu.perfctr + i);
release_evntsel_nmi(x86_pmu.eventsel + i);
}
-
- if (nmi_watchdog == NMI_LOCAL_APIC)
- enable_lapic_nmi_watchdog();
}
#else
@@ -381,6 +372,58 @@ static void release_pmc_hardware(void) {}
#endif
+static bool check_hw_exists(void)
+{
+ u64 val, val_new = 0;
+ int i, reg, ret = 0;
+
+ /*
+ * Check to see if the BIOS enabled any of the counters, if so
+ * complain and bail.
+ */
+ for (i = 0; i < x86_pmu.num_counters; i++) {
+ reg = x86_pmu.eventsel + i;
+ ret = rdmsrl_safe(reg, &val);
+ if (ret)
+ goto msr_fail;
+ if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
+ goto bios_fail;
+ }
+
+ if (x86_pmu.num_counters_fixed) {
+ reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+ ret = rdmsrl_safe(reg, &val);
+ if (ret)
+ goto msr_fail;
+ for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
+ if (val & (0x03 << i*4))
+ goto bios_fail;
+ }
+ }
+
+ /*
+ * Now write a value and read it back to see if it matches,
+ * this is needed to detect certain hardware emulators (qemu/kvm)
+ * that don't trap on the MSR access and always return 0s.
+ */
+ val = 0xabcdUL;
+ ret = checking_wrmsrl(x86_pmu.perfctr, val);
+ ret |= rdmsrl_safe(x86_pmu.perfctr, &val_new);
+ if (ret || val != val_new)
+ goto msr_fail;
+
+ return true;
+
+bios_fail:
+ printk(KERN_CONT "Broken BIOS detected, using software events only.\n");
+ printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
+ return false;
+
+msr_fail:
+ printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
+ return false;
+}
+
static void reserve_ds_buffers(void);
static void release_ds_buffers(void);
@@ -437,7 +480,7 @@ static int x86_setup_perfctr(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
u64 config;
- if (!hwc->sample_period) {
+ if (!is_sampling_event(event)) {
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
@@ -954,8 +997,7 @@ x86_perf_event_set_period(struct perf_event *event)
static void x86_pmu_enable_event(struct perf_event *event)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- if (cpuc->enabled)
+ if (__this_cpu_read(cpu_hw_events.enabled))
__x86_pmu_enable_event(&event->hw,
ARCH_PERFMON_EVENTSEL_ENABLE);
}
@@ -1225,11 +1267,10 @@ perf_event_nmi_handler(struct notifier_block *self,
switch (cmd) {
case DIE_NMI:
- case DIE_NMI_IPI:
break;
case DIE_NMIUNKNOWN:
this_nmi = percpu_read(irq_stat.__nmi_count);
- if (this_nmi != __get_cpu_var(pmu_nmi).marked)
+ if (this_nmi != __this_cpu_read(pmu_nmi.marked))
/* let the kernel handle the unknown nmi */
return NOTIFY_DONE;
/*
@@ -1253,8 +1294,8 @@ perf_event_nmi_handler(struct notifier_block *self,
this_nmi = percpu_read(irq_stat.__nmi_count);
if ((handled > 1) ||
/* the next nmi could be a back-to-back nmi */
- ((__get_cpu_var(pmu_nmi).marked == this_nmi) &&
- (__get_cpu_var(pmu_nmi).handled > 1))) {
+ ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
+ (__this_cpu_read(pmu_nmi.handled) > 1))) {
/*
* We could have two subsequent back-to-back nmis: The
* first handles more than one counter, the 2nd
@@ -1265,8 +1306,8 @@ perf_event_nmi_handler(struct notifier_block *self,
* handling more than one counter. We will mark the
* next (3rd) and then drop it if unhandled.
*/
- __get_cpu_var(pmu_nmi).marked = this_nmi + 1;
- __get_cpu_var(pmu_nmi).handled = handled;
+ __this_cpu_write(pmu_nmi.marked, this_nmi + 1);
+ __this_cpu_write(pmu_nmi.handled, handled);
}
return NOTIFY_STOP;
@@ -1275,7 +1316,7 @@ perf_event_nmi_handler(struct notifier_block *self,
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
.notifier_call = perf_event_nmi_handler,
.next = NULL,
- .priority = 1
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
static struct event_constraint unconstrained;
@@ -1348,7 +1389,7 @@ static void __init pmu_check_apic(void)
pr_info("no hardware sampling interrupt available.\n");
}
-void __init init_hw_perf_events(void)
+int __init init_hw_perf_events(void)
{
struct event_constraint *c;
int err;
@@ -1363,15 +1404,19 @@ void __init init_hw_perf_events(void)
err = amd_pmu_init();
break;
default:
- return;
+ return 0;
}
if (err != 0) {
pr_cont("no PMU driver, software events only.\n");
- return;
+ return 0;
}
pmu_check_apic();
+ /* sanity check that the hardware exists or is emulated */
+ if (!check_hw_exists())
+ return 0;
+
pr_cont("%s PMU driver.\n", x86_pmu.name);
if (x86_pmu.quirks)
@@ -1418,9 +1463,12 @@ void __init init_hw_perf_events(void)
pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
- perf_pmu_register(&pmu);
+ perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
perf_cpu_notifier(x86_pmu_notifier);
+
+ return 0;
}
+early_initcall(init_hw_perf_events);
static inline void x86_pmu_read(struct perf_event *event)
{
@@ -1434,11 +1482,9 @@ static inline void x86_pmu_read(struct perf_event *event)
*/
static void x86_pmu_start_txn(struct pmu *pmu)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
perf_pmu_disable(pmu);
- cpuc->group_flag |= PERF_EVENT_TXN;
- cpuc->n_txn = 0;
+ __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
+ __this_cpu_write(cpu_hw_events.n_txn, 0);
}
/*
@@ -1448,14 +1494,12 @@ static void x86_pmu_start_txn(struct pmu *pmu)
*/
static void x86_pmu_cancel_txn(struct pmu *pmu)
{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
-
- cpuc->group_flag &= ~PERF_EVENT_TXN;
+ __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
/*
* Truncate the collected events.
*/
- cpuc->n_added -= cpuc->n_txn;
- cpuc->n_events -= cpuc->n_txn;
+ __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
+ __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
perf_pmu_enable(pmu);
}
@@ -1666,7 +1710,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
perf_callchain_store(entry, regs->ip);
- dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
+ dump_trace(NULL, regs, NULL, &backtrace_ops, entry);
}
#ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 46d58448c3af..67e2202a6039 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,7 +1,5 @@
#ifdef CONFIG_CPU_SUP_AMD
-static DEFINE_RAW_SPINLOCK(amd_nb_lock);
-
static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -275,17 +273,17 @@ done:
return &emptyconstraint;
}
-static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
+static struct amd_nb *amd_alloc_nb(int cpu)
{
struct amd_nb *nb;
int i;
- nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
+ nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO,
+ cpu_to_node(cpu));
if (!nb)
return NULL;
- memset(nb, 0, sizeof(*nb));
- nb->nb_id = nb_id;
+ nb->nb_id = -1;
/*
* initialize all possible NB constraints
@@ -306,7 +304,7 @@ static int amd_pmu_cpu_prepare(int cpu)
if (boot_cpu_data.x86_max_cores < 2)
return NOTIFY_OK;
- cpuc->amd_nb = amd_alloc_nb(cpu, -1);
+ cpuc->amd_nb = amd_alloc_nb(cpu);
if (!cpuc->amd_nb)
return NOTIFY_BAD;
@@ -325,8 +323,6 @@ static void amd_pmu_cpu_starting(int cpu)
nb_id = amd_get_nb_id(cpu);
WARN_ON_ONCE(nb_id == BAD_APICID);
- raw_spin_lock(&amd_nb_lock);
-
for_each_online_cpu(i) {
nb = per_cpu(cpu_hw_events, i).amd_nb;
if (WARN_ON_ONCE(!nb))
@@ -341,8 +337,6 @@ static void amd_pmu_cpu_starting(int cpu)
cpuc->amd_nb->nb_id = nb_id;
cpuc->amd_nb->refcnt++;
-
- raw_spin_unlock(&amd_nb_lock);
}
static void amd_pmu_cpu_dead(int cpu)
@@ -354,8 +348,6 @@ static void amd_pmu_cpu_dead(int cpu)
cpuhw = &per_cpu(cpu_hw_events, cpu);
- raw_spin_lock(&amd_nb_lock);
-
if (cpuhw->amd_nb) {
struct amd_nb *nb = cpuhw->amd_nb;
@@ -364,8 +356,6 @@ static void amd_pmu_cpu_dead(int cpu)
cpuhw->amd_nb = NULL;
}
-
- raw_spin_unlock(&amd_nb_lock);
}
static __initconst const struct x86_pmu amd_pmu = {
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index c8f5c088cad1..008835c1d79c 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -649,7 +649,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
- if (!__get_cpu_var(cpu_hw_events).enabled)
+ if (!__this_cpu_read(cpu_hw_events.enabled))
return;
intel_pmu_enable_bts(hwc->config);
@@ -679,7 +679,7 @@ static int intel_pmu_save_and_restart(struct perf_event *event)
static void intel_pmu_reset(void)
{
- struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
+ struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
unsigned long flags;
int idx;
@@ -816,6 +816,32 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (ret)
return ret;
+ if (event->attr.precise_ip &&
+ (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
+ /*
+ * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
+ * (0x003c) so that we can use it with PEBS.
+ *
+ * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
+ * PEBS capable. However we can use INST_RETIRED.ANY_P
+ * (0x00c0), which is a PEBS capable event, to get the same
+ * count.
+ *
+ * INST_RETIRED.ANY_P counts the number of cycles that retires
+ * CNTMASK instructions. By setting CNTMASK to a value (16)
+ * larger than the maximum number of instructions that can be
+ * retired per cycle (4) and then inverting the condition, we
+ * count all cycles that retire 16 or less instructions, which
+ * is every cycle.
+ *
+ * Thereby we gain a PEBS capable cycle counter.
+ */
+ u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
+
+ alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
+ event->hw.config = alt_config;
+ }
+
if (event->attr.type != PERF_TYPE_RAW)
return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 81400b93e694..e56b9bfbabd1 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -753,19 +753,21 @@ out:
static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
{
- int overflow = 0;
- u32 low, high;
+ u64 v;
- rdmsr(hwc->config_base + hwc->idx, low, high);
-
- /* we need to check high bit for unflagged overflows */
- if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
- overflow = 1;
- (void)checking_wrmsrl(hwc->config_base + hwc->idx,
- ((u64)low) & ~P4_CCCR_OVF);
+ /* an official way for overflow indication */
+ rdmsrl(hwc->config_base + hwc->idx, v);
+ if (v & P4_CCCR_OVF) {
+ wrmsrl(hwc->config_base + hwc->idx, v & ~P4_CCCR_OVF);
+ return 1;
}
- return overflow;
+ /* it might be unflagged overflow */
+ rdmsrl(hwc->event_base + hwc->idx, v);
+ if (!(v & ARCH_P4_CNTRVAL_MASK))
+ return 1;
+
+ return 0;
}
static void p4_pmu_disable_pebs(void)
@@ -1152,9 +1154,9 @@ static __initconst const struct x86_pmu p4_pmu = {
*/
.num_counters = ARCH_P4_MAX_CCCR,
.apic = 1,
- .cntval_bits = 40,
- .cntval_mask = (1ULL << 40) - 1,
- .max_period = (1ULL << 39) - 1,
+ .cntval_bits = ARCH_P4_CNTRVAL_BITS,
+ .cntval_mask = ARCH_P4_CNTRVAL_MASK,
+ .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
.hw_config = p4_hw_config,
.schedule_events = p4_pmu_schedule_events,
/*
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index d9f4ff8fcd69..d5a236615501 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -16,32 +16,12 @@
#include <linux/kernel.h>
#include <linux/bitops.h>
#include <linux/smp.h>
-#include <linux/nmi.h>
+#include <asm/nmi.h>
#include <linux/kprobes.h>
#include <asm/apic.h>
#include <asm/perf_event.h>
-struct nmi_watchdog_ctlblk {
- unsigned int cccr_msr;
- unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
- unsigned int evntsel_msr; /* the MSR to select the events to handle */
-};
-
-/* Interface defining a CPU specific perfctr watchdog */
-struct wd_ops {
- int (*reserve)(void);
- void (*unreserve)(void);
- int (*setup)(unsigned nmi_hz);
- void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
- void (*stop)(void);
- unsigned perfctr;
- unsigned evntsel;
- u64 checkbit;
-};
-
-static const struct wd_ops *wd_ops;
-
/*
* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
* offset from MSR_P4_BSU_ESCR0.
@@ -60,8 +40,6 @@ static const struct wd_ops *wd_ops;
static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
-static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
-
/* converts an msr to an appropriate reservation bit */
static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
{
@@ -172,623 +150,3 @@ void release_evntsel_nmi(unsigned int msr)
clear_bit(counter, evntsel_nmi_owner);
}
EXPORT_SYMBOL(release_evntsel_nmi);
-
-void disable_lapic_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
- if (atomic_read(&nmi_active) <= 0)
- return;
-
- on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
-
- if (wd_ops)
- wd_ops->unreserve();
-
- BUG_ON(atomic_read(&nmi_active) != 0);
-}
-
-void enable_lapic_nmi_watchdog(void)
-{
- BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
-
- /* are we already enabled */
- if (atomic_read(&nmi_active) != 0)
- return;
-
- /* are we lapic aware */
- if (!wd_ops)
- return;
- if (!wd_ops->reserve()) {
- printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
- return;
- }
-
- on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
- touch_nmi_watchdog();
-}
-
-/*
- * Activate the NMI watchdog via the local APIC.
- */
-
-static unsigned int adjust_for_32bit_ctr(unsigned int hz)
-{
- u64 counter_val;
- unsigned int retval = hz;
-
- /*
- * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
- * are writable, with higher bits sign extending from bit 31.
- * So, we can only program the counter with 31 bit values and
- * 32nd bit should be 1, for 33.. to be 1.
- * Find the appropriate nmi_hz
- */
- counter_val = (u64)cpu_khz * 1000;
- do_div(counter_val, retval);
- if (counter_val > 0x7fffffffULL) {
- u64 count = (u64)cpu_khz * 1000;
- do_div(count, 0x7fffffffUL);
- retval = count + 1;
- }
- return retval;
-}
-
-static void write_watchdog_counter(unsigned int perfctr_msr,
- const char *descr, unsigned nmi_hz)
-{
- u64 count = (u64)cpu_khz * 1000;
-
- do_div(count, nmi_hz);
- if (descr)
- pr_debug("setting %s to -0x%08Lx\n", descr, count);
- wrmsrl(perfctr_msr, 0 - count);
-}
-
-static void write_watchdog_counter32(unsigned int perfctr_msr,
- const char *descr, unsigned nmi_hz)
-{
- u64 count = (u64)cpu_khz * 1000;
-
- do_div(count, nmi_hz);
- if (descr)
- pr_debug("setting %s to -0x%08Lx\n", descr, count);
- wrmsr(perfctr_msr, (u32)(-count), 0);
-}
-
-/*
- * AMD K7/K8/Family10h/Family11h support.
- * AMD keeps this interface nicely stable so there is not much variety
- */
-#define K7_EVNTSEL_ENABLE (1 << 22)
-#define K7_EVNTSEL_INT (1 << 20)
-#define K7_EVNTSEL_OS (1 << 17)
-#define K7_EVNTSEL_USR (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
-#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-
-static int setup_k7_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- wrmsrl(perfctr_msr, 0UL);
-
- evntsel = K7_EVNTSEL_INT
- | K7_EVNTSEL_OS
- | K7_EVNTSEL_USR
- | K7_NMI_EVENT;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
-
- /* initialize the wd struct before enabling */
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
-
- return 1;
-}
-
-static void single_msr_stop_watchdog(void)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int single_msr_reserve(void)
-{
- if (!reserve_perfctr_nmi(wd_ops->perfctr))
- return 0;
-
- if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
- release_perfctr_nmi(wd_ops->perfctr);
- return 0;
- }
- return 1;
-}
-
-static void single_msr_unreserve(void)
-{
- release_evntsel_nmi(wd_ops->evntsel);
- release_perfctr_nmi(wd_ops->perfctr);
-}
-
-static void __kprobes
-single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- /* start the cycle over again */
- write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops k7_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_k7_watchdog,
- .rearm = single_msr_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_K7_PERFCTR0,
- .evntsel = MSR_K7_EVNTSEL0,
- .checkbit = 1ULL << 47,
-};
-
-/*
- * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
- */
-#define P6_EVNTSEL0_ENABLE (1 << 22)
-#define P6_EVNTSEL_INT (1 << 20)
-#define P6_EVNTSEL_OS (1 << 17)
-#define P6_EVNTSEL_USR (1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
-#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
-
-static int setup_p6_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- /* KVM doesn't implement this MSR */
- if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
- return 0;
-
- evntsel = P6_EVNTSEL_INT
- | P6_EVNTSEL_OS
- | P6_EVNTSEL_USR
- | P6_NMI_EVENT;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- nmi_hz = adjust_for_32bit_ctr(nmi_hz);
- write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
-
- /* initialize the wd struct before enabling */
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= P6_EVNTSEL0_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
-
- return 1;
-}
-
-static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- /*
- * P6 based Pentium M need to re-unmask
- * the apic vector but it doesn't hurt
- * other P6 variant.
- * ArchPerfom/Core Duo also needs this
- */
- apic_write(APIC_LVTPC, APIC_DM_NMI);
-
- /* P6/ARCH_PERFMON has 32 bit counter write */
- write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops p6_wd_ops = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_p6_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_P6_PERFCTR0,
- .evntsel = MSR_P6_EVNTSEL0,
- .checkbit = 1ULL << 39,
-};
-
-/*
- * Intel P4 performance counters.
- * By far the most complicated of all.
- */
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
-#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
-#define P4_ESCR_OS (1 << 3)
-#define P4_ESCR_USR (1 << 2)
-#define P4_CCCR_OVF_PMI0 (1 << 26)
-#define P4_CCCR_OVF_PMI1 (1 << 27)
-#define P4_CCCR_THRESHOLD(N) ((N) << 20)
-#define P4_CCCR_COMPLEMENT (1 << 19)
-#define P4_CCCR_COMPARE (1 << 18)
-#define P4_CCCR_REQUIRED (3 << 16)
-#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
-#define P4_CCCR_ENABLE (1 << 12)
-#define P4_CCCR_OVF (1 << 31)
-
-#define P4_CONTROLS 18
-static unsigned int p4_controls[18] = {
- MSR_P4_BPU_CCCR0,
- MSR_P4_BPU_CCCR1,
- MSR_P4_BPU_CCCR2,
- MSR_P4_BPU_CCCR3,
- MSR_P4_MS_CCCR0,
- MSR_P4_MS_CCCR1,
- MSR_P4_MS_CCCR2,
- MSR_P4_MS_CCCR3,
- MSR_P4_FLAME_CCCR0,
- MSR_P4_FLAME_CCCR1,
- MSR_P4_FLAME_CCCR2,
- MSR_P4_FLAME_CCCR3,
- MSR_P4_IQ_CCCR0,
- MSR_P4_IQ_CCCR1,
- MSR_P4_IQ_CCCR2,
- MSR_P4_IQ_CCCR3,
- MSR_P4_IQ_CCCR4,
- MSR_P4_IQ_CCCR5,
-};
-/*
- * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- * CRU_ESCR0 (with any non-null event selector) through a complemented
- * max threshold. [IA32-Vol3, Section 14.9.9]
- */
-static int setup_p4_watchdog(unsigned nmi_hz)
-{
- unsigned int perfctr_msr, evntsel_msr, cccr_msr;
- unsigned int evntsel, cccr_val;
- unsigned int misc_enable, dummy;
- unsigned int ht_num;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
- if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
- return 0;
-
-#ifdef CONFIG_SMP
- /* detect which hyperthread we are on */
- if (smp_num_siblings == 2) {
- unsigned int ebx, apicid;
-
- ebx = cpuid_ebx(1);
- apicid = (ebx >> 24) & 0xff;
- ht_num = apicid & 1;
- } else
-#endif
- ht_num = 0;
-
- /*
- * performance counters are shared resources
- * assign each hyperthread its own set
- * (re-use the ESCR0 register, seems safe
- * and keeps the cccr_val the same)
- */
- if (!ht_num) {
- /* logical cpu 0 */
- perfctr_msr = MSR_P4_IQ_PERFCTR0;
- evntsel_msr = MSR_P4_CRU_ESCR0;
- cccr_msr = MSR_P4_IQ_CCCR0;
- cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
-
- /*
- * If we're on the kdump kernel or other situation, we may
- * still have other performance counter registers set to
- * interrupt and they'll keep interrupting forever because
- * of the P4_CCCR_OVF quirk. So we need to ACK all the
- * pending interrupts and disable all the registers here,
- * before reenabling the NMI delivery. Refer to p4_rearm()
- * about the P4_CCCR_OVF quirk.
- */
- if (reset_devices) {
- unsigned int low, high;
- int i;
-
- for (i = 0; i < P4_CONTROLS; i++) {
- rdmsr(p4_controls[i], low, high);
- low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
- wrmsr(p4_controls[i], low, high);
- }
- }
- } else {
- /* logical cpu 1 */
- perfctr_msr = MSR_P4_IQ_PERFCTR1;
- evntsel_msr = MSR_P4_CRU_ESCR0;
- cccr_msr = MSR_P4_IQ_CCCR1;
-
- /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
- if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
- cccr_val = P4_CCCR_OVF_PMI0;
- else
- cccr_val = P4_CCCR_OVF_PMI1;
- cccr_val |= P4_CCCR_ESCR_SELECT(4);
- }
-
- evntsel = P4_ESCR_EVENT_SELECT(0x3F)
- | P4_ESCR_OS
- | P4_ESCR_USR;
-
- cccr_val |= P4_CCCR_THRESHOLD(15)
- | P4_CCCR_COMPLEMENT
- | P4_CCCR_COMPARE
- | P4_CCCR_REQUIRED;
-
- wrmsr(evntsel_msr, evntsel, 0);
- wrmsr(cccr_msr, cccr_val, 0);
- write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = cccr_msr;
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- cccr_val |= P4_CCCR_ENABLE;
- wrmsr(cccr_msr, cccr_val, 0);
- return 1;
-}
-
-static void stop_p4_watchdog(void)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- wrmsr(wd->cccr_msr, 0, 0);
- wrmsr(wd->evntsel_msr, 0, 0);
-}
-
-static int p4_reserve(void)
-{
- if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
- return 0;
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
- goto fail1;
-#endif
- if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
- goto fail2;
- /* RED-PEN why is ESCR1 not reserved here? */
- return 1;
- fail2:
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1)
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
- fail1:
-#endif
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
- return 0;
-}
-
-static void p4_unreserve(void)
-{
-#ifdef CONFIG_SMP
- if (smp_num_siblings > 1)
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
-#endif
- release_evntsel_nmi(MSR_P4_CRU_ESCR0);
- release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
-}
-
-static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
-{
- unsigned dummy;
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
- */
- rdmsrl(wd->cccr_msr, dummy);
- dummy &= ~P4_CCCR_OVF;
- wrmsrl(wd->cccr_msr, dummy);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- /* start the cycle over again */
- write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
-}
-
-static const struct wd_ops p4_wd_ops = {
- .reserve = p4_reserve,
- .unreserve = p4_unreserve,
- .setup = setup_p4_watchdog,
- .rearm = p4_rearm,
- .stop = stop_p4_watchdog,
- /* RED-PEN this is wrong for the other sibling */
- .perfctr = MSR_P4_BPU_PERFCTR0,
- .evntsel = MSR_P4_BSU_ESCR0,
- .checkbit = 1ULL << 39,
-};
-
-/*
- * Watchdog using the Intel architected PerfMon.
- * Used for Core2 and hopefully all future Intel CPUs.
- */
-#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
-#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
-
-static struct wd_ops intel_arch_wd_ops;
-
-static int setup_intel_arch_watchdog(unsigned nmi_hz)
-{
- unsigned int ebx;
- union cpuid10_eax eax;
- unsigned int unused;
- unsigned int perfctr_msr, evntsel_msr;
- unsigned int evntsel;
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
-
- /*
- * Check whether the Architectural PerfMon supports
- * Unhalted Core Cycles Event or not.
- * NOTE: Corresponding bit = 0 in ebx indicates event present.
- */
- cpuid(10, &(eax.full), &ebx, &unused, &unused);
- if ((eax.split.mask_length <
- (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
- (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
- return 0;
-
- perfctr_msr = wd_ops->perfctr;
- evntsel_msr = wd_ops->evntsel;
-
- wrmsrl(perfctr_msr, 0UL);
-
- evntsel = ARCH_PERFMON_EVENTSEL_INT
- | ARCH_PERFMON_EVENTSEL_OS
- | ARCH_PERFMON_EVENTSEL_USR
- | ARCH_PERFMON_NMI_EVENT_SEL
- | ARCH_PERFMON_NMI_EVENT_UMASK;
-
- /* setup the timer */
- wrmsr(evntsel_msr, evntsel, 0);
- nmi_hz = adjust_for_32bit_ctr(nmi_hz);
- write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
-
- wd->perfctr_msr = perfctr_msr;
- wd->evntsel_msr = evntsel_msr;
- wd->cccr_msr = 0; /* unused */
-
- /* ok, everything is initialized, announce that we're set */
- cpu_nmi_set_wd_enabled();
-
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
- wrmsr(evntsel_msr, evntsel, 0);
- intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
- return 1;
-}
-
-static struct wd_ops intel_arch_wd_ops __read_mostly = {
- .reserve = single_msr_reserve,
- .unreserve = single_msr_unreserve,
- .setup = setup_intel_arch_watchdog,
- .rearm = p6_rearm,
- .stop = single_msr_stop_watchdog,
- .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
- .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
-};
-
-static void probe_nmi_watchdog(void)
-{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 == 6 ||
- (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
- wd_ops = &k7_wd_ops;
- return;
- case X86_VENDOR_INTEL:
- /* Work around where perfctr1 doesn't have a working enable
- * bit as described in the following errata:
- * AE49 Core Duo and Intel Core Solo 65 nm
- * AN49 Intel Pentium Dual-Core
- * AF49 Dual-Core Intel Xeon Processor LV
- */
- if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
- ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
- boot_cpu_data.x86_mask == 4))) {
- intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
- intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
- }
- if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- wd_ops = &intel_arch_wd_ops;
- break;
- }
- switch (boot_cpu_data.x86) {
- case 6:
- if (boot_cpu_data.x86_model > 13)
- return;
-
- wd_ops = &p6_wd_ops;
- break;
- case 15:
- wd_ops = &p4_wd_ops;
- break;
- default:
- return;
- }
- break;
- }
-}
-
-/* Interface to nmi.c */
-
-int lapic_watchdog_init(unsigned nmi_hz)
-{
- if (!wd_ops) {
- probe_nmi_watchdog();
- if (!wd_ops) {
- printk(KERN_INFO "NMI watchdog: CPU not supported\n");
- return -1;
- }
-
- if (!wd_ops->reserve()) {
- printk(KERN_ERR
- "NMI watchdog: cannot reserve perfctrs\n");
- return -1;
- }
- }
-
- if (!(wd_ops->setup(nmi_hz))) {
- printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
- raw_smp_processor_id());
- return -1;
- }
-
- return 0;
-}
-
-void lapic_watchdog_stop(void)
-{
- if (wd_ops)
- wd_ops->stop();
-}
-
-unsigned lapic_adjust_nmi_hz(unsigned hz)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
- wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
- hz = adjust_for_32bit_ctr(hz);
- return hz;
-}
-
-int __kprobes lapic_wd_event(unsigned nmi_hz)
-{
- struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
- u64 ctr;
-
- rdmsrl(wd->perfctr_msr, ctr);
- if (ctr & wd_ops->checkbit) /* perfctr still running? */
- return 0;
-
- wd_ops->rearm(wd, nmi_hz);
- return 1;
-}
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 1b7b31ab7d86..212a6a42527c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -33,7 +33,6 @@
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/smp.h>
-#include <linux/smp_lock.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/device.h>
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6e8752c1bd52..df20723a6a1b 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -175,21 +175,21 @@ static const struct stacktrace_ops print_trace_ops = {
void
show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl)
+ unsigned long *stack, char *log_lvl)
{
printk("%sCall Trace:\n", log_lvl);
- dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
+ dump_trace(task, regs, stack, &print_trace_ops, log_lvl);
}
void show_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp)
+ unsigned long *stack)
{
- show_trace_log_lvl(task, regs, stack, bp, "");
+ show_trace_log_lvl(task, regs, stack, "");
}
void show_stack(struct task_struct *task, unsigned long *sp)
{
- show_stack_log_lvl(task, NULL, sp, 0, "");
+ show_stack_log_lvl(task, NULL, sp, "");
}
/*
@@ -197,20 +197,14 @@ void show_stack(struct task_struct *task, unsigned long *sp)
*/
void dump_stack(void)
{
- unsigned long bp = 0;
unsigned long stack;
-#ifdef CONFIG_FRAME_POINTER
- if (!bp)
- get_bp(bp);
-#endif
-
printk("Pid: %d, comm: %.20s %s %s %.*s\n",
current->pid, current->comm, print_tainted(),
init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
- show_trace(NULL, NULL, &stack, bp);
+ show_trace(NULL, NULL, &stack);
}
EXPORT_SYMBOL(dump_stack);
@@ -240,6 +234,7 @@ unsigned __kprobes long oops_begin(void)
bust_spinlocks(1);
return flags;
}
+EXPORT_SYMBOL_GPL(oops_begin);
void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
{
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 1bc7f75a5bda..74cc1eda384b 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -17,11 +17,12 @@
#include <asm/stacktrace.h>
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
+void dump_trace(struct task_struct *task,
+ struct pt_regs *regs, unsigned long *stack,
const struct stacktrace_ops *ops, void *data)
{
int graph = 0;
+ unsigned long bp;
if (!task)
task = current;
@@ -34,18 +35,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
stack = (unsigned long *)task->thread.sp;
}
-#ifdef CONFIG_FRAME_POINTER
- if (!bp) {
- if (task == current) {
- /* Grab bp right from our regs */
- get_bp(bp);
- } else {
- /* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) task->thread.sp;
- }
- }
-#endif
-
+ bp = stack_frame(task, regs);
for (;;) {
struct thread_info *context;
@@ -65,7 +55,7 @@ EXPORT_SYMBOL(dump_trace);
void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+ unsigned long *sp, char *log_lvl)
{
unsigned long *stack;
int i;
@@ -87,7 +77,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
touch_nmi_watchdog();
}
printk(KERN_CONT "\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, log_lvl);
}
@@ -112,8 +102,7 @@ void show_registers(struct pt_regs *regs)
u8 *ip;
printk(KERN_EMERG "Stack:\n");
- show_stack_log_lvl(NULL, regs, &regs->sp,
- 0, KERN_EMERG);
+ show_stack_log_lvl(NULL, regs, &regs->sp, KERN_EMERG);
printk(KERN_EMERG "Code: ");
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 6a340485249a..a6b6fcf7f0ae 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -139,8 +139,8 @@ fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
* severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
*/
-void dump_trace(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp,
+void dump_trace(struct task_struct *task,
+ struct pt_regs *regs, unsigned long *stack,
const struct stacktrace_ops *ops, void *data)
{
const unsigned cpu = get_cpu();
@@ -149,29 +149,19 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned used = 0;
struct thread_info *tinfo;
int graph = 0;
+ unsigned long dummy;
+ unsigned long bp;
if (!task)
task = current;
if (!stack) {
- unsigned long dummy;
stack = &dummy;
if (task && task != current)
stack = (unsigned long *)task->thread.sp;
}
-#ifdef CONFIG_FRAME_POINTER
- if (!bp) {
- if (task == current) {
- /* Grab bp right from our regs */
- get_bp(bp);
- } else {
- /* bp is the last reg pushed by switch_to */
- bp = *(unsigned long *) task->thread.sp;
- }
- }
-#endif
-
+ bp = stack_frame(task, regs);
/*
* Print function call entries in all stacks, starting at the
* current stack address. If the stacks consist of nested
@@ -235,7 +225,7 @@ EXPORT_SYMBOL(dump_trace);
void
show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl)
+ unsigned long *sp, char *log_lvl)
{
unsigned long *irq_stack_end;
unsigned long *irq_stack;
@@ -279,7 +269,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
preempt_enable();
printk(KERN_CONT "\n");
- show_trace_log_lvl(task, regs, sp, bp, log_lvl);
+ show_trace_log_lvl(task, regs, sp, log_lvl);
}
void show_registers(struct pt_regs *regs)
@@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs)
printk(KERN_EMERG "Stack:\n");
show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
- regs->bp, KERN_EMERG);
+ KERN_EMERG);
printk(KERN_EMERG "Code: ");
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0c2b7ef7a34d..294f26da0c0c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -14,6 +14,7 @@
#include <linux/bootmem.h>
#include <linux/pfn.h>
#include <linux/suspend.h>
+#include <linux/acpi.h>
#include <linux/firmware-map.h>
#include <linux/memblock.h>
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 59e175e89599..c8b4efad7ebb 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -395,7 +395,7 @@ sysenter_past_esp:
* A tiny bit of offset fixup is necessary - 4*4 means the 4 words
* pushed above; +8 corresponds to copy_thread's esp0 setting.
*/
- pushl_cfi (TI_sysenter_return-THREAD_SIZE_asm+8+4*4)(%esp)
+ pushl_cfi ((TI_sysenter_return)-THREAD_SIZE_asm+8+4*4)(%esp)
CFI_REL_OFFSET eip, 0
pushl_cfi %eax
@@ -1406,6 +1406,16 @@ ENTRY(general_protection)
CFI_ENDPROC
END(general_protection)
+#ifdef CONFIG_KVM_GUEST
+ENTRY(async_page_fault)
+ RING0_EC_FRAME
+ pushl $do_async_page_fault
+ CFI_ADJUST_CFA_OFFSET 4
+ jmp error_code
+ CFI_ENDPROC
+END(apf_page_fault)
+#endif
+
/*
* End of kprobes section
*/
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index fe2690d71c0c..aed1ffbeb0c9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -295,20 +295,25 @@ ENDPROC(native_usergs_sysret64)
.endm
/* save partial stack frame */
+ .pushsection .kprobes.text, "ax"
ENTRY(save_args)
XCPT_FRAME
cld
- movq_cfi rdi, RDI+16-ARGOFFSET
- movq_cfi rsi, RSI+16-ARGOFFSET
- movq_cfi rdx, RDX+16-ARGOFFSET
- movq_cfi rcx, RCX+16-ARGOFFSET
- movq_cfi rax, RAX+16-ARGOFFSET
- movq_cfi r8, R8+16-ARGOFFSET
- movq_cfi r9, R9+16-ARGOFFSET
- movq_cfi r10, R10+16-ARGOFFSET
- movq_cfi r11, R11+16-ARGOFFSET
-
- leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */
+ /*
+ * start from rbp in pt_regs and jump over
+ * return address.
+ */
+ movq_cfi rdi, RDI+8-RBP
+ movq_cfi rsi, RSI+8-RBP
+ movq_cfi rdx, RDX+8-RBP
+ movq_cfi rcx, RCX+8-RBP
+ movq_cfi rax, RAX+8-RBP
+ movq_cfi r8, R8+8-RBP
+ movq_cfi r9, R9+8-RBP
+ movq_cfi r10, R10+8-RBP
+ movq_cfi r11, R11+8-RBP
+
+ leaq -RBP+8(%rsp),%rdi /* arg1 for handler */
movq_cfi rbp, 8 /* push %rbp */
leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
testl $3, CS(%rdi)
@@ -334,6 +339,7 @@ ENTRY(save_args)
ret
CFI_ENDPROC
END(save_args)
+ .popsection
ENTRY(save_rest)
PARTIAL_FRAME 1 REST_SKIP+8
@@ -780,8 +786,9 @@ END(interrupt)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- subq $ORIG_RAX-ARGOFFSET+8, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8
+ /* reserve pt_regs for scratch regs and rbp */
+ subq $ORIG_RAX-RBP, %rsp
+ CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
call save_args
PARTIAL_FRAME 0
call \func
@@ -806,9 +813,14 @@ ret_from_intr:
TRACE_IRQS_OFF
decl PER_CPU_VAR(irq_count)
leaveq
+
CFI_RESTORE rbp
CFI_DEF_CFA_REGISTER rsp
CFI_ADJUST_CFA_OFFSET -8
+
+ /* we did not save rbx, restore only from ARGOFFSET */
+ addq $8, %rsp
+ CFI_ADJUST_CFA_OFFSET -8
exit_intr:
GET_THREAD_INFO(%rcx)
testl $3,CS-ARGOFFSET(%rsp)
@@ -1317,6 +1329,9 @@ errorentry xen_stack_segment do_stack_segment
#endif
errorentry general_protection do_general_protection
errorentry page_fault do_page_fault
+#ifdef CONFIG_KVM_GUEST
+errorentry async_page_fault do_async_page_fault
+#endif
#ifdef CONFIG_X86_MCE
paranoidzeroentry machine_check *machine_check_vector(%rip)
#endif
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3afb33f14d2d..382eb2936d4d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/module.h>
#include <trace/syscall.h>
@@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code);
int ftrace_arch_code_modify_prepare(void)
{
set_kernel_text_rw();
+ set_all_modules_text_rw();
modifying_code = 1;
return 0;
}
@@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void)
int ftrace_arch_code_modify_post_process(void)
{
modifying_code = 0;
+ set_all_modules_text_ro();
set_kernel_text_ro();
return 0;
}
@@ -167,9 +170,9 @@ static void ftrace_mod_code(void)
void ftrace_nmi_enter(void)
{
- __get_cpu_var(save_modifying_code) = modifying_code;
+ __this_cpu_write(save_modifying_code, modifying_code);
- if (!__get_cpu_var(save_modifying_code))
+ if (!__this_cpu_read(save_modifying_code))
return;
if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
@@ -183,7 +186,7 @@ void ftrace_nmi_enter(void)
void ftrace_nmi_exit(void)
{
- if (!__get_cpu_var(save_modifying_code))
+ if (!__this_cpu_read(save_modifying_code))
return;
/* Finish all executions before clearing nmi_running */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index bcece91dd311..fc293dc8dc35 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -60,16 +60,18 @@
#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
#endif
+/* Number of possible pages in the lowmem region */
+LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT)
+
/* Enough space to fit pagetables for the low memory linear map */
-MAPPING_BEYOND_END = \
- PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
+MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
/*
* Worst-case size of the kernel mapping we need to make:
- * the worst-case size of the kernel itself, plus the extra we need
- * to map for the linear map.
+ * a relocatable kernel can live anywhere in lowmem, so we need to be able
+ * to map all of lowmem.
*/
-KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT
+KERNEL_PAGES = LOWMEM_PAGES
INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm
RESERVE_BRK(pagetables, INIT_MAP_SIZE)
@@ -124,7 +126,7 @@ ENTRY(startup_32)
movsl
movl pa(boot_params) + NEW_CL_POINTER,%esi
andl %esi,%esi
- jz 1f # No comand line
+ jz 1f # No command line
movl $pa(boot_command_line),%edi
movl $(COMMAND_LINE_SIZE/4),%ecx
rep
@@ -137,39 +139,6 @@ ENTRY(startup_32)
movl %eax, pa(olpc_ofw_pgd)
#endif
-#ifdef CONFIG_PARAVIRT
- /* This is can only trip for a broken bootloader... */
- cmpw $0x207, pa(boot_params + BP_version)
- jb default_entry
-
- /* Paravirt-compatible boot parameters. Look to see what architecture
- we're booting under. */
- movl pa(boot_params + BP_hardware_subarch), %eax
- cmpl $num_subarch_entries, %eax
- jae bad_subarch
-
- movl pa(subarch_entries)(,%eax,4), %eax
- subl $__PAGE_OFFSET, %eax
- jmp *%eax
-
-bad_subarch:
-WEAK(lguest_entry)
-WEAK(xen_entry)
- /* Unknown implementation; there's really
- nothing we can do at this point. */
- ud2a
-
- __INITDATA
-
-subarch_entries:
- .long default_entry /* normal x86/PC */
- .long lguest_entry /* lguest hypervisor */
- .long xen_entry /* Xen hypervisor */
- .long default_entry /* Moorestown MID */
-num_subarch_entries = (. - subarch_entries) / 4
-.previous
-#endif /* CONFIG_PARAVIRT */
-
/*
* Initialize page tables. This creates a PDE and a set of page
* tables, which are located immediately beyond __brk_base. The variable
@@ -179,7 +148,6 @@ num_subarch_entries = (. - subarch_entries) / 4
*
* Note that the stack is not yet set up!
*/
-default_entry:
#ifdef CONFIG_X86_PAE
/*
@@ -259,7 +227,42 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
movl %eax,pa(initial_page_table+0xffc)
#endif
- jmp 3f
+
+#ifdef CONFIG_PARAVIRT
+ /* This is can only trip for a broken bootloader... */
+ cmpw $0x207, pa(boot_params + BP_version)
+ jb default_entry
+
+ /* Paravirt-compatible boot parameters. Look to see what architecture
+ we're booting under. */
+ movl pa(boot_params + BP_hardware_subarch), %eax
+ cmpl $num_subarch_entries, %eax
+ jae bad_subarch
+
+ movl pa(subarch_entries)(,%eax,4), %eax
+ subl $__PAGE_OFFSET, %eax
+ jmp *%eax
+
+bad_subarch:
+WEAK(lguest_entry)
+WEAK(xen_entry)
+ /* Unknown implementation; there's really
+ nothing we can do at this point. */
+ ud2a
+
+ __INITDATA
+
+subarch_entries:
+ .long default_entry /* normal x86/PC */
+ .long lguest_entry /* lguest hypervisor */
+ .long xen_entry /* Xen hypervisor */
+ .long default_entry /* Moorestown MID */
+num_subarch_entries = (. - subarch_entries) / 4
+.previous
+#else
+ jmp default_entry
+#endif /* CONFIG_PARAVIRT */
+
/*
* Non-boot CPU entry point; entered from trampoline.S
* We can't lgdt here, because lgdt itself uses a data segment, but
@@ -280,7 +283,7 @@ ENTRY(startup_32_smp)
movl %eax,%fs
movl %eax,%gs
#endif /* CONFIG_SMP */
-3:
+default_entry:
/*
* New page tables may be in 4Mbyte page mode and may
@@ -314,6 +317,10 @@ ENTRY(startup_32_smp)
subl $0x80000001, %eax
cmpl $(0x8000ffff-0x80000001), %eax
ja 6f
+
+ /* Clear bogus XD_DISABLE bits */
+ call verify_cpu
+
mov $0x80000001, %eax
cpuid
/* Execute Disable bit supported? */
@@ -609,6 +616,8 @@ ignore_int:
#endif
iret
+#include "verify_cpu.S"
+
__REFDATA
.align 4
ENTRY(initial_code)
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ae03cab4352e..4ff5968f12d2 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -27,6 +27,9 @@
#define HPET_DEV_FSB_CAP 0x1000
#define HPET_DEV_PERI_CAP 0x2000
+#define HPET_MIN_CYCLES 128
+#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
+
#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
/*
@@ -299,8 +302,9 @@ static void hpet_legacy_clockevent_register(void)
/* Calculate the min / max delta */
hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
&hpet_clockevent);
- /* 5 usec minimum reprogramming delta. */
- hpet_clockevent.min_delta_ns = 5000;
+ /* Setup minimum reprogramming delta. */
+ hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA,
+ &hpet_clockevent);
/*
* Start hpet with the boot cpu mask and make it
@@ -393,22 +397,24 @@ static int hpet_next_event(unsigned long delta,
* the wraparound into account) nor a simple count down event
* mode. Further the write to the comparator register is
* delayed internally up to two HPET clock cycles in certain
- * chipsets (ATI, ICH9,10). We worked around that by reading
- * back the compare register, but that required another
- * workaround for ICH9,10 chips where the first readout after
- * write can return the old stale value. We already have a
- * minimum delta of 5us enforced, but a NMI or SMI hitting
+ * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even
+ * longer delays. We worked around that by reading back the
+ * compare register, but that required another workaround for
+ * ICH9,10 chips where the first readout after write can
+ * return the old stale value. We already had a minimum
+ * programming delta of 5us enforced, but a NMI or SMI hitting
* between the counter readout and the comparator write can
* move us behind that point easily. Now instead of reading
* the compare register back several times, we make the ETIME
* decision based on the following: Return ETIME if the
- * counter value after the write is less than 8 HPET cycles
+ * counter value after the write is less than HPET_MIN_CYCLES
* away from the event or if the counter is already ahead of
- * the event.
+ * the event. The minimum programming delta for the generic
+ * clockevents code is set to 1.5 * HPET_MIN_CYCLES.
*/
res = (s32)(cnt - hpet_readl(HPET_COUNTER));
- return res < 8 ? -ETIME : 0;
+ return res < HPET_MIN_CYCLES ? -ETIME : 0;
}
static void hpet_legacy_set_mode(enum clock_event_mode mode,
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index ff15c9dcc25d..02f07634d265 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -122,7 +122,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
return -EBUSY;
set_debugreg(info->address, i);
- __get_cpu_var(cpu_debugreg[i]) = info->address;
+ __this_cpu_write(cpu_debugreg[i], info->address);
dr7 = &__get_cpu_var(cpu_dr7);
*dr7 |= encode_dr7(i, info->len, info->type);
@@ -397,12 +397,12 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
void hw_breakpoint_restore(void)
{
- set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
- set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
- set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
- set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
+ set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0);
+ set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1);
+ set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2);
+ set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3);
set_debugreg(current->thread.debugreg6, 6);
- set_debugreg(__get_cpu_var(cpu_dr7), 7);
+ set_debugreg(__this_cpu_read(cpu_dr7), 7);
}
EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
@@ -433,6 +433,10 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
dr6_p = (unsigned long *)ERR_PTR(args->err);
dr6 = *dr6_p;
+ /* If it's a single step, TRAP bits are random */
+ if (dr6 & DR_STEP)
+ return NOTIFY_DONE;
+
/* Do an early return if no trap bits are set in DR6 */
if ((dr6 & DR_TRAP_BITS) == 0)
return NOTIFY_DONE;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 58bb239a2fd7..e60c38cc0eed 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk)
set_stopped_child_used_math(tsk);
return 0;
}
+EXPORT_SYMBOL_GPL(init_fpu);
/*
* The xstateregs_active() routine is the same as the fpregs_active() routine,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 83ec0175f986..52945da52a94 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -4,6 +4,7 @@
#include <linux/cpu.h>
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
+#include <linux/of.h>
#include <linux/seq_file.h>
#include <linux/smp.h>
#include <linux/ftrace.h>
@@ -234,7 +235,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
exit_idle();
irq_enter();
- irq = __get_cpu_var(vector_irq)[vector];
+ irq = __this_cpu_read(vector_irq[vector]);
if (!handle_irq(irq, regs)) {
ack_APIC_irq();
@@ -275,6 +276,15 @@ void smp_x86_platform_ipi(struct pt_regs *regs)
EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
+#ifdef CONFIG_OF
+unsigned int irq_create_of_mapping(struct device_node *controller,
+ const u32 *intspec, unsigned int intsize)
+{
+ return intspec[0];
+}
+EXPORT_SYMBOL_GPL(irq_create_of_mapping);
+#endif
+
#ifdef CONFIG_HOTPLUG_CPU
/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
void fixup_irqs(void)
@@ -350,12 +360,12 @@ void fixup_irqs(void)
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
unsigned int irr;
- if (__get_cpu_var(vector_irq)[vector] < 0)
+ if (__this_cpu_read(vector_irq[vector]) < 0)
continue;
irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
if (irr & (1 << (vector % 32))) {
- irq = __get_cpu_var(vector_irq)[vector];
+ irq = __this_cpu_read(vector_irq[vector]);
data = irq_get_irq_data(irq);
raw_spin_lock(&desc->lock);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 96656f207751..9974d21048fd 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -79,7 +79,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
u32 *isp, arg1, arg2;
curctx = (union irq_ctx *) current_thread_info();
- irqctx = __get_cpu_var(hardirq_ctx);
+ irqctx = __this_cpu_read(hardirq_ctx);
/*
* this is where we switch to the IRQ stack. However, if we are
@@ -129,8 +129,7 @@ void __cpuinit irq_ctx_init(int cpu)
irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
THREAD_FLAGS,
THREAD_ORDER));
- irqctx->tinfo.task = NULL;
- irqctx->tinfo.exec_domain = NULL;
+ memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
@@ -140,10 +139,8 @@ void __cpuinit irq_ctx_init(int cpu)
irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
THREAD_FLAGS,
THREAD_ORDER));
- irqctx->tinfo.task = NULL;
- irqctx->tinfo.exec_domain = NULL;
+ memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = 0;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
per_cpu(softirq_ctx, cpu) = irqctx;
@@ -166,7 +163,7 @@ asmlinkage void do_softirq(void)
if (local_softirq_pending()) {
curctx = current_thread_info();
- irqctx = __get_cpu_var(softirq_ctx);
+ irqctx = __this_cpu_read(softirq_ctx);
irqctx->tinfo.task = curctx->task;
irqctx->tinfo.previous_esp = current_stack_pointer;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index ec592caac4b4..a4130005028a 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -48,6 +48,7 @@
#include <asm/apicdef.h>
#include <asm/system.h>
#include <asm/apic.h>
+#include <asm/nmi.h>
struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{
@@ -315,14 +316,18 @@ static void kgdb_remove_all_hw_break(void)
if (!breakinfo[i].enabled)
continue;
bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
- if (bp->attr.disabled == 1)
+ if (!bp->attr.disabled) {
+ arch_uninstall_hw_breakpoint(bp);
+ bp->attr.disabled = 1;
continue;
+ }
if (dbg_is_early)
early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
breakinfo[i].type);
- else
- arch_uninstall_hw_breakpoint(bp);
- bp->attr.disabled = 1;
+ else if (hw_break_release_slot(i))
+ printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n",
+ breakinfo[i].addr);
+ breakinfo[i].enabled = 0;
}
}
@@ -521,10 +526,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
}
return NOTIFY_DONE;
- case DIE_NMI_IPI:
- /* Just ignore, we will handle the roundup on DIE_NMI. */
- return NOTIFY_DONE;
-
case DIE_NMIUNKNOWN:
if (was_in_debug_nmi[raw_smp_processor_id()]) {
was_in_debug_nmi[raw_smp_processor_id()] = 0;
@@ -602,7 +603,7 @@ static struct notifier_block kgdb_notifier = {
/*
* Lowest-prio notifier priority, we want to be notified last:
*/
- .priority = -INT_MAX,
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
/**
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 1cbd54c0df99..d91c477b3f62 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -403,7 +403,7 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
- __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp;
+ __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
kcb->kprobe_status = kcb->prev_kprobe.status;
kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
@@ -412,7 +412,7 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
{
- __get_cpu_var(current_kprobe) = p;
+ __this_cpu_write(current_kprobe, p);
kcb->kprobe_saved_flags = kcb->kprobe_old_flags
= (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
if (is_IF_modifier(p->ainsn.insn))
@@ -586,7 +586,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
preempt_enable_no_resched();
return 1;
} else if (kprobe_running()) {
- p = __get_cpu_var(current_kprobe);
+ p = __this_cpu_read(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
setup_singlestep(p, regs, kcb, 0);
return 1;
@@ -759,11 +759,11 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
orig_ret_address = (unsigned long)ri->ret_addr;
if (ri->rp && ri->rp->handler) {
- __get_cpu_var(current_kprobe) = &ri->rp->kp;
+ __this_cpu_write(current_kprobe, &ri->rp->kp);
get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
ri->ret_addr = correct_ret_addr;
ri->rp->handler(ri, regs);
- __get_cpu_var(current_kprobe) = NULL;
+ __this_cpu_write(current_kprobe, NULL);
}
recycle_rp_inst(ri, &empty_rp);
@@ -1184,6 +1184,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+ /* This is possible if op is under delayed unoptimizing */
+ if (kprobe_disabled(&op->kp))
+ return;
+
preempt_disable();
if (kprobe_running()) {
kprobes_inc_nmissed_count(&op->kp);
@@ -1198,10 +1202,10 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
regs->orig_ax = ~0UL;
- __get_cpu_var(current_kprobe) = &op->kp;
+ __this_cpu_write(current_kprobe, &op->kp);
kcb->kprobe_status = KPROBE_HIT_ACTIVE;
opt_pre_handler(&op->kp, regs);
- __get_cpu_var(current_kprobe) = NULL;
+ __this_cpu_write(current_kprobe, NULL);
}
preempt_enable_no_resched();
}
@@ -1401,10 +1405,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
return 0;
}
-/* Replace a breakpoint (int3) with a relative jump. */
-int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
+#define MAX_OPTIMIZE_PROBES 256
+static struct text_poke_param *jump_poke_params;
+static struct jump_poke_buffer {
+ u8 buf[RELATIVEJUMP_SIZE];
+} *jump_poke_bufs;
+
+static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
+ u8 *insn_buf,
+ struct optimized_kprobe *op)
{
- unsigned char jmp_code[RELATIVEJUMP_SIZE];
s32 rel = (s32)((long)op->optinsn.insn -
((long)op->kp.addr + RELATIVEJUMP_SIZE));
@@ -1412,16 +1422,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
RELATIVE_ADDR_SIZE);
- jmp_code[0] = RELATIVEJUMP_OPCODE;
- *(s32 *)(&jmp_code[1]) = rel;
+ insn_buf[0] = RELATIVEJUMP_OPCODE;
+ *(s32 *)(&insn_buf[1]) = rel;
+
+ tprm->addr = op->kp.addr;
+ tprm->opcode = insn_buf;
+ tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Replace breakpoints (int3) with relative jumps.
+ * Caller must call with locking kprobe_mutex and text_mutex.
+ */
+void __kprobes arch_optimize_kprobes(struct list_head *oplist)
+{
+ struct optimized_kprobe *op, *tmp;
+ int c = 0;
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ WARN_ON(kprobe_disabled(&op->kp));
+ /* Setup param */
+ setup_optimize_kprobe(&jump_poke_params[c],
+ jump_poke_bufs[c].buf, op);
+ list_del_init(&op->list);
+ if (++c >= MAX_OPTIMIZE_PROBES)
+ break;
+ }
/*
* text_poke_smp doesn't support NMI/MCE code modifying.
* However, since kprobes itself also doesn't support NMI/MCE
* code probing, it's not a problem.
*/
- text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
- return 0;
+ text_poke_smp_batch(jump_poke_params, c);
+}
+
+static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
+ u8 *insn_buf,
+ struct optimized_kprobe *op)
+{
+ /* Set int3 to first byte for kprobes */
+ insn_buf[0] = BREAKPOINT_INSTRUCTION;
+ memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
+
+ tprm->addr = op->kp.addr;
+ tprm->opcode = insn_buf;
+ tprm->len = RELATIVEJUMP_SIZE;
+}
+
+/*
+ * Recover original instructions and breakpoints from relative jumps.
+ * Caller must call with locking kprobe_mutex.
+ */
+extern void arch_unoptimize_kprobes(struct list_head *oplist,
+ struct list_head *done_list)
+{
+ struct optimized_kprobe *op, *tmp;
+ int c = 0;
+
+ list_for_each_entry_safe(op, tmp, oplist, list) {
+ /* Setup param */
+ setup_unoptimize_kprobe(&jump_poke_params[c],
+ jump_poke_bufs[c].buf, op);
+ list_move(&op->list, done_list);
+ if (++c >= MAX_OPTIMIZE_PROBES)
+ break;
+ }
+
+ /*
+ * text_poke_smp doesn't support NMI/MCE code modifying.
+ * However, since kprobes itself also doesn't support NMI/MCE
+ * code probing, it's not a problem.
+ */
+ text_poke_smp_batch(jump_poke_params, c);
}
/* Replace a relative jump with a breakpoint (int3). */
@@ -1453,11 +1526,35 @@ static int __kprobes setup_detour_execution(struct kprobe *p,
}
return 0;
}
+
+static int __kprobes init_poke_params(void)
+{
+ /* Allocate code buffer and parameter array */
+ jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
+ MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+ if (!jump_poke_bufs)
+ return -ENOMEM;
+
+ jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
+ MAX_OPTIMIZE_PROBES, GFP_KERNEL);
+ if (!jump_poke_params) {
+ kfree(jump_poke_bufs);
+ jump_poke_bufs = NULL;
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+#else /* !CONFIG_OPTPROBES */
+static int __kprobes init_poke_params(void)
+{
+ return 0;
+}
#endif
int __init arch_init_kprobes(void)
{
- return 0;
+ return init_poke_params();
}
int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 63b0ec8d3d4a..8dc44662394b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,16 +27,37 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/kprobes.h>
#include <asm/timer.h>
+#include <asm/cpu.h>
+#include <asm/traps.h>
+#include <asm/desc.h>
+#include <asm/tlbflush.h>
#define MMU_QUEUE_SIZE 1024
+static int kvmapf = 1;
+
+static int parse_no_kvmapf(char *arg)
+{
+ kvmapf = 0;
+ return 0;
+}
+
+early_param("no-kvmapf", parse_no_kvmapf);
+
struct kvm_para_state {
u8 mmu_queue[MMU_QUEUE_SIZE];
int mmu_queue_len;
};
static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
static struct kvm_para_state *kvm_para_state(void)
{
@@ -50,6 +71,195 @@ static void kvm_io_delay(void)
{
}
+#define KVM_TASK_SLEEP_HASHBITS 8
+#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
+
+struct kvm_task_sleep_node {
+ struct hlist_node link;
+ wait_queue_head_t wq;
+ u32 token;
+ int cpu;
+ bool halted;
+ struct mm_struct *mm;
+};
+
+static struct kvm_task_sleep_head {
+ spinlock_t lock;
+ struct hlist_head list;
+} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
+
+static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
+ u32 token)
+{
+ struct hlist_node *p;
+
+ hlist_for_each(p, &b->list) {
+ struct kvm_task_sleep_node *n =
+ hlist_entry(p, typeof(*n), link);
+ if (n->token == token)
+ return n;
+ }
+
+ return NULL;
+}
+
+void kvm_async_pf_task_wait(u32 token)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node n, *e;
+ DEFINE_WAIT(wait);
+ int cpu, idle;
+
+ cpu = get_cpu();
+ idle = idle_cpu(cpu);
+ put_cpu();
+
+ spin_lock(&b->lock);
+ e = _find_apf_task(b, token);
+ if (e) {
+ /* dummy entry exist -> wake up was delivered ahead of PF */
+ hlist_del(&e->link);
+ kfree(e);
+ spin_unlock(&b->lock);
+ return;
+ }
+
+ n.token = token;
+ n.cpu = smp_processor_id();
+ n.mm = current->active_mm;
+ n.halted = idle || preempt_count() > 1;
+ atomic_inc(&n.mm->mm_count);
+ init_waitqueue_head(&n.wq);
+ hlist_add_head(&n.link, &b->list);
+ spin_unlock(&b->lock);
+
+ for (;;) {
+ if (!n.halted)
+ prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (hlist_unhashed(&n.link))
+ break;
+
+ if (!n.halted) {
+ local_irq_enable();
+ schedule();
+ local_irq_disable();
+ } else {
+ /*
+ * We cannot reschedule. So halt.
+ */
+ native_safe_halt();
+ local_irq_disable();
+ }
+ }
+ if (!n.halted)
+ finish_wait(&n.wq, &wait);
+
+ return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
+
+static void apf_task_wake_one(struct kvm_task_sleep_node *n)
+{
+ hlist_del_init(&n->link);
+ if (!n->mm)
+ return;
+ mmdrop(n->mm);
+ if (n->halted)
+ smp_send_reschedule(n->cpu);
+ else if (waitqueue_active(&n->wq))
+ wake_up(&n->wq);
+}
+
+static void apf_task_wake_all(void)
+{
+ int i;
+
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
+ struct hlist_node *p, *next;
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
+ spin_lock(&b->lock);
+ hlist_for_each_safe(p, next, &b->list) {
+ struct kvm_task_sleep_node *n =
+ hlist_entry(p, typeof(*n), link);
+ if (n->cpu == smp_processor_id())
+ apf_task_wake_one(n);
+ }
+ spin_unlock(&b->lock);
+ }
+}
+
+void kvm_async_pf_task_wake(u32 token)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node *n;
+
+ if (token == ~0) {
+ apf_task_wake_all();
+ return;
+ }
+
+again:
+ spin_lock(&b->lock);
+ n = _find_apf_task(b, token);
+ if (!n) {
+ /*
+ * async PF was not yet handled.
+ * Add dummy entry for the token.
+ */
+ n = kmalloc(sizeof(*n), GFP_ATOMIC);
+ if (!n) {
+ /*
+ * Allocation failed! Busy wait while other cpu
+ * handles async PF.
+ */
+ spin_unlock(&b->lock);
+ cpu_relax();
+ goto again;
+ }
+ n->token = token;
+ n->cpu = smp_processor_id();
+ n->mm = NULL;
+ init_waitqueue_head(&n->wq);
+ hlist_add_head(&n->link, &b->list);
+ } else
+ apf_task_wake_one(n);
+ spin_unlock(&b->lock);
+ return;
+}
+EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
+
+u32 kvm_read_and_reset_pf_reason(void)
+{
+ u32 reason = 0;
+
+ if (__get_cpu_var(apf_reason).enabled) {
+ reason = __get_cpu_var(apf_reason).reason;
+ __get_cpu_var(apf_reason).reason = 0;
+ }
+
+ return reason;
+}
+EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
+
+dotraplinkage void __kprobes
+do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
+{
+ switch (kvm_read_and_reset_pf_reason()) {
+ default:
+ do_page_fault(regs, error_code);
+ break;
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ /* page is swapped out by the host. */
+ kvm_async_pf_task_wait((u32)read_cr2());
+ break;
+ case KVM_PV_REASON_PAGE_READY:
+ kvm_async_pf_task_wake((u32)read_cr2());
+ break;
+ }
+}
+
static void kvm_mmu_op(void *buffer, unsigned len)
{
int r;
@@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void)
#endif
}
+void __cpuinit kvm_guest_cpu_init(void)
+{
+ if (!kvm_para_available())
+ return;
+
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
+ u64 pa = __pa(&__get_cpu_var(apf_reason));
+
+#ifdef CONFIG_PREEMPT
+ pa |= KVM_ASYNC_PF_SEND_ALWAYS;
+#endif
+ wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
+ __get_cpu_var(apf_reason).enabled = 1;
+ printk(KERN_INFO"KVM setup async PF for cpu %d\n",
+ smp_processor_id());
+ }
+}
+
+static void kvm_pv_disable_apf(void *unused)
+{
+ if (!__get_cpu_var(apf_reason).enabled)
+ return;
+
+ wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
+ __get_cpu_var(apf_reason).enabled = 0;
+
+ printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
+ smp_processor_id());
+}
+
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(kvm_pv_disable_apf, NULL, 1);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kvm_pv_reboot_nb = {
+ .notifier_call = kvm_pv_reboot_notify,
+};
+
+#ifdef CONFIG_SMP
+static void __init kvm_smp_prepare_boot_cpu(void)
+{
+#ifdef CONFIG_KVM_CLOCK
+ WARN_ON(kvm_register_clock("primary cpu clock"));
+#endif
+ kvm_guest_cpu_init();
+ native_smp_prepare_boot_cpu();
+}
+
+static void kvm_guest_cpu_online(void *dummy)
+{
+ kvm_guest_cpu_init();
+}
+
+static void kvm_guest_cpu_offline(void *dummy)
+{
+ kvm_pv_disable_apf(NULL);
+ apf_task_wake_all();
+}
+
+static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ int cpu = (unsigned long)hcpu;
+ switch (action) {
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ case CPU_ONLINE_FROZEN:
+ smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
+ break;
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
+ .notifier_call = kvm_cpu_notify,
+};
+#endif
+
+static void __init kvm_apf_trap_init(void)
+{
+ set_intr_gate(14, &async_page_fault);
+}
+
void __init kvm_guest_init(void)
{
+ int i;
+
if (!kvm_para_available())
return;
paravirt_ops_setup();
+ register_reboot_notifier(&kvm_pv_reboot_nb);
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
+ spin_lock_init(&async_pf_sleepers[i].lock);
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
+ x86_init.irqs.trap_init = kvm_apf_trap_init;
+
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+ register_cpu_notifier(&kvm_cpu_notifier);
+#else
+ kvm_guest_cpu_init();
+#endif
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index ca43ce31a19c..f98d3eafe07a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -125,7 +125,7 @@ static struct clocksource kvm_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
-static int kvm_register_clock(char *txt)
+int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
int low, high, ret;
@@ -152,14 +152,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
}
#endif
-#ifdef CONFIG_SMP
-static void __init kvm_smp_prepare_boot_cpu(void)
-{
- WARN_ON(kvm_register_clock("primary cpu clock"));
- native_smp_prepare_boot_cpu();
-}
-#endif
-
/*
* After the clock is registered, the host will keep writing to the
* registered memory location. If the guest happens to shutdown, this memory
@@ -206,9 +198,6 @@ void __init kvmclock_init(void)
x86_cpuinit.setup_percpu_clockev =
kvm_setup_secondary_clock;
#endif
-#ifdef CONFIG_SMP
- smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
-#endif
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index e1af7c055c7d..0fe6d1a66c38 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -155,12 +155,6 @@ static int apply_microcode_amd(int cpu)
return 0;
}
-static int get_ucode_data(void *to, const u8 *from, size_t n)
-{
- memcpy(to, from, n);
- return 0;
-}
-
static void *
get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
{
@@ -168,8 +162,7 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
void *mc;
- if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
- return NULL;
+ get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR);
if (section_hdr[0] != UCODE_UCODE_TYPE) {
pr_err("error: invalid type field in container file section header\n");
@@ -183,16 +176,13 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
return NULL;
}
- mc = vmalloc(UCODE_MAX_SIZE);
- if (mc) {
- memset(mc, 0, UCODE_MAX_SIZE);
- if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR,
- total_size)) {
- vfree(mc);
- mc = NULL;
- } else
- *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
- }
+ mc = vzalloc(UCODE_MAX_SIZE);
+ if (!mc)
+ return NULL;
+
+ get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size);
+ *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
+
return mc;
}
@@ -202,8 +192,7 @@ static int install_equiv_cpu_table(const u8 *buf)
unsigned int *buf_pos = (unsigned int *)container_hdr;
unsigned long size;
- if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
- return 0;
+ get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE);
size = buf_pos[2];
@@ -212,17 +201,14 @@ static int install_equiv_cpu_table(const u8 *buf)
return 0;
}
- equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
+ equiv_cpu_table = vmalloc(size);
if (!equiv_cpu_table) {
pr_err("failed to allocate equivalent CPU table\n");
return 0;
}
buf += UCODE_CONTAINER_HEADER_SIZE;
- if (get_ucode_data(equiv_cpu_table, buf, size)) {
- vfree(equiv_cpu_table);
- return 0;
- }
+ get_ucode_data(equiv_cpu_table, buf, size);
return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index dcb65cc0a053..1a1b606d3e92 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -364,8 +364,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
/* For performance reasons, reuse mc area when possible */
if (!mc || mc_size > curr_mc_size) {
- if (mc)
- vfree(mc);
+ vfree(mc);
mc = vmalloc(mc_size);
if (!mc)
break;
@@ -374,13 +373,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
if (get_ucode_data(mc, ucode_ptr, mc_size) ||
microcode_sanity_check(mc) < 0) {
- vfree(mc);
break;
}
if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
- if (new_mc)
- vfree(new_mc);
+ vfree(new_mc);
new_rev = mc_header.rev;
new_mc = mc;
mc = NULL; /* trigger new vmalloc */
@@ -390,12 +387,10 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
leftover -= mc_size;
}
- if (mc)
- vfree(mc);
+ vfree(mc);
if (leftover) {
- if (new_mc)
- vfree(new_mc);
+ vfree(new_mc);
state = UCODE_ERROR;
goto out;
}
@@ -405,8 +400,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
goto out;
}
- if (uci->mc)
- vfree(uci->mc);
+ vfree(uci->mc);
uci->mc = (struct microcode_intel *)new_mc;
pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 71825806cd44..ac861b8348e2 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -25,7 +25,6 @@ struct pci_hostbridge_probe {
};
static u64 __cpuinitdata fam10h_pci_mmconf_base;
-static int __cpuinitdata fam10h_pci_mmconf_base_status;
static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
{ 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
@@ -44,10 +43,12 @@ static int __cpuinit cmp_range(const void *x1, const void *x2)
return start1 - start2;
}
-/*[47:0] */
-/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */
+#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT)
+#define MMCONF_MASK (~(MMCONF_UNIT - 1))
+#define MMCONF_SIZE (MMCONF_UNIT << 8)
+/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */
#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32)
-#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32)))
+#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40))
static void __cpuinit get_fam10h_pci_mmconf_base(void)
{
int i;
@@ -64,12 +65,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
struct range range[8];
/* only try to get setting from BSP */
- /* -1 or 1 */
- if (fam10h_pci_mmconf_base_status)
+ if (fam10h_pci_mmconf_base)
return;
if (!early_pci_allowed())
- goto fail;
+ return;
found = 0;
for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
@@ -91,7 +91,7 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
}
if (!found)
- goto fail;
+ return;
/* SYS_CFG */
address = MSR_K8_SYSCFG;
@@ -99,16 +99,16 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
/* TOP_MEM2 is not enabled? */
if (!(val & (1<<21))) {
- tom2 = 0;
+ tom2 = 1ULL << 32;
} else {
/* TOP_MEM2 */
address = MSR_K8_TOP_MEM2;
rdmsrl(address, val);
- tom2 = val & (0xffffULL<<32);
+ tom2 = max(val & 0xffffff800000ULL, 1ULL << 32);
}
if (base <= tom2)
- base = tom2 + (1ULL<<32);
+ base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK;
/*
* need to check if the range is in the high mmio range that is
@@ -123,11 +123,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
if (!(reg & 3))
continue;
- start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/
+ start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/
reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
- end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/
+ end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/
- if (!end)
+ if (end < tom2)
continue;
range[hi_mmio_num].start = start;
@@ -143,32 +143,27 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
if (range[hi_mmio_num - 1].end < base)
goto out;
- if (range[0].start > base)
+ if (range[0].start > base + MMCONF_SIZE)
goto out;
/* need to find one window */
- base = range[0].start - (1ULL << 32);
+ base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT;
if ((base > tom2) && BASE_VALID(base))
goto out;
- base = range[hi_mmio_num - 1].end + (1ULL << 32);
- if ((base > tom2) && BASE_VALID(base))
+ base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK;
+ if (BASE_VALID(base))
goto out;
/* need to find window between ranges */
- if (hi_mmio_num > 1)
- for (i = 0; i < hi_mmio_num - 1; i++) {
- if (range[i + 1].start > (range[i].end + (1ULL << 32))) {
- base = range[i].end + (1ULL << 32);
- if ((base > tom2) && BASE_VALID(base))
- goto out;
- }
+ for (i = 1; i < hi_mmio_num; i++) {
+ base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK;
+ val = range[i].start & MMCONF_MASK;
+ if (val >= base + MMCONF_SIZE && BASE_VALID(base))
+ goto out;
}
-
-fail:
- fam10h_pci_mmconf_base_status = -1;
return;
+
out:
fam10h_pci_mmconf_base = base;
- fam10h_pci_mmconf_base_status = 1;
}
void __cpuinit fam10h_check_enable_mmcfg(void)
@@ -190,11 +185,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
/* only trust the one handle 256 buses, if acpi=off */
if (!acpi_pci_disabled || busnbits >= 8) {
- u64 base;
- base = val & (0xffffULL << 32);
- if (fam10h_pci_mmconf_base_status <= 0) {
+ u64 base = val & MMCONF_MASK;
+
+ if (!fam10h_pci_mmconf_base) {
fam10h_pci_mmconf_base = base;
- fam10h_pci_mmconf_base_status = 1;
return;
} else if (fam10h_pci_mmconf_base == base)
return;
@@ -206,8 +200,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
* with 256 buses
*/
get_fam10h_pci_mmconf_base();
- if (fam10h_pci_mmconf_base_status <= 0)
+ if (!fam10h_pci_mmconf_base) {
+ pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
return;
+ }
printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
@@ -217,13 +213,13 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
wrmsrl(address, val);
}
-static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d)
+static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
{
pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF;
return 0;
}
-static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
+static const struct dmi_system_id __initconst mmconf_dmi_table[] = {
{
.callback = set_check_enable_amd_mmconf,
.ident = "Sun Microsystems Machine",
@@ -234,7 +230,8 @@ static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
{}
};
-void __cpuinit check_enable_amd_mmconf_dmi(void)
+/* Called from a __cpuinit function, but only on the BSP. */
+void __ref check_enable_amd_mmconf_dmi(void)
{
dmi_check_system(mmconf_dmi_table);
}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 8f2956091735..ab23f1ad4bf1 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -37,20 +37,11 @@
void *module_alloc(unsigned long size)
{
- struct vm_struct *area;
-
- if (!size)
- return NULL;
- size = PAGE_ALIGN(size);
- if (size > MODULES_LEN)
+ if (PAGE_ALIGN(size) > MODULES_LEN)
return NULL;
-
- area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
- if (!area)
- return NULL;
-
- return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM,
- PAGE_KERNEL_EXEC);
+ return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+ GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
+ -1, __builtin_return_address(0));
}
/* Free memory returned from module_alloc */
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 9af64d9c4b67..01b0f6d06451 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -118,21 +118,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
static void __init MP_ioapic_info(struct mpc_ioapic *m)
{
- if (!(m->flags & MPC_APIC_USABLE))
- return;
-
- printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
- m->apicid, m->apicver, m->apicaddr);
-
- mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
-}
-
-static void print_MP_intsrc_info(struct mpc_intsrc *m)
-{
- apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
- " IRQ %02x, APIC ID %x, APIC INT %02x\n",
- m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
- m->srcbusirq, m->dstapic, m->dstirq);
+ if (m->flags & MPC_APIC_USABLE)
+ mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
}
static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
@@ -144,73 +131,11 @@ static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
}
-static void __init assign_to_mp_irq(struct mpc_intsrc *m,
- struct mpc_intsrc *mp_irq)
-{
- mp_irq->dstapic = m->dstapic;
- mp_irq->type = m->type;
- mp_irq->irqtype = m->irqtype;
- mp_irq->irqflag = m->irqflag;
- mp_irq->srcbus = m->srcbus;
- mp_irq->srcbusirq = m->srcbusirq;
- mp_irq->dstirq = m->dstirq;
-}
-
-static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
- struct mpc_intsrc *m)
-{
- m->dstapic = mp_irq->dstapic;
- m->type = mp_irq->type;
- m->irqtype = mp_irq->irqtype;
- m->irqflag = mp_irq->irqflag;
- m->srcbus = mp_irq->srcbus;
- m->srcbusirq = mp_irq->srcbusirq;
- m->dstirq = mp_irq->dstirq;
-}
-
-static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
- struct mpc_intsrc *m)
-{
- if (mp_irq->dstapic != m->dstapic)
- return 1;
- if (mp_irq->type != m->type)
- return 2;
- if (mp_irq->irqtype != m->irqtype)
- return 3;
- if (mp_irq->irqflag != m->irqflag)
- return 4;
- if (mp_irq->srcbus != m->srcbus)
- return 5;
- if (mp_irq->srcbusirq != m->srcbusirq)
- return 6;
- if (mp_irq->dstirq != m->dstirq)
- return 7;
-
- return 0;
-}
-
-static void __init MP_intsrc_info(struct mpc_intsrc *m)
-{
- int i;
-
- print_MP_intsrc_info(m);
-
- for (i = 0; i < mp_irq_entries; i++) {
- if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
- return;
- }
-
- assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!!\n");
-}
#else /* CONFIG_X86_IO_APIC */
static inline void __init MP_bus_info(struct mpc_bus *m) {}
static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
-static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
#endif /* CONFIG_X86_IO_APIC */
-
static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
{
apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
@@ -222,7 +147,6 @@ static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
/*
* Read/parse the MPC
*/
-
static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
{
@@ -275,18 +199,6 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
-static void __init smp_register_lapic_address(unsigned long address)
-{
- mp_lapic_addr = address;
-
- set_fixmap_nocache(FIX_APIC_BASE, address);
- if (boot_cpu_physical_apicid == -1U) {
- boot_cpu_physical_apicid = read_apic_id();
- apic_version[boot_cpu_physical_apicid] =
- GET_APIC_VERSION(apic_read(APIC_LVR));
- }
-}
-
static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
{
char str[16];
@@ -301,17 +213,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
#ifdef CONFIG_X86_32
generic_mps_oem_check(mpc, oem, str);
#endif
- /* save the local APIC address, it might be non-default */
+ /* Initialize the lapic mapping */
if (!acpi_lapic)
- mp_lapic_addr = mpc->lapic;
+ register_lapic_address(mpc->lapic);
if (early)
return 1;
- /* Initialize the lapic mapping */
- if (!acpi_lapic)
- smp_register_lapic_address(mpc->lapic);
-
if (mpc->oemptr)
x86_init.mpparse.smp_read_mpc_oem(mpc);
@@ -337,7 +245,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
break;
case MP_INTSRC:
- MP_intsrc_info((struct mpc_intsrc *)mpt);
+ mp_save_irq((struct mpc_intsrc *)mpt);
skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
break;
case MP_LINTSRC:
@@ -429,13 +337,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
intsrc.srcbusirq = i;
intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
- MP_intsrc_info(&intsrc);
+ mp_save_irq(&intsrc);
}
intsrc.irqtype = mp_ExtINT;
intsrc.srcbusirq = 0;
intsrc.dstirq = 0; /* 8259A to INTIN0 */
- MP_intsrc_info(&intsrc);
+ mp_save_irq(&intsrc);
}
@@ -784,11 +692,11 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
int i;
apic_printk(APIC_VERBOSE, "OLD ");
- print_MP_intsrc_info(m);
+ print_mp_irq_info(m);
i = get_MP_intsrc_index(m);
if (i > 0) {
- assign_to_mpc_intsrc(&mp_irqs[i], m);
+ memcpy(m, &mp_irqs[i], sizeof(*m));
apic_printk(APIC_VERBOSE, "NEW ");
print_mp_irq_info(&mp_irqs[i]);
return;
@@ -875,14 +783,14 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
if (nr_m_spare > 0) {
apic_printk(APIC_VERBOSE, "*NEW* found\n");
nr_m_spare--;
- assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]);
+ memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
m_spare[nr_m_spare] = NULL;
} else {
struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
count += sizeof(struct mpc_intsrc);
if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
goto out;
- assign_to_mpc_intsrc(&mp_irqs[i], m);
+ memcpy(m, &mp_irqs[i], sizeof(*m));
mpc->length = count;
mpt += sizeof(struct mpc_intsrc);
}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7bf2dc4c8f70..12fcbe2c143e 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -30,7 +30,6 @@
#include <linux/init.h>
#include <linux/poll.h>
#include <linux/smp.h>
-#include <linux/smp_lock.h>
#include <linux/major.h>
#include <linux/fs.h>
#include <linux/device.h>
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index c5b250011fd4..869e1aeeb71b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -421,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = {
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
.set_pmd = native_set_pmd,
+ .set_pmd_at = native_set_pmd_at,
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
+ .pmd_update = paravirt_nop,
+ .pmd_update_defer = paravirt_nop,
.ptep_modify_prot_start = __ptep_modify_prot_start,
.ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index ba0f0ca9f280..c01ffa5b9b87 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -143,7 +143,7 @@ static void flush_gart(void)
spin_lock_irqsave(&iommu_bitmap_lock, flags);
if (need_flush) {
- k8_flush_garts();
+ amd_flush_garts();
need_flush = false;
}
spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -561,17 +561,17 @@ static void enable_gart_translations(void)
{
int i;
- if (!k8_northbridges.gart_supported)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return;
- for (i = 0; i < k8_northbridges.num; i++) {
- struct pci_dev *dev = k8_northbridges.nb_misc[i];
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
enable_gart_translation(dev, __pa(agp_gatt_table));
}
/* Flush the GART-TLB to remove stale entries */
- k8_flush_garts();
+ amd_flush_garts();
}
/*
@@ -596,13 +596,13 @@ static void gart_fixup_northbridges(struct sys_device *dev)
if (!fix_up_north_bridges)
return;
- if (!k8_northbridges.gart_supported)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return;
pr_info("PCI-DMA: Restoring GART aperture settings\n");
- for (i = 0; i < k8_northbridges.num; i++) {
- struct pci_dev *dev = k8_northbridges.nb_misc[i];
+ for (i = 0; i < amd_nb_num(); i++) {
+ struct pci_dev *dev = node_to_amd_nb(i)->misc;
/*
* Don't enable translations just yet. That is the next
@@ -644,7 +644,7 @@ static struct sys_device device_gart = {
* Private Northbridge GATT initialization in case we cannot use the
* AGP driver for some reason.
*/
-static __init int init_k8_gatt(struct agp_kern_info *info)
+static __init int init_amd_gatt(struct agp_kern_info *info)
{
unsigned aper_size, gatt_size, new_aper_size;
unsigned aper_base, new_aper_base;
@@ -656,8 +656,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
aper_size = aper_base = info->aper_size = 0;
dev = NULL;
- for (i = 0; i < k8_northbridges.num; i++) {
- dev = k8_northbridges.nb_misc[i];
+ for (i = 0; i < amd_nb_num(); i++) {
+ dev = node_to_amd_nb(i)->misc;
new_aper_base = read_aperture(dev, &new_aper_size);
if (!new_aper_base)
goto nommu;
@@ -725,13 +725,13 @@ static void gart_iommu_shutdown(void)
if (!no_agp)
return;
- if (!k8_northbridges.gart_supported)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return;
- for (i = 0; i < k8_northbridges.num; i++) {
+ for (i = 0; i < amd_nb_num(); i++) {
u32 ctl;
- dev = k8_northbridges.nb_misc[i];
+ dev = node_to_amd_nb(i)->misc;
pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
ctl &= ~GARTEN;
@@ -749,14 +749,14 @@ int __init gart_iommu_init(void)
unsigned long scratch;
long i;
- if (!k8_northbridges.gart_supported)
+ if (!amd_nb_has_feature(AMD_NB_GART))
return 0;
#ifndef CONFIG_AGP_AMD64
no_agp = 1;
#else
/* Makefile puts PCI initialization via subsys_initcall first. */
- /* Add other K8 AGP bridge drivers here */
+ /* Add other AMD AGP bridge drivers here */
no_agp = no_agp ||
(agp_amd64_init() < 0) ||
(agp_copy_info(agp_bridge, &info) < 0);
@@ -765,7 +765,7 @@ int __init gart_iommu_init(void)
if (no_iommu ||
(!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
!gart_iommu_aperture ||
- (no_agp && init_k8_gatt(&info) < 0)) {
+ (no_agp && init_amd_gatt(&info) < 0)) {
if (max_pfn > MAX_DMA32_PFN) {
pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
pr_warning("falling back to iommu=soft.\n");
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 57d1868a86aa..e764fc05d700 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -14,6 +14,7 @@
#include <linux/utsname.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
+#include <asm/cpu.h>
#include <asm/system.h>
#include <asm/apic.h>
#include <asm/syscalls.h>
@@ -22,11 +23,6 @@
#include <asm/i387.h>
#include <asm/debugreg.h>
-unsigned long idle_halt;
-EXPORT_SYMBOL(idle_halt);
-unsigned long idle_nomwait;
-EXPORT_SYMBOL(idle_nomwait);
-
struct kmem_cache *task_xstate_cachep;
EXPORT_SYMBOL_GPL(task_xstate_cachep);
@@ -91,8 +87,7 @@ void exit_thread(void)
void show_regs(struct pt_regs *regs)
{
show_registers(regs);
- show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs),
- regs->bp);
+ show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs));
}
void show_regs_common(void)
@@ -328,7 +323,7 @@ long sys_execve(const char __user *name,
/*
* Idle related variables and functions
*/
-unsigned long boot_option_idle_override = 0;
+unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
EXPORT_SYMBOL(boot_option_idle_override);
/*
@@ -374,6 +369,7 @@ void default_idle(void)
{
if (hlt_use_halt()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
+ trace_cpu_idle(1, smp_processor_id());
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -386,6 +382,8 @@ void default_idle(void)
else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
+ trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
} else {
local_irq_enable();
/* loop is done by the caller */
@@ -443,9 +441,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
- trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
if (!need_resched()) {
- if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
+ if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
__monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -460,7 +457,8 @@ static void mwait_idle(void)
{
if (!need_resched()) {
trace_power_start(POWER_CSTATE, 1, smp_processor_id());
- if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
+ trace_cpu_idle(1, smp_processor_id());
+ if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
__monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -469,6 +467,8 @@ static void mwait_idle(void)
__sti_mwait(0, 0);
else
local_irq_enable();
+ trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
} else
local_irq_enable();
}
@@ -481,10 +481,12 @@ static void mwait_idle(void)
static void poll_idle(void)
{
trace_power_start(POWER_CSTATE, 0, smp_processor_id());
+ trace_cpu_idle(0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
- trace_power_end(0);
+ trace_power_end(smp_processor_id());
+ trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
}
/*
@@ -499,17 +501,16 @@ static void poll_idle(void)
*
* idle=mwait overrides this decision and forces the usage of mwait.
*/
-static int __cpuinitdata force_mwait;
#define MWAIT_INFO 0x05
#define MWAIT_ECX_EXTENDED_INFO 0x01
#define MWAIT_EDX_C1 0xf0
-static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
+int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
- if (force_mwait)
+ if (boot_option_idle_override == IDLE_FORCE_MWAIT)
return 1;
if (c->cpuid_level < MWAIT_INFO)
@@ -629,9 +630,10 @@ static int __init idle_setup(char *str)
if (!strcmp(str, "poll")) {
printk("using polling idle threads.\n");
pm_idle = poll_idle;
- } else if (!strcmp(str, "mwait"))
- force_mwait = 1;
- else if (!strcmp(str, "halt")) {
+ boot_option_idle_override = IDLE_POLL;
+ } else if (!strcmp(str, "mwait")) {
+ boot_option_idle_override = IDLE_FORCE_MWAIT;
+ } else if (!strcmp(str, "halt")) {
/*
* When the boot option of idle=halt is added, halt is
* forced to be used for CPU idle. In such case CPU C2/C3
@@ -640,8 +642,7 @@ static int __init idle_setup(char *str)
* the boot_option_idle_override.
*/
pm_idle = default_idle;
- idle_halt = 1;
- return 0;
+ boot_option_idle_override = IDLE_HALT;
} else if (!strcmp(str, "nomwait")) {
/*
* If the boot option of "idle=nomwait" is added,
@@ -649,12 +650,10 @@ static int __init idle_setup(char *str)
* states. In such case it won't touch the variable
* of boot_option_idle_override.
*/
- idle_nomwait = 1;
- return 0;
+ boot_option_idle_override = IDLE_NOMWAIT;
} else
return -1;
- boot_option_idle_override = 1;
return 0;
}
early_param("idle", idle_setup);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 96586c3cbbbf..8d128783af47 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,8 +57,6 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
-#include <trace/events/power.h>
-
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
/*
@@ -113,8 +111,6 @@ void cpu_idle(void)
stop_critical_timings();
pm_idle();
start_critical_timings();
-
- trace_power_end(smp_processor_id());
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b3d7a3a04f38..bd387e8f73b4 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,8 +51,6 @@
#include <asm/syscalls.h>
#include <asm/debugreg.h>
-#include <trace/events/power.h>
-
asmlinkage extern void ret_from_fork(void);
DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -141,8 +139,6 @@ void cpu_idle(void)
pm_idle();
start_critical_timings();
- trace_power_end(smp_processor_id());
-
/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
loops can be woken up without interrupt. */
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index bab3b9e6f66d..42eb3300dfc6 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -41,44 +41,6 @@ void pvclock_set_flags(u8 flags)
valid_flags = flags;
}
-/*
- * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
- * yielding a 64-bit result.
- */
-static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
-{
- u64 product;
-#ifdef __i386__
- u32 tmp1, tmp2;
-#endif
-
- if (shift < 0)
- delta >>= -shift;
- else
- delta <<= shift;
-
-#ifdef __i386__
- __asm__ (
- "mul %5 ; "
- "mov %4,%%eax ; "
- "mov %%edx,%4 ; "
- "mul %5 ; "
- "xor %5,%5 ; "
- "add %4,%%eax ; "
- "adc %5,%%edx ; "
- : "=A" (product), "=r" (tmp1), "=r" (tmp2)
- : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
-#elif defined(__x86_64__)
- __asm__ (
- "mul %%rdx ; shrd $32,%%rdx,%%rax"
- : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
-#else
-#error implement me!
-#endif
-
- return product;
-}
-
static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
{
u64 delta = native_read_tsc() - shadow->tsc_timestamp;
@@ -121,6 +83,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
static atomic64_t last_value = ATOMIC64_INIT(0);
+void pvclock_resume(void)
+{
+ atomic64_set(&last_value, 0);
+}
+
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c495aa8d4815..fc7aae1e2bc7 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -18,6 +18,7 @@
#include <asm/pci_x86.h>
#include <asm/virtext.h>
#include <asm/cpu.h>
+#include <asm/nmi.h>
#ifdef CONFIG_X86_32
# include <linux/ctype.h>
@@ -747,7 +748,7 @@ static int crash_nmi_callback(struct notifier_block *self,
{
int cpu;
- if (val != DIE_NMI_IPI)
+ if (val != DIE_NMI)
return NOTIFY_OK;
cpu = raw_smp_processor_id();
@@ -778,6 +779,8 @@ static void smp_send_nmi_allbutself(void)
static struct notifier_block crash_nmi_nb = {
.notifier_call = crash_nmi_callback,
+ /* we want to be the first one called */
+ .priority = NMI_LOCAL_HIGH_PRIOR+1,
};
/* Halt all other CPUs, calling the specified function on each of them
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
new file mode 100644
index 000000000000..2a26819bb6a8
--- /dev/null
+++ b/arch/x86/kernel/resource.c
@@ -0,0 +1,48 @@
+#include <linux/ioport.h>
+#include <asm/e820.h>
+
+static void resource_clip(struct resource *res, resource_size_t start,
+ resource_size_t end)
+{
+ resource_size_t low = 0, high = 0;
+
+ if (res->end < start || res->start > end)
+ return; /* no conflict */
+
+ if (res->start < start)
+ low = start - res->start;
+
+ if (res->end > end)
+ high = res->end - end;
+
+ /* Keep the area above or below the conflict, whichever is larger */
+ if (low > high)
+ res->end = start - 1;
+ else
+ res->start = end + 1;
+}
+
+static void remove_e820_regions(struct resource *avail)
+{
+ int i;
+ struct e820entry *entry;
+
+ for (i = 0; i < e820.nr_map; i++) {
+ entry = &e820.map[i];
+
+ resource_clip(avail, entry->addr,
+ entry->addr + entry->size - 1);
+ }
+}
+
+void arch_remove_reservations(struct resource *avail)
+{
+ /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
+ if (avail->flags & IORESOURCE_MEM) {
+ if (avail->start < BIOS_END)
+ avail->start = BIOS_END;
+ resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
+
+ remove_e820_regions(avail);
+ }
+}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 1cfbbfc3ae26..6f39cab052d5 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -76,7 +76,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
CMOS_WRITE(real_seconds, RTC_SECONDS);
CMOS_WRITE(real_minutes, RTC_MINUTES);
} else {
- printk(KERN_WARNING
+ printk_once(KERN_NOTICE
"set_rtc_mmss: can't update from %d to %d\n",
cmos_minutes, real_minutes);
retval = -1;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 68d535a77df0..ca2f10622a79 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -501,7 +501,18 @@ static inline unsigned long long get_total_mem(void)
return total << PAGE_SHIFT;
}
-#define DEFAULT_BZIMAGE_ADDR_MAX 0x37FFFFFF
+/*
+ * Keep the crash kernel below this limit. On 32 bits earlier kernels
+ * would limit the kernel to the low 512 MiB due to mapping restrictions.
+ * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
+ * limit once kexec-tools are fixed.
+ */
+#ifdef CONFIG_X86_32
+# define CRASH_KERNEL_ADDR_MAX (512 << 20)
+#else
+# define CRASH_KERNEL_ADDR_MAX (896 << 20)
+#endif
+
static void __init reserve_crashkernel(void)
{
unsigned long long total_mem;
@@ -520,10 +531,10 @@ static void __init reserve_crashkernel(void)
const unsigned long long alignment = 16<<20; /* 16M */
/*
- * kexec want bzImage is below DEFAULT_BZIMAGE_ADDR_MAX
+ * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
*/
crash_base = memblock_find_in_range(alignment,
- DEFAULT_BZIMAGE_ADDR_MAX, crash_size, alignment);
+ CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
if (crash_base == MEMBLOCK_ERROR) {
pr_info("crashkernel reservation failed - No suitable area found.\n");
@@ -694,7 +705,7 @@ static u64 __init get_max_mapped(void)
void __init setup_arch(char **cmdline_p)
{
int acpi = 0;
- int k8 = 0;
+ int amd = 0;
unsigned long flags;
#ifdef CONFIG_X86_32
@@ -769,7 +780,6 @@ void __init setup_arch(char **cmdline_p)
x86_init.oem.arch_setup();
- resource_alloc_from_bottom = 0;
iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
setup_memory_map();
parse_setup_data();
@@ -981,12 +991,12 @@ void __init setup_arch(char **cmdline_p)
acpi = acpi_numa_init();
#endif
-#ifdef CONFIG_K8_NUMA
+#ifdef CONFIG_AMD_NUMA
if (!acpi)
- k8 = !k8_numa_init(0, max_pfn);
+ amd = !amd_numa_init(0, max_pfn);
#endif
- initmem_init(0, max_pfn, acpi, k8);
+ initmem_init(0, max_pfn, acpi, amd);
memblock_find_dma_reserve();
dma32_reserve_bootmem();
@@ -1035,10 +1045,7 @@ void __init setup_arch(char **cmdline_p)
#endif
init_apic_mappings();
- ioapic_init_mappings();
-
- /* need to wait for io_apic is mapped */
- probe_nr_irqs_gsi();
+ ioapic_and_gsi_init();
kvm_guest_init();
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 083e99d1b7df..0cbe8c0b35ed 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,12 +97,12 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
*/
static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
-void cpu_hotplug_driver_lock()
+void cpu_hotplug_driver_lock(void)
{
mutex_lock(&x86_cpu_hotplug_driver_mutex);
}
-void cpu_hotplug_driver_unlock()
+void cpu_hotplug_driver_unlock(void)
{
mutex_unlock(&x86_cpu_hotplug_driver_mutex);
}
@@ -281,6 +281,13 @@ static void __cpuinit smp_callin(void)
*/
smp_store_cpu_info(cpuid);
+ /*
+ * This must be done before setting cpu_online_mask
+ * or calling notify_cpu_starting.
+ */
+ set_cpu_sibling_map(raw_smp_processor_id());
+ wmb();
+
notify_cpu_starting(cpuid);
/*
@@ -316,16 +323,6 @@ notrace static void __cpuinit start_secondary(void *unused)
*/
check_tsc_sync_target();
- if (nmi_watchdog == NMI_IO_APIC) {
- legacy_pic->mask(0);
- enable_NMI_through_LVT0();
- legacy_pic->unmask(0);
- }
-
- /* This must be done before setting cpu_online_mask */
- set_cpu_sibling_map(raw_smp_processor_id());
- wmb();
-
/*
* We need to hold call_lock, so there is no inconsistency
* between the time smp_call_function() determines number of
@@ -430,7 +427,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
cpumask_set_cpu(cpu, c->llc_shared_map);
- if (current_cpu_data.x86_max_cores == 1) {
+ if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
c->booted_cores = 1;
return;
@@ -1061,8 +1058,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
printk(KERN_INFO "SMP mode deactivated.\n");
smpboot_clear_io_apic();
- localise_nmi_watchdog();
-
connect_bsp_APIC();
setup_local_APIC();
end_local_APIC_setup();
@@ -1094,7 +1089,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
preempt_disable();
smp_cpu_index_default();
- current_cpu_data = boot_cpu_data;
+ memcpy(__this_cpu_ptr(&cpu_info), &boot_cpu_data, sizeof(cpu_info));
cpumask_copy(cpu_callin_mask, cpumask_of(0));
mb();
/*
@@ -1166,6 +1161,20 @@ out:
preempt_enable();
}
+void arch_disable_nonboot_cpus_begin(void)
+{
+ /*
+ * Avoid the smp alternatives switch during the disable_nonboot_cpus().
+ * In the suspend path, we will be back in the SMP mode shortly anyways.
+ */
+ skip_smp_alternatives = true;
+}
+
+void arch_disable_nonboot_cpus_end(void)
+{
+ skip_smp_alternatives = false;
+}
+
void arch_enable_nonboot_cpus_begin(void)
{
set_mtrr_aps_delayed_init();
@@ -1196,7 +1205,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
#endif
- check_nmi_watchdog();
mtrr_aps_init();
}
@@ -1341,8 +1349,6 @@ int native_cpu_disable(void)
if (cpu == 0)
return -EBUSY;
- if (nmi_watchdog == NMI_LOCAL_APIC)
- stop_apic_nmi_watchdog(NULL);
clear_local_APIC();
cpu_disable_common();
@@ -1377,7 +1383,7 @@ void play_dead_common(void)
mb();
/* Ack it */
- __get_cpu_var(cpu_state) = CPU_DEAD;
+ __this_cpu_write(cpu_state, CPU_DEAD);
/*
* With physical CPU hotplug, we should halt the cpu
@@ -1396,12 +1402,13 @@ static inline void mwait_play_dead(void)
unsigned int highest_subcstate = 0;
int i;
void *mwait_ptr;
+ struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
- if (!cpu_has(&current_cpu_data, X86_FEATURE_MWAIT))
+ if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)))
return;
- if (!cpu_has(&current_cpu_data, X86_FEATURE_CLFLSH))
+ if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH))
return;
- if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+ if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
return;
eax = CPUID_MWAIT_LEAF;
@@ -1452,7 +1459,7 @@ static inline void mwait_play_dead(void)
static inline void hlt_play_dead(void)
{
- if (current_cpu_data.x86 >= 4)
+ if (__this_cpu_read(cpu_info.x86) >= 4)
wbinvd();
while (1) {
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index b53c525368a7..938c8e10a19a 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -73,22 +73,22 @@ static const struct stacktrace_ops save_stack_ops_nosched = {
*/
void save_stack_trace(struct stack_trace *trace)
{
- dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace);
+ dump_trace(current, NULL, NULL, &save_stack_ops, trace);
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
EXPORT_SYMBOL_GPL(save_stack_trace);
-void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp)
+void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
{
- dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace);
+ dump_trace(current, regs, NULL, &save_stack_ops, trace);
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
- dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace);
+ dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
if (trace->nr_entries < trace->max_entries)
trace->entries[trace->nr_entries++] = ULONG_MAX;
}
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e2..998e972f3b1a 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
pmd = pmd_alloc(&tboot_mm, pud, vaddr);
if (!pmd)
return -1;
- pte = pte_alloc_map(&tboot_mm, pmd, vaddr);
+ pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
if (!pte)
return -1;
set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index fb5cc5e14cfa..25a28a245937 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -22,10 +22,6 @@
#include <asm/hpet.h>
#include <asm/time.h>
-#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
-int timer_ack;
-#endif
-
#ifdef CONFIG_X86_64
volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES;
#endif
@@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
/* Keep nmi watchdog up to date */
inc_irq_stat(irq0_irqs);
- /* Optimized out for !IO_APIC and x86_64 */
- if (timer_ack) {
- /*
- * Subtle, when I/O APICs are used we have to ack timer IRQ
- * manually to deassert NMI lines for the watchdog if run
- * on an 82489DX-based system.
- */
- raw_spin_lock(&i8259A_lock);
- outb(0x0c, PIC_MASTER_OCW3);
- /* Ack the IRQ; AEOI will end it automatically. */
- inb(PIC_MASTER_POLL);
- raw_spin_unlock(&i8259A_lock);
- }
-
global_clock_event->event_handler(global_clock_event);
/* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 3af2dff58b21..075d130efcf9 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -127,7 +127,7 @@ startup_64:
no_longmode:
hlt
jmp no_longmode
-#include "verify_cpu_64.S"
+#include "verify_cpu.S"
# Careful these need to be in the same 64K segment as the above;
tidt:
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index cb838ca42c96..b9b67166f9de 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,6 +83,13 @@ EXPORT_SYMBOL_GPL(used_vectors);
static int ignore_nmis;
+int unknown_nmi_panic;
+/*
+ * Prevent NMI reason port (0x61) being accessed simultaneously, can
+ * only be used in NMI handler.
+ */
+static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
+
static inline void conditional_sti(struct pt_regs *regs)
{
if (regs->flags & X86_EFLAGS_IF)
@@ -300,16 +307,23 @@ gp_in_kernel:
die("general protection fault", regs, error_code);
}
-static notrace __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs *regs)
+static int __init setup_unknown_nmi_panic(char *str)
{
- printk(KERN_EMERG
- "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
+ unknown_nmi_panic = 1;
+ return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
- printk(KERN_EMERG
- "You have some hardware problem, likely on the PCI bus.\n");
+static notrace __kprobes void
+pci_serr_error(unsigned char reason, struct pt_regs *regs)
+{
+ pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+ /*
+ * On some machines, PCI SERR line is used to report memory
+ * errors. EDAC makes use of it.
+ */
#if defined(CONFIG_EDAC)
if (edac_handler_set()) {
edac_atomic_assert_error();
@@ -320,11 +334,11 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
if (panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ pr_emerg("Dazed and confused, but trying to continue\n");
- /* Clear and disable the memory parity error line. */
- reason = (reason & 0xf) | 4;
- outb(reason, 0x61);
+ /* Clear and disable the PCI SERR error line. */
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
+ outb(reason, NMI_REASON_PORT);
}
static notrace __kprobes void
@@ -332,22 +346,26 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
- printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+ pr_emerg(
+ "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
show_registers(regs);
if (panic_on_io_nmi)
panic("NMI IOCK error: Not continuing");
/* Re-enable the IOCK line, wait for a few seconds */
- reason = (reason & 0xf) | 8;
- outb(reason, 0x61);
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
- i = 2000;
- while (--i)
- udelay(1000);
+ i = 20000;
+ while (--i) {
+ touch_nmi_watchdog();
+ udelay(100);
+ }
- reason &= ~8;
- outb(reason, 0x61);
+ reason &= ~NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
}
static notrace __kprobes void
@@ -366,69 +384,50 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
return;
}
#endif
- printk(KERN_EMERG
- "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
+ pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
- printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
- if (panic_on_unrecovered_nmi)
+ pr_emerg("Do you have a strange power saving mode enabled?\n");
+ if (unknown_nmi_panic || panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ pr_emerg("Dazed and confused, but trying to continue\n");
}
static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
- int cpu;
- cpu = smp_processor_id();
-
- /* Only the BSP gets external NMIs from the system. */
- if (!cpu)
- reason = get_nmi_reason();
-
- if (!(reason & 0xc0)) {
- if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
- return;
+ /*
+ * CPU-specific NMI must be processed before non-CPU-specific
+ * NMI, otherwise we may lose it, because the CPU-specific
+ * NMI can not be detected/processed on other CPUs.
+ */
+ if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
+ return;
-#ifdef CONFIG_X86_LOCAL_APIC
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
- return;
+ /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
+ raw_spin_lock(&nmi_reason_lock);
+ reason = get_nmi_reason();
-#ifndef CONFIG_LOCKUP_DETECTOR
+ if (reason & NMI_REASON_MASK) {
+ if (reason & NMI_REASON_SERR)
+ pci_serr_error(reason, regs);
+ else if (reason & NMI_REASON_IOCHK)
+ io_check_error(reason, regs);
+#ifdef CONFIG_X86_32
/*
- * Ok, so this is none of the documented NMI sources,
- * so it must be the NMI watchdog.
+ * Reassert NMI in case it became active
+ * meanwhile as it's edge-triggered:
*/
- if (nmi_watchdog_tick(regs, reason))
- return;
- if (!do_nmi_callback(regs, cpu))
-#endif /* !CONFIG_LOCKUP_DETECTOR */
- unknown_nmi_error(reason, regs);
-#else
- unknown_nmi_error(reason, regs);
+ reassert_nmi();
#endif
-
+ raw_spin_unlock(&nmi_reason_lock);
return;
}
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
- return;
+ raw_spin_unlock(&nmi_reason_lock);
- /* AK: following checks seem to be broken on modern chipsets. FIXME */
- if (reason & 0x80)
- mem_parity_error(reason, regs);
- if (reason & 0x40)
- io_check_error(reason, regs);
-#ifdef CONFIG_X86_32
- /*
- * Reassert NMI in case it became active meanwhile
- * as it's edge-triggered:
- */
- reassert_nmi();
-#endif
+ unknown_nmi_error(reason, regs);
}
dotraplinkage notrace __kprobes void
@@ -446,14 +445,12 @@ do_nmi(struct pt_regs *regs, long error_code)
void stop_nmi(void)
{
- acpi_nmi_disable();
ignore_nmis++;
}
void restart_nmi(void)
{
ignore_nmis--;
- acpi_nmi_enable();
}
/* May run on IST stack. */
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 0c40d8b72416..ffe5755caa8b 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -464,7 +464,7 @@ unsigned long native_calibrate_tsc(void)
tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
/* hpet or pmtimer available ? */
- if (!hpet && !ref1 && !ref2)
+ if (ref1 == ref2)
continue;
/* Check, whether the sampling was disturbed by an SMI */
@@ -659,7 +659,7 @@ void restore_sched_clock_state(void)
local_irq_save(flags);
- __get_cpu_var(cyc2ns_offset) = 0;
+ __this_cpu_write(cyc2ns_offset, 0);
offset = cyc2ns_suspend - sched_clock();
for_each_possible_cpu(cpu)
@@ -872,6 +872,9 @@ __cpuinit int unsynchronized_tsc(void)
if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
return 0;
+
+ if (tsc_clocksource_reliable)
+ return 0;
/*
* Intel systems are normally all synchronized.
* Exceptions must mark TSC as unstable:
@@ -879,14 +882,92 @@ __cpuinit int unsynchronized_tsc(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
/* assume multi socket systems are not synchronized: */
if (num_possible_cpus() > 1)
- tsc_unstable = 1;
+ return 1;
}
- return tsc_unstable;
+ return 0;
+}
+
+
+static void tsc_refine_calibration_work(struct work_struct *work);
+static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
+/**
+ * tsc_refine_calibration_work - Further refine tsc freq calibration
+ * @work - ignored.
+ *
+ * This functions uses delayed work over a period of a
+ * second to further refine the TSC freq value. Since this is
+ * timer based, instead of loop based, we don't block the boot
+ * process while this longer calibration is done.
+ *
+ * If there are any calibration anomolies (too many SMIs, etc),
+ * or the refined calibration is off by 1% of the fast early
+ * calibration, we throw out the new calibration and use the
+ * early calibration.
+ */
+static void tsc_refine_calibration_work(struct work_struct *work)
+{
+ static u64 tsc_start = -1, ref_start;
+ static int hpet;
+ u64 tsc_stop, ref_stop, delta;
+ unsigned long freq;
+
+ /* Don't bother refining TSC on unstable systems */
+ if (check_tsc_unstable())
+ goto out;
+
+ /*
+ * Since the work is started early in boot, we may be
+ * delayed the first time we expire. So set the workqueue
+ * again once we know timers are working.
+ */
+ if (tsc_start == -1) {
+ /*
+ * Only set hpet once, to avoid mixing hardware
+ * if the hpet becomes enabled later.
+ */
+ hpet = is_hpet_enabled();
+ schedule_delayed_work(&tsc_irqwork, HZ);
+ tsc_start = tsc_read_refs(&ref_start, hpet);
+ return;
+ }
+
+ tsc_stop = tsc_read_refs(&ref_stop, hpet);
+
+ /* hpet or pmtimer available ? */
+ if (ref_start == ref_stop)
+ goto out;
+
+ /* Check, whether the sampling was disturbed by an SMI */
+ if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
+ goto out;
+
+ delta = tsc_stop - tsc_start;
+ delta *= 1000000LL;
+ if (hpet)
+ freq = calc_hpet_ref(delta, ref_start, ref_stop);
+ else
+ freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+
+ /* Make sure we're within 1% */
+ if (abs(tsc_khz - freq) > tsc_khz/100)
+ goto out;
+
+ tsc_khz = freq;
+ printk(KERN_INFO "Refined TSC clocksource calibration: "
+ "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
+
+out:
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
}
-static void __init init_tsc_clocksource(void)
+
+static int __init init_tsc_clocksource(void)
{
+ if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz)
+ return 0;
+
if (tsc_clocksource_reliable)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
/* lower the rating if we already know its unstable: */
@@ -894,8 +975,14 @@ static void __init init_tsc_clocksource(void)
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
}
- clocksource_register_khz(&clocksource_tsc, tsc_khz);
+ schedule_delayed_work(&tsc_irqwork, 0);
+ return 0;
}
+/*
+ * We use device_initcall here, to ensure we run after the hpet
+ * is fully initialized, which may occur at fs_initcall time.
+ */
+device_initcall(init_tsc_clocksource);
void __init tsc_init(void)
{
@@ -949,6 +1036,5 @@ void __init tsc_init(void)
mark_tsc_unstable("TSCs unsynchronized");
check_system_tsc_reliable();
- init_tsc_clocksource();
}
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu.S
index 56a8c2a867d9..0edefc19a113 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -7,6 +7,7 @@
* Copyright (c) 2007 Andi Kleen (ak@suse.de)
* Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
* Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
+ * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com)
*
* This source code is licensed under the GNU General Public License,
* Version 2. See the file COPYING for more details.
@@ -14,18 +15,17 @@
* This is a common code for verification whether CPU supports
* long mode and SSE or not. It is not called directly instead this
* file is included at various places and compiled in that context.
- * Following are the current usage.
+ * This file is expected to run in 32bit code. Currently:
*
- * This file is included by both 16bit and 32bit code.
+ * arch/x86/boot/compressed/head_64.S: Boot cpu verification
+ * arch/x86/kernel/trampoline_64.S: secondary processor verfication
+ * arch/x86/kernel/head_32.S: processor startup
*
- * arch/x86_64/boot/setup.S : Boot cpu verification (16bit)
- * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
- * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
- * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
- *
- * verify_cpu, returns the status of cpu check in register %eax.
+ * verify_cpu, returns the status of longmode and SSE in register %eax.
* 0: Success 1: Failure
*
+ * On Intel, the XD_DISABLE flag will be cleared as a side-effect.
+ *
* The caller needs to check for the error code and take the action
* appropriately. Either display a message or halt.
*/
@@ -62,8 +62,41 @@ verify_cpu:
cmpl $0x444d4163,%ecx
jnz verify_cpu_noamd
mov $1,%di # cpu is from AMD
+ jmp verify_cpu_check
verify_cpu_noamd:
+ cmpl $0x756e6547,%ebx # GenuineIntel?
+ jnz verify_cpu_check
+ cmpl $0x49656e69,%edx
+ jnz verify_cpu_check
+ cmpl $0x6c65746e,%ecx
+ jnz verify_cpu_check
+
+ # only call IA32_MISC_ENABLE when:
+ # family > 6 || (family == 6 && model >= 0xd)
+ movl $0x1, %eax # check CPU family and model
+ cpuid
+ movl %eax, %ecx
+
+ andl $0x0ff00f00, %eax # mask family and extended family
+ shrl $8, %eax
+ cmpl $6, %eax
+ ja verify_cpu_clear_xd # family > 6, ok
+ jb verify_cpu_check # family < 6, skip
+
+ andl $0x000f00f0, %ecx # mask model and extended model
+ shrl $4, %ecx
+ cmpl $0xd, %ecx
+ jb verify_cpu_check # family == 6, model < 0xd, skip
+
+verify_cpu_clear_xd:
+ movl $MSR_IA32_MISC_ENABLE, %ecx
+ rdmsr
+ btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
+ jnc verify_cpu_check # only write MSR if bit was changed
+ wrmsr
+
+verify_cpu_check:
movl $0x1,%eax # Does the cpu have what it takes
cpuid
andl $REQUIRED_MASK0,%edx
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 61fb98519622..863f8753ab0a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
+ split_huge_page_pmd(mm, pmd);
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e03530aebfd0..bf4700755184 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ jiffies_64 = jiffies;
PHDRS {
text PT_LOAD FLAGS(5); /* R_E */
- data PT_LOAD FLAGS(7); /* RWE */
+ data PT_LOAD FLAGS(6); /* RW_ */
#ifdef CONFIG_X86_64
user PT_LOAD FLAGS(5); /* R_E */
#ifdef CONFIG_SMP
@@ -116,6 +116,10 @@ SECTIONS
EXCEPTION_TABLE(16) :text = 0x9090
+#if defined(CONFIG_DEBUG_RODATA)
+ /* .text should occupy whole number of pages */
+ . = ALIGN(PAGE_SIZE);
+#endif
X64_ALIGN_DEBUG_RODATA_BEGIN
RO_DATA(PAGE_SIZE)
X64_ALIGN_DEBUG_RODATA_END
@@ -335,7 +339,7 @@ SECTIONS
__bss_start = .;
*(.bss..page_aligned)
*(.bss)
- . = ALIGN(4);
+ . = ALIGN(PAGE_SIZE);
__bss_stop = .;
}
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 9c253bd65e24..547128546cc3 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -394,7 +394,8 @@ static void __init setup_xstate_init(void)
* Setup init_xstate_buf to represent the init state of
* all the features managed by the xsave
*/
- init_xstate_buf = alloc_bootmem(xstate_size);
+ init_xstate_buf = alloc_bootmem_align(xstate_size,
+ __alignof__(struct xsave_struct));
init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
clts();
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index ddc131ff438f..50f63648ce1b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
select HAVE_KVM_IRQCHIP
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
+ select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
---help---
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035c4bd9..f15501f431c8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
-EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
+ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
CFLAGS_x86.o := -I.
CFLAGS_svm.o := -I.
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
coalesced_mmio.o irq_comm.o eventfd.o \
assigned-dev.o)
kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
+kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
i8254.o timer.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 38b6e8dafaff..caf966781d25 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -20,16 +20,8 @@
* From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
*/
-#ifndef __KERNEL__
-#include <stdio.h>
-#include <stdint.h>
-#include <public/xen.h>
-#define DPRINTF(_f, _a ...) printf(_f , ## _a)
-#else
#include <linux/kvm_host.h>
#include "kvm_cache_regs.h"
-#define DPRINTF(x...) do {} while (0)
-#endif
#include <linux/module.h>
#include <asm/kvm_emulate.h>
@@ -418,9 +410,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
}
static inline unsigned long
-register_address(struct decode_cache *c, unsigned long base, unsigned long reg)
+register_address(struct decode_cache *c, unsigned long reg)
{
- return base + address_mask(c, reg);
+ return address_mask(c, reg);
}
static inline void
@@ -452,60 +444,55 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
return ops->get_cached_segment_base(seg, ctxt->vcpu);
}
-static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops,
- struct decode_cache *c)
+static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
+ struct x86_emulate_ops *ops,
+ struct decode_cache *c)
{
if (!c->has_seg_override)
return 0;
- return seg_base(ctxt, ops, c->seg_override);
+ return c->seg_override;
}
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
+static ulong linear(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr)
{
- return seg_base(ctxt, ops, VCPU_SREG_ES);
-}
-
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
- struct x86_emulate_ops *ops)
-{
- return seg_base(ctxt, ops, VCPU_SREG_SS);
-}
+ struct decode_cache *c = &ctxt->decode;
+ ulong la;
-static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
- u32 error, bool valid)
-{
- ctxt->exception = vec;
- ctxt->error_code = error;
- ctxt->error_code_valid = valid;
+ la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
+ if (c->ad_bytes != 8)
+ la &= (u32)-1;
+ return la;
}
-static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+ u32 error, bool valid)
{
- emulate_exception(ctxt, GP_VECTOR, err, true);
+ ctxt->exception.vector = vec;
+ ctxt->exception.error_code = error;
+ ctxt->exception.error_code_valid = valid;
+ return X86EMUL_PROPAGATE_FAULT;
}
-static void emulate_pf(struct x86_emulate_ctxt *ctxt)
+static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
{
- emulate_exception(ctxt, PF_VECTOR, 0, true);
+ return emulate_exception(ctxt, GP_VECTOR, err, true);
}
-static void emulate_ud(struct x86_emulate_ctxt *ctxt)
+static int emulate_ud(struct x86_emulate_ctxt *ctxt)
{
- emulate_exception(ctxt, UD_VECTOR, 0, false);
+ return emulate_exception(ctxt, UD_VECTOR, 0, false);
}
-static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
{
- emulate_exception(ctxt, TS_VECTOR, err, true);
+ return emulate_exception(ctxt, TS_VECTOR, err, true);
}
static int emulate_de(struct x86_emulate_ctxt *ctxt)
{
- emulate_exception(ctxt, DE_VECTOR, 0, false);
- return X86EMUL_PROPAGATE_FAULT;
+ return emulate_exception(ctxt, DE_VECTOR, 0, false);
}
static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -520,7 +507,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
cur_size = fc->end - fc->start;
size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size,
- size, ctxt->vcpu, NULL);
+ size, ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
fc->end += size;
@@ -564,7 +551,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
static int read_descriptor(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops,
- ulong addr,
+ struct segmented_address addr,
u16 *size, unsigned long *address, int op_bytes)
{
int rc;
@@ -572,10 +559,13 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
if (op_bytes == 2)
op_bytes = 3;
*address = 0;
- rc = ops->read_std(addr, (unsigned long *)size, 2, ctxt->vcpu, NULL);
+ rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2,
+ ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = ops->read_std(addr + 2, address, op_bytes, ctxt->vcpu, NULL);
+ addr.ea += 2;
+ rc = ops->read_std(linear(ctxt, addr), address, op_bytes,
+ ctxt->vcpu, &ctxt->exception);
return rc;
}
@@ -768,7 +758,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
break;
}
}
- op->addr.mem = modrm_ea;
+ op->addr.mem.ea = modrm_ea;
done:
return rc;
}
@@ -783,13 +773,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt,
op->type = OP_MEM;
switch (c->ad_bytes) {
case 2:
- op->addr.mem = insn_fetch(u16, 2, c->eip);
+ op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
break;
case 4:
- op->addr.mem = insn_fetch(u32, 4, c->eip);
+ op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
break;
case 8:
- op->addr.mem = insn_fetch(u64, 8, c->eip);
+ op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
break;
}
done:
@@ -808,7 +798,7 @@ static void fetch_bit_operand(struct decode_cache *c)
else if (c->src.bytes == 4)
sv = (s32)c->src.val & (s32)mask;
- c->dst.addr.mem += (sv >> 3);
+ c->dst.addr.mem.ea += (sv >> 3);
}
/* only subword offset */
@@ -821,7 +811,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
{
int rc;
struct read_cache *mc = &ctxt->decode.mem_read;
- u32 err;
while (size) {
int n = min(size, 8u);
@@ -829,10 +818,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
if (mc->pos < mc->end)
goto read_cached;
- rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
- ctxt->vcpu);
- if (rc == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
+ rc = ops->read_emulated(addr, mc->data + mc->end, n,
+ &ctxt->exception, ctxt->vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
mc->end += n;
@@ -907,19 +894,15 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
struct desc_ptr dt;
u16 index = selector >> 3;
int ret;
- u32 err;
ulong addr;
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
- if (dt.size < index * 8 + 7) {
- emulate_gp(ctxt, selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, selector & 0xfffc);
addr = dt.address + index * 8;
- ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
+ ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,
+ &ctxt->exception);
return ret;
}
@@ -931,21 +914,17 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
{
struct desc_ptr dt;
u16 index = selector >> 3;
- u32 err;
ulong addr;
int ret;
get_descriptor_table_ptr(ctxt, ops, selector, &dt);
- if (dt.size < index * 8 + 7) {
- emulate_gp(ctxt, selector & 0xfffc);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (dt.size < index * 8 + 7)
+ return emulate_gp(ctxt, selector & 0xfffc);
addr = dt.address + index * 8;
- ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
+ ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu,
+ &ctxt->exception);
return ret;
}
@@ -1092,7 +1071,6 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
{
int rc;
struct decode_cache *c = &ctxt->decode;
- u32 err;
switch (c->dst.type) {
case OP_REG:
@@ -1101,21 +1079,19 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
case OP_MEM:
if (c->lock_prefix)
rc = ops->cmpxchg_emulated(
- c->dst.addr.mem,
+ linear(ctxt, c->dst.addr.mem),
&c->dst.orig_val,
&c->dst.val,
c->dst.bytes,
- &err,
+ &ctxt->exception,
ctxt->vcpu);
else
rc = ops->write_emulated(
- c->dst.addr.mem,
+ linear(ctxt, c->dst.addr.mem),
&c->dst.val,
c->dst.bytes,
- &err,
+ &ctxt->exception,
ctxt->vcpu);
- if (rc == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
if (rc != X86EMUL_CONTINUE)
return rc;
break;
@@ -1137,8 +1113,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
c->dst.bytes = c->op_bytes;
c->dst.val = c->src.val;
register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
- c->dst.addr.mem = register_address(c, ss_base(ctxt, ops),
- c->regs[VCPU_REGS_RSP]);
+ c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+ c->dst.addr.mem.seg = VCPU_SREG_SS;
}
static int emulate_pop(struct x86_emulate_ctxt *ctxt,
@@ -1147,10 +1123,11 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
{
struct decode_cache *c = &ctxt->decode;
int rc;
+ struct segmented_address addr;
- rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
- c->regs[VCPU_REGS_RSP]),
- dest, len);
+ addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
+ addr.seg = VCPU_SREG_SS;
+ rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1184,10 +1161,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
change_mask |= EFLG_IF;
break;
case X86EMUL_MODE_VM86:
- if (iopl < 3) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (iopl < 3)
+ return emulate_gp(ctxt, 0);
change_mask |= EFLG_IF;
break;
default: /* real mode */
@@ -1198,9 +1173,6 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
*(unsigned long *)dest =
(ctxt->eflags & ~change_mask) | (val & change_mask);
- if (rc == X86EMUL_PROPAGATE_FAULT)
- emulate_pf(ctxt);
-
return rc;
}
@@ -1287,7 +1259,6 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
gva_t cs_addr;
gva_t eip_addr;
u16 cs, eip;
- u32 err;
/* TODO: Add limit checks */
c->src.val = ctxt->eflags;
@@ -1317,11 +1288,11 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
eip_addr = dt.address + (irq << 2);
cs_addr = dt.address + (irq << 2) + 2;
- rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &err);
+ rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &err);
+ rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -1370,10 +1341,8 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
if (rc != X86EMUL_CONTINUE)
return rc;
- if (temp_eip & ~0xffff) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (temp_eip & ~0xffff)
+ return emulate_gp(ctxt, 0);
rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
@@ -1624,10 +1593,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* syscall is not available in real mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
- ctxt->mode == X86EMUL_MODE_VM86) {
- emulate_ud(ctxt);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_ud(ctxt);
setup_syscalls_segments(ctxt, ops, &cs, &ss);
@@ -1678,34 +1645,26 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
u16 cs_sel, ss_sel;
/* inject #GP if in real mode */
- if (ctxt->mode == X86EMUL_MODE_REAL) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (ctxt->mode == X86EMUL_MODE_REAL)
+ return emulate_gp(ctxt, 0);
/* XXX sysenter/sysexit have not been tested in 64bit mode.
* Therefore, we inject an #UD.
*/
- if (ctxt->mode == X86EMUL_MODE_PROT64) {
- emulate_ud(ctxt);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (ctxt->mode == X86EMUL_MODE_PROT64)
+ return emulate_ud(ctxt);
setup_syscalls_segments(ctxt, ops, &cs, &ss);
ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
switch (ctxt->mode) {
case X86EMUL_MODE_PROT32:
- if ((msr_data & 0xfffc) == 0x0) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
break;
case X86EMUL_MODE_PROT64:
- if (msr_data == 0x0) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (msr_data == 0x0)
+ return emulate_gp(ctxt, 0);
break;
}
@@ -1745,10 +1704,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
/* inject #GP if in real mode or Virtual 8086 mode */
if (ctxt->mode == X86EMUL_MODE_REAL ||
- ctxt->mode == X86EMUL_MODE_VM86) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ ctxt->mode == X86EMUL_MODE_VM86)
+ return emulate_gp(ctxt, 0);
setup_syscalls_segments(ctxt, ops, &cs, &ss);
@@ -1763,18 +1720,14 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
switch (usermode) {
case X86EMUL_MODE_PROT32:
cs_sel = (u16)(msr_data + 16);
- if ((msr_data & 0xfffc) == 0x0) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if ((msr_data & 0xfffc) == 0x0)
+ return emulate_gp(ctxt, 0);
ss_sel = (u16)(msr_data + 24);
break;
case X86EMUL_MODE_PROT64:
cs_sel = (u16)(msr_data + 32);
- if (msr_data == 0x0) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (msr_data == 0x0)
+ return emulate_gp(ctxt, 0);
ss_sel = cs_sel + 8;
cs.d = 0;
cs.l = 1;
@@ -1934,33 +1887,27 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
{
struct tss_segment_16 tss_seg;
int ret;
- u32 err, new_tss_base = get_desc_base(new_desc);
+ u32 new_tss_base = get_desc_base(new_desc);
ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
save_state_to_tss16(ctxt, ops, &tss_seg);
ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
@@ -1968,12 +1915,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
ret = ops->write_std(new_tss_base,
&tss_seg.prev_task_link,
sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ ctxt->vcpu, &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
}
return load_state_from_tss16(ctxt, ops, &tss_seg);
@@ -2013,10 +1958,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
struct decode_cache *c = &ctxt->decode;
int ret;
- if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (ops->set_cr(3, tss->cr3, ctxt->vcpu))
+ return emulate_gp(ctxt, 0);
c->eip = tss->eip;
ctxt->eflags = tss->eflags | 2;
c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2076,33 +2019,27 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
{
struct tss_segment_32 tss_seg;
int ret;
- u32 err, new_tss_base = get_desc_base(new_desc);
+ u32 new_tss_base = get_desc_base(new_desc);
ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
save_state_to_tss32(ctxt, ops, &tss_seg);
ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu,
- &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
if (old_tss_sel != 0xffff) {
tss_seg.prev_task_link = old_tss_sel;
@@ -2110,12 +2047,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
ret = ops->write_std(new_tss_base,
&tss_seg.prev_task_link,
sizeof tss_seg.prev_task_link,
- ctxt->vcpu, &err);
- if (ret == X86EMUL_PROPAGATE_FAULT) {
+ ctxt->vcpu, &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
/* FIXME: need to provide precise fault address */
- emulate_pf(ctxt);
return ret;
- }
}
return load_state_from_tss32(ctxt, ops, &tss_seg);
@@ -2146,10 +2081,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if (reason != TASK_SWITCH_IRET) {
if ((tss_selector & 3) > next_tss_desc.dpl ||
- ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ ops->cpl(ctxt->vcpu) > next_tss_desc.dpl)
+ return emulate_gp(ctxt, 0);
}
desc_limit = desc_limit_scaled(&next_tss_desc);
@@ -2231,14 +2164,15 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
}
-static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base,
+static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
int reg, struct operand *op)
{
struct decode_cache *c = &ctxt->decode;
int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
register_address_increment(c, &c->regs[reg], df * op->bytes);
- op->addr.mem = register_address(c, base, c->regs[reg]);
+ op->addr.mem.ea = register_address(c, c->regs[reg]);
+ op->addr.mem.seg = seg;
}
static int em_push(struct x86_emulate_ctxt *ctxt)
@@ -2369,10 +2303,8 @@ static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
struct decode_cache *c = &ctxt->decode;
u64 tsc = 0;
- if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) {
- emulate_gp(ctxt, 0);
- return X86EMUL_PROPAGATE_FAULT;
- }
+ if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD))
+ return emulate_gp(ctxt, 0);
ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
c->regs[VCPU_REGS_RAX] = (u32)tsc;
c->regs[VCPU_REGS_RDX] = tsc >> 32;
@@ -2647,7 +2579,7 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->type = OP_IMM;
op->bytes = size;
- op->addr.mem = c->eip;
+ op->addr.mem.ea = c->eip;
/* NB. Immediates are sign-extended as necessary. */
switch (op->bytes) {
case 1:
@@ -2678,7 +2610,7 @@ done:
}
int
-x86_decode_insn(struct x86_emulate_ctxt *ctxt)
+x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
{
struct x86_emulate_ops *ops = ctxt->ops;
struct decode_cache *c = &ctxt->decode;
@@ -2689,7 +2621,10 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt)
struct operand memop = { .type = OP_NONE };
c->eip = ctxt->eip;
- c->fetch.start = c->fetch.end = c->eip;
+ c->fetch.start = c->eip;
+ c->fetch.end = c->fetch.start + insn_len;
+ if (insn_len > 0)
+ memcpy(c->fetch.data, insn, insn_len);
ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
switch (mode) {
@@ -2803,10 +2738,8 @@ done_prefixes:
c->execute = opcode.u.execute;
/* Unrecognised? */
- if (c->d == 0 || (c->d & Undefined)) {
- DPRINTF("Cannot emulate %02x\n", c->b);
+ if (c->d == 0 || (c->d & Undefined))
return -1;
- }
if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
c->op_bytes = 8;
@@ -2831,14 +2764,13 @@ done_prefixes:
if (!c->has_seg_override)
set_seg_override(c, VCPU_SREG_DS);
- if (memop.type == OP_MEM && !(!c->twobyte && c->b == 0x8d))
- memop.addr.mem += seg_override_base(ctxt, ops, c);
+ memop.addr.mem.seg = seg_override(ctxt, ops, c);
if (memop.type == OP_MEM && c->ad_bytes != 8)
- memop.addr.mem = (u32)memop.addr.mem;
+ memop.addr.mem.ea = (u32)memop.addr.mem.ea;
if (memop.type == OP_MEM && c->rip_relative)
- memop.addr.mem += c->eip;
+ memop.addr.mem.ea += c->eip;
/*
* Decode and fetch the source operand: register, memory
@@ -2890,14 +2822,14 @@ done_prefixes:
case SrcSI:
c->src.type = OP_MEM;
c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->src.addr.mem =
- register_address(c, seg_override_base(ctxt, ops, c),
- c->regs[VCPU_REGS_RSI]);
+ c->src.addr.mem.ea =
+ register_address(c, c->regs[VCPU_REGS_RSI]);
+ c->src.addr.mem.seg = seg_override(ctxt, ops, c),
c->src.val = 0;
break;
case SrcImmFAddr:
c->src.type = OP_IMM;
- c->src.addr.mem = c->eip;
+ c->src.addr.mem.ea = c->eip;
c->src.bytes = c->op_bytes + 2;
insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
break;
@@ -2944,7 +2876,7 @@ done_prefixes:
break;
case DstImmUByte:
c->dst.type = OP_IMM;
- c->dst.addr.mem = c->eip;
+ c->dst.addr.mem.ea = c->eip;
c->dst.bytes = 1;
c->dst.val = insn_fetch(u8, 1, c->eip);
break;
@@ -2969,9 +2901,9 @@ done_prefixes:
case DstDI:
c->dst.type = OP_MEM;
c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
- c->dst.addr.mem =
- register_address(c, es_base(ctxt, ops),
- c->regs[VCPU_REGS_RDI]);
+ c->dst.addr.mem.ea =
+ register_address(c, c->regs[VCPU_REGS_RDI]);
+ c->dst.addr.mem.seg = VCPU_SREG_ES;
c->dst.val = 0;
break;
case ImplicitOps:
@@ -3020,24 +2952,24 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
ctxt->decode.mem_read.pos = 0;
if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
/* LOCK prefix is allowed only with some instructions */
if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
/* Privileged instruction can be executed only in CPL=0 */
if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
}
@@ -3050,7 +2982,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
}
if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
- rc = read_emulated(ctxt, ops, c->src.addr.mem,
+ rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem),
c->src.valptr, c->src.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3058,7 +2990,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
}
if (c->src2.type == OP_MEM) {
- rc = read_emulated(ctxt, ops, c->src2.addr.mem,
+ rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem),
&c->src2.val, c->src2.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3070,7 +3002,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
/* optimisation - avoid slow emulated read if Mov */
- rc = read_emulated(ctxt, ops, c->dst.addr.mem,
+ rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem),
&c->dst.val, c->dst.bytes);
if (rc != X86EMUL_CONTINUE)
goto done;
@@ -3215,13 +3147,13 @@ special_insn:
break;
case 0x8c: /* mov r/m, sreg */
if (c->modrm_reg > VCPU_SREG_GS) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
break;
case 0x8d: /* lea r16/r32, m */
- c->dst.val = c->src.addr.mem;
+ c->dst.val = c->src.addr.mem.ea;
break;
case 0x8e: { /* mov seg, r/m16 */
uint16_t sel;
@@ -3230,7 +3162,7 @@ special_insn:
if (c->modrm_reg == VCPU_SREG_CS ||
c->modrm_reg > VCPU_SREG_GS) {
- emulate_ud(ctxt);
+ rc = emulate_ud(ctxt);
goto done;
}
@@ -3268,7 +3200,6 @@ special_insn:
break;
case 0xa6 ... 0xa7: /* cmps */
c->dst.type = OP_NONE; /* Disable writeback. */
- DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.addr.mem, c->dst.addr.mem);
goto cmp;
case 0xa8 ... 0xa9: /* test ax, imm */
goto test;
@@ -3363,7 +3294,7 @@ special_insn:
do_io_in:
c->dst.bytes = min(c->dst.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
}
if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
@@ -3377,7 +3308,7 @@ special_insn:
c->src.bytes = min(c->src.bytes, 4u);
if (!emulator_io_permited(ctxt, ops, c->dst.val,
c->src.bytes)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
}
ops->pio_out_emulated(c->src.bytes, c->dst.val,
@@ -3402,14 +3333,14 @@ special_insn:
break;
case 0xfa: /* cli */
if (emulator_bad_iopl(ctxt, ops)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
} else
ctxt->eflags &= ~X86_EFLAGS_IF;
break;
case 0xfb: /* sti */
if (emulator_bad_iopl(ctxt, ops)) {
- emulate_gp(ctxt, 0);
+ rc = emulate_gp(ctxt, 0);
goto done;
} else {
ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
@@ -3449,11 +3380,11 @@ writeback:
c->dst.type = saved_dst_type;
if ((c->d & SrcMask) == SrcSI)
- string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
+ string_addr_inc(ctxt, seg_override(ctxt, ops, c),
VCPU_REGS_RSI, &c->src);
if ((c->d & DstMask) == DstDI)
- string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
+ string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
&c->dst);
if (c->rep_prefix && (c->d & String)) {
@@ -3482,6 +3413,8 @@ writeback:
ctxt->eip = c->eip;
done:
+ if (rc == X86EMUL_PROPAGATE_FAULT)
+ ctxt->have_exception = true;
return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
twobyte_insn:
@@ -3544,9 +3477,11 @@ twobyte_insn:
break;
case 5: /* not defined */
emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
case 7: /* invlpg*/
- emulate_invlpg(ctxt->vcpu, c->src.addr.mem);
+ emulate_invlpg(ctxt->vcpu,
+ linear(ctxt, c->src.addr.mem));
/* Disable writeback. */
c->dst.type = OP_NONE;
break;
@@ -3573,6 +3508,7 @@ twobyte_insn:
case 5 ... 7:
case 9 ... 15:
emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3581,6 +3517,7 @@ twobyte_insn:
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
@@ -3588,6 +3525,7 @@ twobyte_insn:
case 0x22: /* mov reg, cr */
if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) {
emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
c->dst.type = OP_NONE;
@@ -3596,6 +3534,7 @@ twobyte_insn:
if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
(c->modrm_reg == 4 || c->modrm_reg == 5)) {
emulate_ud(ctxt);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
@@ -3604,6 +3543,7 @@ twobyte_insn:
~0ULL : ~0U), ctxt->vcpu) < 0) {
/* #UD condition is already handled by the code above */
emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
@@ -3615,6 +3555,7 @@ twobyte_insn:
| ((u64)c->regs[VCPU_REGS_RDX] << 32);
if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
}
rc = X86EMUL_CONTINUE;
@@ -3623,6 +3564,7 @@ twobyte_insn:
/* rdmsr */
if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
emulate_gp(ctxt, 0);
+ rc = X86EMUL_PROPAGATE_FAULT;
goto done;
} else {
c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3785,6 +3727,5 @@ twobyte_insn:
goto writeback;
cannot_emulate:
- DPRINTF("Cannot emulate %02x\n", c->b);
return -1;
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index f628234fbeca..3cece05e4ac4 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -575,6 +575,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
s->pics[1].elcr_mask = 0xde;
s->pics[0].pics_state = s;
s->pics[1].pics_state = s;
+ s->pics[0].isr_ack = 0xff;
+ s->pics[1].isr_ack = 0xff;
/*
* Initialize PIO device
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 975bb45329a1..3377d53fcd36 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -73,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
return vcpu->arch.cr4 & mask;
}
+static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
+{
+ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ kvm_x86_ops->decache_cr3(vcpu);
+ return vcpu->arch.cr3;
+}
+
static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, ~0UL);
@@ -84,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
}
+static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags |= HF_GUEST_MASK;
+}
+
+static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.hflags &= ~HF_GUEST_MASK;
+}
+
+static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.hflags & HF_GUEST_MASK;
+}
+
#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 413f8973a855..93cf9d0d3653 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -277,7 +277,8 @@ static void apic_update_ppr(struct kvm_lapic *apic)
if (old_ppr != ppr) {
apic_set_reg(apic, APIC_PROCPRI, ppr);
- kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
+ if (ppr < old_ppr)
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
}
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fb8b376bf28c..f02b8edc3d44 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,9 +18,11 @@
*
*/
+#include "irq.h"
#include "mmu.h"
#include "x86.h"
#include "kvm_cache_regs.h"
+#include "x86.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -194,7 +196,6 @@ static struct percpu_counter kvm_total_used_mmu_pages;
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
-static u64 __read_mostly shadow_base_present_pte;
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
@@ -213,12 +214,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
-void kvm_mmu_set_base_ptes(u64 base_pte)
-{
- shadow_base_present_pte = base_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
-
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
{
@@ -482,46 +477,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
}
/*
- * Return the pointer to the largepage write count for a given
- * gfn, handling slots that are not large page aligned.
+ * Return the pointer to the large page information for a given gfn,
+ * handling slots that are not large page aligned.
*/
-static int *slot_largepage_idx(gfn_t gfn,
- struct kvm_memory_slot *slot,
- int level)
+static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
+ struct kvm_memory_slot *slot,
+ int level)
{
unsigned long idx;
idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
- return &slot->lpage_info[level - 2][idx].write_count;
+ return &slot->lpage_info[level - 2][idx];
}
static void account_shadowed(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
- int *write_count;
+ struct kvm_lpage_info *linfo;
int i;
slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- write_count = slot_largepage_idx(gfn, slot, i);
- *write_count += 1;
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->write_count += 1;
}
}
static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
- int *write_count;
+ struct kvm_lpage_info *linfo;
int i;
slot = gfn_to_memslot(kvm, gfn);
for (i = PT_DIRECTORY_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
- write_count = slot_largepage_idx(gfn, slot, i);
- *write_count -= 1;
- WARN_ON(*write_count < 0);
+ linfo = lpage_info_slot(gfn, slot, i);
+ linfo->write_count -= 1;
+ WARN_ON(linfo->write_count < 0);
}
}
@@ -530,12 +525,12 @@ static int has_wrprotected_page(struct kvm *kvm,
int level)
{
struct kvm_memory_slot *slot;
- int *largepage_idx;
+ struct kvm_lpage_info *linfo;
slot = gfn_to_memslot(kvm, gfn);
if (slot) {
- largepage_idx = slot_largepage_idx(gfn, slot, level);
- return *largepage_idx;
+ linfo = lpage_info_slot(gfn, slot, level);
+ return linfo->write_count;
}
return 1;
@@ -559,14 +554,18 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
return ret;
}
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
struct kvm_memory_slot *slot;
- int host_level, level, max_level;
-
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
if (slot && slot->dirty_bitmap)
- return PT_PAGE_TABLE_LEVEL;
+ return true;
+ return false;
+}
+
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+{
+ int host_level, level, max_level;
host_level = host_mapping_level(vcpu->kvm, large_gfn);
@@ -590,16 +589,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
{
struct kvm_memory_slot *slot;
- unsigned long idx;
+ struct kvm_lpage_info *linfo;
slot = gfn_to_memslot(kvm, gfn);
if (likely(level == PT_PAGE_TABLE_LEVEL))
return &slot->rmap[gfn - slot->base_gfn];
- idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
- (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
+ linfo = lpage_info_slot(gfn, slot, level);
- return &slot->lpage_info[level - 2][idx].rmap_pde;
+ return &linfo->rmap_pde;
}
/*
@@ -887,19 +885,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
end = start + (memslot->npages << PAGE_SHIFT);
if (hva >= start && hva < end) {
gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
+ gfn_t gfn = memslot->base_gfn + gfn_offset;
ret = handler(kvm, &memslot->rmap[gfn_offset], data);
for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
- unsigned long idx;
- int sh;
-
- sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
- idx = ((memslot->base_gfn+gfn_offset) >> sh) -
- (memslot->base_gfn >> sh);
- ret |= handler(kvm,
- &memslot->lpage_info[j][idx].rmap_pde,
- data);
+ struct kvm_lpage_info *linfo;
+
+ linfo = lpage_info_slot(gfn, memslot,
+ PT_DIRECTORY_LEVEL + j);
+ ret |= handler(kvm, &linfo->rmap_pde, data);
}
trace_kvm_age_page(hva, memslot, ret);
retval |= ret;
@@ -950,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
return young;
}
+static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long data)
+{
+ u64 *spte;
+ int young = 0;
+
+ /*
+ * If there's no access bit in the secondary pte set by the
+ * hardware it's up to gup-fast/gup to set the access bit in
+ * the primary pte or in the page structure.
+ */
+ if (!shadow_accessed_mask)
+ goto out;
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ u64 _spte = *spte;
+ BUG_ON(!(_spte & PT_PRESENT_MASK));
+ young = _spte & PT_ACCESSED_MASK;
+ if (young) {
+ young = 1;
+ break;
+ }
+ spte = rmap_next(kvm, rmapp, spte);
+ }
+out:
+ return young;
+}
+
#define RMAP_RECYCLE_THRESHOLD 1000
static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -970,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
}
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
+}
+
#ifdef MMU_DEBUG
static int is_empty_shadow_page(u64 *spt)
{
@@ -1161,7 +1190,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
}
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, bool clear_unsync)
+ struct kvm_mmu_page *sp)
{
return 1;
}
@@ -1291,7 +1320,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (clear_unsync)
kvm_unlink_unsync_page(vcpu->kvm, sp);
- if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
+ if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
@@ -1332,12 +1361,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
continue;
WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ kvm_unlink_unsync_page(vcpu->kvm, s);
if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
- (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
+ (vcpu->arch.mmu.sync_page(vcpu, s))) {
kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
continue;
}
- kvm_unlink_unsync_page(vcpu->kvm, s);
flush = true;
}
@@ -1963,9 +1992,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
- bool can_unsync, bool reset_host_protection)
+ bool can_unsync, bool host_writable)
{
- u64 spte;
+ u64 spte, entry = *sptep;
int ret = 0;
/*
@@ -1973,7 +2002,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
* whether the guest actually used the pte (in order to detect
* demand paging).
*/
- spte = shadow_base_present_pte;
+ spte = PT_PRESENT_MASK;
if (!speculative)
spte |= shadow_accessed_mask;
if (!dirty)
@@ -1990,8 +2019,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
- if (reset_host_protection)
+ if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
+ else
+ pte_access &= ~ACC_WRITE_MASK;
spte |= (u64)pfn << PAGE_SHIFT;
@@ -2036,6 +2067,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
set_pte:
update_spte(sptep, spte);
+ /*
+ * If we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB.
+ */
+ if (is_writable_pte(entry) && !is_writable_pte(*sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
done:
return ret;
}
@@ -2045,7 +2084,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
int user_fault, int write_fault, int dirty,
int *ptwrite, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
- bool reset_host_protection)
+ bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
@@ -2080,7 +2119,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
dirty, level, gfn, pfn, speculative, true,
- reset_host_protection)) {
+ host_writable)) {
if (write_fault)
*ptwrite = 1;
kvm_mmu_flush_tlb(vcpu);
@@ -2211,7 +2250,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
}
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int level, gfn_t gfn, pfn_t pfn)
+ int map_writable, int level, gfn_t gfn, pfn_t pfn,
+ bool prefault)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
@@ -2220,9 +2260,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
if (iterator.level == level) {
- mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
+ unsigned pte_access = ACC_ALL;
+
+ mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
0, write, 1, &pt_write,
- level, gfn, pfn, false, true);
+ level, gfn, pfn, prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
@@ -2277,27 +2319,81 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
return 1;
}
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
+ gfn_t *gfnp, pfn_t *pfnp, int *levelp)
+{
+ pfn_t pfn = *pfnp;
+ gfn_t gfn = *gfnp;
+ int level = *levelp;
+
+ /*
+ * Check if it's a transparent hugepage. If this would be an
+ * hugetlbfs page, level wouldn't be set to
+ * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
+ * here.
+ */
+ if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+ level == PT_PAGE_TABLE_LEVEL &&
+ PageTransCompound(pfn_to_page(pfn)) &&
+ !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
+ unsigned long mask;
+ /*
+ * mmu_notifier_retry was successful and we hold the
+ * mmu_lock here, so the pmd can't become splitting
+ * from under us, and in turn
+ * __split_huge_page_refcount() can't run from under
+ * us and we can safely transfer the refcount from
+ * PG_tail to PG_head as we switch the pfn to tail to
+ * head.
+ */
+ *levelp = level = PT_DIRECTORY_LEVEL;
+ mask = KVM_PAGES_PER_HPAGE(level) - 1;
+ VM_BUG_ON((gfn & mask) != (pfn & mask));
+ if (pfn & mask) {
+ gfn &= ~mask;
+ *gfnp = gfn;
+ kvm_release_pfn_clean(pfn);
+ pfn &= ~mask;
+ if (!get_page_unless_zero(pfn_to_page(pfn)))
+ BUG();
+ *pfnp = pfn;
+ }
+ }
+}
+
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+ gva_t gva, pfn_t *pfn, bool write, bool *writable);
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
+ bool prefault)
{
int r;
int level;
+ int force_pt_level;
pfn_t pfn;
unsigned long mmu_seq;
+ bool map_writable;
- level = mapping_level(vcpu, gfn);
-
- /*
- * This path builds a PAE pagetable - so we can map 2mb pages at
- * maximum. Therefore check if the level is larger than that.
- */
- if (level > PT_DIRECTORY_LEVEL)
- level = PT_DIRECTORY_LEVEL;
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+ if (likely(!force_pt_level)) {
+ level = mapping_level(vcpu, gfn);
+ /*
+ * This path builds a PAE pagetable - so we can map
+ * 2mb pages at maximum. Therefore check if the level
+ * is larger than that.
+ */
+ if (level > PT_DIRECTORY_LEVEL)
+ level = PT_DIRECTORY_LEVEL;
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ } else
+ level = PT_PAGE_TABLE_LEVEL;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+ if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
+ return 0;
/* mmio */
if (is_error_pfn(pfn))
@@ -2307,7 +2403,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, v, write, level, gfn, pfn);
+ if (likely(!force_pt_level))
+ transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
+ r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
+ prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -2394,7 +2493,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
ASSERT(!VALID_PAGE(root));
spin_lock(&vcpu->kvm->mmu_lock);
kvm_mmu_free_some_pages(vcpu);
- sp = kvm_mmu_get_page(vcpu, i << 30, i << 30,
+ sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
+ i << 30,
PT32_ROOT_LEVEL, 1, ACC_ALL,
NULL);
root = __pa(sp->spt);
@@ -2529,6 +2629,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
return;
}
for (i = 0; i < 4; ++i) {
@@ -2551,23 +2652,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
}
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access, u32 *error)
+ u32 access, struct x86_exception *exception)
{
- if (error)
- *error = 0;
+ if (exception)
+ exception->error_code = 0;
return vaddr;
}
static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access, u32 *error)
+ u32 access,
+ struct x86_exception *exception)
{
- if (error)
- *error = 0;
+ if (exception)
+ exception->error_code = 0;
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
}
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code)
+ u32 error_code, bool prefault)
{
gfn_t gfn;
int r;
@@ -2583,17 +2685,68 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn = gva >> PAGE_SHIFT;
return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn);
+ error_code & PFERR_WRITE_MASK, gfn, prefault);
+}
+
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+{
+ struct kvm_arch_async_pf arch;
+
+ arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
+ arch.gfn = gfn;
+ arch.direct_map = vcpu->arch.mmu.direct_map;
+ arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
+
+ return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
+}
+
+static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+ if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+ kvm_event_needs_reinjection(vcpu)))
+ return false;
+
+ return kvm_x86_ops->interrupt_allowed(vcpu);
}
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
- u32 error_code)
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
+ gva_t gva, pfn_t *pfn, bool write, bool *writable)
+{
+ bool async;
+
+ *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
+
+ if (!async)
+ return false; /* *pfn has correct page already */
+
+ put_page(pfn_to_page(*pfn));
+
+ if (!prefault && can_do_async_pf(vcpu)) {
+ trace_kvm_try_async_get_page(gva, gfn);
+ if (kvm_find_async_pf_gfn(vcpu, gfn)) {
+ trace_kvm_async_pf_doublefault(gva, gfn);
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return true;
+ } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
+ return true;
+ }
+
+ *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
+
+ return false;
+}
+
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
+ bool prefault)
{
pfn_t pfn;
int r;
int level;
+ int force_pt_level;
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
+ int write = error_code & PFERR_WRITE_MASK;
+ bool map_writable;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2602,21 +2755,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
if (r)
return r;
- level = mapping_level(vcpu, gfn);
-
- gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+ if (likely(!force_pt_level)) {
+ level = mapping_level(vcpu, gfn);
+ gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
+ } else
+ level = PT_PAGE_TABLE_LEVEL;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
+ return 0;
+
+ /* mmio */
if (is_error_pfn(pfn))
return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
- level, gfn, pfn);
+ if (likely(!force_pt_level))
+ transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
+ r = __direct_map(vcpu, gpa, write, map_writable,
+ level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
@@ -2658,18 +2820,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
- pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3);
+ pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
mmu_free_roots(vcpu);
}
static unsigned long get_cr3(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.cr3;
+ return kvm_read_cr3(vcpu);
}
-static void inject_page_fault(struct kvm_vcpu *vcpu)
+static void inject_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
{
- vcpu->arch.mmu.inject_page_fault(vcpu);
+ vcpu->arch.mmu.inject_page_fault(vcpu, fault);
}
static void paging_free(struct kvm_vcpu *vcpu)
@@ -2815,6 +2978,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = vcpu->arch.walk_mmu;
+ context->base_role.word = 0;
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = tdp_page_fault;
context->free = nonpaging_free;
@@ -3007,9 +3171,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
return;
}
- if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
- return;
-
++vcpu->kvm->stat.mmu_pte_updated;
if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
@@ -3263,12 +3424,13 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
}
}
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
+ void *insn, int insn_len)
{
int r;
enum emulation_result er;
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
if (r < 0)
goto out;
@@ -3281,7 +3443,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
if (r)
goto out;
- er = emulate_instruction(vcpu, cr2, error_code, 0);
+ er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
switch (er) {
case EMULATE_DONE:
@@ -3376,11 +3538,14 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (!test_bit(slot, sp->slot_bitmap))
continue;
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ continue;
+
pt = sp->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
if (is_writable_pte(pt[i]))
- pt[i] &= ~PT_WRITABLE_MASK;
+ update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
}
kvm_flush_remote_tlbs(kvm);
}
@@ -3462,13 +3627,6 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}
-void kvm_mmu_module_exit(void)
-{
- mmu_destroy_caches();
- percpu_counter_destroy(&kvm_total_used_mmu_pages);
- unregister_shrinker(&mmu_shrinker);
-}
-
int kvm_mmu_module_init(void)
{
pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3565,7 +3723,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
- (void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
+ (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
return 1;
}
@@ -3661,12 +3819,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
}
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void mmu_audit_disable(void) { }
-#endif
-
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
@@ -3674,5 +3826,18 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
destroy_kvm_mmu(vcpu);
free_mmu_pages(vcpu);
mmu_free_memory_caches(vcpu);
+}
+
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void mmu_audit_disable(void) { }
+#endif
+
+void kvm_mmu_module_exit(void)
+{
+ mmu_destroy_caches();
+ percpu_counter_destroy(&kvm_total_used_mmu_pages);
+ unregister_shrinker(&mmu_shrinker);
mmu_audit_disable();
}
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ba2bcdde6221..5f6223b8bcf7 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,11 +19,9 @@
#include <linux/ratelimit.h>
-static int audit_point;
-
-#define audit_printk(fmt, args...) \
+#define audit_printk(kvm, fmt, args...) \
printk(KERN_ERR "audit: (%s) error: " \
- fmt, audit_point_name[audit_point], ##args)
+ fmt, audit_point_name[kvm->arch.audit_point], ##args)
typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
@@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
if (sp->unsync) {
if (level != PT_PAGE_TABLE_LEVEL) {
- audit_printk("unsync sp: %p level = %d\n", sp, level);
+ audit_printk(vcpu->kvm, "unsync sp: %p "
+ "level = %d\n", sp, level);
return;
}
if (*sptep == shadow_notrap_nonpresent_pte) {
- audit_printk("notrap spte in unsync sp: %p\n", sp);
+ audit_printk(vcpu->kvm, "notrap spte in unsync "
+ "sp: %p\n", sp);
return;
}
}
if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
- audit_printk("notrap spte in direct sp: %p\n", sp);
+ audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
+ sp);
return;
}
@@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
hpa = pfn << PAGE_SHIFT;
if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
- vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
+ audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
+ "ent %llxn", vcpu->arch.mmu.root_level, pfn,
+ hpa, *sptep);
}
static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
if (!gfn_to_memslot(kvm, gfn)) {
if (!printk_ratelimit())
return;
- audit_printk("no memslot for gfn %llx\n", gfn);
- audit_printk("index %ld of sp (gfn=%llx)\n",
+ audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
+ audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
(long int)(sptep - rev_sp->spt), rev_sp->gfn);
dump_stack();
return;
@@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
if (!*rmapp) {
if (!printk_ratelimit())
return;
- audit_printk("no rmap for writable spte %llx\n", *sptep);
+ audit_printk(kvm, "no rmap for writable spte %llx\n",
+ *sptep);
dump_stack();
}
}
@@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
{
struct kvm_mmu_page *sp = page_header(__pa(sptep));
- if (audit_point == AUDIT_POST_SYNC && sp->unsync)
- audit_printk("meet unsync sp(%p) after sync root.\n", sp);
+ if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
+ audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
+ "root.\n", sp);
}
static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
if (is_writable_pte(*spte))
- audit_printk("shadow page has writable mappings: gfn "
- "%llx role %x\n", sp->gfn, sp->role.word);
+ audit_printk(kvm, "shadow page has writable "
+ "mappings: gfn %llx role %x\n",
+ sp->gfn, sp->role.word);
spte = rmap_next(kvm, rmapp, spte);
}
}
@@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
if (!__ratelimit(&ratelimit_state))
return;
- audit_point = point;
+ vcpu->kvm->arch.audit_point = point;
audit_all_active_sps(vcpu->kvm);
audit_vcpu_spte(vcpu);
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index cd7a833a3b52..6bccc24c4181 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -72,7 +72,7 @@ struct guest_walker {
unsigned pt_access;
unsigned pte_access;
gfn_t gfn;
- u32 error_code;
+ struct x86_exception fault;
};
static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@@ -266,21 +266,23 @@ walk:
return 1;
error:
- walker->error_code = 0;
+ walker->fault.vector = PF_VECTOR;
+ walker->fault.error_code_valid = true;
+ walker->fault.error_code = 0;
if (present)
- walker->error_code |= PFERR_PRESENT_MASK;
+ walker->fault.error_code |= PFERR_PRESENT_MASK;
- walker->error_code |= write_fault | user_fault;
+ walker->fault.error_code |= write_fault | user_fault;
if (fetch_fault && mmu->nx)
- walker->error_code |= PFERR_FETCH_MASK;
+ walker->fault.error_code |= PFERR_FETCH_MASK;
if (rsvd_fault)
- walker->error_code |= PFERR_RSVD_MASK;
+ walker->fault.error_code |= PFERR_RSVD_MASK;
- vcpu->arch.fault.address = addr;
- vcpu->arch.fault.error_code = walker->error_code;
+ walker->fault.address = addr;
+ walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
- trace_kvm_mmu_walker_error(walker->error_code);
+ trace_kvm_mmu_walker_error(walker->fault.error_code);
return 0;
}
@@ -299,25 +301,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
addr, access);
}
+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ pt_element_t gpte)
+{
+ u64 nonpresent = shadow_trap_nonpresent_pte;
+
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!is_present_gpte(gpte)) {
+ if (!sp->unsync)
+ nonpresent = shadow_notrap_nonpresent_pte;
+ goto no_present;
+ }
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte, nonpresent);
+ return true;
+}
+
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte)
{
pt_element_t gpte;
unsigned pte_access;
pfn_t pfn;
- u64 new_spte;
gpte = *(const pt_element_t *)pte;
- if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_gpte(gpte)) {
- if (sp->unsync)
- new_spte = shadow_trap_nonpresent_pte;
- else
- new_spte = shadow_notrap_nonpresent_pte;
- __set_spte(spte, new_spte);
- }
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return;
- }
+
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@@ -329,7 +348,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return;
kvm_get_pfn(pfn);
/*
- * we call mmu_set_spte() with reset_host_protection = true beacuse that
+ * we call mmu_set_spte() with host_writable = true beacuse that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -364,7 +383,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
u64 *sptep)
{
struct kvm_mmu_page *sp;
- struct kvm_mmu *mmu = &vcpu->arch.mmu;
pt_element_t *gptep = gw->prefetch_ptes;
u64 *spte;
int i;
@@ -395,14 +413,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
gpte = gptep[i];
- if (!is_present_gpte(gpte) ||
- is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
- if (!sp->unsync)
- __set_spte(spte, shadow_notrap_nonpresent_pte);
- continue;
- }
-
- if (!(gpte & PT_ACCESSED_MASK))
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
continue;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@@ -427,7 +438,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int user_fault, int write_fault, int hlevel,
- int *ptwrite, pfn_t pfn)
+ int *ptwrite, pfn_t pfn, bool map_writable,
+ bool prefault)
{
unsigned access = gw->pt_access;
struct kvm_mmu_page *sp = NULL;
@@ -501,7 +513,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
user_fault, write_fault, dirty, ptwrite, it.level,
- gw->gfn, pfn, false, true);
+ gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return it.sptep;
@@ -527,8 +539,8 @@ out_gpte_changed:
* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
* a negative value on error.
*/
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
- u32 error_code)
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
+ bool prefault)
{
int write_fault = error_code & PFERR_WRITE_MASK;
int user_fault = error_code & PFERR_USER_MASK;
@@ -538,7 +550,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
+ int force_pt_level;
unsigned long mmu_seq;
+ bool map_writable;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
@@ -556,19 +570,29 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
*/
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
- inject_page_fault(vcpu);
- vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+ if (!prefault) {
+ inject_page_fault(vcpu, &walker.fault);
+ /* reset fork detector */
+ vcpu->arch.last_pt_write_count = 0;
+ }
return 0;
}
- if (walker.level >= PT_DIRECTORY_LEVEL) {
+ if (walker.level >= PT_DIRECTORY_LEVEL)
+ force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
+ else
+ force_pt_level = 1;
+ if (!force_pt_level) {
level = min(walker.level, mapping_level(vcpu, walker.gfn));
walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
}
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
+
+ if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
+ &map_writable))
+ return 0;
/* mmio */
if (is_error_pfn(pfn))
@@ -580,8 +604,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
+ if (!force_pt_level)
+ transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- level, &write_pt, pfn);
+ level, &write_pt, pfn, map_writable, prefault);
(void)sptep;
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
sptep, *sptep, write_pt);
@@ -661,7 +687,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
- u32 *error)
+ struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
@@ -672,14 +698,15 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
- } else if (error)
- *error = walker.error_code;
+ } else if (exception)
+ *exception = walker.fault;
return gpa;
}
static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
- u32 access, u32 *error)
+ u32 access,
+ struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
@@ -690,8 +717,8 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
if (r) {
gpa = gfn_to_gpa(walker.gfn);
gpa |= vaddr & ~PAGE_MASK;
- } else if (error)
- *error = walker.error_code;
+ } else if (exception)
+ *exception = walker.fault;
return gpa;
}
@@ -730,12 +757,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
+ *
+ * Note:
+ * We should flush all tlbs if spte is dropped even though guest is
+ * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
+ * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
+ * used by guest then tlbs are not flushed, so guest is allowed to access the
+ * freed pages.
+ * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
*/
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- bool clear_unsync)
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
int i, offset, nr_present;
- bool reset_host_protection;
+ bool host_writable;
gpa_t first_pte_gpa;
offset = nr_present = 0;
@@ -764,31 +798,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return -EINVAL;
gfn = gpte_to_gfn(gpte);
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
- || gfn != sp->gfns[i] || !is_present_gpte(gpte)
- || !(gpte & PT_ACCESSED_MASK)) {
- u64 nonpresent;
- if (is_present_gpte(gpte) || !clear_unsync)
- nonpresent = shadow_trap_nonpresent_pte;
- else
- nonpresent = shadow_notrap_nonpresent_pte;
- drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+ vcpu->kvm->tlbs_dirty++;
+ continue;
+ }
+
+ if (gfn != sp->gfns[i]) {
+ drop_spte(vcpu->kvm, &sp->spt[i],
+ shadow_trap_nonpresent_pte);
+ vcpu->kvm->tlbs_dirty++;
continue;
}
nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
- if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
- pte_access &= ~ACC_WRITE_MASK;
- reset_host_protection = 0;
- } else {
- reset_host_protection = 1;
- }
+ host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
+
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
- reset_host_protection);
+ host_writable);
}
return !nr_present;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 82e144a4e514..25bd1bc5aad2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -31,6 +31,7 @@
#include <asm/tlbflush.h>
#include <asm/desc.h>
+#include <asm/kvm_para.h>
#include <asm/virtext.h>
#include "trace.h"
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL");
#define SVM_FEATURE_LBRV (1 << 1)
#define SVM_FEATURE_SVML (1 << 2)
#define SVM_FEATURE_NRIP (1 << 3)
+#define SVM_FEATURE_TSC_RATE (1 << 4)
+#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
+#define SVM_FEATURE_FLUSH_ASID (1 << 6)
+#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -97,10 +102,8 @@ struct nested_state {
unsigned long vmexit_rax;
/* cache for intercepts of the guest */
- u16 intercept_cr_read;
- u16 intercept_cr_write;
- u16 intercept_dr_read;
- u16 intercept_dr_write;
+ u32 intercept_cr;
+ u32 intercept_dr;
u32 intercept_exceptions;
u64 intercept;
@@ -123,7 +126,12 @@ struct vcpu_svm {
u64 next_rip;
u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
- u64 host_gs_base;
+ struct {
+ u16 fs;
+ u16 gs;
+ u16 ldt;
+ u64 gs_base;
+ } host;
u32 *msrpm;
@@ -133,6 +141,7 @@ struct vcpu_svm {
unsigned int3_injected;
unsigned long int3_rip;
+ u32 apf_reason;
};
#define MSR_INVALID 0xffffffffU
@@ -180,14 +189,151 @@ static int nested_svm_vmexit(struct vcpu_svm *svm);
static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
bool has_error_code, u32 error_code);
+enum {
+ VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
+ pause filter count */
+ VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
+ VMCB_ASID, /* ASID */
+ VMCB_INTR, /* int_ctl, int_vector */
+ VMCB_NPT, /* npt_en, nCR3, gPAT */
+ VMCB_CR, /* CR0, CR3, CR4, EFER */
+ VMCB_DR, /* DR6, DR7 */
+ VMCB_DT, /* GDT, IDT */
+ VMCB_SEG, /* CS, DS, SS, ES, CPL */
+ VMCB_CR2, /* CR2 only */
+ VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
+ VMCB_DIRTY_MAX,
+};
+
+/* TPR and CR2 are always written before VMRUN */
+#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
+
+static inline void mark_all_dirty(struct vmcb *vmcb)
+{
+ vmcb->control.clean = 0;
+}
+
+static inline void mark_all_clean(struct vmcb *vmcb)
+{
+ vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
+ & ~VMCB_ALWAYS_DIRTY_MASK;
+}
+
+static inline void mark_dirty(struct vmcb *vmcb, int bit)
+{
+ vmcb->control.clean &= ~(1 << bit);
+}
+
static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
{
return container_of(vcpu, struct vcpu_svm, vcpu);
}
-static inline bool is_nested(struct vcpu_svm *svm)
+static void recalc_intercepts(struct vcpu_svm *svm)
+{
+ struct vmcb_control_area *c, *h;
+ struct nested_state *g;
+
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ if (!is_guest_mode(&svm->vcpu))
+ return;
+
+ c = &svm->vmcb->control;
+ h = &svm->nested.hsave->control;
+ g = &svm->nested;
+
+ c->intercept_cr = h->intercept_cr | g->intercept_cr;
+ c->intercept_dr = h->intercept_dr | g->intercept_dr;
+ c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
+ c->intercept = h->intercept | g->intercept;
+}
+
+static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
+{
+ if (is_guest_mode(&svm->vcpu))
+ return svm->nested.hsave;
+ else
+ return svm->vmcb;
+}
+
+static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_cr &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ return vmcb->control.intercept_cr & (1U << bit);
+}
+
+static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_dr &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions |= (1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
{
- return svm->nested.vmcb;
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept_exceptions &= ~(1U << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void set_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept |= (1ULL << bit);
+
+ recalc_intercepts(svm);
+}
+
+static inline void clr_intercept(struct vcpu_svm *svm, int bit)
+{
+ struct vmcb *vmcb = get_host_vmcb(svm);
+
+ vmcb->control.intercept &= ~(1ULL << bit);
+
+ recalc_intercepts(svm);
}
static inline void enable_gif(struct vcpu_svm *svm)
@@ -264,11 +410,6 @@ static u32 svm_msrpm_offset(u32 msr)
#define MAX_INST_SIZE 15
-static inline u32 svm_has(u32 feat)
-{
- return svm_features & feat;
-}
-
static inline void clgi(void)
{
asm volatile (__ex(SVM_CLGI));
@@ -284,16 +425,6 @@ static inline void invlpga(unsigned long addr, u32 asid)
asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
}
-static inline void force_new_asid(struct kvm_vcpu *vcpu)
-{
- to_svm(vcpu)->asid_generation--;
-}
-
-static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
-{
- force_new_asid(vcpu);
-}
-
static int get_npt_level(void)
{
#ifdef CONFIG_X86_64
@@ -310,6 +441,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
efer &= ~EFER_LME;
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
+ mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
}
static int is_external_interrupt(u32 info)
@@ -347,7 +479,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
svm->next_rip = svm->vmcb->control.next_rip;
if (!svm->next_rip) {
- if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
+ if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
EMULATE_DONE)
printk(KERN_DEBUG "%s: NOP\n", __func__);
return;
@@ -374,7 +506,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
nested_svm_check_exception(svm, nr, has_error_code, error_code))
return;
- if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) {
+ if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
/*
@@ -670,7 +802,7 @@ static __init int svm_hardware_setup(void)
svm_features = cpuid_edx(SVM_CPUID_FUNC);
- if (!svm_has(SVM_FEATURE_NPT))
+ if (!boot_cpu_has(X86_FEATURE_NPT))
npt_enabled = false;
if (npt_enabled && !npt) {
@@ -725,13 +857,15 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
struct vcpu_svm *svm = to_svm(vcpu);
u64 g_tsc_offset = 0;
- if (is_nested(svm)) {
+ if (is_guest_mode(vcpu)) {
g_tsc_offset = svm->vmcb->control.tsc_offset -
svm->nested.hsave->control.tsc_offset;
svm->nested.hsave->control.tsc_offset = offset;
}
svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
+
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -739,8 +873,9 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.tsc_offset += adjustment;
- if (is_nested(svm))
+ if (is_guest_mode(vcpu))
svm->nested.hsave->control.tsc_offset += adjustment;
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
}
static void init_vmcb(struct vcpu_svm *svm)
@@ -749,62 +884,62 @@ static void init_vmcb(struct vcpu_svm *svm)
struct vmcb_save_area *save = &svm->vmcb->save;
svm->vcpu.fpu_active = 1;
+ svm->vcpu.arch.hflags = 0;
- control->intercept_cr_read = INTERCEPT_CR0_MASK |
- INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK;
-
- control->intercept_cr_write = INTERCEPT_CR0_MASK |
- INTERCEPT_CR3_MASK |
- INTERCEPT_CR4_MASK |
- INTERCEPT_CR8_MASK;
-
- control->intercept_dr_read = INTERCEPT_DR0_MASK |
- INTERCEPT_DR1_MASK |
- INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK |
- INTERCEPT_DR4_MASK |
- INTERCEPT_DR5_MASK |
- INTERCEPT_DR6_MASK |
- INTERCEPT_DR7_MASK;
-
- control->intercept_dr_write = INTERCEPT_DR0_MASK |
- INTERCEPT_DR1_MASK |
- INTERCEPT_DR2_MASK |
- INTERCEPT_DR3_MASK |
- INTERCEPT_DR4_MASK |
- INTERCEPT_DR5_MASK |
- INTERCEPT_DR6_MASK |
- INTERCEPT_DR7_MASK;
-
- control->intercept_exceptions = (1 << PF_VECTOR) |
- (1 << UD_VECTOR) |
- (1 << MC_VECTOR);
-
-
- control->intercept = (1ULL << INTERCEPT_INTR) |
- (1ULL << INTERCEPT_NMI) |
- (1ULL << INTERCEPT_SMI) |
- (1ULL << INTERCEPT_SELECTIVE_CR0) |
- (1ULL << INTERCEPT_CPUID) |
- (1ULL << INTERCEPT_INVD) |
- (1ULL << INTERCEPT_HLT) |
- (1ULL << INTERCEPT_INVLPG) |
- (1ULL << INTERCEPT_INVLPGA) |
- (1ULL << INTERCEPT_IOIO_PROT) |
- (1ULL << INTERCEPT_MSR_PROT) |
- (1ULL << INTERCEPT_TASK_SWITCH) |
- (1ULL << INTERCEPT_SHUTDOWN) |
- (1ULL << INTERCEPT_VMRUN) |
- (1ULL << INTERCEPT_VMMCALL) |
- (1ULL << INTERCEPT_VMLOAD) |
- (1ULL << INTERCEPT_VMSAVE) |
- (1ULL << INTERCEPT_STGI) |
- (1ULL << INTERCEPT_CLGI) |
- (1ULL << INTERCEPT_SKINIT) |
- (1ULL << INTERCEPT_WBINVD) |
- (1ULL << INTERCEPT_MONITOR) |
- (1ULL << INTERCEPT_MWAIT);
+ set_cr_intercept(svm, INTERCEPT_CR0_READ);
+ set_cr_intercept(svm, INTERCEPT_CR3_READ);
+ set_cr_intercept(svm, INTERCEPT_CR4_READ);
+ set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
+ set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+
+ set_dr_intercept(svm, INTERCEPT_DR0_READ);
+ set_dr_intercept(svm, INTERCEPT_DR1_READ);
+ set_dr_intercept(svm, INTERCEPT_DR2_READ);
+ set_dr_intercept(svm, INTERCEPT_DR3_READ);
+ set_dr_intercept(svm, INTERCEPT_DR4_READ);
+ set_dr_intercept(svm, INTERCEPT_DR5_READ);
+ set_dr_intercept(svm, INTERCEPT_DR6_READ);
+ set_dr_intercept(svm, INTERCEPT_DR7_READ);
+
+ set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
+ set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
+
+ set_exception_intercept(svm, PF_VECTOR);
+ set_exception_intercept(svm, UD_VECTOR);
+ set_exception_intercept(svm, MC_VECTOR);
+
+ set_intercept(svm, INTERCEPT_INTR);
+ set_intercept(svm, INTERCEPT_NMI);
+ set_intercept(svm, INTERCEPT_SMI);
+ set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
+ set_intercept(svm, INTERCEPT_CPUID);
+ set_intercept(svm, INTERCEPT_INVD);
+ set_intercept(svm, INTERCEPT_HLT);
+ set_intercept(svm, INTERCEPT_INVLPG);
+ set_intercept(svm, INTERCEPT_INVLPGA);
+ set_intercept(svm, INTERCEPT_IOIO_PROT);
+ set_intercept(svm, INTERCEPT_MSR_PROT);
+ set_intercept(svm, INTERCEPT_TASK_SWITCH);
+ set_intercept(svm, INTERCEPT_SHUTDOWN);
+ set_intercept(svm, INTERCEPT_VMRUN);
+ set_intercept(svm, INTERCEPT_VMMCALL);
+ set_intercept(svm, INTERCEPT_VMLOAD);
+ set_intercept(svm, INTERCEPT_VMSAVE);
+ set_intercept(svm, INTERCEPT_STGI);
+ set_intercept(svm, INTERCEPT_CLGI);
+ set_intercept(svm, INTERCEPT_SKINIT);
+ set_intercept(svm, INTERCEPT_WBINVD);
+ set_intercept(svm, INTERCEPT_MONITOR);
+ set_intercept(svm, INTERCEPT_MWAIT);
+ set_intercept(svm, INTERCEPT_XSETBV);
control->iopm_base_pa = iopm_base;
control->msrpm_base_pa = __pa(svm->msrpm);
@@ -855,25 +990,27 @@ static void init_vmcb(struct vcpu_svm *svm)
if (npt_enabled) {
/* Setup VMCB for Nested Paging */
control->nested_ctl = 1;
- control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
- (1ULL << INTERCEPT_INVLPG));
- control->intercept_exceptions &= ~(1 << PF_VECTOR);
- control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
- control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
+ clr_intercept(svm, INTERCEPT_TASK_SWITCH);
+ clr_intercept(svm, INTERCEPT_INVLPG);
+ clr_exception_intercept(svm, PF_VECTOR);
+ clr_cr_intercept(svm, INTERCEPT_CR3_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
save->g_pat = 0x0007040600070406ULL;
save->cr3 = 0;
save->cr4 = 0;
}
- force_new_asid(&svm->vcpu);
+ svm->asid_generation = 0;
svm->nested.vmcb = 0;
svm->vcpu.arch.hflags = 0;
- if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
+ if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
control->pause_filter_count = 3000;
- control->intercept |= (1ULL << INTERCEPT_PAUSE);
+ set_intercept(svm, INTERCEPT_PAUSE);
}
+ mark_all_dirty(svm->vmcb);
+
enable_gif(svm);
}
@@ -990,8 +1127,16 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (unlikely(cpu != vcpu->cpu)) {
svm->asid_generation = 0;
+ mark_all_dirty(svm->vmcb);
}
+#ifdef CONFIG_X86_64
+ rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
+#endif
+ savesegment(fs, svm->host.fs);
+ savesegment(gs, svm->host.gs);
+ svm->host.ldt = kvm_read_ldt();
+
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
}
@@ -1002,6 +1147,14 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
int i;
++vcpu->stat.host_state_reload;
+ kvm_load_ldt(svm->host.ldt);
+#ifdef CONFIG_X86_64
+ loadsegment(fs, svm->host.fs);
+ load_gs_index(svm->host.gs);
+ wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+#else
+ loadsegment(gs, svm->host.gs);
+#endif
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
}
@@ -1021,7 +1174,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
switch (reg) {
case VCPU_EXREG_PDPTR:
BUG_ON(!npt_enabled);
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
break;
default:
BUG();
@@ -1030,12 +1183,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
static void svm_set_vintr(struct vcpu_svm *svm)
{
- svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR;
+ set_intercept(svm, INTERCEPT_VINTR);
}
static void svm_clear_vintr(struct vcpu_svm *svm)
{
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
+ clr_intercept(svm, INTERCEPT_VINTR);
}
static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -1150,6 +1303,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
svm->vmcb->save.idtr.limit = dt->size;
svm->vmcb->save.idtr.base = dt->address ;
+ mark_dirty(svm->vmcb, VMCB_DT);
}
static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -1166,19 +1320,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
svm->vmcb->save.gdtr.limit = dt->size;
svm->vmcb->save.gdtr.base = dt->address ;
+ mark_dirty(svm->vmcb, VMCB_DT);
}
static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
{
}
+static void svm_decache_cr3(struct kvm_vcpu *vcpu)
+{
+}
+
static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
}
static void update_cr0_intercept(struct vcpu_svm *svm)
{
- struct vmcb *vmcb = svm->vmcb;
ulong gcr0 = svm->vcpu.arch.cr0;
u64 *hcr0 = &svm->vmcb->save.cr0;
@@ -1188,27 +1346,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
*hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
| (gcr0 & SVM_CR0_SELECTIVE_MASK);
+ mark_dirty(svm->vmcb, VMCB_CR);
if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
- vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
- vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
- if (is_nested(svm)) {
- struct vmcb *hsave = svm->nested.hsave;
-
- hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
- hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
- vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
- vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
- }
+ clr_cr_intercept(svm, INTERCEPT_CR0_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
} else {
- svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
- svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
- if (is_nested(svm)) {
- struct vmcb *hsave = svm->nested.hsave;
-
- hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
- hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
- }
+ set_cr_intercept(svm, INTERCEPT_CR0_READ);
+ set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
}
}
@@ -1216,7 +1361,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_nested(svm)) {
+ if (is_guest_mode(vcpu)) {
/*
* We are here because we run in nested mode, the host kvm
* intercepts cr0 writes but the l1 hypervisor does not.
@@ -1268,6 +1413,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
*/
cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
svm->vmcb->save.cr0 = cr0;
+ mark_dirty(svm->vmcb, VMCB_CR);
update_cr0_intercept(svm);
}
@@ -1277,13 +1423,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
- force_new_asid(vcpu);
+ svm_flush_tlb(vcpu);
vcpu->arch.cr4 = cr4;
if (!npt_enabled)
cr4 |= X86_CR4_PAE;
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
+ mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
}
static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1312,26 +1459,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
= (svm->vmcb->save.cs.attrib
>> SVM_SELECTOR_DPL_SHIFT) & 3;
+ mark_dirty(svm->vmcb, VMCB_SEG);
}
static void update_db_intercept(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.intercept_exceptions &=
- ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
+ clr_exception_intercept(svm, DB_VECTOR);
+ clr_exception_intercept(svm, BP_VECTOR);
if (svm->nmi_singlestep)
- svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
+ set_exception_intercept(svm, DB_VECTOR);
if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
if (vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- svm->vmcb->control.intercept_exceptions |=
- 1 << DB_VECTOR;
+ set_exception_intercept(svm, DB_VECTOR);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- svm->vmcb->control.intercept_exceptions |=
- 1 << BP_VECTOR;
+ set_exception_intercept(svm, BP_VECTOR);
} else
vcpu->guest_debug = 0;
}
@@ -1345,21 +1491,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
else
svm->vmcb->save.dr7 = vcpu->arch.dr7;
- update_db_intercept(vcpu);
-}
-
-static void load_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
-}
+ mark_dirty(svm->vmcb, VMCB_DR);
-static void save_host_msrs(struct kvm_vcpu *vcpu)
-{
-#ifdef CONFIG_X86_64
- rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
-#endif
+ update_db_intercept(vcpu);
}
static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -1372,6 +1506,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
svm->asid_generation = sd->asid_generation;
svm->vmcb->control.asid = sd->next_asid++;
+
+ mark_dirty(svm->vmcb, VMCB_ASID);
}
static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
@@ -1379,20 +1515,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.dr7 = value;
+ mark_dirty(svm->vmcb, VMCB_DR);
}
static int pf_interception(struct vcpu_svm *svm)
{
- u64 fault_address;
+ u64 fault_address = svm->vmcb->control.exit_info_2;
u32 error_code;
+ int r = 1;
- fault_address = svm->vmcb->control.exit_info_2;
- error_code = svm->vmcb->control.exit_info_1;
-
- trace_kvm_page_fault(fault_address, error_code);
- if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
- kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
- return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
+ switch (svm->apf_reason) {
+ default:
+ error_code = svm->vmcb->control.exit_info_1;
+
+ trace_kvm_page_fault(fault_address, error_code);
+ if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
+ kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+ r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ svm->vmcb->control.insn_bytes,
+ svm->vmcb->control.insn_len);
+ break;
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ svm->apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wait(fault_address);
+ local_irq_enable();
+ break;
+ case KVM_PV_REASON_PAGE_READY:
+ svm->apf_reason = 0;
+ local_irq_disable();
+ kvm_async_pf_task_wake(fault_address);
+ local_irq_enable();
+ break;
+ }
+ return r;
}
static int db_interception(struct vcpu_svm *svm)
@@ -1440,7 +1596,7 @@ static int ud_interception(struct vcpu_svm *svm)
{
int er;
- er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
+ er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
@@ -1449,21 +1605,8 @@ static int ud_interception(struct vcpu_svm *svm)
static void svm_fpu_activate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u32 excp;
-
- if (is_nested(svm)) {
- u32 h_excp, n_excp;
-
- h_excp = svm->nested.hsave->control.intercept_exceptions;
- n_excp = svm->nested.intercept_exceptions;
- h_excp &= ~(1 << NM_VECTOR);
- excp = h_excp | n_excp;
- } else {
- excp = svm->vmcb->control.intercept_exceptions;
- excp &= ~(1 << NM_VECTOR);
- }
- svm->vmcb->control.intercept_exceptions = excp;
+ clr_exception_intercept(svm, NM_VECTOR);
svm->vcpu.fpu_active = 1;
update_cr0_intercept(svm);
@@ -1570,7 +1713,7 @@ static int io_interception(struct vcpu_svm *svm)
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
if (string || in)
- return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1624,17 +1767,19 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.nested_cr3 = root;
- force_new_asid(vcpu);
+ mark_dirty(svm->vmcb, VMCB_NPT);
+ svm_flush_tlb(vcpu);
}
-static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu)
+static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
{
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.exit_code = SVM_EXIT_NPF;
svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = vcpu->arch.fault.error_code;
- svm->vmcb->control.exit_info_2 = vcpu->arch.fault.address;
+ svm->vmcb->control.exit_info_1 = fault->error_code;
+ svm->vmcb->control.exit_info_2 = fault->address;
nested_svm_vmexit(svm);
}
@@ -1680,7 +1825,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
{
int vmexit;
- if (!is_nested(svm))
+ if (!is_guest_mode(&svm->vcpu))
return 0;
svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
@@ -1698,7 +1843,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
/* This function returns true if it is save to enable the irq window */
static inline bool nested_svm_intr(struct vcpu_svm *svm)
{
- if (!is_nested(svm))
+ if (!is_guest_mode(&svm->vcpu))
return true;
if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
@@ -1737,7 +1882,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
/* This function returns true if it is save to enable the nmi window */
static inline bool nested_svm_nmi(struct vcpu_svm *svm)
{
- if (!is_nested(svm))
+ if (!is_guest_mode(&svm->vcpu))
return true;
if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
@@ -1836,8 +1981,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
return NESTED_EXIT_HOST;
break;
case SVM_EXIT_EXCP_BASE + PF_VECTOR:
- /* When we're shadowing, trap PFs */
- if (!npt_enabled)
+ /* When we're shadowing, trap PFs, but not async PF */
+ if (!npt_enabled && svm->apf_reason == 0)
return NESTED_EXIT_HOST;
break;
case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -1865,27 +2010,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
case SVM_EXIT_IOIO:
vmexit = nested_svm_intercept_ioio(svm);
break;
- case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: {
- u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0);
- if (svm->nested.intercept_cr_read & cr_bits)
+ case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
+ if (svm->nested.intercept_cr & bit)
vmexit = NESTED_EXIT_DONE;
break;
}
- case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
- u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
- if (svm->nested.intercept_cr_write & cr_bits)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
- u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
- if (svm->nested.intercept_dr_read & dr_bits)
- vmexit = NESTED_EXIT_DONE;
- break;
- }
- case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: {
- u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0);
- if (svm->nested.intercept_dr_write & dr_bits)
+ case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
+ u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
+ if (svm->nested.intercept_dr & bit)
vmexit = NESTED_EXIT_DONE;
break;
}
@@ -1893,6 +2026,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
if (svm->nested.intercept_exceptions & excp_bits)
vmexit = NESTED_EXIT_DONE;
+ /* async page fault always cause vmexit */
+ else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
+ svm->apf_reason != 0)
+ vmexit = NESTED_EXIT_DONE;
break;
}
case SVM_EXIT_ERR: {
@@ -1926,10 +2063,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
struct vmcb_control_area *dst = &dst_vmcb->control;
struct vmcb_control_area *from = &from_vmcb->control;
- dst->intercept_cr_read = from->intercept_cr_read;
- dst->intercept_cr_write = from->intercept_cr_write;
- dst->intercept_dr_read = from->intercept_dr_read;
- dst->intercept_dr_write = from->intercept_dr_write;
+ dst->intercept_cr = from->intercept_cr;
+ dst->intercept_dr = from->intercept_dr;
dst->intercept_exceptions = from->intercept_exceptions;
dst->intercept = from->intercept;
dst->iopm_base_pa = from->iopm_base_pa;
@@ -1970,7 +2105,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
if (!nested_vmcb)
return 1;
- /* Exit nested SVM mode */
+ /* Exit Guest-Mode */
+ leave_guest_mode(&svm->vcpu);
svm->nested.vmcb = 0;
/* Give the current vmcb to the guest */
@@ -1984,7 +2120,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
nested_vmcb->save.idtr = vmcb->save.idtr;
nested_vmcb->save.efer = svm->vcpu.arch.efer;
nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
- nested_vmcb->save.cr3 = svm->vcpu.arch.cr3;
+ nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
nested_vmcb->save.cr2 = vmcb->save.cr2;
nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
nested_vmcb->save.rflags = vmcb->save.rflags;
@@ -2061,6 +2197,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
+ mark_all_dirty(svm->vmcb);
+
nested_svm_unmap(page);
nested_svm_uninit_mmu_context(&svm->vcpu);
@@ -2148,8 +2286,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
nested_vmcb->control.event_inj,
nested_vmcb->control.nested_ctl);
- trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read,
- nested_vmcb->control.intercept_cr_write,
+ trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+ nested_vmcb->control.intercept_cr >> 16,
nested_vmcb->control.intercept_exceptions,
nested_vmcb->control.intercept);
@@ -2177,7 +2315,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
if (npt_enabled)
hsave->save.cr3 = vmcb->save.cr3;
else
- hsave->save.cr3 = svm->vcpu.arch.cr3;
+ hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
copy_vmcb_control_area(hsave, vmcb);
@@ -2229,14 +2367,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
/* cache intercepts */
- svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read;
- svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write;
- svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
- svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
+ svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
+ svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
svm->nested.intercept = nested_vmcb->control.intercept;
- force_new_asid(&svm->vcpu);
+ svm_flush_tlb(&svm->vcpu);
svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -2245,29 +2381,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
/* We only want the cr8 intercept bits of the guest */
- svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK;
- svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
+ clr_cr_intercept(svm, INTERCEPT_CR8_READ);
+ clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
}
/* We don't want to see VMMCALLs from a nested guest */
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL);
-
- /*
- * We don't want a nested guest to be more powerful than the guest, so
- * all intercepts are ORed
- */
- svm->vmcb->control.intercept_cr_read |=
- nested_vmcb->control.intercept_cr_read;
- svm->vmcb->control.intercept_cr_write |=
- nested_vmcb->control.intercept_cr_write;
- svm->vmcb->control.intercept_dr_read |=
- nested_vmcb->control.intercept_dr_read;
- svm->vmcb->control.intercept_dr_write |=
- nested_vmcb->control.intercept_dr_write;
- svm->vmcb->control.intercept_exceptions |=
- nested_vmcb->control.intercept_exceptions;
-
- svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
+ clr_intercept(svm, INTERCEPT_VMMCALL);
svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
@@ -2278,11 +2397,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
nested_svm_unmap(page);
- /* nested_vmcb is our indicator if nested SVM is activated */
+ /* Enter Guest-Mode */
+ enter_guest_mode(&svm->vcpu);
+
+ /*
+ * Merge guest and host intercepts - must be called with vcpu in
+ * guest-mode to take affect here
+ */
+ recalc_intercepts(svm);
+
svm->nested.vmcb = vmcb_gpa;
enable_gif(svm);
+ mark_all_dirty(svm->vmcb);
+
return true;
}
@@ -2400,6 +2529,8 @@ static int clgi_interception(struct vcpu_svm *svm)
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+ mark_dirty(svm->vmcb, VMCB_INTR);
+
return 1;
}
@@ -2426,6 +2557,19 @@ static int skinit_interception(struct vcpu_svm *svm)
return 1;
}
+static int xsetbv_interception(struct vcpu_svm *svm)
+{
+ u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
+ u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+
+ if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
+ svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ skip_emulated_instruction(&svm->vcpu);
+ }
+
+ return 1;
+}
+
static int invalid_op_interception(struct vcpu_svm *svm)
{
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -2507,19 +2651,92 @@ static int cpuid_interception(struct vcpu_svm *svm)
static int iret_interception(struct vcpu_svm *svm)
{
++svm->vcpu.stat.nmi_window_exits;
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
+ clr_intercept(svm, INTERCEPT_IRET);
svm->vcpu.arch.hflags |= HF_IRET_MASK;
return 1;
}
static int invlpg_interception(struct vcpu_svm *svm)
{
- return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+
+ kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
+ skip_emulated_instruction(&svm->vcpu);
+ return 1;
}
static int emulate_on_interception(struct vcpu_svm *svm)
{
- return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
+ return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+}
+
+#define CR_VALID (1ULL << 63)
+
+static int cr_interception(struct vcpu_svm *svm)
+{
+ int reg, cr;
+ unsigned long val;
+ int err;
+
+ if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(svm);
+
+ if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
+ return emulate_on_interception(svm);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+
+ err = 0;
+ if (cr >= 16) { /* mov to cr */
+ cr -= 16;
+ val = kvm_register_read(&svm->vcpu, reg);
+ switch (cr) {
+ case 0:
+ err = kvm_set_cr0(&svm->vcpu, val);
+ break;
+ case 3:
+ err = kvm_set_cr3(&svm->vcpu, val);
+ break;
+ case 4:
+ err = kvm_set_cr4(&svm->vcpu, val);
+ break;
+ case 8:
+ err = kvm_set_cr8(&svm->vcpu, val);
+ break;
+ default:
+ WARN(1, "unhandled write to CR%d", cr);
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+ } else { /* mov from cr */
+ switch (cr) {
+ case 0:
+ val = kvm_read_cr0(&svm->vcpu);
+ break;
+ case 2:
+ val = svm->vcpu.arch.cr2;
+ break;
+ case 3:
+ val = kvm_read_cr3(&svm->vcpu);
+ break;
+ case 4:
+ val = kvm_read_cr4(&svm->vcpu);
+ break;
+ case 8:
+ val = kvm_get_cr8(&svm->vcpu);
+ break;
+ default:
+ WARN(1, "unhandled read from CR%d", cr);
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+ }
+ kvm_register_write(&svm->vcpu, reg, val);
+ }
+ kvm_complete_insn_gp(&svm->vcpu, err);
+
+ return 1;
}
static int cr0_write_interception(struct vcpu_svm *svm)
@@ -2527,7 +2744,7 @@ static int cr0_write_interception(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
int r;
- r = emulate_instruction(&svm->vcpu, 0, 0, 0);
+ r = cr_interception(svm);
if (svm->nested.vmexit_rip) {
kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
@@ -2536,22 +2753,47 @@ static int cr0_write_interception(struct vcpu_svm *svm)
svm->nested.vmexit_rip = 0;
}
- return r == EMULATE_DONE;
+ return r;
+}
+
+static int dr_interception(struct vcpu_svm *svm)
+{
+ int reg, dr;
+ unsigned long val;
+ int err;
+
+ if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
+ return emulate_on_interception(svm);
+
+ reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
+ dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
+
+ if (dr >= 16) { /* mov to DRn */
+ val = kvm_register_read(&svm->vcpu, reg);
+ kvm_set_dr(&svm->vcpu, dr - 16, val);
+ } else {
+ err = kvm_get_dr(&svm->vcpu, dr, &val);
+ if (!err)
+ kvm_register_write(&svm->vcpu, reg, val);
+ }
+
+ return 1;
}
static int cr8_write_interception(struct vcpu_svm *svm)
{
struct kvm_run *kvm_run = svm->vcpu.run;
+ int r;
u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
/* instruction emulation calls kvm_set_cr8() */
- emulate_instruction(&svm->vcpu, 0, 0, 0);
+ r = cr_interception(svm);
if (irqchip_in_kernel(svm->vcpu.kvm)) {
- svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
- return 1;
+ clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
+ return r;
}
if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
- return 1;
+ return r;
kvm_run->exit_reason = KVM_EXIT_SET_TPR;
return 0;
}
@@ -2562,14 +2804,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
switch (ecx) {
case MSR_IA32_TSC: {
- u64 tsc_offset;
+ struct vmcb *vmcb = get_host_vmcb(svm);
- if (is_nested(svm))
- tsc_offset = svm->nested.hsave->control.tsc_offset;
- else
- tsc_offset = svm->vmcb->control.tsc_offset;
-
- *data = tsc_offset + native_read_tsc();
+ *data = vmcb->control.tsc_offset + native_read_tsc();
break;
}
case MSR_STAR:
@@ -2714,7 +2951,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
svm->vmcb->save.sysenter_esp = data;
break;
case MSR_IA32_DEBUGCTLMSR:
- if (!svm_has(SVM_FEATURE_LBRV)) {
+ if (!boot_cpu_has(X86_FEATURE_LBRV)) {
pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
__func__, data);
break;
@@ -2723,6 +2960,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
return 1;
svm->vmcb->save.dbgctl = data;
+ mark_dirty(svm->vmcb, VMCB_LBR);
if (data & (1ULL<<0))
svm_enable_lbrv(svm);
else
@@ -2775,6 +3013,7 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
svm_clear_vintr(svm);
svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
+ mark_dirty(svm->vmcb, VMCB_INTR);
/*
* If the user space waits to inject interrupts, exit as soon as
* possible
@@ -2797,31 +3036,31 @@ static int pause_interception(struct vcpu_svm *svm)
}
static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
- [SVM_EXIT_READ_CR0] = emulate_on_interception,
- [SVM_EXIT_READ_CR3] = emulate_on_interception,
- [SVM_EXIT_READ_CR4] = emulate_on_interception,
- [SVM_EXIT_READ_CR8] = emulate_on_interception,
+ [SVM_EXIT_READ_CR0] = cr_interception,
+ [SVM_EXIT_READ_CR3] = cr_interception,
+ [SVM_EXIT_READ_CR4] = cr_interception,
+ [SVM_EXIT_READ_CR8] = cr_interception,
[SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
[SVM_EXIT_WRITE_CR0] = cr0_write_interception,
- [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
+ [SVM_EXIT_WRITE_CR3] = cr_interception,
+ [SVM_EXIT_WRITE_CR4] = cr_interception,
[SVM_EXIT_WRITE_CR8] = cr8_write_interception,
- [SVM_EXIT_READ_DR0] = emulate_on_interception,
- [SVM_EXIT_READ_DR1] = emulate_on_interception,
- [SVM_EXIT_READ_DR2] = emulate_on_interception,
- [SVM_EXIT_READ_DR3] = emulate_on_interception,
- [SVM_EXIT_READ_DR4] = emulate_on_interception,
- [SVM_EXIT_READ_DR5] = emulate_on_interception,
- [SVM_EXIT_READ_DR6] = emulate_on_interception,
- [SVM_EXIT_READ_DR7] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR4] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR6] = emulate_on_interception,
- [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
+ [SVM_EXIT_READ_DR0] = dr_interception,
+ [SVM_EXIT_READ_DR1] = dr_interception,
+ [SVM_EXIT_READ_DR2] = dr_interception,
+ [SVM_EXIT_READ_DR3] = dr_interception,
+ [SVM_EXIT_READ_DR4] = dr_interception,
+ [SVM_EXIT_READ_DR5] = dr_interception,
+ [SVM_EXIT_READ_DR6] = dr_interception,
+ [SVM_EXIT_READ_DR7] = dr_interception,
+ [SVM_EXIT_WRITE_DR0] = dr_interception,
+ [SVM_EXIT_WRITE_DR1] = dr_interception,
+ [SVM_EXIT_WRITE_DR2] = dr_interception,
+ [SVM_EXIT_WRITE_DR3] = dr_interception,
+ [SVM_EXIT_WRITE_DR4] = dr_interception,
+ [SVM_EXIT_WRITE_DR5] = dr_interception,
+ [SVM_EXIT_WRITE_DR6] = dr_interception,
+ [SVM_EXIT_WRITE_DR7] = dr_interception,
[SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
[SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
[SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
@@ -2854,6 +3093,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_WBINVD] = emulate_on_interception,
[SVM_EXIT_MONITOR] = invalid_op_interception,
[SVM_EXIT_MWAIT] = invalid_op_interception,
+ [SVM_EXIT_XSETBV] = xsetbv_interception,
[SVM_EXIT_NPF] = pf_interception,
};
@@ -2864,10 +3104,10 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
struct vmcb_save_area *save = &svm->vmcb->save;
pr_err("VMCB Control Area:\n");
- pr_err("cr_read: %04x\n", control->intercept_cr_read);
- pr_err("cr_write: %04x\n", control->intercept_cr_write);
- pr_err("dr_read: %04x\n", control->intercept_dr_read);
- pr_err("dr_write: %04x\n", control->intercept_dr_write);
+ pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff);
+ pr_err("cr_write: %04x\n", control->intercept_cr >> 16);
+ pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff);
+ pr_err("dr_write: %04x\n", control->intercept_dr >> 16);
pr_err("exceptions: %08x\n", control->intercept_exceptions);
pr_err("intercepts: %016llx\n", control->intercept);
pr_err("pause filter count: %d\n", control->pause_filter_count);
@@ -2950,15 +3190,23 @@ void dump_vmcb(struct kvm_vcpu *vcpu)
}
+static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+ struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
+
+ *info1 = control->exit_info_1;
+ *info2 = control->exit_info_2;
+}
+
static int handle_exit(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct kvm_run *kvm_run = vcpu->run;
u32 exit_code = svm->vmcb->control.exit_code;
- trace_kvm_exit(exit_code, vcpu);
+ trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
- if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
+ if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
if (npt_enabled)
vcpu->arch.cr3 = svm->vmcb->save.cr3;
@@ -2970,7 +3218,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
return 1;
}
- if (is_nested(svm)) {
+ if (is_guest_mode(vcpu)) {
int vmexit;
trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
@@ -3033,7 +3281,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
/* FIXME: handle wraparound of asid_generation */
if (svm->asid_generation != sd->asid_generation)
new_asid(svm, sd);
@@ -3045,7 +3292,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
vcpu->arch.hflags |= HF_NMI_MASK;
- svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
+ set_intercept(svm, INTERCEPT_IRET);
++vcpu->stat.nmi_injections;
}
@@ -3058,6 +3305,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
control->int_ctl &= ~V_INTR_PRIO_MASK;
control->int_ctl |= V_IRQ_MASK |
((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
+ mark_dirty(svm->vmcb, VMCB_INTR);
}
static void svm_set_irq(struct kvm_vcpu *vcpu)
@@ -3077,14 +3325,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
return;
if (irr == -1)
return;
if (tpr >= irr)
- svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+ set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
}
static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -3112,10 +3360,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
if (masked) {
svm->vcpu.arch.hflags |= HF_NMI_MASK;
- svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
+ set_intercept(svm, INTERCEPT_IRET);
} else {
svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
- svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
+ clr_intercept(svm, INTERCEPT_IRET);
}
}
@@ -3131,7 +3379,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
- if (is_nested(svm))
+ if (is_guest_mode(vcpu))
return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
return ret;
@@ -3177,7 +3425,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
static void svm_flush_tlb(struct kvm_vcpu *vcpu)
{
- force_new_asid(vcpu);
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
+ else
+ svm->asid_generation--;
}
static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3188,10 +3441,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
return;
- if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
+ if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
kvm_set_cr8(vcpu, cr8);
}
@@ -3202,7 +3455,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
u64 cr8;
- if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK))
+ if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
return;
cr8 = kvm_get_cr8(vcpu);
@@ -3289,9 +3542,6 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
static void svm_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- u16 fs_selector;
- u16 gs_selector;
- u16 ldt_selector;
svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3308,10 +3558,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
sync_lapic_to_cr8(vcpu);
- save_host_msrs(vcpu);
- savesegment(fs, fs_selector);
- savesegment(gs, gs_selector);
- ldt_selector = kvm_read_ldt();
svm->vmcb->save.cr2 = vcpu->arch.cr2;
clgi();
@@ -3389,20 +3635,11 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
#endif
);
- vcpu->arch.cr2 = svm->vmcb->save.cr2;
- vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
- vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
- vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
-
- load_host_msrs(vcpu);
- loadsegment(fs, fs_selector);
#ifdef CONFIG_X86_64
- load_gs_index(gs_selector);
- wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
+ wrmsrl(MSR_GS_BASE, svm->host.gs_base);
#else
- loadsegment(gs, gs_selector);
+ loadsegment(fs, svm->host.fs);
#endif
- kvm_load_ldt(ldt_selector);
reload_tss(vcpu);
@@ -3410,10 +3647,21 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
stgi();
+ vcpu->arch.cr2 = svm->vmcb->save.cr2;
+ vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
+ vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
+ vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
+
sync_cr8_to_lapic(vcpu);
svm->next_rip = 0;
+ svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+
+ /* if exit due to PF check for async PF */
+ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
+ svm->apf_reason = kvm_read_and_reset_pf_reason();
+
if (npt_enabled) {
vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
@@ -3426,6 +3674,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
if (unlikely(svm->vmcb->control.exit_code ==
SVM_EXIT_EXCP_BASE + MC_VECTOR))
svm_handle_mce(svm);
+
+ mark_all_clean(svm->vmcb);
}
#undef R
@@ -3435,7 +3685,8 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->save.cr3 = root;
- force_new_asid(vcpu);
+ mark_dirty(svm->vmcb, VMCB_CR);
+ svm_flush_tlb(vcpu);
}
static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
@@ -3443,11 +3694,13 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
struct vcpu_svm *svm = to_svm(vcpu);
svm->vmcb->control.nested_cr3 = root;
+ mark_dirty(svm->vmcb, VMCB_NPT);
/* Also sync guest cr3 here in case we live migrate */
- svm->vmcb->save.cr3 = vcpu->arch.cr3;
+ svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
+ mark_dirty(svm->vmcb, VMCB_CR);
- force_new_asid(vcpu);
+ svm_flush_tlb(vcpu);
}
static int is_disabled(void)
@@ -3507,7 +3760,7 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
additional features */
/* Support next_rip if host supports it */
- if (svm_has(SVM_FEATURE_NRIP))
+ if (boot_cpu_has(X86_FEATURE_NRIPS))
entry->edx |= SVM_FEATURE_NRIP;
/* Support NPT for the guest if enabled */
@@ -3567,6 +3820,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
{ SVM_EXIT_WBINVD, "wbinvd" },
{ SVM_EXIT_MONITOR, "monitor" },
{ SVM_EXIT_MWAIT, "mwait" },
+ { SVM_EXIT_XSETBV, "xsetbv" },
{ SVM_EXIT_NPF, "npf" },
{ -1, NULL }
};
@@ -3590,9 +3844,7 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
- if (is_nested(svm))
- svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
+ set_exception_intercept(svm, NM_VECTOR);
update_cr0_intercept(svm);
}
@@ -3623,6 +3875,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
.decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
+ .decache_cr3 = svm_decache_cr3,
.decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
.set_cr0 = svm_set_cr0,
.set_cr3 = svm_set_cr3,
@@ -3663,7 +3916,9 @@ static struct kvm_x86_ops svm_x86_ops = {
.get_tdp_level = get_npt_level,
.get_mt_mask = svm_get_mt_mask,
+ .get_exit_info = svm_get_exit_info,
.exit_reasons_str = svm_exit_reasons_str,
+
.get_lpage_level = svm_get_lpage_level,
.cpuid_update = svm_cpuid_update,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a6544b8e7c0f..1357d7cf4ec8 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic,
#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
+#define KVM_ISA_VMX 1
+#define KVM_ISA_SVM 2
+
/*
* Tracepoint for kvm guest exit:
*/
TRACE_EVENT(kvm_exit,
- TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu),
- TP_ARGS(exit_reason, vcpu),
+ TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
+ TP_ARGS(exit_reason, vcpu, isa),
TP_STRUCT__entry(
__field( unsigned int, exit_reason )
__field( unsigned long, guest_rip )
+ __field( u32, isa )
+ __field( u64, info1 )
+ __field( u64, info2 )
),
TP_fast_assign(
__entry->exit_reason = exit_reason;
__entry->guest_rip = kvm_rip_read(vcpu);
+ __entry->isa = isa;
+ kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
+ &__entry->info2);
),
- TP_printk("reason %s rip 0x%lx",
+ TP_printk("reason %s rip 0x%lx info %llx %llx",
ftrace_print_symbols_seq(p, __entry->exit_reason,
kvm_x86_ops->exit_reasons_str),
- __entry->guest_rip)
+ __entry->guest_rip, __entry->info1, __entry->info2)
);
/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8da0e45ff7c9..bf89ec2cfb82 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
static int __read_mostly vmm_exclusive = 1;
module_param(vmm_exclusive, bool, S_IRUGO);
+static int __read_mostly yield_on_hlt = 1;
+module_param(yield_on_hlt, bool, S_IRUGO);
+
#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
#define KVM_GUEST_CR0_MASK \
@@ -177,6 +180,7 @@ static int init_rmode(struct kvm *kvm);
static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
+static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -188,6 +192,8 @@ static unsigned long *vmx_io_bitmap_b;
static unsigned long *vmx_msr_bitmap_legacy;
static unsigned long *vmx_msr_bitmap_longmode;
+static bool cpu_has_load_ia32_efer;
+
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -472,7 +478,7 @@ static void vmcs_clear(struct vmcs *vmcs)
u8 error;
asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
- : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
if (error)
printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@@ -485,7 +491,7 @@ static void vmcs_load(struct vmcs *vmcs)
u8 error;
asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
- : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+ : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
: "cc", "memory");
if (error)
printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
@@ -565,10 +571,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
static unsigned long vmcs_readl(unsigned long field)
{
- unsigned long value;
+ unsigned long value = 0;
asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
- : "=a"(value) : "d"(field) : "cc");
+ : "+a"(value) : "d"(field) : "cc");
return value;
}
@@ -661,6 +667,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
+ if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+ vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+ vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
@@ -680,6 +692,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
unsigned i;
struct msr_autoload *m = &vmx->msr_autoload;
+ if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
+ vmcs_write64(GUEST_IA32_EFER, guest_val);
+ vmcs_write64(HOST_IA32_EFER, host_val);
+ vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
+ vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
+ return;
+ }
+
for (i = 0; i < m->nr; ++i)
if (m->guest[i].index == msr)
break;
@@ -821,10 +841,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
#endif
#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu)) {
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
+ if (is_long_mode(&vmx->vcpu))
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
- }
#endif
for (i = 0; i < vmx->save_nmsrs; ++i)
kvm_set_shared_msr(vmx->guest_msrs[i].index,
@@ -839,23 +858,23 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
++vmx->vcpu.stat.host_state_reload;
vmx->host_state.loaded = 0;
- if (vmx->host_state.fs_reload_needed)
- loadsegment(fs, vmx->host_state.fs_sel);
+#ifdef CONFIG_X86_64
+ if (is_long_mode(&vmx->vcpu))
+ rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
+#endif
if (vmx->host_state.gs_ldt_reload_needed) {
kvm_load_ldt(vmx->host_state.ldt_sel);
#ifdef CONFIG_X86_64
load_gs_index(vmx->host_state.gs_sel);
- wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
#else
loadsegment(gs, vmx->host_state.gs_sel);
#endif
}
+ if (vmx->host_state.fs_reload_needed)
+ loadsegment(fs, vmx->host_state.fs_sel);
reload_tss();
#ifdef CONFIG_X86_64
- if (is_long_mode(&vmx->vcpu)) {
- rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
- wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
- }
+ wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
#endif
if (current_thread_info()->status & TS_USEDFPU)
clts();
@@ -1010,6 +1029,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
}
+static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
+{
+ /* Ensure that we clear the HLT state in the VMCS. We don't need to
+ * explicitly skip the instruction because if the HLT state is set, then
+ * the instruction is already executing and RIP has already been
+ * advanced. */
+ if (!yield_on_hlt &&
+ vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
+}
+
static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
bool has_error_code, u32 error_code,
bool reinject)
@@ -1036,6 +1066,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
intr_info |= INTR_TYPE_HARD_EXCEPTION;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
+ vmx_clear_hlt(vcpu);
}
static bool vmx_rdtscp_supported(void)
@@ -1306,8 +1337,11 @@ static __init int vmx_disabled_by_bios(void)
&& tboot_enabled())
return 1;
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && !tboot_enabled())
+ && !tboot_enabled()) {
+ printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
+ " activate TXT before enabling KVM\n");
return 1;
+ }
}
return 0;
@@ -1401,6 +1435,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
return 0;
}
+static __init bool allow_1_setting(u32 msr, u32 ctl)
+{
+ u32 vmx_msr_low, vmx_msr_high;
+
+ rdmsr(msr, vmx_msr_low, vmx_msr_high);
+ return vmx_msr_high & ctl;
+}
+
static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
{
u32 vmx_msr_low, vmx_msr_high;
@@ -1417,7 +1459,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
&_pin_based_exec_control) < 0)
return -EIO;
- min = CPU_BASED_HLT_EXITING |
+ min =
#ifdef CONFIG_X86_64
CPU_BASED_CR8_LOAD_EXITING |
CPU_BASED_CR8_STORE_EXITING |
@@ -1430,6 +1472,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING;
+
+ if (yield_on_hlt)
+ min |= CPU_BASED_HLT_EXITING;
+
opt = CPU_BASED_TPR_SHADOW |
CPU_BASED_USE_MSR_BITMAPS |
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1511,6 +1557,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
+ cpu_has_load_ia32_efer =
+ allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
+ VM_ENTRY_LOAD_IA32_EFER)
+ && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
+ VM_EXIT_LOAD_IA32_EFER);
+
return 0;
}
@@ -1684,9 +1736,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
save->limit = vmcs_read32(sf->limit);
save->ar = vmcs_read32(sf->ar_bytes);
vmcs_write16(sf->selector, save->base >> 4);
- vmcs_write32(sf->base, save->base & 0xfffff);
+ vmcs_write32(sf->base, save->base & 0xffff0);
vmcs_write32(sf->limit, 0xffff);
vmcs_write32(sf->ar_bytes, 0xf3);
+ if (save->base & 0xf)
+ printk_once(KERN_WARNING "kvm: segment base is not paragraph"
+ " aligned when entering protected mode (seg=%d)",
+ seg);
}
static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -1815,6 +1871,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
}
+static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
+{
+ if (enable_ept && is_paging(vcpu))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+}
+
static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
{
ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1858,6 +1921,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
unsigned long cr0,
struct kvm_vcpu *vcpu)
{
+ vmx_decache_cr3(vcpu);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1938,7 +2002,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (enable_ept) {
eptp = construct_eptp(cr3);
vmcs_write64(EPT_POINTER, eptp);
- guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
+ guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
vcpu->kvm->arch.ept_identity_map_addr;
ept_load_pdptrs(vcpu);
}
@@ -2726,7 +2790,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_writel(GUEST_IDTR_BASE, 0);
vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
- vmcs_write32(GUEST_ACTIVITY_STATE, 0);
+ vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
@@ -2788,6 +2852,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
return;
}
+ if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+ enable_irq_window(vcpu);
+ return;
+ }
cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2815,6 +2883,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
} else
intr |= INTR_TYPE_EXT_INTR;
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
+ vmx_clear_hlt(vcpu);
}
static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2842,6 +2911,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
}
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
+ vmx_clear_hlt(vcpu);
}
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2850,7 +2920,8 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
return 0;
return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
+ (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
+ | GUEST_INTR_STATE_NMI));
}
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
@@ -2911,7 +2982,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
* Cause the #SS fault with 0 error code in VM86 mode.
*/
if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
- if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
+ if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
return 1;
/*
* Forward all other exceptions that are valid in real mode.
@@ -3008,7 +3079,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
}
if (is_invalid_opcode(intr_info)) {
- er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
+ er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
if (er != EMULATE_DONE)
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
@@ -3027,7 +3098,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, cr2);
- return kvm_mmu_page_fault(vcpu, cr2, error_code);
+ return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
}
if (vmx->rmode.vm86_active &&
@@ -3099,7 +3170,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
++vcpu->stat.io_exits;
if (string || in)
- return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
@@ -3119,14 +3190,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
hypercall[2] = 0xc1;
}
-static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
-{
- if (err)
- kvm_inject_gp(vcpu, 0);
- else
- skip_emulated_instruction(vcpu);
-}
-
static int handle_cr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification, val;
@@ -3144,21 +3207,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
switch (cr) {
case 0:
err = kvm_set_cr0(vcpu, val);
- complete_insn_gp(vcpu, err);
+ kvm_complete_insn_gp(vcpu, err);
return 1;
case 3:
err = kvm_set_cr3(vcpu, val);
- complete_insn_gp(vcpu, err);
+ kvm_complete_insn_gp(vcpu, err);
return 1;
case 4:
err = kvm_set_cr4(vcpu, val);
- complete_insn_gp(vcpu, err);
+ kvm_complete_insn_gp(vcpu, err);
return 1;
case 8: {
u8 cr8_prev = kvm_get_cr8(vcpu);
u8 cr8 = kvm_register_read(vcpu, reg);
- kvm_set_cr8(vcpu, cr8);
- skip_emulated_instruction(vcpu);
+ err = kvm_set_cr8(vcpu, cr8);
+ kvm_complete_insn_gp(vcpu, err);
if (irqchip_in_kernel(vcpu->kvm))
return 1;
if (cr8_prev <= cr8)
@@ -3177,8 +3240,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
case 1: /*mov from cr*/
switch (cr) {
case 3:
- kvm_register_write(vcpu, reg, vcpu->arch.cr3);
- trace_kvm_cr_read(cr, vcpu->arch.cr3);
+ val = kvm_read_cr3(vcpu);
+ kvm_register_write(vcpu, reg, val);
+ trace_kvm_cr_read(cr, val);
skip_emulated_instruction(vcpu);
return 1;
case 8:
@@ -3350,6 +3414,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_invd(struct kvm_vcpu *vcpu)
+{
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
+}
+
static int handle_invlpg(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3378,7 +3447,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
- return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+ return emulate_instruction(vcpu, 0) == EMULATE_DONE;
}
static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3477,7 +3546,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);
+ return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
}
static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3593,7 +3662,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
&& (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
return handle_interrupt_window(&vmx->vcpu);
- err = emulate_instruction(vcpu, 0, 0, 0);
+ err = emulate_instruction(vcpu, 0);
if (err == EMULATE_DO_MMIO) {
ret = 0;
@@ -3650,6 +3719,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_MSR_WRITE] = handle_wrmsr,
[EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
[EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_VMCALL] = handle_vmcall,
[EXIT_REASON_VMCLEAR] = handle_vmx_insn,
@@ -3677,6 +3747,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+{
+ *info1 = vmcs_readl(EXIT_QUALIFICATION);
+ *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+}
+
/*
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
@@ -3687,17 +3763,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
- trace_kvm_exit(exit_reason, vcpu);
+ trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
/* If guest state is invalid, start emulating */
if (vmx->emulation_required && emulate_invalid_guest_state)
return handle_invalid_guest_state(vcpu);
- /* Access CR3 don't cause VMExit in paging mode, so we need
- * to sync with guest real CR3. */
- if (enable_ept && is_paging(vcpu))
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
-
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -4014,7 +4085,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
);
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
- | (1 << VCPU_EXREG_PDPTR));
+ | (1 << VCPU_EXREG_PDPTR)
+ | (1 << VCPU_EXREG_CR3));
vcpu->arch.regs_dirty = 0;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
@@ -4228,11 +4300,6 @@ static int vmx_get_lpage_level(void)
return PT_PDPE_LEVEL;
}
-static inline u32 bit(int bitno)
-{
- return 1 << (bitno & 31);
-}
-
static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -4286,6 +4353,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
.decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
+ .decache_cr3 = vmx_decache_cr3,
.decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
.set_cr0 = vmx_set_cr0,
.set_cr3 = vmx_set_cr3,
@@ -4326,7 +4394,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
.get_tdp_level = get_ept_level,
.get_mt_mask = vmx_get_mt_mask,
+ .get_exit_info = vmx_get_exit_info,
.exit_reasons_str = vmx_exit_reasons_str,
+
.get_lpage_level = vmx_get_lpage_level,
.cpuid_update = vmx_cpuid_update,
@@ -4402,8 +4472,6 @@ static int __init vmx_init(void)
if (enable_ept) {
bypass_guest_pf = 0;
- kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
- VMX_EPT_WRITABLE_MASK);
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
VMX_EPT_EXECUTABLE_MASK);
kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cdac9e592aa5..bcc0efce85bf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -43,6 +43,7 @@
#include <linux/slab.h>
#include <linux/perf_event.h>
#include <linux/uaccess.h>
+#include <linux/hash.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@@ -155,9 +156,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
u64 __read_mostly host_xcr0;
-static inline u32 bit(int bitno)
+static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
- return 1 << (bitno & 31);
+ int i;
+ for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
+ vcpu->arch.apf.gfns[i] = ~0;
}
static void kvm_on_user_return(struct user_return_notifier *urn)
@@ -331,23 +334,28 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);
-void kvm_inject_page_fault(struct kvm_vcpu *vcpu)
+void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
{
- unsigned error_code = vcpu->arch.fault.error_code;
+ if (err)
+ kvm_inject_gp(vcpu, 0);
+ else
+ kvm_x86_ops->skip_emulated_instruction(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
+void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+{
++vcpu->stat.pf_guest;
- vcpu->arch.cr2 = vcpu->arch.fault.address;
- kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
+ vcpu->arch.cr2 = fault->address;
+ kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
}
-void kvm_propagate_fault(struct kvm_vcpu *vcpu)
+void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
- if (mmu_is_nested(vcpu) && !vcpu->arch.fault.nested)
- vcpu->arch.nested_mmu.inject_page_fault(vcpu);
+ if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
+ vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
else
- vcpu->arch.mmu.inject_page_fault(vcpu);
-
- vcpu->arch.fault.nested = false;
+ vcpu->arch.mmu.inject_page_fault(vcpu, fault);
}
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -465,8 +473,8 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
(unsigned long *)&vcpu->arch.regs_avail))
return true;
- gfn = (vcpu->arch.cr3 & ~31u) >> PAGE_SHIFT;
- offset = (vcpu->arch.cr3 & ~31u) & (PAGE_SIZE - 1);
+ gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
+ offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
PFERR_USER_MASK | PFERR_WRITE_MASK);
if (r < 0)
@@ -511,12 +519,15 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
} else
#endif
if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- vcpu->arch.cr3))
+ kvm_read_cr3(vcpu)))
return 1;
}
kvm_x86_ops->set_cr0(vcpu, cr0);
+ if ((cr0 ^ old_cr0) & X86_CR0_PG)
+ kvm_clear_async_pf_completion_queue(vcpu);
+
if ((cr0 ^ old_cr0) & update_bits)
kvm_mmu_reset_context(vcpu);
return 0;
@@ -600,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
- && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3))
+ && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
+ kvm_read_cr3(vcpu)))
return 1;
if (cr4 & X86_CR4_VMXE)
@@ -620,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
- if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
+ if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
return 0;
@@ -655,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
return 1;
vcpu->arch.cr3 = cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
vcpu->arch.mmu.new_cr3(vcpu);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_cr3);
-int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
{
if (cr8 & CR8_RESERVED_BITS)
return 1;
@@ -670,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
vcpu->arch.cr8 = cr8;
return 0;
}
-
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
-{
- if (__kvm_set_cr8(vcpu, cr8))
- kvm_inject_gp(vcpu, 0);
-}
EXPORT_SYMBOL_GPL(kvm_set_cr8);
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
@@ -780,12 +787,12 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
* kvm-specific. Those are put in the beginning of the list.
*/
-#define KVM_SAVE_MSRS_BEGIN 7
+#define KVM_SAVE_MSRS_BEGIN 8
static u32 msrs_to_save[] = {
MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
- HV_X64_MSR_APIC_ASSIST_PAGE,
+ HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_STAR,
#ifdef CONFIG_X86_64
@@ -835,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
kvm_x86_ops->set_efer(vcpu, efer);
vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
- kvm_mmu_reset_context(vcpu);
/* Update reserved bits */
if ((efer ^ old_efer) & EFER_NX)
@@ -981,7 +987,7 @@ static inline u64 nsec_to_cycles(u64 nsec)
if (kvm_tsc_changes_freq())
printk_once(KERN_WARNING
"kvm: unreliable cycle conversion on adjustable rate TSC\n");
- ret = nsec * __get_cpu_var(cpu_tsc_khz);
+ ret = nsec * __this_cpu_read(cpu_tsc_khz);
do_div(ret, USEC_PER_SEC);
return ret;
}
@@ -1066,7 +1072,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
local_irq_save(flags);
kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
kernel_ns = get_kernel_ns();
- this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
+ this_tsc_khz = __this_cpu_read(cpu_tsc_khz);
if (unlikely(this_tsc_khz == 0)) {
local_irq_restore(flags);
@@ -1423,6 +1429,30 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
return 0;
}
+static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
+{
+ gpa_t gpa = data & ~0x3f;
+
+ /* Bits 2:5 are resrved, Should be zero */
+ if (data & 0x3c)
+ return 1;
+
+ vcpu->arch.apf.msr_val = data;
+
+ if (!(data & KVM_ASYNC_PF_ENABLED)) {
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ return 0;
+ }
+
+ if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
+ return 1;
+
+ vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
+ kvm_async_pf_wakeup_all(vcpu);
+ return 0;
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
@@ -1504,6 +1534,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
}
break;
}
+ case MSR_KVM_ASYNC_PF_EN:
+ if (kvm_pv_enable_async_pf(vcpu, data))
+ return 1;
+ break;
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1780,6 +1814,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_KVM_SYSTEM_TIME_NEW:
data = vcpu->arch.time;
break;
+ case MSR_KVM_ASYNC_PF_EN:
+ data = vcpu->arch.apf.msr_val;
+ break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
@@ -1909,6 +1946,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_NOP_IO_DELAY:
case KVM_CAP_MP_STATE:
case KVM_CAP_SYNC_MMU:
+ case KVM_CAP_USER_NMI:
case KVM_CAP_REINJECT_CONTROL:
case KVM_CAP_IRQ_INJECT_STATUS:
case KVM_CAP_ASSIGN_DEV_IRQ:
@@ -1927,6 +1965,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_DEBUGREGS:
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
+ case KVM_CAP_ASYNC_PF:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -2190,6 +2229,11 @@ out:
return r;
}
+static void cpuid_mask(u32 *word, int wordnum)
+{
+ *word &= boot_cpu_data.x86_capability[wordnum];
+}
+
static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u32 index)
{
@@ -2264,7 +2308,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
break;
case 1:
entry->edx &= kvm_supported_word0_x86_features;
+ cpuid_mask(&entry->edx, 0);
entry->ecx &= kvm_supported_word4_x86_features;
+ cpuid_mask(&entry->ecx, 4);
/* we support x2apic emulation even if host does not support
* it since we emulate x2apic in software */
entry->ecx |= F(X2APIC);
@@ -2355,7 +2401,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
break;
case 0x80000001:
entry->edx &= kvm_supported_word1_x86_features;
+ cpuid_mask(&entry->edx, 1);
entry->ecx &= kvm_supported_word6_x86_features;
+ cpuid_mask(&entry->ecx, 6);
break;
}
@@ -3174,20 +3222,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memslots *slots, *old_slots;
unsigned long *dirty_bitmap;
- r = -ENOMEM;
- dirty_bitmap = vmalloc(n);
- if (!dirty_bitmap)
- goto out;
+ dirty_bitmap = memslot->dirty_bitmap_head;
+ if (memslot->dirty_bitmap == dirty_bitmap)
+ dirty_bitmap += n / sizeof(long);
memset(dirty_bitmap, 0, n);
r = -ENOMEM;
slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
- if (!slots) {
- vfree(dirty_bitmap);
+ if (!slots)
goto out;
- }
memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
+ slots->generation++;
old_slots = kvm->memslots;
rcu_assign_pointer(kvm->memslots, slots);
@@ -3200,11 +3246,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
spin_unlock(&kvm->mmu_lock);
r = -EFAULT;
- if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
- vfree(dirty_bitmap);
+ if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
goto out;
- }
- vfree(dirty_bitmap);
} else {
r = -EFAULT;
if (clear_user(log->dirty_bitmap, n))
@@ -3271,8 +3314,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (vpic) {
r = kvm_ioapic_init(kvm);
if (r) {
+ mutex_lock(&kvm->slots_lock);
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
&vpic->dev);
+ mutex_unlock(&kvm->slots_lock);
kfree(vpic);
goto create_irqchip_unlock;
}
@@ -3283,10 +3328,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
smp_wmb();
r = kvm_setup_default_irq_routing(kvm);
if (r) {
+ mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->irq_lock);
kvm_ioapic_destroy(kvm);
kvm_destroy_pic(kvm);
mutex_unlock(&kvm->irq_lock);
+ mutex_unlock(&kvm->slots_lock);
}
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
@@ -3562,63 +3609,63 @@ static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
{
gpa_t t_gpa;
- u32 error;
+ struct x86_exception exception;
BUG_ON(!mmu_is_nested(vcpu));
/* NPT walks are always user-walks */
access |= PFERR_USER_MASK;
- t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &error);
- if (t_gpa == UNMAPPED_GVA)
- vcpu->arch.fault.nested = true;
+ t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
return t_gpa;
}
-gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
- gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+ gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
-gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
/* uses this to access any guest's mapped memory without checking CPL */
-gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
+gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
+ struct x86_exception *exception)
{
- return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, error);
+ return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
}
static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
struct kvm_vcpu *vcpu, u32 access,
- u32 *error)
+ struct x86_exception *exception)
{
void *data = val;
int r = X86EMUL_CONTINUE;
while (bytes) {
gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
- error);
+ exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA) {
- r = X86EMUL_PROPAGATE_FAULT;
- goto out;
- }
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
@@ -3635,31 +3682,35 @@ out:
/* used for instruction fetching */
static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 *error)
+ struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
- access | PFERR_FETCH_MASK, error);
+ access | PFERR_FETCH_MASK,
+ exception);
}
static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 *error)
+ struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
- error);
+ exception);
}
static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
- struct kvm_vcpu *vcpu, u32 *error)
+ struct kvm_vcpu *vcpu,
+ struct x86_exception *exception)
{
- return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
+ return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
}
static int kvm_write_guest_virt_system(gva_t addr, void *val,
unsigned int bytes,
struct kvm_vcpu *vcpu,
- u32 *error)
+ struct x86_exception *exception)
{
void *data = val;
int r = X86EMUL_CONTINUE;
@@ -3667,15 +3718,13 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
while (bytes) {
gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
PFERR_WRITE_MASK,
- error);
+ exception);
unsigned offset = addr & (PAGE_SIZE-1);
unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
int ret;
- if (gpa == UNMAPPED_GVA) {
- r = X86EMUL_PROPAGATE_FAULT;
- goto out;
- }
+ if (gpa == UNMAPPED_GVA)
+ return X86EMUL_PROPAGATE_FAULT;
ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
if (ret < 0) {
r = X86EMUL_IO_NEEDED;
@@ -3693,7 +3742,7 @@ out:
static int emulator_read_emulated(unsigned long addr,
void *val,
unsigned int bytes,
- unsigned int *error_code,
+ struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
@@ -3706,7 +3755,7 @@ static int emulator_read_emulated(unsigned long addr,
return X86EMUL_CONTINUE;
}
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
+ gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
@@ -3715,8 +3764,8 @@ static int emulator_read_emulated(unsigned long addr,
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
- if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
- == X86EMUL_CONTINUE)
+ if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception)
+ == X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
mmio:
@@ -3740,7 +3789,7 @@ mmio:
}
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
- const void *val, int bytes)
+ const void *val, int bytes)
{
int ret;
@@ -3754,12 +3803,12 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
static int emulator_write_emulated_onepage(unsigned long addr,
const void *val,
unsigned int bytes,
- unsigned int *error_code,
+ struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
@@ -3792,7 +3841,7 @@ mmio:
int emulator_write_emulated(unsigned long addr,
const void *val,
unsigned int bytes,
- unsigned int *error_code,
+ struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
/* Crossing a page boundary? */
@@ -3800,7 +3849,7 @@ int emulator_write_emulated(unsigned long addr,
int rc, now;
now = -addr & ~PAGE_MASK;
- rc = emulator_write_emulated_onepage(addr, val, now, error_code,
+ rc = emulator_write_emulated_onepage(addr, val, now, exception,
vcpu);
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -3808,7 +3857,7 @@ int emulator_write_emulated(unsigned long addr,
val += now;
bytes -= now;
}
- return emulator_write_emulated_onepage(addr, val, bytes, error_code,
+ return emulator_write_emulated_onepage(addr, val, bytes, exception,
vcpu);
}
@@ -3826,7 +3875,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
const void *old,
const void *new,
unsigned int bytes,
- unsigned int *error_code,
+ struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
gpa_t gpa;
@@ -3884,7 +3933,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
emul_write:
printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
- return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
+ return emulator_write_emulated(addr, new, bytes, exception, vcpu);
}
static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3909,7 +3958,7 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
if (vcpu->arch.pio.count)
goto data_avail;
- trace_kvm_pio(0, port, size, 1);
+ trace_kvm_pio(0, port, size, count);
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = 1;
@@ -3937,7 +3986,7 @@ static int emulator_pio_out_emulated(int size, unsigned short port,
const void *val, unsigned int count,
struct kvm_vcpu *vcpu)
{
- trace_kvm_pio(1, port, size, 1);
+ trace_kvm_pio(1, port, size, count);
vcpu->arch.pio.port = port;
vcpu->arch.pio.in = 0;
@@ -3978,13 +4027,15 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
return X86EMUL_CONTINUE;
if (kvm_x86_ops->has_wbinvd_exit()) {
- preempt_disable();
+ int cpu = get_cpu();
+
+ cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
wbinvd_ipi, NULL, 1);
- preempt_enable();
+ put_cpu();
cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
- }
- wbinvd();
+ } else
+ wbinvd();
return X86EMUL_CONTINUE;
}
EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
@@ -4024,7 +4075,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
value = vcpu->arch.cr2;
break;
case 3:
- value = vcpu->arch.cr3;
+ value = kvm_read_cr3(vcpu);
break;
case 4:
value = kvm_read_cr4(vcpu);
@@ -4058,7 +4109,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
break;
case 8:
- res = __kvm_set_cr8(vcpu, val & 0xfUL);
+ res = kvm_set_cr8(vcpu, val);
break;
default:
vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
@@ -4211,12 +4262,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
static void inject_emulated_exception(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
- if (ctxt->exception == PF_VECTOR)
- kvm_propagate_fault(vcpu);
- else if (ctxt->error_code_valid)
- kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
+ if (ctxt->exception.vector == PF_VECTOR)
+ kvm_propagate_fault(vcpu, &ctxt->exception);
+ else if (ctxt->exception.error_code_valid)
+ kvm_queue_exception_e(vcpu, ctxt->exception.vector,
+ ctxt->exception.error_code);
else
- kvm_queue_exception(vcpu, ctxt->exception);
+ kvm_queue_exception(vcpu, ctxt->exception.vector);
}
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -4272,13 +4324,19 @@ EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
static int handle_emulation_failure(struct kvm_vcpu *vcpu)
{
+ int r = EMULATE_DONE;
+
++vcpu->stat.insn_emulation_fail;
trace_kvm_emulate_insn_failed(vcpu);
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- vcpu->run->internal.ndata = 0;
+ if (!is_guest_mode(vcpu)) {
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ r = EMULATE_FAIL;
+ }
kvm_queue_exception(vcpu, UD_VECTOR);
- return EMULATE_FAIL;
+
+ return r;
}
static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
@@ -4307,10 +4365,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
return false;
}
-int emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2,
- u16 error_code,
- int emulation_type)
+int x86_emulate_instruction(struct kvm_vcpu *vcpu,
+ unsigned long cr2,
+ int emulation_type,
+ void *insn,
+ int insn_len)
{
int r;
struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
@@ -4328,10 +4387,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
init_emulate_ctxt(vcpu);
vcpu->arch.emulate_ctxt.interruptibility = 0;
- vcpu->arch.emulate_ctxt.exception = -1;
+ vcpu->arch.emulate_ctxt.have_exception = false;
vcpu->arch.emulate_ctxt.perm_ok = false;
- r = x86_decode_insn(&vcpu->arch.emulate_ctxt);
+ r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
if (r == X86EMUL_PROPAGATE_FAULT)
goto done;
@@ -4394,7 +4453,7 @@ restart:
}
done:
- if (vcpu->arch.emulate_ctxt.exception >= 0) {
+ if (vcpu->arch.emulate_ctxt.have_exception) {
inject_emulated_exception(vcpu);
r = EMULATE_DONE;
} else if (vcpu->arch.pio.count) {
@@ -4418,7 +4477,7 @@ done:
return r;
}
-EXPORT_SYMBOL_GPL(emulate_instruction);
+EXPORT_SYMBOL_GPL(x86_emulate_instruction);
int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
{
@@ -4432,7 +4491,7 @@ EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
static void tsc_bad(void *info)
{
- __get_cpu_var(cpu_tsc_khz) = 0;
+ __this_cpu_write(cpu_tsc_khz, 0);
}
static void tsc_khz_changed(void *data)
@@ -4446,7 +4505,7 @@ static void tsc_khz_changed(void *data)
khz = cpufreq_quick_get(raw_smp_processor_id());
if (!khz)
khz = tsc_khz;
- __get_cpu_var(cpu_tsc_khz) = khz;
+ __this_cpu_write(cpu_tsc_khz, khz);
}
static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4569,9 +4628,11 @@ static void kvm_timer_init(void)
#ifdef CONFIG_CPU_FREQ
struct cpufreq_policy policy;
memset(&policy, 0, sizeof(policy));
- cpufreq_get_policy(&policy, get_cpu());
+ cpu = get_cpu();
+ cpufreq_get_policy(&policy, cpu);
if (policy.cpuinfo.max_freq)
max_tsc_khz = policy.cpuinfo.max_freq;
+ put_cpu();
#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
@@ -4656,7 +4717,6 @@ int kvm_arch_init(void *opaque)
kvm_x86_ops = ops;
kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
- kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);
@@ -5119,6 +5179,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->fpu_active = 0;
kvm_x86_ops->fpu_deactivate(vcpu);
}
+ if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
+ /* Page is swapped out. Do synthetic halt */
+ vcpu->arch.apf.halted = true;
+ r = 1;
+ goto out;
+ }
}
r = kvm_mmu_reload(vcpu);
@@ -5247,7 +5313,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
r = 1;
while (r > 0) {
- if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
+ if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted)
r = vcpu_enter_guest(vcpu);
else {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -5260,6 +5327,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
case KVM_MP_STATE_RUNNABLE:
+ vcpu->arch.apf.halted = false;
break;
case KVM_MP_STATE_SIPI_RECEIVED:
default:
@@ -5281,6 +5349,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
vcpu->run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.request_irq_exits;
}
+
+ kvm_check_async_pf_completion(vcpu);
+
if (signal_pending(current)) {
r = -EINTR;
vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -5305,6 +5376,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int r;
sigset_t sigsaved;
+ if (!tsk_used_math(current) && init_fpu(current))
+ return -ENOMEM;
+
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
@@ -5316,8 +5390,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
}
/* re-sync apic's tpr */
- if (!irqchip_in_kernel(vcpu->kvm))
- kvm_set_cr8(vcpu, kvm_run->cr8);
+ if (!irqchip_in_kernel(vcpu->kvm)) {
+ if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
+ r = -EINVAL;
+ goto out;
+ }
+ }
if (vcpu->arch.pio.count || vcpu->mmio_needed) {
if (vcpu->mmio_needed) {
@@ -5326,7 +5404,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
vcpu->mmio_needed = 0;
}
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
+ r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
if (r != EMULATE_DONE) {
r = 0;
@@ -5439,7 +5517,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
sregs->cr0 = kvm_read_cr0(vcpu);
sregs->cr2 = vcpu->arch.cr2;
- sregs->cr3 = vcpu->arch.cr3;
+ sregs->cr3 = kvm_read_cr3(vcpu);
sregs->cr4 = kvm_read_cr4(vcpu);
sregs->cr8 = kvm_get_cr8(vcpu);
sregs->efer = vcpu->arch.efer;
@@ -5507,8 +5585,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_x86_ops->set_gdt(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
- mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
+ mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
+ __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
kvm_set_cr8(vcpu, sregs->cr8);
@@ -5522,8 +5601,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
+ if (sregs->cr4 & X86_CR4_OSXSAVE)
+ update_cpuid(vcpu);
if (!is_long_mode(vcpu) && is_pae(vcpu)) {
- load_pdptrs(vcpu, vcpu->arch.walk_mmu, vcpu->arch.cr3);
+ load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
mmu_reset_needed = 1;
}
@@ -5774,6 +5855,8 @@ free_vcpu:
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.apf.msr_val = 0;
+
vcpu_load(vcpu);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
@@ -5793,6 +5876,11 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ vcpu->arch.apf.msr_val = 0;
+
+ kvm_clear_async_pf_completion_queue(vcpu);
+ kvm_async_pf_hash_reset(vcpu);
+ vcpu->arch.apf.halted = false;
return kvm_x86_ops->vcpu_reset(vcpu);
}
@@ -5882,6 +5970,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
goto fail_free_mce_banks;
+ kvm_async_pf_hash_reset(vcpu);
+
return 0;
fail_free_mce_banks:
kfree(vcpu->arch.mce_banks);
@@ -5907,13 +5997,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
free_page((unsigned long)vcpu->arch.pio_data);
}
-struct kvm *kvm_arch_create_vm(void)
+int kvm_arch_init_vm(struct kvm *kvm)
{
- struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
-
- if (!kvm)
- return ERR_PTR(-ENOMEM);
-
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -5922,7 +6007,7 @@ struct kvm *kvm_arch_create_vm(void)
spin_lock_init(&kvm->arch.tsc_write_lock);
- return kvm;
+ return 0;
}
static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -5940,8 +6025,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
/*
* Unpin any mmu pages first.
*/
- kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_clear_async_pf_completion_queue(vcpu);
kvm_unload_vcpu_mmu(vcpu);
+ }
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_arch_vcpu_free(vcpu);
@@ -5965,13 +6052,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
- kvm_free_physmem(kvm);
if (kvm->arch.apic_access_page)
put_page(kvm->arch.apic_access_page);
if (kvm->arch.ept_identity_pagetable)
put_page(kvm->arch.ept_identity_pagetable);
- cleanup_srcu_struct(&kvm->srcu);
- kfree(kvm);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -6052,7 +6136,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
+ return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
+ !vcpu->arch.apf.halted)
+ || !list_empty_careful(&vcpu->async_pf.done)
|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
|| vcpu->arch.nmi_pending ||
(kvm_arch_interrupt_allowed(vcpu) &&
@@ -6111,6 +6197,147 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
+{
+ int r;
+
+ if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
+ is_error_page(work->page))
+ return;
+
+ r = kvm_mmu_reload(vcpu);
+ if (unlikely(r))
+ return;
+
+ if (!vcpu->arch.mmu.direct_map &&
+ work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
+ return;
+
+ vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
+}
+
+static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
+{
+ return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
+}
+
+static inline u32 kvm_async_pf_next_probe(u32 key)
+{
+ return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
+}
+
+static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ while (vcpu->arch.apf.gfns[key] != ~0)
+ key = kvm_async_pf_next_probe(key);
+
+ vcpu->arch.apf.gfns[key] = gfn;
+}
+
+static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ int i;
+ u32 key = kvm_async_pf_hash_fn(gfn);
+
+ for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
+ (vcpu->arch.apf.gfns[key] != gfn &&
+ vcpu->arch.apf.gfns[key] != ~0); i++)
+ key = kvm_async_pf_next_probe(key);
+
+ return key;
+}
+
+bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
+}
+
+static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ u32 i, j, k;
+
+ i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+ while (true) {
+ vcpu->arch.apf.gfns[i] = ~0;
+ do {
+ j = kvm_async_pf_next_probe(j);
+ if (vcpu->arch.apf.gfns[j] == ~0)
+ return;
+ k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
+ /*
+ * k lies cyclically in ]i,j]
+ * | i.k.j |
+ * |....j i.k.| or |.k..j i...|
+ */
+ } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
+ vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
+ i = j;
+ }
+}
+
+static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
+{
+
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
+ sizeof(val));
+}
+
+void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct x86_exception fault;
+
+ trace_kvm_async_pf_not_present(work->arch.token, work->gva);
+ kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
+
+ if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
+ (vcpu->arch.apf.send_user_only &&
+ kvm_x86_ops->get_cpl(vcpu) == 0))
+ kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ kvm_inject_page_fault(vcpu, &fault);
+ }
+}
+
+void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ struct x86_exception fault;
+
+ trace_kvm_async_pf_ready(work->arch.token, work->gva);
+ if (is_error_page(work->page))
+ work->arch.token = ~0; /* broadcast wakeup */
+ else
+ kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+
+ if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
+ !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = 0;
+ fault.nested_page_fault = false;
+ fault.address = work->arch.token;
+ kvm_inject_page_fault(vcpu, &fault);
+ }
+ vcpu->arch.apf.halted = false;
+}
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
+ return true;
+ else
+ return !kvm_event_needs_reinjection(vcpu) &&
+ kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2cea414489f3..c600da830ce0 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -70,6 +70,11 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
}
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq);
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 38718041efc3..6e121a2a49e1 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,6 +2,7 @@ config LGUEST_GUEST
bool "Lguest guest support"
select PARAVIRT
depends on X86_32
+ select VIRTUALIZATION
select VIRTIO
select VIRTIO_RING
select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 73b1e1a1f489..eba687f0cc0c 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -531,7 +531,10 @@ static void lguest_write_cr3(unsigned long cr3)
{
lguest_data.pgdir = cr3;
lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
- cr3_changed = true;
+
+ /* These two page tables are simple, linear, and used during boot */
+ if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
+ cr3_changed = true;
}
static unsigned long lguest_read_cr3(void)
@@ -703,9 +706,9 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
* to forget all of them. Fortunately, this is very rare.
*
* ... except in early boot when the kernel sets up the initial pagetables,
- * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell
- * the Host anything changed until we've done the first page table switch,
- * which brings boot back to 0.25 seconds.
+ * which makes booting astonishingly slow: 48 seconds! So we don't even tell
+ * the Host anything changed until we've done the first real page table switch,
+ * which brings boot back to 4.3 seconds.
*/
static void lguest_set_pte(pte_t *ptep, pte_t pteval)
{
@@ -821,7 +824,7 @@ static void __init lguest_init_IRQ(void)
for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
/* Some systems map "vectors" to interrupts weirdly. Not us! */
- __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
+ __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
if (i != SYSCALL_VECTOR)
set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
@@ -1002,7 +1005,7 @@ static void lguest_time_init(void)
clockevents_register_device(&lguest_clockevent);
/* Finally, we unblock the timer interrupt. */
- enable_lguest_irq(0);
+ clear_bit(0, lguest_data.blocked_interrupts);
}
/*
@@ -1349,9 +1352,6 @@ __init void lguest_init(void)
*/
switch_to_new_gdt(0);
- /* We actually boot with all memory mapped, but let's say 128MB. */
- max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
-
/*
* The Host<->Guest Switcher lives at the top of our address space, and
* the Host told us how big it is when we made LGUEST_INIT hypercall:
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index ff485d361182..fc45ba887d05 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -121,7 +121,7 @@ inline void __const_udelay(unsigned long xloops)
asm("mull %%edx"
:"=d" (xloops), "=&a" (d0)
:"1" (xloops), "0"
- (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4)));
+ (this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4)));
__delay(++xloops);
}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 55543397a8a7..09df2f9a3d69 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,7 +23,7 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
-obj-$(CONFIG_K8_NUMA) += k8topology_64.o
+obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o
obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology_64.c
index 804a3b6c6e14..f21962c435ed 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/amdtopology_64.c
@@ -1,8 +1,8 @@
/*
- * AMD K8 NUMA support.
+ * AMD NUMA support.
* Discover the memory map and associated nodes.
*
- * This version reads it directly from the K8 northbridge.
+ * This version reads it directly from the AMD northbridge.
*
* Copyright 2002,2003 Andi Kleen, SuSE Labs.
*/
@@ -27,6 +27,7 @@
#include <asm/amd_nb.h>
static struct bootnode __initdata nodes[8];
+static unsigned char __initdata nodeids[8];
static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
static __init int find_northbridge(void)
@@ -57,7 +58,7 @@ static __init void early_get_boot_cpu_id(void)
{
/*
* need to get the APIC ID of the BSP so can use that to
- * create apicid_to_node in k8_scan_nodes()
+ * create apicid_to_node in amd_scan_nodes()
*/
#ifdef CONFIG_X86_MPPARSE
/*
@@ -66,23 +67,9 @@ static __init void early_get_boot_cpu_id(void)
if (smp_found_config)
early_get_smp_config();
#endif
- early_init_lapic_mapping();
}
-int __init k8_get_nodes(struct bootnode *physnodes)
-{
- int i;
- int ret = 0;
-
- for_each_node_mask(i, nodes_parsed) {
- physnodes[ret].start = nodes[i].start;
- physnodes[ret].end = nodes[i].end;
- ret++;
- }
- return ret;
-}
-
-int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
+int __init amd_numa_init(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long start = PFN_PHYS(start_pfn);
unsigned long end = PFN_PHYS(end_pfn);
@@ -114,7 +101,7 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
base = read_pci_config(0, nb, 1, 0x40 + i*8);
limit = read_pci_config(0, nb, 1, 0x44 + i*8);
- nodeid = limit & 7;
+ nodeids[i] = nodeid = limit & 7;
if ((base & 3) == 0) {
if (i < numnodes)
pr_info("Skipping disabled node %d\n", i);
@@ -194,7 +181,77 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
return 0;
}
-int __init k8_scan_nodes(void)
+#ifdef CONFIG_NUMA_EMU
+static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
+
+void __init amd_get_nodes(struct bootnode *physnodes)
+{
+ int i;
+
+ for_each_node_mask(i, nodes_parsed) {
+ physnodes[i].start = nodes[i].start;
+ physnodes[i].end = nodes[i].end;
+ }
+}
+
+static int __init find_node_by_addr(unsigned long addr)
+{
+ int ret = NUMA_NO_NODE;
+ int i;
+
+ for (i = 0; i < 8; i++)
+ if (addr >= nodes[i].start && addr < nodes[i].end) {
+ ret = i;
+ break;
+ }
+ return ret;
+}
+
+/*
+ * For NUMA emulation, fake proximity domain (_PXM) to node id mappings must be
+ * setup to represent the physical topology but reflect the emulated
+ * environment. For each emulated node, the real node which it appears on is
+ * found and a fake pxm to nid mapping is created which mirrors the actual
+ * locality. node_distance() then represents the correct distances between
+ * emulated nodes by using the fake acpi mappings to pxms.
+ */
+void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
+{
+ unsigned int bits;
+ unsigned int cores;
+ unsigned int apicid_base = 0;
+ int i;
+
+ bits = boot_cpu_data.x86_coreid_bits;
+ cores = 1 << bits;
+ early_get_boot_cpu_id();
+ if (boot_cpu_physical_apicid > 0)
+ apicid_base = boot_cpu_physical_apicid;
+
+ for (i = 0; i < nr_nodes; i++) {
+ int index;
+ int nid;
+ int j;
+
+ nid = find_node_by_addr(nodes[i].start);
+ if (nid == NUMA_NO_NODE)
+ continue;
+
+ index = nodeids[nid] << bits;
+ if (fake_apicid_to_node[index + apicid_base] == NUMA_NO_NODE)
+ for (j = apicid_base; j < cores + apicid_base; j++)
+ fake_apicid_to_node[index + j] = i;
+#ifdef CONFIG_ACPI_NUMA
+ __acpi_map_pxm_to_node(nid, i);
+#endif
+ }
+ memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
+}
+#endif /* CONFIG_NUMA_EMU */
+
+int __init amd_scan_nodes(void)
{
unsigned int bits;
unsigned int cores;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/highmem.h>
+#include <linux/swap.h>
#include <asm/pgtable.h>
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
get_page(page);
+ SetPageReferenced(page);
pages[*nr] = page;
(*nr)++;
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
VM_BUG_ON(page != compound_head(page));
VM_BUG_ON(page_count(page) == 0);
atomic_add(nr, &page->_count);
+ SetPageReferenced(page);
+}
+
+static inline void get_huge_page_tail(struct page *page)
+{
+ /*
+ * __split_huge_page_refcount() cannot run
+ * from under us.
+ */
+ VM_BUG_ON(atomic_read(&page->_count) < 0);
+ atomic_inc(&page->_count);
}
static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
do {
VM_BUG_ON(compound_head(page) != head);
pages[*nr] = page;
+ if (PageTail(page))
+ get_huge_page_tail(page);
(*nr)++;
page++;
refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
next = pmd_addr_end(addr, end);
- if (pmd_none(pmd))
+ /*
+ * The pmd_trans_splitting() check below explains why
+ * pmdp_splitting_flush has to flush the tlb, to stop
+ * this gup-fast code from running while we set the
+ * splitting bit in the pmd. Returning zero will take
+ * the slow path that will call wait_split_huge_page()
+ * if the pmd is still in splitting state. gup-fast
+ * can't because it has irq disabled and
+ * wait_split_huge_page() would never return as the
+ * tlb flush IPI wouldn't run.
+ */
+ if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0;
if (unlikely(pmd_large(pmd))) {
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c0e28a13de7d..947f42abe820 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -364,8 +364,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
/*
* We just marked the kernel text read only above, now that
* we are going to free part of that, we need to make that
- * writeable first.
+ * writeable and non-executable first.
*/
+ set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 0e969f9f401b..c821074b7f0b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -45,6 +45,7 @@
#include <asm/bugs.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
+#include <asm/olpc_ofw.h>
#include <asm/pgalloc.h>
#include <asm/sections.h>
#include <asm/paravirt.h>
@@ -226,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
static inline int is_kernel_text(unsigned long addr)
{
- if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end)
+ if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
return 1;
return 0;
}
@@ -715,6 +716,7 @@ void __init paging_init(void)
/*
* NOTE: at this point the bootmem allocator is fully available.
*/
+ olpc_dt_build_devicetree();
sparse_init();
zone_sizes_init();
}
@@ -912,6 +914,23 @@ void set_kernel_text_ro(void)
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
}
+static void mark_nxdata_nx(void)
+{
+ /*
+ * When this called, init has already been executed and released,
+ * so everything past _etext sould be NX.
+ */
+ unsigned long start = PFN_ALIGN(_etext);
+ /*
+ * This comes from is_kernel_text upper limit. Also HPAGE where used:
+ */
+ unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
+
+ if (__supported_pte_mask & _PAGE_NX)
+ printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
+ set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
+}
+
void mark_rodata_ro(void)
{
unsigned long start = PFN_ALIGN(_text);
@@ -946,6 +965,7 @@ void mark_rodata_ro(void)
printk(KERN_INFO "Testing CPA: write protecting again\n");
set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
#endif
+ mark_nxdata_nx();
}
#endif
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index af3b6c8a436f..704a37cedddb 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
e->trace.entries = e->trace_entries;
e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
e->trace.skip = 0;
- save_stack_trace_bp(&e->trace, regs->bp);
+ save_stack_trace_regs(&e->trace, regs);
/* Round address down to nearest 16 bytes */
shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 787c52ca49c3..ebf6d7887a38 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -2,6 +2,28 @@
#include <linux/topology.h>
#include <linux/module.h>
#include <linux/bootmem.h>
+#include <asm/numa.h>
+#include <asm/acpi.h>
+
+int __initdata numa_off;
+
+static __init int numa_setup(char *opt)
+{
+ if (!opt)
+ return -EINVAL;
+ if (!strncmp(opt, "off", 3))
+ numa_off = 1;
+#ifdef CONFIG_NUMA_EMU
+ if (!strncmp(opt, "fake=", 5))
+ numa_emu_cmdline(opt + 5);
+#endif
+#ifdef CONFIG_ACPI_NUMA
+ if (!strncmp(opt, "noacpi", 6))
+ acpi_numa = -1;
+#endif
+ return 0;
+}
+early_param("numa", numa_setup);
/*
* Which logical CPUs are on which nodes
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 7ffc9b727efd..95ea1551eebc 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -30,7 +30,6 @@ s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
-int numa_off __initdata;
static unsigned long __initdata nodemap_addr;
static unsigned long __initdata nodemap_size;
@@ -260,30 +259,35 @@ void __init numa_init_array(void)
#ifdef CONFIG_NUMA_EMU
/* Numa emulation */
static struct bootnode nodes[MAX_NUMNODES] __initdata;
-static struct bootnode physnodes[MAX_NUMNODES] __initdata;
+static struct bootnode physnodes[MAX_NUMNODES] __cpuinitdata;
static char *cmdline __initdata;
+void __init numa_emu_cmdline(char *str)
+{
+ cmdline = str;
+}
+
static int __init setup_physnodes(unsigned long start, unsigned long end,
- int acpi, int k8)
+ int acpi, int amd)
{
- int nr_nodes = 0;
int ret = 0;
int i;
+ memset(physnodes, 0, sizeof(physnodes));
#ifdef CONFIG_ACPI_NUMA
if (acpi)
- nr_nodes = acpi_get_nodes(physnodes);
+ acpi_get_nodes(physnodes, start, end);
#endif
-#ifdef CONFIG_K8_NUMA
- if (k8)
- nr_nodes = k8_get_nodes(physnodes);
+#ifdef CONFIG_AMD_NUMA
+ if (amd)
+ amd_get_nodes(physnodes);
#endif
/*
* Basic sanity checking on the physical node map: there may be errors
- * if the SRAT or K8 incorrectly reported the topology or the mem=
+ * if the SRAT or AMD code incorrectly reported the topology or the mem=
* kernel parameter is used.
*/
- for (i = 0; i < nr_nodes; i++) {
+ for (i = 0; i < MAX_NUMNODES; i++) {
if (physnodes[i].start == physnodes[i].end)
continue;
if (physnodes[i].start > end) {
@@ -298,17 +302,6 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
physnodes[i].start = start;
if (physnodes[i].end > end)
physnodes[i].end = end;
- }
-
- /*
- * Remove all nodes that have no memory or were truncated because of the
- * limited address range.
- */
- for (i = 0; i < nr_nodes; i++) {
- if (physnodes[i].start == physnodes[i].end)
- continue;
- physnodes[ret].start = physnodes[i].start;
- physnodes[ret].end = physnodes[i].end;
ret++;
}
@@ -324,6 +317,24 @@ static int __init setup_physnodes(unsigned long start, unsigned long end,
return ret;
}
+static void __init fake_physnodes(int acpi, int amd, int nr_nodes)
+{
+ int i;
+
+ BUG_ON(acpi && amd);
+#ifdef CONFIG_ACPI_NUMA
+ if (acpi)
+ acpi_fake_nodes(nodes, nr_nodes);
+#endif
+#ifdef CONFIG_AMD_NUMA
+ if (amd)
+ amd_fake_nodes(nodes, nr_nodes);
+#endif
+ if (!acpi && !amd)
+ for (i = 0; i < nr_cpu_ids; i++)
+ numa_set_node(i, 0);
+}
+
/*
* Setups up nid to range from addr to addr + size. If the end
* boundary is greater than max_addr, then max_addr is used instead.
@@ -352,8 +363,7 @@ static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
* to max_addr. The return value is the number of nodes allocated.
*/
-static int __init split_nodes_interleave(u64 addr, u64 max_addr,
- int nr_phys_nodes, int nr_nodes)
+static int __init split_nodes_interleave(u64 addr, u64 max_addr, int nr_nodes)
{
nodemask_t physnode_mask = NODE_MASK_NONE;
u64 size;
@@ -384,7 +394,7 @@ static int __init split_nodes_interleave(u64 addr, u64 max_addr,
return -1;
}
- for (i = 0; i < nr_phys_nodes; i++)
+ for (i = 0; i < MAX_NUMNODES; i++)
if (physnodes[i].start != physnodes[i].end)
node_set(i, physnode_mask);
@@ -549,15 +559,13 @@ static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
* numa=fake command-line option.
*/
static int __init numa_emulation(unsigned long start_pfn,
- unsigned long last_pfn, int acpi, int k8)
+ unsigned long last_pfn, int acpi, int amd)
{
u64 addr = start_pfn << PAGE_SHIFT;
u64 max_addr = last_pfn << PAGE_SHIFT;
- int num_phys_nodes;
int num_nodes;
int i;
- num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
/*
* If the numa=fake command-line contains a 'M' or 'G', it represents
* the fixed node size. Otherwise, if it is just a single number N,
@@ -572,7 +580,7 @@ static int __init numa_emulation(unsigned long start_pfn,
unsigned long n;
n = simple_strtoul(cmdline, NULL, 0);
- num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
+ num_nodes = split_nodes_interleave(addr, max_addr, n);
}
if (num_nodes < 0)
@@ -595,14 +603,15 @@ static int __init numa_emulation(unsigned long start_pfn,
nodes[i].end >> PAGE_SHIFT);
setup_node_bootmem(i, nodes[i].start, nodes[i].end);
}
- acpi_fake_nodes(nodes, num_nodes);
+ setup_physnodes(addr, max_addr, acpi, amd);
+ fake_physnodes(acpi, amd, num_nodes);
numa_init_array();
return 0;
}
#endif /* CONFIG_NUMA_EMU */
void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
- int acpi, int k8)
+ int acpi, int amd)
{
int i;
@@ -610,8 +619,12 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
nodes_clear(node_online_map);
#ifdef CONFIG_NUMA_EMU
- if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
+ setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
+ acpi, amd);
+ if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, amd))
return;
+ setup_physnodes(start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT,
+ acpi, amd);
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
#endif
@@ -624,8 +637,8 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
nodes_clear(node_online_map);
#endif
-#ifdef CONFIG_K8_NUMA
- if (!numa_off && k8 && !k8_scan_nodes())
+#ifdef CONFIG_AMD_NUMA
+ if (!numa_off && amd && !amd_scan_nodes())
return;
nodes_clear(node_possible_map);
nodes_clear(node_online_map);
@@ -661,24 +674,6 @@ unsigned long __init numa_free_all_bootmem(void)
return pages;
}
-static __init int numa_setup(char *opt)
-{
- if (!opt)
- return -EINVAL;
- if (!strncmp(opt, "off", 3))
- numa_off = 1;
-#ifdef CONFIG_NUMA_EMU
- if (!strncmp(opt, "fake=", 5))
- cmdline = opt + 5;
-#endif
-#ifdef CONFIG_ACPI_NUMA
- if (!strncmp(opt, "noacpi", 6))
- acpi_numa = -1;
-#endif
- return 0;
-}
-early_param("numa", numa_setup);
-
#ifdef CONFIG_NUMA
static __init int find_near_online_node(int node)
@@ -767,6 +762,7 @@ void __cpuinit numa_clear_node(int cpu)
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+#ifndef CONFIG_NUMA_EMU
void __cpuinit numa_add_cpu(int cpu)
{
cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
@@ -776,34 +772,115 @@ void __cpuinit numa_remove_cpu(int cpu)
{
cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
}
+#else
+void __cpuinit numa_add_cpu(int cpu)
+{
+ unsigned long addr;
+ u16 apicid;
+ int physnid;
+ int nid = NUMA_NO_NODE;
+
+ apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
+ if (apicid != BAD_APICID)
+ nid = apicid_to_node[apicid];
+ if (nid == NUMA_NO_NODE)
+ nid = early_cpu_to_node(cpu);
+ BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+
+ /*
+ * Use the starting address of the emulated node to find which physical
+ * node it is allocated on.
+ */
+ addr = node_start_pfn(nid) << PAGE_SHIFT;
+ for (physnid = 0; physnid < MAX_NUMNODES; physnid++)
+ if (addr >= physnodes[physnid].start &&
+ addr < physnodes[physnid].end)
+ break;
+
+ /*
+ * Map the cpu to each emulated node that is allocated on the physical
+ * node of the cpu's apic id.
+ */
+ for_each_online_node(nid) {
+ addr = node_start_pfn(nid) << PAGE_SHIFT;
+ if (addr >= physnodes[physnid].start &&
+ addr < physnodes[physnid].end)
+ cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+ }
+}
+
+void __cpuinit numa_remove_cpu(int cpu)
+{
+ int i;
+
+ for_each_online_node(i)
+ cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#endif /* !CONFIG_NUMA_EMU */
#else /* CONFIG_DEBUG_PER_CPU_MAPS */
+static struct cpumask __cpuinit *debug_cpumask_set_cpu(int cpu, int enable)
+{
+ int node = early_cpu_to_node(cpu);
+ struct cpumask *mask;
+ char buf[64];
+
+ mask = node_to_cpumask_map[node];
+ if (!mask) {
+ pr_err("node_to_cpumask_map[%i] NULL\n", node);
+ dump_stack();
+ return NULL;
+ }
+
+ cpulist_scnprintf(buf, sizeof(buf), mask);
+ printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
+ enable ? "numa_add_cpu" : "numa_remove_cpu",
+ cpu, node, buf);
+ return mask;
+}
/*
* --------- debug versions of the numa functions ---------
*/
+#ifndef CONFIG_NUMA_EMU
static void __cpuinit numa_set_cpumask(int cpu, int enable)
{
- int node = early_cpu_to_node(cpu);
struct cpumask *mask;
- char buf[64];
- mask = node_to_cpumask_map[node];
- if (mask == NULL) {
- printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
- dump_stack();
+ mask = debug_cpumask_set_cpu(cpu, enable);
+ if (!mask)
return;
- }
if (enable)
cpumask_set_cpu(cpu, mask);
else
cpumask_clear_cpu(cpu, mask);
+}
+#else
+static void __cpuinit numa_set_cpumask(int cpu, int enable)
+{
+ int node = early_cpu_to_node(cpu);
+ struct cpumask *mask;
+ int i;
- cpulist_scnprintf(buf, sizeof(buf), mask);
- printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
- enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
+ for_each_online_node(i) {
+ unsigned long addr;
+
+ addr = node_start_pfn(i) << PAGE_SHIFT;
+ if (addr < physnodes[node].start ||
+ addr >= physnodes[node].end)
+ continue;
+ mask = debug_cpumask_set_cpu(cpu, enable);
+ if (!mask)
+ return;
+
+ if (enable)
+ cpumask_set_cpu(cpu, mask);
+ else
+ cpumask_clear_cpu(cpu, mask);
+ }
}
+#endif /* CONFIG_NUMA_EMU */
void __cpuinit numa_add_cpu(int cpu)
{
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e7933d606..8b830ca14ac4 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
#include <linux/pfn.h>
#include <linux/percpu.h>
#include <linux/gfp.h>
+#include <linux/pci.h>
#include <asm/e820.h>
#include <asm/processor.h>
@@ -255,13 +256,16 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
unsigned long pfn)
{
pgprot_t forbidden = __pgprot(0);
+ pgprot_t required = __pgprot(0);
/*
* The BIOS area between 640k and 1Mb needs to be executable for
* PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
*/
- if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
+#ifdef CONFIG_PCI_BIOS
+ if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_NX;
+#endif
/*
* The kernel text needs to be executable for obvious reasons
@@ -278,6 +282,12 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT,
__pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
pgprot_val(forbidden) |= _PAGE_RW;
+ /*
+ * .data and .bss should always be writable.
+ */
+ if (within(address, (unsigned long)_sdata, (unsigned long)_edata) ||
+ within(address, (unsigned long)__bss_start, (unsigned long)__bss_stop))
+ pgprot_val(required) |= _PAGE_RW;
#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
/*
@@ -317,6 +327,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
#endif
prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
+ prot = __pgprot(pgprot_val(prot) | pgprot_val(required));
return prot;
}
@@ -393,7 +404,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
{
unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn;
pte_t new_pte, old_pte, *tmp;
- pgprot_t old_prot, new_prot;
+ pgprot_t old_prot, new_prot, req_prot;
int i, do_split = 1;
unsigned int level;
@@ -438,10 +449,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* We are safe now. Check whether the new pgprot is the same:
*/
old_pte = *kpte;
- old_prot = new_prot = pte_pgprot(old_pte);
+ old_prot = new_prot = req_prot = pte_pgprot(old_pte);
- pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr);
- pgprot_val(new_prot) |= pgprot_val(cpa->mask_set);
+ pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
+ pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
/*
* old_pte points to the large page base address. So we need
@@ -450,17 +461,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
cpa->pfn = pfn;
- new_prot = static_protections(new_prot, address, pfn);
+ new_prot = static_protections(req_prot, address, pfn);
/*
* We need to check the full range, whether
* static_protection() requires a different pgprot for one of
* the pages in the range we try to preserve:
*/
- addr = address + PAGE_SIZE;
- pfn++;
- for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) {
- pgprot_t chk_prot = static_protections(new_prot, addr, pfn);
+ addr = address & pmask;
+ pfn = pte_pfn(old_pte);
+ for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
+ pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
if (pgprot_val(chk_prot) != pgprot_val(new_prot))
goto out_unlock;
@@ -483,7 +494,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
* that we limited the number of possible pages already to
* the number of pages in the large page.
*/
- if (address == (nextpage_addr - psize) && cpa->numpages == numpages) {
+ if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
/*
* The address is aligned and the number of pages
* covers the full page.
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 8be8c7d7bc89..500242d3c96d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -320,6 +320,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
return changed;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp,
+ pmd_t entry, int dirty)
+{
+ int changed = !pmd_same(*pmdp, entry);
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ if (changed && dirty) {
+ *pmdp = entry;
+ pmd_update_defer(vma->vm_mm, address, pmdp);
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+
+ return changed;
+}
+#endif
+
int ptep_test_and_clear_young(struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
@@ -335,6 +354,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
return ret;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ int ret = 0;
+
+ if (pmd_young(*pmdp))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ (unsigned long *)pmdp);
+
+ if (ret)
+ pmd_update(vma->vm_mm, addr, pmdp);
+
+ return ret;
+}
+#endif
+
int ptep_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep)
{
@@ -347,6 +383,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
return young;
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int young;
+
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+
+ young = pmdp_test_and_clear_young(vma, address, pmdp);
+ if (young)
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+ return young;
+}
+
+void pmdp_splitting_flush(struct vm_area_struct *vma,
+ unsigned long address, pmd_t *pmdp)
+{
+ int set;
+ VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+ set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
+ (unsigned long *)pmdp);
+ if (set) {
+ pmd_update(vma->vm_mm, address, pmdp);
+ /* need tlb flush only to serialize against gup-fast */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+ }
+}
+#endif
+
/**
* reserve_top_address - reserves a hole in the top of kernel address space
* @reserve - size of hole to reserve
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa34086..410531d3c292 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
{
if (!cpu_has_nx) {
printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
- "missing in CPU or disabled in BIOS!\n");
+ "missing in CPU!\n");
} else {
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
if (disable_nx) {
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index a17dffd136c1..ae96e7b8051d 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -59,7 +59,6 @@ static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
static int __initdata num_memory_chunks; /* total number of memory chunks */
static u8 __initdata apicid_to_pxm[MAX_APICID];
-int numa_off __initdata;
int acpi_numa __initdata;
static __init void bad_srat(void)
@@ -92,6 +91,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
/* mark this node as "seen" in node bitmap */
BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
+ /* don't need to check apic_id here, because it is always 8 bits */
apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index a35cb9d8b060..603d285d1daa 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -134,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
}
apic_id = pa->apic_id;
+ if (apic_id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+ return;
+ }
apicid_to_node[apic_id] = node;
node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
@@ -168,6 +172,12 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
else
apic_id = pa->apic_id;
+
+ if (apic_id >= MAX_LOCAL_APIC) {
+ printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
+ return;
+ }
+
apicid_to_node[apic_id] = node;
node_set(node, cpu_nodes_parsed);
acpi_numa = 1;
@@ -339,18 +349,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
void __init acpi_numa_arch_fixup(void) {}
-int __init acpi_get_nodes(struct bootnode *physnodes)
+#ifdef CONFIG_NUMA_EMU
+void __init acpi_get_nodes(struct bootnode *physnodes, unsigned long start,
+ unsigned long end)
{
int i;
- int ret = 0;
for_each_node_mask(i, nodes_parsed) {
- physnodes[ret].start = nodes[i].start;
- physnodes[ret].end = nodes[i].end;
- ret++;
+ cutoff_node(i, start, end);
+ physnodes[i].start = nodes[i].start;
+ physnodes[i].end = nodes[i].end;
}
- return ret;
}
+#endif /* CONFIG_NUMA_EMU */
/* Use the information discovered above to actually set up the nodes. */
int __init acpi_scan_nodes(unsigned long start, unsigned long end)
@@ -495,8 +506,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
{
int i, j;
- printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
- "topology.\n");
for (i = 0; i < num_nodes; i++) {
int nid, pxm;
@@ -516,6 +525,17 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
fake_apicid_to_node[j] == NUMA_NO_NODE)
fake_apicid_to_node[j] = i;
}
+
+ /*
+ * If there are apicid-to-node mappings for physical nodes that do not
+ * have a corresponding emulated node, it should default to a guaranteed
+ * value.
+ */
+ for (i = 0; i < MAX_LOCAL_APIC; i++)
+ if (apicid_to_node[i] != NUMA_NO_NODE &&
+ fake_apicid_to_node[i] == NUMA_NO_NODE)
+ fake_apicid_to_node[i] = 0;
+
for (i = 0; i < num_nodes; i++)
__acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 49358481c733..6acc724d5d8f 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -223,7 +223,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
static void __cpuinit calculate_tlb_offset(void)
{
- int cpu, node, nr_node_vecs;
+ int cpu, node, nr_node_vecs, idx = 0;
/*
* we are changing tlb_vector_offset for each CPU in runtime, but this
* will not cause inconsistency, as the write is atomic under X86. we
@@ -239,7 +239,7 @@ static void __cpuinit calculate_tlb_offset(void)
nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
for_each_online_node(node) {
- int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
+ int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
nr_node_vecs;
int cpu_offset = 0;
for_each_cpu(cpu, cpumask_of_node(node)) {
@@ -248,10 +248,11 @@ static void __cpuinit calculate_tlb_offset(void)
cpu_offset++;
cpu_offset = cpu_offset % nr_node_vecs;
}
+ idx++;
}
}
-static int tlb_cpuhp_notify(struct notifier_block *n,
+static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
unsigned long action, void *hcpu)
{
switch (action & 0xf) {
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 2d49d4e19a36..72cbec14d783 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -126,7 +126,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
if (!user_mode_vm(regs)) {
unsigned long stack = kernel_stack_pointer(regs);
if (depth)
- dump_trace(NULL, regs, (unsigned long *)stack, 0,
+ dump_trace(NULL, regs, (unsigned long *)stack,
&backtrace_ops, &depth);
return;
}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 4e8baad36d37..e2b7b0c06cdf 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -65,7 +65,6 @@ static int profile_exceptions_notify(struct notifier_block *self,
switch (val) {
case DIE_NMI:
- case DIE_NMI_IPI:
if (ctr_running)
model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
else if (!nmi_enabled)
@@ -143,7 +142,7 @@ static inline int has_mux(void)
inline int op_x86_phys_to_virt(int phys)
{
- return __get_cpu_var(switch_index) + phys;
+ return __this_cpu_read(switch_index) + phys;
}
inline int op_x86_virt_to_phys(int virt)
@@ -361,7 +360,7 @@ static void nmi_cpu_setup(void *dummy)
static struct notifier_block profile_exceptions_nb = {
.notifier_call = profile_exceptions_notify,
.next = NULL,
- .priority = 2
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
static void nmi_cpu_restore_registers(struct op_msrs *msrs)
@@ -732,6 +731,9 @@ int __init op_nmi_init(struct oprofile_operations *ops)
case 0x14:
cpu_type = "x86-64/family14h";
break;
+ case 0x15:
+ cpu_type = "x86-64/family15h";
+ break;
default:
return -ENODEV;
}
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
index e3ecb71b5790..720bf5a53c51 100644
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -38,7 +38,7 @@ static int profile_timer_exceptions_notify(struct notifier_block *self,
static struct notifier_block profile_timer_exceptions_nb = {
.notifier_call = profile_timer_exceptions_notify,
.next = NULL,
- .priority = 0
+ .priority = NMI_LOW_PRIOR,
};
static int timer_start(void)
@@ -58,9 +58,6 @@ static void timer_stop(void)
int __init op_nmi_timer_init(struct oprofile_operations *ops)
{
- if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0))
- return -ENODEV;
-
ops->start = timer_start;
ops->stop = timer_stop;
ops->cpu_type = "timer";
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index a011bcc0f943..c3b8e24f2b16 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -29,11 +29,12 @@
#include "op_x86_model.h"
#include "op_counter.h"
-#define NUM_COUNTERS 4
+#define NUM_COUNTERS 4
+#define NUM_COUNTERS_F15H 6
#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
-#define NUM_VIRT_COUNTERS 32
+#define NUM_VIRT_COUNTERS 32
#else
-#define NUM_VIRT_COUNTERS NUM_COUNTERS
+#define NUM_VIRT_COUNTERS 0
#endif
#define OP_EVENT_MASK 0x0FFF
@@ -41,7 +42,8 @@
#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
-static unsigned long reset_value[NUM_VIRT_COUNTERS];
+static int num_counters;
+static unsigned long reset_value[OP_MAX_COUNTER];
#define IBS_FETCH_SIZE 6
#define IBS_OP_SIZE 12
@@ -387,7 +389,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
int i;
/* enable active counters */
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
int virt = op_x86_phys_to_virt(i);
if (!reset_value[virt])
continue;
@@ -406,7 +408,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
{
int i;
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if (!msrs->counters[i].addr)
continue;
release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
@@ -418,7 +420,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
{
int i;
- for (i = 0; i < NUM_COUNTERS; i++) {
+ for (i = 0; i < num_counters; i++) {
if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
goto fail;
if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
@@ -426,8 +428,13 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
goto fail;
}
/* both registers must be reserved */
- msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
- msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+ if (num_counters == NUM_COUNTERS_F15H) {
+ msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
+ msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
+ } else {
+ msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
+ msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
+ }
continue;
fail:
if (!counter_config[i].enabled)
@@ -447,7 +454,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
int i;
/* setup reset_value */
- for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
+ for (i = 0; i < OP_MAX_COUNTER; ++i) {
if (counter_config[i].enabled
&& msrs->counters[op_x86_virt_to_phys(i)].addr)
reset_value[i] = counter_config[i].count;
@@ -456,7 +463,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
}
/* clear all counters */
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if (!msrs->controls[i].addr)
continue;
rdmsrl(msrs->controls[i].addr, val);
@@ -472,7 +479,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
}
/* enable active counters */
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
int virt = op_x86_phys_to_virt(i);
if (!reset_value[virt])
continue;
@@ -503,7 +510,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs,
u64 val;
int i;
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
int virt = op_x86_phys_to_virt(i);
if (!reset_value[virt])
continue;
@@ -526,7 +533,7 @@ static void op_amd_start(struct op_msrs const * const msrs)
u64 val;
int i;
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if (!reset_value[op_x86_phys_to_virt(i)])
continue;
rdmsrl(msrs->controls[i].addr, val);
@@ -546,7 +553,7 @@ static void op_amd_stop(struct op_msrs const * const msrs)
* Subtle: stop on all counters to avoid race with setting our
* pm callback
*/
- for (i = 0; i < NUM_COUNTERS; ++i) {
+ for (i = 0; i < num_counters; ++i) {
if (!reset_value[op_x86_phys_to_virt(i)])
continue;
rdmsrl(msrs->controls[i].addr, val);
@@ -603,6 +610,7 @@ static int force_ibs_eilvt_setup(void)
ret = setup_ibs_ctl(i);
if (ret)
return ret;
+ pr_err(FW_BUG "using offset %d for IBS interrupts\n", i);
return 0;
}
@@ -630,21 +638,29 @@ static int __init_ibs_nmi(void)
return 0;
}
-/* initialize the APIC for the IBS interrupts if available */
+/*
+ * check and reserve APIC extended interrupt LVT offset for IBS if
+ * available
+ *
+ * init_ibs() preforms implicitly cpu-local operations, so pin this
+ * thread to its current CPU
+ */
+
static void init_ibs(void)
{
- ibs_caps = get_ibs_caps();
+ preempt_disable();
+ ibs_caps = get_ibs_caps();
if (!ibs_caps)
- return;
+ goto out;
- if (__init_ibs_nmi()) {
+ if (__init_ibs_nmi() < 0)
ibs_caps = 0;
- return;
- }
+ else
+ printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
- printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n",
- (unsigned)ibs_caps);
+out:
+ preempt_enable();
}
static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
@@ -698,18 +714,29 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
return 0;
}
+struct op_x86_model_spec op_amd_spec;
+
static int op_amd_init(struct oprofile_operations *ops)
{
init_ibs();
create_arch_files = ops->create_files;
ops->create_files = setup_ibs_files;
+
+ if (boot_cpu_data.x86 == 0x15) {
+ num_counters = NUM_COUNTERS_F15H;
+ } else {
+ num_counters = NUM_COUNTERS;
+ }
+
+ op_amd_spec.num_counters = num_counters;
+ op_amd_spec.num_controls = num_counters;
+ op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS);
+
return 0;
}
struct op_x86_model_spec op_amd_spec = {
- .num_counters = NUM_COUNTERS,
- .num_controls = NUM_COUNTERS,
- .num_virt_counters = NUM_VIRT_COUNTERS,
+ /* num_counters/num_controls filled in at runtime */
.reserved = MSR_AMD_EVENTSEL_RESERVED,
.event_mask = OP_EVENT_MASK,
.init = op_amd_init,
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 182558dd5515..9fadec074142 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -11,7 +11,7 @@
#include <linux/oprofile.h>
#include <linux/smp.h>
#include <linux/ptrace.h>
-#include <linux/nmi.h>
+#include <asm/nmi.h>
#include <asm/msr.h>
#include <asm/fixmap.h>
#include <asm/apic.h>
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index d769cda54082..94b745045e45 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -95,8 +95,8 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
* counter width:
*/
if (!(eax.split.version_id == 0 &&
- current_cpu_data.x86 == 6 &&
- current_cpu_data.x86_model == 15)) {
+ __this_cpu_read(cpu_info.x86) == 6 &&
+ __this_cpu_read(cpu_info.x86_model) == 15)) {
if (counter_width < eax.split.bit_width)
counter_width = eax.split.bit_width;
@@ -235,8 +235,8 @@ static void arch_perfmon_setup_counters(void)
eax.full = cpuid_eax(0xa);
/* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
- if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
- current_cpu_data.x86_model == 15) {
+ if (eax.split.version_id == 0 && __this_cpu_read(cpu_info.x86) == 6 &&
+ __this_cpu_read(cpu_info.x86_model) == 15) {
eax.split.version_id = 2;
eax.split.num_counters = 2;
eax.split.bit_width = 40;
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 15466c096ba5..0972315c3860 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -138,7 +138,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
struct acpi_resource_address64 addr;
acpi_status status;
unsigned long flags;
- struct resource *root, *conflict;
u64 start, end;
status = resource_to_addr(acpi_res, &addr);
@@ -146,12 +145,10 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
return AE_OK;
if (addr.resource_type == ACPI_MEMORY_RANGE) {
- root = &iomem_resource;
flags = IORESOURCE_MEM;
if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY)
flags |= IORESOURCE_PREFETCH;
} else if (addr.resource_type == ACPI_IO_RANGE) {
- root = &ioport_resource;
flags = IORESOURCE_IO;
} else
return AE_OK;
@@ -172,25 +169,90 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
return AE_OK;
}
- conflict = insert_resource_conflict(root, res);
- if (conflict) {
- dev_err(&info->bridge->dev,
- "address space collision: host bridge window %pR "
- "conflicts with %s %pR\n",
- res, conflict->name, conflict);
- } else {
- pci_bus_add_resource(info->bus, res, 0);
- info->res_num++;
- if (addr.translation_offset)
- dev_info(&info->bridge->dev, "host bridge window %pR "
- "(PCI address [%#llx-%#llx])\n",
- res, res->start - addr.translation_offset,
- res->end - addr.translation_offset);
+ info->res_num++;
+ if (addr.translation_offset)
+ dev_info(&info->bridge->dev, "host bridge window %pR "
+ "(PCI address [%#llx-%#llx])\n",
+ res, res->start - addr.translation_offset,
+ res->end - addr.translation_offset);
+ else
+ dev_info(&info->bridge->dev, "host bridge window %pR\n", res);
+
+ return AE_OK;
+}
+
+static bool resource_contains(struct resource *res, resource_size_t point)
+{
+ if (res->start <= point && point <= res->end)
+ return true;
+ return false;
+}
+
+static void coalesce_windows(struct pci_root_info *info, int type)
+{
+ int i, j;
+ struct resource *res1, *res2;
+
+ for (i = 0; i < info->res_num; i++) {
+ res1 = &info->res[i];
+ if (!(res1->flags & type))
+ continue;
+
+ for (j = i + 1; j < info->res_num; j++) {
+ res2 = &info->res[j];
+ if (!(res2->flags & type))
+ continue;
+
+ /*
+ * I don't like throwing away windows because then
+ * our resources no longer match the ACPI _CRS, but
+ * the kernel resource tree doesn't allow overlaps.
+ */
+ if (resource_contains(res1, res2->start) ||
+ resource_contains(res1, res2->end) ||
+ resource_contains(res2, res1->start) ||
+ resource_contains(res2, res1->end)) {
+ res1->start = min(res1->start, res2->start);
+ res1->end = max(res1->end, res2->end);
+ dev_info(&info->bridge->dev,
+ "host bridge window expanded to %pR; %pR ignored\n",
+ res1, res2);
+ res2->flags = 0;
+ }
+ }
+ }
+}
+
+static void add_resources(struct pci_root_info *info)
+{
+ int i;
+ struct resource *res, *root, *conflict;
+
+ if (!pci_use_crs)
+ return;
+
+ coalesce_windows(info, IORESOURCE_MEM);
+ coalesce_windows(info, IORESOURCE_IO);
+
+ for (i = 0; i < info->res_num; i++) {
+ res = &info->res[i];
+
+ if (res->flags & IORESOURCE_MEM)
+ root = &iomem_resource;
+ else if (res->flags & IORESOURCE_IO)
+ root = &ioport_resource;
else
- dev_info(&info->bridge->dev,
- "host bridge window %pR\n", res);
+ continue;
+
+ conflict = insert_resource_conflict(root, res);
+ if (conflict)
+ dev_err(&info->bridge->dev,
+ "address space collision: host bridge window %pR "
+ "conflicts with %s %pR\n",
+ res, conflict->name, conflict);
+ else
+ pci_bus_add_resource(info->bus, res, 0);
}
- return AE_OK;
}
static void
@@ -224,6 +286,7 @@ get_current_resources(struct acpi_device *device, int busnum,
acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
&info);
+ add_resources(&info);
return;
name_alloc_fail:
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index fc1e8fe07e5c..e27dffbbb1a7 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -4,6 +4,7 @@
#include <linux/cpu.h>
#include <linux/range.h>
+#include <asm/amd_nb.h>
#include <asm/pci_x86.h>
#include <asm/pci-direct.h>
@@ -378,6 +379,34 @@ static struct notifier_block __cpuinitdata amd_cpu_notifier = {
.notifier_call = amd_cpu_notify,
};
+static void __init pci_enable_pci_io_ecs(void)
+{
+#ifdef CONFIG_AMD_NB
+ unsigned int i, n;
+
+ for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
+ u8 bus = amd_nb_bus_dev_ranges[i].bus;
+ u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
+ u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
+
+ for (; slot < limit; ++slot) {
+ u32 val = read_pci_config(bus, slot, 3, 0);
+
+ if (!early_is_amd_nb(val))
+ continue;
+
+ val = read_pci_config(bus, slot, 3, 0x8c);
+ if (!(val & (ENABLE_CF8_EXT_CFG >> 32))) {
+ val |= ENABLE_CF8_EXT_CFG >> 32;
+ write_pci_config(bus, slot, 3, 0x8c, val);
+ }
+ ++n;
+ }
+ }
+ pr_info("Extended Config Space enabled on %u nodes\n", n);
+#endif
+}
+
static int __init pci_io_ecs_init(void)
{
int cpu;
@@ -386,6 +415,10 @@ static int __init pci_io_ecs_init(void)
if (boot_cpu_data.x86 < 0x10)
return 0;
+ /* Try the PCI method first. */
+ if (early_pci_allowed())
+ pci_enable_pci_io_ecs();
+
register_cpu_notifier(&amd_cpu_notifier);
for_each_online_cpu(cpu)
amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index 0846a5bbbfbd..ab8269b0da29 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -9,6 +9,7 @@
* option) any later version.
*/
+#include <linux/acpi.h>
#include <linux/delay.h>
#include <linux/dmi.h>
#include <linux/pci.h>
@@ -25,12 +26,14 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
u8 fbus, lbus;
int i;
+#ifdef CONFIG_ACPI
/*
- * The x86_pci_root_bus_res_quirks() function already refuses to use
- * this information if ACPI _CRS was used. Therefore, we don't bother
- * checking if ACPI is enabled, and just generate the information
- * for both the ACPI _CRS and no ACPI cases.
+ * We should get host bridge information from ACPI unless the BIOS
+ * doesn't support it.
*/
+ if (acpi_os_get_root_pointer())
+ return;
+#endif
info = &pci_root_info[pci_root_num];
pci_root_num++;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index f7c8a399978c..5fe75026ecc2 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
unsigned int pci_early_dump_regs;
static int pci_bf_sort;
+static int smbios_type_b1_flag;
int pci_routeirq;
int noioapicquirk;
#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
@@ -185,6 +186,39 @@ static int __devinit set_bf_sort(const struct dmi_system_id *d)
return 0;
}
+static void __devinit read_dmi_type_b1(const struct dmi_header *dm,
+ void *private_data)
+{
+ u8 *d = (u8 *)dm + 4;
+
+ if (dm->type != 0xB1)
+ return;
+ switch (((*(u32 *)d) >> 9) & 0x03) {
+ case 0x00:
+ printk(KERN_INFO "dmi type 0xB1 record - unknown flag\n");
+ break;
+ case 0x01: /* set pci=bfsort */
+ smbios_type_b1_flag = 1;
+ break;
+ case 0x02: /* do not set pci=bfsort */
+ smbios_type_b1_flag = 2;
+ break;
+ default:
+ break;
+ }
+}
+
+static int __devinit find_sort_method(const struct dmi_system_id *d)
+{
+ dmi_walk(read_dmi_type_b1, NULL);
+
+ if (smbios_type_b1_flag == 1) {
+ set_bf_sort(d);
+ return 0;
+ }
+ return -1;
+}
+
/*
* Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
*/
@@ -213,6 +247,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
},
#endif /* __i386__ */
{
+ .callback = find_sort_method,
+ .ident = "Dell System",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
+ },
+ },
+ {
.callback = set_bf_sort,
.ident = "Dell PowerEdge 1950",
.matches = {
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index c4bb261c106e..b1805b78842f 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -65,21 +65,13 @@ pcibios_align_resource(void *data, const struct resource *res,
resource_size_t size, resource_size_t align)
{
struct pci_dev *dev = data;
- resource_size_t start = round_down(res->end - size + 1, align);
+ resource_size_t start = res->start;
if (res->flags & IORESOURCE_IO) {
-
- /*
- * If we're avoiding ISA aliases, the largest contiguous I/O
- * port space is 256 bytes. Clearing bits 9 and 10 preserves
- * all 256-byte and smaller alignments, so the result will
- * still be correctly aligned.
- */
- if (!skip_isa_ioresource_align(dev))
- start &= ~0x300;
- } else if (res->flags & IORESOURCE_MEM) {
- if (start < BIOS_END)
- start = res->end; /* fail; no space */
+ if (skip_isa_ioresource_align(dev))
+ return start;
+ if (start & 0x300)
+ start = (start + 0x3ff) & ~0x3ff;
}
return start;
}
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 9f9bfb705cf9..87e6c8323117 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -589,7 +589,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
case PCI_DEVICE_ID_INTEL_ICH10_1:
case PCI_DEVICE_ID_INTEL_ICH10_2:
case PCI_DEVICE_ID_INTEL_ICH10_3:
- case PCI_DEVICE_ID_INTEL_PATSBURG_LPC:
+ case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0:
+ case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1:
r->name = "PIIX/ICH";
r->get = pirq_piix_get;
r->set = pirq_piix_set;
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 2492d165096a..a5f7d0d63de0 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -9,6 +9,7 @@
#include <linux/uaccess.h>
#include <asm/pci_x86.h>
#include <asm/pci-functions.h>
+#include <asm/cacheflush.h>
/* BIOS32 signature: "_32_" */
#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
@@ -25,6 +26,27 @@
#define PCIBIOS_HW_TYPE1_SPEC 0x10
#define PCIBIOS_HW_TYPE2_SPEC 0x20
+int pcibios_enabled;
+
+/* According to the BIOS specification at:
+ * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could
+ * restrict the x zone to some pages and make it ro. But this may be
+ * broken on some bios, complex to handle with static_protections.
+ * We could make the 0xe0000-0x100000 range rox, but this can break
+ * some ISA mapping.
+ *
+ * So we let's an rw and x hole when pcibios is used. This shouldn't
+ * happen for modern system with mmconfig, and if you don't want it
+ * you could disable pcibios...
+ */
+static inline void set_bios_x(void)
+{
+ pcibios_enabled = 1;
+ set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
+ if (__supported_pte_mask & _PAGE_NX)
+ printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
+}
+
/*
* This is the standard structure used to identify the entry point
* to the BIOS32 Service Directory, as documented in
@@ -332,6 +354,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
bios32_entry);
bios32_indirect.address = bios32_entry + PAGE_OFFSET;
+ set_bios_x();
if (check_pcibios())
return &pci_bios_access;
}
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 117f5b8daf75..25cd4a07d09f 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -70,6 +70,9 @@ static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
struct xen_pci_frontend_ops *xen_pci_frontend;
EXPORT_SYMBOL_GPL(xen_pci_frontend);
+#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \
+ MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0))
+
static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
struct msi_msg *msg)
{
@@ -83,12 +86,7 @@ static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
MSI_ADDR_REDIRECTION_CPU |
MSI_ADDR_DEST_ID(pirq);
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- /* delivery mode reserved */
- (3 << 8) |
- MSI_DATA_VECTOR(0);
+ msg->data = XEN_PIRQ_MSI_DATA;
}
static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
@@ -98,8 +96,23 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
struct msi_msg msg;
list_for_each_entry(msidesc, &dev->msi_list, list) {
+ __read_msi_msg(msidesc, &msg);
+ pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
+ ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
+ if (xen_irq_from_pirq(pirq) >= 0 && msg.data == XEN_PIRQ_MSI_DATA) {
+ xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
+ "msi-x" : "msi", &irq, &pirq, XEN_ALLOC_IRQ);
+ if (irq < 0)
+ goto error;
+ ret = set_irq_msi(irq, msidesc);
+ if (ret < 0)
+ goto error_while;
+ printk(KERN_DEBUG "xen: msi already setup: msi --> irq=%d"
+ " pirq=%d\n", irq, pirq);
+ return 0;
+ }
xen_allocate_pirq_msi((type == PCI_CAP_ID_MSIX) ?
- "msi-x" : "msi", &irq, &pirq);
+ "msi-x" : "msi", &irq, &pirq, (XEN_ALLOC_IRQ | XEN_ALLOC_PIRQ));
if (irq < 0 || pirq < 0)
goto error;
printk(KERN_DEBUG "xen: msi --> irq=%d, pirq=%d\n", irq, pirq);
@@ -147,8 +160,10 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
irq = xen_allocate_pirq(v[i], 0, /* not sharable */
(type == PCI_CAP_ID_MSIX) ?
"pcifront-msi-x" : "pcifront-msi");
- if (irq < 0)
- return -1;
+ if (irq < 0) {
+ ret = -1;
+ goto free;
+ }
ret = set_irq_msi(irq, msidesc);
if (ret)
@@ -164,7 +179,7 @@ error:
if (ret == -ENODEV)
dev_err(&dev->dev, "Xen PCI frontend has not registered" \
" MSI/MSI-X support!\n");
-
+free:
kfree(v);
return ret;
}
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
index 65df603622b2..25bfdbb5b130 100644
--- a/arch/x86/platform/mrst/early_printk_mrst.c
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -103,7 +103,7 @@ struct dw_spi_reg {
static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
static u32 *pclk_spi0;
-/* Always contains an accessable address, start with 0 */
+/* Always contains an accessible address, start with 0 */
static struct dw_spi_reg *pspi;
static struct kmsg_dumper dw_dumper;
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 4c542c757cb4..5c0207bf959b 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -72,32 +72,6 @@ struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
EXPORT_SYMBOL_GPL(sfi_mrtc_array);
int sfi_mrtc_num;
-static inline void assign_to_mp_irq(struct mpc_intsrc *m,
- struct mpc_intsrc *mp_irq)
-{
- memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-
-static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
- struct mpc_intsrc *m)
-{
- return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
-}
-
-static void save_mp_irq(struct mpc_intsrc *m)
-{
- int i;
-
- for (i = 0; i < mp_irq_entries; i++) {
- if (!mp_irq_cmp(&mp_irqs[i], m))
- return;
- }
-
- assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
- if (++mp_irq_entries == MAX_IRQ_SOURCES)
- panic("Max # of irq sources exceeded!!\n");
-}
-
/* parse all the mtimer info to a static mtimer array */
static int __init sfi_parse_mtmr(struct sfi_table_header *table)
{
@@ -131,7 +105,7 @@ static int __init sfi_parse_mtmr(struct sfi_table_header *table)
mp_irq.srcbusirq = pentry->irq; /* IRQ */
mp_irq.dstapic = MP_APIC_ALL;
mp_irq.dstirq = pentry->irq;
- save_mp_irq(&mp_irq);
+ mp_save_irq(&mp_irq);
}
return 0;
@@ -201,7 +175,7 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
mp_irq.srcbusirq = pentry->irq; /* IRQ */
mp_irq.dstapic = MP_APIC_ALL;
mp_irq.dstirq = pentry->irq;
- save_mp_irq(&mp_irq);
+ mp_save_irq(&mp_irq);
}
return 0;
}
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index c31b8fcb5a86..e797428b163b 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,3 +1,4 @@
obj-$(CONFIG_OLPC) += olpc.o
obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
+obj-$(CONFIG_OLPC_OPENFIRMWARE_DT) += olpc_dt.o
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
index f5442c03abc3..127775696d6c 100644
--- a/arch/x86/platform/olpc/olpc-xo1.c
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -1,6 +1,7 @@
/*
* Support for features of the OLPC XO-1 laptop
*
+ * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
* Copyright (C) 2010 One Laptop per Child
* Copyright (C) 2006 Red Hat, Inc.
* Copyright (C) 2006 Advanced Micro Devices, Inc.
@@ -12,8 +13,6 @@
*/
#include <linux/module.h>
-#include <linux/pci.h>
-#include <linux/pci_ids.h>
#include <linux/platform_device.h>
#include <linux/pm.h>
@@ -22,9 +21,6 @@
#define DRV_NAME "olpc-xo1"
-#define PMS_BAR 4
-#define ACPI_BAR 5
-
/* PMC registers (PMS block) */
#define PM_SCLK 0x10
#define PM_IN_SLPCTL 0x20
@@ -57,65 +53,67 @@ static void xo1_power_off(void)
outl(0x00002000, acpi_base + PM1_CNT);
}
-/* Read the base addresses from the PCI BAR info */
-static int __devinit setup_bases(struct pci_dev *pdev)
+static int __devinit olpc_xo1_probe(struct platform_device *pdev)
{
- int r;
+ struct resource *res;
- r = pci_enable_device_io(pdev);
- if (r) {
- dev_err(&pdev->dev, "can't enable device IO\n");
- return r;
- }
+ /* don't run on non-XOs */
+ if (!machine_is_olpc())
+ return -ENODEV;
- r = pci_request_region(pdev, ACPI_BAR, DRV_NAME);
- if (r) {
- dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR);
- return r;
+ res = platform_get_resource(pdev, IORESOURCE_IO, 0);
+ if (!res) {
+ dev_err(&pdev->dev, "can't fetch device resource info\n");
+ return -EIO;
}
- r = pci_request_region(pdev, PMS_BAR, DRV_NAME);
- if (r) {
- dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR);
- pci_release_region(pdev, ACPI_BAR);
- return r;
+ if (!request_region(res->start, resource_size(res), DRV_NAME)) {
+ dev_err(&pdev->dev, "can't request region\n");
+ return -EIO;
}
- acpi_base = pci_resource_start(pdev, ACPI_BAR);
- pms_base = pci_resource_start(pdev, PMS_BAR);
+ if (strcmp(pdev->name, "cs5535-pms") == 0)
+ pms_base = res->start;
+ else if (strcmp(pdev->name, "cs5535-acpi") == 0)
+ acpi_base = res->start;
+
+ /* If we have both addresses, we can override the poweroff hook */
+ if (pms_base && acpi_base) {
+ pm_power_off = xo1_power_off;
+ printk(KERN_INFO "OLPC XO-1 support registered\n");
+ }
return 0;
}
-static int __devinit olpc_xo1_probe(struct platform_device *pdev)
+static int __devexit olpc_xo1_remove(struct platform_device *pdev)
{
- struct pci_dev *pcidev;
- int r;
-
- pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
- NULL);
- if (!pdev)
- return -ENODEV;
-
- r = setup_bases(pcidev);
- if (r)
- return r;
+ struct resource *r;
- pm_power_off = xo1_power_off;
+ r = platform_get_resource(pdev, IORESOURCE_IO, 0);
+ release_region(r->start, resource_size(r));
- printk(KERN_INFO "OLPC XO-1 support registered\n");
- return 0;
-}
+ if (strcmp(pdev->name, "cs5535-pms") == 0)
+ pms_base = 0;
+ else if (strcmp(pdev->name, "cs5535-acpi") == 0)
+ acpi_base = 0;
-static int __devexit olpc_xo1_remove(struct platform_device *pdev)
-{
pm_power_off = NULL;
return 0;
}
-static struct platform_driver olpc_xo1_driver = {
+static struct platform_driver cs5535_pms_drv = {
+ .driver = {
+ .name = "cs5535-pms",
+ .owner = THIS_MODULE,
+ },
+ .probe = olpc_xo1_probe,
+ .remove = __devexit_p(olpc_xo1_remove),
+};
+
+static struct platform_driver cs5535_acpi_drv = {
.driver = {
- .name = DRV_NAME,
+ .name = "cs5535-acpi",
.owner = THIS_MODULE,
},
.probe = olpc_xo1_probe,
@@ -124,12 +122,23 @@ static struct platform_driver olpc_xo1_driver = {
static int __init olpc_xo1_init(void)
{
- return platform_driver_register(&olpc_xo1_driver);
+ int r;
+
+ r = platform_driver_register(&cs5535_pms_drv);
+ if (r)
+ return r;
+
+ r = platform_driver_register(&cs5535_acpi_drv);
+ if (r)
+ platform_driver_unregister(&cs5535_pms_drv);
+
+ return r;
}
static void __exit olpc_xo1_exit(void)
{
- platform_driver_unregister(&olpc_xo1_driver);
+ platform_driver_unregister(&cs5535_acpi_drv);
+ platform_driver_unregister(&cs5535_pms_drv);
}
MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
new file mode 100644
index 000000000000..dab874647530
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -0,0 +1,183 @@
+/*
+ * OLPC-specific OFW device tree support code.
+ *
+ * Paul Mackerras August 1996.
+ * Copyright (C) 1996-2005 Paul Mackerras.
+ *
+ * Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner.
+ * {engebret|bergner}@us.ibm.com
+ *
+ * Adapted for sparc by David S. Miller davem@davemloft.net
+ * Adapted for x86/OLPC by Andres Salomon <dilinger@queued.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/bootmem.h>
+#include <linux/of.h>
+#include <linux/of_pdt.h>
+#include <asm/olpc_ofw.h>
+
+static phandle __init olpc_dt_getsibling(phandle node)
+{
+ const void *args[] = { (void *)node };
+ void *res[] = { &node };
+
+ if ((s32)node == -1)
+ return 0;
+
+ if (olpc_ofw("peer", args, res) || (s32)node == -1)
+ return 0;
+
+ return node;
+}
+
+static phandle __init olpc_dt_getchild(phandle node)
+{
+ const void *args[] = { (void *)node };
+ void *res[] = { &node };
+
+ if ((s32)node == -1)
+ return 0;
+
+ if (olpc_ofw("child", args, res) || (s32)node == -1) {
+ pr_err("PROM: %s: fetching child failed!\n", __func__);
+ return 0;
+ }
+
+ return node;
+}
+
+static int __init olpc_dt_getproplen(phandle node, const char *prop)
+{
+ const void *args[] = { (void *)node, prop };
+ int len;
+ void *res[] = { &len };
+
+ if ((s32)node == -1)
+ return -1;
+
+ if (olpc_ofw("getproplen", args, res)) {
+ pr_err("PROM: %s: getproplen failed!\n", __func__);
+ return -1;
+ }
+
+ return len;
+}
+
+static int __init olpc_dt_getproperty(phandle node, const char *prop,
+ char *buf, int bufsize)
+{
+ int plen;
+
+ plen = olpc_dt_getproplen(node, prop);
+ if (plen > bufsize || plen < 1) {
+ return -1;
+ } else {
+ const void *args[] = { (void *)node, prop, buf, (void *)plen };
+ void *res[] = { &plen };
+
+ if (olpc_ofw("getprop", args, res)) {
+ pr_err("PROM: %s: getprop failed!\n", __func__);
+ return -1;
+ }
+ }
+
+ return plen;
+}
+
+static int __init olpc_dt_nextprop(phandle node, char *prev, char *buf)
+{
+ const void *args[] = { (void *)node, prev, buf };
+ int success;
+ void *res[] = { &success };
+
+ buf[0] = '\0';
+
+ if ((s32)node == -1)
+ return -1;
+
+ if (olpc_ofw("nextprop", args, res) || success != 1)
+ return -1;
+
+ return 0;
+}
+
+static int __init olpc_dt_pkg2path(phandle node, char *buf,
+ const int buflen, int *len)
+{
+ const void *args[] = { (void *)node, buf, (void *)buflen };
+ void *res[] = { len };
+
+ if ((s32)node == -1)
+ return -1;
+
+ if (olpc_ofw("package-to-path", args, res) || *len < 1)
+ return -1;
+
+ return 0;
+}
+
+static unsigned int prom_early_allocated __initdata;
+
+void * __init prom_early_alloc(unsigned long size)
+{
+ static u8 *mem;
+ static size_t free_mem;
+ void *res;
+
+ if (free_mem < size) {
+ const size_t chunk_size = max(PAGE_SIZE, size);
+
+ /*
+ * To mimimize the number of allocations, grab at least
+ * PAGE_SIZE of memory (that's an arbitrary choice that's
+ * fast enough on the platforms we care about while minimizing
+ * wasted bootmem) and hand off chunks of it to callers.
+ */
+ res = alloc_bootmem(chunk_size);
+ if (!res)
+ return NULL;
+ prom_early_allocated += chunk_size;
+ memset(res, 0, chunk_size);
+ free_mem = chunk_size;
+ mem = res;
+ }
+
+ /* allocate from the local cache */
+ free_mem -= size;
+ res = mem;
+ mem += size;
+ return res;
+}
+
+static struct of_pdt_ops prom_olpc_ops __initdata = {
+ .nextprop = olpc_dt_nextprop,
+ .getproplen = olpc_dt_getproplen,
+ .getproperty = olpc_dt_getproperty,
+ .getchild = olpc_dt_getchild,
+ .getsibling = olpc_dt_getsibling,
+ .pkg2path = olpc_dt_pkg2path,
+};
+
+void __init olpc_dt_build_devicetree(void)
+{
+ phandle root;
+
+ if (!olpc_ofw_is_installed())
+ return;
+
+ root = olpc_dt_getsibling(0);
+ if (!root) {
+ pr_err("PROM: unable to get root node from OFW!\n");
+ return;
+ }
+ of_pdt_build_devicetree(root, &prom_olpc_ops);
+
+ pr_info("PROM DT: Built device tree with %u bytes of memory.\n",
+ prom_early_allocated);
+}
diff --git a/arch/x86/platform/olpc/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c
index 787320464379..e7604f62870d 100644
--- a/arch/x86/platform/olpc/olpc_ofw.c
+++ b/arch/x86/platform/olpc/olpc_ofw.c
@@ -110,3 +110,8 @@ void __init olpc_ofw_detect(void)
(unsigned long)olpc_ofw_cif, (-start) >> 20);
reserve_top_address(-start);
}
+
+bool __init olpc_ofw_is_installed(void)
+{
+ return olpc_ofw_cif != NULL;
+}
diff --git a/arch/x86/platform/sfi/sfi.c b/arch/x86/platform/sfi/sfi.c
index dd4c281ffe57..7785b72ecc3a 100644
--- a/arch/x86/platform/sfi/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -34,23 +34,12 @@
#ifdef CONFIG_X86_LOCAL_APIC
static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
-static void __init mp_sfi_register_lapic_address(unsigned long address)
-{
- mp_lapic_addr = address;
-
- set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
- if (boot_cpu_physical_apicid == -1U)
- boot_cpu_physical_apicid = read_apic_id();
-
- pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
-}
-
/* All CPUs enumerated by SFI must be present and enabled */
static void __cpuinit mp_sfi_register_lapic(u8 id)
{
- if (MAX_APICS - id <= 0) {
+ if (MAX_LOCAL_APIC - id <= 0) {
pr_warning("Processor #%d invalid (max %d)\n",
- id, MAX_APICS);
+ id, MAX_LOCAL_APIC);
return;
}
@@ -110,7 +99,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
int __init sfi_platform_init(void)
{
#ifdef CONFIG_X86_LOCAL_APIC
- mp_sfi_register_lapic_address(sfi_lapic_addr);
+ register_lapic_address(sfi_lapic_addr);
sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
#endif
#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 20ea20a39e2a..df58e9cad96a 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1341,10 +1341,10 @@ uv_activation_descriptor_init(int node, int pnode)
/*
* each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
- * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
+ * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE)
*/
- bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
- UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
+ bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE
+ * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
BUG_ON(!bau_desc);
pa = uv_gpa(bau_desc); /* need the real nasid*/
@@ -1402,9 +1402,9 @@ uv_payload_queue_init(int node, int pnode)
struct bau_payload_queue_entry *pqp_malloc;
struct bau_control *bcp;
- pqp = (struct bau_payload_queue_entry *) kmalloc_node(
- (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
- GFP_KERNEL, node);
+ pqp = kmalloc_node((DEST_Q_SIZE + 1)
+ * sizeof(struct bau_payload_queue_entry),
+ GFP_KERNEL, node);
BUG_ON(!pqp);
pqp_malloc = pqp;
@@ -1455,7 +1455,7 @@ static void __init uv_init_uvhub(int uvhub, int vector)
* the below initialization can't be in firmware because the
* messaging IRQ will be determined by the OS
*/
- apicid = uvhub_to_first_apicid(uvhub);
+ apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
((apicid << 32) | vector));
}
@@ -1490,7 +1490,7 @@ calculate_destination_timeout(void)
/*
* initialize the bau_control structure for each cpu
*/
-static void __init uv_init_per_cpu(int nuvhubs)
+static int __init uv_init_per_cpu(int nuvhubs)
{
int i;
int cpu;
@@ -1507,7 +1507,7 @@ static void __init uv_init_per_cpu(int nuvhubs)
struct bau_control *smaster = NULL;
struct socket_desc {
short num_cpus;
- short cpu_number[16];
+ short cpu_number[MAX_CPUS_PER_SOCKET];
};
struct uvhub_desc {
unsigned short socket_mask;
@@ -1520,8 +1520,7 @@ static void __init uv_init_per_cpu(int nuvhubs)
timeout_us = calculate_destination_timeout();
- uvhub_descs = (struct uvhub_desc *)
- kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
+ uvhub_descs = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
for_each_present_cpu(cpu) {
@@ -1541,6 +1540,10 @@ static void __init uv_init_per_cpu(int nuvhubs)
sdp = &bdp->socket[socket];
sdp->cpu_number[sdp->num_cpus] = cpu;
sdp->num_cpus++;
+ if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
+ printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus);
+ return 1;
+ }
}
for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
@@ -1571,6 +1574,12 @@ static void __init uv_init_per_cpu(int nuvhubs)
bcp->uvhub_master = hmaster;
bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
blade_processor_id;
+ if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
+ printk(KERN_EMERG
+ "%d cpus per uvhub invalid\n",
+ bcp->uvhub_cpu);
+ return 1;
+ }
}
nextsocket:
socket++;
@@ -1596,6 +1605,7 @@ nextsocket:
bcp->congested_reps = congested_reps;
bcp->congested_period = congested_period;
}
+ return 0;
}
/*
@@ -1626,7 +1636,10 @@ static int __init uv_bau_init(void)
spin_lock_init(&disable_lock);
congested_cycles = microsec_2_cycles(congested_response_us);
- uv_init_per_cpu(nuvhubs);
+ if (uv_init_per_cpu(nuvhubs)) {
+ nobau = 1;
+ return 0;
+ }
uv_partition_base_pnode = 0x7fffffff;
for (uvhub = 0; uvhub < nuvhubs; uvhub++)
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 56e421bc379b..9daf5d1af9f1 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -89,6 +89,7 @@ static void uv_rtc_send_IPI(int cpu)
apicid = cpu_physical_id(cpu);
pnode = uv_apicid_to_pnode(apicid);
+ apicid |= uv_apicid_hibits;
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
(apicid << UVH_IPI_INT_APIC_ID_SHFT) |
(X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
@@ -107,6 +108,7 @@ static int uv_intr_pending(int pnode)
static int uv_setup_intr(int cpu, u64 expires)
{
u64 val;
+ unsigned long apicid = cpu_physical_id(cpu) | uv_apicid_hibits;
int pnode = uv_cpu_to_pnode(cpu);
uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
@@ -117,7 +119,7 @@ static int uv_setup_intr(int cpu, u64 expires)
UVH_EVENT_OCCURRED0_RTC1_MASK);
val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
- ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
+ ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
/* Set configuration */
uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
diff --git a/arch/x86/platform/visws/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index 3371bd053b89..632037671746 100644
--- a/arch/x86/platform/visws/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -171,7 +171,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
ver = m->apicver;
if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
- m->apicid, MAX_APICS);
+ m->apicid, MAX_LOCAL_APIC);
return;
}
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 4a2afa1bac51..b6552b189bcd 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -25,7 +25,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
export CPPFLAGS_vdso.lds += -P -C
-VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \
+VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
-Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
@@ -69,7 +69,7 @@ vdso32.so-$(VDSO32-y) += sysenter
vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
-VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1
+VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-soname=linux-gate.so.1
# This makes sure the $(obj) subdirectory exists even though vdso32/
# is not a kbuild sub-make subdirectory.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 779385158915..17c565de3d64 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp)
obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
time.o xen-asm.o xen-asm_$(BITS).o \
- grant-table.o suspend.o platform-pci-unplug.o
+ grant-table.o suspend.o platform-pci-unplug.o \
+ p2m.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 235c0f4d3861..50542efe45fb 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -75,6 +75,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
enum xen_domain_type xen_domain_type = XEN_NATIVE;
EXPORT_SYMBOL_GPL(xen_domain_type);
+unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
+EXPORT_SYMBOL(machine_to_phys_mapping);
+unsigned int machine_to_phys_order;
+EXPORT_SYMBOL(machine_to_phys_order);
+
struct start_info *xen_start_info;
EXPORT_SYMBOL_GPL(xen_start_info);
@@ -569,8 +574,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
preempt_disable();
- start = __get_cpu_var(idt_desc).address;
- end = start + __get_cpu_var(idt_desc).size + 1;
+ start = __this_cpu_read(idt_desc.address);
+ end = start + __this_cpu_read(idt_desc.size) + 1;
xen_mc_flush();
@@ -1016,10 +1021,6 @@ static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
-#ifdef CONFIG_SMP
- stop_other_cpus();
-#endif
-
if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
BUG();
}
@@ -1090,6 +1091,8 @@ static void __init xen_setup_stackprotector(void)
/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
+ struct physdev_set_iopl set_iopl;
+ int rc;
pgd_t *pgd;
if (!xen_start_info)
@@ -1097,6 +1100,8 @@ asmlinkage void __init xen_start_kernel(void)
xen_domain_type = XEN_PV_DOMAIN;
+ xen_setup_machphys_mapping();
+
/* Install Xen paravirt ops */
pv_info = xen_info;
pv_init_ops = xen_init_ops;
@@ -1169,6 +1174,15 @@ asmlinkage void __init xen_start_kernel(void)
xen_smp_init();
+#ifdef CONFIG_ACPI_NUMA
+ /*
+ * The pages we from Xen are not related to machine pages, so
+ * any NUMA information the kernel tries to get from ACPI will
+ * be meaningless. Prevent it from trying.
+ */
+ acpi_numa = -1;
+#endif
+
pgd = (pgd_t *)xen_start_info->pt_base;
if (!xen_initial_domain())
@@ -1180,7 +1194,7 @@ asmlinkage void __init xen_start_kernel(void)
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
local_irq_disable();
- early_boot_irqs_off();
+ early_boot_irqs_disabled = true;
memblock_init();
@@ -1191,8 +1205,6 @@ asmlinkage void __init xen_start_kernel(void)
/* Allocate and initialize top and mid mfn levels for p2m structure */
xen_build_mfn_list_list();
- init_mm.pgd = pgd;
-
/* keep using Xen gdt for now; no urgent need to change it */
#ifdef CONFIG_X86_32
@@ -1202,10 +1214,18 @@ asmlinkage void __init xen_start_kernel(void)
#else
pv_info.kernel_rpl = 0;
#endif
-
/* set the limit of our address space */
xen_reserve_top();
+ /* We used to do this in xen_arch_setup, but that is too late on AMD
+ * were early_cpu_init (run before ->arch_setup()) calls early_amd_init
+ * which pokes 0xcf8 port.
+ */
+ set_iopl.iopl = 1;
+ rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
+ if (rc != 0)
+ xen_raw_printk("physdev_op failed %d\n", rc);
+
#ifdef CONFIG_X86_32
/* set up basic CPUID stuff */
cpu_detect(&new_cpu_data);
@@ -1245,25 +1265,6 @@ asmlinkage void __init xen_start_kernel(void)
#endif
}
-static uint32_t xen_cpuid_base(void)
-{
- uint32_t base, eax, ebx, ecx, edx;
- char signature[13];
-
- for (base = 0x40000000; base < 0x40010000; base += 0x100) {
- cpuid(base, &eax, &ebx, &ecx, &edx);
- *(uint32_t *)(signature + 0) = ebx;
- *(uint32_t *)(signature + 4) = ecx;
- *(uint32_t *)(signature + 8) = edx;
- signature[12] = 0;
-
- if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
- return base;
- }
-
- return 0;
-}
-
static int init_hvm_pv_info(int *major, int *minor)
{
uint32_t eax, ebx, ecx, edx, pages, msr, base;
@@ -1373,6 +1374,18 @@ static bool __init xen_hvm_platform(void)
return true;
}
+bool xen_hvm_need_lapic(void)
+{
+ if (xen_pv_domain())
+ return false;
+ if (!xen_hvm_domain())
+ return false;
+ if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
+ return false;
+ return true;
+}
+EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
+
const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = {
.name = "Xen HVM",
.detect = xen_hvm_platform,
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 9d30105a0c4a..6a6fe8939645 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -126,7 +126,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
#endif
};
-void __init xen_init_irq_ops()
+void __init xen_init_irq_ops(void)
{
pv_irq_ops = xen_irq_ops;
x86_init.irqs.intr_init = xen_init_IRQ;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c237b810b03f..5e92b61ad574 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -173,371 +173,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
*/
#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
-/*
- * Xen leaves the responsibility for maintaining p2m mappings to the
- * guests themselves, but it must also access and update the p2m array
- * during suspend/resume when all the pages are reallocated.
- *
- * The p2m table is logically a flat array, but we implement it as a
- * three-level tree to allow the address space to be sparse.
- *
- * Xen
- * |
- * p2m_top p2m_top_mfn
- * / \ / \
- * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
- * / \ / \ / /
- * p2m p2m p2m p2m p2m p2m p2m ...
- *
- * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
- *
- * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
- * maximum representable pseudo-physical address space is:
- * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
- *
- * P2M_PER_PAGE depends on the architecture, as a mfn is always
- * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
- * 512 and 1024 entries respectively.
- */
-
-unsigned long xen_max_p2m_pfn __read_mostly;
-
-#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
-/* Placeholders for holes in the address space */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
-
-static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
-
-RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-
-static inline unsigned p2m_top_index(unsigned long pfn)
-{
- BUG_ON(pfn >= MAX_P2M_PFN);
- return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
-}
-
-static inline unsigned p2m_mid_index(unsigned long pfn)
-{
- return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
-}
-
-static inline unsigned p2m_index(unsigned long pfn)
-{
- return pfn % P2M_PER_PAGE;
-}
-
-static void p2m_top_init(unsigned long ***top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = p2m_mid_missing;
-}
-
-static void p2m_top_mfn_init(unsigned long *top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = virt_to_mfn(p2m_mid_missing_mfn);
-}
-
-static void p2m_top_mfn_p_init(unsigned long **top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = p2m_mid_missing_mfn;
-}
-
-static void p2m_mid_init(unsigned long **mid)
-{
- unsigned i;
-
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = p2m_missing;
-}
-
-static void p2m_mid_mfn_init(unsigned long *mid)
-{
- unsigned i;
-
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = virt_to_mfn(p2m_missing);
-}
-
-static void p2m_init(unsigned long *p2m)
-{
- unsigned i;
-
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- p2m[i] = INVALID_P2M_ENTRY;
-}
-
-/*
- * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
- *
- * This is called both at boot time, and after resuming from suspend:
- * - At boot time we're called very early, and must use extend_brk()
- * to allocate memory.
- *
- * - After resume we're called from within stop_machine, but the mfn
- * tree should alreay be completely allocated.
- */
-void xen_build_mfn_list_list(void)
-{
- unsigned long pfn;
-
- /* Pre-initialize p2m_top_mfn to be completely missing */
- if (p2m_top_mfn == NULL) {
- p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
-
- p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_mfn_p_init(p2m_top_mfn_p);
-
- p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_mfn_init(p2m_top_mfn);
- } else {
- /* Reinitialise, mfn's all change after migration */
- p2m_mid_mfn_init(p2m_mid_missing_mfn);
- }
-
- for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
- unsigned long **mid;
- unsigned long *mid_mfn_p;
-
- mid = p2m_top[topidx];
- mid_mfn_p = p2m_top_mfn_p[topidx];
-
- /* Don't bother allocating any mfn mid levels if
- * they're just missing, just update the stored mfn,
- * since all could have changed over a migrate.
- */
- if (mid == p2m_mid_missing) {
- BUG_ON(mididx);
- BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
- p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
- pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
- continue;
- }
-
- if (mid_mfn_p == p2m_mid_missing_mfn) {
- /*
- * XXX boot-time only! We should never find
- * missing parts of the mfn tree after
- * runtime. extend_brk() will BUG if we call
- * it too late.
- */
- mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_mfn_init(mid_mfn_p);
-
- p2m_top_mfn_p[topidx] = mid_mfn_p;
- }
-
- p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
- mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
- }
-}
-
-void xen_setup_mfn_list_list(void)
-{
- BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
-
- HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
- virt_to_mfn(p2m_top_mfn);
- HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
-}
-
-/* Set up p2m_top to point to the domain-builder provided p2m pages */
-void __init xen_build_dynamic_phys_to_machine(void)
-{
- unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
- unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
- unsigned long pfn;
-
- xen_max_p2m_pfn = max_pfn;
-
- p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_init(p2m_missing);
-
- p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(p2m_mid_missing);
-
- p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_init(p2m_top);
-
- /*
- * The domain builder gives us a pre-constructed p2m array in
- * mfn_list for all the pages initially given to us, so we just
- * need to graft that into our tree structure.
- */
- for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
-
- if (p2m_top[topidx] == p2m_mid_missing) {
- unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(mid);
-
- p2m_top[topidx] = mid;
- }
-
- p2m_top[topidx][mididx] = &mfn_list[pfn];
- }
-}
-
-unsigned long get_phys_to_machine(unsigned long pfn)
-{
- unsigned topidx, mididx, idx;
-
- if (unlikely(pfn >= MAX_P2M_PFN))
- return INVALID_P2M_ENTRY;
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- return p2m_top[topidx][mididx][idx];
-}
-EXPORT_SYMBOL_GPL(get_phys_to_machine);
-
-static void *alloc_p2m_page(void)
-{
- return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
-}
-
-static void free_p2m_page(void *p)
-{
- free_page((unsigned long)p);
-}
-
-/*
- * Fully allocate the p2m structure for a given pfn. We need to check
- * that both the top and mid levels are allocated, and make sure the
- * parallel mfn tree is kept in sync. We may race with other cpus, so
- * the new pages are installed with cmpxchg; if we lose the race then
- * simply free the page we allocated and use the one that's there.
- */
-static bool alloc_p2m(unsigned long pfn)
-{
- unsigned topidx, mididx;
- unsigned long ***top_p, **mid;
- unsigned long *top_mfn_p, *mid_mfn;
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
-
- top_p = &p2m_top[topidx];
- mid = *top_p;
-
- if (mid == p2m_mid_missing) {
- /* Mid level is missing, allocate a new one */
- mid = alloc_p2m_page();
- if (!mid)
- return false;
-
- p2m_mid_init(mid);
-
- if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
- free_p2m_page(mid);
- }
-
- top_mfn_p = &p2m_top_mfn[topidx];
- mid_mfn = p2m_top_mfn_p[topidx];
-
- BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
-
- if (mid_mfn == p2m_mid_missing_mfn) {
- /* Separately check the mid mfn level */
- unsigned long missing_mfn;
- unsigned long mid_mfn_mfn;
-
- mid_mfn = alloc_p2m_page();
- if (!mid_mfn)
- return false;
-
- p2m_mid_mfn_init(mid_mfn);
-
- missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
- mid_mfn_mfn = virt_to_mfn(mid_mfn);
- if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
- free_p2m_page(mid_mfn);
- else
- p2m_top_mfn_p[topidx] = mid_mfn;
- }
-
- if (p2m_top[topidx][mididx] == p2m_missing) {
- /* p2m leaf page is missing */
- unsigned long *p2m;
-
- p2m = alloc_p2m_page();
- if (!p2m)
- return false;
-
- p2m_init(p2m);
-
- if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
- free_p2m_page(p2m);
- else
- mid_mfn[mididx] = virt_to_mfn(p2m);
- }
-
- return true;
-}
-
-/* Try to install p2m mapping; fail if intermediate bits missing */
-bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
- unsigned topidx, mididx, idx;
-
- if (unlikely(pfn >= MAX_P2M_PFN)) {
- BUG_ON(mfn != INVALID_P2M_ENTRY);
- return true;
- }
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- if (p2m_top[topidx][mididx] == p2m_missing)
- return mfn == INVALID_P2M_ENTRY;
-
- p2m_top[topidx][mididx][idx] = mfn;
-
- return true;
-}
-
-bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
- if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
- BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
- return true;
- }
-
- if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
- if (!alloc_p2m(pfn))
- return false;
-
- if (!__set_phys_to_machine(pfn, mfn))
- return false;
- }
-
- return true;
-}
-
unsigned long arbitrary_virt_to_mfn(void *vaddr)
{
xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
@@ -566,6 +201,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
offset = address & ~PAGE_MASK;
return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
}
+EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
void make_lowmem_page_readonly(void *vaddr)
{
@@ -2034,6 +1670,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
set_page_prot(pmd, PAGE_KERNEL_RO);
}
+void __init xen_setup_machphys_mapping(void)
+{
+ struct xen_machphys_mapping mapping;
+ unsigned long machine_to_phys_nr_ents;
+
+ if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
+ machine_to_phys_mapping = (unsigned long *)mapping.v_start;
+ machine_to_phys_nr_ents = mapping.max_mfn + 1;
+ } else {
+ machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
+ }
+ machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
+}
+
#ifdef CONFIG_X86_64
static void convert_pfn_mfn(void *v)
{
@@ -2119,44 +1769,83 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
return pgd;
}
#else /* !CONFIG_X86_64 */
-static RESERVE_BRK_ARRAY(pmd_t, level2_kernel_pgt, PTRS_PER_PMD);
+static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
+static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
+
+static __init void xen_write_cr3_init(unsigned long cr3)
+{
+ unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
+
+ BUG_ON(read_cr3() != __pa(initial_page_table));
+ BUG_ON(cr3 != __pa(swapper_pg_dir));
+
+ /*
+ * We are switching to swapper_pg_dir for the first time (from
+ * initial_page_table) and therefore need to mark that page
+ * read-only and then pin it.
+ *
+ * Xen disallows sharing of kernel PMDs for PAE
+ * guests. Therefore we must copy the kernel PMD from
+ * initial_page_table into a new kernel PMD to be used in
+ * swapper_pg_dir.
+ */
+ swapper_kernel_pmd =
+ extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
+ memcpy(swapper_kernel_pmd, initial_kernel_pmd,
+ sizeof(pmd_t) * PTRS_PER_PMD);
+ swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
+ __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
+ set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
+
+ set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+ xen_write_cr3(cr3);
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
+
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
+ PFN_DOWN(__pa(initial_page_table)));
+ set_page_prot(initial_page_table, PAGE_KERNEL);
+ set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
+
+ pv_mmu_ops.write_cr3 = &xen_write_cr3;
+}
__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
unsigned long max_pfn)
{
pmd_t *kernel_pmd;
- level2_kernel_pgt = extend_brk(sizeof(pmd_t *) * PTRS_PER_PMD, PAGE_SIZE);
+ initial_kernel_pmd =
+ extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
xen_start_info->nr_pt_frames * PAGE_SIZE +
512*1024);
kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
- memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
+ memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
- xen_map_identity_early(level2_kernel_pgt, max_pfn);
+ xen_map_identity_early(initial_kernel_pmd, max_pfn);
- memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
- set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
- __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+ memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+ initial_page_table[KERNEL_PGD_BOUNDARY] =
+ __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
- set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+ set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
+ set_page_prot(initial_page_table, PAGE_KERNEL_RO);
set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
- xen_write_cr3(__pa(swapper_pg_dir));
-
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
+ PFN_DOWN(__pa(initial_page_table)));
+ xen_write_cr3(__pa(initial_page_table));
memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
__pa(xen_start_info->pt_base +
xen_start_info->nr_pt_frames * PAGE_SIZE),
"XEN PAGETABLES");
- return swapper_pg_dir;
+ return initial_page_table;
}
#endif /* CONFIG_X86_64 */
@@ -2290,7 +1979,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
.write_cr2 = xen_write_cr2,
.read_cr3 = xen_read_cr3,
+#ifdef CONFIG_X86_32
+ .write_cr3 = xen_write_cr3_init,
+#else
.write_cr3 = xen_write_cr3,
+#endif
.flush_tlb_user = xen_flush_tlb,
.flush_tlb_kernel = xen_flush_tlb,
@@ -2358,8 +2051,6 @@ void __init xen_init_mmu_ops(void)
x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
pv_mmu_ops = xen_mmu_ops;
- vmap_lazy_unmap = false;
-
memset(dummy_mapping, 0xff, PAGE_SIZE);
}
@@ -2627,7 +2318,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
- vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+ BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
+ (VM_PFNMAP | VM_RESERVED | VM_IO)));
rmd.mfn = mfn;
rmd.prot = prot;
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 9e565da5d1f7..4ec8035e3216 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -22,7 +22,7 @@ static inline void xen_mc_batch(void)
unsigned long flags;
/* need to disable interrupts until this entry is complete */
local_irq_save(flags);
- __get_cpu_var(xen_mc_irq_flags) = flags;
+ __this_cpu_write(xen_mc_irq_flags, flags);
}
static inline struct multicall_space xen_mc_entry(size_t args)
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 000000000000..fd12d7ce7ff9
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,522 @@
+/*
+ * Xen leaves the responsibility for maintaining p2m mappings to the
+ * guests themselves, but it must also access and update the p2m array
+ * during suspend/resume when all the pages are reallocated.
+ *
+ * The p2m table is logically a flat array, but we implement it as a
+ * three-level tree to allow the address space to be sparse.
+ *
+ * Xen
+ * |
+ * p2m_top p2m_top_mfn
+ * / \ / \
+ * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
+ * / \ / \ / /
+ * p2m p2m p2m p2m p2m p2m p2m ...
+ *
+ * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
+ *
+ * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
+ * maximum representable pseudo-physical address space is:
+ * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
+ *
+ * P2M_PER_PAGE depends on the architecture, as a mfn is always
+ * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
+ * 512 and 1024 entries respectively.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
+
+#include <asm/cache.h>
+#include <asm/setup.h>
+
+#include <asm/xen/page.h>
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include "xen-ops.h"
+
+static void __init m2p_override_init(void);
+
+unsigned long xen_max_p2m_pfn __read_mostly;
+
+#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
+/* Placeholders for holes in the address space */
+static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
+
+static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
+static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
+
+RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
+
+static inline unsigned p2m_top_index(unsigned long pfn)
+{
+ BUG_ON(pfn >= MAX_P2M_PFN);
+ return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
+}
+
+static inline unsigned p2m_mid_index(unsigned long pfn)
+{
+ return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
+}
+
+static inline unsigned p2m_index(unsigned long pfn)
+{
+ return pfn % P2M_PER_PAGE;
+}
+
+static void p2m_top_init(unsigned long ***top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing;
+}
+
+static void p2m_top_mfn_init(unsigned long *top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = virt_to_mfn(p2m_mid_missing_mfn);
+}
+
+static void p2m_top_mfn_p_init(unsigned long **top)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_TOP_PER_PAGE; i++)
+ top[i] = p2m_mid_missing_mfn;
+}
+
+static void p2m_mid_init(unsigned long **mid)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = p2m_missing;
+}
+
+static void p2m_mid_mfn_init(unsigned long *mid)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ mid[i] = virt_to_mfn(p2m_missing);
+}
+
+static void p2m_init(unsigned long *p2m)
+{
+ unsigned i;
+
+ for (i = 0; i < P2M_MID_PER_PAGE; i++)
+ p2m[i] = INVALID_P2M_ENTRY;
+}
+
+/*
+ * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
+ *
+ * This is called both at boot time, and after resuming from suspend:
+ * - At boot time we're called very early, and must use extend_brk()
+ * to allocate memory.
+ *
+ * - After resume we're called from within stop_machine, but the mfn
+ * tree should alreay be completely allocated.
+ */
+void xen_build_mfn_list_list(void)
+{
+ unsigned long pfn;
+
+ /* Pre-initialize p2m_top_mfn to be completely missing */
+ if (p2m_top_mfn == NULL) {
+ p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(p2m_mid_missing_mfn);
+
+ p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_p_init(p2m_top_mfn_p);
+
+ p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_init(p2m_top_mfn);
+ } else {
+ /* Reinitialise, mfn's all change after migration */
+ p2m_mid_mfn_init(p2m_mid_missing_mfn);
+ }
+
+ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+ unsigned long **mid;
+ unsigned long *mid_mfn_p;
+
+ mid = p2m_top[topidx];
+ mid_mfn_p = p2m_top_mfn_p[topidx];
+
+ /* Don't bother allocating any mfn mid levels if
+ * they're just missing, just update the stored mfn,
+ * since all could have changed over a migrate.
+ */
+ if (mid == p2m_mid_missing) {
+ BUG_ON(mididx);
+ BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
+ p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
+ pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
+ continue;
+ }
+
+ if (mid_mfn_p == p2m_mid_missing_mfn) {
+ /*
+ * XXX boot-time only! We should never find
+ * missing parts of the mfn tree after
+ * runtime. extend_brk() will BUG if we call
+ * it too late.
+ */
+ mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_mfn_init(mid_mfn_p);
+
+ p2m_top_mfn_p[topidx] = mid_mfn_p;
+ }
+
+ p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
+ mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
+ }
+}
+
+void xen_setup_mfn_list_list(void)
+{
+ BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
+
+ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
+ virt_to_mfn(p2m_top_mfn);
+ HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
+}
+
+/* Set up p2m_top to point to the domain-builder provided p2m pages */
+void __init xen_build_dynamic_phys_to_machine(void)
+{
+ unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
+ unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
+ unsigned long pfn;
+
+ xen_max_p2m_pfn = max_pfn;
+
+ p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_init(p2m_missing);
+
+ p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(p2m_mid_missing);
+
+ p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_init(p2m_top);
+
+ /*
+ * The domain builder gives us a pre-constructed p2m array in
+ * mfn_list for all the pages initially given to us, so we just
+ * need to graft that into our tree structure.
+ */
+ for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
+ unsigned topidx = p2m_top_index(pfn);
+ unsigned mididx = p2m_mid_index(pfn);
+
+ if (p2m_top[topidx] == p2m_mid_missing) {
+ unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_init(mid);
+
+ p2m_top[topidx] = mid;
+ }
+
+ /*
+ * As long as the mfn_list has enough entries to completely
+ * fill a p2m page, pointing into the array is ok. But if
+ * not the entries beyond the last pfn will be undefined.
+ */
+ if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
+ unsigned long p2midx;
+
+ p2midx = max_pfn % P2M_PER_PAGE;
+ for ( ; p2midx < P2M_PER_PAGE; p2midx++)
+ mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
+ }
+ p2m_top[topidx][mididx] = &mfn_list[pfn];
+ }
+
+ m2p_override_init();
+}
+
+unsigned long get_phys_to_machine(unsigned long pfn)
+{
+ unsigned topidx, mididx, idx;
+
+ if (unlikely(pfn >= MAX_P2M_PFN))
+ return INVALID_P2M_ENTRY;
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ return p2m_top[topidx][mididx][idx];
+}
+EXPORT_SYMBOL_GPL(get_phys_to_machine);
+
+static void *alloc_p2m_page(void)
+{
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
+}
+
+static void free_p2m_page(void *p)
+{
+ free_page((unsigned long)p);
+}
+
+/*
+ * Fully allocate the p2m structure for a given pfn. We need to check
+ * that both the top and mid levels are allocated, and make sure the
+ * parallel mfn tree is kept in sync. We may race with other cpus, so
+ * the new pages are installed with cmpxchg; if we lose the race then
+ * simply free the page we allocated and use the one that's there.
+ */
+static bool alloc_p2m(unsigned long pfn)
+{
+ unsigned topidx, mididx;
+ unsigned long ***top_p, **mid;
+ unsigned long *top_mfn_p, *mid_mfn;
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+
+ top_p = &p2m_top[topidx];
+ mid = *top_p;
+
+ if (mid == p2m_mid_missing) {
+ /* Mid level is missing, allocate a new one */
+ mid = alloc_p2m_page();
+ if (!mid)
+ return false;
+
+ p2m_mid_init(mid);
+
+ if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
+ free_p2m_page(mid);
+ }
+
+ top_mfn_p = &p2m_top_mfn[topidx];
+ mid_mfn = p2m_top_mfn_p[topidx];
+
+ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+
+ if (mid_mfn == p2m_mid_missing_mfn) {
+ /* Separately check the mid mfn level */
+ unsigned long missing_mfn;
+ unsigned long mid_mfn_mfn;
+
+ mid_mfn = alloc_p2m_page();
+ if (!mid_mfn)
+ return false;
+
+ p2m_mid_mfn_init(mid_mfn);
+
+ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+ mid_mfn_mfn = virt_to_mfn(mid_mfn);
+ if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
+ free_p2m_page(mid_mfn);
+ else
+ p2m_top_mfn_p[topidx] = mid_mfn;
+ }
+
+ if (p2m_top[topidx][mididx] == p2m_missing) {
+ /* p2m leaf page is missing */
+ unsigned long *p2m;
+
+ p2m = alloc_p2m_page();
+ if (!p2m)
+ return false;
+
+ p2m_init(p2m);
+
+ if (cmpxchg(&mid[mididx], p2m_missing, p2m) != p2m_missing)
+ free_p2m_page(p2m);
+ else
+ mid_mfn[mididx] = virt_to_mfn(p2m);
+ }
+
+ return true;
+}
+
+/* Try to install p2m mapping; fail if intermediate bits missing */
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ unsigned topidx, mididx, idx;
+
+ if (unlikely(pfn >= MAX_P2M_PFN)) {
+ BUG_ON(mfn != INVALID_P2M_ENTRY);
+ return true;
+ }
+
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
+ idx = p2m_index(pfn);
+
+ if (p2m_top[topidx][mididx] == p2m_missing)
+ return mfn == INVALID_P2M_ENTRY;
+
+ p2m_top[topidx][mididx][idx] = mfn;
+
+ return true;
+}
+
+bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+ if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+ BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+ return true;
+ }
+
+ if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
+ if (!alloc_p2m(pfn))
+ return false;
+
+ if (!__set_phys_to_machine(pfn, mfn))
+ return false;
+ }
+
+ return true;
+}
+
+#define M2P_OVERRIDE_HASH_SHIFT 10
+#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
+
+static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
+static DEFINE_SPINLOCK(m2p_override_lock);
+
+static void __init m2p_override_init(void)
+{
+ unsigned i;
+
+ m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
+ sizeof(unsigned long));
+
+ for (i = 0; i < M2P_OVERRIDE_HASH; i++)
+ INIT_LIST_HEAD(&m2p_overrides[i]);
+}
+
+static unsigned long mfn_hash(unsigned long mfn)
+{
+ return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
+}
+
+/* Add an MFN override for a particular page */
+int m2p_add_override(unsigned long mfn, struct page *page)
+{
+ unsigned long flags;
+ unsigned long pfn;
+ unsigned long address;
+ unsigned level;
+ pte_t *ptep = NULL;
+
+ pfn = page_to_pfn(page);
+ if (!PageHighMem(page)) {
+ address = (unsigned long)__va(pfn << PAGE_SHIFT);
+ ptep = lookup_address(address, &level);
+
+ if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+ "m2p_add_override: pfn %lx not mapped", pfn))
+ return -EINVAL;
+ }
+
+ page->private = mfn;
+ page->index = pfn_to_mfn(pfn);
+
+ __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
+ if (!PageHighMem(page))
+ /* Just zap old mapping for now */
+ pte_clear(&init_mm, address, ptep);
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+ list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ return 0;
+}
+
+int m2p_remove_override(struct page *page)
+{
+ unsigned long flags;
+ unsigned long mfn;
+ unsigned long pfn;
+ unsigned long address;
+ unsigned level;
+ pte_t *ptep = NULL;
+
+ pfn = page_to_pfn(page);
+ mfn = get_phys_to_machine(pfn);
+ if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
+ return -EINVAL;
+
+ if (!PageHighMem(page)) {
+ address = (unsigned long)__va(pfn << PAGE_SHIFT);
+ ptep = lookup_address(address, &level);
+
+ if (WARN(ptep == NULL || level != PG_LEVEL_4K,
+ "m2p_remove_override: pfn %lx not mapped", pfn))
+ return -EINVAL;
+ }
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+ list_del(&page->lru);
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+ __set_phys_to_machine(pfn, page->index);
+
+ if (!PageHighMem(page))
+ set_pte_at(&init_mm, address, ptep,
+ pfn_pte(pfn, PAGE_KERNEL));
+ /* No tlb flush necessary because the caller already
+ * left the pte unmapped. */
+
+ return 0;
+}
+
+struct page *m2p_find_override(unsigned long mfn)
+{
+ unsigned long flags;
+ struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
+ struct page *p, *ret;
+
+ ret = NULL;
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+
+ list_for_each_entry(p, bucket, lru) {
+ if (p->private == mfn) {
+ ret = p;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ return ret;
+}
+
+unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
+{
+ struct page *p = m2p_find_override(mfn);
+ unsigned long ret = pfn;
+
+ if (p)
+ ret = page_to_pfn(p);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 0f456386cce5..25c52f94a27c 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -68,7 +68,7 @@ static int __init check_platform_magic(void)
return 0;
}
-void __init xen_unplug_emulated_devices(void)
+void xen_unplug_emulated_devices(void)
{
int r;
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index b1dbdaa23ecc..a8a66a50d446 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -23,7 +23,6 @@
#include <xen/interface/callback.h>
#include <xen/interface/memory.h>
#include <xen/interface/physdev.h>
-#include <xen/interface/memory.h>
#include <xen/features.h>
#include "xen-ops.h"
@@ -118,16 +117,18 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
const struct e820map *e820)
{
phys_addr_t max_addr = PFN_PHYS(max_pfn);
- phys_addr_t last_end = 0;
+ phys_addr_t last_end = ISA_END_ADDRESS;
unsigned long released = 0;
int i;
+ /* Free any unused memory above the low 1Mbyte. */
for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
phys_addr_t end = e820->map[i].addr;
end = min(max_addr, end);
- released += xen_release_chunk(last_end, end);
- last_end = e820->map[i].addr + e820->map[i].size;
+ if (last_end < end)
+ released += xen_release_chunk(last_end, end);
+ last_end = max(last_end, e820->map[i].addr + e820->map[i].size);
}
if (last_end < max_addr)
@@ -164,6 +165,7 @@ char * __init xen_memory_setup(void)
XENMEM_memory_map;
rc = HYPERVISOR_memory_op(op, &memmap);
if (rc == -ENOSYS) {
+ BUG_ON(xen_initial_domain());
memmap.nr_entries = 1;
map[0].addr = 0ULL;
map[0].size = mem_end;
@@ -177,36 +179,39 @@ char * __init xen_memory_setup(void)
e820.nr_map = 0;
xen_extra_mem_start = mem_end;
for (i = 0; i < memmap.nr_entries; i++) {
- unsigned long long end = map[i].addr + map[i].size;
+ unsigned long long end;
+
+ /* Guard against non-page aligned E820 entries. */
+ if (map[i].type == E820_RAM)
+ map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
- if (map[i].type == E820_RAM) {
- if (map[i].addr < mem_end && end > mem_end) {
- /* Truncate region to max_mem. */
- u64 delta = end - mem_end;
+ end = map[i].addr + map[i].size;
+ if (map[i].type == E820_RAM && end > mem_end) {
+ /* RAM off the end - may be partially included */
+ u64 delta = min(map[i].size, end - mem_end);
- map[i].size -= delta;
- extra_pages += PFN_DOWN(delta);
+ map[i].size -= delta;
+ end -= delta;
- end = mem_end;
- }
+ extra_pages += PFN_DOWN(delta);
}
- if (end > xen_extra_mem_start)
+ if (map[i].size > 0 && end > xen_extra_mem_start)
xen_extra_mem_start = end;
- /* If region is non-RAM or below mem_end, add what remains */
- if ((map[i].type != E820_RAM || map[i].addr < mem_end) &&
- map[i].size > 0)
+ /* Add region if any remains */
+ if (map[i].size > 0)
e820_add_region(map[i].addr, map[i].size, map[i].type);
}
/*
- * Even though this is normal, usable memory under Xen, reserve
- * ISA memory anyway because too many things think they can poke
+ * In domU, the ISA region is normal, usable memory, but we
+ * reserve ISA memory anyway because too many things poke
* about in there.
*
- * In a dom0 kernel, this region is identity mapped with the
- * hardware ISA area, so it really is out of bounds.
+ * In Dom0, the host E820 information can leave gaps in the
+ * ISA range, which would cause us to release those pages. To
+ * avoid this, we unconditionally reserve them here.
*/
e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
E820_RESERVED);
@@ -244,26 +249,11 @@ char * __init xen_memory_setup(void)
else
extra_pages = 0;
- if (!xen_initial_domain())
- xen_add_extra_mem(extra_pages);
+ xen_add_extra_mem(extra_pages);
return "Xen";
}
-static void xen_idle(void)
-{
- local_irq_disable();
-
- if (need_resched())
- local_irq_enable();
- else {
- current_thread_info()->status &= ~TS_POLLING;
- smp_mb__after_clear_bit();
- safe_halt();
- current_thread_info()->status |= TS_POLLING;
- }
-}
-
/*
* Set the bit indicating "nosegneg" library variants should be used.
* We only need to bother in pure 32-bit mode; compat 32-bit processes
@@ -333,9 +323,6 @@ void __cpuinit xen_enable_syscall(void)
void __init xen_arch_setup(void)
{
- struct physdev_set_iopl set_iopl;
- int rc;
-
xen_panic_handler_init();
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
@@ -352,11 +339,6 @@ void __init xen_arch_setup(void)
xen_enable_sysenter();
xen_enable_syscall();
- set_iopl.iopl = 1;
- rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
- if (rc != 0)
- printk(KERN_INFO "physdev_op failed %d\n", rc);
-
#ifdef CONFIG_ACPI
if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
@@ -368,7 +350,12 @@ void __init xen_arch_setup(void)
MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
- pm_idle = xen_idle;
+ /* Set up idle, making sure it calls safe_halt() pvop */
+#ifdef CONFIG_X86_32
+ boot_cpu_data.hlt_works_ok = 1;
+#endif
+ pm_idle = default_idle;
+ boot_option_idle_override = IDLE_HALT;
fiddle_vdso();
}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 23e061b9327b..cc9b1e182fcf 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -159,8 +159,8 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
{
struct xen_spinlock *prev;
- prev = __get_cpu_var(lock_spinners);
- __get_cpu_var(lock_spinners) = xl;
+ prev = __this_cpu_read(lock_spinners);
+ __this_cpu_write(lock_spinners, xl);
wmb(); /* set lock of interest before count */
@@ -179,14 +179,14 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock
asm(LOCK_PREFIX " decw %0"
: "+m" (xl->spinners) : : "memory");
wmb(); /* decrement count before restoring lock */
- __get_cpu_var(lock_spinners) = prev;
+ __this_cpu_write(lock_spinners, prev);
}
static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
{
struct xen_spinlock *xl = (struct xen_spinlock *)lock;
struct xen_spinlock *prev;
- int irq = __get_cpu_var(lock_kicker_irq);
+ int irq = __this_cpu_read(lock_kicker_irq);
int ret;
u64 start;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 1d789d56877c..9bbd63a129b5 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -31,6 +31,7 @@ void xen_hvm_post_suspend(int suspend_cancelled)
int cpu;
xen_hvm_init_shared_info();
xen_callback_vector();
+ xen_unplug_emulated_devices();
if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
for_each_online_cpu(cpu) {
xen_setup_runstate_info(cpu);
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b2bb5aa3b054..067759e3d6a5 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -135,24 +135,24 @@ static void do_stolen_accounting(void)
/* Add the appropriate number of ticks of stolen time,
including any left-overs from last time. */
- stolen = runnable + offline + __get_cpu_var(xen_residual_stolen);
+ stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
if (stolen < 0)
stolen = 0;
ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
- __get_cpu_var(xen_residual_stolen) = stolen;
+ __this_cpu_write(xen_residual_stolen, stolen);
account_steal_ticks(ticks);
/* Add the appropriate number of ticks of blocked time,
including any left-overs from last time. */
- blocked += __get_cpu_var(xen_residual_blocked);
+ blocked += __this_cpu_read(xen_residual_blocked);
if (blocked < 0)
blocked = 0;
ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
- __get_cpu_var(xen_residual_blocked) = blocked;
+ __this_cpu_write(xen_residual_blocked, blocked);
account_idle_ticks(ticks);
}
@@ -426,6 +426,8 @@ void xen_timer_resume(void)
{
int cpu;
+ pvclock_resume();
+
if (xen_clockevent != &xen_vcpuop_clockevent)
return;
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 64044747348e..9d41bf985757 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -43,7 +43,7 @@ void xen_vcpu_restore(void);
void xen_callback_vector(void);
void xen_hvm_init_shared_info(void);
-void __init xen_unplug_emulated_devices(void);
+void xen_unplug_emulated_devices(void);
void __init xen_build_dynamic_phys_to_machine(void);