summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2017-12-06 23:37:06 +0100
committerIngo Molnar <mingo@kernel.org>2017-12-06 23:37:06 +0100
commitd0300e5e8d2fb74852f8116f26546e12cfa66735 (patch)
treea6580aa74a0ebd7965dbc37767717b4f73dd9d79 /arch/x86
parentMerge tag 'perf-core-for-mingo-4.16-20171206' of git://git.kernel.org/pub/scm... (diff)
parenttooling/headers: Synchronize updated s390 and x86 UAPI headers (diff)
downloadlinux-d0300e5e8d2fb74852f8116f26546e12cfa66735.tar.xz
linux-d0300e5e8d2fb74852f8116f26546e12cfa66735.zip
Merge branch 'perf/urgent' into perf/core, to pick up fixes and to refresh to v4.15
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig17
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/boot/compressed/kaslr.c5
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c10
-rw-r--r--arch/x86/crypto/crc32-pclmul_asm.S17
-rw-r--r--arch/x86/entry/entry_64.S14
-rw-r--r--arch/x86/entry/vdso/Makefile4
-rw-r--r--arch/x86/entry/vdso/vma.c2
-rw-r--r--arch/x86/events/intel/uncore.c4
-rw-r--r--arch/x86/events/intel/uncore.h2
-rw-r--r--arch/x86/events/intel/uncore_snbep.c10
-rw-r--r--arch/x86/hyperv/hv_init.c15
-rw-r--r--arch/x86/include/asm/compat.h1
-rw-r--r--arch/x86/include/asm/dma-mapping.h8
-rw-r--r--arch/x86/include/asm/elf.h1
-rw-r--r--arch/x86/include/asm/fixmap.h6
-rw-r--r--arch/x86/include/asm/hw_irq.h8
-rw-r--r--arch/x86/include/asm/hypertransport.h46
-rw-r--r--arch/x86/include/asm/insn-eval.h2
-rw-r--r--arch/x86/include/asm/io.h4
-rw-r--r--arch/x86/include/asm/irqdomain.h6
-rw-r--r--arch/x86/include/asm/kmemcheck.h42
-rw-r--r--arch/x86/include/asm/kvm_emulate.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h11
-rw-r--r--arch/x86/include/asm/mshyperv.h2
-rw-r--r--arch/x86/include/asm/pci.h2
-rw-r--r--arch/x86/include/asm/pgtable.h13
-rw-r--r--arch/x86/include/asm/pgtable_types.h13
-rw-r--r--arch/x86/include/asm/processor.h1
-rw-r--r--arch/x86/include/asm/pvclock.h19
-rw-r--r--arch/x86/include/asm/string_32.h9
-rw-r--r--arch/x86/include/asm/string_64.h8
-rw-r--r--arch/x86/include/asm/vmx.h4
-rw-r--r--arch/x86/include/asm/xen/cpuid.h42
-rw-r--r--arch/x86/include/asm/xen/page.h11
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/kernel/acpi/apei.c5
-rw-r--r--arch/x86/kernel/acpi/boot.c61
-rw-r--r--arch/x86/kernel/apic/Makefile1
-rw-r--r--arch/x86/kernel/apic/htirq.c198
-rw-r--r--arch/x86/kernel/apic/vector.c5
-rw-r--r--arch/x86/kernel/cpu/Makefile2
-rw-r--r--arch/x86/kernel/cpu/aperfmperf.c74
-rw-r--r--arch/x86/kernel/cpu/common.c2
-rw-r--r--arch/x86/kernel/cpu/cpu.h3
-rw-r--r--arch/x86/kernel/cpu/intel.c15
-rw-r--r--arch/x86/kernel/cpu/proc.c6
-rw-r--r--arch/x86/kernel/espfix_64.c2
-rw-r--r--arch/x86/kernel/kvmclock.c7
-rw-r--r--arch/x86/kernel/mpparse.c6
-rw-r--r--arch/x86/kernel/pvclock.c14
-rw-r--r--arch/x86/kernel/smpboot.c128
-rw-r--r--arch/x86/kernel/sys_x86_64.c10
-rw-r--r--arch/x86/kernel/traps.c5
-rw-r--r--arch/x86/kernel/umip.c88
-rw-r--r--arch/x86/kvm/cpuid.h2
-rw-r--r--arch/x86/kvm/emulate.c48
-rw-r--r--arch/x86/kvm/ioapic.c34
-rw-r--r--arch/x86/kvm/lapic.c103
-rw-r--r--arch/x86/kvm/mmu.c115
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/paging_tmpl.h18
-rw-r--r--arch/x86/kvm/svm.c259
-rw-r--r--arch/x86/kvm/vmx.c442
-rw-r--r--arch/x86/kvm/x86.c136
-rw-r--r--arch/x86/lib/insn-eval.c4
-rw-r--r--arch/x86/lib/x86-opcode-map.txt2
-rw-r--r--arch/x86/mm/Makefile2
-rw-r--r--arch/x86/mm/extable.c2
-rw-r--r--arch/x86/mm/fault.c8
-rw-r--r--arch/x86/mm/hugetlbpage.c11
-rw-r--r--arch/x86/mm/init.c8
-rw-r--r--arch/x86/mm/init_64.c13
-rw-r--r--arch/x86/mm/kasan_init_64.c143
-rw-r--r--arch/x86/mm/kmemcheck/Makefile1
-rw-r--r--arch/x86/mm/kmemcheck/error.c227
-rw-r--r--arch/x86/mm/kmemcheck/error.h15
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c658
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c106
-rw-r--r--arch/x86/mm/kmemcheck/opcode.h9
-rw-r--r--arch/x86/mm/kmemcheck/pte.c22
-rw-r--r--arch/x86/mm/kmemcheck/pte.h10
-rw-r--r--arch/x86/mm/kmemcheck/selftest.c70
-rw-r--r--arch/x86/mm/kmemcheck/selftest.h6
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c173
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h18
-rw-r--r--arch/x86/mm/mmap.c62
-rw-r--r--arch/x86/mm/pageattr.c10
-rw-r--r--arch/x86/mm/pgtable.c2
-rw-r--r--arch/x86/oprofile/nmi_int.c2
-rw-r--r--arch/x86/pci/fixup.c85
-rw-r--r--arch/x86/pci/intel_mid_pci.c2
-rw-r--r--arch/x86/platform/efi/efi_64.c2
-rw-r--r--arch/x86/xen/grant-table.c60
-rw-r--r--arch/x86/xen/mmu.c14
-rw-r--r--arch/x86/xen/mmu_pv.c4
-rw-r--r--arch/x86/xen/suspend.c4
-rw-r--r--arch/x86/xen/time.c99
-rw-r--r--arch/x86/xen/xen-ops.h2
99 files changed, 1667 insertions, 2352 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f08977d82ca0..8eed3f94bfc7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -110,9 +110,8 @@ config X86
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE
select HAVE_ARCH_JUMP_LABEL
- select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
+ select HAVE_ARCH_KASAN if X86_64
select HAVE_ARCH_KGDB
- select HAVE_ARCH_KMEMCHECK
select HAVE_ARCH_MMAP_RND_BITS if MMU
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT
select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT
@@ -1430,7 +1429,7 @@ config ARCH_DMA_ADDR_T_64BIT
config X86_DIRECT_GBPAGES
def_bool y
- depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+ depends on X86_64 && !DEBUG_PAGEALLOC
---help---
Certain kernel features effectively disable kernel
linear 1 GB mappings (even if the CPU otherwise
@@ -1805,14 +1804,20 @@ config X86_SMAP
If unsure, say Y.
config X86_INTEL_UMIP
- def_bool n
+ def_bool y
depends on CPU_SUP_INTEL
prompt "Intel User Mode Instruction Prevention" if EXPERT
---help---
The User Mode Instruction Prevention (UMIP) is a security
feature in newer Intel processors. If enabled, a general
- protection fault is issued if the instructions SGDT, SLDT,
- SIDT, SMSW and STR are executed in user mode.
+ protection fault is issued if the SGDT, SLDT, SIDT, SMSW
+ or STR instructions are executed in user mode. These instructions
+ unnecessarily expose information about the hardware state.
+
+ The vast majority of applications do not use these instructions.
+ For the very few that do, software emulation is provided in
+ specific cases in protected and virtual-8086 modes. Emulated
+ results are dummy.
config X86_INTEL_MPX
prompt "Intel MPX (Memory Protection Extensions)"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index a20eacd9c7e9..3e73bc255e4e 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -158,11 +158,6 @@ ifdef CONFIG_X86_X32
endif
export CONFIG_X86_X32_ABI
-# Don't unroll struct assignments with kmemcheck enabled
-ifeq ($(CONFIG_KMEMCHECK),y)
- KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
-endif
-
#
# If the function graph tracer is used with mcount instead of fentry,
# '-maccumulate-outgoing-args' is needed to prevent a GCC bug
diff --git a/arch/x86/boot/compressed/kaslr.c b/arch/x86/boot/compressed/kaslr.c
index a63fbc25ce84..8199a6187251 100644
--- a/arch/x86/boot/compressed/kaslr.c
+++ b/arch/x86/boot/compressed/kaslr.c
@@ -171,7 +171,6 @@ parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
static void mem_avoid_memmap(char *str)
{
static int i;
- int rc;
if (i >= MAX_MEMMAP_REGIONS)
return;
@@ -219,7 +218,7 @@ static int handle_mem_memmap(void)
return 0;
tmp_cmdline = malloc(len + 1);
- if (!tmp_cmdline )
+ if (!tmp_cmdline)
error("Failed to allocate space for tmp_cmdline");
memcpy(tmp_cmdline, args, len);
@@ -363,7 +362,7 @@ static void mem_avoid_init(unsigned long input, unsigned long input_size,
cmd_line |= boot_params->hdr.cmd_line_ptr;
/* Calculate size of cmd_line. */
ptr = (char *)(unsigned long)cmd_line;
- for (cmd_line_size = 0; ptr[cmd_line_size++]; )
+ for (cmd_line_size = 0; ptr[cmd_line_size++];)
;
mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 5c15d6b57329..3bf3dcf29825 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -28,6 +28,7 @@
#include <crypto/cryptd.h>
#include <crypto/ctr.h>
#include <crypto/b128ops.h>
+#include <crypto/gcm.h>
#include <crypto/xts.h>
#include <asm/cpu_device_id.h>
#include <asm/fpu/api.h>
@@ -1067,9 +1068,10 @@ static struct skcipher_alg aesni_skciphers[] = {
}
};
+static
struct simd_skcipher_alg *aesni_simd_skciphers[ARRAY_SIZE(aesni_skciphers)];
-struct {
+static struct {
const char *algname;
const char *drvname;
const char *basename;
@@ -1131,7 +1133,7 @@ static struct aead_alg aesni_aead_algs[] = { {
.setauthsize = common_rfc4106_set_authsize,
.encrypt = helper_rfc4106_encrypt,
.decrypt = helper_rfc4106_decrypt,
- .ivsize = 8,
+ .ivsize = GCM_RFC4106_IV_SIZE,
.maxauthsize = 16,
.base = {
.cra_name = "__gcm-aes-aesni",
@@ -1149,7 +1151,7 @@ static struct aead_alg aesni_aead_algs[] = { {
.setauthsize = rfc4106_set_authsize,
.encrypt = rfc4106_encrypt,
.decrypt = rfc4106_decrypt,
- .ivsize = 8,
+ .ivsize = GCM_RFC4106_IV_SIZE,
.maxauthsize = 16,
.base = {
.cra_name = "rfc4106(gcm(aes))",
@@ -1165,7 +1167,7 @@ static struct aead_alg aesni_aead_algs[] = { {
.setauthsize = generic_gcmaes_set_authsize,
.encrypt = generic_gcmaes_encrypt,
.decrypt = generic_gcmaes_decrypt,
- .ivsize = 12,
+ .ivsize = GCM_AES_IV_SIZE,
.maxauthsize = 16,
.base = {
.cra_name = "gcm(aes)",
diff --git a/arch/x86/crypto/crc32-pclmul_asm.S b/arch/x86/crypto/crc32-pclmul_asm.S
index f247304299a2..1c099dc08cc3 100644
--- a/arch/x86/crypto/crc32-pclmul_asm.S
+++ b/arch/x86/crypto/crc32-pclmul_asm.S
@@ -41,6 +41,7 @@
#include <asm/inst.h>
+.section .rodata
.align 16
/*
* [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4
@@ -111,19 +112,13 @@ ENTRY(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */
pxor CONSTANT, %xmm1
sub $0x40, LEN
add $0x40, BUF
-#ifndef __x86_64__
- /* This is for position independent code(-fPIC) support for 32bit */
- call delta
-delta:
- pop %ecx
-#endif
cmp $0x40, LEN
jb less_64
#ifdef __x86_64__
movdqa .Lconstant_R2R1(%rip), CONSTANT
#else
- movdqa .Lconstant_R2R1 - delta(%ecx), CONSTANT
+ movdqa .Lconstant_R2R1, CONSTANT
#endif
loop_64:/* 64 bytes Full cache line folding */
@@ -172,7 +167,7 @@ less_64:/* Folding cache line into 128bit */
#ifdef __x86_64__
movdqa .Lconstant_R4R3(%rip), CONSTANT
#else
- movdqa .Lconstant_R4R3 - delta(%ecx), CONSTANT
+ movdqa .Lconstant_R4R3, CONSTANT
#endif
prefetchnta (BUF)
@@ -220,8 +215,8 @@ fold_64:
movdqa .Lconstant_R5(%rip), CONSTANT
movdqa .Lconstant_mask32(%rip), %xmm3
#else
- movdqa .Lconstant_R5 - delta(%ecx), CONSTANT
- movdqa .Lconstant_mask32 - delta(%ecx), %xmm3
+ movdqa .Lconstant_R5, CONSTANT
+ movdqa .Lconstant_mask32, %xmm3
#endif
psrldq $0x04, %xmm2
pand %xmm3, %xmm1
@@ -232,7 +227,7 @@ fold_64:
#ifdef __x86_64__
movdqa .Lconstant_RUpoly(%rip), CONSTANT
#else
- movdqa .Lconstant_RUpoly - delta(%ecx), CONSTANT
+ movdqa .Lconstant_RUpoly, CONSTANT
#endif
movdqa %xmm1, %xmm2
pand %xmm3, %xmm1
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a2b30ec69497..f81d50d7ceac 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -51,15 +51,19 @@ ENTRY(native_usergs_sysret64)
END(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT */
-.macro TRACE_IRQS_IRETQ
+.macro TRACE_IRQS_FLAGS flags:req
#ifdef CONFIG_TRACE_IRQFLAGS
- bt $9, EFLAGS(%rsp) /* interrupts off? */
+ bt $9, \flags /* interrupts off? */
jnc 1f
TRACE_IRQS_ON
1:
#endif
.endm
+.macro TRACE_IRQS_IRETQ
+ TRACE_IRQS_FLAGS EFLAGS(%rsp)
+.endm
+
/*
* When dynamic function tracer is enabled it will add a breakpoint
* to all locations that it is about to modify, sync CPUs, update
@@ -148,8 +152,6 @@ ENTRY(entry_SYSCALL_64)
movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- TRACE_IRQS_OFF
-
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
@@ -170,6 +172,8 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
UNWIND_HINT_REGS extra=0
+ TRACE_IRQS_OFF
+
/*
* If we need to do entry work or if we guess we'll need to do
* exit work, go straight to the slow path.
@@ -943,11 +947,13 @@ ENTRY(native_load_gs_index)
FRAME_BEGIN
pushfq
DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
+ TRACE_IRQS_OFF
SWAPGS
.Lgs_change:
movl %edi, %gs
2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
SWAPGS
+ TRACE_IRQS_FLAGS (%rsp)
popfq
FRAME_END
ret
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index c366c0adeb40..1943aebadede 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -130,10 +130,6 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
-# This makes sure the $(obj) subdirectory exists even though vdso32/
-# is not a kbuild sub-make subdirectory.
-override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
-
targets += vdso32/vdso32.lds
targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o
targets += vdso32/vclock_gettime.o
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index d63053142b16..5b8b556dbb12 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -112,7 +112,7 @@ static int vvar_fault(const struct vm_special_mapping *sm,
__pa_symbol(&__vvar_page) >> PAGE_SHIFT);
} else if (sym_offset == image->sym_pvclock_page) {
struct pvclock_vsyscall_time_info *pvti =
- pvclock_pvti_cpu0_va();
+ pvclock_get_pvti_cpu0_va();
if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
ret = vm_insert_pfn_prot(
vma,
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index d45e06346f14..7874c980d569 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -975,10 +975,10 @@ static void uncore_pci_remove(struct pci_dev *pdev)
int i, phys_id, pkg;
phys_id = uncore_pcibus_to_physid(pdev->bus);
- pkg = topology_phys_to_logical_pkg(phys_id);
box = pci_get_drvdata(pdev);
if (!box) {
+ pkg = topology_phys_to_logical_pkg(phys_id);
for (i = 0; i < UNCORE_EXTRA_PCI_DEV_MAX; i++) {
if (uncore_extra_pci_dev[pkg].dev[i] == pdev) {
uncore_extra_pci_dev[pkg].dev[i] = NULL;
@@ -994,7 +994,7 @@ static void uncore_pci_remove(struct pci_dev *pdev)
return;
pci_set_drvdata(pdev, NULL);
- pmu->boxes[pkg] = NULL;
+ pmu->boxes[box->pkgid] = NULL;
if (atomic_dec_return(&pmu->activeboxes) == 0)
uncore_pmu_unregister(pmu);
uncore_box_exit(box);
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index 4364191e7c6b..414dc7e7c950 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -100,7 +100,7 @@ struct intel_uncore_extra_reg {
struct intel_uncore_box {
int pci_phys_id;
- int pkgid;
+ int pkgid; /* Logical package ID */
int n_active; /* number of active events */
int n_events;
int cpu; /* cpu to collect events */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 95cb19f4e06f..6d8044ab1060 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -1057,7 +1057,7 @@ static void snbep_qpi_enable_event(struct intel_uncore_box *box, struct perf_eve
if (reg1->idx != EXTRA_REG_NONE) {
int idx = box->pmu->pmu_idx + SNBEP_PCI_QPI_PORT0_FILTER;
- int pkg = topology_phys_to_logical_pkg(box->pci_phys_id);
+ int pkg = box->pkgid;
struct pci_dev *filter_pdev = uncore_extra_pci_dev[pkg].dev[idx];
if (filter_pdev) {
@@ -3035,11 +3035,19 @@ static struct intel_uncore_type *bdx_msr_uncores[] = {
NULL,
};
+/* Bit 7 'Use Occupancy' is not available for counter 0 on BDX */
+static struct event_constraint bdx_uncore_pcu_constraints[] = {
+ EVENT_CONSTRAINT(0x80, 0xe, 0x80),
+ EVENT_CONSTRAINT_END
+};
+
void bdx_uncore_cpu_init(void)
{
if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
uncore_msr_uncores = bdx_msr_uncores;
+
+ hswep_uncore_pcu.constraints = bdx_uncore_pcu_constraints;
}
static struct intel_uncore_type bdx_uncore_ha = {
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index a0b86cf486e0..189a398290db 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -210,9 +210,10 @@ void hyperv_cleanup(void)
}
EXPORT_SYMBOL_GPL(hyperv_cleanup);
-void hyperv_report_panic(struct pt_regs *regs)
+void hyperv_report_panic(struct pt_regs *regs, long err)
{
static bool panic_reported;
+ u64 guest_id;
/*
* We prefer to report panic on 'die' chain as we have proper
@@ -223,11 +224,13 @@ void hyperv_report_panic(struct pt_regs *regs)
return;
panic_reported = true;
- wrmsrl(HV_X64_MSR_CRASH_P0, regs->ip);
- wrmsrl(HV_X64_MSR_CRASH_P1, regs->ax);
- wrmsrl(HV_X64_MSR_CRASH_P2, regs->bx);
- wrmsrl(HV_X64_MSR_CRASH_P3, regs->cx);
- wrmsrl(HV_X64_MSR_CRASH_P4, regs->dx);
+ rdmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id);
+
+ wrmsrl(HV_X64_MSR_CRASH_P0, err);
+ wrmsrl(HV_X64_MSR_CRASH_P1, guest_id);
+ wrmsrl(HV_X64_MSR_CRASH_P2, regs->ip);
+ wrmsrl(HV_X64_MSR_CRASH_P3, regs->ax);
+ wrmsrl(HV_X64_MSR_CRASH_P4, regs->sp);
/*
* Let Hyper-V know there is crash data available
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index a600a6cda9ec..2cbd75dd2fd3 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -210,7 +210,6 @@ typedef struct compat_siginfo {
} compat_siginfo_t;
#define COMPAT_OFF_T_MAX 0x7fffffff
-#define COMPAT_LOFF_T_MAX 0x7fffffffffffffffL
struct compat_ipc64_perm {
compat_key_t key;
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 836ca1178a6a..0350d99bb8fd 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -7,7 +7,6 @@
* Documentation/DMA-API.txt for documentation.
*/
-#include <linux/kmemcheck.h>
#include <linux/scatterlist.h>
#include <linux/dma-debug.h>
#include <asm/io.h>
@@ -68,13 +67,6 @@ static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr)
}
#endif /* CONFIG_X86_DMA_REMAP */
-static inline void
-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
- enum dma_data_direction dir)
-{
- flush_write_buffers();
-}
-
static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
gfp_t gfp)
{
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 3a091cea36c5..0d157d2a1e2a 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -309,6 +309,7 @@ static inline int mmap_is_ia32(void)
extern unsigned long task_size_32bit(void);
extern unsigned long task_size_64bit(int full_addr_space);
extern unsigned long get_mmap_base(int is_legacy);
+extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len);
#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index dcd9fb55e679..b0c505fe9a95 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -104,6 +104,12 @@ enum fixed_addresses {
FIX_GDT_REMAP_BEGIN,
FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
+#ifdef CONFIG_ACPI_APEI_GHES
+ /* Used for GHES mapping from assorted contexts */
+ FIX_APEI_GHES_IRQ,
+ FIX_APEI_GHES_NMI,
+#endif
+
__end_of_permanent_fixed_addresses,
/*
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b80e46733909..2851077b6051 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -99,14 +99,6 @@ struct irq_alloc_info {
void *dmar_data;
};
#endif
-#ifdef CONFIG_HT_IRQ
- struct {
- int ht_pos;
- int ht_idx;
- struct pci_dev *ht_dev;
- void *ht_update;
- };
-#endif
#ifdef CONFIG_X86_UV
struct {
int uv_limit;
diff --git a/arch/x86/include/asm/hypertransport.h b/arch/x86/include/asm/hypertransport.h
deleted file mode 100644
index 5d55df352879..000000000000
--- a/arch/x86/include/asm/hypertransport.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _ASM_X86_HYPERTRANSPORT_H
-#define _ASM_X86_HYPERTRANSPORT_H
-
-/*
- * Constants for x86 Hypertransport Interrupts.
- */
-
-#define HT_IRQ_LOW_BASE 0xf8000000
-
-#define HT_IRQ_LOW_VECTOR_SHIFT 16
-#define HT_IRQ_LOW_VECTOR_MASK 0x00ff0000
-#define HT_IRQ_LOW_VECTOR(v) \
- (((v) << HT_IRQ_LOW_VECTOR_SHIFT) & HT_IRQ_LOW_VECTOR_MASK)
-
-#define HT_IRQ_LOW_DEST_ID_SHIFT 8
-#define HT_IRQ_LOW_DEST_ID_MASK 0x0000ff00
-#define HT_IRQ_LOW_DEST_ID(v) \
- (((v) << HT_IRQ_LOW_DEST_ID_SHIFT) & HT_IRQ_LOW_DEST_ID_MASK)
-
-#define HT_IRQ_LOW_DM_PHYSICAL 0x0000000
-#define HT_IRQ_LOW_DM_LOGICAL 0x0000040
-
-#define HT_IRQ_LOW_RQEOI_EDGE 0x0000000
-#define HT_IRQ_LOW_RQEOI_LEVEL 0x0000020
-
-
-#define HT_IRQ_LOW_MT_FIXED 0x0000000
-#define HT_IRQ_LOW_MT_ARBITRATED 0x0000004
-#define HT_IRQ_LOW_MT_SMI 0x0000008
-#define HT_IRQ_LOW_MT_NMI 0x000000c
-#define HT_IRQ_LOW_MT_INIT 0x0000010
-#define HT_IRQ_LOW_MT_STARTUP 0x0000014
-#define HT_IRQ_LOW_MT_EXTINT 0x0000018
-#define HT_IRQ_LOW_MT_LINT1 0x000008c
-#define HT_IRQ_LOW_MT_LINT0 0x0000098
-
-#define HT_IRQ_LOW_IRQ_MASKED 0x0000001
-
-
-#define HT_IRQ_HIGH_DEST_ID_SHIFT 0
-#define HT_IRQ_HIGH_DEST_ID_MASK 0x00ffffff
-#define HT_IRQ_HIGH_DEST_ID(v) \
- ((((v) >> 8) << HT_IRQ_HIGH_DEST_ID_SHIFT) & HT_IRQ_HIGH_DEST_ID_MASK)
-
-#endif /* _ASM_X86_HYPERTRANSPORT_H */
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
index e1d3b4ce8a92..2b6ccf2c49f1 100644
--- a/arch/x86/include/asm/insn-eval.h
+++ b/arch/x86/include/asm/insn-eval.h
@@ -18,6 +18,6 @@
void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs);
int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs);
unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
-char insn_get_code_seg_params(struct pt_regs *regs);
+int insn_get_code_seg_params(struct pt_regs *regs);
#endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 93ae8aee1780..95e948627fd0 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -111,6 +111,10 @@ build_mmio_write(__writeq, "q", unsigned long, "r", )
#endif
+#define ARCH_HAS_VALID_PHYS_ADDR_RANGE
+extern int valid_phys_addr_range(phys_addr_t addr, size_t size);
+extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size);
+
/**
* virt_to_phys - map virtual addresses to physical
* @address: address to remap
diff --git a/arch/x86/include/asm/irqdomain.h b/arch/x86/include/asm/irqdomain.h
index f695cc6b8e1f..139feef467f7 100644
--- a/arch/x86/include/asm/irqdomain.h
+++ b/arch/x86/include/asm/irqdomain.h
@@ -56,10 +56,4 @@ extern void arch_init_msi_domain(struct irq_domain *domain);
static inline void arch_init_msi_domain(struct irq_domain *domain) { }
#endif
-#ifdef CONFIG_HT_IRQ
-extern void arch_init_htirq_domain(struct irq_domain *domain);
-#else
-static inline void arch_init_htirq_domain(struct irq_domain *domain) { }
-#endif
-
#endif
diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h
index 945a0337fbcf..ea32a7d3cf1b 100644
--- a/arch/x86/include/asm/kmemcheck.h
+++ b/arch/x86/include/asm/kmemcheck.h
@@ -1,43 +1 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ASM_X86_KMEMCHECK_H
-#define ASM_X86_KMEMCHECK_H
-
-#include <linux/types.h>
-#include <asm/ptrace.h>
-
-#ifdef CONFIG_KMEMCHECK
-bool kmemcheck_active(struct pt_regs *regs);
-
-void kmemcheck_show(struct pt_regs *regs);
-void kmemcheck_hide(struct pt_regs *regs);
-
-bool kmemcheck_fault(struct pt_regs *regs,
- unsigned long address, unsigned long error_code);
-bool kmemcheck_trap(struct pt_regs *regs);
-#else
-static inline bool kmemcheck_active(struct pt_regs *regs)
-{
- return false;
-}
-
-static inline void kmemcheck_show(struct pt_regs *regs)
-{
-}
-
-static inline void kmemcheck_hide(struct pt_regs *regs)
-{
-}
-
-static inline bool kmemcheck_fault(struct pt_regs *regs,
- unsigned long address, unsigned long error_code)
-{
- return false;
-}
-
-static inline bool kmemcheck_trap(struct pt_regs *regs)
-{
- return false;
-}
-#endif /* CONFIG_KMEMCHECK */
-
-#endif
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index ee23a43386a2..034caa1a084e 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -226,6 +226,8 @@ struct x86_emulate_ops {
unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt);
void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags);
+ int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase);
+
};
typedef u32 __attribute__((vector_size(16))) sse128_t;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9d7d856b2d89..977de5fb968b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1061,6 +1061,11 @@ struct kvm_x86_ops {
void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
void (*setup_mce)(struct kvm_vcpu *vcpu);
+
+ int (*smi_allowed)(struct kvm_vcpu *vcpu);
+ int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
+ int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
+ int (*enable_smi_window)(struct kvm_vcpu *vcpu);
};
struct kvm_arch_async_pf {
@@ -1156,7 +1161,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
static inline int emulate_instruction(struct kvm_vcpu *vcpu,
int emulation_type)
{
- return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
+ return x86_emulate_instruction(vcpu, 0,
+ emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0);
}
void kvm_enable_efer_bits(u64);
@@ -1426,4 +1432,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
#endif
}
+#define put_smstate(type, buf, offset, val) \
+ *(type *)((buf) + (offset) - 0x7e00) = val
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h
index 581bb54dd464..5400add2885b 100644
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -311,7 +311,7 @@ static inline int hv_cpu_number_to_vp_number(int cpu_number)
void hyperv_init(void);
void hyperv_setup_mmu_ops(void);
void hyper_alloc_mmu(void);
-void hyperv_report_panic(struct pt_regs *regs);
+void hyperv_report_panic(struct pt_regs *regs, long err);
bool hv_is_hypercall_page_setup(void);
void hyperv_cleanup(void);
#else /* CONFIG_HYPERV */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 09c06b0fb964..d32175e30259 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -89,10 +89,8 @@ extern unsigned long pci_mem_start;
#define PCIBIOS_MIN_CARDBUS_IO 0x4000
extern int pcibios_enabled;
-void pcibios_config_init(void);
void pcibios_scan_root(int bus);
-void pcibios_set_master(struct pci_dev *dev);
struct irq_routing_table *pcibios_get_irq_routing_table(void);
int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index f735c3016325..95e2dfd75521 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -667,11 +667,6 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
return false;
}
-static inline int pte_hidden(pte_t pte)
-{
- return pte_flags(pte) & _PAGE_HIDDEN;
-}
-
static inline int pmd_present(pmd_t pmd)
{
/*
@@ -1066,7 +1061,7 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmdp);
-#define __HAVE_ARCH_PMD_WRITE
+#define pmd_write pmd_write
static inline int pmd_write(pmd_t pmd)
{
return pmd_flags(pmd) & _PAGE_RW;
@@ -1093,6 +1088,12 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
}
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+ return pud_flags(pud) & _PAGE_RW;
+}
+
/*
* clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
*
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 9e9b05fc4860..3696398a9475 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -32,7 +32,6 @@
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
-#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4
@@ -79,18 +78,6 @@
#define _PAGE_KNL_ERRATUM_MASK 0
#endif
-#ifdef CONFIG_KMEMCHECK
-#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN)
-#else
-#define _PAGE_HIDDEN (_AT(pteval_t, 0))
-#endif
-
-/*
- * The same hidden bit is used by kmemcheck, but since kmemcheck
- * works on kernel pages while soft-dirty engine on user space,
- * they do not conflict with each other.
- */
-
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY)
#else
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2db7cf720b04..cc16fa882e3e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -132,6 +132,7 @@ struct cpuinfo_x86 {
/* Index into per_cpu list: */
u16 cpu_index;
u32 microcode;
+ unsigned initialized : 1;
} __randomize_layout;
struct cpuid_regs {
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 3e4ed8fb5f91..a7471dcd2205 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -5,15 +5,6 @@
#include <linux/clocksource.h>
#include <asm/pvclock-abi.h>
-#ifdef CONFIG_KVM_GUEST
-extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
-#else
-static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
-{
- return NULL;
-}
-#endif
-
/* some helper functions for xen and kvm pv clock sources */
u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
@@ -102,4 +93,14 @@ struct pvclock_vsyscall_time_info {
#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
+#ifdef CONFIG_PARAVIRT_CLOCK
+void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti);
+struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void);
+#else
+static inline struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void)
+{
+ return NULL;
+}
+#endif
+
#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index 076502241eae..55d392c6bd29 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -179,8 +179,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
* No 3D Now!
*/
-#ifndef CONFIG_KMEMCHECK
-
#if (__GNUC__ >= 4)
#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
#else
@@ -189,13 +187,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
? __constant_memcpy((t), (f), (n)) \
: __memcpy((t), (f), (n)))
#endif
-#else
-/*
- * kmemcheck becomes very happy if we use the REP instructions unconditionally,
- * because it means that we know both memory operands in advance.
- */
-#define memcpy(t, f, n) __memcpy((t), (f), (n))
-#endif
#endif
#endif /* !CONFIG_FORTIFY_SOURCE */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 0b1b4445f4c5..533f74c300c2 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -33,7 +33,6 @@ extern void *memcpy(void *to, const void *from, size_t len);
extern void *__memcpy(void *to, const void *from, size_t len);
#ifndef CONFIG_FORTIFY_SOURCE
-#ifndef CONFIG_KMEMCHECK
#if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4
#define memcpy(dst, src, len) \
({ \
@@ -46,13 +45,6 @@ extern void *__memcpy(void *to, const void *from, size_t len);
__ret; \
})
#endif
-#else
-/*
- * kmemcheck becomes very happy if we use the REP instructions unconditionally,
- * because it means that we know both memory operands in advance.
- */
-#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len))
-#endif
#endif /* !CONFIG_FORTIFY_SOURCE */
#define __HAVE_ARCH_MEMSET
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index caec8417539f..8b6780751132 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -70,11 +70,11 @@
#define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100
#define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
-#define SECONDARY_EXEC_RDRAND 0x00000800
+#define SECONDARY_EXEC_RDRAND_EXITING 0x00000800
#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
#define SECONDARY_EXEC_ENABLE_VMFUNC 0x00002000
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
-#define SECONDARY_EXEC_RDSEED 0x00010000
+#define SECONDARY_EXEC_RDSEED_EXITING 0x00010000
#define SECONDARY_EXEC_ENABLE_PML 0x00020000
#define SECONDARY_EXEC_XSAVES 0x00100000
#define SECONDARY_EXEC_TSC_SCALING 0x02000000
diff --git a/arch/x86/include/asm/xen/cpuid.h b/arch/x86/include/asm/xen/cpuid.h
index 3bdd10d71223..a9630104f1c4 100644
--- a/arch/x86/include/asm/xen/cpuid.h
+++ b/arch/x86/include/asm/xen/cpuid.h
@@ -74,21 +74,43 @@
#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD (1u<<0)
/*
+ * Leaf 4 (0x40000x03)
+ * Sub-leaf 0: EAX: bit 0: emulated tsc
+ * bit 1: host tsc is known to be reliable
+ * bit 2: RDTSCP instruction available
+ * EBX: tsc_mode: 0=default (emulate if necessary), 1=emulate,
+ * 2=no emulation, 3=no emulation + TSC_AUX support
+ * ECX: guest tsc frequency in kHz
+ * EDX: guest tsc incarnation (migration count)
+ * Sub-leaf 1: EAX: tsc offset low part
+ * EBX: tsc offset high part
+ * ECX: multiplicator for tsc->ns conversion
+ * EDX: shift amount for tsc->ns conversion
+ * Sub-leaf 2: EAX: host tsc frequency in kHz
+ */
+
+/*
* Leaf 5 (0x40000x04)
* HVM-specific features
- * EAX: Features
- * EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag)
+ * Sub-leaf 0: EAX: Features
+ * Sub-leaf 0: EBX: vcpu id (iff EAX has XEN_HVM_CPUID_VCPU_ID_PRESENT flag)
*/
-
-/* Virtualized APIC registers */
-#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0)
-/* Virtualized x2APIC accesses */
-#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1)
+#define XEN_HVM_CPUID_APIC_ACCESS_VIRT (1u << 0) /* Virtualized APIC registers */
+#define XEN_HVM_CPUID_X2APIC_VIRT (1u << 1) /* Virtualized x2APIC accesses */
/* Memory mapped from other domains has valid IOMMU entries */
#define XEN_HVM_CPUID_IOMMU_MAPPINGS (1u << 2)
-/* vcpu id is present in EBX */
-#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3)
+#define XEN_HVM_CPUID_VCPU_ID_PRESENT (1u << 3) /* vcpu id is present in EBX */
+
+/*
+ * Leaf 6 (0x40000x05)
+ * PV-specific parameters
+ * Sub-leaf 0: EAX: max available sub-leaf
+ * Sub-leaf 0: EBX: bits 0-7: max machine address width
+ */
+
+/* Max. address width in bits taking memory hotplug into account. */
+#define XEN_CPUID_MACHINE_ADDRESS_WIDTH_MASK (0xffu << 0)
-#define XEN_CPUID_MAX_NUM_LEAVES 4
+#define XEN_CPUID_MAX_NUM_LEAVES 5
#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c6b84245e5ab..123e669bf363 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -27,6 +27,15 @@ typedef struct xpaddr {
phys_addr_t paddr;
} xpaddr_t;
+#ifdef CONFIG_X86_64
+#define XEN_PHYSICAL_MASK __sme_clr((1UL << 52) - 1)
+#else
+#define XEN_PHYSICAL_MASK __PHYSICAL_MASK
+#endif
+
+#define XEN_PTE_MFN_MASK ((pteval_t)(((signed long)PAGE_MASK) & \
+ XEN_PHYSICAL_MASK))
+
#define XMADDR(x) ((xmaddr_t) { .maddr = (x) })
#define XPADDR(x) ((xpaddr_t) { .paddr = (x) })
@@ -278,7 +287,7 @@ static inline unsigned long bfn_to_local_pfn(unsigned long mfn)
static inline unsigned long pte_mfn(pte_t pte)
{
- return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
+ return (pte.pte & XEN_PTE_MFN_MASK) >> PAGE_SHIFT;
}
static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index 1f5c5161ead6..45c8605467f1 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -1,7 +1,4 @@
-#ifdef CONFIG_KMEMCHECK
-/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */
-# include <asm-generic/xor.h>
-#elif !defined(_ASM_X86_XOR_H)
+#ifndef _ASM_X86_XOR_H
#define _ASM_X86_XOR_H
/*
diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c
index ea3046e0b0cf..bb8d300fecbd 100644
--- a/arch/x86/kernel/acpi/apei.c
+++ b/arch/x86/kernel/acpi/apei.c
@@ -52,8 +52,3 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
apei_mce_report_mem_error(sev, mem_err);
#endif
}
-
-void arch_apei_flush_tlb_one(unsigned long addr)
-{
- __flush_tlb_one(addr);
-}
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index ef9e02e614d0..f4c463df8b08 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -342,13 +342,12 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
#ifdef CONFIG_X86_IO_APIC
#define MP_ISA_BUS 0
+static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
+ u8 trigger, u32 gsi);
+
static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
u32 gsi)
{
- int ioapic;
- int pin;
- struct mpc_intsrc mp_irq;
-
/*
* Check bus_irq boundary.
*/
@@ -358,14 +357,6 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
}
/*
- * Convert 'gsi' to 'ioapic.pin'.
- */
- ioapic = mp_find_ioapic(gsi);
- if (ioapic < 0)
- return;
- pin = mp_find_ioapic_pin(ioapic, gsi);
-
- /*
* TBD: This check is for faulty timer entries, where the override
* erroneously sets the trigger to level, resulting in a HUGE
* increase of timer interrupts!
@@ -373,16 +364,8 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
if ((bus_irq == 0) && (trigger == 3))
trigger = 1;
- mp_irq.type = MP_INTSRC;
- mp_irq.irqtype = mp_INT;
- mp_irq.irqflag = (trigger << 2) | polarity;
- mp_irq.srcbus = MP_ISA_BUS;
- mp_irq.srcbusirq = bus_irq; /* IRQ */
- mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
- mp_irq.dstirq = pin; /* INTIN# */
-
- mp_save_irq(&mp_irq);
-
+ if (mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi) < 0)
+ return;
/*
* Reset default identity mapping if gsi is also an legacy IRQ,
* otherwise there will be more than one entry with the same GSI
@@ -429,6 +412,34 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
return 0;
}
+static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
+ u8 trigger, u32 gsi)
+{
+ struct mpc_intsrc mp_irq;
+ int ioapic, pin;
+
+ /* Convert 'gsi' to 'ioapic.pin'(INTIN#) */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0) {
+ pr_warn("Failed to find ioapic for gsi : %u\n", gsi);
+ return ioapic;
+ }
+
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+
+ mp_irq.type = MP_INTSRC;
+ mp_irq.irqtype = mp_INT;
+ mp_irq.irqflag = (trigger << 2) | polarity;
+ mp_irq.srcbus = MP_ISA_BUS;
+ mp_irq.srcbusirq = bus_irq;
+ mp_irq.dstapic = mpc_ioapic_id(ioapic);
+ mp_irq.dstirq = pin;
+
+ mp_save_irq(&mp_irq);
+
+ return 0;
+}
+
static int __init
acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
{
@@ -473,7 +484,11 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
- mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+ if (bus_irq < NR_IRQS_LEGACY)
+ mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+ else
+ mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi);
+
acpi_penalize_sci_irq(bus_irq, trigger, polarity);
/*
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index a9e08924927e..a6fcaf16cdbf 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -12,7 +12,6 @@ obj-y += hw_nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_PCI_MSI) += msi.o
-obj-$(CONFIG_HT_IRQ) += htirq.o
obj-$(CONFIG_SMP) += ipi.o
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
deleted file mode 100644
index b07075dce8b7..000000000000
--- a/arch/x86/kernel/apic/htirq.c
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Support Hypertransport IRQ
- *
- * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
- * Moved from arch/x86/kernel/apic/io_apic.c.
- * Jiang Liu <jiang.liu@linux.intel.com>
- * Add support of hierarchical irqdomain
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/device.h>
-#include <linux/pci.h>
-#include <linux/htirq.h>
-#include <asm/irqdomain.h>
-#include <asm/hw_irq.h>
-#include <asm/apic.h>
-#include <asm/hypertransport.h>
-
-static struct irq_domain *htirq_domain;
-
-/*
- * Hypertransport interrupt support
- */
-static int
-ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
-{
- struct irq_data *parent = data->parent_data;
- int ret;
-
- ret = parent->chip->irq_set_affinity(parent, mask, force);
- if (ret >= 0) {
- struct ht_irq_msg msg;
- struct irq_cfg *cfg = irqd_cfg(data);
-
- fetch_ht_irq_msg(data->irq, &msg);
- msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK |
- HT_IRQ_LOW_DEST_ID_MASK);
- msg.address_lo |= HT_IRQ_LOW_VECTOR(cfg->vector) |
- HT_IRQ_LOW_DEST_ID(cfg->dest_apicid);
- msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
- msg.address_hi |= HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
- write_ht_irq_msg(data->irq, &msg);
- }
-
- return ret;
-}
-
-static struct irq_chip ht_irq_chip = {
- .name = "PCI-HT",
- .irq_mask = mask_ht_irq,
- .irq_unmask = unmask_ht_irq,
- .irq_ack = irq_chip_ack_parent,
- .irq_set_affinity = ht_set_affinity,
- .irq_retrigger = irq_chip_retrigger_hierarchy,
- .flags = IRQCHIP_SKIP_SET_WAKE,
-};
-
-static int htirq_domain_alloc(struct irq_domain *domain, unsigned int virq,
- unsigned int nr_irqs, void *arg)
-{
- struct ht_irq_cfg *ht_cfg;
- struct irq_alloc_info *info = arg;
- struct pci_dev *dev;
- irq_hw_number_t hwirq;
- int ret;
-
- if (nr_irqs > 1 || !info)
- return -EINVAL;
-
- dev = info->ht_dev;
- hwirq = (info->ht_idx & 0xFF) |
- PCI_DEVID(dev->bus->number, dev->devfn) << 8 |
- (pci_domain_nr(dev->bus) & 0xFFFFFFFF) << 24;
- if (irq_find_mapping(domain, hwirq) > 0)
- return -EEXIST;
-
- ht_cfg = kmalloc(sizeof(*ht_cfg), GFP_KERNEL);
- if (!ht_cfg)
- return -ENOMEM;
-
- ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, info);
- if (ret < 0) {
- kfree(ht_cfg);
- return ret;
- }
-
- /* Initialize msg to a value that will never match the first write. */
- ht_cfg->msg.address_lo = 0xffffffff;
- ht_cfg->msg.address_hi = 0xffffffff;
- ht_cfg->dev = info->ht_dev;
- ht_cfg->update = info->ht_update;
- ht_cfg->pos = info->ht_pos;
- ht_cfg->idx = 0x10 + (info->ht_idx * 2);
- irq_domain_set_info(domain, virq, hwirq, &ht_irq_chip, ht_cfg,
- handle_edge_irq, ht_cfg, "edge");
-
- return 0;
-}
-
-static void htirq_domain_free(struct irq_domain *domain, unsigned int virq,
- unsigned int nr_irqs)
-{
- struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq);
-
- BUG_ON(nr_irqs != 1);
- kfree(irq_data->chip_data);
- irq_domain_free_irqs_top(domain, virq, nr_irqs);
-}
-
-static int htirq_domain_activate(struct irq_domain *domain,
- struct irq_data *irq_data, bool early)
-{
- struct ht_irq_msg msg;
- struct irq_cfg *cfg = irqd_cfg(irq_data);
-
- msg.address_hi = HT_IRQ_HIGH_DEST_ID(cfg->dest_apicid);
- msg.address_lo =
- HT_IRQ_LOW_BASE |
- HT_IRQ_LOW_DEST_ID(cfg->dest_apicid) |
- HT_IRQ_LOW_VECTOR(cfg->vector) |
- ((apic->irq_dest_mode == 0) ?
- HT_IRQ_LOW_DM_PHYSICAL :
- HT_IRQ_LOW_DM_LOGICAL) |
- HT_IRQ_LOW_RQEOI_EDGE |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- HT_IRQ_LOW_MT_FIXED :
- HT_IRQ_LOW_MT_ARBITRATED) |
- HT_IRQ_LOW_IRQ_MASKED;
- write_ht_irq_msg(irq_data->irq, &msg);
- return 0;
-}
-
-static void htirq_domain_deactivate(struct irq_domain *domain,
- struct irq_data *irq_data)
-{
- struct ht_irq_msg msg;
-
- memset(&msg, 0, sizeof(msg));
- write_ht_irq_msg(irq_data->irq, &msg);
-}
-
-static const struct irq_domain_ops htirq_domain_ops = {
- .alloc = htirq_domain_alloc,
- .free = htirq_domain_free,
- .activate = htirq_domain_activate,
- .deactivate = htirq_domain_deactivate,
-};
-
-void __init arch_init_htirq_domain(struct irq_domain *parent)
-{
- struct fwnode_handle *fn;
-
- if (disable_apic)
- return;
-
- fn = irq_domain_alloc_named_fwnode("PCI-HT");
- if (!fn)
- goto warn;
-
- htirq_domain = irq_domain_create_tree(fn, &htirq_domain_ops, NULL);
- irq_domain_free_fwnode(fn);
- if (!htirq_domain)
- goto warn;
-
- htirq_domain->parent = parent;
- return;
-
-warn:
- pr_warn("Failed to initialize irqdomain for HTIRQ.\n");
-}
-
-int arch_setup_ht_irq(int idx, int pos, struct pci_dev *dev,
- ht_irq_update_t *update)
-{
- struct irq_alloc_info info;
-
- if (!htirq_domain)
- return -ENOSYS;
-
- init_irq_alloc_info(&info, NULL);
- info.ht_idx = idx;
- info.ht_pos = pos;
- info.ht_dev = dev;
- info.ht_update = update;
-
- return irq_domain_alloc_irqs(htirq_domain, 1, dev_to_node(&dev->dev),
- &info);
-}
-
-void arch_teardown_ht_irq(unsigned int irq)
-{
- irq_domain_free_irqs(irq, 1);
-}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 05c85e693a5d..6a823a25eaff 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -1,5 +1,5 @@
/*
- * Local APIC related interfaces to support IOAPIC, MSI, HT_IRQ etc.
+ * Local APIC related interfaces to support IOAPIC, MSI, etc.
*
* Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
* Moved from arch/x86/kernel/apic/io_apic.c.
@@ -601,7 +601,7 @@ int __init arch_probe_nr_irqs(void)
nr_irqs = NR_VECTORS * nr_cpu_ids;
nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
-#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
+#if defined(CONFIG_PCI_MSI)
/*
* for MSI and HT dyn irq
*/
@@ -663,7 +663,6 @@ int __init arch_early_irq_init(void)
irq_set_default_host(x86_vector_domain);
arch_init_msi_domain(x86_vector_domain);
- arch_init_htirq_domain(x86_vector_domain);
BUG_ON(!alloc_cpumask_var(&vector_searchmask, GFP_KERNEL));
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 90cb82dbba57..570e8bb1f386 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -22,7 +22,7 @@ obj-y += common.o
obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
-obj-$(CONFIG_CPU_FREQ) += aperfmperf.o
+obj-y += aperfmperf.o
obj-y += cpuid-deps.o
obj-$(CONFIG_PROC_FS) += proc.o
diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c
index 957813e0180d..7eba34df54c3 100644
--- a/arch/x86/kernel/cpu/aperfmperf.c
+++ b/arch/x86/kernel/cpu/aperfmperf.c
@@ -14,6 +14,8 @@
#include <linux/percpu.h>
#include <linux/smp.h>
+#include "cpu.h"
+
struct aperfmperf_sample {
unsigned int khz;
ktime_t time;
@@ -24,7 +26,7 @@ struct aperfmperf_sample {
static DEFINE_PER_CPU(struct aperfmperf_sample, samples);
#define APERFMPERF_CACHE_THRESHOLD_MS 10
-#define APERFMPERF_REFRESH_DELAY_MS 20
+#define APERFMPERF_REFRESH_DELAY_MS 10
#define APERFMPERF_STALE_THRESHOLD_MS 1000
/*
@@ -38,8 +40,6 @@ static void aperfmperf_snapshot_khz(void *dummy)
u64 aperf, aperf_delta;
u64 mperf, mperf_delta;
struct aperfmperf_sample *s = this_cpu_ptr(&samples);
- ktime_t now = ktime_get();
- s64 time_delta = ktime_ms_delta(now, s->time);
unsigned long flags;
local_irq_save(flags);
@@ -57,38 +57,68 @@ static void aperfmperf_snapshot_khz(void *dummy)
if (mperf_delta == 0)
return;
- s->time = now;
+ s->time = ktime_get();
s->aperf = aperf;
s->mperf = mperf;
-
- /* If the previous iteration was too long ago, discard it. */
- if (time_delta > APERFMPERF_STALE_THRESHOLD_MS)
- s->khz = 0;
- else
- s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
+ s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta);
}
-unsigned int arch_freq_get_on_cpu(int cpu)
+static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait)
{
- s64 time_delta;
- unsigned int khz;
+ s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu));
+
+ /* Don't bother re-computing within the cache threshold time. */
+ if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
+ return true;
+
+ smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait);
+
+ /* Return false if the previous iteration was too long ago. */
+ return time_delta <= APERFMPERF_STALE_THRESHOLD_MS;
+}
+unsigned int aperfmperf_get_khz(int cpu)
+{
if (!cpu_khz)
return 0;
if (!static_cpu_has(X86_FEATURE_APERFMPERF))
return 0;
- /* Don't bother re-computing within the cache threshold time. */
- time_delta = ktime_ms_delta(ktime_get(), per_cpu(samples.time, cpu));
- khz = per_cpu(samples.khz, cpu);
- if (khz && time_delta < APERFMPERF_CACHE_THRESHOLD_MS)
- return khz;
+ aperfmperf_snapshot_cpu(cpu, ktime_get(), true);
+ return per_cpu(samples.khz, cpu);
+}
- smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
- khz = per_cpu(samples.khz, cpu);
- if (khz)
- return khz;
+void arch_freq_prepare_all(void)
+{
+ ktime_t now = ktime_get();
+ bool wait = false;
+ int cpu;
+
+ if (!cpu_khz)
+ return;
+
+ if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+ return;
+
+ for_each_online_cpu(cpu)
+ if (!aperfmperf_snapshot_cpu(cpu, now, false))
+ wait = true;
+
+ if (wait)
+ msleep(APERFMPERF_REFRESH_DELAY_MS);
+}
+
+unsigned int arch_freq_get_on_cpu(int cpu)
+{
+ if (!cpu_khz)
+ return 0;
+
+ if (!static_cpu_has(X86_FEATURE_APERFMPERF))
+ return 0;
+
+ if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true))
+ return per_cpu(samples.khz, cpu);
msleep(APERFMPERF_REFRESH_DELAY_MS);
smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 13ae9e5eec2f..fa998ca8aa5a 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -341,6 +341,8 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c)
cr4_set_bits(X86_CR4_UMIP);
+ pr_info("x86/cpu: Activated the Intel User Mode Instruction Prevention (UMIP) CPU feature\n");
+
return;
out:
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index f52a370b6c00..e806b11a99af 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -47,4 +47,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[],
extern void get_cpu_cap(struct cpuinfo_x86 *c);
extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
+
+unsigned int aperfmperf_get_khz(int cpu);
+
#endif /* ARCH_X86_CPU_H */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b720dacac051..b1af22073e28 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -187,21 +187,6 @@ static void early_init_intel(struct cpuinfo_x86 *c)
if (c->x86 == 6 && c->x86_model < 15)
clear_cpu_cap(c, X86_FEATURE_PAT);
-#ifdef CONFIG_KMEMCHECK
- /*
- * P4s have a "fast strings" feature which causes single-
- * stepping REP instructions to only generate a #DB on
- * cache-line boundaries.
- *
- * Ingo Molnar reported a Pentium D (model 6) and a Xeon
- * (model 2) with the same problem.
- */
- if (c->x86 == 15)
- if (msr_clear_bit(MSR_IA32_MISC_ENABLE,
- MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0)
- pr_info("kmemcheck: Disabling fast string operations\n");
-#endif
-
/*
* If fast string is not enabled in IA32_MISC_ENABLE for any reason,
* clear the fast string and enhanced fast string CPU capabilities.
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 6b7e17bf0b71..e7ecedafa1c8 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -5,6 +5,8 @@
#include <linux/seq_file.h>
#include <linux/cpufreq.h>
+#include "cpu.h"
+
/*
* Get CPU information for use by the procfs.
*/
@@ -78,9 +80,11 @@ static int show_cpuinfo(struct seq_file *m, void *v)
seq_printf(m, "microcode\t: 0x%x\n", c->microcode);
if (cpu_has(c, X86_FEATURE_TSC)) {
- unsigned int freq = cpufreq_quick_get(cpu);
+ unsigned int freq = aperfmperf_get_khz(cpu);
if (!freq)
+ freq = cpufreq_quick_get(cpu);
+ if (!freq)
freq = cpu_khz;
seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
freq / 1000, (freq % 1000));
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 7d7715dde901..e5ec3cafa72e 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -57,7 +57,7 @@
# error "Need more virtual address space for the ESPFIX hack"
#endif
-#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO)
/* This contains the *bottom* address of the espfix stack */
DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack);
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 77b492c2d658..8b26c9e01cc4 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -48,12 +48,6 @@ early_param("no-kvmclock", parse_no_kvmclock);
static struct pvclock_vsyscall_time_info *hv_clock;
static struct pvclock_wall_clock *wall_clock;
-struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
-{
- return hv_clock;
-}
-EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va);
-
/*
* The wallclock is the time of day when we booted. Since then, some time may
* have elapsed since the hypervisor wrote the data. So we try to account for
@@ -377,6 +371,7 @@ int __init kvm_setup_vsyscall_timeinfo(void)
return 1;
}
+ pvclock_set_pvti_cpu0_va(hv_clock);
put_cpu();
kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 410c5dadcee3..3a4b12809ab5 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -431,6 +431,7 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
}
static unsigned long mpf_base;
+static bool mpf_found;
static unsigned long __init get_mpc_size(unsigned long physptr)
{
@@ -504,7 +505,7 @@ void __init default_get_smp_config(unsigned int early)
if (!smp_found_config)
return;
- if (!mpf_base)
+ if (!mpf_found)
return;
if (acpi_lapic && early)
@@ -593,6 +594,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
smp_found_config = 1;
#endif
mpf_base = base;
+ mpf_found = true;
pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n",
base, base + sizeof(*mpf) - 1, mpf);
@@ -858,7 +860,7 @@ static int __init update_mp_table(void)
if (!enable_update_mptable)
return 0;
- if (!mpf_base)
+ if (!mpf_found)
return 0;
mpf = early_memremap(mpf_base, sizeof(*mpf));
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 5c3f6d6a5078..761f6af6efa5 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -25,8 +25,10 @@
#include <asm/fixmap.h>
#include <asm/pvclock.h>
+#include <asm/vgtod.h>
static u8 valid_flags __read_mostly = 0;
+static struct pvclock_vsyscall_time_info *pvti_cpu0_va __read_mostly;
void pvclock_set_flags(u8 flags)
{
@@ -144,3 +146,15 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
+
+void pvclock_set_pvti_cpu0_va(struct pvclock_vsyscall_time_info *pvti)
+{
+ WARN_ON(vclock_was_used(VCLOCK_PVCLOCK));
+ pvti_cpu0_va = pvti;
+}
+
+struct pvclock_vsyscall_time_info *pvclock_get_pvti_cpu0_va(void)
+{
+ return pvti_cpu0_va;
+}
+EXPORT_SYMBOL_GPL(pvclock_get_pvti_cpu0_va);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 5f59e6bee123..3d01df7d7cf6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -101,9 +101,6 @@ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
/* Logical package management. We might want to allocate that dynamically */
-static int *physical_to_logical_pkg __read_mostly;
-static unsigned long *physical_package_map __read_mostly;;
-static unsigned int max_physical_pkg_id __read_mostly;
unsigned int __max_logical_packages __read_mostly;
EXPORT_SYMBOL(__max_logical_packages);
static unsigned int logical_packages __read_mostly;
@@ -281,108 +278,48 @@ static void notrace start_secondary(void *unused)
}
/**
+ * topology_phys_to_logical_pkg - Map a physical package id to a logical
+ *
+ * Returns logical package id or -1 if not found
+ */
+int topology_phys_to_logical_pkg(unsigned int phys_pkg)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+ if (c->initialized && c->phys_proc_id == phys_pkg)
+ return c->logical_proc_id;
+ }
+ return -1;
+}
+EXPORT_SYMBOL(topology_phys_to_logical_pkg);
+
+/**
* topology_update_package_map - Update the physical to logical package map
* @pkg: The physical package id as retrieved via CPUID
* @cpu: The cpu for which this is updated
*/
int topology_update_package_map(unsigned int pkg, unsigned int cpu)
{
- unsigned int new;
+ int new;
- /* Called from early boot ? */
- if (!physical_package_map)
- return 0;
-
- if (pkg >= max_physical_pkg_id)
- return -EINVAL;
-
- /* Set the logical package id */
- if (test_and_set_bit(pkg, physical_package_map))
+ /* Already available somewhere? */
+ new = topology_phys_to_logical_pkg(pkg);
+ if (new >= 0)
goto found;
- if (logical_packages >= __max_logical_packages) {
- pr_warn("Package %u of CPU %u exceeds BIOS package data %u.\n",
- logical_packages, cpu, __max_logical_packages);
- return -ENOSPC;
- }
-
new = logical_packages++;
if (new != pkg) {
pr_info("CPU %u Converting physical %u to logical package %u\n",
cpu, pkg, new);
}
- physical_to_logical_pkg[pkg] = new;
-
found:
- cpu_data(cpu).logical_proc_id = physical_to_logical_pkg[pkg];
+ cpu_data(cpu).logical_proc_id = new;
return 0;
}
-/**
- * topology_phys_to_logical_pkg - Map a physical package id to a logical
- *
- * Returns logical package id or -1 if not found
- */
-int topology_phys_to_logical_pkg(unsigned int phys_pkg)
-{
- if (phys_pkg >= max_physical_pkg_id)
- return -1;
- return physical_to_logical_pkg[phys_pkg];
-}
-EXPORT_SYMBOL(topology_phys_to_logical_pkg);
-
-static void __init smp_init_package_map(struct cpuinfo_x86 *c, unsigned int cpu)
-{
- unsigned int ncpus;
- size_t size;
-
- /*
- * Today neither Intel nor AMD support heterogenous systems. That
- * might change in the future....
- *
- * While ideally we'd want '* smp_num_siblings' in the below @ncpus
- * computation, this won't actually work since some Intel BIOSes
- * report inconsistent HT data when they disable HT.
- *
- * In particular, they reduce the APIC-IDs to only include the cores,
- * but leave the CPUID topology to say there are (2) siblings.
- * This means we don't know how many threads there will be until
- * after the APIC enumeration.
- *
- * By not including this we'll sometimes over-estimate the number of
- * logical packages by the amount of !present siblings, but this is
- * still better than MAX_LOCAL_APIC.
- *
- * We use total_cpus not nr_cpu_ids because nr_cpu_ids can be limited
- * on the command line leading to a similar issue as the HT disable
- * problem because the hyperthreads are usually enumerated after the
- * primary cores.
- */
- ncpus = boot_cpu_data.x86_max_cores;
- if (!ncpus) {
- pr_warn("x86_max_cores == zero !?!?");
- ncpus = 1;
- }
-
- __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus);
- logical_packages = 0;
-
- /*
- * Possibly larger than what we need as the number of apic ids per
- * package can be smaller than the actual used apic ids.
- */
- max_physical_pkg_id = DIV_ROUND_UP(MAX_LOCAL_APIC, ncpus);
- size = max_physical_pkg_id * sizeof(unsigned int);
- physical_to_logical_pkg = kmalloc(size, GFP_KERNEL);
- memset(physical_to_logical_pkg, 0xff, size);
- size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long);
- physical_package_map = kzalloc(size, GFP_KERNEL);
-
- pr_info("Max logical packages: %u\n", __max_logical_packages);
-
- topology_update_package_map(c->phys_proc_id, cpu);
-}
-
void __init smp_store_boot_cpu_info(void)
{
int id = 0; /* CPU 0 */
@@ -390,7 +327,8 @@ void __init smp_store_boot_cpu_info(void)
*c = boot_cpu_data;
c->cpu_index = id;
- smp_init_package_map(c, id);
+ topology_update_package_map(c->phys_proc_id, id);
+ c->initialized = true;
}
/*
@@ -401,13 +339,16 @@ void smp_store_cpu_info(int id)
{
struct cpuinfo_x86 *c = &cpu_data(id);
- *c = boot_cpu_data;
+ /* Copy boot_cpu_data only on the first bringup */
+ if (!c->initialized)
+ *c = boot_cpu_data;
c->cpu_index = id;
/*
* During boot time, CPU0 has this setup already. Save the info when
* bringing up AP or offlined CPU0.
*/
identify_secondary_cpu(c);
+ c->initialized = true;
}
static bool
@@ -1356,7 +1297,16 @@ void __init native_smp_prepare_boot_cpu(void)
void __init native_smp_cpus_done(unsigned int max_cpus)
{
+ int ncpus;
+
pr_debug("Boot done\n");
+ /*
+ * Today neither Intel nor AMD support heterogenous systems so
+ * extrapolate the boot cpu's data to all packages.
+ */
+ ncpus = cpu_data(0).booted_cores * smp_num_siblings;
+ __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus);
+ pr_info("Max logical packages: %u\n", __max_logical_packages);
if (x86_has_numa_in_package)
set_sched_topology(x86_numa_in_package_topology);
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index a63fe77b3217..676774b9bb8d 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -188,6 +188,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
if (len > TASK_SIZE)
return -ENOMEM;
+ /* No address checking. See comment at mmap_address_hint_valid() */
if (flags & MAP_FIXED)
return addr;
@@ -197,12 +198,15 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
/* requesting a specific address */
if (addr) {
- addr = PAGE_ALIGN(addr);
+ addr &= PAGE_MASK;
+ if (!mmap_address_hint_valid(addr, len))
+ goto get_unmapped_area;
+
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vm_start_gap(vma)))
+ if (!vma || addr + len <= vm_start_gap(vma))
return addr;
}
+get_unmapped_area:
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index b7b0f74a2150..989514c94a55 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -42,7 +42,6 @@
#include <linux/edac.h>
#endif
-#include <asm/kmemcheck.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
@@ -749,10 +748,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
if (!dr6 && user_mode(regs))
user_icebp = 1;
- /* Catch kmemcheck conditions! */
- if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
- goto exit;
-
/* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
index 6ba82be68cff..f44ce0fb3583 100644
--- a/arch/x86/kernel/umip.c
+++ b/arch/x86/kernel/umip.c
@@ -78,7 +78,60 @@
#define UMIP_INST_SGDT 0 /* 0F 01 /0 */
#define UMIP_INST_SIDT 1 /* 0F 01 /1 */
-#define UMIP_INST_SMSW 3 /* 0F 01 /4 */
+#define UMIP_INST_SMSW 2 /* 0F 01 /4 */
+#define UMIP_INST_SLDT 3 /* 0F 00 /0 */
+#define UMIP_INST_STR 4 /* 0F 00 /1 */
+
+const char * const umip_insns[5] = {
+ [UMIP_INST_SGDT] = "SGDT",
+ [UMIP_INST_SIDT] = "SIDT",
+ [UMIP_INST_SMSW] = "SMSW",
+ [UMIP_INST_SLDT] = "SLDT",
+ [UMIP_INST_STR] = "STR",
+};
+
+#define umip_pr_err(regs, fmt, ...) \
+ umip_printk(regs, KERN_ERR, fmt, ##__VA_ARGS__)
+#define umip_pr_warning(regs, fmt, ...) \
+ umip_printk(regs, KERN_WARNING, fmt, ##__VA_ARGS__)
+
+/**
+ * umip_printk() - Print a rate-limited message
+ * @regs: Register set with the context in which the warning is printed
+ * @log_level: Kernel log level to print the message
+ * @fmt: The text string to print
+ *
+ * Print the text contained in @fmt. The print rate is limited to bursts of 5
+ * messages every two minutes. The purpose of this customized version of
+ * printk() is to print messages when user space processes use any of the
+ * UMIP-protected instructions. Thus, the printed text is prepended with the
+ * task name and process ID number of the current task as well as the
+ * instruction and stack pointers in @regs as seen when entering kernel mode.
+ *
+ * Returns:
+ *
+ * None.
+ */
+static __printf(3, 4)
+void umip_printk(const struct pt_regs *regs, const char *log_level,
+ const char *fmt, ...)
+{
+ /* Bursts of 5 messages every two minutes */
+ static DEFINE_RATELIMIT_STATE(ratelimit, 2 * 60 * HZ, 5);
+ struct task_struct *tsk = current;
+ struct va_format vaf;
+ va_list args;
+
+ if (!__ratelimit(&ratelimit))
+ return;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ printk("%s" pr_fmt("%s[%d] ip:%lx sp:%lx: %pV"), log_level, tsk->comm,
+ task_pid_nr(tsk), regs->ip, regs->sp, &vaf);
+ va_end(args);
+}
/**
* identify_insn() - Identify a UMIP-protected instruction
@@ -118,10 +171,16 @@ static int identify_insn(struct insn *insn)
default:
return -EINVAL;
}
+ } else if (insn->opcode.bytes[1] == 0x0) {
+ if (X86_MODRM_REG(insn->modrm.value) == 0)
+ return UMIP_INST_SLDT;
+ else if (X86_MODRM_REG(insn->modrm.value) == 1)
+ return UMIP_INST_STR;
+ else
+ return -EINVAL;
+ } else {
+ return -EINVAL;
}
-
- /* SLDT AND STR are not emulated */
- return -EINVAL;
}
/**
@@ -228,10 +287,8 @@ static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV)))
return;
- pr_err_ratelimited("%s[%d] umip emulation segfault ip:%lx sp:%lx error:%x in %lx\n",
- tsk->comm, task_pid_nr(tsk), regs->ip,
- regs->sp, X86_PF_USER | X86_PF_WRITE,
- regs->ip);
+ umip_pr_err(regs, "segfault in emulation. error%x\n",
+ X86_PF_USER | X86_PF_WRITE);
}
/**
@@ -262,15 +319,11 @@ bool fixup_umip_exception(struct pt_regs *regs)
unsigned char buf[MAX_INSN_SIZE];
void __user *uaddr;
struct insn insn;
- char seg_defs;
+ int seg_defs;
if (!regs)
return false;
- /* Do not emulate 64-bit processes. */
- if (user_64bit_mode(regs))
- return false;
-
/*
* If not in user-space long mode, a custom code segment could be in
* use. This is true in protected mode (if the process defined a local
@@ -322,6 +375,15 @@ bool fixup_umip_exception(struct pt_regs *regs)
if (umip_inst < 0)
return false;
+ umip_pr_warning(regs, "%s instruction cannot be used by applications.\n",
+ umip_insns[umip_inst]);
+
+ /* Do not emulate SLDT, STR or user long mode processes. */
+ if (umip_inst == UMIP_INST_STR || umip_inst == UMIP_INST_SLDT || user_64bit_mode(regs))
+ return false;
+
+ umip_pr_warning(regs, "For now, expensive software emulation returns the result.\n");
+
if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size))
return false;
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index cdc70a3a6583..c2cea6651279 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -44,7 +44,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},
[CPUID_1_ECX] = { 1, 0, CPUID_ECX},
[CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX},
- [CPUID_8000_0001_ECX] = {0xc0000001, 0, CPUID_ECX},
+ [CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},
[CPUID_7_0_EBX] = { 7, 0, CPUID_EBX},
[CPUID_D_1_EAX] = { 0xd, 1, CPUID_EAX},
[CPUID_F_0_EDX] = { 0xf, 0, CPUID_EDX},
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d90cdc77e077..e7d04d0c8008 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2591,6 +2591,15 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
smbase = ctxt->ops->get_smbase(ctxt);
+
+ /*
+ * Give pre_leave_smm() a chance to make ISA-specific changes to the
+ * vCPU state (e.g. enter guest mode) before loading state from the SMM
+ * state-save area.
+ */
+ if (ctxt->ops->pre_leave_smm(ctxt, smbase))
+ return X86EMUL_UNHANDLEABLE;
+
if (emulator_has_longmode(ctxt))
ret = rsm_load_state_64(ctxt, smbase + 0x8000);
else
@@ -4005,6 +4014,26 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)
fxstate_size(ctxt));
}
+/*
+ * FXRSTOR might restore XMM registers not provided by the guest. Fill
+ * in the host registers (via FXSAVE) instead, so they won't be modified.
+ * (preemption has to stay disabled until FXRSTOR).
+ *
+ * Use noinline to keep the stack for other functions called by callers small.
+ */
+static noinline int fxregs_fixup(struct fxregs_state *fx_state,
+ const size_t used_size)
+{
+ struct fxregs_state fx_tmp;
+ int rc;
+
+ rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_tmp));
+ memcpy((void *)fx_state + used_size, (void *)&fx_tmp + used_size,
+ __fxstate_size(16) - used_size);
+
+ return rc;
+}
+
static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
{
struct fxregs_state fx_state;
@@ -4015,19 +4044,19 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
+ size = fxstate_size(ctxt);
+ rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
ctxt->ops->get_fpu(ctxt);
- size = fxstate_size(ctxt);
if (size < __fxstate_size(16)) {
- rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+ rc = fxregs_fixup(&fx_state, size);
if (rc != X86EMUL_CONTINUE)
goto out;
}
- rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size);
- if (rc != X86EMUL_CONTINUE)
- goto out;
-
if (fx_state.mxcsr >> 16) {
rc = emulate_gp(ctxt, 0);
goto out;
@@ -4991,6 +5020,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
bool op_prefix = false;
bool has_seg_override = false;
struct opcode opcode;
+ u16 dummy;
+ struct desc_struct desc;
ctxt->memop.type = OP_NONE;
ctxt->memopp = NULL;
@@ -5009,6 +5040,11 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
switch (mode) {
case X86EMUL_MODE_REAL:
case X86EMUL_MODE_VM86:
+ def_op_bytes = def_ad_bytes = 2;
+ ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS);
+ if (desc.d)
+ def_op_bytes = def_ad_bytes = 4;
+ break;
case X86EMUL_MODE_PROT16:
def_op_bytes = def_ad_bytes = 2;
break;
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index bdff437acbcb..4e822ad363f3 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -209,12 +209,12 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
old_irr = ioapic->irr;
ioapic->irr |= mask;
- if (edge)
+ if (edge) {
ioapic->irr_delivered &= ~mask;
- if ((edge && old_irr == ioapic->irr) ||
- (!edge && entry.fields.remote_irr)) {
- ret = 0;
- goto out;
+ if (old_irr == ioapic->irr) {
+ ret = 0;
+ goto out;
+ }
}
ret = ioapic_service(ioapic, irq, line_status);
@@ -257,8 +257,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)
index == RTC_GSI) {
if (kvm_apic_match_dest(vcpu, NULL, 0,
e->fields.dest_id, e->fields.dest_mode) ||
- (e->fields.trig_mode == IOAPIC_EDGE_TRIG &&
- kvm_apic_pending_eoi(vcpu, e->fields.vector)))
+ kvm_apic_pending_eoi(vcpu, e->fields.vector))
__set_bit(e->fields.vector,
ioapic_handled_vectors);
}
@@ -277,6 +276,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
{
unsigned index;
bool mask_before, mask_after;
+ int old_remote_irr, old_delivery_status;
union kvm_ioapic_redirect_entry *e;
switch (ioapic->ioregsel) {
@@ -299,14 +299,28 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
return;
e = &ioapic->redirtbl[index];
mask_before = e->fields.mask;
+ /* Preserve read-only fields */
+ old_remote_irr = e->fields.remote_irr;
+ old_delivery_status = e->fields.delivery_status;
if (ioapic->ioregsel & 1) {
e->bits &= 0xffffffff;
e->bits |= (u64) val << 32;
} else {
e->bits &= ~0xffffffffULL;
e->bits |= (u32) val;
- e->fields.remote_irr = 0;
}
+ e->fields.remote_irr = old_remote_irr;
+ e->fields.delivery_status = old_delivery_status;
+
+ /*
+ * Some OSes (Linux, Xen) assume that Remote IRR bit will
+ * be cleared by IOAPIC hardware when the entry is configured
+ * as edge-triggered. This behavior is used to simulate an
+ * explicit EOI on IOAPICs that don't have the EOI register.
+ */
+ if (e->fields.trig_mode == IOAPIC_EDGE_TRIG)
+ e->fields.remote_irr = 0;
+
mask_after = e->fields.mask;
if (mask_before != mask_after)
kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
@@ -324,7 +338,9 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
struct kvm_lapic_irq irqe;
int ret;
- if (entry->fields.mask)
+ if (entry->fields.mask ||
+ (entry->fields.trig_mode == IOAPIC_LEVEL_TRIG &&
+ entry->fields.remote_irr))
return -1;
ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 36c90d631096..e2c1fb8d35ce 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -266,9 +266,14 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
recalculate_apic_map(apic->vcpu->kvm);
}
+static inline u32 kvm_apic_calc_x2apic_ldr(u32 id)
+{
+ return ((id >> 4) << 16) | (1 << (id & 0xf));
+}
+
static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
{
- u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+ u32 ldr = kvm_apic_calc_x2apic_ldr(id);
WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
@@ -1301,14 +1306,42 @@ static void update_divide_count(struct kvm_lapic *apic)
apic->divide_count);
}
+static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
+{
+ /*
+ * Do not allow the guest to program periodic timers with small
+ * interval, since the hrtimers are not throttled by the host
+ * scheduler.
+ */
+ if (apic_lvtt_period(apic) && apic->lapic_timer.period) {
+ s64 min_period = min_timer_period_us * 1000LL;
+
+ if (apic->lapic_timer.period < min_period) {
+ pr_info_ratelimited(
+ "kvm: vcpu %i: requested %lld ns "
+ "lapic timer period limited to %lld ns\n",
+ apic->vcpu->vcpu_id,
+ apic->lapic_timer.period, min_period);
+ apic->lapic_timer.period = min_period;
+ }
+ }
+}
+
static void apic_update_lvtt(struct kvm_lapic *apic)
{
u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
apic->lapic_timer.timer_mode_mask;
if (apic->lapic_timer.timer_mode != timer_mode) {
+ if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
+ APIC_LVT_TIMER_TSCDEADLINE)) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ kvm_lapic_set_reg(apic, APIC_TMICT, 0);
+ apic->lapic_timer.period = 0;
+ apic->lapic_timer.tscdeadline = 0;
+ }
apic->lapic_timer.timer_mode = timer_mode;
- hrtimer_cancel(&apic->lapic_timer.timer);
+ limit_periodic_timer_frequency(apic);
}
}
@@ -1430,6 +1463,30 @@ static void start_sw_period(struct kvm_lapic *apic)
HRTIMER_MODE_ABS_PINNED);
}
+static void update_target_expiration(struct kvm_lapic *apic, uint32_t old_divisor)
+{
+ ktime_t now, remaining;
+ u64 ns_remaining_old, ns_remaining_new;
+
+ apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
+ * APIC_BUS_CYCLE_NS * apic->divide_count;
+ limit_periodic_timer_frequency(apic);
+
+ now = ktime_get();
+ remaining = ktime_sub(apic->lapic_timer.target_expiration, now);
+ if (ktime_to_ns(remaining) < 0)
+ remaining = 0;
+
+ ns_remaining_old = ktime_to_ns(remaining);
+ ns_remaining_new = mul_u64_u32_div(ns_remaining_old,
+ apic->divide_count, old_divisor);
+
+ apic->lapic_timer.tscdeadline +=
+ nsec_to_cycles(apic->vcpu, ns_remaining_new) -
+ nsec_to_cycles(apic->vcpu, ns_remaining_old);
+ apic->lapic_timer.target_expiration = ktime_add_ns(now, ns_remaining_new);
+}
+
static bool set_target_expiration(struct kvm_lapic *apic)
{
ktime_t now;
@@ -1439,27 +1496,13 @@ static bool set_target_expiration(struct kvm_lapic *apic)
apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT)
* APIC_BUS_CYCLE_NS * apic->divide_count;
- if (!apic->lapic_timer.period)
+ if (!apic->lapic_timer.period) {
+ apic->lapic_timer.tscdeadline = 0;
return false;
-
- /*
- * Do not allow the guest to program periodic timers with small
- * interval, since the hrtimers are not throttled by the host
- * scheduler.
- */
- if (apic_lvtt_period(apic)) {
- s64 min_period = min_timer_period_us * 1000LL;
-
- if (apic->lapic_timer.period < min_period) {
- pr_info_ratelimited(
- "kvm: vcpu %i: requested %lld ns "
- "lapic timer period limited to %lld ns\n",
- apic->vcpu->vcpu_id,
- apic->lapic_timer.period, min_period);
- apic->lapic_timer.period = min_period;
- }
}
+ limit_periodic_timer_frequency(apic);
+
apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
PRIx64 ", "
"timer initial count 0x%x, period %lldns, "
@@ -1515,6 +1558,9 @@ static bool start_hv_timer(struct kvm_lapic *apic)
if (!apic_lvtt_period(apic) && atomic_read(&ktimer->pending))
return false;
+ if (!ktimer->tscdeadline)
+ return false;
+
r = kvm_x86_ops->set_hv_timer(apic->vcpu, ktimer->tscdeadline);
if (r < 0)
return false;
@@ -1738,13 +1784,21 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
start_apic_timer(apic);
break;
- case APIC_TDCR:
+ case APIC_TDCR: {
+ uint32_t old_divisor = apic->divide_count;
+
if (val & 4)
apic_debug("KVM_WRITE:TDCR %x\n", val);
kvm_lapic_set_reg(apic, APIC_TDCR, val);
update_divide_count(apic);
+ if (apic->divide_count != old_divisor &&
+ apic->lapic_timer.period) {
+ hrtimer_cancel(&apic->lapic_timer.timer);
+ update_target_expiration(apic, old_divisor);
+ restart_apic_timer(apic);
+ }
break;
-
+ }
case APIC_ESR:
if (apic_x2apic_mode(apic) && val != 0) {
apic_debug("KVM_WRITE:ESR not zero %x\n", val);
@@ -2196,6 +2250,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
{
if (apic_x2apic_mode(vcpu->arch.apic)) {
u32 *id = (u32 *)(s->regs + APIC_ID);
+ u32 *ldr = (u32 *)(s->regs + APIC_LDR);
if (vcpu->kvm->arch.x2apic_format) {
if (*id != vcpu->vcpu_id)
@@ -2206,6 +2261,10 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
else
*id <<= 24;
}
+
+ /* In x2APIC mode, the LDR is fixed and based on the id */
+ if (set)
+ *ldr = kvm_apic_calc_x2apic_ldr(*id);
}
return 0;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a119b361b8b7..e5e66e5c6640 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -150,6 +150,20 @@ module_param(dbg, bool, 0644);
/* make pte_list_desc fit well in cache line */
#define PTE_LIST_EXT 3
+/*
+ * Return values of handle_mmio_page_fault and mmu.page_fault:
+ * RET_PF_RETRY: let CPU fault again on the address.
+ * RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
+ *
+ * For handle_mmio_page_fault only:
+ * RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
+ */
+enum {
+ RET_PF_RETRY = 0,
+ RET_PF_EMULATE = 1,
+ RET_PF_INVALID = 2,
+};
+
struct pte_list_desc {
u64 *sptes[PTE_LIST_EXT];
struct pte_list_desc *more;
@@ -2424,7 +2438,7 @@ static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
{
- return __shadow_walk_next(iterator, *iterator->sptep);
+ __shadow_walk_next(iterator, *iterator->sptep);
}
static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
@@ -2794,13 +2808,13 @@ done:
return ret;
}
-static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
- int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
- bool speculative, bool host_writable)
+static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
+ int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
+ bool speculative, bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
- bool emulate = false;
+ int ret = RET_PF_RETRY;
pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
*sptep, write_fault, gfn);
@@ -2830,12 +2844,12 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
true, host_writable)) {
if (write_fault)
- emulate = true;
+ ret = RET_PF_EMULATE;
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
}
if (unlikely(is_mmio_spte(*sptep)))
- emulate = true;
+ ret = RET_PF_EMULATE;
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
@@ -2855,7 +2869,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
kvm_release_pfn_clean(pfn);
- return emulate;
+ return ret;
}
static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
@@ -2994,14 +3008,13 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
* Do not cache the mmio info caused by writing the readonly gfn
* into the spte otherwise read access on readonly gfn also can
* caused mmio page fault and treat it as mmio access.
- * Return 1 to tell kvm to emulate it.
*/
if (pfn == KVM_PFN_ERR_RO_FAULT)
- return 1;
+ return RET_PF_EMULATE;
if (pfn == KVM_PFN_ERR_HWPOISON) {
kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
- return 0;
+ return RET_PF_RETRY;
}
return -EFAULT;
@@ -3286,13 +3299,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
}
if (fast_page_fault(vcpu, v, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
return r;
@@ -3312,7 +3325,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
@@ -3659,54 +3672,38 @@ exit:
return reserved;
}
-/*
- * Return values of handle_mmio_page_fault:
- * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction
- * directly.
- * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page
- * fault path update the mmio spte.
- * RET_MMIO_PF_RETRY: let CPU fault again on the address.
- * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed).
- */
-enum {
- RET_MMIO_PF_EMULATE = 1,
- RET_MMIO_PF_INVALID = 2,
- RET_MMIO_PF_RETRY = 0,
- RET_MMIO_PF_BUG = -1
-};
-
static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
{
u64 spte;
bool reserved;
if (mmio_info_in_cache(vcpu, addr, direct))
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
if (WARN_ON(reserved))
- return RET_MMIO_PF_BUG;
+ return -EINVAL;
if (is_mmio_spte(spte)) {
gfn_t gfn = get_mmio_spte_gfn(spte);
unsigned access = get_mmio_spte_access(spte);
if (!check_mmio_spte(vcpu, spte))
- return RET_MMIO_PF_INVALID;
+ return RET_PF_INVALID;
if (direct)
addr = 0;
trace_handle_mmio_page_fault(addr, gfn, access);
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
- return RET_MMIO_PF_EMULATE;
+ return RET_PF_EMULATE;
}
/*
* If the page table is zapped by other cpus, let CPU fault again on
* the address.
*/
- return RET_MMIO_PF_RETRY;
+ return RET_PF_RETRY;
}
EXPORT_SYMBOL_GPL(handle_mmio_page_fault);
@@ -3756,7 +3753,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3820,8 +3817,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
}
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
- u64 fault_address, char *insn, int insn_len,
- bool need_unprotect)
+ u64 fault_address, char *insn, int insn_len)
{
int r = 1;
@@ -3829,7 +3825,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
default:
trace_kvm_page_fault(fault_address, error_code);
- if (need_unprotect && kvm_event_needs_reinjection(vcpu))
+ if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, fault_address);
r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
insn_len);
@@ -3876,7 +3872,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
if (page_fault_handle_page_track(vcpu, error_code, gfn))
- return 1;
+ return RET_PF_EMULATE;
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -3893,13 +3889,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
}
if (fast_page_fault(vcpu, gpa, level, error_code))
- return 0;
+ return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
return r;
@@ -3919,7 +3915,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
static void nonpaging_init_context(struct kvm_vcpu *vcpu,
@@ -4918,25 +4914,25 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
vcpu->arch.gpa_val = cr2;
}
+ r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
r = handle_mmio_page_fault(vcpu, cr2, direct);
- if (r == RET_MMIO_PF_EMULATE) {
+ if (r == RET_PF_EMULATE) {
emulation_type = 0;
goto emulate;
}
- if (r == RET_MMIO_PF_RETRY)
- return 1;
- if (r < 0)
- return r;
- /* Must be RET_MMIO_PF_INVALID. */
}
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
- false);
+ if (r == RET_PF_INVALID) {
+ r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
+ false);
+ WARN_ON(r == RET_PF_INVALID);
+ }
+
+ if (r == RET_PF_RETRY)
+ return 1;
if (r < 0)
return r;
- if (!r)
- return 1;
/*
* Before emulating the instruction, check if the error code
@@ -4993,8 +4989,7 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
free_page((unsigned long)vcpu->arch.mmu.pae_root);
- if (vcpu->arch.mmu.lm_root != NULL)
- free_page((unsigned long)vcpu->arch.mmu.lm_root);
+ free_page((unsigned long)vcpu->arch.mmu.lm_root);
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -5464,10 +5459,8 @@ static struct shrinker mmu_shrinker = {
static void mmu_destroy_caches(void)
{
- if (pte_list_desc_cache)
- kmem_cache_destroy(pte_list_desc_cache);
- if (mmu_page_header_cache)
- kmem_cache_destroy(mmu_page_header_cache);
+ kmem_cache_destroy(pte_list_desc_cache);
+ kmem_cache_destroy(mmu_page_header_cache);
}
int kvm_mmu_module_init(void)
@@ -5476,13 +5469,13 @@ int kvm_mmu_module_init(void)
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
- 0, 0, NULL);
+ 0, SLAB_ACCOUNT, NULL);
if (!pte_list_desc_cache)
goto nomem;
mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
sizeof(struct kvm_mmu_page),
- 0, 0, NULL);
+ 0, SLAB_ACCOUNT, NULL);
if (!mmu_page_header_cache)
goto nomem;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index efc857615d8e..5b408c0ad612 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -66,8 +66,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
bool accessed_dirty);
bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
- u64 fault_address, char *insn, int insn_len,
- bool need_unprotect);
+ u64 fault_address, char *insn, int insn_len);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f18d1f8d332b..5abae72266b7 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -593,7 +593,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
- int top_level, emulate;
+ int top_level, ret;
direct_access = gw->pte_access;
@@ -659,15 +659,15 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}
clear_sp_write_flooding_count(it.sptep);
- emulate = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
- it.level, gw->gfn, pfn, prefault, map_writable);
+ ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
+ it.level, gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
- return emulate;
+ return ret;
out_gpte_changed:
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
/*
@@ -762,12 +762,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (!prefault)
inject_page_fault(vcpu, &walker.fault);
- return 0;
+ return RET_PF_RETRY;
}
if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
shadow_page_table_clear_flood(vcpu, addr);
- return 1;
+ return RET_PF_EMULATE;
}
vcpu->arch.write_fault_to_shadow_pgtable = false;
@@ -789,7 +789,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
&map_writable))
- return 0;
+ return RET_PF_RETRY;
if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
return r;
@@ -834,7 +834,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
- return 0;
+ return RET_PF_RETRY;
}
static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0e68f0b3cbf7..eb714f1cdf7e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -361,6 +361,7 @@ static void recalc_intercepts(struct vcpu_svm *svm)
{
struct vmcb_control_area *c, *h;
struct nested_state *g;
+ u32 h_intercept_exceptions;
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
@@ -371,9 +372,14 @@ static void recalc_intercepts(struct vcpu_svm *svm)
h = &svm->nested.hsave->control;
g = &svm->nested;
+ /* No need to intercept #UD if L1 doesn't intercept it */
+ h_intercept_exceptions =
+ h->intercept_exceptions & ~(1U << UD_VECTOR);
+
c->intercept_cr = h->intercept_cr | g->intercept_cr;
c->intercept_dr = h->intercept_dr | g->intercept_dr;
- c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
+ c->intercept_exceptions =
+ h_intercept_exceptions | g->intercept_exceptions;
c->intercept = h->intercept | g->intercept;
}
@@ -1034,15 +1040,12 @@ static int avic_ga_log_notifier(u32 ga_tag)
}
spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags);
- if (!vcpu)
- return 0;
-
/* Note:
* At this point, the IOMMU should have already set the pending
* bit in the vAPIC backing page. So, we just need to schedule
* in the vcpu.
*/
- if (vcpu->mode == OUTSIDE_GUEST_MODE)
+ if (vcpu)
kvm_vcpu_wake_up(vcpu);
return 0;
@@ -2144,7 +2147,18 @@ static int pf_interception(struct vcpu_svm *svm)
return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address,
svm->vmcb->control.insn_bytes,
- svm->vmcb->control.insn_len, !npt_enabled);
+ svm->vmcb->control.insn_len);
+}
+
+static int npf_interception(struct vcpu_svm *svm)
+{
+ u64 fault_address = svm->vmcb->control.exit_info_2;
+ u64 error_code = svm->vmcb->control.exit_info_1;
+
+ trace_kvm_page_fault(fault_address, error_code);
+ return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
+ svm->vmcb->control.insn_bytes,
+ svm->vmcb->control.insn_len);
}
static int db_interception(struct vcpu_svm *svm)
@@ -2188,7 +2202,10 @@ static int ud_interception(struct vcpu_svm *svm)
{
int er;
+ WARN_ON_ONCE(is_guest_mode(&svm->vcpu));
er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
if (er != EMULATE_DONE)
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
@@ -2916,70 +2933,9 @@ static bool nested_vmcb_checks(struct vmcb *vmcb)
return true;
}
-static bool nested_svm_vmrun(struct vcpu_svm *svm)
+static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
+ struct vmcb *nested_vmcb, struct page *page)
{
- struct vmcb *nested_vmcb;
- struct vmcb *hsave = svm->nested.hsave;
- struct vmcb *vmcb = svm->vmcb;
- struct page *page;
- u64 vmcb_gpa;
-
- vmcb_gpa = svm->vmcb->save.rax;
-
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
- return false;
-
- if (!nested_vmcb_checks(nested_vmcb)) {
- nested_vmcb->control.exit_code = SVM_EXIT_ERR;
- nested_vmcb->control.exit_code_hi = 0;
- nested_vmcb->control.exit_info_1 = 0;
- nested_vmcb->control.exit_info_2 = 0;
-
- nested_svm_unmap(page);
-
- return false;
- }
-
- trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
- nested_vmcb->save.rip,
- nested_vmcb->control.int_ctl,
- nested_vmcb->control.event_inj,
- nested_vmcb->control.nested_ctl);
-
- trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
- nested_vmcb->control.intercept_cr >> 16,
- nested_vmcb->control.intercept_exceptions,
- nested_vmcb->control.intercept);
-
- /* Clear internal status */
- kvm_clear_exception_queue(&svm->vcpu);
- kvm_clear_interrupt_queue(&svm->vcpu);
-
- /*
- * Save the old vmcb, so we don't need to pick what we save, but can
- * restore everything when a VMEXIT occurs
- */
- hsave->save.es = vmcb->save.es;
- hsave->save.cs = vmcb->save.cs;
- hsave->save.ss = vmcb->save.ss;
- hsave->save.ds = vmcb->save.ds;
- hsave->save.gdtr = vmcb->save.gdtr;
- hsave->save.idtr = vmcb->save.idtr;
- hsave->save.efer = svm->vcpu.arch.efer;
- hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
- hsave->save.cr4 = svm->vcpu.arch.cr4;
- hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
- hsave->save.rip = kvm_rip_read(&svm->vcpu);
- hsave->save.rsp = vmcb->save.rsp;
- hsave->save.rax = vmcb->save.rax;
- if (npt_enabled)
- hsave->save.cr3 = vmcb->save.cr3;
- else
- hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
-
- copy_vmcb_control_area(hsave, vmcb);
-
if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
else
@@ -3072,6 +3028,73 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
enable_gif(svm);
mark_all_dirty(svm->vmcb);
+}
+
+static bool nested_svm_vmrun(struct vcpu_svm *svm)
+{
+ struct vmcb *nested_vmcb;
+ struct vmcb *hsave = svm->nested.hsave;
+ struct vmcb *vmcb = svm->vmcb;
+ struct page *page;
+ u64 vmcb_gpa;
+
+ vmcb_gpa = svm->vmcb->save.rax;
+
+ nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
+ if (!nested_vmcb)
+ return false;
+
+ if (!nested_vmcb_checks(nested_vmcb)) {
+ nested_vmcb->control.exit_code = SVM_EXIT_ERR;
+ nested_vmcb->control.exit_code_hi = 0;
+ nested_vmcb->control.exit_info_1 = 0;
+ nested_vmcb->control.exit_info_2 = 0;
+
+ nested_svm_unmap(page);
+
+ return false;
+ }
+
+ trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
+ nested_vmcb->save.rip,
+ nested_vmcb->control.int_ctl,
+ nested_vmcb->control.event_inj,
+ nested_vmcb->control.nested_ctl);
+
+ trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
+ nested_vmcb->control.intercept_cr >> 16,
+ nested_vmcb->control.intercept_exceptions,
+ nested_vmcb->control.intercept);
+
+ /* Clear internal status */
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
+ /*
+ * Save the old vmcb, so we don't need to pick what we save, but can
+ * restore everything when a VMEXIT occurs
+ */
+ hsave->save.es = vmcb->save.es;
+ hsave->save.cs = vmcb->save.cs;
+ hsave->save.ss = vmcb->save.ss;
+ hsave->save.ds = vmcb->save.ds;
+ hsave->save.gdtr = vmcb->save.gdtr;
+ hsave->save.idtr = vmcb->save.idtr;
+ hsave->save.efer = svm->vcpu.arch.efer;
+ hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
+ hsave->save.cr4 = svm->vcpu.arch.cr4;
+ hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
+ hsave->save.rip = kvm_rip_read(&svm->vcpu);
+ hsave->save.rsp = vmcb->save.rsp;
+ hsave->save.rax = vmcb->save.rax;
+ if (npt_enabled)
+ hsave->save.cr3 = vmcb->save.cr3;
+ else
+ hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
+
+ copy_vmcb_control_area(hsave, vmcb);
+
+ enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
return true;
}
@@ -3173,7 +3196,7 @@ static int stgi_interception(struct vcpu_svm *svm)
/*
* If VGIF is enabled, the STGI intercept is only added to
- * detect the opening of the NMI window; remove it now.
+ * detect the opening of the SMI/NMI window; remove it now.
*/
if (vgif_enabled(svm))
clr_intercept(svm, INTERCEPT_STGI);
@@ -3657,6 +3680,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
u32 ecx = msr->index;
u64 data = msr->data;
switch (ecx) {
+ case MSR_IA32_CR_PAT:
+ if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
+ return 1;
+ vcpu->arch.pat = data;
+ svm->vmcb->save.g_pat = data;
+ mark_dirty(svm->vmcb, VMCB_NPT);
+ break;
case MSR_IA32_TSC:
kvm_write_tsc(vcpu, msr);
break;
@@ -4131,7 +4161,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_MONITOR] = monitor_interception,
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
- [SVM_EXIT_NPF] = pf_interception,
+ [SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_RSM] = emulate_on_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
[SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception,
@@ -5393,6 +5423,88 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu)
vcpu->arch.mcg_cap &= 0x1ff;
}
+static int svm_smi_allowed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /* Per APM Vol.2 15.22.2 "Response to SMI" */
+ if (!gif_set(svm))
+ return 0;
+
+ if (is_guest_mode(&svm->vcpu) &&
+ svm->nested.intercept & (1ULL << INTERCEPT_SMI)) {
+ /* TODO: Might need to set exit_info_1 and exit_info_2 here */
+ svm->vmcb->control.exit_code = SVM_EXIT_SMI;
+ svm->nested.exit_required = true;
+ return 0;
+ }
+
+ return 1;
+}
+
+static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ int ret;
+
+ if (is_guest_mode(vcpu)) {
+ /* FED8h - SVM Guest */
+ put_smstate(u64, smstate, 0x7ed8, 1);
+ /* FEE0h - SVM Guest VMCB Physical Address */
+ put_smstate(u64, smstate, 0x7ee0, svm->nested.vmcb);
+
+ svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
+ svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
+ svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
+
+ ret = nested_svm_vmexit(svm);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
+static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *nested_vmcb;
+ struct page *page;
+ struct {
+ u64 guest;
+ u64 vmcb;
+ } svm_state_save;
+ int ret;
+
+ ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save,
+ sizeof(svm_state_save));
+ if (ret)
+ return ret;
+
+ if (svm_state_save.guest) {
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page);
+ if (nested_vmcb)
+ enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page);
+ else
+ ret = 1;
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ }
+ return ret;
+}
+
+static int enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ if (!gif_set(svm)) {
+ if (vgif_enabled(svm))
+ set_intercept(svm, INTERCEPT_STGI);
+ /* STGI will cause a vm exit */
+ return 1;
+ }
+ return 0;
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -5503,6 +5615,11 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.deliver_posted_interrupt = svm_deliver_avic_intr,
.update_pi_irte = svm_update_pi_irte,
.setup_mce = svm_setup_mce,
+
+ .smi_allowed = svm_smi_allowed,
+ .pre_enter_smm = svm_pre_enter_smm,
+ .pre_leave_smm = svm_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a6f4f095f8f4..4704aaf6d19e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -70,6 +70,9 @@ MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
static bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
+static bool __read_mostly enable_vnmi = 1;
+module_param_named(vnmi, enable_vnmi, bool, S_IRUGO);
+
static bool __read_mostly flexpriority_enabled = 1;
module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
@@ -202,6 +205,10 @@ struct loaded_vmcs {
bool nmi_known_unmasked;
unsigned long vmcs_host_cr3; /* May not match real cr3 */
unsigned long vmcs_host_cr4; /* May not match real cr4 */
+ /* Support for vnmi-less CPUs */
+ int soft_vnmi_blocked;
+ ktime_t entry_time;
+ s64 vnmi_blocked_time;
struct list_head loaded_vmcss_on_cpu_link;
};
@@ -486,6 +493,14 @@ struct nested_vmx {
u64 nested_vmx_cr4_fixed1;
u64 nested_vmx_vmcs_enum;
u64 nested_vmx_vmfunc_controls;
+
+ /* SMM related state */
+ struct {
+ /* in VMX operation on SMM entry? */
+ bool vmxon;
+ /* in guest mode on SMM entry? */
+ bool guest_mode;
+ } smm;
};
#define POSTED_INTR_ON 0
@@ -900,16 +915,13 @@ static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
static bool vmx_xsaves_supported(void);
-static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
-static int alloc_identity_pagetable(struct kvm *kvm);
static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@ -1286,6 +1298,11 @@ static inline bool cpu_has_vmx_invpcid(void)
SECONDARY_EXEC_ENABLE_INVPCID;
}
+static inline bool cpu_has_virtual_nmis(void)
+{
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
+}
+
static inline bool cpu_has_vmx_wbinvd_exit(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -1343,11 +1360,6 @@ static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
(vmcs12->secondary_vm_exec_control & bit);
}
-static inline bool nested_cpu_has_virtual_nmis(struct vmcs12 *vmcs12)
-{
- return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
-}
-
static inline bool nested_cpu_has_preemption_timer(struct vmcs12 *vmcs12)
{
return vmcs12->pin_based_vm_exec_control &
@@ -1598,18 +1610,15 @@ static inline void vpid_sync_context(int vpid)
static inline void ept_sync_global(void)
{
- if (cpu_has_vmx_invept_global())
- __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
+ __invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);
}
static inline void ept_sync_context(u64 eptp)
{
- if (enable_ept) {
- if (cpu_has_vmx_invept_context())
- __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
- else
- ept_sync_global();
- }
+ if (cpu_has_vmx_invept_context())
+ __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
+ else
+ ept_sync_global();
}
static __always_inline void vmcs_check16(unsigned long field)
@@ -1878,7 +1887,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
{
u32 eb;
- eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
+ eb = (1u << PF_VECTOR) | (1u << MC_VECTOR) |
(1u << DB_VECTOR) | (1u << AC_VECTOR);
if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
@@ -1896,6 +1905,8 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
*/
if (is_guest_mode(vcpu))
eb |= get_vmcs12(vcpu)->exception_bitmap;
+ else
+ eb |= 1u << UD_VECTOR;
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -2831,8 +2842,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
SECONDARY_EXEC_ENABLE_PML;
vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT;
}
- } else
- vmx->nested.nested_vmx_ept_caps = 0;
+ }
if (cpu_has_vmx_vmfunc()) {
vmx->nested.nested_vmx_secondary_ctls_high |=
@@ -2841,8 +2851,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
* Advertise EPTP switching unconditionally
* since we emulate it
*/
- vmx->nested.nested_vmx_vmfunc_controls =
- VMX_VMFUNC_EPTP_SWITCHING;
+ if (enable_ept)
+ vmx->nested.nested_vmx_vmfunc_controls =
+ VMX_VMFUNC_EPTP_SWITCHING;
}
/*
@@ -2856,8 +2867,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
SECONDARY_EXEC_ENABLE_VPID;
vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
VMX_VPID_EXTENT_SUPPORTED_MASK;
- } else
- vmx->nested.nested_vmx_vpid_caps = 0;
+ }
if (enable_unrestricted_guest)
vmx->nested.nested_vmx_secondary_ctls_high |=
@@ -3544,7 +3554,8 @@ static int hardware_enable(void)
wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
}
kvm_cpu_vmxon(phys_addr);
- ept_sync_global();
+ if (enable_ept)
+ ept_sync_global();
return 0;
}
@@ -3657,8 +3668,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_SHADOW_VMCS |
SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_RDSEED |
- SECONDARY_EXEC_RDRAND |
+ SECONDARY_EXEC_RDSEED_EXITING |
+ SECONDARY_EXEC_RDRAND_EXITING |
SECONDARY_EXEC_ENABLE_PML |
SECONDARY_EXEC_TSC_SCALING |
SECONDARY_EXEC_ENABLE_VMFUNC;
@@ -3679,14 +3690,25 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+ rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
+ &vmx_capability.ept, &vmx_capability.vpid);
+
if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
/* CR3 accesses and invlpg don't need to cause VM Exits when EPT
enabled */
_cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_INVLPG_EXITING);
- rdmsr(MSR_IA32_VMX_EPT_VPID_CAP,
- vmx_capability.ept, vmx_capability.vpid);
+ } else if (vmx_capability.ept) {
+ vmx_capability.ept = 0;
+ pr_warn_once("EPT CAP should not exist if not support "
+ "1-setting enable EPT VM-execution control\n");
+ }
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
+ vmx_capability.vpid) {
+ vmx_capability.vpid = 0;
+ pr_warn_once("VPID CAP should not exist if not support "
+ "1-setting enable VPID VM-execution control\n");
}
min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
@@ -3699,9 +3721,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
&_vmexit_control) < 0)
return -EIO;
- min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING |
- PIN_BASED_VIRTUAL_NMIS;
- opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER;
+ min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
+ opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+ PIN_BASED_VMX_PREEMPTION_TIMER;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
&_pin_based_exec_control) < 0)
return -EIO;
@@ -4781,18 +4803,18 @@ static int init_rmode_identity_map(struct kvm *kvm)
kvm_pfn_t identity_map_pfn;
u32 tmp;
- if (!enable_ept)
- return 0;
-
/* Protect kvm->arch.ept_identity_pagetable_done. */
mutex_lock(&kvm->slots_lock);
if (likely(kvm->arch.ept_identity_pagetable_done))
goto out2;
+ if (!kvm->arch.ept_identity_map_addr)
+ kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
- r = alloc_identity_pagetable(kvm);
+ r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ kvm->arch.ept_identity_map_addr, PAGE_SIZE);
if (r < 0)
goto out2;
@@ -4864,20 +4886,6 @@ out:
return r;
}
-static int alloc_identity_pagetable(struct kvm *kvm)
-{
- /* Called with kvm->slots_lock held. */
-
- int r = 0;
-
- BUG_ON(kvm->arch.ept_identity_pagetable_done);
-
- r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
- kvm->arch.ept_identity_map_addr, PAGE_SIZE);
-
- return r;
-}
-
static int allocate_vpid(void)
{
int vpid;
@@ -5233,6 +5241,10 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
if (!kvm_vcpu_apicv_active(&vmx->vcpu))
pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+
+ if (!enable_vnmi)
+ pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
+
/* Enable the preemption timer dynamically */
pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
return pin_based_exec_ctrl;
@@ -5282,13 +5294,13 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
static bool vmx_rdrand_supported(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDRAND;
+ SECONDARY_EXEC_RDRAND_EXITING;
}
static bool vmx_rdseed_supported(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDSEED;
+ SECONDARY_EXEC_RDSEED_EXITING;
}
static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
@@ -5382,30 +5394,30 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
if (vmx_rdrand_supported()) {
bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
if (rdrand_enabled)
- exec_control &= ~SECONDARY_EXEC_RDRAND;
+ exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
if (nested) {
if (rdrand_enabled)
vmx->nested.nested_vmx_secondary_ctls_high |=
- SECONDARY_EXEC_RDRAND;
+ SECONDARY_EXEC_RDRAND_EXITING;
else
vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDRAND;
+ ~SECONDARY_EXEC_RDRAND_EXITING;
}
}
if (vmx_rdseed_supported()) {
bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
if (rdseed_enabled)
- exec_control &= ~SECONDARY_EXEC_RDSEED;
+ exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
if (nested) {
if (rdseed_enabled)
vmx->nested.nested_vmx_secondary_ctls_high |=
- SECONDARY_EXEC_RDSEED;
+ SECONDARY_EXEC_RDSEED_EXITING;
else
vmx->nested.nested_vmx_secondary_ctls_high &=
- ~SECONDARY_EXEC_RDSEED;
+ ~SECONDARY_EXEC_RDSEED_EXITING;
}
}
@@ -5426,7 +5438,7 @@ static void ept_set_mmio_spte_mask(void)
/*
* Sets up the vmcs for emulated real mode.
*/
-static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
+static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
{
#ifdef CONFIG_X86_64
unsigned long a;
@@ -5539,8 +5551,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
}
-
- return 0;
}
static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -5592,7 +5602,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
}
- vmcs_writel(GUEST_RFLAGS, 0x02);
+ kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0xfff0);
vmcs_writel(GUEST_GDTR_BASE, 0);
@@ -5604,6 +5614,8 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ if (kvm_mpx_supported())
+ vmcs_write64(GUEST_BNDCFGS, 0);
setup_msrs(vmx);
@@ -5667,7 +5679,8 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
static void enable_nmi_window(struct kvm_vcpu *vcpu)
{
- if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
+ if (!enable_vnmi ||
+ vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
enable_irq_window(vcpu);
return;
}
@@ -5707,6 +5720,19 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ if (!enable_vnmi) {
+ /*
+ * Tracking the NMI-blocked state in software is built upon
+ * finding the next open IRQ window. This, in turn, depends on
+ * well-behaving guests: They have to keep IRQs disabled at
+ * least as long as the NMI handler runs. Otherwise we may
+ * cause NMI nesting, maybe breaking the guest. But as this is
+ * highly unlikely, we can live with the residual risk.
+ */
+ vmx->loaded_vmcs->soft_vnmi_blocked = 1;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+
++vcpu->stat.nmi_injections;
vmx->loaded_vmcs->nmi_known_unmasked = false;
@@ -5725,6 +5751,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool masked;
+ if (!enable_vnmi)
+ return vmx->loaded_vmcs->soft_vnmi_blocked;
if (vmx->loaded_vmcs->nmi_known_unmasked)
return false;
masked = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
@@ -5736,13 +5764,20 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmx->loaded_vmcs->nmi_known_unmasked = !masked;
- if (masked)
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- else
- vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
+ if (!enable_vnmi) {
+ if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = masked;
+ vmx->loaded_vmcs->vnmi_blocked_time = 0;
+ }
+ } else {
+ vmx->loaded_vmcs->nmi_known_unmasked = !masked;
+ if (masked)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ }
}
static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -5750,6 +5785,10 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
if (to_vmx(vcpu)->nested.nested_run_pending)
return 0;
+ if (!enable_vnmi &&
+ to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
+ return 0;
+
return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
| GUEST_INTR_STATE_NMI));
@@ -5878,11 +5917,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
return 1; /* already handled by vmx_vcpu_run() */
if (is_invalid_opcode(intr_info)) {
- if (is_guest_mode(vcpu)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
+ WARN_ON_ONCE(is_guest_mode(vcpu));
er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
+ if (er == EMULATE_USER_EXIT)
+ return 0;
if (er != EMULATE_DONE)
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
@@ -5912,8 +5950,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
cr2 = vmcs_readl(EXIT_QUALIFICATION);
/* EPT won't cause page fault directly */
WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
- return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0,
- true);
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -6478,6 +6515,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
* AAK134, BY25.
*/
if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ enable_vnmi &&
(exit_qualification & INTR_INFO_UNBLOCK_NMI))
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
@@ -6537,6 +6575,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
+ WARN_ON_ONCE(!enable_vnmi);
vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
CPU_BASED_VIRTUAL_NMI_PENDING);
++vcpu->stat.nmi_window_exits;
@@ -6564,7 +6603,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (kvm_test_request(KVM_REQ_EVENT, vcpu))
return 1;
- err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
+ err = emulate_instruction(vcpu, 0);
if (err == EMULATE_USER_EXIT) {
++vcpu->stat.mmio_exits;
@@ -6747,21 +6786,22 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_ept() ||
!cpu_has_vmx_ept_4levels() ||
- !cpu_has_vmx_ept_mt_wb()) {
+ !cpu_has_vmx_ept_mt_wb() ||
+ !cpu_has_vmx_invept_global())
enable_ept = 0;
- enable_unrestricted_guest = 0;
- enable_ept_ad_bits = 0;
- }
if (!cpu_has_vmx_ept_ad_bits() || !enable_ept)
enable_ept_ad_bits = 0;
- if (!cpu_has_vmx_unrestricted_guest())
+ if (!cpu_has_vmx_unrestricted_guest() || !enable_ept)
enable_unrestricted_guest = 0;
if (!cpu_has_vmx_flexpriority())
flexpriority_enabled = 0;
+ if (!cpu_has_virtual_nmis())
+ enable_vnmi = 0;
+
/*
* set_apic_access_page_addr() is used to reload apic access
* page upon invalidation. No need to do anything if not
@@ -6776,8 +6816,13 @@ static __init int hardware_setup(void)
if (enable_ept && !cpu_has_vmx_ept_2m_page())
kvm_disable_largepages();
- if (!cpu_has_vmx_ple())
+ if (!cpu_has_vmx_ple()) {
ple_gap = 0;
+ ple_window = 0;
+ ple_window_grow = 0;
+ ple_window_max = 0;
+ ple_window_shrink = 0;
+ }
if (!cpu_has_vmx_apicv()) {
enable_apicv = 0;
@@ -6961,7 +7006,7 @@ static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
}
/* Create a new VMCS */
- item = kmalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
+ item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
if (!item)
return NULL;
item->vmcs02.vmcs = alloc_vmcs();
@@ -7370,10 +7415,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
*/
static void free_nested(struct vcpu_vmx *vmx)
{
- if (!vmx->nested.vmxon)
+ if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
return;
vmx->nested.vmxon = false;
+ vmx->nested.smm.vmxon = false;
free_vpid(vmx->nested.vpid02);
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
@@ -7978,6 +8024,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
* "blocked by NMI" bit has to be set before next VM entry.
*/
if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+ enable_vnmi &&
(exit_qualification & INTR_INFO_UNBLOCK_NMI))
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
GUEST_INTR_STATE_NMI);
@@ -8415,9 +8462,9 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
case EXIT_REASON_RDPMC:
return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
case EXIT_REASON_RDRAND:
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND_EXITING);
case EXIT_REASON_RDSEED:
- return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED);
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
@@ -8822,6 +8869,25 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
return 0;
}
+ if (unlikely(!enable_vnmi &&
+ vmx->loaded_vmcs->soft_vnmi_blocked)) {
+ if (vmx_interrupt_allowed(vcpu)) {
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ } else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
+ vcpu->arch.nmi_pending) {
+ /*
+ * This CPU don't support us in finding the end of an
+ * NMI-blocked window if the guest runs with IRQs
+ * disabled. So we pull the trigger after 1 s of
+ * futile waiting, but inform the user about this.
+ */
+ printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+ "state on VCPU %d after 1 s timeout\n",
+ __func__, vcpu->vcpu_id);
+ vmx->loaded_vmcs->soft_vnmi_blocked = 0;
+ }
+ }
+
if (exit_reason < kvm_vmx_max_exit_handlers
&& kvm_vmx_exit_handlers[exit_reason])
return kvm_vmx_exit_handlers[exit_reason](vcpu);
@@ -9104,33 +9170,38 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
- if (vmx->loaded_vmcs->nmi_known_unmasked)
- return;
- /*
- * Can't use vmx->exit_intr_info since we're not sure what
- * the exit reason is.
- */
- exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
- vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
- /*
- * SDM 3: 27.7.1.2 (September 2008)
- * Re-set bit "block by NMI" before VM entry if vmexit caused by
- * a guest IRET fault.
- * SDM 3: 23.2.2 (September 2008)
- * Bit 12 is undefined in any of the following cases:
- * If the VM exit sets the valid bit in the IDT-vectoring
- * information field.
- * If the VM exit is due to a double fault.
- */
- if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
- vector != DF_VECTOR && !idtv_info_valid)
- vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
- GUEST_INTR_STATE_NMI);
- else
- vmx->loaded_vmcs->nmi_known_unmasked =
- !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
- & GUEST_INTR_STATE_NMI);
+ if (enable_vnmi) {
+ if (vmx->loaded_vmcs->nmi_known_unmasked)
+ return;
+ /*
+ * Can't use vmx->exit_intr_info since we're not sure what
+ * the exit reason is.
+ */
+ exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
+ vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
+ /*
+ * SDM 3: 27.7.1.2 (September 2008)
+ * Re-set bit "block by NMI" before VM entry if vmexit caused by
+ * a guest IRET fault.
+ * SDM 3: 23.2.2 (September 2008)
+ * Bit 12 is undefined in any of the following cases:
+ * If the VM exit sets the valid bit in the IDT-vectoring
+ * information field.
+ * If the VM exit is due to a double fault.
+ */
+ if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+ vector != DF_VECTOR && !idtv_info_valid)
+ vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+ GUEST_INTR_STATE_NMI);
+ else
+ vmx->loaded_vmcs->nmi_known_unmasked =
+ !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
+ & GUEST_INTR_STATE_NMI);
+ } else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->vnmi_blocked_time +=
+ ktime_to_ns(ktime_sub(ktime_get(),
+ vmx->loaded_vmcs->entry_time));
}
static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
@@ -9247,6 +9318,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long debugctlmsr, cr3, cr4;
+ /* Record the guest's net vcpu time for enforced NMI injections. */
+ if (unlikely(!enable_vnmi &&
+ vmx->loaded_vmcs->soft_vnmi_blocked))
+ vmx->loaded_vmcs->entry_time = ktime_get();
+
/* Don't enter VMX if guest state is invalid, let the exit handler
start emulation until we arrive back to a valid state */
if (vmx->emulation_required)
@@ -9475,7 +9551,6 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
vmx->loaded_vmcs = vmcs;
vmx_vcpu_put(vcpu);
vmx_vcpu_load(vcpu, cpu);
- vcpu->cpu = cpu;
put_cpu();
}
@@ -9556,11 +9631,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
cpu = get_cpu();
vmx_vcpu_load(&vmx->vcpu, cpu);
vmx->vcpu.cpu = cpu;
- err = vmx_vcpu_setup(vmx);
+ vmx_vcpu_setup(vmx);
vmx_vcpu_put(&vmx->vcpu);
put_cpu();
- if (err)
- goto free_vmcs;
if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
err = alloc_apic_access_page(kvm);
if (err)
@@ -9568,9 +9641,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
}
if (enable_ept) {
- if (!kvm->arch.ept_identity_map_addr)
- kvm->arch.ept_identity_map_addr =
- VMX_EPT_IDENTITY_PAGETABLE_ADDR;
err = init_rmode_identity_map(kvm);
if (err)
goto free_vmcs;
@@ -9732,8 +9802,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
- /* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */
- cr4_fixed1_update(bit(11), ecx, bit(2));
+ cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
#undef cr4_fixed1_update
}
@@ -10807,6 +10876,11 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
return 1;
}
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) &&
+ (is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) ||
+ (vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD)))
+ return 1;
+
return 0;
}
@@ -11031,13 +11105,12 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qual;
-
- if (kvm_event_needs_reinjection(vcpu))
- return -EBUSY;
+ bool block_nested_events =
+ vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
if (vcpu->arch.exception.pending &&
nested_vmx_check_exception(vcpu, &exit_qual)) {
- if (vmx->nested.nested_run_pending)
+ if (block_nested_events)
return -EBUSY;
nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
vcpu->arch.exception.pending = false;
@@ -11046,14 +11119,14 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
vmx->nested.preemption_timer_expired) {
- if (vmx->nested.nested_run_pending)
+ if (block_nested_events)
return -EBUSY;
nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
return 0;
}
if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
- if (vmx->nested.nested_run_pending)
+ if (block_nested_events)
return -EBUSY;
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
NMI_VECTOR | INTR_TYPE_NMI_INTR |
@@ -11069,7 +11142,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
nested_exit_on_intr(vcpu)) {
- if (vmx->nested.nested_run_pending)
+ if (block_nested_events)
return -EBUSY;
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
return 0;
@@ -11256,6 +11329,24 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
kvm_clear_interrupt_queue(vcpu);
}
+static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ u32 entry_failure_code;
+
+ nested_ept_uninit_mmu_context(vcpu);
+
+ /*
+ * Only PDPTE load can fail as the value of cr3 was checked on entry and
+ * couldn't have changed.
+ */
+ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+ nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
+
+ if (!enable_ept)
+ vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+}
+
/*
* A part of what we need to when the nested L2 guest exits and we want to
* run its L1 parent, is to reset L1's guest state to the host state specified
@@ -11269,7 +11360,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
struct kvm_segment seg;
- u32 entry_failure_code;
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -11296,17 +11386,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
vmx_set_cr4(vcpu, vmcs12->host_cr4);
- nested_ept_uninit_mmu_context(vcpu);
-
- /*
- * Only PDPTE load can fail as the value of cr3 was checked on entry and
- * couldn't have changed.
- */
- if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
- nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
-
- if (!enable_ept)
- vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
+ load_vmcs12_mmu_host_state(vcpu, vmcs12);
if (enable_vpid) {
/*
@@ -11325,6 +11405,8 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->host_idtr_base);
vmcs_writel(GUEST_GDTR_BASE, vmcs12->host_gdtr_base);
+ vmcs_write32(GUEST_IDTR_LIMIT, 0xFFFF);
+ vmcs_write32(GUEST_GDTR_LIMIT, 0xFFFF);
/* If not VM_EXIT_CLEAR_BNDCFGS, the L2 value propagates to L1. */
if (vmcs12->vm_exit_controls & VM_EXIT_CLEAR_BNDCFGS)
@@ -11421,8 +11503,11 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
leave_guest_mode(vcpu);
if (likely(!vmx->fail)) {
- prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
- exit_qualification);
+ if (exit_reason == -1)
+ sync_vmcs12(vcpu, vmcs12);
+ else
+ prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
+ exit_qualification);
if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
vmcs12->vm_exit_msr_store_count))
@@ -11486,7 +11571,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
*/
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (enable_shadow_vmcs)
+ if (enable_shadow_vmcs && exit_reason != -1)
vmx->nested.sync_shadow_vmcs = true;
/* in case we halted in L2 */
@@ -11510,12 +11595,13 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
}
- trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
- vmcs12->exit_qualification,
- vmcs12->idt_vectoring_info_field,
- vmcs12->vm_exit_intr_info,
- vmcs12->vm_exit_intr_error_code,
- KVM_ISA_VMX);
+ if (exit_reason != -1)
+ trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
+ vmcs12->exit_qualification,
+ vmcs12->idt_vectoring_info_field,
+ vmcs12->vm_exit_intr_info,
+ vmcs12->vm_exit_intr_error_code,
+ KVM_ISA_VMX);
load_vmcs12_host_state(vcpu, vmcs12);
@@ -11530,6 +11616,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
* accordingly.
*/
nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+
+ load_vmcs12_mmu_host_state(vcpu, vmcs12);
+
/*
* The emulated instruction was already skipped in
* nested_vmx_run, but the updated RIP was never
@@ -11938,6 +12027,54 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu)
~FEATURE_CONTROL_LMCE;
}
+static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
+{
+ /* we need a nested vmexit to enter SMM, postpone if run is pending */
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return 0;
+ return 1;
+}
+
+static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
+ if (vmx->nested.smm.guest_mode)
+ nested_vmx_vmexit(vcpu, -1, 0, 0);
+
+ vmx->nested.smm.vmxon = vmx->nested.vmxon;
+ vmx->nested.vmxon = false;
+ return 0;
+}
+
+static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int ret;
+
+ if (vmx->nested.smm.vmxon) {
+ vmx->nested.vmxon = true;
+ vmx->nested.smm.vmxon = false;
+ }
+
+ if (vmx->nested.smm.guest_mode) {
+ vcpu->arch.hflags &= ~HF_SMM_MASK;
+ ret = enter_vmx_non_root_mode(vcpu, false);
+ vcpu->arch.hflags |= HF_SMM_MASK;
+ if (ret)
+ return ret;
+
+ vmx->nested.smm.guest_mode = false;
+ }
+ return 0;
+}
+
+static int enable_smi_window(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -12063,6 +12200,11 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
#endif
.setup_mce = vmx_setup_mce,
+
+ .smi_allowed = vmx_smi_allowed,
+ .pre_enter_smm = vmx_pre_enter_smm,
+ .pre_leave_smm = vmx_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03869eb7fcd6..eee8e7faf1af 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -107,6 +107,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
static bool __read_mostly ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
+static bool __read_mostly report_ignored_msrs = true;
+module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR);
+
unsigned int min_timer_period_us = 500;
module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
@@ -1795,10 +1798,13 @@ u64 get_kvmclock_ns(struct kvm *kvm)
/* both __this_cpu_read() and rdtsc() should be on the same cpu */
get_cpu();
- kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
- &hv_clock.tsc_shift,
- &hv_clock.tsc_to_system_mul);
- ret = __pvclock_read_cycles(&hv_clock, rdtsc());
+ if (__this_cpu_read(cpu_tsc_khz)) {
+ kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
+ &hv_clock.tsc_shift,
+ &hv_clock.tsc_to_system_mul);
+ ret = __pvclock_read_cycles(&hv_clock, rdtsc());
+ } else
+ ret = ktime_get_boot_ns() + ka->kvmclock_offset;
put_cpu();
@@ -1830,6 +1836,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
*/
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+ if (guest_hv_clock.version & 1)
+ ++guest_hv_clock.version; /* first time write, random junk */
+
vcpu->hv_clock.version = guest_hv_clock.version + 1;
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
@@ -2006,10 +2015,12 @@ static void kvmclock_sync_fn(struct work_struct *work)
KVMCLOCK_SYNC_PERIOD);
}
-static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
u64 mcg_cap = vcpu->arch.mcg_cap;
unsigned bank_num = mcg_cap & 0xff;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
switch (msr) {
case MSR_IA32_MCG_STATUS:
@@ -2034,6 +2045,9 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if ((offset & 0x3) == 0 &&
data != 0 && (data | (1 << 10)) != ~(u64)0)
return -1;
+ if (!msr_info->host_initiated &&
+ (offset & 0x3) == 1 && data != 0)
+ return -1;
vcpu->arch.mce_banks[offset] = data;
break;
}
@@ -2283,7 +2297,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
- return set_msr_mce(vcpu, msr, data);
+ return set_msr_mce(vcpu, msr_info);
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
@@ -2317,7 +2331,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
/* Drop writes to this legacy MSR -- see rdmsr
* counterpart for further detail.
*/
- vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data);
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
break;
case MSR_AMD64_OSVW_ID_LENGTH:
if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
@@ -2354,8 +2370,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr, data);
return 1;
} else {
- vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
- msr, data);
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu,
+ "ignored wrmsr: 0x%x data 0x%llx\n",
+ msr, data);
break;
}
}
@@ -2573,7 +2591,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->index);
return 1;
} else {
- vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index);
+ if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
+ msr_info->index);
msr_info->data = 0;
}
break;
@@ -4034,10 +4054,16 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_SET_IDENTITY_MAP_ADDR: {
u64 ident_addr;
+ mutex_lock(&kvm->lock);
+ r = -EINVAL;
+ if (kvm->created_vcpus)
+ goto set_identity_unlock;
r = -EFAULT;
if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
- goto out;
+ goto set_identity_unlock;
r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
+set_identity_unlock:
+ mutex_unlock(&kvm->lock);
break;
}
case KVM_SET_NR_MMU_PAGES:
@@ -5275,6 +5301,11 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla
kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags);
}
+static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase)
+{
+ return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase);
+}
+
static const struct x86_emulate_ops emulate_ops = {
.read_gpr = emulator_read_gpr,
.write_gpr = emulator_write_gpr,
@@ -5316,6 +5347,7 @@ static const struct x86_emulate_ops emulate_ops = {
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
.set_hflags = emulator_set_hflags,
+ .pre_leave_smm = emulator_pre_leave_smm,
};
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
@@ -5413,7 +5445,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
- r = EMULATE_FAIL;
+ r = EMULATE_USER_EXIT;
}
kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5705,6 +5737,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
emulation_type))
return EMULATE_DONE;
+ if (ctxt->have_exception && inject_emulated_exception(vcpu))
+ return EMULATE_DONE;
if (emulation_type & EMULTYPE_SKIP)
return EMULATE_FAIL;
return handle_emulation_failure(vcpu);
@@ -6426,7 +6460,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
}
kvm_x86_ops->queue_exception(vcpu);
- } else if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+ } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) {
vcpu->arch.smi_pending = false;
enter_smm(vcpu);
} else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
@@ -6473,9 +6507,6 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
-#define put_smstate(type, buf, offset, val) \
- *(type *)((buf) + (offset) - 0x7e00) = val
-
static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
{
u32 flags = 0;
@@ -6641,13 +6672,20 @@ static void enter_smm(struct kvm_vcpu *vcpu)
u32 cr0;
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
- vcpu->arch.hflags |= HF_SMM_MASK;
memset(buf, 0, 512);
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
enter_smm_save_state_64(vcpu, buf);
else
enter_smm_save_state_32(vcpu, buf);
+ /*
+ * Give pre_enter_smm() a chance to make ISA-specific changes to the
+ * vCPU state (e.g. leave guest mode) after we've saved the state into
+ * the SMM state-save area.
+ */
+ kvm_x86_ops->pre_enter_smm(vcpu, buf);
+
+ vcpu->arch.hflags |= HF_SMM_MASK;
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
if (kvm_x86_ops->get_nmi_mask(vcpu))
@@ -6876,17 +6914,23 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (inject_pending_event(vcpu, req_int_win) != 0)
req_immediate_exit = true;
else {
- /* Enable NMI/IRQ window open exits if needed.
+ /* Enable SMI/NMI/IRQ window open exits if needed.
*
- * SMIs have two cases: 1) they can be nested, and
- * then there is nothing to do here because RSM will
- * cause a vmexit anyway; 2) or the SMI can be pending
- * because inject_pending_event has completed the
- * injection of an IRQ or NMI from the previous vmexit,
- * and then we request an immediate exit to inject the SMI.
+ * SMIs have three cases:
+ * 1) They can be nested, and then there is nothing to
+ * do here because RSM will cause a vmexit anyway.
+ * 2) There is an ISA-specific reason why SMI cannot be
+ * injected, and the moment when this changes can be
+ * intercepted.
+ * 3) Or the SMI can be pending because
+ * inject_pending_event has completed the injection
+ * of an IRQ or NMI from the previous vmexit, and
+ * then we request an immediate exit to inject the
+ * SMI.
*/
if (vcpu->arch.smi_pending && !is_smm(vcpu))
- req_immediate_exit = true;
+ if (!kvm_x86_ops->enable_smi_window(vcpu))
+ req_immediate_exit = true;
if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
@@ -7223,12 +7267,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
struct fpu *fpu = &current->thread.fpu;
int r;
- sigset_t sigsaved;
fpu__initialize(fpu);
- if (vcpu->sigset_active)
- sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+ kvm_sigset_activate(vcpu);
if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
if (kvm_run->immediate_exit) {
@@ -7271,8 +7313,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
out:
post_kvm_run_save(vcpu);
- if (vcpu->sigset_active)
- sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+ kvm_sigset_deactivate(vcpu);
return r;
}
@@ -7798,18 +7839,40 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false;
+ if (kvm_mpx_supported()) {
+ void *mpx_state_buffer;
+
+ /*
+ * To avoid have the INIT path from kvm_apic_has_events() that be
+ * called with loaded FPU and does not let userspace fix the state.
+ */
+ kvm_put_guest_fpu(vcpu);
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
+ XFEATURE_MASK_BNDREGS);
+ if (mpx_state_buffer)
+ memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
+ mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,
+ XFEATURE_MASK_BNDCSR);
+ if (mpx_state_buffer)
+ memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
+ }
+
if (!init_event) {
kvm_pmu_reset(vcpu);
vcpu->arch.smbase = 0x30000;
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
vcpu->arch.msr_misc_features_enables = 0;
+
+ vcpu->arch.xcr0 = XFEATURE_MASK_FP;
}
memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
vcpu->arch.regs_avail = ~0;
vcpu->arch.regs_dirty = ~0;
+ vcpu->arch.ia32_xss = 0;
+
kvm_x86_ops->vcpu_reset(vcpu, init_event);
}
@@ -7974,16 +8037,11 @@ EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
struct page *page;
- struct kvm *kvm;
int r;
- BUG_ON(vcpu->kvm == NULL);
- kvm = vcpu->kvm;
-
vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
- vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.emulate_ctxt.ops = &emulate_ops;
- if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
@@ -8001,7 +8059,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (r < 0)
goto fail_free_pio_data;
- if (irqchip_in_kernel(kvm)) {
+ if (irqchip_in_kernel(vcpu->kvm)) {
r = kvm_create_lapic(vcpu);
if (r < 0)
goto fail_mmu_destroy;
@@ -8023,10 +8081,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
fx_init(vcpu);
- vcpu->arch.ia32_tsc_adjust_msr = 0x0;
- vcpu->arch.pv_time_enabled = false;
-
- vcpu->arch.guest_supported_xcr0 = 0;
vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
index 35625d279458..9119d8e41f1f 100644
--- a/arch/x86/lib/insn-eval.c
+++ b/arch/x86/lib/insn-eval.c
@@ -733,11 +733,11 @@ static unsigned long get_seg_limit(struct pt_regs *regs, int seg_reg_idx)
*
* Returns:
*
- * A signed 8-bit value containing the default parameters on success.
+ * An int containing ORed-in default parameters on success.
*
* -EINVAL on error.
*/
-char insn_get_code_seg_params(struct pt_regs *regs)
+int insn_get_code_seg_params(struct pt_regs *regs)
{
struct desc_struct *desc;
short sel;
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 12e377184ee4..c4d55919fac1 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -896,7 +896,7 @@ EndTable
GrpTable: Grp3_1
0: TEST Eb,Ib
-1:
+1: TEST Eb,Ib
2: NOT Eb
3: NEG Eb
4: MUL AL,Eb
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 7ba7f3d7f477..8e13b8cc6bed 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -29,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o
obj-$(CONFIG_HIGHMEM) += highmem_32.o
-obj-$(CONFIG_KMEMCHECK) += kmemcheck/
-
KASAN_SANITIZE_kasan_init_$(BITS).o := n
obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 3321b446b66c..30bc4812ceb8 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -82,7 +82,7 @@ bool ex_handler_refcount(const struct exception_table_entry *fixup,
return true;
}
-EXPORT_SYMBOL_GPL(ex_handler_refcount);
+EXPORT_SYMBOL(ex_handler_refcount);
/*
* Handler for when we fail to restore a task's FPU state. We should never get
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 3109ba6c6ede..febf6980e653 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,7 +20,6 @@
#include <asm/cpufeature.h> /* boot_cpu_has, ... */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
-#include <asm/kmemcheck.h> /* kmemcheck_*(), ... */
#include <asm/fixmap.h> /* VSYSCALL_ADDR */
#include <asm/vsyscall.h> /* emulate_vsyscall */
#include <asm/vm86.h> /* struct vm86 */
@@ -702,7 +701,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
else
printk(KERN_CONT "paging request");
- printk(KERN_CONT " at %p\n", (void *) address);
+ printk(KERN_CONT " at %px\n", (void *) address);
printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);
dump_pagetable(address);
@@ -1256,8 +1255,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
* Detect and handle instructions that would cause a page fault for
* both a tracked kernel page and a userspace page.
*/
- if (kmemcheck_active(regs))
- kmemcheck_hide(regs);
prefetchw(&mm->mmap_sem);
if (unlikely(kmmio_fault(regs, address)))
@@ -1280,9 +1277,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
if (vmalloc_fault(address) >= 0)
return;
-
- if (kmemcheck_fault(regs, address, error_code))
- return;
}
/* Can handle a stale RO->RW TLB: */
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 8ae0000cbdb3..00b296617ca4 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -158,6 +158,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (len > TASK_SIZE)
return -ENOMEM;
+ /* No address checking. See comment at mmap_address_hint_valid() */
if (flags & MAP_FIXED) {
if (prepare_hugepage_range(file, addr, len))
return -EINVAL;
@@ -165,12 +166,16 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
}
if (addr) {
- addr = ALIGN(addr, huge_page_size(h));
+ addr &= huge_page_mask(h);
+ if (!mmap_address_hint_valid(addr, len))
+ goto get_unmapped_area;
+
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
- (!vma || addr + len <= vm_start_gap(vma)))
+ if (!vma || addr + len <= vm_start_gap(vma))
return addr;
}
+
+get_unmapped_area:
if (mm->get_unmapped_area == arch_get_unmapped_area)
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
pgoff, flags);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a22c2b95e513..6fdf91ef130a 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -92,8 +92,7 @@ __ref void *alloc_low_pages(unsigned int num)
unsigned int order;
order = get_order((unsigned long)num << PAGE_SHIFT);
- return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK |
- __GFP_ZERO, order);
+ return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order);
}
if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
@@ -164,12 +163,11 @@ static int page_size_mask;
static void __init probe_page_size_mask(void)
{
/*
- * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will
- * use small pages.
+ * For pagealloc debugging, identity mapping will use small pages.
* This will simplify cpa(), which otherwise needs to support splitting
* large pages into small in interrupt context, etc.
*/
- if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK))
+ if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled())
page_size_mask |= 1 << PG_LEVEL_2M;
else
direct_gbpages = 0;
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index adcea90a2046..4a837289f2ad 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -184,7 +184,7 @@ static __ref void *spp_getpage(void)
void *ptr;
if (after_bootmem)
- ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
+ ptr = (void *) get_zeroed_page(GFP_ATOMIC);
else
ptr = alloc_bootmem_pages(PAGE_SIZE);
@@ -1173,12 +1173,18 @@ void __init mem_init(void)
/* clear_bss() already clear the empty_zero_page */
- register_page_bootmem_info();
-
/* this will put all memory onto the freelists */
free_all_bootmem();
after_bootmem = 1;
+ /*
+ * Must be done after boot memory is put on freelist, because here we
+ * might set fields in deferred struct pages that have not yet been
+ * initialized, and free_all_bootmem() initializes all the reserved
+ * deferred pages for us.
+ */
+ register_page_bootmem_info();
+
/* Register memory areas for /proc/kcore */
kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR,
PAGE_SIZE, KCORE_OTHER);
@@ -1399,7 +1405,6 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start,
vmemmap_verify((pte_t *)pmd, node, addr, next);
continue;
}
- pr_warn_once("vmemmap: falling back to regular page backing\n");
if (vmemmap_populate_basepages(addr, next, node))
return -ENOMEM;
}
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 2b60dc6e64b1..99dfed6dfef8 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -4,12 +4,14 @@
#include <linux/bootmem.h>
#include <linux/kasan.h>
#include <linux/kdebug.h>
+#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
#include <linux/vmalloc.h>
#include <asm/e820/types.h>
+#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
@@ -18,7 +20,134 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES];
static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
-static int __init map_range(struct range *range)
+static __init void *early_alloc(size_t size, int nid)
+{
+ return memblock_virt_alloc_try_nid_nopanic(size, size,
+ __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid);
+}
+
+static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr,
+ unsigned long end, int nid)
+{
+ pte_t *pte;
+
+ if (pmd_none(*pmd)) {
+ void *p;
+
+ if (boot_cpu_has(X86_FEATURE_PSE) &&
+ ((end - addr) == PMD_SIZE) &&
+ IS_ALIGNED(addr, PMD_SIZE)) {
+ p = early_alloc(PMD_SIZE, nid);
+ if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL))
+ return;
+ else if (p)
+ memblock_free(__pa(p), PMD_SIZE);
+ }
+
+ p = early_alloc(PAGE_SIZE, nid);
+ pmd_populate_kernel(&init_mm, pmd, p);
+ }
+
+ pte = pte_offset_kernel(pmd, addr);
+ do {
+ pte_t entry;
+ void *p;
+
+ if (!pte_none(*pte))
+ continue;
+
+ p = early_alloc(PAGE_SIZE, nid);
+ entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL);
+ set_pte_at(&init_mm, addr, pte, entry);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+}
+
+static void __init kasan_populate_pud(pud_t *pud, unsigned long addr,
+ unsigned long end, int nid)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ if (pud_none(*pud)) {
+ void *p;
+
+ if (boot_cpu_has(X86_FEATURE_GBPAGES) &&
+ ((end - addr) == PUD_SIZE) &&
+ IS_ALIGNED(addr, PUD_SIZE)) {
+ p = early_alloc(PUD_SIZE, nid);
+ if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL))
+ return;
+ else if (p)
+ memblock_free(__pa(p), PUD_SIZE);
+ }
+
+ p = early_alloc(PAGE_SIZE, nid);
+ pud_populate(&init_mm, pud, p);
+ }
+
+ pmd = pmd_offset(pud, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ if (!pmd_large(*pmd))
+ kasan_populate_pmd(pmd, addr, next, nid);
+ } while (pmd++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr,
+ unsigned long end, int nid)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ if (p4d_none(*p4d)) {
+ void *p = early_alloc(PAGE_SIZE, nid);
+
+ p4d_populate(&init_mm, p4d, p);
+ }
+
+ pud = pud_offset(p4d, addr);
+ do {
+ next = pud_addr_end(addr, end);
+ if (!pud_large(*pud))
+ kasan_populate_pud(pud, addr, next, nid);
+ } while (pud++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr,
+ unsigned long end, int nid)
+{
+ void *p;
+ p4d_t *p4d;
+ unsigned long next;
+
+ if (pgd_none(*pgd)) {
+ p = early_alloc(PAGE_SIZE, nid);
+ pgd_populate(&init_mm, pgd, p);
+ }
+
+ p4d = p4d_offset(pgd, addr);
+ do {
+ next = p4d_addr_end(addr, end);
+ kasan_populate_p4d(p4d, addr, next, nid);
+ } while (p4d++, addr = next, addr != end);
+}
+
+static void __init kasan_populate_shadow(unsigned long addr, unsigned long end,
+ int nid)
+{
+ pgd_t *pgd;
+ unsigned long next;
+
+ addr = addr & PAGE_MASK;
+ end = round_up(end, PAGE_SIZE);
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ kasan_populate_pgd(pgd, addr, next, nid);
+ } while (pgd++, addr = next, addr != end);
+}
+
+static void __init map_range(struct range *range)
{
unsigned long start;
unsigned long end;
@@ -26,7 +155,7 @@ static int __init map_range(struct range *range)
start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
- return vmemmap_populate(start, end, NUMA_NO_NODE);
+ kasan_populate_shadow(start, end, early_pfn_to_nid(range->start));
}
static void __init clear_pgds(unsigned long start,
@@ -189,16 +318,16 @@ void __init kasan_init(void)
if (pfn_mapped[i].end == 0)
break;
- if (map_range(&pfn_mapped[i]))
- panic("kasan: unable to allocate shadow!");
+ map_range(&pfn_mapped[i]);
}
+
kasan_populate_zero_shadow(
kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
kasan_mem_to_shadow((void *)__START_KERNEL_map));
- vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
- (unsigned long)kasan_mem_to_shadow(_end),
- NUMA_NO_NODE);
+ kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
+ (unsigned long)kasan_mem_to_shadow(_end),
+ early_pfn_to_nid(__pa(_stext)));
kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
(void *)KASAN_SHADOW_END);
diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile
deleted file mode 100644
index 520b3bce4095..000000000000
--- a/arch/x86/mm/kmemcheck/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 872ec4159a68..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -1,228 +1 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/interrupt.h>
-#include <linux/kdebug.h>
-#include <linux/kmemcheck.h>
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/stacktrace.h>
-#include <linux/string.h>
-
-#include "error.h"
-#include "shadow.h"
-
-enum kmemcheck_error_type {
- KMEMCHECK_ERROR_INVALID_ACCESS,
- KMEMCHECK_ERROR_BUG,
-};
-
-#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT)
-
-struct kmemcheck_error {
- enum kmemcheck_error_type type;
-
- union {
- /* KMEMCHECK_ERROR_INVALID_ACCESS */
- struct {
- /* Kind of access that caused the error */
- enum kmemcheck_shadow state;
- /* Address and size of the erroneous read */
- unsigned long address;
- unsigned int size;
- };
- };
-
- struct pt_regs regs;
- struct stack_trace trace;
- unsigned long trace_entries[32];
-
- /* We compress it to a char. */
- unsigned char shadow_copy[SHADOW_COPY_SIZE];
- unsigned char memory_copy[SHADOW_COPY_SIZE];
-};
-
-/*
- * Create a ring queue of errors to output. We can't call printk() directly
- * from the kmemcheck traps, since this may call the console drivers and
- * result in a recursive fault.
- */
-static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE];
-static unsigned int error_count;
-static unsigned int error_rd;
-static unsigned int error_wr;
-static unsigned int error_missed_count;
-
-static struct kmemcheck_error *error_next_wr(void)
-{
- struct kmemcheck_error *e;
-
- if (error_count == ARRAY_SIZE(error_fifo)) {
- ++error_missed_count;
- return NULL;
- }
-
- e = &error_fifo[error_wr];
- if (++error_wr == ARRAY_SIZE(error_fifo))
- error_wr = 0;
- ++error_count;
- return e;
-}
-
-static struct kmemcheck_error *error_next_rd(void)
-{
- struct kmemcheck_error *e;
-
- if (error_count == 0)
- return NULL;
-
- e = &error_fifo[error_rd];
- if (++error_rd == ARRAY_SIZE(error_fifo))
- error_rd = 0;
- --error_count;
- return e;
-}
-
-void kmemcheck_error_recall(void)
-{
- static const char *desc[] = {
- [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated",
- [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized",
- [KMEMCHECK_SHADOW_INITIALIZED] = "initialized",
- [KMEMCHECK_SHADOW_FREED] = "freed",
- };
-
- static const char short_desc[] = {
- [KMEMCHECK_SHADOW_UNALLOCATED] = 'a',
- [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u',
- [KMEMCHECK_SHADOW_INITIALIZED] = 'i',
- [KMEMCHECK_SHADOW_FREED] = 'f',
- };
-
- struct kmemcheck_error *e;
- unsigned int i;
-
- e = error_next_rd();
- if (!e)
- return;
-
- switch (e->type) {
- case KMEMCHECK_ERROR_INVALID_ACCESS:
- printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
- 8 * e->size, e->state < ARRAY_SIZE(desc) ?
- desc[e->state] : "(invalid shadow state)",
- (void *) e->address);
-
- printk(KERN_WARNING);
- for (i = 0; i < SHADOW_COPY_SIZE; ++i)
- printk(KERN_CONT "%02x", e->memory_copy[i]);
- printk(KERN_CONT "\n");
-
- printk(KERN_WARNING);
- for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
- if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
- printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
- else
- printk(KERN_CONT " ?");
- }
- printk(KERN_CONT "\n");
- printk(KERN_WARNING "%*c\n", 2 + 2
- * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
- break;
- case KMEMCHECK_ERROR_BUG:
- printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n");
- break;
- }
-
- __show_regs(&e->regs, 1);
- print_stack_trace(&e->trace, 0);
-}
-
-static void do_wakeup(unsigned long data)
-{
- while (error_count > 0)
- kmemcheck_error_recall();
-
- if (error_missed_count > 0) {
- printk(KERN_WARNING "kmemcheck: Lost %d error reports because "
- "the queue was too small\n", error_missed_count);
- error_missed_count = 0;
- }
-}
-
-static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0);
-
-/*
- * Save the context of an error report.
- */
-void kmemcheck_error_save(enum kmemcheck_shadow state,
- unsigned long address, unsigned int size, struct pt_regs *regs)
-{
- static unsigned long prev_ip;
-
- struct kmemcheck_error *e;
- void *shadow_copy;
- void *memory_copy;
-
- /* Don't report several adjacent errors from the same EIP. */
- if (regs->ip == prev_ip)
- return;
- prev_ip = regs->ip;
-
- e = error_next_wr();
- if (!e)
- return;
-
- e->type = KMEMCHECK_ERROR_INVALID_ACCESS;
-
- e->state = state;
- e->address = address;
- e->size = size;
-
- /* Save regs */
- memcpy(&e->regs, regs, sizeof(*regs));
-
- /* Save stack trace */
- e->trace.nr_entries = 0;
- e->trace.entries = e->trace_entries;
- e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
- e->trace.skip = 0;
- save_stack_trace_regs(regs, &e->trace);
-
- /* Round address down to nearest 16 bytes */
- shadow_copy = kmemcheck_shadow_lookup(address
- & ~(SHADOW_COPY_SIZE - 1));
- BUG_ON(!shadow_copy);
-
- memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE);
-
- kmemcheck_show_addr(address);
- memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1));
- memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE);
- kmemcheck_hide_addr(address);
-
- tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
-
-/*
- * Save the context of a kmemcheck bug.
- */
-void kmemcheck_error_save_bug(struct pt_regs *regs)
-{
- struct kmemcheck_error *e;
-
- e = error_next_wr();
- if (!e)
- return;
-
- e->type = KMEMCHECK_ERROR_BUG;
-
- memcpy(&e->regs, regs, sizeof(*regs));
-
- e->trace.nr_entries = 0;
- e->trace.entries = e->trace_entries;
- e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
- e->trace.skip = 1;
- save_stack_trace(&e->trace);
-
- tasklet_hi_schedule_first(&kmemcheck_tasklet);
-}
diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h
index 39f80d7a874d..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/error.h
+++ b/arch/x86/mm/kmemcheck/error.h
@@ -1,16 +1 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H
-#define ARCH__X86__MM__KMEMCHECK__ERROR_H
-
-#include <linux/ptrace.h>
-
-#include "shadow.h"
-
-void kmemcheck_error_save(enum kmemcheck_shadow state,
- unsigned long address, unsigned int size, struct pt_regs *regs);
-
-void kmemcheck_error_save_bug(struct pt_regs *regs);
-
-void kmemcheck_error_recall(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
deleted file mode 100644
index 4515bae36bbe..000000000000
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ /dev/null
@@ -1,658 +0,0 @@
-/**
- * kmemcheck - a heavyweight memory checker for the linux kernel
- * Copyright (C) 2007, 2008 Vegard Nossum <vegardno@ifi.uio.no>
- * (With a lot of help from Ingo Molnar and Pekka Enberg.)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License (version 2) as
- * published by the Free Software Foundation.
- */
-
-#include <linux/init.h>
-#include <linux/interrupt.h>
-#include <linux/kallsyms.h>
-#include <linux/kernel.h>
-#include <linux/kmemcheck.h>
-#include <linux/mm.h>
-#include <linux/page-flags.h>
-#include <linux/percpu.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/types.h>
-
-#include <asm/cacheflush.h>
-#include <asm/kmemcheck.h>
-#include <asm/pgtable.h>
-#include <asm/tlbflush.h>
-
-#include "error.h"
-#include "opcode.h"
-#include "pte.h"
-#include "selftest.h"
-#include "shadow.h"
-
-
-#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT
-# define KMEMCHECK_ENABLED 0
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT
-# define KMEMCHECK_ENABLED 1
-#endif
-
-#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT
-# define KMEMCHECK_ENABLED 2
-#endif
-
-int kmemcheck_enabled = KMEMCHECK_ENABLED;
-
-int __init kmemcheck_init(void)
-{
-#ifdef CONFIG_SMP
- /*
- * Limit SMP to use a single CPU. We rely on the fact that this code
- * runs before SMP is set up.
- */
- if (setup_max_cpus > 1) {
- printk(KERN_INFO
- "kmemcheck: Limiting number of CPUs to 1.\n");
- setup_max_cpus = 1;
- }
-#endif
-
- if (!kmemcheck_selftest()) {
- printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n");
- kmemcheck_enabled = 0;
- return -EINVAL;
- }
-
- printk(KERN_INFO "kmemcheck: Initialized\n");
- return 0;
-}
-
-early_initcall(kmemcheck_init);
-
-/*
- * We need to parse the kmemcheck= option before any memory is allocated.
- */
-static int __init param_kmemcheck(char *str)
-{
- int val;
- int ret;
-
- if (!str)
- return -EINVAL;
-
- ret = kstrtoint(str, 0, &val);
- if (ret)
- return ret;
- kmemcheck_enabled = val;
- return 0;
-}
-
-early_param("kmemcheck", param_kmemcheck);
-
-int kmemcheck_show_addr(unsigned long address)
-{
- pte_t *pte;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return 0;
-
- set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
- __flush_tlb_one(address);
- return 1;
-}
-
-int kmemcheck_hide_addr(unsigned long address)
-{
- pte_t *pte;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return 0;
-
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
- __flush_tlb_one(address);
- return 1;
-}
-
-struct kmemcheck_context {
- bool busy;
- int balance;
-
- /*
- * There can be at most two memory operands to an instruction, but
- * each address can cross a page boundary -- so we may need up to
- * four addresses that must be hidden/revealed for each fault.
- */
- unsigned long addr[4];
- unsigned long n_addrs;
- unsigned long flags;
-
- /* Data size of the instruction that caused a fault. */
- unsigned int size;
-};
-
-static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context);
-
-bool kmemcheck_active(struct pt_regs *regs)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- return data->balance > 0;
-}
-
-/* Save an address that needs to be shown/hidden */
-static void kmemcheck_save_addr(unsigned long addr)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr));
- data->addr[data->n_addrs++] = addr;
-}
-
-static unsigned int kmemcheck_show_all(void)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
- unsigned int i;
- unsigned int n;
-
- n = 0;
- for (i = 0; i < data->n_addrs; ++i)
- n += kmemcheck_show_addr(data->addr[i]);
-
- return n;
-}
-
-static unsigned int kmemcheck_hide_all(void)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
- unsigned int i;
- unsigned int n;
-
- n = 0;
- for (i = 0; i < data->n_addrs; ++i)
- n += kmemcheck_hide_addr(data->addr[i]);
-
- return n;
-}
-
-/*
- * Called from the #PF handler.
- */
-void kmemcheck_show(struct pt_regs *regs)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- BUG_ON(!irqs_disabled());
-
- if (unlikely(data->balance != 0)) {
- kmemcheck_show_all();
- kmemcheck_error_save_bug(regs);
- data->balance = 0;
- return;
- }
-
- /*
- * None of the addresses actually belonged to kmemcheck. Note that
- * this is not an error.
- */
- if (kmemcheck_show_all() == 0)
- return;
-
- ++data->balance;
-
- /*
- * The IF needs to be cleared as well, so that the faulting
- * instruction can run "uninterrupted". Otherwise, we might take
- * an interrupt and start executing that before we've had a chance
- * to hide the page again.
- *
- * NOTE: In the rare case of multiple faults, we must not override
- * the original flags:
- */
- if (!(regs->flags & X86_EFLAGS_TF))
- data->flags = regs->flags;
-
- regs->flags |= X86_EFLAGS_TF;
- regs->flags &= ~X86_EFLAGS_IF;
-}
-
-/*
- * Called from the #DB handler.
- */
-void kmemcheck_hide(struct pt_regs *regs)
-{
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
- int n;
-
- BUG_ON(!irqs_disabled());
-
- if (unlikely(data->balance != 1)) {
- kmemcheck_show_all();
- kmemcheck_error_save_bug(regs);
- data->n_addrs = 0;
- data->balance = 0;
-
- if (!(data->flags & X86_EFLAGS_TF))
- regs->flags &= ~X86_EFLAGS_TF;
- if (data->flags & X86_EFLAGS_IF)
- regs->flags |= X86_EFLAGS_IF;
- return;
- }
-
- if (kmemcheck_enabled)
- n = kmemcheck_hide_all();
- else
- n = kmemcheck_show_all();
-
- if (n == 0)
- return;
-
- --data->balance;
-
- data->n_addrs = 0;
-
- if (!(data->flags & X86_EFLAGS_TF))
- regs->flags &= ~X86_EFLAGS_TF;
- if (data->flags & X86_EFLAGS_IF)
- regs->flags |= X86_EFLAGS_IF;
-}
-
-void kmemcheck_show_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i) {
- unsigned long address;
- pte_t *pte;
- unsigned int level;
-
- address = (unsigned long) page_address(&p[i]);
- pte = lookup_address(address, &level);
- BUG_ON(!pte);
- BUG_ON(level != PG_LEVEL_4K);
-
- set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN));
- __flush_tlb_one(address);
- }
-}
-
-bool kmemcheck_page_is_tracked(struct page *p)
-{
- /* This will also check the "hidden" flag of the PTE. */
- return kmemcheck_pte_lookup((unsigned long) page_address(p));
-}
-
-void kmemcheck_hide_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i) {
- unsigned long address;
- pte_t *pte;
- unsigned int level;
-
- address = (unsigned long) page_address(&p[i]);
- pte = lookup_address(address, &level);
- BUG_ON(!pte);
- BUG_ON(level != PG_LEVEL_4K);
-
- set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
- set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN));
- __flush_tlb_one(address);
- }
-}
-
-/* Access may NOT cross page boundary */
-static void kmemcheck_read_strict(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- void *shadow;
- enum kmemcheck_shadow status;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (!shadow)
- return;
-
- kmemcheck_save_addr(addr);
- status = kmemcheck_shadow_test(shadow, size);
- if (status == KMEMCHECK_SHADOW_INITIALIZED)
- return;
-
- if (kmemcheck_enabled)
- kmemcheck_error_save(status, addr, size, regs);
-
- if (kmemcheck_enabled == 2)
- kmemcheck_enabled = 0;
-
- /* Don't warn about it again. */
- kmemcheck_shadow_set(shadow, size);
-}
-
-bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
-{
- enum kmemcheck_shadow status;
- void *shadow;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (!shadow)
- return true;
-
- status = kmemcheck_shadow_test_all(shadow, size);
-
- return status == KMEMCHECK_SHADOW_INITIALIZED;
-}
-
-/* Access may cross page boundary */
-static void kmemcheck_read(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- unsigned long page = addr & PAGE_MASK;
- unsigned long next_addr = addr + size - 1;
- unsigned long next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- kmemcheck_read_strict(regs, addr, size);
- return;
- }
-
- /*
- * What we do is basically to split the access across the
- * two pages and handle each part separately. Yes, this means
- * that we may now see reads that are 3 + 5 bytes, for
- * example (and if both are uninitialized, there will be two
- * reports), but it makes the code a lot simpler.
- */
- kmemcheck_read_strict(regs, addr, next_page - addr);
- kmemcheck_read_strict(regs, next_page, next_addr - next_page);
-}
-
-static void kmemcheck_write_strict(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- void *shadow;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (!shadow)
- return;
-
- kmemcheck_save_addr(addr);
- kmemcheck_shadow_set(shadow, size);
-}
-
-static void kmemcheck_write(struct pt_regs *regs,
- unsigned long addr, unsigned int size)
-{
- unsigned long page = addr & PAGE_MASK;
- unsigned long next_addr = addr + size - 1;
- unsigned long next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- kmemcheck_write_strict(regs, addr, size);
- return;
- }
-
- /* See comment in kmemcheck_read(). */
- kmemcheck_write_strict(regs, addr, next_page - addr);
- kmemcheck_write_strict(regs, next_page, next_addr - next_page);
-}
-
-/*
- * Copying is hard. We have two addresses, each of which may be split across
- * a page (and each page will have different shadow addresses).
- */
-static void kmemcheck_copy(struct pt_regs *regs,
- unsigned long src_addr, unsigned long dst_addr, unsigned int size)
-{
- uint8_t shadow[8];
- enum kmemcheck_shadow status;
-
- unsigned long page;
- unsigned long next_addr;
- unsigned long next_page;
-
- uint8_t *x;
- unsigned int i;
- unsigned int n;
-
- BUG_ON(size > sizeof(shadow));
-
- page = src_addr & PAGE_MASK;
- next_addr = src_addr + size - 1;
- next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- /* Same page */
- x = kmemcheck_shadow_lookup(src_addr);
- if (x) {
- kmemcheck_save_addr(src_addr);
- for (i = 0; i < size; ++i)
- shadow[i] = x[i];
- } else {
- for (i = 0; i < size; ++i)
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- } else {
- n = next_page - src_addr;
- BUG_ON(n > sizeof(shadow));
-
- /* First page */
- x = kmemcheck_shadow_lookup(src_addr);
- if (x) {
- kmemcheck_save_addr(src_addr);
- for (i = 0; i < n; ++i)
- shadow[i] = x[i];
- } else {
- /* Not tracked */
- for (i = 0; i < n; ++i)
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
-
- /* Second page */
- x = kmemcheck_shadow_lookup(next_page);
- if (x) {
- kmemcheck_save_addr(next_page);
- for (i = n; i < size; ++i)
- shadow[i] = x[i - n];
- } else {
- /* Not tracked */
- for (i = n; i < size; ++i)
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
-
- page = dst_addr & PAGE_MASK;
- next_addr = dst_addr + size - 1;
- next_page = next_addr & PAGE_MASK;
-
- if (likely(page == next_page)) {
- /* Same page */
- x = kmemcheck_shadow_lookup(dst_addr);
- if (x) {
- kmemcheck_save_addr(dst_addr);
- for (i = 0; i < size; ++i) {
- x[i] = shadow[i];
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
- } else {
- n = next_page - dst_addr;
- BUG_ON(n > sizeof(shadow));
-
- /* First page */
- x = kmemcheck_shadow_lookup(dst_addr);
- if (x) {
- kmemcheck_save_addr(dst_addr);
- for (i = 0; i < n; ++i) {
- x[i] = shadow[i];
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
-
- /* Second page */
- x = kmemcheck_shadow_lookup(next_page);
- if (x) {
- kmemcheck_save_addr(next_page);
- for (i = n; i < size; ++i) {
- x[i - n] = shadow[i];
- shadow[i] = KMEMCHECK_SHADOW_INITIALIZED;
- }
- }
- }
-
- status = kmemcheck_shadow_test(shadow, size);
- if (status == KMEMCHECK_SHADOW_INITIALIZED)
- return;
-
- if (kmemcheck_enabled)
- kmemcheck_error_save(status, src_addr, size, regs);
-
- if (kmemcheck_enabled == 2)
- kmemcheck_enabled = 0;
-}
-
-enum kmemcheck_method {
- KMEMCHECK_READ,
- KMEMCHECK_WRITE,
-};
-
-static void kmemcheck_access(struct pt_regs *regs,
- unsigned long fallback_address, enum kmemcheck_method fallback_method)
-{
- const uint8_t *insn;
- const uint8_t *insn_primary;
- unsigned int size;
-
- struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context);
-
- /* Recursive fault -- ouch. */
- if (data->busy) {
- kmemcheck_show_addr(fallback_address);
- kmemcheck_error_save_bug(regs);
- return;
- }
-
- data->busy = true;
-
- insn = (const uint8_t *) regs->ip;
- insn_primary = kmemcheck_opcode_get_primary(insn);
-
- kmemcheck_opcode_decode(insn, &size);
-
- switch (insn_primary[0]) {
-#ifdef CONFIG_KMEMCHECK_BITOPS_OK
- /* AND, OR, XOR */
- /*
- * Unfortunately, these instructions have to be excluded from
- * our regular checking since they access only some (and not
- * all) bits. This clears out "bogus" bitfield-access warnings.
- */
- case 0x80:
- case 0x81:
- case 0x82:
- case 0x83:
- switch ((insn_primary[1] >> 3) & 7) {
- /* OR */
- case 1:
- /* AND */
- case 4:
- /* XOR */
- case 6:
- kmemcheck_write(regs, fallback_address, size);
- goto out;
-
- /* ADD */
- case 0:
- /* ADC */
- case 2:
- /* SBB */
- case 3:
- /* SUB */
- case 5:
- /* CMP */
- case 7:
- break;
- }
- break;
-#endif
-
- /* MOVS, MOVSB, MOVSW, MOVSD */
- case 0xa4:
- case 0xa5:
- /*
- * These instructions are special because they take two
- * addresses, but we only get one page fault.
- */
- kmemcheck_copy(regs, regs->si, regs->di, size);
- goto out;
-
- /* CMPS, CMPSB, CMPSW, CMPSD */
- case 0xa6:
- case 0xa7:
- kmemcheck_read(regs, regs->si, size);
- kmemcheck_read(regs, regs->di, size);
- goto out;
- }
-
- /*
- * If the opcode isn't special in any way, we use the data from the
- * page fault handler to determine the address and type of memory
- * access.
- */
- switch (fallback_method) {
- case KMEMCHECK_READ:
- kmemcheck_read(regs, fallback_address, size);
- goto out;
- case KMEMCHECK_WRITE:
- kmemcheck_write(regs, fallback_address, size);
- goto out;
- }
-
-out:
- data->busy = false;
-}
-
-bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
- unsigned long error_code)
-{
- pte_t *pte;
-
- /*
- * XXX: Is it safe to assume that memory accesses from virtual 86
- * mode or non-kernel code segments will _never_ access kernel
- * memory (e.g. tracked pages)? For now, we need this to avoid
- * invoking kmemcheck for PnP BIOS calls.
- */
- if (regs->flags & X86_VM_MASK)
- return false;
- if (regs->cs != __KERNEL_CS)
- return false;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return false;
-
- WARN_ON_ONCE(in_nmi());
-
- if (error_code & 2)
- kmemcheck_access(regs, address, KMEMCHECK_WRITE);
- else
- kmemcheck_access(regs, address, KMEMCHECK_READ);
-
- kmemcheck_show(regs);
- return true;
-}
-
-bool kmemcheck_trap(struct pt_regs *regs)
-{
- if (!kmemcheck_active(regs))
- return false;
-
- /* We're done. */
- kmemcheck_hide(regs);
- return true;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index df8109ddf7fe..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -1,107 +1 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/types.h>
-
-#include "opcode.h"
-
-static bool opcode_is_prefix(uint8_t b)
-{
- return
- /* Group 1 */
- b == 0xf0 || b == 0xf2 || b == 0xf3
- /* Group 2 */
- || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
- || b == 0x64 || b == 0x65
- /* Group 3 */
- || b == 0x66
- /* Group 4 */
- || b == 0x67;
-}
-
-#ifdef CONFIG_X86_64
-static bool opcode_is_rex_prefix(uint8_t b)
-{
- return (b & 0xf0) == 0x40;
-}
-#else
-static bool opcode_is_rex_prefix(uint8_t b)
-{
- return false;
-}
-#endif
-
-#define REX_W (1 << 3)
-
-/*
- * This is a VERY crude opcode decoder. We only need to find the size of the
- * load/store that caused our #PF and this should work for all the opcodes
- * that we care about. Moreover, the ones who invented this instruction set
- * should be shot.
- */
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size)
-{
- /* Default operand size */
- int operand_size_override = 4;
-
- /* prefixes */
- for (; opcode_is_prefix(*op); ++op) {
- if (*op == 0x66)
- operand_size_override = 2;
- }
-
- /* REX prefix */
- if (opcode_is_rex_prefix(*op)) {
- uint8_t rex = *op;
-
- ++op;
- if (rex & REX_W) {
- switch (*op) {
- case 0x63:
- *size = 4;
- return;
- case 0x0f:
- ++op;
-
- switch (*op) {
- case 0xb6:
- case 0xbe:
- *size = 1;
- return;
- case 0xb7:
- case 0xbf:
- *size = 2;
- return;
- }
-
- break;
- }
-
- *size = 8;
- return;
- }
- }
-
- /* escape opcode */
- if (*op == 0x0f) {
- ++op;
-
- /*
- * This is move with zero-extend and sign-extend, respectively;
- * we don't have to think about 0xb6/0xbe, because this is
- * already handled in the conditional below.
- */
- if (*op == 0xb7 || *op == 0xbf)
- operand_size_override = 2;
- }
-
- *size = (*op & 1) ? operand_size_override : 1;
-}
-
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op)
-{
- /* skip prefixes */
- while (opcode_is_prefix(*op))
- ++op;
- if (opcode_is_rex_prefix(*op))
- ++op;
- return op;
-}
diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h
index 51a1ce94c24a..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/opcode.h
+++ b/arch/x86/mm/kmemcheck/opcode.h
@@ -1,10 +1 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H
-#define ARCH__X86__MM__KMEMCHECK__OPCODE_H
-
-#include <linux/types.h>
-
-void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size);
-const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c
index 8a03be90272a..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/pte.c
+++ b/arch/x86/mm/kmemcheck/pte.c
@@ -1,23 +1 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-#include "pte.h"
-
-pte_t *kmemcheck_pte_lookup(unsigned long address)
-{
- pte_t *pte;
- unsigned int level;
-
- pte = lookup_address(address, &level);
- if (!pte)
- return NULL;
- if (level != PG_LEVEL_4K)
- return NULL;
- if (!pte_hidden(*pte))
- return NULL;
-
- return pte;
-}
-
diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h
index b595612382c2..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/pte.h
+++ b/arch/x86/mm/kmemcheck/pte.h
@@ -1,11 +1 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H
-#define ARCH__X86__MM__KMEMCHECK__PTE_H
-
-#include <linux/mm.h>
-
-#include <asm/pgtable.h>
-
-pte_t *kmemcheck_pte_lookup(unsigned long address);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c
index 7ce0be1f99eb..cec594032515 100644
--- a/arch/x86/mm/kmemcheck/selftest.c
+++ b/arch/x86/mm/kmemcheck/selftest.c
@@ -1,71 +1 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/bug.h>
-#include <linux/kernel.h>
-
-#include "opcode.h"
-#include "selftest.h"
-
-struct selftest_opcode {
- unsigned int expected_size;
- const uint8_t *insn;
- const char *desc;
-};
-
-static const struct selftest_opcode selftest_opcodes[] = {
- /* REP MOVS */
- {1, "\xf3\xa4", "rep movsb <mem8>, <mem8>"},
- {4, "\xf3\xa5", "rep movsl <mem32>, <mem32>"},
-
- /* MOVZX / MOVZXD */
- {1, "\x66\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg16>"},
- {1, "\x0f\xb6\x51\xf8", "movzwq <mem8>, <reg32>"},
-
- /* MOVSX / MOVSXD */
- {1, "\x66\x0f\xbe\x51\xf8", "movswq <mem8>, <reg16>"},
- {1, "\x0f\xbe\x51\xf8", "movswq <mem8>, <reg32>"},
-
-#ifdef CONFIG_X86_64
- /* MOVZX / MOVZXD */
- {1, "\x49\x0f\xb6\x51\xf8", "movzbq <mem8>, <reg64>"},
- {2, "\x49\x0f\xb7\x51\xf8", "movzbq <mem16>, <reg64>"},
-
- /* MOVSX / MOVSXD */
- {1, "\x49\x0f\xbe\x51\xf8", "movsbq <mem8>, <reg64>"},
- {2, "\x49\x0f\xbf\x51\xf8", "movsbq <mem16>, <reg64>"},
- {4, "\x49\x63\x51\xf8", "movslq <mem32>, <reg64>"},
-#endif
-};
-
-static bool selftest_opcode_one(const struct selftest_opcode *op)
-{
- unsigned size;
-
- kmemcheck_opcode_decode(op->insn, &size);
-
- if (size == op->expected_size)
- return true;
-
- printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n",
- op->desc, op->expected_size, size);
- return false;
-}
-
-static bool selftest_opcodes_all(void)
-{
- bool pass = true;
- unsigned int i;
-
- for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i)
- pass = pass && selftest_opcode_one(&selftest_opcodes[i]);
-
- return pass;
-}
-
-bool kmemcheck_selftest(void)
-{
- bool pass = true;
-
- pass = pass && selftest_opcodes_all();
-
- return pass;
-}
diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h
index 8d759aae453d..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/selftest.h
+++ b/arch/x86/mm/kmemcheck/selftest.h
@@ -1,7 +1 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H
-
-bool kmemcheck_selftest(void);
-
-#endif
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
deleted file mode 100644
index c2638a7d2c10..000000000000
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <linux/kmemcheck.h>
-#include <linux/export.h>
-#include <linux/mm.h>
-
-#include <asm/page.h>
-#include <asm/pgtable.h>
-
-#include "pte.h"
-#include "shadow.h"
-
-/*
- * Return the shadow address for the given address. Returns NULL if the
- * address is not tracked.
- *
- * We need to be extremely careful not to follow any invalid pointers,
- * because this function can be called for *any* possible address.
- */
-void *kmemcheck_shadow_lookup(unsigned long address)
-{
- pte_t *pte;
- struct page *page;
-
- if (!virt_addr_valid(address))
- return NULL;
-
- pte = kmemcheck_pte_lookup(address);
- if (!pte)
- return NULL;
-
- page = virt_to_page(address);
- if (!page->shadow)
- return NULL;
- return page->shadow + (address & (PAGE_SIZE - 1));
-}
-
-static void mark_shadow(void *address, unsigned int n,
- enum kmemcheck_shadow status)
-{
- unsigned long addr = (unsigned long) address;
- unsigned long last_addr = addr + n - 1;
- unsigned long page = addr & PAGE_MASK;
- unsigned long last_page = last_addr & PAGE_MASK;
- unsigned int first_n;
- void *shadow;
-
- /* If the memory range crosses a page boundary, stop there. */
- if (page == last_page)
- first_n = n;
- else
- first_n = page + PAGE_SIZE - addr;
-
- shadow = kmemcheck_shadow_lookup(addr);
- if (shadow)
- memset(shadow, status, first_n);
-
- addr += first_n;
- n -= first_n;
-
- /* Do full-page memset()s. */
- while (n >= PAGE_SIZE) {
- shadow = kmemcheck_shadow_lookup(addr);
- if (shadow)
- memset(shadow, status, PAGE_SIZE);
-
- addr += PAGE_SIZE;
- n -= PAGE_SIZE;
- }
-
- /* Do the remaining page, if any. */
- if (n > 0) {
- shadow = kmemcheck_shadow_lookup(addr);
- if (shadow)
- memset(shadow, status, n);
- }
-}
-
-void kmemcheck_mark_unallocated(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED);
-}
-
-void kmemcheck_mark_uninitialized(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED);
-}
-
-/*
- * Fill the shadow memory of the given address such that the memory at that
- * address is marked as being initialized.
- */
-void kmemcheck_mark_initialized(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED);
-}
-EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized);
-
-void kmemcheck_mark_freed(void *address, unsigned int n)
-{
- mark_shadow(address, n, KMEMCHECK_SHADOW_FREED);
-}
-
-void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i)
- kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i)
- kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
-{
- unsigned int i;
-
- for (i = 0; i < n; ++i)
- kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE);
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
-{
-#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
- uint8_t *x;
- unsigned int i;
-
- x = shadow;
-
- /*
- * Make sure _some_ bytes are initialized. Gcc frequently generates
- * code to access neighboring bytes.
- */
- for (i = 0; i < size; ++i) {
- if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
- return x[i];
- }
-
- return x[0];
-#else
- return kmemcheck_shadow_test_all(shadow, size);
-#endif
-}
-
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
-{
- uint8_t *x;
- unsigned int i;
-
- x = shadow;
-
- /* All bytes must be initialized. */
- for (i = 0; i < size; ++i) {
- if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
- return x[i];
- }
-
- return x[0];
-}
-
-void kmemcheck_shadow_set(void *shadow, unsigned int size)
-{
- uint8_t *x;
- unsigned int i;
-
- x = shadow;
- for (i = 0; i < size; ++i)
- x[i] = KMEMCHECK_SHADOW_INITIALIZED;
-}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index 49768dc18664..ea32a7d3cf1b 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -1,19 +1 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H
-#define ARCH__X86__MM__KMEMCHECK__SHADOW_H
-
-enum kmemcheck_shadow {
- KMEMCHECK_SHADOW_UNALLOCATED,
- KMEMCHECK_SHADOW_UNINITIALIZED,
- KMEMCHECK_SHADOW_INITIALIZED,
- KMEMCHECK_SHADOW_FREED,
-};
-
-void *kmemcheck_shadow_lookup(unsigned long address);
-
-enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
-enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
- unsigned int size);
-void kmemcheck_shadow_set(void *shadow, unsigned int size);
-
-#endif
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index a99679826846..155ecbac9e28 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -33,6 +33,8 @@
#include <linux/compat.h>
#include <asm/elf.h>
+#include "physaddr.h"
+
struct va_alignment __read_mostly va_align = {
.flags = -1,
};
@@ -174,3 +176,63 @@ const char *arch_vma_name(struct vm_area_struct *vma)
return "[mpx]";
return NULL;
}
+
+/**
+ * mmap_address_hint_valid - Validate the address hint of mmap
+ * @addr: Address hint
+ * @len: Mapping length
+ *
+ * Check whether @addr and @addr + @len result in a valid mapping.
+ *
+ * On 32bit this only checks whether @addr + @len is <= TASK_SIZE.
+ *
+ * On 64bit with 5-level page tables another sanity check is required
+ * because mappings requested by mmap(@addr, 0) which cross the 47-bit
+ * virtual address boundary can cause the following theoretical issue:
+ *
+ * An application calls mmap(addr, 0), i.e. without MAP_FIXED, where @addr
+ * is below the border of the 47-bit address space and @addr + @len is
+ * above the border.
+ *
+ * With 4-level paging this request succeeds, but the resulting mapping
+ * address will always be within the 47-bit virtual address space, because
+ * the hint address does not result in a valid mapping and is
+ * ignored. Hence applications which are not prepared to handle virtual
+ * addresses above 47-bit work correctly.
+ *
+ * With 5-level paging this request would be granted and result in a
+ * mapping which crosses the border of the 47-bit virtual address
+ * space. If the application cannot handle addresses above 47-bit this
+ * will lead to misbehaviour and hard to diagnose failures.
+ *
+ * Therefore ignore address hints which would result in a mapping crossing
+ * the 47-bit virtual address boundary.
+ *
+ * Note, that in the same scenario with MAP_FIXED the behaviour is
+ * different. The request with @addr < 47-bit and @addr + @len > 47-bit
+ * fails on a 4-level paging machine but succeeds on a 5-level paging
+ * machine. It is reasonable to expect that an application does not rely on
+ * the failure of such a fixed mapping request, so the restriction is not
+ * applied.
+ */
+bool mmap_address_hint_valid(unsigned long addr, unsigned long len)
+{
+ if (TASK_SIZE - len < addr)
+ return false;
+
+ return (addr > DEFAULT_MAP_WINDOW) == (addr + len > DEFAULT_MAP_WINDOW);
+}
+
+/* Can we access it for direct reading/writing? Must be RAM: */
+int valid_phys_addr_range(phys_addr_t addr, size_t count)
+{
+ return addr + count <= __pa(high_memory);
+}
+
+/* Can we access it through mmap? Must be a valid physical address: */
+int valid_mmap_phys_addr_range(unsigned long pfn, size_t count)
+{
+ phys_addr_t addr = (phys_addr_t)pfn << PAGE_SHIFT;
+
+ return phys_addr_valid(addr + count - 1);
+}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3fe68483463c..85cf12219dea 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
if (!debug_pagealloc_enabled())
spin_unlock(&cpa_lock);
- base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0);
+ base = alloc_pages(GFP_KERNEL, 0);
if (!debug_pagealloc_enabled())
spin_lock(&cpa_lock);
if (!base)
@@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end)
static int alloc_pte_page(pmd_t *pmd)
{
- pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
if (!pte)
return -1;
@@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd)
static int alloc_pmd_page(pud_t *pud)
{
- pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
if (!pmd)
return -1;
@@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
pgd_entry = cpa->pgd + pgd_index(addr);
if (pgd_none(*pgd_entry)) {
- p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL);
if (!p4d)
return -1;
@@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr)
*/
p4d = p4d_offset(pgd_entry, addr);
if (p4d_none(*p4d)) {
- pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
+ pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
if (!pud)
return -1;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 17ebc5a978cc..96d456a94b03 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -7,7 +7,7 @@
#include <asm/fixmap.h>
#include <asm/mtrr.h>
-#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO)
+#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO)
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index ffdbc4836b4f..174c59774cc9 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -592,7 +592,7 @@ enum __force_cpu_type {
static int force_cpu_type;
-static int set_cpu_type(const char *str, struct kernel_param *kp)
+static int set_cpu_type(const char *str, const struct kernel_param *kp)
{
if (!strcmp(str, "timer")) {
force_cpu_type = timer;
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 4210da7b44de..1e996df687a3 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -636,3 +636,88 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2030, quirk_no_aersid);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2031, quirk_no_aersid);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2032, quirk_no_aersid);
DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid);
+
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+
+#define AMD_141b_MMIO_BASE(x) (0x80 + (x) * 0x8)
+#define AMD_141b_MMIO_BASE_RE_MASK BIT(0)
+#define AMD_141b_MMIO_BASE_WE_MASK BIT(1)
+#define AMD_141b_MMIO_BASE_MMIOBASE_MASK GENMASK(31,8)
+
+#define AMD_141b_MMIO_LIMIT(x) (0x84 + (x) * 0x8)
+#define AMD_141b_MMIO_LIMIT_MMIOLIMIT_MASK GENMASK(31,8)
+
+#define AMD_141b_MMIO_HIGH(x) (0x180 + (x) * 0x4)
+#define AMD_141b_MMIO_HIGH_MMIOBASE_MASK GENMASK(7,0)
+#define AMD_141b_MMIO_HIGH_MMIOLIMIT_SHIFT 16
+#define AMD_141b_MMIO_HIGH_MMIOLIMIT_MASK GENMASK(23,16)
+
+/*
+ * The PCI Firmware Spec, rev 3.2, notes that ACPI should optionally allow
+ * configuring host bridge windows using the _PRS and _SRS methods.
+ *
+ * But this is rarely implemented, so we manually enable a large 64bit BAR for
+ * PCIe device on AMD Family 15h (Models 00h-1fh, 30h-3fh, 60h-7fh) Processors
+ * here.
+ */
+static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
+{
+ unsigned i;
+ u32 base, limit, high;
+ struct resource *res, *conflict;
+
+ for (i = 0; i < 8; i++) {
+ pci_read_config_dword(dev, AMD_141b_MMIO_BASE(i), &base);
+ pci_read_config_dword(dev, AMD_141b_MMIO_HIGH(i), &high);
+
+ /* Is this slot free? */
+ if (!(base & (AMD_141b_MMIO_BASE_RE_MASK |
+ AMD_141b_MMIO_BASE_WE_MASK)))
+ break;
+
+ base >>= 8;
+ base |= high << 24;
+
+ /* Abort if a slot already configures a 64bit BAR. */
+ if (base > 0x10000)
+ return;
+ }
+ if (i == 8)
+ return;
+
+ res = kzalloc(sizeof(*res), GFP_KERNEL);
+ if (!res)
+ return;
+
+ res->name = "PCI Bus 0000:00";
+ res->flags = IORESOURCE_PREFETCH | IORESOURCE_MEM |
+ IORESOURCE_MEM_64 | IORESOURCE_WINDOW;
+ res->start = 0x100000000ull;
+ res->end = 0xfd00000000ull - 1;
+
+ /* Just grab the free area behind system memory for this */
+ while ((conflict = request_resource_conflict(&iomem_resource, res)))
+ res->start = conflict->end + 1;
+
+ dev_info(&dev->dev, "adding root bus resource %pR\n", res);
+
+ base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) |
+ AMD_141b_MMIO_BASE_RE_MASK | AMD_141b_MMIO_BASE_WE_MASK;
+ limit = ((res->end + 1) >> 8) & AMD_141b_MMIO_LIMIT_MMIOLIMIT_MASK;
+ high = ((res->start >> 40) & AMD_141b_MMIO_HIGH_MMIOBASE_MASK) |
+ ((((res->end + 1) >> 40) << AMD_141b_MMIO_HIGH_MMIOLIMIT_SHIFT)
+ & AMD_141b_MMIO_HIGH_MMIOLIMIT_MASK);
+
+ pci_write_config_dword(dev, AMD_141b_MMIO_HIGH(i), high);
+ pci_write_config_dword(dev, AMD_141b_MMIO_LIMIT(i), limit);
+ pci_write_config_dword(dev, AMD_141b_MMIO_BASE(i), base);
+
+ pci_bus_add_resource(dev->bus, res, 0);
+}
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1401, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x141b, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1571, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x15b1, pci_amd_enable_64bit_bar);
+DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_AMD, 0x1601, pci_amd_enable_64bit_bar);
+
+#endif
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index 1012a5f0f98d..511921045312 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -280,7 +280,7 @@ static void intel_mid_pci_irq_disable(struct pci_dev *dev)
}
}
-static struct pci_ops intel_mid_pci_ops = {
+static const struct pci_ops intel_mid_pci_ops __initconst = {
.read = pci_read,
.write = pci_write,
};
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 9e4ee5b04b2d..6a151ce70e86 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -207,7 +207,7 @@ int __init efi_alloc_page_tables(void)
if (efi_enabled(EFI_OLD_MEMMAP))
return 0;
- gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO;
+ gfp_mask = GFP_KERNEL | __GFP_ZERO;
efi_pgd = (pgd_t *)__get_free_page(gfp_mask);
if (!efi_pgd)
return -ENOMEM;
diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c
index 809b6c812654..92ccc718152d 100644
--- a/arch/x86/xen/grant-table.c
+++ b/arch/x86/xen/grant-table.c
@@ -49,7 +49,7 @@
static struct gnttab_vm_area {
struct vm_struct *area;
pte_t **ptes;
-} gnttab_shared_vm_area;
+} gnttab_shared_vm_area, gnttab_status_vm_area;
int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
unsigned long max_nr_gframes,
@@ -73,16 +73,43 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes,
return 0;
}
+int arch_gnttab_map_status(uint64_t *frames, unsigned long nr_gframes,
+ unsigned long max_nr_gframes,
+ grant_status_t **__shared)
+{
+ grant_status_t *shared = *__shared;
+ unsigned long addr;
+ unsigned long i;
+
+ if (shared == NULL)
+ *__shared = shared = gnttab_status_vm_area.area->addr;
+
+ addr = (unsigned long)shared;
+
+ for (i = 0; i < nr_gframes; i++) {
+ set_pte_at(&init_mm, addr, gnttab_status_vm_area.ptes[i],
+ mfn_pte(frames[i], PAGE_KERNEL));
+ addr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
void arch_gnttab_unmap(void *shared, unsigned long nr_gframes)
{
+ pte_t **ptes;
unsigned long addr;
unsigned long i;
+ if (shared == gnttab_status_vm_area.area->addr)
+ ptes = gnttab_status_vm_area.ptes;
+ else
+ ptes = gnttab_shared_vm_area.ptes;
+
addr = (unsigned long)shared;
for (i = 0; i < nr_gframes; i++) {
- set_pte_at(&init_mm, addr, gnttab_shared_vm_area.ptes[i],
- __pte(0));
+ set_pte_at(&init_mm, addr, ptes[i], __pte(0));
addr += PAGE_SIZE;
}
}
@@ -102,12 +129,35 @@ static int arch_gnttab_valloc(struct gnttab_vm_area *area, unsigned nr_frames)
return 0;
}
-int arch_gnttab_init(unsigned long nr_shared)
+static void arch_gnttab_vfree(struct gnttab_vm_area *area)
{
+ free_vm_area(area->area);
+ kfree(area->ptes);
+}
+
+int arch_gnttab_init(unsigned long nr_shared, unsigned long nr_status)
+{
+ int ret;
+
if (!xen_pv_domain())
return 0;
- return arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
+ ret = arch_gnttab_valloc(&gnttab_shared_vm_area, nr_shared);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Always allocate the space for the status frames in case
+ * we're migrated to a host with V2 support.
+ */
+ ret = arch_gnttab_valloc(&gnttab_status_vm_area, nr_status);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+err:
+ arch_gnttab_vfree(&gnttab_shared_vm_area);
+ return -ENOMEM;
}
#ifdef CONFIG_XEN_PVH
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3e15345abfe7..d33e7dbe3129 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -172,6 +172,9 @@ int xen_remap_domain_gfn_range(struct vm_area_struct *vma,
pgprot_t prot, unsigned domid,
struct page **pages)
{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return -EOPNOTSUPP;
+
return do_remap_gfn(vma, addr, &gfn, nr, NULL, prot, domid, pages);
}
EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_range);
@@ -182,6 +185,10 @@ int xen_remap_domain_gfn_array(struct vm_area_struct *vma,
int *err_ptr, pgprot_t prot,
unsigned domid, struct page **pages)
{
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return xen_xlate_remap_gfn_array(vma, addr, gfn, nr, err_ptr,
+ prot, domid, pages);
+
/* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
* and the consequences later is quite hard to detect what the actual
* cause of "wrong memory was mapped in".
@@ -193,9 +200,12 @@ EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array);
/* Returns: 0 success */
int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,
- int numpgs, struct page **pages)
+ int nr, struct page **pages)
{
- if (!pages || !xen_feature(XENFEAT_auto_translated_physmap))
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return xen_xlate_unmap_gfn_range(vma, nr, pages);
+
+ if (!pages)
return 0;
return -EINVAL;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 2ccdaba31a07..fc048ec686e7 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -315,7 +315,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
static pteval_t pte_mfn_to_pfn(pteval_t val)
{
if (val & _PAGE_PRESENT) {
- unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
+ unsigned long mfn = (val & XEN_PTE_MFN_MASK) >> PAGE_SHIFT;
unsigned long pfn = mfn_to_pfn(mfn);
pteval_t flags = val & PTE_FLAGS_MASK;
@@ -1721,7 +1721,7 @@ static unsigned long __init m2p(phys_addr_t maddr)
{
phys_addr_t paddr;
- maddr &= PTE_PFN_MASK;
+ maddr &= XEN_PTE_MFN_MASK;
paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
return paddr;
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 92bf5ecb6baf..d9f96cc5d743 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -17,6 +17,8 @@
void xen_arch_pre_suspend(void)
{
+ xen_save_time_memory_area();
+
if (xen_pv_domain())
xen_pv_pre_suspend();
}
@@ -27,6 +29,8 @@ void xen_arch_post_suspend(int cancelled)
xen_pv_post_suspend(cancelled);
else
xen_hvm_post_suspend(cancelled);
+
+ xen_restore_time_memory_area();
}
static void xen_vcpu_notify_restore(void *data)
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 80c2a4bdf230..29163c43ebbd 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -75,7 +75,7 @@ static void xen_get_wallclock(struct timespec *now)
static int xen_set_wallclock(const struct timespec *now)
{
- return -1;
+ return -ENODEV;
}
static int xen_pvclock_gtod_notify(struct notifier_block *nb,
@@ -371,8 +371,95 @@ static const struct pv_time_ops xen_time_ops __initconst = {
.steal_clock = xen_steal_clock,
};
+static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
+
+void xen_save_time_memory_area(void)
+{
+ struct vcpu_register_time_memory_area t;
+ int ret;
+
+ if (!xen_clock)
+ return;
+
+ t.addr.v = NULL;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+ if (ret != 0)
+ pr_notice("Cannot save secondary vcpu_time_info (err %d)",
+ ret);
+ else
+ clear_page(xen_clock);
+}
+
+void xen_restore_time_memory_area(void)
+{
+ struct vcpu_register_time_memory_area t;
+ int ret;
+
+ if (!xen_clock)
+ return;
+
+ t.addr.v = &xen_clock->pvti;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+
+ /*
+ * We don't disable VCLOCK_PVCLOCK entirely if it fails to register the
+ * secondary time info with Xen or if we migrated to a host without the
+ * necessary flags. On both of these cases what happens is either
+ * process seeing a zeroed out pvti or seeing no PVCLOCK_TSC_STABLE_BIT
+ * bit set. Userspace checks the latter and if 0, it discards the data
+ * in pvti and fallbacks to a system call for a reliable timestamp.
+ */
+ if (ret != 0)
+ pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
+ ret);
+}
+
+static void xen_setup_vsyscall_time_info(void)
+{
+ struct vcpu_register_time_memory_area t;
+ struct pvclock_vsyscall_time_info *ti;
+ int ret;
+
+ ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL);
+ if (!ti)
+ return;
+
+ t.addr.v = &ti->pvti;
+
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
+ if (ret) {
+ pr_notice("xen: VCLOCK_PVCLOCK not supported (err %d)\n", ret);
+ free_page((unsigned long)ti);
+ return;
+ }
+
+ /*
+ * If primary time info had this bit set, secondary should too since
+ * it's the same data on both just different memory regions. But we
+ * still check it in case hypervisor is buggy.
+ */
+ if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) {
+ t.addr.v = NULL;
+ ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
+ 0, &t);
+ if (!ret)
+ free_page((unsigned long)ti);
+
+ pr_notice("xen: VCLOCK_PVCLOCK not supported (tsc unstable)\n");
+ return;
+ }
+
+ xen_clock = ti;
+ pvclock_set_pvti_cpu0_va(xen_clock);
+
+ xen_clocksource.archdata.vclock_mode = VCLOCK_PVCLOCK;
+}
+
static void __init xen_time_init(void)
{
+ struct pvclock_vcpu_time_info *pvti;
int cpu = smp_processor_id();
struct timespec tp;
@@ -396,6 +483,16 @@ static void __init xen_time_init(void)
setup_force_cpu_cap(X86_FEATURE_TSC);
+ /*
+ * We check ahead on the primary time info if this
+ * bit is supported hence speeding up Xen clocksource.
+ */
+ pvti = &__this_cpu_read(xen_vcpu)->time;
+ if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) {
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+ xen_setup_vsyscall_time_info();
+ }
+
xen_setup_runstate_info(cpu);
xen_setup_timer(cpu);
xen_setup_cpu_clockevents();
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f377e1820c6c..75011b80660f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -70,6 +70,8 @@ void xen_setup_runstate_info(int cpu);
void xen_teardown_timer(int cpu);
u64 xen_clocksource_read(void);
void xen_setup_cpu_clockevents(void);
+void xen_save_time_memory_area(void);
+void xen_restore_time_memory_area(void);
void __init xen_init_time_ops(void);
void __init xen_hvm_init_time_ops(void);