summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-02-23 03:22:53 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2017-02-23 03:22:53 +0100
commitfd7e9a88348472521d999434ee02f25735c7dadf (patch)
tree90e6249e58d90ba9d590cfed4481c29ca36a05dc /arch
parentMerge tag 'iommu-fix-v4.11-rc0' of git://git.kernel.org/pub/scm/linux/kernel/... (diff)
parentx86/kvm: Provide optimized version of vcpu_is_preempted() for x86-64 (diff)
downloadlinux-fd7e9a88348472521d999434ee02f25735c7dadf.tar.xz
linux-fd7e9a88348472521d999434ee02f25735c7dadf.zip
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull KVM updates from Paolo Bonzini: "4.11 is going to be a relatively large release for KVM, with a little over 200 commits and noteworthy changes for most architectures. ARM: - GICv3 save/restore - cache flushing fixes - working MSI injection for GICv3 ITS - physical timer emulation MIPS: - various improvements under the hood - support for SMP guests - a large rewrite of MMU emulation. KVM MIPS can now use MMU notifiers to support copy-on-write, KSM, idle page tracking, swapping, ballooning and everything else. KVM_CAP_READONLY_MEM is also supported, so that writes to some memory regions can be treated as MMIO. The new MMU also paves the way for hardware virtualization support. PPC: - support for POWER9 using the radix-tree MMU for host and guest - resizable hashed page table - bugfixes. s390: - expose more features to the guest - more SIMD extensions - instruction execution protection - ESOP2 x86: - improved hashing in the MMU - faster PageLRU tracking for Intel CPUs without EPT A/D bits - some refactoring of nested VMX entry/exit code, preparing for live migration support of nested hypervisors - expose yet another AVX512 CPUID bit - host-to-guest PTP support - refactoring of interrupt injection, with some optimizations thrown in and some duct tape removed. - remove lazy FPU handling - optimizations of user-mode exits - optimizations of vcpu_is_preempted() for KVM guests generic: - alternative signaling mechanism that doesn't pound on tsk->sighand->siglock" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (195 commits) x86/kvm: Provide optimized version of vcpu_is_preempted() for x86-64 x86/paravirt: Change vcp_is_preempted() arg type to long KVM: VMX: use correct vmcs_read/write for guest segment selector/base x86/kvm/vmx: Defer TR reload after VM exit x86/asm/64: Drop __cacheline_aligned from struct x86_hw_tss x86/kvm/vmx: Simplify segment_base() x86/kvm/vmx: Get rid of segment_base() on 64-bit kernels x86/kvm/vmx: Don't fetch the TSS base from the GDT x86/asm: Define the kernel TSS limit in a macro kvm: fix page struct leak in handle_vmon KVM: PPC: Book3S HV: Disable HPT resizing on POWER9 for now KVM: Return an error code only as a constant in kvm_get_dirty_log() KVM: Return an error code only as a constant in kvm_get_dirty_log_protect() KVM: Return directly after a failed copy_from_user() in kvm_vm_compat_ioctl() KVM: x86: remove code for lazy FPU handling KVM: race-free exit from KVM_RUN without POSIX signals KVM: PPC: Book3S HV: Turn "KVM guest htab" message into a debug message KVM: PPC: Book3S PR: Ratelimit copy data failure error messages KVM: Support vCPU-based gfn->hva cache KVM: use separate generations for each address space ...
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/include/asm/kvm_host.h3
-rw-r--r--arch/arm/include/asm/kvm_mmu.h12
-rw-r--r--arch/arm/include/uapi/asm/kvm.h13
-rw-r--r--arch/arm/kvm/Makefile5
-rw-r--r--arch/arm/kvm/arm.c8
-rw-r--r--arch/arm/kvm/mmu.c20
-rw-r--r--arch/arm/kvm/reset.c9
-rw-r--r--arch/arm/kvm/vgic-v3-coproc.c35
-rw-r--r--arch/arm64/include/asm/kvm_host.h3
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h6
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h13
-rw-r--r--arch/arm64/kvm/Makefile4
-rw-r--r--arch/arm64/kvm/reset.c9
-rw-r--r--arch/arm64/kvm/sys_regs.c92
-rw-r--r--arch/arm64/kvm/sys_regs.h4
-rw-r--r--arch/arm64/kvm/vgic-sys-reg-v3.c346
-rw-r--r--arch/mips/include/asm/kvm_host.h183
-rw-r--r--arch/mips/include/asm/mmu_context.h9
-rw-r--r--arch/mips/include/uapi/asm/kvm.h2
-rw-r--r--arch/mips/kvm/Kconfig2
-rw-r--r--arch/mips/kvm/dyntrans.c52
-rw-r--r--arch/mips/kvm/emulate.c432
-rw-r--r--arch/mips/kvm/entry.c155
-rw-r--r--arch/mips/kvm/interrupt.c5
-rw-r--r--arch/mips/kvm/mips.c503
-rw-r--r--arch/mips/kvm/mmu.c1329
-rw-r--r--arch/mips/kvm/tlb.c291
-rw-r--r--arch/mips/kvm/trap_emul.c734
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h16
-rw-r--r--arch/powerpc/include/asm/kvm_host.h21
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h15
-rw-r--r--arch/powerpc/include/uapi/asm/kvm.h2
-rw-r--r--arch/powerpc/kvm/book3s_32_mmu.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu.c3
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_hv.c635
-rw-r--r--arch/powerpc/kvm/book3s_64_vio.c1
-rw-r--r--arch/powerpc/kvm/book3s_hv.c65
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c8
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_mmu.c62
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c138
-rw-r--r--arch/powerpc/kvm/book3s_pr.c130
-rw-r--r--arch/powerpc/kvm/book3s_xics.c192
-rw-r--r--arch/powerpc/kvm/book3s_xics.h7
-rw-r--r--arch/powerpc/kvm/powerpc.c10
-rw-r--r--arch/s390/kvm/gaccess.c26
-rw-r--r--arch/s390/kvm/gaccess.h19
-rw-r--r--arch/s390/kvm/guestdbg.c120
-rw-r--r--arch/s390/kvm/intercept.c25
-rw-r--r--arch/s390/kvm/kvm-s390.c46
-rw-r--r--arch/s390/kvm/kvm-s390.h12
-rw-r--r--arch/s390/kvm/priv.c30
-rw-r--r--arch/s390/kvm/vsie.c3
-rw-r--r--arch/s390/mm/pgtable.c2
-rw-r--r--arch/s390/tools/gen_facilities.c2
-rw-r--r--arch/x86/include/asm/desc.h58
-rw-r--r--arch/x86/include/asm/kvm_emulate.h1
-rw-r--r--arch/x86/include/asm/kvm_host.h30
-rw-r--r--arch/x86/include/asm/kvmclock.h6
-rw-r--r--arch/x86/include/asm/paravirt.h2
-rw-r--r--arch/x86/include/asm/processor.h12
-rw-r--r--arch/x86/include/asm/qspinlock.h2
-rw-r--r--arch/x86/include/asm/vmx.h28
-rw-r--r--arch/x86/include/uapi/asm/kvm_para.h9
-rw-r--r--arch/x86/kernel/asm-offsets_64.c9
-rw-r--r--arch/x86/kernel/ioport.c5
-rw-r--r--arch/x86/kernel/kvm.c26
-rw-r--r--arch/x86/kernel/kvmclock.c5
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c2
-rw-r--r--arch/x86/kernel/process.c10
-rw-r--r--arch/x86/kvm/cpuid.c10
-rw-r--r--arch/x86/kvm/emulate.c20
-rw-r--r--arch/x86/kvm/hyperv.c4
-rw-r--r--arch/x86/kvm/i8259.c16
-rw-r--r--arch/x86/kvm/irq.h19
-rw-r--r--arch/x86/kvm/irq_comm.c29
-rw-r--r--arch/x86/kvm/lapic.c197
-rw-r--r--arch/x86/kvm/lapic.h16
-rw-r--r--arch/x86/kvm/mmu.c509
-rw-r--r--arch/x86/kvm/svm.c57
-rw-r--r--arch/x86/kvm/vmx.c909
-rw-r--r--arch/x86/kvm/x86.c274
81 files changed, 5526 insertions, 2611 deletions
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index d5423ab15ed5..cc495d799c67 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -60,9 +60,6 @@ struct kvm_arch {
/* The last vcpu id that ran on each physical CPU */
int __percpu *last_vcpu_ran;
- /* Timer */
- struct arch_timer_kvm timer;
-
/*
* Anything that is not used directly from assembly code goes
* here.
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 74a44727f8e1..95f38dcd611d 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -129,8 +129,7 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
kvm_pfn_t pfn,
- unsigned long size,
- bool ipa_uncached)
+ unsigned long size)
{
/*
* If we are going to insert an instruction page and the icache is
@@ -150,18 +149,12 @@ static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
* and iterate over the range.
*/
- bool need_flush = !vcpu_has_cache_enabled(vcpu) || ipa_uncached;
-
VM_BUG_ON(size & ~PAGE_MASK);
- if (!need_flush && !icache_is_pipt())
- goto vipt_cache;
-
while (size) {
void *va = kmap_atomic_pfn(pfn);
- if (need_flush)
- kvm_flush_dcache_to_poc(va, PAGE_SIZE);
+ kvm_flush_dcache_to_poc(va, PAGE_SIZE);
if (icache_is_pipt())
__cpuc_coherent_user_range((unsigned long)va,
@@ -173,7 +166,6 @@ static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
kunmap_atomic(va);
}
-vipt_cache:
if (!icache_is_pipt() && !icache_is_vivt_asid_tagged()) {
/* any kind of VIPT cache */
__flush_icache_all();
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index af05f8e0903e..6ebd3e6a1fd1 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -181,10 +181,23 @@ struct kvm_arch_memory_slot {
#define KVM_DEV_ARM_VGIC_GRP_CPU_REGS 2
#define KVM_DEV_ARM_VGIC_CPUID_SHIFT 32
#define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT)
+#define KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32
+#define KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \
+ (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT)
#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0
#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
+#define KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff)
#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
#define KVM_DEV_ARM_VGIC_GRP_CTRL 4
+#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5
+#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
+#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
+ (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff
+#define VGIC_LEVEL_INFO_LINE_LEVEL 0
+
#define KVM_DEV_ARM_VGIC_CTRL_INIT 0
/* KVM_IRQ_LINE irq field index values */
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index d571243ab4d1..7b3670c2ae7b 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -7,7 +7,7 @@ ifeq ($(plus_virt),+virt)
plus_virt_def := -DREQUIRES_VIRT=1
endif
-ccflags-y += -Iarch/arm/kvm
+ccflags-y += -Iarch/arm/kvm -Ivirt/kvm/arm/vgic
CFLAGS_arm.o := -I. $(plus_virt_def)
CFLAGS_mmu.o := -I.
@@ -20,7 +20,7 @@ kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vf
obj-$(CONFIG_KVM_ARM_HOST) += hyp/
obj-y += kvm-arm.o init.o interrupts.o
obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
-obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
+obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o vgic-v3-coproc.o
obj-y += $(KVM)/arm/aarch32.o
obj-y += $(KVM)/arm/vgic/vgic.o
@@ -33,5 +33,6 @@ obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o
obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o
obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o
obj-y += $(KVM)/arm/vgic/vgic-its.o
+obj-y += $(KVM)/arm/vgic/vgic-debug.o
obj-y += $(KVM)/irqchip.o
obj-y += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 9d7446456e0c..c9a2103faeb9 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -135,7 +135,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
goto out_free_stage2_pgd;
kvm_vgic_early_init(kvm);
- kvm_timer_init(kvm);
/* Mark the initial VMID generation invalid */
kvm->arch.vmid_gen = 0;
@@ -207,6 +206,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_PSCI_0_2:
case KVM_CAP_READONLY_MEM:
case KVM_CAP_MP_STATE:
+ case KVM_CAP_IMMEDIATE_EXIT:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -301,7 +301,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
- return kvm_timer_should_fire(vcpu);
+ return kvm_timer_should_fire(vcpu_vtimer(vcpu)) ||
+ kvm_timer_should_fire(vcpu_ptimer(vcpu));
}
void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
@@ -604,6 +605,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
return ret;
}
+ if (run->immediate_exit)
+ return -EINTR;
+
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index a5265edbeeab..962616fd4ddd 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -1232,9 +1232,9 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
}
static void coherent_cache_guest_page(struct kvm_vcpu *vcpu, kvm_pfn_t pfn,
- unsigned long size, bool uncached)
+ unsigned long size)
{
- __coherent_cache_guest_page(vcpu, pfn, size, uncached);
+ __coherent_cache_guest_page(vcpu, pfn, size);
}
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
@@ -1250,7 +1250,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct vm_area_struct *vma;
kvm_pfn_t pfn;
pgprot_t mem_type = PAGE_S2;
- bool fault_ipa_uncached;
bool logging_active = memslot_is_logging(memslot);
unsigned long flags = 0;
@@ -1337,8 +1336,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (!hugetlb && !force_pte)
hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
- fault_ipa_uncached = memslot->flags & KVM_MEMSLOT_INCOHERENT;
-
if (hugetlb) {
pmd_t new_pmd = pfn_pmd(pfn, mem_type);
new_pmd = pmd_mkhuge(new_pmd);
@@ -1346,7 +1343,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
new_pmd = kvm_s2pmd_mkwrite(new_pmd);
kvm_set_pfn_dirty(pfn);
}
- coherent_cache_guest_page(vcpu, pfn, PMD_SIZE, fault_ipa_uncached);
+ coherent_cache_guest_page(vcpu, pfn, PMD_SIZE);
ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
} else {
pte_t new_pte = pfn_pte(pfn, mem_type);
@@ -1356,7 +1353,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
kvm_set_pfn_dirty(pfn);
mark_page_dirty(kvm, gfn);
}
- coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE, fault_ipa_uncached);
+ coherent_cache_guest_page(vcpu, pfn, PAGE_SIZE);
ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
}
@@ -1879,15 +1876,6 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
unsigned long npages)
{
- /*
- * Readonly memslots are not incoherent with the caches by definition,
- * but in practice, they are used mostly to emulate ROMs or NOR flashes
- * that the guest may consider devices and hence map as uncached.
- * To prevent incoherency issues in these cases, tag all readonly
- * regions as incoherent.
- */
- if (slot->flags & KVM_MEM_READONLY)
- slot->flags |= KVM_MEMSLOT_INCOHERENT;
return 0;
}
diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index 4b5e802e57d1..1da8b2d14550 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -37,6 +37,11 @@ static struct kvm_regs cortexa_regs_reset = {
.usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT,
};
+static const struct kvm_irq_level cortexa_ptimer_irq = {
+ { .irq = 30 },
+ .level = 1,
+};
+
static const struct kvm_irq_level cortexa_vtimer_irq = {
{ .irq = 27 },
.level = 1,
@@ -58,6 +63,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
{
struct kvm_regs *reset_regs;
const struct kvm_irq_level *cpu_vtimer_irq;
+ const struct kvm_irq_level *cpu_ptimer_irq;
switch (vcpu->arch.target) {
case KVM_ARM_TARGET_CORTEX_A7:
@@ -65,6 +71,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
reset_regs = &cortexa_regs_reset;
vcpu->arch.midr = read_cpuid_id();
cpu_vtimer_irq = &cortexa_vtimer_irq;
+ cpu_ptimer_irq = &cortexa_ptimer_irq;
break;
default:
return -ENODEV;
@@ -77,5 +84,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
kvm_reset_coprocs(vcpu);
/* Reset arch_timer context */
- return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
+ return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq);
}
diff --git a/arch/arm/kvm/vgic-v3-coproc.c b/arch/arm/kvm/vgic-v3-coproc.c
new file mode 100644
index 000000000000..f41abf76366f
--- /dev/null
+++ b/arch/arm/kvm/vgic-v3-coproc.c
@@ -0,0 +1,35 @@
+/*
+ * VGIC system registers handling functions for AArch32 mode
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include "vgic.h"
+
+int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
+ u64 *reg)
+{
+ /*
+ * TODO: Implement for AArch32
+ */
+ return -ENXIO;
+}
+
+int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id,
+ u64 *reg)
+{
+ /*
+ * TODO: Implement for AArch32
+ */
+ return -ENXIO;
+}
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 443b387021f2..f21fd3894370 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -70,9 +70,6 @@ struct kvm_arch {
/* Interrupt controller */
struct vgic_dist vgic;
-
- /* Timer */
- struct arch_timer_kvm timer;
};
#define KVM_NR_MEM_OBJS 40
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 55772c13a375..ed1246014901 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -236,13 +236,11 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,
kvm_pfn_t pfn,
- unsigned long size,
- bool ipa_uncached)
+ unsigned long size)
{
void *va = page_address(pfn_to_page(pfn));
- if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
- kvm_flush_dcache_to_poc(va, size);
+ kvm_flush_dcache_to_poc(va, size);
if (!icache_is_aliasing()) { /* PIPT */
flush_icache_range((unsigned long)va,
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index 3051f86a9b5f..c2860358ae3e 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -201,10 +201,23 @@ struct kvm_arch_memory_slot {
#define KVM_DEV_ARM_VGIC_GRP_CPU_REGS 2
#define KVM_DEV_ARM_VGIC_CPUID_SHIFT 32
#define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT)
+#define KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32
+#define KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \
+ (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT)
#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0
#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT)
+#define KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff)
#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3
#define KVM_DEV_ARM_VGIC_GRP_CTRL 4
+#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5
+#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6
+#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \
+ (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT)
+#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff
+#define VGIC_LEVEL_INFO_LINE_LEVEL 0
+
#define KVM_DEV_ARM_VGIC_CTRL_INIT 0
/* Device Control API on vcpu fd */
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index d50a82a16ff6..afd51bebb9c5 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -2,7 +2,7 @@
# Makefile for Kernel-based Virtual Machine module
#
-ccflags-y += -Iarch/arm64/kvm
+ccflags-y += -Iarch/arm64/kvm -Ivirt/kvm/arm/vgic
CFLAGS_arm.o := -I.
CFLAGS_mmu.o := -I.
@@ -19,6 +19,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
kvm-$(CONFIG_KVM_ARM_HOST) += inject_fault.o regmap.o
kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
+kvm-$(CONFIG_KVM_ARM_HOST) += vgic-sys-reg-v3.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/aarch32.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
@@ -31,6 +32,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-debug.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/irqchip.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e95d4f68bf54..d9e9697de1b2 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -46,6 +46,11 @@ static const struct kvm_regs default_regs_reset32 = {
COMPAT_PSR_I_BIT | COMPAT_PSR_F_BIT),
};
+static const struct kvm_irq_level default_ptimer_irq = {
+ .irq = 30,
+ .level = 1,
+};
+
static const struct kvm_irq_level default_vtimer_irq = {
.irq = 27,
.level = 1,
@@ -104,6 +109,7 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
{
const struct kvm_irq_level *cpu_vtimer_irq;
+ const struct kvm_irq_level *cpu_ptimer_irq;
const struct kvm_regs *cpu_reset;
switch (vcpu->arch.target) {
@@ -117,6 +123,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
}
cpu_vtimer_irq = &default_vtimer_irq;
+ cpu_ptimer_irq = &default_ptimer_irq;
break;
}
@@ -130,5 +137,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
kvm_pmu_vcpu_reset(vcpu);
/* Reset timer */
- return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
+ return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq, cpu_ptimer_irq);
}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index 87e7e6608cd8..0e26f8c2b56f 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -820,6 +820,61 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
CRm((0b1100 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \
access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), }
+static bool access_cntp_tval(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+ u64 now = kvm_phys_timer_read();
+
+ if (p->is_write)
+ ptimer->cnt_cval = p->regval + now;
+ else
+ p->regval = ptimer->cnt_cval - now;
+
+ return true;
+}
+
+static bool access_cntp_ctl(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+
+ if (p->is_write) {
+ /* ISTATUS bit is read-only */
+ ptimer->cnt_ctl = p->regval & ~ARCH_TIMER_CTRL_IT_STAT;
+ } else {
+ u64 now = kvm_phys_timer_read();
+
+ p->regval = ptimer->cnt_ctl;
+ /*
+ * Set ISTATUS bit if it's expired.
+ * Note that according to ARMv8 ARM Issue A.k, ISTATUS bit is
+ * UNKNOWN when ENABLE bit is 0, so we chose to set ISTATUS bit
+ * regardless of ENABLE bit for our implementation convenience.
+ */
+ if (ptimer->cnt_cval <= now)
+ p->regval |= ARCH_TIMER_CTRL_IT_STAT;
+ }
+
+ return true;
+}
+
+static bool access_cntp_cval(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ struct arch_timer_context *ptimer = vcpu_ptimer(vcpu);
+
+ if (p->is_write)
+ ptimer->cnt_cval = p->regval;
+ else
+ p->regval = ptimer->cnt_cval;
+
+ return true;
+}
+
/*
* Architected system registers.
* Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
@@ -1029,6 +1084,16 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b011),
NULL, reset_unknown, TPIDRRO_EL0 },
+ /* CNTP_TVAL_EL0 */
+ { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b000),
+ access_cntp_tval },
+ /* CNTP_CTL_EL0 */
+ { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b001),
+ access_cntp_ctl },
+ /* CNTP_CVAL_EL0 */
+ { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b010),
+ access_cntp_cval },
+
/* PMEVCNTRn_EL0 */
PMU_PMEVCNTR_EL0(0),
PMU_PMEVCNTR_EL0(1),
@@ -1795,6 +1860,17 @@ static bool index_to_params(u64 id, struct sys_reg_params *params)
}
}
+const struct sys_reg_desc *find_reg_by_id(u64 id,
+ struct sys_reg_params *params,
+ const struct sys_reg_desc table[],
+ unsigned int num)
+{
+ if (!index_to_params(id, params))
+ return NULL;
+
+ return find_reg(params, table, num);
+}
+
/* Decode an index value, and find the sys_reg_desc entry. */
static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu,
u64 id)
@@ -1807,11 +1883,8 @@ static const struct sys_reg_desc *index_to_sys_reg_desc(struct kvm_vcpu *vcpu,
if ((id & KVM_REG_ARM_COPROC_MASK) != KVM_REG_ARM64_SYSREG)
return NULL;
- if (!index_to_params(id, &params))
- return NULL;
-
table = get_target_table(vcpu->arch.target, true, &num);
- r = find_reg(&params, table, num);
+ r = find_reg_by_id(id, &params, table, num);
if (!r)
r = find_reg(&params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs));
@@ -1918,10 +1991,8 @@ static int get_invariant_sys_reg(u64 id, void __user *uaddr)
struct sys_reg_params params;
const struct sys_reg_desc *r;
- if (!index_to_params(id, &params))
- return -ENOENT;
-
- r = find_reg(&params, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs));
+ r = find_reg_by_id(id, &params, invariant_sys_regs,
+ ARRAY_SIZE(invariant_sys_regs));
if (!r)
return -ENOENT;
@@ -1935,9 +2006,8 @@ static int set_invariant_sys_reg(u64 id, void __user *uaddr)
int err;
u64 val = 0; /* Make sure high bits are 0 for 32-bit regs */
- if (!index_to_params(id, &params))
- return -ENOENT;
- r = find_reg(&params, invariant_sys_regs, ARRAY_SIZE(invariant_sys_regs));
+ r = find_reg_by_id(id, &params, invariant_sys_regs,
+ ARRAY_SIZE(invariant_sys_regs));
if (!r)
return -ENOENT;
diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h
index dbbb01cfbee9..9c6ffd0f0196 100644
--- a/arch/arm64/kvm/sys_regs.h
+++ b/arch/arm64/kvm/sys_regs.h
@@ -136,6 +136,10 @@ static inline int cmp_sys_reg(const struct sys_reg_desc *i1,
return i1->Op2 - i2->Op2;
}
+const struct sys_reg_desc *find_reg_by_id(u64 id,
+ struct sys_reg_params *params,
+ const struct sys_reg_desc table[],
+ unsigned int num);
#define Op0(_x) .Op0 = _x
#define Op1(_x) .Op1 = _x
diff --git a/arch/arm64/kvm/vgic-sys-reg-v3.c b/arch/arm64/kvm/vgic-sys-reg-v3.c
new file mode 100644
index 000000000000..79f37e37d367
--- /dev/null
+++ b/arch/arm64/kvm/vgic-sys-reg-v3.c
@@ -0,0 +1,346 @@
+/*
+ * VGIC system registers handling functions for AArch64 mode
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/irqchip/arm-gic-v3.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_emulate.h>
+#include "vgic.h"
+#include "sys_regs.h"
+
+static bool access_gic_ctlr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ u32 host_pri_bits, host_id_bits, host_seis, host_a3v, seis, a3v;
+ struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_vmcr vmcr;
+ u64 val;
+
+ vgic_get_vmcr(vcpu, &vmcr);
+ if (p->is_write) {
+ val = p->regval;
+
+ /*
+ * Disallow restoring VM state if not supported by this
+ * hardware.
+ */
+ host_pri_bits = ((val & ICC_CTLR_EL1_PRI_BITS_MASK) >>
+ ICC_CTLR_EL1_PRI_BITS_SHIFT) + 1;
+ if (host_pri_bits > vgic_v3_cpu->num_pri_bits)
+ return false;
+
+ vgic_v3_cpu->num_pri_bits = host_pri_bits;
+
+ host_id_bits = (val & ICC_CTLR_EL1_ID_BITS_MASK) >>
+ ICC_CTLR_EL1_ID_BITS_SHIFT;
+ if (host_id_bits > vgic_v3_cpu->num_id_bits)
+ return false;
+
+ vgic_v3_cpu->num_id_bits = host_id_bits;
+
+ host_seis = ((kvm_vgic_global_state.ich_vtr_el2 &
+ ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT);
+ seis = (val & ICC_CTLR_EL1_SEIS_MASK) >>
+ ICC_CTLR_EL1_SEIS_SHIFT;
+ if (host_seis != seis)
+ return false;
+
+ host_a3v = ((kvm_vgic_global_state.ich_vtr_el2 &
+ ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT);
+ a3v = (val & ICC_CTLR_EL1_A3V_MASK) >> ICC_CTLR_EL1_A3V_SHIFT;
+ if (host_a3v != a3v)
+ return false;
+
+ /*
+ * Here set VMCR.CTLR in ICC_CTLR_EL1 layout.
+ * The vgic_set_vmcr() will convert to ICH_VMCR layout.
+ */
+ vmcr.ctlr = val & ICC_CTLR_EL1_CBPR_MASK;
+ vmcr.ctlr |= val & ICC_CTLR_EL1_EOImode_MASK;
+ vgic_set_vmcr(vcpu, &vmcr);
+ } else {
+ val = 0;
+ val |= (vgic_v3_cpu->num_pri_bits - 1) <<
+ ICC_CTLR_EL1_PRI_BITS_SHIFT;
+ val |= vgic_v3_cpu->num_id_bits << ICC_CTLR_EL1_ID_BITS_SHIFT;
+ val |= ((kvm_vgic_global_state.ich_vtr_el2 &
+ ICH_VTR_SEIS_MASK) >> ICH_VTR_SEIS_SHIFT) <<
+ ICC_CTLR_EL1_SEIS_SHIFT;
+ val |= ((kvm_vgic_global_state.ich_vtr_el2 &
+ ICH_VTR_A3V_MASK) >> ICH_VTR_A3V_SHIFT) <<
+ ICC_CTLR_EL1_A3V_SHIFT;
+ /*
+ * The VMCR.CTLR value is in ICC_CTLR_EL1 layout.
+ * Extract it directly using ICC_CTLR_EL1 reg definitions.
+ */
+ val |= vmcr.ctlr & ICC_CTLR_EL1_CBPR_MASK;
+ val |= vmcr.ctlr & ICC_CTLR_EL1_EOImode_MASK;
+
+ p->regval = val;
+ }
+
+ return true;
+}
+
+static bool access_gic_pmr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ struct vgic_vmcr vmcr;
+
+ vgic_get_vmcr(vcpu, &vmcr);
+ if (p->is_write) {
+ vmcr.pmr = (p->regval & ICC_PMR_EL1_MASK) >> ICC_PMR_EL1_SHIFT;
+ vgic_set_vmcr(vcpu, &vmcr);
+ } else {
+ p->regval = (vmcr.pmr << ICC_PMR_EL1_SHIFT) & ICC_PMR_EL1_MASK;
+ }
+
+ return true;
+}
+
+static bool access_gic_bpr0(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ struct vgic_vmcr vmcr;
+
+ vgic_get_vmcr(vcpu, &vmcr);
+ if (p->is_write) {
+ vmcr.bpr = (p->regval & ICC_BPR0_EL1_MASK) >>
+ ICC_BPR0_EL1_SHIFT;
+ vgic_set_vmcr(vcpu, &vmcr);
+ } else {
+ p->regval = (vmcr.bpr << ICC_BPR0_EL1_SHIFT) &
+ ICC_BPR0_EL1_MASK;
+ }
+
+ return true;
+}
+
+static bool access_gic_bpr1(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ struct vgic_vmcr vmcr;
+
+ if (!p->is_write)
+ p->regval = 0;
+
+ vgic_get_vmcr(vcpu, &vmcr);
+ if (!((vmcr.ctlr & ICH_VMCR_CBPR_MASK) >> ICH_VMCR_CBPR_SHIFT)) {
+ if (p->is_write) {
+ vmcr.abpr = (p->regval & ICC_BPR1_EL1_MASK) >>
+ ICC_BPR1_EL1_SHIFT;
+ vgic_set_vmcr(vcpu, &vmcr);
+ } else {
+ p->regval = (vmcr.abpr << ICC_BPR1_EL1_SHIFT) &
+ ICC_BPR1_EL1_MASK;
+ }
+ } else {
+ if (!p->is_write)
+ p->regval = min((vmcr.bpr + 1), 7U);
+ }
+
+ return true;
+}
+
+static bool access_gic_grpen0(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ struct vgic_vmcr vmcr;
+
+ vgic_get_vmcr(vcpu, &vmcr);
+ if (p->is_write) {
+ vmcr.grpen0 = (p->regval & ICC_IGRPEN0_EL1_MASK) >>
+ ICC_IGRPEN0_EL1_SHIFT;
+ vgic_set_vmcr(vcpu, &vmcr);
+ } else {
+ p->regval = (vmcr.grpen0 << ICC_IGRPEN0_EL1_SHIFT) &
+ ICC_IGRPEN0_EL1_MASK;
+ }
+
+ return true;
+}
+
+static bool access_gic_grpen1(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ struct vgic_vmcr vmcr;
+
+ vgic_get_vmcr(vcpu, &vmcr);
+ if (p->is_write) {
+ vmcr.grpen1 = (p->regval & ICC_IGRPEN1_EL1_MASK) >>
+ ICC_IGRPEN1_EL1_SHIFT;
+ vgic_set_vmcr(vcpu, &vmcr);
+ } else {
+ p->regval = (vmcr.grpen1 << ICC_IGRPEN1_EL1_SHIFT) &
+ ICC_IGRPEN1_EL1_MASK;
+ }
+
+ return true;
+}
+
+static void vgic_v3_access_apr_reg(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p, u8 apr, u8 idx)
+{
+ struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
+ uint32_t *ap_reg;
+
+ if (apr)
+ ap_reg = &vgicv3->vgic_ap1r[idx];
+ else
+ ap_reg = &vgicv3->vgic_ap0r[idx];
+
+ if (p->is_write)
+ *ap_reg = p->regval;
+ else
+ p->regval = *ap_reg;
+}
+
+static bool access_gic_aprn(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r, u8 apr)
+{
+ struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu;
+ u8 idx = r->Op2 & 3;
+
+ /*
+ * num_pri_bits are initialized with HW supported values.
+ * We can rely safely on num_pri_bits even if VM has not
+ * restored ICC_CTLR_EL1 before restoring APnR registers.
+ */
+ switch (vgic_v3_cpu->num_pri_bits) {
+ case 7:
+ vgic_v3_access_apr_reg(vcpu, p, apr, idx);
+ break;
+ case 6:
+ if (idx > 1)
+ goto err;
+ vgic_v3_access_apr_reg(vcpu, p, apr, idx);
+ break;
+ default:
+ if (idx > 0)
+ goto err;
+ vgic_v3_access_apr_reg(vcpu, p, apr, idx);
+ }
+
+ return true;
+err:
+ if (!p->is_write)
+ p->regval = 0;
+
+ return false;
+}
+
+static bool access_gic_ap0r(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+
+{
+ return access_gic_aprn(vcpu, p, r, 0);
+}
+
+static bool access_gic_ap1r(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ return access_gic_aprn(vcpu, p, r, 1);
+}
+
+static bool access_gic_sre(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ struct vgic_v3_cpu_if *vgicv3 = &vcpu->arch.vgic_cpu.vgic_v3;
+
+ /* Validate SRE bit */
+ if (p->is_write) {
+ if (!(p->regval & ICC_SRE_EL1_SRE))
+ return false;
+ } else {
+ p->regval = vgicv3->vgic_sre;
+ }
+
+ return true;
+}
+static const struct sys_reg_desc gic_v3_icc_reg_descs[] = {
+ /* ICC_PMR_EL1 */
+ { Op0(3), Op1(0), CRn(4), CRm(6), Op2(0), access_gic_pmr },
+ /* ICC_BPR0_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(8), Op2(3), access_gic_bpr0 },
+ /* ICC_AP0R0_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(8), Op2(4), access_gic_ap0r },
+ /* ICC_AP0R1_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(8), Op2(5), access_gic_ap0r },
+ /* ICC_AP0R2_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(8), Op2(6), access_gic_ap0r },
+ /* ICC_AP0R3_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(8), Op2(7), access_gic_ap0r },
+ /* ICC_AP1R0_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(9), Op2(0), access_gic_ap1r },
+ /* ICC_AP1R1_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(9), Op2(1), access_gic_ap1r },
+ /* ICC_AP1R2_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(9), Op2(2), access_gic_ap1r },
+ /* ICC_AP1R3_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(9), Op2(3), access_gic_ap1r },
+ /* ICC_BPR1_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(12), Op2(3), access_gic_bpr1 },
+ /* ICC_CTLR_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(12), Op2(4), access_gic_ctlr },
+ /* ICC_SRE_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(12), Op2(5), access_gic_sre },
+ /* ICC_IGRPEN0_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(12), Op2(6), access_gic_grpen0 },
+ /* ICC_GRPEN1_EL1 */
+ { Op0(3), Op1(0), CRn(12), CRm(12), Op2(7), access_gic_grpen1 },
+};
+
+int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id,
+ u64 *reg)
+{
+ struct sys_reg_params params;
+ u64 sysreg = (id & KVM_DEV_ARM_VGIC_SYSREG_MASK) | KVM_REG_SIZE_U64;
+
+ params.regval = *reg;
+ params.is_write = is_write;
+ params.is_aarch32 = false;
+ params.is_32bit = false;
+
+ if (find_reg_by_id(sysreg, &params, gic_v3_icc_reg_descs,
+ ARRAY_SIZE(gic_v3_icc_reg_descs)))
+ return 0;
+
+ return -ENXIO;
+}
+
+int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id,
+ u64 *reg)
+{
+ struct sys_reg_params params;
+ const struct sys_reg_desc *r;
+ u64 sysreg = (id & KVM_DEV_ARM_VGIC_SYSREG_MASK) | KVM_REG_SIZE_U64;
+
+ if (is_write)
+ params.regval = *reg;
+ params.is_write = is_write;
+ params.is_aarch32 = false;
+ params.is_32bit = false;
+
+ r = find_reg_by_id(sysreg, &params, gic_v3_icc_reg_descs,
+ ARRAY_SIZE(gic_v3_icc_reg_descs));
+ if (!r)
+ return -ENXIO;
+
+ if (!r->access(vcpu, &params, r))
+ return -EINVAL;
+
+ if (!is_write)
+ *reg = params.regval;
+
+ return 0;
+}
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index bebec370324f..05e785fc061d 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -43,6 +43,7 @@
#define KVM_REG_MIPS_CP0_ENTRYHI MIPS_CP0_64(10, 0)
#define KVM_REG_MIPS_CP0_COMPARE MIPS_CP0_32(11, 0)
#define KVM_REG_MIPS_CP0_STATUS MIPS_CP0_32(12, 0)
+#define KVM_REG_MIPS_CP0_INTCTL MIPS_CP0_32(12, 1)
#define KVM_REG_MIPS_CP0_CAUSE MIPS_CP0_32(13, 0)
#define KVM_REG_MIPS_CP0_EPC MIPS_CP0_64(14, 0)
#define KVM_REG_MIPS_CP0_PRID MIPS_CP0_32(15, 0)
@@ -64,7 +65,7 @@
#define KVM_REG_MIPS_CP0_KSCRATCH6 MIPS_CP0_64(31, 7)
-#define KVM_MAX_VCPUS 1
+#define KVM_MAX_VCPUS 8
#define KVM_USER_MEM_SLOTS 8
/* memory slots that does not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 0
@@ -88,6 +89,7 @@
#define KVM_GUEST_KUSEG 0x00000000UL
#define KVM_GUEST_KSEG0 0x40000000UL
+#define KVM_GUEST_KSEG1 0x40000000UL
#define KVM_GUEST_KSEG23 0x60000000UL
#define KVM_GUEST_KSEGX(a) ((_ACAST32_(a)) & 0xe0000000)
#define KVM_GUEST_CPHYSADDR(a) ((_ACAST32_(a)) & 0x1fffffff)
@@ -104,7 +106,6 @@
#define KVM_GUEST_KSEG23ADDR(a) (KVM_GUEST_CPHYSADDR(a) | KVM_GUEST_KSEG23)
#define KVM_INVALID_PAGE 0xdeadbeef
-#define KVM_INVALID_INST 0xdeadbeef
#define KVM_INVALID_ADDR 0xdeadbeef
/*
@@ -121,8 +122,6 @@ static inline bool kvm_is_error_hva(unsigned long addr)
return IS_ERR_VALUE(addr);
}
-extern atomic_t kvm_mips_instance;
-
struct kvm_vm_stat {
ulong remote_tlb_flush;
};
@@ -156,12 +155,8 @@ struct kvm_arch_memory_slot {
};
struct kvm_arch {
- /* Guest GVA->HPA page table */
- unsigned long *guest_pmap;
- unsigned long guest_pmap_npages;
-
- /* Wired host TLB used for the commpage */
- int commpage_tlb;
+ /* Guest physical mm */
+ struct mm_struct gpa_mm;
};
#define N_MIPS_COPROC_REGS 32
@@ -233,6 +228,7 @@ enum emulation_result {
EMULATE_FAIL, /* can't emulate this instruction */
EMULATE_WAIT, /* WAIT instruction */
EMULATE_PRIV_FAIL,
+ EMULATE_EXCEPT, /* A guest exception has been generated */
};
#define mips3_paddr_to_tlbpfn(x) \
@@ -250,6 +246,7 @@ enum emulation_result {
#define TLB_ASID(x) ((x).tlb_hi & KVM_ENTRYHI_ASID)
#define TLB_LO_IDX(x, va) (((va) >> PAGE_SHIFT) & 1)
#define TLB_IS_VALID(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V)
+#define TLB_IS_DIRTY(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_D)
#define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) == \
((y) & VPN2_MASK & ~(x).tlb_mask))
#define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) || \
@@ -261,6 +258,17 @@ struct kvm_mips_tlb {
long tlb_lo[2];
};
+#define KVM_NR_MEM_OBJS 4
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
+struct kvm_mmu_memory_cache {
+ int nobjs;
+ void *objects[KVM_NR_MEM_OBJS];
+};
+
#define KVM_MIPS_AUX_FPU 0x1
#define KVM_MIPS_AUX_MSA 0x2
@@ -275,6 +283,8 @@ struct kvm_vcpu_arch {
unsigned long host_cp0_badvaddr;
unsigned long host_cp0_epc;
u32 host_cp0_cause;
+ u32 host_cp0_badinstr;
+ u32 host_cp0_badinstrp;
/* GPRS */
unsigned long gprs[32];
@@ -318,20 +328,18 @@ struct kvm_vcpu_arch {
/* Bitmask of pending exceptions to be cleared */
unsigned long pending_exceptions_clr;
- /* Save/Restore the entryhi register when are are preempted/scheduled back in */
- unsigned long preempt_entryhi;
-
/* S/W Based TLB for guest */
struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE];
- /* Cached guest kernel/user ASIDs */
- u32 guest_user_asid[NR_CPUS];
- u32 guest_kernel_asid[NR_CPUS];
+ /* Guest kernel/user [partial] mm */
struct mm_struct guest_kernel_mm, guest_user_mm;
/* Guest ASID of last user mode execution */
unsigned int last_user_gasid;
+ /* Cache some mmu pages needed inside spinlock regions */
+ struct kvm_mmu_memory_cache mmu_page_cache;
+
int last_sched_cpu;
/* WAIT executed */
@@ -339,14 +347,15 @@ struct kvm_vcpu_arch {
u8 fpu_enabled;
u8 msa_enabled;
- u8 kscratch_enabled;
};
#define kvm_read_c0_guest_index(cop0) (cop0->reg[MIPS_CP0_TLB_INDEX][0])
#define kvm_write_c0_guest_index(cop0, val) (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val)
#define kvm_read_c0_guest_entrylo0(cop0) (cop0->reg[MIPS_CP0_TLB_LO0][0])
+#define kvm_write_c0_guest_entrylo0(cop0, val) (cop0->reg[MIPS_CP0_TLB_LO0][0] = (val))
#define kvm_read_c0_guest_entrylo1(cop0) (cop0->reg[MIPS_CP0_TLB_LO1][0])
+#define kvm_write_c0_guest_entrylo1(cop0, val) (cop0->reg[MIPS_CP0_TLB_LO1][0] = (val))
#define kvm_read_c0_guest_context(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0])
#define kvm_write_c0_guest_context(cop0, val) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val))
#define kvm_read_c0_guest_userlocal(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][2])
@@ -522,9 +531,17 @@ struct kvm_mips_callbacks {
int (*handle_msa_fpe)(struct kvm_vcpu *vcpu);
int (*handle_fpe)(struct kvm_vcpu *vcpu);
int (*handle_msa_disabled)(struct kvm_vcpu *vcpu);
- int (*vm_init)(struct kvm *kvm);
int (*vcpu_init)(struct kvm_vcpu *vcpu);
+ void (*vcpu_uninit)(struct kvm_vcpu *vcpu);
int (*vcpu_setup)(struct kvm_vcpu *vcpu);
+ void (*flush_shadow_all)(struct kvm *kvm);
+ /*
+ * Must take care of flushing any cached GPA PTEs (e.g. guest entries in
+ * VZ root TLB, or T&E GVA page tables and corresponding root TLB
+ * mappings).
+ */
+ void (*flush_shadow_memslot)(struct kvm *kvm,
+ const struct kvm_memory_slot *slot);
gpa_t (*gva_to_gpa)(gva_t gva);
void (*queue_timer_int)(struct kvm_vcpu *vcpu);
void (*dequeue_timer_int)(struct kvm_vcpu *vcpu);
@@ -542,8 +559,10 @@ struct kvm_mips_callbacks {
const struct kvm_one_reg *reg, s64 *v);
int (*set_one_reg)(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *reg, s64 v);
- int (*vcpu_get_regs)(struct kvm_vcpu *vcpu);
- int (*vcpu_set_regs)(struct kvm_vcpu *vcpu);
+ int (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
+ int (*vcpu_put)(struct kvm_vcpu *vcpu, int cpu);
+ int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ void (*vcpu_reenter)(struct kvm_run *run, struct kvm_vcpu *vcpu);
};
extern struct kvm_mips_callbacks *kvm_mips_callbacks;
int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
@@ -556,6 +575,7 @@ extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
/* Building of entry/exception code */
int kvm_mips_entry_setup(void);
void *kvm_mips_build_vcpu_run(void *addr);
+void *kvm_mips_build_tlb_refill_exception(void *addr, void *handler);
void *kvm_mips_build_exception(void *addr, void *handler);
void *kvm_mips_build_exit(void *addr);
@@ -580,54 +600,125 @@ u32 kvm_get_user_asid(struct kvm_vcpu *vcpu);
u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr,
- struct kvm_vcpu *vcpu);
+ struct kvm_vcpu *vcpu,
+ bool write_fault);
extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
struct kvm_vcpu *vcpu);
extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
- struct kvm_mips_tlb *tlb);
+ struct kvm_mips_tlb *tlb,
+ unsigned long gva,
+ bool write_fault);
extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
u32 *opc,
struct kvm_run *run,
- struct kvm_vcpu *vcpu);
-
-extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause,
- u32 *opc,
- struct kvm_run *run,
- struct kvm_vcpu *vcpu);
+ struct kvm_vcpu *vcpu,
+ bool write_fault);
extern void kvm_mips_dump_host_tlbs(void);
extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu);
-extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
- unsigned long entrylo0,
- unsigned long entrylo1,
- int flush_dcache_mask);
-extern void kvm_mips_flush_host_tlb(int skip_kseg0);
-extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi);
+extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi,
+ bool user, bool kernel);
extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu,
unsigned long entryhi);
-extern int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr);
-extern unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
- unsigned long gva);
-extern void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
- struct kvm_vcpu *vcpu);
-extern void kvm_local_flush_tlb_all(void);
-extern void kvm_mips_alloc_new_mmu_context(struct kvm_vcpu *vcpu);
-extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
-extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu);
+
+void kvm_mips_suspend_mm(int cpu);
+void kvm_mips_resume_mm(int cpu);
+
+/* MMU handling */
+
+/**
+ * enum kvm_mips_flush - Types of MMU flushes.
+ * @KMF_USER: Flush guest user virtual memory mappings.
+ * Guest USeg only.
+ * @KMF_KERN: Flush guest kernel virtual memory mappings.
+ * Guest USeg and KSeg2/3.
+ * @KMF_GPA: Flush guest physical memory mappings.
+ * Also includes KSeg0 if KMF_KERN is set.
+ */
+enum kvm_mips_flush {
+ KMF_USER = 0x0,
+ KMF_KERN = 0x1,
+ KMF_GPA = 0x2,
+};
+void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags);
+bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
+int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn);
+pgd_t *kvm_pgd_alloc(void);
+void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
+void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
+ bool user);
+void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu);
+void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu);
+
+enum kvm_mips_fault_result {
+ KVM_MIPS_MAPPED = 0,
+ KVM_MIPS_GVA,
+ KVM_MIPS_GPA,
+ KVM_MIPS_TLB,
+ KVM_MIPS_TLBINV,
+ KVM_MIPS_TLBMOD,
+};
+enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
+ unsigned long gva,
+ bool write);
+
+#define KVM_ARCH_WANT_MMU_NOTIFIER
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
+int kvm_unmap_hva_range(struct kvm *kvm,
+ unsigned long start, unsigned long end);
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
+
+static inline void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
+ unsigned long address)
+{
+}
/* Emulation */
-u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu);
+int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
+int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
+int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out);
+
+/**
+ * kvm_is_ifetch_fault() - Find whether a TLBL exception is due to ifetch fault.
+ * @vcpu: Virtual CPU.
+ *
+ * Returns: Whether the TLBL exception was likely due to an instruction
+ * fetch fault rather than a data load fault.
+ */
+static inline bool kvm_is_ifetch_fault(struct kvm_vcpu_arch *vcpu)
+{
+ unsigned long badvaddr = vcpu->host_cp0_badvaddr;
+ unsigned long epc = msk_isa16_mode(vcpu->pc);
+ u32 cause = vcpu->host_cp0_cause;
+
+ if (epc == badvaddr)
+ return true;
+
+ /*
+ * Branches may be 32-bit or 16-bit instructions.
+ * This isn't exact, but we don't really support MIPS16 or microMIPS yet
+ * in KVM anyway.
+ */
+ if ((cause & CAUSEF_BD) && badvaddr - epc <= 4)
+ return true;
+
+ return false;
+}
extern enum emulation_result kvm_mips_emulate_inst(u32 cause,
u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
+long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu);
+
extern enum emulation_result kvm_mips_emulate_syscall(u32 cause,
u32 *opc,
struct kvm_run *run,
@@ -761,10 +852,6 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {}
static inline void kvm_arch_free_memslot(struct kvm *kvm,
struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
-static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
-static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
- struct kvm_memory_slot *slot) {}
-static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
diff --git a/arch/mips/include/asm/mmu_context.h b/arch/mips/include/asm/mmu_context.h
index ddd57ade1aa8..2abf94f72c0a 100644
--- a/arch/mips/include/asm/mmu_context.h
+++ b/arch/mips/include/asm/mmu_context.h
@@ -29,9 +29,11 @@ do { \
} \
} while (0)
+extern void tlbmiss_handler_setup_pgd(unsigned long);
+
+/* Note: This is also implemented with uasm in arch/mips/kvm/entry.c */
#define TLBMISS_HANDLER_SETUP_PGD(pgd) \
do { \
- extern void tlbmiss_handler_setup_pgd(unsigned long); \
tlbmiss_handler_setup_pgd((unsigned long)(pgd)); \
htw_set_pwbase((unsigned long)pgd); \
} while (0)
@@ -97,17 +99,12 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
static inline void
get_new_mmu_context(struct mm_struct *mm, unsigned long cpu)
{
- extern void kvm_local_flush_tlb_all(void);
unsigned long asid = asid_cache(cpu);
if (!((asid += cpu_asid_inc()) & cpu_asid_mask(&cpu_data[cpu]))) {
if (cpu_has_vtag_icache)
flush_icache_all();
-#ifdef CONFIG_KVM
- kvm_local_flush_tlb_all(); /* start new asid cycle */
-#else
local_flush_tlb_all(); /* start new asid cycle */
-#endif
if (!asid) /* fix version if needed */
asid = asid_first_version(cpu);
}
diff --git a/arch/mips/include/uapi/asm/kvm.h b/arch/mips/include/uapi/asm/kvm.h
index 6985eb59b085..a8a0199bf760 100644
--- a/arch/mips/include/uapi/asm/kvm.h
+++ b/arch/mips/include/uapi/asm/kvm.h
@@ -19,6 +19,8 @@
* Some parts derived from the x86 version of this file.
*/
+#define __KVM_HAVE_READONLY_MEM
+
/*
* for KVM_GET_REGS and KVM_SET_REGS
*
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 7c56d6b124d1..65067327db12 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -20,7 +20,9 @@ config KVM
select EXPORT_UASM
select PREEMPT_NOTIFIERS
select ANON_INODES
+ select KVM_GENERIC_DIRTYLOG_READ_PROTECT
select KVM_MMIO
+ select MMU_NOTIFIER
select SRCU
---help---
Support for hosting Guest kernels.
diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index 010cef240688..f8e772564d74 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -13,6 +13,7 @@
#include <linux/err.h>
#include <linux/highmem.h>
#include <linux/kvm_host.h>
+#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/fs.h>
#include <linux/bootmem.h>
@@ -29,28 +30,37 @@
static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc,
union mips_instruction replace)
{
- unsigned long paddr, flags;
- void *vaddr;
-
- if (KVM_GUEST_KSEGX((unsigned long)opc) == KVM_GUEST_KSEG0) {
- paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
- (unsigned long)opc);
- vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
- vaddr += paddr & ~PAGE_MASK;
- memcpy(vaddr, (void *)&replace, sizeof(u32));
- local_flush_icache_range((unsigned long)vaddr,
- (unsigned long)vaddr + 32);
- kunmap_atomic(vaddr);
- } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
- local_irq_save(flags);
- memcpy((void *)opc, (void *)&replace, sizeof(u32));
- __local_flush_icache_user_range((unsigned long)opc,
- (unsigned long)opc + 32);
- local_irq_restore(flags);
- } else {
- kvm_err("%s: Invalid address: %p\n", __func__, opc);
- return -EFAULT;
+ unsigned long vaddr = (unsigned long)opc;
+ int err;
+
+retry:
+ /* The GVA page table is still active so use the Linux TLB handlers */
+ kvm_trap_emul_gva_lockless_begin(vcpu);
+ err = put_user(replace.word, opc);
+ kvm_trap_emul_gva_lockless_end(vcpu);
+
+ if (unlikely(err)) {
+ /*
+ * We write protect clean pages in GVA page table so normal
+ * Linux TLB mod handler doesn't silently dirty the page.
+ * Its also possible we raced with a GVA invalidation.
+ * Try to force the page to become dirty.
+ */
+ err = kvm_trap_emul_gva_fault(vcpu, vaddr, true);
+ if (unlikely(err)) {
+ kvm_info("%s: Address unwriteable: %p\n",
+ __func__, opc);
+ return -EFAULT;
+ }
+
+ /*
+ * Try again. This will likely trigger a TLB refill, which will
+ * fetch the new dirty entry from the GVA page table, which
+ * should then succeed.
+ */
+ goto retry;
}
+ __local_flush_icache_user_range(vaddr, vaddr + 4);
return 0;
}
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index aa0937423e28..d40cfaad4529 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -38,23 +38,25 @@
* Compute the return address and do emulate branch simulation, if required.
* This function should be called only in branch delay slot active.
*/
-unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
- unsigned long instpc)
+static int kvm_compute_return_epc(struct kvm_vcpu *vcpu, unsigned long instpc,
+ unsigned long *out)
{
unsigned int dspcontrol;
union mips_instruction insn;
struct kvm_vcpu_arch *arch = &vcpu->arch;
long epc = instpc;
- long nextpc = KVM_INVALID_INST;
+ long nextpc;
+ int err;
- if (epc & 3)
- goto unaligned;
+ if (epc & 3) {
+ kvm_err("%s: unaligned epc\n", __func__);
+ return -EINVAL;
+ }
/* Read the instruction */
- insn.word = kvm_get_inst((u32 *) epc, vcpu);
-
- if (insn.word == KVM_INVALID_INST)
- return KVM_INVALID_INST;
+ err = kvm_get_badinstrp((u32 *)epc, vcpu, &insn.word);
+ if (err)
+ return err;
switch (insn.i_format.opcode) {
/* jr and jalr are in r_format format. */
@@ -66,6 +68,8 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
case jr_op:
nextpc = arch->gprs[insn.r_format.rs];
break;
+ default:
+ return -EINVAL;
}
break;
@@ -114,8 +118,11 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
nextpc = epc;
break;
case bposge32_op:
- if (!cpu_has_dsp)
- goto sigill;
+ if (!cpu_has_dsp) {
+ kvm_err("%s: DSP branch but not DSP ASE\n",
+ __func__);
+ return -EINVAL;
+ }
dspcontrol = rddsp(0x01);
@@ -125,6 +132,8 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
epc += 8;
nextpc = epc;
break;
+ default:
+ return -EINVAL;
}
break;
@@ -189,7 +198,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
/* And now the FPA/cp1 branch instructions. */
case cop1_op:
kvm_err("%s: unsupported cop1_op\n", __func__);
- break;
+ return -EINVAL;
#ifdef CONFIG_CPU_MIPSR6
/* R6 added the following compact branches with forbidden slots */
@@ -198,19 +207,19 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
/* only rt == 0 isn't compact branch */
if (insn.i_format.rt != 0)
goto compact_branch;
- break;
+ return -EINVAL;
case pop10_op:
case pop30_op:
/* only rs == rt == 0 is reserved, rest are compact branches */
if (insn.i_format.rs != 0 || insn.i_format.rt != 0)
goto compact_branch;
- break;
+ return -EINVAL;
case pop66_op:
case pop76_op:
/* only rs == 0 isn't compact branch */
if (insn.i_format.rs != 0)
goto compact_branch;
- break;
+ return -EINVAL;
compact_branch:
/*
* If we've hit an exception on the forbidden slot, then
@@ -221,42 +230,74 @@ compact_branch:
break;
#else
compact_branch:
- /* Compact branches not supported before R6 */
- break;
+ /* Fall through - Compact branches not supported before R6 */
#endif
+ default:
+ return -EINVAL;
}
- return nextpc;
-
-unaligned:
- kvm_err("%s: unaligned epc\n", __func__);
- return nextpc;
-
-sigill:
- kvm_err("%s: DSP branch but not DSP ASE\n", __func__);
- return nextpc;
+ *out = nextpc;
+ return 0;
}
enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause)
{
- unsigned long branch_pc;
- enum emulation_result er = EMULATE_DONE;
+ int err;
if (cause & CAUSEF_BD) {
- branch_pc = kvm_compute_return_epc(vcpu, vcpu->arch.pc);
- if (branch_pc == KVM_INVALID_INST) {
- er = EMULATE_FAIL;
- } else {
- vcpu->arch.pc = branch_pc;
- kvm_debug("BD update_pc(): New PC: %#lx\n",
- vcpu->arch.pc);
- }
- } else
+ err = kvm_compute_return_epc(vcpu, vcpu->arch.pc,
+ &vcpu->arch.pc);
+ if (err)
+ return EMULATE_FAIL;
+ } else {
vcpu->arch.pc += 4;
+ }
kvm_debug("update_pc(): New PC: %#lx\n", vcpu->arch.pc);
- return er;
+ return EMULATE_DONE;
+}
+
+/**
+ * kvm_get_badinstr() - Get bad instruction encoding.
+ * @opc: Guest pointer to faulting instruction.
+ * @vcpu: KVM VCPU information.
+ *
+ * Gets the instruction encoding of the faulting instruction, using the saved
+ * BadInstr register value if it exists, otherwise falling back to reading guest
+ * memory at @opc.
+ *
+ * Returns: The instruction encoding of the faulting instruction.
+ */
+int kvm_get_badinstr(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
+{
+ if (cpu_has_badinstr) {
+ *out = vcpu->arch.host_cp0_badinstr;
+ return 0;
+ } else {
+ return kvm_get_inst(opc, vcpu, out);
+ }
+}
+
+/**
+ * kvm_get_badinstrp() - Get bad prior instruction encoding.
+ * @opc: Guest pointer to prior faulting instruction.
+ * @vcpu: KVM VCPU information.
+ *
+ * Gets the instruction encoding of the prior faulting instruction (the branch
+ * containing the delay slot which faulted), using the saved BadInstrP register
+ * value if it exists, otherwise falling back to reading guest memory at @opc.
+ *
+ * Returns: The instruction encoding of the prior faulting instruction.
+ */
+int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
+{
+ if (cpu_has_badinstrp) {
+ *out = vcpu->arch.host_cp0_badinstrp;
+ return 0;
+ } else {
+ return kvm_get_inst(opc, vcpu, out);
+ }
}
/**
@@ -856,22 +897,30 @@ enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu,
struct kvm_mips_tlb *tlb)
{
+ struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+ struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
int cpu, i;
bool user;
/* No need to flush for entries which are already invalid */
if (!((tlb->tlb_lo[0] | tlb->tlb_lo[1]) & ENTRYLO_V))
return;
+ /* Don't touch host kernel page tables or TLB mappings */
+ if ((unsigned long)tlb->tlb_hi > 0x7fffffff)
+ return;
/* User address space doesn't need flushing for KSeg2/3 changes */
user = tlb->tlb_hi < KVM_GUEST_KSEG0;
preempt_disable();
+ /* Invalidate page table entries */
+ kvm_trap_emul_invalidate_gva(vcpu, tlb->tlb_hi & VPN2_MASK, user);
+
/*
* Probe the shadow host TLB for the entry being overwritten, if one
* matches, invalidate it
*/
- kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi);
+ kvm_mips_host_tlb_inv(vcpu, tlb->tlb_hi, user, true);
/* Invalidate the whole ASID on other CPUs */
cpu = smp_processor_id();
@@ -879,8 +928,8 @@ static void kvm_mips_invalidate_guest_tlb(struct kvm_vcpu *vcpu,
if (i == cpu)
continue;
if (user)
- vcpu->arch.guest_user_asid[i] = 0;
- vcpu->arch.guest_kernel_asid[i] = 0;
+ cpu_context(i, user_mm) = 0;
+ cpu_context(i, kern_mm) = 0;
}
preempt_enable();
@@ -1017,7 +1066,7 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu)
unsigned int mask = MIPS_CONF_M;
/* KScrExist */
- mask |= (unsigned int)vcpu->arch.kscratch_enabled << 16;
+ mask |= 0xfc << MIPS_CONF4_KSCREXIST_SHIFT;
return mask;
}
@@ -1056,6 +1105,7 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
+ struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
enum emulation_result er = EMULATE_DONE;
u32 rt, rd, sel;
unsigned long curr_pc;
@@ -1150,14 +1200,13 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
er = EMULATE_FAIL;
break;
}
-#define C0_EBASE_CORE_MASK 0xff
if ((rd == MIPS_CP0_PRID) && (sel == 1)) {
- /* Preserve CORE number */
- kvm_change_c0_guest_ebase(cop0,
- ~(C0_EBASE_CORE_MASK),
+ /*
+ * Preserve core number, and keep the exception
+ * base in guest KSeg0.
+ */
+ kvm_change_c0_guest_ebase(cop0, 0x1ffff000,
vcpu->arch.gprs[rt]);
- kvm_err("MTCz, cop0->reg[EBASE]: %#lx\n",
- kvm_read_c0_guest_ebase(cop0));
} else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
u32 nasid =
vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
@@ -1169,6 +1218,17 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
nasid);
/*
+ * Flush entries from the GVA page
+ * tables.
+ * Guest user page table will get
+ * flushed lazily on re-entry to guest
+ * user if the guest ASID actually
+ * changes.
+ */
+ kvm_mips_flush_gva_pt(kern_mm->pgd,
+ KMF_KERN);
+
+ /*
* Regenerate/invalidate kernel MMU
* context.
* The user MMU context will be
@@ -1178,13 +1238,10 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
*/
preempt_disable();
cpu = smp_processor_id();
- kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm,
- cpu, vcpu);
- vcpu->arch.guest_kernel_asid[cpu] =
- vcpu->arch.guest_kernel_mm.context.asid[cpu];
+ get_new_mmu_context(kern_mm, cpu);
for_each_possible_cpu(i)
if (i != cpu)
- vcpu->arch.guest_kernel_asid[i] = 0;
+ cpu_context(i, kern_mm) = 0;
preempt_enable();
}
kvm_write_c0_guest_entryhi(cop0,
@@ -1639,12 +1696,56 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
return er;
}
+static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long),
+ unsigned long curr_pc,
+ unsigned long addr,
+ struct kvm_run *run,
+ struct kvm_vcpu *vcpu,
+ u32 cause)
+{
+ int err;
+
+ for (;;) {
+ /* Carefully attempt the cache operation */
+ kvm_trap_emul_gva_lockless_begin(vcpu);
+ err = fn(addr);
+ kvm_trap_emul_gva_lockless_end(vcpu);
+
+ if (likely(!err))
+ return EMULATE_DONE;
+
+ /*
+ * Try to handle the fault and retry, maybe we just raced with a
+ * GVA invalidation.
+ */
+ switch (kvm_trap_emul_gva_fault(vcpu, addr, false)) {
+ case KVM_MIPS_GVA:
+ case KVM_MIPS_GPA:
+ /* bad virtual or physical address */
+ return EMULATE_FAIL;
+ case KVM_MIPS_TLB:
+ /* no matching guest TLB */
+ vcpu->arch.host_cp0_badvaddr = addr;
+ vcpu->arch.pc = curr_pc;
+ kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, vcpu);
+ return EMULATE_EXCEPT;
+ case KVM_MIPS_TLBINV:
+ /* invalid matching guest TLB */
+ vcpu->arch.host_cp0_badvaddr = addr;
+ vcpu->arch.pc = curr_pc;
+ kvm_mips_emulate_tlbinv_ld(cause, NULL, run, vcpu);
+ return EMULATE_EXCEPT;
+ default:
+ break;
+ };
+ }
+}
+
enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
u32 *opc, u32 cause,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
- struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result er = EMULATE_DONE;
u32 cache, op_inst, op, base;
s16 offset;
@@ -1701,80 +1802,16 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
goto done;
}
- preempt_disable();
- if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
- if (kvm_mips_host_tlb_lookup(vcpu, va) < 0 &&
- kvm_mips_handle_kseg0_tlb_fault(va, vcpu)) {
- kvm_err("%s: handling mapped kseg0 tlb fault for %lx, vcpu: %p, ASID: %#lx\n",
- __func__, va, vcpu, read_c0_entryhi());
- er = EMULATE_FAIL;
- preempt_enable();
- goto done;
- }
- } else if ((KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0) ||
- KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
- int index;
-
- /* If an entry already exists then skip */
- if (kvm_mips_host_tlb_lookup(vcpu, va) >= 0)
- goto skip_fault;
-
- /*
- * If address not in the guest TLB, then give the guest a fault,
- * the resulting handler will do the right thing
- */
- index = kvm_mips_guest_tlb_lookup(vcpu, (va & VPN2_MASK) |
- (kvm_read_c0_guest_entryhi
- (cop0) & KVM_ENTRYHI_ASID));
-
- if (index < 0) {
- vcpu->arch.host_cp0_badvaddr = va;
- vcpu->arch.pc = curr_pc;
- er = kvm_mips_emulate_tlbmiss_ld(cause, NULL, run,
- vcpu);
- preempt_enable();
- goto dont_update_pc;
- } else {
- struct kvm_mips_tlb *tlb = &vcpu->arch.guest_tlb[index];
- /*
- * Check if the entry is valid, if not then setup a TLB
- * invalid exception to the guest
- */
- if (!TLB_IS_VALID(*tlb, va)) {
- vcpu->arch.host_cp0_badvaddr = va;
- vcpu->arch.pc = curr_pc;
- er = kvm_mips_emulate_tlbinv_ld(cause, NULL,
- run, vcpu);
- preempt_enable();
- goto dont_update_pc;
- }
- /*
- * We fault an entry from the guest tlb to the
- * shadow host TLB
- */
- if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) {
- kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
- __func__, va, index, vcpu,
- read_c0_entryhi());
- er = EMULATE_FAIL;
- preempt_enable();
- goto done;
- }
- }
- } else {
- kvm_err("INVALID CACHE INDEX/ADDRESS (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n",
- cache, op, base, arch->gprs[base], offset);
- er = EMULATE_FAIL;
- preempt_enable();
- goto done;
-
- }
-
-skip_fault:
/* XXXKYMA: Only a subset of cache ops are supported, used by Linux */
if (op_inst == Hit_Writeback_Inv_D || op_inst == Hit_Invalidate_D) {
- flush_dcache_line(va);
-
+ /*
+ * Perform the dcache part of icache synchronisation on the
+ * guest's behalf.
+ */
+ er = kvm_mips_guest_cache_op(protected_writeback_dcache_line,
+ curr_pc, va, run, vcpu, cause);
+ if (er != EMULATE_DONE)
+ goto done;
#ifdef CONFIG_KVM_MIPS_DYN_TRANS
/*
* Replace the CACHE instruction, with a SYNCI, not the same,
@@ -1783,8 +1820,15 @@ skip_fault:
kvm_mips_trans_cache_va(inst, opc, vcpu);
#endif
} else if (op_inst == Hit_Invalidate_I) {
- flush_dcache_line(va);
- flush_icache_line(va);
+ /* Perform the icache synchronisation on the guest's behalf */
+ er = kvm_mips_guest_cache_op(protected_writeback_dcache_line,
+ curr_pc, va, run, vcpu, cause);
+ if (er != EMULATE_DONE)
+ goto done;
+ er = kvm_mips_guest_cache_op(protected_flush_icache_line,
+ curr_pc, va, run, vcpu, cause);
+ if (er != EMULATE_DONE)
+ goto done;
#ifdef CONFIG_KVM_MIPS_DYN_TRANS
/* Replace the CACHE instruction, with a SYNCI */
@@ -1796,17 +1840,13 @@ skip_fault:
er = EMULATE_FAIL;
}
- preempt_enable();
done:
/* Rollback PC only if emulation was unsuccessful */
if (er == EMULATE_FAIL)
vcpu->arch.pc = curr_pc;
-
-dont_update_pc:
- /*
- * This is for exceptions whose emulation updates the PC, so do not
- * overwrite the PC under any circumstances
- */
+ /* Guest exception needs guest to resume */
+ if (er == EMULATE_EXCEPT)
+ er = EMULATE_DONE;
return er;
}
@@ -1817,12 +1857,14 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
{
union mips_instruction inst;
enum emulation_result er = EMULATE_DONE;
+ int err;
/* Fetch the instruction. */
if (cause & CAUSEF_BD)
opc += 1;
-
- inst.word = kvm_get_inst(opc, vcpu);
+ err = kvm_get_badinstr(opc, vcpu, &inst.word);
+ if (err)
+ return EMULATE_FAIL;
switch (inst.r_format.opcode) {
case cop0_op:
@@ -1874,6 +1916,22 @@ unknown:
return er;
}
+/**
+ * kvm_mips_guest_exception_base() - Find guest exception vector base address.
+ *
+ * Returns: The base address of the current guest exception vector, taking
+ * both Guest.CP0_Status.BEV and Guest.CP0_EBase into account.
+ */
+long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu)
+{
+ struct mips_coproc *cop0 = vcpu->arch.cop0;
+
+ if (kvm_read_c0_guest_status(cop0) & ST0_BEV)
+ return KVM_GUEST_CKSEG1ADDR(0x1fc00200);
+ else
+ return kvm_read_c0_guest_ebase(cop0) & MIPS_EBASE_BASE;
+}
+
enum emulation_result kvm_mips_emulate_syscall(u32 cause,
u32 *opc,
struct kvm_run *run,
@@ -1899,7 +1957,7 @@ enum emulation_result kvm_mips_emulate_syscall(u32 cause,
(EXCCODE_SYS << CAUSEB_EXCCODE));
/* Set PC to the exception entry point */
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
} else {
kvm_err("Trying to deliver SYSCALL when EXL is already set\n");
@@ -1933,13 +1991,13 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
arch->pc);
/* set pc to the exception entry point */
- arch->pc = KVM_GUEST_KSEG0 + 0x0;
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x0;
} else {
kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n",
arch->pc);
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
}
kvm_change_c0_guest_cause(cop0, (0xff),
@@ -1949,8 +2007,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
/* XXXKYMA: is the context register used by linux??? */
kvm_write_c0_guest_entryhi(cop0, entryhi);
- /* Blow away the shadow host TLBs */
- kvm_mips_flush_host_tlb(1);
return EMULATE_DONE;
}
@@ -1978,16 +2034,14 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
kvm_debug("[EXL == 0] delivering TLB INV @ pc %#lx\n",
arch->pc);
-
- /* set pc to the exception entry point */
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
-
} else {
kvm_debug("[EXL == 1] delivering TLB MISS @ pc %#lx\n",
arch->pc);
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
}
+ /* set pc to the exception entry point */
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
+
kvm_change_c0_guest_cause(cop0, (0xff),
(EXCCODE_TLBL << CAUSEB_EXCCODE));
@@ -1995,8 +2049,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
/* XXXKYMA: is the context register used by linux??? */
kvm_write_c0_guest_entryhi(cop0, entryhi);
- /* Blow away the shadow host TLBs */
- kvm_mips_flush_host_tlb(1);
return EMULATE_DONE;
}
@@ -2025,11 +2077,11 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
arch->pc);
/* Set PC to the exception entry point */
- arch->pc = KVM_GUEST_KSEG0 + 0x0;
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x0;
} else {
kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n",
arch->pc);
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
}
kvm_change_c0_guest_cause(cop0, (0xff),
@@ -2039,8 +2091,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
/* XXXKYMA: is the context register used by linux??? */
kvm_write_c0_guest_entryhi(cop0, entryhi);
- /* Blow away the shadow host TLBs */
- kvm_mips_flush_host_tlb(1);
return EMULATE_DONE;
}
@@ -2067,15 +2117,14 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
kvm_debug("[EXL == 0] Delivering TLB MISS @ pc %#lx\n",
arch->pc);
-
- /* Set PC to the exception entry point */
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
} else {
kvm_debug("[EXL == 1] Delivering TLB MISS @ pc %#lx\n",
arch->pc);
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
}
+ /* Set PC to the exception entry point */
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
+
kvm_change_c0_guest_cause(cop0, (0xff),
(EXCCODE_TLBS << CAUSEB_EXCCODE));
@@ -2083,41 +2132,10 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
/* XXXKYMA: is the context register used by linux??? */
kvm_write_c0_guest_entryhi(cop0, entryhi);
- /* Blow away the shadow host TLBs */
- kvm_mips_flush_host_tlb(1);
return EMULATE_DONE;
}
-/* TLBMOD: store into address matching TLB with Dirty bit off */
-enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc,
- struct kvm_run *run,
- struct kvm_vcpu *vcpu)
-{
- enum emulation_result er = EMULATE_DONE;
-#ifdef DEBUG
- struct mips_coproc *cop0 = vcpu->arch.cop0;
- unsigned long entryhi = (vcpu->arch.host_cp0_badvaddr & VPN2_MASK) |
- (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
- int index;
-
- /* If address not in the guest TLB, then we are in trouble */
- index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
- if (index < 0) {
- /* XXXKYMA Invalidate and retry */
- kvm_mips_host_tlb_inv(vcpu, vcpu->arch.host_cp0_badvaddr);
- kvm_err("%s: host got TLBMOD for %#lx but entry not present in Guest TLB\n",
- __func__, entryhi);
- kvm_mips_dump_guest_tlbs(vcpu);
- kvm_mips_dump_host_tlbs();
- return EMULATE_FAIL;
- }
-#endif
-
- er = kvm_mips_emulate_tlbmod(cause, opc, run, vcpu);
- return er;
-}
-
enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
u32 *opc,
struct kvm_run *run,
@@ -2140,14 +2158,13 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
kvm_debug("[EXL == 0] Delivering TLB MOD @ pc %#lx\n",
arch->pc);
-
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
} else {
kvm_debug("[EXL == 1] Delivering TLB MOD @ pc %#lx\n",
arch->pc);
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
}
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
+
kvm_change_c0_guest_cause(cop0, (0xff),
(EXCCODE_MOD << CAUSEB_EXCCODE));
@@ -2155,8 +2172,6 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
/* XXXKYMA: is the context register used by linux??? */
kvm_write_c0_guest_entryhi(cop0, entryhi);
- /* Blow away the shadow host TLBs */
- kvm_mips_flush_host_tlb(1);
return EMULATE_DONE;
}
@@ -2181,7 +2196,7 @@ enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
}
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
kvm_change_c0_guest_cause(cop0, (0xff),
(EXCCODE_CPU << CAUSEB_EXCCODE));
@@ -2215,7 +2230,7 @@ enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
(EXCCODE_RI << CAUSEB_EXCCODE));
/* Set PC to the exception entry point */
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
} else {
kvm_err("Trying to deliver RI when EXL is already set\n");
@@ -2250,7 +2265,7 @@ enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
(EXCCODE_BP << CAUSEB_EXCCODE));
/* Set PC to the exception entry point */
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
} else {
kvm_err("Trying to deliver BP when EXL is already set\n");
@@ -2285,7 +2300,7 @@ enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
(EXCCODE_TR << CAUSEB_EXCCODE));
/* Set PC to the exception entry point */
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
} else {
kvm_err("Trying to deliver TRAP when EXL is already set\n");
@@ -2320,7 +2335,7 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
(EXCCODE_MSAFPE << CAUSEB_EXCCODE));
/* Set PC to the exception entry point */
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
} else {
kvm_err("Trying to deliver MSAFPE when EXL is already set\n");
@@ -2355,7 +2370,7 @@ enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
(EXCCODE_FPE << CAUSEB_EXCCODE));
/* Set PC to the exception entry point */
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
} else {
kvm_err("Trying to deliver FPE when EXL is already set\n");
@@ -2390,7 +2405,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
(EXCCODE_MSADIS << CAUSEB_EXCCODE));
/* Set PC to the exception entry point */
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
} else {
kvm_err("Trying to deliver MSADIS when EXL is already set\n");
@@ -2409,6 +2424,7 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
enum emulation_result er = EMULATE_DONE;
unsigned long curr_pc;
union mips_instruction inst;
+ int err;
/*
* Update PC and hold onto current PC in case there is
@@ -2422,11 +2438,9 @@ enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
/* Fetch the instruction. */
if (cause & CAUSEF_BD)
opc += 1;
-
- inst.word = kvm_get_inst(opc, vcpu);
-
- if (inst.word == KVM_INVALID_INST) {
- kvm_err("%s: Cannot get inst @ %p\n", __func__, opc);
+ err = kvm_get_badinstr(opc, vcpu, &inst.word);
+ if (err) {
+ kvm_err("%s: Cannot get inst @ %p (%d)\n", __func__, opc, err);
return EMULATE_FAIL;
}
@@ -2557,7 +2571,7 @@ static enum emulation_result kvm_mips_emulate_exc(u32 cause,
(exccode << CAUSEB_EXCCODE));
/* Set PC to the exception entry point */
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
+ arch->pc = kvm_mips_guest_exception_base(vcpu) + 0x180;
kvm_write_c0_guest_badvaddr(cop0, vcpu->arch.host_cp0_badvaddr);
kvm_debug("Delivering EXC %d @ pc %#lx, badVaddr: %#lx\n",
@@ -2670,7 +2684,8 @@ enum emulation_result kvm_mips_check_privilege(u32 cause,
enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
u32 *opc,
struct kvm_run *run,
- struct kvm_vcpu *vcpu)
+ struct kvm_vcpu *vcpu,
+ bool write_fault)
{
enum emulation_result er = EMULATE_DONE;
u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
@@ -2726,7 +2741,8 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
* OK we have a Guest TLB entry, now inject it into the
* shadow host TLB
*/
- if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb)) {
+ if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, va,
+ write_fault)) {
kvm_err("%s: handling mapped seg tlb fault for %lx, index: %u, vcpu: %p, ASID: %#lx\n",
__func__, va, index, vcpu,
read_c0_entryhi());
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
index e92fb190e2d6..c5b254c4d0da 100644
--- a/arch/mips/kvm/entry.c
+++ b/arch/mips/kvm/entry.c
@@ -12,8 +12,11 @@
*/
#include <linux/kvm_host.h>
+#include <linux/log2.h>
+#include <asm/mmu_context.h>
#include <asm/msa.h>
#include <asm/setup.h>
+#include <asm/tlbex.h>
#include <asm/uasm.h>
/* Register names */
@@ -50,6 +53,8 @@
/* Some CP0 registers */
#define C0_HWRENA 7, 0
#define C0_BADVADDR 8, 0
+#define C0_BADINSTR 8, 1
+#define C0_BADINSTRP 8, 2
#define C0_ENTRYHI 10, 0
#define C0_STATUS 12, 0
#define C0_CAUSE 13, 0
@@ -89,6 +94,21 @@ static void *kvm_mips_build_ret_from_exit(void *addr);
static void *kvm_mips_build_ret_to_guest(void *addr);
static void *kvm_mips_build_ret_to_host(void *addr);
+/*
+ * The version of this function in tlbex.c uses current_cpu_type(), but for KVM
+ * we assume symmetry.
+ */
+static int c0_kscratch(void)
+{
+ switch (boot_cpu_type()) {
+ case CPU_XLP:
+ case CPU_XLR:
+ return 22;
+ default:
+ return 31;
+ }
+}
+
/**
* kvm_mips_entry_setup() - Perform global setup for entry code.
*
@@ -103,18 +123,21 @@ int kvm_mips_entry_setup(void)
* We prefer to use KScratchN registers if they are available over the
* defaults above, which may not work on all cores.
*/
- unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc;
+ unsigned int kscratch_mask = cpu_data[0].kscratch_mask;
+
+ if (pgd_reg != -1)
+ kscratch_mask &= ~BIT(pgd_reg);
/* Pick a scratch register for storing VCPU */
if (kscratch_mask) {
- scratch_vcpu[0] = 31;
+ scratch_vcpu[0] = c0_kscratch();
scratch_vcpu[1] = ffs(kscratch_mask) - 1;
kscratch_mask &= ~BIT(scratch_vcpu[1]);
}
/* Pick a scratch register to use as a temp for saving state */
if (kscratch_mask) {
- scratch_tmp[0] = 31;
+ scratch_tmp[0] = c0_kscratch();
scratch_tmp[1] = ffs(kscratch_mask) - 1;
kscratch_mask &= ~BIT(scratch_tmp[1]);
}
@@ -130,7 +153,7 @@ static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp,
UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
/* Save the temp scratch register value in cp0_cause of stack frame */
- if (scratch_tmp[0] == 31) {
+ if (scratch_tmp[0] == c0_kscratch()) {
UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
}
@@ -146,7 +169,7 @@ static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp,
UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
- if (scratch_tmp[0] == 31) {
+ if (scratch_tmp[0] == c0_kscratch()) {
UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
}
@@ -286,23 +309,26 @@ static void *kvm_mips_build_enter_guest(void *addr)
uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL);
uasm_i_xori(&p, T0, T0, KSU_USER);
uasm_il_bnez(&p, &r, T0, label_kernel_asid);
- UASM_i_ADDIU(&p, T1, K1,
- offsetof(struct kvm_vcpu_arch, guest_kernel_asid));
+ UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch,
+ guest_kernel_mm.context.asid));
/* else user */
- UASM_i_ADDIU(&p, T1, K1,
- offsetof(struct kvm_vcpu_arch, guest_user_asid));
+ UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch,
+ guest_user_mm.context.asid));
uasm_l_kernel_asid(&l, p);
/* t1: contains the base of the ASID array, need to get the cpu id */
/* smp_processor_id */
uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP);
- /* x4 */
- uasm_i_sll(&p, T2, T2, 2);
+ /* index the ASID array */
+ uasm_i_sll(&p, T2, T2, ilog2(sizeof(long)));
UASM_i_ADDU(&p, T3, T1, T2);
- uasm_i_lw(&p, K0, 0, T3);
+ UASM_i_LW(&p, K0, 0, T3);
#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
- /* x sizeof(struct cpuinfo_mips)/4 */
- uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4);
+ /*
+ * reuse ASID array offset
+ * cpuinfo_mips is a multiple of sizeof(long)
+ */
+ uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/sizeof(long));
uasm_i_mul(&p, T2, T2, T3);
UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask);
@@ -312,7 +338,20 @@ static void *kvm_mips_build_enter_guest(void *addr)
#else
uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID);
#endif
- uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+
+ /*
+ * Set up KVM T&E GVA pgd.
+ * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD():
+ * - call tlbmiss_handler_setup_pgd(mm->pgd)
+ * - but skips write into CP0_PWBase for now
+ */
+ UASM_i_LW(&p, A0, (int)offsetof(struct mm_struct, pgd) -
+ (int)offsetof(struct mm_struct, context.asid), T1);
+
+ UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd);
+ uasm_i_jalr(&p, RA, T9);
+ uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+
uasm_i_ehb(&p);
/* Disable RDHWR access */
@@ -348,6 +387,80 @@ static void *kvm_mips_build_enter_guest(void *addr)
}
/**
+ * kvm_mips_build_tlb_refill_exception() - Assemble TLB refill handler.
+ * @addr: Address to start writing code.
+ * @handler: Address of common handler (within range of @addr).
+ *
+ * Assemble TLB refill exception fast path handler for guest execution.
+ *
+ * Returns: Next address after end of written function.
+ */
+void *kvm_mips_build_tlb_refill_exception(void *addr, void *handler)
+{
+ u32 *p = addr;
+ struct uasm_label labels[2];
+ struct uasm_reloc relocs[2];
+ struct uasm_label *l = labels;
+ struct uasm_reloc *r = relocs;
+
+ memset(labels, 0, sizeof(labels));
+ memset(relocs, 0, sizeof(relocs));
+
+ /* Save guest k1 into scratch register */
+ UASM_i_MTC0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
+
+ /* Get the VCPU pointer from the VCPU scratch register */
+ UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
+
+ /* Save guest k0 into VCPU structure */
+ UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu, arch.gprs[K0]), K1);
+
+ /*
+ * Some of the common tlbex code uses current_cpu_type(). For KVM we
+ * assume symmetry and just disable preemption to silence the warning.
+ */
+ preempt_disable();
+
+ /*
+ * Now for the actual refill bit. A lot of this can be common with the
+ * Linux TLB refill handler, however we don't need to handle so many
+ * cases. We only need to handle user mode refills, and user mode runs
+ * with 32-bit addressing.
+ *
+ * Therefore the branch to label_vmalloc generated by build_get_pmde64()
+ * that isn't resolved should never actually get taken and is harmless
+ * to leave in place for now.
+ */
+
+#ifdef CONFIG_64BIT
+ build_get_pmde64(&p, &l, &r, K0, K1); /* get pmd in K1 */
+#else
+ build_get_pgde32(&p, K0, K1); /* get pgd in K1 */
+#endif
+
+ /* we don't support huge pages yet */
+
+ build_get_ptep(&p, K0, K1);
+ build_update_entries(&p, K0, K1);
+ build_tlb_write_entry(&p, &l, &r, tlb_random);
+
+ preempt_enable();
+
+ /* Get the VCPU pointer from the VCPU scratch register again */
+ UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
+
+ /* Restore the guest's k0/k1 registers */
+ UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu, arch.gprs[K0]), K1);
+ uasm_i_ehb(&p);
+ UASM_i_MFC0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
+
+ /* Jump to guest */
+ uasm_i_eret(&p);
+
+ return p;
+}
+
+/**
* kvm_mips_build_exception() - Assemble first level guest exception handler.
* @addr: Address to start writing code.
* @handler: Address of common handler (within range of @addr).
@@ -468,6 +581,18 @@ void *kvm_mips_build_exit(void *addr)
uasm_i_mfc0(&p, K0, C0_CAUSE);
uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1);
+ if (cpu_has_badinstr) {
+ uasm_i_mfc0(&p, K0, C0_BADINSTR);
+ uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch,
+ host_cp0_badinstr), K1);
+ }
+
+ if (cpu_has_badinstrp) {
+ uasm_i_mfc0(&p, K0, C0_BADINSTRP);
+ uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch,
+ host_cp0_badinstrp), K1);
+ }
+
/* Now restore the host state just enough to run the handlers */
/* Switch EBASE to the one used by Linux */
diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c
index e88403b3dcdd..aa0a1a00faf6 100644
--- a/arch/mips/kvm/interrupt.c
+++ b/arch/mips/kvm/interrupt.c
@@ -183,10 +183,11 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
(exccode << CAUSEB_EXCCODE));
/* XXXSL Set PC to the interrupt exception entry point */
+ arch->pc = kvm_mips_guest_exception_base(vcpu);
if (kvm_read_c0_guest_cause(cop0) & CAUSEF_IV)
- arch->pc = KVM_GUEST_KSEG0 + 0x200;
+ arch->pc += 0x200;
else
- arch->pc = KVM_GUEST_KSEG0 + 0x180;
+ arch->pc += 0x180;
clear_bit(priority, &vcpu->arch.pending_exceptions);
}
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 29ec9ab3fd55..ed81e5ac1426 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -22,6 +22,7 @@
#include <asm/page.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <linux/kvm_host.h>
@@ -63,18 +64,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{NULL}
};
-static int kvm_mips_reset_vcpu(struct kvm_vcpu *vcpu)
-{
- int i;
-
- for_each_possible_cpu(i) {
- vcpu->arch.guest_kernel_asid[i] = 0;
- vcpu->arch.guest_user_asid[i] = 0;
- }
-
- return 0;
-}
-
/*
* XXXKYMA: We are simulatoring a processor that has the WII bit set in
* Config7, so we are "runnable" if interrupts are pending
@@ -104,39 +93,12 @@ void kvm_arch_check_processor_compat(void *rtn)
*(int *)rtn = 0;
}
-static void kvm_mips_init_tlbs(struct kvm *kvm)
-{
- unsigned long wired;
-
- /*
- * Add a wired entry to the TLB, it is used to map the commpage to
- * the Guest kernel
- */
- wired = read_c0_wired();
- write_c0_wired(wired + 1);
- mtc0_tlbw_hazard();
- kvm->arch.commpage_tlb = wired;
-
- kvm_debug("[%d] commpage TLB: %d\n", smp_processor_id(),
- kvm->arch.commpage_tlb);
-}
-
-static void kvm_mips_init_vm_percpu(void *arg)
-{
- struct kvm *kvm = (struct kvm *)arg;
-
- kvm_mips_init_tlbs(kvm);
- kvm_mips_callbacks->vm_init(kvm);
-
-}
-
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
- if (atomic_inc_return(&kvm_mips_instance) == 1) {
- kvm_debug("%s: 1st KVM instance, setup host TLB parameters\n",
- __func__);
- on_each_cpu(kvm_mips_init_vm_percpu, kvm, 1);
- }
+ /* Allocate page table to map GPA -> RPA */
+ kvm->arch.gpa_mm.pgd = kvm_pgd_alloc();
+ if (!kvm->arch.gpa_mm.pgd)
+ return -ENOMEM;
return 0;
}
@@ -156,13 +118,6 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
unsigned int i;
struct kvm_vcpu *vcpu;
- /* Put the pages we reserved for the guest pmap */
- for (i = 0; i < kvm->arch.guest_pmap_npages; i++) {
- if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE)
- kvm_release_pfn_clean(kvm->arch.guest_pmap[i]);
- }
- kfree(kvm->arch.guest_pmap);
-
kvm_for_each_vcpu(i, vcpu, kvm) {
kvm_arch_vcpu_free(vcpu);
}
@@ -177,25 +132,17 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
mutex_unlock(&kvm->lock);
}
-static void kvm_mips_uninit_tlbs(void *arg)
+static void kvm_mips_free_gpa_pt(struct kvm *kvm)
{
- /* Restore wired count */
- write_c0_wired(0);
- mtc0_tlbw_hazard();
- /* Clear out all the TLBs */
- kvm_local_flush_tlb_all();
+ /* It should always be safe to remove after flushing the whole range */
+ WARN_ON(!kvm_mips_flush_gpa_pt(kvm, 0, ~0));
+ pgd_free(NULL, kvm->arch.gpa_mm.pgd);
}
void kvm_arch_destroy_vm(struct kvm *kvm)
{
kvm_mips_free_vcpus(kvm);
-
- /* If this is the last instance, restore wired count */
- if (atomic_dec_return(&kvm_mips_instance) == 0) {
- kvm_debug("%s: last KVM instance, restoring TLB parameters\n",
- __func__);
- on_each_cpu(kvm_mips_uninit_tlbs, NULL, 1);
- }
+ kvm_mips_free_gpa_pt(kvm);
}
long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl,
@@ -210,6 +157,32 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
return 0;
}
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+ /* Flush whole GPA */
+ kvm_mips_flush_gpa_pt(kvm, 0, ~0);
+
+ /* Let implementation do the rest */
+ kvm_mips_callbacks->flush_shadow_all(kvm);
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+ struct kvm_memory_slot *slot)
+{
+ /*
+ * The slot has been made invalid (ready for moving or deletion), so we
+ * need to ensure that it can no longer be accessed by any guest VCPUs.
+ */
+
+ spin_lock(&kvm->mmu_lock);
+ /* Flush slot from GPA */
+ kvm_mips_flush_gpa_pt(kvm, slot->base_gfn,
+ slot->base_gfn + slot->npages - 1);
+ /* Let implementation do the rest */
+ kvm_mips_callbacks->flush_shadow_memslot(kvm, slot);
+ spin_unlock(&kvm->mmu_lock);
+}
+
int kvm_arch_prepare_memory_region(struct kvm *kvm,
struct kvm_memory_slot *memslot,
const struct kvm_userspace_memory_region *mem,
@@ -224,35 +197,32 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- unsigned long npages = 0;
- int i;
+ int needs_flush;
kvm_debug("%s: kvm: %p slot: %d, GPA: %llx, size: %llx, QVA: %llx\n",
__func__, kvm, mem->slot, mem->guest_phys_addr,
mem->memory_size, mem->userspace_addr);
- /* Setup Guest PMAP table */
- if (!kvm->arch.guest_pmap) {
- if (mem->slot == 0)
- npages = mem->memory_size >> PAGE_SHIFT;
-
- if (npages) {
- kvm->arch.guest_pmap_npages = npages;
- kvm->arch.guest_pmap =
- kzalloc(npages * sizeof(unsigned long), GFP_KERNEL);
-
- if (!kvm->arch.guest_pmap) {
- kvm_err("Failed to allocate guest PMAP\n");
- return;
- }
-
- kvm_debug("Allocated space for Guest PMAP Table (%ld pages) @ %p\n",
- npages, kvm->arch.guest_pmap);
-
- /* Now setup the page table */
- for (i = 0; i < npages; i++)
- kvm->arch.guest_pmap[i] = KVM_INVALID_PAGE;
- }
+ /*
+ * If dirty page logging is enabled, write protect all pages in the slot
+ * ready for dirty logging.
+ *
+ * There is no need to do this in any of the following cases:
+ * CREATE: No dirty mappings will already exist.
+ * MOVE/DELETE: The old mappings will already have been cleaned up by
+ * kvm_arch_flush_shadow_memslot()
+ */
+ if (change == KVM_MR_FLAGS_ONLY &&
+ (!(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
+ new->flags & KVM_MEM_LOG_DIRTY_PAGES)) {
+ spin_lock(&kvm->mmu_lock);
+ /* Write protect GPA page table entries */
+ needs_flush = kvm_mips_mkclean_gpa_pt(kvm, new->base_gfn,
+ new->base_gfn + new->npages - 1);
+ /* Let implementation do the rest */
+ if (needs_flush)
+ kvm_mips_callbacks->flush_shadow_memslot(kvm, new);
+ spin_unlock(&kvm->mmu_lock);
}
}
@@ -276,7 +246,7 @@ static inline void dump_handler(const char *symbol, void *start, void *end)
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
{
int err, size;
- void *gebase, *p, *handler;
+ void *gebase, *p, *handler, *refill_start, *refill_end;
int i;
struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
@@ -329,8 +299,9 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
/* Build guest exception vectors dynamically in unmapped memory */
handler = gebase + 0x2000;
- /* TLB Refill, EXL = 0 */
- kvm_mips_build_exception(gebase, handler);
+ /* TLB refill */
+ refill_start = gebase;
+ refill_end = kvm_mips_build_tlb_refill_exception(refill_start, handler);
/* General Exception Entry point */
kvm_mips_build_exception(gebase + 0x180, handler);
@@ -356,6 +327,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
pr_debug("#include <asm/regdef.h>\n");
pr_debug("\n");
dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p);
+ dump_handler("kvm_tlb_refill", refill_start, refill_end);
dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200);
dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run);
@@ -406,6 +378,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
kvm_mips_dump_stats(vcpu);
+ kvm_mmu_free_memory_caches(vcpu);
kfree(vcpu->arch.guest_ebase);
kfree(vcpu->arch.kseg0_commpage);
kfree(vcpu);
@@ -422,37 +395,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
return -ENOIOCTLCMD;
}
-/* Must be called with preemption disabled, just before entering guest */
-static void kvm_mips_check_asids(struct kvm_vcpu *vcpu)
-{
- struct mips_coproc *cop0 = vcpu->arch.cop0;
- int i, cpu = smp_processor_id();
- unsigned int gasid;
-
- /*
- * Lazy host ASID regeneration for guest user mode.
- * If the guest ASID has changed since the last guest usermode
- * execution, regenerate the host ASID so as to invalidate stale TLB
- * entries.
- */
- if (!KVM_GUEST_KERNEL_MODE(vcpu)) {
- gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
- if (gasid != vcpu->arch.last_user_gasid) {
- kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu,
- vcpu);
- vcpu->arch.guest_user_asid[cpu] =
- vcpu->arch.guest_user_mm.context.asid[cpu];
- for_each_possible_cpu(i)
- if (i != cpu)
- vcpu->arch.guest_user_asid[cpu] = 0;
- vcpu->arch.last_user_gasid = gasid;
- }
- }
-}
-
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
{
- int r = 0;
+ int r = -EINTR;
sigset_t sigsaved;
if (vcpu->sigset_active)
@@ -464,31 +409,30 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
vcpu->mmio_needed = 0;
}
+ if (run->immediate_exit)
+ goto out;
+
lose_fpu(1);
local_irq_disable();
- /* Check if we have any exceptions/interrupts pending */
- kvm_mips_deliver_interrupts(vcpu,
- kvm_read_c0_guest_cause(vcpu->arch.cop0));
-
guest_enter_irqoff();
-
- /* Disable hardware page table walking while in guest */
- htw_stop();
-
trace_kvm_enter(vcpu);
- kvm_mips_check_asids(vcpu);
-
- r = vcpu->arch.vcpu_run(run, vcpu);
- trace_kvm_out(vcpu);
+ /*
+ * Make sure the read of VCPU requests in vcpu_run() callback is not
+ * reordered ahead of the write to vcpu->mode, or we could miss a TLB
+ * flush request while the requester sees the VCPU as outside of guest
+ * mode and not needing an IPI.
+ */
+ smp_store_mb(vcpu->mode, IN_GUEST_MODE);
- /* Re-enable HTW before enabling interrupts */
- htw_start();
+ r = kvm_mips_callbacks->vcpu_run(run, vcpu);
+ trace_kvm_out(vcpu);
guest_exit_irqoff();
local_irq_enable();
+out:
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
@@ -580,33 +524,6 @@ static u64 kvm_mips_get_one_regs[] = {
KVM_REG_MIPS_LO,
#endif
KVM_REG_MIPS_PC,
-
- KVM_REG_MIPS_CP0_INDEX,
- KVM_REG_MIPS_CP0_CONTEXT,
- KVM_REG_MIPS_CP0_USERLOCAL,
- KVM_REG_MIPS_CP0_PAGEMASK,
- KVM_REG_MIPS_CP0_WIRED,
- KVM_REG_MIPS_CP0_HWRENA,
- KVM_REG_MIPS_CP0_BADVADDR,
- KVM_REG_MIPS_CP0_COUNT,
- KVM_REG_MIPS_CP0_ENTRYHI,
- KVM_REG_MIPS_CP0_COMPARE,
- KVM_REG_MIPS_CP0_STATUS,
- KVM_REG_MIPS_CP0_CAUSE,
- KVM_REG_MIPS_CP0_EPC,
- KVM_REG_MIPS_CP0_PRID,
- KVM_REG_MIPS_CP0_CONFIG,
- KVM_REG_MIPS_CP0_CONFIG1,
- KVM_REG_MIPS_CP0_CONFIG2,
- KVM_REG_MIPS_CP0_CONFIG3,
- KVM_REG_MIPS_CP0_CONFIG4,
- KVM_REG_MIPS_CP0_CONFIG5,
- KVM_REG_MIPS_CP0_CONFIG7,
- KVM_REG_MIPS_CP0_ERROREPC,
-
- KVM_REG_MIPS_COUNT_CTL,
- KVM_REG_MIPS_COUNT_RESUME,
- KVM_REG_MIPS_COUNT_HZ,
};
static u64 kvm_mips_get_one_regs_fpu[] = {
@@ -619,15 +536,6 @@ static u64 kvm_mips_get_one_regs_msa[] = {
KVM_REG_MIPS_MSA_CSR,
};
-static u64 kvm_mips_get_one_regs_kscratch[] = {
- KVM_REG_MIPS_CP0_KSCRATCH1,
- KVM_REG_MIPS_CP0_KSCRATCH2,
- KVM_REG_MIPS_CP0_KSCRATCH3,
- KVM_REG_MIPS_CP0_KSCRATCH4,
- KVM_REG_MIPS_CP0_KSCRATCH5,
- KVM_REG_MIPS_CP0_KSCRATCH6,
-};
-
static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
{
unsigned long ret;
@@ -641,7 +549,6 @@ static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
}
if (kvm_mips_guest_can_have_msa(&vcpu->arch))
ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32;
- ret += __arch_hweight8(vcpu->arch.kscratch_enabled);
ret += kvm_mips_callbacks->num_regs(vcpu);
return ret;
@@ -694,16 +601,6 @@ static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
}
}
- for (i = 0; i < 6; ++i) {
- if (!(vcpu->arch.kscratch_enabled & BIT(i + 2)))
- continue;
-
- if (copy_to_user(indices, &kvm_mips_get_one_regs_kscratch[i],
- sizeof(kvm_mips_get_one_regs_kscratch[i])))
- return -EFAULT;
- ++indices;
- }
-
return kvm_mips_callbacks->copy_reg_indices(vcpu, indices);
}
@@ -794,95 +691,6 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
v = fpu->msacsr;
break;
- /* Co-processor 0 registers */
- case KVM_REG_MIPS_CP0_INDEX:
- v = (long)kvm_read_c0_guest_index(cop0);
- break;
- case KVM_REG_MIPS_CP0_CONTEXT:
- v = (long)kvm_read_c0_guest_context(cop0);
- break;
- case KVM_REG_MIPS_CP0_USERLOCAL:
- v = (long)kvm_read_c0_guest_userlocal(cop0);
- break;
- case KVM_REG_MIPS_CP0_PAGEMASK:
- v = (long)kvm_read_c0_guest_pagemask(cop0);
- break;
- case KVM_REG_MIPS_CP0_WIRED:
- v = (long)kvm_read_c0_guest_wired(cop0);
- break;
- case KVM_REG_MIPS_CP0_HWRENA:
- v = (long)kvm_read_c0_guest_hwrena(cop0);
- break;
- case KVM_REG_MIPS_CP0_BADVADDR:
- v = (long)kvm_read_c0_guest_badvaddr(cop0);
- break;
- case KVM_REG_MIPS_CP0_ENTRYHI:
- v = (long)kvm_read_c0_guest_entryhi(cop0);
- break;
- case KVM_REG_MIPS_CP0_COMPARE:
- v = (long)kvm_read_c0_guest_compare(cop0);
- break;
- case KVM_REG_MIPS_CP0_STATUS:
- v = (long)kvm_read_c0_guest_status(cop0);
- break;
- case KVM_REG_MIPS_CP0_CAUSE:
- v = (long)kvm_read_c0_guest_cause(cop0);
- break;
- case KVM_REG_MIPS_CP0_EPC:
- v = (long)kvm_read_c0_guest_epc(cop0);
- break;
- case KVM_REG_MIPS_CP0_PRID:
- v = (long)kvm_read_c0_guest_prid(cop0);
- break;
- case KVM_REG_MIPS_CP0_CONFIG:
- v = (long)kvm_read_c0_guest_config(cop0);
- break;
- case KVM_REG_MIPS_CP0_CONFIG1:
- v = (long)kvm_read_c0_guest_config1(cop0);
- break;
- case KVM_REG_MIPS_CP0_CONFIG2:
- v = (long)kvm_read_c0_guest_config2(cop0);
- break;
- case KVM_REG_MIPS_CP0_CONFIG3:
- v = (long)kvm_read_c0_guest_config3(cop0);
- break;
- case KVM_REG_MIPS_CP0_CONFIG4:
- v = (long)kvm_read_c0_guest_config4(cop0);
- break;
- case KVM_REG_MIPS_CP0_CONFIG5:
- v = (long)kvm_read_c0_guest_config5(cop0);
- break;
- case KVM_REG_MIPS_CP0_CONFIG7:
- v = (long)kvm_read_c0_guest_config7(cop0);
- break;
- case KVM_REG_MIPS_CP0_ERROREPC:
- v = (long)kvm_read_c0_guest_errorepc(cop0);
- break;
- case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
- idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
- if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
- return -EINVAL;
- switch (idx) {
- case 2:
- v = (long)kvm_read_c0_guest_kscratch1(cop0);
- break;
- case 3:
- v = (long)kvm_read_c0_guest_kscratch2(cop0);
- break;
- case 4:
- v = (long)kvm_read_c0_guest_kscratch3(cop0);
- break;
- case 5:
- v = (long)kvm_read_c0_guest_kscratch4(cop0);
- break;
- case 6:
- v = (long)kvm_read_c0_guest_kscratch5(cop0);
- break;
- case 7:
- v = (long)kvm_read_c0_guest_kscratch6(cop0);
- break;
- }
- break;
/* registers to be handled specially */
default:
ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v);
@@ -1014,68 +822,6 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
fpu->msacsr = v;
break;
- /* Co-processor 0 registers */
- case KVM_REG_MIPS_CP0_INDEX:
- kvm_write_c0_guest_index(cop0, v);
- break;
- case KVM_REG_MIPS_CP0_CONTEXT:
- kvm_write_c0_guest_context(cop0, v);
- break;
- case KVM_REG_MIPS_CP0_USERLOCAL:
- kvm_write_c0_guest_userlocal(cop0, v);
- break;
- case KVM_REG_MIPS_CP0_PAGEMASK:
- kvm_write_c0_guest_pagemask(cop0, v);
- break;
- case KVM_REG_MIPS_CP0_WIRED:
- kvm_write_c0_guest_wired(cop0, v);
- break;
- case KVM_REG_MIPS_CP0_HWRENA:
- kvm_write_c0_guest_hwrena(cop0, v);
- break;
- case KVM_REG_MIPS_CP0_BADVADDR:
- kvm_write_c0_guest_badvaddr(cop0, v);
- break;
- case KVM_REG_MIPS_CP0_ENTRYHI:
- kvm_write_c0_guest_entryhi(cop0, v);
- break;
- case KVM_REG_MIPS_CP0_STATUS:
- kvm_write_c0_guest_status(cop0, v);
- break;
- case KVM_REG_MIPS_CP0_EPC:
- kvm_write_c0_guest_epc(cop0, v);
- break;
- case KVM_REG_MIPS_CP0_PRID:
- kvm_write_c0_guest_prid(cop0, v);
- break;
- case KVM_REG_MIPS_CP0_ERROREPC:
- kvm_write_c0_guest_errorepc(cop0, v);
- break;
- case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
- idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
- if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
- return -EINVAL;
- switch (idx) {
- case 2:
- kvm_write_c0_guest_kscratch1(cop0, v);
- break;
- case 3:
- kvm_write_c0_guest_kscratch2(cop0, v);
- break;
- case 4:
- kvm_write_c0_guest_kscratch3(cop0, v);
- break;
- case 5:
- kvm_write_c0_guest_kscratch4(cop0, v);
- break;
- case 6:
- kvm_write_c0_guest_kscratch5(cop0, v);
- break;
- case 7:
- kvm_write_c0_guest_kscratch6(cop0, v);
- break;
- }
- break;
/* registers to be handled specially */
default:
return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
@@ -1144,18 +890,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
return -E2BIG;
return kvm_mips_copy_reg_indices(vcpu, user_list->reg);
}
- case KVM_NMI:
- /* Treat the NMI as a CPU reset */
- r = kvm_mips_reset_vcpu(vcpu);
- break;
case KVM_INTERRUPT:
{
struct kvm_mips_interrupt irq;
- r = -EFAULT;
if (copy_from_user(&irq, argp, sizeof(irq)))
- goto out;
-
+ return -EFAULT;
kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__,
irq.irq);
@@ -1165,56 +905,57 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
case KVM_ENABLE_CAP: {
struct kvm_enable_cap cap;
- r = -EFAULT;
if (copy_from_user(&cap, argp, sizeof(cap)))
- goto out;
+ return -EFAULT;
r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
break;
}
default:
r = -ENOIOCTLCMD;
}
-
-out:
return r;
}
-/* Get (and clear) the dirty memory log for a memory slot. */
+/**
+ * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
+ * @kvm: kvm instance
+ * @log: slot id and address to which we copy the log
+ *
+ * Steps 1-4 below provide general overview of dirty page logging. See
+ * kvm_get_dirty_log_protect() function description for additional details.
+ *
+ * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
+ * always flush the TLB (step 4) even if previous step failed and the dirty
+ * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
+ * does not preclude user space subsequent dirty log read. Flushing TLB ensures
+ * writes will be marked dirty for next log read.
+ *
+ * 1. Take a snapshot of the bit and clear it if needed.
+ * 2. Write protect the corresponding page.
+ * 3. Copy the snapshot to the userspace.
+ * 4. Flush TLB's if needed.
+ */
int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
- unsigned long ga, ga_end;
- int is_dirty = 0;
+ bool is_dirty = false;
int r;
- unsigned long n;
mutex_lock(&kvm->slots_lock);
- r = kvm_get_dirty_log(kvm, log, &is_dirty);
- if (r)
- goto out;
+ r = kvm_get_dirty_log_protect(kvm, log, &is_dirty);
- /* If nothing is dirty, don't bother messing with page tables. */
if (is_dirty) {
slots = kvm_memslots(kvm);
memslot = id_to_memslot(slots, log->slot);
- ga = memslot->base_gfn << PAGE_SHIFT;
- ga_end = ga + (memslot->npages << PAGE_SHIFT);
-
- kvm_info("%s: dirty, ga: %#lx, ga_end %#lx\n", __func__, ga,
- ga_end);
-
- n = kvm_dirty_bitmap_bytes(memslot);
- memset(memslot->dirty_bitmap, 0, n);
+ /* Let implementation handle TLB/GVA invalidation */
+ kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot);
}
- r = 0;
-out:
mutex_unlock(&kvm->slots_lock);
return r;
-
}
long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
@@ -1282,11 +1023,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
switch (ext) {
case KVM_CAP_ONE_REG:
case KVM_CAP_ENABLE_CAP:
+ case KVM_CAP_READONLY_MEM:
+ case KVM_CAP_SYNC_MMU:
+ case KVM_CAP_IMMEDIATE_EXIT:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
r = KVM_COALESCED_MMIO_PAGE_OFFSET;
break;
+ case KVM_CAP_NR_VCPUS:
+ r = num_online_cpus();
+ break;
+ case KVM_CAP_MAX_VCPUS:
+ r = KVM_MAX_VCPUS;
+ break;
case KVM_CAP_MIPS_FPU:
/* We don't handle systems with inconsistent cpu_has_fpu */
r = !!raw_cpu_has_fpu;
@@ -1400,13 +1150,23 @@ static enum hrtimer_restart kvm_mips_comparecount_wakeup(struct hrtimer *timer)
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
- kvm_mips_callbacks->vcpu_init(vcpu);
+ int err;
+
+ err = kvm_mips_callbacks->vcpu_init(vcpu);
+ if (err)
+ return err;
+
hrtimer_init(&vcpu->arch.comparecount_timer, CLOCK_MONOTONIC,
HRTIMER_MODE_REL);
vcpu->arch.comparecount_timer.function = kvm_mips_comparecount_wakeup;
return 0;
}
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ kvm_mips_callbacks->vcpu_uninit(vcpu);
+}
+
int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
struct kvm_translation *tr)
{
@@ -1440,8 +1200,11 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
enum emulation_result er = EMULATE_DONE;
+ u32 inst;
int ret = RESUME_GUEST;
+ vcpu->mode = OUTSIDE_GUEST_MODE;
+
/* re-enable HTW before enabling interrupts */
htw_start();
@@ -1564,8 +1327,12 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
break;
default:
+ if (cause & CAUSEF_BD)
+ opc += 1;
+ inst = 0;
+ kvm_get_badinstr(opc, vcpu, &inst);
kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#lx\n",
- exccode, opc, kvm_get_inst(opc, vcpu), badvaddr,
+ exccode, opc, inst, badvaddr,
kvm_read_c0_guest_status(vcpu->arch.cop0));
kvm_arch_vcpu_dump_regs(vcpu);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -1593,7 +1360,15 @@ skip_emul:
if (ret == RESUME_GUEST) {
trace_kvm_reenter(vcpu);
- kvm_mips_check_asids(vcpu);
+ /*
+ * Make sure the read of VCPU requests in vcpu_reenter()
+ * callback is not reordered ahead of the write to vcpu->mode,
+ * or we could miss a TLB flush request while the requester sees
+ * the VCPU as outside of guest mode and not needing an IPI.
+ */
+ smp_store_mb(vcpu->mode, IN_GUEST_MODE);
+
+ kvm_mips_callbacks->vcpu_reenter(run, vcpu);
/*
* If FPU / MSA are enabled (i.e. the guest's FPU / MSA context
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
index 3b677c851be0..cb0faade311e 100644
--- a/arch/mips/kvm/mmu.c
+++ b/arch/mips/kvm/mmu.c
@@ -11,86 +11,995 @@
#include <linux/highmem.h>
#include <linux/kvm_host.h>
+#include <linux/uaccess.h>
#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
-static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+/*
+ * KVM_MMU_CACHE_MIN_PAGES is the number of GPA page table translation levels
+ * for which pages need to be cached.
+ */
+#if defined(__PAGETABLE_PMD_FOLDED)
+#define KVM_MMU_CACHE_MIN_PAGES 1
+#else
+#define KVM_MMU_CACHE_MIN_PAGES 2
+#endif
+
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ int min, int max)
{
- int cpu = smp_processor_id();
+ void *page;
+
+ BUG_ON(max > KVM_NR_MEM_OBJS);
+ if (cache->nobjs >= min)
+ return 0;
+ while (cache->nobjs < max) {
+ page = (void *)__get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ cache->objects[cache->nobjs++] = page;
+ }
+ return 0;
+}
- return vcpu->arch.guest_kernel_asid[cpu] &
- cpu_asid_mask(&cpu_data[cpu]);
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+ while (mc->nobjs)
+ free_page((unsigned long)mc->objects[--mc->nobjs]);
}
-static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
{
- int cpu = smp_processor_id();
+ void *p;
- return vcpu->arch.guest_user_asid[cpu] &
- cpu_asid_mask(&cpu_data[cpu]);
+ BUG_ON(!mc || !mc->nobjs);
+ p = mc->objects[--mc->nobjs];
+ return p;
}
-static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
+void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
{
- int srcu_idx, err = 0;
- kvm_pfn_t pfn;
+ mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
+}
+
+/**
+ * kvm_pgd_init() - Initialise KVM GPA page directory.
+ * @page: Pointer to page directory (PGD) for KVM GPA.
+ *
+ * Initialise a KVM GPA page directory with pointers to the invalid table, i.e.
+ * representing no mappings. This is similar to pgd_init(), however it
+ * initialises all the page directory pointers, not just the ones corresponding
+ * to the userland address space (since it is for the guest physical address
+ * space rather than a virtual address space).
+ */
+static void kvm_pgd_init(void *page)
+{
+ unsigned long *p, *end;
+ unsigned long entry;
+
+#ifdef __PAGETABLE_PMD_FOLDED
+ entry = (unsigned long)invalid_pte_table;
+#else
+ entry = (unsigned long)invalid_pmd_table;
+#endif
+
+ p = (unsigned long *)page;
+ end = p + PTRS_PER_PGD;
+
+ do {
+ p[0] = entry;
+ p[1] = entry;
+ p[2] = entry;
+ p[3] = entry;
+ p[4] = entry;
+ p += 8;
+ p[-3] = entry;
+ p[-2] = entry;
+ p[-1] = entry;
+ } while (p != end);
+}
+
+/**
+ * kvm_pgd_alloc() - Allocate and initialise a KVM GPA page directory.
+ *
+ * Allocate a blank KVM GPA page directory (PGD) for representing guest physical
+ * to host physical page mappings.
+ *
+ * Returns: Pointer to new KVM GPA page directory.
+ * NULL on allocation failure.
+ */
+pgd_t *kvm_pgd_alloc(void)
+{
+ pgd_t *ret;
+
+ ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGD_ORDER);
+ if (ret)
+ kvm_pgd_init(ret);
+
+ return ret;
+}
+
+/**
+ * kvm_mips_walk_pgd() - Walk page table with optional allocation.
+ * @pgd: Page directory pointer.
+ * @addr: Address to index page table using.
+ * @cache: MMU page cache to allocate new page tables from, or NULL.
+ *
+ * Walk the page tables pointed to by @pgd to find the PTE corresponding to the
+ * address @addr. If page tables don't exist for @addr, they will be created
+ * from the MMU cache if @cache is not NULL.
+ *
+ * Returns: Pointer to pte_t corresponding to @addr.
+ * NULL if a page table doesn't exist for @addr and !@cache.
+ * NULL if a page table allocation failed.
+ */
+static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache,
+ unsigned long addr)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pgd += pgd_index(addr);
+ if (pgd_none(*pgd)) {
+ /* Not used on MIPS yet */
+ BUG();
+ return NULL;
+ }
+ pud = pud_offset(pgd, addr);
+ if (pud_none(*pud)) {
+ pmd_t *new_pmd;
+
+ if (!cache)
+ return NULL;
+ new_pmd = mmu_memory_cache_alloc(cache);
+ pmd_init((unsigned long)new_pmd,
+ (unsigned long)invalid_pte_table);
+ pud_populate(NULL, pud, new_pmd);
+ }
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd)) {
+ pte_t *new_pte;
+
+ if (!cache)
+ return NULL;
+ new_pte = mmu_memory_cache_alloc(cache);
+ clear_page(new_pte);
+ pmd_populate_kernel(NULL, pmd, new_pte);
+ }
+ return pte_offset(pmd, addr);
+}
+
+/* Caller must hold kvm->mm_lock */
+static pte_t *kvm_mips_pte_for_gpa(struct kvm *kvm,
+ struct kvm_mmu_memory_cache *cache,
+ unsigned long addr)
+{
+ return kvm_mips_walk_pgd(kvm->arch.gpa_mm.pgd, cache, addr);
+}
+
+/*
+ * kvm_mips_flush_gpa_{pte,pmd,pud,pgd,pt}.
+ * Flush a range of guest physical address space from the VM's GPA page tables.
+ */
+
+static bool kvm_mips_flush_gpa_pte(pte_t *pte, unsigned long start_gpa,
+ unsigned long end_gpa)
+{
+ int i_min = __pte_offset(start_gpa);
+ int i_max = __pte_offset(end_gpa);
+ bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1);
+ int i;
+
+ for (i = i_min; i <= i_max; ++i) {
+ if (!pte_present(pte[i]))
+ continue;
+
+ set_pte(pte + i, __pte(0));
+ }
+ return safe_to_remove;
+}
+
+static bool kvm_mips_flush_gpa_pmd(pmd_t *pmd, unsigned long start_gpa,
+ unsigned long end_gpa)
+{
+ pte_t *pte;
+ unsigned long end = ~0ul;
+ int i_min = __pmd_offset(start_gpa);
+ int i_max = __pmd_offset(end_gpa);
+ bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1);
+ int i;
+
+ for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
+ if (!pmd_present(pmd[i]))
+ continue;
+
+ pte = pte_offset(pmd + i, 0);
+ if (i == i_max)
+ end = end_gpa;
+
+ if (kvm_mips_flush_gpa_pte(pte, start_gpa, end)) {
+ pmd_clear(pmd + i);
+ pte_free_kernel(NULL, pte);
+ } else {
+ safe_to_remove = false;
+ }
+ }
+ return safe_to_remove;
+}
+
+static bool kvm_mips_flush_gpa_pud(pud_t *pud, unsigned long start_gpa,
+ unsigned long end_gpa)
+{
+ pmd_t *pmd;
+ unsigned long end = ~0ul;
+ int i_min = __pud_offset(start_gpa);
+ int i_max = __pud_offset(end_gpa);
+ bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1);
+ int i;
+
+ for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
+ if (!pud_present(pud[i]))
+ continue;
+
+ pmd = pmd_offset(pud + i, 0);
+ if (i == i_max)
+ end = end_gpa;
+
+ if (kvm_mips_flush_gpa_pmd(pmd, start_gpa, end)) {
+ pud_clear(pud + i);
+ pmd_free(NULL, pmd);
+ } else {
+ safe_to_remove = false;
+ }
+ }
+ return safe_to_remove;
+}
+
+static bool kvm_mips_flush_gpa_pgd(pgd_t *pgd, unsigned long start_gpa,
+ unsigned long end_gpa)
+{
+ pud_t *pud;
+ unsigned long end = ~0ul;
+ int i_min = pgd_index(start_gpa);
+ int i_max = pgd_index(end_gpa);
+ bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1);
+ int i;
+
+ for (i = i_min; i <= i_max; ++i, start_gpa = 0) {
+ if (!pgd_present(pgd[i]))
+ continue;
+
+ pud = pud_offset(pgd + i, 0);
+ if (i == i_max)
+ end = end_gpa;
+
+ if (kvm_mips_flush_gpa_pud(pud, start_gpa, end)) {
+ pgd_clear(pgd + i);
+ pud_free(NULL, pud);
+ } else {
+ safe_to_remove = false;
+ }
+ }
+ return safe_to_remove;
+}
+
+/**
+ * kvm_mips_flush_gpa_pt() - Flush a range of guest physical addresses.
+ * @kvm: KVM pointer.
+ * @start_gfn: Guest frame number of first page in GPA range to flush.
+ * @end_gfn: Guest frame number of last page in GPA range to flush.
+ *
+ * Flushes a range of GPA mappings from the GPA page tables.
+ *
+ * The caller must hold the @kvm->mmu_lock spinlock.
+ *
+ * Returns: Whether its safe to remove the top level page directory because
+ * all lower levels have been removed.
+ */
+bool kvm_mips_flush_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
+{
+ return kvm_mips_flush_gpa_pgd(kvm->arch.gpa_mm.pgd,
+ start_gfn << PAGE_SHIFT,
+ end_gfn << PAGE_SHIFT);
+}
+
+#define BUILD_PTE_RANGE_OP(name, op) \
+static int kvm_mips_##name##_pte(pte_t *pte, unsigned long start, \
+ unsigned long end) \
+{ \
+ int ret = 0; \
+ int i_min = __pte_offset(start); \
+ int i_max = __pte_offset(end); \
+ int i; \
+ pte_t old, new; \
+ \
+ for (i = i_min; i <= i_max; ++i) { \
+ if (!pte_present(pte[i])) \
+ continue; \
+ \
+ old = pte[i]; \
+ new = op(old); \
+ if (pte_val(new) == pte_val(old)) \
+ continue; \
+ set_pte(pte + i, new); \
+ ret = 1; \
+ } \
+ return ret; \
+} \
+ \
+/* returns true if anything was done */ \
+static int kvm_mips_##name##_pmd(pmd_t *pmd, unsigned long start, \
+ unsigned long end) \
+{ \
+ int ret = 0; \
+ pte_t *pte; \
+ unsigned long cur_end = ~0ul; \
+ int i_min = __pmd_offset(start); \
+ int i_max = __pmd_offset(end); \
+ int i; \
+ \
+ for (i = i_min; i <= i_max; ++i, start = 0) { \
+ if (!pmd_present(pmd[i])) \
+ continue; \
+ \
+ pte = pte_offset(pmd + i, 0); \
+ if (i == i_max) \
+ cur_end = end; \
+ \
+ ret |= kvm_mips_##name##_pte(pte, start, cur_end); \
+ } \
+ return ret; \
+} \
+ \
+static int kvm_mips_##name##_pud(pud_t *pud, unsigned long start, \
+ unsigned long end) \
+{ \
+ int ret = 0; \
+ pmd_t *pmd; \
+ unsigned long cur_end = ~0ul; \
+ int i_min = __pud_offset(start); \
+ int i_max = __pud_offset(end); \
+ int i; \
+ \
+ for (i = i_min; i <= i_max; ++i, start = 0) { \
+ if (!pud_present(pud[i])) \
+ continue; \
+ \
+ pmd = pmd_offset(pud + i, 0); \
+ if (i == i_max) \
+ cur_end = end; \
+ \
+ ret |= kvm_mips_##name##_pmd(pmd, start, cur_end); \
+ } \
+ return ret; \
+} \
+ \
+static int kvm_mips_##name##_pgd(pgd_t *pgd, unsigned long start, \
+ unsigned long end) \
+{ \
+ int ret = 0; \
+ pud_t *pud; \
+ unsigned long cur_end = ~0ul; \
+ int i_min = pgd_index(start); \
+ int i_max = pgd_index(end); \
+ int i; \
+ \
+ for (i = i_min; i <= i_max; ++i, start = 0) { \
+ if (!pgd_present(pgd[i])) \
+ continue; \
+ \
+ pud = pud_offset(pgd + i, 0); \
+ if (i == i_max) \
+ cur_end = end; \
+ \
+ ret |= kvm_mips_##name##_pud(pud, start, cur_end); \
+ } \
+ return ret; \
+}
+
+/*
+ * kvm_mips_mkclean_gpa_pt.
+ * Mark a range of guest physical address space clean (writes fault) in the VM's
+ * GPA page table to allow dirty page tracking.
+ */
- if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
+BUILD_PTE_RANGE_OP(mkclean, pte_mkclean)
+
+/**
+ * kvm_mips_mkclean_gpa_pt() - Make a range of guest physical addresses clean.
+ * @kvm: KVM pointer.
+ * @start_gfn: Guest frame number of first page in GPA range to flush.
+ * @end_gfn: Guest frame number of last page in GPA range to flush.
+ *
+ * Make a range of GPA mappings clean so that guest writes will fault and
+ * trigger dirty page logging.
+ *
+ * The caller must hold the @kvm->mmu_lock spinlock.
+ *
+ * Returns: Whether any GPA mappings were modified, which would require
+ * derived mappings (GVA page tables & TLB enties) to be
+ * invalidated.
+ */
+int kvm_mips_mkclean_gpa_pt(struct kvm *kvm, gfn_t start_gfn, gfn_t end_gfn)
+{
+ return kvm_mips_mkclean_pgd(kvm->arch.gpa_mm.pgd,
+ start_gfn << PAGE_SHIFT,
+ end_gfn << PAGE_SHIFT);
+}
+
+/**
+ * kvm_arch_mmu_enable_log_dirty_pt_masked() - write protect dirty pages
+ * @kvm: The KVM pointer
+ * @slot: The memory slot associated with mask
+ * @gfn_offset: The gfn offset in memory slot
+ * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory
+ * slot to be write protected
+ *
+ * Walks bits set in mask write protects the associated pte's. Caller must
+ * acquire @kvm->mmu_lock.
+ */
+void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ gfn_t gfn_offset, unsigned long mask)
+{
+ gfn_t base_gfn = slot->base_gfn + gfn_offset;
+ gfn_t start = base_gfn + __ffs(mask);
+ gfn_t end = base_gfn + __fls(mask);
+
+ kvm_mips_mkclean_gpa_pt(kvm, start, end);
+}
+
+/*
+ * kvm_mips_mkold_gpa_pt.
+ * Mark a range of guest physical address space old (all accesses fault) in the
+ * VM's GPA page table to allow detection of commonly used pages.
+ */
+
+BUILD_PTE_RANGE_OP(mkold, pte_mkold)
+
+static int kvm_mips_mkold_gpa_pt(struct kvm *kvm, gfn_t start_gfn,
+ gfn_t end_gfn)
+{
+ return kvm_mips_mkold_pgd(kvm->arch.gpa_mm.pgd,
+ start_gfn << PAGE_SHIFT,
+ end_gfn << PAGE_SHIFT);
+}
+
+static int handle_hva_to_gpa(struct kvm *kvm,
+ unsigned long start,
+ unsigned long end,
+ int (*handler)(struct kvm *kvm, gfn_t gfn,
+ gpa_t gfn_end,
+ struct kvm_memory_slot *memslot,
+ void *data),
+ void *data)
+{
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+ int ret = 0;
+
+ slots = kvm_memslots(kvm);
+
+ /* we only care about the pages that the guest sees */
+ kvm_for_each_memslot(memslot, slots) {
+ unsigned long hva_start, hva_end;
+ gfn_t gfn, gfn_end;
+
+ hva_start = max(start, memslot->userspace_addr);
+ hva_end = min(end, memslot->userspace_addr +
+ (memslot->npages << PAGE_SHIFT));
+ if (hva_start >= hva_end)
+ continue;
+
+ /*
+ * {gfn(page) | page intersects with [hva_start, hva_end)} =
+ * {gfn_start, gfn_start+1, ..., gfn_end-1}.
+ */
+ gfn = hva_to_gfn_memslot(hva_start, memslot);
+ gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
+
+ ret |= handler(kvm, gfn, gfn_end, memslot, data);
+ }
+
+ return ret;
+}
+
+
+static int kvm_unmap_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
+ struct kvm_memory_slot *memslot, void *data)
+{
+ kvm_mips_flush_gpa_pt(kvm, gfn, gfn_end);
+ return 1;
+}
+
+int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
+{
+ unsigned long end = hva + PAGE_SIZE;
+
+ handle_hva_to_gpa(kvm, hva, end, &kvm_unmap_hva_handler, NULL);
+
+ kvm_mips_callbacks->flush_shadow_all(kvm);
+ return 0;
+}
+
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
+
+ kvm_mips_callbacks->flush_shadow_all(kvm);
+ return 0;
+}
+
+static int kvm_set_spte_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
+ struct kvm_memory_slot *memslot, void *data)
+{
+ gpa_t gpa = gfn << PAGE_SHIFT;
+ pte_t hva_pte = *(pte_t *)data;
+ pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
+ pte_t old_pte;
+
+ if (!gpa_pte)
+ return 0;
+
+ /* Mapping may need adjusting depending on memslot flags */
+ old_pte = *gpa_pte;
+ if (memslot->flags & KVM_MEM_LOG_DIRTY_PAGES && !pte_dirty(old_pte))
+ hva_pte = pte_mkclean(hva_pte);
+ else if (memslot->flags & KVM_MEM_READONLY)
+ hva_pte = pte_wrprotect(hva_pte);
+
+ set_pte(gpa_pte, hva_pte);
+
+ /* Replacing an absent or old page doesn't need flushes */
+ if (!pte_present(old_pte) || !pte_young(old_pte))
return 0;
+ /* Pages swapped, aged, moved, or cleaned require flushes */
+ return !pte_present(hva_pte) ||
+ !pte_young(hva_pte) ||
+ pte_pfn(old_pte) != pte_pfn(hva_pte) ||
+ (pte_dirty(old_pte) && !pte_dirty(hva_pte));
+}
+
+void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+{
+ unsigned long end = hva + PAGE_SIZE;
+ int ret;
+
+ ret = handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &pte);
+ if (ret)
+ kvm_mips_callbacks->flush_shadow_all(kvm);
+}
+
+static int kvm_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
+ struct kvm_memory_slot *memslot, void *data)
+{
+ return kvm_mips_mkold_gpa_pt(kvm, gfn, gfn_end);
+}
+
+static int kvm_test_age_hva_handler(struct kvm *kvm, gfn_t gfn, gfn_t gfn_end,
+ struct kvm_memory_slot *memslot, void *data)
+{
+ gpa_t gpa = gfn << PAGE_SHIFT;
+ pte_t *gpa_pte = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
+
+ if (!gpa_pte)
+ return 0;
+ return pte_young(*gpa_pte);
+}
+
+int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
+{
+ return handle_hva_to_gpa(kvm, start, end, kvm_age_hva_handler, NULL);
+}
+
+int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
+{
+ return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
+}
+
+/**
+ * _kvm_mips_map_page_fast() - Fast path GPA fault handler.
+ * @vcpu: VCPU pointer.
+ * @gpa: Guest physical address of fault.
+ * @write_fault: Whether the fault was due to a write.
+ * @out_entry: New PTE for @gpa (written on success unless NULL).
+ * @out_buddy: New PTE for @gpa's buddy (written on success unless
+ * NULL).
+ *
+ * Perform fast path GPA fault handling, doing all that can be done without
+ * calling into KVM. This handles marking old pages young (for idle page
+ * tracking), and dirtying of clean pages (for dirty page logging).
+ *
+ * Returns: 0 on success, in which case we can update derived mappings and
+ * resume guest execution.
+ * -EFAULT on failure due to absent GPA mapping or write to
+ * read-only page, in which case KVM must be consulted.
+ */
+static int _kvm_mips_map_page_fast(struct kvm_vcpu *vcpu, unsigned long gpa,
+ bool write_fault,
+ pte_t *out_entry, pte_t *out_buddy)
+{
+ struct kvm *kvm = vcpu->kvm;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ pte_t *ptep;
+ kvm_pfn_t pfn = 0; /* silence bogus GCC warning */
+ bool pfn_valid = false;
+ int ret = 0;
+
+ spin_lock(&kvm->mmu_lock);
+
+ /* Fast path - just check GPA page table for an existing entry */
+ ptep = kvm_mips_pte_for_gpa(kvm, NULL, gpa);
+ if (!ptep || !pte_present(*ptep)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* Track access to pages marked old */
+ if (!pte_young(*ptep)) {
+ set_pte(ptep, pte_mkyoung(*ptep));
+ pfn = pte_pfn(*ptep);
+ pfn_valid = true;
+ /* call kvm_set_pfn_accessed() after unlock */
+ }
+ if (write_fault && !pte_dirty(*ptep)) {
+ if (!pte_write(*ptep)) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* Track dirtying of writeable pages */
+ set_pte(ptep, pte_mkdirty(*ptep));
+ pfn = pte_pfn(*ptep);
+ mark_page_dirty(kvm, gfn);
+ kvm_set_pfn_dirty(pfn);
+ }
+
+ if (out_entry)
+ *out_entry = *ptep;
+ if (out_buddy)
+ *out_buddy = *ptep_buddy(ptep);
+
+out:
+ spin_unlock(&kvm->mmu_lock);
+ if (pfn_valid)
+ kvm_set_pfn_accessed(pfn);
+ return ret;
+}
+
+/**
+ * kvm_mips_map_page() - Map a guest physical page.
+ * @vcpu: VCPU pointer.
+ * @gpa: Guest physical address of fault.
+ * @write_fault: Whether the fault was due to a write.
+ * @out_entry: New PTE for @gpa (written on success unless NULL).
+ * @out_buddy: New PTE for @gpa's buddy (written on success unless
+ * NULL).
+ *
+ * Handle GPA faults by creating a new GPA mapping (or updating an existing
+ * one).
+ *
+ * This takes care of marking pages young or dirty (idle/dirty page tracking),
+ * asking KVM for the corresponding PFN, and creating a mapping in the GPA page
+ * tables. Derived mappings (GVA page tables and TLBs) must be handled by the
+ * caller.
+ *
+ * Returns: 0 on success, in which case the caller may use the @out_entry
+ * and @out_buddy PTEs to update derived mappings and resume guest
+ * execution.
+ * -EFAULT if there is no memory region at @gpa or a write was
+ * attempted to a read-only memory region. This is usually handled
+ * as an MMIO access.
+ */
+static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa,
+ bool write_fault,
+ pte_t *out_entry, pte_t *out_buddy)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ int srcu_idx, err;
+ kvm_pfn_t pfn;
+ pte_t *ptep, entry, old_pte;
+ bool writeable;
+ unsigned long prot_bits;
+ unsigned long mmu_seq;
+
+ /* Try the fast path to handle old / clean pages */
srcu_idx = srcu_read_lock(&kvm->srcu);
- pfn = gfn_to_pfn(kvm, gfn);
+ err = _kvm_mips_map_page_fast(vcpu, gpa, write_fault, out_entry,
+ out_buddy);
+ if (!err)
+ goto out;
+ /* We need a minimum of cached pages ready for page table creation */
+ err = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
+ KVM_NR_MEM_OBJS);
+ if (err)
+ goto out;
+
+retry:
+ /*
+ * Used to check for invalidations in progress, of the pfn that is
+ * returned by pfn_to_pfn_prot below.
+ */
+ mmu_seq = kvm->mmu_notifier_seq;
+ /*
+ * Ensure the read of mmu_notifier_seq isn't reordered with PTE reads in
+ * gfn_to_pfn_prot() (which calls get_user_pages()), so that we don't
+ * risk the page we get a reference to getting unmapped before we have a
+ * chance to grab the mmu_lock without mmu_notifier_retry() noticing.
+ *
+ * This smp_rmb() pairs with the effective smp_wmb() of the combination
+ * of the pte_unmap_unlock() after the PTE is zapped, and the
+ * spin_lock() in kvm_mmu_notifier_invalidate_<page|range_end>() before
+ * mmu_notifier_seq is incremented.
+ */
+ smp_rmb();
+
+ /* Slow path - ask KVM core whether we can access this GPA */
+ pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writeable);
if (is_error_noslot_pfn(pfn)) {
- kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn);
err = -EFAULT;
goto out;
}
- kvm->arch.guest_pmap[gfn] = pfn;
+ spin_lock(&kvm->mmu_lock);
+ /* Check if an invalidation has taken place since we got pfn */
+ if (mmu_notifier_retry(kvm, mmu_seq)) {
+ /*
+ * This can happen when mappings are changed asynchronously, but
+ * also synchronously if a COW is triggered by
+ * gfn_to_pfn_prot().
+ */
+ spin_unlock(&kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ goto retry;
+ }
+
+ /* Ensure page tables are allocated */
+ ptep = kvm_mips_pte_for_gpa(kvm, memcache, gpa);
+
+ /* Set up the PTE */
+ prot_bits = _PAGE_PRESENT | __READABLE | _page_cachable_default;
+ if (writeable) {
+ prot_bits |= _PAGE_WRITE;
+ if (write_fault) {
+ prot_bits |= __WRITEABLE;
+ mark_page_dirty(kvm, gfn);
+ kvm_set_pfn_dirty(pfn);
+ }
+ }
+ entry = pfn_pte(pfn, __pgprot(prot_bits));
+
+ /* Write the PTE */
+ old_pte = *ptep;
+ set_pte(ptep, entry);
+
+ err = 0;
+ if (out_entry)
+ *out_entry = *ptep;
+ if (out_buddy)
+ *out_buddy = *ptep_buddy(ptep);
+
+ spin_unlock(&kvm->mmu_lock);
+ kvm_release_pfn_clean(pfn);
+ kvm_set_pfn_accessed(pfn);
out:
srcu_read_unlock(&kvm->srcu, srcu_idx);
return err;
}
-/* Translate guest KSEG0 addresses to Host PA */
-unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
- unsigned long gva)
+static pte_t *kvm_trap_emul_pte_for_gva(struct kvm_vcpu *vcpu,
+ unsigned long addr)
{
- gfn_t gfn;
- unsigned long offset = gva & ~PAGE_MASK;
- struct kvm *kvm = vcpu->kvm;
+ struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
+ pgd_t *pgdp;
+ int ret;
+
+ /* We need a minimum of cached pages ready for page table creation */
+ ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES,
+ KVM_NR_MEM_OBJS);
+ if (ret)
+ return NULL;
+
+ if (KVM_GUEST_KERNEL_MODE(vcpu))
+ pgdp = vcpu->arch.guest_kernel_mm.pgd;
+ else
+ pgdp = vcpu->arch.guest_user_mm.pgd;
+
+ return kvm_mips_walk_pgd(pgdp, memcache, addr);
+}
- if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
- kvm_err("%s/%p: Invalid gva: %#lx\n", __func__,
- __builtin_return_address(0), gva);
- return KVM_INVALID_PAGE;
+void kvm_trap_emul_invalidate_gva(struct kvm_vcpu *vcpu, unsigned long addr,
+ bool user)
+{
+ pgd_t *pgdp;
+ pte_t *ptep;
+
+ addr &= PAGE_MASK << 1;
+
+ pgdp = vcpu->arch.guest_kernel_mm.pgd;
+ ptep = kvm_mips_walk_pgd(pgdp, NULL, addr);
+ if (ptep) {
+ ptep[0] = pfn_pte(0, __pgprot(0));
+ ptep[1] = pfn_pte(0, __pgprot(0));
+ }
+
+ if (user) {
+ pgdp = vcpu->arch.guest_user_mm.pgd;
+ ptep = kvm_mips_walk_pgd(pgdp, NULL, addr);
+ if (ptep) {
+ ptep[0] = pfn_pte(0, __pgprot(0));
+ ptep[1] = pfn_pte(0, __pgprot(0));
+ }
}
+}
- gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT);
+/*
+ * kvm_mips_flush_gva_{pte,pmd,pud,pgd,pt}.
+ * Flush a range of guest physical address space from the VM's GPA page tables.
+ */
- if (gfn >= kvm->arch.guest_pmap_npages) {
- kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn,
- gva);
- return KVM_INVALID_PAGE;
+static bool kvm_mips_flush_gva_pte(pte_t *pte, unsigned long start_gva,
+ unsigned long end_gva)
+{
+ int i_min = __pte_offset(start_gva);
+ int i_max = __pte_offset(end_gva);
+ bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PTE - 1);
+ int i;
+
+ /*
+ * There's no freeing to do, so there's no point clearing individual
+ * entries unless only part of the last level page table needs flushing.
+ */
+ if (safe_to_remove)
+ return true;
+
+ for (i = i_min; i <= i_max; ++i) {
+ if (!pte_present(pte[i]))
+ continue;
+
+ set_pte(pte + i, __pte(0));
}
+ return false;
+}
- if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
- return KVM_INVALID_ADDR;
+static bool kvm_mips_flush_gva_pmd(pmd_t *pmd, unsigned long start_gva,
+ unsigned long end_gva)
+{
+ pte_t *pte;
+ unsigned long end = ~0ul;
+ int i_min = __pmd_offset(start_gva);
+ int i_max = __pmd_offset(end_gva);
+ bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PMD - 1);
+ int i;
+
+ for (i = i_min; i <= i_max; ++i, start_gva = 0) {
+ if (!pmd_present(pmd[i]))
+ continue;
+
+ pte = pte_offset(pmd + i, 0);
+ if (i == i_max)
+ end = end_gva;
+
+ if (kvm_mips_flush_gva_pte(pte, start_gva, end)) {
+ pmd_clear(pmd + i);
+ pte_free_kernel(NULL, pte);
+ } else {
+ safe_to_remove = false;
+ }
+ }
+ return safe_to_remove;
+}
- return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
+static bool kvm_mips_flush_gva_pud(pud_t *pud, unsigned long start_gva,
+ unsigned long end_gva)
+{
+ pmd_t *pmd;
+ unsigned long end = ~0ul;
+ int i_min = __pud_offset(start_gva);
+ int i_max = __pud_offset(end_gva);
+ bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PUD - 1);
+ int i;
+
+ for (i = i_min; i <= i_max; ++i, start_gva = 0) {
+ if (!pud_present(pud[i]))
+ continue;
+
+ pmd = pmd_offset(pud + i, 0);
+ if (i == i_max)
+ end = end_gva;
+
+ if (kvm_mips_flush_gva_pmd(pmd, start_gva, end)) {
+ pud_clear(pud + i);
+ pmd_free(NULL, pmd);
+ } else {
+ safe_to_remove = false;
+ }
+ }
+ return safe_to_remove;
+}
+
+static bool kvm_mips_flush_gva_pgd(pgd_t *pgd, unsigned long start_gva,
+ unsigned long end_gva)
+{
+ pud_t *pud;
+ unsigned long end = ~0ul;
+ int i_min = pgd_index(start_gva);
+ int i_max = pgd_index(end_gva);
+ bool safe_to_remove = (i_min == 0 && i_max == PTRS_PER_PGD - 1);
+ int i;
+
+ for (i = i_min; i <= i_max; ++i, start_gva = 0) {
+ if (!pgd_present(pgd[i]))
+ continue;
+
+ pud = pud_offset(pgd + i, 0);
+ if (i == i_max)
+ end = end_gva;
+
+ if (kvm_mips_flush_gva_pud(pud, start_gva, end)) {
+ pgd_clear(pgd + i);
+ pud_free(NULL, pud);
+ } else {
+ safe_to_remove = false;
+ }
+ }
+ return safe_to_remove;
+}
+
+void kvm_mips_flush_gva_pt(pgd_t *pgd, enum kvm_mips_flush flags)
+{
+ if (flags & KMF_GPA) {
+ /* all of guest virtual address space could be affected */
+ if (flags & KMF_KERN)
+ /* useg, kseg0, seg2/3 */
+ kvm_mips_flush_gva_pgd(pgd, 0, 0x7fffffff);
+ else
+ /* useg */
+ kvm_mips_flush_gva_pgd(pgd, 0, 0x3fffffff);
+ } else {
+ /* useg */
+ kvm_mips_flush_gva_pgd(pgd, 0, 0x3fffffff);
+
+ /* kseg2/3 */
+ if (flags & KMF_KERN)
+ kvm_mips_flush_gva_pgd(pgd, 0x60000000, 0x7fffffff);
+ }
+}
+
+static pte_t kvm_mips_gpa_pte_to_gva_unmapped(pte_t pte)
+{
+ /*
+ * Don't leak writeable but clean entries from GPA page tables. We don't
+ * want the normal Linux tlbmod handler to handle dirtying when KVM
+ * accesses guest memory.
+ */
+ if (!pte_dirty(pte))
+ pte = pte_wrprotect(pte);
+
+ return pte;
+}
+
+static pte_t kvm_mips_gpa_pte_to_gva_mapped(pte_t pte, long entrylo)
+{
+ /* Guest EntryLo overrides host EntryLo */
+ if (!(entrylo & ENTRYLO_D))
+ pte = pte_mkclean(pte);
+
+ return kvm_mips_gpa_pte_to_gva_unmapped(pte);
}
/* XXXKYMA: Must be called with interrupts disabled */
int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
- struct kvm_vcpu *vcpu)
+ struct kvm_vcpu *vcpu,
+ bool write_fault)
{
- gfn_t gfn;
- kvm_pfn_t pfn0, pfn1;
- unsigned long vaddr = 0;
- unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
- struct kvm *kvm = vcpu->kvm;
- const int flush_dcache_mask = 0;
- int ret;
+ unsigned long gpa;
+ pte_t pte_gpa[2], *ptep_gva;
+ int idx;
if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
@@ -98,49 +1007,39 @@ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
return -1;
}
- gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
- if ((gfn | 1) >= kvm->arch.guest_pmap_npages) {
- kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
- gfn, badvaddr);
- kvm_mips_dump_host_tlbs();
+ /* Get the GPA page table entry */
+ gpa = KVM_GUEST_CPHYSADDR(badvaddr);
+ idx = (badvaddr >> PAGE_SHIFT) & 1;
+ if (kvm_mips_map_page(vcpu, gpa, write_fault, &pte_gpa[idx],
+ &pte_gpa[!idx]) < 0)
return -1;
- }
- vaddr = badvaddr & (PAGE_MASK << 1);
- if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
+ /* Get the GVA page table entry */
+ ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, badvaddr & ~PAGE_SIZE);
+ if (!ptep_gva) {
+ kvm_err("No ptep for gva %lx\n", badvaddr);
return -1;
+ }
- if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
- return -1;
-
- pfn0 = kvm->arch.guest_pmap[gfn & ~0x1];
- pfn1 = kvm->arch.guest_pmap[gfn | 0x1];
-
- entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
- ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
- ENTRYLO_D | ENTRYLO_V;
- entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
- ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
- ENTRYLO_D | ENTRYLO_V;
-
- preempt_disable();
- entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
- ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
- flush_dcache_mask);
- preempt_enable();
+ /* Copy a pair of entries from GPA page table to GVA page table */
+ ptep_gva[0] = kvm_mips_gpa_pte_to_gva_unmapped(pte_gpa[0]);
+ ptep_gva[1] = kvm_mips_gpa_pte_to_gva_unmapped(pte_gpa[1]);
- return ret;
+ /* Invalidate this entry in the TLB, guest kernel ASID only */
+ kvm_mips_host_tlb_inv(vcpu, badvaddr, false, true);
+ return 0;
}
int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
- struct kvm_mips_tlb *tlb)
+ struct kvm_mips_tlb *tlb,
+ unsigned long gva,
+ bool write_fault)
{
- unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
struct kvm *kvm = vcpu->kvm;
- kvm_pfn_t pfn0, pfn1;
- gfn_t gfn0, gfn1;
long tlb_lo[2];
- int ret;
+ pte_t pte_gpa[2], *ptep_buddy, *ptep_gva;
+ unsigned int idx = TLB_LO_IDX(*tlb, gva);
+ bool kernel = KVM_GUEST_KERNEL_MODE(vcpu);
tlb_lo[0] = tlb->tlb_lo[0];
tlb_lo[1] = tlb->tlb_lo[1];
@@ -149,70 +1048,64 @@ int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
* The commpage address must not be mapped to anything else if the guest
* TLB contains entries nearby, or commpage accesses will break.
*/
- if (!((tlb->tlb_hi ^ KVM_GUEST_COMMPAGE_ADDR) &
- VPN2_MASK & (PAGE_MASK << 1)))
- tlb_lo[(KVM_GUEST_COMMPAGE_ADDR >> PAGE_SHIFT) & 1] = 0;
-
- gfn0 = mips3_tlbpfn_to_paddr(tlb_lo[0]) >> PAGE_SHIFT;
- gfn1 = mips3_tlbpfn_to_paddr(tlb_lo[1]) >> PAGE_SHIFT;
- if (gfn0 >= kvm->arch.guest_pmap_npages ||
- gfn1 >= kvm->arch.guest_pmap_npages) {
- kvm_err("%s: Invalid gfn: [%#llx, %#llx], EHi: %#lx\n",
- __func__, gfn0, gfn1, tlb->tlb_hi);
- kvm_mips_dump_guest_tlbs(vcpu);
- return -1;
- }
+ if (!((gva ^ KVM_GUEST_COMMPAGE_ADDR) & VPN2_MASK & (PAGE_MASK << 1)))
+ tlb_lo[TLB_LO_IDX(*tlb, KVM_GUEST_COMMPAGE_ADDR)] = 0;
- if (kvm_mips_map_page(kvm, gfn0) < 0)
+ /* Get the GPA page table entry */
+ if (kvm_mips_map_page(vcpu, mips3_tlbpfn_to_paddr(tlb_lo[idx]),
+ write_fault, &pte_gpa[idx], NULL) < 0)
return -1;
- if (kvm_mips_map_page(kvm, gfn1) < 0)
+ /* And its GVA buddy's GPA page table entry if it also exists */
+ pte_gpa[!idx] = pfn_pte(0, __pgprot(0));
+ if (tlb_lo[!idx] & ENTRYLO_V) {
+ spin_lock(&kvm->mmu_lock);
+ ptep_buddy = kvm_mips_pte_for_gpa(kvm, NULL,
+ mips3_tlbpfn_to_paddr(tlb_lo[!idx]));
+ if (ptep_buddy)
+ pte_gpa[!idx] = *ptep_buddy;
+ spin_unlock(&kvm->mmu_lock);
+ }
+
+ /* Get the GVA page table entry pair */
+ ptep_gva = kvm_trap_emul_pte_for_gva(vcpu, gva & ~PAGE_SIZE);
+ if (!ptep_gva) {
+ kvm_err("No ptep for gva %lx\n", gva);
return -1;
+ }
- pfn0 = kvm->arch.guest_pmap[gfn0];
- pfn1 = kvm->arch.guest_pmap[gfn1];
+ /* Copy a pair of entries from GPA page table to GVA page table */
+ ptep_gva[0] = kvm_mips_gpa_pte_to_gva_mapped(pte_gpa[0], tlb_lo[0]);
+ ptep_gva[1] = kvm_mips_gpa_pte_to_gva_mapped(pte_gpa[1], tlb_lo[1]);
- /* Get attributes from the Guest TLB */
- entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
- ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
- (tlb_lo[0] & ENTRYLO_D) |
- (tlb_lo[0] & ENTRYLO_V);
- entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
- ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
- (tlb_lo[1] & ENTRYLO_D) |
- (tlb_lo[1] & ENTRYLO_V);
+ /* Invalidate this entry in the TLB, current guest mode ASID only */
+ kvm_mips_host_tlb_inv(vcpu, gva, !kernel, kernel);
kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
tlb->tlb_lo[0], tlb->tlb_lo[1]);
- preempt_disable();
- entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
- kvm_mips_get_kernel_asid(vcpu) :
- kvm_mips_get_user_asid(vcpu));
- ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
- tlb->tlb_mask);
- preempt_enable();
-
- return ret;
+ return 0;
}
-void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
- struct kvm_vcpu *vcpu)
+int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
+ struct kvm_vcpu *vcpu)
{
- unsigned long asid = asid_cache(cpu);
-
- asid += cpu_asid_inc();
- if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
- if (cpu_has_vtag_icache)
- flush_icache_all();
-
- kvm_local_flush_tlb_all(); /* start new asid cycle */
+ kvm_pfn_t pfn;
+ pte_t *ptep;
- if (!asid) /* fix version if needed */
- asid = asid_first_version(cpu);
+ ptep = kvm_trap_emul_pte_for_gva(vcpu, badvaddr);
+ if (!ptep) {
+ kvm_err("No ptep for commpage %lx\n", badvaddr);
+ return -1;
}
- cpu_context(cpu, mm) = asid_cache(cpu) = asid;
+ pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage));
+ /* Also set valid and dirty, so refill handler doesn't have to */
+ *ptep = pte_mkyoung(pte_mkdirty(pfn_pte(pfn, PAGE_SHARED)));
+
+ /* Invalidate this entry in the TLB, guest kernel ASID only */
+ kvm_mips_host_tlb_inv(vcpu, badvaddr, false, true);
+ return 0;
}
/**
@@ -235,42 +1128,13 @@ static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
/* Restore ASID once we are scheduled back after preemption */
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
unsigned long flags;
- int newasid = 0;
kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
- /* Allocate new kernel and user ASIDs if needed */
-
local_irq_save(flags);
- if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
- asid_version_mask(cpu)) {
- kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
- vcpu->arch.guest_kernel_asid[cpu] =
- vcpu->arch.guest_kernel_mm.context.asid[cpu];
- newasid++;
-
- kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
- cpu_context(cpu, current->mm));
- kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
- cpu, vcpu->arch.guest_kernel_asid[cpu]);
- }
-
- if ((vcpu->arch.guest_user_asid[cpu] ^ asid_cache(cpu)) &
- asid_version_mask(cpu)) {
- kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
- vcpu->arch.guest_user_asid[cpu] =
- vcpu->arch.guest_user_mm.context.asid[cpu];
- newasid++;
-
- kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
- cpu_context(cpu, current->mm));
- kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
- vcpu->arch.guest_user_asid[cpu]);
- }
-
+ vcpu->cpu = cpu;
if (vcpu->arch.last_sched_cpu != cpu) {
kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
@@ -282,42 +1146,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_mips_migrate_count(vcpu);
}
- if (!newasid) {
- /*
- * If we preempted while the guest was executing, then reload
- * the pre-empted ASID
- */
- if (current->flags & PF_VCPU) {
- write_c0_entryhi(vcpu->arch.
- preempt_entryhi & asid_mask);
- ehb();
- }
- } else {
- /* New ASIDs were allocated for the VM */
-
- /*
- * Were we in guest context? If so then the pre-empted ASID is
- * no longer valid, we need to set it to what it should be based
- * on the mode of the Guest (Kernel/User)
- */
- if (current->flags & PF_VCPU) {
- if (KVM_GUEST_KERNEL_MODE(vcpu))
- write_c0_entryhi(vcpu->arch.
- guest_kernel_asid[cpu] &
- asid_mask);
- else
- write_c0_entryhi(vcpu->arch.
- guest_user_asid[cpu] &
- asid_mask);
- ehb();
- }
- }
-
/* restore guest state to registers */
- kvm_mips_callbacks->vcpu_set_regs(vcpu);
+ kvm_mips_callbacks->vcpu_load(vcpu, cpu);
local_irq_restore(flags);
-
}
/* ASID can change if another task is scheduled during preemption */
@@ -329,75 +1161,90 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
local_irq_save(flags);
cpu = smp_processor_id();
-
- vcpu->arch.preempt_entryhi = read_c0_entryhi();
vcpu->arch.last_sched_cpu = cpu;
+ vcpu->cpu = -1;
/* save guest state in registers */
- kvm_mips_callbacks->vcpu_get_regs(vcpu);
-
- if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
- asid_version_mask(cpu))) {
- kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__,
- cpu_context(cpu, current->mm));
- drop_mmu_context(current->mm, cpu);
- }
- write_c0_entryhi(cpu_asid(cpu, current->mm));
- ehb();
+ kvm_mips_callbacks->vcpu_put(vcpu, cpu);
local_irq_restore(flags);
}
-u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
+/**
+ * kvm_trap_emul_gva_fault() - Safely attempt to handle a GVA access fault.
+ * @vcpu: Virtual CPU.
+ * @gva: Guest virtual address to be accessed.
+ * @write: True if write attempted (must be dirtied and made writable).
+ *
+ * Safely attempt to handle a GVA fault, mapping GVA pages if necessary, and
+ * dirtying the page if @write so that guest instructions can be modified.
+ *
+ * Returns: KVM_MIPS_MAPPED on success.
+ * KVM_MIPS_GVA if bad guest virtual address.
+ * KVM_MIPS_GPA if bad guest physical address.
+ * KVM_MIPS_TLB if guest TLB not present.
+ * KVM_MIPS_TLBINV if guest TLB present but not valid.
+ * KVM_MIPS_TLBMOD if guest TLB read only.
+ */
+enum kvm_mips_fault_result kvm_trap_emul_gva_fault(struct kvm_vcpu *vcpu,
+ unsigned long gva,
+ bool write)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
- unsigned long paddr, flags, vpn2, asid;
- unsigned long va = (unsigned long)opc;
- void *vaddr;
- u32 inst;
+ struct kvm_mips_tlb *tlb;
int index;
- if (KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0 ||
- KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
- local_irq_save(flags);
- index = kvm_mips_host_tlb_lookup(vcpu, va);
- if (index >= 0) {
- inst = *(opc);
- } else {
- vpn2 = va & VPN2_MASK;
- asid = kvm_read_c0_guest_entryhi(cop0) &
- KVM_ENTRYHI_ASID;
- index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
- if (index < 0) {
- kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
- __func__, opc, vcpu, read_c0_entryhi());
- kvm_mips_dump_host_tlbs();
- kvm_mips_dump_guest_tlbs(vcpu);
- local_irq_restore(flags);
- return KVM_INVALID_INST;
- }
- if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
- &vcpu->arch.guest_tlb[index])) {
- kvm_err("%s: handling mapped seg tlb fault failed for %p, index: %u, vcpu: %p, ASID: %#lx\n",
- __func__, opc, index, vcpu,
- read_c0_entryhi());
- kvm_mips_dump_guest_tlbs(vcpu);
- local_irq_restore(flags);
- return KVM_INVALID_INST;
- }
- inst = *(opc);
- }
- local_irq_restore(flags);
- } else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
- paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va);
- vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
- vaddr += paddr & ~PAGE_MASK;
- inst = *(u32 *)vaddr;
- kunmap_atomic(vaddr);
+ if (KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG0) {
+ if (kvm_mips_handle_kseg0_tlb_fault(gva, vcpu, write) < 0)
+ return KVM_MIPS_GPA;
+ } else if ((KVM_GUEST_KSEGX(gva) < KVM_GUEST_KSEG0) ||
+ KVM_GUEST_KSEGX(gva) == KVM_GUEST_KSEG23) {
+ /* Address should be in the guest TLB */
+ index = kvm_mips_guest_tlb_lookup(vcpu, (gva & VPN2_MASK) |
+ (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID));
+ if (index < 0)
+ return KVM_MIPS_TLB;
+ tlb = &vcpu->arch.guest_tlb[index];
+
+ /* Entry should be valid, and dirty for writes */
+ if (!TLB_IS_VALID(*tlb, gva))
+ return KVM_MIPS_TLBINV;
+ if (write && !TLB_IS_DIRTY(*tlb, gva))
+ return KVM_MIPS_TLBMOD;
+
+ if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, gva, write))
+ return KVM_MIPS_GPA;
} else {
- kvm_err("%s: illegal address: %p\n", __func__, opc);
- return KVM_INVALID_INST;
+ return KVM_MIPS_GVA;
}
- return inst;
+ return KVM_MIPS_MAPPED;
+}
+
+int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out)
+{
+ int err;
+
+retry:
+ kvm_trap_emul_gva_lockless_begin(vcpu);
+ err = get_user(*out, opc);
+ kvm_trap_emul_gva_lockless_end(vcpu);
+
+ if (unlikely(err)) {
+ /*
+ * Try to handle the fault, maybe we just raced with a GVA
+ * invalidation.
+ */
+ err = kvm_trap_emul_gva_fault(vcpu, (unsigned long)opc,
+ false);
+ if (unlikely(err)) {
+ kvm_err("%s: illegal address: %p\n",
+ __func__, opc);
+ return -EFAULT;
+ }
+
+ /* Hopefully it'll work now */
+ goto retry;
+ }
+ return 0;
}
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index 254377d8e0b9..2819eb793345 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -33,28 +33,20 @@
#define KVM_GUEST_PC_TLB 0
#define KVM_GUEST_SP_TLB 1
-atomic_t kvm_mips_instance;
-EXPORT_SYMBOL_GPL(kvm_mips_instance);
-
static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
{
+ struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
int cpu = smp_processor_id();
- return vcpu->arch.guest_kernel_asid[cpu] &
- cpu_asid_mask(&cpu_data[cpu]);
+ return cpu_asid(cpu, kern_mm);
}
static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
{
+ struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
int cpu = smp_processor_id();
- return vcpu->arch.guest_user_asid[cpu] &
- cpu_asid_mask(&cpu_data[cpu]);
-}
-
-inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
-{
- return vcpu->kvm->arch.commpage_tlb;
+ return cpu_asid(cpu, user_mm);
}
/* Structure defining an tlb entry data set. */
@@ -104,109 +96,6 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs);
-/* XXXKYMA: Must be called with interrupts disabled */
-/* set flush_dcache_mask == 0 if no dcache flush required */
-int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
- unsigned long entrylo0, unsigned long entrylo1,
- int flush_dcache_mask)
-{
- unsigned long flags;
- unsigned long old_entryhi;
- int idx;
-
- local_irq_save(flags);
-
- old_entryhi = read_c0_entryhi();
- write_c0_entryhi(entryhi);
- mtc0_tlbw_hazard();
-
- tlb_probe();
- tlb_probe_hazard();
- idx = read_c0_index();
-
- if (idx > current_cpu_data.tlbsize) {
- kvm_err("%s: Invalid Index: %d\n", __func__, idx);
- kvm_mips_dump_host_tlbs();
- local_irq_restore(flags);
- return -1;
- }
-
- write_c0_entrylo0(entrylo0);
- write_c0_entrylo1(entrylo1);
- mtc0_tlbw_hazard();
-
- if (idx < 0)
- tlb_write_random();
- else
- tlb_write_indexed();
- tlbw_use_hazard();
-
- kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0(R): 0x%08lx, entrylo1(R): 0x%08lx\n",
- vcpu->arch.pc, idx, read_c0_entryhi(),
- read_c0_entrylo0(), read_c0_entrylo1());
-
- /* Flush D-cache */
- if (flush_dcache_mask) {
- if (entrylo0 & ENTRYLO_V) {
- ++vcpu->stat.flush_dcache_exits;
- flush_data_cache_page((entryhi & VPN2_MASK) &
- ~flush_dcache_mask);
- }
- if (entrylo1 & ENTRYLO_V) {
- ++vcpu->stat.flush_dcache_exits;
- flush_data_cache_page(((entryhi & VPN2_MASK) &
- ~flush_dcache_mask) |
- (0x1 << PAGE_SHIFT));
- }
- }
-
- /* Restore old ASID */
- write_c0_entryhi(old_entryhi);
- mtc0_tlbw_hazard();
- local_irq_restore(flags);
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write);
-
-int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
- struct kvm_vcpu *vcpu)
-{
- kvm_pfn_t pfn;
- unsigned long flags, old_entryhi = 0, vaddr = 0;
- unsigned long entrylo[2] = { 0, 0 };
- unsigned int pair_idx;
-
- pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage));
- pair_idx = (badvaddr >> PAGE_SHIFT) & 1;
- entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) |
- ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
- ENTRYLO_D | ENTRYLO_V;
-
- local_irq_save(flags);
-
- old_entryhi = read_c0_entryhi();
- vaddr = badvaddr & (PAGE_MASK << 1);
- write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu));
- write_c0_entrylo0(entrylo[0]);
- write_c0_entrylo1(entrylo[1]);
- write_c0_index(kvm_mips_get_commpage_asid(vcpu));
- mtc0_tlbw_hazard();
- tlb_write_indexed();
- tlbw_use_hazard();
-
- kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n",
- vcpu->arch.pc, read_c0_index(), read_c0_entryhi(),
- read_c0_entrylo0(), read_c0_entrylo1());
-
- /* Restore old ASID */
- write_c0_entryhi(old_entryhi);
- mtc0_tlbw_hazard();
- local_irq_restore(flags);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_handle_commpage_tlb_fault);
-
int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
{
int i;
@@ -228,51 +117,11 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
}
EXPORT_SYMBOL_GPL(kvm_mips_guest_tlb_lookup);
-int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr)
-{
- unsigned long old_entryhi, flags;
- int idx;
-
- local_irq_save(flags);
-
- old_entryhi = read_c0_entryhi();
-
- if (KVM_GUEST_KERNEL_MODE(vcpu))
- write_c0_entryhi((vaddr & VPN2_MASK) |
- kvm_mips_get_kernel_asid(vcpu));
- else {
- write_c0_entryhi((vaddr & VPN2_MASK) |
- kvm_mips_get_user_asid(vcpu));
- }
-
- mtc0_tlbw_hazard();
-
- tlb_probe();
- tlb_probe_hazard();
- idx = read_c0_index();
-
- /* Restore old ASID */
- write_c0_entryhi(old_entryhi);
- mtc0_tlbw_hazard();
-
- local_irq_restore(flags);
-
- kvm_debug("Host TLB lookup, %#lx, idx: %2d\n", vaddr, idx);
-
- return idx;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_lookup);
-
-int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
+static int _kvm_mips_host_tlb_inv(unsigned long entryhi)
{
int idx;
- unsigned long flags, old_entryhi;
-
- local_irq_save(flags);
-
- old_entryhi = read_c0_entryhi();
- write_c0_entryhi((va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu));
+ write_c0_entryhi(entryhi);
mtc0_tlbw_hazard();
tlb_probe();
@@ -282,7 +131,7 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
if (idx >= current_cpu_data.tlbsize)
BUG();
- if (idx > 0) {
+ if (idx >= 0) {
write_c0_entryhi(UNIQUE_ENTRYHI(idx));
write_c0_entrylo0(0);
write_c0_entrylo1(0);
@@ -292,93 +141,75 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
tlbw_use_hazard();
}
- write_c0_entryhi(old_entryhi);
- mtc0_tlbw_hazard();
-
- local_irq_restore(flags);
-
- if (idx > 0)
- kvm_debug("%s: Invalidated entryhi %#lx @ idx %d\n", __func__,
- (va & VPN2_MASK) | kvm_mips_get_user_asid(vcpu), idx);
-
- return 0;
+ return idx;
}
-EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv);
-void kvm_mips_flush_host_tlb(int skip_kseg0)
+int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va,
+ bool user, bool kernel)
{
- unsigned long flags;
- unsigned long old_entryhi, entryhi;
- unsigned long old_pagemask;
- int entry = 0;
- int maxentry = current_cpu_data.tlbsize;
+ int idx_user, idx_kernel;
+ unsigned long flags, old_entryhi;
local_irq_save(flags);
old_entryhi = read_c0_entryhi();
- old_pagemask = read_c0_pagemask();
-
- /* Blast 'em all away. */
- for (entry = 0; entry < maxentry; entry++) {
- write_c0_index(entry);
-
- if (skip_kseg0) {
- mtc0_tlbr_hazard();
- tlb_read();
- tlb_read_hazard();
-
- entryhi = read_c0_entryhi();
- /* Don't blow away guest kernel entries */
- if (KVM_GUEST_KSEGX(entryhi) == KVM_GUEST_KSEG0)
- continue;
-
- write_c0_pagemask(old_pagemask);
- }
-
- /* Make sure all entries differ. */
- write_c0_entryhi(UNIQUE_ENTRYHI(entry));
- write_c0_entrylo0(0);
- write_c0_entrylo1(0);
- mtc0_tlbw_hazard();
-
- tlb_write_indexed();
- tlbw_use_hazard();
- }
+ if (user)
+ idx_user = _kvm_mips_host_tlb_inv((va & VPN2_MASK) |
+ kvm_mips_get_user_asid(vcpu));
+ if (kernel)
+ idx_kernel = _kvm_mips_host_tlb_inv((va & VPN2_MASK) |
+ kvm_mips_get_kernel_asid(vcpu));
write_c0_entryhi(old_entryhi);
- write_c0_pagemask(old_pagemask);
mtc0_tlbw_hazard();
local_irq_restore(flags);
+
+ if (user && idx_user >= 0)
+ kvm_debug("%s: Invalidated guest user entryhi %#lx @ idx %d\n",
+ __func__, (va & VPN2_MASK) |
+ kvm_mips_get_user_asid(vcpu), idx_user);
+ if (kernel && idx_kernel >= 0)
+ kvm_debug("%s: Invalidated guest kernel entryhi %#lx @ idx %d\n",
+ __func__, (va & VPN2_MASK) |
+ kvm_mips_get_kernel_asid(vcpu), idx_kernel);
+
+ return 0;
}
-EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb);
+EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv);
-void kvm_local_flush_tlb_all(void)
+/**
+ * kvm_mips_suspend_mm() - Suspend the active mm.
+ * @cpu The CPU we're running on.
+ *
+ * Suspend the active_mm, ready for a switch to a KVM guest virtual address
+ * space. This is left active for the duration of guest context, including time
+ * with interrupts enabled, so we need to be careful not to confuse e.g. cache
+ * management IPIs.
+ *
+ * kvm_mips_resume_mm() should be called before context switching to a different
+ * process so we don't need to worry about reference counting.
+ *
+ * This needs to be in static kernel code to avoid exporting init_mm.
+ */
+void kvm_mips_suspend_mm(int cpu)
{
- unsigned long flags;
- unsigned long old_ctx;
- int entry = 0;
-
- local_irq_save(flags);
- /* Save old context and create impossible VPN2 value */
- old_ctx = read_c0_entryhi();
- write_c0_entrylo0(0);
- write_c0_entrylo1(0);
-
- /* Blast 'em all away. */
- while (entry < current_cpu_data.tlbsize) {
- /* Make sure all entries differ. */
- write_c0_entryhi(UNIQUE_ENTRYHI(entry));
- write_c0_index(entry);
- mtc0_tlbw_hazard();
- tlb_write_indexed();
- tlbw_use_hazard();
- entry++;
- }
- write_c0_entryhi(old_ctx);
- mtc0_tlbw_hazard();
+ cpumask_clear_cpu(cpu, mm_cpumask(current->active_mm));
+ current->active_mm = &init_mm;
+}
+EXPORT_SYMBOL_GPL(kvm_mips_suspend_mm);
- local_irq_restore(flags);
+/**
+ * kvm_mips_resume_mm() - Resume the current process mm.
+ * @cpu The CPU we're running on.
+ *
+ * Resume the mm of the current process, after a switch back from a KVM guest
+ * virtual address space (see kvm_mips_suspend_mm()).
+ */
+void kvm_mips_resume_mm(int cpu)
+{
+ cpumask_set_cpu(cpu, mm_cpumask(current->mm));
+ current->active_mm = current->mm;
}
-EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all);
+EXPORT_SYMBOL_GPL(kvm_mips_resume_mm);
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 3b20441f2beb..b1fa53b252ea 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -11,9 +11,11 @@
#include <linux/errno.h>
#include <linux/err.h>
-#include <linux/vmalloc.h>
-
#include <linux/kvm_host.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <asm/mmu_context.h>
+#include <asm/pgalloc.h>
#include "interrupt.h"
@@ -21,9 +23,12 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
{
gpa_t gpa;
gva_t kseg = KSEGX(gva);
+ gva_t gkseg = KVM_GUEST_KSEGX(gva);
if ((kseg == CKSEG0) || (kseg == CKSEG1))
gpa = CPHYSADDR(gva);
+ else if (gkseg == KVM_GUEST_KSEG0)
+ gpa = KVM_GUEST_CPHYSADDR(gva);
else {
kvm_err("%s: cannot find GPA for GVA: %#lx\n", __func__, gva);
kvm_mips_dump_host_tlbs();
@@ -83,48 +88,134 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
return ret;
}
+static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_run *run,
+ struct kvm_vcpu *vcpu)
+{
+ enum emulation_result er;
+ union mips_instruction inst;
+ int err;
+
+ /* A code fetch fault doesn't count as an MMIO */
+ if (kvm_is_ifetch_fault(&vcpu->arch)) {
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ return RESUME_HOST;
+ }
+
+ /* Fetch the instruction. */
+ if (cause & CAUSEF_BD)
+ opc += 1;
+ err = kvm_get_badinstr(opc, vcpu, &inst.word);
+ if (err) {
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ return RESUME_HOST;
+ }
+
+ /* Emulate the load */
+ er = kvm_mips_emulate_load(inst, cause, run, vcpu);
+ if (er == EMULATE_FAIL) {
+ kvm_err("Emulate load from MMIO space failed\n");
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ } else {
+ run->exit_reason = KVM_EXIT_MMIO;
+ }
+ return RESUME_HOST;
+}
+
+static int kvm_mips_bad_store(u32 cause, u32 *opc, struct kvm_run *run,
+ struct kvm_vcpu *vcpu)
+{
+ enum emulation_result er;
+ union mips_instruction inst;
+ int err;
+
+ /* Fetch the instruction. */
+ if (cause & CAUSEF_BD)
+ opc += 1;
+ err = kvm_get_badinstr(opc, vcpu, &inst.word);
+ if (err) {
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ return RESUME_HOST;
+ }
+
+ /* Emulate the store */
+ er = kvm_mips_emulate_store(inst, cause, run, vcpu);
+ if (er == EMULATE_FAIL) {
+ kvm_err("Emulate store to MMIO space failed\n");
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ } else {
+ run->exit_reason = KVM_EXIT_MMIO;
+ }
+ return RESUME_HOST;
+}
+
+static int kvm_mips_bad_access(u32 cause, u32 *opc, struct kvm_run *run,
+ struct kvm_vcpu *vcpu, bool store)
+{
+ if (store)
+ return kvm_mips_bad_store(cause, opc, run, vcpu);
+ else
+ return kvm_mips_bad_load(cause, opc, run, vcpu);
+}
+
static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
{
+ struct mips_coproc *cop0 = vcpu->arch.cop0;
struct kvm_run *run = vcpu->run;
u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
u32 cause = vcpu->arch.host_cp0_cause;
- enum emulation_result er = EMULATE_DONE;
- int ret = RESUME_GUEST;
+ struct kvm_mips_tlb *tlb;
+ unsigned long entryhi;
+ int index;
if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
|| KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
- kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
- cause, opc, badvaddr);
- er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu);
+ /*
+ * First find the mapping in the guest TLB. If the failure to
+ * write was due to the guest TLB, it should be up to the guest
+ * to handle it.
+ */
+ entryhi = (badvaddr & VPN2_MASK) |
+ (kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID);
+ index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
- if (er == EMULATE_DONE)
- ret = RESUME_GUEST;
- else {
+ /*
+ * These should never happen.
+ * They would indicate stale host TLB entries.
+ */
+ if (unlikely(index < 0)) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
+ return RESUME_HOST;
}
- } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
+ tlb = vcpu->arch.guest_tlb + index;
+ if (unlikely(!TLB_IS_VALID(*tlb, badvaddr))) {
+ run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ return RESUME_HOST;
+ }
+
/*
- * XXXKYMA: The guest kernel does not expect to get this fault
- * when we are not using HIGHMEM. Need to address this in a
- * HIGHMEM kernel
+ * Guest entry not dirty? That would explain the TLB modified
+ * exception. Relay that on to the guest so it can handle it.
*/
- kvm_err("TLB MOD fault not handled, cause %#x, PC: %p, BadVaddr: %#lx\n",
- cause, opc, badvaddr);
- kvm_mips_dump_host_tlbs();
- kvm_arch_vcpu_dump_regs(vcpu);
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
+ if (!TLB_IS_DIRTY(*tlb, badvaddr)) {
+ kvm_mips_emulate_tlbmod(cause, opc, run, vcpu);
+ return RESUME_GUEST;
+ }
+
+ if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, badvaddr,
+ true))
+ /* Not writable, needs handling as MMIO */
+ return kvm_mips_bad_store(cause, opc, run, vcpu);
+ return RESUME_GUEST;
+ } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
+ if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, true) < 0)
+ /* Not writable, needs handling as MMIO */
+ return kvm_mips_bad_store(cause, opc, run, vcpu);
+ return RESUME_GUEST;
} else {
- kvm_err("Illegal TLB Mod fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
- cause, opc, badvaddr);
- kvm_mips_dump_host_tlbs();
- kvm_arch_vcpu_dump_regs(vcpu);
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
+ /* host kernel addresses are all handled as MMIO */
+ return kvm_mips_bad_store(cause, opc, run, vcpu);
}
- return ret;
}
static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
@@ -157,7 +248,7 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
* into the shadow host TLB
*/
- er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu);
+ er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu, store);
if (er == EMULATE_DONE)
ret = RESUME_GUEST;
else {
@@ -169,29 +260,15 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
* All KSEG0 faults are handled by KVM, as the guest kernel does
* not expect to ever get them
*/
- if (kvm_mips_handle_kseg0_tlb_fault
- (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) {
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
- }
+ if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, store) < 0)
+ ret = kvm_mips_bad_access(cause, opc, run, vcpu, store);
} else if (KVM_GUEST_KERNEL_MODE(vcpu)
&& (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
/*
* With EVA we may get a TLB exception instead of an address
* error when the guest performs MMIO to KSeg1 addresses.
*/
- kvm_debug("Emulate %s MMIO space\n",
- store ? "Store to" : "Load from");
- er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
- if (er == EMULATE_FAIL) {
- kvm_err("Emulate %s MMIO space failed\n",
- store ? "Store to" : "Load from");
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
- } else {
- run->exit_reason = KVM_EXIT_MMIO;
- ret = RESUME_HOST;
- }
+ ret = kvm_mips_bad_access(cause, opc, run, vcpu, store);
} else {
kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
store ? "ST" : "LD", cause, opc, badvaddr);
@@ -219,21 +296,11 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
u32 cause = vcpu->arch.host_cp0_cause;
- enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
if (KVM_GUEST_KERNEL_MODE(vcpu)
&& (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) {
- kvm_debug("Emulate Store to MMIO space\n");
- er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
- if (er == EMULATE_FAIL) {
- kvm_err("Emulate Store to MMIO space failed\n");
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
- } else {
- run->exit_reason = KVM_EXIT_MMIO;
- ret = RESUME_HOST;
- }
+ ret = kvm_mips_bad_store(cause, opc, run, vcpu);
} else {
kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n",
cause, opc, badvaddr);
@@ -249,26 +316,15 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
u32 cause = vcpu->arch.host_cp0_cause;
- enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
if (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1) {
- kvm_debug("Emulate Load from MMIO space @ %#lx\n", badvaddr);
- er = kvm_mips_emulate_inst(cause, opc, run, vcpu);
- if (er == EMULATE_FAIL) {
- kvm_err("Emulate Load from MMIO space failed\n");
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
- } else {
- run->exit_reason = KVM_EXIT_MMIO;
- ret = RESUME_HOST;
- }
+ ret = kvm_mips_bad_load(cause, opc, run, vcpu);
} else {
kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n",
cause, opc, badvaddr);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
ret = RESUME_HOST;
- er = EMULATE_FAIL;
}
return ret;
}
@@ -428,16 +484,75 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
return ret;
}
-static int kvm_trap_emul_vm_init(struct kvm *kvm)
+static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
{
+ struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+ struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
+
+ /*
+ * Allocate GVA -> HPA page tables.
+ * MIPS doesn't use the mm_struct pointer argument.
+ */
+ kern_mm->pgd = pgd_alloc(kern_mm);
+ if (!kern_mm->pgd)
+ return -ENOMEM;
+
+ user_mm->pgd = pgd_alloc(user_mm);
+ if (!user_mm->pgd) {
+ pgd_free(kern_mm, kern_mm->pgd);
+ return -ENOMEM;
+ }
+
return 0;
}
-static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
+static void kvm_mips_emul_free_gva_pt(pgd_t *pgd)
{
- vcpu->arch.kscratch_enabled = 0xfc;
+ /* Don't free host kernel page tables copied from init_mm.pgd */
+ const unsigned long end = 0x80000000;
+ unsigned long pgd_va, pud_va, pmd_va;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+ int i, j, k;
+
+ for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+ if (pgd_none(pgd[i]))
+ continue;
+
+ pgd_va = (unsigned long)i << PGDIR_SHIFT;
+ if (pgd_va >= end)
+ break;
+ pud = pud_offset(pgd + i, 0);
+ for (j = 0; j < PTRS_PER_PUD; j++) {
+ if (pud_none(pud[j]))
+ continue;
+
+ pud_va = pgd_va | ((unsigned long)j << PUD_SHIFT);
+ if (pud_va >= end)
+ break;
+ pmd = pmd_offset(pud + j, 0);
+ for (k = 0; k < PTRS_PER_PMD; k++) {
+ if (pmd_none(pmd[k]))
+ continue;
+
+ pmd_va = pud_va | (k << PMD_SHIFT);
+ if (pmd_va >= end)
+ break;
+ pte = pte_offset(pmd + k, 0);
+ pte_free_kernel(NULL, pte);
+ }
+ pmd_free(NULL, pmd);
+ }
+ pud_free(NULL, pud);
+ }
+ pgd_free(NULL, pgd);
+}
- return 0;
+static void kvm_trap_emul_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+ kvm_mips_emul_free_gva_pt(vcpu->arch.guest_kernel_mm.pgd);
+ kvm_mips_emul_free_gva_pt(vcpu->arch.guest_user_mm.pgd);
}
static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
@@ -499,6 +614,9 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
/* Set Wait IE/IXMT Ignore in Config7, IAR, AR */
kvm_write_c0_guest_config7(cop0, (MIPS_CONF7_WII) | (1 << 10));
+ /* Status */
+ kvm_write_c0_guest_status(cop0, ST0_BEV | ST0_ERL);
+
/*
* Setup IntCtl defaults, compatibility mode for timer interrupts (HW5)
*/
@@ -508,17 +626,76 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
kvm_write_c0_guest_ebase(cop0, KVM_GUEST_KSEG0 |
(vcpu_id & MIPS_EBASE_CPUNUM));
+ /* Put PC at guest reset vector */
+ vcpu->arch.pc = KVM_GUEST_CKSEG1ADDR(0x1fc00000);
+
return 0;
}
+static void kvm_trap_emul_flush_shadow_all(struct kvm *kvm)
+{
+ /* Flush GVA page tables and invalidate GVA ASIDs on all VCPUs */
+ kvm_flush_remote_tlbs(kvm);
+}
+
+static void kvm_trap_emul_flush_shadow_memslot(struct kvm *kvm,
+ const struct kvm_memory_slot *slot)
+{
+ kvm_trap_emul_flush_shadow_all(kvm);
+}
+
+static u64 kvm_trap_emul_get_one_regs[] = {
+ KVM_REG_MIPS_CP0_INDEX,
+ KVM_REG_MIPS_CP0_ENTRYLO0,
+ KVM_REG_MIPS_CP0_ENTRYLO1,
+ KVM_REG_MIPS_CP0_CONTEXT,
+ KVM_REG_MIPS_CP0_USERLOCAL,
+ KVM_REG_MIPS_CP0_PAGEMASK,
+ KVM_REG_MIPS_CP0_WIRED,
+ KVM_REG_MIPS_CP0_HWRENA,
+ KVM_REG_MIPS_CP0_BADVADDR,
+ KVM_REG_MIPS_CP0_COUNT,
+ KVM_REG_MIPS_CP0_ENTRYHI,
+ KVM_REG_MIPS_CP0_COMPARE,
+ KVM_REG_MIPS_CP0_STATUS,
+ KVM_REG_MIPS_CP0_INTCTL,
+ KVM_REG_MIPS_CP0_CAUSE,
+ KVM_REG_MIPS_CP0_EPC,
+ KVM_REG_MIPS_CP0_PRID,
+ KVM_REG_MIPS_CP0_EBASE,
+ KVM_REG_MIPS_CP0_CONFIG,
+ KVM_REG_MIPS_CP0_CONFIG1,
+ KVM_REG_MIPS_CP0_CONFIG2,
+ KVM_REG_MIPS_CP0_CONFIG3,
+ KVM_REG_MIPS_CP0_CONFIG4,
+ KVM_REG_MIPS_CP0_CONFIG5,
+ KVM_REG_MIPS_CP0_CONFIG7,
+ KVM_REG_MIPS_CP0_ERROREPC,
+ KVM_REG_MIPS_CP0_KSCRATCH1,
+ KVM_REG_MIPS_CP0_KSCRATCH2,
+ KVM_REG_MIPS_CP0_KSCRATCH3,
+ KVM_REG_MIPS_CP0_KSCRATCH4,
+ KVM_REG_MIPS_CP0_KSCRATCH5,
+ KVM_REG_MIPS_CP0_KSCRATCH6,
+
+ KVM_REG_MIPS_COUNT_CTL,
+ KVM_REG_MIPS_COUNT_RESUME,
+ KVM_REG_MIPS_COUNT_HZ,
+};
+
static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu)
{
- return 0;
+ return ARRAY_SIZE(kvm_trap_emul_get_one_regs);
}
static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu,
u64 __user *indices)
{
+ if (copy_to_user(indices, kvm_trap_emul_get_one_regs,
+ sizeof(kvm_trap_emul_get_one_regs)))
+ return -EFAULT;
+ indices += ARRAY_SIZE(kvm_trap_emul_get_one_regs);
+
return 0;
}
@@ -526,7 +703,81 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *reg,
s64 *v)
{
+ struct mips_coproc *cop0 = vcpu->arch.cop0;
+
switch (reg->id) {
+ case KVM_REG_MIPS_CP0_INDEX:
+ *v = (long)kvm_read_c0_guest_index(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_ENTRYLO0:
+ *v = kvm_read_c0_guest_entrylo0(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_ENTRYLO1:
+ *v = kvm_read_c0_guest_entrylo1(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_CONTEXT:
+ *v = (long)kvm_read_c0_guest_context(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_USERLOCAL:
+ *v = (long)kvm_read_c0_guest_userlocal(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_PAGEMASK:
+ *v = (long)kvm_read_c0_guest_pagemask(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_WIRED:
+ *v = (long)kvm_read_c0_guest_wired(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_HWRENA:
+ *v = (long)kvm_read_c0_guest_hwrena(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_BADVADDR:
+ *v = (long)kvm_read_c0_guest_badvaddr(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_ENTRYHI:
+ *v = (long)kvm_read_c0_guest_entryhi(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_COMPARE:
+ *v = (long)kvm_read_c0_guest_compare(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_STATUS:
+ *v = (long)kvm_read_c0_guest_status(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_INTCTL:
+ *v = (long)kvm_read_c0_guest_intctl(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_CAUSE:
+ *v = (long)kvm_read_c0_guest_cause(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_EPC:
+ *v = (long)kvm_read_c0_guest_epc(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_PRID:
+ *v = (long)kvm_read_c0_guest_prid(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_EBASE:
+ *v = (long)kvm_read_c0_guest_ebase(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_CONFIG:
+ *v = (long)kvm_read_c0_guest_config(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_CONFIG1:
+ *v = (long)kvm_read_c0_guest_config1(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_CONFIG2:
+ *v = (long)kvm_read_c0_guest_config2(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_CONFIG3:
+ *v = (long)kvm_read_c0_guest_config3(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_CONFIG4:
+ *v = (long)kvm_read_c0_guest_config4(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_CONFIG5:
+ *v = (long)kvm_read_c0_guest_config5(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_CONFIG7:
+ *v = (long)kvm_read_c0_guest_config7(cop0);
+ break;
case KVM_REG_MIPS_CP0_COUNT:
*v = kvm_mips_read_count(vcpu);
break;
@@ -539,6 +790,27 @@ static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
case KVM_REG_MIPS_COUNT_HZ:
*v = vcpu->arch.count_hz;
break;
+ case KVM_REG_MIPS_CP0_ERROREPC:
+ *v = (long)kvm_read_c0_guest_errorepc(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_KSCRATCH1:
+ *v = (long)kvm_read_c0_guest_kscratch1(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_KSCRATCH2:
+ *v = (long)kvm_read_c0_guest_kscratch2(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_KSCRATCH3:
+ *v = (long)kvm_read_c0_guest_kscratch3(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_KSCRATCH4:
+ *v = (long)kvm_read_c0_guest_kscratch4(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_KSCRATCH5:
+ *v = (long)kvm_read_c0_guest_kscratch5(cop0);
+ break;
+ case KVM_REG_MIPS_CP0_KSCRATCH6:
+ *v = (long)kvm_read_c0_guest_kscratch6(cop0);
+ break;
default:
return -EINVAL;
}
@@ -554,6 +826,56 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
unsigned int cur, change;
switch (reg->id) {
+ case KVM_REG_MIPS_CP0_INDEX:
+ kvm_write_c0_guest_index(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_ENTRYLO0:
+ kvm_write_c0_guest_entrylo0(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_ENTRYLO1:
+ kvm_write_c0_guest_entrylo1(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_CONTEXT:
+ kvm_write_c0_guest_context(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_USERLOCAL:
+ kvm_write_c0_guest_userlocal(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_PAGEMASK:
+ kvm_write_c0_guest_pagemask(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_WIRED:
+ kvm_write_c0_guest_wired(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_HWRENA:
+ kvm_write_c0_guest_hwrena(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_BADVADDR:
+ kvm_write_c0_guest_badvaddr(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_ENTRYHI:
+ kvm_write_c0_guest_entryhi(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_STATUS:
+ kvm_write_c0_guest_status(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_INTCTL:
+ /* No VInt, so no VS, read-only for now */
+ break;
+ case KVM_REG_MIPS_CP0_EPC:
+ kvm_write_c0_guest_epc(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_PRID:
+ kvm_write_c0_guest_prid(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_EBASE:
+ /*
+ * Allow core number to be written, but the exception base must
+ * remain in guest KSeg0.
+ */
+ kvm_change_c0_guest_ebase(cop0, 0x1ffff000 | MIPS_EBASE_CPUNUM,
+ v);
+ break;
case KVM_REG_MIPS_CP0_COUNT:
kvm_mips_write_count(vcpu, v);
break;
@@ -618,6 +940,9 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
kvm_write_c0_guest_config5(cop0, v);
}
break;
+ case KVM_REG_MIPS_CP0_CONFIG7:
+ /* writes ignored */
+ break;
case KVM_REG_MIPS_COUNT_CTL:
ret = kvm_mips_set_count_ctl(vcpu, v);
break;
@@ -627,24 +952,269 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu,
case KVM_REG_MIPS_COUNT_HZ:
ret = kvm_mips_set_count_hz(vcpu, v);
break;
+ case KVM_REG_MIPS_CP0_ERROREPC:
+ kvm_write_c0_guest_errorepc(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_KSCRATCH1:
+ kvm_write_c0_guest_kscratch1(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_KSCRATCH2:
+ kvm_write_c0_guest_kscratch2(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_KSCRATCH3:
+ kvm_write_c0_guest_kscratch3(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_KSCRATCH4:
+ kvm_write_c0_guest_kscratch4(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_KSCRATCH5:
+ kvm_write_c0_guest_kscratch5(cop0, v);
+ break;
+ case KVM_REG_MIPS_CP0_KSCRATCH6:
+ kvm_write_c0_guest_kscratch6(cop0, v);
+ break;
default:
return -EINVAL;
}
return ret;
}
-static int kvm_trap_emul_vcpu_get_regs(struct kvm_vcpu *vcpu)
+static int kvm_trap_emul_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
- kvm_lose_fpu(vcpu);
+ struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+ struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
+ struct mm_struct *mm;
+
+ /*
+ * Were we in guest context? If so, restore the appropriate ASID based
+ * on the mode of the Guest (Kernel/User).
+ */
+ if (current->flags & PF_VCPU) {
+ mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm;
+ if ((cpu_context(cpu, mm) ^ asid_cache(cpu)) &
+ asid_version_mask(cpu))
+ get_new_mmu_context(mm, cpu);
+ write_c0_entryhi(cpu_asid(cpu, mm));
+ TLBMISS_HANDLER_SETUP_PGD(mm->pgd);
+ kvm_mips_suspend_mm(cpu);
+ ehb();
+ }
return 0;
}
-static int kvm_trap_emul_vcpu_set_regs(struct kvm_vcpu *vcpu)
+static int kvm_trap_emul_vcpu_put(struct kvm_vcpu *vcpu, int cpu)
{
+ kvm_lose_fpu(vcpu);
+
+ if (current->flags & PF_VCPU) {
+ /* Restore normal Linux process memory map */
+ if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
+ asid_version_mask(cpu)))
+ get_new_mmu_context(current->mm, cpu);
+ write_c0_entryhi(cpu_asid(cpu, current->mm));
+ TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd);
+ kvm_mips_resume_mm(cpu);
+ ehb();
+ }
+
return 0;
}
+static void kvm_trap_emul_check_requests(struct kvm_vcpu *vcpu, int cpu,
+ bool reload_asid)
+{
+ struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+ struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
+ struct mm_struct *mm;
+ int i;
+
+ if (likely(!vcpu->requests))
+ return;
+
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+ /*
+ * Both kernel & user GVA mappings must be invalidated. The
+ * caller is just about to check whether the ASID is stale
+ * anyway so no need to reload it here.
+ */
+ kvm_mips_flush_gva_pt(kern_mm->pgd, KMF_GPA | KMF_KERN);
+ kvm_mips_flush_gva_pt(user_mm->pgd, KMF_GPA | KMF_USER);
+ for_each_possible_cpu(i) {
+ cpu_context(i, kern_mm) = 0;
+ cpu_context(i, user_mm) = 0;
+ }
+
+ /* Generate new ASID for current mode */
+ if (reload_asid) {
+ mm = KVM_GUEST_KERNEL_MODE(vcpu) ? kern_mm : user_mm;
+ get_new_mmu_context(mm, cpu);
+ htw_stop();
+ write_c0_entryhi(cpu_asid(cpu, mm));
+ TLBMISS_HANDLER_SETUP_PGD(mm->pgd);
+ htw_start();
+ }
+ }
+}
+
+/**
+ * kvm_trap_emul_gva_lockless_begin() - Begin lockless access to GVA space.
+ * @vcpu: VCPU pointer.
+ *
+ * Call before a GVA space access outside of guest mode, to ensure that
+ * asynchronous TLB flush requests are handled or delayed until completion of
+ * the GVA access (as indicated by a matching kvm_trap_emul_gva_lockless_end()).
+ *
+ * Should be called with IRQs already enabled.
+ */
+void kvm_trap_emul_gva_lockless_begin(struct kvm_vcpu *vcpu)
+{
+ /* We re-enable IRQs in kvm_trap_emul_gva_lockless_end() */
+ WARN_ON_ONCE(irqs_disabled());
+
+ /*
+ * The caller is about to access the GVA space, so we set the mode to
+ * force TLB flush requests to send an IPI, and also disable IRQs to
+ * delay IPI handling until kvm_trap_emul_gva_lockless_end().
+ */
+ local_irq_disable();
+
+ /*
+ * Make sure the read of VCPU requests is not reordered ahead of the
+ * write to vcpu->mode, or we could miss a TLB flush request while
+ * the requester sees the VCPU as outside of guest mode and not needing
+ * an IPI.
+ */
+ smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
+
+ /*
+ * If a TLB flush has been requested (potentially while
+ * OUTSIDE_GUEST_MODE and assumed immediately effective), perform it
+ * before accessing the GVA space, and be sure to reload the ASID if
+ * necessary as it'll be immediately used.
+ *
+ * TLB flush requests after this check will trigger an IPI due to the
+ * mode change above, which will be delayed due to IRQs disabled.
+ */
+ kvm_trap_emul_check_requests(vcpu, smp_processor_id(), true);
+}
+
+/**
+ * kvm_trap_emul_gva_lockless_end() - End lockless access to GVA space.
+ * @vcpu: VCPU pointer.
+ *
+ * Called after a GVA space access outside of guest mode. Should have a matching
+ * call to kvm_trap_emul_gva_lockless_begin().
+ */
+void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu)
+{
+ /*
+ * Make sure the write to vcpu->mode is not reordered in front of GVA
+ * accesses, or a TLB flush requester may not think it necessary to send
+ * an IPI.
+ */
+ smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
+
+ /*
+ * Now that the access to GVA space is complete, its safe for pending
+ * TLB flush request IPIs to be handled (which indicates completion).
+ */
+ local_irq_enable();
+}
+
+static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run,
+ struct kvm_vcpu *vcpu)
+{
+ struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm;
+ struct mm_struct *user_mm = &vcpu->arch.guest_user_mm;
+ struct mm_struct *mm;
+ struct mips_coproc *cop0 = vcpu->arch.cop0;
+ int i, cpu = smp_processor_id();
+ unsigned int gasid;
+
+ /*
+ * No need to reload ASID, IRQs are disabled already so there's no rush,
+ * and we'll check if we need to regenerate below anyway before
+ * re-entering the guest.
+ */
+ kvm_trap_emul_check_requests(vcpu, cpu, false);
+
+ if (KVM_GUEST_KERNEL_MODE(vcpu)) {
+ mm = kern_mm;
+ } else {
+ mm = user_mm;
+
+ /*
+ * Lazy host ASID regeneration / PT flush for guest user mode.
+ * If the guest ASID has changed since the last guest usermode
+ * execution, invalidate the stale TLB entries and flush GVA PT
+ * entries too.
+ */
+ gasid = kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID;
+ if (gasid != vcpu->arch.last_user_gasid) {
+ kvm_mips_flush_gva_pt(user_mm->pgd, KMF_USER);
+ for_each_possible_cpu(i)
+ cpu_context(i, user_mm) = 0;
+ vcpu->arch.last_user_gasid = gasid;
+ }
+ }
+
+ /*
+ * Check if ASID is stale. This may happen due to a TLB flush request or
+ * a lazy user MM invalidation.
+ */
+ if ((cpu_context(cpu, mm) ^ asid_cache(cpu)) &
+ asid_version_mask(cpu))
+ get_new_mmu_context(mm, cpu);
+}
+
+static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+ int cpu = smp_processor_id();
+ int r;
+
+ /* Check if we have any exceptions/interrupts pending */
+ kvm_mips_deliver_interrupts(vcpu,
+ kvm_read_c0_guest_cause(vcpu->arch.cop0));
+
+ kvm_trap_emul_vcpu_reenter(run, vcpu);
+
+ /*
+ * We use user accessors to access guest memory, but we don't want to
+ * invoke Linux page faulting.
+ */
+ pagefault_disable();
+
+ /* Disable hardware page table walking while in guest */
+ htw_stop();
+
+ /*
+ * While in guest context we're in the guest's address space, not the
+ * host process address space, so we need to be careful not to confuse
+ * e.g. cache management IPIs.
+ */
+ kvm_mips_suspend_mm(cpu);
+
+ r = vcpu->arch.vcpu_run(run, vcpu);
+
+ /* We may have migrated while handling guest exits */
+ cpu = smp_processor_id();
+
+ /* Restore normal Linux process memory map */
+ if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
+ asid_version_mask(cpu)))
+ get_new_mmu_context(current->mm, cpu);
+ write_c0_entryhi(cpu_asid(cpu, current->mm));
+ TLBMISS_HANDLER_SETUP_PGD(current->mm->pgd);
+ kvm_mips_resume_mm(cpu);
+
+ htw_start();
+
+ pagefault_enable();
+
+ return r;
+}
+
static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
/* exit handlers */
.handle_cop_unusable = kvm_trap_emul_handle_cop_unusable,
@@ -661,9 +1231,11 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
.handle_fpe = kvm_trap_emul_handle_fpe,
.handle_msa_disabled = kvm_trap_emul_handle_msa_disabled,
- .vm_init = kvm_trap_emul_vm_init,
.vcpu_init = kvm_trap_emul_vcpu_init,
+ .vcpu_uninit = kvm_trap_emul_vcpu_uninit,
.vcpu_setup = kvm_trap_emul_vcpu_setup,
+ .flush_shadow_all = kvm_trap_emul_flush_shadow_all,
+ .flush_shadow_memslot = kvm_trap_emul_flush_shadow_memslot,
.gva_to_gpa = kvm_trap_emul_gva_to_gpa_cb,
.queue_timer_int = kvm_mips_queue_timer_int_cb,
.dequeue_timer_int = kvm_mips_dequeue_timer_int_cb,
@@ -675,8 +1247,10 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
.copy_reg_indices = kvm_trap_emul_copy_reg_indices,
.get_one_reg = kvm_trap_emul_get_one_reg,
.set_one_reg = kvm_trap_emul_set_one_reg,
- .vcpu_get_regs = kvm_trap_emul_vcpu_get_regs,
- .vcpu_set_regs = kvm_trap_emul_vcpu_set_regs,
+ .vcpu_load = kvm_trap_emul_vcpu_load,
+ .vcpu_put = kvm_trap_emul_vcpu_put,
+ .vcpu_run = kvm_trap_emul_vcpu_run,
+ .vcpu_reenter = kvm_trap_emul_vcpu_reenter,
};
int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks)
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 0db010cc4e65..d9b48f5bb606 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -22,6 +22,10 @@
#include <asm/book3s/64/mmu-hash.h>
+/* Power architecture requires HPT is at least 256kiB, at most 64TiB */
+#define PPC_MIN_HPT_ORDER 18
+#define PPC_MAX_HPT_ORDER 46
+
#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
static inline struct kvmppc_book3s_shadow_vcpu *svcpu_get(struct kvm_vcpu *vcpu)
{
@@ -356,6 +360,18 @@ extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
extern void kvmhv_rm_send_ipi(int cpu);
+static inline unsigned long kvmppc_hpt_npte(struct kvm_hpt_info *hpt)
+{
+ /* HPTEs are 2**4 bytes long */
+ return 1UL << (hpt->order - 4);
+}
+
+static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt)
+{
+ /* 128 (2**7) bytes in each HPTEG */
+ return (1UL << (hpt->order - 7)) - 1;
+}
+
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index b2dbeac3f450..7bba8f415627 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -241,12 +241,24 @@ struct kvm_arch_memory_slot {
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
};
+struct kvm_hpt_info {
+ /* Host virtual (linear mapping) address of guest HPT */
+ unsigned long virt;
+ /* Array of reverse mapping entries for each guest HPTE */
+ struct revmap_entry *rev;
+ /* Guest HPT size is 2**(order) bytes */
+ u32 order;
+ /* 1 if HPT allocated with CMA, 0 otherwise */
+ int cma;
+};
+
+struct kvm_resize_hpt;
+
struct kvm_arch {
unsigned int lpid;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
unsigned int tlb_sets;
- unsigned long hpt_virt;
- struct revmap_entry *revmap;
+ struct kvm_hpt_info hpt;
atomic64_t mmio_update;
unsigned int host_lpid;
unsigned long host_lpcr;
@@ -256,20 +268,17 @@ struct kvm_arch {
unsigned long lpcr;
unsigned long vrma_slb_v;
int hpte_setup_done;
- u32 hpt_order;
atomic_t vcpus_running;
u32 online_vcores;
- unsigned long hpt_npte;
- unsigned long hpt_mask;
atomic_t hpte_mod_interest;
cpumask_t need_tlb_flush;
cpumask_t cpu_in_guest;
- int hpt_cma_alloc;
u8 radix;
pgd_t *pgtable;
u64 process_table;
struct dentry *debugfs_dir;
struct dentry *htab_dentry;
+ struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
struct mutex hpt_mutex;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 48c760f89590..dd11c4c8c56a 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -155,9 +155,10 @@ extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
extern int kvmppc_kvm_pv(struct kvm_vcpu *vcpu);
extern void kvmppc_map_magic(struct kvm_vcpu *vcpu);
-extern long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp);
-extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp);
-extern void kvmppc_free_hpt(struct kvm *kvm);
+extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order);
+extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info);
+extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order);
+extern void kvmppc_free_hpt(struct kvm_hpt_info *info);
extern long kvmppc_prepare_vrma(struct kvm *kvm,
struct kvm_userspace_memory_region *mem);
extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
@@ -186,8 +187,8 @@ extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
unsigned long tce_value, unsigned long npages);
extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
unsigned long ioba);
-extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
-extern void kvm_release_hpt(struct page *page, unsigned long nr_pages);
+extern struct page *kvm_alloc_hpt_cma(unsigned long nr_pages);
+extern void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages);
extern int kvmppc_core_init_vm(struct kvm *kvm);
extern void kvmppc_core_destroy_vm(struct kvm *kvm);
extern void kvmppc_core_free_memslot(struct kvm *kvm,
@@ -214,6 +215,10 @@ extern void kvmppc_bookehv_exit(void);
extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
+extern long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
+ struct kvm_ppc_resize_hpt *rhpt);
+extern long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
+ struct kvm_ppc_resize_hpt *rhpt);
int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index cc0908b6c2a0..4edbe4bb0e8b 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -633,5 +633,7 @@ struct kvm_ppc_rmmu_info {
#define KVM_XICS_LEVEL_SENSITIVE (1ULL << 40)
#define KVM_XICS_MASKED (1ULL << 41)
#define KVM_XICS_PENDING (1ULL << 42)
+#define KVM_XICS_PRESENTED (1ULL << 43)
+#define KVM_XICS_QUEUED (1ULL << 44)
#endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index a2eb6d354a57..1992676c7a94 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -224,7 +224,8 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr,
ptem = kvmppc_mmu_book3s_32_get_ptem(sre, eaddr, primary);
if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
- printk(KERN_ERR "KVM: Can't copy data from 0x%lx!\n", ptegp);
+ printk_ratelimited(KERN_ERR
+ "KVM: Can't copy data from 0x%lx!\n", ptegp);
goto no_page_found;
}
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index b9131aa1aedf..70153578131a 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -265,7 +265,8 @@ do_second:
goto no_page_found;
if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) {
- printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp);
+ printk_ratelimited(KERN_ERR
+ "KVM: Can't copy data from 0x%lx!\n", ptegp);
goto no_page_found;
}
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 9df3d940acec..f3158fb16de3 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -40,84 +40,101 @@
#include "trace_hv.h"
-/* Power architecture requires HPT is at least 256kB */
-#define PPC_MIN_HPT_ORDER 18
+//#define DEBUG_RESIZE_HPT 1
+
+#ifdef DEBUG_RESIZE_HPT
+#define resize_hpt_debug(resize, ...) \
+ do { \
+ printk(KERN_DEBUG "RESIZE HPT %p: ", resize); \
+ printk(__VA_ARGS__); \
+ } while (0)
+#else
+#define resize_hpt_debug(resize, ...) \
+ do { } while (0)
+#endif
static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
long pte_index, unsigned long pteh,
unsigned long ptel, unsigned long *pte_idx_ret);
+
+struct kvm_resize_hpt {
+ /* These fields read-only after init */
+ struct kvm *kvm;
+ struct work_struct work;
+ u32 order;
+
+ /* These fields protected by kvm->lock */
+ int error;
+ bool prepare_done;
+
+ /* Private to the work thread, until prepare_done is true,
+ * then protected by kvm->resize_hpt_sem */
+ struct kvm_hpt_info hpt;
+};
+
static void kvmppc_rmap_reset(struct kvm *kvm);
-long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
+int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
{
unsigned long hpt = 0;
- struct revmap_entry *rev;
+ int cma = 0;
struct page *page = NULL;
- long order = KVM_DEFAULT_HPT_ORDER;
+ struct revmap_entry *rev;
+ unsigned long npte;
- if (htab_orderp) {
- order = *htab_orderp;
- if (order < PPC_MIN_HPT_ORDER)
- order = PPC_MIN_HPT_ORDER;
- }
+ if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER))
+ return -EINVAL;
- kvm->arch.hpt_cma_alloc = 0;
- page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT));
+ page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT));
if (page) {
hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
memset((void *)hpt, 0, (1ul << order));
- kvm->arch.hpt_cma_alloc = 1;
+ cma = 1;
}
- /* Lastly try successively smaller sizes from the page allocator */
- /* Only do this if userspace didn't specify a size via ioctl */
- while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) {
- hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
- __GFP_NOWARN, order - PAGE_SHIFT);
- if (!hpt)
- --order;
- }
+ if (!hpt)
+ hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT
+ |__GFP_NOWARN, order - PAGE_SHIFT);
if (!hpt)
return -ENOMEM;
- kvm->arch.hpt_virt = hpt;
- kvm->arch.hpt_order = order;
/* HPTEs are 2**4 bytes long */
- kvm->arch.hpt_npte = 1ul << (order - 4);
- /* 128 (2**7) bytes in each HPTEG */
- kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
-
- atomic64_set(&kvm->arch.mmio_update, 0);
+ npte = 1ul << (order - 4);
/* Allocate reverse map array */
- rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
+ rev = vmalloc(sizeof(struct revmap_entry) * npte);
if (!rev) {
- pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n");
- goto out_freehpt;
+ pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n");
+ if (cma)
+ kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
+ else
+ free_pages(hpt, order - PAGE_SHIFT);
+ return -ENOMEM;
}
- kvm->arch.revmap = rev;
- kvm->arch.sdr1 = __pa(hpt) | (order - 18);
- pr_info("KVM guest htab at %lx (order %ld), LPID %x\n",
- hpt, order, kvm->arch.lpid);
+ info->order = order;
+ info->virt = hpt;
+ info->cma = cma;
+ info->rev = rev;
- if (htab_orderp)
- *htab_orderp = order;
return 0;
+}
- out_freehpt:
- if (kvm->arch.hpt_cma_alloc)
- kvm_release_hpt(page, 1 << (order - PAGE_SHIFT));
- else
- free_pages(hpt, order - PAGE_SHIFT);
- return -ENOMEM;
+void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info)
+{
+ atomic64_set(&kvm->arch.mmio_update, 0);
+ kvm->arch.hpt = *info;
+ kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18);
+
+ pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n",
+ info->virt, (long)info->order, kvm->arch.lpid);
}
-long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
+long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
{
long err = -EBUSY;
- long order;
+ struct kvm_hpt_info info;
if (kvm_is_radix(kvm))
return -EINVAL;
@@ -132,36 +149,44 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
goto out;
}
}
- if (kvm->arch.hpt_virt) {
- order = kvm->arch.hpt_order;
+ if (kvm->arch.hpt.order == order) {
+ /* We already have a suitable HPT */
+
/* Set the entire HPT to 0, i.e. invalid HPTEs */
- memset((void *)kvm->arch.hpt_virt, 0, 1ul << order);
+ memset((void *)kvm->arch.hpt.virt, 0, 1ul << order);
/*
* Reset all the reverse-mapping chains for all memslots
*/
kvmppc_rmap_reset(kvm);
/* Ensure that each vcpu will flush its TLB on next entry. */
cpumask_setall(&kvm->arch.need_tlb_flush);
- *htab_orderp = order;
err = 0;
- } else {
- err = kvmppc_alloc_hpt(kvm, htab_orderp);
- order = *htab_orderp;
+ goto out;
}
- out:
+
+ if (kvm->arch.hpt.virt)
+ kvmppc_free_hpt(&kvm->arch.hpt);
+
+ err = kvmppc_allocate_hpt(&info, order);
+ if (err < 0)
+ goto out;
+ kvmppc_set_hpt(kvm, &info);
+
+out:
mutex_unlock(&kvm->lock);
return err;
}
-void kvmppc_free_hpt(struct kvm *kvm)
+void kvmppc_free_hpt(struct kvm_hpt_info *info)
{
- vfree(kvm->arch.revmap);
- if (kvm->arch.hpt_cma_alloc)
- kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt),
- 1 << (kvm->arch.hpt_order - PAGE_SHIFT));
- else if (kvm->arch.hpt_virt)
- free_pages(kvm->arch.hpt_virt,
- kvm->arch.hpt_order - PAGE_SHIFT);
+ vfree(info->rev);
+ if (info->cma)
+ kvm_free_hpt_cma(virt_to_page(info->virt),
+ 1 << (info->order - PAGE_SHIFT));
+ else if (info->virt)
+ free_pages(info->virt, info->order - PAGE_SHIFT);
+ info->virt = 0;
+ info->order = 0;
}
/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
@@ -196,8 +221,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
if (npages > 1ul << (40 - porder))
npages = 1ul << (40 - porder);
/* Can't use more than 1 HPTE per HPTEG */
- if (npages > kvm->arch.hpt_mask + 1)
- npages = kvm->arch.hpt_mask + 1;
+ if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1)
+ npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1;
hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) |
HPTE_V_BOLTED | hpte0_pgsize_encoding(psize);
@@ -207,7 +232,8 @@ void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
for (i = 0; i < npages; ++i) {
addr = i << porder;
/* can't use hpt_hash since va > 64 bits */
- hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask;
+ hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25)))
+ & kvmppc_hpt_mask(&kvm->arch.hpt);
/*
* We assume that the hash table is empty and no
* vcpus are using it at this stage. Since we create
@@ -340,11 +366,11 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
preempt_enable();
return -ENOENT;
}
- hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+ hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
if (cpu_has_feature(CPU_FTR_ARCH_300))
v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
- gr = kvm->arch.revmap[index].guest_rpte;
+ gr = kvm->arch.hpt.rev[index].guest_rpte;
unlock_hpte(hptep, orig_v);
preempt_enable();
@@ -485,8 +511,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
}
index = vcpu->arch.pgfault_index;
- hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
- rev = &kvm->arch.revmap[index];
+ hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
+ rev = &kvm->arch.hpt.rev[index];
preempt_disable();
while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
cpu_relax();
@@ -745,13 +771,53 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
return kvm_handle_hva_range(kvm, hva, hva + 1, handler);
}
+/* Must be called with both HPTE and rmap locked */
+static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
+ unsigned long *rmapp, unsigned long gfn)
+{
+ __be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
+ struct revmap_entry *rev = kvm->arch.hpt.rev;
+ unsigned long j, h;
+ unsigned long ptel, psize, rcbits;
+
+ j = rev[i].forw;
+ if (j == i) {
+ /* chain is now empty */
+ *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
+ } else {
+ /* remove i from chain */
+ h = rev[i].back;
+ rev[h].forw = j;
+ rev[j].back = h;
+ rev[i].forw = rev[i].back = i;
+ *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
+ }
+
+ /* Now check and modify the HPTE */
+ ptel = rev[i].guest_rpte;
+ psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
+ if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
+ hpte_rpn(ptel, psize) == gfn) {
+ hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
+ kvmppc_invalidate_hpte(kvm, hptep, i);
+ hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
+ /* Harvest R and C */
+ rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
+ *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+ if (rcbits & HPTE_R_C)
+ kvmppc_update_rmap_change(rmapp, psize);
+ if (rcbits & ~rev[i].guest_rpte) {
+ rev[i].guest_rpte = ptel | rcbits;
+ note_hpte_modification(kvm, &rev[i]);
+ }
+ }
+}
+
static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn)
{
- struct revmap_entry *rev = kvm->arch.revmap;
- unsigned long h, i, j;
+ unsigned long i;
__be64 *hptep;
- unsigned long ptel, psize, rcbits;
unsigned long *rmapp;
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
@@ -768,7 +834,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
* rmap chain lock.
*/
i = *rmapp & KVMPPC_RMAP_INDEX;
- hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+ hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) {
/* unlock rmap before spinning on the HPTE lock */
unlock_rmap(rmapp);
@@ -776,37 +842,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
cpu_relax();
continue;
}
- j = rev[i].forw;
- if (j == i) {
- /* chain is now empty */
- *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX);
- } else {
- /* remove i from chain */
- h = rev[i].back;
- rev[h].forw = j;
- rev[j].back = h;
- rev[i].forw = rev[i].back = i;
- *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j;
- }
- /* Now check and modify the HPTE */
- ptel = rev[i].guest_rpte;
- psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
- if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
- hpte_rpn(ptel, psize) == gfn) {
- hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
- kvmppc_invalidate_hpte(kvm, hptep, i);
- hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
- /* Harvest R and C */
- rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
- *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
- if (rcbits & HPTE_R_C)
- kvmppc_update_rmap_change(rmapp, psize);
- if (rcbits & ~rev[i].guest_rpte) {
- rev[i].guest_rpte = ptel | rcbits;
- note_hpte_modification(kvm, &rev[i]);
- }
- }
+ kvmppc_unmap_hpte(kvm, i, rmapp, gfn);
unlock_rmap(rmapp);
__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
}
@@ -860,7 +897,7 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm,
static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn)
{
- struct revmap_entry *rev = kvm->arch.revmap;
+ struct revmap_entry *rev = kvm->arch.hpt.rev;
unsigned long head, i, j;
__be64 *hptep;
int ret = 0;
@@ -880,7 +917,7 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
i = head = *rmapp & KVMPPC_RMAP_INDEX;
do {
- hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+ hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
j = rev[i].forw;
/* If this HPTE isn't referenced, ignore it */
@@ -923,7 +960,7 @@ int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end)
static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn)
{
- struct revmap_entry *rev = kvm->arch.revmap;
+ struct revmap_entry *rev = kvm->arch.hpt.rev;
unsigned long head, i, j;
unsigned long *hp;
int ret = 1;
@@ -940,7 +977,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
if (*rmapp & KVMPPC_RMAP_PRESENT) {
i = head = *rmapp & KVMPPC_RMAP_INDEX;
do {
- hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4));
+ hp = (unsigned long *)(kvm->arch.hpt.virt + (i << 4));
j = rev[i].forw;
if (be64_to_cpu(hp[1]) & HPTE_R_R)
goto out;
@@ -980,7 +1017,7 @@ static int vcpus_running(struct kvm *kvm)
*/
static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
{
- struct revmap_entry *rev = kvm->arch.revmap;
+ struct revmap_entry *rev = kvm->arch.hpt.rev;
unsigned long head, i, j;
unsigned long n;
unsigned long v, r;
@@ -1005,7 +1042,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
i = head = *rmapp & KVMPPC_RMAP_INDEX;
do {
unsigned long hptep1;
- hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4));
+ hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
j = rev[i].forw;
/*
@@ -1172,6 +1209,363 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
}
/*
+ * HPT resizing
+ */
+static int resize_hpt_allocate(struct kvm_resize_hpt *resize)
+{
+ int rc;
+
+ rc = kvmppc_allocate_hpt(&resize->hpt, resize->order);
+ if (rc < 0)
+ return rc;
+
+ resize_hpt_debug(resize, "resize_hpt_allocate(): HPT @ 0x%lx\n",
+ resize->hpt.virt);
+
+ return 0;
+}
+
+static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
+ unsigned long idx)
+{
+ struct kvm *kvm = resize->kvm;
+ struct kvm_hpt_info *old = &kvm->arch.hpt;
+ struct kvm_hpt_info *new = &resize->hpt;
+ unsigned long old_hash_mask = (1ULL << (old->order - 7)) - 1;
+ unsigned long new_hash_mask = (1ULL << (new->order - 7)) - 1;
+ __be64 *hptep, *new_hptep;
+ unsigned long vpte, rpte, guest_rpte;
+ int ret;
+ struct revmap_entry *rev;
+ unsigned long apsize, psize, avpn, pteg, hash;
+ unsigned long new_idx, new_pteg, replace_vpte;
+
+ hptep = (__be64 *)(old->virt + (idx << 4));
+
+ /* Guest is stopped, so new HPTEs can't be added or faulted
+ * in, only unmapped or altered by host actions. So, it's
+ * safe to check this before we take the HPTE lock */
+ vpte = be64_to_cpu(hptep[0]);
+ if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
+ return 0; /* nothing to do */
+
+ while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
+ cpu_relax();
+
+ vpte = be64_to_cpu(hptep[0]);
+
+ ret = 0;
+ if (!(vpte & HPTE_V_VALID) && !(vpte & HPTE_V_ABSENT))
+ /* Nothing to do */
+ goto out;
+
+ /* Unmap */
+ rev = &old->rev[idx];
+ guest_rpte = rev->guest_rpte;
+
+ ret = -EIO;
+ apsize = hpte_page_size(vpte, guest_rpte);
+ if (!apsize)
+ goto out;
+
+ if (vpte & HPTE_V_VALID) {
+ unsigned long gfn = hpte_rpn(guest_rpte, apsize);
+ int srcu_idx = srcu_read_lock(&kvm->srcu);
+ struct kvm_memory_slot *memslot =
+ __gfn_to_memslot(kvm_memslots(kvm), gfn);
+
+ if (memslot) {
+ unsigned long *rmapp;
+ rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+
+ lock_rmap(rmapp);
+ kvmppc_unmap_hpte(kvm, idx, rmapp, gfn);
+ unlock_rmap(rmapp);
+ }
+
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ }
+
+ /* Reload PTE after unmap */
+ vpte = be64_to_cpu(hptep[0]);
+
+ BUG_ON(vpte & HPTE_V_VALID);
+ BUG_ON(!(vpte & HPTE_V_ABSENT));
+
+ ret = 0;
+ if (!(vpte & HPTE_V_BOLTED))
+ goto out;
+
+ rpte = be64_to_cpu(hptep[1]);
+ psize = hpte_base_page_size(vpte, rpte);
+ avpn = HPTE_V_AVPN_VAL(vpte) & ~((psize - 1) >> 23);
+ pteg = idx / HPTES_PER_GROUP;
+ if (vpte & HPTE_V_SECONDARY)
+ pteg = ~pteg;
+
+ if (!(vpte & HPTE_V_1TB_SEG)) {
+ unsigned long offset, vsid;
+
+ /* We only have 28 - 23 bits of offset in avpn */
+ offset = (avpn & 0x1f) << 23;
+ vsid = avpn >> 5;
+ /* We can find more bits from the pteg value */
+ if (psize < (1ULL << 23))
+ offset |= ((vsid ^ pteg) & old_hash_mask) * psize;
+
+ hash = vsid ^ (offset / psize);
+ } else {
+ unsigned long offset, vsid;
+
+ /* We only have 40 - 23 bits of seg_off in avpn */
+ offset = (avpn & 0x1ffff) << 23;
+ vsid = avpn >> 17;
+ if (psize < (1ULL << 23))
+ offset |= ((vsid ^ (vsid << 25) ^ pteg) & old_hash_mask) * psize;
+
+ hash = vsid ^ (vsid << 25) ^ (offset / psize);
+ }
+
+ new_pteg = hash & new_hash_mask;
+ if (vpte & HPTE_V_SECONDARY) {
+ BUG_ON(~pteg != (hash & old_hash_mask));
+ new_pteg = ~new_pteg;
+ } else {
+ BUG_ON(pteg != (hash & old_hash_mask));
+ }
+
+ new_idx = new_pteg * HPTES_PER_GROUP + (idx % HPTES_PER_GROUP);
+ new_hptep = (__be64 *)(new->virt + (new_idx << 4));
+
+ replace_vpte = be64_to_cpu(new_hptep[0]);
+
+ if (replace_vpte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
+ BUG_ON(new->order >= old->order);
+
+ if (replace_vpte & HPTE_V_BOLTED) {
+ if (vpte & HPTE_V_BOLTED)
+ /* Bolted collision, nothing we can do */
+ ret = -ENOSPC;
+ /* Discard the new HPTE */
+ goto out;
+ }
+
+ /* Discard the previous HPTE */
+ }
+
+ new_hptep[1] = cpu_to_be64(rpte);
+ new->rev[new_idx].guest_rpte = guest_rpte;
+ /* No need for a barrier, since new HPT isn't active */
+ new_hptep[0] = cpu_to_be64(vpte);
+ unlock_hpte(new_hptep, vpte);
+
+out:
+ unlock_hpte(hptep, vpte);
+ return ret;
+}
+
+static int resize_hpt_rehash(struct kvm_resize_hpt *resize)
+{
+ struct kvm *kvm = resize->kvm;
+ unsigned long i;
+ int rc;
+
+ /*
+ * resize_hpt_rehash_hpte() doesn't handle the new-format HPTEs
+ * that POWER9 uses, and could well hit a BUG_ON on POWER9.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ return -EIO;
+ for (i = 0; i < kvmppc_hpt_npte(&kvm->arch.hpt); i++) {
+ rc = resize_hpt_rehash_hpte(resize, i);
+ if (rc != 0)
+ return rc;
+ }
+
+ return 0;
+}
+
+static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
+{
+ struct kvm *kvm = resize->kvm;
+ struct kvm_hpt_info hpt_tmp;
+
+ /* Exchange the pending tables in the resize structure with
+ * the active tables */
+
+ resize_hpt_debug(resize, "resize_hpt_pivot()\n");
+
+ spin_lock(&kvm->mmu_lock);
+ asm volatile("ptesync" : : : "memory");
+
+ hpt_tmp = kvm->arch.hpt;
+ kvmppc_set_hpt(kvm, &resize->hpt);
+ resize->hpt = hpt_tmp;
+
+ spin_unlock(&kvm->mmu_lock);
+
+ synchronize_srcu_expedited(&kvm->srcu);
+
+ resize_hpt_debug(resize, "resize_hpt_pivot() done\n");
+}
+
+static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
+{
+ BUG_ON(kvm->arch.resize_hpt != resize);
+
+ if (!resize)
+ return;
+
+ if (resize->hpt.virt)
+ kvmppc_free_hpt(&resize->hpt);
+
+ kvm->arch.resize_hpt = NULL;
+ kfree(resize);
+}
+
+static void resize_hpt_prepare_work(struct work_struct *work)
+{
+ struct kvm_resize_hpt *resize = container_of(work,
+ struct kvm_resize_hpt,
+ work);
+ struct kvm *kvm = resize->kvm;
+ int err;
+
+ resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
+ resize->order);
+
+ err = resize_hpt_allocate(resize);
+
+ mutex_lock(&kvm->lock);
+
+ resize->error = err;
+ resize->prepare_done = true;
+
+ mutex_unlock(&kvm->lock);
+}
+
+long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
+ struct kvm_ppc_resize_hpt *rhpt)
+{
+ unsigned long flags = rhpt->flags;
+ unsigned long shift = rhpt->shift;
+ struct kvm_resize_hpt *resize;
+ int ret;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ if (shift && ((shift < 18) || (shift > 46)))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ resize = kvm->arch.resize_hpt;
+
+ if (resize) {
+ if (resize->order == shift) {
+ /* Suitable resize in progress */
+ if (resize->prepare_done) {
+ ret = resize->error;
+ if (ret != 0)
+ resize_hpt_release(kvm, resize);
+ } else {
+ ret = 100; /* estimated time in ms */
+ }
+
+ goto out;
+ }
+
+ /* not suitable, cancel it */
+ resize_hpt_release(kvm, resize);
+ }
+
+ ret = 0;
+ if (!shift)
+ goto out; /* nothing to do */
+
+ /* start new resize */
+
+ resize = kzalloc(sizeof(*resize), GFP_KERNEL);
+ resize->order = shift;
+ resize->kvm = kvm;
+ INIT_WORK(&resize->work, resize_hpt_prepare_work);
+ kvm->arch.resize_hpt = resize;
+
+ schedule_work(&resize->work);
+
+ ret = 100; /* estimated time in ms */
+
+out:
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+static void resize_hpt_boot_vcpu(void *opaque)
+{
+ /* Nothing to do, just force a KVM exit */
+}
+
+long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
+ struct kvm_ppc_resize_hpt *rhpt)
+{
+ unsigned long flags = rhpt->flags;
+ unsigned long shift = rhpt->shift;
+ struct kvm_resize_hpt *resize;
+ long ret;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ if (shift && ((shift < 18) || (shift > 46)))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+
+ resize = kvm->arch.resize_hpt;
+
+ /* This shouldn't be possible */
+ ret = -EIO;
+ if (WARN_ON(!kvm->arch.hpte_setup_done))
+ goto out_no_hpt;
+
+ /* Stop VCPUs from running while we mess with the HPT */
+ kvm->arch.hpte_setup_done = 0;
+ smp_mb();
+
+ /* Boot all CPUs out of the guest so they re-read
+ * hpte_setup_done */
+ on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
+
+ ret = -ENXIO;
+ if (!resize || (resize->order != shift))
+ goto out;
+
+ ret = -EBUSY;
+ if (!resize->prepare_done)
+ goto out;
+
+ ret = resize->error;
+ if (ret != 0)
+ goto out;
+
+ ret = resize_hpt_rehash(resize);
+ if (ret != 0)
+ goto out;
+
+ resize_hpt_pivot(resize);
+
+out:
+ /* Let VCPUs run again */
+ kvm->arch.hpte_setup_done = 1;
+ smp_mb();
+out_no_hpt:
+ resize_hpt_release(kvm, resize);
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+/*
* Functions for reading and writing the hash table via reads and
* writes on a file descriptor.
*
@@ -1311,8 +1705,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
flags = ctx->flags;
i = ctx->index;
- hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
- revp = kvm->arch.revmap + i;
+ hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
+ revp = kvm->arch.hpt.rev + i;
lbuf = (unsigned long __user *)buf;
nb = 0;
@@ -1327,7 +1721,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
/* Skip uninteresting entries, i.e. clean on not-first pass */
if (!first_pass) {
- while (i < kvm->arch.hpt_npte &&
+ while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
!hpte_dirty(revp, hptp)) {
++i;
hptp += 2;
@@ -1337,7 +1731,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
hdr.index = i;
/* Grab a series of valid entries */
- while (i < kvm->arch.hpt_npte &&
+ while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
hdr.n_valid < 0xffff &&
nb + HPTE_SIZE < count &&
record_hpte(flags, hptp, hpte, revp, 1, first_pass)) {
@@ -1353,7 +1747,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
++revp;
}
/* Now skip invalid entries while we can */
- while (i < kvm->arch.hpt_npte &&
+ while (i < kvmppc_hpt_npte(&kvm->arch.hpt) &&
hdr.n_invalid < 0xffff &&
record_hpte(flags, hptp, hpte, revp, 0, first_pass)) {
/* found an invalid entry */
@@ -1374,7 +1768,7 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
}
/* Check if we've wrapped around the hash table */
- if (i >= kvm->arch.hpt_npte) {
+ if (i >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
i = 0;
ctx->first_pass = 0;
break;
@@ -1433,11 +1827,11 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
err = -EINVAL;
i = hdr.index;
- if (i >= kvm->arch.hpt_npte ||
- i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte)
+ if (i >= kvmppc_hpt_npte(&kvm->arch.hpt) ||
+ i + hdr.n_valid + hdr.n_invalid > kvmppc_hpt_npte(&kvm->arch.hpt))
break;
- hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+ hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
lbuf = (unsigned long __user *)buf;
for (j = 0; j < hdr.n_valid; ++j) {
__be64 hpte_v;
@@ -1624,8 +2018,9 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
kvm = p->kvm;
i = p->hpt_index;
- hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
- for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) {
+ hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
+ for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
+ ++i, hptp += 2) {
if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
continue;
@@ -1635,7 +2030,7 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
cpu_relax();
v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
hr = be64_to_cpu(hptp[1]);
- gr = kvm->arch.revmap[i].guest_rpte;
+ gr = kvm->arch.hpt.rev[i].guest_rpte;
unlock_hpte(hptp, v);
preempt_enable();
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index c379ff5a4438..491c5d8120f7 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -171,6 +171,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
goto fail;
}
+ ret = -ENOMEM;
stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
GFP_KERNEL);
if (!stt)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e4a79679342e..1e107ece4e37 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -182,7 +182,8 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
++vcpu->stat.halt_wakeup;
}
- if (kvmppc_ipi_thread(vcpu->arch.thread_cpu))
+ cpu = READ_ONCE(vcpu->arch.thread_cpu);
+ if (cpu >= 0 && kvmppc_ipi_thread(cpu))
return;
/* CPU points to the first thread of the core */
@@ -773,12 +774,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
}
tvcpu->arch.prodded = 1;
smp_mb();
- if (vcpu->arch.ceded) {
- if (swait_active(&vcpu->wq)) {
- swake_up(&vcpu->wq);
- vcpu->stat.halt_wakeup++;
- }
- }
+ if (tvcpu->arch.ceded)
+ kvmppc_fast_vcpu_kick_hv(tvcpu);
break;
case H_CONFER:
target = kvmppc_get_gpr(vcpu, 4);
@@ -2665,7 +2662,8 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
int i;
for_each_runnable_thread(i, vcpu, vc) {
- if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
+ if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded ||
+ vcpu->arch.prodded)
return 1;
}
@@ -2851,7 +2849,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
break;
n_ceded = 0;
for_each_runnable_thread(i, v, vc) {
- if (!v->arch.pending_exceptions)
+ if (!v->arch.pending_exceptions && !v->arch.prodded)
n_ceded += v->arch.ceded;
else
v->arch.ceded = 0;
@@ -3199,12 +3197,23 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
goto out; /* another vcpu beat us to it */
/* Allocate hashed page table (if not done already) and reset it */
- if (!kvm->arch.hpt_virt) {
- err = kvmppc_alloc_hpt(kvm, NULL);
- if (err) {
+ if (!kvm->arch.hpt.virt) {
+ int order = KVM_DEFAULT_HPT_ORDER;
+ struct kvm_hpt_info info;
+
+ err = kvmppc_allocate_hpt(&info, order);
+ /* If we get here, it means userspace didn't specify a
+ * size explicitly. So, try successively smaller
+ * sizes if the default failed. */
+ while ((err == -ENOMEM) && --order >= PPC_MIN_HPT_ORDER)
+ err = kvmppc_allocate_hpt(&info, order);
+
+ if (err < 0) {
pr_err("KVM: Couldn't alloc HPT\n");
goto out;
}
+
+ kvmppc_set_hpt(kvm, &info);
}
/* Look up the memslot for guest physical address 0 */
@@ -3413,6 +3422,9 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
kvm->arch.lpcr = lpcr;
+ /* Initialization for future HPT resizes */
+ kvm->arch.resize_hpt = NULL;
+
/*
* Work out how many sets the TLB has, for the use of
* the TLB invalidation loop in book3s_hv_rmhandlers.S.
@@ -3469,7 +3481,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
if (kvm_is_radix(kvm))
kvmppc_free_radix(kvm);
else
- kvmppc_free_hpt(kvm);
+ kvmppc_free_hpt(&kvm->arch.hpt);
kvmppc_free_pimap(kvm);
}
@@ -3695,12 +3707,9 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
r = -EFAULT;
if (get_user(htab_order, (u32 __user *)argp))
break;
- r = kvmppc_alloc_reset_hpt(kvm, &htab_order);
+ r = kvmppc_alloc_reset_hpt(kvm, htab_order);
if (r)
break;
- r = -EFAULT;
- if (put_user(htab_order, (u32 __user *)argp))
- break;
r = 0;
break;
}
@@ -3715,6 +3724,28 @@ static long kvm_arch_vm_ioctl_hv(struct file *filp,
break;
}
+ case KVM_PPC_RESIZE_HPT_PREPARE: {
+ struct kvm_ppc_resize_hpt rhpt;
+
+ r = -EFAULT;
+ if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
+ break;
+
+ r = kvm_vm_ioctl_resize_hpt_prepare(kvm, &rhpt);
+ break;
+ }
+
+ case KVM_PPC_RESIZE_HPT_COMMIT: {
+ struct kvm_ppc_resize_hpt rhpt;
+
+ r = -EFAULT;
+ if (copy_from_user(&rhpt, argp, sizeof(rhpt)))
+ break;
+
+ r = kvm_vm_ioctl_resize_hpt_commit(kvm, &rhpt);
+ break;
+ }
+
default:
r = -ENOTTY;
}
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 2f69fbc19bb0..c42a7e63b39e 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -52,19 +52,19 @@ static int __init early_parse_kvm_cma_resv(char *p)
}
early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv);
-struct page *kvm_alloc_hpt(unsigned long nr_pages)
+struct page *kvm_alloc_hpt_cma(unsigned long nr_pages)
{
VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT);
return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES));
}
-EXPORT_SYMBOL_GPL(kvm_alloc_hpt);
+EXPORT_SYMBOL_GPL(kvm_alloc_hpt_cma);
-void kvm_release_hpt(struct page *page, unsigned long nr_pages)
+void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages)
{
cma_release(kvm_cma, page, nr_pages);
}
-EXPORT_SYMBOL_GPL(kvm_release_hpt);
+EXPORT_SYMBOL_GPL(kvm_free_hpt_cma);
/**
* kvm_cma_reserve() - reserve area for kvm hash pagetable
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index b095afcd4309..6fca970373ee 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -86,10 +86,10 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
if (*rmap & KVMPPC_RMAP_PRESENT) {
i = *rmap & KVMPPC_RMAP_INDEX;
- head = &kvm->arch.revmap[i];
+ head = &kvm->arch.hpt.rev[i];
if (realmode)
head = real_vmalloc_addr(head);
- tail = &kvm->arch.revmap[head->back];
+ tail = &kvm->arch.hpt.rev[head->back];
if (realmode)
tail = real_vmalloc_addr(tail);
rev->forw = i;
@@ -154,8 +154,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
lock_rmap(rmap);
head = *rmap & KVMPPC_RMAP_INDEX;
- next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]);
- prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]);
+ next = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->forw]);
+ prev = real_vmalloc_addr(&kvm->arch.hpt.rev[rev->back]);
next->back = rev->back;
prev->forw = rev->forw;
if (head == pte_index) {
@@ -292,11 +292,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
/* Find and lock the HPTEG slot to use */
do_insert:
- if (pte_index >= kvm->arch.hpt_npte)
+ if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
return H_PARAMETER;
if (likely((flags & H_EXACT) == 0)) {
pte_index &= ~7UL;
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
for (i = 0; i < 8; ++i) {
if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 &&
try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
@@ -327,7 +327,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
}
pte_index += i;
} else {
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID |
HPTE_V_ABSENT)) {
/* Lock the slot and check again */
@@ -344,7 +344,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
}
/* Save away the guest's idea of the second HPTE dword */
- rev = &kvm->arch.revmap[pte_index];
+ rev = &kvm->arch.hpt.rev[pte_index];
if (realmode)
rev = real_vmalloc_addr(rev);
if (rev) {
@@ -469,9 +469,9 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
if (kvm_is_radix(kvm))
return H_FUNCTION;
- if (pte_index >= kvm->arch.hpt_npte)
+ if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
return H_PARAMETER;
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
cpu_relax();
pte = orig_pte = be64_to_cpu(hpte[0]);
@@ -487,7 +487,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
return H_NOT_FOUND;
}
- rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
v = pte & ~HPTE_V_HVLOCK;
if (v & HPTE_V_VALID) {
hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
@@ -557,13 +557,13 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
break;
}
if (req != 1 || flags == 3 ||
- pte_index >= kvm->arch.hpt_npte) {
+ pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt)) {
/* parameter error */
args[j] = ((0xa0 | flags) << 56) + pte_index;
ret = H_PARAMETER;
break;
}
- hp = (__be64 *) (kvm->arch.hpt_virt + (pte_index << 4));
+ hp = (__be64 *) (kvm->arch.hpt.virt + (pte_index << 4));
/* to avoid deadlock, don't spin except for first */
if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) {
if (n)
@@ -600,7 +600,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
}
args[j] = ((0x80 | flags) << 56) + pte_index;
- rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
note_hpte_modification(kvm, rev);
if (!(hp0 & HPTE_V_VALID)) {
@@ -657,10 +657,10 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
if (kvm_is_radix(kvm))
return H_FUNCTION;
- if (pte_index >= kvm->arch.hpt_npte)
+ if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
return H_PARAMETER;
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
cpu_relax();
v = pte_v = be64_to_cpu(hpte[0]);
@@ -680,7 +680,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
/* Update guest view of 2nd HPTE dword */
mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N |
HPTE_R_KEY_HI | HPTE_R_KEY_LO;
- rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
if (rev) {
r = (rev->guest_rpte & ~mask) | bits;
rev->guest_rpte = r;
@@ -728,15 +728,15 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
if (kvm_is_radix(kvm))
return H_FUNCTION;
- if (pte_index >= kvm->arch.hpt_npte)
+ if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
return H_PARAMETER;
if (flags & H_READ_4) {
pte_index &= ~3;
n = 4;
}
- rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
for (i = 0; i < n; ++i, ++pte_index) {
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
r = be64_to_cpu(hpte[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
@@ -769,11 +769,11 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
if (kvm_is_radix(kvm))
return H_FUNCTION;
- if (pte_index >= kvm->arch.hpt_npte)
+ if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
return H_PARAMETER;
- rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
cpu_relax();
v = be64_to_cpu(hpte[0]);
@@ -817,11 +817,11 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
if (kvm_is_radix(kvm))
return H_FUNCTION;
- if (pte_index >= kvm->arch.hpt_npte)
+ if (pte_index >= kvmppc_hpt_npte(&kvm->arch.hpt))
return H_PARAMETER;
- rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
- hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[pte_index]);
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (pte_index << 4));
while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
cpu_relax();
v = be64_to_cpu(hpte[0]);
@@ -970,7 +970,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
somask = (1UL << 28) - 1;
vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT;
}
- hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask;
+ hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvmppc_hpt_mask(&kvm->arch.hpt);
avpn = slb_v & ~(somask >> 16); /* also includes B */
avpn |= (eaddr & somask) >> 16;
@@ -981,7 +981,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
val |= avpn;
for (;;) {
- hpte = (__be64 *)(kvm->arch.hpt_virt + (hash << 7));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (hash << 7));
for (i = 0; i < 16; i += 2) {
/* Read the PTE racily */
@@ -1017,7 +1017,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
if (val & HPTE_V_SECONDARY)
break;
val |= HPTE_V_SECONDARY;
- hash = hash ^ kvm->arch.hpt_mask;
+ hash = hash ^ kvmppc_hpt_mask(&kvm->arch.hpt);
}
return -1;
}
@@ -1066,14 +1066,14 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
return status; /* there really was no HPTE */
return 0; /* for prot fault, HPTE disappeared */
}
- hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
+ hpte = (__be64 *)(kvm->arch.hpt.virt + (index << 4));
v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
r = be64_to_cpu(hpte[1]);
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
v = hpte_new_to_old_v(v, r);
r = hpte_new_to_old_r(r);
}
- rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
+ rev = real_vmalloc_addr(&kvm->arch.hpt.rev[index]);
gr = rev->guest_rpte;
unlock_hpte(hpte, orig_v);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 29f43ed6d5eb..e78542d99cd6 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -35,7 +35,7 @@ int kvm_irq_bypass = 1;
EXPORT_SYMBOL(kvm_irq_bypass);
static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
- u32 new_irq);
+ u32 new_irq, bool check_resend);
static int xics_opal_set_server(unsigned int hw_irq, int server_cpu);
/* -- ICS routines -- */
@@ -44,20 +44,12 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics,
{
int i;
- arch_spin_lock(&ics->lock);
-
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
struct ics_irq_state *state = &ics->irq_state[i];
-
- if (!state->resend)
- continue;
-
- arch_spin_unlock(&ics->lock);
- icp_rm_deliver_irq(xics, icp, state->number);
- arch_spin_lock(&ics->lock);
+ if (state->resend)
+ icp_rm_deliver_irq(xics, icp, state->number, true);
}
- arch_spin_unlock(&ics->lock);
}
/* -- ICP routines -- */
@@ -288,7 +280,7 @@ static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
}
static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
- u32 new_irq)
+ u32 new_irq, bool check_resend)
{
struct ics_irq_state *state;
struct kvmppc_ics *ics;
@@ -333,6 +325,10 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
}
}
+ if (check_resend)
+ if (!state->resend)
+ goto out;
+
/* Clear the resend bit of that interrupt */
state->resend = 0;
@@ -378,7 +374,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
*/
if (reject && reject != XICS_IPI) {
arch_spin_unlock(&ics->lock);
+ icp->n_reject++;
new_irq = reject;
+ check_resend = 0;
goto again;
}
} else {
@@ -386,10 +384,16 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
* We failed to deliver the interrupt we need to set the
* resend map bit and mark the ICS state as needing a resend
*/
- set_bit(ics->icsid, icp->resend_map);
state->resend = 1;
/*
+ * Make sure when checking resend, we don't miss the resend
+ * if resend_map bit is seen and cleared.
+ */
+ smp_wmb();
+ set_bit(ics->icsid, icp->resend_map);
+
+ /*
* If the need_resend flag got cleared in the ICP some time
* between icp_rm_try_to_deliver() atomic update and now, then
* we know it might have missed the resend_map bit. So we
@@ -397,7 +401,9 @@ static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
*/
smp_mb();
if (!icp->state.need_resend) {
+ state->resend = 0;
arch_spin_unlock(&ics->lock);
+ check_resend = 0;
goto again;
}
}
@@ -592,7 +598,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
/* Handle reject in real mode */
if (reject && reject != XICS_IPI) {
this_icp->n_reject++;
- icp_rm_deliver_irq(xics, icp, reject);
+ icp_rm_deliver_irq(xics, icp, reject, false);
}
/* Handle resends in real mode */
@@ -660,59 +666,45 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
*/
if (reject && reject != XICS_IPI) {
icp->n_reject++;
- icp_rm_deliver_irq(xics, icp, reject);
+ icp_rm_deliver_irq(xics, icp, reject, false);
}
bail:
return check_too_hard(xics, icp);
}
-int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
{
struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
struct kvmppc_icp *icp = vcpu->arch.icp;
struct kvmppc_ics *ics;
struct ics_irq_state *state;
- u32 irq = xirr & 0x00ffffff;
u16 src;
-
- if (!xics || !xics->real_mode)
- return H_TOO_HARD;
+ u32 pq_old, pq_new;
/*
- * ICP State: EOI
+ * ICS EOI handling: For LSI, if P bit is still set, we need to
+ * resend it.
*
- * Note: If EOI is incorrectly used by SW to lower the CPPR
- * value (ie more favored), we do not check for rejection of
- * a pending interrupt, this is a SW error and PAPR sepcifies
- * that we don't have to deal with it.
- *
- * The sending of an EOI to the ICS is handled after the
- * CPPR update
- *
- * ICP State: Down_CPPR which we handle
- * in a separate function as it's shared with H_CPPR.
+ * For MSI, we move Q bit into P (and clear Q). If it is set,
+ * resend it.
*/
- icp_rm_down_cppr(xics, icp, xirr >> 24);
- /* IPIs have no EOI */
- if (irq == XICS_IPI)
- goto bail;
- /*
- * EOI handling: If the interrupt is still asserted, we need to
- * resend it. We can take a lockless "peek" at the ICS state here.
- *
- * "Message" interrupts will never have "asserted" set
- */
ics = kvmppc_xics_find_ics(xics, irq, &src);
if (!ics)
goto bail;
+
state = &ics->irq_state[src];
- /* Still asserted, resend it */
- if (state->asserted) {
- icp->n_reject++;
- icp_rm_deliver_irq(xics, icp, irq);
- }
+ if (state->lsi)
+ pq_new = state->pq_state;
+ else
+ do {
+ pq_old = state->pq_state;
+ pq_new = pq_old >> 1;
+ } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+ if (pq_new & PQ_PRESENTED)
+ icp_rm_deliver_irq(xics, NULL, irq, false);
if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
icp->rm_action |= XICS_RM_NOTIFY_EOI;
@@ -733,10 +725,43 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
state->intr_cpu = -1;
}
}
+
bail:
return check_too_hard(xics, icp);
}
+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ u32 irq = xirr & 0x00ffffff;
+
+ if (!xics || !xics->real_mode)
+ return H_TOO_HARD;
+
+ /*
+ * ICP State: EOI
+ *
+ * Note: If EOI is incorrectly used by SW to lower the CPPR
+ * value (ie more favored), we do not check for rejection of
+ * a pending interrupt, this is a SW error and PAPR specifies
+ * that we don't have to deal with it.
+ *
+ * The sending of an EOI to the ICS is handled after the
+ * CPPR update
+ *
+ * ICP State: Down_CPPR which we handle
+ * in a separate function as it's shared with H_CPPR.
+ */
+ icp_rm_down_cppr(xics, icp, xirr >> 24);
+
+ /* IPIs have no EOI */
+ if (irq == XICS_IPI)
+ return check_too_hard(xics, icp);
+
+ return ics_rm_eoi(vcpu, irq);
+}
+
unsigned long eoi_rc;
static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
@@ -823,14 +848,33 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
{
struct kvmppc_xics *xics;
struct kvmppc_icp *icp;
+ struct kvmppc_ics *ics;
+ struct ics_irq_state *state;
u32 irq;
+ u16 src;
+ u32 pq_old, pq_new;
irq = irq_map->v_hwirq;
xics = vcpu->kvm->arch.xics;
icp = vcpu->arch.icp;
kvmppc_rm_handle_irq_desc(irq_map->desc);
- icp_rm_deliver_irq(xics, icp, irq);
+
+ ics = kvmppc_xics_find_ics(xics, irq, &src);
+ if (!ics)
+ return 2;
+
+ state = &ics->irq_state[src];
+
+ /* only MSIs register bypass producers, so it must be MSI here */
+ do {
+ pq_old = state->pq_state;
+ pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
+ } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+ /* Test P=1, Q=0, this is the only case where we present */
+ if (pq_new == PQ_PRESENTED)
+ icp_rm_deliver_irq(xics, icp, irq, false);
/* EOI the interrupt */
icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr,
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 1482961ceb4d..d4dfc0ca2a44 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -902,6 +902,69 @@ static void kvmppc_clear_debug(struct kvm_vcpu *vcpu)
}
}
+static int kvmppc_exit_pr_progint(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ unsigned int exit_nr)
+{
+ enum emulation_result er;
+ ulong flags;
+ u32 last_inst;
+ int emul, r;
+
+ /*
+ * shadow_srr1 only contains valid flags if we came here via a program
+ * exception. The other exceptions (emulation assist, FP unavailable,
+ * etc.) do not provide flags in SRR1, so use an illegal-instruction
+ * exception when injecting a program interrupt into the guest.
+ */
+ if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
+ flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+ else
+ flags = SRR1_PROGILL;
+
+ emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
+ if (emul != EMULATE_DONE)
+ return RESUME_GUEST;
+
+ if (kvmppc_get_msr(vcpu) & MSR_PR) {
+#ifdef EXIT_DEBUG
+ pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
+ kvmppc_get_pc(vcpu), last_inst);
+#endif
+ if ((last_inst & 0xff0007ff) != (INS_DCBZ & 0xfffffff7)) {
+ kvmppc_core_queue_program(vcpu, flags);
+ return RESUME_GUEST;
+ }
+ }
+
+ vcpu->stat.emulated_inst_exits++;
+ er = kvmppc_emulate_instruction(run, vcpu);
+ switch (er) {
+ case EMULATE_DONE:
+ r = RESUME_GUEST_NV;
+ break;
+ case EMULATE_AGAIN:
+ r = RESUME_GUEST;
+ break;
+ case EMULATE_FAIL:
+ pr_crit("%s: emulation at %lx failed (%08x)\n",
+ __func__, kvmppc_get_pc(vcpu), last_inst);
+ kvmppc_core_queue_program(vcpu, flags);
+ r = RESUME_GUEST;
+ break;
+ case EMULATE_DO_MMIO:
+ run->exit_reason = KVM_EXIT_MMIO;
+ r = RESUME_HOST_NV;
+ break;
+ case EMULATE_EXIT_USER:
+ r = RESUME_HOST_NV;
+ break;
+ default:
+ BUG();
+ }
+
+ return r;
+}
+
int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned int exit_nr)
{
@@ -1044,71 +1107,8 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
case BOOK3S_INTERRUPT_PROGRAM:
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
- {
- enum emulation_result er;
- ulong flags;
- u32 last_inst;
- int emul;
-
-program_interrupt:
- /*
- * shadow_srr1 only contains valid flags if we came here via
- * a program exception. The other exceptions (emulation assist,
- * FP unavailable, etc.) do not provide flags in SRR1, so use
- * an illegal-instruction exception when injecting a program
- * interrupt into the guest.
- */
- if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
- flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
- else
- flags = SRR1_PROGILL;
-
- emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
- if (emul != EMULATE_DONE) {
- r = RESUME_GUEST;
- break;
- }
-
- if (kvmppc_get_msr(vcpu) & MSR_PR) {
-#ifdef EXIT_DEBUG
- pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n",
- kvmppc_get_pc(vcpu), last_inst);
-#endif
- if ((last_inst & 0xff0007ff) !=
- (INS_DCBZ & 0xfffffff7)) {
- kvmppc_core_queue_program(vcpu, flags);
- r = RESUME_GUEST;
- break;
- }
- }
-
- vcpu->stat.emulated_inst_exits++;
- er = kvmppc_emulate_instruction(run, vcpu);
- switch (er) {
- case EMULATE_DONE:
- r = RESUME_GUEST_NV;
- break;
- case EMULATE_AGAIN:
- r = RESUME_GUEST;
- break;
- case EMULATE_FAIL:
- printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
- __func__, kvmppc_get_pc(vcpu), last_inst);
- kvmppc_core_queue_program(vcpu, flags);
- r = RESUME_GUEST;
- break;
- case EMULATE_DO_MMIO:
- run->exit_reason = KVM_EXIT_MMIO;
- r = RESUME_HOST_NV;
- break;
- case EMULATE_EXIT_USER:
- r = RESUME_HOST_NV;
- break;
- default:
- BUG();
- }
+ r = kvmppc_exit_pr_progint(run, vcpu, exit_nr);
break;
- }
case BOOK3S_INTERRUPT_SYSCALL:
{
u32 last_sc;
@@ -1185,7 +1185,7 @@ program_interrupt:
emul = kvmppc_get_last_inst(vcpu, INST_GENERIC,
&last_inst);
if (emul == EMULATE_DONE)
- goto program_interrupt;
+ r = kvmppc_exit_pr_progint(run, vcpu, exit_nr);
else
r = RESUME_GUEST;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 20dff102a06f..e48803e2918d 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -63,7 +63,7 @@
/* -- ICS routines -- */
static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
- u32 new_irq);
+ u32 new_irq, bool check_resend);
/*
* Return value ideally indicates how the interrupt was handled, but no
@@ -75,6 +75,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
struct ics_irq_state *state;
struct kvmppc_ics *ics;
u16 src;
+ u32 pq_old, pq_new;
XICS_DBG("ics deliver %#x (level: %d)\n", irq, level);
@@ -87,25 +88,41 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
if (!state->exists)
return -EINVAL;
+ if (level == KVM_INTERRUPT_SET_LEVEL || level == KVM_INTERRUPT_SET)
+ level = 1;
+ else if (level == KVM_INTERRUPT_UNSET)
+ level = 0;
/*
- * We set state->asserted locklessly. This should be fine as
- * we are the only setter, thus concurrent access is undefined
- * to begin with.
+ * Take other values the same as 1, consistent with original code.
+ * maybe WARN here?
*/
- if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
- state->asserted = 1;
- else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
- state->asserted = 0;
+
+ if (!state->lsi && level == 0) /* noop for MSI */
return 0;
- }
+
+ do {
+ pq_old = state->pq_state;
+ if (state->lsi) {
+ if (level) {
+ if (pq_old & PQ_PRESENTED)
+ /* Setting already set LSI ... */
+ return 0;
+
+ pq_new = PQ_PRESENTED;
+ } else
+ pq_new = 0;
+ } else
+ pq_new = ((pq_old << 1) & 3) | PQ_PRESENTED;
+ } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+ /* Test P=1, Q=0, this is the only case where we present */
+ if (pq_new == PQ_PRESENTED)
+ icp_deliver_irq(xics, NULL, irq, false);
/* Record which CPU this arrived on for passed-through interrupts */
if (state->host_irq)
state->intr_cpu = raw_smp_processor_id();
- /* Attempt delivery */
- icp_deliver_irq(xics, NULL, irq);
-
return 0;
}
@@ -114,29 +131,14 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
{
int i;
- unsigned long flags;
-
- local_irq_save(flags);
- arch_spin_lock(&ics->lock);
-
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
struct ics_irq_state *state = &ics->irq_state[i];
-
- if (!state->resend)
- continue;
-
- XICS_DBG("resend %#x prio %#x\n", state->number,
- state->priority);
-
- arch_spin_unlock(&ics->lock);
- local_irq_restore(flags);
- icp_deliver_irq(xics, icp, state->number);
- local_irq_save(flags);
- arch_spin_lock(&ics->lock);
+ if (state->resend) {
+ XICS_DBG("resend %#x prio %#x\n", state->number,
+ state->priority);
+ icp_deliver_irq(xics, icp, state->number, true);
+ }
}
-
- arch_spin_unlock(&ics->lock);
- local_irq_restore(flags);
}
static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
@@ -155,6 +157,7 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
deliver = false;
if ((state->masked_pending || state->resend) && priority != MASKED) {
state->masked_pending = 0;
+ state->resend = 0;
deliver = true;
}
@@ -189,7 +192,7 @@ int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority)
state->masked_pending, state->resend);
if (write_xive(xics, ics, state, server, priority, priority))
- icp_deliver_irq(xics, icp, irq);
+ icp_deliver_irq(xics, icp, irq, false);
return 0;
}
@@ -242,7 +245,7 @@ int kvmppc_xics_int_on(struct kvm *kvm, u32 irq)
if (write_xive(xics, ics, state, state->server, state->saved_priority,
state->saved_priority))
- icp_deliver_irq(xics, icp, irq);
+ icp_deliver_irq(xics, icp, irq, false);
return 0;
}
@@ -376,7 +379,7 @@ static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
}
static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
- u32 new_irq)
+ u32 new_irq, bool check_resend)
{
struct ics_irq_state *state;
struct kvmppc_ics *ics;
@@ -422,6 +425,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
}
}
+ if (check_resend)
+ if (!state->resend)
+ goto out;
+
/* Clear the resend bit of that interrupt */
state->resend = 0;
@@ -470,6 +477,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
arch_spin_unlock(&ics->lock);
local_irq_restore(flags);
new_irq = reject;
+ check_resend = 0;
goto again;
}
} else {
@@ -477,10 +485,16 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
* We failed to deliver the interrupt we need to set the
* resend map bit and mark the ICS state as needing a resend
*/
- set_bit(ics->icsid, icp->resend_map);
state->resend = 1;
/*
+ * Make sure when checking resend, we don't miss the resend
+ * if resend_map bit is seen and cleared.
+ */
+ smp_wmb();
+ set_bit(ics->icsid, icp->resend_map);
+
+ /*
* If the need_resend flag got cleared in the ICP some time
* between icp_try_to_deliver() atomic update and now, then
* we know it might have missed the resend_map bit. So we
@@ -488,8 +502,10 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
*/
smp_mb();
if (!icp->state.need_resend) {
+ state->resend = 0;
arch_spin_unlock(&ics->lock);
local_irq_restore(flags);
+ check_resend = 0;
goto again;
}
}
@@ -681,7 +697,7 @@ static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
/* Handle reject */
if (reject && reject != XICS_IPI)
- icp_deliver_irq(xics, icp, reject);
+ icp_deliver_irq(xics, icp, reject, false);
/* Handle resend */
if (resend)
@@ -761,17 +777,54 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
* attempt (see comments in icp_deliver_irq).
*/
if (reject && reject != XICS_IPI)
- icp_deliver_irq(xics, icp, reject);
+ icp_deliver_irq(xics, icp, reject, false);
}
-static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+static int ics_eoi(struct kvm_vcpu *vcpu, u32 irq)
{
struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
struct kvmppc_icp *icp = vcpu->arch.icp;
struct kvmppc_ics *ics;
struct ics_irq_state *state;
- u32 irq = xirr & 0x00ffffff;
u16 src;
+ u32 pq_old, pq_new;
+
+ /*
+ * ICS EOI handling: For LSI, if P bit is still set, we need to
+ * resend it.
+ *
+ * For MSI, we move Q bit into P (and clear Q). If it is set,
+ * resend it.
+ */
+
+ ics = kvmppc_xics_find_ics(xics, irq, &src);
+ if (!ics) {
+ XICS_DBG("ios_eoi: IRQ 0x%06x not found !\n", irq);
+ return H_PARAMETER;
+ }
+ state = &ics->irq_state[src];
+
+ if (state->lsi)
+ pq_new = state->pq_state;
+ else
+ do {
+ pq_old = state->pq_state;
+ pq_new = pq_old >> 1;
+ } while (cmpxchg(&state->pq_state, pq_old, pq_new) != pq_old);
+
+ if (pq_new & PQ_PRESENTED)
+ icp_deliver_irq(xics, icp, irq, false);
+
+ kvm_notify_acked_irq(vcpu->kvm, 0, irq);
+
+ return H_SUCCESS;
+}
+
+static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
+{
+ struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
+ struct kvmppc_icp *icp = vcpu->arch.icp;
+ u32 irq = xirr & 0x00ffffff;
XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr);
@@ -794,26 +847,8 @@ static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
/* IPIs have no EOI */
if (irq == XICS_IPI)
return H_SUCCESS;
- /*
- * EOI handling: If the interrupt is still asserted, we need to
- * resend it. We can take a lockless "peek" at the ICS state here.
- *
- * "Message" interrupts will never have "asserted" set
- */
- ics = kvmppc_xics_find_ics(xics, irq, &src);
- if (!ics) {
- XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq);
- return H_PARAMETER;
- }
- state = &ics->irq_state[src];
- /* Still asserted, resend it */
- if (state->asserted)
- icp_deliver_irq(xics, icp, irq);
-
- kvm_notify_acked_irq(vcpu->kvm, 0, irq);
-
- return H_SUCCESS;
+ return ics_eoi(vcpu, irq);
}
int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
@@ -832,10 +867,6 @@ int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
icp->n_rm_check_resend++;
icp_check_resend(xics, icp->rm_resend_icp);
}
- if (icp->rm_action & XICS_RM_REJECT) {
- icp->n_rm_reject++;
- icp_deliver_irq(xics, icp, icp->rm_reject);
- }
if (icp->rm_action & XICS_RM_NOTIFY_EOI) {
icp->n_rm_notify_eoi++;
kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq);
@@ -920,7 +951,7 @@ static int xics_debug_show(struct seq_file *m, void *private)
int icsid, i;
unsigned long flags;
unsigned long t_rm_kick_vcpu, t_rm_check_resend;
- unsigned long t_rm_reject, t_rm_notify_eoi;
+ unsigned long t_rm_notify_eoi;
unsigned long t_reject, t_check_resend;
if (!kvm)
@@ -929,7 +960,6 @@ static int xics_debug_show(struct seq_file *m, void *private)
t_rm_kick_vcpu = 0;
t_rm_notify_eoi = 0;
t_rm_check_resend = 0;
- t_rm_reject = 0;
t_check_resend = 0;
t_reject = 0;
@@ -952,14 +982,13 @@ static int xics_debug_show(struct seq_file *m, void *private)
t_rm_kick_vcpu += icp->n_rm_kick_vcpu;
t_rm_notify_eoi += icp->n_rm_notify_eoi;
t_rm_check_resend += icp->n_rm_check_resend;
- t_rm_reject += icp->n_rm_reject;
t_check_resend += icp->n_check_resend;
t_reject += icp->n_reject;
}
- seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n",
+ seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu notify_eoi=%lu\n",
t_rm_kick_vcpu, t_rm_check_resend,
- t_rm_reject, t_rm_notify_eoi);
+ t_rm_notify_eoi);
seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n",
t_check_resend, t_reject);
for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
@@ -977,9 +1006,9 @@ static int xics_debug_show(struct seq_file *m, void *private)
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
struct ics_irq_state *irq = &ics->irq_state[i];
- seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n",
+ seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x pq_state %d resend %d masked pending %d\n",
irq->number, irq->server, irq->priority,
- irq->saved_priority, irq->asserted,
+ irq->saved_priority, irq->pq_state,
irq->resend, irq->masked_pending);
}
@@ -1198,10 +1227,17 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
val |= prio << KVM_XICS_PRIORITY_SHIFT;
if (irqp->lsi) {
val |= KVM_XICS_LEVEL_SENSITIVE;
- if (irqp->asserted)
+ if (irqp->pq_state & PQ_PRESENTED)
val |= KVM_XICS_PENDING;
} else if (irqp->masked_pending || irqp->resend)
val |= KVM_XICS_PENDING;
+
+ if (irqp->pq_state & PQ_PRESENTED)
+ val |= KVM_XICS_PRESENTED;
+
+ if (irqp->pq_state & PQ_QUEUED)
+ val |= KVM_XICS_QUEUED;
+
ret = 0;
}
arch_spin_unlock(&ics->lock);
@@ -1253,18 +1289,20 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
irqp->resend = 0;
irqp->masked_pending = 0;
irqp->lsi = 0;
- irqp->asserted = 0;
- if (val & KVM_XICS_LEVEL_SENSITIVE) {
+ irqp->pq_state = 0;
+ if (val & KVM_XICS_LEVEL_SENSITIVE)
irqp->lsi = 1;
- if (val & KVM_XICS_PENDING)
- irqp->asserted = 1;
- }
+ /* If PENDING, set P in case P is not saved because of old code */
+ if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+ irqp->pq_state |= PQ_PRESENTED;
+ if (val & KVM_XICS_QUEUED)
+ irqp->pq_state |= PQ_QUEUED;
irqp->exists = 1;
arch_spin_unlock(&ics->lock);
local_irq_restore(flags);
if (val & KVM_XICS_PENDING)
- icp_deliver_irq(xics, NULL, irqp->number);
+ icp_deliver_irq(xics, NULL, irqp->number, false);
return 0;
}
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index 2a50320b55ca..ec5474cf70c6 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -31,16 +31,19 @@
/* Priority value to use for disabling an interrupt */
#define MASKED 0xff
+#define PQ_PRESENTED 1
+#define PQ_QUEUED 2
+
/* State for one irq source */
struct ics_irq_state {
u32 number;
u32 server;
+ u32 pq_state;
u8 priority;
u8 saved_priority;
u8 resend;
u8 masked_pending;
u8 lsi; /* level-sensitive interrupt */
- u8 asserted; /* Only for LSI */
u8 exists;
int intr_cpu;
u32 host_irq;
@@ -73,7 +76,6 @@ struct kvmppc_icp {
*/
#define XICS_RM_KICK_VCPU 0x1
#define XICS_RM_CHECK_RESEND 0x2
-#define XICS_RM_REJECT 0x4
#define XICS_RM_NOTIFY_EOI 0x8
u32 rm_action;
struct kvm_vcpu *rm_kick_target;
@@ -84,7 +86,6 @@ struct kvmppc_icp {
/* Counters for each reason we exited real mode */
unsigned long n_rm_kick_vcpu;
unsigned long n_rm_check_resend;
- unsigned long n_rm_reject;
unsigned long n_rm_notify_eoi;
/* Counters for handling ICP processing in real mode */
unsigned long n_check_resend;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 40a5b2d75ed1..2b38d824e9e5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -511,6 +511,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ONE_REG:
case KVM_CAP_IOEVENTFD:
case KVM_CAP_DEVICE_CTRL:
+ case KVM_CAP_IMMEDIATE_EXIT:
r = 1;
break;
case KVM_CAP_PPC_PAIRED_SINGLES:
@@ -612,6 +613,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SPAPR_MULTITCE:
r = 1;
break;
+ case KVM_CAP_SPAPR_RESIZE_HPT:
+ /* Disable this on POWER9 until code handles new HPTE format */
+ r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300);
+ break;
#endif
case KVM_CAP_PPC_HTM:
r = cpu_has_feature(CPU_FTR_TM_COMP) &&
@@ -1114,7 +1119,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
#endif
}
- r = kvmppc_vcpu_run(run, vcpu);
+ if (run->immediate_exit)
+ r = -EINTR;
+ else
+ r = kvmppc_vcpu_run(run, vcpu);
if (vcpu->sigset_active)
sigprocmask(SIG_SETMASK, &sigsaved, NULL);
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 4aa8a7e2a1da..4492c9363178 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -373,7 +373,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu)
ipte_unlock_simple(vcpu);
}
-static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
+static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, u8 ar,
enum gacc_mode mode)
{
union alet alet;
@@ -465,7 +465,9 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
struct trans_exc_code_bits {
unsigned long addr : 52; /* Translation-exception Address */
unsigned long fsi : 2; /* Access Exception Fetch/Store Indication */
- unsigned long : 6;
+ unsigned long : 2;
+ unsigned long b56 : 1;
+ unsigned long : 3;
unsigned long b60 : 1;
unsigned long b61 : 1;
unsigned long as : 2; /* ASCE Identifier */
@@ -485,7 +487,7 @@ enum prot_type {
};
static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
- ar_t ar, enum gacc_mode mode, enum prot_type prot)
+ u8 ar, enum gacc_mode mode, enum prot_type prot)
{
struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
struct trans_exc_code_bits *tec;
@@ -497,14 +499,18 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
switch (code) {
case PGM_PROTECTION:
switch (prot) {
+ case PROT_TYPE_LA:
+ tec->b56 = 1;
+ break;
+ case PROT_TYPE_KEYC:
+ tec->b60 = 1;
+ break;
case PROT_TYPE_ALC:
tec->b60 = 1;
/* FALL THROUGH */
case PROT_TYPE_DAT:
tec->b61 = 1;
break;
- default: /* LA and KEYC set b61 to 0, other params undefined */
- return code;
}
/* FALL THROUGH */
case PGM_ASCE_TYPE:
@@ -539,7 +545,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
}
static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
- unsigned long ga, ar_t ar, enum gacc_mode mode)
+ unsigned long ga, u8 ar, enum gacc_mode mode)
{
int rc;
struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
@@ -771,7 +777,7 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
return 1;
}
-static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
+static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar,
unsigned long *pages, unsigned long nr_pages,
const union asce asce, enum gacc_mode mode)
{
@@ -803,7 +809,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
return 0;
}
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
+int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
unsigned long len, enum gacc_mode mode)
{
psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -877,7 +883,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
* Note: The IPTE lock is not taken during this function, so the caller
* has to take care of this.
*/
-int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
+int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
unsigned long *gpa, enum gacc_mode mode)
{
psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -910,7 +916,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
/**
* check_gva_range - test a range of guest virtual addresses for accessibility
*/
-int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
+int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
unsigned long length, enum gacc_mode mode)
{
unsigned long gpa;
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index 8756569ad938..7ce47fd36f28 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -162,11 +162,11 @@ enum gacc_mode {
};
int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
- ar_t ar, unsigned long *gpa, enum gacc_mode mode);
-int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
+ u8 ar, unsigned long *gpa, enum gacc_mode mode);
+int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, u8 ar,
unsigned long length, enum gacc_mode mode);
-int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
+int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
unsigned long len, enum gacc_mode mode);
int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
@@ -218,7 +218,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
* if data has been changed in guest space in case of an exception.
*/
static inline __must_check
-int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
+int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
unsigned long len)
{
return access_guest(vcpu, ga, ar, data, len, GACC_STORE);
@@ -238,7 +238,7 @@ int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
* data will be copied from guest space to kernel space.
*/
static inline __must_check
-int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
+int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, u8 ar, void *data,
unsigned long len)
{
return access_guest(vcpu, ga, ar, data, len, GACC_FETCH);
@@ -247,10 +247,11 @@ int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
/**
* read_guest_instr - copy instruction data from guest space to kernel space
* @vcpu: virtual cpu
+ * @ga: guest address
* @data: destination address in kernel space
* @len: number of bytes to copy
*
- * Copy @len bytes from the current psw address (guest space) to @data (kernel
+ * Copy @len bytes from the given address (guest space) to @data (kernel
* space).
*
* The behaviour of read_guest_instr is identical to read_guest, except that
@@ -258,10 +259,10 @@ int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
* address-space mode.
*/
static inline __must_check
-int read_guest_instr(struct kvm_vcpu *vcpu, void *data, unsigned long len)
+int read_guest_instr(struct kvm_vcpu *vcpu, unsigned long ga, void *data,
+ unsigned long len)
{
- return access_guest(vcpu, vcpu->arch.sie_block->gpsw.addr, 0, data, len,
- GACC_IFETCH);
+ return access_guest(vcpu, ga, 0, data, len, GACC_IFETCH);
}
/**
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index d7c6a7f53ced..23d9a4e12da1 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -388,14 +388,13 @@ void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu)
#define per_write_wp_event(code) \
(code & (PER_CODE_STORE | PER_CODE_STORE_REAL))
-static int debug_exit_required(struct kvm_vcpu *vcpu)
+static int debug_exit_required(struct kvm_vcpu *vcpu, u8 perc,
+ unsigned long peraddr)
{
- u8 perc = vcpu->arch.sie_block->perc;
struct kvm_debug_exit_arch *debug_exit = &vcpu->run->debug.arch;
struct kvm_hw_wp_info_arch *wp_info = NULL;
struct kvm_hw_bp_info_arch *bp_info = NULL;
unsigned long addr = vcpu->arch.sie_block->gpsw.addr;
- unsigned long peraddr = vcpu->arch.sie_block->peraddr;
if (guestdbg_hw_bp_enabled(vcpu)) {
if (per_write_wp_event(perc) &&
@@ -437,36 +436,118 @@ exit_required:
return 1;
}
+static int per_fetched_addr(struct kvm_vcpu *vcpu, unsigned long *addr)
+{
+ u8 exec_ilen = 0;
+ u16 opcode[3];
+ int rc;
+
+ if (vcpu->arch.sie_block->icptcode == ICPT_PROGI) {
+ /* PER address references the fetched or the execute instr */
+ *addr = vcpu->arch.sie_block->peraddr;
+ /*
+ * Manually detect if we have an EXECUTE instruction. As
+ * instructions are always 2 byte aligned we can read the
+ * first two bytes unconditionally
+ */
+ rc = read_guest_instr(vcpu, *addr, &opcode, 2);
+ if (rc)
+ return rc;
+ if (opcode[0] >> 8 == 0x44)
+ exec_ilen = 4;
+ if ((opcode[0] & 0xff0f) == 0xc600)
+ exec_ilen = 6;
+ } else {
+ /* instr was suppressed, calculate the responsible instr */
+ *addr = __rewind_psw(vcpu->arch.sie_block->gpsw,
+ kvm_s390_get_ilen(vcpu));
+ if (vcpu->arch.sie_block->icptstatus & 0x01) {
+ exec_ilen = (vcpu->arch.sie_block->icptstatus & 0x60) >> 4;
+ if (!exec_ilen)
+ exec_ilen = 4;
+ }
+ }
+
+ if (exec_ilen) {
+ /* read the complete EXECUTE instr to detect the fetched addr */
+ rc = read_guest_instr(vcpu, *addr, &opcode, exec_ilen);
+ if (rc)
+ return rc;
+ if (exec_ilen == 6) {
+ /* EXECUTE RELATIVE LONG - RIL-b format */
+ s32 rl = *((s32 *) (opcode + 1));
+
+ /* rl is a _signed_ 32 bit value specifying halfwords */
+ *addr += (u64)(s64) rl * 2;
+ } else {
+ /* EXECUTE - RX-a format */
+ u32 base = (opcode[1] & 0xf000) >> 12;
+ u32 disp = opcode[1] & 0x0fff;
+ u32 index = opcode[0] & 0x000f;
+
+ *addr = base ? vcpu->run->s.regs.gprs[base] : 0;
+ *addr += index ? vcpu->run->s.regs.gprs[index] : 0;
+ *addr += disp;
+ }
+ *addr = kvm_s390_logical_to_effective(vcpu, *addr);
+ }
+ return 0;
+}
+
#define guest_per_enabled(vcpu) \
(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER)
int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
{
+ const u64 cr10 = vcpu->arch.sie_block->gcr[10];
+ const u64 cr11 = vcpu->arch.sie_block->gcr[11];
const u8 ilen = kvm_s390_get_ilen(vcpu);
struct kvm_s390_pgm_info pgm_info = {
.code = PGM_PER,
.per_code = PER_CODE_IFETCH,
.per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
};
+ unsigned long fetched_addr;
+ int rc;
/*
* The PSW points to the next instruction, therefore the intercepted
* instruction generated a PER i-fetch event. PER address therefore
* points at the previous PSW address (could be an EXECUTE function).
*/
- return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+ if (!guestdbg_enabled(vcpu))
+ return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+
+ if (debug_exit_required(vcpu, pgm_info.per_code, pgm_info.per_address))
+ vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING;
+
+ if (!guest_per_enabled(vcpu) ||
+ !(vcpu->arch.sie_block->gcr[9] & PER_EVENT_IFETCH))
+ return 0;
+
+ rc = per_fetched_addr(vcpu, &fetched_addr);
+ if (rc < 0)
+ return rc;
+ if (rc)
+ /* instruction-fetching exceptions */
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ if (in_addr_range(fetched_addr, cr10, cr11))
+ return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+ return 0;
}
-static void filter_guest_per_event(struct kvm_vcpu *vcpu)
+static int filter_guest_per_event(struct kvm_vcpu *vcpu)
{
const u8 perc = vcpu->arch.sie_block->perc;
- u64 peraddr = vcpu->arch.sie_block->peraddr;
u64 addr = vcpu->arch.sie_block->gpsw.addr;
u64 cr9 = vcpu->arch.sie_block->gcr[9];
u64 cr10 = vcpu->arch.sie_block->gcr[10];
u64 cr11 = vcpu->arch.sie_block->gcr[11];
/* filter all events, demanded by the guest */
u8 guest_perc = perc & (cr9 >> 24) & PER_CODE_MASK;
+ unsigned long fetched_addr;
+ int rc;
if (!guest_per_enabled(vcpu))
guest_perc = 0;
@@ -478,9 +559,17 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
guest_perc &= ~PER_CODE_BRANCH;
/* filter "instruction-fetching" events */
- if (guest_perc & PER_CODE_IFETCH &&
- !in_addr_range(peraddr, cr10, cr11))
- guest_perc &= ~PER_CODE_IFETCH;
+ if (guest_perc & PER_CODE_IFETCH) {
+ rc = per_fetched_addr(vcpu, &fetched_addr);
+ if (rc < 0)
+ return rc;
+ /*
+ * Don't inject an irq on exceptions. This would make handling
+ * on icpt code 8 very complex (as PSW was already rewound).
+ */
+ if (rc || !in_addr_range(fetched_addr, cr10, cr11))
+ guest_perc &= ~PER_CODE_IFETCH;
+ }
/* All other PER events will be given to the guest */
/* TODO: Check altered address/address space */
@@ -489,6 +578,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
if (!guest_perc)
vcpu->arch.sie_block->iprcc &= ~PGM_PER;
+ return 0;
}
#define pssec(vcpu) (vcpu->arch.sie_block->gcr[1] & _ASCE_SPACE_SWITCH)
@@ -496,14 +586,17 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
#define old_ssec(vcpu) ((vcpu->arch.sie_block->tecmc >> 31) & 0x1)
#define old_as_is_home(vcpu) !(vcpu->arch.sie_block->tecmc & 0xffff)
-void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
+int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
{
- int new_as;
+ int rc, new_as;
- if (debug_exit_required(vcpu))
+ if (debug_exit_required(vcpu, vcpu->arch.sie_block->perc,
+ vcpu->arch.sie_block->peraddr))
vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING;
- filter_guest_per_event(vcpu);
+ rc = filter_guest_per_event(vcpu);
+ if (rc)
+ return rc;
/*
* Only RP, SAC, SACF, PT, PTI, PR, PC instructions can trigger
@@ -532,4 +625,5 @@ void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu)
(pssec(vcpu) || old_ssec(vcpu)))
vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH;
}
+ return 0;
}
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 7a27eebab28a..59920f96ebc0 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -238,7 +238,9 @@ static int handle_prog(struct kvm_vcpu *vcpu)
vcpu->stat.exit_program_interruption++;
if (guestdbg_enabled(vcpu) && per_event(vcpu)) {
- kvm_s390_handle_per_event(vcpu);
+ rc = kvm_s390_handle_per_event(vcpu);
+ if (rc)
+ return rc;
/* the interrupt might have been filtered out completely */
if (vcpu->arch.sie_block->iprcc == 0)
return 0;
@@ -359,6 +361,9 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
static int handle_operexc(struct kvm_vcpu *vcpu)
{
+ psw_t oldpsw, newpsw;
+ int rc;
+
vcpu->stat.exit_operation_exception++;
trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
vcpu->arch.sie_block->ipb);
@@ -369,6 +374,24 @@ static int handle_operexc(struct kvm_vcpu *vcpu)
if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
return -EOPNOTSUPP;
+ rc = read_guest_lc(vcpu, __LC_PGM_NEW_PSW, &newpsw, sizeof(psw_t));
+ if (rc)
+ return rc;
+ /*
+ * Avoid endless loops of operation exceptions, if the pgm new
+ * PSW will cause a new operation exception.
+ * The heuristic checks if the pgm new psw is within 6 bytes before
+ * the faulting psw address (with same DAT, AS settings) and the
+ * new psw is not a wait psw and the fault was not triggered by
+ * problem state.
+ */
+ oldpsw = vcpu->arch.sie_block->gpsw;
+ if (oldpsw.addr - newpsw.addr <= 6 &&
+ !(newpsw.mask & PSW_MASK_WAIT) &&
+ !(oldpsw.mask & PSW_MASK_PSTATE) &&
+ (newpsw.mask & PSW_MASK_ASC) == (oldpsw.mask & PSW_MASK_ASC) &&
+ (newpsw.mask & PSW_MASK_DAT) == (oldpsw.mask & PSW_MASK_DAT))
+ return -EOPNOTSUPP;
return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
}
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index b604854df02c..f5694838234d 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -218,7 +218,7 @@ static void allow_cpu_feat(unsigned long nr)
static inline int plo_test_bit(unsigned char nr)
{
register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
- int cc = 3; /* subfunction not available */
+ int cc;
asm volatile(
/* Parameter registers are ignored for "test bit" */
@@ -371,6 +371,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_IRQCHIP:
case KVM_CAP_VM_ATTRIBUTES:
case KVM_CAP_MP_STATE:
+ case KVM_CAP_IMMEDIATE_EXIT:
case KVM_CAP_S390_INJECT_IRQ:
case KVM_CAP_S390_USER_SIGP:
case KVM_CAP_S390_USER_STSI:
@@ -443,6 +444,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
struct kvm_memory_slot *memslot;
int is_dirty = 0;
+ if (kvm_is_ucontrol(kvm))
+ return -EINVAL;
+
mutex_lock(&kvm->slots_lock);
r = -EINVAL;
@@ -506,6 +510,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
} else if (MACHINE_HAS_VX) {
set_kvm_facility(kvm->arch.model.fac_mask, 129);
set_kvm_facility(kvm->arch.model.fac_list, 129);
+ if (test_facility(134)) {
+ set_kvm_facility(kvm->arch.model.fac_mask, 134);
+ set_kvm_facility(kvm->arch.model.fac_list, 134);
+ }
+ if (test_facility(135)) {
+ set_kvm_facility(kvm->arch.model.fac_mask, 135);
+ set_kvm_facility(kvm->arch.model.fac_list, 135);
+ }
r = 0;
} else
r = -EINVAL;
@@ -822,6 +834,13 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
}
memcpy(kvm->arch.model.fac_list, proc->fac_list,
S390_ARCH_FAC_LIST_SIZE_BYTE);
+ VM_EVENT(kvm, 3, "SET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx",
+ kvm->arch.model.ibc,
+ kvm->arch.model.cpuid);
+ VM_EVENT(kvm, 3, "SET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx",
+ kvm->arch.model.fac_list[0],
+ kvm->arch.model.fac_list[1],
+ kvm->arch.model.fac_list[2]);
} else
ret = -EFAULT;
kfree(proc);
@@ -895,6 +914,13 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr)
proc->ibc = kvm->arch.model.ibc;
memcpy(&proc->fac_list, kvm->arch.model.fac_list,
S390_ARCH_FAC_LIST_SIZE_BYTE);
+ VM_EVENT(kvm, 3, "GET: guest ibc: 0x%4.4x, guest cpuid: 0x%16.16llx",
+ kvm->arch.model.ibc,
+ kvm->arch.model.cpuid);
+ VM_EVENT(kvm, 3, "GET: guest faclist: 0x%16.16llx.%16.16llx.%16.16llx",
+ kvm->arch.model.fac_list[0],
+ kvm->arch.model.fac_list[1],
+ kvm->arch.model.fac_list[2]);
if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc)))
ret = -EFAULT;
kfree(proc);
@@ -918,6 +944,17 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
S390_ARCH_FAC_LIST_SIZE_BYTE);
memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
sizeof(S390_lowcore.stfle_fac_list));
+ VM_EVENT(kvm, 3, "GET: host ibc: 0x%4.4x, host cpuid: 0x%16.16llx",
+ kvm->arch.model.ibc,
+ kvm->arch.model.cpuid);
+ VM_EVENT(kvm, 3, "GET: host facmask: 0x%16.16llx.%16.16llx.%16.16llx",
+ mach->fac_mask[0],
+ mach->fac_mask[1],
+ mach->fac_mask[2]);
+ VM_EVENT(kvm, 3, "GET: host faclist: 0x%16.16llx.%16.16llx.%16.16llx",
+ mach->fac_list[0],
+ mach->fac_list[1],
+ mach->fac_list[2]);
if (copy_to_user((void __user *)attr->addr, mach, sizeof(*mach)))
ret = -EFAULT;
kfree(mach);
@@ -1939,6 +1976,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
vcpu->arch.sie_block->ecb2 |= 0x08;
+ if (test_kvm_facility(vcpu->kvm, 130))
+ vcpu->arch.sie_block->ecb2 |= 0x20;
vcpu->arch.sie_block->eca = 0x1002000U;
if (sclp.has_cei)
vcpu->arch.sie_block->eca |= 0x80000000U;
@@ -2579,7 +2618,7 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
* to look up the current opcode to get the length of the instruction
* to be able to forward the PSW.
*/
- rc = read_guest_instr(vcpu, &opcode, 1);
+ rc = read_guest_instr(vcpu, vcpu->arch.sie_block->gpsw.addr, &opcode, 1);
ilen = insn_length(opcode);
if (rc < 0) {
return rc;
@@ -2761,6 +2800,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
int rc;
sigset_t sigsaved;
+ if (kvm_run->immediate_exit)
+ return -EINTR;
+
if (guestdbg_exit_pending(vcpu)) {
kvm_s390_prepare_debug_exit(vcpu);
return 0;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 3a4e97f1a9e6..af9fa91a0c91 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -86,9 +86,7 @@ static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix)
kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
}
-typedef u8 __bitwise ar_t;
-
-static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, ar_t *ar)
+static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, u8 *ar)
{
u32 base2 = vcpu->arch.sie_block->ipb >> 28;
u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
@@ -101,7 +99,7 @@ static inline u64 kvm_s390_get_base_disp_s(struct kvm_vcpu *vcpu, ar_t *ar)
static inline void kvm_s390_get_base_disp_sse(struct kvm_vcpu *vcpu,
u64 *address1, u64 *address2,
- ar_t *ar_b1, ar_t *ar_b2)
+ u8 *ar_b1, u8 *ar_b2)
{
u32 base1 = (vcpu->arch.sie_block->ipb & 0xf0000000) >> 28;
u32 disp1 = (vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16;
@@ -125,7 +123,7 @@ static inline void kvm_s390_get_regs_rre(struct kvm_vcpu *vcpu, int *r1, int *r2
*r2 = (vcpu->arch.sie_block->ipb & 0x000f0000) >> 16;
}
-static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, ar_t *ar)
+static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, u8 *ar)
{
u32 base2 = vcpu->arch.sie_block->ipb >> 28;
u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16) +
@@ -140,7 +138,7 @@ static inline u64 kvm_s390_get_base_disp_rsy(struct kvm_vcpu *vcpu, ar_t *ar)
return (base2 ? vcpu->run->s.regs.gprs[base2] : 0) + (long)(int)disp2;
}
-static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, ar_t *ar)
+static inline u64 kvm_s390_get_base_disp_rs(struct kvm_vcpu *vcpu, u8 *ar)
{
u32 base2 = vcpu->arch.sie_block->ipb >> 28;
u32 disp2 = ((vcpu->arch.sie_block->ipb & 0x0fff0000) >> 16);
@@ -379,7 +377,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu);
void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
-void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
/* support for Basic/Extended SCA handling */
static inline union ipte_control *kvm_s390_get_ipte_control(struct kvm *kvm)
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 794503516bd4..fb4b494cde9b 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -54,7 +54,7 @@ int kvm_s390_handle_aa(struct kvm_vcpu *vcpu)
static int handle_set_clock(struct kvm_vcpu *vcpu)
{
int rc;
- ar_t ar;
+ u8 ar;
u64 op2, val;
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
@@ -79,7 +79,7 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu)
u64 operand2;
u32 address;
int rc;
- ar_t ar;
+ u8 ar;
vcpu->stat.instruction_spx++;
@@ -117,7 +117,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu)
u64 operand2;
u32 address;
int rc;
- ar_t ar;
+ u8 ar;
vcpu->stat.instruction_stpx++;
@@ -147,7 +147,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
u16 vcpu_id = vcpu->vcpu_id;
u64 ga;
int rc;
- ar_t ar;
+ u8 ar;
vcpu->stat.instruction_stap++;
@@ -380,7 +380,7 @@ static int handle_tpi(struct kvm_vcpu *vcpu)
u32 tpi_data[3];
int rc;
u64 addr;
- ar_t ar;
+ u8 ar;
addr = kvm_s390_get_base_disp_s(vcpu, &ar);
if (addr & 3)
@@ -548,7 +548,7 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu)
psw_compat_t new_psw;
u64 addr;
int rc;
- ar_t ar;
+ u8 ar;
if (gpsw->mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -575,7 +575,7 @@ static int handle_lpswe(struct kvm_vcpu *vcpu)
psw_t new_psw;
u64 addr;
int rc;
- ar_t ar;
+ u8 ar;
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -597,7 +597,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu)
u64 stidp_data = vcpu->kvm->arch.model.cpuid;
u64 operand2;
int rc;
- ar_t ar;
+ u8 ar;
vcpu->stat.instruction_stidp++;
@@ -644,7 +644,7 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
ASCEBC(mem->vm[0].cpi, 16);
}
-static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, ar_t ar,
+static void insert_stsi_usr_data(struct kvm_vcpu *vcpu, u64 addr, u8 ar,
u8 fc, u8 sel1, u16 sel2)
{
vcpu->run->exit_reason = KVM_EXIT_S390_STSI;
@@ -663,7 +663,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu)
unsigned long mem = 0;
u64 operand2;
int rc = 0;
- ar_t ar;
+ u8 ar;
vcpu->stat.instruction_stsi++;
VCPU_EVENT(vcpu, 3, "STSI: fc: %u sel1: %u sel2: %u", fc, sel1, sel2);
@@ -970,7 +970,7 @@ int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu)
int reg, rc, nr_regs;
u32 ctl_array[16];
u64 ga;
- ar_t ar;
+ u8 ar;
vcpu->stat.instruction_lctl++;
@@ -1009,7 +1009,7 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu)
int reg, rc, nr_regs;
u32 ctl_array[16];
u64 ga;
- ar_t ar;
+ u8 ar;
vcpu->stat.instruction_stctl++;
@@ -1043,7 +1043,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu)
int reg, rc, nr_regs;
u64 ctl_array[16];
u64 ga;
- ar_t ar;
+ u8 ar;
vcpu->stat.instruction_lctlg++;
@@ -1081,7 +1081,7 @@ static int handle_stctg(struct kvm_vcpu *vcpu)
int reg, rc, nr_regs;
u64 ctl_array[16];
u64 ga;
- ar_t ar;
+ u8 ar;
vcpu->stat.instruction_stctg++;
@@ -1132,7 +1132,7 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
unsigned long hva, gpa;
int ret = 0, cc = 0;
bool writable;
- ar_t ar;
+ u8 ar;
vcpu->stat.instruction_tprot++;
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
index a9a9d974d9a4..38556e395915 100644
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -324,6 +324,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
/* Run-time-Instrumentation */
if (test_kvm_facility(vcpu->kvm, 64))
scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+ /* Instruction Execution Prevention */
+ if (test_kvm_facility(vcpu->kvm, 130))
+ scb_s->ecb2 |= scb_o->ecb2 & 0x20U;
if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
scb_s->eca |= scb_o->eca & 0x00000001U;
if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index beb90f3993e6..b48dc5f1900b 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -744,7 +744,7 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
pgste_set_unlock(ptep, new);
pte_unmap_unlock(ptep, ptl);
- return 0;
+ return cc;
}
EXPORT_SYMBOL(reset_guest_reference_bit);
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index 8cc53b1e6d03..0cf802de52a1 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -80,6 +80,8 @@ static struct facility_def facility_defs[] = {
76, /* msa extension 3 */
77, /* msa extension 4 */
78, /* enhanced-DAT 2 */
+ 130, /* instruction-execution-protection */
+ 131, /* enhanced-SOP 2 and side-effect */
-1 /* END */
}
},
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 12080d87da3b..cb8f9149f6c8 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -177,16 +177,8 @@ static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
struct desc_struct *d = get_cpu_gdt_table(cpu);
tss_desc tss;
- /*
- * sizeof(unsigned long) coming from an extra "long" at the end
- * of the iobitmap. See tss_struct definition in processor.h
- *
- * -1? seg base+limit should be pointing to the address of the
- * last valid byte
- */
set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
- IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
- sizeof(unsigned long) - 1);
+ __KERNEL_TSS_LIMIT);
write_gdt_entry(d, entry, &tss, DESC_TSS);
}
@@ -213,6 +205,54 @@ static inline void native_load_tr_desc(void)
asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
}
+static inline void force_reload_TR(void)
+{
+ struct desc_struct *d = get_cpu_gdt_table(smp_processor_id());
+ tss_desc tss;
+
+ memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc));
+
+ /*
+ * LTR requires an available TSS, and the TSS is currently
+ * busy. Make it be available so that LTR will work.
+ */
+ tss.type = DESC_TSS;
+ write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS);
+
+ load_TR_desc();
+}
+
+DECLARE_PER_CPU(bool, need_tr_refresh);
+
+static inline void refresh_TR(void)
+{
+ DEBUG_LOCKS_WARN_ON(preemptible());
+
+ if (unlikely(this_cpu_read(need_tr_refresh))) {
+ force_reload_TR();
+ this_cpu_write(need_tr_refresh, false);
+ }
+}
+
+/*
+ * If you do something evil that corrupts the cached TSS limit (I'm looking
+ * at you, VMX exits), call this function.
+ *
+ * The optimization here is that the TSS limit only matters for Linux if the
+ * IO bitmap is in use. If the TSS limit gets forced to its minimum value,
+ * everything works except that IO bitmap will be ignored and all CPL 3 IO
+ * instructions will #GP, which is exactly what we want for normal tasks.
+ */
+static inline void invalidate_tss_limit(void)
+{
+ DEBUG_LOCKS_WARN_ON(preemptible());
+
+ if (unlikely(test_thread_flag(TIF_IO_BITMAP)))
+ force_reload_TR();
+ else
+ this_cpu_write(need_tr_refresh, true);
+}
+
static inline void native_load_gdt(const struct desc_ptr *dtr)
{
asm volatile("lgdt %0"::"m" (*dtr));
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index e9cd7befcb76..3e8c287090e4 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -441,5 +441,6 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq);
void emulator_invalidate_register_cache(struct x86_emulate_ctxt *ctxt);
void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt);
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt);
#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a7066dc1a7e9..74ef58c8ff53 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -55,7 +55,6 @@
#define KVM_REQ_TRIPLE_FAULT 10
#define KVM_REQ_MMU_SYNC 11
#define KVM_REQ_CLOCK_UPDATE 12
-#define KVM_REQ_DEACTIVATE_FPU 13
#define KVM_REQ_EVENT 14
#define KVM_REQ_APF_HALT 15
#define KVM_REQ_STEAL_UPDATE 16
@@ -115,7 +114,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
#define KVM_PERMILLE_MMU_PAGES 20
#define KVM_MIN_ALLOC_MMU_PAGES 64
-#define KVM_MMU_HASH_SHIFT 10
+#define KVM_MMU_HASH_SHIFT 12
#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
#define KVM_MIN_FREE_MMU_PAGES 5
#define KVM_REFILL_PAGES 25
@@ -208,6 +207,13 @@ enum {
PFERR_WRITE_MASK | \
PFERR_PRESENT_MASK)
+/*
+ * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
+ * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting
+ * with the SVE bit in EPT PTEs.
+ */
+#define SPTE_SPECIAL_MASK (1ULL << 62)
+
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
/*
@@ -668,6 +674,9 @@ struct kvm_vcpu_arch {
int pending_ioapic_eoi;
int pending_external_vector;
+
+ /* GPA available (AMD only) */
+ bool gpa_available;
};
struct kvm_lpage_info {
@@ -716,6 +725,12 @@ struct kvm_hv {
HV_REFERENCE_TSC_PAGE tsc_ref;
};
+enum kvm_irqchip_mode {
+ KVM_IRQCHIP_NONE,
+ KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */
+ KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */
+};
+
struct kvm_arch {
unsigned int n_used_mmu_pages;
unsigned int n_requested_mmu_pages;
@@ -788,7 +803,7 @@ struct kvm_arch {
u64 disabled_quirks;
- bool irqchip_split;
+ enum kvm_irqchip_mode irqchip_mode;
u8 nr_reserved_ioapic_pins;
bool disabled_lapic_found;
@@ -815,6 +830,7 @@ struct kvm_vm_stat {
ulong mmu_unsync;
ulong remote_tlb_flush;
ulong lpages;
+ ulong max_mmu_page_hash_collisions;
};
struct kvm_vcpu_stat {
@@ -844,6 +860,7 @@ struct kvm_vcpu_stat {
u64 hypercalls;
u64 irq_injections;
u64 nmi_injections;
+ u64 req_event;
};
struct x86_instruction_info;
@@ -918,8 +935,6 @@ struct kvm_x86_ops {
unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
u32 (*get_pkru)(struct kvm_vcpu *vcpu);
- void (*fpu_activate)(struct kvm_vcpu *vcpu);
- void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
void (*tlb_flush)(struct kvm_vcpu *vcpu);
@@ -951,7 +966,7 @@ struct kvm_x86_ops {
void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
- void (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
+ int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu);
int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
int (*get_tdp_level)(void);
u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
@@ -1050,7 +1065,8 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask);
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
+ u64 acc_track_mask);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/include/asm/kvmclock.h b/arch/x86/include/asm/kvmclock.h
new file mode 100644
index 000000000000..f260bef63591
--- /dev/null
+++ b/arch/x86/include/asm/kvmclock.h
@@ -0,0 +1,6 @@
+#ifndef _ASM_X86_KVM_CLOCK_H
+#define _ASM_X86_KVM_CLOCK_H
+
+extern struct clocksource kvm_clock;
+
+#endif /* _ASM_X86_KVM_CLOCK_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 1eea6ca40694..f75fbfe550f2 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -673,7 +673,7 @@ static __always_inline void pv_kick(int cpu)
PVOP_VCALL1(pv_lock_ops.kick, cpu);
}
-static __always_inline bool pv_vcpu_is_preempted(int cpu)
+static __always_inline bool pv_vcpu_is_preempted(long cpu)
{
return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu);
}
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e6cfe7ba2d65..f385eca5407a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -304,7 +304,7 @@ struct x86_hw_tss {
u16 reserved5;
u16 io_bitmap_base;
-} __attribute__((packed)) ____cacheline_aligned;
+} __attribute__((packed));
#endif
/*
@@ -342,6 +342,16 @@ struct tss_struct {
DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+/*
+ * sizeof(unsigned long) coming from an extra "long" at the end
+ * of the iobitmap.
+ *
+ * -1? seg base+limit should be pointing to the address of the
+ * last valid byte
+ */
+#define __KERNEL_TSS_LIMIT \
+ (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1)
+
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
#endif
diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index c343ab52579f..48a706f641f2 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -34,7 +34,7 @@ static inline void queued_spin_unlock(struct qspinlock *lock)
}
#define vcpu_is_preempted vcpu_is_preempted
-static inline bool vcpu_is_preempted(int cpu)
+static inline bool vcpu_is_preempted(long cpu)
{
return pv_vcpu_is_preempted(cpu);
}
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 2b5b2d4b924e..cc54b7026567 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -467,8 +467,16 @@ enum vmcs_field {
#define VMX_EPT_WRITABLE_MASK 0x2ull
#define VMX_EPT_EXECUTABLE_MASK 0x4ull
#define VMX_EPT_IPAT_BIT (1ull << 6)
-#define VMX_EPT_ACCESS_BIT (1ull << 8)
-#define VMX_EPT_DIRTY_BIT (1ull << 9)
+#define VMX_EPT_ACCESS_BIT (1ull << 8)
+#define VMX_EPT_DIRTY_BIT (1ull << 9)
+#define VMX_EPT_RWX_MASK (VMX_EPT_READABLE_MASK | \
+ VMX_EPT_WRITABLE_MASK | \
+ VMX_EPT_EXECUTABLE_MASK)
+#define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT)
+
+/* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */
+#define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \
+ VMX_EPT_EXECUTABLE_MASK)
#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
@@ -500,6 +508,22 @@ struct vmx_msr_entry {
#define ENTRY_FAIL_VMCS_LINK_PTR 4
/*
+ * Exit Qualifications for EPT Violations
+ */
+#define EPT_VIOLATION_ACC_READ_BIT 0
+#define EPT_VIOLATION_ACC_WRITE_BIT 1
+#define EPT_VIOLATION_ACC_INSTR_BIT 2
+#define EPT_VIOLATION_READABLE_BIT 3
+#define EPT_VIOLATION_WRITABLE_BIT 4
+#define EPT_VIOLATION_EXECUTABLE_BIT 5
+#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT)
+#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT)
+#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT)
+#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT)
+#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT)
+#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT)
+
+/*
* VM-instruction error numbers
*/
enum vm_instruction_error_number {
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 1421a6585126..cff0bb6556f8 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -50,6 +50,15 @@ struct kvm_steal_time {
__u32 pad[11];
};
+#define KVM_CLOCK_PAIRING_WALLCLOCK 0
+struct kvm_clock_pairing {
+ __s64 sec;
+ __s64 nsec;
+ __u64 tsc;
+ __u32 flags;
+ __u32 pad[9];
+};
+
#define KVM_STEAL_ALIGNMENT_BITS 5
#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1)))
#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1)
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 210927ee2e74..99332f550c48 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -13,6 +13,10 @@ static char syscalls_ia32[] = {
#include <asm/syscalls_32.h>
};
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#include <asm/kvm_para.h>
+#endif
+
int main(void)
{
#ifdef CONFIG_PARAVIRT
@@ -22,6 +26,11 @@ int main(void)
BLANK();
#endif
+#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+ OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
+ BLANK();
+#endif
+
#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
ENTRY(bx);
ENTRY(cx);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 589b3193f102..b01bc8517450 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -16,6 +16,7 @@
#include <linux/syscalls.h>
#include <linux/bitmap.h>
#include <asm/syscalls.h>
+#include <asm/desc.h>
/*
* this changes the io permissions bitmap in the current task.
@@ -45,6 +46,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
memset(bitmap, 0xff, IO_BITMAP_BYTES);
t->io_bitmap_ptr = bitmap;
set_thread_flag(TIF_IO_BITMAP);
+
+ preempt_disable();
+ refresh_TR();
+ preempt_enable();
}
/*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 099fcba4981d..14f65a5f938e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -589,7 +589,8 @@ out:
local_irq_restore(flags);
}
-__visible bool __kvm_vcpu_is_preempted(int cpu)
+#ifdef CONFIG_X86_32
+__visible bool __kvm_vcpu_is_preempted(long cpu)
{
struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
@@ -597,6 +598,29 @@ __visible bool __kvm_vcpu_is_preempted(int cpu)
}
PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+#else
+
+#include <asm/asm-offsets.h>
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+
+/*
+ * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+ * restoring to/from the stack.
+ */
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+"movq __per_cpu_offset(,%rdi,8), %rax;"
+"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+"setne %al;"
+"ret;"
+".popsection");
+
+#endif
+
/*
* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
*/
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 542710b99f52..bae6ea6cfb94 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -28,6 +28,7 @@
#include <asm/x86_init.h>
#include <asm/reboot.h>
+#include <asm/kvmclock.h>
static int kvmclock __ro_after_init = 1;
static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
@@ -49,6 +50,7 @@ struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
{
return hv_clock;
}
+EXPORT_SYMBOL_GPL(pvclock_pvti_cpu0_va);
/*
* The wallclock is the time of day when we booted. Since then, some time may
@@ -174,13 +176,14 @@ bool kvm_check_and_clear_guest_paused(void)
return ret;
}
-static struct clocksource kvm_clock = {
+struct clocksource kvm_clock = {
.name = "kvm-clock",
.read = kvm_clock_get_cycles,
.rating = 400,
.mask = CLOCKSOURCE_MASK(64),
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
+EXPORT_SYMBOL_GPL(kvm_clock);
int kvm_register_clock(char *txt)
{
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 6259327f3454..8f2d1c9d43a8 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -20,7 +20,7 @@ bool pv_is_native_spin_unlock(void)
__raw_callee_save___native_queued_spin_unlock;
}
-__visible bool __native_vcpu_is_preempted(int cpu)
+__visible bool __native_vcpu_is_preempted(long cpu)
{
return false;
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b615a1113f58..7780efa635b9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -32,6 +32,7 @@
#include <asm/mce.h>
#include <asm/vm86.h>
#include <asm/switch_to.h>
+#include <asm/desc.h>
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -64,6 +65,9 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
};
EXPORT_PER_CPU_SYMBOL(cpu_tss);
+DEFINE_PER_CPU(bool, need_tr_refresh);
+EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh);
+
/*
* this gets called so that we can store lazy state into memory and copy the
* current task into the new thread.
@@ -209,6 +213,12 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
*/
memcpy(tss->io_bitmap, next->io_bitmap_ptr,
max(prev->io_bitmap_max, next->io_bitmap_max));
+
+ /*
+ * Make sure that the TSS limit is correct for the CPU
+ * to notice the IO bitmap.
+ */
+ refresh_TR();
} else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
/*
* Clear any possible leftover bits:
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index e85f6bd7b9d5..1d155cc56629 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -123,8 +123,6 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
- kvm_x86_ops->fpu_activate(vcpu);
-
/*
* The existing code assumes virtual address is 48-bit in the canonical
* address checks; exit if it is ever changed.
@@ -383,7 +381,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
- F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/;
+ F(AVX512VBMI) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ);
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
@@ -861,12 +859,6 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
if (!best)
best = check_cpuid_limit(vcpu, function, index);
- /*
- * Perfmon not yet supported for L2 guest.
- */
- if (is_guest_mode(vcpu) && function == 0xa)
- best = NULL;
-
if (best) {
*eax = best->eax;
*ebx = best->ebx;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index cedbba0f3402..45c7306c8780 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -173,6 +173,7 @@
#define NearBranch ((u64)1 << 52) /* Near branches */
#define No16 ((u64)1 << 53) /* No 16 bit operand */
#define IncSP ((u64)1 << 54) /* SP is incremented before ModRM calc */
+#define TwoMemOp ((u64)1 << 55) /* Instruction has two memory operand */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -4298,7 +4299,7 @@ static const struct opcode group1[] = {
};
static const struct opcode group1A[] = {
- I(DstMem | SrcNone | Mov | Stack | IncSP, em_pop), N, N, N, N, N, N, N,
+ I(DstMem | SrcNone | Mov | Stack | IncSP | TwoMemOp, em_pop), N, N, N, N, N, N, N,
};
static const struct opcode group2[] = {
@@ -4336,7 +4337,7 @@ static const struct opcode group5[] = {
I(SrcMemFAddr | ImplicitOps, em_call_far),
I(SrcMem | NearBranch, em_jmp_abs),
I(SrcMemFAddr | ImplicitOps, em_jmp_far),
- I(SrcMem | Stack, em_push), D(Undefined),
+ I(SrcMem | Stack | TwoMemOp, em_push), D(Undefined),
};
static const struct opcode group6[] = {
@@ -4556,8 +4557,8 @@ static const struct opcode opcode_table[256] = {
/* 0xA0 - 0xA7 */
I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
- I2bv(SrcSI | DstDI | Mov | String, em_mov),
- F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r),
+ I2bv(SrcSI | DstDI | Mov | String | TwoMemOp, em_mov),
+ F2bv(SrcSI | DstDI | String | NoWrite | TwoMemOp, em_cmp_r),
/* 0xA8 - 0xAF */
F2bv(DstAcc | SrcImm | NoWrite, em_test),
I2bv(SrcAcc | DstDI | Mov | String, em_mov),
@@ -5671,3 +5672,14 @@ void emulator_writeback_register_cache(struct x86_emulate_ctxt *ctxt)
{
writeback_registers(ctxt);
}
+
+bool emulator_can_use_gpa(struct x86_emulate_ctxt *ctxt)
+{
+ if (ctxt->rep_prefix && (ctxt->d & String))
+ return false;
+
+ if (ctxt->d & TwoMemOp)
+ return false;
+
+ return true;
+}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 2ecd7dab4631..f701d4430727 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -305,13 +305,13 @@ static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
return -ENOENT;
memset(&irq, 0, sizeof(irq));
- irq.dest_id = kvm_apic_id(vcpu->arch.apic);
+ irq.shorthand = APIC_DEST_SELF;
irq.dest_mode = APIC_DEST_PHYSICAL;
irq.delivery_mode = APIC_DM_FIXED;
irq.vector = vector;
irq.level = 1;
- ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL);
+ ret = kvm_irq_delivery_to_apic(vcpu->kvm, vcpu->arch.apic, &irq, NULL);
trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
return ret;
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 7cc2360f1848..73ea24d4f119 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -598,14 +598,14 @@ static const struct kvm_io_device_ops picdev_eclr_ops = {
.write = picdev_eclr_write,
};
-struct kvm_pic *kvm_create_pic(struct kvm *kvm)
+int kvm_pic_init(struct kvm *kvm)
{
struct kvm_pic *s;
int ret;
s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
if (!s)
- return NULL;
+ return -ENOMEM;
spin_lock_init(&s->lock);
s->kvm = kvm;
s->pics[0].elcr_mask = 0xf8;
@@ -635,7 +635,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
mutex_unlock(&kvm->slots_lock);
- return s;
+ kvm->arch.vpic = s;
+
+ return 0;
fail_unreg_1:
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave);
@@ -648,13 +650,17 @@ fail_unlock:
kfree(s);
- return NULL;
+ return ret;
}
-void kvm_destroy_pic(struct kvm_pic *vpic)
+void kvm_pic_destroy(struct kvm *kvm)
{
+ struct kvm_pic *vpic = kvm->arch.vpic;
+
kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master);
kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave);
kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr);
+
+ kvm->arch.vpic = NULL;
kfree(vpic);
}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 035731eb3897..40d5b2cf6061 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -73,8 +73,8 @@ struct kvm_pic {
unsigned long irq_states[PIC_NUM_PINS];
};
-struct kvm_pic *kvm_create_pic(struct kvm *kvm);
-void kvm_destroy_pic(struct kvm_pic *vpic);
+int kvm_pic_init(struct kvm *kvm);
+void kvm_pic_destroy(struct kvm *kvm);
int kvm_pic_read_irq(struct kvm *kvm);
void kvm_pic_update_irq(struct kvm_pic *s);
@@ -93,18 +93,19 @@ static inline int pic_in_kernel(struct kvm *kvm)
static inline int irqchip_split(struct kvm *kvm)
{
- return kvm->arch.irqchip_split;
+ return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT;
}
-static inline int irqchip_in_kernel(struct kvm *kvm)
+static inline int irqchip_kernel(struct kvm *kvm)
{
- struct kvm_pic *vpic = pic_irqchip(kvm);
- bool ret;
+ return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL;
+}
- ret = (vpic != NULL);
- ret |= irqchip_split(kvm);
+static inline int irqchip_in_kernel(struct kvm *kvm)
+{
+ bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE;
- /* Read vpic before kvm->irq_routing. */
+ /* Matches with wmb after initializing kvm->irq_routing. */
smp_rmb();
return ret;
}
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6c0191615f23..b96d3893f121 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -41,15 +41,6 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
bool line_status)
{
struct kvm_pic *pic = pic_irqchip(kvm);
-
- /*
- * XXX: rejecting pic routes when pic isn't in use would be better,
- * but the default routing table is installed while kvm->arch.vpic is
- * NULL and KVM_CREATE_IRQCHIP can race with KVM_IRQ_LINE.
- */
- if (!pic)
- return -1;
-
return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
}
@@ -58,10 +49,6 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
bool line_status)
{
struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-
- if (!ioapic)
- return -1;
-
return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
line_status);
}
@@ -297,16 +284,20 @@ int kvm_set_routing_entry(struct kvm *kvm,
case KVM_IRQ_ROUTING_IRQCHIP:
delta = 0;
switch (ue->u.irqchip.irqchip) {
- case KVM_IRQCHIP_PIC_MASTER:
- e->set = kvm_set_pic_irq;
- max_pin = PIC_NUM_PINS;
- break;
case KVM_IRQCHIP_PIC_SLAVE:
+ delta = 8;
+ /* fall through */
+ case KVM_IRQCHIP_PIC_MASTER:
+ if (!pic_in_kernel(kvm))
+ goto out;
+
e->set = kvm_set_pic_irq;
max_pin = PIC_NUM_PINS;
- delta = 8;
break;
case KVM_IRQCHIP_IOAPIC:
+ if (!ioapic_in_kernel(kvm))
+ goto out;
+
max_pin = KVM_IOAPIC_NUM_PINS;
e->set = kvm_set_ioapic_irq;
break;
@@ -409,7 +400,7 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm)
void kvm_arch_post_irq_routing_update(struct kvm *kvm)
{
- if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
+ if (!irqchip_split(kvm))
return;
kvm_make_scan_ioapic_request(kvm);
}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 2f6ef5121a4c..bad6a25067bc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -115,6 +115,16 @@ static inline int apic_enabled(struct kvm_lapic *apic)
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+ return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
+static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
+{
+ return apic->vcpu->vcpu_id;
+}
+
static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
switch (map->mode) {
@@ -159,13 +169,13 @@ static void recalculate_apic_map(struct kvm *kvm)
struct kvm_apic_map *new, *old = NULL;
struct kvm_vcpu *vcpu;
int i;
- u32 max_id = 255;
+ u32 max_id = 255; /* enough space for any xAPIC ID */
mutex_lock(&kvm->arch.apic_map_lock);
kvm_for_each_vcpu(i, vcpu, kvm)
if (kvm_apic_present(vcpu))
- max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
+ max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
@@ -179,16 +189,28 @@ static void recalculate_apic_map(struct kvm *kvm)
struct kvm_lapic *apic = vcpu->arch.apic;
struct kvm_lapic **cluster;
u16 mask;
- u32 ldr, aid;
+ u32 ldr;
+ u8 xapic_id;
+ u32 x2apic_id;
if (!kvm_apic_present(vcpu))
continue;
- aid = kvm_apic_id(apic);
- ldr = kvm_lapic_get_reg(apic, APIC_LDR);
+ xapic_id = kvm_xapic_id(apic);
+ x2apic_id = kvm_x2apic_id(apic);
- if (aid <= new->max_apic_id)
- new->phys_map[aid] = apic;
+ /* Hotplug hack: see kvm_apic_match_physical_addr(), ... */
+ if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
+ x2apic_id <= new->max_apic_id)
+ new->phys_map[x2apic_id] = apic;
+ /*
+ * ... xAPIC ID of VCPUs with APIC ID > 0xff will wrap-around,
+ * prevent them from masking VCPUs with APIC ID <= 0xff.
+ */
+ if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])
+ new->phys_map[xapic_id] = apic;
+
+ ldr = kvm_lapic_get_reg(apic, APIC_LDR);
if (apic_x2apic_mode(apic)) {
new->mode |= KVM_APIC_MODE_X2APIC;
@@ -250,6 +272,8 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
{
u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
+ WARN_ON_ONCE(id != apic->vcpu->vcpu_id);
+
kvm_lapic_set_reg(apic, APIC_ID, id);
kvm_lapic_set_reg(apic, APIC_LDR, ldr);
recalculate_apic_map(apic->vcpu->kvm);
@@ -317,7 +341,7 @@ static int find_highest_vector(void *bitmap)
vec >= 0; vec -= APIC_VECTORS_PER_REG) {
reg = bitmap + REG_POS(vec);
if (*reg)
- return fls(*reg) - 1 + vec;
+ return __fls(*reg) + vec;
}
return -1;
@@ -337,27 +361,32 @@ static u8 count_vectors(void *bitmap)
return count;
}
-void __kvm_apic_update_irr(u32 *pir, void *regs)
+int __kvm_apic_update_irr(u32 *pir, void *regs)
{
- u32 i, pir_val;
+ u32 i, vec;
+ u32 pir_val, irr_val;
+ int max_irr = -1;
- for (i = 0; i <= 7; i++) {
+ for (i = vec = 0; i <= 7; i++, vec += 32) {
pir_val = READ_ONCE(pir[i]);
+ irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10));
if (pir_val) {
- pir_val = xchg(&pir[i], 0);
- *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val;
+ irr_val |= xchg(&pir[i], 0);
+ *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val;
}
+ if (irr_val)
+ max_irr = __fls(irr_val) + vec;
}
+
+ return max_irr;
}
EXPORT_SYMBOL_GPL(__kvm_apic_update_irr);
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
+int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- __kvm_apic_update_irr(pir, apic->regs);
-
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ return __kvm_apic_update_irr(pir, apic->regs);
}
EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
@@ -377,8 +406,6 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
if (!apic->irr_pending)
return -1;
- if (apic->vcpu->arch.apicv_active)
- kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
result = apic_search_irr(apic);
ASSERT(result == -1 || result >= 16);
@@ -392,9 +419,10 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
vcpu = apic->vcpu;
if (unlikely(vcpu->arch.apicv_active)) {
- /* try to update RVI */
+ /* need to update RVI */
apic_clear_vector(vec, apic->regs + APIC_IRR);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ apic_find_highest_irr(apic));
} else {
apic->irr_pending = false;
apic_clear_vector(vec, apic->regs + APIC_IRR);
@@ -484,6 +512,7 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
*/
return apic_find_highest_irr(vcpu->arch.apic);
}
+EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
int vector, int level, int trig_mode,
@@ -500,16 +529,14 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
{
-
- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val,
- sizeof(val));
+ return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val,
+ sizeof(val));
}
static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val)
{
-
- return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val,
- sizeof(*val));
+ return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val,
+ sizeof(*val));
}
static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu)
@@ -546,7 +573,19 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
}
-static void apic_update_ppr(struct kvm_lapic *apic)
+static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
+{
+ int highest_irr;
+ if (kvm_x86_ops->sync_pir_to_irr && apic->vcpu->arch.apicv_active)
+ highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
+ else
+ highest_irr = apic_find_highest_irr(apic);
+ if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr)
+ return -1;
+ return highest_irr;
+}
+
+static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr)
{
u32 tpr, isrv, ppr, old_ppr;
int isr;
@@ -564,13 +603,28 @@ static void apic_update_ppr(struct kvm_lapic *apic)
apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
apic, ppr, isr, isrv);
- if (old_ppr != ppr) {
+ *new_ppr = ppr;
+ if (old_ppr != ppr)
kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr);
- if (ppr < old_ppr)
- kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
- }
+
+ return ppr < old_ppr;
+}
+
+static void apic_update_ppr(struct kvm_lapic *apic)
+{
+ u32 ppr;
+
+ if (__apic_update_ppr(apic, &ppr) &&
+ apic_has_interrupt_for_ppr(apic, ppr) != -1)
+ kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
}
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu)
+{
+ apic_update_ppr(vcpu->arch.apic);
+}
+EXPORT_SYMBOL_GPL(kvm_apic_update_ppr);
+
static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
{
kvm_lapic_set_reg(apic, APIC_TASKPRI, tpr);
@@ -579,10 +633,8 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
{
- if (apic_x2apic_mode(apic))
- return mda == X2APIC_BROADCAST;
-
- return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST;
+ return mda == (apic_x2apic_mode(apic) ?
+ X2APIC_BROADCAST : APIC_BROADCAST);
}
static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
@@ -591,9 +643,18 @@ static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
return true;
if (apic_x2apic_mode(apic))
- return mda == kvm_apic_id(apic);
+ return mda == kvm_x2apic_id(apic);
- return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic));
+ /*
+ * Hotplug hack: Make LAPIC in xAPIC mode also accept interrupts as if
+ * it were in x2APIC mode. Hotplugged VCPUs start in xAPIC mode and
+ * this allows unique addressing of VCPUs with APIC ID over 0xff.
+ * The 0xff condition is needed because writeable xAPIC ID.
+ */
+ if (kvm_x2apic_id(apic) > 0xff && mda == kvm_x2apic_id(apic))
+ return true;
+
+ return mda == kvm_xapic_id(apic);
}
static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -610,7 +671,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
&& (logical_id & mda & 0xffff) != 0;
logical_id = GET_APIC_LOGICAL_ID(logical_id);
- mda = GET_APIC_DEST_FIELD(mda);
switch (kvm_lapic_get_reg(apic, APIC_DFR)) {
case APIC_DFR_FLAT:
@@ -627,9 +687,9 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
/* The KVM local APIC implementation has two quirks:
*
- * - the xAPIC MDA stores the destination at bits 24-31, while this
- * is not true of struct kvm_lapic_irq's dest_id field. This is
- * just a quirk in the API and is not problematic.
+ * - Real hardware delivers interrupts destined to x2APIC ID > 0xff to LAPICs
+ * in xAPIC mode if the "destination & 0xff" matches its xAPIC ID.
+ * KVM doesn't do that aliasing.
*
* - in-kernel IOAPIC messages have to be delivered directly to
* x2APIC, because the kernel does not support interrupt remapping.
@@ -645,13 +705,12 @@ static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
struct kvm_lapic *source, struct kvm_lapic *target)
{
bool ipi = source != NULL;
- bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
- !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+ !ipi && dest_id == APIC_BROADCAST && apic_x2apic_mode(target))
return X2APIC_BROADCAST;
- return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
+ return dest_id;
}
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
@@ -1907,9 +1966,9 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.apic_arb_prio = 0;
vcpu->arch.apic_attention = 0;
- apic_debug("%s: vcpu=%p, id=%d, base_msr="
+ apic_debug("%s: vcpu=%p, id=0x%x, base_msr="
"0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
- vcpu, kvm_apic_id(apic),
+ vcpu, kvm_lapic_get_reg(apic, APIC_ID),
vcpu->arch.apic_base, apic->base_address);
}
@@ -2021,17 +2080,13 @@ nomem:
int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- int highest_irr;
+ u32 ppr;
if (!apic_enabled(apic))
return -1;
- apic_update_ppr(apic);
- highest_irr = apic_find_highest_irr(apic);
- if ((highest_irr == -1) ||
- ((highest_irr & 0xF0) <= kvm_lapic_get_reg(apic, APIC_PROCPRI)))
- return -1;
- return highest_irr;
+ __apic_update_ppr(apic, &ppr);
+ return apic_has_interrupt_for_ppr(apic, ppr);
}
int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
@@ -2067,6 +2122,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
{
int vector = kvm_apic_has_interrupt(vcpu);
struct kvm_lapic *apic = vcpu->arch.apic;
+ u32 ppr;
if (vector == -1)
return -1;
@@ -2078,13 +2134,23 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
* because the process would deliver it through the IDT.
*/
- apic_set_isr(vector, apic);
- apic_update_ppr(apic);
apic_clear_irr(vector, apic);
-
if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
- apic_clear_isr(vector, apic);
+ /*
+ * For auto-EOI interrupts, there might be another pending
+ * interrupt above PPR, so check whether to raise another
+ * KVM_REQ_EVENT.
+ */
apic_update_ppr(apic);
+ } else {
+ /*
+ * For normal interrupts, PPR has been raised and there cannot
+ * be a higher-priority pending interrupt---except if there was
+ * a concurrent interrupt injection, but that would have
+ * triggered KVM_REQ_EVENT already.
+ */
+ apic_set_isr(vector, apic);
+ __apic_update_ppr(apic, &ppr);
}
return vector;
@@ -2145,8 +2211,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
1 : count_vectors(apic->regs + APIC_ISR);
apic->highest_isr_cache = -1;
if (vcpu->arch.apicv_active) {
- if (kvm_x86_ops->apicv_post_state_restore)
- kvm_x86_ops->apicv_post_state_restore(vcpu);
+ kvm_x86_ops->apicv_post_state_restore(vcpu);
kvm_x86_ops->hwapic_irr_update(vcpu,
apic_find_highest_irr(apic));
kvm_x86_ops->hwapic_isr_update(vcpu,
@@ -2220,8 +2285,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention))
return;
- if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
- sizeof(u32)))
+ if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32)))
return;
apic_set_tpr(vcpu->arch.apic, data & 0xff);
@@ -2273,14 +2338,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu)
max_isr = 0;
data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24);
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data,
- sizeof(u32));
+ kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data,
+ sizeof(u32));
}
int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr)
{
if (vapic_addr) {
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
&vcpu->arch.apic->vapic_cache,
vapic_addr, sizeof(u32)))
return -EINVAL;
@@ -2374,7 +2439,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
vcpu->arch.pv_eoi.msr_val = data;
if (!pv_eoi_enabled(vcpu))
return 0;
- return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data,
+ return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data,
addr, sizeof(u8));
}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index ff8039d61672..bcbe811f3b97 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -71,8 +71,9 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len,
bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, unsigned int dest, int dest_mode);
-void __kvm_apic_update_irr(u32 *pir, void *regs);
-void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
+int __kvm_apic_update_irr(u32 *pir, void *regs);
+int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
+void kvm_apic_update_ppr(struct kvm_vcpu *vcpu);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
struct dest_map *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
@@ -203,17 +204,6 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
-static inline u32 kvm_apic_id(struct kvm_lapic *apic)
-{
- /* To avoid a race between apic_base and following APIC_ID update when
- * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
- */
- if (apic_x2apic_mode(apic))
- return apic->vcpu->vcpu_id;
-
- return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
-}
-
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
void wait_lapic_expire(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7012de4a1fed..2fd7586aad4d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -37,6 +37,8 @@
#include <linux/srcu.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/hash.h>
+#include <linux/kern_levels.h>
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -129,6 +131,10 @@ module_param(dbg, bool, 0644);
#define ACC_USER_MASK PT_USER_MASK
#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
+/* The mask for the R/X bits in EPT PTEs */
+#define PT64_EPT_READABLE_MASK 0x1ull
+#define PT64_EPT_EXECUTABLE_MASK 0x4ull
+
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@@ -178,15 +184,40 @@ static u64 __read_mostly shadow_dirty_mask;
static u64 __read_mostly shadow_mmio_mask;
static u64 __read_mostly shadow_present_mask;
+/*
+ * The mask/value to distinguish a PTE that has been marked not-present for
+ * access tracking purposes.
+ * The mask would be either 0 if access tracking is disabled, or
+ * SPTE_SPECIAL_MASK|VMX_EPT_RWX_MASK if access tracking is enabled.
+ */
+static u64 __read_mostly shadow_acc_track_mask;
+static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
+
+/*
+ * The mask/shift to use for saving the original R/X bits when marking the PTE
+ * as not-present for access tracking purposes. We do not save the W bit as the
+ * PTEs being access tracked also need to be dirty tracked, so the W bit will be
+ * restored only when a write is attempted to the page.
+ */
+static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
+ PT64_EPT_EXECUTABLE_MASK;
+static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
+
static void mmu_spte_set(u64 *sptep, u64 spte);
static void mmu_free_roots(struct kvm_vcpu *vcpu);
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
{
- shadow_mmio_mask = mmio_mask;
+ shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+static inline bool is_access_track_spte(u64 spte)
+{
+ /* Always false if shadow_acc_track_mask is zero. */
+ return (spte & shadow_acc_track_mask) == shadow_acc_track_value;
+}
+
/*
* the low bit of the generation number is always presumed to be zero.
* This disables mmio caching during memslot updates. The concept is
@@ -284,17 +315,35 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
}
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask)
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
+ u64 acc_track_mask)
{
+ if (acc_track_mask != 0)
+ acc_track_mask |= SPTE_SPECIAL_MASK;
+
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
shadow_dirty_mask = dirty_mask;
shadow_nx_mask = nx_mask;
shadow_x_mask = x_mask;
shadow_present_mask = p_mask;
+ shadow_acc_track_mask = acc_track_mask;
+ WARN_ON(shadow_accessed_mask != 0 && shadow_acc_track_mask != 0);
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
+void kvm_mmu_clear_all_pte_masks(void)
+{
+ shadow_user_mask = 0;
+ shadow_accessed_mask = 0;
+ shadow_dirty_mask = 0;
+ shadow_nx_mask = 0;
+ shadow_x_mask = 0;
+ shadow_mmio_mask = 0;
+ shadow_present_mask = 0;
+ shadow_acc_track_mask = 0;
+}
+
static int is_cpuid_PSE36(void)
{
return 1;
@@ -307,7 +356,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
static int is_shadow_present_pte(u64 pte)
{
- return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
+ return (pte != 0) && !is_mmio_spte(pte);
}
static int is_large_pte(u64 pte)
@@ -324,6 +373,11 @@ static int is_last_spte(u64 pte, int level)
return 0;
}
+static bool is_executable_pte(u64 spte)
+{
+ return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+}
+
static kvm_pfn_t spte_to_pfn(u64 pte)
{
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -473,7 +527,7 @@ retry:
}
#endif
-static bool spte_is_locklessly_modifiable(u64 spte)
+static bool spte_can_locklessly_be_made_writable(u64 spte)
{
return (spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE)) ==
(SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE);
@@ -481,36 +535,38 @@ static bool spte_is_locklessly_modifiable(u64 spte)
static bool spte_has_volatile_bits(u64 spte)
{
+ if (!is_shadow_present_pte(spte))
+ return false;
+
/*
* Always atomically update spte if it can be updated
* out of mmu-lock, it can ensure dirty bit is not lost,
* also, it can help us to get a stable is_writable_pte()
* to ensure tlb flush is not missed.
*/
- if (spte_is_locklessly_modifiable(spte))
+ if (spte_can_locklessly_be_made_writable(spte) ||
+ is_access_track_spte(spte))
return true;
- if (!shadow_accessed_mask)
- return false;
-
- if (!is_shadow_present_pte(spte))
- return false;
-
- if ((spte & shadow_accessed_mask) &&
- (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
- return false;
+ if (shadow_accessed_mask) {
+ if ((spte & shadow_accessed_mask) == 0 ||
+ (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
+ return true;
+ }
- return true;
+ return false;
}
-static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+static bool is_accessed_spte(u64 spte)
{
- return (old_spte & bit_mask) && !(new_spte & bit_mask);
+ return shadow_accessed_mask ? spte & shadow_accessed_mask
+ : !is_access_track_spte(spte);
}
-static bool spte_is_bit_changed(u64 old_spte, u64 new_spte, u64 bit_mask)
+static bool is_dirty_spte(u64 spte)
{
- return (old_spte & bit_mask) != (new_spte & bit_mask);
+ return shadow_dirty_mask ? spte & shadow_dirty_mask
+ : spte & PT_WRITABLE_MASK;
}
/* Rules for using mmu_spte_set:
@@ -525,25 +581,19 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
__set_spte(sptep, new_spte);
}
-/* Rules for using mmu_spte_update:
- * Update the state bits, it means the mapped pfn is not changed.
- *
- * Whenever we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB, the return value indicates this
- * case.
+/*
+ * Update the SPTE (excluding the PFN), but do not track changes in its
+ * accessed/dirty status.
*/
-static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
{
u64 old_spte = *sptep;
- bool ret = false;
WARN_ON(!is_shadow_present_pte(new_spte));
if (!is_shadow_present_pte(old_spte)) {
mmu_spte_set(sptep, new_spte);
- return ret;
+ return old_spte;
}
if (!spte_has_volatile_bits(old_spte))
@@ -551,45 +601,62 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
+ WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
+
+ return old_spte;
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changed.
+ *
+ * Whenever we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB, the return value indicates this
+ * case.
+ *
+ * Returns true if the TLB needs to be flushed
+ */
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
+{
+ bool flush = false;
+ u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
+
+ if (!is_shadow_present_pte(old_spte))
+ return false;
+
/*
* For the spte updated out of mmu-lock is safe, since
* we always atomically update it, see the comments in
* spte_has_volatile_bits().
*/
- if (spte_is_locklessly_modifiable(old_spte) &&
+ if (spte_can_locklessly_be_made_writable(old_spte) &&
!is_writable_pte(new_spte))
- ret = true;
-
- if (!shadow_accessed_mask) {
- /*
- * We don't set page dirty when dropping non-writable spte.
- * So do it now if the new spte is becoming non-writable.
- */
- if (ret)
- kvm_set_pfn_dirty(spte_to_pfn(old_spte));
- return ret;
- }
+ flush = true;
/*
- * Flush TLB when accessed/dirty bits are changed in the page tables,
+ * Flush TLB when accessed/dirty states are changed in the page tables,
* to guarantee consistency between TLB and page tables.
*/
- if (spte_is_bit_changed(old_spte, new_spte,
- shadow_accessed_mask | shadow_dirty_mask))
- ret = true;
- if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
+ if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
+ flush = true;
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
- if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
+ }
+
+ if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
+ flush = true;
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+ }
- return ret;
+ return flush;
}
/*
* Rules for using mmu_spte_clear_track_bits:
* It sets the sptep from present to nonpresent, and track the
* state bits, it is used to clear the last level sptep.
+ * Returns non-zero if the PTE was previously valid.
*/
static int mmu_spte_clear_track_bits(u64 *sptep)
{
@@ -613,11 +680,12 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
*/
WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
- if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+ if (is_accessed_spte(old_spte))
kvm_set_pfn_accessed(pfn);
- if (old_spte & (shadow_dirty_mask ? shadow_dirty_mask :
- PT_WRITABLE_MASK))
+
+ if (is_dirty_spte(old_spte))
kvm_set_pfn_dirty(pfn);
+
return 1;
}
@@ -636,6 +704,78 @@ static u64 mmu_spte_get_lockless(u64 *sptep)
return __get_spte_lockless(sptep);
}
+static u64 mark_spte_for_access_track(u64 spte)
+{
+ if (shadow_accessed_mask != 0)
+ return spte & ~shadow_accessed_mask;
+
+ if (shadow_acc_track_mask == 0 || is_access_track_spte(spte))
+ return spte;
+
+ /*
+ * Making an Access Tracking PTE will result in removal of write access
+ * from the PTE. So, verify that we will be able to restore the write
+ * access in the fast page fault path later on.
+ */
+ WARN_ONCE((spte & PT_WRITABLE_MASK) &&
+ !spte_can_locklessly_be_made_writable(spte),
+ "kvm: Writable SPTE is not locklessly dirty-trackable\n");
+
+ WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
+ shadow_acc_track_saved_bits_shift),
+ "kvm: Access Tracking saved bit locations are not zero\n");
+
+ spte |= (spte & shadow_acc_track_saved_bits_mask) <<
+ shadow_acc_track_saved_bits_shift;
+ spte &= ~shadow_acc_track_mask;
+ spte |= shadow_acc_track_value;
+
+ return spte;
+}
+
+/* Restore an acc-track PTE back to a regular PTE */
+static u64 restore_acc_track_spte(u64 spte)
+{
+ u64 new_spte = spte;
+ u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
+ & shadow_acc_track_saved_bits_mask;
+
+ WARN_ON_ONCE(!is_access_track_spte(spte));
+
+ new_spte &= ~shadow_acc_track_mask;
+ new_spte &= ~(shadow_acc_track_saved_bits_mask <<
+ shadow_acc_track_saved_bits_shift);
+ new_spte |= saved_bits;
+
+ return new_spte;
+}
+
+/* Returns the Accessed status of the PTE and resets it at the same time. */
+static bool mmu_spte_age(u64 *sptep)
+{
+ u64 spte = mmu_spte_get_lockless(sptep);
+
+ if (!is_accessed_spte(spte))
+ return false;
+
+ if (shadow_accessed_mask) {
+ clear_bit((ffs(shadow_accessed_mask) - 1),
+ (unsigned long *)sptep);
+ } else {
+ /*
+ * Capture the dirty status of the page, so that it doesn't get
+ * lost when the SPTE is marked for access tracking.
+ */
+ if (is_writable_pte(spte))
+ kvm_set_pfn_dirty(spte_to_pfn(spte));
+
+ spte = mark_spte_for_access_track(spte);
+ mmu_spte_update_no_track(sptep, spte);
+ }
+
+ return true;
+}
+
static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
{
/*
@@ -1212,7 +1352,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
u64 spte = *sptep;
if (!is_writable_pte(spte) &&
- !(pt_protect && spte_is_locklessly_modifiable(spte)))
+ !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
return false;
rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
@@ -1420,7 +1560,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
restart:
for_each_rmap_spte(rmap_head, &iter, sptep) {
rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
- sptep, *sptep, gfn, level);
+ sptep, *sptep, gfn, level);
need_flush = 1;
@@ -1433,7 +1573,8 @@ restart:
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
- new_spte &= ~shadow_accessed_mask;
+
+ new_spte = mark_spte_for_access_track(new_spte);
mmu_spte_clear_track_bits(sptep);
mmu_spte_set(sptep, new_spte);
@@ -1595,15 +1736,8 @@ static int kvm_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
struct rmap_iterator uninitialized_var(iter);
int young = 0;
- BUG_ON(!shadow_accessed_mask);
-
- for_each_rmap_spte(rmap_head, &iter, sptep) {
- if (*sptep & shadow_accessed_mask) {
- young = 1;
- clear_bit((ffs(shadow_accessed_mask) - 1),
- (unsigned long *)sptep);
- }
- }
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ young |= mmu_spte_age(sptep);
trace_kvm_age_page(gfn, level, slot, young);
return young;
@@ -1615,24 +1749,20 @@ static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head,
{
u64 *sptep;
struct rmap_iterator iter;
- int young = 0;
/*
- * If there's no access bit in the secondary pte set by the
- * hardware it's up to gup-fast/gup to set the access bit in
- * the primary pte or in the page structure.
+ * If there's no access bit in the secondary pte set by the hardware and
+ * fast access tracking is also not enabled, it's up to gup-fast/gup to
+ * set the access bit in the primary pte or in the page structure.
*/
- if (!shadow_accessed_mask)
+ if (!shadow_accessed_mask && !shadow_acc_track_mask)
goto out;
- for_each_rmap_spte(rmap_head, &iter, sptep) {
- if (*sptep & shadow_accessed_mask) {
- young = 1;
- break;
- }
- }
+ for_each_rmap_spte(rmap_head, &iter, sptep)
+ if (is_accessed_spte(*sptep))
+ return 1;
out:
- return young;
+ return 0;
}
#define RMAP_RECYCLE_THRESHOLD 1000
@@ -1660,7 +1790,7 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
* This has some overhead, but not as much as the cost of swapping
* out actively used pages or breaking up actively used hugepages.
*/
- if (!shadow_accessed_mask)
+ if (!shadow_accessed_mask && !shadow_acc_track_mask)
return kvm_handle_hva_range(kvm, start, end, 0,
kvm_unmap_rmapp);
@@ -1713,7 +1843,7 @@ static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
- return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
+ return hash_64(gfn, KVM_MMU_HASH_SHIFT);
}
static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
@@ -1904,17 +2034,17 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
* since it has been deleted from active_mmu_pages but still can be found
* at hast list.
*
- * for_each_gfn_valid_sp() has skipped that kind of pages.
+ * for_each_valid_sp() has skipped that kind of pages.
*/
-#define for_each_gfn_valid_sp(_kvm, _sp, _gfn) \
+#define for_each_valid_sp(_kvm, _sp, _gfn) \
hlist_for_each_entry(_sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
- if ((_sp)->gfn != (_gfn) || is_obsolete_sp((_kvm), (_sp)) \
- || (_sp)->role.invalid) {} else
+ if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) { \
+ } else
#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
- for_each_gfn_valid_sp(_kvm, _sp, _gfn) \
- if ((_sp)->role.direct) {} else
+ for_each_valid_sp(_kvm, _sp, _gfn) \
+ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
/* @sp->gfn should be write-protected at the call site */
static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
@@ -2116,6 +2246,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp;
bool need_sync = false;
bool flush = false;
+ int collisions = 0;
LIST_HEAD(invalid_list);
role = vcpu->arch.mmu.base_role;
@@ -2130,7 +2261,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
- for_each_gfn_valid_sp(vcpu->kvm, sp, gfn) {
+ for_each_valid_sp(vcpu->kvm, sp, gfn) {
+ if (sp->gfn != gfn) {
+ collisions++;
+ continue;
+ }
+
if (!need_sync && sp->unsync)
need_sync = true;
@@ -2153,7 +2289,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
__clear_sp_write_flooding_count(sp);
trace_kvm_mmu_get_page(sp, false);
- return sp;
+ goto out;
}
++vcpu->kvm->stat.mmu_cache_miss;
@@ -2183,6 +2319,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
trace_kvm_mmu_get_page(sp, true);
kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
+out:
+ if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
+ vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
return sp;
}
@@ -2583,6 +2722,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= shadow_dirty_mask;
}
+ if (speculative)
+ spte = mark_spte_for_access_track(spte);
+
set_pte:
if (mmu_spte_update(sptep, spte))
kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2636,7 +2778,7 @@ static bool mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
is_large_pte(*sptep)? "2MB" : "4kB",
- *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
+ *sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
*sptep, sptep);
if (!was_rmapped && is_large_pte(*sptep))
++vcpu->kvm->stat.lpages;
@@ -2869,33 +3011,43 @@ static bool page_fault_can_be_fast(u32 error_code)
if (unlikely(error_code & PFERR_RSVD_MASK))
return false;
+ /* See if the page fault is due to an NX violation */
+ if (unlikely(((error_code & (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))
+ == (PFERR_FETCH_MASK | PFERR_PRESENT_MASK))))
+ return false;
+
/*
- * #PF can be fast only if the shadow page table is present and it
- * is caused by write-protect, that means we just need change the
- * W bit of the spte which can be done out of mmu-lock.
+ * #PF can be fast if:
+ * 1. The shadow page table entry is not present, which could mean that
+ * the fault is potentially caused by access tracking (if enabled).
+ * 2. The shadow page table entry is present and the fault
+ * is caused by write-protect, that means we just need change the W
+ * bit of the spte which can be done out of mmu-lock.
+ *
+ * However, if access tracking is disabled we know that a non-present
+ * page must be a genuine page fault where we have to create a new SPTE.
+ * So, if access tracking is disabled, we return true only for write
+ * accesses to a present page.
*/
- if (!(error_code & PFERR_PRESENT_MASK) ||
- !(error_code & PFERR_WRITE_MASK))
- return false;
- return true;
+ return shadow_acc_track_mask != 0 ||
+ ((error_code & (PFERR_WRITE_MASK | PFERR_PRESENT_MASK))
+ == (PFERR_WRITE_MASK | PFERR_PRESENT_MASK));
}
+/*
+ * Returns true if the SPTE was fixed successfully. Otherwise,
+ * someone else modified the SPTE from its original value.
+ */
static bool
fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *sptep, u64 spte)
+ u64 *sptep, u64 old_spte, u64 new_spte)
{
gfn_t gfn;
WARN_ON(!sp->role.direct);
/*
- * The gfn of direct spte is stable since it is calculated
- * by sp->gfn.
- */
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
-
- /*
* Theoretically we could also set dirty bit (and flush TLB) here in
* order to eliminate unnecessary PML logging. See comments in
* set_spte. But fast_page_fault is very unlikely to happen with PML
@@ -2907,12 +3059,33 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
*
* Compare with set_spte where instead shadow_dirty_mask is set.
*/
- if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+ if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
+ return false;
+
+ if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
+ /*
+ * The gfn of direct spte is stable since it is
+ * calculated by sp->gfn.
+ */
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
return true;
}
+static bool is_access_allowed(u32 fault_err_code, u64 spte)
+{
+ if (fault_err_code & PFERR_FETCH_MASK)
+ return is_executable_pte(spte);
+
+ if (fault_err_code & PFERR_WRITE_MASK)
+ return is_writable_pte(spte);
+
+ /* Fault was on Read access */
+ return spte & PT_PRESENT_MASK;
+}
+
/*
* Return value:
* - true: let the vcpu to access on the same address again.
@@ -2923,8 +3096,9 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
- bool ret = false;
+ bool fault_handled = false;
u64 spte = 0ull;
+ uint retry_count = 0;
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return false;
@@ -2933,66 +3107,93 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
return false;
walk_shadow_page_lockless_begin(vcpu);
- for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
- if (!is_shadow_present_pte(spte) || iterator.level < level)
+
+ do {
+ u64 new_spte;
+
+ for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+ if (!is_shadow_present_pte(spte) ||
+ iterator.level < level)
+ break;
+
+ sp = page_header(__pa(iterator.sptep));
+ if (!is_last_spte(spte, sp->role.level))
break;
- /*
- * If the mapping has been changed, let the vcpu fault on the
- * same address again.
- */
- if (!is_shadow_present_pte(spte)) {
- ret = true;
- goto exit;
- }
+ /*
+ * Check whether the memory access that caused the fault would
+ * still cause it if it were to be performed right now. If not,
+ * then this is a spurious fault caused by TLB lazily flushed,
+ * or some other CPU has already fixed the PTE after the
+ * current CPU took the fault.
+ *
+ * Need not check the access of upper level table entries since
+ * they are always ACC_ALL.
+ */
+ if (is_access_allowed(error_code, spte)) {
+ fault_handled = true;
+ break;
+ }
- sp = page_header(__pa(iterator.sptep));
- if (!is_last_spte(spte, sp->role.level))
- goto exit;
+ new_spte = spte;
- /*
- * Check if it is a spurious fault caused by TLB lazily flushed.
- *
- * Need not check the access of upper level table entries since
- * they are always ACC_ALL.
- */
- if (is_writable_pte(spte)) {
- ret = true;
- goto exit;
- }
+ if (is_access_track_spte(spte))
+ new_spte = restore_acc_track_spte(new_spte);
- /*
- * Currently, to simplify the code, only the spte write-protected
- * by dirty-log can be fast fixed.
- */
- if (!spte_is_locklessly_modifiable(spte))
- goto exit;
+ /*
+ * Currently, to simplify the code, write-protection can
+ * be removed in the fast path only if the SPTE was
+ * write-protected for dirty-logging or access tracking.
+ */
+ if ((error_code & PFERR_WRITE_MASK) &&
+ spte_can_locklessly_be_made_writable(spte))
+ {
+ new_spte |= PT_WRITABLE_MASK;
- /*
- * Do not fix write-permission on the large spte since we only dirty
- * the first page into the dirty-bitmap in fast_pf_fix_direct_spte()
- * that means other pages are missed if its slot is dirty-logged.
- *
- * Instead, we let the slow page fault path create a normal spte to
- * fix the access.
- *
- * See the comments in kvm_arch_commit_memory_region().
- */
- if (sp->role.level > PT_PAGE_TABLE_LEVEL)
- goto exit;
+ /*
+ * Do not fix write-permission on the large spte. Since
+ * we only dirty the first page into the dirty-bitmap in
+ * fast_pf_fix_direct_spte(), other pages are missed
+ * if its slot has dirty logging enabled.
+ *
+ * Instead, we let the slow page fault path create a
+ * normal spte to fix the access.
+ *
+ * See the comments in kvm_arch_commit_memory_region().
+ */
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ break;
+ }
+
+ /* Verify that the fault can be handled in the fast path */
+ if (new_spte == spte ||
+ !is_access_allowed(error_code, new_spte))
+ break;
+
+ /*
+ * Currently, fast page fault only works for direct mapping
+ * since the gfn is not stable for indirect shadow page. See
+ * Documentation/virtual/kvm/locking.txt to get more detail.
+ */
+ fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
+ iterator.sptep, spte,
+ new_spte);
+ if (fault_handled)
+ break;
+
+ if (++retry_count > 4) {
+ printk_once(KERN_WARNING
+ "kvm: Fast #PF retrying more than 4 times.\n");
+ break;
+ }
+
+ } while (true);
- /*
- * Currently, fast page fault only works for direct mapping since
- * the gfn is not stable for indirect shadow page.
- * See Documentation/virtual/kvm/locking.txt to get more detail.
- */
- ret = fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte);
-exit:
trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
- spte, ret);
+ spte, fault_handled);
walk_shadow_page_lockless_end(vcpu);
- return ret;
+ return fault_handled;
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
@@ -5063,6 +5264,8 @@ static void mmu_destroy_caches(void)
int kvm_mmu_module_init(void)
{
+ kvm_mmu_clear_all_pte_masks();
+
pte_list_desc_cache = kmem_cache_create("pte_list_desc",
sizeof(struct pte_list_desc),
0, 0, NULL);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 08a4d3ab3455..d1efe2c62b3f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -971,8 +971,8 @@ static void svm_disable_lbrv(struct vcpu_svm *svm)
* a particular vCPU.
*/
#define SVM_VM_DATA_HASH_BITS 8
-DECLARE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
-static spinlock_t svm_vm_data_hash_lock;
+static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS);
+static DEFINE_SPINLOCK(svm_vm_data_hash_lock);
/* Note:
* This function is called from IOMMU driver to notify
@@ -1077,8 +1077,6 @@ static __init int svm_hardware_setup(void)
} else {
pr_info("AVIC enabled\n");
- hash_init(svm_vm_data_hash);
- spin_lock_init(&svm_vm_data_hash_lock);
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
}
}
@@ -1159,7 +1157,6 @@ static void init_vmcb(struct vcpu_svm *svm)
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
- svm->vcpu.fpu_active = 1;
svm->vcpu.arch.hflags = 0;
set_cr_intercept(svm, INTERCEPT_CR0_READ);
@@ -1901,15 +1898,12 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
ulong gcr0 = svm->vcpu.arch.cr0;
u64 *hcr0 = &svm->vmcb->save.cr0;
- if (!svm->vcpu.fpu_active)
- *hcr0 |= SVM_CR0_SELECTIVE_MASK;
- else
- *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
- | (gcr0 & SVM_CR0_SELECTIVE_MASK);
+ *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
+ | (gcr0 & SVM_CR0_SELECTIVE_MASK);
mark_dirty(svm->vmcb, VMCB_CR);
- if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
+ if (gcr0 == *hcr0) {
clr_cr_intercept(svm, INTERCEPT_CR0_READ);
clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
} else {
@@ -1940,8 +1934,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (!npt_enabled)
cr0 |= X86_CR0_PG | X86_CR0_WP;
- if (!vcpu->fpu_active)
- cr0 |= X86_CR0_TS;
/*
* re-enable caching here because the QEMU bios
* does not do it - this results in some delay at
@@ -2160,22 +2152,6 @@ static int ac_interception(struct vcpu_svm *svm)
return 1;
}
-static void svm_fpu_activate(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- clr_exception_intercept(svm, NM_VECTOR);
-
- svm->vcpu.fpu_active = 1;
- update_cr0_intercept(svm);
-}
-
-static int nm_interception(struct vcpu_svm *svm)
-{
- svm_fpu_activate(&svm->vcpu);
- return 1;
-}
-
static bool is_erratum_383(void)
{
int err, i;
@@ -2573,9 +2549,6 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
if (!npt_enabled && svm->apf_reason == 0)
return NESTED_EXIT_HOST;
break;
- case SVM_EXIT_EXCP_BASE + NM_VECTOR:
- nm_interception(svm);
- break;
default:
break;
}
@@ -4020,7 +3993,6 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
[SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
[SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
[SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
- [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
[SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
[SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception,
[SVM_EXIT_INTR] = intr_interception,
@@ -4182,6 +4154,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
+ vcpu->arch.gpa_available = (exit_code == SVM_EXIT_NPF);
+
if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
vcpu->arch.cr0 = svm->vmcb->save.cr0;
if (npt_enabled)
@@ -4357,11 +4331,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
return;
}
-static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
-{
- return;
-}
-
static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
{
kvm_lapic_set_irr(vec, vcpu->arch.apic);
@@ -5077,14 +5046,6 @@ static bool svm_has_wbinvd_exit(void)
return true;
}
-static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- set_exception_intercept(svm, NM_VECTOR);
- update_cr0_intercept(svm);
-}
-
#define PRE_EX(exit) { .exit_code = (exit), \
.stage = X86_ICPT_PRE_EXCEPT, }
#define POST_EX(exit) { .exit_code = (exit), \
@@ -5345,9 +5306,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.get_pkru = svm_get_pkru,
- .fpu_activate = svm_fpu_activate,
- .fpu_deactivate = svm_fpu_deactivate,
-
.tlb_flush = svm_flush_tlb,
.run = svm_vcpu_run,
@@ -5371,7 +5329,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.get_enable_apicv = svm_get_enable_apicv,
.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = svm_load_eoi_exitmap,
- .sync_pir_to_irr = svm_sync_pir_to_irr,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
.apicv_post_state_restore = avic_post_state_restore,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a236decb81e4..ef4ba71dbb66 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1856,7 +1856,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
u32 eb;
eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
- (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR);
+ (1u << DB_VECTOR) | (1u << AC_VECTOR);
if ((vcpu->guest_debug &
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
(KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
@@ -1865,8 +1865,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
eb = ~0;
if (enable_ept)
eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
- if (vcpu->fpu_active)
- eb &= ~(1u << NM_VECTOR);
/* When we are running a nested L2 guest and L1 specified for it a
* certain exception bitmap, we must trap the same exceptions and pass
@@ -1992,19 +1990,6 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
m->host[i].value = host_val;
}
-static void reload_tss(void)
-{
- /*
- * VT restores TR but not its size. Useless.
- */
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
- struct desc_struct *descs;
-
- descs = (void *)gdt->address;
- descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
- load_TR_desc();
-}
-
static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
{
u64 guest_efer = vmx->vcpu.arch.efer;
@@ -2059,41 +2044,36 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
}
}
+#ifdef CONFIG_X86_32
+/*
+ * On 32-bit kernels, VM exits still load the FS and GS bases from the
+ * VMCS rather than the segment table. KVM uses this helper to figure
+ * out the current bases to poke them into the VMCS before entry.
+ */
static unsigned long segment_base(u16 selector)
{
struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
struct desc_struct *d;
- unsigned long table_base;
+ struct desc_struct *table;
unsigned long v;
- if (!(selector & ~3))
+ if (!(selector & ~SEGMENT_RPL_MASK))
return 0;
- table_base = gdt->address;
+ table = (struct desc_struct *)gdt->address;
- if (selector & 4) { /* from ldt */
+ if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
u16 ldt_selector = kvm_read_ldt();
- if (!(ldt_selector & ~3))
+ if (!(ldt_selector & ~SEGMENT_RPL_MASK))
return 0;
- table_base = segment_base(ldt_selector);
+ table = (struct desc_struct *)segment_base(ldt_selector);
}
- d = (struct desc_struct *)(table_base + (selector & ~7));
- v = get_desc_base(d);
-#ifdef CONFIG_X86_64
- if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
- v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32;
-#endif
+ v = get_desc_base(&table[selector >> 3]);
return v;
}
-
-static inline unsigned long kvm_read_tr_base(void)
-{
- u16 tr;
- asm("str %0" : "=g"(tr));
- return segment_base(tr);
-}
+#endif
static void vmx_save_host_state(struct kvm_vcpu *vcpu)
{
@@ -2179,7 +2159,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
loadsegment(es, vmx->host_state.es_sel);
}
#endif
- reload_tss();
+ invalidate_tss_limit();
#ifdef CONFIG_X86_64
wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
#endif
@@ -2294,10 +2274,19 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
/*
* Linux uses per-cpu TSS and GDT, so set these when switching
- * processors.
+ * processors. See 22.2.4.
*/
- vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
- vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
+ vmcs_writel(HOST_TR_BASE,
+ (unsigned long)this_cpu_ptr(&cpu_tss));
+ vmcs_writel(HOST_GDTR_BASE, gdt->address);
+
+ /*
+ * VM exits change the host TR limit to 0x67 after a VM
+ * exit. This is okay, since 0x67 covers everything except
+ * the IO bitmap and have have code to handle the IO bitmap
+ * being lost after a VM exit.
+ */
+ BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -2340,25 +2329,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
}
}
-static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
-{
- ulong cr0;
-
- if (vcpu->fpu_active)
- return;
- vcpu->fpu_active = 1;
- cr0 = vmcs_readl(GUEST_CR0);
- cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
- cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
- vmcs_writel(GUEST_CR0, cr0);
- update_exception_bitmap(vcpu);
- vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
- if (is_guest_mode(vcpu))
- vcpu->arch.cr0_guest_owned_bits &=
- ~get_vmcs12(vcpu)->cr0_guest_host_mask;
- vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
-}
-
static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
/*
@@ -2377,33 +2347,6 @@ static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
(fields->cr4_read_shadow & fields->cr4_guest_host_mask);
}
-static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
-{
- /* Note that there is no vcpu->fpu_active = 0 here. The caller must
- * set this *before* calling this function.
- */
- vmx_decache_cr0_guest_bits(vcpu);
- vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
- update_exception_bitmap(vcpu);
- vcpu->arch.cr0_guest_owned_bits = 0;
- vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
- if (is_guest_mode(vcpu)) {
- /*
- * L1's specified read shadow might not contain the TS bit,
- * so now that we turned on shadowing of this bit, we need to
- * set this bit of the shadow. Like in nested_vmx_run we need
- * nested_read_cr0(vmcs12), but vmcs12->guest_cr0 is not yet
- * up-to-date here because we just decached cr0.TS (and we'll
- * only update vmcs12->guest_cr0 on nested exit).
- */
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- vmcs12->guest_cr0 = (vmcs12->guest_cr0 & ~X86_CR0_TS) |
- (vcpu->arch.cr0 & X86_CR0_TS);
- vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));
- } else
- vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
-}
-
static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
unsigned long rflags, save_rflags;
@@ -3962,7 +3905,7 @@ static void fix_rmode_seg(int seg, struct kvm_segment *save)
}
vmcs_write16(sf->selector, var.selector);
- vmcs_write32(sf->base, var.base);
+ vmcs_writel(sf->base, var.base);
vmcs_write32(sf->limit, var.limit);
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(&var));
}
@@ -4232,9 +4175,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
if (enable_ept)
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
- if (!vcpu->fpu_active)
- hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
-
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
@@ -4953,7 +4893,7 @@ static bool vmx_get_enable_apicv(void)
return enable_apicv;
}
-static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
@@ -4964,19 +4904,15 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
vmx->nested.pi_pending) {
vmx->nested.pi_pending = false;
if (!pi_test_and_clear_on(vmx->nested.pi_desc))
- return 0;
+ return;
max_irr = find_last_bit(
(unsigned long *)vmx->nested.pi_desc->pir, 256);
if (max_irr == 256)
- return 0;
+ return;
vapic_page = kmap(vmx->nested.virtual_apic_page);
- if (!vapic_page) {
- WARN_ON(1);
- return -ENOMEM;
- }
__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
kunmap(vmx->nested.virtual_apic_page);
@@ -4987,7 +4923,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
vmcs_write16(GUEST_INTR_STATUS, status);
}
}
- return 0;
}
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -5056,26 +4991,12 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
if (pi_test_and_set_pir(vector, &vmx->pi_desc))
return;
- r = pi_test_and_set_on(&vmx->pi_desc);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
- kvm_vcpu_kick(vcpu);
-}
-
-static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!pi_test_on(&vmx->pi_desc))
+ /* If a previous notification has sent the IPI, nothing to do. */
+ if (pi_test_and_set_on(&vmx->pi_desc))
return;
- pi_clear_on(&vmx->pi_desc);
- /*
- * IOMMU can write to PIR.ON, so the barrier matters even on UP.
- * But on x86 this is just a compiler barrier anyway.
- */
- smp_mb__after_atomic();
- kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+ if (!kvm_vcpu_trigger_posted_interrupt(vcpu))
+ kvm_vcpu_kick(vcpu);
}
/*
@@ -5236,10 +5157,8 @@ static void ept_set_mmio_spte_mask(void)
/*
* EPT Misconfigurations can be generated if the value of bits 2:0
* of an EPT paging-structure entry is 110b (write/execute).
- * Also, magic bits (0x3ull << 62) is set to quickly identify mmio
- * spte.
*/
- kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE);
}
#define VMX_XSS_EXIT_BITMAP 0
@@ -5342,7 +5261,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
/* 22.2.1, 20.8.1 */
vm_entry_controls_init(vmx, vmcs_config.vmentry_ctrl);
- vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
+ vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
+
set_cr4_guest_host_mask(vmx);
if (vmx_xsaves_supported())
@@ -5446,7 +5367,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vmx_set_cr0(vcpu, cr0); /* enter rmode */
vmx_set_cr4(vcpu, 0);
vmx_set_efer(vcpu, 0);
- vmx_fpu_activate(vcpu);
+
update_exception_bitmap(vcpu);
vpid_sync_context(vmx->vpid);
@@ -5480,26 +5401,20 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
- u32 cpu_based_vm_exec_control;
-
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_INTR_PENDING);
}
static void enable_nmi_window(struct kvm_vcpu *vcpu)
{
- u32 cpu_based_vm_exec_control;
-
if (!cpu_has_virtual_nmis() ||
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
enable_irq_window(vcpu);
return;
}
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_set_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_NMI_PENDING);
}
static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -5725,11 +5640,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
if (is_nmi(intr_info))
return 1; /* already handled by vmx_vcpu_run() */
- if (is_no_device(intr_info)) {
- vmx_fpu_activate(vcpu);
- return 1;
- }
-
if (is_invalid_opcode(intr_info)) {
if (is_guest_mode(vcpu)) {
kvm_queue_exception(vcpu, UD_VECTOR);
@@ -5919,22 +5829,6 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val)
return kvm_set_cr4(vcpu, val);
}
-/* called to set cr0 as appropriate for clts instruction exit. */
-static void handle_clts(struct kvm_vcpu *vcpu)
-{
- if (is_guest_mode(vcpu)) {
- /*
- * We get here when L2 did CLTS, and L1 didn't shadow CR0.TS
- * but we did (!fpu_active). We need to keep GUEST_CR0.TS on,
- * just pretend it's off (also in arch.cr0 for fpu_activate).
- */
- vmcs_writel(CR0_READ_SHADOW,
- vmcs_readl(CR0_READ_SHADOW) & ~X86_CR0_TS);
- vcpu->arch.cr0 &= ~X86_CR0_TS;
- } else
- vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
-}
-
static int handle_cr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification, val;
@@ -5980,9 +5874,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
}
break;
case 2: /* clts */
- handle_clts(vcpu);
+ WARN_ONCE(1, "Guest should always own CR0.TS");
+ vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
- vmx_fpu_activate(vcpu);
return kvm_skip_emulated_instruction(vcpu);
case 1: /*mov from cr*/
switch (cr) {
@@ -6152,18 +6046,14 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
{
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_apic_update_ppr(vcpu);
return 1;
}
static int handle_interrupt_window(struct kvm_vcpu *vcpu)
{
- u32 cpu_based_vm_exec_control;
-
- /* clear pending irq */
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_INTR_PENDING);
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -6374,15 +6264,22 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- /* it is a read fault? */
- error_code = (exit_qualification << 2) & PFERR_USER_MASK;
- /* it is a write fault? */
- error_code |= exit_qualification & PFERR_WRITE_MASK;
- /* It is a fetch fault? */
- error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
- /* ept page table is present? */
- error_code |= (exit_qualification & 0x38) != 0;
-
+ /* Is it a read fault? */
+ error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
+ ? PFERR_USER_MASK : 0;
+ /* Is it a write fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
+ ? PFERR_WRITE_MASK : 0;
+ /* Is it a fetch fault? */
+ error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
+ ? PFERR_FETCH_MASK : 0;
+ /* ept page table entry is present? */
+ error_code |= (exit_qualification &
+ (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
+ EPT_VIOLATION_EXECUTABLE))
+ ? PFERR_PRESENT_MASK : 0;
+
+ vcpu->arch.gpa_available = true;
vcpu->arch.exit_qualification = exit_qualification;
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
@@ -6400,6 +6297,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
}
ret = handle_mmio_page_fault(vcpu, gpa, true);
+ vcpu->arch.gpa_available = true;
if (likely(ret == RET_MMIO_PF_EMULATE))
return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
EMULATE_DONE;
@@ -6421,12 +6319,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
- u32 cpu_based_vm_exec_control;
-
- /* clear pending NMI */
- cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
- cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
- vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_VIRTUAL_NMI_PENDING);
++vcpu->stat.nmi_window_exits;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -6572,6 +6466,19 @@ static void wakeup_handler(void)
spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
}
+void vmx_enable_tdp(void)
+{
+ kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
+ enable_ept_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull,
+ enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
+ 0ull, VMX_EPT_EXECUTABLE_MASK,
+ cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
+ enable_ept_ad_bits ? 0ull : VMX_EPT_RWX_MASK);
+
+ ept_set_mmio_spte_mask();
+ kvm_enable_tdp();
+}
+
static __init int hardware_setup(void)
{
int r = -ENOMEM, i, msr;
@@ -6651,8 +6558,10 @@ static __init int hardware_setup(void)
if (!cpu_has_vmx_ple())
ple_gap = 0;
- if (!cpu_has_vmx_apicv())
+ if (!cpu_has_vmx_apicv()) {
enable_apicv = 0;
+ kvm_x86_ops->sync_pir_to_irr = NULL;
+ }
if (cpu_has_vmx_tsc_scaling()) {
kvm_has_tsc_control = true;
@@ -6697,16 +6606,9 @@ static __init int hardware_setup(void)
/* SELF-IPI */
vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
- if (enable_ept) {
- kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
- (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
- (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
- 0ull, VMX_EPT_EXECUTABLE_MASK,
- cpu_has_vmx_ept_execute_only() ?
- 0ull : VMX_EPT_READABLE_MASK);
- ept_set_mmio_spte_mask();
- kvm_enable_tdp();
- } else
+ if (enable_ept)
+ vmx_enable_tdp();
+ else
kvm_disable_tdp();
update_ple_window_actual_max();
@@ -7085,13 +6987,18 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
}
page = nested_get_page(vcpu, vmptr);
- if (page == NULL ||
- *(u32 *)kmap(page) != VMCS12_REVISION) {
+ if (page == NULL) {
nested_vmx_failInvalid(vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+ }
+ if (*(u32 *)kmap(page) != VMCS12_REVISION) {
kunmap(page);
+ nested_release_page_clean(page);
+ nested_vmx_failInvalid(vcpu);
return kvm_skip_emulated_instruction(vcpu);
}
kunmap(page);
+ nested_release_page_clean(page);
vmx->nested.vmxon_ptr = vmptr;
break;
case EXIT_REASON_VMCLEAR:
@@ -7129,6 +7036,53 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
return 0;
}
+static int enter_vmx_operation(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs *shadow_vmcs;
+
+ if (cpu_has_vmx_msr_bitmap()) {
+ vmx->nested.msr_bitmap =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx->nested.msr_bitmap)
+ goto out_msr_bitmap;
+ }
+
+ vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+ if (!vmx->nested.cached_vmcs12)
+ goto out_cached_vmcs12;
+
+ if (enable_shadow_vmcs) {
+ shadow_vmcs = alloc_vmcs();
+ if (!shadow_vmcs)
+ goto out_shadow_vmcs;
+ /* mark vmcs as shadow */
+ shadow_vmcs->revision_id |= (1u << 31);
+ /* init shadow vmcs */
+ vmcs_clear(shadow_vmcs);
+ vmx->vmcs01.shadow_vmcs = shadow_vmcs;
+ }
+
+ INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
+ vmx->nested.vmcs02_num = 0;
+
+ hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_REL_PINNED);
+ vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
+
+ vmx->nested.vmxon = true;
+ return 0;
+
+out_shadow_vmcs:
+ kfree(vmx->nested.cached_vmcs12);
+
+out_cached_vmcs12:
+ free_page((unsigned long)vmx->nested.msr_bitmap);
+
+out_msr_bitmap:
+ return -ENOMEM;
+}
+
/*
* Emulate the VMXON instruction.
* Currently, we just remember that VMX is active, and do not save or even
@@ -7139,9 +7093,9 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason,
*/
static int handle_vmon(struct kvm_vcpu *vcpu)
{
+ int ret;
struct kvm_segment cs;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct vmcs *shadow_vmcs;
const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
| FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
@@ -7168,9 +7122,6 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
- if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
- return 1;
-
if (vmx->nested.vmxon) {
nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
return kvm_skip_emulated_instruction(vcpu);
@@ -7182,48 +7133,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
- if (cpu_has_vmx_msr_bitmap()) {
- vmx->nested.msr_bitmap =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx->nested.msr_bitmap)
- goto out_msr_bitmap;
- }
-
- vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
- if (!vmx->nested.cached_vmcs12)
- goto out_cached_vmcs12;
-
- if (enable_shadow_vmcs) {
- shadow_vmcs = alloc_vmcs();
- if (!shadow_vmcs)
- goto out_shadow_vmcs;
- /* mark vmcs as shadow */
- shadow_vmcs->revision_id |= (1u << 31);
- /* init shadow vmcs */
- vmcs_clear(shadow_vmcs);
- vmx->vmcs01.shadow_vmcs = shadow_vmcs;
- }
-
- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
- vmx->nested.vmcs02_num = 0;
-
- hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL_PINNED);
- vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
-
- vmx->nested.vmxon = true;
+ if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMON, NULL))
+ return 1;
+
+ ret = enter_vmx_operation(vcpu);
+ if (ret)
+ return ret;
nested_vmx_succeed(vcpu);
return kvm_skip_emulated_instruction(vcpu);
-
-out_shadow_vmcs:
- kfree(vmx->nested.cached_vmcs12);
-
-out_cached_vmcs12:
- free_page((unsigned long)vmx->nested.msr_bitmap);
-
-out_msr_bitmap:
- return -ENOMEM;
}
/*
@@ -7672,6 +7590,18 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return kvm_skip_emulated_instruction(vcpu);
}
+static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr)
+{
+ vmx->nested.current_vmptr = vmptr;
+ if (enable_shadow_vmcs) {
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER,
+ __pa(vmx->vmcs01.shadow_vmcs));
+ vmx->nested.sync_shadow_vmcs = true;
+ }
+}
+
/* Emulate the VMPTRLD instruction */
static int handle_vmptrld(struct kvm_vcpu *vcpu)
{
@@ -7702,7 +7632,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
}
nested_release_vmcs12(vmx);
- vmx->nested.current_vmptr = vmptr;
vmx->nested.current_vmcs12 = new_vmcs12;
vmx->nested.current_vmcs12_page = page;
/*
@@ -7711,14 +7640,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
*/
memcpy(vmx->nested.cached_vmcs12,
vmx->nested.current_vmcs12, VMCS12_SIZE);
-
- if (enable_shadow_vmcs) {
- vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_SHADOW_VMCS);
- vmcs_write64(VMCS_LINK_POINTER,
- __pa(vmx->vmcs01.shadow_vmcs));
- vmx->nested.sync_shadow_vmcs = true;
- }
+ set_current_vmptr(vmx, vmptr);
}
nested_vmx_succeed(vcpu);
@@ -8191,8 +8113,6 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
case EXIT_REASON_TASK_SWITCH:
return true;
case EXIT_REASON_CPUID:
- if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
- return false;
return true;
case EXIT_REASON_HLT:
return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
@@ -8350,7 +8270,7 @@ static void kvm_flush_pml_buffers(struct kvm *kvm)
static void vmx_dump_sel(char *name, uint32_t sel)
{
pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
- name, vmcs_read32(sel),
+ name, vmcs_read16(sel),
vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
@@ -8514,6 +8434,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
u32 vectoring_info = vmx->idt_vectoring_info;
trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+ vcpu->arch.gpa_available = false;
/*
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
@@ -8732,6 +8653,27 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
}
}
+static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ int max_irr;
+
+ WARN_ON(!vcpu->arch.apicv_active);
+ if (pi_test_on(&vmx->pi_desc)) {
+ pi_clear_on(&vmx->pi_desc);
+ /*
+ * IOMMU can write to PIR.ON, so the barrier matters even on UP.
+ * But on x86 this is just a compiler barrier anyway.
+ */
+ smp_mb__after_atomic();
+ max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
+ } else {
+ max_irr = kvm_lapic_find_highest_irr(vcpu);
+ }
+ vmx_hwapic_irr_update(vcpu, max_irr);
+ return max_irr;
+}
+
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
@@ -8743,6 +8685,14 @@ static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
vmcs_write64(EOI_EXIT_BITMAP3, eoi_exit_bitmap[3]);
}
+static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ pi_clear_on(&vmx->pi_desc);
+ memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
+}
+
static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -9588,17 +9538,16 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
kvm_inject_page_fault(vcpu, fault);
}
-static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
+static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12);
+
+static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int maxphyaddr = cpuid_maxphyaddr(vcpu);
+ u64 hpa;
if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
- if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
- vmcs12->apic_access_addr >> maxphyaddr)
- return false;
-
/*
* Translate L1 physical address to host physical
* address for vmcs02. Keep the page pinned, so this
@@ -9609,59 +9558,80 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
nested_release_page(vmx->nested.apic_access_page);
vmx->nested.apic_access_page =
nested_get_page(vcpu, vmcs12->apic_access_addr);
+ /*
+ * If translation failed, no matter: This feature asks
+ * to exit when accessing the given address, and if it
+ * can never be accessed, this feature won't do
+ * anything anyway.
+ */
+ if (vmx->nested.apic_access_page) {
+ hpa = page_to_phys(vmx->nested.apic_access_page);
+ vmcs_write64(APIC_ACCESS_ADDR, hpa);
+ } else {
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ }
+ } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
+ cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
+ vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ kvm_vcpu_reload_apic_access_page(vcpu);
}
if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
- if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
- vmcs12->virtual_apic_page_addr >> maxphyaddr)
- return false;
-
if (vmx->nested.virtual_apic_page) /* shouldn't happen */
nested_release_page(vmx->nested.virtual_apic_page);
vmx->nested.virtual_apic_page =
nested_get_page(vcpu, vmcs12->virtual_apic_page_addr);
/*
- * Failing the vm entry is _not_ what the processor does
- * but it's basically the only possibility we have.
- * We could still enter the guest if CR8 load exits are
- * enabled, CR8 store exits are enabled, and virtualize APIC
- * access is disabled; in this case the processor would never
- * use the TPR shadow and we could simply clear the bit from
- * the execution control. But such a configuration is useless,
- * so let's keep the code simple.
+ * If translation failed, VM entry will fail because
+ * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull.
+ * Failing the vm entry is _not_ what the processor
+ * does but it's basically the only possibility we
+ * have. We could still enter the guest if CR8 load
+ * exits are enabled, CR8 store exits are enabled, and
+ * virtualize APIC access is disabled; in this case
+ * the processor would never use the TPR shadow and we
+ * could simply clear the bit from the execution
+ * control. But such a configuration is useless, so
+ * let's keep the code simple.
*/
- if (!vmx->nested.virtual_apic_page)
- return false;
+ if (vmx->nested.virtual_apic_page) {
+ hpa = page_to_phys(vmx->nested.virtual_apic_page);
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa);
+ }
}
if (nested_cpu_has_posted_intr(vmcs12)) {
- if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
- vmcs12->posted_intr_desc_addr >> maxphyaddr)
- return false;
-
if (vmx->nested.pi_desc_page) { /* shouldn't happen */
kunmap(vmx->nested.pi_desc_page);
nested_release_page(vmx->nested.pi_desc_page);
}
vmx->nested.pi_desc_page =
nested_get_page(vcpu, vmcs12->posted_intr_desc_addr);
- if (!vmx->nested.pi_desc_page)
- return false;
-
vmx->nested.pi_desc =
(struct pi_desc *)kmap(vmx->nested.pi_desc_page);
if (!vmx->nested.pi_desc) {
nested_release_page_clean(vmx->nested.pi_desc_page);
- return false;
+ return;
}
vmx->nested.pi_desc =
(struct pi_desc *)((void *)vmx->nested.pi_desc +
(unsigned long)(vmcs12->posted_intr_desc_addr &
(PAGE_SIZE - 1)));
+ vmcs_write64(POSTED_INTR_DESC_ADDR,
+ page_to_phys(vmx->nested.pi_desc_page) +
+ (unsigned long)(vmcs12->posted_intr_desc_addr &
+ (PAGE_SIZE - 1)));
}
-
- return true;
+ if (cpu_has_vmx_msr_bitmap() &&
+ nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) &&
+ nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
+ ;
+ else
+ vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL,
+ CPU_BASED_USE_MSR_BITMAPS);
}
static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
@@ -9730,11 +9700,6 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
return false;
}
msr_bitmap_l1 = (unsigned long *)kmap(page);
- if (!msr_bitmap_l1) {
- nested_release_page_clean(page);
- WARN_ON(1);
- return false;
- }
memset(msr_bitmap_l0, 0xff, PAGE_SIZE);
@@ -9982,7 +9947,7 @@ static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
* is assigned to entry_failure_code on failure.
*/
static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
- unsigned long *entry_failure_code)
+ u32 *entry_failure_code)
{
if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
if (!nested_cr3_valid(vcpu, cr3)) {
@@ -10022,7 +9987,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
* is assigned to entry_failure_code on failure.
*/
static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
- unsigned long *entry_failure_code)
+ bool from_vmentry, u32 *entry_failure_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exec_control;
@@ -10065,21 +10030,26 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) {
+ if (from_vmentry &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
} else {
kvm_set_dr(vcpu, 7, vcpu->arch.dr7);
vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl);
}
- vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
- vmcs12->vm_entry_intr_info_field);
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
- vmcs12->vm_entry_exception_error_code);
- vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
- vmcs12->vm_entry_instruction_len);
- vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
- vmcs12->guest_interruptibility_info);
+ if (from_vmentry) {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ vmcs12->vm_entry_intr_info_field);
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vmcs12->vm_entry_exception_error_code);
+ vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+ vmcs12->vm_entry_instruction_len);
+ vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
+ vmcs12->guest_interruptibility_info);
+ } else {
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
+ }
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
vmx_set_rflags(vcpu, vmcs12->guest_rflags);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
@@ -10108,12 +10078,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
vmx->nested.pi_pending = false;
vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR);
- vmcs_write64(POSTED_INTR_DESC_ADDR,
- page_to_phys(vmx->nested.pi_desc_page) +
- (unsigned long)(vmcs12->posted_intr_desc_addr &
- (PAGE_SIZE - 1)));
- } else
+ } else {
exec_control &= ~PIN_BASED_POSTED_INTR;
+ }
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control);
@@ -10158,26 +10125,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;
- if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
- /*
- * If translation failed, no matter: This feature asks
- * to exit when accessing the given address, and if it
- * can never be accessed, this feature won't do
- * anything anyway.
- */
- if (!vmx->nested.apic_access_page)
- exec_control &=
- ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- else
- vmcs_write64(APIC_ACCESS_ADDR,
- page_to_phys(vmx->nested.apic_access_page));
- } else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
- cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
- exec_control |=
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- kvm_vcpu_reload_apic_access_page(vcpu);
- }
-
if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
vmcs_write64(EOI_EXIT_BITMAP0,
vmcs12->eoi_exit_bitmap0);
@@ -10192,6 +10139,15 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0;
+
+ /*
+ * Write an illegal value to APIC_ACCESS_ADDR. Later,
+ * nested_get_vmcs12_pages will either fix it up or
+ * remove the VM execution control.
+ */
+ if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
+ vmcs_write64(APIC_ACCESS_ADDR, -1ull);
+
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}
@@ -10228,19 +10184,16 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
exec_control &= ~CPU_BASED_TPR_SHADOW;
exec_control |= vmcs12->cpu_based_vm_exec_control;
+ /*
+ * Write an illegal value to VIRTUAL_APIC_PAGE_ADDR. Later, if
+ * nested_get_vmcs12_pages can't fix it up, the illegal value
+ * will result in a VM entry failure.
+ */
if (exec_control & CPU_BASED_TPR_SHADOW) {
- vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
- page_to_phys(vmx->nested.virtual_apic_page));
+ vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, -1ull);
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
}
- if (cpu_has_vmx_msr_bitmap() &&
- exec_control & CPU_BASED_USE_MSR_BITMAPS &&
- nested_vmx_merge_msr_bitmap(vcpu, vmcs12))
- ; /* MSR_BITMAP will be set by following vmx_set_efer. */
- else
- exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
-
/*
* Merging of IO bitmap not currently supported.
* Rather, exit every time.
@@ -10272,16 +10225,18 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
~VM_ENTRY_IA32E_MODE) |
(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) {
+ if (from_vmentry &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)) {
vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
vcpu->arch.pat = vmcs12->guest_ia32_pat;
- } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
+ } else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
-
+ }
set_cr4_guest_host_mask(vmx);
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
+ if (from_vmentry &&
+ vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)
vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs);
if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
@@ -10320,8 +10275,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
}
/*
- * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
- * TS bit (for lazy fpu) and bits which we consider mandatory enabled.
+ * This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
+ * bits which we consider mandatory enabled.
* The CR0_READ_SHADOW is what L2 should have expected to read given
* the specifications by L1; It's not enough to take
* vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
@@ -10333,7 +10288,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmx_set_cr4(vcpu, vmcs12->guest_cr4);
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
+ if (from_vmentry &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER))
vcpu->arch.efer = vmcs12->guest_ia32_efer;
else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
vcpu->arch.efer |= (EFER_LMA | EFER_LME);
@@ -10367,73 +10323,22 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
return 0;
}
-/*
- * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
- * for running an L2 nested guest.
- */
-static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
- struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- int cpu;
- struct loaded_vmcs *vmcs02;
- bool ia32e;
- u32 msr_entry_idx;
- unsigned long exit_qualification;
-
- if (!nested_vmx_check_permission(vcpu))
- return 1;
-
- if (!nested_vmx_check_vmcs12(vcpu))
- goto out;
-
- vmcs12 = get_vmcs12(vcpu);
-
- if (enable_shadow_vmcs)
- copy_shadow_to_vmcs12(vmx);
-
- /*
- * The nested entry process starts with enforcing various prerequisites
- * on vmcs12 as required by the Intel SDM, and act appropriately when
- * they fail: As the SDM explains, some conditions should cause the
- * instruction to fail, while others will cause the instruction to seem
- * to succeed, but return an EXIT_REASON_INVALID_STATE.
- * To speed up the normal (success) code path, we should avoid checking
- * for misconfigurations which will anyway be caught by the processor
- * when using the merged vmcs02.
- */
- if (vmcs12->launch_state == launch) {
- nested_vmx_failValid(vcpu,
- launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
- : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
- goto out;
- }
if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE &&
- vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) {
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- goto out;
- }
+ vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT)
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
- if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- goto out;
- }
+ if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
- if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) {
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- goto out;
- }
+ if (nested_vmx_check_apicv_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
- if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) {
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- goto out;
- }
-
- if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) {
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- goto out;
- }
+ if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12))
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
vmx->nested.nested_vmx_procbased_ctls_low,
@@ -10450,28 +10355,30 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
!vmx_control_verify(vmcs12->vm_entry_controls,
vmx->nested.nested_vmx_entry_ctls_low,
vmx->nested.nested_vmx_entry_ctls_high))
- {
- nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
- goto out;
- }
+ return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) ||
!nested_host_cr4_valid(vcpu, vmcs12->host_cr4) ||
- !nested_cr3_valid(vcpu, vmcs12->host_cr3)) {
- nested_vmx_failValid(vcpu,
- VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
- goto out;
- }
+ !nested_cr3_valid(vcpu, vmcs12->host_cr3))
+ return VMXERR_ENTRY_INVALID_HOST_STATE_FIELD;
+
+ return 0;
+}
+
+static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ u32 *exit_qual)
+{
+ bool ia32e;
+
+ *exit_qual = ENTRY_FAIL_DEFAULT;
if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) ||
- !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) {
- nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+ !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
return 1;
- }
- if (vmcs12->vmcs_link_pointer != -1ull) {
- nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR);
+
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
+ vmcs12->vmcs_link_pointer != -1ull) {
+ *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
return 1;
}
@@ -10484,16 +10391,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* to bit 8 (LME) if bit 31 in the CR0 field (corresponding to
* CR0.PG) is 1.
*/
- if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) {
+ if (to_vmx(vcpu)->nested.nested_run_pending &&
+ (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)) {
ia32e = (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) != 0;
if (!kvm_valid_efer(vcpu, vmcs12->guest_ia32_efer) ||
ia32e != !!(vmcs12->guest_ia32_efer & EFER_LMA) ||
((vmcs12->guest_cr0 & X86_CR0_PG) &&
- ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) {
- nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+ ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME)))
return 1;
- }
}
/*
@@ -10507,28 +10412,26 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
VM_EXIT_HOST_ADDR_SPACE_SIZE) != 0;
if (!kvm_valid_efer(vcpu, vmcs12->host_ia32_efer) ||
ia32e != !!(vmcs12->host_ia32_efer & EFER_LMA) ||
- ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) {
- nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT);
+ ia32e != !!(vmcs12->host_ia32_efer & EFER_LME))
return 1;
- }
}
- /*
- * We're finally done with prerequisite checking, and can start with
- * the nested entry.
- */
+ return 0;
+}
+
+static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct loaded_vmcs *vmcs02;
+ int cpu;
+ u32 msr_entry_idx;
+ u32 exit_qual;
vmcs02 = nested_get_current_vmcs02(vmx);
if (!vmcs02)
return -ENOMEM;
- /*
- * After this point, the trap flag no longer triggers a singlestep trap
- * on the vm entry instructions. Don't call
- * kvm_skip_emulated_instruction.
- */
- skip_emulated_instruction(vcpu);
enter_guest_mode(vcpu);
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
@@ -10543,14 +10446,16 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmx_segment_cache_clear(vmx);
- if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) {
+ if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
leave_guest_mode(vcpu);
vmx_load_vmcs01(vcpu);
nested_vmx_entry_failure(vcpu, vmcs12,
- EXIT_REASON_INVALID_STATE, exit_qualification);
+ EXIT_REASON_INVALID_STATE, exit_qual);
return 1;
}
+ nested_get_vmcs12_pages(vcpu, vmcs12);
+
msr_entry_idx = nested_vmx_load_msr(vcpu,
vmcs12->vm_entry_msr_load_addr,
vmcs12->vm_entry_msr_load_count);
@@ -10564,17 +10469,90 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
vmcs12->launch_state = 1;
- if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
- return kvm_vcpu_halt(vcpu);
-
- vmx->nested.nested_run_pending = 1;
-
/*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
* we are no longer running L1, and VMLAUNCH/VMRESUME has not yet
* returned as far as L1 is concerned. It will only return (and set
* the success flag) when L2 exits (see nested_vmx_vmexit()).
*/
+ return 0;
+}
+
+/*
+ * nested_vmx_run() handles a nested entry, i.e., a VMLAUNCH or VMRESUME on L1
+ * for running an L2 nested guest.
+ */
+static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 exit_qual;
+ int ret;
+
+ if (!nested_vmx_check_permission(vcpu))
+ return 1;
+
+ if (!nested_vmx_check_vmcs12(vcpu))
+ goto out;
+
+ vmcs12 = get_vmcs12(vcpu);
+
+ if (enable_shadow_vmcs)
+ copy_shadow_to_vmcs12(vmx);
+
+ /*
+ * The nested entry process starts with enforcing various prerequisites
+ * on vmcs12 as required by the Intel SDM, and act appropriately when
+ * they fail: As the SDM explains, some conditions should cause the
+ * instruction to fail, while others will cause the instruction to seem
+ * to succeed, but return an EXIT_REASON_INVALID_STATE.
+ * To speed up the normal (success) code path, we should avoid checking
+ * for misconfigurations which will anyway be caught by the processor
+ * when using the merged vmcs02.
+ */
+ if (vmcs12->launch_state == launch) {
+ nested_vmx_failValid(vcpu,
+ launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
+ : VMXERR_VMRESUME_NONLAUNCHED_VMCS);
+ goto out;
+ }
+
+ ret = check_vmentry_prereqs(vcpu, vmcs12);
+ if (ret) {
+ nested_vmx_failValid(vcpu, ret);
+ goto out;
+ }
+
+ /*
+ * After this point, the trap flag no longer triggers a singlestep trap
+ * on the vm entry instructions; don't call kvm_skip_emulated_instruction.
+ * This is not 100% correct; for performance reasons, we delegate most
+ * of the checks on host state to the processor. If those fail,
+ * the singlestep trap is missed.
+ */
+ skip_emulated_instruction(vcpu);
+
+ ret = check_vmentry_postreqs(vcpu, vmcs12, &exit_qual);
+ if (ret) {
+ nested_vmx_entry_failure(vcpu, vmcs12,
+ EXIT_REASON_INVALID_STATE, exit_qual);
+ return 1;
+ }
+
+ /*
+ * We're finally done with prerequisite checking, and can start with
+ * the nested entry.
+ */
+
+ ret = enter_vmx_non_root_mode(vcpu, true);
+ if (ret)
+ return ret;
+
+ if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
+ return kvm_vcpu_halt(vcpu);
+
+ vmx->nested.nested_run_pending = 1;
+
return 1;
out:
@@ -10696,7 +10674,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
return 0;
}
- return vmx_complete_nested_posted_interrupt(vcpu);
+ vmx_complete_nested_posted_interrupt(vcpu);
+ return 0;
}
static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
@@ -10714,21 +10693,13 @@ static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
}
/*
- * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
- * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
- * and this function updates it to reflect the changes to the guest state while
- * L2 was running (and perhaps made some exits which were handled directly by L0
- * without going back to L1), and to reflect the exit reason.
- * Note that we do not have to copy here all VMCS fields, just those that
- * could have changed by the L2 guest or the exit - i.e., the guest-state and
- * exit-information fields only. Other fields are modified by L1 with VMWRITE,
- * which already writes to vmcs12 directly.
+ * Update the guest state fields of vmcs12 to reflect changes that
+ * occurred while L2 was running. (The "IA-32e mode guest" bit of the
+ * VM-entry controls is also updated, since this is really a guest
+ * state bit.)
*/
-static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
- u32 exit_reason, u32 exit_intr_info,
- unsigned long exit_qualification)
+static void sync_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
- /* update guest state fields: */
vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12);
vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12);
@@ -10834,6 +10805,25 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
if (nested_cpu_has_xsaves(vmcs12))
vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
+}
+
+/*
+ * prepare_vmcs12 is part of what we need to do when the nested L2 guest exits
+ * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 (vmcs12),
+ * and this function updates it to reflect the changes to the guest state while
+ * L2 was running (and perhaps made some exits which were handled directly by L0
+ * without going back to L1), and to reflect the exit reason.
+ * Note that we do not have to copy here all VMCS fields, just those that
+ * could have changed by the L2 guest or the exit - i.e., the guest-state and
+ * exit-information fields only. Other fields are modified by L1 with VMWRITE,
+ * which already writes to vmcs12 directly.
+ */
+static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
+ u32 exit_reason, u32 exit_intr_info,
+ unsigned long exit_qualification)
+{
+ /* update guest state fields: */
+ sync_vmcs12(vcpu, vmcs12);
/* update exit information fields: */
@@ -10884,7 +10874,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
struct kvm_segment seg;
- unsigned long entry_failure_code;
+ u32 entry_failure_code;
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -10899,24 +10889,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
vmx_set_rflags(vcpu, X86_EFLAGS_FIXED);
/*
* Note that calling vmx_set_cr0 is important, even if cr0 hasn't
- * actually changed, because it depends on the current state of
- * fpu_active (which may have changed).
- * Note that vmx_set_cr0 refers to efer set above.
+ * actually changed, because vmx_set_cr0 refers to efer set above.
+ *
+ * CR0_GUEST_HOST_MASK is already set in the original vmcs01
+ * (KVM doesn't change it);
*/
+ vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
vmx_set_cr0(vcpu, vmcs12->host_cr0);
- /*
- * If we did fpu_activate()/fpu_deactivate() during L2's run, we need
- * to apply the same changes to L1's vmcs. We just set cr0 correctly,
- * but we also need to update cr0_guest_host_mask and exception_bitmap.
- */
- update_exception_bitmap(vcpu);
- vcpu->arch.cr0_guest_owned_bits = (vcpu->fpu_active ? X86_CR0_TS : 0);
- vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
- /*
- * Note that CR4_GUEST_HOST_MASK is already set in the original vmcs01
- * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask();
- */
+ /* Same as above - no reason to call set_cr4_guest_host_mask(). */
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
kvm_set_cr4(vcpu, vmcs12->host_cr4);
@@ -11545,9 +11526,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.get_pkru = vmx_get_pkru,
- .fpu_activate = vmx_fpu_activate,
- .fpu_deactivate = vmx_fpu_deactivate,
-
.tlb_flush = vmx_flush_tlb,
.run = vmx_vcpu_run,
@@ -11572,6 +11550,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.get_enable_apicv = vmx_get_enable_apicv,
.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
.load_eoi_exitmap = vmx_load_eoi_exitmap,
+ .apicv_post_state_restore = vmx_apicv_post_state_restore,
.hwapic_irr_update = vmx_hwapic_irr_update,
.hwapic_isr_update = vmx_hwapic_isr_update,
.sync_pir_to_irr = vmx_sync_pir_to_irr,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e52c9088660f..b2a4b11274b0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -180,6 +180,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
{ "irq_injections", VCPU_STAT(irq_injections) },
{ "nmi_injections", VCPU_STAT(nmi_injections) },
+ { "req_event", VCPU_STAT(req_event) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -190,6 +191,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ "largepages", VM_STAT(lpages) },
+ { "max_mmu_page_hash_collisions",
+ VM_STAT(max_mmu_page_hash_collisions) },
{ NULL }
};
@@ -1139,6 +1142,7 @@ struct pvclock_gtod_data {
u64 boot_ns;
u64 nsec_base;
+ u64 wall_time_sec;
};
static struct pvclock_gtod_data pvclock_gtod_data;
@@ -1162,6 +1166,8 @@ static void update_pvclock_gtod(struct timekeeper *tk)
vdata->boot_ns = boot_ns;
vdata->nsec_base = tk->tkr_mono.xtime_nsec;
+ vdata->wall_time_sec = tk->xtime_sec;
+
write_seqcount_end(&vdata->seq);
}
#endif
@@ -1623,6 +1629,28 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now)
return mode;
}
+static int do_realtime(struct timespec *ts, u64 *cycle_now)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ unsigned long seq;
+ int mode;
+ u64 ns;
+
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ mode = gtod->clock.vclock_mode;
+ ts->tv_sec = gtod->wall_time_sec;
+ ns = gtod->nsec_base;
+ ns += vgettsc(cycle_now);
+ ns >>= gtod->clock.shift;
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+
+ ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
+ ts->tv_nsec = ns;
+
+ return mode;
+}
+
/* returns true if host is using tsc clocksource */
static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
{
@@ -1632,6 +1660,17 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now)
return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC;
}
+
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_walltime_and_clockread(struct timespec *ts,
+ u64 *cycle_now)
+{
+ /* checked again under seqlock below */
+ if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+ return false;
+
+ return do_realtime(ts, cycle_now) == VCLOCK_TSC;
+}
#endif
/*
@@ -1772,7 +1811,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
struct kvm_vcpu_arch *vcpu = &v->arch;
struct pvclock_vcpu_time_info guest_hv_clock;
- if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+ if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time,
&guest_hv_clock, sizeof(guest_hv_clock))))
return;
@@ -1793,9 +1832,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
vcpu->hv_clock.version = guest_hv_clock.version + 1;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
smp_wmb();
@@ -1809,16 +1848,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock));
+ kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock));
smp_wmb();
vcpu->hv_clock.version++;
- kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
- &vcpu->hv_clock,
- sizeof(vcpu->hv_clock.version));
+ kvm_vcpu_write_guest_cached(v, &vcpu->pv_time,
+ &vcpu->hv_clock,
+ sizeof(vcpu->hv_clock.version));
}
static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -2051,7 +2090,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
+ if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa,
sizeof(u32)))
return 1;
@@ -2070,7 +2109,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
return;
@@ -2081,7 +2120,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
vcpu->arch.st.steal.version += 1;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
smp_wmb();
@@ -2090,14 +2129,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
smp_wmb();
vcpu->arch.st.steal.version += 1;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime,
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
}
@@ -2202,7 +2241,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (!(data & 1))
break;
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ if (kvm_vcpu_gfn_to_hva_cache_init(vcpu,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)))
vcpu->arch.pv_time_enabled = false;
@@ -2223,7 +2262,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & KVM_STEAL_RESERVED_MASK)
return 1;
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
+ if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime,
data & KVM_STEAL_VALID_BITS,
sizeof(struct kvm_steal_time)))
return 1;
@@ -2633,6 +2672,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_DISABLE_QUIRKS:
case KVM_CAP_SET_BOOT_CPU_ID:
case KVM_CAP_SPLIT_IRQCHIP:
+ case KVM_CAP_IMMEDIATE_EXIT:
#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
case KVM_CAP_ASSIGN_DEV_IRQ:
case KVM_CAP_PCI_2_3:
@@ -2836,7 +2876,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
vcpu->arch.st.steal.preempted = 1;
- kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
+ kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime,
&vcpu->arch.st.steal.preempted,
offsetof(struct kvm_steal_time, preempted),
sizeof(vcpu->arch.st.steal.preempted));
@@ -2870,7 +2910,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- if (vcpu->arch.apicv_active)
+ if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
return kvm_apic_get_state(vcpu, s);
@@ -3897,7 +3937,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
goto split_irqchip_unlock;
/* Pairs with irqchip_in_kernel. */
smp_wmb();
- kvm->arch.irqchip_split = true;
+ kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
r = 0;
split_irqchip_unlock:
@@ -3960,40 +4000,41 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
break;
case KVM_CREATE_IRQCHIP: {
- struct kvm_pic *vpic;
-
mutex_lock(&kvm->lock);
+
r = -EEXIST;
- if (kvm->arch.vpic)
+ if (irqchip_in_kernel(kvm))
goto create_irqchip_unlock;
+
r = -EINVAL;
if (kvm->created_vcpus)
goto create_irqchip_unlock;
- r = -ENOMEM;
- vpic = kvm_create_pic(kvm);
- if (vpic) {
- r = kvm_ioapic_init(kvm);
- if (r) {
- mutex_lock(&kvm->slots_lock);
- kvm_destroy_pic(vpic);
- mutex_unlock(&kvm->slots_lock);
- goto create_irqchip_unlock;
- }
- } else
+
+ r = kvm_pic_init(kvm);
+ if (r)
+ goto create_irqchip_unlock;
+
+ r = kvm_ioapic_init(kvm);
+ if (r) {
+ mutex_lock(&kvm->slots_lock);
+ kvm_pic_destroy(kvm);
+ mutex_unlock(&kvm->slots_lock);
goto create_irqchip_unlock;
+ }
+
r = kvm_setup_default_irq_routing(kvm);
if (r) {
mutex_lock(&kvm->slots_lock);
mutex_lock(&kvm->irq_lock);
kvm_ioapic_destroy(kvm);
- kvm_destroy_pic(vpic);
+ kvm_pic_destroy(kvm);
mutex_unlock(&kvm->irq_lock);
mutex_unlock(&kvm->slots_lock);
goto create_irqchip_unlock;
}
- /* Write kvm->irq_routing before kvm->arch.vpic. */
+ /* Write kvm->irq_routing before enabling irqchip_in_kernel. */
smp_wmb();
- kvm->arch.vpic = vpic;
+ kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
create_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
@@ -4029,7 +4070,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
r = -ENXIO;
- if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
+ if (!irqchip_kernel(kvm))
goto get_irqchip_out;
r = kvm_vm_ioctl_get_irqchip(kvm, chip);
if (r)
@@ -4053,7 +4094,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
r = -ENXIO;
- if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
+ if (!irqchip_kernel(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
if (r)
@@ -4462,6 +4503,21 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+ gpa_t gpa, bool write)
+{
+ /* For APIC access vmexit */
+ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ return 1;
+
+ if (vcpu_match_mmio_gpa(vcpu, gpa)) {
+ trace_vcpu_match_mmio(gva, gpa, write, true);
+ return 1;
+ }
+
+ return 0;
+}
+
static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
gpa_t *gpa, struct x86_exception *exception,
bool write)
@@ -4488,16 +4544,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
if (*gpa == UNMAPPED_GVA)
return -1;
- /* For APIC access vmexit */
- if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
- return 1;
-
- if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
- trace_vcpu_match_mmio(gva, *gpa, write, true);
- return 1;
- }
-
- return 0;
+ return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
}
int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -4594,6 +4641,22 @@ static int emulator_read_write_onepage(unsigned long addr, void *val,
int handled, ret;
bool write = ops->write;
struct kvm_mmio_fragment *frag;
+ struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+
+ /*
+ * If the exit was due to a NPF we may already have a GPA.
+ * If the GPA is present, use it to avoid the GVA to GPA table walk.
+ * Note, this cannot be used on string operations since string
+ * operation using rep will only have the initial GPA from the NPF
+ * occurred.
+ */
+ if (vcpu->arch.gpa_available &&
+ emulator_can_use_gpa(ctxt) &&
+ vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) &&
+ (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) {
+ gpa = exception->address;
+ goto mmio;
+ }
ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
@@ -5610,6 +5673,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
}
restart:
+ /* Save the faulting GPA (cr2) in the address field */
+ ctxt->exception.address = cr2;
+
r = x86_emulate_insn(ctxt);
if (r == EMULATION_INTERCEPTED)
@@ -5924,9 +5990,6 @@ static void kvm_set_mmio_spte_mask(void)
/* Mask the reserved physical address bits. */
mask = rsvd_bits(maxphyaddr, 51);
- /* Bit 62 is always reserved for 32bit host. */
- mask |= 0x3ull << 62;
-
/* Set the present bit. */
mask |= 1ull;
@@ -6025,7 +6088,7 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0,
- PT_PRESENT_MASK);
+ PT_PRESENT_MASK, 0);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -6087,6 +6150,35 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_emulate_halt);
+#ifdef CONFIG_X86_64
+static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
+ unsigned long clock_type)
+{
+ struct kvm_clock_pairing clock_pairing;
+ struct timespec ts;
+ u64 cycle;
+ int ret;
+
+ if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
+ return -KVM_EOPNOTSUPP;
+
+ if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
+ return -KVM_EOPNOTSUPP;
+
+ clock_pairing.sec = ts.tv_sec;
+ clock_pairing.nsec = ts.tv_nsec;
+ clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
+ clock_pairing.flags = 0;
+
+ ret = 0;
+ if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
+ sizeof(struct kvm_clock_pairing)))
+ ret = -KVM_EFAULT;
+
+ return ret;
+}
+#endif
+
/*
* kvm_pv_kick_cpu_op: Kick a vcpu.
*
@@ -6151,6 +6243,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
ret = 0;
break;
+#ifdef CONFIG_X86_64
+ case KVM_HC_CLOCK_PAIRING:
+ ret = kvm_pv_clock_pairing(vcpu, a0, a1);
+ break;
+#endif
default:
ret = -KVM_ENOSYS;
break;
@@ -6564,7 +6661,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
- if (vcpu->arch.apicv_active)
+ if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -6655,10 +6752,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 0;
goto out;
}
- if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
- vcpu->fpu_active = 0;
- kvm_x86_ops->fpu_deactivate(vcpu);
- }
if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
/* Page is swapped out. Do synthetic halt */
vcpu->arch.apf.halted = true;
@@ -6718,21 +6811,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_hv_process_stimers(vcpu);
}
- /*
- * KVM_REQ_EVENT is not set when posted interrupts are set by
- * VT-d hardware, so we have to update RVI unconditionally.
- */
- if (kvm_lapic_enabled(vcpu)) {
- /*
- * Update architecture specific hints for APIC
- * virtual interrupt delivery.
- */
- if (vcpu->arch.apicv_active)
- kvm_x86_ops->hwapic_irr_update(vcpu,
- kvm_lapic_find_highest_irr(vcpu));
- }
-
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
+ ++vcpu->stat.req_event;
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
r = 1;
@@ -6773,22 +6853,40 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
- if (vcpu->fpu_active)
- kvm_load_guest_fpu(vcpu);
+ kvm_load_guest_fpu(vcpu);
+
+ /*
+ * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
+ * IPI are then delayed after guest entry, which ensures that they
+ * result in virtual interrupt delivery.
+ */
+ local_irq_disable();
vcpu->mode = IN_GUEST_MODE;
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
/*
- * We should set ->mode before check ->requests,
- * Please see the comment in kvm_make_all_cpus_request.
- * This also orders the write to mode from any reads
- * to the page tables done while the VCPU is running.
- * Please see the comment in kvm_flush_remote_tlbs.
+ * 1) We should set ->mode before checking ->requests. Please see
+ * the comment in kvm_make_all_cpus_request.
+ *
+ * 2) For APICv, we should set ->mode before checking PIR.ON. This
+ * pairs with the memory barrier implicit in pi_test_and_set_on
+ * (see vmx_deliver_posted_interrupt).
+ *
+ * 3) This also orders the write to mode from any reads to the page
+ * tables done while the VCPU is running. Please see the comment
+ * in kvm_flush_remote_tlbs.
*/
smp_mb__after_srcu_read_unlock();
- local_irq_disable();
+ /*
+ * This handles the case where a posted interrupt was
+ * notified with kvm_vcpu_kick.
+ */
+ if (kvm_lapic_enabled(vcpu)) {
+ if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active)
+ kvm_x86_ops->sync_pir_to_irr(vcpu);
+ }
if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
|| need_resched() || signal_pending(current)) {
@@ -6927,6 +7025,9 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
+ if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
+ kvm_x86_ops->check_nested_events(vcpu, false);
+
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
}
@@ -7098,7 +7199,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
} else
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
- r = vcpu_run(vcpu);
+ if (kvm_run->immediate_exit)
+ r = -EINTR;
+ else
+ r = vcpu_run(vcpu);
out:
post_kvm_run_save(vcpu);
@@ -8293,9 +8397,6 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
- kvm_x86_ops->check_nested_events(vcpu, false);
-
return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu);
}
@@ -8432,9 +8533,8 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
{
-
- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
- sizeof(val));
+ return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val,
+ sizeof(val));
}
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,