From 1ad9f23873a4ee837643be5a29c05e405ec54e18 Mon Sep 17 00:00:00 2001 From: "pingfank@linux.vnet.ibm.com" Date: Tue, 15 Apr 2014 16:33:40 +0800 Subject: KVM: PPC: Book3S: HV: make _PAGE_NUMA take effect Numa fault is a method which help to achieve auto numa balancing. When such a page fault takes place, the page fault handler will check whether the page is placed correctly. If not, migration should be involved to cut down the distance between the cpu and pages. A pte with _PAGE_NUMA help to implement numa fault. It means not to allow the MMU to access the page directly. So a page fault is triggered and numa fault handler gets the opportunity to run checker. As for the access of MMU, we need special handling for the powernv's guest. When we mark a pte with _PAGE_NUMA, we already call mmu_notifier to invalidate it in guest's htab, but when we tried to re-insert them, we firstly try to map it in real-mode. Only after this fails, we fallback to virt mode, and most of important, we run numa fault handler in virt mode. This patch guards the way of real-mode to ensure that if a pte is marked with _PAGE_NUMA, it will NOT be mapped in real mode, instead, it will be mapped in virt mode and have the opportunity to be checked with placement. Signed-off-by: Liu Ping Fan Reviewed-by: Aneesh Kumar K.V Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 1d6c56ad5b60..8fcc36306a02 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -234,7 +234,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, pte_size = psize; pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size); - if (pte_present(pte)) { + if (pte_present(pte) && !pte_numa(pte)) { if (writing && !pte_write(pte)) /* make the actual HPTE be read-only */ ptel = hpte_make_readonly(ptel); -- cgit v1.2.3 From 0a8eccefcb3423b45dc0a5a527b4799520e2bc94 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 14 Apr 2014 08:56:26 +1000 Subject: KVM: PPC: Book3S HV: Add missing code for transaction reclaim on guest exit Testing by Michael Neuling revealed that commit e4e38121507a ("KVM: PPC: Book3S HV: Add transactional memory support") is missing the code that saves away the checkpointed state of the guest when switching to the host. This adds that code, which was in earlier versions of the patch but went missing somehow. Reported-by: Michael Neuling Signed-off-by: Paul Mackerras Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 104 ++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index ffbb871c2bd8..0ff470170626 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1312,6 +1312,110 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) mr r3, r9 bl kvmppc_save_fp +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + b 2f +END_FTR_SECTION_IFCLR(CPU_FTR_TM) + /* Turn on TM. */ + mfmsr r8 + li r0, 1 + rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG + mtmsrd r8 + + ld r5, VCPU_MSR(r9) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beq 1f /* TM not active in guest. */ + + li r3, TM_CAUSE_KVM_RESCHED + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* All GPRs are volatile at this point. */ + TRECLAIM(R3) + + /* Temporarily store r13 and r9 so we have some regs to play with */ + SET_SCRATCH0(r13) + GET_PACA(r13) + std r9, PACATMSCRATCH(r13) + ld r9, HSTATE_KVM_VCPU(r13) + + /* Get a few more GPRs free. */ + std r29, VCPU_GPRS_TM(29)(r9) + std r30, VCPU_GPRS_TM(30)(r9) + std r31, VCPU_GPRS_TM(31)(r9) + + /* Save away PPR and DSCR soon so don't run with user values. */ + mfspr r31, SPRN_PPR + HMT_MEDIUM + mfspr r30, SPRN_DSCR + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + + /* Save all but r9, r13 & r29-r31 */ + reg = 0 + .rept 29 + .if (reg != 9) && (reg != 13) + std reg, VCPU_GPRS_TM(reg)(r9) + .endif + reg = reg + 1 + .endr + /* ... now save r13 */ + GET_SCRATCH0(r4) + std r4, VCPU_GPRS_TM(13)(r9) + /* ... and save r9 */ + ld r4, PACATMSCRATCH(r13) + std r4, VCPU_GPRS_TM(9)(r9) + + /* Reload stack pointer and TOC. */ + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATOC(r13) + + /* Set MSR RI now we have r1 and r13 back. */ + li r5, MSR_RI + mtmsrd r5, 1 + + /* Save away checkpinted SPRs. */ + std r31, VCPU_PPR_TM(r9) + std r30, VCPU_DSCR_TM(r9) + mflr r5 + mfcr r6 + mfctr r7 + mfspr r8, SPRN_AMR + mfspr r10, SPRN_TAR + std r5, VCPU_LR_TM(r9) + stw r6, VCPU_CR_TM(r9) + std r7, VCPU_CTR_TM(r9) + std r8, VCPU_AMR_TM(r9) + std r10, VCPU_TAR_TM(r9) + + /* Restore r12 as trap number. */ + lwz r12, VCPU_TRAP(r9) + + /* Save FP/VSX. */ + addi r3, r9, VCPU_FPRS_TM + bl .store_fp_state + addi r3, r9, VCPU_VRS_TM + bl .store_vr_state + mfspr r6, SPRN_VRSAVE + stw r6, VCPU_VRSAVE_TM(r9) +1: + /* + * We need to save these SPRs after the treclaim so that the software + * error code is recorded correctly in the TEXASR. Also the user may + * change these outside of a transaction, so they must always be + * context switched. + */ + mfspr r5, SPRN_TFHAR + mfspr r6, SPRN_TFIAR + mfspr r7, SPRN_TEXASR + std r5, VCPU_TFHAR(r9) + std r6, VCPU_TFIAR(r9) + std r7, VCPU_TEXASR(r9) +2: +#endif + /* Increment yield count if they have a VPA */ ld r8, VCPU_VPA(r9) /* do they have a VPA? */ cmpdi r8, 0 -- cgit v1.2.3 From ab78475c76bd8c54375d8a778200c59314973d30 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Sun, 6 Apr 2014 23:31:48 +0200 Subject: KVM: PPC: Book3S: ifdef on CONFIG_KVM_BOOK3S_32_HANDLER for 32bit The book3s_32 target can get built as module which means we don't see the config define for it in code. Instead, check on the bool define CONFIG_KVM_BOOK3S_32_HANDLER whenever we want to know whether we're building for a book3s_32 host. This fixes running book3s_32 kvm as a module for me. Signed-off-by: Alexander Graf Reviewed-by: Aneesh Kumar K.V --- arch/powerpc/kvm/book3s.c | 6 +++--- arch/powerpc/kvm/book3s_pr.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 94e597e6f15c..7af190a266b3 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -886,7 +886,7 @@ static int kvmppc_book3s_init(void) r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (r) return r; -#ifdef CONFIG_KVM_BOOK3S_32 +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER r = kvmppc_book3s_init_pr(); #endif return r; @@ -895,7 +895,7 @@ static int kvmppc_book3s_init(void) static void kvmppc_book3s_exit(void) { -#ifdef CONFIG_KVM_BOOK3S_32 +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER kvmppc_book3s_exit_pr(); #endif kvm_exit(); @@ -905,7 +905,7 @@ module_init(kvmppc_book3s_init); module_exit(kvmppc_book3s_exit); /* On 32bit this is our one and only kernel module */ -#ifdef CONFIG_KVM_BOOK3S_32 +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER MODULE_ALIAS_MISCDEV(KVM_MINOR); MODULE_ALIAS("devname:kvm"); #endif diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index c5c052a9729c..02f1defd8bb9 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -1153,7 +1153,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, goto free_vcpu; vcpu->arch.book3s = vcpu_book3s; -#ifdef CONFIG_KVM_BOOK3S_32 +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER vcpu->arch.shadow_vcpu = kzalloc(sizeof(*vcpu->arch.shadow_vcpu), GFP_KERNEL); if (!vcpu->arch.shadow_vcpu) @@ -1198,7 +1198,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, uninit_vcpu: kvm_vcpu_uninit(vcpu); free_shadow_vcpu: -#ifdef CONFIG_KVM_BOOK3S_32 +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER kfree(vcpu->arch.shadow_vcpu); free_vcpu3s: #endif @@ -1215,7 +1215,7 @@ static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); kvm_vcpu_uninit(vcpu); -#ifdef CONFIG_KVM_BOOK3S_32 +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER kfree(vcpu->arch.shadow_vcpu); #endif vfree(vcpu_book3s); -- cgit v1.2.3 From b18db0b80867931f4e3a844400a3c22a4fd2ff57 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 29 Apr 2014 12:17:26 +0200 Subject: KVM guest: Make pv trampoline code executable Our PV guest patching code assembles chunks of instructions on the fly when it encounters more complicated instructions to hijack. These instructions need to live in a section that we don't mark as non-executable, as otherwise we fault when jumping there. Right now we put it into the .bss section where it automatically gets marked as non-executable. Add a check to the NX setting function to ensure that we leave these particular pages executable. Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/sections.h | 11 +++++++++++ arch/powerpc/kernel/kvm.c | 2 +- arch/powerpc/mm/hash_utils_64.c | 4 ++++ 3 files changed, 16 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index d0e784e0ff48..521790330672 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -39,6 +39,17 @@ static inline int overlaps_kernel_text(unsigned long start, unsigned long end) (unsigned long)_stext < end; } +static inline int overlaps_kvm_tmp(unsigned long start, unsigned long end) +{ +#ifdef CONFIG_KVM_GUEST + extern char kvm_tmp[]; + return start < (unsigned long)kvm_tmp && + (unsigned long)&kvm_tmp[1024 * 1024] < end; +#else + return 0; +#endif +} + #undef dereference_function_descriptor static inline void *dereference_function_descriptor(void *ptr) { diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c index 6a0175297b0d..dd8695f6cb6d 100644 --- a/arch/powerpc/kernel/kvm.c +++ b/arch/powerpc/kernel/kvm.c @@ -74,7 +74,7 @@ #define KVM_INST_MTSRIN 0x7c0001e4 static bool kvm_patching_worked = true; -static char kvm_tmp[1024 * 1024]; +char kvm_tmp[1024 * 1024]; static int kvm_tmp_index; static inline void kvm_patch_ins(u32 *inst, u32 new_inst) diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index d766d6ee33fe..06ba83b036d3 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -207,6 +207,10 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend, if (overlaps_kernel_text(vaddr, vaddr + step)) tprot &= ~HPTE_R_N; + /* Make kvm guest trampolines executable */ + if (overlaps_kvm_tmp(vaddr, vaddr + step)) + tprot &= ~HPTE_R_N; + /* * If relocatable, check if it overlaps interrupt vectors that * are copied down to real 0. For relocatable kernel -- cgit v1.2.3 From 696dfd95ba9838327a7013e5988ff3ba60dcc8c8 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 7 May 2014 11:20:54 +0200 Subject: KVM: vmx: disable APIC virtualization in nested guests While running a nested guest, we should disable APIC virtualization controls (virtualized APIC register accesses, virtual interrupt delivery and posted interrupts), because we do not expose them to the nested guest. Reported-by: Hu Yaohui Suggested-by: Abel Gordon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 33e8c028842f..138ceffc6377 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7778,7 +7778,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) exec_control = vmcs12->pin_based_vm_exec_control; exec_control |= vmcs_config.pin_based_exec_ctrl; - exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER; + exec_control &= ~(PIN_BASED_VMX_PREEMPTION_TIMER | + PIN_BASED_POSTED_INTR); vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, exec_control); vmx->nested.preemption_timer_expired = false; @@ -7815,7 +7816,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (!vmx->rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; /* Take the following fields only from vmcs12 */ - exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | + SECONDARY_EXEC_APIC_REGISTER_VIRT); if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; -- cgit v1.2.3 From 16a9602158861687c78b6de6dc6a79e6e8a9136f Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 14 May 2014 12:43:24 -0300 Subject: KVM: x86: disable master clock if TSC is reset during suspend Updating system_time from the kernel clock once master clock has been enabled can result in time backwards event, in case kernel clock frequency is lower than TSC frequency. Disable master clock in case it is necessary to update it from the resume path. Signed-off-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8b8fc0b792ba..84a2d4152a63 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -106,6 +106,8 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); static u32 tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); +static bool backwards_tsc_observed = false; + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -1486,7 +1488,8 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_kernel_ns, &ka->master_cycle_now); - ka->use_master_clock = host_tsc_clocksource & vcpus_matched; + ka->use_master_clock = host_tsc_clocksource && vcpus_matched + && !backwards_tsc_observed; if (ka->use_master_clock) atomic_set(&kvm_guest_has_master_clock, 1); @@ -6945,6 +6948,7 @@ int kvm_arch_hardware_enable(void *garbage) */ if (backwards_tsc) { u64 delta_cyc = max_tsc - local_tsc; + backwards_tsc_observed = true; list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { vcpu->arch.tsc_offset_adjustment += delta_cyc; -- cgit v1.2.3 From ebc3226202d5956a5963185222982d435378b899 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Fri, 9 May 2014 15:00:46 +0200 Subject: KVM: s390: announce irqfd capability s390 has acquired irqfd support with commit "KVM: s390: irq routing for adapter interrupts" (84223598778ba08041f4297fda485df83414d57e) but failed to announce it. Let's fix that. Signed-off-by: Cornelia Huck Acked-by: Christian Borntraeger Signed-off-by: Christian Borntraeger --- Documentation/virtual/kvm/api.txt | 2 +- arch/s390/kvm/kvm-s390.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index a9380ba54c8e..b4f53653c106 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2126,7 +2126,7 @@ into the hash PTE second double word). 4.75 KVM_IRQFD Capability: KVM_CAP_IRQFD -Architectures: x86 +Architectures: x86 s390 Type: vm ioctl Parameters: struct kvm_irqfd (in) Returns: 0 on success, -1 on error diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b3ecb8f5b6ce..9ae6664ff08c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -158,6 +158,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_ONE_REG: case KVM_CAP_ENABLE_CAP: case KVM_CAP_S390_CSS_SUPPORT: + case KVM_CAP_IRQFD: case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: case KVM_CAP_ENABLE_CAP_VM: -- cgit v1.2.3