diff options
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r-- | arch/x86/kvm/x86.c | 275 |
1 files changed, 188 insertions, 87 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d153be8929a6..b2a4b11274b0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -180,6 +180,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, { "irq_injections", VCPU_STAT(irq_injections) }, { "nmi_injections", VCPU_STAT(nmi_injections) }, + { "req_event", VCPU_STAT(req_event) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, @@ -190,6 +191,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, + { "max_mmu_page_hash_collisions", + VM_STAT(max_mmu_page_hash_collisions) }, { NULL } }; @@ -1139,6 +1142,7 @@ struct pvclock_gtod_data { u64 boot_ns; u64 nsec_base; + u64 wall_time_sec; }; static struct pvclock_gtod_data pvclock_gtod_data; @@ -1162,6 +1166,8 @@ static void update_pvclock_gtod(struct timekeeper *tk) vdata->boot_ns = boot_ns; vdata->nsec_base = tk->tkr_mono.xtime_nsec; + vdata->wall_time_sec = tk->xtime_sec; + write_seqcount_end(&vdata->seq); } #endif @@ -1623,6 +1629,28 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now) return mode; } +static int do_realtime(struct timespec *ts, u64 *cycle_now) +{ + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + unsigned long seq; + int mode; + u64 ns; + + do { + seq = read_seqcount_begin(>od->seq); + mode = gtod->clock.vclock_mode; + ts->tv_sec = gtod->wall_time_sec; + ns = gtod->nsec_base; + ns += vgettsc(cycle_now); + ns >>= gtod->clock.shift; + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + /* returns true if host is using tsc clocksource */ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) { @@ -1632,6 +1660,17 @@ static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; } + +/* returns true if host is using tsc clocksource */ +static bool kvm_get_walltime_and_clockread(struct timespec *ts, + u64 *cycle_now) +{ + /* checked again under seqlock below */ + if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + return false; + + return do_realtime(ts, cycle_now) == VCLOCK_TSC; +} #endif /* @@ -1772,7 +1811,7 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info guest_hv_clock; - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, + if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time, &guest_hv_clock, sizeof(guest_hv_clock)))) return; @@ -1793,9 +1832,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); smp_wmb(); @@ -1809,16 +1848,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); smp_wmb(); vcpu->hv_clock.version++; - kvm_write_guest_cached(v->kvm, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); } static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -2051,7 +2090,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, + if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa, sizeof(u32))) return 1; @@ -2070,7 +2109,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; @@ -2081,7 +2120,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.version += 1; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); @@ -2090,14 +2129,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); vcpu->arch.st.steal.version += 1; - kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, + kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); } @@ -2202,7 +2241,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) vcpu->arch.pv_time_enabled = false; @@ -2223,7 +2262,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & KVM_STEAL_RESERVED_MASK) return 1; - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, + if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime, data & KVM_STEAL_VALID_BITS, sizeof(struct kvm_steal_time))) return 1; @@ -2633,6 +2672,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_DISABLE_QUIRKS: case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: + case KVM_CAP_IMMEDIATE_EXIT: #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT case KVM_CAP_ASSIGN_DEV_IRQ: case KVM_CAP_PCI_2_3: @@ -2836,7 +2876,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.preempted = 1; - kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, + kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime, &vcpu->arch.st.steal.preempted, offsetof(struct kvm_steal_time, preempted), sizeof(vcpu->arch.st.steal.preempted)); @@ -2870,7 +2910,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - if (vcpu->arch.apicv_active) + if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); return kvm_apic_get_state(vcpu, s); @@ -3182,6 +3222,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) memcpy(dest, xsave, XSAVE_HDR_OFFSET); /* Set XSTATE_BV */ + xstate_bv &= vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FPSSE; *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv; /* @@ -3896,7 +3937,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, goto split_irqchip_unlock; /* Pairs with irqchip_in_kernel. */ smp_wmb(); - kvm->arch.irqchip_split = true; + kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; r = 0; split_irqchip_unlock: @@ -3959,40 +4000,41 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; case KVM_CREATE_IRQCHIP: { - struct kvm_pic *vpic; - mutex_lock(&kvm->lock); + r = -EEXIST; - if (kvm->arch.vpic) + if (irqchip_in_kernel(kvm)) goto create_irqchip_unlock; + r = -EINVAL; if (kvm->created_vcpus) goto create_irqchip_unlock; - r = -ENOMEM; - vpic = kvm_create_pic(kvm); - if (vpic) { - r = kvm_ioapic_init(kvm); - if (r) { - mutex_lock(&kvm->slots_lock); - kvm_destroy_pic(vpic); - mutex_unlock(&kvm->slots_lock); - goto create_irqchip_unlock; - } - } else + + r = kvm_pic_init(kvm); + if (r) + goto create_irqchip_unlock; + + r = kvm_ioapic_init(kvm); + if (r) { + mutex_lock(&kvm->slots_lock); + kvm_pic_destroy(kvm); + mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; + } + r = kvm_setup_default_irq_routing(kvm); if (r) { mutex_lock(&kvm->slots_lock); mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); - kvm_destroy_pic(vpic); + kvm_pic_destroy(kvm); mutex_unlock(&kvm->irq_lock); mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } - /* Write kvm->irq_routing before kvm->arch.vpic. */ + /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ smp_wmb(); - kvm->arch.vpic = vpic; + kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; create_irqchip_unlock: mutex_unlock(&kvm->lock); break; @@ -4028,7 +4070,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_kernel(kvm)) goto get_irqchip_out; r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) @@ -4052,7 +4094,7 @@ long kvm_arch_vm_ioctl(struct file *filp, } r = -ENXIO; - if (!irqchip_in_kernel(kvm) || irqchip_split(kvm)) + if (!irqchip_kernel(kvm)) goto set_irqchip_out; r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) @@ -4461,6 +4503,21 @@ out: } EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); +static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, + gpa_t gpa, bool write) +{ + /* For APIC access vmexit */ + if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) + return 1; + + if (vcpu_match_mmio_gpa(vcpu, gpa)) { + trace_vcpu_match_mmio(gva, gpa, write, true); + return 1; + } + + return 0; +} + static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) @@ -4487,16 +4544,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, if (*gpa == UNMAPPED_GVA) return -1; - /* For APIC access vmexit */ - if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) - return 1; - - if (vcpu_match_mmio_gpa(vcpu, *gpa)) { - trace_vcpu_match_mmio(gva, *gpa, write, true); - return 1; - } - - return 0; + return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write); } int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -4593,6 +4641,22 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + + /* + * If the exit was due to a NPF we may already have a GPA. + * If the GPA is present, use it to avoid the GVA to GPA table walk. + * Note, this cannot be used on string operations since string + * operation using rep will only have the initial GPA from the NPF + * occurred. + */ + if (vcpu->arch.gpa_available && + emulator_can_use_gpa(ctxt) && + vcpu_is_mmio_gpa(vcpu, addr, exception->address, write) && + (addr & ~PAGE_MASK) == (exception->address & ~PAGE_MASK)) { + gpa = exception->address; + goto mmio; + } ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -5609,6 +5673,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, } restart: + /* Save the faulting GPA (cr2) in the address field */ + ctxt->exception.address = cr2; + r = x86_emulate_insn(ctxt); if (r == EMULATION_INTERCEPTED) @@ -5923,9 +5990,6 @@ static void kvm_set_mmio_spte_mask(void) /* Mask the reserved physical address bits. */ mask = rsvd_bits(maxphyaddr, 51); - /* Bit 62 is always reserved for 32bit host. */ - mask |= 0x3ull << 62; - /* Set the present bit. */ mask |= 1ull; @@ -6024,7 +6088,7 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, - PT_PRESENT_MASK); + PT_PRESENT_MASK, 0); kvm_timer_init(); perf_register_guest_info_callbacks(&kvm_guest_cbs); @@ -6086,6 +6150,35 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +#ifdef CONFIG_X86_64 +static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, + unsigned long clock_type) +{ + struct kvm_clock_pairing clock_pairing; + struct timespec ts; + u64 cycle; + int ret; + + if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) + return -KVM_EOPNOTSUPP; + + if (kvm_get_walltime_and_clockread(&ts, &cycle) == false) + return -KVM_EOPNOTSUPP; + + clock_pairing.sec = ts.tv_sec; + clock_pairing.nsec = ts.tv_nsec; + clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle); + clock_pairing.flags = 0; + + ret = 0; + if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing, + sizeof(struct kvm_clock_pairing))) + ret = -KVM_EFAULT; + + return ret; +} +#endif + /* * kvm_pv_kick_cpu_op: Kick a vcpu. * @@ -6150,6 +6243,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1); ret = 0; break; +#ifdef CONFIG_X86_64 + case KVM_HC_CLOCK_PAIRING: + ret = kvm_pv_clock_pairing(vcpu, a0, a1); + break; +#endif default: ret = -KVM_ENOSYS; break; @@ -6563,7 +6661,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { - if (vcpu->arch.apicv_active) + if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -6654,10 +6752,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 0; goto out; } - if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { - vcpu->fpu_active = 0; - kvm_x86_ops->fpu_deactivate(vcpu); - } if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { /* Page is swapped out. Do synthetic halt */ vcpu->arch.apf.halted = true; @@ -6717,21 +6811,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_hv_process_stimers(vcpu); } - /* - * KVM_REQ_EVENT is not set when posted interrupts are set by - * VT-d hardware, so we have to update RVI unconditionally. - */ - if (kvm_lapic_enabled(vcpu)) { - /* - * Update architecture specific hints for APIC - * virtual interrupt delivery. - */ - if (vcpu->arch.apicv_active) - kvm_x86_ops->hwapic_irr_update(vcpu, - kvm_lapic_find_highest_irr(vcpu)); - } - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { + ++vcpu->stat.req_event; kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { r = 1; @@ -6772,22 +6853,40 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); - if (vcpu->fpu_active) - kvm_load_guest_fpu(vcpu); + kvm_load_guest_fpu(vcpu); + + /* + * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt + * IPI are then delayed after guest entry, which ensures that they + * result in virtual interrupt delivery. + */ + local_irq_disable(); vcpu->mode = IN_GUEST_MODE; srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); /* - * We should set ->mode before check ->requests, - * Please see the comment in kvm_make_all_cpus_request. - * This also orders the write to mode from any reads - * to the page tables done while the VCPU is running. - * Please see the comment in kvm_flush_remote_tlbs. + * 1) We should set ->mode before checking ->requests. Please see + * the comment in kvm_make_all_cpus_request. + * + * 2) For APICv, we should set ->mode before checking PIR.ON. This + * pairs with the memory barrier implicit in pi_test_and_set_on + * (see vmx_deliver_posted_interrupt). + * + * 3) This also orders the write to mode from any reads to the page + * tables done while the VCPU is running. Please see the comment + * in kvm_flush_remote_tlbs. */ smp_mb__after_srcu_read_unlock(); - local_irq_disable(); + /* + * This handles the case where a posted interrupt was + * notified with kvm_vcpu_kick. + */ + if (kvm_lapic_enabled(vcpu)) { + if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) + kvm_x86_ops->sync_pir_to_irr(vcpu); + } if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests || need_resched() || signal_pending(current)) { @@ -6926,6 +7025,9 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { + if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) + kvm_x86_ops->check_nested_events(vcpu, false); + return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); } @@ -7097,7 +7199,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } else WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed); - r = vcpu_run(vcpu); + if (kvm_run->immediate_exit) + r = -EINTR; + else + r = vcpu_run(vcpu); out: post_kvm_run_save(vcpu); @@ -8292,9 +8397,6 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) - kvm_x86_ops->check_nested_events(vcpu, false); - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); } @@ -8431,9 +8533,8 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) { - - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, - sizeof(val)); + return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val, + sizeof(val)); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, |