summaryrefslogtreecommitdiffstats
path: root/arch/x86/kvm/x86.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm/x86.c')
-rw-r--r--arch/x86/kvm/x86.c635
1 files changed, 527 insertions, 108 deletions
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1eefebe5d727..c243b81e3c74 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -46,6 +46,8 @@
#include <linux/uaccess.h>
#include <linux/hash.h>
#include <linux/pci.h>
+#include <linux/timekeeper_internal.h>
+#include <linux/pvclock_gtod.h>
#include <trace/events/kvm.h>
#define CREATE_TRACE_POINTS
@@ -118,7 +120,7 @@ struct kvm_shared_msrs {
};
static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
-static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
+static struct kvm_shared_msrs __percpu *shared_msrs;
struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -158,7 +160,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
u64 __read_mostly host_xcr0;
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
+
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
@@ -187,10 +191,10 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
static void shared_msr_update(unsigned slot, u32 msr)
{
- struct kvm_shared_msrs *smsr;
u64 value;
+ unsigned int cpu = smp_processor_id();
+ struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
- smsr = &__get_cpu_var(shared_msrs);
/* only read, and nobody should modify it at this time,
* so don't need lock */
if (slot >= shared_msrs_global.nr) {
@@ -222,7 +226,8 @@ static void kvm_shared_msr_cpu_online(void)
void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
{
- struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+ unsigned int cpu = smp_processor_id();
+ struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
if (((value ^ smsr->values[slot].curr) & mask) == 0)
return;
@@ -238,7 +243,8 @@ EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
static void drop_user_return_notifiers(void *ignore)
{
- struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
+ unsigned int cpu = smp_processor_id();
+ struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
if (smsr->registered)
kvm_on_user_return(&smsr->urn);
@@ -633,7 +639,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
}
if (is_long_mode(vcpu)) {
- if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) {
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
return 1;
} else
@@ -827,6 +833,7 @@ static u32 msrs_to_save[] = {
static unsigned num_msrs_to_save;
static const u32 emulated_msrs[] = {
+ MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
@@ -886,9 +893,9 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
- return kvm_x86_ops->set_msr(vcpu, msr_index, data);
+ return kvm_x86_ops->set_msr(vcpu, msr);
}
/*
@@ -896,8 +903,62 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
*/
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- return kvm_set_msr(vcpu, index, *data);
+ struct msr_data msr;
+
+ msr.data = *data;
+ msr.index = index;
+ msr.host_initiated = true;
+ return kvm_set_msr(vcpu, &msr);
+}
+
+#ifdef CONFIG_X86_64
+struct pvclock_gtod_data {
+ seqcount_t seq;
+
+ struct { /* extract of a clocksource struct */
+ int vclock_mode;
+ cycle_t cycle_last;
+ cycle_t mask;
+ u32 mult;
+ u32 shift;
+ } clock;
+
+ /* open coded 'struct timespec' */
+ u64 monotonic_time_snsec;
+ time_t monotonic_time_sec;
+};
+
+static struct pvclock_gtod_data pvclock_gtod_data;
+
+static void update_pvclock_gtod(struct timekeeper *tk)
+{
+ struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
+
+ write_seqcount_begin(&vdata->seq);
+
+ /* copy pvclock gtod data */
+ vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode;
+ vdata->clock.cycle_last = tk->clock->cycle_last;
+ vdata->clock.mask = tk->clock->mask;
+ vdata->clock.mult = tk->mult;
+ vdata->clock.shift = tk->shift;
+
+ vdata->monotonic_time_sec = tk->xtime_sec
+ + tk->wall_to_monotonic.tv_sec;
+ vdata->monotonic_time_snsec = tk->xtime_nsec
+ + (tk->wall_to_monotonic.tv_nsec
+ << tk->shift);
+ while (vdata->monotonic_time_snsec >=
+ (((u64)NSEC_PER_SEC) << tk->shift)) {
+ vdata->monotonic_time_snsec -=
+ ((u64)NSEC_PER_SEC) << tk->shift;
+ vdata->monotonic_time_sec++;
+ }
+
+ write_seqcount_end(&vdata->seq);
}
+#endif
+
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
@@ -995,6 +1056,10 @@ static inline u64 get_kernel_ns(void)
return timespec_to_ns(&ts);
}
+#ifdef CONFIG_X86_64
+static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
+#endif
+
static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
unsigned long max_tsc_khz;
@@ -1046,12 +1111,47 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
return tsc;
}
-void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
+void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_X86_64
+ bool vcpus_matched;
+ bool do_request = false;
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+ vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+ atomic_read(&vcpu->kvm->online_vcpus));
+
+ if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
+ if (!ka->use_master_clock)
+ do_request = 1;
+
+ if (!vcpus_matched && ka->use_master_clock)
+ do_request = 1;
+
+ if (do_request)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+ trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
+ atomic_read(&vcpu->kvm->online_vcpus),
+ ka->use_master_clock, gtod->clock.vclock_mode);
+#endif
+}
+
+static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
+{
+ u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+ vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
+}
+
+void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
{
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
s64 usdiff;
+ bool matched;
+ u64 data = msr->data;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
@@ -1094,6 +1194,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
}
+ matched = true;
} else {
/*
* We split periods of matched TSC writes into generations.
@@ -1108,6 +1209,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
kvm->arch.cur_tsc_nsec = ns;
kvm->arch.cur_tsc_write = data;
kvm->arch.cur_tsc_offset = offset;
+ matched = false;
pr_debug("kvm: new tsc generation %u, clock %llu\n",
kvm->arch.cur_tsc_generation, data);
}
@@ -1129,26 +1231,195 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
+ if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
+ update_ia32_tsc_adjust_msr(vcpu, offset);
kvm_x86_ops->write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+
+ spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
+ if (matched)
+ kvm->arch.nr_vcpus_matched_tsc++;
+ else
+ kvm->arch.nr_vcpus_matched_tsc = 0;
+
+ kvm_track_tsc_matching(vcpu);
+ spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
}
EXPORT_SYMBOL_GPL(kvm_write_tsc);
+#ifdef CONFIG_X86_64
+
+static cycle_t read_tsc(void)
+{
+ cycle_t ret;
+ u64 last;
+
+ /*
+ * Empirically, a fence (of type that depends on the CPU)
+ * before rdtsc is enough to ensure that rdtsc is ordered
+ * with respect to loads. The various CPU manuals are unclear
+ * as to whether rdtsc can be reordered with later loads,
+ * but no one has ever seen it happen.
+ */
+ rdtsc_barrier();
+ ret = (cycle_t)vget_cycles();
+
+ last = pvclock_gtod_data.clock.cycle_last;
+
+ if (likely(ret >= last))
+ return ret;
+
+ /*
+ * GCC likes to generate cmov here, but this branch is extremely
+ * predictable (it's just a funciton of time and the likely is
+ * very likely) and there's a data dependence, so force GCC
+ * to generate a branch instead. I don't barrier() because
+ * we don't actually need a barrier, and if this function
+ * ever gets inlined it will generate worse code.
+ */
+ asm volatile ("");
+ return last;
+}
+
+static inline u64 vgettsc(cycle_t *cycle_now)
+{
+ long v;
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+ *cycle_now = read_tsc();
+
+ v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask;
+ return v * gtod->clock.mult;
+}
+
+static int do_monotonic(struct timespec *ts, cycle_t *cycle_now)
+{
+ unsigned long seq;
+ u64 ns;
+ int mode;
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+
+ ts->tv_nsec = 0;
+ do {
+ seq = read_seqcount_begin(&gtod->seq);
+ mode = gtod->clock.vclock_mode;
+ ts->tv_sec = gtod->monotonic_time_sec;
+ ns = gtod->monotonic_time_snsec;
+ ns += vgettsc(cycle_now);
+ ns >>= gtod->clock.shift;
+ } while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
+ timespec_add_ns(ts, ns);
+
+ return mode;
+}
+
+/* returns true if host is using tsc clocksource */
+static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now)
+{
+ struct timespec ts;
+
+ /* checked again under seqlock below */
+ if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC)
+ return false;
+
+ if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC)
+ return false;
+
+ monotonic_to_bootbased(&ts);
+ *kernel_ns = timespec_to_ns(&ts);
+
+ return true;
+}
+#endif
+
+/*
+ *
+ * Assuming a stable TSC across physical CPUS, and a stable TSC
+ * across virtual CPUs, the following condition is possible.
+ * Each numbered line represents an event visible to both
+ * CPUs at the next numbered event.
+ *
+ * "timespecX" represents host monotonic time. "tscX" represents
+ * RDTSC value.
+ *
+ * VCPU0 on CPU0 | VCPU1 on CPU1
+ *
+ * 1. read timespec0,tsc0
+ * 2. | timespec1 = timespec0 + N
+ * | tsc1 = tsc0 + M
+ * 3. transition to guest | transition to guest
+ * 4. ret0 = timespec0 + (rdtsc - tsc0) |
+ * 5. | ret1 = timespec1 + (rdtsc - tsc1)
+ * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
+ *
+ * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
+ *
+ * - ret0 < ret1
+ * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
+ * ...
+ * - 0 < N - M => M < N
+ *
+ * That is, when timespec0 != timespec1, M < N. Unfortunately that is not
+ * always the case (the difference between two distinct xtime instances
+ * might be smaller then the difference between corresponding TSC reads,
+ * when updating guest vcpus pvclock areas).
+ *
+ * To avoid that problem, do not allow visibility of distinct
+ * system_timestamp/tsc_timestamp values simultaneously: use a master
+ * copy of host monotonic time values. Update that master copy
+ * in lockstep.
+ *
+ * Rely on synchronization of host TSCs and guest TSCs for monotonicity.
+ *
+ */
+
+static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ struct kvm_arch *ka = &kvm->arch;
+ int vclock_mode;
+ bool host_tsc_clocksource, vcpus_matched;
+
+ vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
+ atomic_read(&kvm->online_vcpus));
+
+ /*
+ * If the host uses TSC clock, then passthrough TSC as stable
+ * to the guest.
+ */
+ host_tsc_clocksource = kvm_get_time_and_clockread(
+ &ka->master_kernel_ns,
+ &ka->master_cycle_now);
+
+ ka->use_master_clock = host_tsc_clocksource & vcpus_matched;
+
+ if (ka->use_master_clock)
+ atomic_set(&kvm_guest_has_master_clock, 1);
+
+ vclock_mode = pvclock_gtod_data.clock.vclock_mode;
+ trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
+ vcpus_matched);
+#endif
+}
+
static int kvm_guest_time_update(struct kvm_vcpu *v)
{
- unsigned long flags;
+ unsigned long flags, this_tsc_khz;
struct kvm_vcpu_arch *vcpu = &v->arch;
+ struct kvm_arch *ka = &v->kvm->arch;
void *shared_kaddr;
- unsigned long this_tsc_khz;
s64 kernel_ns, max_kernel_ns;
- u64 tsc_timestamp;
+ u64 tsc_timestamp, host_tsc;
+ struct pvclock_vcpu_time_info *guest_hv_clock;
u8 pvclock_flags;
+ bool use_master_clock;
+
+ kernel_ns = 0;
+ host_tsc = 0;
/* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags);
- tsc_timestamp = kvm_x86_ops->read_l1_tsc(v);
- kernel_ns = get_kernel_ns();
this_tsc_khz = __get_cpu_var(cpu_tsc_khz);
if (unlikely(this_tsc_khz == 0)) {
local_irq_restore(flags);
@@ -1157,6 +1428,24 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
}
/*
+ * If the host uses TSC clock, then passthrough TSC as stable
+ * to the guest.
+ */
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+ use_master_clock = ka->use_master_clock;
+ if (use_master_clock) {
+ host_tsc = ka->master_cycle_now;
+ kernel_ns = ka->master_kernel_ns;
+ }
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+ if (!use_master_clock) {
+ host_tsc = native_read_tsc();
+ kernel_ns = get_kernel_ns();
+ }
+
+ tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc);
+
+ /*
* We may have to catch up the TSC to match elapsed wall clock
* time for two reasons, even if kvmclock is used.
* 1) CPU could have been running below the maximum TSC rate
@@ -1217,23 +1506,20 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hw_tsc_khz = this_tsc_khz;
}
- if (max_kernel_ns > kernel_ns)
- kernel_ns = max_kernel_ns;
-
+ /* with a master <monotonic time, tsc value> tuple,
+ * pvclock clock reads always increase at the (scaled) rate
+ * of guest TSC - no need to deal with sampling errors.
+ */
+ if (!use_master_clock) {
+ if (max_kernel_ns > kernel_ns)
+ kernel_ns = max_kernel_ns;
+ }
/* With all the info we got, fill in the values */
vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_kernel_ns = kernel_ns;
vcpu->last_guest_tsc = tsc_timestamp;
- pvclock_flags = 0;
- if (vcpu->pvclock_set_guest_stopped_request) {
- pvclock_flags |= PVCLOCK_GUEST_STOPPED;
- vcpu->pvclock_set_guest_stopped_request = false;
- }
-
- vcpu->hv_clock.flags = pvclock_flags;
-
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
@@ -1243,6 +1529,22 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
shared_kaddr = kmap_atomic(vcpu->time_page);
+ guest_hv_clock = shared_kaddr + vcpu->time_offset;
+
+ /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+ pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
+
+ if (vcpu->pvclock_set_guest_stopped_request) {
+ pvclock_flags |= PVCLOCK_GUEST_STOPPED;
+ vcpu->pvclock_set_guest_stopped_request = false;
+ }
+
+ /* If the host uses TSC clocksource, then it is stable */
+ if (use_master_clock)
+ pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
+
+ vcpu->hv_clock.flags = pvclock_flags;
+
memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock,
sizeof(vcpu->hv_clock));
@@ -1572,9 +1874,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
}
-int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
bool pr = false;
+ u32 msr = msr_info->index;
+ u64 data = msr_info->data;
switch (msr) {
case MSR_EFER:
@@ -1625,6 +1929,15 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
case MSR_IA32_TSCDEADLINE:
kvm_set_lapic_tscdeadline_msr(vcpu, data);
break;
+ case MSR_IA32_TSC_ADJUST:
+ if (guest_cpuid_has_tsc_adjust(vcpu)) {
+ if (!msr_info->host_initiated) {
+ u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+ kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
+ }
+ vcpu->arch.ia32_tsc_adjust_msr = data;
+ }
+ break;
case MSR_IA32_MISC_ENABLE:
vcpu->arch.ia32_misc_enable_msr = data;
break;
@@ -1984,6 +2297,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_IA32_TSCDEADLINE:
data = kvm_get_lapic_tscdeadline_msr(vcpu);
break;
+ case MSR_IA32_TSC_ADJUST:
+ data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
+ break;
case MSR_IA32_MISC_ENABLE:
data = vcpu->arch.ia32_misc_enable_msr;
break;
@@ -2342,7 +2658,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
kvm_x86_ops->write_tsc_offset(vcpu, offset);
vcpu->arch.tsc_catchup = 1;
}
- kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
+ /*
+ * On a host with synchronized TSC, there is no need to update
+ * kvmclock on vcpu->cpu migration
+ */
+ if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1)
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (vcpu->cpu != cpu)
kvm_migrate_timers(vcpu);
vcpu->cpu = cpu;
@@ -2691,15 +3012,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (!vcpu->arch.apic)
goto out;
u.lapic = memdup_user(argp, sizeof(*u.lapic));
- if (IS_ERR(u.lapic)) {
- r = PTR_ERR(u.lapic);
- goto out;
- }
+ if (IS_ERR(u.lapic))
+ return PTR_ERR(u.lapic);
r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_INTERRUPT: {
@@ -2709,16 +3025,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (copy_from_user(&irq, argp, sizeof irq))
goto out;
r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_NMI: {
r = kvm_vcpu_ioctl_nmi(vcpu);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_SET_CPUID: {
@@ -2729,8 +3039,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
goto out;
r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
- if (r)
- goto out;
break;
}
case KVM_SET_CPUID2: {
@@ -2742,8 +3050,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
goto out;
r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
cpuid_arg->entries);
- if (r)
- goto out;
break;
}
case KVM_GET_CPUID2: {
@@ -2875,10 +3181,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
case KVM_SET_XSAVE: {
u.xsave = memdup_user(argp, sizeof(*u.xsave));
- if (IS_ERR(u.xsave)) {
- r = PTR_ERR(u.xsave);
- goto out;
- }
+ if (IS_ERR(u.xsave))
+ return PTR_ERR(u.xsave);
r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
break;
@@ -2900,10 +3204,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
}
case KVM_SET_XCRS: {
u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
- if (IS_ERR(u.xcrs)) {
- r = PTR_ERR(u.xcrs);
- goto out;
- }
+ if (IS_ERR(u.xcrs))
+ return PTR_ERR(u.xcrs);
r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
break;
@@ -2951,7 +3253,7 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
int ret;
if (addr > (unsigned int)(-3 * PAGE_SIZE))
- return -1;
+ return -EINVAL;
ret = kvm_x86_ops->set_tss_addr(kvm, addr);
return ret;
}
@@ -3212,8 +3514,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
switch (ioctl) {
case KVM_SET_TSS_ADDR:
r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
- if (r < 0)
- goto out;
break;
case KVM_SET_IDENTITY_MAP_ADDR: {
u64 ident_addr;
@@ -3222,14 +3522,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (copy_from_user(&ident_addr, argp, sizeof ident_addr))
goto out;
r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
- if (r < 0)
- goto out;
break;
}
case KVM_SET_NR_MMU_PAGES:
r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
- if (r)
- goto out;
break;
case KVM_GET_NR_MMU_PAGES:
r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
@@ -3320,8 +3616,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
get_irqchip_out:
kfree(chip);
- if (r)
- goto out;
break;
}
case KVM_SET_IRQCHIP: {
@@ -3343,8 +3637,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
r = 0;
set_irqchip_out:
kfree(chip);
- if (r)
- goto out;
break;
}
case KVM_GET_PIT: {
@@ -3371,9 +3663,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (!kvm->arch.vpit)
goto out;
r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_GET_PIT2: {
@@ -3397,9 +3686,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (!kvm->arch.vpit)
goto out;
r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_REINJECT_CONTROL: {
@@ -3408,9 +3694,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (copy_from_user(&control, argp, sizeof(control)))
goto out;
r = kvm_vm_ioctl_reinject(kvm, &control);
- if (r)
- goto out;
- r = 0;
break;
}
case KVM_XEN_HVM_CONFIG: {
@@ -3779,7 +4062,7 @@ static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
{
struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
- memcpy(vcpu->run->mmio.data, frag->data, frag->len);
+ memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
return X86EMUL_CONTINUE;
}
@@ -3832,18 +4115,11 @@ mmio:
bytes -= handled;
val += handled;
- while (bytes) {
- unsigned now = min(bytes, 8U);
-
- frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
- frag->gpa = gpa;
- frag->data = val;
- frag->len = now;
-
- gpa += now;
- val += now;
- bytes -= now;
- }
+ WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
+ frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
+ frag->gpa = gpa;
+ frag->data = val;
+ frag->len = bytes;
return X86EMUL_CONTINUE;
}
@@ -3890,7 +4166,7 @@ int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
vcpu->mmio_needed = 1;
vcpu->mmio_cur_fragment = 0;
- vcpu->run->mmio.len = vcpu->mmio_fragments[0].len;
+ vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
vcpu->run->exit_reason = KVM_EXIT_MMIO;
vcpu->run->mmio.phys_addr = gpa;
@@ -4280,7 +4556,12 @@ static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 data)
{
- return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+ struct msr_data msr;
+
+ msr.data = data;
+ msr.index = msr_index;
+ msr.host_initiated = false;
+ return kvm_set_msr(emul_to_vcpu(ctxt), &msr);
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -4502,7 +4783,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
* instruction -> ...
*/
pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
- if (!is_error_pfn(pfn)) {
+ if (!is_error_noslot_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return true;
}
@@ -4888,6 +5169,50 @@ static void kvm_set_mmio_spte_mask(void)
kvm_mmu_set_mmio_spte_mask(mask);
}
+#ifdef CONFIG_X86_64
+static void pvclock_gtod_update_fn(struct work_struct *work)
+{
+ struct kvm *kvm;
+
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ raw_spin_lock(&kvm_lock);
+ list_for_each_entry(kvm, &vm_list, vm_list)
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests);
+ atomic_set(&kvm_guest_has_master_clock, 0);
+ raw_spin_unlock(&kvm_lock);
+}
+
+static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
+
+/*
+ * Notification about pvclock gtod data update.
+ */
+static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
+ void *priv)
+{
+ struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
+ struct timekeeper *tk = priv;
+
+ update_pvclock_gtod(tk);
+
+ /* disable master clock if host does not trust, or does not
+ * use, TSC clocksource
+ */
+ if (gtod->clock.vclock_mode != VCLOCK_TSC &&
+ atomic_read(&kvm_guest_has_master_clock) != 0)
+ queue_work(system_long_wq, &pvclock_gtod_work);
+
+ return 0;
+}
+
+static struct notifier_block pvclock_gtod_notifier = {
+ .notifier_call = pvclock_gtod_notify,
+};
+#endif
+
int kvm_arch_init(void *opaque)
{
int r;
@@ -4910,9 +5235,16 @@ int kvm_arch_init(void *opaque)
goto out;
}
+ r = -ENOMEM;
+ shared_msrs = alloc_percpu(struct kvm_shared_msrs);
+ if (!shared_msrs) {
+ printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
+ goto out;
+ }
+
r = kvm_mmu_module_init();
if (r)
- goto out;
+ goto out_free_percpu;
kvm_set_mmio_spte_mask();
kvm_init_msr_list();
@@ -4929,8 +5261,14 @@ int kvm_arch_init(void *opaque)
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
kvm_lapic_init();
+#ifdef CONFIG_X86_64
+ pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
+#endif
+
return 0;
+out_free_percpu:
+ free_percpu(shared_msrs);
out:
return r;
}
@@ -4943,8 +5281,12 @@ void kvm_arch_exit(void)
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
+#ifdef CONFIG_X86_64
+ pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+#endif
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
+ free_percpu(shared_msrs);
}
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
@@ -5066,7 +5408,7 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
-int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
+static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
char instruction[3];
@@ -5242,6 +5584,29 @@ static void process_nmi(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+static void kvm_gen_update_masterclock(struct kvm *kvm)
+{
+#ifdef CONFIG_X86_64
+ int i;
+ struct kvm_vcpu *vcpu;
+ struct kvm_arch *ka = &kvm->arch;
+
+ spin_lock(&ka->pvclock_gtod_sync_lock);
+ kvm_make_mclock_inprogress_request(kvm);
+ /* no guest entries from this point */
+ pvclock_update_vm_gtod_copy(kvm);
+
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests);
+
+ /* guest entries allowed */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests);
+
+ spin_unlock(&ka->pvclock_gtod_sync_lock);
+#endif
+}
+
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
int r;
@@ -5254,6 +5619,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_mmu_unload(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
__kvm_migrate_timers(vcpu);
+ if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
+ kvm_gen_update_masterclock(vcpu->kvm);
if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
r = kvm_guest_time_update(vcpu);
if (unlikely(r))
@@ -5369,7 +5736,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (hw_breakpoint_active())
hw_breakpoint_restore();
- vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu);
+ vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
+ native_read_tsc());
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
@@ -5426,7 +5794,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
pr_debug("vcpu %d received sipi with vector # %x\n",
vcpu->vcpu_id, vcpu->arch.sipi_vector);
kvm_lapic_reset(vcpu);
- r = kvm_arch_vcpu_reset(vcpu);
+ r = kvm_vcpu_reset(vcpu);
if (r)
return r;
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
@@ -5522,28 +5890,44 @@ static int complete_emulated_pio(struct kvm_vcpu *vcpu)
*
* read:
* for each fragment
- * write gpa, len
- * exit
- * copy data
+ * for each mmio piece in the fragment
+ * write gpa, len
+ * exit
+ * copy data
* execute insn
*
* write:
* for each fragment
- * write gpa, len
- * copy data
- * exit
+ * for each mmio piece in the fragment
+ * write gpa, len
+ * copy data
+ * exit
*/
static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
struct kvm_mmio_fragment *frag;
+ unsigned len;
BUG_ON(!vcpu->mmio_needed);
/* Complete previous fragment */
- frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment++];
+ frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
+ len = min(8u, frag->len);
if (!vcpu->mmio_is_write)
- memcpy(frag->data, run->mmio.data, frag->len);
+ memcpy(frag->data, run->mmio.data, len);
+
+ if (frag->len <= 8) {
+ /* Switch to the next fragment. */
+ frag++;
+ vcpu->mmio_cur_fragment++;
+ } else {
+ /* Go forward to the next mmio piece. */
+ frag->data += len;
+ frag->gpa += len;
+ frag->len -= len;
+ }
+
if (vcpu->mmio_cur_fragment == vcpu->mmio_nr_fragments) {
vcpu->mmio_needed = 0;
if (vcpu->mmio_is_write)
@@ -5551,13 +5935,12 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
vcpu->mmio_read_completed = 1;
return complete_emulated_io(vcpu);
}
- /* Initiate next fragment */
- ++frag;
+
run->exit_reason = KVM_EXIT_MMIO;
run->mmio.phys_addr = frag->gpa;
if (vcpu->mmio_is_write)
- memcpy(run->mmio.data, frag->data, frag->len);
- run->mmio.len = frag->len;
+ memcpy(run->mmio.data, frag->data, min(8u, frag->len));
+ run->mmio.len = min(8u, frag->len);
run->mmio.is_write = vcpu->mmio_is_write;
vcpu->arch.complete_userspace_io = complete_emulated_mmio;
return 0;
@@ -5773,6 +6156,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
int pending_vec, max_bits, idx;
struct desc_ptr dt;
+ if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE))
+ return -EINVAL;
+
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
kvm_x86_ops->set_idt(vcpu, &dt);
@@ -6036,7 +6422,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
r = vcpu_load(vcpu);
if (r)
return r;
- r = kvm_arch_vcpu_reset(vcpu);
+ r = kvm_vcpu_reset(vcpu);
if (r == 0)
r = kvm_mmu_setup(vcpu);
vcpu_put(vcpu);
@@ -6044,6 +6430,23 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
return r;
}
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+ int r;
+ struct msr_data msr;
+
+ r = vcpu_load(vcpu);
+ if (r)
+ return r;
+ msr.data = 0x0;
+ msr.index = MSR_IA32_TSC;
+ msr.host_initiated = true;
+ kvm_write_tsc(vcpu, &msr);
+ vcpu_put(vcpu);
+
+ return r;
+}
+
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
int r;
@@ -6058,7 +6461,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
kvm_x86_ops->vcpu_free(vcpu);
}
-int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
+static int kvm_vcpu_reset(struct kvm_vcpu *vcpu)
{
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
@@ -6081,6 +6484,10 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_pmu_reset(vcpu);
+ memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
+ vcpu->arch.regs_avail = ~0;
+ vcpu->arch.regs_dirty = ~0;
+
return kvm_x86_ops->vcpu_reset(vcpu);
}
@@ -6157,6 +6564,8 @@ int kvm_arch_hardware_enable(void *garbage)
kvm_for_each_vcpu(i, vcpu, kvm) {
vcpu->arch.tsc_offset_adjustment += delta_cyc;
vcpu->arch.last_host_tsc = local_tsc;
+ set_bit(KVM_REQ_MASTERCLOCK_UPDATE,
+ &vcpu->requests);
}
/*
@@ -6247,10 +6656,17 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
goto fail_free_mce_banks;
+ r = fx_init(vcpu);
+ if (r)
+ goto fail_free_wbinvd_dirty_mask;
+
+ vcpu->arch.ia32_tsc_adjust_msr = 0x0;
kvm_async_pf_hash_reset(vcpu);
kvm_pmu_init(vcpu);
return 0;
+fail_free_wbinvd_dirty_mask:
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
fail_free_mce_banks:
kfree(vcpu->arch.mce_banks);
fail_free_lapic:
@@ -6294,6 +6710,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
raw_spin_lock_init(&kvm->arch.tsc_write_lock);
mutex_init(&kvm->arch.apic_map_lock);
+ spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
+
+ pvclock_update_vm_gtod_copy(kvm);
return 0;
}