diff options
-rw-r--r-- | Documentation/virtual/kvm/api.txt | 287 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_emulate.h | 32 | ||||
-rw-r--r-- | arch/x86/include/asm/kvm_host.h | 14 | ||||
-rw-r--r-- | arch/x86/include/uapi/asm/kvm.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/cpuid.h | 8 | ||||
-rw-r--r-- | arch/x86/kvm/emulate.c | 405 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.c | 4 | ||||
-rw-r--r-- | arch/x86/kvm/svm.c | 57 | ||||
-rw-r--r-- | arch/x86/kvm/trace.h | 6 | ||||
-rw-r--r-- | arch/x86/kvm/vmx.c | 125 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 121 |
11 files changed, 626 insertions, 436 deletions
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f1979c77ac1c..68cda1fc3d52 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -297,6 +297,15 @@ struct kvm_regs { __u64 rip, rflags; }; +/* mips */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 gpr[32]; + __u64 hi; + __u64 lo; + __u64 pc; +}; + 4.12 KVM_SET_REGS @@ -378,7 +387,7 @@ struct kvm_translation { 4.16 KVM_INTERRUPT Capability: basic -Architectures: x86, ppc +Architectures: x86, ppc, mips Type: vcpu ioctl Parameters: struct kvm_interrupt (in) Returns: 0 on success, -1 on error @@ -423,6 +432,11 @@ c) KVM_INTERRUPT_SET_LEVEL Note that any value for 'irq' other than the ones stated above is invalid and incurs unexpected behavior. +MIPS: + +Queues an external interrupt to be injected into the virtual CPU. A negative +interrupt number dequeues the interrupt. + 4.17 KVM_DEBUG_GUEST @@ -512,7 +526,7 @@ struct kvm_cpuid { 4.21 KVM_SET_SIGNAL_MASK Capability: basic -Architectures: x86 +Architectures: all Type: vcpu ioctl Parameters: struct kvm_signal_mask (in) Returns: 0 on success, -1 on error @@ -1783,122 +1797,151 @@ and architecture specific registers. Each have their own range of operation and their own constants and width. To keep track of the implemented registers, find a list below: - Arch | Register | Width (bits) - | | - PPC | KVM_REG_PPC_HIOR | 64 - PPC | KVM_REG_PPC_IAC1 | 64 - PPC | KVM_REG_PPC_IAC2 | 64 - PPC | KVM_REG_PPC_IAC3 | 64 - PPC | KVM_REG_PPC_IAC4 | 64 - PPC | KVM_REG_PPC_DAC1 | 64 - PPC | KVM_REG_PPC_DAC2 | 64 - PPC | KVM_REG_PPC_DABR | 64 - PPC | KVM_REG_PPC_DSCR | 64 - PPC | KVM_REG_PPC_PURR | 64 - PPC | KVM_REG_PPC_SPURR | 64 - PPC | KVM_REG_PPC_DAR | 64 - PPC | KVM_REG_PPC_DSISR | 32 - PPC | KVM_REG_PPC_AMR | 64 - PPC | KVM_REG_PPC_UAMOR | 64 - PPC | KVM_REG_PPC_MMCR0 | 64 - PPC | KVM_REG_PPC_MMCR1 | 64 - PPC | KVM_REG_PPC_MMCRA | 64 - PPC | KVM_REG_PPC_MMCR2 | 64 - PPC | KVM_REG_PPC_MMCRS | 64 - PPC | KVM_REG_PPC_SIAR | 64 - PPC | KVM_REG_PPC_SDAR | 64 - PPC | KVM_REG_PPC_SIER | 64 - PPC | KVM_REG_PPC_PMC1 | 32 - PPC | KVM_REG_PPC_PMC2 | 32 - PPC | KVM_REG_PPC_PMC3 | 32 - PPC | KVM_REG_PPC_PMC4 | 32 - PPC | KVM_REG_PPC_PMC5 | 32 - PPC | KVM_REG_PPC_PMC6 | 32 - PPC | KVM_REG_PPC_PMC7 | 32 - PPC | KVM_REG_PPC_PMC8 | 32 - PPC | KVM_REG_PPC_FPR0 | 64 + Arch | Register | Width (bits) + | | + PPC | KVM_REG_PPC_HIOR | 64 + PPC | KVM_REG_PPC_IAC1 | 64 + PPC | KVM_REG_PPC_IAC2 | 64 + PPC | KVM_REG_PPC_IAC3 | 64 + PPC | KVM_REG_PPC_IAC4 | 64 + PPC | KVM_REG_PPC_DAC1 | 64 + PPC | KVM_REG_PPC_DAC2 | 64 + PPC | KVM_REG_PPC_DABR | 64 + PPC | KVM_REG_PPC_DSCR | 64 + PPC | KVM_REG_PPC_PURR | 64 + PPC | KVM_REG_PPC_SPURR | 64 + PPC | KVM_REG_PPC_DAR | 64 + PPC | KVM_REG_PPC_DSISR | 32 + PPC | KVM_REG_PPC_AMR | 64 + PPC | KVM_REG_PPC_UAMOR | 64 + PPC | KVM_REG_PPC_MMCR0 | 64 + PPC | KVM_REG_PPC_MMCR1 | 64 + PPC | KVM_REG_PPC_MMCRA | 64 + PPC | KVM_REG_PPC_MMCR2 | 64 + PPC | KVM_REG_PPC_MMCRS | 64 + PPC | KVM_REG_PPC_SIAR | 64 + PPC | KVM_REG_PPC_SDAR | 64 + PPC | KVM_REG_PPC_SIER | 64 + PPC | KVM_REG_PPC_PMC1 | 32 + PPC | KVM_REG_PPC_PMC2 | 32 + PPC | KVM_REG_PPC_PMC3 | 32 + PPC | KVM_REG_PPC_PMC4 | 32 + PPC | KVM_REG_PPC_PMC5 | 32 + PPC | KVM_REG_PPC_PMC6 | 32 + PPC | KVM_REG_PPC_PMC7 | 32 + PPC | KVM_REG_PPC_PMC8 | 32 + PPC | KVM_REG_PPC_FPR0 | 64 ... - PPC | KVM_REG_PPC_FPR31 | 64 - PPC | KVM_REG_PPC_VR0 | 128 + PPC | KVM_REG_PPC_FPR31 | 64 + PPC | KVM_REG_PPC_VR0 | 128 ... - PPC | KVM_REG_PPC_VR31 | 128 - PPC | KVM_REG_PPC_VSR0 | 128 + PPC | KVM_REG_PPC_VR31 | 128 + PPC | KVM_REG_PPC_VSR0 | 128 ... - PPC | KVM_REG_PPC_VSR31 | 128 - PPC | KVM_REG_PPC_FPSCR | 64 - PPC | KVM_REG_PPC_VSCR | 32 - PPC | KVM_REG_PPC_VPA_ADDR | 64 - PPC | KVM_REG_PPC_VPA_SLB | 128 - PPC | KVM_REG_PPC_VPA_DTL | 128 - PPC | KVM_REG_PPC_EPCR | 32 - PPC | KVM_REG_PPC_EPR | 32 - PPC | KVM_REG_PPC_TCR | 32 - PPC | KVM_REG_PPC_TSR | 32 - PPC | KVM_REG_PPC_OR_TSR | 32 - PPC | KVM_REG_PPC_CLEAR_TSR | 32 - PPC | KVM_REG_PPC_MAS0 | 32 - PPC | KVM_REG_PPC_MAS1 | 32 - PPC | KVM_REG_PPC_MAS2 | 64 - PPC | KVM_REG_PPC_MAS7_3 | 64 - PPC | KVM_REG_PPC_MAS4 | 32 - PPC | KVM_REG_PPC_MAS6 | 32 - PPC | KVM_REG_PPC_MMUCFG | 32 - PPC | KVM_REG_PPC_TLB0CFG | 32 - PPC | KVM_REG_PPC_TLB1CFG | 32 - PPC | KVM_REG_PPC_TLB2CFG | 32 - PPC | KVM_REG_PPC_TLB3CFG | 32 - PPC | KVM_REG_PPC_TLB0PS | 32 - PPC | KVM_REG_PPC_TLB1PS | 32 - PPC | KVM_REG_PPC_TLB2PS | 32 - PPC | KVM_REG_PPC_TLB3PS | 32 - PPC | KVM_REG_PPC_EPTCFG | 32 - PPC | KVM_REG_PPC_ICP_STATE | 64 - PPC | KVM_REG_PPC_TB_OFFSET | 64 - PPC | KVM_REG_PPC_SPMC1 | 32 - PPC | KVM_REG_PPC_SPMC2 | 32 - PPC | KVM_REG_PPC_IAMR | 64 - PPC | KVM_REG_PPC_TFHAR | 64 - PPC | KVM_REG_PPC_TFIAR | 64 - PPC | KVM_REG_PPC_TEXASR | 64 - PPC | KVM_REG_PPC_FSCR | 64 - PPC | KVM_REG_PPC_PSPB | 32 - PPC | KVM_REG_PPC_EBBHR | 64 - PPC | KVM_REG_PPC_EBBRR | 64 - PPC | KVM_REG_PPC_BESCR | 64 - PPC | KVM_REG_PPC_TAR | 64 - PPC | KVM_REG_PPC_DPDES | 64 - PPC | KVM_REG_PPC_DAWR | 64 - PPC | KVM_REG_PPC_DAWRX | 64 - PPC | KVM_REG_PPC_CIABR | 64 - PPC | KVM_REG_PPC_IC | 64 - PPC | KVM_REG_PPC_VTB | 64 - PPC | KVM_REG_PPC_CSIGR | 64 - PPC | KVM_REG_PPC_TACR | 64 - PPC | KVM_REG_PPC_TCSCR | 64 - PPC | KVM_REG_PPC_PID | 64 - PPC | KVM_REG_PPC_ACOP | 64 - PPC | KVM_REG_PPC_VRSAVE | 32 - PPC | KVM_REG_PPC_LPCR | 64 - PPC | KVM_REG_PPC_PPR | 64 - PPC | KVM_REG_PPC_ARCH_COMPAT 32 - PPC | KVM_REG_PPC_DABRX | 32 - PPC | KVM_REG_PPC_WORT | 64 - PPC | KVM_REG_PPC_TM_GPR0 | 64 + PPC | KVM_REG_PPC_VSR31 | 128 + PPC | KVM_REG_PPC_FPSCR | 64 + PPC | KVM_REG_PPC_VSCR | 32 + PPC | KVM_REG_PPC_VPA_ADDR | 64 + PPC | KVM_REG_PPC_VPA_SLB | 128 + PPC | KVM_REG_PPC_VPA_DTL | 128 + PPC | KVM_REG_PPC_EPCR | 32 + PPC | KVM_REG_PPC_EPR | 32 + PPC | KVM_REG_PPC_TCR | 32 + PPC | KVM_REG_PPC_TSR | 32 + PPC | KVM_REG_PPC_OR_TSR | 32 + PPC | KVM_REG_PPC_CLEAR_TSR | 32 + PPC | KVM_REG_PPC_MAS0 | 32 + PPC | KVM_REG_PPC_MAS1 | 32 + PPC | KVM_REG_PPC_MAS2 | 64 + PPC | KVM_REG_PPC_MAS7_3 | 64 + PPC | KVM_REG_PPC_MAS4 | 32 + PPC | KVM_REG_PPC_MAS6 | 32 + PPC | KVM_REG_PPC_MMUCFG | 32 + PPC | KVM_REG_PPC_TLB0CFG | 32 + PPC | KVM_REG_PPC_TLB1CFG | 32 + PPC | KVM_REG_PPC_TLB2CFG | 32 + PPC | KVM_REG_PPC_TLB3CFG | 32 + PPC | KVM_REG_PPC_TLB0PS | 32 + PPC | KVM_REG_PPC_TLB1PS | 32 + PPC | KVM_REG_PPC_TLB2PS | 32 + PPC | KVM_REG_PPC_TLB3PS | 32 + PPC | KVM_REG_PPC_EPTCFG | 32 + PPC | KVM_REG_PPC_ICP_STATE | 64 + PPC | KVM_REG_PPC_TB_OFFSET | 64 + PPC | KVM_REG_PPC_SPMC1 | 32 + PPC | KVM_REG_PPC_SPMC2 | 32 + PPC | KVM_REG_PPC_IAMR | 64 + PPC | KVM_REG_PPC_TFHAR | 64 + PPC | KVM_REG_PPC_TFIAR | 64 + PPC | KVM_REG_PPC_TEXASR | 64 + PPC | KVM_REG_PPC_FSCR | 64 + PPC | KVM_REG_PPC_PSPB | 32 + PPC | KVM_REG_PPC_EBBHR | 64 + PPC | KVM_REG_PPC_EBBRR | 64 + PPC | KVM_REG_PPC_BESCR | 64 + PPC | KVM_REG_PPC_TAR | 64 + PPC | KVM_REG_PPC_DPDES | 64 + PPC | KVM_REG_PPC_DAWR | 64 + PPC | KVM_REG_PPC_DAWRX | 64 + PPC | KVM_REG_PPC_CIABR | 64 + PPC | KVM_REG_PPC_IC | 64 + PPC | KVM_REG_PPC_VTB | 64 + PPC | KVM_REG_PPC_CSIGR | 64 + PPC | KVM_REG_PPC_TACR | 64 + PPC | KVM_REG_PPC_TCSCR | 64 + PPC | KVM_REG_PPC_PID | 64 + PPC | KVM_REG_PPC_ACOP | 64 + PPC | KVM_REG_PPC_VRSAVE | 32 + PPC | KVM_REG_PPC_LPCR | 64 + PPC | KVM_REG_PPC_PPR | 64 + PPC | KVM_REG_PPC_ARCH_COMPAT | 32 + PPC | KVM_REG_PPC_DABRX | 32 + PPC | KVM_REG_PPC_WORT | 64 + PPC | KVM_REG_PPC_TM_GPR0 | 64 ... - PPC | KVM_REG_PPC_TM_GPR31 | 64 - PPC | KVM_REG_PPC_TM_VSR0 | 128 + PPC | KVM_REG_PPC_TM_GPR31 | 64 + PPC | KVM_REG_PPC_TM_VSR0 | 128 ... - PPC | KVM_REG_PPC_TM_VSR63 | 128 - PPC | KVM_REG_PPC_TM_CR | 64 - PPC | KVM_REG_PPC_TM_LR | 64 - PPC | KVM_REG_PPC_TM_CTR | 64 - PPC | KVM_REG_PPC_TM_FPSCR | 64 - PPC | KVM_REG_PPC_TM_AMR | 64 - PPC | KVM_REG_PPC_TM_PPR | 64 - PPC | KVM_REG_PPC_TM_VRSAVE | 64 - PPC | KVM_REG_PPC_TM_VSCR | 32 - PPC | KVM_REG_PPC_TM_DSCR | 64 - PPC | KVM_REG_PPC_TM_TAR | 64 + PPC | KVM_REG_PPC_TM_VSR63 | 128 + PPC | KVM_REG_PPC_TM_CR | 64 + PPC | KVM_REG_PPC_TM_LR | 64 + PPC | KVM_REG_PPC_TM_CTR | 64 + PPC | KVM_REG_PPC_TM_FPSCR | 64 + PPC | KVM_REG_PPC_TM_AMR | 64 + PPC | KVM_REG_PPC_TM_PPR | 64 + PPC | KVM_REG_PPC_TM_VRSAVE | 64 + PPC | KVM_REG_PPC_TM_VSCR | 32 + PPC | KVM_REG_PPC_TM_DSCR | 64 + PPC | KVM_REG_PPC_TM_TAR | 64 + | | + MIPS | KVM_REG_MIPS_R0 | 64 + ... + MIPS | KVM_REG_MIPS_R31 | 64 + MIPS | KVM_REG_MIPS_HI | 64 + MIPS | KVM_REG_MIPS_LO | 64 + MIPS | KVM_REG_MIPS_PC | 64 + MIPS | KVM_REG_MIPS_CP0_INDEX | 32 + MIPS | KVM_REG_MIPS_CP0_CONTEXT | 64 + MIPS | KVM_REG_MIPS_CP0_USERLOCAL | 64 + MIPS | KVM_REG_MIPS_CP0_PAGEMASK | 32 + MIPS | KVM_REG_MIPS_CP0_WIRED | 32 + MIPS | KVM_REG_MIPS_CP0_HWRENA | 32 + MIPS | KVM_REG_MIPS_CP0_BADVADDR | 64 + MIPS | KVM_REG_MIPS_CP0_COUNT | 32 + MIPS | KVM_REG_MIPS_CP0_ENTRYHI | 64 + MIPS | KVM_REG_MIPS_CP0_COMPARE | 32 + MIPS | KVM_REG_MIPS_CP0_STATUS | 32 + MIPS | KVM_REG_MIPS_CP0_CAUSE | 32 + MIPS | KVM_REG_MIPS_CP0_EPC | 64 + MIPS | KVM_REG_MIPS_CP0_CONFIG | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG1 | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG2 | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG3 | 32 + MIPS | KVM_REG_MIPS_CP0_CONFIG7 | 32 + MIPS | KVM_REG_MIPS_CP0_ERROREPC | 64 + MIPS | KVM_REG_MIPS_COUNT_CTL | 64 + MIPS | KVM_REG_MIPS_COUNT_RESUME | 64 + MIPS | KVM_REG_MIPS_COUNT_HZ | 64 ARM registers are mapped using the lower 32 bits. The upper 16 of that is the register group type, or coprocessor number: @@ -1937,6 +1980,22 @@ arm64 CCSIDR registers are demultiplexed by CSSELR value: arm64 system registers have the following id bit patterns: 0x6030 0000 0013 <op0:2> <op1:3> <crn:4> <crm:4> <op2:3> + +MIPS registers are mapped using the lower 32 bits. The upper 16 of that is +the register group type: + +MIPS core registers (see above) have the following id bit patterns: + 0x7030 0000 0000 <reg:16> + +MIPS CP0 registers (see KVM_REG_MIPS_CP0_* above) have the following id bit +patterns depending on whether they're 32-bit or 64-bit registers: + 0x7020 0000 0001 00 <reg:5> <sel:3> (32-bit) + 0x7030 0000 0001 00 <reg:5> <sel:3> (64-bit) + +MIPS KVM control registers (see above) have the following id bit patterns: + 0x7030 0000 0002 <reg:16> + + 4.69 KVM_GET_ONE_REG Capability: KVM_CAP_ONE_REG @@ -2424,7 +2483,7 @@ in VCPU matching underlying host. 4.84 KVM_GET_REG_LIST Capability: basic -Architectures: arm, arm64 +Architectures: arm, arm64, mips Type: vcpu ioctl Parameters: struct kvm_reg_list (in/out) Returns: 0 on success; -1 on error diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index ffa2671a7f2f..eb181178fe0b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -37,6 +37,7 @@ struct x86_instruction_info { u8 modrm_reg; /* index of register used */ u8 modrm_rm; /* rm part of modrm */ u64 src_val; /* value of source operand */ + u64 dst_val; /* value of destination operand */ u8 src_bytes; /* size of source operand */ u8 dst_bytes; /* size of destination operand */ u8 ad_bytes; /* size of src/dst address */ @@ -232,7 +233,7 @@ struct operand { union { unsigned long val; u64 val64; - char valptr[sizeof(unsigned long) + 2]; + char valptr[sizeof(sse128_t)]; sse128_t vec_val; u64 mm_val; void *data; @@ -241,8 +242,8 @@ struct operand { struct fetch_cache { u8 data[15]; - unsigned long start; - unsigned long end; + u8 *ptr; + u8 *end; }; struct read_cache { @@ -287,30 +288,36 @@ struct x86_emulate_ctxt { u8 opcode_len; u8 b; u8 intercept; - u8 lock_prefix; - u8 rep_prefix; u8 op_bytes; u8 ad_bytes; - u8 rex_prefix; struct operand src; struct operand src2; struct operand dst; - bool has_seg_override; - u8 seg_override; - u64 d; int (*execute)(struct x86_emulate_ctxt *ctxt); int (*check_perm)(struct x86_emulate_ctxt *ctxt); + /* + * The following six fields are cleared together, + * the rest are initialized unconditionally in x86_decode_insn + * or elsewhere + */ + bool rip_relative; + u8 rex_prefix; + u8 lock_prefix; + u8 rep_prefix; + /* bitmaps of registers in _regs[] that can be read */ + u32 regs_valid; + /* bitmaps of registers in _regs[] that have been written */ + u32 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; u8 modrm_reg; u8 modrm_rm; u8 modrm_seg; - bool rip_relative; + u8 seg_override; + u64 d; unsigned long _eip; struct operand memop; - u32 regs_valid; /* bitmaps of registers in _regs[] that can be read */ - u32 regs_dirty; /* bitmaps of registers in _regs[] that have been written */ /* Fields above regs are cleared together. */ unsigned long _regs[NR_VCPU_REGS]; struct operand *memopp; @@ -408,6 +415,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_OK 0 #define EMULATION_RESTART 1 #define EMULATION_INTERCEPTED 2 +void init_decode_cache(struct x86_emulate_ctxt *ctxt); int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 63e020be3da7..a84eaf7ba33f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -152,14 +152,16 @@ enum { #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) -#define DR6_FIXED_1 0xffff0ff0 -#define DR6_VOLATILE 0x0000e00f +#define DR6_RTM (1 << 16) +#define DR6_FIXED_1 0xfffe0ff0 +#define DR6_INIT 0xffff0ff0 +#define DR6_VOLATILE 0x0001e00f #define DR7_BP_EN_MASK 0x000000ff #define DR7_GE (1 << 9) #define DR7_GD (1 << 13) #define DR7_FIXED_1 0x00000400 -#define DR7_VOLATILE 0xffff23ff +#define DR7_VOLATILE 0xffff2bff /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -448,7 +450,7 @@ struct kvm_vcpu_arch { u64 tsc_offset_adjustment; u64 this_tsc_nsec; u64 this_tsc_write; - u8 this_tsc_generation; + u64 this_tsc_generation; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; @@ -591,7 +593,7 @@ struct kvm_arch { u64 cur_tsc_nsec; u64 cur_tsc_write; u64 cur_tsc_offset; - u8 cur_tsc_generation; + u64 cur_tsc_generation; int nr_vcpus_matched_tsc; spinlock_t pvclock_gtod_sync_lock; @@ -717,7 +719,7 @@ struct kvm_x86_ops { int (*handle_exit)(struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); - u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); + u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); void (*set_irq)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index d3a87780c70b..d7dcef58aefa 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -23,7 +23,10 @@ #define GP_VECTOR 13 #define PF_VECTOR 14 #define MF_VECTOR 16 +#define AC_VECTOR 17 #define MC_VECTOR 18 +#define XM_VECTOR 19 +#define VE_VECTOR 20 /* Select x86 specific features in <linux/kvm.h> */ #define __KVM_HAVE_PIT diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index f9087315e0cd..a5380590ab0e 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -95,4 +95,12 @@ static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->edx & bit(X86_FEATURE_GBPAGES)); } + +static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_RTM)); +} #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 84dc4ba0364d..8d415563e05b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -162,6 +162,10 @@ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ #define NoMod ((u64)1 << 47) /* Mod field is ignored */ +#define Intercept ((u64)1 << 48) /* Has valid intercept field */ +#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ +#define NoBigReal ((u64)1 << 50) /* No big real mode */ +#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -426,6 +430,7 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, .modrm_reg = ctxt->modrm_reg, .modrm_rm = ctxt->modrm_rm, .src_val = ctxt->src.val64, + .dst_val = ctxt->dst.val64, .src_bytes = ctxt->src.bytes, .dst_bytes = ctxt->dst.bytes, .ad_bytes = ctxt->ad_bytes, @@ -511,12 +516,6 @@ static u32 desc_limit_scaled(struct desc_struct *desc) return desc->g ? (limit << 12) | 0xfff : limit; } -static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg) -{ - ctxt->has_seg_override = true; - ctxt->seg_override = seg; -} - static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) @@ -525,14 +524,6 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) return ctxt->ops->get_cached_segment_base(ctxt, seg); } -static unsigned seg_override(struct x86_emulate_ctxt *ctxt) -{ - if (!ctxt->has_seg_override) - return 0; - - return ctxt->seg_override; -} - static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { @@ -651,7 +642,12 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, if (!fetch && (desc.type & 8) && !(desc.type & 2)) goto bad; lim = desc_limit_scaled(&desc); - if ((desc.type & 8) || !(desc.type & 4)) { + if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch && + (ctxt->d & NoBigReal)) { + /* la is between zero and 0xffff */ + if (la > 0xffff || (u32)(la + size - 1) > 0xffff) + goto bad; + } else if ((desc.type & 8) || !(desc.type & 4)) { /* expand-up segment */ if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) goto bad; @@ -716,68 +712,71 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, } /* - * Fetch the next byte of the instruction being emulated which is pointed to - * by ctxt->_eip, then increment ctxt->_eip. - * - * Also prefetch the remaining bytes of the instruction without crossing page + * Prefetch the remaining bytes of the instruction without crossing page * boundary if they are not in fetch_cache yet. */ -static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest) +static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) { - struct fetch_cache *fc = &ctxt->fetch; int rc; - int size, cur_size; - - if (ctxt->_eip == fc->end) { - unsigned long linear; - struct segmented_address addr = { .seg = VCPU_SREG_CS, - .ea = ctxt->_eip }; - cur_size = fc->end - fc->start; - size = min(15UL - cur_size, - PAGE_SIZE - offset_in_page(ctxt->_eip)); - rc = __linearize(ctxt, addr, size, false, true, &linear); - if (unlikely(rc != X86EMUL_CONTINUE)) - return rc; - rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, - size, &ctxt->exception); - if (unlikely(rc != X86EMUL_CONTINUE)) - return rc; - fc->end += size; - } - *dest = fc->data[ctxt->_eip - fc->start]; - ctxt->_eip++; - return X86EMUL_CONTINUE; -} + unsigned size; + unsigned long linear; + int cur_size = ctxt->fetch.end - ctxt->fetch.data; + struct segmented_address addr = { .seg = VCPU_SREG_CS, + .ea = ctxt->eip + cur_size }; + + size = 15UL ^ cur_size; + rc = __linearize(ctxt, addr, size, false, true, &linear); + if (unlikely(rc != X86EMUL_CONTINUE)) + return rc; -static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - void *dest, unsigned size) -{ - int rc; + size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear)); - /* x86 instructions are limited to 15 bytes. */ - if (unlikely(ctxt->_eip + size - ctxt->eip > 15)) + /* + * One instruction can only straddle two pages, + * and one has been loaded at the beginning of + * x86_decode_insn. So, if not enough bytes + * still, we must have hit the 15-byte boundary. + */ + if (unlikely(size < op_size)) return X86EMUL_UNHANDLEABLE; - while (size--) { - rc = do_insn_fetch_byte(ctxt, dest++); - if (rc != X86EMUL_CONTINUE) - return rc; - } + rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end, + size, &ctxt->exception); + if (unlikely(rc != X86EMUL_CONTINUE)) + return rc; + ctxt->fetch.end += size; return X86EMUL_CONTINUE; } +static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, + unsigned size) +{ + if (unlikely(ctxt->fetch.end - ctxt->fetch.ptr < size)) + return __do_insn_fetch_bytes(ctxt, size); + else + return X86EMUL_CONTINUE; +} + /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _ctxt) \ -({ unsigned long _x; \ - rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \ +({ _type _x; \ + \ + rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ - (_type)_x; \ + ctxt->_eip += sizeof(_type); \ + _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \ + ctxt->fetch.ptr += sizeof(_type); \ + _x; \ }) #define insn_fetch_arr(_arr, _size, _ctxt) \ -({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \ +({ \ + rc = do_insn_fetch_bytes(_ctxt, _size); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ + ctxt->_eip += (_size); \ + memcpy(_arr, ctxt->fetch.ptr, _size); \ + ctxt->fetch.ptr += (_size); \ }) /* @@ -1063,19 +1062,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, struct operand *op) { u8 sib; - int index_reg = 0, base_reg = 0, scale; + int index_reg, base_reg, scale; int rc = X86EMUL_CONTINUE; ulong modrm_ea = 0; - if (ctxt->rex_prefix) { - ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */ - index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */ - ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ - } + ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */ + index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */ + base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */ - ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; + ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; - ctxt->modrm_rm |= (ctxt->modrm & 0x07); + ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07); ctxt->modrm_seg = VCPU_SREG_DS; if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { @@ -1190,6 +1187,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } } op->addr.mem.ea = modrm_ea; + if (ctxt->ad_bytes != 8) + ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; + done: return rc; } @@ -1585,34 +1585,28 @@ static void write_register_operand(struct operand *op) static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) { - int rc; - switch (op->type) { case OP_REG: write_register_operand(op); break; case OP_MEM: if (ctxt->lock_prefix) - rc = segmented_cmpxchg(ctxt, + return segmented_cmpxchg(ctxt, + op->addr.mem, + &op->orig_val, + &op->val, + op->bytes); + else + return segmented_write(ctxt, op->addr.mem, - &op->orig_val, &op->val, op->bytes); - else - rc = segmented_write(ctxt, - op->addr.mem, - &op->val, - op->bytes); - if (rc != X86EMUL_CONTINUE) - return rc; break; case OP_MEM_STR: - rc = segmented_write(ctxt, - op->addr.mem, - op->data, - op->bytes * op->count); - if (rc != X86EMUL_CONTINUE) - return rc; + return segmented_write(ctxt, + op->addr.mem, + op->data, + op->bytes * op->count); break; case OP_XMM: write_sse_reg(ctxt, &op->vec_val, op->addr.xmm); @@ -1681,7 +1675,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, return rc; change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF - | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID; switch(ctxt->mode) { case X86EMUL_MODE_PROT64: @@ -2217,7 +2211,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip; if (efer & EFER_LMA) { #ifdef CONFIG_X86_64 - *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF; + *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags; ops->get_msr(ctxt, ctxt->mode == X86EMUL_MODE_PROT64 ? @@ -2225,14 +2219,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->_eip = msr_data; ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); - ctxt->eflags &= ~(msr_data | EFLG_RF); + ctxt->eflags &= ~msr_data; #endif } else { /* legacy mode */ ops->get_msr(ctxt, MSR_STAR, &msr_data); ctxt->_eip = (u32)msr_data; - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + ctxt->eflags &= ~(EFLG_VM | EFLG_IF); } return X86EMUL_CONTINUE; @@ -2281,7 +2275,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) break; } - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + ctxt->eflags &= ~(EFLG_VM | EFLG_IF); cs_sel = (u16)msr_data; cs_sel &= ~SELECTOR_RPL_MASK; ss_sel = cs_sel + 8; @@ -2987,7 +2981,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt) static int em_mov(struct x86_emulate_ctxt *ctxt) { - memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes); + memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr)); return X86EMUL_CONTINUE; } @@ -3545,9 +3539,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) } #define D(_y) { .flags = (_y) } -#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } -#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ - .check_perm = (_p) } +#define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i } +#define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \ + .intercept = x86_intercept_##_i, .check_perm = (_p) } #define N D(NotImpl) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } @@ -3556,10 +3550,10 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } #define II(_f, _e, _i) \ - { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } + { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i } #define IIP(_f, _e, _i, _p) \ - { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \ - .check_perm = (_p) } + { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \ + .intercept = x86_intercept_##_i, .check_perm = (_p) } #define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } #define D2bv(_f) D((_f) | ByteOp), D(_f) @@ -4174,7 +4168,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI)); - op->addr.mem.seg = seg_override(ctxt); + op->addr.mem.seg = ctxt->seg_override; op->val = 0; op->count = 1; break; @@ -4185,7 +4179,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, register_address(ctxt, reg_read(ctxt, VCPU_REGS_RBX) + (reg_read(ctxt, VCPU_REGS_RAX) & 0xff)); - op->addr.mem.seg = seg_override(ctxt); + op->addr.mem.seg = ctxt->seg_override; op->val = 0; break; case OpImmFAddr: @@ -4232,16 +4226,22 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) int mode = ctxt->mode; int def_op_bytes, def_ad_bytes, goffset, simd_prefix; bool op_prefix = false; + bool has_seg_override = false; struct opcode opcode; ctxt->memop.type = OP_NONE; ctxt->memopp = NULL; ctxt->_eip = ctxt->eip; - ctxt->fetch.start = ctxt->_eip; - ctxt->fetch.end = ctxt->fetch.start + insn_len; + ctxt->fetch.ptr = ctxt->fetch.data; + ctxt->fetch.end = ctxt->fetch.data + insn_len; ctxt->opcode_len = 1; if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); + else { + rc = __do_insn_fetch_bytes(ctxt, 1); + if (rc != X86EMUL_CONTINUE) + return rc; + } switch (mode) { case X86EMUL_MODE_REAL: @@ -4285,11 +4285,13 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) case 0x2e: /* CS override */ case 0x36: /* SS override */ case 0x3e: /* DS override */ - set_seg_override(ctxt, (ctxt->b >> 3) & 3); + has_seg_override = true; + ctxt->seg_override = (ctxt->b >> 3) & 3; break; case 0x64: /* FS override */ case 0x65: /* GS override */ - set_seg_override(ctxt, ctxt->b & 7); + has_seg_override = true; + ctxt->seg_override = ctxt->b & 7; break; case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) @@ -4387,49 +4389,59 @@ done_prefixes: ctxt->d |= opcode.flags; } - ctxt->execute = opcode.u.execute; - ctxt->check_perm = opcode.check_perm; - ctxt->intercept = opcode.intercept; - /* Unrecognised? */ - if (ctxt->d == 0 || (ctxt->d & NotImpl)) + if (ctxt->d == 0) return EMULATION_FAILED; - if (!(ctxt->d & EmulateOnUD) && ctxt->ud) - return EMULATION_FAILED; + ctxt->execute = opcode.u.execute; - if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) - ctxt->op_bytes = 8; + if (unlikely(ctxt->d & + (NotImpl|EmulateOnUD|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) { + /* + * These are copied unconditionally here, and checked unconditionally + * in x86_emulate_insn. + */ + ctxt->check_perm = opcode.check_perm; + ctxt->intercept = opcode.intercept; + + if (ctxt->d & NotImpl) + return EMULATION_FAILED; + + if (!(ctxt->d & EmulateOnUD) && ctxt->ud) + return EMULATION_FAILED; - if (ctxt->d & Op3264) { - if (mode == X86EMUL_MODE_PROT64) + if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) ctxt->op_bytes = 8; - else - ctxt->op_bytes = 4; - } - if (ctxt->d & Sse) - ctxt->op_bytes = 16; - else if (ctxt->d & Mmx) - ctxt->op_bytes = 8; + if (ctxt->d & Op3264) { + if (mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; + else + ctxt->op_bytes = 4; + } + + if (ctxt->d & Sse) + ctxt->op_bytes = 16; + else if (ctxt->d & Mmx) + ctxt->op_bytes = 8; + } /* ModRM and SIB bytes. */ if (ctxt->d & ModRM) { rc = decode_modrm(ctxt, &ctxt->memop); - if (!ctxt->has_seg_override) - set_seg_override(ctxt, ctxt->modrm_seg); + if (!has_seg_override) { + has_seg_override = true; + ctxt->seg_override = ctxt->modrm_seg; + } } else if (ctxt->d & MemAbs) rc = decode_abs(ctxt, &ctxt->memop); if (rc != X86EMUL_CONTINUE) goto done; - if (!ctxt->has_seg_override) - set_seg_override(ctxt, VCPU_SREG_DS); - - ctxt->memop.addr.mem.seg = seg_override(ctxt); + if (!has_seg_override) + ctxt->seg_override = VCPU_SREG_DS; - if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8) - ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; + ctxt->memop.addr.mem.seg = ctxt->seg_override; /* * Decode and fetch the source operand: register, memory @@ -4451,7 +4463,7 @@ done_prefixes: rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); done: - if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative) + if (ctxt->rip_relative) ctxt->memopp->addr.mem.ea += ctxt->_eip; return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; @@ -4526,6 +4538,16 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) return X86EMUL_CONTINUE; } +void init_decode_cache(struct x86_emulate_ctxt *ctxt) +{ + memset(&ctxt->rip_relative, 0, + (void *)&ctxt->modrm - (void *)&ctxt->rip_relative); + + ctxt->io_read.pos = 0; + ctxt->io_read.end = 0; + ctxt->mem_read.end = 0; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { const struct x86_emulate_ops *ops = ctxt->ops; @@ -4534,12 +4556,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) ctxt->mem_read.pos = 0; - if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || - (ctxt->d & Undefined)) { - rc = emulate_ud(ctxt); - goto done; - } - /* LOCK prefix is allowed only with some instructions */ if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) { rc = emulate_ud(ctxt); @@ -4551,69 +4567,82 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) - || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { - rc = emulate_ud(ctxt); - goto done; - } - - if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { - rc = emulate_nm(ctxt); - goto done; - } + if (unlikely(ctxt->d & + (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { + if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || + (ctxt->d & Undefined)) { + rc = emulate_ud(ctxt); + goto done; + } - if (ctxt->d & Mmx) { - rc = flush_pending_x87_faults(ctxt); - if (rc != X86EMUL_CONTINUE) + if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) + || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { + rc = emulate_ud(ctxt); goto done; - /* - * Now that we know the fpu is exception safe, we can fetch - * operands from it. - */ - fetch_possible_mmx_operand(ctxt, &ctxt->src); - fetch_possible_mmx_operand(ctxt, &ctxt->src2); - if (!(ctxt->d & Mov)) - fetch_possible_mmx_operand(ctxt, &ctxt->dst); - } + } - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { - rc = emulator_check_intercept(ctxt, ctxt->intercept, - X86_ICPT_PRE_EXCEPT); - if (rc != X86EMUL_CONTINUE) + if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + rc = emulate_nm(ctxt); goto done; - } + } - /* Privileged instruction can be executed only in CPL=0 */ - if ((ctxt->d & Priv) && ops->cpl(ctxt)) { - rc = emulate_gp(ctxt, 0); - goto done; - } + if (ctxt->d & Mmx) { + rc = flush_pending_x87_faults(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + /* + * Now that we know the fpu is exception safe, we can fetch + * operands from it. + */ + fetch_possible_mmx_operand(ctxt, &ctxt->src); + fetch_possible_mmx_operand(ctxt, &ctxt->src2); + if (!(ctxt->d & Mov)) + fetch_possible_mmx_operand(ctxt, &ctxt->dst); + } - /* Instruction can only be executed in protected mode */ - if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { - rc = emulate_ud(ctxt); - goto done; - } + if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + rc = emulator_check_intercept(ctxt, ctxt->intercept, + X86_ICPT_PRE_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } - /* Do instruction specific permission checks */ - if (ctxt->check_perm) { - rc = ctxt->check_perm(ctxt); - if (rc != X86EMUL_CONTINUE) + /* Privileged instruction can be executed only in CPL=0 */ + if ((ctxt->d & Priv) && ops->cpl(ctxt)) { + if (ctxt->d & PrivUD) + rc = emulate_ud(ctxt); + else + rc = emulate_gp(ctxt, 0); goto done; - } + } - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { - rc = emulator_check_intercept(ctxt, ctxt->intercept, - X86_ICPT_POST_EXCEPT); - if (rc != X86EMUL_CONTINUE) + /* Instruction can only be executed in protected mode */ + if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { + rc = emulate_ud(ctxt); goto done; - } + } - if (ctxt->rep_prefix && (ctxt->d & String)) { - /* All REP prefixes have the same first termination condition */ - if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { - ctxt->eip = ctxt->_eip; - goto done; + /* Do instruction specific permission checks */ + if (ctxt->d & CheckPerm) { + rc = ctxt->check_perm(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + } + + if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { + rc = emulator_check_intercept(ctxt, ctxt->intercept, + X86_ICPT_POST_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } + + if (ctxt->rep_prefix && (ctxt->d & String)) { + /* All REP prefixes have the same first termination condition */ + if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { + ctxt->eip = ctxt->_eip; + ctxt->eflags &= ~EFLG_RF; + goto done; + } } } @@ -4647,13 +4676,15 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { + if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) goto done; } + ctxt->eflags &= ~EFLG_RF; + if (ctxt->execute) { if (ctxt->d & Fastop) { void (*fop)(struct fastop *) = (void *)ctxt->execute; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 006911858174..3855103f71fd 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1451,7 +1451,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; - apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" + apic_debug("%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, vcpu, kvm_apic_id(apic), vcpu->arch.apic_base, apic->base_address); @@ -1895,7 +1895,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; - pr_debug("vcpu %d received sipi with vector # %x\n", + apic_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, sipi_vector); kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b5e994ad0135..ddf742768ecf 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -486,14 +486,14 @@ static int is_external_interrupt(u32 info) return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); } -static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u32 ret = 0; if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) - ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; - return ret & mask; + ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; + return ret; } static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) @@ -1415,7 +1415,16 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; - var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; + + /* + * AMD CPUs circa 2014 track the G bit for all segments except CS. + * However, the SVM spec states that the G bit is not observed by the + * CPU, and some VMware virtual CPUs drop the G bit for all segments. + * So let's synthesize a legal G bit for all segments, this helps + * running KVM nested. It also helps cross-vendor migration, because + * Intel's vmentry has a check on the 'G' bit. + */ + var->g = s->limit > 0xfffff; /* * AMD's VMCB does not have an explicit unusable field, so emulate it @@ -1424,14 +1433,6 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->unusable = !var->present || (var->type == 0); switch (seg) { - case VCPU_SREG_CS: - /* - * SVM always stores 0 for the 'G' bit in the CS selector in - * the VMCB on a VMEXIT. This hurts cross-vendor migration: - * Intel's VMENTRY has a check on the 'G' bit. - */ - var->g = s->limit > 0xfffff; - break; case VCPU_SREG_TR: /* * Work around a bug where the busy flag in the tr selector @@ -2116,22 +2117,27 @@ static void nested_svm_unmap(struct page *page) static int nested_svm_intercept_ioio(struct vcpu_svm *svm) { - unsigned port; - u8 val, bit; + unsigned port, size, iopm_len; + u16 val, mask; + u8 start_bit; u64 gpa; if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; + size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >> + SVM_IOIO_SIZE_SHIFT; gpa = svm->nested.vmcb_iopm + (port / 8); - bit = port % 8; - val = 0; + start_bit = port % 8; + iopm_len = (start_bit + size > 8) ? 2 : 1; + mask = (0xf >> (4 - size)) << start_bit; + val = 0; - if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1)) - val &= (1 << bit); + if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len)) + return NESTED_EXIT_DONE; - return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; + return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) @@ -4205,7 +4211,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, if (info->intercept == x86_intercept_cr_write) icpt_info.exit_code += info->modrm_reg; - if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0) + if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || + info->intercept == x86_intercept_clts) break; intercept = svm->nested.intercept; @@ -4250,14 +4257,14 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, u64 exit_info; u32 bytes; - exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16; - if (info->intercept == x86_intercept_in || info->intercept == x86_intercept_ins) { - exit_info |= SVM_IOIO_TYPE_MASK; - bytes = info->src_bytes; - } else { + exit_info = ((info->src_val & 0xffff) << 16) | + SVM_IOIO_TYPE_MASK; bytes = info->dst_bytes; + } else { + exit_info = (info->dst_val & 0xffff) << 16; + bytes = info->src_bytes; } if (info->intercept == x86_intercept_outs || diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 33574c95220d..e850a7d332be 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -721,10 +721,10 @@ TRACE_EVENT(kvm_emulate_insn, ), TP_fast_assign( - __entry->rip = vcpu->arch.emulate_ctxt.fetch.start; __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); - __entry->len = vcpu->arch.emulate_ctxt._eip - - vcpu->arch.emulate_ctxt.fetch.start; + __entry->len = vcpu->arch.emulate_ctxt.fetch.ptr + - vcpu->arch.emulate_ctxt.fetch.data; + __entry->rip = vcpu->arch.emulate_ctxt._eip - __entry->len; memcpy(__entry->insn, vcpu->arch.emulate_ctxt.fetch.data, 15); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8748c2e19ed6..fd24f68378a7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1943,7 +1943,7 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); } -static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); int ret = 0; @@ -1953,7 +1953,7 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) if (interruptibility & GUEST_INTR_STATE_MOV_SS) ret |= KVM_X86_SHADOW_INT_MOV_SS; - return ret & mask; + return ret; } static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) @@ -3672,7 +3672,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); out: - vmx->emulation_required |= emulation_required(vcpu); + vmx->emulation_required = emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -4892,7 +4892,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= dr6; + vcpu->arch.dr6 |= dr6 | DR6_RTM; if (!(dr6 & ~DR6_RESERVED)) /* icebp */ skip_emulated_instruction(vcpu); @@ -5151,7 +5151,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 0; } else { vcpu->arch.dr7 &= ~DR7_GD; - vcpu->arch.dr6 |= DR6_BD; + vcpu->arch.dr6 |= DR6_BD | DR6_RTM; vmcs_writel(GUEST_DR7, vcpu->arch.dr7); kvm_queue_exception(vcpu, DB_VECTOR); return 1; @@ -5640,7 +5640,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; - while (!guest_state_valid(vcpu) && count-- != 0) { + while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); @@ -5674,7 +5674,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) schedule(); } - vmx->emulation_required = emulation_required(vcpu); out: return ret; } @@ -5773,22 +5772,27 @@ static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) /* * Free all VMCSs saved for this vcpu, except the one pointed by - * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one - * currently used, if running L2), and vmcs01 when running L2. + * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs + * must be &vmx->vmcs01. */ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) { struct vmcs02_list *item, *n; + + WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { - if (vmx->loaded_vmcs != &item->vmcs02) - free_loaded_vmcs(&item->vmcs02); + /* + * Something will leak if the above WARN triggers. Better than + * a use-after-free. + */ + if (vmx->loaded_vmcs == &item->vmcs02) + continue; + + free_loaded_vmcs(&item->vmcs02); list_del(&item->list); kfree(item); + vmx->nested.vmcs02_num--; } - vmx->nested.vmcs02_num = 0; - - if (vmx->loaded_vmcs != &vmx->vmcs01) - free_loaded_vmcs(&vmx->vmcs01); } /* @@ -6105,20 +6109,27 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { u32 exec_control; + if (vmx->nested.current_vmptr == -1ull) + return; + + /* current_vmptr and current_vmcs12 are always set/reset together */ + if (WARN_ON(vmx->nested.current_vmcs12 == NULL)) + return; + if (enable_shadow_vmcs) { - if (vmx->nested.current_vmcs12 != NULL) { - /* copy to memory all shadowed fields in case - they were modified */ - copy_shadow_to_vmcs12(vmx); - vmx->nested.sync_shadow_vmcs = false; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); - vmcs_write64(VMCS_LINK_POINTER, -1ull); - } + /* copy to memory all shadowed fields in case + they were modified */ + copy_shadow_to_vmcs12(vmx); + vmx->nested.sync_shadow_vmcs = false; + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_write64(VMCS_LINK_POINTER, -1ull); } kunmap(vmx->nested.current_vmcs12_page); nested_release_page(vmx->nested.current_vmcs12_page); + vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmcs12 = NULL; } /* @@ -6129,12 +6140,9 @@ static void free_nested(struct vcpu_vmx *vmx) { if (!vmx->nested.vmxon) return; + vmx->nested.vmxon = false; - if (vmx->nested.current_vmptr != -1ull) { - nested_release_vmcs12(vmx); - vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; - } + nested_release_vmcs12(vmx); if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); /* Unpin physical memory we referred to in current vmcs02 */ @@ -6171,11 +6179,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) return 1; - if (vmptr == vmx->nested.current_vmptr) { + if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vmx); - vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; - } page = nested_get_page(vcpu, vmptr); if (page == NULL) { @@ -6517,9 +6522,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } - if (vmx->nested.current_vmptr != -1ull) - nested_release_vmcs12(vmx); + nested_release_vmcs12(vmx); vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; @@ -7131,7 +7135,26 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) if (max_irr == -1) return; - vmx_set_rvi(max_irr); + /* + * If a vmexit is needed, vmx_check_nested_events handles it. + */ + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + return; + + if (!is_guest_mode(vcpu)) { + vmx_set_rvi(max_irr); + return; + } + + /* + * Fall back to pre-APICv interrupt injection since L2 + * is run without virtual interrupt delivery. + */ + if (!kvm_event_needs_reinjection(vcpu) && + vmx_interrupt_allowed(vcpu)) { + kvm_queue_interrupt(vcpu, max_irr, false); + vmx_inject_irq(vcpu); + } } static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -7539,13 +7562,31 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_complete_interrupts(vmx); } +static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + + if (vmx->loaded_vmcs == &vmx->vmcs01) + return; + + cpu = get_cpu(); + vmx->loaded_vmcs = &vmx->vmcs01; + vmx_vcpu_put(vcpu); + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; + put_cpu(); +} + static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); free_vpid(vmx); - free_loaded_vmcs(vmx->loaded_vmcs); + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); free_nested(vmx); + free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); @@ -8703,7 +8744,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, unsigned long exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int cpu; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); /* trying to cancel vmlaunch/vmresume is a bug */ @@ -8728,12 +8768,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_intr_error_code, KVM_ISA_VMX); - cpu = get_cpu(); - vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_put(vcpu); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - put_cpu(); + vmx_load_vmcs01(vcpu); vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5a8691b0ed76..08c48a3c7c3a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -87,6 +87,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); +static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -758,6 +759,15 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } +static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) +{ + u64 fixed = DR6_FIXED_1; + + if (!guest_cpuid_has_rtm(vcpu)) + fixed |= DR6_RTM; + return fixed; +} + static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { switch (dr) { @@ -773,7 +783,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) case 6: if (val & 0xffffffff00000000ULL) return -1; /* #GP */ - vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); kvm_update_dr6(vcpu); break; case 5: @@ -1215,6 +1225,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) unsigned long flags; s64 usdiff; bool matched; + bool already_matched; u64 data = msr->data; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -1279,6 +1290,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } matched = true; + already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); } else { /* * We split periods of matched TSC writes into generations. @@ -1294,7 +1306,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; matched = false; - pr_debug("kvm: new tsc generation %u, clock %llu\n", + pr_debug("kvm: new tsc generation %llu, clock %llu\n", kvm->arch.cur_tsc_generation, data); } @@ -1319,10 +1331,11 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); spin_lock(&kvm->arch.pvclock_gtod_sync_lock); - if (matched) - kvm->arch.nr_vcpus_matched_tsc++; - else + if (!matched) { kvm->arch.nr_vcpus_matched_tsc = 0; + } else if (!already_matched) { + kvm->arch.nr_vcpus_matched_tsc++; + } kvm_track_tsc_matching(vcpu); spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); @@ -2032,6 +2045,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ + data &= ~(u64)0x40000; /* ignore Mc status write enable */ if (data != 0) { vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); @@ -2974,9 +2988,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; events->interrupt.soft = 0; - events->interrupt.shadow = - kvm_x86_ops->get_interrupt_shadow(vcpu, - KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); + events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; @@ -4082,7 +4094,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); + ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data, + offset, toread); if (ret < 0) { r = X86EMUL_IO_NEEDED; goto out; @@ -4103,10 +4116,24 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + unsigned offset; + int ret; - return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, - access | PFERR_FETCH_MASK, - exception); + /* Inline kvm_read_guest_virt_helper for speed. */ + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK, + exception); + if (unlikely(gpa == UNMAPPED_GVA)) + return X86EMUL_PROPAGATE_FAULT; + + offset = addr & (PAGE_SIZE-1); + if (WARN_ON(offset + bytes > PAGE_SIZE)) + bytes = (unsigned)PAGE_SIZE - offset; + ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val, + offset, bytes); + if (unlikely(ret < 0)) + return X86EMUL_IO_NEEDED; + + return X86EMUL_CONTINUE; } int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, @@ -4856,7 +4883,7 @@ static const struct x86_emulate_ops emulate_ops = { static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -4864,8 +4891,13 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) * means that the last instruction is an sti. We should not * leave the flag on in this case. The same goes for mov ss */ - if (!(int_shadow & mask)) + if (int_shadow & mask) + mask = 0; + if (unlikely(int_shadow || mask)) { kvm_x86_ops->set_interrupt_shadow(vcpu, mask); + if (!mask) + kvm_make_request(KVM_REQ_EVENT, vcpu); + } } static void inject_emulated_exception(struct kvm_vcpu *vcpu) @@ -4880,19 +4912,6 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception.vector); } -static void init_decode_cache(struct x86_emulate_ctxt *ctxt) -{ - memset(&ctxt->opcode_len, 0, - (void *)&ctxt->_regs - (void *)&ctxt->opcode_len); - - ctxt->fetch.start = 0; - ctxt->fetch.end = 0; - ctxt->io_read.pos = 0; - ctxt->io_read.end = 0; - ctxt->mem_read.pos = 0; - ctxt->mem_read.end = 0; -} - static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; @@ -5091,23 +5110,22 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r) +static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) { struct kvm_run *kvm_run = vcpu->run; /* - * Use the "raw" value to see if TF was passed to the processor. - * Note that the new value of the flags has not been saved yet. + * rflags is the old, "raw" value of the flags. The new value has + * not been saved yet. * * This is correct even for TF set by the guest, because "the * processor will not generate this exception after the instruction * that sets the TF flag". */ - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); - if (unlikely(rflags & X86_EFLAGS_TF)) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1; + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | + DR6_RTM; kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; @@ -5120,7 +5138,7 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r) * cleared by the processor". */ vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BS; + vcpu->arch.dr6 |= DR6_BS | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); } } @@ -5139,7 +5157,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.eff_db); if (dr6 != 0) { - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; kvm_run->debug.arch.pc = kvm_rip_read(vcpu) + get_segment_base(vcpu, VCPU_SREG_CS); @@ -5150,14 +5168,15 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) } } - if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) { + if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && + !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) { dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.dr7, vcpu->arch.db); if (dr6 != 0) { vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= dr6; + vcpu->arch.dr6 |= dr6 | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); *r = EMULATE_DONE; return true; @@ -5221,6 +5240,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (emulation_type & EMULTYPE_SKIP) { kvm_rip_write(vcpu, ctxt->_eip); + if (ctxt->eflags & X86_EFLAGS_RF) + kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); return EMULATE_DONE; } @@ -5271,13 +5292,22 @@ restart: r = EMULATE_DONE; if (writeback) { + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); - kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); if (r == EMULATE_DONE) - kvm_vcpu_check_singlestep(vcpu, &r); - kvm_set_rflags(vcpu, ctxt->eflags); + kvm_vcpu_check_singlestep(vcpu, rflags, &r); + __kvm_set_rflags(vcpu, ctxt->eflags); + + /* + * For STI, interrupts are shadowed; so KVM_REQ_EVENT will + * do nothing, and it will be requested again as soon as + * the shadow expires. But we still need to check here, + * because POPF has no interrupt shadow. + */ + if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF)) + kvm_make_request(KVM_REQ_EVENT, vcpu); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; @@ -6842,9 +6872,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; + kvm_clear_interrupt_queue(vcpu); + kvm_clear_exception_queue(vcpu); memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); - vcpu->arch.dr6 = DR6_FIXED_1; + vcpu->arch.dr6 = DR6_INIT; kvm_update_dr6(vcpu); vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); @@ -7400,12 +7432,17 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_rflags); -void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; kvm_x86_ops->set_rflags(vcpu, rflags); +} + +void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + __kvm_set_rflags(vcpu, rflags); kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_set_rflags); |