diff options
author | Michael Ellerman <mpe@ellerman.id.au> | 2022-03-08 03:11:46 +0100 |
---|---|---|
committer | Michael Ellerman <mpe@ellerman.id.au> | 2022-03-08 03:13:08 +0100 |
commit | 04a5b0ee97046293431685404ef7352917f1e243 (patch) | |
tree | a08d6eb6bb8b37dce0fceb472fc3dd506ce1d295 /arch | |
parent | KVM: PPC: Book3S HV P9: Fix "lost kick" race (diff) | |
parent | KVM: PPC: reserve capability 210 for KVM_CAP_PPC_AIL_MODE_3 (diff) | |
download | linux-04a5b0ee97046293431685404ef7352917f1e243.tar.xz linux-04a5b0ee97046293431685404ef7352917f1e243.zip |
Merge branch 'kvm-ppc-cap-210' of https://git.kernel.org/pub/scm/virt/kvm/kvm into topic/ppc-kvm
Merge this branch from the KVM tree to bring in a new KVM capability
KVM_CAP_PPC_AIL_MODE_3.
It also brings in some unrelated KVM changes as well as v5.17-rc3.
Diffstat (limited to 'arch')
35 files changed, 487 insertions, 307 deletions
diff --git a/arch/arm/crypto/blake2s-shash.c b/arch/arm/crypto/blake2s-shash.c index 17c1c3bfe2f5..763c73beea2d 100644 --- a/arch/arm/crypto/blake2s-shash.c +++ b/arch/arm/crypto/blake2s-shash.c @@ -13,12 +13,12 @@ static int crypto_blake2s_update_arm(struct shash_desc *desc, const u8 *in, unsigned int inlen) { - return crypto_blake2s_update(desc, in, inlen, blake2s_compress); + return crypto_blake2s_update(desc, in, inlen, false); } static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out) { - return crypto_blake2s_final(desc, out, blake2s_compress); + return crypto_blake2s_final(desc, out, false); } #define BLAKE2S_ALG(name, driver_name, digest_size) \ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f2b5a4abef21..cbcd42decb2a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -680,6 +680,22 @@ config ARM64_ERRATUM_2051678 If unsure, say Y. +config ARM64_ERRATUM_2077057 + bool "Cortex-A510: 2077057: workaround software-step corrupting SPSR_EL2" + help + This option adds the workaround for ARM Cortex-A510 erratum 2077057. + Affected Cortex-A510 may corrupt SPSR_EL2 when the a step exception is + expected, but a Pointer Authentication trap is taken instead. The + erratum causes SPSR_EL1 to be copied to SPSR_EL2, which could allow + EL1 to cause a return to EL2 with a guest controlled ELR_EL2. + + This can only happen when EL2 is stepping EL1. + + When these conditions occur, the SPSR_EL2 value is unchanged from the + previous guest entry, and can be restored from the in-memory copy. + + If unsure, say Y. + config ARM64_ERRATUM_2119858 bool "Cortex-A710/X2: 2119858: workaround TRBE overwriting trace data in FILL mode" default y diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 066098198c24..b217941713a8 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -600,6 +600,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = { CAP_MIDR_RANGE_LIST(trbe_write_out_of_range_cpus), }, #endif +#ifdef CONFIG_ARM64_ERRATUM_2077057 + { + .desc = "ARM erratum 2077057", + .capability = ARM64_WORKAROUND_2077057, + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2), + }, +#endif #ifdef CONFIG_ARM64_ERRATUM_2064142 { .desc = "ARM erratum 2064142", diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a4a0063df456..ecc5958e27fe 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -797,6 +797,24 @@ static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) xfer_to_guest_mode_work_pending(); } +/* + * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while + * the vCPU is running. + * + * This must be noinstr as instrumentation may make use of RCU, and this is not + * safe during the EQS. + */ +static int noinstr kvm_arm_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + int ret; + + guest_state_enter_irqoff(); + ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu); + guest_state_exit_irqoff(); + + return ret; +} + /** * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code * @vcpu: The VCPU pointer @@ -881,9 +899,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * Enter the guest */ trace_kvm_entry(*vcpu_pc(vcpu)); - guest_enter_irqoff(); + guest_timing_enter_irqoff(); - ret = kvm_call_hyp_ret(__kvm_vcpu_run, vcpu); + ret = kvm_arm_vcpu_enter_exit(vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; vcpu->stat.exits++; @@ -918,26 +936,23 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_arch_vcpu_ctxsync_fp(vcpu); /* - * We may have taken a host interrupt in HYP mode (ie - * while executing the guest). This interrupt is still - * pending, as we haven't serviced it yet! + * We must ensure that any pending interrupts are taken before + * we exit guest timing so that timer ticks are accounted as + * guest time. Transiently unmask interrupts so that any + * pending interrupts are taken. * - * We're now back in SVC mode, with interrupts - * disabled. Enabling the interrupts now will have - * the effect of taking the interrupt again, in SVC - * mode this time. + * Per ARM DDI 0487G.b section D1.13.4, an ISB (or other + * context synchronization event) is necessary to ensure that + * pending interrupts are taken. */ local_irq_enable(); + isb(); + local_irq_disable(); + + guest_timing_exit_irqoff(); + + local_irq_enable(); - /* - * We do local_irq_enable() before calling guest_exit() so - * that if a timer interrupt hits while running the guest we - * account that tick as being spent in the guest. We enable - * preemption after calling guest_exit() so that if we get - * preempted we make sure ticks after that is not counted as - * guest time. - */ - guest_exit(); trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* Exit types that need handling before we can be preempted */ diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index fd2dd26caf91..e3140abd2e2e 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -228,6 +228,14 @@ int handle_exit(struct kvm_vcpu *vcpu, int exception_index) { struct kvm_run *run = vcpu->run; + if (ARM_SERROR_PENDING(exception_index)) { + /* + * The SError is handled by handle_exit_early(). If the guest + * survives it will re-execute the original instruction. + */ + return 1; + } + exception_index = ARM_EXCEPTION_CODE(exception_index); switch (exception_index) { diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 58e14f8ead23..701cfb964905 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -402,6 +402,24 @@ static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code) return false; } +static inline void synchronize_vcpu_pstate(struct kvm_vcpu *vcpu, u64 *exit_code) +{ + /* + * Check for the conditions of Cortex-A510's #2077057. When these occur + * SPSR_EL2 can't be trusted, but isn't needed either as it is + * unchanged from the value in vcpu_gp_regs(vcpu)->pstate. + * Are we single-stepping the guest, and took a PAC exception from the + * active-not-pending state? + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_2077057) && + vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && + *vcpu_cpsr(vcpu) & DBG_SPSR_SS && + ESR_ELx_EC(read_sysreg_el2(SYS_ESR)) == ESR_ELx_EC_PAC) + write_sysreg_el2(*vcpu_cpsr(vcpu), SYS_SPSR); + + vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR); +} + /* * Return true when we were able to fixup the guest exit and should return to * the guest, false when we should restore the host state and return to the @@ -413,7 +431,7 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) * Save PSTATE early so that we can evaluate the vcpu mode * early on. */ - vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR); + synchronize_vcpu_pstate(vcpu, exit_code); /* * Check whether we want to repaint the state one way or @@ -424,7 +442,8 @@ static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code) if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR); - if (ARM_SERROR_PENDING(*exit_code)) { + if (ARM_SERROR_PENDING(*exit_code) && + ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ) { u8 esr_ec = kvm_vcpu_trap_get_class(vcpu); /* diff --git a/arch/arm64/kvm/vgic/vgic-mmio.c b/arch/arm64/kvm/vgic/vgic-mmio.c index 7068da080799..49837d3a3ef5 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio.c +++ b/arch/arm64/kvm/vgic/vgic-mmio.c @@ -248,6 +248,8 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu, IRQCHIP_STATE_PENDING, &val); WARN_RATELIMIT(err, "IRQ %d", irq->host_irq); + } else if (vgic_irq_is_mapped_level(irq)) { + val = vgic_get_phys_line_level(irq); } else { val = irq_is_pending(irq); } diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index e7719e8f18de..9c65b1e25a96 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -55,9 +55,10 @@ WORKAROUND_1418040 WORKAROUND_1463225 WORKAROUND_1508412 WORKAROUND_1542419 -WORKAROUND_2064142 -WORKAROUND_2038923 WORKAROUND_1902691 +WORKAROUND_2038923 +WORKAROUND_2064142 +WORKAROUND_2077057 WORKAROUND_TRBE_OVERWRITE_FILL_MODE WORKAROUND_TSB_FLUSH_FAILURE WORKAROUND_TRBE_WRITE_OUT_OF_RANGE diff --git a/arch/mips/cavium-octeon/octeon-memcpy.S b/arch/mips/cavium-octeon/octeon-memcpy.S index 0a515cde1c18..25860fba6218 100644 --- a/arch/mips/cavium-octeon/octeon-memcpy.S +++ b/arch/mips/cavium-octeon/octeon-memcpy.S @@ -74,7 +74,7 @@ #define EXC(inst_reg,addr,handler) \ 9: inst_reg, addr; \ .section __ex_table,"a"; \ - PTR 9b, handler; \ + PTR_WD 9b, handler; \ .previous /* diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index e59cb6246f76..a25e0b73ee70 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -414,6 +414,24 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, return -ENOIOCTLCMD; } +/* + * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while + * the vCPU is running. + * + * This must be noinstr as instrumentation may make use of RCU, and this is not + * safe during the EQS. + */ +static int noinstr kvm_mips_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + int ret; + + guest_state_enter_irqoff(); + ret = kvm_mips_callbacks->vcpu_run(vcpu); + guest_state_exit_irqoff(); + + return ret; +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { int r = -EINTR; @@ -434,7 +452,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) lose_fpu(1); local_irq_disable(); - guest_enter_irqoff(); + guest_timing_enter_irqoff(); trace_kvm_enter(vcpu); /* @@ -445,10 +463,23 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - r = kvm_mips_callbacks->vcpu_run(vcpu); + r = kvm_mips_vcpu_enter_exit(vcpu); + + /* + * We must ensure that any pending interrupts are taken before + * we exit guest timing so that timer ticks are accounted as + * guest time. Transiently unmask interrupts so that any + * pending interrupts are taken. + * + * TODO: is there a barrier which ensures that pending interrupts are + * recognised? Currently this just hopes that the CPU takes any pending + * interrupts between the enable and disable. + */ + local_irq_enable(); + local_irq_disable(); trace_kvm_out(vcpu); - guest_exit_irqoff(); + guest_timing_exit_irqoff(); local_irq_enable(); out: @@ -1168,7 +1199,7 @@ static void kvm_mips_set_c0_status(void) /* * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV) */ -int kvm_mips_handle_exit(struct kvm_vcpu *vcpu) +static int __kvm_mips_handle_exit(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 cause = vcpu->arch.host_cp0_cause; @@ -1357,6 +1388,17 @@ int kvm_mips_handle_exit(struct kvm_vcpu *vcpu) return ret; } +int noinstr kvm_mips_handle_exit(struct kvm_vcpu *vcpu) +{ + int ret; + + guest_state_exit_irqoff(); + ret = __kvm_mips_handle_exit(vcpu); + guest_state_enter_irqoff(); + + return ret; +} + /* Enable FPU for guest and restore context */ void kvm_own_fpu(struct kvm_vcpu *vcpu) { diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 4adca5abbc72..c706f5890a05 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -458,8 +458,8 @@ void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu) /** * _kvm_vz_save_htimer() - Switch to software emulation of guest timer. * @vcpu: Virtual CPU. - * @compare: Pointer to write compare value to. - * @cause: Pointer to write cause value to. + * @out_compare: Pointer to write compare value to. + * @out_cause: Pointer to write cause value to. * * Save VZ guest timer state and switch to software emulation of guest CP0 * timer. The hard timer must already be in use, so preemption should be @@ -1541,11 +1541,14 @@ static int kvm_trap_vz_handle_guest_exit(struct kvm_vcpu *vcpu) } /** - * kvm_trap_vz_handle_cop_unusuable() - Guest used unusable coprocessor. + * kvm_trap_vz_handle_cop_unusable() - Guest used unusable coprocessor. * @vcpu: Virtual CPU context. * * Handle when the guest attempts to use a coprocessor which hasn't been allowed * by the root context. + * + * Return: value indicating whether to resume the host or the guest + * (RESUME_HOST or RESUME_GUEST) */ static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu) { @@ -1592,6 +1595,9 @@ static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu) * * Handle when the guest attempts to use MSA when it is disabled in the root * context. + * + * Return: value indicating whether to resume the host or the guest + * (RESUME_HOST or RESUME_GUEST) */ static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu) { diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 0c5239e05721..624166004e36 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -90,6 +90,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *cntx; + struct kvm_vcpu_csr *reset_csr = &vcpu->arch.guest_reset_csr; /* Mark this VCPU never ran */ vcpu->arch.ran_atleast_once = false; @@ -106,6 +107,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) cntx->hstatus |= HSTATUS_SPVP; cntx->hstatus |= HSTATUS_SPV; + /* By default, make CY, TM, and IR counters accessible in VU mode */ + reset_csr->scounteren = 0x7; + /* Setup VCPU timer */ kvm_riscv_vcpu_timer_init(vcpu); @@ -699,6 +703,20 @@ static void kvm_riscv_update_hvip(struct kvm_vcpu *vcpu) csr_write(CSR_HVIP, csr->hvip); } +/* + * Actually run the vCPU, entering an RCU extended quiescent state (EQS) while + * the vCPU is running. + * + * This must be noinstr as instrumentation may make use of RCU, and this is not + * safe during the EQS. + */ +static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu) +{ + guest_state_enter_irqoff(); + __kvm_riscv_switch_to(&vcpu->arch); + guest_state_exit_irqoff(); +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { int ret; @@ -790,9 +808,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) continue; } - guest_enter_irqoff(); + guest_timing_enter_irqoff(); - __kvm_riscv_switch_to(&vcpu->arch); + kvm_riscv_vcpu_enter_exit(vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; vcpu->stat.exits++; @@ -812,25 +830,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_riscv_vcpu_sync_interrupts(vcpu); /* - * We may have taken a host interrupt in VS/VU-mode (i.e. - * while executing the guest). This interrupt is still - * pending, as we haven't serviced it yet! + * We must ensure that any pending interrupts are taken before + * we exit guest timing so that timer ticks are accounted as + * guest time. Transiently unmask interrupts so that any + * pending interrupts are taken. * - * We're now back in HS-mode with interrupts disabled - * so enabling the interrupts now will have the effect - * of taking the interrupt again, in HS-mode this time. + * There's no barrier which ensures that pending interrupts are + * recognised, so we just hope that the CPU takes any pending + * interrupts between the enable and disable. */ local_irq_enable(); + local_irq_disable(); - /* - * We do local_irq_enable() before calling guest_exit() so - * that if a timer interrupt hits while running the guest - * we account that tick as being spent in the guest. We - * enable preemption after calling guest_exit() so that if - * we get preempted we make sure ticks after that is not - * counted as guest time. - */ - guest_exit(); + guest_timing_exit_irqoff(); + + local_irq_enable(); preempt_enable(); diff --git a/arch/riscv/kvm/vcpu_sbi_base.c b/arch/riscv/kvm/vcpu_sbi_base.c index 4ecf377f483b..48f431091cdb 100644 --- a/arch/riscv/kvm/vcpu_sbi_base.c +++ b/arch/riscv/kvm/vcpu_sbi_base.c @@ -9,6 +9,7 @@ #include <linux/errno.h> #include <linux/err.h> #include <linux/kvm_host.h> +#include <linux/version.h> #include <asm/csr.h> #include <asm/sbi.h> #include <asm/kvm_vcpu_timer.h> @@ -32,7 +33,7 @@ static int kvm_sbi_ext_base_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, *out_val = KVM_SBI_IMPID; break; case SBI_EXT_BASE_GET_IMP_VERSION: - *out_val = 0; + *out_val = LINUX_VERSION_CODE; break; case SBI_EXT_BASE_PROBE_EXT: if ((cp->a0 >= SBI_EXT_EXPERIMENTAL_START && diff --git a/arch/x86/crypto/blake2s-shash.c b/arch/x86/crypto/blake2s-shash.c index f9e2fecdb761..59ae28abe35c 100644 --- a/arch/x86/crypto/blake2s-shash.c +++ b/arch/x86/crypto/blake2s-shash.c @@ -18,12 +18,12 @@ static int crypto_blake2s_update_x86(struct shash_desc *desc, const u8 *in, unsigned int inlen) { - return crypto_blake2s_update(desc, in, inlen, blake2s_compress); + return crypto_blake2s_update(desc, in, inlen, false); } static int crypto_blake2s_final_x86(struct shash_desc *desc, u8 *out) { - return crypto_blake2s_final(desc, out, blake2s_compress); + return crypto_blake2s_final(desc, out, false); } #define BLAKE2S_ALG(name, driver_name, digest_size) \ diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index c91434056c29..a3c7ca876aeb 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4703,6 +4703,19 @@ static __initconst const struct x86_pmu intel_pmu = { .lbr_read = intel_pmu_lbr_read_64, .lbr_save = intel_pmu_lbr_save, .lbr_restore = intel_pmu_lbr_restore, + + /* + * SMM has access to all 4 rings and while traditionally SMM code only + * ran in CPL0, 2021-era firmware is starting to make use of CPL3 in SMM. + * + * Since the EVENTSEL.{USR,OS} CPL filtering makes no distinction + * between SMM or not, this results in what should be pure userspace + * counters including SMM data. + * + * This is a clear privilege issue, therefore globally disable + * counting SMM by default. + */ + .attr_freeze_on_smi = 1, }; static __init void intel_clovertown_quirk(void) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 7f406c14715f..2d33bba9a144 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -897,8 +897,9 @@ static void pt_handle_status(struct pt *pt) * means we are already losing data; need to let the decoder * know. */ - if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || - buf->output_off == pt_buffer_region_size(buf)) { + if (!buf->single && + (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries) || + buf->output_off == pt_buffer_region_size(buf))) { perf_aux_output_flag(&pt->handle, PERF_AUX_FLAG_TRUNCATED); advance++; diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index 631d5040b31e..d39e0de06be2 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -82,7 +82,7 @@ KVM_X86_OP_NULL(guest_apic_has_interrupt) KVM_X86_OP(load_eoi_exitmap) KVM_X86_OP(set_virtual_apic_mode) KVM_X86_OP_NULL(set_apic_access_page_addr) -KVM_X86_OP(deliver_posted_interrupt) +KVM_X86_OP(deliver_interrupt) KVM_X86_OP_NULL(sync_pir_to_irr) KVM_X86_OP(set_tss_addr) KVM_X86_OP(set_identity_map_addr) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 6e7c545bc7ee..6dcccb304775 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1410,7 +1410,8 @@ struct kvm_x86_ops { void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu); - int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); + void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3faf0f97edb1..a4a39c3e0f19 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -476,6 +476,7 @@ #define MSR_AMD64_ICIBSEXTDCTL 0xc001103c #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_SVM_AVIC_DOORBELL 0xc001011b #define MSR_AMD64_VM_PAGE_FLUSH 0xc001011e #define MSR_AMD64_SEV_ES_GHCB 0xc0010130 #define MSR_AMD64_SEV 0xc0010131 diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index b00dbc5fac2b..bb2fb78523ce 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -220,6 +220,42 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_NESTED_CTL_SEV_ENABLE BIT(1) #define SVM_NESTED_CTL_SEV_ES_ENABLE BIT(2) + +/* AVIC */ +#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) +#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 +#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) + +#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) +#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) +#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) +#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) +#define AVIC_PHYSICAL_ID_TABLE_SIZE_MASK (0xFF) + +#define AVIC_DOORBELL_PHYSICAL_ID_MASK (0xFF) + +#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 +#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 +#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF + +enum avic_ipi_failure_cause { + AVIC_IPI_FAILURE_INVALID_INT_TYPE, + AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, + AVIC_IPI_FAILURE_INVALID_TARGET, + AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, +}; + + +/* + * 0xff is broadcast, so the max index allowed for physical APIC ID + * table is 0xfe. APIC IDs above 0xff are reserved. + */ +#define AVIC_MAX_PHYSICAL_ID_COUNT 0xff + +#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) +#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL + + struct vmcb_seg { u16 selector; u16 attrib; diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h index 1bf2ad34188a..16f548a661cf 100644 --- a/arch/x86/include/asm/xen/hypervisor.h +++ b/arch/x86/include/asm/xen/hypervisor.h @@ -43,20 +43,6 @@ static inline uint32_t xen_cpuid_base(void) return hypervisor_cpuid_base("XenVMMXenVMM", 2); } -#ifdef CONFIG_XEN -extern bool __init xen_hvm_need_lapic(void); - -static inline bool __init xen_x2apic_para_available(void) -{ - return xen_hvm_need_lapic(); -} -#else -static inline bool __init xen_x2apic_para_available(void) -{ - return (xen_cpuid_base() != 0); -} -#endif - struct pci_dev; #ifdef CONFIG_XEN_PV_DOM0 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 28be02adc669..494d4d351859 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -554,12 +554,13 @@ void kvm_set_cpu_caps(void) ); kvm_cpu_cap_mask(CPUID_7_0_EBX, - F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | F(INVPCID) | F(RTM) | 0 /*MPX*/ | F(RDSEED) | - F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | - F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/ - ); + F(FSGSBASE) | F(SGX) | F(BMI1) | F(HLE) | F(AVX2) | + F(FDP_EXCPTN_ONLY) | F(SMEP) | F(BMI2) | F(ERMS) | F(INVPCID) | + F(RTM) | F(ZERO_FCS_FDS) | 0 /*MPX*/ | F(AVX512F) | + F(AVX512DQ) | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | + F(CLFLUSHOPT) | F(CLWB) | 0 /*INTEL_PT*/ | F(AVX512PF) | + F(AVX512ER) | F(AVX512CD) | F(SHA_NI) | F(AVX512BW) | + F(AVX512VL)); kvm_cpu_cap_mask(CPUID_7_ECX, F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) | diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4662469240bc..9322e6340a74 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1096,14 +1096,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic->regs + APIC_TMR); } - if (static_call(kvm_x86_deliver_posted_interrupt)(vcpu, vector)) { - kvm_lapic_set_irr(vector, apic); - kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_vcpu_kick(vcpu); - } else { - trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, - trig_mode, vector); - } + static_call(kvm_x86_deliver_interrupt)(apic, delivery_mode, + trig_mode, vector); break; case APIC_DM_REMRD: @@ -2312,7 +2306,12 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) apic->irr_pending = true; apic->isr_count = 1; } else { - apic->irr_pending = (apic_search_irr(apic) != -1); + /* + * Don't clear irr_pending, searching the IRR can race with + * updates from the CPU as APICv is still active from hardware's + * perspective. The flag will be cleared as appropriate when + * KVM injects the interrupt. + */ apic->isr_count = count_vectors(apic->regs + APIC_ISR); } } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index f614f95acc6b..b1a02993782b 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -95,7 +95,7 @@ static void kvm_perf_overflow(struct perf_event *perf_event, } static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, - unsigned config, bool exclude_user, + u64 config, bool exclude_user, bool exclude_kernel, bool intr, bool in_tx, bool in_tx_cp) { @@ -181,7 +181,8 @@ static int cmp_u64(const void *a, const void *b) void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) { - unsigned config, type = PERF_TYPE_RAW; + u64 config; + u32 type = PERF_TYPE_RAW; struct kvm *kvm = pmc->vcpu->kvm; struct kvm_pmu_event_filter *filter; bool allow_event = true; @@ -220,7 +221,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) } if (type == PERF_TYPE_RAW) - config = eventsel & X86_RAW_EVENT_MASK; + config = eventsel & AMD64_RAW_EVENT_MASK; if (pmc->current_config == eventsel && pmc_resume_counter(pmc)) return; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 90364d02f22a..fb3e20791338 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -27,20 +27,6 @@ #include "irq.h" #include "svm.h" -#define SVM_AVIC_DOORBELL 0xc001011b - -#define AVIC_HPA_MASK ~((0xFFFULL << 52) | 0xFFF) - -/* - * 0xff is broadcast, so the max index allowed for physical APIC ID - * table is 0xfe. APIC IDs above 0xff are reserved. - */ -#define AVIC_MAX_PHYSICAL_ID_COUNT 255 - -#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 -#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 -#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF - /* AVIC GATAG is encoded using VM and VCPU IDs */ #define AVIC_VCPU_ID_BITS 8 #define AVIC_VCPU_ID_MASK ((1 << AVIC_VCPU_ID_BITS) - 1) @@ -73,12 +59,6 @@ struct amd_svm_iommu_ir { void *data; /* Storing pointer to struct amd_ir_data */ }; -enum avic_ipi_failure_cause { - AVIC_IPI_FAILURE_INVALID_INT_TYPE, - AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, - AVIC_IPI_FAILURE_INVALID_TARGET, - AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, -}; /* Note: * This function is called from IOMMU driver to notify @@ -289,6 +269,22 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } +void avic_ring_doorbell(struct kvm_vcpu *vcpu) +{ + /* + * Note, the vCPU could get migrated to a different pCPU at any point, + * which could result in signalling the wrong/previous pCPU. But if + * that happens the vCPU is guaranteed to do a VMRUN (after being + * migrated) and thus will process pending interrupts, i.e. a doorbell + * is not needed (and the spurious one is harmless). + */ + int cpu = READ_ONCE(vcpu->cpu); + + if (cpu != get_cpu()) + wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); + put_cpu(); +} + static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, u32 icrl, u32 icrh) { @@ -304,8 +300,13 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, kvm_for_each_vcpu(i, vcpu, kvm) { if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, GET_APIC_DEST_FIELD(icrh), - icrl & APIC_DEST_MASK)) - kvm_vcpu_wake_up(vcpu); + icrl & APIC_DEST_MASK)) { + vcpu->arch.apic->irr_pending = true; + svm_complete_interrupt_delivery(vcpu, + icrl & APIC_MODE_MASK, + icrl & APIC_INT_LEVELTRIG, + icrl & APIC_VECTOR_MASK); + } } } @@ -345,8 +346,6 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh); break; case AVIC_IPI_FAILURE_INVALID_TARGET: - WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n", - index, vcpu->vcpu_id, icrh, icrl); break; case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: WARN_ONCE(1, "Invalid backing page\n"); @@ -669,52 +668,6 @@ void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) return; } -int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) -{ - if (!vcpu->arch.apicv_active) - return -1; - - kvm_lapic_set_irr(vec, vcpu->arch.apic); - - /* - * Pairs with the smp_mb_*() after setting vcpu->guest_mode in - * vcpu_enter_guest() to ensure the write to the vIRR is ordered before - * the read of guest_mode, which guarantees that either VMRUN will see - * and process the new vIRR entry, or that the below code will signal - * the doorbell if the vCPU is already running in the guest. - */ - smp_mb__after_atomic(); - - /* - * Signal the doorbell to tell hardware to inject the IRQ if the vCPU - * is in the guest. If the vCPU is not in the guest, hardware will - * automatically process AVIC interrupts at VMRUN. - */ - if (vcpu->mode == IN_GUEST_MODE) { - int cpu = READ_ONCE(vcpu->cpu); - - /* - * Note, the vCPU could get migrated to a different pCPU at any - * point, which could result in signalling the wrong/previous - * pCPU. But if that happens the vCPU is guaranteed to do a - * VMRUN (after being migrated) and thus will process pending - * interrupts, i.e. a doorbell is not needed (and the spurious - * one is harmless). - */ - if (cpu != get_cpu()) - wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu)); - put_cpu(); - } else { - /* - * Wake the vCPU if it was blocking. KVM will then detect the - * pending IRQ when checking if the vCPU has a wake event. - */ - kvm_vcpu_wake_up(vcpu); - } - - return 0; -} - bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu) { return false; diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 1218b5a342fc..39d280e7e80e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -1457,18 +1457,6 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, !__nested_vmcb_check_save(vcpu, &save_cached)) goto out_free; - /* - * While the nested guest CR3 is already checked and set by - * KVM_SET_SREGS, it was set when nested state was yet loaded, - * thus MMU might not be initialized correctly. - * Set it again to fix this. - */ - - ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, - nested_npt_enabled(svm), false); - if (WARN_ON_ONCE(ret)) - goto out_free; - /* * All checks done, we can enter guest mode. Userspace provides @@ -1494,6 +1482,20 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, svm_switch_vmcb(svm, &svm->nested.vmcb02); nested_vmcb02_prepare_control(svm); + + /* + * While the nested guest CR3 is already checked and set by + * KVM_SET_SREGS, it was set when nested state was yet loaded, + * thus MMU might not be initialized correctly. + * Set it again to fix this. + */ + + ret = nested_svm_load_cr3(&svm->vcpu, vcpu->arch.cr3, + nested_npt_enabled(svm), false); + if (WARN_ON_ONCE(ret)) + goto out_free; + + kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu); ret = 0; out_free: diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6d97629655e3..821edf664e7a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1585,6 +1585,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); u64 hcr0 = cr0; + bool old_paging = is_paging(vcpu); #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME && !vcpu->arch.guest_state_protected) { @@ -1601,8 +1602,11 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) #endif vcpu->arch.cr0 = cr0; - if (!npt_enabled) + if (!npt_enabled) { hcr0 |= X86_CR0_PG | X86_CR0_WP; + if (old_paging != is_paging(vcpu)) + svm_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } /* * re-enable caching here because the QEMU bios @@ -1646,8 +1650,12 @@ void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) svm_flush_tlb(vcpu); vcpu->arch.cr4 = cr4; - if (!npt_enabled) + if (!npt_enabled) { cr4 |= X86_CR4_PAE; + + if (!is_paging(vcpu)) + cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); + } cr4 |= host_cr4_mce; to_svm(vcpu)->vmcb->save.cr4 = cr4; vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); @@ -3291,6 +3299,55 @@ static void svm_set_irq(struct kvm_vcpu *vcpu) SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } +void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, + int trig_mode, int vector) +{ + /* + * vcpu->arch.apicv_active must be read after vcpu->mode. + * Pairs with smp_store_release in vcpu_enter_guest. + */ + bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); + + if (!READ_ONCE(vcpu->arch.apicv_active)) { + /* Process the interrupt via inject_pending_event */ + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + return; + } + + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); + if (in_guest_mode) { + /* + * Signal the doorbell to tell hardware to inject the IRQ. If + * the vCPU exits the guest before the doorbell chimes, hardware + * will automatically process AVIC interrupts at the next VMRUN. + */ + avic_ring_doorbell(vcpu); + } else { + /* + * Wake the vCPU if it was blocking. KVM will then detect the + * pending IRQ when checking if the vCPU has a wake event. + */ + kvm_vcpu_wake_up(vcpu); + } +} + +static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + kvm_lapic_set_irr(vector, apic); + + /* + * Pairs with the smp_mb_*() after setting vcpu->guest_mode in + * vcpu_enter_guest() to ensure the write to the vIRR is ordered before + * the read of guest_mode. This guarantees that either VMRUN will see + * and process the new vIRR entry, or that svm_complete_interrupt_delivery + * will signal the doorbell if the CPU has already entered the guest. + */ + smp_mb__after_atomic(); + svm_complete_interrupt_delivery(apic->vcpu, delivery_mode, trig_mode, vector); +} + static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3338,11 +3395,13 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_nmi_blocked(vcpu)) + return 0; + /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) return -EBUSY; - - return !svm_nmi_blocked(vcpu); + return 1; } static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) @@ -3394,9 +3453,13 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); + if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_interrupt_blocked(vcpu)) + return 0; + /* * An IRQ must not be injected into L2 if it's supposed to VM-Exit, * e.g. if the IRQ arrived asynchronously after checking nested events. @@ -3404,7 +3467,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) return -EBUSY; - return !svm_interrupt_blocked(vcpu); + return 1; } static void svm_enable_irq_window(struct kvm_vcpu *vcpu) @@ -3615,7 +3678,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); unsigned long vmcb_pa = svm->current_vmcb->pa; - kvm_guest_enter_irqoff(); + guest_state_enter_irqoff(); if (sev_es_guest(vcpu->kvm)) { __svm_sev_es_vcpu_run(vmcb_pa); @@ -3635,7 +3698,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) vmload(__sme_page_pa(sd->save_area)); } - kvm_guest_exit_irqoff(); + guest_state_exit_irqoff(); } static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) @@ -4135,11 +4198,14 @@ static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) if (svm->nested.nested_run_pending) return -EBUSY; + if (svm_smi_blocked(vcpu)) + return 0; + /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) return -EBUSY; - return !svm_smi_blocked(vcpu); + return 1; } static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate) @@ -4233,11 +4299,18 @@ static int svm_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) * Enter the nested guest now */ + vmcb_mark_all_dirty(svm->vmcb01.ptr); + vmcb12 = map.hva; nested_copy_vmcb_control_to_cache(svm, &vmcb12->control); nested_copy_vmcb_save_to_cache(svm, &vmcb12->save); ret = enter_svm_guest_mode(vcpu, vmcb12_gpa, vmcb12, false); + if (ret) + goto unmap_save; + + svm->nested.nested_run_pending = 1; + unmap_save: kvm_vcpu_unmap(vcpu, &map_save, true); unmap_map: @@ -4545,7 +4618,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .pmu_ops = &amd_pmu_ops, .nested_ops = &svm_nested_ops, - .deliver_posted_interrupt = svm_deliver_avic_intr, + .deliver_interrupt = svm_deliver_interrupt, .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, @@ -4622,6 +4695,7 @@ static __init void svm_set_cpu_caps(void) /* CPUID 0x80000001 and 0x8000000A (SVM features) */ if (nested) { kvm_cpu_cap_set(X86_FEATURE_SVM); + kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN); if (nrips) kvm_cpu_cap_set(X86_FEATURE_NRIPS); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 73525353e424..fa98d6844728 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -489,6 +489,8 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code); void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, int read, int write); +void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, + int trig_mode, int vec); /* nested.c */ @@ -556,17 +558,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; /* avic.c */ -#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) -#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 -#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) - -#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) -#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) -#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) -#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) - -#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL - int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); @@ -583,12 +574,12 @@ bool svm_check_apicv_inhibit_reasons(ulong bit); void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); -int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec); bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu); int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void avic_vcpu_blocking(struct kvm_vcpu *vcpu); void avic_vcpu_unblocking(struct kvm_vcpu *vcpu); +void avic_ring_doorbell(struct kvm_vcpu *vcpu); /* sev.c */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index aca3ae2a02f3..efda5e4d6247 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4041,6 +4041,21 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector) return 0; } +static void vmx_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, + int trig_mode, int vector) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + + if (vmx_deliver_posted_interrupt(vcpu, vector)) { + kvm_lapic_set_irr(vector, apic); + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + } else { + trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, + trig_mode, vector); + } +} + /* * Set up the vmcs's constant host-state fields, i.e., host-state fields that * will not change in the lifetime of the guest. @@ -6754,7 +6769,7 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx) { - kvm_guest_enter_irqoff(); + guest_state_enter_irqoff(); /* L1D Flush includes CPU buffer clear to mitigate MDS */ if (static_branch_unlikely(&vmx_l1d_should_flush)) @@ -6770,7 +6785,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, vcpu->arch.cr2 = native_read_cr2(); - kvm_guest_exit_irqoff(); + guest_state_exit_irqoff(); } static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) @@ -7644,6 +7659,7 @@ static int vmx_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (ret) return ret; + vmx->nested.nested_run_pending = 1; vmx->nested.smm.guest_mode = false; } return 0; @@ -7768,7 +7784,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .hwapic_isr_update = vmx_hwapic_isr_update, .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .deliver_interrupt = vmx_deliver_interrupt, .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt, .set_tss_addr = vmx_set_tss_addr, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 74b53a16f38a..641044db415d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -90,6 +90,8 @@ u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); +#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) + #define emul_to_vcpu(ctxt) \ ((struct kvm_vcpu *)(ctxt)->vcpu) @@ -4340,7 +4342,7 @@ static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr) void __user *uaddr = (void __user*)(unsigned long)attr->addr; if ((u64)(unsigned long)uaddr != attr->addr) - return ERR_PTR(-EFAULT); + return ERR_PTR_USR(-EFAULT); return uaddr; } @@ -9981,7 +9983,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * result in virtual interrupt delivery. */ local_irq_disable(); - vcpu->mode = IN_GUEST_MODE; + + /* Store vcpu->apicv_active before vcpu->mode. */ + smp_store_release(&vcpu->mode, IN_GUEST_MODE); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); @@ -10041,6 +10045,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(0, 7); } + guest_timing_enter_irqoff(); + for (;;) { /* * Assert that vCPU vs. VM APICv state is consistent. An APICv @@ -10125,7 +10131,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * of accounting via context tracking, but the loss of accuracy is * acceptable for all known use cases. */ - vtime_account_guest_exit(); + guest_timing_exit_irqoff(); if (lapic_in_kernel(vcpu)) { s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; @@ -11639,8 +11645,6 @@ void kvm_arch_sync_events(struct kvm *kvm) kvm_free_pit(kvm); } -#define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) - /** * __x86_set_memory_region: Setup KVM internal memory slot * diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 635b75f9e145..767ec7f99516 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -10,51 +10,6 @@ void kvm_spurious_fault(void); -static __always_inline void kvm_guest_enter_irqoff(void) -{ - /* - * VMENTER enables interrupts (host state), but the kernel state is - * interrupts disabled when this is invoked. Also tell RCU about - * it. This is the same logic as for exit_to_user_mode(). - * - * This ensures that e.g. latency analysis on the host observes - * guest mode as interrupt enabled. - * - * guest_enter_irqoff() informs context tracking about the - * transition to guest mode and if enabled adjusts RCU state - * accordingly. - */ - instrumentation_begin(); - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - instrumentation_end(); - - guest_enter_irqoff(); - lockdep_hardirqs_on(CALLER_ADDR0); -} - -static __always_inline void kvm_guest_exit_irqoff(void) -{ - /* - * VMEXIT disables interrupts (host state), but tracing and lockdep - * have them in state 'on' as recorded before entering guest mode. - * Same as enter_from_user_mode(). - * - * context_tracking_guest_exit() restores host context and reinstates - * RCU if enabled and required. - * - * This needs to be done immediately after VM-Exit, before any code - * that might contain tracepoints or call out to the greater world, - * e.g. before x86_spec_ctrl_restore_host(). - */ - lockdep_hardirqs_off(CALLER_ADDR0); - context_tracking_guest_exit(); - - instrumentation_begin(); - trace_hardirqs_off_finish(); - instrumentation_end(); -} - #define KVM_NESTED_VMENTER_CONSISTENCY_CHECK(consistency_check) \ ({ \ bool failed = (consistency_check); \ diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index bad57535fad0..74be1fda58e3 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -133,32 +133,57 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) { struct kvm_vcpu_xen *vx = &v->arch.xen; + struct gfn_to_hva_cache *ghc = &vx->runstate_cache; + struct kvm_memslots *slots = kvm_memslots(v->kvm); + bool atomic = (state == RUNSTATE_runnable); uint64_t state_entry_time; - unsigned int offset; + int __user *user_state; + uint64_t __user *user_times; kvm_xen_update_runstate(v, state); if (!vx->runstate_set) return; - BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); + if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) && + kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len)) + return; + + /* We made sure it fits in a single page */ + BUG_ON(!ghc->memslot); + + if (atomic) + pagefault_disable(); - offset = offsetof(struct compat_vcpu_runstate_info, state_entry_time); -#ifdef CONFIG_X86_64 /* - * The only difference is alignment of uint64_t in 32-bit. - * So the first field 'state' is accessed directly using - * offsetof() (where its offset happens to be zero), while the - * remaining fields which are all uint64_t, start at 'offset' - * which we tweak here by adding 4. + * The only difference between 32-bit and 64-bit versions of the + * runstate struct us the alignment of uint64_t in 32-bit, which + * means that the 64-bit version has an additional 4 bytes of + * padding after the first field 'state'. + * + * So we use 'int __user *user_state' to point to the state field, + * and 'uint64_t __user *user_times' for runstate_entry_time. So + * the actual array of time[] in each state starts at user_times[1]. */ + BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0); + BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0); + user_state = (int __user *)ghc->hva; + + BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); + + user_times = (uint64_t __user *)(ghc->hva + + offsetof(struct compat_vcpu_runstate_info, + state_entry_time)); +#ifdef CONFIG_X86_64 BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != offsetof(struct compat_vcpu_runstate_info, time) + 4); if (v->kvm->arch.xen.long_mode) - offset = offsetof(struct vcpu_runstate_info, state_entry_time); + user_times = (uint64_t __user *)(ghc->hva + + offsetof(struct vcpu_runstate_info, + state_entry_time)); #endif /* * First write the updated state_entry_time at the appropriate @@ -172,10 +197,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != sizeof(state_entry_time)); - if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, - &state_entry_time, offset, - sizeof(state_entry_time))) - return; + if (__put_user(state_entry_time, user_times)) + goto out; smp_wmb(); /* @@ -189,11 +212,8 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != sizeof(vx->current_runstate)); - if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, - &vx->current_runstate, - offsetof(struct vcpu_runstate_info, state), - sizeof(vx->current_runstate))) - return; + if (__put_user(vx->current_runstate, user_state)) + goto out; /* * Write the actual runstate times immediately after the @@ -208,24 +228,23 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state) BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != sizeof(vx->runstate_times)); - if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, - &vx->runstate_times[0], - offset + sizeof(u64), - sizeof(vx->runstate_times))) - return; - + if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times))) + goto out; smp_wmb(); /* * Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's * runstate_entry_time field. */ - state_entry_time &= ~XEN_RUNSTATE_UPDATE; - if (kvm_write_guest_offset_cached(v->kvm, &v->arch.xen.runstate_cache, - &state_entry_time, offset, - sizeof(state_entry_time))) - return; + __put_user(state_entry_time, user_times); + smp_wmb(); + + out: + mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT); + + if (atomic) + pagefault_enable(); } int __kvm_xen_has_interrupt(struct kvm_vcpu *v) @@ -443,6 +462,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } + /* It must fit within a single page */ + if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) { + r = -EINVAL; + break; + } + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache, data->u.gpa, @@ -460,6 +485,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } + /* It must fit within a single page */ + if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) { + r = -EINVAL; + break; + } + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.xen.vcpu_time_info_cache, data->u.gpa, @@ -481,6 +512,12 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } + /* It must fit within a single page */ + if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) { + r = -EINVAL; + break; + } + r = kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.xen.runstate_cache, data->u.gpa, diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 42300941ec29..6448c5071117 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -9,6 +9,7 @@ #include <xen/events.h> #include <xen/interface/memory.h> +#include <asm/apic.h> #include <asm/cpu.h> #include <asm/smp.h> #include <asm/io_apic.h> @@ -242,15 +243,9 @@ static __init int xen_parse_no_vector_callback(char *arg) } early_param("xen_no_vector_callback", xen_parse_no_vector_callback); -bool __init xen_hvm_need_lapic(void) +static __init bool xen_x2apic_available(void) { - if (xen_pv_domain()) - return false; - if (!xen_hvm_domain()) - return false; - if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) - return false; - return true; + return x2apic_supported(); } static __init void xen_hvm_guest_late_init(void) @@ -312,7 +307,7 @@ struct hypervisor_x86 x86_hyper_xen_hvm __initdata = { .detect = xen_platform_hvm, .type = X86_HYPER_XEN_HVM, .init.init_platform = xen_hvm_guest_init, - .init.x2apic_available = xen_x2apic_para_available, + .init.x2apic_available = xen_x2apic_available, .init.init_mem_mapping = xen_hvm_init_mem_mapping, .init.guest_late_init = xen_hvm_guest_late_init, .runtime.pin_vcpu = xen_pin_vcpu, diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 5004feb16783..d47c3d176ae4 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -1341,10 +1341,6 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_acpi_sleep_register(); - /* Avoid searching for BIOS MP tables */ - x86_init.mpparse.find_smp_config = x86_init_noop; - x86_init.mpparse.get_smp_config = x86_init_uint_noop; - xen_boot_params_init_edd(); #ifdef CONFIG_ACPI diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 6a8f3b53ab83..4a6019238ee7 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -148,28 +148,12 @@ int xen_smp_intr_init_pv(unsigned int cpu) return rc; } -static void __init xen_fill_possible_map(void) -{ - int i, rc; - - if (xen_initial_domain()) - return; - - for (i = 0; i < nr_cpu_ids; i++) { - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) { - num_processors++; - set_cpu_possible(i, true); - } - } -} - -static void __init xen_filter_cpu_maps(void) +static void __init _get_smp_config(unsigned int early) { int i, rc; unsigned int subtract = 0; - if (!xen_initial_domain()) + if (early) return; num_processors = 0; @@ -210,7 +194,6 @@ static void __init xen_pv_smp_prepare_boot_cpu(void) * sure the old memory can be recycled. */ make_lowmem_page_readwrite(xen_initial_gdt); - xen_filter_cpu_maps(); xen_setup_vcpu_info_placement(); /* @@ -476,5 +459,8 @@ static const struct smp_ops xen_smp_ops __initconst = { void __init xen_smp_init(void) { smp_ops = xen_smp_ops; - xen_fill_possible_map(); + + /* Avoid searching for BIOS MP tables */ + x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.mpparse.get_smp_config = _get_smp_config; } |