diff options
Diffstat (limited to 'arch')
141 files changed, 4603 insertions, 8744 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 03bbfc312fe7..66a04f6f4775 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -2090,5 +2090,3 @@ source "drivers/firmware/Kconfig" if CRYPTO source "arch/arm/crypto/Kconfig" endif - -source "arch/arm/kvm/Kconfig" diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 1fc32b611f8a..e1d13d779e08 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -278,7 +278,6 @@ core-$(CONFIG_FPE_NWFPE) += arch/arm/nwfpe/ core-$(CONFIG_FPE_FASTFPE) += $(patsubst $(srctree)/%,%,$(wildcard $(srctree)/arch/arm/fastfpe/)) core-$(CONFIG_VFP) += arch/arm/vfp/ core-$(CONFIG_XEN) += arch/arm/xen/ -core-$(CONFIG_KVM_ARM_HOST) += arch/arm/kvm/ core-$(CONFIG_VDSO) += arch/arm/vdso/ # If we have a machine-specific directory, then include it in the build. diff --git a/arch/arm/configs/axm55xx_defconfig b/arch/arm/configs/axm55xx_defconfig index 6ea7dafa4c9e..46075216ee6d 100644 --- a/arch/arm/configs/axm55xx_defconfig +++ b/arch/arm/configs/axm55xx_defconfig @@ -236,5 +236,3 @@ CONFIG_CRYPTO_GCM=y CONFIG_CRYPTO_XCBC=y CONFIG_CRYPTO_SHA256=y # CONFIG_CRYPTO_ANSI_CPRNG is not set -CONFIG_VIRTUALIZATION=y -CONFIG_KVM=y diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index c815477b4303..413abfb42989 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -38,71 +38,6 @@ #define ICC_AP1R2 __ICC_AP1Rx(2) #define ICC_AP1R3 __ICC_AP1Rx(3) -#define ICC_HSRE __ACCESS_CP15(c12, 4, c9, 5) - -#define ICH_VSEIR __ACCESS_CP15(c12, 4, c9, 4) -#define ICH_HCR __ACCESS_CP15(c12, 4, c11, 0) -#define ICH_VTR __ACCESS_CP15(c12, 4, c11, 1) -#define ICH_MISR __ACCESS_CP15(c12, 4, c11, 2) -#define ICH_EISR __ACCESS_CP15(c12, 4, c11, 3) -#define ICH_ELRSR __ACCESS_CP15(c12, 4, c11, 5) -#define ICH_VMCR __ACCESS_CP15(c12, 4, c11, 7) - -#define __LR0(x) __ACCESS_CP15(c12, 4, c12, x) -#define __LR8(x) __ACCESS_CP15(c12, 4, c13, x) - -#define ICH_LR0 __LR0(0) -#define ICH_LR1 __LR0(1) -#define ICH_LR2 __LR0(2) -#define ICH_LR3 __LR0(3) -#define ICH_LR4 __LR0(4) -#define ICH_LR5 __LR0(5) -#define ICH_LR6 __LR0(6) -#define ICH_LR7 __LR0(7) -#define ICH_LR8 __LR8(0) -#define ICH_LR9 __LR8(1) -#define ICH_LR10 __LR8(2) -#define ICH_LR11 __LR8(3) -#define ICH_LR12 __LR8(4) -#define ICH_LR13 __LR8(5) -#define ICH_LR14 __LR8(6) -#define ICH_LR15 __LR8(7) - -/* LR top half */ -#define __LRC0(x) __ACCESS_CP15(c12, 4, c14, x) -#define __LRC8(x) __ACCESS_CP15(c12, 4, c15, x) - -#define ICH_LRC0 __LRC0(0) -#define ICH_LRC1 __LRC0(1) -#define ICH_LRC2 __LRC0(2) -#define ICH_LRC3 __LRC0(3) -#define ICH_LRC4 __LRC0(4) -#define ICH_LRC5 __LRC0(5) -#define ICH_LRC6 __LRC0(6) -#define ICH_LRC7 __LRC0(7) -#define ICH_LRC8 __LRC8(0) -#define ICH_LRC9 __LRC8(1) -#define ICH_LRC10 __LRC8(2) -#define ICH_LRC11 __LRC8(3) -#define ICH_LRC12 __LRC8(4) -#define ICH_LRC13 __LRC8(5) -#define ICH_LRC14 __LRC8(6) -#define ICH_LRC15 __LRC8(7) - -#define __ICH_AP0Rx(x) __ACCESS_CP15(c12, 4, c8, x) -#define ICH_AP0R0 __ICH_AP0Rx(0) -#define ICH_AP0R1 __ICH_AP0Rx(1) -#define ICH_AP0R2 __ICH_AP0Rx(2) -#define ICH_AP0R3 __ICH_AP0Rx(3) - -#define __ICH_AP1Rx(x) __ACCESS_CP15(c12, 4, c9, x) -#define ICH_AP1R0 __ICH_AP1Rx(0) -#define ICH_AP1R1 __ICH_AP1Rx(1) -#define ICH_AP1R2 __ICH_AP1Rx(2) -#define ICH_AP1R3 __ICH_AP1Rx(3) - -/* A32-to-A64 mappings used by VGIC save/restore */ - #define CPUIF_MAP(a32, a64) \ static inline void write_ ## a64(u32 val) \ { \ @@ -113,21 +48,6 @@ static inline u32 read_ ## a64(void) \ return read_sysreg(a32); \ } \ -#define CPUIF_MAP_LO_HI(a32lo, a32hi, a64) \ -static inline void write_ ## a64(u64 val) \ -{ \ - write_sysreg(lower_32_bits(val), a32lo);\ - write_sysreg(upper_32_bits(val), a32hi);\ -} \ -static inline u64 read_ ## a64(void) \ -{ \ - u64 val = read_sysreg(a32lo); \ - \ - val |= (u64)read_sysreg(a32hi) << 32; \ - \ - return val; \ -} - CPUIF_MAP(ICC_PMR, ICC_PMR_EL1) CPUIF_MAP(ICC_AP0R0, ICC_AP0R0_EL1) CPUIF_MAP(ICC_AP0R1, ICC_AP0R1_EL1) @@ -138,40 +58,6 @@ CPUIF_MAP(ICC_AP1R1, ICC_AP1R1_EL1) CPUIF_MAP(ICC_AP1R2, ICC_AP1R2_EL1) CPUIF_MAP(ICC_AP1R3, ICC_AP1R3_EL1) -CPUIF_MAP(ICH_HCR, ICH_HCR_EL2) -CPUIF_MAP(ICH_VTR, ICH_VTR_EL2) -CPUIF_MAP(ICH_MISR, ICH_MISR_EL2) -CPUIF_MAP(ICH_EISR, ICH_EISR_EL2) -CPUIF_MAP(ICH_ELRSR, ICH_ELRSR_EL2) -CPUIF_MAP(ICH_VMCR, ICH_VMCR_EL2) -CPUIF_MAP(ICH_AP0R3, ICH_AP0R3_EL2) -CPUIF_MAP(ICH_AP0R2, ICH_AP0R2_EL2) -CPUIF_MAP(ICH_AP0R1, ICH_AP0R1_EL2) -CPUIF_MAP(ICH_AP0R0, ICH_AP0R0_EL2) -CPUIF_MAP(ICH_AP1R3, ICH_AP1R3_EL2) -CPUIF_MAP(ICH_AP1R2, ICH_AP1R2_EL2) -CPUIF_MAP(ICH_AP1R1, ICH_AP1R1_EL2) -CPUIF_MAP(ICH_AP1R0, ICH_AP1R0_EL2) -CPUIF_MAP(ICC_HSRE, ICC_SRE_EL2) -CPUIF_MAP(ICC_SRE, ICC_SRE_EL1) - -CPUIF_MAP_LO_HI(ICH_LR15, ICH_LRC15, ICH_LR15_EL2) -CPUIF_MAP_LO_HI(ICH_LR14, ICH_LRC14, ICH_LR14_EL2) -CPUIF_MAP_LO_HI(ICH_LR13, ICH_LRC13, ICH_LR13_EL2) -CPUIF_MAP_LO_HI(ICH_LR12, ICH_LRC12, ICH_LR12_EL2) -CPUIF_MAP_LO_HI(ICH_LR11, ICH_LRC11, ICH_LR11_EL2) -CPUIF_MAP_LO_HI(ICH_LR10, ICH_LRC10, ICH_LR10_EL2) -CPUIF_MAP_LO_HI(ICH_LR9, ICH_LRC9, ICH_LR9_EL2) -CPUIF_MAP_LO_HI(ICH_LR8, ICH_LRC8, ICH_LR8_EL2) -CPUIF_MAP_LO_HI(ICH_LR7, ICH_LRC7, ICH_LR7_EL2) -CPUIF_MAP_LO_HI(ICH_LR6, ICH_LRC6, ICH_LR6_EL2) -CPUIF_MAP_LO_HI(ICH_LR5, ICH_LRC5, ICH_LR5_EL2) -CPUIF_MAP_LO_HI(ICH_LR4, ICH_LRC4, ICH_LR4_EL2) -CPUIF_MAP_LO_HI(ICH_LR3, ICH_LRC3, ICH_LR3_EL2) -CPUIF_MAP_LO_HI(ICH_LR2, ICH_LRC2, ICH_LR2_EL2) -CPUIF_MAP_LO_HI(ICH_LR1, ICH_LRC1, ICH_LR1_EL2) -CPUIF_MAP_LO_HI(ICH_LR0, ICH_LRC0, ICH_LR0_EL2) - #define read_gicreg(r) read_##r() #define write_gicreg(v, r) write_##r(v) diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h deleted file mode 100644 index 9c04bd810d07..000000000000 --- a/arch/arm/include/asm/kvm_arm.h +++ /dev/null @@ -1,239 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#ifndef __ARM_KVM_ARM_H__ -#define __ARM_KVM_ARM_H__ - -#include <linux/const.h> -#include <linux/types.h> - -/* Hyp Configuration Register (HCR) bits */ -#define HCR_TGE (1 << 27) -#define HCR_TVM (1 << 26) -#define HCR_TTLB (1 << 25) -#define HCR_TPU (1 << 24) -#define HCR_TPC (1 << 23) -#define HCR_TSW (1 << 22) -#define HCR_TAC (1 << 21) -#define HCR_TIDCP (1 << 20) -#define HCR_TSC (1 << 19) -#define HCR_TID3 (1 << 18) -#define HCR_TID2 (1 << 17) -#define HCR_TID1 (1 << 16) -#define HCR_TID0 (1 << 15) -#define HCR_TWE (1 << 14) -#define HCR_TWI (1 << 13) -#define HCR_DC (1 << 12) -#define HCR_BSU (3 << 10) -#define HCR_BSU_IS (1 << 10) -#define HCR_FB (1 << 9) -#define HCR_VA (1 << 8) -#define HCR_VI (1 << 7) -#define HCR_VF (1 << 6) -#define HCR_AMO (1 << 5) -#define HCR_IMO (1 << 4) -#define HCR_FMO (1 << 3) -#define HCR_PTW (1 << 2) -#define HCR_SWIO (1 << 1) -#define HCR_VM 1 - -/* - * The bits we set in HCR: - * TAC: Trap ACTLR - * TSC: Trap SMC - * TVM: Trap VM ops (until MMU and caches are on) - * TSW: Trap cache operations by set/way - * TWI: Trap WFI - * TWE: Trap WFE - * TIDCP: Trap L2CTLR/L2ECTLR - * BSU_IS: Upgrade barriers to the inner shareable domain - * FB: Force broadcast of all maintainance operations - * AMO: Override CPSR.A and enable signaling with VA - * IMO: Override CPSR.I and enable signaling with VI - * FMO: Override CPSR.F and enable signaling with VF - * SWIO: Turn set/way invalidates into set/way clean+invalidate - */ -#define HCR_GUEST_MASK (HCR_TSC | HCR_TSW | HCR_TWI | HCR_VM | HCR_BSU_IS | \ - HCR_FB | HCR_TAC | HCR_AMO | HCR_IMO | HCR_FMO | \ - HCR_TVM | HCR_TWE | HCR_SWIO | HCR_TIDCP) - -/* System Control Register (SCTLR) bits */ -#define SCTLR_TE (1 << 30) -#define SCTLR_EE (1 << 25) -#define SCTLR_V (1 << 13) - -/* Hyp System Control Register (HSCTLR) bits */ -#define HSCTLR_TE (1 << 30) -#define HSCTLR_EE (1 << 25) -#define HSCTLR_FI (1 << 21) -#define HSCTLR_WXN (1 << 19) -#define HSCTLR_I (1 << 12) -#define HSCTLR_C (1 << 2) -#define HSCTLR_A (1 << 1) -#define HSCTLR_M 1 -#define HSCTLR_MASK (HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I | \ - HSCTLR_WXN | HSCTLR_FI | HSCTLR_EE | HSCTLR_TE) - -/* TTBCR and HTCR Registers bits */ -#define TTBCR_EAE (1 << 31) -#define TTBCR_IMP (1 << 30) -#define TTBCR_SH1 (3 << 28) -#define TTBCR_ORGN1 (3 << 26) -#define TTBCR_IRGN1 (3 << 24) -#define TTBCR_EPD1 (1 << 23) -#define TTBCR_A1 (1 << 22) -#define TTBCR_T1SZ (7 << 16) -#define TTBCR_SH0 (3 << 12) -#define TTBCR_ORGN0 (3 << 10) -#define TTBCR_IRGN0 (3 << 8) -#define TTBCR_EPD0 (1 << 7) -#define TTBCR_T0SZ (7 << 0) -#define HTCR_MASK (TTBCR_T0SZ | TTBCR_IRGN0 | TTBCR_ORGN0 | TTBCR_SH0) - -/* Hyp System Trap Register */ -#define HSTR_T(x) (1 << x) -#define HSTR_TTEE (1 << 16) -#define HSTR_TJDBX (1 << 17) - -/* Hyp Coprocessor Trap Register */ -#define HCPTR_TCP(x) (1 << x) -#define HCPTR_TCP_MASK (0x3fff) -#define HCPTR_TASE (1 << 15) -#define HCPTR_TTA (1 << 20) -#define HCPTR_TCPAC (1 << 31) - -/* Hyp Debug Configuration Register bits */ -#define HDCR_TDRA (1 << 11) -#define HDCR_TDOSA (1 << 10) -#define HDCR_TDA (1 << 9) -#define HDCR_TDE (1 << 8) -#define HDCR_HPME (1 << 7) -#define HDCR_TPM (1 << 6) -#define HDCR_TPMCR (1 << 5) -#define HDCR_HPMN_MASK (0x1F) - -/* - * The architecture supports 40-bit IPA as input to the 2nd stage translations - * and PTRS_PER_S2_PGD becomes 1024, because each entry covers 1GB of address - * space. - */ -#define KVM_PHYS_SHIFT (40) - -#define PTRS_PER_S2_PGD (_AC(1, ULL) << (KVM_PHYS_SHIFT - 30)) - -/* Virtualization Translation Control Register (VTCR) bits */ -#define VTCR_SH0 (3 << 12) -#define VTCR_ORGN0 (3 << 10) -#define VTCR_IRGN0 (3 << 8) -#define VTCR_SL0 (3 << 6) -#define VTCR_S (1 << 4) -#define VTCR_T0SZ (0xf) -#define VTCR_MASK (VTCR_SH0 | VTCR_ORGN0 | VTCR_IRGN0 | VTCR_SL0 | \ - VTCR_S | VTCR_T0SZ) -#define VTCR_HTCR_SH (VTCR_SH0 | VTCR_ORGN0 | VTCR_IRGN0) -#define VTCR_SL_L2 (0 << 6) /* Starting-level: 2 */ -#define VTCR_SL_L1 (1 << 6) /* Starting-level: 1 */ -#define KVM_VTCR_SL0 VTCR_SL_L1 -/* stage-2 input address range defined as 2^(32-T0SZ) */ -#define KVM_T0SZ (32 - KVM_PHYS_SHIFT) -#define KVM_VTCR_T0SZ (KVM_T0SZ & VTCR_T0SZ) -#define KVM_VTCR_S ((KVM_VTCR_T0SZ << 1) & VTCR_S) - -/* Virtualization Translation Table Base Register (VTTBR) bits */ -#if KVM_VTCR_SL0 == VTCR_SL_L2 /* see ARM DDI 0406C: B4-1720 */ -#define VTTBR_X (14 - KVM_T0SZ) -#else -#define VTTBR_X (5 - KVM_T0SZ) -#endif -#define VTTBR_CNP_BIT _AC(1, UL) -#define VTTBR_BADDR_MASK (((_AC(1, ULL) << (40 - VTTBR_X)) - 1) << VTTBR_X) -#define VTTBR_VMID_SHIFT _AC(48, ULL) -#define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT) - -/* Hyp Syndrome Register (HSR) bits */ -#define HSR_EC_SHIFT (26) -#define HSR_EC (_AC(0x3f, UL) << HSR_EC_SHIFT) -#define HSR_IL (_AC(1, UL) << 25) -#define HSR_ISS (HSR_IL - 1) -#define HSR_ISV_SHIFT (24) -#define HSR_ISV (_AC(1, UL) << HSR_ISV_SHIFT) -#define HSR_SRT_SHIFT (16) -#define HSR_SRT_MASK (0xf << HSR_SRT_SHIFT) -#define HSR_CM (1 << 8) -#define HSR_FSC (0x3f) -#define HSR_FSC_TYPE (0x3c) -#define HSR_SSE (1 << 21) -#define HSR_WNR (1 << 6) -#define HSR_CV_SHIFT (24) -#define HSR_CV (_AC(1, UL) << HSR_CV_SHIFT) -#define HSR_COND_SHIFT (20) -#define HSR_COND (_AC(0xf, UL) << HSR_COND_SHIFT) - -#define FSC_FAULT (0x04) -#define FSC_ACCESS (0x08) -#define FSC_PERM (0x0c) -#define FSC_SEA (0x10) -#define FSC_SEA_TTW0 (0x14) -#define FSC_SEA_TTW1 (0x15) -#define FSC_SEA_TTW2 (0x16) -#define FSC_SEA_TTW3 (0x17) -#define FSC_SECC (0x18) -#define FSC_SECC_TTW0 (0x1c) -#define FSC_SECC_TTW1 (0x1d) -#define FSC_SECC_TTW2 (0x1e) -#define FSC_SECC_TTW3 (0x1f) - -/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ -#define HPFAR_MASK (~0xf) - -#define HSR_EC_UNKNOWN (0x00) -#define HSR_EC_WFI (0x01) -#define HSR_EC_CP15_32 (0x03) -#define HSR_EC_CP15_64 (0x04) -#define HSR_EC_CP14_MR (0x05) -#define HSR_EC_CP14_LS (0x06) -#define HSR_EC_CP_0_13 (0x07) -#define HSR_EC_CP10_ID (0x08) -#define HSR_EC_JAZELLE (0x09) -#define HSR_EC_BXJ (0x0A) -#define HSR_EC_CP14_64 (0x0C) -#define HSR_EC_SVC_HYP (0x11) -#define HSR_EC_HVC (0x12) -#define HSR_EC_SMC (0x13) -#define HSR_EC_IABT (0x20) -#define HSR_EC_IABT_HYP (0x21) -#define HSR_EC_DABT (0x24) -#define HSR_EC_DABT_HYP (0x25) -#define HSR_EC_MAX (0x3f) - -#define HSR_WFI_IS_WFE (_AC(1, UL) << 0) - -#define HSR_HVC_IMM_MASK ((_AC(1, UL) << 16) - 1) - -#define HSR_DABT_S1PTW (_AC(1, UL) << 7) -#define HSR_DABT_CM (_AC(1, UL) << 8) - -#define kvm_arm_exception_type \ - {0, "RESET" }, \ - {1, "UNDEFINED" }, \ - {2, "SOFTWARE" }, \ - {3, "PREF_ABORT" }, \ - {4, "DATA_ABORT" }, \ - {5, "IRQ" }, \ - {6, "FIQ" }, \ - {7, "HVC" } - -#define HSRECN(x) { HSR_EC_##x, #x } - -#define kvm_arm_exception_class \ - HSRECN(UNKNOWN), HSRECN(WFI), HSRECN(CP15_32), HSRECN(CP15_64), \ - HSRECN(CP14_MR), HSRECN(CP14_LS), HSRECN(CP_0_13), HSRECN(CP10_ID), \ - HSRECN(JAZELLE), HSRECN(BXJ), HSRECN(CP14_64), HSRECN(SVC_HYP), \ - HSRECN(HVC), HSRECN(SMC), HSRECN(IABT), HSRECN(IABT_HYP), \ - HSRECN(DABT), HSRECN(DABT_HYP) - - -#endif /* __ARM_KVM_ARM_H__ */ diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h deleted file mode 100644 index f615830f9f57..000000000000 --- a/arch/arm/include/asm/kvm_asm.h +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#ifndef __ARM_KVM_ASM_H__ -#define __ARM_KVM_ASM_H__ - -#include <asm/virt.h> - -#define ARM_EXIT_WITH_ABORT_BIT 31 -#define ARM_EXCEPTION_CODE(x) ((x) & ~(1U << ARM_EXIT_WITH_ABORT_BIT)) -#define ARM_EXCEPTION_IS_TRAP(x) \ - (ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_PREF_ABORT || \ - ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_DATA_ABORT || \ - ARM_EXCEPTION_CODE((x)) == ARM_EXCEPTION_HVC) -#define ARM_ABORT_PENDING(x) !!((x) & (1U << ARM_EXIT_WITH_ABORT_BIT)) - -#define ARM_EXCEPTION_RESET 0 -#define ARM_EXCEPTION_UNDEFINED 1 -#define ARM_EXCEPTION_SOFTWARE 2 -#define ARM_EXCEPTION_PREF_ABORT 3 -#define ARM_EXCEPTION_DATA_ABORT 4 -#define ARM_EXCEPTION_IRQ 5 -#define ARM_EXCEPTION_FIQ 6 -#define ARM_EXCEPTION_HVC 7 -#define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR -/* - * The rr_lo_hi macro swaps a pair of registers depending on - * current endianness. It is used in conjunction with ldrd and strd - * instructions that load/store a 64-bit value from/to memory to/from - * a pair of registers which are used with the mrrc and mcrr instructions. - * If used with the ldrd/strd instructions, the a1 parameter is the first - * source/destination register and the a2 parameter is the second - * source/destination register. Note that the ldrd/strd instructions - * already swap the bytes within the words correctly according to the - * endianness setting, but the order of the registers need to be effectively - * swapped when used with the mrrc/mcrr instructions. - */ -#ifdef CONFIG_CPU_ENDIAN_BE8 -#define rr_lo_hi(a1, a2) a2, a1 -#else -#define rr_lo_hi(a1, a2) a1, a2 -#endif - -#define kvm_ksym_ref(kva) (kva) - -#ifndef __ASSEMBLY__ -struct kvm; -struct kvm_vcpu; - -extern char __kvm_hyp_init[]; -extern char __kvm_hyp_init_end[]; - -extern void __kvm_flush_vm_context(void); -extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa); -extern void __kvm_tlb_flush_vmid(struct kvm *kvm); -extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu); - -extern void __kvm_timer_set_cntvoff(u32 cntvoff_low, u32 cntvoff_high); - -/* no VHE on 32-bit :( */ -static inline int kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) { BUG(); return 0; } - -extern int __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu); - -extern void __init_stage2_translation(void); - -extern u64 __vgic_v3_get_ich_vtr_el2(void); -extern u64 __vgic_v3_read_vmcr(void); -extern void __vgic_v3_write_vmcr(u32 vmcr); -extern void __vgic_v3_init_lrs(void); - -#endif - -#endif /* __ARM_KVM_ASM_H__ */ diff --git a/arch/arm/include/asm/kvm_coproc.h b/arch/arm/include/asm/kvm_coproc.h deleted file mode 100644 index a23826117dd6..000000000000 --- a/arch/arm/include/asm/kvm_coproc.h +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 Rusty Russell IBM Corporation - */ - -#ifndef __ARM_KVM_COPROC_H__ -#define __ARM_KVM_COPROC_H__ -#include <linux/kvm_host.h> - -void kvm_reset_coprocs(struct kvm_vcpu *vcpu); - -struct kvm_coproc_target_table { - unsigned target; - const struct coproc_reg *table; - size_t num; -}; -void kvm_register_target_coproc_table(struct kvm_coproc_target_table *table); - -int kvm_handle_cp10_id(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp_0_13_access(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run); - -unsigned long kvm_arm_num_guest_msrs(struct kvm_vcpu *vcpu); -int kvm_arm_copy_msrindices(struct kvm_vcpu *vcpu, u64 __user *uindices); -void kvm_coproc_table_init(void); - -struct kvm_one_reg; -int kvm_arm_copy_coproc_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); -int kvm_arm_coproc_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); -int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); -unsigned long kvm_arm_num_coproc_regs(struct kvm_vcpu *vcpu); -#endif /* __ARM_KVM_COPROC_H__ */ diff --git a/arch/arm/include/asm/kvm_emulate.h b/arch/arm/include/asm/kvm_emulate.h deleted file mode 100644 index 3944305e81df..000000000000 --- a/arch/arm/include/asm/kvm_emulate.h +++ /dev/null @@ -1,372 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#ifndef __ARM_KVM_EMULATE_H__ -#define __ARM_KVM_EMULATE_H__ - -#include <linux/kvm_host.h> -#include <asm/kvm_asm.h> -#include <asm/kvm_arm.h> -#include <asm/cputype.h> - -/* arm64 compatibility macros */ -#define PSR_AA32_MODE_FIQ FIQ_MODE -#define PSR_AA32_MODE_SVC SVC_MODE -#define PSR_AA32_MODE_ABT ABT_MODE -#define PSR_AA32_MODE_UND UND_MODE -#define PSR_AA32_T_BIT PSR_T_BIT -#define PSR_AA32_F_BIT PSR_F_BIT -#define PSR_AA32_I_BIT PSR_I_BIT -#define PSR_AA32_A_BIT PSR_A_BIT -#define PSR_AA32_E_BIT PSR_E_BIT -#define PSR_AA32_IT_MASK PSR_IT_MASK -#define PSR_AA32_GE_MASK 0x000f0000 -#define PSR_AA32_DIT_BIT 0x00200000 -#define PSR_AA32_PAN_BIT 0x00400000 -#define PSR_AA32_SSBS_BIT 0x00800000 -#define PSR_AA32_Q_BIT PSR_Q_BIT -#define PSR_AA32_V_BIT PSR_V_BIT -#define PSR_AA32_C_BIT PSR_C_BIT -#define PSR_AA32_Z_BIT PSR_Z_BIT -#define PSR_AA32_N_BIT PSR_N_BIT - -unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num); - -static inline unsigned long *vcpu_reg32(struct kvm_vcpu *vcpu, u8 reg_num) -{ - return vcpu_reg(vcpu, reg_num); -} - -unsigned long *__vcpu_spsr(struct kvm_vcpu *vcpu); - -static inline unsigned long vpcu_read_spsr(struct kvm_vcpu *vcpu) -{ - return *__vcpu_spsr(vcpu); -} - -static inline void vcpu_write_spsr(struct kvm_vcpu *vcpu, unsigned long v) -{ - *__vcpu_spsr(vcpu) = v; -} - -static inline unsigned long host_spsr_to_spsr32(unsigned long spsr) -{ - return spsr; -} - -static inline unsigned long vcpu_get_reg(struct kvm_vcpu *vcpu, - u8 reg_num) -{ - return *vcpu_reg(vcpu, reg_num); -} - -static inline void vcpu_set_reg(struct kvm_vcpu *vcpu, u8 reg_num, - unsigned long val) -{ - *vcpu_reg(vcpu, reg_num) = val; -} - -bool kvm_condition_valid32(const struct kvm_vcpu *vcpu); -void kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr); -void kvm_inject_undef32(struct kvm_vcpu *vcpu); -void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr); -void kvm_inject_pabt32(struct kvm_vcpu *vcpu, unsigned long addr); -void kvm_inject_vabt(struct kvm_vcpu *vcpu); - -static inline void kvm_inject_undefined(struct kvm_vcpu *vcpu) -{ - kvm_inject_undef32(vcpu); -} - -static inline void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr) -{ - kvm_inject_dabt32(vcpu, addr); -} - -static inline void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr) -{ - kvm_inject_pabt32(vcpu, addr); -} - -static inline bool kvm_condition_valid(const struct kvm_vcpu *vcpu) -{ - return kvm_condition_valid32(vcpu); -} - -static inline void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr) -{ - kvm_skip_instr32(vcpu, is_wide_instr); -} - -static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) -{ - vcpu->arch.hcr = HCR_GUEST_MASK; -} - -static inline unsigned long *vcpu_hcr(const struct kvm_vcpu *vcpu) -{ - return (unsigned long *)&vcpu->arch.hcr; -} - -static inline void vcpu_clear_wfx_traps(struct kvm_vcpu *vcpu) -{ - vcpu->arch.hcr &= ~HCR_TWE; -} - -static inline void vcpu_set_wfx_traps(struct kvm_vcpu *vcpu) -{ - vcpu->arch.hcr |= HCR_TWE; -} - -static inline bool vcpu_mode_is_32bit(const struct kvm_vcpu *vcpu) -{ - return true; -} - -static inline unsigned long *vcpu_pc(struct kvm_vcpu *vcpu) -{ - return &vcpu->arch.ctxt.gp_regs.usr_regs.ARM_pc; -} - -static inline unsigned long *vcpu_cpsr(const struct kvm_vcpu *vcpu) -{ - return (unsigned long *)&vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr; -} - -static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu) -{ - *vcpu_cpsr(vcpu) |= PSR_T_BIT; -} - -static inline bool mode_has_spsr(struct kvm_vcpu *vcpu) -{ - unsigned long cpsr_mode = vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr & MODE_MASK; - return (cpsr_mode > USR_MODE && cpsr_mode < SYSTEM_MODE); -} - -static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu) -{ - unsigned long cpsr_mode = vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr & MODE_MASK; - return cpsr_mode > USR_MODE; -} - -static inline u32 kvm_vcpu_get_hsr(const struct kvm_vcpu *vcpu) -{ - return vcpu->arch.fault.hsr; -} - -static inline int kvm_vcpu_get_condition(const struct kvm_vcpu *vcpu) -{ - u32 hsr = kvm_vcpu_get_hsr(vcpu); - - if (hsr & HSR_CV) - return (hsr & HSR_COND) >> HSR_COND_SHIFT; - - return -1; -} - -static inline unsigned long kvm_vcpu_get_hfar(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.fault.hxfar; -} - -static inline phys_addr_t kvm_vcpu_get_fault_ipa(struct kvm_vcpu *vcpu) -{ - return ((phys_addr_t)vcpu->arch.fault.hpfar & HPFAR_MASK) << 8; -} - -static inline bool kvm_vcpu_dabt_isvalid(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_get_hsr(vcpu) & HSR_ISV; -} - -static inline unsigned long kvm_vcpu_dabt_iss_nisv_sanitized(const struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_get_hsr(vcpu) & (HSR_CM | HSR_WNR | HSR_FSC); -} - -static inline bool kvm_vcpu_dabt_iswrite(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_get_hsr(vcpu) & HSR_WNR; -} - -static inline bool kvm_vcpu_dabt_issext(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_get_hsr(vcpu) & HSR_SSE; -} - -static inline bool kvm_vcpu_dabt_issf(const struct kvm_vcpu *vcpu) -{ - return false; -} - -static inline int kvm_vcpu_dabt_get_rd(struct kvm_vcpu *vcpu) -{ - return (kvm_vcpu_get_hsr(vcpu) & HSR_SRT_MASK) >> HSR_SRT_SHIFT; -} - -static inline bool kvm_vcpu_dabt_iss1tw(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_get_hsr(vcpu) & HSR_DABT_S1PTW; -} - -static inline bool kvm_vcpu_dabt_is_cm(struct kvm_vcpu *vcpu) -{ - return !!(kvm_vcpu_get_hsr(vcpu) & HSR_DABT_CM); -} - -/* Get Access Size from a data abort */ -static inline unsigned int kvm_vcpu_dabt_get_as(struct kvm_vcpu *vcpu) -{ - switch ((kvm_vcpu_get_hsr(vcpu) >> 22) & 0x3) { - case 0: - return 1; - case 1: - return 2; - case 2: - return 4; - default: - kvm_err("Hardware is weird: SAS 0b11 is reserved\n"); - return 4; - } -} - -/* This one is not specific to Data Abort */ -static inline bool kvm_vcpu_trap_il_is32bit(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_get_hsr(vcpu) & HSR_IL; -} - -static inline u8 kvm_vcpu_trap_get_class(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_get_hsr(vcpu) >> HSR_EC_SHIFT; -} - -static inline bool kvm_vcpu_trap_is_iabt(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_trap_get_class(vcpu) == HSR_EC_IABT; -} - -static inline u8 kvm_vcpu_trap_get_fault(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_get_hsr(vcpu) & HSR_FSC; -} - -static inline u8 kvm_vcpu_trap_get_fault_type(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_get_hsr(vcpu) & HSR_FSC_TYPE; -} - -static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu) -{ - switch (kvm_vcpu_trap_get_fault(vcpu)) { - case FSC_SEA: - case FSC_SEA_TTW0: - case FSC_SEA_TTW1: - case FSC_SEA_TTW2: - case FSC_SEA_TTW3: - case FSC_SECC: - case FSC_SECC_TTW0: - case FSC_SECC_TTW1: - case FSC_SECC_TTW2: - case FSC_SECC_TTW3: - return true; - default: - return false; - } -} - -static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) -{ - if (kvm_vcpu_trap_is_iabt(vcpu)) - return false; - - return kvm_vcpu_dabt_iswrite(vcpu); -} - -static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK; -} - -static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) -{ - return vcpu_cp15(vcpu, c0_MPIDR) & MPIDR_HWID_BITMASK; -} - -static inline bool kvm_arm_get_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu) -{ - return false; -} - -static inline void kvm_arm_set_vcpu_workaround_2_flag(struct kvm_vcpu *vcpu, - bool flag) -{ -} - -static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu) -{ - *vcpu_cpsr(vcpu) |= PSR_E_BIT; -} - -static inline bool kvm_vcpu_is_be(struct kvm_vcpu *vcpu) -{ - return !!(*vcpu_cpsr(vcpu) & PSR_E_BIT); -} - -static inline unsigned long vcpu_data_guest_to_host(struct kvm_vcpu *vcpu, - unsigned long data, - unsigned int len) -{ - if (kvm_vcpu_is_be(vcpu)) { - switch (len) { - case 1: - return data & 0xff; - case 2: - return be16_to_cpu(data & 0xffff); - default: - return be32_to_cpu(data); - } - } else { - switch (len) { - case 1: - return data & 0xff; - case 2: - return le16_to_cpu(data & 0xffff); - default: - return le32_to_cpu(data); - } - } -} - -static inline unsigned long vcpu_data_host_to_guest(struct kvm_vcpu *vcpu, - unsigned long data, - unsigned int len) -{ - if (kvm_vcpu_is_be(vcpu)) { - switch (len) { - case 1: - return data & 0xff; - case 2: - return cpu_to_be16(data & 0xffff); - default: - return cpu_to_be32(data); - } - } else { - switch (len) { - case 1: - return data & 0xff; - case 2: - return cpu_to_le16(data & 0xffff); - default: - return cpu_to_le32(data); - } - } -} - -static inline void vcpu_ptrauth_setup_lazy(struct kvm_vcpu *vcpu) {} - -#endif /* __ARM_KVM_EMULATE_H__ */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h deleted file mode 100644 index a827b4d60d38..000000000000 --- a/arch/arm/include/asm/kvm_host.h +++ /dev/null @@ -1,456 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#ifndef __ARM_KVM_HOST_H__ -#define __ARM_KVM_HOST_H__ - -#include <linux/arm-smccc.h> -#include <linux/errno.h> -#include <linux/types.h> -#include <linux/kvm_types.h> -#include <asm/cputype.h> -#include <asm/kvm.h> -#include <asm/kvm_asm.h> -#include <asm/fpstate.h> -#include <kvm/arm_arch_timer.h> - -#define __KVM_HAVE_ARCH_INTC_INITIALIZED - -#define KVM_USER_MEM_SLOTS 32 -#define KVM_HAVE_ONE_REG -#define KVM_HALT_POLL_NS_DEFAULT 500000 - -#define KVM_VCPU_MAX_FEATURES 2 - -#include <kvm/arm_vgic.h> - - -#ifdef CONFIG_ARM_GIC_V3 -#define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS -#else -#define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS -#endif - -#define KVM_REQ_SLEEP \ - KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) -#define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) -#define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2) -#define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) - -DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); - -static inline int kvm_arm_init_sve(void) { return 0; } - -u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); -int __attribute_const__ kvm_target_cpu(void); -int kvm_reset_vcpu(struct kvm_vcpu *vcpu); -void kvm_reset_coprocs(struct kvm_vcpu *vcpu); - -struct kvm_vmid { - /* The VMID generation used for the virt. memory system */ - u64 vmid_gen; - u32 vmid; -}; - -struct kvm_arch { - /* The last vcpu id that ran on each physical CPU */ - int __percpu *last_vcpu_ran; - - /* - * Anything that is not used directly from assembly code goes - * here. - */ - - /* The VMID generation used for the virt. memory system */ - struct kvm_vmid vmid; - - /* Stage-2 page table */ - pgd_t *pgd; - phys_addr_t pgd_phys; - - /* Interrupt controller */ - struct vgic_dist vgic; - int max_vcpus; - - /* Mandated version of PSCI */ - u32 psci_version; - - /* - * If we encounter a data abort without valid instruction syndrome - * information, report this to user space. User space can (and - * should) opt in to this feature if KVM_CAP_ARM_NISV_TO_USER is - * supported. - */ - bool return_nisv_io_abort_to_user; -}; - -#define KVM_NR_MEM_OBJS 40 - -/* - * We don't want allocation failures within the mmu code, so we preallocate - * enough memory for a single page fault in a cache. - */ -struct kvm_mmu_memory_cache { - int nobjs; - void *objects[KVM_NR_MEM_OBJS]; -}; - -struct kvm_vcpu_fault_info { - u32 hsr; /* Hyp Syndrome Register */ - u32 hxfar; /* Hyp Data/Inst. Fault Address Register */ - u32 hpfar; /* Hyp IPA Fault Address Register */ -}; - -/* - * 0 is reserved as an invalid value. - * Order should be kept in sync with the save/restore code. - */ -enum vcpu_sysreg { - __INVALID_SYSREG__, - c0_MPIDR, /* MultiProcessor ID Register */ - c0_CSSELR, /* Cache Size Selection Register */ - c1_SCTLR, /* System Control Register */ - c1_ACTLR, /* Auxiliary Control Register */ - c1_CPACR, /* Coprocessor Access Control */ - c2_TTBR0, /* Translation Table Base Register 0 */ - c2_TTBR0_high, /* TTBR0 top 32 bits */ - c2_TTBR1, /* Translation Table Base Register 1 */ - c2_TTBR1_high, /* TTBR1 top 32 bits */ - c2_TTBCR, /* Translation Table Base Control R. */ - c3_DACR, /* Domain Access Control Register */ - c5_DFSR, /* Data Fault Status Register */ - c5_IFSR, /* Instruction Fault Status Register */ - c5_ADFSR, /* Auxilary Data Fault Status R */ - c5_AIFSR, /* Auxilary Instrunction Fault Status R */ - c6_DFAR, /* Data Fault Address Register */ - c6_IFAR, /* Instruction Fault Address Register */ - c7_PAR, /* Physical Address Register */ - c7_PAR_high, /* PAR top 32 bits */ - c9_L2CTLR, /* Cortex A15/A7 L2 Control Register */ - c10_PRRR, /* Primary Region Remap Register */ - c10_NMRR, /* Normal Memory Remap Register */ - c12_VBAR, /* Vector Base Address Register */ - c13_CID, /* Context ID Register */ - c13_TID_URW, /* Thread ID, User R/W */ - c13_TID_URO, /* Thread ID, User R/O */ - c13_TID_PRIV, /* Thread ID, Privileged */ - c14_CNTKCTL, /* Timer Control Register (PL1) */ - c10_AMAIR0, /* Auxilary Memory Attribute Indirection Reg0 */ - c10_AMAIR1, /* Auxilary Memory Attribute Indirection Reg1 */ - NR_CP15_REGS /* Number of regs (incl. invalid) */ -}; - -struct kvm_cpu_context { - struct kvm_regs gp_regs; - struct vfp_hard_struct vfp; - u32 cp15[NR_CP15_REGS]; -}; - -struct kvm_host_data { - struct kvm_cpu_context host_ctxt; -}; - -typedef struct kvm_host_data kvm_host_data_t; - -static inline void kvm_init_host_cpu_context(struct kvm_cpu_context *cpu_ctxt) -{ - /* The host's MPIDR is immutable, so let's set it up at boot time */ - cpu_ctxt->cp15[c0_MPIDR] = read_cpuid_mpidr(); -} - -struct vcpu_reset_state { - unsigned long pc; - unsigned long r0; - bool be; - bool reset; -}; - -struct kvm_vcpu_arch { - struct kvm_cpu_context ctxt; - - int target; /* Processor target */ - DECLARE_BITMAP(features, KVM_VCPU_MAX_FEATURES); - - /* The CPU type we expose to the VM */ - u32 midr; - - /* HYP trapping configuration */ - u32 hcr; - - /* Exception Information */ - struct kvm_vcpu_fault_info fault; - - /* Host FP context */ - struct kvm_cpu_context *host_cpu_context; - - /* VGIC state */ - struct vgic_cpu vgic_cpu; - struct arch_timer_cpu timer_cpu; - - /* - * Anything that is not used directly from assembly code goes - * here. - */ - - /* vcpu power-off state */ - bool power_off; - - /* Don't run the guest (internal implementation need) */ - bool pause; - - /* Cache some mmu pages needed inside spinlock regions */ - struct kvm_mmu_memory_cache mmu_page_cache; - - struct vcpu_reset_state reset_state; - - /* Detect first run of a vcpu */ - bool has_run_once; -}; - -struct kvm_vm_stat { - ulong remote_tlb_flush; -}; - -struct kvm_vcpu_stat { - u64 halt_successful_poll; - u64 halt_attempted_poll; - u64 halt_poll_invalid; - u64 halt_wakeup; - u64 hvc_exit_stat; - u64 wfe_exit_stat; - u64 wfi_exit_stat; - u64 mmio_exit_user; - u64 mmio_exit_kernel; - u64 exits; -}; - -#define vcpu_cp15(v,r) (v)->arch.ctxt.cp15[r] - -int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init); -unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); -int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); -int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); -int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); - -unsigned long __kvm_call_hyp(void *hypfn, ...); - -/* - * The has_vhe() part doesn't get emitted, but is used for type-checking. - */ -#define kvm_call_hyp(f, ...) \ - do { \ - if (has_vhe()) { \ - f(__VA_ARGS__); \ - } else { \ - __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__); \ - } \ - } while(0) - -#define kvm_call_hyp_ret(f, ...) \ - ({ \ - typeof(f(__VA_ARGS__)) ret; \ - \ - if (has_vhe()) { \ - ret = f(__VA_ARGS__); \ - } else { \ - ret = __kvm_call_hyp(kvm_ksym_ref(f), \ - ##__VA_ARGS__); \ - } \ - \ - ret; \ - }) - -void force_vm_exit(const cpumask_t *mask); -int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events); - -int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events); - -#define KVM_ARCH_WANT_MMU_NOTIFIER -int kvm_unmap_hva_range(struct kvm *kvm, - unsigned long start, unsigned long end); -int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); - -unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); -int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); -int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end); -int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); - -void kvm_arm_halt_guest(struct kvm *kvm); -void kvm_arm_resume_guest(struct kvm *kvm); - -int kvm_arm_copy_coproc_indices(struct kvm_vcpu *vcpu, u64 __user *uindices); -unsigned long kvm_arm_num_coproc_regs(struct kvm_vcpu *vcpu); -int kvm_arm_coproc_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); -int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *); - -int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, - int exception_index); - -static inline void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, - int exception_index) {} - -/* MMIO helpers */ -void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); -unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); - -int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); -int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, - phys_addr_t fault_ipa); - -static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, - unsigned long hyp_stack_ptr, - unsigned long vector_ptr) -{ - /* - * Call initialization code, and switch to the full blown HYP - * code. The init code doesn't need to preserve these - * registers as r0-r3 are already callee saved according to - * the AAPCS. - * Note that we slightly misuse the prototype by casting the - * stack pointer to a void *. - - * The PGDs are always passed as the third argument, in order - * to be passed into r2-r3 to the init code (yes, this is - * compliant with the PCS!). - */ - - __kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr); -} - -static inline void __cpu_init_stage2(void) -{ - kvm_call_hyp(__init_stage2_translation); -} - -static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext) -{ - return 0; -} - -int kvm_perf_init(void); -int kvm_perf_teardown(void); - -static inline long kvm_hypercall_pv_features(struct kvm_vcpu *vcpu) -{ - return SMCCC_RET_NOT_SUPPORTED; -} - -static inline gpa_t kvm_init_stolen_time(struct kvm_vcpu *vcpu) -{ - return GPA_INVALID; -} - -static inline void kvm_update_stolen_time(struct kvm_vcpu *vcpu) -{ -} - -static inline void kvm_arm_pvtime_vcpu_init(struct kvm_vcpu_arch *vcpu_arch) -{ -} - -static inline bool kvm_arm_is_pvtime_enabled(struct kvm_vcpu_arch *vcpu_arch) -{ - return false; -} - -void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); - -struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr); - -static inline bool kvm_arch_requires_vhe(void) { return false; } -static inline void kvm_arch_hardware_unsetup(void) {} -static inline void kvm_arch_sync_events(struct kvm *kvm) {} -static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} -static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} -static inline void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu) {} - -static inline void kvm_arm_init_debug(void) {} -static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} -static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {} -static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {} - -int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr); -int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr); -int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr); - -/* - * VFP/NEON switching is all done by the hyp switch code, so no need to - * coordinate with host context handling for this state: - */ -static inline void kvm_arch_vcpu_load_fp(struct kvm_vcpu *vcpu) {} -static inline void kvm_arch_vcpu_ctxsync_fp(struct kvm_vcpu *vcpu) {} -static inline void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) {} - -static inline void kvm_vcpu_pmu_restore_guest(struct kvm_vcpu *vcpu) {} -static inline void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu) {} - -#define KVM_BP_HARDEN_UNKNOWN -1 -#define KVM_BP_HARDEN_WA_NEEDED 0 -#define KVM_BP_HARDEN_NOT_REQUIRED 1 - -static inline int kvm_arm_harden_branch_predictor(void) -{ - switch(read_cpuid_part()) { -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR - case ARM_CPU_PART_BRAHMA_B15: - case ARM_CPU_PART_CORTEX_A12: - case ARM_CPU_PART_CORTEX_A15: - case ARM_CPU_PART_CORTEX_A17: - return KVM_BP_HARDEN_WA_NEEDED; -#endif - case ARM_CPU_PART_CORTEX_A7: - return KVM_BP_HARDEN_NOT_REQUIRED; - default: - return KVM_BP_HARDEN_UNKNOWN; - } -} - -#define KVM_SSBD_UNKNOWN -1 -#define KVM_SSBD_FORCE_DISABLE 0 -#define KVM_SSBD_KERNEL 1 -#define KVM_SSBD_FORCE_ENABLE 2 -#define KVM_SSBD_MITIGATED 3 - -static inline int kvm_arm_have_ssbd(void) -{ - /* No way to detect it yet, pretend it is not there. */ - return KVM_SSBD_UNKNOWN; -} - -static inline void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu) {} -static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {} - -#define __KVM_HAVE_ARCH_VM_ALLOC -struct kvm *kvm_arch_alloc_vm(void); -void kvm_arch_free_vm(struct kvm *kvm); - -static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) -{ - /* - * On 32bit ARM, VMs get a static 40bit IPA stage2 setup, - * so any non-zero value used as type is illegal. - */ - if (type) - return -EINVAL; - return 0; -} - -static inline int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature) -{ - return -EINVAL; -} - -static inline bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu) -{ - return true; -} - -#endif /* __ARM_KVM_HOST_H__ */ diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h deleted file mode 100644 index 3c1b55ecc578..000000000000 --- a/arch/arm/include/asm/kvm_hyp.h +++ /dev/null @@ -1,127 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2015 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - */ - -#ifndef __ARM_KVM_HYP_H__ -#define __ARM_KVM_HYP_H__ - -#include <linux/compiler.h> -#include <linux/kvm_host.h> -#include <asm/cp15.h> -#include <asm/kvm_arm.h> -#include <asm/vfp.h> - -#define __hyp_text __section(.hyp.text) notrace - -#define __ACCESS_VFP(CRn) \ - "mrc", "mcr", __stringify(p10, 7, %0, CRn, cr0, 0), u32 - -#define write_special(v, r) \ - asm volatile("msr " __stringify(r) ", %0" : : "r" (v)) -#define read_special(r) ({ \ - u32 __val; \ - asm volatile("mrs %0, " __stringify(r) : "=r" (__val)); \ - __val; \ -}) - -#define TTBR0 __ACCESS_CP15_64(0, c2) -#define TTBR1 __ACCESS_CP15_64(1, c2) -#define VTTBR __ACCESS_CP15_64(6, c2) -#define PAR __ACCESS_CP15_64(0, c7) -#define CNTP_CVAL __ACCESS_CP15_64(2, c14) -#define CNTV_CVAL __ACCESS_CP15_64(3, c14) -#define CNTVOFF __ACCESS_CP15_64(4, c14) - -#define MIDR __ACCESS_CP15(c0, 0, c0, 0) -#define CSSELR __ACCESS_CP15(c0, 2, c0, 0) -#define VPIDR __ACCESS_CP15(c0, 4, c0, 0) -#define VMPIDR __ACCESS_CP15(c0, 4, c0, 5) -#define SCTLR __ACCESS_CP15(c1, 0, c0, 0) -#define CPACR __ACCESS_CP15(c1, 0, c0, 2) -#define HCR __ACCESS_CP15(c1, 4, c1, 0) -#define HDCR __ACCESS_CP15(c1, 4, c1, 1) -#define HCPTR __ACCESS_CP15(c1, 4, c1, 2) -#define HSTR __ACCESS_CP15(c1, 4, c1, 3) -#define TTBCR __ACCESS_CP15(c2, 0, c0, 2) -#define HTCR __ACCESS_CP15(c2, 4, c0, 2) -#define VTCR __ACCESS_CP15(c2, 4, c1, 2) -#define DACR __ACCESS_CP15(c3, 0, c0, 0) -#define DFSR __ACCESS_CP15(c5, 0, c0, 0) -#define IFSR __ACCESS_CP15(c5, 0, c0, 1) -#define ADFSR __ACCESS_CP15(c5, 0, c1, 0) -#define AIFSR __ACCESS_CP15(c5, 0, c1, 1) -#define HSR __ACCESS_CP15(c5, 4, c2, 0) -#define DFAR __ACCESS_CP15(c6, 0, c0, 0) -#define IFAR __ACCESS_CP15(c6, 0, c0, 2) -#define HDFAR __ACCESS_CP15(c6, 4, c0, 0) -#define HIFAR __ACCESS_CP15(c6, 4, c0, 2) -#define HPFAR __ACCESS_CP15(c6, 4, c0, 4) -#define ICIALLUIS __ACCESS_CP15(c7, 0, c1, 0) -#define BPIALLIS __ACCESS_CP15(c7, 0, c1, 6) -#define ICIMVAU __ACCESS_CP15(c7, 0, c5, 1) -#define ATS1CPR __ACCESS_CP15(c7, 0, c8, 0) -#define TLBIALLIS __ACCESS_CP15(c8, 0, c3, 0) -#define TLBIALL __ACCESS_CP15(c8, 0, c7, 0) -#define TLBIALLNSNHIS __ACCESS_CP15(c8, 4, c3, 4) -#define PRRR __ACCESS_CP15(c10, 0, c2, 0) -#define NMRR __ACCESS_CP15(c10, 0, c2, 1) -#define AMAIR0 __ACCESS_CP15(c10, 0, c3, 0) -#define AMAIR1 __ACCESS_CP15(c10, 0, c3, 1) -#define VBAR __ACCESS_CP15(c12, 0, c0, 0) -#define CID __ACCESS_CP15(c13, 0, c0, 1) -#define TID_URW __ACCESS_CP15(c13, 0, c0, 2) -#define TID_URO __ACCESS_CP15(c13, 0, c0, 3) -#define TID_PRIV __ACCESS_CP15(c13, 0, c0, 4) -#define HTPIDR __ACCESS_CP15(c13, 4, c0, 2) -#define CNTKCTL __ACCESS_CP15(c14, 0, c1, 0) -#define CNTP_CTL __ACCESS_CP15(c14, 0, c2, 1) -#define CNTV_CTL __ACCESS_CP15(c14, 0, c3, 1) -#define CNTHCTL __ACCESS_CP15(c14, 4, c1, 0) - -#define VFP_FPEXC __ACCESS_VFP(FPEXC) - -/* AArch64 compatibility macros, only for the timer so far */ -#define read_sysreg_el0(r) read_sysreg(r##_EL0) -#define write_sysreg_el0(v, r) write_sysreg(v, r##_EL0) - -#define SYS_CNTP_CTL_EL0 CNTP_CTL -#define SYS_CNTP_CVAL_EL0 CNTP_CVAL -#define SYS_CNTV_CTL_EL0 CNTV_CTL -#define SYS_CNTV_CVAL_EL0 CNTV_CVAL - -#define cntvoff_el2 CNTVOFF -#define cnthctl_el2 CNTHCTL - -void __timer_enable_traps(struct kvm_vcpu *vcpu); -void __timer_disable_traps(struct kvm_vcpu *vcpu); - -void __vgic_v2_save_state(struct kvm_vcpu *vcpu); -void __vgic_v2_restore_state(struct kvm_vcpu *vcpu); - -void __sysreg_save_state(struct kvm_cpu_context *ctxt); -void __sysreg_restore_state(struct kvm_cpu_context *ctxt); - -void __vgic_v3_save_state(struct kvm_vcpu *vcpu); -void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); -void __vgic_v3_activate_traps(struct kvm_vcpu *vcpu); -void __vgic_v3_deactivate_traps(struct kvm_vcpu *vcpu); -void __vgic_v3_save_aprs(struct kvm_vcpu *vcpu); -void __vgic_v3_restore_aprs(struct kvm_vcpu *vcpu); - -asmlinkage void __vfp_save_state(struct vfp_hard_struct *vfp); -asmlinkage void __vfp_restore_state(struct vfp_hard_struct *vfp); -static inline bool __vfp_enabled(void) -{ - return !(read_sysreg(HCPTR) & (HCPTR_TCP(11) | HCPTR_TCP(10))); -} - -void __hyp_text __banked_save_state(struct kvm_cpu_context *ctxt); -void __hyp_text __banked_restore_state(struct kvm_cpu_context *ctxt); - -asmlinkage int __guest_enter(struct kvm_vcpu *vcpu, - struct kvm_cpu_context *host); -asmlinkage int __hyp_do_panic(const char *, int, u32); - -#endif /* __ARM_KVM_HYP_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h deleted file mode 100644 index 0d84d50bf9ba..000000000000 --- a/arch/arm/include/asm/kvm_mmu.h +++ /dev/null @@ -1,435 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#ifndef __ARM_KVM_MMU_H__ -#define __ARM_KVM_MMU_H__ - -#include <asm/memory.h> -#include <asm/page.h> - -/* - * We directly use the kernel VA for the HYP, as we can directly share - * the mapping (HTTBR "covers" TTBR1). - */ -#define kern_hyp_va(kva) (kva) - -/* Contrary to arm64, there is no need to generate a PC-relative address */ -#define hyp_symbol_addr(s) \ - ({ \ - typeof(s) *addr = &(s); \ - addr; \ - }) - -#ifndef __ASSEMBLY__ - -#include <linux/highmem.h> -#include <asm/cacheflush.h> -#include <asm/cputype.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_hyp.h> -#include <asm/pgalloc.h> -#include <asm/stage2_pgtable.h> - -/* Ensure compatibility with arm64 */ -#define VA_BITS 32 - -#define kvm_phys_shift(kvm) KVM_PHYS_SHIFT -#define kvm_phys_size(kvm) (1ULL << kvm_phys_shift(kvm)) -#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - 1ULL) -#define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK - -#define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t)) - -int create_hyp_mappings(void *from, void *to, pgprot_t prot); -int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size, - void __iomem **kaddr, - void __iomem **haddr); -int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, - void **haddr); -void free_hyp_pgds(void); - -void stage2_unmap_vm(struct kvm *kvm); -int kvm_alloc_stage2_pgd(struct kvm *kvm); -void kvm_free_stage2_pgd(struct kvm *kvm); -int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, - phys_addr_t pa, unsigned long size, bool writable); - -int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run); - -void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); - -phys_addr_t kvm_mmu_get_httbr(void); -phys_addr_t kvm_get_idmap_vector(void); -int kvm_mmu_init(void); -void kvm_clear_hyp_idmap(void); - -#define kvm_mk_pmd(ptep) __pmd(__pa(ptep) | PMD_TYPE_TABLE) -#define kvm_mk_pud(pmdp) __pud(__pa(pmdp) | PMD_TYPE_TABLE) -#define kvm_mk_pgd(pudp) ({ BUILD_BUG(); 0; }) - -#define kvm_pfn_pte(pfn, prot) pfn_pte(pfn, prot) -#define kvm_pfn_pmd(pfn, prot) pfn_pmd(pfn, prot) -#define kvm_pfn_pud(pfn, prot) (__pud(0)) - -#define kvm_pud_pfn(pud) ({ WARN_ON(1); 0; }) - - -#define kvm_pmd_mkhuge(pmd) pmd_mkhuge(pmd) -/* No support for pud hugepages */ -#define kvm_pud_mkhuge(pud) ( {WARN_ON(1); pud; }) - -/* - * The following kvm_*pud*() functions are provided strictly to allow - * sharing code with arm64. They should never be called in practice. - */ -static inline void kvm_set_s2pud_readonly(pud_t *pud) -{ - WARN_ON(1); -} - -static inline bool kvm_s2pud_readonly(pud_t *pud) -{ - WARN_ON(1); - return false; -} - -static inline void kvm_set_pud(pud_t *pud, pud_t new_pud) -{ - WARN_ON(1); -} - -static inline pud_t kvm_s2pud_mkwrite(pud_t pud) -{ - WARN_ON(1); - return pud; -} - -static inline pud_t kvm_s2pud_mkexec(pud_t pud) -{ - WARN_ON(1); - return pud; -} - -static inline bool kvm_s2pud_exec(pud_t *pud) -{ - WARN_ON(1); - return false; -} - -static inline pud_t kvm_s2pud_mkyoung(pud_t pud) -{ - BUG(); - return pud; -} - -static inline bool kvm_s2pud_young(pud_t pud) -{ - WARN_ON(1); - return false; -} - -static inline pte_t kvm_s2pte_mkwrite(pte_t pte) -{ - pte_val(pte) |= L_PTE_S2_RDWR; - return pte; -} - -static inline pmd_t kvm_s2pmd_mkwrite(pmd_t pmd) -{ - pmd_val(pmd) |= L_PMD_S2_RDWR; - return pmd; -} - -static inline pte_t kvm_s2pte_mkexec(pte_t pte) -{ - pte_val(pte) &= ~L_PTE_XN; - return pte; -} - -static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd) -{ - pmd_val(pmd) &= ~PMD_SECT_XN; - return pmd; -} - -static inline void kvm_set_s2pte_readonly(pte_t *pte) -{ - pte_val(*pte) = (pte_val(*pte) & ~L_PTE_S2_RDWR) | L_PTE_S2_RDONLY; -} - -static inline bool kvm_s2pte_readonly(pte_t *pte) -{ - return (pte_val(*pte) & L_PTE_S2_RDWR) == L_PTE_S2_RDONLY; -} - -static inline bool kvm_s2pte_exec(pte_t *pte) -{ - return !(pte_val(*pte) & L_PTE_XN); -} - -static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) -{ - pmd_val(*pmd) = (pmd_val(*pmd) & ~L_PMD_S2_RDWR) | L_PMD_S2_RDONLY; -} - -static inline bool kvm_s2pmd_readonly(pmd_t *pmd) -{ - return (pmd_val(*pmd) & L_PMD_S2_RDWR) == L_PMD_S2_RDONLY; -} - -static inline bool kvm_s2pmd_exec(pmd_t *pmd) -{ - return !(pmd_val(*pmd) & PMD_SECT_XN); -} - -static inline bool kvm_page_empty(void *ptr) -{ - struct page *ptr_page = virt_to_page(ptr); - return page_count(ptr_page) == 1; -} - -#define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) -#define kvm_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) -#define kvm_pud_table_empty(kvm, pudp) false - -#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) -#define hyp_pmd_table_empty(pmdp) kvm_page_empty(pmdp) -#define hyp_pud_table_empty(pudp) false - -struct kvm; - -#define kvm_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l)) - -static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu) -{ - return (vcpu_cp15(vcpu, c1_SCTLR) & 0b101) == 0b101; -} - -static inline void __clean_dcache_guest_page(kvm_pfn_t pfn, unsigned long size) -{ - /* - * Clean the dcache to the Point of Coherency. - * - * We need to do this through a kernel mapping (using the - * user-space mapping has proved to be the wrong - * solution). For that, we need to kmap one page at a time, - * and iterate over the range. - */ - - VM_BUG_ON(size & ~PAGE_MASK); - - while (size) { - void *va = kmap_atomic_pfn(pfn); - - kvm_flush_dcache_to_poc(va, PAGE_SIZE); - - size -= PAGE_SIZE; - pfn++; - - kunmap_atomic(va); - } -} - -static inline void __invalidate_icache_guest_page(kvm_pfn_t pfn, - unsigned long size) -{ - u32 iclsz; - - /* - * If we are going to insert an instruction page and the icache is - * either VIPT or PIPT, there is a potential problem where the host - * (or another VM) may have used the same page as this guest, and we - * read incorrect data from the icache. If we're using a PIPT cache, - * we can invalidate just that page, but if we are using a VIPT cache - * we need to invalidate the entire icache - damn shame - as written - * in the ARM ARM (DDI 0406C.b - Page B3-1393). - * - * VIVT caches are tagged using both the ASID and the VMID and doesn't - * need any kind of flushing (DDI 0406C.b - Page B3-1392). - */ - - VM_BUG_ON(size & ~PAGE_MASK); - - if (icache_is_vivt_asid_tagged()) - return; - - if (!icache_is_pipt()) { - /* any kind of VIPT cache */ - __flush_icache_all(); - return; - } - - /* - * CTR IminLine contains Log2 of the number of words in the - * cache line, so we can get the number of words as - * 2 << (IminLine - 1). To get the number of bytes, we - * multiply by 4 (the number of bytes in a 32-bit word), and - * get 4 << (IminLine). - */ - iclsz = 4 << (read_cpuid(CPUID_CACHETYPE) & 0xf); - - while (size) { - void *va = kmap_atomic_pfn(pfn); - void *end = va + PAGE_SIZE; - void *addr = va; - - do { - write_sysreg(addr, ICIMVAU); - addr += iclsz; - } while (addr < end); - - dsb(ishst); - isb(); - - size -= PAGE_SIZE; - pfn++; - - kunmap_atomic(va); - } - - /* Check if we need to invalidate the BTB */ - if ((read_cpuid_ext(CPUID_EXT_MMFR1) >> 28) != 4) { - write_sysreg(0, BPIALLIS); - dsb(ishst); - isb(); - } -} - -static inline void __kvm_flush_dcache_pte(pte_t pte) -{ - void *va = kmap_atomic(pte_page(pte)); - - kvm_flush_dcache_to_poc(va, PAGE_SIZE); - - kunmap_atomic(va); -} - -static inline void __kvm_flush_dcache_pmd(pmd_t pmd) -{ - unsigned long size = PMD_SIZE; - kvm_pfn_t pfn = pmd_pfn(pmd); - - while (size) { - void *va = kmap_atomic_pfn(pfn); - - kvm_flush_dcache_to_poc(va, PAGE_SIZE); - - pfn++; - size -= PAGE_SIZE; - - kunmap_atomic(va); - } -} - -static inline void __kvm_flush_dcache_pud(pud_t pud) -{ -} - -#define kvm_virt_to_phys(x) virt_to_idmap((unsigned long)(x)) - -void kvm_set_way_flush(struct kvm_vcpu *vcpu); -void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled); - -static inline bool __kvm_cpu_uses_extended_idmap(void) -{ - return false; -} - -static inline unsigned long __kvm_idmap_ptrs_per_pgd(void) -{ - return PTRS_PER_PGD; -} - -static inline void __kvm_extend_hypmap(pgd_t *boot_hyp_pgd, - pgd_t *hyp_pgd, - pgd_t *merged_hyp_pgd, - unsigned long hyp_idmap_start) { } - -static inline unsigned int kvm_get_vmid_bits(void) -{ - return 8; -} - -/* - * We are not in the kvm->srcu critical section most of the time, so we take - * the SRCU read lock here. Since we copy the data from the user page, we - * can immediately drop the lock again. - */ -static inline int kvm_read_guest_lock(struct kvm *kvm, - gpa_t gpa, void *data, unsigned long len) -{ - int srcu_idx = srcu_read_lock(&kvm->srcu); - int ret = kvm_read_guest(kvm, gpa, data, len); - - srcu_read_unlock(&kvm->srcu, srcu_idx); - - return ret; -} - -static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, - const void *data, unsigned long len) -{ - int srcu_idx = srcu_read_lock(&kvm->srcu); - int ret = kvm_write_guest(kvm, gpa, data, len); - - srcu_read_unlock(&kvm->srcu, srcu_idx); - - return ret; -} - -static inline void *kvm_get_hyp_vector(void) -{ - switch(read_cpuid_part()) { -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR - case ARM_CPU_PART_CORTEX_A12: - case ARM_CPU_PART_CORTEX_A17: - { - extern char __kvm_hyp_vector_bp_inv[]; - return kvm_ksym_ref(__kvm_hyp_vector_bp_inv); - } - - case ARM_CPU_PART_BRAHMA_B15: - case ARM_CPU_PART_CORTEX_A15: - { - extern char __kvm_hyp_vector_ic_inv[]; - return kvm_ksym_ref(__kvm_hyp_vector_ic_inv); - } -#endif - default: - { - extern char __kvm_hyp_vector[]; - return kvm_ksym_ref(__kvm_hyp_vector); - } - } -} - -static inline int kvm_map_vectors(void) -{ - return 0; -} - -static inline int hyp_map_aux_data(void) -{ - return 0; -} - -#define kvm_phys_to_vttbr(addr) (addr) - -static inline void kvm_set_ipa_limit(void) {} - -static __always_inline u64 kvm_get_vttbr(struct kvm *kvm) -{ - struct kvm_vmid *vmid = &kvm->arch.vmid; - u64 vmid_field, baddr; - - baddr = kvm->arch.pgd_phys; - vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT; - return kvm_phys_to_vttbr(baddr) | vmid_field; -} - -#endif /* !__ASSEMBLY__ */ - -#endif /* __ARM_KVM_MMU_H__ */ diff --git a/arch/arm/include/asm/kvm_ras.h b/arch/arm/include/asm/kvm_ras.h deleted file mode 100644 index e9577292dfe4..000000000000 --- a/arch/arm/include/asm/kvm_ras.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2018 - Arm Ltd */ - -#ifndef __ARM_KVM_RAS_H__ -#define __ARM_KVM_RAS_H__ - -#include <linux/types.h> - -static inline int kvm_handle_guest_sea(phys_addr_t addr, unsigned int esr) -{ - return -1; -} - -#endif /* __ARM_KVM_RAS_H__ */ diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index ad55ab068dbf..36805f94939e 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -104,26 +104,6 @@ */ #define L_PGD_SWAPPER (_AT(pgdval_t, 1) << 55) /* swapper_pg_dir entry */ -/* - * 2nd stage PTE definitions for LPAE. - */ -#define L_PTE_S2_MT_UNCACHED (_AT(pteval_t, 0x0) << 2) /* strongly ordered */ -#define L_PTE_S2_MT_WRITETHROUGH (_AT(pteval_t, 0xa) << 2) /* normal inner write-through */ -#define L_PTE_S2_MT_WRITEBACK (_AT(pteval_t, 0xf) << 2) /* normal inner write-back */ -#define L_PTE_S2_MT_DEV_SHARED (_AT(pteval_t, 0x1) << 2) /* device */ -#define L_PTE_S2_MT_MASK (_AT(pteval_t, 0xf) << 2) - -#define L_PTE_S2_RDONLY (_AT(pteval_t, 1) << 6) /* HAP[1] */ -#define L_PTE_S2_RDWR (_AT(pteval_t, 3) << 6) /* HAP[2:1] */ - -#define L_PMD_S2_RDONLY (_AT(pmdval_t, 1) << 6) /* HAP[1] */ -#define L_PMD_S2_RDWR (_AT(pmdval_t, 3) << 6) /* HAP[2:1] */ - -/* - * Hyp-mode PL2 PTE definitions for LPAE. - */ -#define L_PTE_HYP L_PTE_USER - #ifndef __ASSEMBLY__ #define pud_none(pud) (!pud_val(pud)) diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index eabcb48a7840..0483cf413315 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -80,9 +80,6 @@ extern void __pgd_error(const char *file, int line, pgd_t); extern pgprot_t pgprot_user; extern pgprot_t pgprot_kernel; -extern pgprot_t pgprot_hyp_device; -extern pgprot_t pgprot_s2; -extern pgprot_t pgprot_s2_device; #define _MOD_PROT(p, b) __pgprot(pgprot_val(p) | (b)) @@ -95,12 +92,6 @@ extern pgprot_t pgprot_s2_device; #define PAGE_READONLY_EXEC _MOD_PROT(pgprot_user, L_PTE_USER | L_PTE_RDONLY) #define PAGE_KERNEL _MOD_PROT(pgprot_kernel, L_PTE_XN) #define PAGE_KERNEL_EXEC pgprot_kernel -#define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_XN) -#define PAGE_HYP_EXEC _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY) -#define PAGE_HYP_RO _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN) -#define PAGE_HYP_DEVICE _MOD_PROT(pgprot_hyp_device, L_PTE_HYP) -#define PAGE_S2 _MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY | L_PTE_XN) -#define PAGE_S2_DEVICE _MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY | L_PTE_XN) #define __PAGE_NONE __pgprot(_L_PTE_DEFAULT | L_PTE_RDONLY | L_PTE_XN | L_PTE_NONE) #define __PAGE_SHARED __pgprot(_L_PTE_DEFAULT | L_PTE_USER | L_PTE_XN) diff --git a/arch/arm/include/asm/sections.h b/arch/arm/include/asm/sections.h index 4ceb4f757d4d..700b8bcdf9bd 100644 --- a/arch/arm/include/asm/sections.h +++ b/arch/arm/include/asm/sections.h @@ -10,8 +10,6 @@ extern char __idmap_text_start[]; extern char __idmap_text_end[]; extern char __entry_text_start[]; extern char __entry_text_end[]; -extern char __hyp_idmap_text_start[]; -extern char __hyp_idmap_text_end[]; static inline bool in_entry_text(unsigned long addr) { @@ -22,9 +20,7 @@ static inline bool in_entry_text(unsigned long addr) static inline bool in_idmap_text(unsigned long addr) { void *a = (void *)addr; - return memory_contains(__idmap_text_start, __idmap_text_end, a, 1) || - memory_contains(__hyp_idmap_text_start, __hyp_idmap_text_end, - a, 1); + return memory_contains(__idmap_text_start, __idmap_text_end, a, 1); } #endif /* _ASM_ARM_SECTIONS_H */ diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h deleted file mode 100644 index aaceec7855ec..000000000000 --- a/arch/arm/include/asm/stage2_pgtable.h +++ /dev/null @@ -1,75 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2016 - ARM Ltd - * - * stage2 page table helpers - */ - -#ifndef __ARM_S2_PGTABLE_H_ -#define __ARM_S2_PGTABLE_H_ - -/* - * kvm_mmu_cache_min_pages() is the number of pages required - * to install a stage-2 translation. We pre-allocate the entry - * level table at VM creation. Since we have a 3 level page-table, - * we need only two pages to add a new mapping. - */ -#define kvm_mmu_cache_min_pages(kvm) 2 - -#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) -#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) -#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) -#define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud) -#define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address) -#define stage2_pud_free(kvm, pud) do { } while (0) - -#define stage2_pud_none(kvm, pud) pud_none(pud) -#define stage2_pud_clear(kvm, pud) pud_clear(pud) -#define stage2_pud_present(kvm, pud) pud_present(pud) -#define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd) -#define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address) -#define stage2_pmd_free(kvm, pmd) free_page((unsigned long)pmd) - -#define stage2_pud_huge(kvm, pud) pud_huge(pud) - -/* Open coded p*d_addr_end that can deal with 64bit addresses */ -static inline phys_addr_t -stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK; - - return (boundary - 1 < end - 1) ? boundary : end; -} - -#define stage2_pud_addr_end(kvm, addr, end) (end) - -static inline phys_addr_t -stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) -{ - phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK; - - return (boundary - 1 < end - 1) ? boundary : end; -} - -#define stage2_pgd_index(kvm, addr) pgd_index(addr) - -#define stage2_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) -#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) -#define stage2_pud_table_empty(kvm, pudp) false - -static inline bool kvm_stage2_has_pud(struct kvm *kvm) -{ - return false; -} - -#define S2_PMD_MASK PMD_MASK -#define S2_PMD_SIZE PMD_SIZE -#define S2_PUD_MASK PUD_MASK -#define S2_PUD_SIZE PUD_SIZE - -static inline bool kvm_stage2_has_pmd(struct kvm *kvm) -{ - return true; -} - -#endif /* __ARM_S2_PGTABLE_H_ */ diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h index 17c26ccd126d..dd9697b2bde8 100644 --- a/arch/arm/include/asm/virt.h +++ b/arch/arm/include/asm/virt.h @@ -39,8 +39,6 @@ static inline void sync_boot_mode(void) sync_cache_r(&__boot_cpu_mode); } -void __hyp_set_vectors(unsigned long phys_vector_base); -void __hyp_reset_vectors(void); #else #define __boot_cpu_mode (SVC_MODE) #define sync_boot_mode() @@ -67,18 +65,6 @@ static inline bool is_kernel_in_hyp_mode(void) return false; } -static inline bool has_vhe(void) -{ - return false; -} - -/* The section containing the hypervisor idmap text */ -extern char __hyp_idmap_text_start[]; -extern char __hyp_idmap_text_end[]; - -/* The section containing the hypervisor text */ -extern char __hyp_text_start[]; -extern char __hyp_text_end[]; #endif #else @@ -87,9 +73,6 @@ extern char __hyp_text_end[]; #define HVC_SET_VECTORS 0 #define HVC_SOFT_RESTART 1 -#define HVC_RESET_VECTORS 2 - -#define HVC_STUB_HCALL_NR 3 #endif /* __ASSEMBLY__ */ diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h deleted file mode 100644 index 03cd7c19a683..000000000000 --- a/arch/arm/include/uapi/asm/kvm.h +++ /dev/null @@ -1,314 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#ifndef __ARM_KVM_H__ -#define __ARM_KVM_H__ - -#include <linux/types.h> -#include <linux/psci.h> -#include <asm/ptrace.h> - -#define __KVM_HAVE_GUEST_DEBUG -#define __KVM_HAVE_IRQ_LINE -#define __KVM_HAVE_READONLY_MEM -#define __KVM_HAVE_VCPU_EVENTS - -#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 - -#define KVM_REG_SIZE(id) \ - (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) - -/* Valid for svc_regs, abt_regs, und_regs, irq_regs in struct kvm_regs */ -#define KVM_ARM_SVC_sp svc_regs[0] -#define KVM_ARM_SVC_lr svc_regs[1] -#define KVM_ARM_SVC_spsr svc_regs[2] -#define KVM_ARM_ABT_sp abt_regs[0] -#define KVM_ARM_ABT_lr abt_regs[1] -#define KVM_ARM_ABT_spsr abt_regs[2] -#define KVM_ARM_UND_sp und_regs[0] -#define KVM_ARM_UND_lr und_regs[1] -#define KVM_ARM_UND_spsr und_regs[2] -#define KVM_ARM_IRQ_sp irq_regs[0] -#define KVM_ARM_IRQ_lr irq_regs[1] -#define KVM_ARM_IRQ_spsr irq_regs[2] - -/* Valid only for fiq_regs in struct kvm_regs */ -#define KVM_ARM_FIQ_r8 fiq_regs[0] -#define KVM_ARM_FIQ_r9 fiq_regs[1] -#define KVM_ARM_FIQ_r10 fiq_regs[2] -#define KVM_ARM_FIQ_fp fiq_regs[3] -#define KVM_ARM_FIQ_ip fiq_regs[4] -#define KVM_ARM_FIQ_sp fiq_regs[5] -#define KVM_ARM_FIQ_lr fiq_regs[6] -#define KVM_ARM_FIQ_spsr fiq_regs[7] - -struct kvm_regs { - struct pt_regs usr_regs; /* R0_usr - R14_usr, PC, CPSR */ - unsigned long svc_regs[3]; /* SP_svc, LR_svc, SPSR_svc */ - unsigned long abt_regs[3]; /* SP_abt, LR_abt, SPSR_abt */ - unsigned long und_regs[3]; /* SP_und, LR_und, SPSR_und */ - unsigned long irq_regs[3]; /* SP_irq, LR_irq, SPSR_irq */ - unsigned long fiq_regs[8]; /* R8_fiq - R14_fiq, SPSR_fiq */ -}; - -/* Supported Processor Types */ -#define KVM_ARM_TARGET_CORTEX_A15 0 -#define KVM_ARM_TARGET_CORTEX_A7 1 -#define KVM_ARM_NUM_TARGETS 2 - -/* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ -#define KVM_ARM_DEVICE_TYPE_SHIFT 0 -#define KVM_ARM_DEVICE_TYPE_MASK (0xffff << KVM_ARM_DEVICE_TYPE_SHIFT) -#define KVM_ARM_DEVICE_ID_SHIFT 16 -#define KVM_ARM_DEVICE_ID_MASK (0xffff << KVM_ARM_DEVICE_ID_SHIFT) - -/* Supported device IDs */ -#define KVM_ARM_DEVICE_VGIC_V2 0 - -/* Supported VGIC address types */ -#define KVM_VGIC_V2_ADDR_TYPE_DIST 0 -#define KVM_VGIC_V2_ADDR_TYPE_CPU 1 - -#define KVM_VGIC_V2_DIST_SIZE 0x1000 -#define KVM_VGIC_V2_CPU_SIZE 0x2000 - -/* Supported VGICv3 address types */ -#define KVM_VGIC_V3_ADDR_TYPE_DIST 2 -#define KVM_VGIC_V3_ADDR_TYPE_REDIST 3 -#define KVM_VGIC_ITS_ADDR_TYPE 4 -#define KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION 5 - -#define KVM_VGIC_V3_DIST_SIZE SZ_64K -#define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K) -#define KVM_VGIC_V3_ITS_SIZE (2 * SZ_64K) - -#define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ -#define KVM_ARM_VCPU_PSCI_0_2 1 /* CPU uses PSCI v0.2 */ - -struct kvm_vcpu_init { - __u32 target; - __u32 features[7]; -}; - -struct kvm_sregs { -}; - -struct kvm_fpu { -}; - -struct kvm_guest_debug_arch { -}; - -struct kvm_debug_exit_arch { -}; - -struct kvm_sync_regs { - /* Used with KVM_CAP_ARM_USER_IRQ */ - __u64 device_irq_level; -}; - -struct kvm_arch_memory_slot { -}; - -/* for KVM_GET/SET_VCPU_EVENTS */ -struct kvm_vcpu_events { - struct { - __u8 serror_pending; - __u8 serror_has_esr; - __u8 ext_dabt_pending; - /* Align it to 8 bytes */ - __u8 pad[5]; - __u64 serror_esr; - } exception; - __u32 reserved[12]; -}; - -/* If you need to interpret the index values, here is the key: */ -#define KVM_REG_ARM_COPROC_MASK 0x000000000FFF0000 -#define KVM_REG_ARM_COPROC_SHIFT 16 -#define KVM_REG_ARM_32_OPC2_MASK 0x0000000000000007 -#define KVM_REG_ARM_32_OPC2_SHIFT 0 -#define KVM_REG_ARM_OPC1_MASK 0x0000000000000078 -#define KVM_REG_ARM_OPC1_SHIFT 3 -#define KVM_REG_ARM_CRM_MASK 0x0000000000000780 -#define KVM_REG_ARM_CRM_SHIFT 7 -#define KVM_REG_ARM_32_CRN_MASK 0x0000000000007800 -#define KVM_REG_ARM_32_CRN_SHIFT 11 -/* - * For KVM currently all guest registers are nonsecure, but we reserve a bit - * in the encoding to distinguish secure from nonsecure for AArch32 system - * registers that are banked by security. This is 1 for the secure banked - * register, and 0 for the nonsecure banked register or if the register is - * not banked by security. - */ -#define KVM_REG_ARM_SECURE_MASK 0x0000000010000000 -#define KVM_REG_ARM_SECURE_SHIFT 28 - -#define ARM_CP15_REG_SHIFT_MASK(x,n) \ - (((x) << KVM_REG_ARM_ ## n ## _SHIFT) & KVM_REG_ARM_ ## n ## _MASK) - -#define __ARM_CP15_REG(op1,crn,crm,op2) \ - (KVM_REG_ARM | (15 << KVM_REG_ARM_COPROC_SHIFT) | \ - ARM_CP15_REG_SHIFT_MASK(op1, OPC1) | \ - ARM_CP15_REG_SHIFT_MASK(crn, 32_CRN) | \ - ARM_CP15_REG_SHIFT_MASK(crm, CRM) | \ - ARM_CP15_REG_SHIFT_MASK(op2, 32_OPC2)) - -#define ARM_CP15_REG32(...) (__ARM_CP15_REG(__VA_ARGS__) | KVM_REG_SIZE_U32) - -#define __ARM_CP15_REG64(op1,crm) \ - (__ARM_CP15_REG(op1, 0, crm, 0) | KVM_REG_SIZE_U64) -#define ARM_CP15_REG64(...) __ARM_CP15_REG64(__VA_ARGS__) - -/* PL1 Physical Timer Registers */ -#define KVM_REG_ARM_PTIMER_CTL ARM_CP15_REG32(0, 14, 2, 1) -#define KVM_REG_ARM_PTIMER_CNT ARM_CP15_REG64(0, 14) -#define KVM_REG_ARM_PTIMER_CVAL ARM_CP15_REG64(2, 14) - -/* Virtual Timer Registers */ -#define KVM_REG_ARM_TIMER_CTL ARM_CP15_REG32(0, 14, 3, 1) -#define KVM_REG_ARM_TIMER_CNT ARM_CP15_REG64(1, 14) -#define KVM_REG_ARM_TIMER_CVAL ARM_CP15_REG64(3, 14) - -/* Normal registers are mapped as coprocessor 16. */ -#define KVM_REG_ARM_CORE (0x0010 << KVM_REG_ARM_COPROC_SHIFT) -#define KVM_REG_ARM_CORE_REG(name) (offsetof(struct kvm_regs, name) / 4) - -/* Some registers need more space to represent values. */ -#define KVM_REG_ARM_DEMUX (0x0011 << KVM_REG_ARM_COPROC_SHIFT) -#define KVM_REG_ARM_DEMUX_ID_MASK 0x000000000000FF00 -#define KVM_REG_ARM_DEMUX_ID_SHIFT 8 -#define KVM_REG_ARM_DEMUX_ID_CCSIDR (0x00 << KVM_REG_ARM_DEMUX_ID_SHIFT) -#define KVM_REG_ARM_DEMUX_VAL_MASK 0x00000000000000FF -#define KVM_REG_ARM_DEMUX_VAL_SHIFT 0 - -/* VFP registers: we could overload CP10 like ARM does, but that's ugly. */ -#define KVM_REG_ARM_VFP (0x0012 << KVM_REG_ARM_COPROC_SHIFT) -#define KVM_REG_ARM_VFP_MASK 0x000000000000FFFF -#define KVM_REG_ARM_VFP_BASE_REG 0x0 -#define KVM_REG_ARM_VFP_FPSID 0x1000 -#define KVM_REG_ARM_VFP_FPSCR 0x1001 -#define KVM_REG_ARM_VFP_MVFR1 0x1006 -#define KVM_REG_ARM_VFP_MVFR0 0x1007 -#define KVM_REG_ARM_VFP_FPEXC 0x1008 -#define KVM_REG_ARM_VFP_FPINST 0x1009 -#define KVM_REG_ARM_VFP_FPINST2 0x100A - -/* KVM-as-firmware specific pseudo-registers */ -#define KVM_REG_ARM_FW (0x0014 << KVM_REG_ARM_COPROC_SHIFT) -#define KVM_REG_ARM_FW_REG(r) (KVM_REG_ARM | KVM_REG_SIZE_U64 | \ - KVM_REG_ARM_FW | ((r) & 0xffff)) -#define KVM_REG_ARM_PSCI_VERSION KVM_REG_ARM_FW_REG(0) -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1 KVM_REG_ARM_FW_REG(1) - /* Higher values mean better protection. */ -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL 0 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL 1 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED 2 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 KVM_REG_ARM_FW_REG(2) - /* Higher values mean better protection. */ -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL 0 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN 1 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL 2 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED 3 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED (1U << 4) - -/* Device Control API: ARM VGIC */ -#define KVM_DEV_ARM_VGIC_GRP_ADDR 0 -#define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1 -#define KVM_DEV_ARM_VGIC_GRP_CPU_REGS 2 -#define KVM_DEV_ARM_VGIC_CPUID_SHIFT 32 -#define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT) -#define KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32 -#define KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \ - (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT) -#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 -#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) -#define KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff) -#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 -#define KVM_DEV_ARM_VGIC_GRP_CTRL 4 -#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5 -#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6 -#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7 -#define KVM_DEV_ARM_VGIC_GRP_ITS_REGS 8 -#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10 -#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \ - (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) -#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff -#define VGIC_LEVEL_INFO_LINE_LEVEL 0 - -/* Device Control API on vcpu fd */ -#define KVM_ARM_VCPU_PMU_V3_CTRL 0 -#define KVM_ARM_VCPU_PMU_V3_IRQ 0 -#define KVM_ARM_VCPU_PMU_V3_INIT 1 -#define KVM_ARM_VCPU_TIMER_CTRL 1 -#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 -#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 - -#define KVM_DEV_ARM_VGIC_CTRL_INIT 0 -#define KVM_DEV_ARM_ITS_SAVE_TABLES 1 -#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 -#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 -#define KVM_DEV_ARM_ITS_CTRL_RESET 4 - -/* KVM_IRQ_LINE irq field index values */ -#define KVM_ARM_IRQ_VCPU2_SHIFT 28 -#define KVM_ARM_IRQ_VCPU2_MASK 0xf -#define KVM_ARM_IRQ_TYPE_SHIFT 24 -#define KVM_ARM_IRQ_TYPE_MASK 0xf -#define KVM_ARM_IRQ_VCPU_SHIFT 16 -#define KVM_ARM_IRQ_VCPU_MASK 0xff -#define KVM_ARM_IRQ_NUM_SHIFT 0 -#define KVM_ARM_IRQ_NUM_MASK 0xffff - -/* irq_type field */ -#define KVM_ARM_IRQ_TYPE_CPU 0 -#define KVM_ARM_IRQ_TYPE_SPI 1 -#define KVM_ARM_IRQ_TYPE_PPI 2 - -/* out-of-kernel GIC cpu interrupt injection irq_number field */ -#define KVM_ARM_IRQ_CPU_IRQ 0 -#define KVM_ARM_IRQ_CPU_FIQ 1 - -/* - * This used to hold the highest supported SPI, but it is now obsolete - * and only here to provide source code level compatibility with older - * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS. - */ -#ifndef __KERNEL__ -#define KVM_ARM_IRQ_GIC_MAX 127 -#endif - -/* One single KVM irqchip, ie. the VGIC */ -#define KVM_NR_IRQCHIPS 1 - -/* PSCI interface */ -#define KVM_PSCI_FN_BASE 0x95c1ba5e -#define KVM_PSCI_FN(n) (KVM_PSCI_FN_BASE + (n)) - -#define KVM_PSCI_FN_CPU_SUSPEND KVM_PSCI_FN(0) -#define KVM_PSCI_FN_CPU_OFF KVM_PSCI_FN(1) -#define KVM_PSCI_FN_CPU_ON KVM_PSCI_FN(2) -#define KVM_PSCI_FN_MIGRATE KVM_PSCI_FN(3) - -#define KVM_PSCI_RET_SUCCESS PSCI_RET_SUCCESS -#define KVM_PSCI_RET_NI PSCI_RET_NOT_SUPPORTED -#define KVM_PSCI_RET_INVAL PSCI_RET_INVALID_PARAMS -#define KVM_PSCI_RET_DENIED PSCI_RET_DENIED - -#endif /* __ARM_KVM_H__ */ diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index c773b829ee8e..c036a4a2f8e2 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -11,9 +11,6 @@ #include <linux/sched.h> #include <linux/mm.h> #include <linux/dma-mapping.h> -#ifdef CONFIG_KVM_ARM_HOST -#include <linux/kvm_host.h> -#endif #include <asm/cacheflush.h> #include <asm/glue-df.h> #include <asm/glue-pf.h> @@ -167,14 +164,6 @@ int main(void) DEFINE(CACHE_WRITEBACK_ORDER, __CACHE_WRITEBACK_ORDER); DEFINE(CACHE_WRITEBACK_GRANULE, __CACHE_WRITEBACK_GRANULE); BLANK(); -#ifdef CONFIG_KVM_ARM_HOST - DEFINE(VCPU_GUEST_CTXT, offsetof(struct kvm_vcpu, arch.ctxt)); - DEFINE(VCPU_HOST_CTXT, offsetof(struct kvm_vcpu, arch.host_cpu_context)); - DEFINE(CPU_CTXT_VFP, offsetof(struct kvm_cpu_context, vfp)); - DEFINE(CPU_CTXT_GP_REGS, offsetof(struct kvm_cpu_context, gp_regs)); - DEFINE(GP_REGS_USR, offsetof(struct kvm_regs, usr_regs)); -#endif - BLANK(); #ifdef CONFIG_VDSO DEFINE(VDSO_DATA_SIZE, sizeof(union vdso_data_store)); #endif diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S index 6607fa817bba..26d8e03b1dd3 100644 --- a/arch/arm/kernel/hyp-stub.S +++ b/arch/arm/kernel/hyp-stub.S @@ -189,19 +189,19 @@ ARM_BE8(orr r7, r7, #(1 << 25)) @ HSCTLR.EE ENDPROC(__hyp_stub_install_secondary) __hyp_stub_do_trap: +#ifdef ZIMAGE teq r0, #HVC_SET_VECTORS bne 1f + /* Only the ZIMAGE stubs can change the HYP vectors */ mcr p15, 4, r1, c12, c0, 0 @ set HVBAR b __hyp_stub_exit +#endif 1: teq r0, #HVC_SOFT_RESTART - bne 1f + bne 2f bx r1 -1: teq r0, #HVC_RESET_VECTORS - beq __hyp_stub_exit - - ldr r0, =HVC_STUB_ERR +2: ldr r0, =HVC_STUB_ERR __ERET __hyp_stub_exit: @@ -210,26 +210,9 @@ __hyp_stub_exit: ENDPROC(__hyp_stub_do_trap) /* - * __hyp_set_vectors: Call this after boot to set the initial hypervisor - * vectors as part of hypervisor installation. On an SMP system, this should - * be called on each CPU. - * - * r0 must be the physical address of the new vector table (which must lie in - * the bottom 4GB of physical address space. - * - * r0 must be 32-byte aligned. - * - * Before calling this, you must check that the stub hypervisor is installed - * everywhere, by waiting for any secondary CPUs to be brought up and then - * checking that BOOT_CPU_MODE_HAVE_HYP(__boot_cpu_mode) is true. - * - * If not, there is a pre-existing hypervisor, some CPUs failed to boot, or - * something else went wrong... in such cases, trying to install a new - * hypervisor is unlikely to work as desired. - * - * When you call into your shiny new hypervisor, sp_hyp will contain junk, - * so you will need to set that to something sensible at the new hypervisor's - * initialisation entry point. + * __hyp_set_vectors is only used when ZIMAGE must bounce between HYP + * and SVC. For the kernel itself, the vectors are set once and for + * all by the stubs. */ ENTRY(__hyp_set_vectors) mov r1, r0 @@ -245,12 +228,6 @@ ENTRY(__hyp_soft_restart) ret lr ENDPROC(__hyp_soft_restart) -ENTRY(__hyp_reset_vectors) - mov r0, #HVC_RESET_VECTORS - __HVC(0) - ret lr -ENDPROC(__hyp_reset_vectors) - #ifndef ZIMAGE .align 2 .L__boot_cpu_mode_offset: diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S index 21b8b271c80d..6d2be994ae58 100644 --- a/arch/arm/kernel/vmlinux-xip.lds.S +++ b/arch/arm/kernel/vmlinux-xip.lds.S @@ -162,14 +162,6 @@ SECTIONS ASSERT((__proc_info_end - __proc_info_begin), "missing CPU support") ASSERT((__arch_info_end - __arch_info_begin), "no machine record defined") -/* - * The HYP init code can't be more than a page long, - * and should not cross a page boundary. - * The above comment applies as well. - */ -ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & PAGE_MASK) <= PAGE_SIZE, - "HYP init code too big or misaligned") - #ifdef CONFIG_XIP_DEFLATED_DATA /* * The .bss is used as a stack area for __inflate_kernel_data() whose stack diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 319ccb10846a..88a720da443b 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -170,12 +170,4 @@ __start_rodata_section_aligned = ALIGN(__start_rodata, 1 << SECTION_SHIFT); ASSERT((__proc_info_end - __proc_info_begin), "missing CPU support") ASSERT((__arch_info_end - __arch_info_begin), "no machine record defined") -/* - * The HYP init code can't be more than a page long, - * and should not cross a page boundary. - * The above comment applies as well. - */ -ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & PAGE_MASK) <= PAGE_SIZE, - "HYP init code too big or misaligned") - #endif /* CONFIG_XIP_KERNEL */ diff --git a/arch/arm/kernel/vmlinux.lds.h b/arch/arm/kernel/vmlinux.lds.h index 8247bc15addc..381a8e105fa5 100644 --- a/arch/arm/kernel/vmlinux.lds.h +++ b/arch/arm/kernel/vmlinux.lds.h @@ -31,20 +31,11 @@ *(.proc.info.init) \ __proc_info_end = .; -#define HYPERVISOR_TEXT \ - __hyp_text_start = .; \ - *(.hyp.text) \ - __hyp_text_end = .; - #define IDMAP_TEXT \ ALIGN_FUNCTION(); \ __idmap_text_start = .; \ *(.idmap.text) \ __idmap_text_end = .; \ - . = ALIGN(PAGE_SIZE); \ - __hyp_idmap_text_start = .; \ - *(.hyp.idmap.text) \ - __hyp_idmap_text_end = .; #define ARM_DISCARD \ *(.ARM.exidx.exit.text) \ @@ -72,7 +63,6 @@ SCHED_TEXT \ CPUIDLE_TEXT \ LOCK_TEXT \ - HYPERVISOR_TEXT \ KPROBES_TEXT \ *(.gnu.warning) \ *(.glue_7) \ diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig deleted file mode 100644 index f591026347a5..000000000000 --- a/arch/arm/kvm/Kconfig +++ /dev/null @@ -1,59 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# KVM configuration -# - -source "virt/kvm/Kconfig" -source "virt/lib/Kconfig" - -menuconfig VIRTUALIZATION - bool "Virtualization" - ---help--- - Say Y here to get to see options for using your Linux host to run - other operating systems inside virtual machines (guests). - This option alone does not add any kernel code. - - If you say N, all options in this submenu will be skipped and - disabled. - -if VIRTUALIZATION - -config KVM - bool "Kernel-based Virtual Machine (KVM) support" - depends on MMU && OF - select PREEMPT_NOTIFIERS - select ARM_GIC - select ARM_GIC_V3 - select ARM_GIC_V3_ITS - select HAVE_KVM_CPU_RELAX_INTERCEPT - select HAVE_KVM_ARCH_TLB_FLUSH_ALL - select KVM_MMIO - select KVM_ARM_HOST - select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select SRCU - select MMU_NOTIFIER - select KVM_VFIO - select HAVE_KVM_EVENTFD - select HAVE_KVM_IRQFD - select HAVE_KVM_IRQCHIP - select HAVE_KVM_IRQ_ROUTING - select HAVE_KVM_MSI - select IRQ_BYPASS_MANAGER - select HAVE_KVM_IRQ_BYPASS - depends on ARM_VIRT_EXT && ARM_LPAE && ARM_ARCH_TIMER - ---help--- - Support hosting virtualized guest machines. - - This module provides access to the hardware capabilities through - a character device node named /dev/kvm. - - If unsure, say N. - -config KVM_ARM_HOST - bool - ---help--- - Provides host support for ARM processors. - -source "drivers/vhost/Kconfig" - -endif # VIRTUALIZATION diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile deleted file mode 100644 index e442d82821df..000000000000 --- a/arch/arm/kvm/Makefile +++ /dev/null @@ -1,43 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for Kernel-based Virtual Machine module -# - -plus_virt := $(call as-instr,.arch_extension virt,+virt) -ifeq ($(plus_virt),+virt) - plus_virt_def := -DREQUIRES_VIRT=1 -endif - -KVM := ../../../virt/kvm - -ccflags-y += -I $(srctree)/$(src) -I $(srctree)/virt/kvm/arm/vgic -CFLAGS_$(KVM)/arm/arm.o := $(plus_virt_def) - -AFLAGS_init.o := -Wa,-march=armv7-a$(plus_virt) -AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt) - -kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o - -obj-$(CONFIG_KVM_ARM_HOST) += hyp/ - -obj-y += kvm-arm.o init.o interrupts.o -obj-y += handle_exit.o guest.o emulate.o reset.o -obj-y += coproc.o coproc_a15.o coproc_a7.o vgic-v3-coproc.o -obj-y += $(KVM)/arm/arm.o $(KVM)/arm/mmu.o $(KVM)/arm/mmio.o -obj-y += $(KVM)/arm/psci.o $(KVM)/arm/perf.o $(KVM)/arm/hypercalls.o -obj-y += $(KVM)/arm/aarch32.o - -obj-y += $(KVM)/arm/vgic/vgic.o -obj-y += $(KVM)/arm/vgic/vgic-init.o -obj-y += $(KVM)/arm/vgic/vgic-irqfd.o -obj-y += $(KVM)/arm/vgic/vgic-v2.o -obj-y += $(KVM)/arm/vgic/vgic-v3.o -obj-y += $(KVM)/arm/vgic/vgic-v4.o -obj-y += $(KVM)/arm/vgic/vgic-mmio.o -obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o -obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o -obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o -obj-y += $(KVM)/arm/vgic/vgic-its.o -obj-y += $(KVM)/arm/vgic/vgic-debug.o -obj-y += $(KVM)/irqchip.o -obj-y += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c deleted file mode 100644 index 07745ee022a1..000000000000 --- a/arch/arm/kvm/coproc.c +++ /dev/null @@ -1,1455 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Authors: Rusty Russell <rusty@rustcorp.com.au> - * Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#include <linux/bsearch.h> -#include <linux/mm.h> -#include <linux/kvm_host.h> -#include <linux/uaccess.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_host.h> -#include <asm/kvm_emulate.h> -#include <asm/kvm_coproc.h> -#include <asm/kvm_mmu.h> -#include <asm/cacheflush.h> -#include <asm/cputype.h> -#include <trace/events/kvm.h> -#include <asm/vfp.h> -#include "../vfp/vfpinstr.h" - -#define CREATE_TRACE_POINTS -#include "trace.h" -#include "coproc.h" - - -/****************************************************************************** - * Co-processor emulation - *****************************************************************************/ - -static bool write_to_read_only(struct kvm_vcpu *vcpu, - const struct coproc_params *params) -{ - WARN_ONCE(1, "CP15 write to read-only register\n"); - print_cp_instr(params); - kvm_inject_undefined(vcpu); - return false; -} - -static bool read_from_write_only(struct kvm_vcpu *vcpu, - const struct coproc_params *params) -{ - WARN_ONCE(1, "CP15 read to write-only register\n"); - print_cp_instr(params); - kvm_inject_undefined(vcpu); - return false; -} - -/* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ -static u32 cache_levels; - -/* CSSELR values; used to index KVM_REG_ARM_DEMUX_ID_CCSIDR */ -#define CSSELR_MAX 12 - -/* - * kvm_vcpu_arch.cp15 holds cp15 registers as an array of u32, but some - * of cp15 registers can be viewed either as couple of two u32 registers - * or one u64 register. Current u64 register encoding is that least - * significant u32 word is followed by most significant u32 word. - */ -static inline void vcpu_cp15_reg64_set(struct kvm_vcpu *vcpu, - const struct coproc_reg *r, - u64 val) -{ - vcpu_cp15(vcpu, r->reg) = val & 0xffffffff; - vcpu_cp15(vcpu, r->reg + 1) = val >> 32; -} - -static inline u64 vcpu_cp15_reg64_get(struct kvm_vcpu *vcpu, - const struct coproc_reg *r) -{ - u64 val; - - val = vcpu_cp15(vcpu, r->reg + 1); - val = val << 32; - val = val | vcpu_cp15(vcpu, r->reg); - return val; -} - -int kvm_handle_cp10_id(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - kvm_inject_undefined(vcpu); - return 1; -} - -int kvm_handle_cp_0_13_access(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - /* - * We can get here, if the host has been built without VFPv3 support, - * but the guest attempted a floating point operation. - */ - kvm_inject_undefined(vcpu); - return 1; -} - -int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - kvm_inject_undefined(vcpu); - return 1; -} - -static void reset_mpidr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) -{ - /* - * Compute guest MPIDR. We build a virtual cluster out of the - * vcpu_id, but we read the 'U' bit from the underlying - * hardware directly. - */ - vcpu_cp15(vcpu, c0_MPIDR) = ((read_cpuid_mpidr() & MPIDR_SMP_BITMASK) | - ((vcpu->vcpu_id >> 2) << MPIDR_LEVEL_BITS) | - (vcpu->vcpu_id & 3)); -} - -/* TRM entries A7:4.3.31 A15:4.3.28 - RO WI */ -static bool access_actlr(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return ignore_write(vcpu, p); - - *vcpu_reg(vcpu, p->Rt1) = vcpu_cp15(vcpu, c1_ACTLR); - return true; -} - -/* TRM entries A7:4.3.56, A15:4.3.60 - R/O. */ -static bool access_cbar(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return write_to_read_only(vcpu, p); - return read_zero(vcpu, p); -} - -/* TRM entries A7:4.3.49, A15:4.3.48 - R/O WI */ -static bool access_l2ctlr(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return ignore_write(vcpu, p); - - *vcpu_reg(vcpu, p->Rt1) = vcpu_cp15(vcpu, c9_L2CTLR); - return true; -} - -static void reset_l2ctlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) -{ - u32 l2ctlr, ncores; - - asm volatile("mrc p15, 1, %0, c9, c0, 2\n" : "=r" (l2ctlr)); - l2ctlr &= ~(3 << 24); - ncores = atomic_read(&vcpu->kvm->online_vcpus) - 1; - /* How many cores in the current cluster and the next ones */ - ncores -= (vcpu->vcpu_id & ~3); - /* Cap it to the maximum number of cores in a single cluster */ - ncores = min(ncores, 3U); - l2ctlr |= (ncores & 3) << 24; - - vcpu_cp15(vcpu, c9_L2CTLR) = l2ctlr; -} - -static void reset_actlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) -{ - u32 actlr; - - /* ACTLR contains SMP bit: make sure you create all cpus first! */ - asm volatile("mrc p15, 0, %0, c1, c0, 1\n" : "=r" (actlr)); - /* Make the SMP bit consistent with the guest configuration */ - if (atomic_read(&vcpu->kvm->online_vcpus) > 1) - actlr |= 1U << 6; - else - actlr &= ~(1U << 6); - - vcpu_cp15(vcpu, c1_ACTLR) = actlr; -} - -/* - * TRM entries: A7:4.3.50, A15:4.3.49 - * R/O WI (even if NSACR.NS_L2ERR, a write of 1 is ignored). - */ -static bool access_l2ectlr(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return ignore_write(vcpu, p); - - *vcpu_reg(vcpu, p->Rt1) = 0; - return true; -} - -/* - * See note at ARMv7 ARM B1.14.4 (TL;DR: S/W ops are not easily virtualized). - */ -static bool access_dcsw(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (!p->is_write) - return read_from_write_only(vcpu, p); - - kvm_set_way_flush(vcpu); - return true; -} - -/* - * Generic accessor for VM registers. Only called as long as HCR_TVM - * is set. If the guest enables the MMU, we stop trapping the VM - * sys_regs and leave it in complete control of the caches. - * - * Used by the cpu-specific code. - */ -bool access_vm_reg(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - bool was_enabled = vcpu_has_cache_enabled(vcpu); - - BUG_ON(!p->is_write); - - vcpu_cp15(vcpu, r->reg) = *vcpu_reg(vcpu, p->Rt1); - if (p->is_64bit) - vcpu_cp15(vcpu, r->reg + 1) = *vcpu_reg(vcpu, p->Rt2); - - kvm_toggle_cache(vcpu, was_enabled); - return true; -} - -static bool access_gic_sgi(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - u64 reg; - bool g1; - - if (!p->is_write) - return read_from_write_only(vcpu, p); - - reg = (u64)*vcpu_reg(vcpu, p->Rt2) << 32; - reg |= *vcpu_reg(vcpu, p->Rt1) ; - - /* - * In a system where GICD_CTLR.DS=1, a ICC_SGI0R access generates - * Group0 SGIs only, while ICC_SGI1R can generate either group, - * depending on the SGI configuration. ICC_ASGI1R is effectively - * equivalent to ICC_SGI0R, as there is no "alternative" secure - * group. - */ - switch (p->Op1) { - default: /* Keep GCC quiet */ - case 0: /* ICC_SGI1R */ - g1 = true; - break; - case 1: /* ICC_ASGI1R */ - case 2: /* ICC_SGI0R */ - g1 = false; - break; - } - - vgic_v3_dispatch_sgi(vcpu, reg, g1); - - return true; -} - -static bool access_gic_sre(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return ignore_write(vcpu, p); - - *vcpu_reg(vcpu, p->Rt1) = vcpu->arch.vgic_cpu.vgic_v3.vgic_sre; - - return true; -} - -static bool access_cntp_tval(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - u32 val; - - if (p->is_write) { - val = *vcpu_reg(vcpu, p->Rt1); - kvm_arm_timer_write_sysreg(vcpu, - TIMER_PTIMER, TIMER_REG_TVAL, val); - } else { - val = kvm_arm_timer_read_sysreg(vcpu, - TIMER_PTIMER, TIMER_REG_TVAL); - *vcpu_reg(vcpu, p->Rt1) = val; - } - - return true; -} - -static bool access_cntp_ctl(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - u32 val; - - if (p->is_write) { - val = *vcpu_reg(vcpu, p->Rt1); - kvm_arm_timer_write_sysreg(vcpu, - TIMER_PTIMER, TIMER_REG_CTL, val); - } else { - val = kvm_arm_timer_read_sysreg(vcpu, - TIMER_PTIMER, TIMER_REG_CTL); - *vcpu_reg(vcpu, p->Rt1) = val; - } - - return true; -} - -static bool access_cntp_cval(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - u64 val; - - if (p->is_write) { - val = (u64)*vcpu_reg(vcpu, p->Rt2) << 32; - val |= *vcpu_reg(vcpu, p->Rt1); - kvm_arm_timer_write_sysreg(vcpu, - TIMER_PTIMER, TIMER_REG_CVAL, val); - } else { - val = kvm_arm_timer_read_sysreg(vcpu, - TIMER_PTIMER, TIMER_REG_CVAL); - *vcpu_reg(vcpu, p->Rt1) = val; - *vcpu_reg(vcpu, p->Rt2) = val >> 32; - } - - return true; -} - -/* - * We could trap ID_DFR0 and tell the guest we don't support performance - * monitoring. Unfortunately the patch to make the kernel check ID_DFR0 was - * NAKed, so it will read the PMCR anyway. - * - * Therefore we tell the guest we have 0 counters. Unfortunately, we - * must always support PMCCNTR (the cycle counter): we just RAZ/WI for - * all PM registers, which doesn't crash the guest kernel at least. - */ -static bool trap_raz_wi(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r) -{ - if (p->is_write) - return ignore_write(vcpu, p); - else - return read_zero(vcpu, p); -} - -#define access_pmcr trap_raz_wi -#define access_pmcntenset trap_raz_wi -#define access_pmcntenclr trap_raz_wi -#define access_pmovsr trap_raz_wi -#define access_pmselr trap_raz_wi -#define access_pmceid0 trap_raz_wi -#define access_pmceid1 trap_raz_wi -#define access_pmccntr trap_raz_wi -#define access_pmxevtyper trap_raz_wi -#define access_pmxevcntr trap_raz_wi -#define access_pmuserenr trap_raz_wi -#define access_pmintenset trap_raz_wi -#define access_pmintenclr trap_raz_wi - -/* Architected CP15 registers. - * CRn denotes the primary register number, but is copied to the CRm in the - * user space API for 64-bit register access in line with the terminology used - * in the ARM ARM. - * Important: Must be sorted ascending by CRn, CRM, Op1, Op2 and with 64-bit - * registers preceding 32-bit ones. - */ -static const struct coproc_reg cp15_regs[] = { - /* MPIDR: we use VMPIDR for guest access. */ - { CRn( 0), CRm( 0), Op1( 0), Op2( 5), is32, - NULL, reset_mpidr, c0_MPIDR }, - - /* CSSELR: swapped by interrupt.S. */ - { CRn( 0), CRm( 0), Op1( 2), Op2( 0), is32, - NULL, reset_unknown, c0_CSSELR }, - - /* ACTLR: trapped by HCR.TAC bit. */ - { CRn( 1), CRm( 0), Op1( 0), Op2( 1), is32, - access_actlr, reset_actlr, c1_ACTLR }, - - /* CPACR: swapped by interrupt.S. */ - { CRn( 1), CRm( 0), Op1( 0), Op2( 2), is32, - NULL, reset_val, c1_CPACR, 0x00000000 }, - - /* TTBR0/TTBR1/TTBCR: swapped by interrupt.S. */ - { CRm64( 2), Op1( 0), is64, access_vm_reg, reset_unknown64, c2_TTBR0 }, - { CRn(2), CRm( 0), Op1( 0), Op2( 0), is32, - access_vm_reg, reset_unknown, c2_TTBR0 }, - { CRn(2), CRm( 0), Op1( 0), Op2( 1), is32, - access_vm_reg, reset_unknown, c2_TTBR1 }, - { CRn( 2), CRm( 0), Op1( 0), Op2( 2), is32, - access_vm_reg, reset_val, c2_TTBCR, 0x00000000 }, - { CRm64( 2), Op1( 1), is64, access_vm_reg, reset_unknown64, c2_TTBR1 }, - - - /* DACR: swapped by interrupt.S. */ - { CRn( 3), CRm( 0), Op1( 0), Op2( 0), is32, - access_vm_reg, reset_unknown, c3_DACR }, - - /* DFSR/IFSR/ADFSR/AIFSR: swapped by interrupt.S. */ - { CRn( 5), CRm( 0), Op1( 0), Op2( 0), is32, - access_vm_reg, reset_unknown, c5_DFSR }, - { CRn( 5), CRm( 0), Op1( 0), Op2( 1), is32, - access_vm_reg, reset_unknown, c5_IFSR }, - { CRn( 5), CRm( 1), Op1( 0), Op2( 0), is32, - access_vm_reg, reset_unknown, c5_ADFSR }, - { CRn( 5), CRm( 1), Op1( 0), Op2( 1), is32, - access_vm_reg, reset_unknown, c5_AIFSR }, - - /* DFAR/IFAR: swapped by interrupt.S. */ - { CRn( 6), CRm( 0), Op1( 0), Op2( 0), is32, - access_vm_reg, reset_unknown, c6_DFAR }, - { CRn( 6), CRm( 0), Op1( 0), Op2( 2), is32, - access_vm_reg, reset_unknown, c6_IFAR }, - - /* PAR swapped by interrupt.S */ - { CRm64( 7), Op1( 0), is64, NULL, reset_unknown64, c7_PAR }, - - /* - * DC{C,I,CI}SW operations: - */ - { CRn( 7), CRm( 6), Op1( 0), Op2( 2), is32, access_dcsw}, - { CRn( 7), CRm(10), Op1( 0), Op2( 2), is32, access_dcsw}, - { CRn( 7), CRm(14), Op1( 0), Op2( 2), is32, access_dcsw}, - /* - * L2CTLR access (guest wants to know #CPUs). - */ - { CRn( 9), CRm( 0), Op1( 1), Op2( 2), is32, - access_l2ctlr, reset_l2ctlr, c9_L2CTLR }, - { CRn( 9), CRm( 0), Op1( 1), Op2( 3), is32, access_l2ectlr}, - - /* - * Dummy performance monitor implementation. - */ - { CRn( 9), CRm(12), Op1( 0), Op2( 0), is32, access_pmcr}, - { CRn( 9), CRm(12), Op1( 0), Op2( 1), is32, access_pmcntenset}, - { CRn( 9), CRm(12), Op1( 0), Op2( 2), is32, access_pmcntenclr}, - { CRn( 9), CRm(12), Op1( 0), Op2( 3), is32, access_pmovsr}, - { CRn( 9), CRm(12), Op1( 0), Op2( 5), is32, access_pmselr}, - { CRn( 9), CRm(12), Op1( 0), Op2( 6), is32, access_pmceid0}, - { CRn( 9), CRm(12), Op1( 0), Op2( 7), is32, access_pmceid1}, - { CRn( 9), CRm(13), Op1( 0), Op2( 0), is32, access_pmccntr}, - { CRn( 9), CRm(13), Op1( 0), Op2( 1), is32, access_pmxevtyper}, - { CRn( 9), CRm(13), Op1( 0), Op2( 2), is32, access_pmxevcntr}, - { CRn( 9), CRm(14), Op1( 0), Op2( 0), is32, access_pmuserenr}, - { CRn( 9), CRm(14), Op1( 0), Op2( 1), is32, access_pmintenset}, - { CRn( 9), CRm(14), Op1( 0), Op2( 2), is32, access_pmintenclr}, - - /* PRRR/NMRR (aka MAIR0/MAIR1): swapped by interrupt.S. */ - { CRn(10), CRm( 2), Op1( 0), Op2( 0), is32, - access_vm_reg, reset_unknown, c10_PRRR}, - { CRn(10), CRm( 2), Op1( 0), Op2( 1), is32, - access_vm_reg, reset_unknown, c10_NMRR}, - - /* AMAIR0/AMAIR1: swapped by interrupt.S. */ - { CRn(10), CRm( 3), Op1( 0), Op2( 0), is32, - access_vm_reg, reset_unknown, c10_AMAIR0}, - { CRn(10), CRm( 3), Op1( 0), Op2( 1), is32, - access_vm_reg, reset_unknown, c10_AMAIR1}, - - /* ICC_SGI1R */ - { CRm64(12), Op1( 0), is64, access_gic_sgi}, - - /* VBAR: swapped by interrupt.S. */ - { CRn(12), CRm( 0), Op1( 0), Op2( 0), is32, - NULL, reset_val, c12_VBAR, 0x00000000 }, - - /* ICC_ASGI1R */ - { CRm64(12), Op1( 1), is64, access_gic_sgi}, - /* ICC_SGI0R */ - { CRm64(12), Op1( 2), is64, access_gic_sgi}, - /* ICC_SRE */ - { CRn(12), CRm(12), Op1( 0), Op2(5), is32, access_gic_sre }, - - /* CONTEXTIDR/TPIDRURW/TPIDRURO/TPIDRPRW: swapped by interrupt.S. */ - { CRn(13), CRm( 0), Op1( 0), Op2( 1), is32, - access_vm_reg, reset_val, c13_CID, 0x00000000 }, - { CRn(13), CRm( 0), Op1( 0), Op2( 2), is32, - NULL, reset_unknown, c13_TID_URW }, - { CRn(13), CRm( 0), Op1( 0), Op2( 3), is32, - NULL, reset_unknown, c13_TID_URO }, - { CRn(13), CRm( 0), Op1( 0), Op2( 4), is32, - NULL, reset_unknown, c13_TID_PRIV }, - - /* CNTP */ - { CRm64(14), Op1( 2), is64, access_cntp_cval}, - - /* CNTKCTL: swapped by interrupt.S. */ - { CRn(14), CRm( 1), Op1( 0), Op2( 0), is32, - NULL, reset_val, c14_CNTKCTL, 0x00000000 }, - - /* CNTP */ - { CRn(14), CRm( 2), Op1( 0), Op2( 0), is32, access_cntp_tval }, - { CRn(14), CRm( 2), Op1( 0), Op2( 1), is32, access_cntp_ctl }, - - /* The Configuration Base Address Register. */ - { CRn(15), CRm( 0), Op1( 4), Op2( 0), is32, access_cbar}, -}; - -static int check_reg_table(const struct coproc_reg *table, unsigned int n) -{ - unsigned int i; - - for (i = 1; i < n; i++) { - if (cmp_reg(&table[i-1], &table[i]) >= 0) { - kvm_err("reg table %p out of order (%d)\n", table, i - 1); - return 1; - } - } - - return 0; -} - -/* Target specific emulation tables */ -static struct kvm_coproc_target_table *target_tables[KVM_ARM_NUM_TARGETS]; - -void kvm_register_target_coproc_table(struct kvm_coproc_target_table *table) -{ - BUG_ON(check_reg_table(table->table, table->num)); - target_tables[table->target] = table; -} - -/* Get specific register table for this target. */ -static const struct coproc_reg *get_target_table(unsigned target, size_t *num) -{ - struct kvm_coproc_target_table *table; - - table = target_tables[target]; - *num = table->num; - return table->table; -} - -#define reg_to_match_value(x) \ - ({ \ - unsigned long val; \ - val = (x)->CRn << 11; \ - val |= (x)->CRm << 7; \ - val |= (x)->Op1 << 4; \ - val |= (x)->Op2 << 1; \ - val |= !(x)->is_64bit; \ - val; \ - }) - -static int match_reg(const void *key, const void *elt) -{ - const unsigned long pval = (unsigned long)key; - const struct coproc_reg *r = elt; - - return pval - reg_to_match_value(r); -} - -static const struct coproc_reg *find_reg(const struct coproc_params *params, - const struct coproc_reg table[], - unsigned int num) -{ - unsigned long pval = reg_to_match_value(params); - - return bsearch((void *)pval, table, num, sizeof(table[0]), match_reg); -} - -static int emulate_cp15(struct kvm_vcpu *vcpu, - const struct coproc_params *params) -{ - size_t num; - const struct coproc_reg *table, *r; - - trace_kvm_emulate_cp15_imp(params->Op1, params->Rt1, params->CRn, - params->CRm, params->Op2, params->is_write); - - table = get_target_table(vcpu->arch.target, &num); - - /* Search target-specific then generic table. */ - r = find_reg(params, table, num); - if (!r) - r = find_reg(params, cp15_regs, ARRAY_SIZE(cp15_regs)); - - if (likely(r)) { - /* If we don't have an accessor, we should never get here! */ - BUG_ON(!r->access); - - if (likely(r->access(vcpu, params, r))) { - /* Skip instruction, since it was emulated */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - } - } else { - /* If access function fails, it should complain. */ - kvm_err("Unsupported guest CP15 access at: %08lx [%08lx]\n", - *vcpu_pc(vcpu), *vcpu_cpsr(vcpu)); - print_cp_instr(params); - kvm_inject_undefined(vcpu); - } - - return 1; -} - -static struct coproc_params decode_64bit_hsr(struct kvm_vcpu *vcpu) -{ - struct coproc_params params; - - params.CRn = (kvm_vcpu_get_hsr(vcpu) >> 1) & 0xf; - params.Rt1 = (kvm_vcpu_get_hsr(vcpu) >> 5) & 0xf; - params.is_write = ((kvm_vcpu_get_hsr(vcpu) & 1) == 0); - params.is_64bit = true; - - params.Op1 = (kvm_vcpu_get_hsr(vcpu) >> 16) & 0xf; - params.Op2 = 0; - params.Rt2 = (kvm_vcpu_get_hsr(vcpu) >> 10) & 0xf; - params.CRm = 0; - - return params; -} - -/** - * kvm_handle_cp15_64 -- handles a mrrc/mcrr trap on a guest CP15 access - * @vcpu: The VCPU pointer - * @run: The kvm_run struct - */ -int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - struct coproc_params params = decode_64bit_hsr(vcpu); - - return emulate_cp15(vcpu, ¶ms); -} - -/** - * kvm_handle_cp14_64 -- handles a mrrc/mcrr trap on a guest CP14 access - * @vcpu: The VCPU pointer - * @run: The kvm_run struct - */ -int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - struct coproc_params params = decode_64bit_hsr(vcpu); - - /* raz_wi cp14 */ - trap_raz_wi(vcpu, ¶ms, NULL); - - /* handled */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - return 1; -} - -static void reset_coproc_regs(struct kvm_vcpu *vcpu, - const struct coproc_reg *table, size_t num, - unsigned long *bmap) -{ - unsigned long i; - - for (i = 0; i < num; i++) - if (table[i].reset) { - int reg = table[i].reg; - - table[i].reset(vcpu, &table[i]); - if (reg > 0 && reg < NR_CP15_REGS) { - set_bit(reg, bmap); - if (table[i].is_64bit) - set_bit(reg + 1, bmap); - } - } -} - -static struct coproc_params decode_32bit_hsr(struct kvm_vcpu *vcpu) -{ - struct coproc_params params; - - params.CRm = (kvm_vcpu_get_hsr(vcpu) >> 1) & 0xf; - params.Rt1 = (kvm_vcpu_get_hsr(vcpu) >> 5) & 0xf; - params.is_write = ((kvm_vcpu_get_hsr(vcpu) & 1) == 0); - params.is_64bit = false; - - params.CRn = (kvm_vcpu_get_hsr(vcpu) >> 10) & 0xf; - params.Op1 = (kvm_vcpu_get_hsr(vcpu) >> 14) & 0x7; - params.Op2 = (kvm_vcpu_get_hsr(vcpu) >> 17) & 0x7; - params.Rt2 = 0; - - return params; -} - -/** - * kvm_handle_cp15_32 -- handles a mrc/mcr trap on a guest CP15 access - * @vcpu: The VCPU pointer - * @run: The kvm_run struct - */ -int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - struct coproc_params params = decode_32bit_hsr(vcpu); - return emulate_cp15(vcpu, ¶ms); -} - -/** - * kvm_handle_cp14_32 -- handles a mrc/mcr trap on a guest CP14 access - * @vcpu: The VCPU pointer - * @run: The kvm_run struct - */ -int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - struct coproc_params params = decode_32bit_hsr(vcpu); - - /* raz_wi cp14 */ - trap_raz_wi(vcpu, ¶ms, NULL); - - /* handled */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - return 1; -} - -/****************************************************************************** - * Userspace API - *****************************************************************************/ - -static bool index_to_params(u64 id, struct coproc_params *params) -{ - switch (id & KVM_REG_SIZE_MASK) { - case KVM_REG_SIZE_U32: - /* Any unused index bits means it's not valid. */ - if (id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK - | KVM_REG_ARM_COPROC_MASK - | KVM_REG_ARM_32_CRN_MASK - | KVM_REG_ARM_CRM_MASK - | KVM_REG_ARM_OPC1_MASK - | KVM_REG_ARM_32_OPC2_MASK)) - return false; - - params->is_64bit = false; - params->CRn = ((id & KVM_REG_ARM_32_CRN_MASK) - >> KVM_REG_ARM_32_CRN_SHIFT); - params->CRm = ((id & KVM_REG_ARM_CRM_MASK) - >> KVM_REG_ARM_CRM_SHIFT); - params->Op1 = ((id & KVM_REG_ARM_OPC1_MASK) - >> KVM_REG_ARM_OPC1_SHIFT); - params->Op2 = ((id & KVM_REG_ARM_32_OPC2_MASK) - >> KVM_REG_ARM_32_OPC2_SHIFT); - return true; - case KVM_REG_SIZE_U64: - /* Any unused index bits means it's not valid. */ - if (id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK - | KVM_REG_ARM_COPROC_MASK - | KVM_REG_ARM_CRM_MASK - | KVM_REG_ARM_OPC1_MASK)) - return false; - params->is_64bit = true; - /* CRm to CRn: see cp15_to_index for details */ - params->CRn = ((id & KVM_REG_ARM_CRM_MASK) - >> KVM_REG_ARM_CRM_SHIFT); - params->Op1 = ((id & KVM_REG_ARM_OPC1_MASK) - >> KVM_REG_ARM_OPC1_SHIFT); - params->Op2 = 0; - params->CRm = 0; - return true; - default: - return false; - } -} - -/* Decode an index value, and find the cp15 coproc_reg entry. */ -static const struct coproc_reg *index_to_coproc_reg(struct kvm_vcpu *vcpu, - u64 id) -{ - size_t num; - const struct coproc_reg *table, *r; - struct coproc_params params; - - /* We only do cp15 for now. */ - if ((id & KVM_REG_ARM_COPROC_MASK) >> KVM_REG_ARM_COPROC_SHIFT != 15) - return NULL; - - if (!index_to_params(id, ¶ms)) - return NULL; - - table = get_target_table(vcpu->arch.target, &num); - r = find_reg(¶ms, table, num); - if (!r) - r = find_reg(¶ms, cp15_regs, ARRAY_SIZE(cp15_regs)); - - /* Not saved in the cp15 array? */ - if (r && !r->reg) - r = NULL; - - return r; -} - -/* - * These are the invariant cp15 registers: we let the guest see the host - * versions of these, so they're part of the guest state. - * - * A future CPU may provide a mechanism to present different values to - * the guest, or a future kvm may trap them. - */ -/* Unfortunately, there's no register-argument for mrc, so generate. */ -#define FUNCTION_FOR32(crn, crm, op1, op2, name) \ - static void get_##name(struct kvm_vcpu *v, \ - const struct coproc_reg *r) \ - { \ - u32 val; \ - \ - asm volatile("mrc p15, " __stringify(op1) \ - ", %0, c" __stringify(crn) \ - ", c" __stringify(crm) \ - ", " __stringify(op2) "\n" : "=r" (val)); \ - ((struct coproc_reg *)r)->val = val; \ - } - -FUNCTION_FOR32(0, 0, 0, 0, MIDR) -FUNCTION_FOR32(0, 0, 0, 1, CTR) -FUNCTION_FOR32(0, 0, 0, 2, TCMTR) -FUNCTION_FOR32(0, 0, 0, 3, TLBTR) -FUNCTION_FOR32(0, 0, 0, 6, REVIDR) -FUNCTION_FOR32(0, 1, 0, 0, ID_PFR0) -FUNCTION_FOR32(0, 1, 0, 1, ID_PFR1) -FUNCTION_FOR32(0, 1, 0, 2, ID_DFR0) -FUNCTION_FOR32(0, 1, 0, 3, ID_AFR0) -FUNCTION_FOR32(0, 1, 0, 4, ID_MMFR0) -FUNCTION_FOR32(0, 1, 0, 5, ID_MMFR1) -FUNCTION_FOR32(0, 1, 0, 6, ID_MMFR2) -FUNCTION_FOR32(0, 1, 0, 7, ID_MMFR3) -FUNCTION_FOR32(0, 2, 0, 0, ID_ISAR0) -FUNCTION_FOR32(0, 2, 0, 1, ID_ISAR1) -FUNCTION_FOR32(0, 2, 0, 2, ID_ISAR2) -FUNCTION_FOR32(0, 2, 0, 3, ID_ISAR3) -FUNCTION_FOR32(0, 2, 0, 4, ID_ISAR4) -FUNCTION_FOR32(0, 2, 0, 5, ID_ISAR5) -FUNCTION_FOR32(0, 0, 1, 1, CLIDR) -FUNCTION_FOR32(0, 0, 1, 7, AIDR) - -/* ->val is filled in by kvm_invariant_coproc_table_init() */ -static struct coproc_reg invariant_cp15[] = { - { CRn( 0), CRm( 0), Op1( 0), Op2( 0), is32, NULL, get_MIDR }, - { CRn( 0), CRm( 0), Op1( 0), Op2( 1), is32, NULL, get_CTR }, - { CRn( 0), CRm( 0), Op1( 0), Op2( 2), is32, NULL, get_TCMTR }, - { CRn( 0), CRm( 0), Op1( 0), Op2( 3), is32, NULL, get_TLBTR }, - { CRn( 0), CRm( 0), Op1( 0), Op2( 6), is32, NULL, get_REVIDR }, - - { CRn( 0), CRm( 0), Op1( 1), Op2( 1), is32, NULL, get_CLIDR }, - { CRn( 0), CRm( 0), Op1( 1), Op2( 7), is32, NULL, get_AIDR }, - - { CRn( 0), CRm( 1), Op1( 0), Op2( 0), is32, NULL, get_ID_PFR0 }, - { CRn( 0), CRm( 1), Op1( 0), Op2( 1), is32, NULL, get_ID_PFR1 }, - { CRn( 0), CRm( 1), Op1( 0), Op2( 2), is32, NULL, get_ID_DFR0 }, - { CRn( 0), CRm( 1), Op1( 0), Op2( 3), is32, NULL, get_ID_AFR0 }, - { CRn( 0), CRm( 1), Op1( 0), Op2( 4), is32, NULL, get_ID_MMFR0 }, - { CRn( 0), CRm( 1), Op1( 0), Op2( 5), is32, NULL, get_ID_MMFR1 }, - { CRn( 0), CRm( 1), Op1( 0), Op2( 6), is32, NULL, get_ID_MMFR2 }, - { CRn( 0), CRm( 1), Op1( 0), Op2( 7), is32, NULL, get_ID_MMFR3 }, - - { CRn( 0), CRm( 2), Op1( 0), Op2( 0), is32, NULL, get_ID_ISAR0 }, - { CRn( 0), CRm( 2), Op1( 0), Op2( 1), is32, NULL, get_ID_ISAR1 }, - { CRn( 0), CRm( 2), Op1( 0), Op2( 2), is32, NULL, get_ID_ISAR2 }, - { CRn( 0), CRm( 2), Op1( 0), Op2( 3), is32, NULL, get_ID_ISAR3 }, - { CRn( 0), CRm( 2), Op1( 0), Op2( 4), is32, NULL, get_ID_ISAR4 }, - { CRn( 0), CRm( 2), Op1( 0), Op2( 5), is32, NULL, get_ID_ISAR5 }, -}; - -/* - * Reads a register value from a userspace address to a kernel - * variable. Make sure that register size matches sizeof(*__val). - */ -static int reg_from_user(void *val, const void __user *uaddr, u64 id) -{ - if (copy_from_user(val, uaddr, KVM_REG_SIZE(id)) != 0) - return -EFAULT; - return 0; -} - -/* - * Writes a register value to a userspace address from a kernel variable. - * Make sure that register size matches sizeof(*__val). - */ -static int reg_to_user(void __user *uaddr, const void *val, u64 id) -{ - if (copy_to_user(uaddr, val, KVM_REG_SIZE(id)) != 0) - return -EFAULT; - return 0; -} - -static int get_invariant_cp15(u64 id, void __user *uaddr) -{ - struct coproc_params params; - const struct coproc_reg *r; - int ret; - - if (!index_to_params(id, ¶ms)) - return -ENOENT; - - r = find_reg(¶ms, invariant_cp15, ARRAY_SIZE(invariant_cp15)); - if (!r) - return -ENOENT; - - ret = -ENOENT; - if (KVM_REG_SIZE(id) == 4) { - u32 val = r->val; - - ret = reg_to_user(uaddr, &val, id); - } else if (KVM_REG_SIZE(id) == 8) { - ret = reg_to_user(uaddr, &r->val, id); - } - return ret; -} - -static int set_invariant_cp15(u64 id, void __user *uaddr) -{ - struct coproc_params params; - const struct coproc_reg *r; - int err; - u64 val; - - if (!index_to_params(id, ¶ms)) - return -ENOENT; - r = find_reg(¶ms, invariant_cp15, ARRAY_SIZE(invariant_cp15)); - if (!r) - return -ENOENT; - - err = -ENOENT; - if (KVM_REG_SIZE(id) == 4) { - u32 val32; - - err = reg_from_user(&val32, uaddr, id); - if (!err) - val = val32; - } else if (KVM_REG_SIZE(id) == 8) { - err = reg_from_user(&val, uaddr, id); - } - if (err) - return err; - - /* This is what we mean by invariant: you can't change it. */ - if (r->val != val) - return -EINVAL; - - return 0; -} - -static bool is_valid_cache(u32 val) -{ - u32 level, ctype; - - if (val >= CSSELR_MAX) - return false; - - /* Bottom bit is Instruction or Data bit. Next 3 bits are level. */ - level = (val >> 1); - ctype = (cache_levels >> (level * 3)) & 7; - - switch (ctype) { - case 0: /* No cache */ - return false; - case 1: /* Instruction cache only */ - return (val & 1); - case 2: /* Data cache only */ - case 4: /* Unified cache */ - return !(val & 1); - case 3: /* Separate instruction and data caches */ - return true; - default: /* Reserved: we can't know instruction or data. */ - return false; - } -} - -/* Which cache CCSIDR represents depends on CSSELR value. */ -static u32 get_ccsidr(u32 csselr) -{ - u32 ccsidr; - - /* Make sure noone else changes CSSELR during this! */ - local_irq_disable(); - /* Put value into CSSELR */ - asm volatile("mcr p15, 2, %0, c0, c0, 0" : : "r" (csselr)); - isb(); - /* Read result out of CCSIDR */ - asm volatile("mrc p15, 1, %0, c0, c0, 0" : "=r" (ccsidr)); - local_irq_enable(); - - return ccsidr; -} - -static int demux_c15_get(u64 id, void __user *uaddr) -{ - u32 val; - u32 __user *uval = uaddr; - - /* Fail if we have unknown bits set. */ - if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK - | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1))) - return -ENOENT; - - switch (id & KVM_REG_ARM_DEMUX_ID_MASK) { - case KVM_REG_ARM_DEMUX_ID_CCSIDR: - if (KVM_REG_SIZE(id) != 4) - return -ENOENT; - val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) - >> KVM_REG_ARM_DEMUX_VAL_SHIFT; - if (!is_valid_cache(val)) - return -ENOENT; - - return put_user(get_ccsidr(val), uval); - default: - return -ENOENT; - } -} - -static int demux_c15_set(u64 id, void __user *uaddr) -{ - u32 val, newval; - u32 __user *uval = uaddr; - - /* Fail if we have unknown bits set. */ - if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK - | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1))) - return -ENOENT; - - switch (id & KVM_REG_ARM_DEMUX_ID_MASK) { - case KVM_REG_ARM_DEMUX_ID_CCSIDR: - if (KVM_REG_SIZE(id) != 4) - return -ENOENT; - val = (id & KVM_REG_ARM_DEMUX_VAL_MASK) - >> KVM_REG_ARM_DEMUX_VAL_SHIFT; - if (!is_valid_cache(val)) - return -ENOENT; - - if (get_user(newval, uval)) - return -EFAULT; - - /* This is also invariant: you can't change it. */ - if (newval != get_ccsidr(val)) - return -EINVAL; - return 0; - default: - return -ENOENT; - } -} - -#ifdef CONFIG_VFPv3 -static const int vfp_sysregs[] = { KVM_REG_ARM_VFP_FPEXC, - KVM_REG_ARM_VFP_FPSCR, - KVM_REG_ARM_VFP_FPINST, - KVM_REG_ARM_VFP_FPINST2, - KVM_REG_ARM_VFP_MVFR0, - KVM_REG_ARM_VFP_MVFR1, - KVM_REG_ARM_VFP_FPSID }; - -static unsigned int num_fp_regs(void) -{ - if (((fmrx(MVFR0) & MVFR0_A_SIMD_MASK) >> MVFR0_A_SIMD_BIT) == 2) - return 32; - else - return 16; -} - -static unsigned int num_vfp_regs(void) -{ - /* Normal FP regs + control regs. */ - return num_fp_regs() + ARRAY_SIZE(vfp_sysregs); -} - -static int copy_vfp_regids(u64 __user *uindices) -{ - unsigned int i; - const u64 u32reg = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_VFP; - const u64 u64reg = KVM_REG_ARM | KVM_REG_SIZE_U64 | KVM_REG_ARM_VFP; - - for (i = 0; i < num_fp_regs(); i++) { - if (put_user((u64reg | KVM_REG_ARM_VFP_BASE_REG) + i, - uindices)) - return -EFAULT; - uindices++; - } - - for (i = 0; i < ARRAY_SIZE(vfp_sysregs); i++) { - if (put_user(u32reg | vfp_sysregs[i], uindices)) - return -EFAULT; - uindices++; - } - - return num_vfp_regs(); -} - -static int vfp_get_reg(const struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) -{ - u32 vfpid = (id & KVM_REG_ARM_VFP_MASK); - u32 val; - - /* Fail if we have unknown bits set. */ - if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK - | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1))) - return -ENOENT; - - if (vfpid < num_fp_regs()) { - if (KVM_REG_SIZE(id) != 8) - return -ENOENT; - return reg_to_user(uaddr, &vcpu->arch.ctxt.vfp.fpregs[vfpid], - id); - } - - /* FP control registers are all 32 bit. */ - if (KVM_REG_SIZE(id) != 4) - return -ENOENT; - - switch (vfpid) { - case KVM_REG_ARM_VFP_FPEXC: - return reg_to_user(uaddr, &vcpu->arch.ctxt.vfp.fpexc, id); - case KVM_REG_ARM_VFP_FPSCR: - return reg_to_user(uaddr, &vcpu->arch.ctxt.vfp.fpscr, id); - case KVM_REG_ARM_VFP_FPINST: - return reg_to_user(uaddr, &vcpu->arch.ctxt.vfp.fpinst, id); - case KVM_REG_ARM_VFP_FPINST2: - return reg_to_user(uaddr, &vcpu->arch.ctxt.vfp.fpinst2, id); - case KVM_REG_ARM_VFP_MVFR0: - val = fmrx(MVFR0); - return reg_to_user(uaddr, &val, id); - case KVM_REG_ARM_VFP_MVFR1: - val = fmrx(MVFR1); - return reg_to_user(uaddr, &val, id); - case KVM_REG_ARM_VFP_FPSID: - val = fmrx(FPSID); - return reg_to_user(uaddr, &val, id); - default: - return -ENOENT; - } -} - -static int vfp_set_reg(struct kvm_vcpu *vcpu, u64 id, const void __user *uaddr) -{ - u32 vfpid = (id & KVM_REG_ARM_VFP_MASK); - u32 val; - - /* Fail if we have unknown bits set. */ - if (id & ~(KVM_REG_ARCH_MASK|KVM_REG_SIZE_MASK|KVM_REG_ARM_COPROC_MASK - | ((1 << KVM_REG_ARM_COPROC_SHIFT)-1))) - return -ENOENT; - - if (vfpid < num_fp_regs()) { - if (KVM_REG_SIZE(id) != 8) - return -ENOENT; - return reg_from_user(&vcpu->arch.ctxt.vfp.fpregs[vfpid], - uaddr, id); - } - - /* FP control registers are all 32 bit. */ - if (KVM_REG_SIZE(id) != 4) - return -ENOENT; - - switch (vfpid) { - case KVM_REG_ARM_VFP_FPEXC: - return reg_from_user(&vcpu->arch.ctxt.vfp.fpexc, uaddr, id); - case KVM_REG_ARM_VFP_FPSCR: - return reg_from_user(&vcpu->arch.ctxt.vfp.fpscr, uaddr, id); - case KVM_REG_ARM_VFP_FPINST: - return reg_from_user(&vcpu->arch.ctxt.vfp.fpinst, uaddr, id); - case KVM_REG_ARM_VFP_FPINST2: - return reg_from_user(&vcpu->arch.ctxt.vfp.fpinst2, uaddr, id); - /* These are invariant. */ - case KVM_REG_ARM_VFP_MVFR0: - if (reg_from_user(&val, uaddr, id)) - return -EFAULT; - if (val != fmrx(MVFR0)) - return -EINVAL; - return 0; - case KVM_REG_ARM_VFP_MVFR1: - if (reg_from_user(&val, uaddr, id)) - return -EFAULT; - if (val != fmrx(MVFR1)) - return -EINVAL; - return 0; - case KVM_REG_ARM_VFP_FPSID: - if (reg_from_user(&val, uaddr, id)) - return -EFAULT; - if (val != fmrx(FPSID)) - return -EINVAL; - return 0; - default: - return -ENOENT; - } -} -#else /* !CONFIG_VFPv3 */ -static unsigned int num_vfp_regs(void) -{ - return 0; -} - -static int copy_vfp_regids(u64 __user *uindices) -{ - return 0; -} - -static int vfp_get_reg(const struct kvm_vcpu *vcpu, u64 id, void __user *uaddr) -{ - return -ENOENT; -} - -static int vfp_set_reg(struct kvm_vcpu *vcpu, u64 id, const void __user *uaddr) -{ - return -ENOENT; -} -#endif /* !CONFIG_VFPv3 */ - -int kvm_arm_coproc_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - const struct coproc_reg *r; - void __user *uaddr = (void __user *)(long)reg->addr; - int ret; - - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) - return demux_c15_get(reg->id, uaddr); - - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_VFP) - return vfp_get_reg(vcpu, reg->id, uaddr); - - r = index_to_coproc_reg(vcpu, reg->id); - if (!r) - return get_invariant_cp15(reg->id, uaddr); - - ret = -ENOENT; - if (KVM_REG_SIZE(reg->id) == 8) { - u64 val; - - val = vcpu_cp15_reg64_get(vcpu, r); - ret = reg_to_user(uaddr, &val, reg->id); - } else if (KVM_REG_SIZE(reg->id) == 4) { - ret = reg_to_user(uaddr, &vcpu_cp15(vcpu, r->reg), reg->id); - } - - return ret; -} - -int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - const struct coproc_reg *r; - void __user *uaddr = (void __user *)(long)reg->addr; - int ret; - - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_DEMUX) - return demux_c15_set(reg->id, uaddr); - - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_VFP) - return vfp_set_reg(vcpu, reg->id, uaddr); - - r = index_to_coproc_reg(vcpu, reg->id); - if (!r) - return set_invariant_cp15(reg->id, uaddr); - - ret = -ENOENT; - if (KVM_REG_SIZE(reg->id) == 8) { - u64 val; - - ret = reg_from_user(&val, uaddr, reg->id); - if (!ret) - vcpu_cp15_reg64_set(vcpu, r, val); - } else if (KVM_REG_SIZE(reg->id) == 4) { - ret = reg_from_user(&vcpu_cp15(vcpu, r->reg), uaddr, reg->id); - } - - return ret; -} - -static unsigned int num_demux_regs(void) -{ - unsigned int i, count = 0; - - for (i = 0; i < CSSELR_MAX; i++) - if (is_valid_cache(i)) - count++; - - return count; -} - -static int write_demux_regids(u64 __user *uindices) -{ - u64 val = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_DEMUX; - unsigned int i; - - val |= KVM_REG_ARM_DEMUX_ID_CCSIDR; - for (i = 0; i < CSSELR_MAX; i++) { - if (!is_valid_cache(i)) - continue; - if (put_user(val | i, uindices)) - return -EFAULT; - uindices++; - } - return 0; -} - -static u64 cp15_to_index(const struct coproc_reg *reg) -{ - u64 val = KVM_REG_ARM | (15 << KVM_REG_ARM_COPROC_SHIFT); - if (reg->is_64bit) { - val |= KVM_REG_SIZE_U64; - val |= (reg->Op1 << KVM_REG_ARM_OPC1_SHIFT); - /* - * CRn always denotes the primary coproc. reg. nr. for the - * in-kernel representation, but the user space API uses the - * CRm for the encoding, because it is modelled after the - * MRRC/MCRR instructions: see the ARM ARM rev. c page - * B3-1445 - */ - val |= (reg->CRn << KVM_REG_ARM_CRM_SHIFT); - } else { - val |= KVM_REG_SIZE_U32; - val |= (reg->Op1 << KVM_REG_ARM_OPC1_SHIFT); - val |= (reg->Op2 << KVM_REG_ARM_32_OPC2_SHIFT); - val |= (reg->CRm << KVM_REG_ARM_CRM_SHIFT); - val |= (reg->CRn << KVM_REG_ARM_32_CRN_SHIFT); - } - return val; -} - -static bool copy_reg_to_user(const struct coproc_reg *reg, u64 __user **uind) -{ - if (!*uind) - return true; - - if (put_user(cp15_to_index(reg), *uind)) - return false; - - (*uind)++; - return true; -} - -/* Assumed ordered tables, see kvm_coproc_table_init. */ -static int walk_cp15(struct kvm_vcpu *vcpu, u64 __user *uind) -{ - const struct coproc_reg *i1, *i2, *end1, *end2; - unsigned int total = 0; - size_t num; - - /* We check for duplicates here, to allow arch-specific overrides. */ - i1 = get_target_table(vcpu->arch.target, &num); - end1 = i1 + num; - i2 = cp15_regs; - end2 = cp15_regs + ARRAY_SIZE(cp15_regs); - - BUG_ON(i1 == end1 || i2 == end2); - - /* Walk carefully, as both tables may refer to the same register. */ - while (i1 || i2) { - int cmp = cmp_reg(i1, i2); - /* target-specific overrides generic entry. */ - if (cmp <= 0) { - /* Ignore registers we trap but don't save. */ - if (i1->reg) { - if (!copy_reg_to_user(i1, &uind)) - return -EFAULT; - total++; - } - } else { - /* Ignore registers we trap but don't save. */ - if (i2->reg) { - if (!copy_reg_to_user(i2, &uind)) - return -EFAULT; - total++; - } - } - - if (cmp <= 0 && ++i1 == end1) - i1 = NULL; - if (cmp >= 0 && ++i2 == end2) - i2 = NULL; - } - return total; -} - -unsigned long kvm_arm_num_coproc_regs(struct kvm_vcpu *vcpu) -{ - return ARRAY_SIZE(invariant_cp15) - + num_demux_regs() - + num_vfp_regs() - + walk_cp15(vcpu, (u64 __user *)NULL); -} - -int kvm_arm_copy_coproc_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) -{ - unsigned int i; - int err; - - /* Then give them all the invariant registers' indices. */ - for (i = 0; i < ARRAY_SIZE(invariant_cp15); i++) { - if (put_user(cp15_to_index(&invariant_cp15[i]), uindices)) - return -EFAULT; - uindices++; - } - - err = walk_cp15(vcpu, uindices); - if (err < 0) - return err; - uindices += err; - - err = copy_vfp_regids(uindices); - if (err < 0) - return err; - uindices += err; - - return write_demux_regids(uindices); -} - -void kvm_coproc_table_init(void) -{ - unsigned int i; - - /* Make sure tables are unique and in order. */ - BUG_ON(check_reg_table(cp15_regs, ARRAY_SIZE(cp15_regs))); - BUG_ON(check_reg_table(invariant_cp15, ARRAY_SIZE(invariant_cp15))); - - /* We abuse the reset function to overwrite the table itself. */ - for (i = 0; i < ARRAY_SIZE(invariant_cp15); i++) - invariant_cp15[i].reset(NULL, &invariant_cp15[i]); - - /* - * CLIDR format is awkward, so clean it up. See ARM B4.1.20: - * - * If software reads the Cache Type fields from Ctype1 - * upwards, once it has seen a value of 0b000, no caches - * exist at further-out levels of the hierarchy. So, for - * example, if Ctype3 is the first Cache Type field with a - * value of 0b000, the values of Ctype4 to Ctype7 must be - * ignored. - */ - asm volatile("mrc p15, 1, %0, c0, c0, 1" : "=r" (cache_levels)); - for (i = 0; i < 7; i++) - if (((cache_levels >> (i*3)) & 7) == 0) - break; - /* Clear all higher bits. */ - cache_levels &= (1 << (i*3))-1; -} - -/** - * kvm_reset_coprocs - sets cp15 registers to reset value - * @vcpu: The VCPU pointer - * - * This function finds the right table above and sets the registers on the - * virtual CPU struct to their architecturally defined reset values. - */ -void kvm_reset_coprocs(struct kvm_vcpu *vcpu) -{ - size_t num; - const struct coproc_reg *table; - DECLARE_BITMAP(bmap, NR_CP15_REGS) = { 0, }; - - /* Generic chip reset first (so target could override). */ - reset_coproc_regs(vcpu, cp15_regs, ARRAY_SIZE(cp15_regs), bmap); - - table = get_target_table(vcpu->arch.target, &num); - reset_coproc_regs(vcpu, table, num, bmap); - - for (num = 1; num < NR_CP15_REGS; num++) - WARN(!test_bit(num, bmap), - "Didn't reset vcpu_cp15(vcpu, %zi)", num); -} diff --git a/arch/arm/kvm/coproc.h b/arch/arm/kvm/coproc.h deleted file mode 100644 index 637065b13012..000000000000 --- a/arch/arm/kvm/coproc.h +++ /dev/null @@ -1,130 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Authors: Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#ifndef __ARM_KVM_COPROC_LOCAL_H__ -#define __ARM_KVM_COPROC_LOCAL_H__ - -struct coproc_params { - unsigned long CRn; - unsigned long CRm; - unsigned long Op1; - unsigned long Op2; - unsigned long Rt1; - unsigned long Rt2; - bool is_64bit; - bool is_write; -}; - -struct coproc_reg { - /* MRC/MCR/MRRC/MCRR instruction which accesses it. */ - unsigned long CRn; - unsigned long CRm; - unsigned long Op1; - unsigned long Op2; - - bool is_64bit; - - /* Trapped access from guest, if non-NULL. */ - bool (*access)(struct kvm_vcpu *, - const struct coproc_params *, - const struct coproc_reg *); - - /* Initialization for vcpu. */ - void (*reset)(struct kvm_vcpu *, const struct coproc_reg *); - - /* Index into vcpu_cp15(vcpu, ...), or 0 if we don't need to save it. */ - unsigned long reg; - - /* Value (usually reset value) */ - u64 val; -}; - -static inline void print_cp_instr(const struct coproc_params *p) -{ - /* Look, we even formatted it for you to paste into the table! */ - if (p->is_64bit) { - kvm_pr_unimpl(" { CRm64(%2lu), Op1(%2lu), is64, func_%s },\n", - p->CRn, p->Op1, p->is_write ? "write" : "read"); - } else { - kvm_pr_unimpl(" { CRn(%2lu), CRm(%2lu), Op1(%2lu), Op2(%2lu), is32," - " func_%s },\n", - p->CRn, p->CRm, p->Op1, p->Op2, - p->is_write ? "write" : "read"); - } -} - -static inline bool ignore_write(struct kvm_vcpu *vcpu, - const struct coproc_params *p) -{ - return true; -} - -static inline bool read_zero(struct kvm_vcpu *vcpu, - const struct coproc_params *p) -{ - *vcpu_reg(vcpu, p->Rt1) = 0; - return true; -} - -/* Reset functions */ -static inline void reset_unknown(struct kvm_vcpu *vcpu, - const struct coproc_reg *r) -{ - BUG_ON(!r->reg); - BUG_ON(r->reg >= ARRAY_SIZE(vcpu->arch.ctxt.cp15)); - vcpu_cp15(vcpu, r->reg) = 0xdecafbad; -} - -static inline void reset_val(struct kvm_vcpu *vcpu, const struct coproc_reg *r) -{ - BUG_ON(!r->reg); - BUG_ON(r->reg >= ARRAY_SIZE(vcpu->arch.ctxt.cp15)); - vcpu_cp15(vcpu, r->reg) = r->val; -} - -static inline void reset_unknown64(struct kvm_vcpu *vcpu, - const struct coproc_reg *r) -{ - BUG_ON(!r->reg); - BUG_ON(r->reg + 1 >= ARRAY_SIZE(vcpu->arch.ctxt.cp15)); - - vcpu_cp15(vcpu, r->reg) = 0xdecafbad; - vcpu_cp15(vcpu, r->reg+1) = 0xd0c0ffee; -} - -static inline int cmp_reg(const struct coproc_reg *i1, - const struct coproc_reg *i2) -{ - BUG_ON(i1 == i2); - if (!i1) - return 1; - else if (!i2) - return -1; - if (i1->CRn != i2->CRn) - return i1->CRn - i2->CRn; - if (i1->CRm != i2->CRm) - return i1->CRm - i2->CRm; - if (i1->Op1 != i2->Op1) - return i1->Op1 - i2->Op1; - if (i1->Op2 != i2->Op2) - return i1->Op2 - i2->Op2; - return i2->is_64bit - i1->is_64bit; -} - - -#define CRn(_x) .CRn = _x -#define CRm(_x) .CRm = _x -#define CRm64(_x) .CRn = _x, .CRm = 0 -#define Op1(_x) .Op1 = _x -#define Op2(_x) .Op2 = _x -#define is64 .is_64bit = true -#define is32 .is_64bit = false - -bool access_vm_reg(struct kvm_vcpu *vcpu, - const struct coproc_params *p, - const struct coproc_reg *r); - -#endif /* __ARM_KVM_COPROC_LOCAL_H__ */ diff --git a/arch/arm/kvm/coproc_a15.c b/arch/arm/kvm/coproc_a15.c deleted file mode 100644 index 36bf15421ae8..000000000000 --- a/arch/arm/kvm/coproc_a15.c +++ /dev/null @@ -1,39 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Authors: Rusty Russell <rusty@rustcorp.au> - * Christoffer Dall <c.dall@virtualopensystems.com> - */ -#include <linux/kvm_host.h> -#include <asm/kvm_coproc.h> -#include <asm/kvm_emulate.h> -#include <linux/init.h> - -#include "coproc.h" - -/* - * A15-specific CP15 registers. - * CRn denotes the primary register number, but is copied to the CRm in the - * user space API for 64-bit register access in line with the terminology used - * in the ARM ARM. - * Important: Must be sorted ascending by CRn, CRM, Op1, Op2 and with 64-bit - * registers preceding 32-bit ones. - */ -static const struct coproc_reg a15_regs[] = { - /* SCTLR: swapped by interrupt.S. */ - { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32, - access_vm_reg, reset_val, c1_SCTLR, 0x00C50078 }, -}; - -static struct kvm_coproc_target_table a15_target_table = { - .target = KVM_ARM_TARGET_CORTEX_A15, - .table = a15_regs, - .num = ARRAY_SIZE(a15_regs), -}; - -static int __init coproc_a15_init(void) -{ - kvm_register_target_coproc_table(&a15_target_table); - return 0; -} -late_initcall(coproc_a15_init); diff --git a/arch/arm/kvm/coproc_a7.c b/arch/arm/kvm/coproc_a7.c deleted file mode 100644 index 40f643e1e05c..000000000000 --- a/arch/arm/kvm/coproc_a7.c +++ /dev/null @@ -1,42 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Copyright (C) 2013 - ARM Ltd - * - * Authors: Rusty Russell <rusty@rustcorp.au> - * Christoffer Dall <c.dall@virtualopensystems.com> - * Jonathan Austin <jonathan.austin@arm.com> - */ -#include <linux/kvm_host.h> -#include <asm/kvm_coproc.h> -#include <asm/kvm_emulate.h> -#include <linux/init.h> - -#include "coproc.h" - -/* - * Cortex-A7 specific CP15 registers. - * CRn denotes the primary register number, but is copied to the CRm in the - * user space API for 64-bit register access in line with the terminology used - * in the ARM ARM. - * Important: Must be sorted ascending by CRn, CRM, Op1, Op2 and with 64-bit - * registers preceding 32-bit ones. - */ -static const struct coproc_reg a7_regs[] = { - /* SCTLR: swapped by interrupt.S. */ - { CRn( 1), CRm( 0), Op1( 0), Op2( 0), is32, - access_vm_reg, reset_val, c1_SCTLR, 0x00C50878 }, -}; - -static struct kvm_coproc_target_table a7_target_table = { - .target = KVM_ARM_TARGET_CORTEX_A7, - .table = a7_regs, - .num = ARRAY_SIZE(a7_regs), -}; - -static int __init coproc_a7_init(void) -{ - kvm_register_target_coproc_table(&a7_target_table); - return 0; -} -late_initcall(coproc_a7_init); diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c deleted file mode 100644 index 29bb852140c5..000000000000 --- a/arch/arm/kvm/emulate.c +++ /dev/null @@ -1,166 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#include <linux/mm.h> -#include <linux/kvm_host.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_emulate.h> -#include <asm/opcodes.h> -#include <trace/events/kvm.h> - -#include "trace.h" - -#define VCPU_NR_MODES 6 -#define VCPU_REG_OFFSET_USR 0 -#define VCPU_REG_OFFSET_FIQ 1 -#define VCPU_REG_OFFSET_IRQ 2 -#define VCPU_REG_OFFSET_SVC 3 -#define VCPU_REG_OFFSET_ABT 4 -#define VCPU_REG_OFFSET_UND 5 -#define REG_OFFSET(_reg) \ - (offsetof(struct kvm_regs, _reg) / sizeof(u32)) - -#define USR_REG_OFFSET(_num) REG_OFFSET(usr_regs.uregs[_num]) - -static const unsigned long vcpu_reg_offsets[VCPU_NR_MODES][15] = { - /* USR/SYS Registers */ - [VCPU_REG_OFFSET_USR] = { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), USR_REG_OFFSET(13), USR_REG_OFFSET(14), - }, - - /* FIQ Registers */ - [VCPU_REG_OFFSET_FIQ] = { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), - REG_OFFSET(fiq_regs[0]), /* r8 */ - REG_OFFSET(fiq_regs[1]), /* r9 */ - REG_OFFSET(fiq_regs[2]), /* r10 */ - REG_OFFSET(fiq_regs[3]), /* r11 */ - REG_OFFSET(fiq_regs[4]), /* r12 */ - REG_OFFSET(fiq_regs[5]), /* r13 */ - REG_OFFSET(fiq_regs[6]), /* r14 */ - }, - - /* IRQ Registers */ - [VCPU_REG_OFFSET_IRQ] = { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), - REG_OFFSET(irq_regs[0]), /* r13 */ - REG_OFFSET(irq_regs[1]), /* r14 */ - }, - - /* SVC Registers */ - [VCPU_REG_OFFSET_SVC] = { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), - REG_OFFSET(svc_regs[0]), /* r13 */ - REG_OFFSET(svc_regs[1]), /* r14 */ - }, - - /* ABT Registers */ - [VCPU_REG_OFFSET_ABT] = { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), - REG_OFFSET(abt_regs[0]), /* r13 */ - REG_OFFSET(abt_regs[1]), /* r14 */ - }, - - /* UND Registers */ - [VCPU_REG_OFFSET_UND] = { - USR_REG_OFFSET(0), USR_REG_OFFSET(1), USR_REG_OFFSET(2), - USR_REG_OFFSET(3), USR_REG_OFFSET(4), USR_REG_OFFSET(5), - USR_REG_OFFSET(6), USR_REG_OFFSET(7), USR_REG_OFFSET(8), - USR_REG_OFFSET(9), USR_REG_OFFSET(10), USR_REG_OFFSET(11), - USR_REG_OFFSET(12), - REG_OFFSET(und_regs[0]), /* r13 */ - REG_OFFSET(und_regs[1]), /* r14 */ - }, -}; - -/* - * Return a pointer to the register number valid in the current mode of - * the virtual CPU. - */ -unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num) -{ - unsigned long *reg_array = (unsigned long *)&vcpu->arch.ctxt.gp_regs; - unsigned long mode = *vcpu_cpsr(vcpu) & MODE_MASK; - - switch (mode) { - case USR_MODE...SVC_MODE: - mode &= ~MODE32_BIT; /* 0 ... 3 */ - break; - - case ABT_MODE: - mode = VCPU_REG_OFFSET_ABT; - break; - - case UND_MODE: - mode = VCPU_REG_OFFSET_UND; - break; - - case SYSTEM_MODE: - mode = VCPU_REG_OFFSET_USR; - break; - - default: - BUG(); - } - - return reg_array + vcpu_reg_offsets[mode][reg_num]; -} - -/* - * Return the SPSR for the current mode of the virtual CPU. - */ -unsigned long *__vcpu_spsr(struct kvm_vcpu *vcpu) -{ - unsigned long mode = *vcpu_cpsr(vcpu) & MODE_MASK; - switch (mode) { - case SVC_MODE: - return &vcpu->arch.ctxt.gp_regs.KVM_ARM_SVC_spsr; - case ABT_MODE: - return &vcpu->arch.ctxt.gp_regs.KVM_ARM_ABT_spsr; - case UND_MODE: - return &vcpu->arch.ctxt.gp_regs.KVM_ARM_UND_spsr; - case IRQ_MODE: - return &vcpu->arch.ctxt.gp_regs.KVM_ARM_IRQ_spsr; - case FIQ_MODE: - return &vcpu->arch.ctxt.gp_regs.KVM_ARM_FIQ_spsr; - default: - BUG(); - } -} - -/****************************************************************************** - * Inject exceptions into the guest - */ - -/** - * kvm_inject_vabt - inject an async abort / SError into the guest - * @vcpu: The VCPU to receive the exception - * - * It is assumed that this code is called from the VCPU thread and that the - * VCPU therefore is not currently executing guest code. - */ -void kvm_inject_vabt(struct kvm_vcpu *vcpu) -{ - *vcpu_hcr(vcpu) |= HCR_VA; -} diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c deleted file mode 100644 index 9f7ae0d8690f..000000000000 --- a/arch/arm/kvm/guest.c +++ /dev/null @@ -1,387 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#include <linux/errno.h> -#include <linux/err.h> -#include <linux/kvm_host.h> -#include <linux/module.h> -#include <linux/vmalloc.h> -#include <linux/fs.h> -#include <kvm/arm_psci.h> -#include <asm/cputype.h> -#include <linux/uaccess.h> -#include <asm/kvm.h> -#include <asm/kvm_emulate.h> -#include <asm/kvm_coproc.h> - -#define VM_STAT(x) { #x, offsetof(struct kvm, stat.x), KVM_STAT_VM } -#define VCPU_STAT(x) { #x, offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU } - -struct kvm_stats_debugfs_item debugfs_entries[] = { - VCPU_STAT(halt_successful_poll), - VCPU_STAT(halt_attempted_poll), - VCPU_STAT(halt_poll_invalid), - VCPU_STAT(halt_wakeup), - VCPU_STAT(hvc_exit_stat), - VCPU_STAT(wfe_exit_stat), - VCPU_STAT(wfi_exit_stat), - VCPU_STAT(mmio_exit_user), - VCPU_STAT(mmio_exit_kernel), - VCPU_STAT(exits), - { NULL } -}; - -static u64 core_reg_offset_from_id(u64 id) -{ - return id & ~(KVM_REG_ARCH_MASK | KVM_REG_SIZE_MASK | KVM_REG_ARM_CORE); -} - -static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - u32 __user *uaddr = (u32 __user *)(long)reg->addr; - struct kvm_regs *regs = &vcpu->arch.ctxt.gp_regs; - u64 off; - - if (KVM_REG_SIZE(reg->id) != 4) - return -ENOENT; - - /* Our ID is an index into the kvm_regs struct. */ - off = core_reg_offset_from_id(reg->id); - if (off >= sizeof(*regs) / KVM_REG_SIZE(reg->id)) - return -ENOENT; - - return put_user(((u32 *)regs)[off], uaddr); -} - -static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - u32 __user *uaddr = (u32 __user *)(long)reg->addr; - struct kvm_regs *regs = &vcpu->arch.ctxt.gp_regs; - u64 off, val; - - if (KVM_REG_SIZE(reg->id) != 4) - return -ENOENT; - - /* Our ID is an index into the kvm_regs struct. */ - off = core_reg_offset_from_id(reg->id); - if (off >= sizeof(*regs) / KVM_REG_SIZE(reg->id)) - return -ENOENT; - - if (get_user(val, uaddr) != 0) - return -EFAULT; - - if (off == KVM_REG_ARM_CORE_REG(usr_regs.ARM_cpsr)) { - unsigned long mode = val & MODE_MASK; - switch (mode) { - case USR_MODE: - case FIQ_MODE: - case IRQ_MODE: - case SVC_MODE: - case ABT_MODE: - case UND_MODE: - break; - default: - return -EINVAL; - } - } - - ((u32 *)regs)[off] = val; - return 0; -} - -int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - return -EINVAL; -} - -int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) -{ - return -EINVAL; -} - -#define NUM_TIMER_REGS 3 - -static bool is_timer_reg(u64 index) -{ - switch (index) { - case KVM_REG_ARM_TIMER_CTL: - case KVM_REG_ARM_TIMER_CNT: - case KVM_REG_ARM_TIMER_CVAL: - return true; - } - return false; -} - -static int copy_timer_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) -{ - if (put_user(KVM_REG_ARM_TIMER_CTL, uindices)) - return -EFAULT; - uindices++; - if (put_user(KVM_REG_ARM_TIMER_CNT, uindices)) - return -EFAULT; - uindices++; - if (put_user(KVM_REG_ARM_TIMER_CVAL, uindices)) - return -EFAULT; - - return 0; -} - -static int set_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - void __user *uaddr = (void __user *)(long)reg->addr; - u64 val; - int ret; - - ret = copy_from_user(&val, uaddr, KVM_REG_SIZE(reg->id)); - if (ret != 0) - return -EFAULT; - - return kvm_arm_timer_set_reg(vcpu, reg->id, val); -} - -static int get_timer_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - void __user *uaddr = (void __user *)(long)reg->addr; - u64 val; - - val = kvm_arm_timer_get_reg(vcpu, reg->id); - return copy_to_user(uaddr, &val, KVM_REG_SIZE(reg->id)) ? -EFAULT : 0; -} - -static unsigned long num_core_regs(void) -{ - return sizeof(struct kvm_regs) / sizeof(u32); -} - -/** - * kvm_arm_num_regs - how many registers do we present via KVM_GET_ONE_REG - * - * This is for all registers. - */ -unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu) -{ - return num_core_regs() + kvm_arm_num_coproc_regs(vcpu) - + kvm_arm_get_fw_num_regs(vcpu) - + NUM_TIMER_REGS; -} - -/** - * kvm_arm_copy_reg_indices - get indices of all registers. - * - * We do core registers right here, then we append coproc regs. - */ -int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) -{ - unsigned int i; - const u64 core_reg = KVM_REG_ARM | KVM_REG_SIZE_U32 | KVM_REG_ARM_CORE; - int ret; - - for (i = 0; i < sizeof(struct kvm_regs)/sizeof(u32); i++) { - if (put_user(core_reg | i, uindices)) - return -EFAULT; - uindices++; - } - - ret = kvm_arm_copy_fw_reg_indices(vcpu, uindices); - if (ret) - return ret; - uindices += kvm_arm_get_fw_num_regs(vcpu); - - ret = copy_timer_indices(vcpu, uindices); - if (ret) - return ret; - uindices += NUM_TIMER_REGS; - - return kvm_arm_copy_coproc_indices(vcpu, uindices); -} - -int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - /* We currently use nothing arch-specific in upper 32 bits */ - if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM >> 32) - return -EINVAL; - - /* Register group 16 means we want a core register. */ - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) - return get_core_reg(vcpu, reg); - - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_FW) - return kvm_arm_get_fw_reg(vcpu, reg); - - if (is_timer_reg(reg->id)) - return get_timer_reg(vcpu, reg); - - return kvm_arm_coproc_get_reg(vcpu, reg); -} - -int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) -{ - /* We currently use nothing arch-specific in upper 32 bits */ - if ((reg->id & ~KVM_REG_SIZE_MASK) >> 32 != KVM_REG_ARM >> 32) - return -EINVAL; - - /* Register group 16 means we set a core register. */ - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_CORE) - return set_core_reg(vcpu, reg); - - if ((reg->id & KVM_REG_ARM_COPROC_MASK) == KVM_REG_ARM_FW) - return kvm_arm_set_fw_reg(vcpu, reg); - - if (is_timer_reg(reg->id)) - return set_timer_reg(vcpu, reg); - - return kvm_arm_coproc_set_reg(vcpu, reg); -} - -int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) -{ - return -EINVAL; -} - -int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, - struct kvm_sregs *sregs) -{ - return -EINVAL; -} - - -int __kvm_arm_vcpu_get_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - events->exception.serror_pending = !!(*vcpu_hcr(vcpu) & HCR_VA); - - /* - * We never return a pending ext_dabt here because we deliver it to - * the virtual CPU directly when setting the event and it's no longer - * 'pending' at this point. - */ - - return 0; -} - -int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, - struct kvm_vcpu_events *events) -{ - bool serror_pending = events->exception.serror_pending; - bool has_esr = events->exception.serror_has_esr; - bool ext_dabt_pending = events->exception.ext_dabt_pending; - - if (serror_pending && has_esr) - return -EINVAL; - else if (serror_pending) - kvm_inject_vabt(vcpu); - - if (ext_dabt_pending) - kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu)); - - return 0; -} - -int __attribute_const__ kvm_target_cpu(void) -{ - switch (read_cpuid_part()) { - case ARM_CPU_PART_CORTEX_A7: - return KVM_ARM_TARGET_CORTEX_A7; - case ARM_CPU_PART_CORTEX_A15: - return KVM_ARM_TARGET_CORTEX_A15; - default: - return -EINVAL; - } -} - -int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) -{ - int target = kvm_target_cpu(); - - if (target < 0) - return -ENODEV; - - memset(init, 0, sizeof(*init)); - - /* - * For now, we don't return any features. - * In future, we might use features to return target - * specific features available for the preferred - * target type. - */ - init->target = (__u32)target; - - return 0; -} - -int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) -{ - return -EINVAL; -} - -int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) -{ - return -EINVAL; -} - -int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, - struct kvm_translation *tr) -{ - return -EINVAL; -} - -int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, - struct kvm_guest_debug *dbg) -{ - return -EINVAL; -} - -int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - int ret; - - switch (attr->group) { - case KVM_ARM_VCPU_TIMER_CTRL: - ret = kvm_arm_timer_set_attr(vcpu, attr); - break; - default: - ret = -ENXIO; - break; - } - - return ret; -} - -int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - int ret; - - switch (attr->group) { - case KVM_ARM_VCPU_TIMER_CTRL: - ret = kvm_arm_timer_get_attr(vcpu, attr); - break; - default: - ret = -ENXIO; - break; - } - - return ret; -} - -int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, - struct kvm_device_attr *attr) -{ - int ret; - - switch (attr->group) { - case KVM_ARM_VCPU_TIMER_CTRL: - ret = kvm_arm_timer_has_attr(vcpu, attr); - break; - default: - ret = -ENXIO; - break; - } - - return ret; -} diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c deleted file mode 100644 index e58a89d2f13f..000000000000 --- a/arch/arm/kvm/handle_exit.c +++ /dev/null @@ -1,175 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <asm/kvm_emulate.h> -#include <asm/kvm_coproc.h> -#include <asm/kvm_mmu.h> -#include <kvm/arm_hypercalls.h> -#include <trace/events/kvm.h> - -#include "trace.h" - -typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *); - -static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - int ret; - - trace_kvm_hvc(*vcpu_pc(vcpu), *vcpu_reg(vcpu, 0), - kvm_vcpu_hvc_get_imm(vcpu)); - vcpu->stat.hvc_exit_stat++; - - ret = kvm_hvc_call_handler(vcpu); - if (ret < 0) { - vcpu_set_reg(vcpu, 0, ~0UL); - return 1; - } - - return ret; -} - -static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - /* - * "If an SMC instruction executed at Non-secure EL1 is - * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a - * Trap exception, not a Secure Monitor Call exception [...]" - * - * We need to advance the PC after the trap, as it would - * otherwise return to the same address... - */ - vcpu_set_reg(vcpu, 0, ~0UL); - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - return 1; -} - -/** - * kvm_handle_wfx - handle a WFI or WFE instructions trapped in guests - * @vcpu: the vcpu pointer - * @run: the kvm_run structure pointer - * - * WFE: Yield the CPU and come back to this vcpu when the scheduler - * decides to. - * WFI: Simply call kvm_vcpu_block(), which will halt execution of - * world-switches and schedule other host processes until there is an - * incoming IRQ or FIQ to the VM. - */ -static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - if (kvm_vcpu_get_hsr(vcpu) & HSR_WFI_IS_WFE) { - trace_kvm_wfx(*vcpu_pc(vcpu), true); - vcpu->stat.wfe_exit_stat++; - kvm_vcpu_on_spin(vcpu, vcpu_mode_priv(vcpu)); - } else { - trace_kvm_wfx(*vcpu_pc(vcpu), false); - vcpu->stat.wfi_exit_stat++; - kvm_vcpu_block(vcpu); - kvm_clear_request(KVM_REQ_UNHALT, vcpu); - } - - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - - return 1; -} - -static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - u32 hsr = kvm_vcpu_get_hsr(vcpu); - - kvm_pr_unimpl("Unknown exception class: hsr: %#08x\n", - hsr); - - kvm_inject_undefined(vcpu); - return 1; -} - -static exit_handle_fn arm_exit_handlers[] = { - [0 ... HSR_EC_MAX] = kvm_handle_unknown_ec, - [HSR_EC_WFI] = kvm_handle_wfx, - [HSR_EC_CP15_32] = kvm_handle_cp15_32, - [HSR_EC_CP15_64] = kvm_handle_cp15_64, - [HSR_EC_CP14_MR] = kvm_handle_cp14_32, - [HSR_EC_CP14_LS] = kvm_handle_cp14_load_store, - [HSR_EC_CP14_64] = kvm_handle_cp14_64, - [HSR_EC_CP_0_13] = kvm_handle_cp_0_13_access, - [HSR_EC_CP10_ID] = kvm_handle_cp10_id, - [HSR_EC_HVC] = handle_hvc, - [HSR_EC_SMC] = handle_smc, - [HSR_EC_IABT] = kvm_handle_guest_abort, - [HSR_EC_DABT] = kvm_handle_guest_abort, -}; - -static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) -{ - u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu); - - return arm_exit_handlers[hsr_ec]; -} - -/* - * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on - * proper exit to userspace. - */ -int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, - int exception_index) -{ - exit_handle_fn exit_handler; - - if (ARM_ABORT_PENDING(exception_index)) { - u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu); - - /* - * HVC/SMC already have an adjusted PC, which we need - * to correct in order to return to after having - * injected the abort. - */ - if (hsr_ec == HSR_EC_HVC || hsr_ec == HSR_EC_SMC) { - u32 adj = kvm_vcpu_trap_il_is32bit(vcpu) ? 4 : 2; - *vcpu_pc(vcpu) -= adj; - } - - kvm_inject_vabt(vcpu); - return 1; - } - - exception_index = ARM_EXCEPTION_CODE(exception_index); - - switch (exception_index) { - case ARM_EXCEPTION_IRQ: - return 1; - case ARM_EXCEPTION_HVC: - /* - * See ARM ARM B1.14.1: "Hyp traps on instructions - * that fail their condition code check" - */ - if (!kvm_condition_valid(vcpu)) { - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - return 1; - } - - exit_handler = kvm_get_exit_handler(vcpu); - - return exit_handler(vcpu, run); - case ARM_EXCEPTION_DATA_ABORT: - kvm_inject_vabt(vcpu); - return 1; - case ARM_EXCEPTION_HYP_GONE: - /* - * HYP has been reset to the hyp-stub. This happens - * when a guest is pre-empted by kvm_reboot()'s - * shutdown call. - */ - run->exit_reason = KVM_EXIT_FAIL_ENTRY; - return 0; - default: - kvm_pr_unimpl("Unsupported exception type: %d", - exception_index); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - return 0; - } -} diff --git a/arch/arm/kvm/hyp/Makefile b/arch/arm/kvm/hyp/Makefile deleted file mode 100644 index ba88b1eca93c..000000000000 --- a/arch/arm/kvm/hyp/Makefile +++ /dev/null @@ -1,34 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# -# Makefile for Kernel-based Virtual Machine module, HYP part -# - -ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING - -KVM=../../../../virt/kvm - -CFLAGS_ARMV7VE :=$(call cc-option, -march=armv7ve) - -obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/aarch32.o - -obj-$(CONFIG_KVM_ARM_HOST) += tlb.o -obj-$(CONFIG_KVM_ARM_HOST) += cp15-sr.o -obj-$(CONFIG_KVM_ARM_HOST) += vfp.o -obj-$(CONFIG_KVM_ARM_HOST) += banked-sr.o -CFLAGS_banked-sr.o += $(CFLAGS_ARMV7VE) - -obj-$(CONFIG_KVM_ARM_HOST) += entry.o -obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o -obj-$(CONFIG_KVM_ARM_HOST) += switch.o -CFLAGS_switch.o += $(CFLAGS_ARMV7VE) -obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o - -# KVM code is run at a different exception code with a different map, so -# compiler instrumentation that inserts callbacks or checks into the code may -# cause crashes. Just disable it. -GCOV_PROFILE := n -KASAN_SANITIZE := n -UBSAN_SANITIZE := n -KCOV_INSTRUMENT := n diff --git a/arch/arm/kvm/hyp/banked-sr.c b/arch/arm/kvm/hyp/banked-sr.c deleted file mode 100644 index c4632ed9e819..000000000000 --- a/arch/arm/kvm/hyp/banked-sr.c +++ /dev/null @@ -1,70 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Original code: - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * Mostly rewritten in C by Marc Zyngier <marc.zyngier@arm.com> - */ - -#include <asm/kvm_hyp.h> - -/* - * gcc before 4.9 doesn't understand -march=armv7ve, so we have to - * trick the assembler. - */ -__asm__(".arch_extension virt"); - -void __hyp_text __banked_save_state(struct kvm_cpu_context *ctxt) -{ - ctxt->gp_regs.usr_regs.ARM_sp = read_special(SP_usr); - ctxt->gp_regs.usr_regs.ARM_pc = read_special(ELR_hyp); - ctxt->gp_regs.usr_regs.ARM_cpsr = read_special(SPSR); - ctxt->gp_regs.KVM_ARM_SVC_sp = read_special(SP_svc); - ctxt->gp_regs.KVM_ARM_SVC_lr = read_special(LR_svc); - ctxt->gp_regs.KVM_ARM_SVC_spsr = read_special(SPSR_svc); - ctxt->gp_regs.KVM_ARM_ABT_sp = read_special(SP_abt); - ctxt->gp_regs.KVM_ARM_ABT_lr = read_special(LR_abt); - ctxt->gp_regs.KVM_ARM_ABT_spsr = read_special(SPSR_abt); - ctxt->gp_regs.KVM_ARM_UND_sp = read_special(SP_und); - ctxt->gp_regs.KVM_ARM_UND_lr = read_special(LR_und); - ctxt->gp_regs.KVM_ARM_UND_spsr = read_special(SPSR_und); - ctxt->gp_regs.KVM_ARM_IRQ_sp = read_special(SP_irq); - ctxt->gp_regs.KVM_ARM_IRQ_lr = read_special(LR_irq); - ctxt->gp_regs.KVM_ARM_IRQ_spsr = read_special(SPSR_irq); - ctxt->gp_regs.KVM_ARM_FIQ_r8 = read_special(R8_fiq); - ctxt->gp_regs.KVM_ARM_FIQ_r9 = read_special(R9_fiq); - ctxt->gp_regs.KVM_ARM_FIQ_r10 = read_special(R10_fiq); - ctxt->gp_regs.KVM_ARM_FIQ_fp = read_special(R11_fiq); - ctxt->gp_regs.KVM_ARM_FIQ_ip = read_special(R12_fiq); - ctxt->gp_regs.KVM_ARM_FIQ_sp = read_special(SP_fiq); - ctxt->gp_regs.KVM_ARM_FIQ_lr = read_special(LR_fiq); - ctxt->gp_regs.KVM_ARM_FIQ_spsr = read_special(SPSR_fiq); -} - -void __hyp_text __banked_restore_state(struct kvm_cpu_context *ctxt) -{ - write_special(ctxt->gp_regs.usr_regs.ARM_sp, SP_usr); - write_special(ctxt->gp_regs.usr_regs.ARM_pc, ELR_hyp); - write_special(ctxt->gp_regs.usr_regs.ARM_cpsr, SPSR_cxsf); - write_special(ctxt->gp_regs.KVM_ARM_SVC_sp, SP_svc); - write_special(ctxt->gp_regs.KVM_ARM_SVC_lr, LR_svc); - write_special(ctxt->gp_regs.KVM_ARM_SVC_spsr, SPSR_svc); - write_special(ctxt->gp_regs.KVM_ARM_ABT_sp, SP_abt); - write_special(ctxt->gp_regs.KVM_ARM_ABT_lr, LR_abt); - write_special(ctxt->gp_regs.KVM_ARM_ABT_spsr, SPSR_abt); - write_special(ctxt->gp_regs.KVM_ARM_UND_sp, SP_und); - write_special(ctxt->gp_regs.KVM_ARM_UND_lr, LR_und); - write_special(ctxt->gp_regs.KVM_ARM_UND_spsr, SPSR_und); - write_special(ctxt->gp_regs.KVM_ARM_IRQ_sp, SP_irq); - write_special(ctxt->gp_regs.KVM_ARM_IRQ_lr, LR_irq); - write_special(ctxt->gp_regs.KVM_ARM_IRQ_spsr, SPSR_irq); - write_special(ctxt->gp_regs.KVM_ARM_FIQ_r8, R8_fiq); - write_special(ctxt->gp_regs.KVM_ARM_FIQ_r9, R9_fiq); - write_special(ctxt->gp_regs.KVM_ARM_FIQ_r10, R10_fiq); - write_special(ctxt->gp_regs.KVM_ARM_FIQ_fp, R11_fiq); - write_special(ctxt->gp_regs.KVM_ARM_FIQ_ip, R12_fiq); - write_special(ctxt->gp_regs.KVM_ARM_FIQ_sp, SP_fiq); - write_special(ctxt->gp_regs.KVM_ARM_FIQ_lr, LR_fiq); - write_special(ctxt->gp_regs.KVM_ARM_FIQ_spsr, SPSR_fiq); -} diff --git a/arch/arm/kvm/hyp/cp15-sr.c b/arch/arm/kvm/hyp/cp15-sr.c deleted file mode 100644 index e6923306f698..000000000000 --- a/arch/arm/kvm/hyp/cp15-sr.c +++ /dev/null @@ -1,72 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Original code: - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * Mostly rewritten in C by Marc Zyngier <marc.zyngier@arm.com> - */ - -#include <asm/kvm_hyp.h> - -static u64 *cp15_64(struct kvm_cpu_context *ctxt, int idx) -{ - return (u64 *)(ctxt->cp15 + idx); -} - -void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt) -{ - ctxt->cp15[c0_CSSELR] = read_sysreg(CSSELR); - ctxt->cp15[c1_SCTLR] = read_sysreg(SCTLR); - ctxt->cp15[c1_CPACR] = read_sysreg(CPACR); - *cp15_64(ctxt, c2_TTBR0) = read_sysreg(TTBR0); - *cp15_64(ctxt, c2_TTBR1) = read_sysreg(TTBR1); - ctxt->cp15[c2_TTBCR] = read_sysreg(TTBCR); - ctxt->cp15[c3_DACR] = read_sysreg(DACR); - ctxt->cp15[c5_DFSR] = read_sysreg(DFSR); - ctxt->cp15[c5_IFSR] = read_sysreg(IFSR); - ctxt->cp15[c5_ADFSR] = read_sysreg(ADFSR); - ctxt->cp15[c5_AIFSR] = read_sysreg(AIFSR); - ctxt->cp15[c6_DFAR] = read_sysreg(DFAR); - ctxt->cp15[c6_IFAR] = read_sysreg(IFAR); - *cp15_64(ctxt, c7_PAR) = read_sysreg(PAR); - ctxt->cp15[c10_PRRR] = read_sysreg(PRRR); - ctxt->cp15[c10_NMRR] = read_sysreg(NMRR); - ctxt->cp15[c10_AMAIR0] = read_sysreg(AMAIR0); - ctxt->cp15[c10_AMAIR1] = read_sysreg(AMAIR1); - ctxt->cp15[c12_VBAR] = read_sysreg(VBAR); - ctxt->cp15[c13_CID] = read_sysreg(CID); - ctxt->cp15[c13_TID_URW] = read_sysreg(TID_URW); - ctxt->cp15[c13_TID_URO] = read_sysreg(TID_URO); - ctxt->cp15[c13_TID_PRIV] = read_sysreg(TID_PRIV); - ctxt->cp15[c14_CNTKCTL] = read_sysreg(CNTKCTL); -} - -void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt) -{ - write_sysreg(ctxt->cp15[c0_MPIDR], VMPIDR); - write_sysreg(ctxt->cp15[c0_CSSELR], CSSELR); - write_sysreg(ctxt->cp15[c1_SCTLR], SCTLR); - write_sysreg(ctxt->cp15[c1_CPACR], CPACR); - write_sysreg(*cp15_64(ctxt, c2_TTBR0), TTBR0); - write_sysreg(*cp15_64(ctxt, c2_TTBR1), TTBR1); - write_sysreg(ctxt->cp15[c2_TTBCR], TTBCR); - write_sysreg(ctxt->cp15[c3_DACR], DACR); - write_sysreg(ctxt->cp15[c5_DFSR], DFSR); - write_sysreg(ctxt->cp15[c5_IFSR], IFSR); - write_sysreg(ctxt->cp15[c5_ADFSR], ADFSR); - write_sysreg(ctxt->cp15[c5_AIFSR], AIFSR); - write_sysreg(ctxt->cp15[c6_DFAR], DFAR); - write_sysreg(ctxt->cp15[c6_IFAR], IFAR); - write_sysreg(*cp15_64(ctxt, c7_PAR), PAR); - write_sysreg(ctxt->cp15[c10_PRRR], PRRR); - write_sysreg(ctxt->cp15[c10_NMRR], NMRR); - write_sysreg(ctxt->cp15[c10_AMAIR0], AMAIR0); - write_sysreg(ctxt->cp15[c10_AMAIR1], AMAIR1); - write_sysreg(ctxt->cp15[c12_VBAR], VBAR); - write_sysreg(ctxt->cp15[c13_CID], CID); - write_sysreg(ctxt->cp15[c13_TID_URW], TID_URW); - write_sysreg(ctxt->cp15[c13_TID_URO], TID_URO); - write_sysreg(ctxt->cp15[c13_TID_PRIV], TID_PRIV); - write_sysreg(ctxt->cp15[c14_CNTKCTL], CNTKCTL); -} diff --git a/arch/arm/kvm/hyp/entry.S b/arch/arm/kvm/hyp/entry.S deleted file mode 100644 index 4bd1f6a74180..000000000000 --- a/arch/arm/kvm/hyp/entry.S +++ /dev/null @@ -1,121 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2016 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> -*/ - -#include <linux/linkage.h> -#include <asm/asm-offsets.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_asm.h> - - .arch_extension virt - - .text - .pushsection .hyp.text, "ax" - -#define USR_REGS_OFFSET (CPU_CTXT_GP_REGS + GP_REGS_USR) - -/* int __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host) */ -ENTRY(__guest_enter) - @ Save host registers - add r1, r1, #(USR_REGS_OFFSET + S_R4) - stm r1!, {r4-r12} - str lr, [r1, #4] @ Skip SP_usr (already saved) - - @ Restore guest registers - add r0, r0, #(VCPU_GUEST_CTXT + USR_REGS_OFFSET + S_R0) - ldr lr, [r0, #S_LR] - ldm r0, {r0-r12} - - clrex - eret -ENDPROC(__guest_enter) - -ENTRY(__guest_exit) - /* - * return convention: - * guest r0, r1, r2 saved on the stack - * r0: vcpu pointer - * r1: exception code - */ - - add r2, r0, #(VCPU_GUEST_CTXT + USR_REGS_OFFSET + S_R3) - stm r2!, {r3-r12} - str lr, [r2, #4] - add r2, r0, #(VCPU_GUEST_CTXT + USR_REGS_OFFSET + S_R0) - pop {r3, r4, r5} @ r0, r1, r2 - stm r2, {r3-r5} - - ldr r0, [r0, #VCPU_HOST_CTXT] - add r0, r0, #(USR_REGS_OFFSET + S_R4) - ldm r0!, {r4-r12} - ldr lr, [r0, #4] - - mov r0, r1 - mrs r1, SPSR - mrs r2, ELR_hyp - mrc p15, 4, r3, c5, c2, 0 @ HSR - - /* - * Force loads and stores to complete before unmasking aborts - * and forcing the delivery of the exception. This gives us a - * single instruction window, which the handler will try to - * match. - */ - dsb sy - cpsie a - - .global abort_guest_exit_start -abort_guest_exit_start: - - isb - - .global abort_guest_exit_end -abort_guest_exit_end: - - /* - * If we took an abort, r0[31] will be set, and cmp will set - * the N bit in PSTATE. - */ - cmp r0, #0 - msrmi SPSR_cxsf, r1 - msrmi ELR_hyp, r2 - mcrmi p15, 4, r3, c5, c2, 0 @ HSR - - bx lr -ENDPROC(__guest_exit) - -/* - * If VFPv3 support is not available, then we will not switch the VFP - * registers; however cp10 and cp11 accesses will still trap and fallback - * to the regular coprocessor emulation code, which currently will - * inject an undefined exception to the guest. - */ -#ifdef CONFIG_VFPv3 -ENTRY(__vfp_guest_restore) - push {r3, r4, lr} - - @ NEON/VFP used. Turn on VFP access. - mrc p15, 4, r1, c1, c1, 2 @ HCPTR - bic r1, r1, #(HCPTR_TCP(10) | HCPTR_TCP(11)) - mcr p15, 4, r1, c1, c1, 2 @ HCPTR - isb - - @ Switch VFP/NEON hardware state to the guest's - mov r4, r0 - ldr r0, [r0, #VCPU_HOST_CTXT] - add r0, r0, #CPU_CTXT_VFP - bl __vfp_save_state - add r0, r4, #(VCPU_GUEST_CTXT + CPU_CTXT_VFP) - bl __vfp_restore_state - - pop {r3, r4, lr} - pop {r0, r1, r2} - clrex - eret -ENDPROC(__vfp_guest_restore) -#endif - - .popsection - diff --git a/arch/arm/kvm/hyp/hyp-entry.S b/arch/arm/kvm/hyp/hyp-entry.S deleted file mode 100644 index fe3d7811a908..000000000000 --- a/arch/arm/kvm/hyp/hyp-entry.S +++ /dev/null @@ -1,295 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#include <linux/arm-smccc.h> -#include <linux/linkage.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_asm.h> - - .arch_extension virt - - .text - .pushsection .hyp.text, "ax" - -.macro load_vcpu reg - mrc p15, 4, \reg, c13, c0, 2 @ HTPIDR -.endm - -/******************************************************************** - * Hypervisor exception vector and handlers - * - * - * The KVM/ARM Hypervisor ABI is defined as follows: - * - * Entry to Hyp mode from the host kernel will happen _only_ when an HVC - * instruction is issued since all traps are disabled when running the host - * kernel as per the Hyp-mode initialization at boot time. - * - * HVC instructions cause a trap to the vector page + offset 0x14 (see hyp_hvc - * below) when the HVC instruction is called from SVC mode (i.e. a guest or the - * host kernel) and they cause a trap to the vector page + offset 0x8 when HVC - * instructions are called from within Hyp-mode. - * - * Hyp-ABI: Calling HYP-mode functions from host (in SVC mode): - * Switching to Hyp mode is done through a simple HVC #0 instruction. The - * exception vector code will check that the HVC comes from VMID==0. - * - r0 contains a pointer to a HYP function - * - r1, r2, and r3 contain arguments to the above function. - * - The HYP function will be called with its arguments in r0, r1 and r2. - * On HYP function return, we return directly to SVC. - * - * Note that the above is used to execute code in Hyp-mode from a host-kernel - * point of view, and is a different concept from performing a world-switch and - * executing guest code SVC mode (with a VMID != 0). - */ - - .align 5 -__kvm_hyp_vector: - .global __kvm_hyp_vector - - @ Hyp-mode exception vector - W(b) hyp_reset - W(b) hyp_undef - W(b) hyp_svc - W(b) hyp_pabt - W(b) hyp_dabt - W(b) hyp_hvc - W(b) hyp_irq - W(b) hyp_fiq - -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR - .align 5 -__kvm_hyp_vector_ic_inv: - .global __kvm_hyp_vector_ic_inv - - /* - * We encode the exception entry in the bottom 3 bits of - * SP, and we have to guarantee to be 8 bytes aligned. - */ - W(add) sp, sp, #1 /* Reset 7 */ - W(add) sp, sp, #1 /* Undef 6 */ - W(add) sp, sp, #1 /* Syscall 5 */ - W(add) sp, sp, #1 /* Prefetch abort 4 */ - W(add) sp, sp, #1 /* Data abort 3 */ - W(add) sp, sp, #1 /* HVC 2 */ - W(add) sp, sp, #1 /* IRQ 1 */ - W(nop) /* FIQ 0 */ - - mcr p15, 0, r0, c7, c5, 0 /* ICIALLU */ - isb - - b decode_vectors - - .align 5 -__kvm_hyp_vector_bp_inv: - .global __kvm_hyp_vector_bp_inv - - /* - * We encode the exception entry in the bottom 3 bits of - * SP, and we have to guarantee to be 8 bytes aligned. - */ - W(add) sp, sp, #1 /* Reset 7 */ - W(add) sp, sp, #1 /* Undef 6 */ - W(add) sp, sp, #1 /* Syscall 5 */ - W(add) sp, sp, #1 /* Prefetch abort 4 */ - W(add) sp, sp, #1 /* Data abort 3 */ - W(add) sp, sp, #1 /* HVC 2 */ - W(add) sp, sp, #1 /* IRQ 1 */ - W(nop) /* FIQ 0 */ - - mcr p15, 0, r0, c7, c5, 6 /* BPIALL */ - isb - -decode_vectors: - -#ifdef CONFIG_THUMB2_KERNEL - /* - * Yet another silly hack: Use VPIDR as a temp register. - * Thumb2 is really a pain, as SP cannot be used with most - * of the bitwise instructions. The vect_br macro ensures - * things gets cleaned-up. - */ - mcr p15, 4, r0, c0, c0, 0 /* VPIDR */ - mov r0, sp - and r0, r0, #7 - sub sp, sp, r0 - push {r1, r2} - mov r1, r0 - mrc p15, 4, r0, c0, c0, 0 /* VPIDR */ - mrc p15, 0, r2, c0, c0, 0 /* MIDR */ - mcr p15, 4, r2, c0, c0, 0 /* VPIDR */ -#endif - -.macro vect_br val, targ -ARM( eor sp, sp, #\val ) -ARM( tst sp, #7 ) -ARM( eorne sp, sp, #\val ) - -THUMB( cmp r1, #\val ) -THUMB( popeq {r1, r2} ) - - beq \targ -.endm - - vect_br 0, hyp_fiq - vect_br 1, hyp_irq - vect_br 2, hyp_hvc - vect_br 3, hyp_dabt - vect_br 4, hyp_pabt - vect_br 5, hyp_svc - vect_br 6, hyp_undef - vect_br 7, hyp_reset -#endif - -.macro invalid_vector label, cause - .align -\label: mov r0, #\cause - b __hyp_panic -.endm - - invalid_vector hyp_reset ARM_EXCEPTION_RESET - invalid_vector hyp_undef ARM_EXCEPTION_UNDEFINED - invalid_vector hyp_svc ARM_EXCEPTION_SOFTWARE - invalid_vector hyp_pabt ARM_EXCEPTION_PREF_ABORT - invalid_vector hyp_fiq ARM_EXCEPTION_FIQ - -ENTRY(__hyp_do_panic) - mrs lr, cpsr - bic lr, lr, #MODE_MASK - orr lr, lr, #SVC_MODE -THUMB( orr lr, lr, #PSR_T_BIT ) - msr spsr_cxsf, lr - ldr lr, =panic - msr ELR_hyp, lr - ldr lr, =__kvm_call_hyp - clrex - eret -ENDPROC(__hyp_do_panic) - -hyp_hvc: - /* - * Getting here is either because of a trap from a guest, - * or from executing HVC from the host kernel, which means - * "do something in Hyp mode". - */ - push {r0, r1, r2} - - @ Check syndrome register - mrc p15, 4, r1, c5, c2, 0 @ HSR - lsr r0, r1, #HSR_EC_SHIFT - cmp r0, #HSR_EC_HVC - bne guest_trap @ Not HVC instr. - - /* - * Let's check if the HVC came from VMID 0 and allow simple - * switch to Hyp mode - */ - mrrc p15, 6, r0, r2, c2 - lsr r2, r2, #16 - and r2, r2, #0xff - cmp r2, #0 - bne guest_hvc_trap @ Guest called HVC - - /* - * Getting here means host called HVC, we shift parameters and branch - * to Hyp function. - */ - pop {r0, r1, r2} - - /* - * Check if we have a kernel function, which is guaranteed to be - * bigger than the maximum hyp stub hypercall - */ - cmp r0, #HVC_STUB_HCALL_NR - bhs 1f - - /* - * Not a kernel function, treat it as a stub hypercall. - * Compute the physical address for __kvm_handle_stub_hvc - * (as the code lives in the idmaped page) and branch there. - * We hijack ip (r12) as a tmp register. - */ - push {r1} - ldr r1, =kimage_voffset - ldr r1, [r1] - ldr ip, =__kvm_handle_stub_hvc - sub ip, ip, r1 - pop {r1} - - bx ip - -1: - /* - * Pushing r2 here is just a way of keeping the stack aligned to - * 8 bytes on any path that can trigger a HYP exception. Here, - * we may well be about to jump into the guest, and the guest - * exit would otherwise be badly decoded by our fancy - * "decode-exception-without-a-branch" code... - */ - push {r2, lr} - - mov lr, r0 - mov r0, r1 - mov r1, r2 - mov r2, r3 - -THUMB( orr lr, #1) - blx lr @ Call the HYP function - - pop {r2, lr} - eret - -guest_hvc_trap: - movw r2, #:lower16:ARM_SMCCC_ARCH_WORKAROUND_1 - movt r2, #:upper16:ARM_SMCCC_ARCH_WORKAROUND_1 - ldr r0, [sp] @ Guest's r0 - teq r0, r2 - bne guest_trap - add sp, sp, #12 - @ Returns: - @ r0 = 0 - @ r1 = HSR value (perfectly predictable) - @ r2 = ARM_SMCCC_ARCH_WORKAROUND_1 - mov r0, #0 - eret - -guest_trap: - load_vcpu r0 @ Load VCPU pointer to r0 - -#ifdef CONFIG_VFPv3 - @ Check for a VFP access - lsr r1, r1, #HSR_EC_SHIFT - cmp r1, #HSR_EC_CP_0_13 - beq __vfp_guest_restore -#endif - - mov r1, #ARM_EXCEPTION_HVC - b __guest_exit - -hyp_irq: - push {r0, r1, r2} - mov r1, #ARM_EXCEPTION_IRQ - load_vcpu r0 @ Load VCPU pointer to r0 - b __guest_exit - -hyp_dabt: - push {r0, r1} - mrs r0, ELR_hyp - ldr r1, =abort_guest_exit_start -THUMB( add r1, r1, #1) - cmp r0, r1 - ldrne r1, =abort_guest_exit_end -THUMB( addne r1, r1, #1) - cmpne r0, r1 - pop {r0, r1} - bne __hyp_panic - - orr r0, r0, #(1 << ARM_EXIT_WITH_ABORT_BIT) - eret - - .ltorg - - .popsection diff --git a/arch/arm/kvm/hyp/s2-setup.c b/arch/arm/kvm/hyp/s2-setup.c deleted file mode 100644 index 5dfbea5adf65..000000000000 --- a/arch/arm/kvm/hyp/s2-setup.c +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2016 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - */ - -#include <linux/types.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_asm.h> -#include <asm/kvm_hyp.h> - -void __hyp_text __init_stage2_translation(void) -{ - u64 val; - - val = read_sysreg(VTCR) & ~VTCR_MASK; - - val |= read_sysreg(HTCR) & VTCR_HTCR_SH; - val |= KVM_VTCR_SL0 | KVM_VTCR_T0SZ | KVM_VTCR_S; - - write_sysreg(val, VTCR); -} diff --git a/arch/arm/kvm/hyp/switch.c b/arch/arm/kvm/hyp/switch.c deleted file mode 100644 index 1efeef3fd0ee..000000000000 --- a/arch/arm/kvm/hyp/switch.c +++ /dev/null @@ -1,242 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2015 - ARM Ltd - * Author: Marc Zyngier <marc.zyngier@arm.com> - */ -#include <linux/jump_label.h> - -#include <asm/kvm_asm.h> -#include <asm/kvm_hyp.h> -#include <asm/kvm_mmu.h> - -__asm__(".arch_extension virt"); - -/* - * Activate the traps, saving the host's fpexc register before - * overwriting it. We'll restore it on VM exit. - */ -static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu, u32 *fpexc_host) -{ - u32 val; - - /* - * We are about to set HCPTR.TCP10/11 to trap all floating point - * register accesses to HYP, however, the ARM ARM clearly states that - * traps are only taken to HYP if the operation would not otherwise - * trap to SVC. Therefore, always make sure that for 32-bit guests, - * we set FPEXC.EN to prevent traps to SVC, when setting the TCP bits. - */ - val = read_sysreg(VFP_FPEXC); - *fpexc_host = val; - if (!(val & FPEXC_EN)) { - write_sysreg(val | FPEXC_EN, VFP_FPEXC); - isb(); - } - - write_sysreg(vcpu->arch.hcr, HCR); - /* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */ - write_sysreg(HSTR_T(15), HSTR); - write_sysreg(HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11), HCPTR); - val = read_sysreg(HDCR); - val |= HDCR_TPM | HDCR_TPMCR; /* trap performance monitors */ - val |= HDCR_TDRA | HDCR_TDOSA | HDCR_TDA; /* trap debug regs */ - write_sysreg(val, HDCR); -} - -static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu) -{ - u32 val; - - /* - * If we pended a virtual abort, preserve it until it gets - * cleared. See B1.9.9 (Virtual Abort exception) for details, - * but the crucial bit is the zeroing of HCR.VA in the - * pseudocode. - */ - if (vcpu->arch.hcr & HCR_VA) - vcpu->arch.hcr = read_sysreg(HCR); - - write_sysreg(0, HCR); - write_sysreg(0, HSTR); - val = read_sysreg(HDCR); - write_sysreg(val & ~(HDCR_TPM | HDCR_TPMCR), HDCR); - write_sysreg(0, HCPTR); -} - -static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = kern_hyp_va(vcpu->kvm); - write_sysreg(kvm_get_vttbr(kvm), VTTBR); - write_sysreg(vcpu->arch.midr, VPIDR); -} - -static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu) -{ - write_sysreg(0, VTTBR); - write_sysreg(read_sysreg(MIDR), VPIDR); -} - - -static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu) -{ - if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { - __vgic_v3_save_state(vcpu); - __vgic_v3_deactivate_traps(vcpu); - } -} - -static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu) -{ - if (static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) { - __vgic_v3_activate_traps(vcpu); - __vgic_v3_restore_state(vcpu); - } -} - -static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu) -{ - u32 hsr = read_sysreg(HSR); - u8 ec = hsr >> HSR_EC_SHIFT; - u32 hpfar, far; - - vcpu->arch.fault.hsr = hsr; - - if (ec == HSR_EC_IABT) - far = read_sysreg(HIFAR); - else if (ec == HSR_EC_DABT) - far = read_sysreg(HDFAR); - else - return true; - - /* - * B3.13.5 Reporting exceptions taken to the Non-secure PL2 mode: - * - * Abort on the stage 2 translation for a memory access from a - * Non-secure PL1 or PL0 mode: - * - * For any Access flag fault or Translation fault, and also for any - * Permission fault on the stage 2 translation of a memory access - * made as part of a translation table walk for a stage 1 translation, - * the HPFAR holds the IPA that caused the fault. Otherwise, the HPFAR - * is UNKNOWN. - */ - if (!(hsr & HSR_DABT_S1PTW) && (hsr & HSR_FSC_TYPE) == FSC_PERM) { - u64 par, tmp; - - par = read_sysreg(PAR); - write_sysreg(far, ATS1CPR); - isb(); - - tmp = read_sysreg(PAR); - write_sysreg(par, PAR); - - if (unlikely(tmp & 1)) - return false; /* Translation failed, back to guest */ - - hpfar = ((tmp >> 12) & ((1UL << 28) - 1)) << 4; - } else { - hpfar = read_sysreg(HPFAR); - } - - vcpu->arch.fault.hxfar = far; - vcpu->arch.fault.hpfar = hpfar; - return true; -} - -int __hyp_text __kvm_vcpu_run_nvhe(struct kvm_vcpu *vcpu) -{ - struct kvm_cpu_context *host_ctxt; - struct kvm_cpu_context *guest_ctxt; - bool fp_enabled; - u64 exit_code; - u32 fpexc; - - vcpu = kern_hyp_va(vcpu); - write_sysreg(vcpu, HTPIDR); - - host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); - guest_ctxt = &vcpu->arch.ctxt; - - __sysreg_save_state(host_ctxt); - __banked_save_state(host_ctxt); - - __activate_traps(vcpu, &fpexc); - __activate_vm(vcpu); - - __vgic_restore_state(vcpu); - __timer_enable_traps(vcpu); - - __sysreg_restore_state(guest_ctxt); - __banked_restore_state(guest_ctxt); - - /* Jump in the fire! */ -again: - exit_code = __guest_enter(vcpu, host_ctxt); - /* And we're baaack! */ - - if (exit_code == ARM_EXCEPTION_HVC && !__populate_fault_info(vcpu)) - goto again; - - fp_enabled = __vfp_enabled(); - - __banked_save_state(guest_ctxt); - __sysreg_save_state(guest_ctxt); - __timer_disable_traps(vcpu); - - __vgic_save_state(vcpu); - - __deactivate_traps(vcpu); - __deactivate_vm(vcpu); - - __banked_restore_state(host_ctxt); - __sysreg_restore_state(host_ctxt); - - if (fp_enabled) { - __vfp_save_state(&guest_ctxt->vfp); - __vfp_restore_state(&host_ctxt->vfp); - } - - write_sysreg(fpexc, VFP_FPEXC); - - return exit_code; -} - -static const char * const __hyp_panic_string[] = { - [ARM_EXCEPTION_RESET] = "\nHYP panic: RST PC:%08x CPSR:%08x", - [ARM_EXCEPTION_UNDEFINED] = "\nHYP panic: UNDEF PC:%08x CPSR:%08x", - [ARM_EXCEPTION_SOFTWARE] = "\nHYP panic: SVC PC:%08x CPSR:%08x", - [ARM_EXCEPTION_PREF_ABORT] = "\nHYP panic: PABRT PC:%08x CPSR:%08x", - [ARM_EXCEPTION_DATA_ABORT] = "\nHYP panic: DABRT PC:%08x ADDR:%08x", - [ARM_EXCEPTION_IRQ] = "\nHYP panic: IRQ PC:%08x CPSR:%08x", - [ARM_EXCEPTION_FIQ] = "\nHYP panic: FIQ PC:%08x CPSR:%08x", - [ARM_EXCEPTION_HVC] = "\nHYP panic: HVC PC:%08x CPSR:%08x", -}; - -void __hyp_text __noreturn __hyp_panic(int cause) -{ - u32 elr = read_special(ELR_hyp); - u32 val; - - if (cause == ARM_EXCEPTION_DATA_ABORT) - val = read_sysreg(HDFAR); - else - val = read_special(SPSR); - - if (read_sysreg(VTTBR)) { - struct kvm_vcpu *vcpu; - struct kvm_cpu_context *host_ctxt; - - vcpu = (struct kvm_vcpu *)read_sysreg(HTPIDR); - host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context); - __timer_disable_traps(vcpu); - __deactivate_traps(vcpu); - __deactivate_vm(vcpu); - __banked_restore_state(host_ctxt); - __sysreg_restore_state(host_ctxt); - } - - /* Call panic for real */ - __hyp_do_panic(__hyp_panic_string[cause], elr, val); - - unreachable(); -} diff --git a/arch/arm/kvm/hyp/tlb.c b/arch/arm/kvm/hyp/tlb.c deleted file mode 100644 index 848f27bbad9d..000000000000 --- a/arch/arm/kvm/hyp/tlb.c +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Original code: - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * Mostly rewritten in C by Marc Zyngier <marc.zyngier@arm.com> - */ - -#include <asm/kvm_hyp.h> -#include <asm/kvm_mmu.h> - -/** - * Flush per-VMID TLBs - * - * __kvm_tlb_flush_vmid(struct kvm *kvm); - * - * We rely on the hardware to broadcast the TLB invalidation to all CPUs - * inside the inner-shareable domain (which is the case for all v7 - * implementations). If we come across a non-IS SMP implementation, we'll - * have to use an IPI based mechanism. Until then, we stick to the simple - * hardware assisted version. - * - * As v7 does not support flushing per IPA, just nuke the whole TLB - * instead, ignoring the ipa value. - */ -void __hyp_text __kvm_tlb_flush_vmid(struct kvm *kvm) -{ - dsb(ishst); - - /* Switch to requested VMID */ - kvm = kern_hyp_va(kvm); - write_sysreg(kvm_get_vttbr(kvm), VTTBR); - isb(); - - write_sysreg(0, TLBIALLIS); - dsb(ish); - isb(); - - write_sysreg(0, VTTBR); -} - -void __hyp_text __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa) -{ - __kvm_tlb_flush_vmid(kvm); -} - -void __hyp_text __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = kern_hyp_va(kern_hyp_va(vcpu)->kvm); - - /* Switch to requested VMID */ - write_sysreg(kvm_get_vttbr(kvm), VTTBR); - isb(); - - write_sysreg(0, TLBIALL); - dsb(nsh); - isb(); - - write_sysreg(0, VTTBR); -} - -void __hyp_text __kvm_flush_vm_context(void) -{ - write_sysreg(0, TLBIALLNSNHIS); - write_sysreg(0, ICIALLUIS); - dsb(ish); -} diff --git a/arch/arm/kvm/hyp/vfp.S b/arch/arm/kvm/hyp/vfp.S deleted file mode 100644 index 675a52348d8d..000000000000 --- a/arch/arm/kvm/hyp/vfp.S +++ /dev/null @@ -1,57 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#include <linux/linkage.h> -#include <asm/vfpmacros.h> - - .text - .pushsection .hyp.text, "ax" - -/* void __vfp_save_state(struct vfp_hard_struct *vfp); */ -ENTRY(__vfp_save_state) - push {r4, r5} - VFPFMRX r1, FPEXC - - @ Make sure *really* VFP is enabled so we can touch the registers. - orr r5, r1, #FPEXC_EN - tst r5, #FPEXC_EX @ Check for VFP Subarchitecture - bic r5, r5, #FPEXC_EX @ FPEXC_EX disable - VFPFMXR FPEXC, r5 - isb - - VFPFMRX r2, FPSCR - beq 1f - - @ If FPEXC_EX is 0, then FPINST/FPINST2 reads are upredictable, so - @ we only need to save them if FPEXC_EX is set. - VFPFMRX r3, FPINST - tst r5, #FPEXC_FP2V - VFPFMRX r4, FPINST2, ne @ vmrsne -1: - VFPFSTMIA r0, r5 @ Save VFP registers - stm r0, {r1-r4} @ Save FPEXC, FPSCR, FPINST, FPINST2 - pop {r4, r5} - bx lr -ENDPROC(__vfp_save_state) - -/* void __vfp_restore_state(struct vfp_hard_struct *vfp); - * Assume FPEXC_EN is on and FPEXC_EX is off */ -ENTRY(__vfp_restore_state) - VFPFLDMIA r0, r1 @ Load VFP registers - ldm r0, {r0-r3} @ Load FPEXC, FPSCR, FPINST, FPINST2 - - VFPFMXR FPSCR, r1 - tst r0, #FPEXC_EX @ Check for VFP Subarchitecture - beq 1f - VFPFMXR FPINST, r2 - tst r0, #FPEXC_FP2V - VFPFMXR FPINST2, r3, ne -1: - VFPFMXR FPEXC, r0 @ FPEXC (last, in case !EN) - bx lr -ENDPROC(__vfp_restore_state) - - .popsection diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S deleted file mode 100644 index 33e34b6d24b2..000000000000 --- a/arch/arm/kvm/init.S +++ /dev/null @@ -1,157 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#include <linux/linkage.h> -#include <asm/assembler.h> -#include <asm/unified.h> -#include <asm/asm-offsets.h> -#include <asm/kvm_asm.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_mmu.h> -#include <asm/virt.h> - -/******************************************************************** - * Hypervisor initialization - * - should be called with: - * r0 = top of Hyp stack (kernel VA) - * r1 = pointer to hyp vectors - * r2,r3 = Hypervisor pgd pointer - * - * The init scenario is: - * - We jump in HYP with 3 parameters: runtime HYP pgd, runtime stack, - * runtime vectors - * - Invalidate TLBs - * - Set stack and vectors - * - Setup the page tables - * - Enable the MMU - * - Profit! (or eret, if you only care about the code). - * - * Another possibility is to get a HYP stub hypercall. - * We discriminate between the two by checking if r0 contains a value - * that is less than HVC_STUB_HCALL_NR. - */ - - .text - .pushsection .hyp.idmap.text,"ax" - .align 5 -__kvm_hyp_init: - .globl __kvm_hyp_init - - @ Hyp-mode exception vector - W(b) . - W(b) . - W(b) . - W(b) . - W(b) . - W(b) __do_hyp_init - W(b) . - W(b) . - -__do_hyp_init: - @ Check for a stub hypercall - cmp r0, #HVC_STUB_HCALL_NR - blo __kvm_handle_stub_hvc - - @ Set stack pointer - mov sp, r0 - - @ Set HVBAR to point to the HYP vectors - mcr p15, 4, r1, c12, c0, 0 @ HVBAR - - @ Set the HTTBR to point to the hypervisor PGD pointer passed - mcrr p15, 4, rr_lo_hi(r2, r3), c2 - - @ Set the HTCR and VTCR to the same shareability and cacheability - @ settings as the non-secure TTBCR and with T0SZ == 0. - mrc p15, 4, r0, c2, c0, 2 @ HTCR - ldr r2, =HTCR_MASK - bic r0, r0, r2 - mrc p15, 0, r1, c2, c0, 2 @ TTBCR - and r1, r1, #(HTCR_MASK & ~TTBCR_T0SZ) - orr r0, r0, r1 - mcr p15, 4, r0, c2, c0, 2 @ HTCR - - @ Use the same memory attributes for hyp. accesses as the kernel - @ (copy MAIRx ro HMAIRx). - mrc p15, 0, r0, c10, c2, 0 - mcr p15, 4, r0, c10, c2, 0 - mrc p15, 0, r0, c10, c2, 1 - mcr p15, 4, r0, c10, c2, 1 - - @ Invalidate the stale TLBs from Bootloader - mcr p15, 4, r0, c8, c7, 0 @ TLBIALLH - dsb ish - - @ Set the HSCTLR to: - @ - ARM/THUMB exceptions: Kernel config (Thumb-2 kernel) - @ - Endianness: Kernel config - @ - Fast Interrupt Features: Kernel config - @ - Write permission implies XN: disabled - @ - Instruction cache: enabled - @ - Data/Unified cache: enabled - @ - MMU: enabled (this code must be run from an identity mapping) - mrc p15, 4, r0, c1, c0, 0 @ HSCR - ldr r2, =HSCTLR_MASK - bic r0, r0, r2 - mrc p15, 0, r1, c1, c0, 0 @ SCTLR - ldr r2, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C) - and r1, r1, r2 - ARM( ldr r2, =(HSCTLR_M) ) - THUMB( ldr r2, =(HSCTLR_M | HSCTLR_TE) ) - orr r1, r1, r2 - orr r0, r0, r1 - mcr p15, 4, r0, c1, c0, 0 @ HSCR - isb - - eret - -ENTRY(__kvm_handle_stub_hvc) - cmp r0, #HVC_SOFT_RESTART - bne 1f - - /* The target is expected in r1 */ - msr ELR_hyp, r1 - mrs r0, cpsr - bic r0, r0, #MODE_MASK - orr r0, r0, #HYP_MODE -THUMB( orr r0, r0, #PSR_T_BIT ) - msr spsr_cxsf, r0 - b reset - -1: cmp r0, #HVC_RESET_VECTORS - bne 1f - -reset: - /* We're now in idmap, disable MMU */ - mrc p15, 4, r1, c1, c0, 0 @ HSCTLR - ldr r0, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I) - bic r1, r1, r0 - mcr p15, 4, r1, c1, c0, 0 @ HSCTLR - - /* - * Install stub vectors, using ardb's VA->PA trick. - */ -0: adr r0, 0b @ PA(0) - movw r1, #:lower16:__hyp_stub_vectors - 0b @ VA(stub) - VA(0) - movt r1, #:upper16:__hyp_stub_vectors - 0b - add r1, r1, r0 @ PA(stub) - mcr p15, 4, r1, c12, c0, 0 @ HVBAR - b exit - -1: ldr r0, =HVC_STUB_ERR - eret - -exit: - mov r0, #0 - eret -ENDPROC(__kvm_handle_stub_hvc) - - .ltorg - - .globl __kvm_hyp_init_end -__kvm_hyp_init_end: - - .popsection diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S deleted file mode 100644 index 064f4f118ca7..000000000000 --- a/arch/arm/kvm/interrupts.S +++ /dev/null @@ -1,36 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - */ - -#include <linux/linkage.h> - - .text - -/******************************************************************** - * Call function in Hyp mode - * - * - * unsigned long kvm_call_hyp(void *hypfn, ...); - * - * This is not really a variadic function in the classic C-way and care must - * be taken when calling this to ensure parameters are passed in registers - * only, since the stack will change between the caller and the callee. - * - * Call the function with the first argument containing a pointer to the - * function you wish to call in Hyp mode, and subsequent arguments will be - * passed as r0, r1, and r2 (a maximum of 3 arguments in addition to the - * function pointer can be passed). The function being called must be mapped - * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c). Return values are - * passed in r0 (strictly 32bit). - * - * The calling convention follows the standard AAPCS: - * r0 - r3: caller save - * r12: caller save - * rest: callee save - */ -ENTRY(__kvm_call_hyp) - hvc #0 - bx lr -ENDPROC(__kvm_call_hyp) diff --git a/arch/arm/kvm/irq.h b/arch/arm/kvm/irq.h deleted file mode 100644 index 0d257de42c10..000000000000 --- a/arch/arm/kvm/irq.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * irq.h: in kernel interrupt controller related definitions - * Copyright (c) 2016 Red Hat, Inc. - * - * This header is included by irqchip.c. However, on ARM, interrupt - * controller declarations are located in include/kvm/arm_vgic.h since - * they are mostly shared between arm and arm64. - */ - -#ifndef __IRQ_H -#define __IRQ_H - -#include <kvm/arm_vgic.h> - -#endif diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c deleted file mode 100644 index eb4174f6ebbd..000000000000 --- a/arch/arm/kvm/reset.c +++ /dev/null @@ -1,86 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - */ -#include <linux/compiler.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/kvm_host.h> -#include <linux/kvm.h> - -#include <asm/unified.h> -#include <asm/ptrace.h> -#include <asm/cputype.h> -#include <asm/kvm_arm.h> -#include <asm/kvm_coproc.h> -#include <asm/kvm_emulate.h> - -#include <kvm/arm_arch_timer.h> - -/****************************************************************************** - * Cortex-A15 and Cortex-A7 Reset Values - */ - -static struct kvm_regs cortexa_regs_reset = { - .usr_regs.ARM_cpsr = SVC_MODE | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT, -}; - - -/******************************************************************************* - * Exported reset function - */ - -/** - * kvm_reset_vcpu - sets core registers and cp15 registers to reset value - * @vcpu: The VCPU pointer - * - * This function finds the right table above and sets the registers on the - * virtual CPU struct to their architecturally defined reset values. - */ -int kvm_reset_vcpu(struct kvm_vcpu *vcpu) -{ - struct kvm_regs *reset_regs; - - switch (vcpu->arch.target) { - case KVM_ARM_TARGET_CORTEX_A7: - case KVM_ARM_TARGET_CORTEX_A15: - reset_regs = &cortexa_regs_reset; - vcpu->arch.midr = read_cpuid_id(); - break; - default: - return -ENODEV; - } - - /* Reset core registers */ - memcpy(&vcpu->arch.ctxt.gp_regs, reset_regs, sizeof(vcpu->arch.ctxt.gp_regs)); - - /* Reset CP15 registers */ - kvm_reset_coprocs(vcpu); - - /* - * Additional reset state handling that PSCI may have imposed on us. - * Must be done after all the sys_reg reset. - */ - if (READ_ONCE(vcpu->arch.reset_state.reset)) { - unsigned long target_pc = vcpu->arch.reset_state.pc; - - /* Gracefully handle Thumb2 entry point */ - if (target_pc & 1) { - target_pc &= ~1UL; - vcpu_set_thumb(vcpu); - } - - /* Propagate caller endianness */ - if (vcpu->arch.reset_state.be) - kvm_vcpu_set_be(vcpu); - - *vcpu_pc(vcpu) = target_pc; - vcpu_set_reg(vcpu, 0, vcpu->arch.reset_state.r0); - - vcpu->arch.reset_state.reset = false; - } - - /* Reset arch_timer context */ - return kvm_timer_vcpu_reset(vcpu); -} diff --git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h deleted file mode 100644 index 69a9d62a0ac6..000000000000 --- a/arch/arm/kvm/trace.h +++ /dev/null @@ -1,86 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#if !defined(_TRACE_ARM_KVM_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_ARM_KVM_H - -#include <linux/tracepoint.h> - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kvm - -/* Architecturally implementation defined CP15 register access */ -TRACE_EVENT(kvm_emulate_cp15_imp, - TP_PROTO(unsigned long Op1, unsigned long Rt1, unsigned long CRn, - unsigned long CRm, unsigned long Op2, bool is_write), - TP_ARGS(Op1, Rt1, CRn, CRm, Op2, is_write), - - TP_STRUCT__entry( - __field( unsigned int, Op1 ) - __field( unsigned int, Rt1 ) - __field( unsigned int, CRn ) - __field( unsigned int, CRm ) - __field( unsigned int, Op2 ) - __field( bool, is_write ) - ), - - TP_fast_assign( - __entry->is_write = is_write; - __entry->Op1 = Op1; - __entry->Rt1 = Rt1; - __entry->CRn = CRn; - __entry->CRm = CRm; - __entry->Op2 = Op2; - ), - - TP_printk("Implementation defined CP15: %s\tp15, %u, r%u, c%u, c%u, %u", - (__entry->is_write) ? "mcr" : "mrc", - __entry->Op1, __entry->Rt1, __entry->CRn, - __entry->CRm, __entry->Op2) -); - -TRACE_EVENT(kvm_wfx, - TP_PROTO(unsigned long vcpu_pc, bool is_wfe), - TP_ARGS(vcpu_pc, is_wfe), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_pc ) - __field( bool, is_wfe ) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->is_wfe = is_wfe; - ), - - TP_printk("guest executed wf%c at: 0x%08lx", - __entry->is_wfe ? 'e' : 'i', __entry->vcpu_pc) -); - -TRACE_EVENT(kvm_hvc, - TP_PROTO(unsigned long vcpu_pc, unsigned long r0, unsigned long imm), - TP_ARGS(vcpu_pc, r0, imm), - - TP_STRUCT__entry( - __field( unsigned long, vcpu_pc ) - __field( unsigned long, r0 ) - __field( unsigned long, imm ) - ), - - TP_fast_assign( - __entry->vcpu_pc = vcpu_pc; - __entry->r0 = r0; - __entry->imm = imm; - ), - - TP_printk("HVC at 0x%08lx (r0: 0x%08lx, imm: 0x%lx", - __entry->vcpu_pc, __entry->r0, __entry->imm) -); - -#endif /* _TRACE_ARM_KVM_H */ - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . -#undef TRACE_INCLUDE_FILE -#define TRACE_INCLUDE_FILE trace - -/* This part must be outside protection */ -#include <trace/define_trace.h> diff --git a/arch/arm/kvm/vgic-v3-coproc.c b/arch/arm/kvm/vgic-v3-coproc.c deleted file mode 100644 index ed3b2e4759ce..000000000000 --- a/arch/arm/kvm/vgic-v3-coproc.c +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * VGIC system registers handling functions for AArch32 mode - */ - -#include <linux/kvm.h> -#include <linux/kvm_host.h> -#include <asm/kvm_emulate.h> -#include "vgic.h" - -int vgic_v3_has_cpu_sysregs_attr(struct kvm_vcpu *vcpu, bool is_write, u64 id, - u64 *reg) -{ - /* - * TODO: Implement for AArch32 - */ - return -ENXIO; -} - -int vgic_v3_cpu_sysregs_uaccess(struct kvm_vcpu *vcpu, bool is_write, u64 id, - u64 *reg) -{ - /* - * TODO: Implement for AArch32 - */ - return -ENXIO; -} diff --git a/arch/arm/mach-exynos/Kconfig b/arch/arm/mach-exynos/Kconfig index cbbe03e96de8..76838255b5fa 100644 --- a/arch/arm/mach-exynos/Kconfig +++ b/arch/arm/mach-exynos/Kconfig @@ -21,7 +21,7 @@ menuconfig ARCH_EXYNOS select EXYNOS_SROM select EXYNOS_PM_DOMAINS if PM_GENERIC_DOMAINS select GPIOLIB - select HAVE_ARM_ARCH_TIMER if ARCH_EXYNOS5 && VIRTUALIZATION + select HAVE_ARM_ARCH_TIMER if ARCH_EXYNOS5 select HAVE_ARM_SCU if SMP select HAVE_S3C2410_I2C if I2C select HAVE_S3C2410_WATCHDOG if WATCHDOG diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 5d0d0f86e790..69a337df619f 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -63,9 +63,6 @@ static unsigned int cachepolicy __initdata = CPOLICY_WRITEBACK; static unsigned int ecc_mask __initdata = 0; pgprot_t pgprot_user; pgprot_t pgprot_kernel; -pgprot_t pgprot_hyp_device; -pgprot_t pgprot_s2; -pgprot_t pgprot_s2_device; EXPORT_SYMBOL(pgprot_user); EXPORT_SYMBOL(pgprot_kernel); @@ -75,15 +72,8 @@ struct cachepolicy { unsigned int cr_mask; pmdval_t pmd; pteval_t pte; - pteval_t pte_s2; }; -#ifdef CONFIG_ARM_LPAE -#define s2_policy(policy) policy -#else -#define s2_policy(policy) 0 -#endif - unsigned long kimage_voffset __ro_after_init; static struct cachepolicy cache_policies[] __initdata = { @@ -92,31 +82,26 @@ static struct cachepolicy cache_policies[] __initdata = { .cr_mask = CR_W|CR_C, .pmd = PMD_SECT_UNCACHED, .pte = L_PTE_MT_UNCACHED, - .pte_s2 = s2_policy(L_PTE_S2_MT_UNCACHED), }, { .policy = "buffered", .cr_mask = CR_C, .pmd = PMD_SECT_BUFFERED, .pte = L_PTE_MT_BUFFERABLE, - .pte_s2 = s2_policy(L_PTE_S2_MT_UNCACHED), }, { .policy = "writethrough", .cr_mask = 0, .pmd = PMD_SECT_WT, .pte = L_PTE_MT_WRITETHROUGH, - .pte_s2 = s2_policy(L_PTE_S2_MT_WRITETHROUGH), }, { .policy = "writeback", .cr_mask = 0, .pmd = PMD_SECT_WB, .pte = L_PTE_MT_WRITEBACK, - .pte_s2 = s2_policy(L_PTE_S2_MT_WRITEBACK), }, { .policy = "writealloc", .cr_mask = 0, .pmd = PMD_SECT_WBWA, .pte = L_PTE_MT_WRITEALLOC, - .pte_s2 = s2_policy(L_PTE_S2_MT_WRITEBACK), } }; @@ -246,9 +231,6 @@ static struct mem_type mem_types[] __ro_after_init = { [MT_DEVICE] = { /* Strongly ordered / ARMv6 shared device */ .prot_pte = PROT_PTE_DEVICE | L_PTE_MT_DEV_SHARED | L_PTE_SHARED, - .prot_pte_s2 = s2_policy(PROT_PTE_S2_DEVICE) | - s2_policy(L_PTE_S2_MT_DEV_SHARED) | - L_PTE_SHARED, .prot_l1 = PMD_TYPE_TABLE, .prot_sect = PROT_SECT_DEVICE | PMD_SECT_S, .domain = DOMAIN_IO, @@ -434,7 +416,6 @@ static void __init build_mem_type_table(void) struct cachepolicy *cp; unsigned int cr = get_cr(); pteval_t user_pgprot, kern_pgprot, vecs_pgprot; - pteval_t hyp_device_pgprot, s2_pgprot, s2_device_pgprot; int cpu_arch = cpu_architecture(); int i; @@ -558,9 +539,6 @@ static void __init build_mem_type_table(void) */ cp = &cache_policies[cachepolicy]; vecs_pgprot = kern_pgprot = user_pgprot = cp->pte; - s2_pgprot = cp->pte_s2; - hyp_device_pgprot = mem_types[MT_DEVICE].prot_pte; - s2_device_pgprot = mem_types[MT_DEVICE].prot_pte_s2; #ifndef CONFIG_ARM_LPAE /* @@ -604,7 +582,6 @@ static void __init build_mem_type_table(void) user_pgprot |= L_PTE_SHARED; kern_pgprot |= L_PTE_SHARED; vecs_pgprot |= L_PTE_SHARED; - s2_pgprot |= L_PTE_SHARED; mem_types[MT_DEVICE_WC].prot_sect |= PMD_SECT_S; mem_types[MT_DEVICE_WC].prot_pte |= L_PTE_SHARED; mem_types[MT_DEVICE_CACHED].prot_sect |= PMD_SECT_S; @@ -666,9 +643,6 @@ static void __init build_mem_type_table(void) pgprot_user = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | user_pgprot); pgprot_kernel = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY | kern_pgprot); - pgprot_s2 = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | s2_pgprot); - pgprot_s2_device = __pgprot(s2_device_pgprot); - pgprot_hyp_device = __pgprot(hyp_device_pgprot); mem_types[MT_LOW_VECTORS].prot_l1 |= ecc_mask; mem_types[MT_HIGH_VECTORS].prot_l1 |= ecc_mask; diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index f658dda12364..a30b4eec7cb4 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -89,7 +89,8 @@ static inline unsigned long *vcpu_hcr(struct kvm_vcpu *vcpu) static inline void vcpu_clear_wfx_traps(struct kvm_vcpu *vcpu) { vcpu->arch.hcr_el2 &= ~HCR_TWE; - if (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count)) + if (atomic_read(&vcpu->arch.vgic_cpu.vgic_v3.its_vpe.vlpi_count) || + vcpu->kvm->arch.vgic.nassgireq) vcpu->arch.hcr_el2 &= ~HCR_TWI; else vcpu->arch.hcr_el2 |= HCR_TWI; diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 57fd46acd058..32c8a675e5a4 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -44,6 +44,7 @@ #define KVM_REQ_IRQ_PENDING KVM_ARCH_REQ(1) #define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(2) #define KVM_REQ_RECORD_STEAL KVM_ARCH_REQ(3) +#define KVM_REQ_RELOAD_GICv4 KVM_ARCH_REQ(4) DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 525010504f9d..e329a36b2bee 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -11,7 +11,6 @@ #include <linux/kvm_host.h> #include <asm/fpsimd.h> #include <asm/kvm_asm.h> -#include <asm/kvm_host.h> #include <asm/kvm_mmu.h> #include <asm/sysreg.h> diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 2bd92301d32f..23ebe51410f0 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -25,7 +25,6 @@ #include <asm/kvm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_coproc.h> -#include <asm/kvm_host.h> #include <asm/sigcontext.h> #include "trace.h" diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index eaa05c3c7235..8a1e81a400e0 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -17,7 +17,6 @@ #include <asm/kprobes.h> #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> -#include <asm/kvm_host.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/fpsimd.h> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 090f46d3add1..51db934702b6 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -22,7 +22,6 @@ #include <asm/kvm_arm.h> #include <asm/kvm_coproc.h> #include <asm/kvm_emulate.h> -#include <asm/kvm_host.h> #include <asm/kvm_hyp.h> #include <asm/kvm_mmu.h> #include <asm/perf_event.h> diff --git a/arch/arm64/kvm/sys_regs_generic_v8.c b/arch/arm64/kvm/sys_regs_generic_v8.c index 2b4a3e2d1b89..9cb6b4c8355a 100644 --- a/arch/arm64/kvm/sys_regs_generic_v8.c +++ b/arch/arm64/kvm/sys_regs_generic_v8.c @@ -12,7 +12,6 @@ #include <asm/cputype.h> #include <asm/kvm_arm.h> #include <asm/kvm_asm.h> -#include <asm/kvm_host.h> #include <asm/kvm_emulate.h> #include <asm/kvm_coproc.h> #include <asm/sysreg.h> diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 41204a49cf95..2c343c346b79 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -1133,7 +1133,7 @@ extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm); static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, - struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} + struct kvm_memory_slot *slot) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 71244bf87c3a..8f05dd0a0f4e 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -118,12 +118,12 @@ void kvm_arch_hardware_disable(void) kvm_mips_callbacks->hardware_disable(); } -int kvm_arch_hardware_setup(void) +int kvm_arch_hardware_setup(void *opaque) { return 0; } -int kvm_arch_check_processor_compat(void) +int kvm_arch_check_processor_compat(void *opaque) { return 0; } @@ -188,12 +188,6 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, return -ENOIOCTLCMD; } -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) -{ - return 0; -} - void kvm_arch_flush_shadow_all(struct kvm *kvm) { /* Flush whole GPA */ @@ -230,7 +224,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, + struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { @@ -984,69 +978,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, return r; } -/** - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot - * @kvm: kvm instance - * @log: slot id and address to which we copy the log - * - * Steps 1-4 below provide general overview of dirty page logging. See - * kvm_get_dirty_log_protect() function description for additional details. - * - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we - * always flush the TLB (step 4) even if previous step failed and the dirty - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API - * does not preclude user space subsequent dirty log read. Flushing TLB ensures - * writes will be marked dirty for next log read. - * - * 1. Take a snapshot of the bit and clear it if needed. - * 2. Write protect the corresponding page. - * 3. Copy the snapshot to the userspace. - * 4. Flush TLB's if needed. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - - r = kvm_get_dirty_log_protect(kvm, log, &flush); - - if (flush) { - slots = kvm_memslots(kvm); - memslot = id_to_memslot(slots, log->slot); - /* Let implementation handle TLB/GVA invalidation */ - kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot); - } - - mutex_unlock(&kvm->slots_lock); - return r; } -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) { - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - - r = kvm_clear_dirty_log_protect(kvm, log, &flush); - - if (flush) { - slots = kvm_memslots(kvm); - memslot = id_to_memslot(slots, log->slot); - - /* Let implementation handle TLB/GVA invalidation */ - kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot); - } - - mutex_unlock(&kvm->slots_lock); - return r; + /* Let implementation handle TLB/GVA invalidation */ + kvm_mips_callbacks->flush_shadow_memslot(kvm, memslot); } long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 635fb154b33f..a3633560493b 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -150,4 +150,7 @@ #define KVM_INST_FETCH_FAILED -1 +/* Extract PO and XOP opcode fields */ +#define PO_XOP_OPCODE_MASK 0xfc0007fe + #endif /* __POWERPC_KVM_ASM_H__ */ diff --git a/arch/powerpc/include/asm/kvm_book3s_uvmem.h b/arch/powerpc/include/asm/kvm_book3s_uvmem.h index 5a9834e0e2d1..9cb7d8be2366 100644 --- a/arch/powerpc/include/asm/kvm_book3s_uvmem.h +++ b/arch/powerpc/include/asm/kvm_book3s_uvmem.h @@ -5,6 +5,7 @@ #ifdef CONFIG_PPC_UV int kvmppc_uvmem_init(void); void kvmppc_uvmem_free(void); +bool kvmppc_uvmem_available(void); int kvmppc_uvmem_slot_init(struct kvm *kvm, const struct kvm_memory_slot *slot); void kvmppc_uvmem_slot_free(struct kvm *kvm, const struct kvm_memory_slot *slot); @@ -30,6 +31,11 @@ static inline int kvmppc_uvmem_init(void) static inline void kvmppc_uvmem_free(void) { } +static inline bool kvmppc_uvmem_available(void) +{ + return false; +} + static inline int kvmppc_uvmem_slot_init(struct kvm *kvm, const struct kvm_memory_slot *slot) { diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 6e8b8ffd06ad..f99b4333dfba 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -303,6 +303,7 @@ struct kvm_arch { u8 radix; u8 fwnmi_enabled; u8 secure_guest; + u8 svm_enabled; bool threads_indep; bool nested_enable; pgd_t *pgtable; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index bc2494e5710a..94f5a32acaf1 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -107,8 +107,6 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, unsigned int gtlb_idx); extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode); extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid); -extern void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu); -extern int kvmppc_mmu_init(struct kvm_vcpu *vcpu); extern int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr); extern int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr); extern gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int gtlb_index, @@ -200,14 +198,11 @@ extern void kvm_free_hpt_cma(struct page *page, unsigned long nr_pages); extern int kvmppc_core_init_vm(struct kvm *kvm); extern void kvmppc_core_destroy_vm(struct kvm *kvm); extern void kvmppc_core_free_memslot(struct kvm *kvm, - struct kvm_memory_slot *free, - struct kvm_memory_slot *dont); -extern int kvmppc_core_create_memslot(struct kvm *kvm, - struct kvm_memory_slot *slot, - unsigned long npages); + struct kvm_memory_slot *slot); extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change); extern void kvmppc_core_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, @@ -280,7 +275,8 @@ struct kvmppc_ops { void (*flush_memslot)(struct kvm *kvm, struct kvm_memory_slot *memslot); int (*prepare_memory_region)(struct kvm *kvm, struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem); + const struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change); void (*commit_memory_region)(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, @@ -291,11 +287,7 @@ struct kvmppc_ops { int (*age_hva)(struct kvm *kvm, unsigned long start, unsigned long end); int (*test_age_hva)(struct kvm *kvm, unsigned long hva); void (*set_spte_hva)(struct kvm *kvm, unsigned long hva, pte_t pte); - void (*mmu_destroy)(struct kvm_vcpu *vcpu); - void (*free_memslot)(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont); - int (*create_memslot)(struct kvm_memory_slot *slot, - unsigned long npages); + void (*free_memslot)(struct kvm_memory_slot *slot); int (*init_vm)(struct kvm *kvm); void (*destroy_vm)(struct kvm *kvm); int (*get_smmu_info)(struct kvm *kvm, struct kvm_ppc_smmu_info *info); @@ -321,6 +313,7 @@ struct kvmppc_ops { int size); int (*store_to_eaddr)(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr, int size); + int (*enable_svm)(struct kvm *kvm); int (*svm_off)(struct kvm *kvm); }; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index d07a8e12fa15..5690a1f9b976 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -799,21 +799,19 @@ int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) return vcpu->kvm->arch.kvm_ops->check_requests(vcpu); } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - return kvm->arch.kvm_ops->get_dirty_log(kvm, log); + } -void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - kvm->arch.kvm_ops->free_memslot(free, dont); + return kvm->arch.kvm_ops->get_dirty_log(kvm, log); } -int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - return kvm->arch.kvm_ops->create_memslot(slot, npages); + kvm->arch.kvm_ops->free_memslot(slot); } void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) @@ -823,9 +821,11 @@ void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) { - return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem); + return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem, + change); } void kvmppc_core_commit_memory_region(struct kvm *kvm, @@ -858,11 +858,6 @@ int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) return 0; } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) -{ - vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); -} - int kvmppc_core_init_vm(struct kvm *kvm) { diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h index 3a4613985949..eae259ee49af 100644 --- a/arch/powerpc/kvm/book3s.h +++ b/arch/powerpc/kvm/book3s.h @@ -16,6 +16,7 @@ extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva); extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte); +extern int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu); extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu); extern int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance); diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c index f21e73492ce3..3fbd570f9c1e 100644 --- a/arch/powerpc/kvm/book3s_32_mmu.c +++ b/arch/powerpc/kvm/book3s_32_mmu.c @@ -234,7 +234,7 @@ static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, case 2: case 6: pte->may_write = true; - /* fall through */ + fallthrough; case 3: case 5: case 7: diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c index d4cb3bcf41b6..e8e7b2c530d1 100644 --- a/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -356,7 +356,7 @@ void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu) /* From mm/mmu_context_hash32.c */ #define CTX_TO_VSID(c, id) ((((c) * (897 * 16)) + (id * 0x111)) & 0xffffff) -int kvmppc_mmu_init(struct kvm_vcpu *vcpu) +int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int err; diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 599133256a95..26b8b27a3755 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -311,7 +311,7 @@ do_second: case 2: case 6: gpte->may_write = true; - /* fall through */ + fallthrough; case 3: case 5: case 7: diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 044dd49eeb9d..e452158a18d7 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -384,7 +384,7 @@ void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu) __destroy_context(to_book3s(vcpu)->context_id[0]); } -int kvmppc_mmu_init(struct kvm_vcpu *vcpu) +int kvmppc_mmu_init_pr(struct kvm_vcpu *vcpu) { struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); int err; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 6c372f5c61b6..3aecec890d6f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -485,18 +485,18 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, __be64 *hptep; unsigned long mmu_seq, psize, pte_size; unsigned long gpa_base, gfn_base; - unsigned long gpa, gfn, hva, pfn; + unsigned long gpa, gfn, hva, pfn, hpa; struct kvm_memory_slot *memslot; unsigned long *rmap; struct revmap_entry *rev; - struct page *page, *pages[1]; - long index, ret, npages; + struct page *page; + long index, ret; bool is_ci; - unsigned int writing, write_ok; - struct vm_area_struct *vma; + bool writing, write_ok; + unsigned int shift; unsigned long rcbits; long mmio_update; - struct mm_struct *mm; + pte_t pte, *ptep; if (kvm_is_radix(kvm)) return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr); @@ -570,59 +570,62 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, smp_rmb(); ret = -EFAULT; - is_ci = false; - pfn = 0; page = NULL; - mm = kvm->mm; - pte_size = PAGE_SIZE; writing = (dsisr & DSISR_ISSTORE) != 0; /* If writing != 0, then the HPTE must allow writing, if we get here */ write_ok = writing; hva = gfn_to_hva_memslot(memslot, gfn); - npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages); - if (npages < 1) { - /* Check if it's an I/O mapping */ - down_read(&mm->mmap_sem); - vma = find_vma(mm, hva); - if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && - (vma->vm_flags & VM_PFNMAP)) { - pfn = vma->vm_pgoff + - ((hva - vma->vm_start) >> PAGE_SHIFT); - pte_size = psize; - is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot)))); - write_ok = vma->vm_flags & VM_WRITE; - } - up_read(&mm->mmap_sem); - if (!pfn) - goto out_put; + + /* + * Do a fast check first, since __gfn_to_pfn_memslot doesn't + * do it with !atomic && !async, which is how we call it. + * We always ask for write permission since the common case + * is that the page is writable. + */ + if (__get_user_pages_fast(hva, 1, 1, &page) == 1) { + write_ok = true; } else { - page = pages[0]; - pfn = page_to_pfn(page); - if (PageHuge(page)) { - page = compound_head(page); - pte_size <<= compound_order(page); - } - /* if the guest wants write access, see if that is OK */ - if (!writing && hpte_is_writable(r)) { - pte_t *ptep, pte; - unsigned long flags; - /* - * We need to protect against page table destruction - * hugepage split and collapse. - */ - local_irq_save(flags); - ptep = find_current_mm_pte(mm->pgd, hva, NULL, NULL); - if (ptep) { - pte = kvmppc_read_update_linux_pte(ptep, 1); - if (__pte_write(pte)) - write_ok = 1; - } - local_irq_restore(flags); + /* Call KVM generic code to do the slow-path check */ + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, + writing, &write_ok); + if (is_error_noslot_pfn(pfn)) + return -EFAULT; + page = NULL; + if (pfn_valid(pfn)) { + page = pfn_to_page(pfn); + if (PageReserved(page)) + page = NULL; } } + /* + * Read the PTE from the process' radix tree and use that + * so we get the shift and attribute bits. + */ + local_irq_disable(); + ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift); + /* + * If the PTE disappeared temporarily due to a THP + * collapse, just return and let the guest try again. + */ + if (!ptep) { + local_irq_enable(); + if (page) + put_page(page); + return RESUME_GUEST; + } + pte = *ptep; + local_irq_enable(); + hpa = pte_pfn(pte) << PAGE_SHIFT; + pte_size = PAGE_SIZE; + if (shift) + pte_size = 1ul << shift; + is_ci = pte_ci(pte); + if (psize > pte_size) goto out_put; + if (pte_size > psize) + hpa |= hva & (pte_size - psize); /* Check WIMG vs. the actual page we're accessing */ if (!hpte_cache_flags_ok(r, is_ci)) { @@ -636,14 +639,13 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, } /* - * Set the HPTE to point to pfn. - * Since the pfn is at PAGE_SIZE granularity, make sure we + * Set the HPTE to point to hpa. + * Since the hpa is at PAGE_SIZE granularity, make sure we * don't mask out lower-order bits if psize < PAGE_SIZE. */ if (psize < PAGE_SIZE) psize = PAGE_SIZE; - r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | - ((pfn << PAGE_SHIFT) & ~(psize - 1)); + r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | hpa; if (hpte_is_writable(r) && !write_ok) r = hpte_make_readonly(r); ret = RESUME_GUEST; @@ -708,20 +710,13 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, asm volatile("ptesync" : : : "memory"); preempt_enable(); if (page && hpte_is_writable(r)) - SetPageDirty(page); + set_page_dirty_lock(page); out_put: trace_kvm_page_fault_exit(vcpu, hpte, ret); - if (page) { - /* - * We drop pages[0] here, not page because page might - * have been set to the head page of a compound, but - * we have to drop the reference on the correct tail - * page to match the get inside gup() - */ - put_page(pages[0]); - } + if (page) + put_page(page); return ret; out_unlock: diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 803940d79b73..134fbc1f029f 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -425,7 +425,7 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, unsigned int lpid) { if (full) { - memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE); + memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE); } else { pte_t *p = pte; unsigned long it; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index ee6c103bb7d5..50555ad1db93 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -27,7 +27,6 @@ #include <asm/hvcall.h> #include <asm/synch.h> #include <asm/ppc-opcode.h> -#include <asm/kvm_host.h> #include <asm/udbg.h> #include <asm/iommu.h> #include <asm/tce.h> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index ab6eeb8e753e..6fcaf1fa8e02 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -24,7 +24,6 @@ #include <asm/hvcall.h> #include <asm/synch.h> #include <asm/ppc-opcode.h> -#include <asm/kvm_host.h> #include <asm/udbg.h> #include <asm/iommu.h> #include <asm/tce.h> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2cefd071b848..fa6e4fc7d0e4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -72,7 +72,6 @@ #include <asm/xics.h> #include <asm/xive.h> #include <asm/hw_breakpoint.h> -#include <asm/kvm_host.h> #include <asm/kvm_book3s_uvmem.h> #include <asm/ultravisor.h> @@ -1074,25 +1073,35 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) kvmppc_get_gpr(vcpu, 6)); break; case H_SVM_PAGE_IN: - ret = kvmppc_h_svm_page_in(vcpu->kvm, - kvmppc_get_gpr(vcpu, 4), - kvmppc_get_gpr(vcpu, 5), - kvmppc_get_gpr(vcpu, 6)); + ret = H_UNSUPPORTED; + if (kvmppc_get_srr1(vcpu) & MSR_S) + ret = kvmppc_h_svm_page_in(vcpu->kvm, + kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); break; case H_SVM_PAGE_OUT: - ret = kvmppc_h_svm_page_out(vcpu->kvm, - kvmppc_get_gpr(vcpu, 4), - kvmppc_get_gpr(vcpu, 5), - kvmppc_get_gpr(vcpu, 6)); + ret = H_UNSUPPORTED; + if (kvmppc_get_srr1(vcpu) & MSR_S) + ret = kvmppc_h_svm_page_out(vcpu->kvm, + kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); break; case H_SVM_INIT_START: - ret = kvmppc_h_svm_init_start(vcpu->kvm); + ret = H_UNSUPPORTED; + if (kvmppc_get_srr1(vcpu) & MSR_S) + ret = kvmppc_h_svm_init_start(vcpu->kvm); break; case H_SVM_INIT_DONE: - ret = kvmppc_h_svm_init_done(vcpu->kvm); + ret = H_UNSUPPORTED; + if (kvmppc_get_srr1(vcpu) & MSR_S) + ret = kvmppc_h_svm_init_done(vcpu->kvm); break; case H_SVM_INIT_ABORT: - ret = kvmppc_h_svm_init_abort(vcpu->kvm); + ret = H_UNSUPPORTED; + if (kvmppc_get_srr1(vcpu) & MSR_S) + ret = kvmppc_h_svm_init_abort(vcpu->kvm); break; default: @@ -3616,6 +3625,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && kvmppc_get_gpr(vcpu, 3) == H_CEDE) { kvmppc_nested_cede(vcpu); + kvmppc_set_gpr(vcpu, 3, 0); trap = 0; } } else { @@ -4400,7 +4410,7 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, slots = kvm_memslots(kvm); memslot = id_to_memslot(slots, log->slot); r = -ENOENT; - if (!memslot->dirty_bitmap) + if (!memslot || !memslot->dirty_bitmap) goto out; /* @@ -4447,29 +4457,26 @@ out: return r; } -static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *slot) { - if (!dont || free->arch.rmap != dont->arch.rmap) { - vfree(free->arch.rmap); - free->arch.rmap = NULL; - } + vfree(slot->arch.rmap); + slot->arch.rmap = NULL; } -static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, - unsigned long npages) +static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, + struct kvm_memory_slot *slot, + const struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) { - slot->arch.rmap = vzalloc(array_size(npages, sizeof(*slot->arch.rmap))); - if (!slot->arch.rmap) - return -ENOMEM; + unsigned long npages = mem->memory_size >> PAGE_SHIFT; - return 0; -} + if (change == KVM_MR_CREATE) { + slot->arch.rmap = vzalloc(array_size(npages, + sizeof(*slot->arch.rmap))); + if (!slot->arch.rmap) + return -ENOMEM; + } -static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, - struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem) -{ return 0; } @@ -4558,11 +4565,6 @@ void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask) } } -static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu) -{ - return; -} - void kvmppc_setup_partition_table(struct kvm *kvm) { unsigned long dw0, dw1; @@ -5427,6 +5429,21 @@ static void unpin_vpa_reset(struct kvm *kvm, struct kvmppc_vpa *vpa) } /* + * Enable a guest to become a secure VM, or test whether + * that could be enabled. + * Called when the KVM_CAP_PPC_SECURE_GUEST capability is + * tested (kvm == NULL) or enabled (kvm != NULL). + */ +static int kvmhv_enable_svm(struct kvm *kvm) +{ + if (!kvmppc_uvmem_available()) + return -EINVAL; + if (kvm) + kvm->arch.svm_enabled = 1; + return 0; +} + +/* * IOCTL handler to turn off secure mode of guest * * - Release all device pages @@ -5526,9 +5543,7 @@ static struct kvmppc_ops kvm_ops_hv = { .age_hva = kvm_age_hva_hv, .test_age_hva = kvm_test_age_hva_hv, .set_spte_hva = kvm_set_spte_hva_hv, - .mmu_destroy = kvmppc_mmu_destroy_hv, .free_memslot = kvmppc_core_free_memslot_hv, - .create_memslot = kvmppc_core_create_memslot_hv, .init_vm = kvmppc_core_init_vm_hv, .destroy_vm = kvmppc_core_destroy_vm_hv, .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv, @@ -5548,6 +5563,7 @@ static struct kvmppc_ops kvm_ops_hv = { .enable_nested = kvmhv_enable_nested, .load_from_eaddr = kvmhv_load_from_eaddr, .store_to_eaddr = kvmhv_store_to_eaddr, + .enable_svm = kvmhv_enable_svm, .svm_off = kvmhv_svm_off, }; diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c index 0db937497169..cc90b8b82329 100644 --- a/arch/powerpc/kvm/book3s_hv_tm.c +++ b/arch/powerpc/kvm/book3s_hv_tm.c @@ -3,6 +3,8 @@ * Copyright 2017 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kvm_host.h> #include <asm/kvm_ppc.h> @@ -44,7 +46,18 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) u64 newmsr, bescr; int ra, rs; - switch (instr & 0xfc0007ff) { + /* + * rfid, rfebb, and mtmsrd encode bit 31 = 0 since it's a reserved bit + * in these instructions, so masking bit 31 out doesn't change these + * instructions. For treclaim., tsr., and trechkpt. instructions if bit + * 31 = 0 then they are per ISA invalid forms, however P9 UM, in section + * 4.6.10 Book II Invalid Forms, informs specifically that ignoring bit + * 31 is an acceptable way to handle these invalid forms that have + * bit 31 = 0. Moreover, for emulation purposes both forms (w/ and wo/ + * bit 31 set) can generate a softpatch interrupt. Hence both forms + * are handled below for these instructions so they behave the same way. + */ + switch (instr & PO_XOP_OPCODE_MASK) { case PPC_INST_RFID: /* XXX do we need to check for PR=0 here? */ newmsr = vcpu->arch.shregs.srr1; @@ -105,7 +118,8 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) vcpu->arch.shregs.msr = newmsr; return RESUME_GUEST; - case PPC_INST_TSR: + /* ignore bit 31, see comment above */ + case (PPC_INST_TSR & PO_XOP_OPCODE_MASK): /* check for PR=1 and arch 2.06 bit set in PCR */ if ((msr & MSR_PR) && (vcpu->arch.vcore->pcr & PCR_ARCH_206)) { /* generate an illegal instruction interrupt */ @@ -140,7 +154,8 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) vcpu->arch.shregs.msr = msr; return RESUME_GUEST; - case PPC_INST_TRECLAIM: + /* ignore bit 31, see comment above */ + case (PPC_INST_TRECLAIM & PO_XOP_OPCODE_MASK): /* check for TM disabled in the HFSCR or MSR */ if (!(vcpu->arch.hfscr & HFSCR_TM)) { /* generate an illegal instruction interrupt */ @@ -176,7 +191,8 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) vcpu->arch.shregs.msr &= ~MSR_TS_MASK; return RESUME_GUEST; - case PPC_INST_TRECHKPT: + /* ignore bit 31, see comment above */ + case (PPC_INST_TRECHKPT & PO_XOP_OPCODE_MASK): /* XXX do we need to check for PR=0 here? */ /* check for TM disabled in the HFSCR or MSR */ if (!(vcpu->arch.hfscr & HFSCR_TM)) { @@ -208,6 +224,8 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) } /* What should we do here? We didn't recognize the instruction */ - WARN_ON_ONCE(1); + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + pr_warn_ratelimited("Unrecognized TM-related instruction %#x for emulation", instr); + return RESUME_GUEST; } diff --git a/arch/powerpc/kvm/book3s_hv_tm_builtin.c b/arch/powerpc/kvm/book3s_hv_tm_builtin.c index 217246279dfa..fad931f224ef 100644 --- a/arch/powerpc/kvm/book3s_hv_tm_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_tm_builtin.c @@ -23,7 +23,18 @@ int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu) u64 newmsr, msr, bescr; int rs; - switch (instr & 0xfc0007ff) { + /* + * rfid, rfebb, and mtmsrd encode bit 31 = 0 since it's a reserved bit + * in these instructions, so masking bit 31 out doesn't change these + * instructions. For the tsr. instruction if bit 31 = 0 then it is per + * ISA an invalid form, however P9 UM, in section 4.6.10 Book II Invalid + * Forms, informs specifically that ignoring bit 31 is an acceptable way + * to handle TM-related invalid forms that have bit 31 = 0. Moreover, + * for emulation purposes both forms (w/ and wo/ bit 31 set) can + * generate a softpatch interrupt. Hence both forms are handled below + * for tsr. to make them behave the same way. + */ + switch (instr & PO_XOP_OPCODE_MASK) { case PPC_INST_RFID: /* XXX do we need to check for PR=0 here? */ newmsr = vcpu->arch.shregs.srr1; @@ -73,7 +84,8 @@ int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu) vcpu->arch.shregs.msr = newmsr; return 1; - case PPC_INST_TSR: + /* ignore bit 31, see comment above */ + case (PPC_INST_TSR & PO_XOP_OPCODE_MASK): /* we know the MSR has the TS field = S (0b01) here */ msr = vcpu->arch.shregs.msr; /* check for PR=1 and arch 2.06 bit set in PCR */ diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index f44f6b27950f..76d05c71fb1f 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -113,6 +113,15 @@ struct kvmppc_uvmem_page_pvt { bool skip_page_out; }; +bool kvmppc_uvmem_available(void) +{ + /* + * If kvmppc_uvmem_bitmap != NULL, then there is an ultravisor + * and our data structures have been initialized successfully. + */ + return !!kvmppc_uvmem_bitmap; +} + int kvmppc_uvmem_slot_init(struct kvm *kvm, const struct kvm_memory_slot *slot) { struct kvmppc_uvmem_slot *p; @@ -209,6 +218,8 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) int ret = H_SUCCESS; int srcu_idx; + kvm->arch.secure_guest = KVMPPC_SECURE_INIT_START; + if (!kvmppc_uvmem_bitmap) return H_UNSUPPORTED; @@ -216,6 +227,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) if (!kvm_is_radix(kvm)) return H_UNSUPPORTED; + /* NAK the transition to secure if not enabled */ + if (!kvm->arch.svm_enabled) + return H_AUTHORITY; + srcu_idx = srcu_read_lock(&kvm->srcu); slots = kvm_memslots(kvm); kvm_for_each_memslot(memslot, slots) { @@ -233,7 +248,6 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) goto out; } } - kvm->arch.secure_guest |= KVMPPC_SECURE_INIT_START; out: srcu_read_unlock(&kvm->srcu, srcu_idx); return ret; @@ -809,6 +823,9 @@ out: void kvmppc_uvmem_free(void) { + if (!kvmppc_uvmem_bitmap) + return; + memunmap_pages(&kvmppc_uvmem_pgmap); release_mem_region(kvmppc_uvmem_pgmap.res.start, resource_size(&kvmppc_uvmem_pgmap.res)); diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index db3a87319642..a0f6813f4560 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -740,7 +740,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) && ((pte.raddr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)) pte.raddr &= ~SPLIT_HACK_MASK; - /* fall through */ + fallthrough; case MSR_IR: vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); @@ -1795,7 +1795,7 @@ static int kvmppc_core_vcpu_create_pr(struct kvm_vcpu *vcpu) vcpu->arch.shadow_msr = MSR_USER64 & ~MSR_LE; - err = kvmppc_mmu_init(vcpu); + err = kvmppc_mmu_init_pr(vcpu); if (err < 0) goto free_shared_page; @@ -1885,7 +1885,6 @@ out: static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm, struct kvm_dirty_log *log) { - struct kvm_memslots *slots; struct kvm_memory_slot *memslot; struct kvm_vcpu *vcpu; ulong ga, ga_end; @@ -1895,15 +1894,12 @@ static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm, mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log(kvm, log, &is_dirty); + r = kvm_get_dirty_log(kvm, log, &is_dirty, &memslot); if (r) goto out; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { - slots = kvm_memslots(kvm); - memslot = id_to_memslot(slots, log->slot); - ga = memslot->base_gfn << PAGE_SHIFT; ga_end = ga + (memslot->npages << PAGE_SHIFT); @@ -1928,7 +1924,8 @@ static void kvmppc_core_flush_memslot_pr(struct kvm *kvm, static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm, struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) { return 0; } @@ -1942,19 +1939,11 @@ static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, return; } -static void kvmppc_core_free_memslot_pr(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +static void kvmppc_core_free_memslot_pr(struct kvm_memory_slot *slot) { return; } -static int kvmppc_core_create_memslot_pr(struct kvm_memory_slot *slot, - unsigned long npages) -{ - return 0; -} - - #ifdef CONFIG_PPC64 static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, struct kvm_ppc_smmu_info *info) @@ -2098,9 +2087,7 @@ static struct kvmppc_ops kvm_ops_pr = { .age_hva = kvm_age_hva_pr, .test_age_hva = kvm_test_age_hva_pr, .set_spte_hva = kvm_set_spte_hva_pr, - .mmu_destroy = kvmppc_mmu_destroy_pr, .free_memslot = kvmppc_core_free_memslot_pr, - .create_memslot = kvmppc_core_create_memslot_pr, .init_vm = kvmppc_core_init_vm_pr, .destroy_vm = kvmppc_core_destroy_vm_pr, .get_smmu_info = kvm_vm_ioctl_get_smmu_info_pr, diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 7b27604adadf..6c18ea88fd25 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -421,11 +421,11 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, case BOOKE_IRQPRIO_DATA_STORAGE: case BOOKE_IRQPRIO_ALIGNMENT: update_dear = true; - /* fall through */ + fallthrough; case BOOKE_IRQPRIO_INST_STORAGE: case BOOKE_IRQPRIO_PROGRAM: update_esr = true; - /* fall through */ + fallthrough; case BOOKE_IRQPRIO_ITLB_MISS: case BOOKE_IRQPRIO_SYSCALL: case BOOKE_IRQPRIO_FP_UNAVAIL: @@ -459,7 +459,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, case BOOKE_IRQPRIO_DECREMENTER: case BOOKE_IRQPRIO_FIT: keep_irq = true; - /* fall through */ + fallthrough; case BOOKE_IRQPRIO_EXTERNAL: case BOOKE_IRQPRIO_DBELL: allowed = vcpu->arch.shared->msr & MSR_EE; @@ -1766,25 +1766,24 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, return r; } -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - return -ENOTSUPP; + } -void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { + return -ENOTSUPP; } -int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - return 0; } int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - const struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) { return 0; } @@ -2074,11 +2073,6 @@ void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) kvmppc_clear_dbsr(); } -void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) -{ - vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); -} - int kvmppc_core_init_vm(struct kvm *kvm) { return kvm->arch.kvm_ops->init_vm(kvm); diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h index 9d3169fbce55..65b4d337d337 100644 --- a/arch/powerpc/kvm/booke.h +++ b/arch/powerpc/kvm/booke.h @@ -94,7 +94,6 @@ enum int_class { void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type); -extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance); @@ -102,7 +101,6 @@ extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong spr_val); extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val); -extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance); diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index f2b4feaff6d2..7e8b69015d20 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -490,7 +490,6 @@ static struct kvmppc_ops kvm_ops_e500 = { .vcpu_put = kvmppc_core_vcpu_put_e500, .vcpu_create = kvmppc_core_vcpu_create_e500, .vcpu_free = kvmppc_core_vcpu_free_e500, - .mmu_destroy = kvmppc_mmu_destroy_e500, .init_vm = kvmppc_core_init_vm_e500, .destroy_vm = kvmppc_core_destroy_vm_e500, .emulate_op = kvmppc_core_emulate_op_e500, diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c index 2d910b87e441..e131fbecdcc4 100644 --- a/arch/powerpc/kvm/e500_mmu.c +++ b/arch/powerpc/kvm/e500_mmu.c @@ -533,10 +533,6 @@ gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, return get_tlb_raddr(gtlbe) | (eaddr & pgmask); } -void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu) -{ -} - /*****************************************/ static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c index e6b06cb2b92c..1c189b5aadcc 100644 --- a/arch/powerpc/kvm/e500mc.c +++ b/arch/powerpc/kvm/e500mc.c @@ -376,7 +376,6 @@ static struct kvmppc_ops kvm_ops_e500mc = { .vcpu_put = kvmppc_core_vcpu_put_e500mc, .vcpu_create = kvmppc_core_vcpu_create_e500mc, .vcpu_free = kvmppc_core_vcpu_free_e500mc, - .mmu_destroy = kvmppc_mmu_destroy_e500, .init_vm = kvmppc_core_init_vm_e500mc, .destroy_vm = kvmppc_core_destroy_vm_e500mc, .emulate_op = kvmppc_core_emulate_op_e500, diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c index fe312c160d97..23e9c2bd9f27 100644 --- a/arch/powerpc/kvm/mpic.c +++ b/arch/powerpc/kvm/mpic.c @@ -32,7 +32,6 @@ #include <linux/uaccess.h> #include <asm/mpic.h> #include <asm/kvm_para.h> -#include <asm/kvm_host.h> #include <asm/kvm_ppc.h> #include <kvm/iodev.h> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 302e9dccdd6d..e15166b0a16d 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -32,7 +32,6 @@ #include <asm/plpar_wrappers.h> #endif #include <asm/ultravisor.h> -#include <asm/kvm_host.h> #include "timing.h" #include "irq.h" @@ -416,12 +415,12 @@ int kvm_arch_hardware_enable(void) return 0; } -int kvm_arch_hardware_setup(void) +int kvm_arch_hardware_setup(void *opaque) { return 0; } -int kvm_arch_check_processor_compat(void) +int kvm_arch_check_processor_compat(void *opaque) { return kvmppc_core_check_processor_compat(); } @@ -525,7 +524,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = 1; break; case KVM_CAP_PPC_GUEST_DEBUG_SSTEP: - /* fall through */ case KVM_CAP_PPC_PAIRED_SINGLES: case KVM_CAP_PPC_OSI: case KVM_CAP_PPC_GET_PVINFO: @@ -671,6 +669,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) (hv_enabled && cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)); break; #endif +#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) + case KVM_CAP_PPC_SECURE_GUEST: + r = hv_enabled && kvmppc_hv_ops->enable_svm && + !kvmppc_hv_ops->enable_svm(NULL); + break; +#endif default: r = 0; break; @@ -685,16 +689,9 @@ long kvm_arch_dev_ioctl(struct file *filp, return -EINVAL; } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) -{ - kvmppc_core_free_memslot(kvm, free, dont); -} - -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - return kvmppc_core_create_memslot(kvm, slot, npages); + kvmppc_core_free_memslot(kvm, slot); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -702,12 +699,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { - return kvmppc_core_prepare_memory_region(kvm, memslot, mem); + return kvmppc_core_prepare_memory_region(kvm, memslot, mem, change); } void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, + struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { @@ -2176,6 +2173,14 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = kvm->arch.kvm_ops->enable_nested(kvm); break; #endif +#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) + case KVM_CAP_PPC_SECURE_GUEST: + r = -EINVAL; + if (!is_kvmppc_hv_enabled(kvm) || !kvm->arch.kvm_ops->enable_svm) + break; + r = kvm->arch.kvm_ops->enable_svm(kvm); + break; +#endif default: r = -EINVAL; break; diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h index ace65f9fed30..feef7885ba82 100644 --- a/arch/powerpc/kvm/timing.h +++ b/arch/powerpc/kvm/timing.h @@ -10,7 +10,6 @@ #define __POWERPC_KVM_EXITTIMING_H__ #include <linux/kvm_host.h> -#include <asm/kvm_host.h> #ifdef CONFIG_KVM_EXIT_TIMING void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu); diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index 0ff9261c915e..45b33b83de08 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -37,7 +37,7 @@ CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o obj-y += version.o pgm_check_info.o ctype.o text_dma.o -obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) += uv.o +obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c index 3f501159ee9f..8fde561f1d07 100644 --- a/arch/s390/boot/uv.c +++ b/arch/s390/boot/uv.c @@ -3,7 +3,13 @@ #include <asm/facility.h> #include <asm/sections.h> +/* will be used in arch/s390/kernel/uv.c */ +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST int __bootdata_preserved(prot_virt_guest); +#endif +#if IS_ENABLED(CONFIG_KVM) +struct uv_info __bootdata_preserved(uv_info); +#endif void uv_query_info(void) { @@ -19,7 +25,21 @@ void uv_query_info(void) if (uv_call(0, (uint64_t)&uvcb) && uvcb.header.rc != 0x100) return; + if (IS_ENABLED(CONFIG_KVM)) { + memcpy(uv_info.inst_calls_list, uvcb.inst_calls_list, sizeof(uv_info.inst_calls_list)); + uv_info.uv_base_stor_len = uvcb.uv_base_stor_len; + uv_info.guest_base_stor_len = uvcb.conf_base_phys_stor_len; + uv_info.guest_virt_base_stor_len = uvcb.conf_base_virt_stor_len; + uv_info.guest_virt_var_stor_len = uvcb.conf_virt_var_stor_len; + uv_info.guest_cpu_stor_len = uvcb.cpu_stor_len; + uv_info.max_sec_stor_addr = ALIGN(uvcb.max_guest_stor_addr, PAGE_SIZE); + uv_info.max_num_sec_conf = uvcb.max_num_sec_conf; + uv_info.max_guest_cpus = uvcb.max_guest_cpus; + } + +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) && test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list)) prot_virt_guest = 1; +#endif } diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 37f96b6f0e61..a816fb4734b8 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -9,6 +9,7 @@ #ifndef _ASM_S390_GMAP_H #define _ASM_S390_GMAP_H +#include <linux/radix-tree.h> #include <linux/refcount.h> /* Generic bits for GMAP notification on DAT table entry changes. */ @@ -31,6 +32,7 @@ * @table: pointer to the page directory * @asce: address space control element for gmap page table * @pfault_enabled: defines if pfaults are applicable for the guest + * @guest_handle: protected virtual machine handle for the ultravisor * @host_to_rmap: radix tree with gmap_rmap lists * @children: list of shadow gmap structures * @pt_list: list of all page tables used in the shadow guest address space @@ -54,6 +56,8 @@ struct gmap { unsigned long asce_end; void *private; bool pfault_enabled; + /* only set for protected virtual machines */ + unsigned long guest_handle; /* Additional data for shadow guest address spaces */ struct radix_tree_root host_to_rmap; struct list_head children; @@ -144,4 +148,6 @@ int gmap_mprotect_notify(struct gmap *, unsigned long start, void gmap_sync_dirty_log_pmd(struct gmap *gmap, unsigned long dirty_bitmap[4], unsigned long gaddr, unsigned long vmaddr); +int gmap_mark_unmergeable(void); +void s390_reset_acc(struct mm_struct *mm); #endif /* _ASM_S390_GMAP_H */ diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 1726224e7772..d6bcd34f3ec3 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -127,6 +127,12 @@ struct mcck_volatile_info { #define CR14_INITIAL_MASK (CR14_UNUSED_32 | CR14_UNUSED_33 | \ CR14_EXTERNAL_DAMAGE_SUBMASK) +#define SIDAD_SIZE_MASK 0xff +#define sida_origin(sie_block) \ + ((sie_block)->sidad & PAGE_MASK) +#define sida_size(sie_block) \ + ((((sie_block)->sidad & SIDAD_SIZE_MASK) + 1) * PAGE_SIZE) + #define CPUSTAT_STOPPED 0x80000000 #define CPUSTAT_WAIT 0x10000000 #define CPUSTAT_ECALL_PEND 0x08000000 @@ -160,7 +166,13 @@ struct kvm_s390_sie_block { __u8 reserved08[4]; /* 0x0008 */ #define PROG_IN_SIE (1<<0) __u32 prog0c; /* 0x000c */ - __u8 reserved10[16]; /* 0x0010 */ + union { + __u8 reserved10[16]; /* 0x0010 */ + struct { + __u64 pv_handle_cpu; + __u64 pv_handle_config; + }; + }; #define PROG_BLOCK_SIE (1<<0) #define PROG_REQUEST (1<<1) atomic_t prog20; /* 0x0020 */ @@ -209,10 +221,23 @@ struct kvm_s390_sie_block { #define ICPT_PARTEXEC 0x38 #define ICPT_IOINST 0x40 #define ICPT_KSS 0x5c +#define ICPT_MCHKREQ 0x60 +#define ICPT_INT_ENABLE 0x64 +#define ICPT_PV_INSTR 0x68 +#define ICPT_PV_NOTIFY 0x6c +#define ICPT_PV_PREF 0x70 __u8 icptcode; /* 0x0050 */ __u8 icptstatus; /* 0x0051 */ __u16 ihcpu; /* 0x0052 */ - __u8 reserved54[2]; /* 0x0054 */ + __u8 reserved54; /* 0x0054 */ +#define IICTL_CODE_NONE 0x00 +#define IICTL_CODE_MCHK 0x01 +#define IICTL_CODE_EXT 0x02 +#define IICTL_CODE_IO 0x03 +#define IICTL_CODE_RESTART 0x04 +#define IICTL_CODE_SPECIFICATION 0x10 +#define IICTL_CODE_OPERAND 0x11 + __u8 iictl; /* 0x0055 */ __u16 ipa; /* 0x0056 */ __u32 ipb; /* 0x0058 */ __u32 scaoh; /* 0x005c */ @@ -233,7 +258,7 @@ struct kvm_s390_sie_block { #define ECB3_RI 0x01 __u8 ecb3; /* 0x0063 */ __u32 scaol; /* 0x0064 */ - __u8 reserved68; /* 0x0068 */ + __u8 sdf; /* 0x0068 */ __u8 epdx; /* 0x0069 */ __u8 reserved6a[2]; /* 0x006a */ __u32 todpr; /* 0x006c */ @@ -249,31 +274,58 @@ struct kvm_s390_sie_block { #define HPID_KVM 0x4 #define HPID_VSIE 0x5 __u8 hpid; /* 0x00b8 */ - __u8 reservedb9[11]; /* 0x00b9 */ - __u16 extcpuaddr; /* 0x00c4 */ - __u16 eic; /* 0x00c6 */ + __u8 reservedb9[7]; /* 0x00b9 */ + union { + struct { + __u32 eiparams; /* 0x00c0 */ + __u16 extcpuaddr; /* 0x00c4 */ + __u16 eic; /* 0x00c6 */ + }; + __u64 mcic; /* 0x00c0 */ + } __packed; __u32 reservedc8; /* 0x00c8 */ - __u16 pgmilc; /* 0x00cc */ - __u16 iprcc; /* 0x00ce */ - __u32 dxc; /* 0x00d0 */ - __u16 mcn; /* 0x00d4 */ - __u8 perc; /* 0x00d6 */ - __u8 peratmid; /* 0x00d7 */ + union { + struct { + __u16 pgmilc; /* 0x00cc */ + __u16 iprcc; /* 0x00ce */ + }; + __u32 edc; /* 0x00cc */ + } __packed; + union { + struct { + __u32 dxc; /* 0x00d0 */ + __u16 mcn; /* 0x00d4 */ + __u8 perc; /* 0x00d6 */ + __u8 peratmid; /* 0x00d7 */ + }; + __u64 faddr; /* 0x00d0 */ + } __packed; __u64 peraddr; /* 0x00d8 */ __u8 eai; /* 0x00e0 */ __u8 peraid; /* 0x00e1 */ __u8 oai; /* 0x00e2 */ __u8 armid; /* 0x00e3 */ __u8 reservede4[4]; /* 0x00e4 */ - __u64 tecmc; /* 0x00e8 */ - __u8 reservedf0[12]; /* 0x00f0 */ + union { + __u64 tecmc; /* 0x00e8 */ + struct { + __u16 subchannel_id; /* 0x00e8 */ + __u16 subchannel_nr; /* 0x00ea */ + __u32 io_int_parm; /* 0x00ec */ + __u32 io_int_word; /* 0x00f0 */ + }; + } __packed; + __u8 reservedf4[8]; /* 0x00f4 */ #define CRYCB_FORMAT_MASK 0x00000003 #define CRYCB_FORMAT0 0x00000000 #define CRYCB_FORMAT1 0x00000001 #define CRYCB_FORMAT2 0x00000003 __u32 crycbd; /* 0x00fc */ __u64 gcr[16]; /* 0x0100 */ - __u64 gbea; /* 0x0180 */ + union { + __u64 gbea; /* 0x0180 */ + __u64 sidad; + }; __u8 reserved188[8]; /* 0x0188 */ __u64 sdnxo; /* 0x0190 */ __u8 reserved198[8]; /* 0x0198 */ @@ -292,7 +344,7 @@ struct kvm_s390_sie_block { __u64 itdba; /* 0x01e8 */ __u64 riccbd; /* 0x01f0 */ __u64 gvrd; /* 0x01f8 */ -} __attribute__((packed)); +} __packed __aligned(512); struct kvm_s390_itdb { __u8 data[256]; @@ -301,7 +353,9 @@ struct kvm_s390_itdb { struct sie_page { struct kvm_s390_sie_block sie_block; struct mcck_volatile_info mcck_info; /* 0x0200 */ - __u8 reserved218[1000]; /* 0x0218 */ + __u8 reserved218[360]; /* 0x0218 */ + __u64 pv_grregs[16]; /* 0x0380 */ + __u8 reserved400[512]; /* 0x0400 */ struct kvm_s390_itdb itdb; /* 0x0600 */ __u8 reserved700[2304]; /* 0x0700 */ }; @@ -476,6 +530,7 @@ enum irq_types { IRQ_PEND_PFAULT_INIT, IRQ_PEND_EXT_HOST, IRQ_PEND_EXT_SERVICE, + IRQ_PEND_EXT_SERVICE_EV, IRQ_PEND_EXT_TIMING, IRQ_PEND_EXT_CPU_TIMER, IRQ_PEND_EXT_CLOCK_COMP, @@ -520,6 +575,7 @@ enum irq_types { (1UL << IRQ_PEND_EXT_TIMING) | \ (1UL << IRQ_PEND_EXT_HOST) | \ (1UL << IRQ_PEND_EXT_SERVICE) | \ + (1UL << IRQ_PEND_EXT_SERVICE_EV) | \ (1UL << IRQ_PEND_VIRTIO) | \ (1UL << IRQ_PEND_PFAULT_INIT) | \ (1UL << IRQ_PEND_PFAULT_DONE)) @@ -536,6 +592,13 @@ enum irq_types { #define IRQ_PEND_MCHK_MASK ((1UL << IRQ_PEND_MCHK_REP) | \ (1UL << IRQ_PEND_MCHK_EX)) +#define IRQ_PEND_EXT_II_MASK ((1UL << IRQ_PEND_EXT_CPU_TIMER) | \ + (1UL << IRQ_PEND_EXT_CLOCK_COMP) | \ + (1UL << IRQ_PEND_EXT_EMERGENCY) | \ + (1UL << IRQ_PEND_EXT_EXTERNAL) | \ + (1UL << IRQ_PEND_EXT_SERVICE) | \ + (1UL << IRQ_PEND_EXT_SERVICE_EV)) + struct kvm_s390_interrupt_info { struct list_head list; u64 type; @@ -594,6 +657,7 @@ struct kvm_s390_local_interrupt { struct kvm_s390_float_interrupt { unsigned long pending_irqs; + unsigned long masked_irqs; spinlock_t lock; struct list_head lists[FIRQ_LIST_COUNT]; int counters[FIRQ_MAX_COUNT]; @@ -645,6 +709,11 @@ struct kvm_guestdbg_info_arch { unsigned long last_bp; }; +struct kvm_s390_pv_vcpu { + u64 handle; + unsigned long stor_base; +}; + struct kvm_vcpu_arch { struct kvm_s390_sie_block *sie_block; /* if vsie is active, currently executed shadow sie control block */ @@ -673,6 +742,7 @@ struct kvm_vcpu_arch { __u64 cputm_start; bool gs_enabled; bool skey_enabled; + struct kvm_s390_pv_vcpu pv; }; struct kvm_vm_stat { @@ -701,9 +771,6 @@ struct s390_io_adapter { bool masked; bool swap; bool suppressible; - struct rw_semaphore maps_lock; - struct list_head maps; - atomic_t nr_maps; }; #define MAX_S390_IO_ADAPTERS ((MAX_ISC + 1) * 8) @@ -846,6 +913,13 @@ struct kvm_s390_gisa_interrupt { DECLARE_BITMAP(kicked_mask, KVM_MAX_VCPUS); }; +struct kvm_s390_pv { + u64 handle; + u64 guest_len; + unsigned long stor_base; + void *stor_var; +}; + struct kvm_arch{ void *sca; int use_esca; @@ -881,6 +955,7 @@ struct kvm_arch{ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct kvm_s390_gisa_interrupt gisa_int; + struct kvm_s390_pv pv; }; #define KVM_HVA_ERR_BAD (-1UL) @@ -921,7 +996,7 @@ static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, - struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {} + struct kvm_memory_slot *slot) {} static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {} static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {} static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index bcfb6371086f..e21b618ad432 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -16,6 +16,8 @@ typedef struct { unsigned long asce; unsigned long asce_limit; unsigned long vdso_base; + /* The mmu context belongs to a secure guest. */ + atomic_t is_protected; /* * The following bitfields need a down_write on the mm * semaphore when they are written to. As they are only diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 8d04e6f3f796..afa836014076 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -23,6 +23,7 @@ static inline int init_new_context(struct task_struct *tsk, INIT_LIST_HEAD(&mm->context.gmap_list); cpumask_clear(&mm->context.cpu_attach_mask); atomic_set(&mm->context.flush_count, 0); + atomic_set(&mm->context.is_protected, 0); mm->context.gmap_asce = 0; mm->context.flush_mm = 0; mm->context.compat_mm = test_thread_flag(TIF_31BIT); diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 1019efd85b9d..62440a82731a 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -153,6 +153,11 @@ static inline int devmem_is_allowed(unsigned long pfn) #define HAVE_ARCH_FREE_PAGE #define HAVE_ARCH_ALLOC_PAGE +#if IS_ENABLED(CONFIG_PGSTE) +int arch_make_page_accessible(struct page *page); +#define HAVE_ARCH_MAKE_PAGE_ACCESSIBLE +#endif + #endif /* !__ASSEMBLY__ */ #define __PAGE_OFFSET 0x0UL diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 6d7c3b7e9281..6076c8c912d2 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -19,6 +19,7 @@ #include <linux/atomic.h> #include <asm/bug.h> #include <asm/page.h> +#include <asm/uv.h> extern pgd_t swapper_pg_dir[]; extern void paging_init(void); @@ -520,6 +521,15 @@ static inline int mm_has_pgste(struct mm_struct *mm) return 0; } +static inline int mm_is_protected(struct mm_struct *mm) +{ +#ifdef CONFIG_PGSTE + if (unlikely(atomic_read(&mm->context.is_protected))) + return 1; +#endif + return 0; +} + static inline int mm_alloc_pgste(struct mm_struct *mm) { #ifdef CONFIG_PGSTE @@ -1067,7 +1077,12 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma, static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); + pte_t res; + + res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); + if (mm_is_protected(mm) && pte_present(res)) + uv_convert_from_secure(pte_val(res) & PAGE_MASK); + return res; } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION @@ -1079,7 +1094,12 @@ void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, static inline pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - return ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID)); + pte_t res; + + res = ptep_xchg_direct(vma->vm_mm, addr, ptep, __pte(_PAGE_INVALID)); + if (mm_is_protected(vma->vm_mm) && pte_present(res)) + uv_convert_from_secure(pte_val(res) & PAGE_MASK); + return res; } /* @@ -1094,12 +1114,17 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full) { + pte_t res; + if (full) { - pte_t pte = *ptep; + res = *ptep; *ptep = __pte(_PAGE_INVALID); - return pte; + } else { + res = ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); } - return ptep_xchg_lazy(mm, addr, ptep, __pte(_PAGE_INVALID)); + if (mm_is_protected(mm) && pte_present(res)) + uv_convert_from_secure(pte_val(res) & PAGE_MASK); + return res; } #define __HAVE_ARCH_PTEP_SET_WRPROTECT diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 4093a2856929..cff4b4c99b75 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -14,23 +14,62 @@ #include <linux/types.h> #include <linux/errno.h> #include <linux/bug.h> +#include <linux/sched.h> #include <asm/page.h> +#include <asm/gmap.h> #define UVC_RC_EXECUTED 0x0001 #define UVC_RC_INV_CMD 0x0002 #define UVC_RC_INV_STATE 0x0003 #define UVC_RC_INV_LEN 0x0005 #define UVC_RC_NO_RESUME 0x0007 +#define UVC_RC_NEED_DESTROY 0x8000 #define UVC_CMD_QUI 0x0001 +#define UVC_CMD_INIT_UV 0x000f +#define UVC_CMD_CREATE_SEC_CONF 0x0100 +#define UVC_CMD_DESTROY_SEC_CONF 0x0101 +#define UVC_CMD_CREATE_SEC_CPU 0x0120 +#define UVC_CMD_DESTROY_SEC_CPU 0x0121 +#define UVC_CMD_CONV_TO_SEC_STOR 0x0200 +#define UVC_CMD_CONV_FROM_SEC_STOR 0x0201 +#define UVC_CMD_SET_SEC_CONF_PARAMS 0x0300 +#define UVC_CMD_UNPACK_IMG 0x0301 +#define UVC_CMD_VERIFY_IMG 0x0302 +#define UVC_CMD_CPU_RESET 0x0310 +#define UVC_CMD_CPU_RESET_INITIAL 0x0311 +#define UVC_CMD_PREPARE_RESET 0x0320 +#define UVC_CMD_CPU_RESET_CLEAR 0x0321 +#define UVC_CMD_CPU_SET_STATE 0x0330 +#define UVC_CMD_SET_UNSHARE_ALL 0x0340 +#define UVC_CMD_PIN_PAGE_SHARED 0x0341 +#define UVC_CMD_UNPIN_PAGE_SHARED 0x0342 #define UVC_CMD_SET_SHARED_ACCESS 0x1000 #define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001 /* Bits in installed uv calls */ enum uv_cmds_inst { BIT_UVC_CMD_QUI = 0, + BIT_UVC_CMD_INIT_UV = 1, + BIT_UVC_CMD_CREATE_SEC_CONF = 2, + BIT_UVC_CMD_DESTROY_SEC_CONF = 3, + BIT_UVC_CMD_CREATE_SEC_CPU = 4, + BIT_UVC_CMD_DESTROY_SEC_CPU = 5, + BIT_UVC_CMD_CONV_TO_SEC_STOR = 6, + BIT_UVC_CMD_CONV_FROM_SEC_STOR = 7, BIT_UVC_CMD_SET_SHARED_ACCESS = 8, BIT_UVC_CMD_REMOVE_SHARED_ACCESS = 9, + BIT_UVC_CMD_SET_SEC_PARMS = 11, + BIT_UVC_CMD_UNPACK_IMG = 13, + BIT_UVC_CMD_VERIFY_IMG = 14, + BIT_UVC_CMD_CPU_RESET = 15, + BIT_UVC_CMD_CPU_RESET_INITIAL = 16, + BIT_UVC_CMD_CPU_SET_STATE = 17, + BIT_UVC_CMD_PREPARE_RESET = 18, + BIT_UVC_CMD_CPU_PERFORM_CLEAR_RESET = 19, + BIT_UVC_CMD_UNSHARE_ALL = 20, + BIT_UVC_CMD_PIN_PAGE_SHARED = 21, + BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22, }; struct uv_cb_header { @@ -40,13 +79,127 @@ struct uv_cb_header { u16 rrc; /* Return Reason Code */ } __packed __aligned(8); +/* Query Ultravisor Information */ struct uv_cb_qui { struct uv_cb_header header; u64 reserved08; u64 inst_calls_list[4]; - u64 reserved30[15]; + u64 reserved30[2]; + u64 uv_base_stor_len; + u64 reserved48; + u64 conf_base_phys_stor_len; + u64 conf_base_virt_stor_len; + u64 conf_virt_var_stor_len; + u64 cpu_stor_len; + u32 reserved70[3]; + u32 max_num_sec_conf; + u64 max_guest_stor_addr; + u8 reserved88[158 - 136]; + u16 max_guest_cpus; + u8 reserveda0[200 - 160]; } __packed __aligned(8); +/* Initialize Ultravisor */ +struct uv_cb_init { + struct uv_cb_header header; + u64 reserved08[2]; + u64 stor_origin; + u64 stor_len; + u64 reserved28[4]; +} __packed __aligned(8); + +/* Create Guest Configuration */ +struct uv_cb_cgc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 conf_base_stor_origin; + u64 conf_virt_stor_origin; + u64 reserved30; + u64 guest_stor_origin; + u64 guest_stor_len; + u64 guest_sca; + u64 guest_asce; + u64 reserved58[5]; +} __packed __aligned(8); + +/* Create Secure CPU */ +struct uv_cb_csc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 cpu_handle; + u64 guest_handle; + u64 stor_origin; + u8 reserved30[6]; + u16 num; + u64 state_origin; + u64 reserved40[4]; +} __packed __aligned(8); + +/* Convert to Secure */ +struct uv_cb_cts { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 gaddr; +} __packed __aligned(8); + +/* Convert from Secure / Pin Page Shared */ +struct uv_cb_cfs { + struct uv_cb_header header; + u64 reserved08[2]; + u64 paddr; +} __packed __aligned(8); + +/* Set Secure Config Parameter */ +struct uv_cb_ssc { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 sec_header_origin; + u32 sec_header_len; + u32 reserved2c; + u64 reserved30[4]; +} __packed __aligned(8); + +/* Unpack */ +struct uv_cb_unp { + struct uv_cb_header header; + u64 reserved08[2]; + u64 guest_handle; + u64 gaddr; + u64 tweak[2]; + u64 reserved38[3]; +} __packed __aligned(8); + +#define PV_CPU_STATE_OPR 1 +#define PV_CPU_STATE_STP 2 +#define PV_CPU_STATE_CHKSTP 3 +#define PV_CPU_STATE_OPR_LOAD 5 + +struct uv_cb_cpu_set_state { + struct uv_cb_header header; + u64 reserved08[2]; + u64 cpu_handle; + u8 reserved20[7]; + u8 state; + u64 reserved28[5]; +}; + +/* + * A common UV call struct for calls that take no payload + * Examples: + * Destroy cpu/config + * Verify + */ +struct uv_cb_nodata { + struct uv_cb_header header; + u64 reserved08[2]; + u64 handle; + u64 reserved20[4]; +} __packed __aligned(8); + +/* Set Shared Access */ struct uv_cb_share { struct uv_cb_header header; u64 reserved08[3]; @@ -54,21 +207,76 @@ struct uv_cb_share { u64 reserved28; } __packed __aligned(8); -static inline int uv_call(unsigned long r1, unsigned long r2) +static inline int __uv_call(unsigned long r1, unsigned long r2) { int cc; asm volatile( - "0: .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n" - " brc 3,0b\n" - " ipm %[cc]\n" - " srl %[cc],28\n" + " .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n" + " ipm %[cc]\n" + " srl %[cc],28\n" : [cc] "=d" (cc) : [r1] "a" (r1), [r2] "a" (r2) : "memory", "cc"); return cc; } +static inline int uv_call(unsigned long r1, unsigned long r2) +{ + int cc; + + do { + cc = __uv_call(r1, r2); + } while (cc > 1); + return cc; +} + +/* Low level uv_call that avoids stalls for long running busy conditions */ +static inline int uv_call_sched(unsigned long r1, unsigned long r2) +{ + int cc; + + do { + cc = __uv_call(r1, r2); + cond_resched(); + } while (cc > 1); + return cc; +} + +/* + * special variant of uv_call that only transports the cpu or guest + * handle and the command, like destroy or verify. + */ +static inline int uv_cmd_nodata(u64 handle, u16 cmd, u16 *rc, u16 *rrc) +{ + struct uv_cb_nodata uvcb = { + .header.cmd = cmd, + .header.len = sizeof(uvcb), + .handle = handle, + }; + int cc; + + WARN(!handle, "No handle provided to Ultravisor call cmd %x\n", cmd); + cc = uv_call_sched(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + return cc ? -EINVAL : 0; +} + +struct uv_info { + unsigned long inst_calls_list[4]; + unsigned long uv_base_stor_len; + unsigned long guest_base_stor_len; + unsigned long guest_virt_base_stor_len; + unsigned long guest_virt_var_stor_len; + unsigned long guest_cpu_stor_len; + unsigned long max_sec_stor_addr; + unsigned int max_num_sec_conf; + unsigned short max_guest_cpus; +}; + +extern struct uv_info uv_info; + #ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST extern int prot_virt_guest; @@ -121,11 +329,40 @@ static inline int uv_remove_shared(unsigned long addr) return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS); } -void uv_query_info(void); #else #define is_prot_virt_guest() 0 static inline int uv_set_shared(unsigned long addr) { return 0; } static inline int uv_remove_shared(unsigned long addr) { return 0; } +#endif + +#if IS_ENABLED(CONFIG_KVM) +extern int prot_virt_host; + +static inline int is_prot_virt_host(void) +{ + return prot_virt_host; +} + +int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb); +int uv_convert_from_secure(unsigned long paddr); +int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr); + +void setup_uv(void); +void adjust_to_uv_max(unsigned long *vmax); +#else +#define is_prot_virt_host() 0 +static inline void setup_uv(void) {} +static inline void adjust_to_uv_max(unsigned long *vmax) {} + +static inline int uv_convert_from_secure(unsigned long paddr) +{ + return 0; +} +#endif + +#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) +void uv_query_info(void); +#else static inline void uv_query_info(void) {} #endif diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 578a6fa82ea4..33d4de233c5b 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -78,6 +78,7 @@ obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_diag.o obj-$(CONFIG_TRACEPOINTS) += trace.o +obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o # vdso obj-y += vdso64/ diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 1d3927e01a5f..faca269d5f27 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -24,6 +24,8 @@ asmlinkage void do_syscall_trace_exit(struct pt_regs *regs); void do_protection_exception(struct pt_regs *regs); void do_dat_exception(struct pt_regs *regs); +void do_secure_storage_access(struct pt_regs *regs); +void do_non_secure_storage_access(struct pt_regs *regs); void addressing_exception(struct pt_regs *regs); void data_exception(struct pt_regs *regs); diff --git a/arch/s390/kernel/pgm_check.S b/arch/s390/kernel/pgm_check.S index eee3a482195a..2c27907a5ffc 100644 --- a/arch/s390/kernel/pgm_check.S +++ b/arch/s390/kernel/pgm_check.S @@ -78,8 +78,8 @@ PGM_CHECK(do_dat_exception) /* 39 */ PGM_CHECK(do_dat_exception) /* 3a */ PGM_CHECK(do_dat_exception) /* 3b */ PGM_CHECK_DEFAULT /* 3c */ -PGM_CHECK_DEFAULT /* 3d */ -PGM_CHECK_DEFAULT /* 3e */ +PGM_CHECK(do_secure_storage_access) /* 3d */ +PGM_CHECK(do_non_secure_storage_access) /* 3e */ PGM_CHECK_DEFAULT /* 3f */ PGM_CHECK(monitor_event_exception) /* 40 */ PGM_CHECK_DEFAULT /* 41 */ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index b2c2f75860e8..1423090a2259 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -92,10 +92,6 @@ char elf_platform[ELF_PLATFORM_SIZE]; unsigned long int_hwcap = 0; -#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST -int __bootdata_preserved(prot_virt_guest); -#endif - int __bootdata(noexec_disabled); int __bootdata(memory_end_set); unsigned long __bootdata(memory_end); @@ -564,6 +560,9 @@ static void __init setup_memory_end(void) vmax = _REGION1_SIZE; /* 4-level kernel page table */ } + if (is_prot_virt_host()) + adjust_to_uv_max(&vmax); + /* module area is at the end of the kernel address space. */ MODULES_END = vmax; MODULES_VADDR = MODULES_END - MODULES_LEN; @@ -1138,6 +1137,8 @@ void __init setup_arch(char **cmdline_p) */ memblock_trim_memory(1UL << (MAX_ORDER - 1 + PAGE_SHIFT)); + if (is_prot_virt_host()) + setup_uv(); setup_memory_end(); setup_memory(); dma_contiguous_reserve(memory_end); diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c new file mode 100644 index 000000000000..c86d654351d1 --- /dev/null +++ b/arch/s390/kernel/uv.c @@ -0,0 +1,414 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Ultravisor functions and initialization + * + * Copyright IBM Corp. 2019, 2020 + */ +#define KMSG_COMPONENT "prot_virt" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/sizes.h> +#include <linux/bitmap.h> +#include <linux/memblock.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <asm/facility.h> +#include <asm/sections.h> +#include <asm/uv.h> + +/* the bootdata_preserved fields come from ones in arch/s390/boot/uv.c */ +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST +int __bootdata_preserved(prot_virt_guest); +#endif + +#if IS_ENABLED(CONFIG_KVM) +int prot_virt_host; +EXPORT_SYMBOL(prot_virt_host); +struct uv_info __bootdata_preserved(uv_info); +EXPORT_SYMBOL(uv_info); + +static int __init prot_virt_setup(char *val) +{ + bool enabled; + int rc; + + rc = kstrtobool(val, &enabled); + if (!rc && enabled) + prot_virt_host = 1; + + if (is_prot_virt_guest() && prot_virt_host) { + prot_virt_host = 0; + pr_warn("Protected virtualization not available in protected guests."); + } + + if (prot_virt_host && !test_facility(158)) { + prot_virt_host = 0; + pr_warn("Protected virtualization not supported by the hardware."); + } + + return rc; +} +early_param("prot_virt", prot_virt_setup); + +static int __init uv_init(unsigned long stor_base, unsigned long stor_len) +{ + struct uv_cb_init uvcb = { + .header.cmd = UVC_CMD_INIT_UV, + .header.len = sizeof(uvcb), + .stor_origin = stor_base, + .stor_len = stor_len, + }; + + if (uv_call(0, (uint64_t)&uvcb)) { + pr_err("Ultravisor init failed with rc: 0x%x rrc: 0%x\n", + uvcb.header.rc, uvcb.header.rrc); + return -1; + } + return 0; +} + +void __init setup_uv(void) +{ + unsigned long uv_stor_base; + + uv_stor_base = (unsigned long)memblock_alloc_try_nid( + uv_info.uv_base_stor_len, SZ_1M, SZ_2G, + MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); + if (!uv_stor_base) { + pr_warn("Failed to reserve %lu bytes for ultravisor base storage\n", + uv_info.uv_base_stor_len); + goto fail; + } + + if (uv_init(uv_stor_base, uv_info.uv_base_stor_len)) { + memblock_free(uv_stor_base, uv_info.uv_base_stor_len); + goto fail; + } + + pr_info("Reserving %luMB as ultravisor base storage\n", + uv_info.uv_base_stor_len >> 20); + return; +fail: + pr_info("Disabling support for protected virtualization"); + prot_virt_host = 0; +} + +void adjust_to_uv_max(unsigned long *vmax) +{ + *vmax = min_t(unsigned long, *vmax, uv_info.max_sec_stor_addr); +} + +/* + * Requests the Ultravisor to pin the page in the shared state. This will + * cause an intercept when the guest attempts to unshare the pinned page. + */ +static int uv_pin_shared(unsigned long paddr) +{ + struct uv_cb_cfs uvcb = { + .header.cmd = UVC_CMD_PIN_PAGE_SHARED, + .header.len = sizeof(uvcb), + .paddr = paddr, + }; + + if (uv_call(0, (u64)&uvcb)) + return -EINVAL; + return 0; +} + +/* + * Requests the Ultravisor to encrypt a guest page and make it + * accessible to the host for paging (export). + * + * @paddr: Absolute host address of page to be exported + */ +int uv_convert_from_secure(unsigned long paddr) +{ + struct uv_cb_cfs uvcb = { + .header.cmd = UVC_CMD_CONV_FROM_SEC_STOR, + .header.len = sizeof(uvcb), + .paddr = paddr + }; + + if (uv_call(0, (u64)&uvcb)) + return -EINVAL; + return 0; +} + +/* + * Calculate the expected ref_count for a page that would otherwise have no + * further pins. This was cribbed from similar functions in other places in + * the kernel, but with some slight modifications. We know that a secure + * page can not be a huge page for example. + */ +static int expected_page_refs(struct page *page) +{ + int res; + + res = page_mapcount(page); + if (PageSwapCache(page)) { + res++; + } else if (page_mapping(page)) { + res++; + if (page_has_private(page)) + res++; + } + return res; +} + +static int make_secure_pte(pte_t *ptep, unsigned long addr, + struct page *exp_page, struct uv_cb_header *uvcb) +{ + pte_t entry = READ_ONCE(*ptep); + struct page *page; + int expected, rc = 0; + + if (!pte_present(entry)) + return -ENXIO; + if (pte_val(entry) & _PAGE_INVALID) + return -ENXIO; + + page = pte_page(entry); + if (page != exp_page) + return -ENXIO; + if (PageWriteback(page)) + return -EAGAIN; + expected = expected_page_refs(page); + if (!page_ref_freeze(page, expected)) + return -EBUSY; + set_bit(PG_arch_1, &page->flags); + rc = uv_call(0, (u64)uvcb); + page_ref_unfreeze(page, expected); + /* Return -ENXIO if the page was not mapped, -EINVAL otherwise */ + if (rc) + rc = uvcb->rc == 0x10a ? -ENXIO : -EINVAL; + return rc; +} + +/* + * Requests the Ultravisor to make a page accessible to a guest. + * If it's brought in the first time, it will be cleared. If + * it has been exported before, it will be decrypted and integrity + * checked. + */ +int gmap_make_secure(struct gmap *gmap, unsigned long gaddr, void *uvcb) +{ + struct vm_area_struct *vma; + bool local_drain = false; + spinlock_t *ptelock; + unsigned long uaddr; + struct page *page; + pte_t *ptep; + int rc; + +again: + rc = -EFAULT; + down_read(&gmap->mm->mmap_sem); + + uaddr = __gmap_translate(gmap, gaddr); + if (IS_ERR_VALUE(uaddr)) + goto out; + vma = find_vma(gmap->mm, uaddr); + if (!vma) + goto out; + /* + * Secure pages cannot be huge and userspace should not combine both. + * In case userspace does it anyway this will result in an -EFAULT for + * the unpack. The guest is thus never reaching secure mode. If + * userspace is playing dirty tricky with mapping huge pages later + * on this will result in a segmentation fault. + */ + if (is_vm_hugetlb_page(vma)) + goto out; + + rc = -ENXIO; + page = follow_page(vma, uaddr, FOLL_WRITE); + if (IS_ERR_OR_NULL(page)) + goto out; + + lock_page(page); + ptep = get_locked_pte(gmap->mm, uaddr, &ptelock); + rc = make_secure_pte(ptep, uaddr, page, uvcb); + pte_unmap_unlock(ptep, ptelock); + unlock_page(page); +out: + up_read(&gmap->mm->mmap_sem); + + if (rc == -EAGAIN) { + wait_on_page_writeback(page); + } else if (rc == -EBUSY) { + /* + * If we have tried a local drain and the page refcount + * still does not match our expected safe value, try with a + * system wide drain. This is needed if the pagevecs holding + * the page are on a different CPU. + */ + if (local_drain) { + lru_add_drain_all(); + /* We give up here, and let the caller try again */ + return -EAGAIN; + } + /* + * We are here if the page refcount does not match the + * expected safe value. The main culprits are usually + * pagevecs. With lru_add_drain() we drain the pagevecs + * on the local CPU so that hopefully the refcount will + * reach the expected safe value. + */ + lru_add_drain(); + local_drain = true; + /* And now we try again immediately after draining */ + goto again; + } else if (rc == -ENXIO) { + if (gmap_fault(gmap, gaddr, FAULT_FLAG_WRITE)) + return -EFAULT; + return -EAGAIN; + } + return rc; +} +EXPORT_SYMBOL_GPL(gmap_make_secure); + +int gmap_convert_to_secure(struct gmap *gmap, unsigned long gaddr) +{ + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_CONV_TO_SEC_STOR, + .header.len = sizeof(uvcb), + .guest_handle = gmap->guest_handle, + .gaddr = gaddr, + }; + + return gmap_make_secure(gmap, gaddr, &uvcb); +} +EXPORT_SYMBOL_GPL(gmap_convert_to_secure); + +/* + * To be called with the page locked or with an extra reference! This will + * prevent gmap_make_secure from touching the page concurrently. Having 2 + * parallel make_page_accessible is fine, as the UV calls will become a + * no-op if the page is already exported. + */ +int arch_make_page_accessible(struct page *page) +{ + int rc = 0; + + /* Hugepage cannot be protected, so nothing to do */ + if (PageHuge(page)) + return 0; + + /* + * PG_arch_1 is used in 3 places: + * 1. for kernel page tables during early boot + * 2. for storage keys of huge pages and KVM + * 3. As an indication that this page might be secure. This can + * overindicate, e.g. we set the bit before calling + * convert_to_secure. + * As secure pages are never huge, all 3 variants can co-exists. + */ + if (!test_bit(PG_arch_1, &page->flags)) + return 0; + + rc = uv_pin_shared(page_to_phys(page)); + if (!rc) { + clear_bit(PG_arch_1, &page->flags); + return 0; + } + + rc = uv_convert_from_secure(page_to_phys(page)); + if (!rc) { + clear_bit(PG_arch_1, &page->flags); + return 0; + } + + return rc; +} +EXPORT_SYMBOL_GPL(arch_make_page_accessible); + +#endif + +#if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) +static ssize_t uv_query_facilities(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return snprintf(page, PAGE_SIZE, "%lx\n%lx\n%lx\n%lx\n", + uv_info.inst_calls_list[0], + uv_info.inst_calls_list[1], + uv_info.inst_calls_list[2], + uv_info.inst_calls_list[3]); +} + +static struct kobj_attribute uv_query_facilities_attr = + __ATTR(facilities, 0444, uv_query_facilities, NULL); + +static ssize_t uv_query_max_guest_cpus(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", + uv_info.max_guest_cpus); +} + +static struct kobj_attribute uv_query_max_guest_cpus_attr = + __ATTR(max_cpus, 0444, uv_query_max_guest_cpus, NULL); + +static ssize_t uv_query_max_guest_vms(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", + uv_info.max_num_sec_conf); +} + +static struct kobj_attribute uv_query_max_guest_vms_attr = + __ATTR(max_guests, 0444, uv_query_max_guest_vms, NULL); + +static ssize_t uv_query_max_guest_addr(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return snprintf(page, PAGE_SIZE, "%lx\n", + uv_info.max_sec_stor_addr); +} + +static struct kobj_attribute uv_query_max_guest_addr_attr = + __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL); + +static struct attribute *uv_query_attrs[] = { + &uv_query_facilities_attr.attr, + &uv_query_max_guest_cpus_attr.attr, + &uv_query_max_guest_vms_attr.attr, + &uv_query_max_guest_addr_attr.attr, + NULL, +}; + +static struct attribute_group uv_query_attr_group = { + .attrs = uv_query_attrs, +}; + +static struct kset *uv_query_kset; +static struct kobject *uv_kobj; + +static int __init uv_info_init(void) +{ + int rc = -ENOMEM; + + if (!test_facility(158)) + return 0; + + uv_kobj = kobject_create_and_add("uv", firmware_kobj); + if (!uv_kobj) + return -ENOMEM; + + uv_query_kset = kset_create_and_add("query", NULL, uv_kobj); + if (!uv_query_kset) + goto out_kobj; + + rc = sysfs_create_group(&uv_query_kset->kobj, &uv_query_attr_group); + if (!rc) + return 0; + + kset_unregister(uv_query_kset); +out_kobj: + kobject_del(uv_kobj); + kobject_put(uv_kobj); + return rc; +} +device_initcall(uv_info_init); +#endif diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile index 05ee90a5ea08..12decca22e7c 100644 --- a/arch/s390/kvm/Makefile +++ b/arch/s390/kvm/Makefile @@ -9,6 +9,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch ccflags-y := -Ivirt/kvm -Iarch/s390/kvm kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o -kvm-objs += diag.o gaccess.o guestdbg.o vsie.o +kvm-objs += diag.o gaccess.o guestdbg.o vsie.o pv.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 3fb54ec2cf3e..563429dece03 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -2,7 +2,7 @@ /* * handling diagnose instructions * - * Copyright IBM Corp. 2008, 2011 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -201,6 +201,10 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } + /* + * no need to check the return value of vcpu_stop as it can only have + * an error for protvirt, but protvirt means user cpu state + */ if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) kvm_s390_vcpu_stop(vcpu); vcpu->run->s390_reset_flags |= KVM_S390_RESET_SUBSYSTEM; diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index 07d30ffcfa41..47a67a958107 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -505,7 +505,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, switch (prot) { case PROT_TYPE_IEP: tec->b61 = 1; - /* FALL THROUGH */ + fallthrough; case PROT_TYPE_LA: tec->b56 = 1; break; @@ -514,12 +514,12 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, break; case PROT_TYPE_ALC: tec->b60 = 1; - /* FALL THROUGH */ + fallthrough; case PROT_TYPE_DAT: tec->b61 = 1; break; } - /* FALL THROUGH */ + fallthrough; case PGM_ASCE_TYPE: case PGM_PAGE_TRANSLATION: case PGM_REGION_FIRST_TRANS: @@ -534,7 +534,7 @@ static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva, tec->addr = gva >> PAGE_SHIFT; tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH; tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as; - /* FALL THROUGH */ + fallthrough; case PGM_ALEN_TRANSLATION: case PGM_ALE_SEQUENCE: case PGM_ASTE_VALIDITY: @@ -677,7 +677,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, dat_protection |= rfte.p; ptr = rfte.rto * PAGE_SIZE + vaddr.rsx * 8; } - /* fallthrough */ + fallthrough; case ASCE_TYPE_REGION2: { union region2_table_entry rste; @@ -695,7 +695,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, dat_protection |= rste.p; ptr = rste.rto * PAGE_SIZE + vaddr.rtx * 8; } - /* fallthrough */ + fallthrough; case ASCE_TYPE_REGION3: { union region3_table_entry rtte; @@ -723,7 +723,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva, dat_protection |= rtte.fc0.p; ptr = rtte.fc0.sto * PAGE_SIZE + vaddr.sx * 8; } - /* fallthrough */ + fallthrough; case ASCE_TYPE_SEGMENT: { union segment_table_entry ste; @@ -1050,7 +1050,8 @@ shadow_r2t: rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake); if (rc) return rc; - } /* fallthrough */ + } + fallthrough; case ASCE_TYPE_REGION2: { union region2_table_entry rste; @@ -1076,7 +1077,8 @@ shadow_r3t: rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake); if (rc) return rc; - } /* fallthrough */ + } + fallthrough; case ASCE_TYPE_REGION3: { union region3_table_entry rtte; @@ -1111,7 +1113,8 @@ shadow_sgt: rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake); if (rc) return rc; - } /* fallthrough */ + } + fallthrough; case ASCE_TYPE_SEGMENT: { union segment_table_entry ste; diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index a389fa85cca2..e7a7c499a73f 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -2,7 +2,7 @@ /* * in-kernel handling for sie intercepts * - * Copyright IBM Corp. 2008, 2014 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -12,10 +12,10 @@ #include <linux/errno.h> #include <linux/pagemap.h> -#include <asm/kvm_host.h> #include <asm/asm-offsets.h> #include <asm/irq.h> #include <asm/sysinfo.h> +#include <asm/uv.h> #include "kvm-s390.h" #include "gaccess.h" @@ -79,6 +79,10 @@ static int handle_stop(struct kvm_vcpu *vcpu) return rc; } + /* + * no need to check the return value of vcpu_stop as it can only have + * an error for protvirt, but protvirt means user cpu state + */ if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) kvm_s390_vcpu_stop(vcpu); return -EOPNOTSUPP; @@ -231,6 +235,13 @@ static int handle_prog(struct kvm_vcpu *vcpu) vcpu->stat.exit_program_interruption++; + /* + * Intercept 8 indicates a loop of specification exceptions + * for protected guests. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return -EOPNOTSUPP; + if (guestdbg_enabled(vcpu) && per_event(vcpu)) { rc = kvm_s390_handle_per_event(vcpu); if (rc) @@ -384,7 +395,7 @@ int handle_sthyi(struct kvm_vcpu *vcpu) goto out; } - if (addr & ~PAGE_MASK) + if (!kvm_s390_pv_cpu_is_protected(vcpu) && (addr & ~PAGE_MASK)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); sctns = (void *)get_zeroed_page(GFP_KERNEL); @@ -395,10 +406,15 @@ int handle_sthyi(struct kvm_vcpu *vcpu) out: if (!cc) { - r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); - if (r) { - free_page((unsigned long)sctns); - return kvm_s390_inject_prog_cond(vcpu, r); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy((void *)(sida_origin(vcpu->arch.sie_block)), + sctns, PAGE_SIZE); + } else { + r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE); + if (r) { + free_page((unsigned long)sctns); + return kvm_s390_inject_prog_cond(vcpu, r); + } } } @@ -444,6 +460,77 @@ static int handle_operexc(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); } +static int handle_pv_spx(struct kvm_vcpu *vcpu) +{ + u32 pref = *(u32 *)vcpu->arch.sie_block->sidad; + + kvm_s390_set_prefix(vcpu, pref); + trace_kvm_s390_handle_prefix(vcpu, 1, pref); + return 0; +} + +static int handle_pv_sclp(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + + spin_lock(&fi->lock); + /* + * 2 cases: + * a: an sccb answering interrupt was already pending or in flight. + * As the sccb value is not known we can simply set some value to + * trigger delivery of a saved SCCB. UV will then use its saved + * copy of the SCCB value. + * b: an error SCCB interrupt needs to be injected so we also inject + * a fake SCCB address. Firmware will use the proper one. + * This makes sure, that both errors and real sccb returns will only + * be delivered after a notification intercept (instruction has + * finished) but not after others. + */ + fi->srv_signal.ext_params |= 0x43000; + set_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs); + clear_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs); + spin_unlock(&fi->lock); + return 0; +} + +static int handle_pv_uvc(struct kvm_vcpu *vcpu) +{ + struct uv_cb_share *guest_uvcb = (void *)vcpu->arch.sie_block->sidad; + struct uv_cb_cts uvcb = { + .header.cmd = UVC_CMD_UNPIN_PAGE_SHARED, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(vcpu->kvm), + .gaddr = guest_uvcb->paddr, + }; + int rc; + + if (guest_uvcb->header.cmd != UVC_CMD_REMOVE_SHARED_ACCESS) { + WARN_ONCE(1, "Unexpected notification intercept for UVC 0x%x\n", + guest_uvcb->header.cmd); + return 0; + } + rc = gmap_make_secure(vcpu->arch.gmap, uvcb.gaddr, &uvcb); + /* + * If the unpin did not succeed, the guest will exit again for the UVC + * and we will retry the unpin. + */ + if (rc == -EINVAL) + return 0; + return rc; +} + +static int handle_pv_notification(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.sie_block->ipa == 0xb210) + return handle_pv_spx(vcpu); + if (vcpu->arch.sie_block->ipa == 0xb220) + return handle_pv_sclp(vcpu); + if (vcpu->arch.sie_block->ipa == 0xb9a4) + return handle_pv_uvc(vcpu); + + return handle_instruction(vcpu); +} + int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) { int rc, per_rc = 0; @@ -480,6 +567,28 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) case ICPT_KSS: rc = kvm_s390_skey_check_enable(vcpu); break; + case ICPT_MCHKREQ: + case ICPT_INT_ENABLE: + /* + * PSW bit 13 or a CR (0, 6, 14) changed and we might + * now be able to deliver interrupts. The pre-run code + * will take care of this. + */ + rc = 0; + break; + case ICPT_PV_INSTR: + rc = handle_instruction(vcpu); + break; + case ICPT_PV_NOTIFY: + rc = handle_pv_notification(vcpu); + break; + case ICPT_PV_PREF: + rc = 0; + gmap_convert_to_secure(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu)); + gmap_convert_to_secure(vcpu->arch.gmap, + kvm_s390_get_prefix(vcpu) + PAGE_SIZE); + break; default: return -EOPNOTSUPP; } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index c06c89d370a7..8191106bf7b9 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -2,7 +2,7 @@ /* * handling kvm guest interrupts * - * Copyright IBM Corp. 2008, 2015 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> */ @@ -324,8 +324,11 @@ static inline int gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gisc) static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu) { - return vcpu->kvm->arch.float_int.pending_irqs | - vcpu->arch.local_int.pending_irqs; + unsigned long pending = vcpu->kvm->arch.float_int.pending_irqs | + vcpu->arch.local_int.pending_irqs; + + pending &= ~vcpu->kvm->arch.float_int.masked_irqs; + return pending; } static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) @@ -383,10 +386,18 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) __clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &active_mask); if (!(vcpu->arch.sie_block->gcr[0] & CR0_CPU_TIMER_SUBMASK)) __clear_bit(IRQ_PEND_EXT_CPU_TIMER, &active_mask); - if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) + if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) { __clear_bit(IRQ_PEND_EXT_SERVICE, &active_mask); + __clear_bit(IRQ_PEND_EXT_SERVICE_EV, &active_mask); + } if (psw_mchk_disabled(vcpu)) active_mask &= ~IRQ_PEND_MCHK_MASK; + /* PV guest cpus can have a single interruption injected at a time. */ + if (kvm_s390_pv_cpu_is_protected(vcpu) && + vcpu->arch.sie_block->iictl != IICTL_CODE_NONE) + active_mask &= ~(IRQ_PEND_EXT_II_MASK | + IRQ_PEND_IO_MASK | + IRQ_PEND_MCHK_MASK); /* * Check both floating and local interrupt's cr14 because * bit IRQ_PEND_MCHK_REP could be set in both cases. @@ -479,19 +490,23 @@ static void set_intercept_indicators(struct kvm_vcpu *vcpu) static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - int rc; + int rc = 0; vcpu->stat.deliver_cputm++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER, 0, 0); - - rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER, - (u16 *)__LC_EXT_INT_CODE); - rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); - rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_CPU_TIMER; + } else { + rc = put_guest_lc(vcpu, EXT_IRQ_CPU_TIMER, + (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } clear_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); return rc ? -EFAULT : 0; } @@ -499,19 +514,23 @@ static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu) static int __must_check __deliver_ckc(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - int rc; + int rc = 0; vcpu->stat.deliver_ckc++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP, 0, 0); - - rc = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP, - (u16 __user *)__LC_EXT_INT_CODE); - rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); - rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_CLK_COMP; + } else { + rc = put_guest_lc(vcpu, EXT_IRQ_CLK_COMP, + (u16 __user *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } clear_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs); return rc ? -EFAULT : 0; } @@ -553,6 +572,20 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, union mci mci; int rc; + /* + * All other possible payload for a machine check (e.g. the register + * contents in the save area) will be handled by the ultravisor, as + * the hypervisor does not not have the needed information for + * protected guests. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_MCHK; + vcpu->arch.sie_block->mcic = mchk->mcic; + vcpu->arch.sie_block->faddr = mchk->failing_storage_address; + vcpu->arch.sie_block->edc = mchk->ext_damage_code; + return 0; + } + mci.val = mchk->mcic; /* take care of lazy register loading */ save_fpu_regs(); @@ -696,17 +729,21 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) static int __must_check __deliver_restart(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - int rc; + int rc = 0; VCPU_EVENT(vcpu, 3, "%s", "deliver: cpu restart"); vcpu->stat.deliver_restart_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0); - rc = write_guest_lc(vcpu, - offsetof(struct lowcore, restart_old_psw), - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw), - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_RESTART; + } else { + rc = write_guest_lc(vcpu, + offsetof(struct lowcore, restart_old_psw), + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, offsetof(struct lowcore, restart_psw), + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + } clear_bit(IRQ_PEND_RESTART, &li->pending_irqs); return rc ? -EFAULT : 0; } @@ -748,6 +785,12 @@ static int __must_check __deliver_emergency_signal(struct kvm_vcpu *vcpu) vcpu->stat.deliver_emergency_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY, cpu_addr, 0); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_EMERGENCY_SIG; + vcpu->arch.sie_block->extcpuaddr = cpu_addr; + return 0; + } rc = put_guest_lc(vcpu, EXT_IRQ_EMERGENCY_SIG, (u16 *)__LC_EXT_INT_CODE); @@ -776,6 +819,12 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL, extcall.code, 0); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_EXTERNAL_CALL; + vcpu->arch.sie_block->extcpuaddr = extcall.code; + return 0; + } rc = put_guest_lc(vcpu, EXT_IRQ_EXTERNAL_CALL, (u16 *)__LC_EXT_INT_CODE); @@ -787,6 +836,21 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu) return rc ? -EFAULT : 0; } +static int __deliver_prog_pv(struct kvm_vcpu *vcpu, u16 code) +{ + switch (code) { + case PGM_SPECIFICATION: + vcpu->arch.sie_block->iictl = IICTL_CODE_SPECIFICATION; + break; + case PGM_OPERAND: + vcpu->arch.sie_block->iictl = IICTL_CODE_OPERAND; + break; + default: + return -EINVAL; + } + return 0; +} + static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; @@ -807,6 +871,10 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, pgm_info.code, 0); + /* PER is handled by the ultravisor */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return __deliver_prog_pv(vcpu, pgm_info.code & ~PGM_PER); + switch (pgm_info.code & ~PGM_PER) { case PGM_AFX_TRANSLATION: case PGM_ASX_TRANSLATION: @@ -818,7 +886,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) case PGM_PRIMARY_AUTHORITY: case PGM_SECONDARY_AUTHORITY: nullifying = true; - /* fall through */ + fallthrough; case PGM_SPACE_SWITCH: rc = put_guest_lc(vcpu, pgm_info.trans_exc_code, (u64 *)__LC_TRANS_EXC_CODE); @@ -902,20 +970,49 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) return rc ? -EFAULT : 0; } +#define SCCB_MASK 0xFFFFFFF8 +#define SCCB_EVENT_PENDING 0x3 + +static int write_sclp(struct kvm_vcpu *vcpu, u32 parm) +{ + int rc; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_EXT; + vcpu->arch.sie_block->eic = EXT_IRQ_SERVICE_SIG; + vcpu->arch.sie_block->eiparams = parm; + return 0; + } + + rc = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE); + rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); + rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, + &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); + rc |= put_guest_lc(vcpu, parm, + (u32 *)__LC_EXT_PARAMS); + + return rc ? -EFAULT : 0; +} + static int __must_check __deliver_service(struct kvm_vcpu *vcpu) { struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; struct kvm_s390_ext_info ext; - int rc = 0; spin_lock(&fi->lock); - if (!(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) { + if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs) || + !(test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs))) { spin_unlock(&fi->lock); return 0; } ext = fi->srv_signal; memset(&fi->srv_signal, 0, sizeof(ext)); clear_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs); + clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + if (kvm_s390_pv_cpu_is_protected(vcpu)) + set_bit(IRQ_PEND_EXT_SERVICE, &fi->masked_irqs); spin_unlock(&fi->lock); VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x", @@ -924,16 +1021,31 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu) trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, ext.ext_params, 0); - rc = put_guest_lc(vcpu, EXT_IRQ_SERVICE_SIG, (u16 *)__LC_EXT_INT_CODE); - rc |= put_guest_lc(vcpu, 0, (u16 *)__LC_EXT_CPU_ADDR); - rc |= write_guest_lc(vcpu, __LC_EXT_OLD_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= read_guest_lc(vcpu, __LC_EXT_NEW_PSW, - &vcpu->arch.sie_block->gpsw, sizeof(psw_t)); - rc |= put_guest_lc(vcpu, ext.ext_params, - (u32 *)__LC_EXT_PARAMS); + return write_sclp(vcpu, ext.ext_params); +} - return rc ? -EFAULT : 0; +static int __must_check __deliver_service_ev(struct kvm_vcpu *vcpu) +{ + struct kvm_s390_float_interrupt *fi = &vcpu->kvm->arch.float_int; + struct kvm_s390_ext_info ext; + + spin_lock(&fi->lock); + if (!(test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs))) { + spin_unlock(&fi->lock); + return 0; + } + ext = fi->srv_signal; + /* only clear the event bit */ + fi->srv_signal.ext_params &= ~SCCB_EVENT_PENDING; + clear_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + spin_unlock(&fi->lock); + + VCPU_EVENT(vcpu, 4, "%s", "deliver: sclp parameter event"); + vcpu->stat.deliver_service_signal++; + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, + ext.ext_params, 0); + + return write_sclp(vcpu, SCCB_EVENT_PENDING); } static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu) @@ -1028,6 +1140,15 @@ static int __do_deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_io_info *io) { int rc; + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->iictl = IICTL_CODE_IO; + vcpu->arch.sie_block->subchannel_id = io->subchannel_id; + vcpu->arch.sie_block->subchannel_nr = io->subchannel_nr; + vcpu->arch.sie_block->io_int_parm = io->io_int_parm; + vcpu->arch.sie_block->io_int_word = io->io_int_word; + return 0; + } + rc = put_guest_lc(vcpu, io->subchannel_id, (u16 *)__LC_SUBCHANNEL_ID); rc |= put_guest_lc(vcpu, io->subchannel_nr, (u16 *)__LC_SUBCHANNEL_NR); rc |= put_guest_lc(vcpu, io->io_int_parm, (u32 *)__LC_IO_INT_PARM); @@ -1329,6 +1450,9 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) case IRQ_PEND_EXT_SERVICE: rc = __deliver_service(vcpu); break; + case IRQ_PEND_EXT_SERVICE_EV: + rc = __deliver_service_ev(vcpu); + break; case IRQ_PEND_PFAULT_DONE: rc = __deliver_pfault_done(vcpu); break; @@ -1421,7 +1545,7 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) if (kvm_get_vcpu_by_id(vcpu->kvm, src_id) == NULL) return -EINVAL; - if (sclp.has_sigpif) + if (sclp.has_sigpif && !kvm_s390_pv_cpu_get_handle(vcpu)) return sca_inject_ext_call(vcpu, src_id); if (test_and_set_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs)) @@ -1681,9 +1805,6 @@ out: return inti; } -#define SCCB_MASK 0xFFFFFFF8 -#define SCCB_EVENT_PENDING 0x3 - static int __inject_service(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) { @@ -1692,6 +1813,11 @@ static int __inject_service(struct kvm *kvm, kvm->stat.inject_service_signal++; spin_lock(&fi->lock); fi->srv_signal.ext_params |= inti->ext.ext_params & SCCB_EVENT_PENDING; + + /* We always allow events, track them separately from the sccb ints */ + if (fi->srv_signal.ext_params & SCCB_EVENT_PENDING) + set_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs); + /* * Early versions of the QEMU s390 bios will inject several * service interrupts after another without handling a @@ -1773,7 +1899,14 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti) kvm->stat.inject_io++; isc = int_word_to_isc(inti->io.io_int_word); - if (gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) { + /* + * Do not make use of gisa in protected mode. We do not use the lock + * checking variant as this is just a performance optimization and we + * do not hold the lock here. This is ok as the code will pick + * interrupts from both "lists" for delivery. + */ + if (!kvm_s390_pv_get_handle(kvm) && + gi->origin && inti->type & KVM_S390_INT_IO_AI_MASK) { VM_EVENT(kvm, 4, "%s isc %1u", "inject: I/O (AI/gisa)", isc); gisa_set_ipm_gisc(gi->origin, isc); kfree(inti); @@ -1834,7 +1967,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: if (!(type & KVM_S390_INT_IO_AI_MASK && - kvm->arch.gisa_int.origin)) + kvm->arch.gisa_int.origin) || + kvm_s390_pv_cpu_get_handle(dst_vcpu)) kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT); break; default: @@ -2080,6 +2214,10 @@ void kvm_s390_clear_float_irqs(struct kvm *kvm) struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; int i; + mutex_lock(&kvm->lock); + if (!kvm_s390_pv_is_protected(kvm)) + fi->masked_irqs = 0; + mutex_unlock(&kvm->lock); spin_lock(&fi->lock); fi->pending_irqs = 0; memset(&fi->srv_signal, 0, sizeof(fi->srv_signal)); @@ -2146,7 +2284,8 @@ static int get_all_floating_irqs(struct kvm *kvm, u8 __user *usrbuf, u64 len) n++; } } - if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs)) { + if (test_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs) || + test_bit(IRQ_PEND_EXT_SERVICE_EV, &fi->pending_irqs)) { if (n == max_irqs) { /* signal userspace to try again */ ret = -ENOMEM; @@ -2327,9 +2466,6 @@ static int register_io_adapter(struct kvm_device *dev, if (!adapter) return -ENOMEM; - INIT_LIST_HEAD(&adapter->maps); - init_rwsem(&adapter->maps_lock); - atomic_set(&adapter->nr_maps, 0); adapter->id = adapter_info.id; adapter->isc = adapter_info.isc; adapter->maskable = adapter_info.maskable; @@ -2354,87 +2490,12 @@ int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked) return ret; } -static int kvm_s390_adapter_map(struct kvm *kvm, unsigned int id, __u64 addr) -{ - struct s390_io_adapter *adapter = get_io_adapter(kvm, id); - struct s390_map_info *map; - int ret; - - if (!adapter || !addr) - return -EINVAL; - - map = kzalloc(sizeof(*map), GFP_KERNEL); - if (!map) { - ret = -ENOMEM; - goto out; - } - INIT_LIST_HEAD(&map->list); - map->guest_addr = addr; - map->addr = gmap_translate(kvm->arch.gmap, addr); - if (map->addr == -EFAULT) { - ret = -EFAULT; - goto out; - } - ret = get_user_pages_fast(map->addr, 1, FOLL_WRITE, &map->page); - if (ret < 0) - goto out; - BUG_ON(ret != 1); - down_write(&adapter->maps_lock); - if (atomic_inc_return(&adapter->nr_maps) < MAX_S390_ADAPTER_MAPS) { - list_add_tail(&map->list, &adapter->maps); - ret = 0; - } else { - put_page(map->page); - ret = -EINVAL; - } - up_write(&adapter->maps_lock); -out: - if (ret) - kfree(map); - return ret; -} - -static int kvm_s390_adapter_unmap(struct kvm *kvm, unsigned int id, __u64 addr) -{ - struct s390_io_adapter *adapter = get_io_adapter(kvm, id); - struct s390_map_info *map, *tmp; - int found = 0; - - if (!adapter || !addr) - return -EINVAL; - - down_write(&adapter->maps_lock); - list_for_each_entry_safe(map, tmp, &adapter->maps, list) { - if (map->guest_addr == addr) { - found = 1; - atomic_dec(&adapter->nr_maps); - list_del(&map->list); - put_page(map->page); - kfree(map); - break; - } - } - up_write(&adapter->maps_lock); - - return found ? 0 : -EINVAL; -} - void kvm_s390_destroy_adapters(struct kvm *kvm) { int i; - struct s390_map_info *map, *tmp; - for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) { - if (!kvm->arch.adapters[i]) - continue; - list_for_each_entry_safe(map, tmp, - &kvm->arch.adapters[i]->maps, list) { - list_del(&map->list); - put_page(map->page); - kfree(map); - } + for (i = 0; i < MAX_S390_IO_ADAPTERS; i++) kfree(kvm->arch.adapters[i]); - } } static int modify_io_adapter(struct kvm_device *dev, @@ -2456,11 +2517,14 @@ static int modify_io_adapter(struct kvm_device *dev, if (ret > 0) ret = 0; break; + /* + * The following operations are no longer needed and therefore no-ops. + * The gpa to hva translation is done when an IRQ route is set up. The + * set_irq code uses get_user_pages_remote() to do the actual write. + */ case KVM_S390_IO_ADAPTER_MAP: - ret = kvm_s390_adapter_map(dev->kvm, req.id, req.addr); - break; case KVM_S390_IO_ADAPTER_UNMAP: - ret = kvm_s390_adapter_unmap(dev->kvm, req.id, req.addr); + ret = 0; break; default: ret = -EINVAL; @@ -2699,19 +2763,15 @@ static unsigned long get_ind_bit(__u64 addr, unsigned long bit_nr, bool swap) return swap ? (bit ^ (BITS_PER_LONG - 1)) : bit; } -static struct s390_map_info *get_map_info(struct s390_io_adapter *adapter, - u64 addr) +static struct page *get_map_page(struct kvm *kvm, u64 uaddr) { - struct s390_map_info *map; + struct page *page = NULL; - if (!adapter) - return NULL; - - list_for_each_entry(map, &adapter->maps, list) { - if (map->guest_addr == addr) - return map; - } - return NULL; + down_read(&kvm->mm->mmap_sem); + get_user_pages_remote(NULL, kvm->mm, uaddr, 1, FOLL_WRITE, + &page, NULL, NULL); + up_read(&kvm->mm->mmap_sem); + return page; } static int adapter_indicators_set(struct kvm *kvm, @@ -2720,30 +2780,35 @@ static int adapter_indicators_set(struct kvm *kvm, { unsigned long bit; int summary_set, idx; - struct s390_map_info *info; + struct page *ind_page, *summary_page; void *map; - info = get_map_info(adapter, adapter_int->ind_addr); - if (!info) + ind_page = get_map_page(kvm, adapter_int->ind_addr); + if (!ind_page) return -1; - map = page_address(info->page); - bit = get_ind_bit(info->addr, adapter_int->ind_offset, adapter->swap); - set_bit(bit, map); - idx = srcu_read_lock(&kvm->srcu); - mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT); - set_page_dirty_lock(info->page); - info = get_map_info(adapter, adapter_int->summary_addr); - if (!info) { - srcu_read_unlock(&kvm->srcu, idx); + summary_page = get_map_page(kvm, adapter_int->summary_addr); + if (!summary_page) { + put_page(ind_page); return -1; } - map = page_address(info->page); - bit = get_ind_bit(info->addr, adapter_int->summary_offset, - adapter->swap); + + idx = srcu_read_lock(&kvm->srcu); + map = page_address(ind_page); + bit = get_ind_bit(adapter_int->ind_addr, + adapter_int->ind_offset, adapter->swap); + set_bit(bit, map); + mark_page_dirty(kvm, adapter_int->ind_addr >> PAGE_SHIFT); + set_page_dirty_lock(ind_page); + map = page_address(summary_page); + bit = get_ind_bit(adapter_int->summary_addr, + adapter_int->summary_offset, adapter->swap); summary_set = test_and_set_bit(bit, map); - mark_page_dirty(kvm, info->guest_addr >> PAGE_SHIFT); - set_page_dirty_lock(info->page); + mark_page_dirty(kvm, adapter_int->summary_addr >> PAGE_SHIFT); + set_page_dirty_lock(summary_page); srcu_read_unlock(&kvm->srcu, idx); + + put_page(ind_page); + put_page(summary_page); return summary_set ? 0 : 1; } @@ -2765,9 +2830,7 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, adapter = get_io_adapter(kvm, e->adapter.adapter_id); if (!adapter) return -1; - down_read(&adapter->maps_lock); ret = adapter_indicators_set(kvm, adapter, &e->adapter); - up_read(&adapter->maps_lock); if ((ret > 0) && !adapter->masked) { ret = kvm_s390_inject_airq(kvm, adapter); if (ret == 0) @@ -2818,23 +2881,27 @@ int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { - int ret; + u64 uaddr; switch (ue->type) { + /* we store the userspace addresses instead of the guest addresses */ case KVM_IRQ_ROUTING_S390_ADAPTER: e->set = set_adapter_int; - e->adapter.summary_addr = ue->u.adapter.summary_addr; - e->adapter.ind_addr = ue->u.adapter.ind_addr; + uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.summary_addr); + if (uaddr == -EFAULT) + return -EFAULT; + e->adapter.summary_addr = uaddr; + uaddr = gmap_translate(kvm->arch.gmap, ue->u.adapter.ind_addr); + if (uaddr == -EFAULT) + return -EFAULT; + e->adapter.ind_addr = uaddr; e->adapter.summary_offset = ue->u.adapter.summary_offset; e->adapter.ind_offset = ue->u.adapter.ind_offset; e->adapter.adapter_id = ue->u.adapter.adapter_id; - ret = 0; - break; + return 0; default: - ret = -EINVAL; + return -EINVAL; } - - return ret; } int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index c2e6d4ba4e23..19a81024fe16 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2,7 +2,7 @@ /* * hosting IBM Z kernel virtual machines (s390x) * - * Copyright IBM Corp. 2008, 2018 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -44,6 +44,7 @@ #include <asm/cpacf.h> #include <asm/timex.h> #include <asm/ap.h> +#include <asm/uv.h> #include "kvm-s390.h" #include "gaccess.h" @@ -184,6 +185,11 @@ static u8 halt_poll_max_steal = 10; module_param(halt_poll_max_steal, byte, 0644); MODULE_PARM_DESC(halt_poll_max_steal, "Maximum percentage of steal time to allow polling"); +/* if set to true, the GISA will be initialized and used if available */ +static bool use_gisa = true; +module_param(use_gisa, bool, 0644); +MODULE_PARM_DESC(use_gisa, "Use the GISA if the host supports it."); + /* * For now we handle at most 16 double words as this is what the s390 base * kernel handles and stores in the prefix page. If we ever need to go beyond @@ -220,6 +226,7 @@ static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc; static struct gmap_notifier gmap_notifier; static struct gmap_notifier vsie_gmap_notifier; debug_info_t *kvm_s390_dbf; +debug_info_t *kvm_s390_dbf_uv; /* Section: not file related */ int kvm_arch_hardware_enable(void) @@ -228,13 +235,15 @@ int kvm_arch_hardware_enable(void) return 0; } -int kvm_arch_check_processor_compat(void) +int kvm_arch_check_processor_compat(void *opaque) { return 0; } +/* forward declarations */ static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); +static int sca_switch_to_extended(struct kvm *kvm); static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) { @@ -293,7 +302,7 @@ static struct notifier_block kvm_clock_notifier = { .notifier_call = kvm_clock_sync, }; -int kvm_arch_hardware_setup(void) +int kvm_arch_hardware_setup(void *opaque) { gmap_notifier.notifier_call = kvm_gmap_notifier; gmap_register_pte_notifier(&gmap_notifier); @@ -460,7 +469,12 @@ int kvm_arch_init(void *opaque) if (!kvm_s390_dbf) return -ENOMEM; - if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view)) + kvm_s390_dbf_uv = debug_register("kvm-uv", 32, 1, 7 * sizeof(long)); + if (!kvm_s390_dbf_uv) + goto out; + + if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view) || + debug_register_view(kvm_s390_dbf_uv, &debug_sprintf_view)) goto out; kvm_s390_cpu_feat_init(); @@ -487,6 +501,7 @@ void kvm_arch_exit(void) { kvm_s390_gib_destroy(); debug_unregister(kvm_s390_dbf); + debug_unregister(kvm_s390_dbf_uv); } /* Section: device related */ @@ -564,14 +579,16 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_BPB: r = test_facility(82); break; + case KVM_CAP_S390_PROTECTED: + r = is_prot_virt_host(); + break; default: r = 0; } return r; } -static void kvm_s390_sync_dirty_log(struct kvm *kvm, - struct kvm_memory_slot *memslot) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { int i; gfn_t cur_gfn, last_gfn; @@ -612,9 +629,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, { int r; unsigned long n; - struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - int is_dirty = 0; + int is_dirty; if (kvm_is_ucontrol(kvm)) return -EINVAL; @@ -625,14 +641,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, if (log->slot >= KVM_USER_MEM_SLOTS) goto out; - slots = kvm_memslots(kvm); - memslot = id_to_memslot(slots, log->slot); - r = -ENOENT; - if (!memslot->dirty_bitmap) - goto out; - - kvm_s390_sync_dirty_log(kvm, memslot); - r = kvm_get_dirty_log(kvm, log, &is_dirty); + r = kvm_get_dirty_log(kvm, log, &is_dirty, &memslot); if (r) goto out; @@ -1993,6 +2002,9 @@ static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args, struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *ms; + if (unlikely(!slots->used_slots)) + return 0; + cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn); ms = gfn_to_memslot(kvm, cur_gfn); args->count = 0; @@ -2158,6 +2170,194 @@ out: return r; } +static int kvm_s390_cpus_from_pv(struct kvm *kvm, u16 *rcp, u16 *rrcp) +{ + struct kvm_vcpu *vcpu; + u16 rc, rrc; + int ret = 0; + int i; + + /* + * We ignore failures and try to destroy as many CPUs as possible. + * At the same time we must not free the assigned resources when + * this fails, as the ultravisor has still access to that memory. + * So kvm_s390_pv_destroy_cpu can leave a "wanted" memory leak + * behind. + * We want to return the first failure rc and rrc, though. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + if (kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc) && !ret) { + *rcp = rc; + *rrcp = rrc; + ret = -EIO; + } + mutex_unlock(&vcpu->mutex); + } + return ret; +} + +static int kvm_s390_cpus_to_pv(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + int i, r = 0; + u16 dummy; + + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_lock(&vcpu->mutex); + r = kvm_s390_pv_create_cpu(vcpu, rc, rrc); + mutex_unlock(&vcpu->mutex); + if (r) + break; + } + if (r) + kvm_s390_cpus_from_pv(kvm, &dummy, &dummy); + return r; +} + +static int kvm_s390_handle_pv(struct kvm *kvm, struct kvm_pv_cmd *cmd) +{ + int r = 0; + u16 dummy; + void __user *argp = (void __user *)cmd->data; + + switch (cmd->cmd) { + case KVM_PV_ENABLE: { + r = -EINVAL; + if (kvm_s390_pv_is_protected(kvm)) + break; + + /* + * FMT 4 SIE needs esca. As we never switch back to bsca from + * esca, we need no cleanup in the error cases below + */ + r = sca_switch_to_extended(kvm); + if (r) + break; + + down_write(¤t->mm->mmap_sem); + r = gmap_mark_unmergeable(); + up_write(¤t->mm->mmap_sem); + if (r) + break; + + r = kvm_s390_pv_init_vm(kvm, &cmd->rc, &cmd->rrc); + if (r) + break; + + r = kvm_s390_cpus_to_pv(kvm, &cmd->rc, &cmd->rrc); + if (r) + kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); + + /* we need to block service interrupts from now on */ + set_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + } + case KVM_PV_DISABLE: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = kvm_s390_cpus_from_pv(kvm, &cmd->rc, &cmd->rrc); + /* + * If a CPU could not be destroyed, destroy VM will also fail. + * There is no point in trying to destroy it. Instead return + * the rc and rrc from the first CPU that failed destroying. + */ + if (r) + break; + r = kvm_s390_pv_deinit_vm(kvm, &cmd->rc, &cmd->rrc); + + /* no need to block service interrupts any more */ + clear_bit(IRQ_PEND_EXT_SERVICE, &kvm->arch.float_int.masked_irqs); + break; + } + case KVM_PV_SET_SEC_PARMS: { + struct kvm_s390_pv_sec_parm parms = {}; + void *hdr; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&parms, argp, sizeof(parms))) + break; + + /* Currently restricted to 8KB */ + r = -EINVAL; + if (parms.length > PAGE_SIZE * 2) + break; + + r = -ENOMEM; + hdr = vmalloc(parms.length); + if (!hdr) + break; + + r = -EFAULT; + if (!copy_from_user(hdr, (void __user *)parms.origin, + parms.length)) + r = kvm_s390_pv_set_sec_parms(kvm, hdr, parms.length, + &cmd->rc, &cmd->rrc); + + vfree(hdr); + break; + } + case KVM_PV_UNPACK: { + struct kvm_s390_pv_unp unp = {}; + + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = -EFAULT; + if (copy_from_user(&unp, argp, sizeof(unp))) + break; + + r = kvm_s390_pv_unpack(kvm, unp.addr, unp.size, unp.tweak, + &cmd->rc, &cmd->rrc); + break; + } + case KVM_PV_VERIFY: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_VERIFY_IMG, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT VERIFY: rc %x rrc %x", cmd->rc, + cmd->rrc); + break; + } + case KVM_PV_PREP_RESET: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_PREPARE_RESET, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT PREP RESET: rc %x rrc %x", + cmd->rc, cmd->rrc); + break; + } + case KVM_PV_UNSHARE_ALL: { + r = -EINVAL; + if (!kvm_s390_pv_is_protected(kvm)) + break; + + r = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_SET_UNSHARE_ALL, &cmd->rc, &cmd->rrc); + KVM_UV_EVENT(kvm, 3, "PROTVIRT UNSHARE: rc %x rrc %x", + cmd->rc, cmd->rrc); + break; + } + default: + r = -ENOTTY; + } + return r; +} + long kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -2255,6 +2455,33 @@ long kvm_arch_vm_ioctl(struct file *filp, mutex_unlock(&kvm->slots_lock); break; } + case KVM_S390_PV_COMMAND: { + struct kvm_pv_cmd args; + + /* protvirt means user sigp */ + kvm->arch.user_cpu_state_ctrl = 1; + r = 0; + if (!is_prot_virt_host()) { + r = -EINVAL; + break; + } + if (copy_from_user(&args, argp, sizeof(args))) { + r = -EFAULT; + break; + } + if (args.flags) { + r = -EINVAL; + break; + } + mutex_lock(&kvm->lock); + r = kvm_s390_handle_pv(kvm, &args); + mutex_unlock(&kvm->lock); + if (copy_to_user(argp, &args, sizeof(args))) { + r = -EFAULT; + break; + } + break; + } default: r = -ENOTTY; } @@ -2504,7 +2731,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.use_skf = sclp.has_skey; spin_lock_init(&kvm->arch.start_stop_lock); kvm_s390_vsie_init(kvm); - kvm_s390_gisa_init(kvm); + if (use_gisa) + kvm_s390_gisa_init(kvm); KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid); return 0; @@ -2518,6 +2746,8 @@ out_err: void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { + u16 rc, rrc; + VCPU_EVENT(vcpu, 3, "%s", "free cpu"); trace_kvm_s390_destroy_vcpu(vcpu->vcpu_id); kvm_s390_clear_local_irqs(vcpu); @@ -2530,6 +2760,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); + /* We can not hold the vcpu mutex here, we are already dying */ + if (kvm_s390_pv_cpu_get_handle(vcpu)) + kvm_s390_pv_destroy_cpu(vcpu, &rc, &rrc); free_page((unsigned long)(vcpu->arch.sie_block)); } @@ -2551,10 +2784,20 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + u16 rc, rrc; + kvm_free_vcpus(kvm); sca_dispose(kvm); - debug_unregister(kvm->arch.dbf); kvm_s390_gisa_destroy(kvm); + /* + * We are already at the end of life and kvm->lock is not taken. + * This is ok as the file descriptor is closed by now and nobody + * can mess with the pv state. To avoid lockdep_assert_held from + * complaining we do not use kvm_s390_pv_is_protected. + */ + if (kvm_s390_pv_get_handle(kvm)) + kvm_s390_pv_deinit_vm(kvm, &rc, &rrc); + debug_unregister(kvm->arch.dbf); free_page((unsigned long)kvm->arch.sie_page2); if (!kvm_is_ucontrol(kvm)) gmap_remove(kvm->arch.gmap); @@ -2650,6 +2893,9 @@ static int sca_switch_to_extended(struct kvm *kvm) unsigned int vcpu_idx; u32 scaol, scaoh; + if (kvm->arch.use_esca) + return 0; + new_sca = alloc_pages_exact(sizeof(*new_sca), GFP_KERNEL|__GFP_ZERO); if (!new_sca) return -ENOMEM; @@ -2901,6 +3147,7 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu) static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) { int rc = 0; + u16 uvrc, uvrrc; atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | CPUSTAT_SM | @@ -2968,6 +3215,14 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) kvm_s390_vcpu_crypto_setup(vcpu); + mutex_lock(&vcpu->kvm->lock); + if (kvm_s390_pv_is_protected(vcpu->kvm)) { + rc = kvm_s390_pv_create_cpu(vcpu, &uvrc, &uvrrc); + if (rc) + kvm_s390_vcpu_unsetup_cmma(vcpu); + } + mutex_unlock(&vcpu->kvm->lock); + return rc; } @@ -3277,7 +3532,6 @@ static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) kvm_s390_set_prefix(vcpu, 0); kvm_s390_set_cpu_timer(vcpu, 0); vcpu->arch.sie_block->ckc = 0; - vcpu->arch.sie_block->todpr = 0; memset(vcpu->arch.sie_block->gcr, 0, sizeof(vcpu->arch.sie_block->gcr)); vcpu->arch.sie_block->gcr[0] = CR0_INITIAL_MASK; vcpu->arch.sie_block->gcr[14] = CR14_INITIAL_MASK; @@ -3295,9 +3549,17 @@ static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) vcpu->run->s.regs.pp = 0; vcpu->run->s.regs.gbea = 1; vcpu->run->s.regs.fpc = 0; - vcpu->arch.sie_block->gbea = 1; - vcpu->arch.sie_block->pp = 0; - vcpu->arch.sie_block->fpf &= ~FPF_BPBC; + /* + * Do not reset these registers in the protected case, as some of + * them are overlayed and they are not accessible in this case + * anyway. + */ + if (!kvm_s390_pv_cpu_is_protected(vcpu)) { + vcpu->arch.sie_block->gbea = 1; + vcpu->arch.sie_block->pp = 0; + vcpu->arch.sie_block->fpf &= ~FPF_BPBC; + vcpu->arch.sie_block->todpr = 0; + } } static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu) @@ -3487,14 +3749,20 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, switch (mp_state->mp_state) { case KVM_MP_STATE_STOPPED: - kvm_s390_vcpu_stop(vcpu); + rc = kvm_s390_vcpu_stop(vcpu); break; case KVM_MP_STATE_OPERATING: - kvm_s390_vcpu_start(vcpu); + rc = kvm_s390_vcpu_start(vcpu); break; case KVM_MP_STATE_LOAD: + if (!kvm_s390_pv_cpu_is_protected(vcpu)) { + rc = -ENXIO; + break; + } + rc = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR_LOAD); + break; case KVM_MP_STATE_CHECK_STOP: - /* fall through - CHECK_STOP and LOAD are not supported yet */ + fallthrough; /* CHECK_STOP and LOAD are not supported yet */ default: rc = -ENXIO; } @@ -3844,9 +4112,11 @@ static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason) return vcpu_post_run_fault_in_sie(vcpu); } +#define PSW_INT_MASK (PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_MCHECK) static int __vcpu_run(struct kvm_vcpu *vcpu) { int rc, exit_reason; + struct sie_page *sie_page = (struct sie_page *)vcpu->arch.sie_block; /* * We try to hold kvm->srcu during most of vcpu_run (except when run- @@ -3868,8 +4138,28 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) guest_enter_irqoff(); __disable_cpu_timer_accounting(vcpu); local_irq_enable(); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(sie_page->pv_grregs, + vcpu->run->s.regs.gprs, + sizeof(sie_page->pv_grregs)); + } exit_reason = sie64a(vcpu->arch.sie_block, vcpu->run->s.regs.gprs); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy(vcpu->run->s.regs.gprs, + sie_page->pv_grregs, + sizeof(sie_page->pv_grregs)); + /* + * We're not allowed to inject interrupts on intercepts + * that leave the guest state in an "in-between" state + * where the next SIE entry will do a continuation. + * Fence interrupts in our "internal" PSW. + */ + if (vcpu->arch.sie_block->icptcode == ICPT_PV_INSTR || + vcpu->arch.sie_block->icptcode == ICPT_PV_PREF) { + vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK; + } + } local_irq_disable(); __enable_cpu_timer_accounting(vcpu); guest_exit_irqoff(); @@ -3883,7 +4173,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return rc; } -static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void sync_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct runtime_instr_cb *riccb; struct gs_cb *gscb; @@ -3892,16 +4182,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) gscb = (struct gs_cb *) &kvm_run->s.regs.gscb; vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask; vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr; - if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) - kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); - if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { - memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128); - /* some control register changes require a tlb flush */ - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); - } if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) { - kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm); - vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc; vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr; vcpu->arch.sie_block->pp = kvm_run->s.regs.pp; vcpu->arch.sie_block->gbea = kvm_run->s.regs.gbea; @@ -3942,6 +4223,36 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.sie_block->fpf &= ~FPF_BPBC; vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0; } + if (MACHINE_HAS_GS) { + preempt_disable(); + __ctl_set_bit(2, 4); + if (current->thread.gs_cb) { + vcpu->arch.host_gscb = current->thread.gs_cb; + save_gs_cb(vcpu->arch.host_gscb); + } + if (vcpu->arch.gs_enabled) { + current->thread.gs_cb = (struct gs_cb *) + &vcpu->run->s.regs.gscb; + restore_gs_cb(current->thread.gs_cb); + } + preempt_enable(); + } + /* SIE will load etoken directly from SDNX and therefore kvm_run */ +} + +static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) + kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); + if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { + memcpy(&vcpu->arch.sie_block->gcr, &kvm_run->s.regs.crs, 128); + /* some control register changes require a tlb flush */ + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) { + kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm); + vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc; + } save_access_regs(vcpu->arch.host_acrs); restore_access_regs(vcpu->run->s.regs.acrs); /* save host (userspace) fprs/vrs */ @@ -3956,23 +4267,47 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (test_fp_ctl(current->thread.fpu.fpc)) /* User space provided an invalid FPC, let's clear it */ current->thread.fpu.fpc = 0; + + /* Sync fmt2 only data */ + if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) { + sync_regs_fmt2(vcpu, kvm_run); + } else { + /* + * In several places we have to modify our internal view to + * not do things that are disallowed by the ultravisor. For + * example we must not inject interrupts after specific exits + * (e.g. 112 prefix page not secure). We do this by turning + * off the machine check, external and I/O interrupt bits + * of our PSW copy. To avoid getting validity intercepts, we + * do only accept the condition code from userspace. + */ + vcpu->arch.sie_block->gpsw.mask &= ~PSW_MASK_CC; + vcpu->arch.sie_block->gpsw.mask |= kvm_run->psw_mask & + PSW_MASK_CC; + } + + kvm_run->kvm_dirty_regs = 0; +} + +static void store_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr; + kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; + kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; + kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; if (MACHINE_HAS_GS) { - preempt_disable(); __ctl_set_bit(2, 4); - if (current->thread.gs_cb) { - vcpu->arch.host_gscb = current->thread.gs_cb; - save_gs_cb(vcpu->arch.host_gscb); - } - if (vcpu->arch.gs_enabled) { - current->thread.gs_cb = (struct gs_cb *) - &vcpu->run->s.regs.gscb; - restore_gs_cb(current->thread.gs_cb); - } + if (vcpu->arch.gs_enabled) + save_gs_cb(current->thread.gs_cb); + preempt_disable(); + current->thread.gs_cb = vcpu->arch.host_gscb; + restore_gs_cb(vcpu->arch.host_gscb); preempt_enable(); + if (!vcpu->arch.host_gscb) + __ctl_clear_bit(2, 4); + vcpu->arch.host_gscb = NULL; } - /* SIE will load etoken directly from SDNX and therefore kvm_run */ - - kvm_run->kvm_dirty_regs = 0; + /* SIE will save etoken directly into SDNX and therefore kvm_run */ } static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -3983,13 +4318,9 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128); kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu); kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc; - kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr; - kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; - kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; kvm_run->s.regs.pft = vcpu->arch.pfault_token; kvm_run->s.regs.pfs = vcpu->arch.pfault_select; kvm_run->s.regs.pfc = vcpu->arch.pfault_compare; - kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; save_access_regs(vcpu->run->s.regs.acrs); restore_access_regs(vcpu->arch.host_acrs); /* Save guest register state */ @@ -3998,19 +4329,8 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* Restore will be done lazily at return */ current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; - if (MACHINE_HAS_GS) { - __ctl_set_bit(2, 4); - if (vcpu->arch.gs_enabled) - save_gs_cb(current->thread.gs_cb); - preempt_disable(); - current->thread.gs_cb = vcpu->arch.host_gscb; - restore_gs_cb(vcpu->arch.host_gscb); - preempt_enable(); - if (!vcpu->arch.host_gscb) - __ctl_clear_bit(2, 4); - vcpu->arch.host_gscb = NULL; - } - /* SIE will save etoken directly into SDNX and therefore kvm_run */ + if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) + store_regs_fmt2(vcpu, kvm_run); } int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -4034,6 +4354,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_sigset_activate(vcpu); + /* + * no need to check the return value of vcpu_start as it can only have + * an error for protvirt, but protvirt means user cpu state + */ if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) { kvm_s390_vcpu_start(vcpu); } else if (is_vcpu_stopped(vcpu)) { @@ -4171,18 +4495,27 @@ static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu) kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu); } -void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) +int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) { - int i, online_vcpus, started_vcpus = 0; + int i, online_vcpus, r = 0, started_vcpus = 0; if (!is_vcpu_stopped(vcpu)) - return; + return 0; trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 1); /* Only one cpu at a time may enter/leave the STOPPED state. */ spin_lock(&vcpu->kvm->arch.start_stop_lock); online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); + /* Let's tell the UV that we want to change into the operating state */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_OPR); + if (r) { + spin_unlock(&vcpu->kvm->arch.start_stop_lock); + return r; + } + } + for (i = 0; i < online_vcpus; i++) { if (!is_vcpu_stopped(vcpu->kvm->vcpus[i])) started_vcpus++; @@ -4202,27 +4535,43 @@ void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu) kvm_s390_clear_cpuflags(vcpu, CPUSTAT_STOPPED); /* + * The real PSW might have changed due to a RESTART interpreted by the + * ultravisor. We block all interrupts and let the next sie exit + * refresh our view. + */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) + vcpu->arch.sie_block->gpsw.mask &= ~PSW_INT_MASK; + /* * Another VCPU might have used IBS while we were offline. * Let's play safe and flush the VCPU at startup. */ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); spin_unlock(&vcpu->kvm->arch.start_stop_lock); - return; + return 0; } -void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) +int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) { - int i, online_vcpus, started_vcpus = 0; + int i, online_vcpus, r = 0, started_vcpus = 0; struct kvm_vcpu *started_vcpu = NULL; if (is_vcpu_stopped(vcpu)) - return; + return 0; trace_kvm_s390_vcpu_start_stop(vcpu->vcpu_id, 0); /* Only one cpu at a time may enter/leave the STOPPED state. */ spin_lock(&vcpu->kvm->arch.start_stop_lock); online_vcpus = atomic_read(&vcpu->kvm->online_vcpus); + /* Let's tell the UV that we want to change into the stopped state */ + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = kvm_s390_pv_set_cpu_state(vcpu, PV_CPU_STATE_STP); + if (r) { + spin_unlock(&vcpu->kvm->arch.start_stop_lock); + return r; + } + } + /* SIGP STOP and SIGP STOP AND STORE STATUS has been fully processed */ kvm_s390_clear_stop_irq(vcpu); @@ -4245,7 +4594,7 @@ void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu) } spin_unlock(&vcpu->kvm->arch.start_stop_lock); - return; + return 0; } static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, @@ -4272,12 +4621,40 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return r; } +static long kvm_s390_guest_sida_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + void __user *uaddr = (void __user *)mop->buf; + int r = 0; + + if (mop->flags || !mop->size) + return -EINVAL; + if (mop->size + mop->sida_offset < mop->size) + return -EINVAL; + if (mop->size + mop->sida_offset > sida_size(vcpu->arch.sie_block)) + return -E2BIG; + + switch (mop->op) { + case KVM_S390_MEMOP_SIDA_READ: + if (copy_to_user(uaddr, (void *)(sida_origin(vcpu->arch.sie_block) + + mop->sida_offset), mop->size)) + r = -EFAULT; + + break; + case KVM_S390_MEMOP_SIDA_WRITE: + if (copy_from_user((void *)(sida_origin(vcpu->arch.sie_block) + + mop->sida_offset), uaddr, mop->size)) + r = -EFAULT; + break; + } + return r; +} static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, struct kvm_s390_mem_op *mop) { void __user *uaddr = (void __user *)mop->buf; void *tmpbuf = NULL; - int r, srcu_idx; + int r = 0; const u64 supported_flags = KVM_S390_MEMOP_F_INJECT_EXCEPTION | KVM_S390_MEMOP_F_CHECK_ONLY; @@ -4287,14 +4664,15 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, if (mop->size > MEM_OP_MAX_SIZE) return -E2BIG; + if (kvm_s390_pv_cpu_is_protected(vcpu)) + return -EINVAL; + if (!(mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY)) { tmpbuf = vmalloc(mop->size); if (!tmpbuf) return -ENOMEM; } - srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - switch (mop->op) { case KVM_S390_MEMOP_LOGICAL_READ: if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) { @@ -4320,12 +4698,8 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, } r = write_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size); break; - default: - r = -EINVAL; } - srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); - if (r > 0 && (mop->flags & KVM_S390_MEMOP_F_INJECT_EXCEPTION) != 0) kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm); @@ -4333,6 +4707,31 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu, return r; } +static long kvm_s390_guest_memsida_op(struct kvm_vcpu *vcpu, + struct kvm_s390_mem_op *mop) +{ + int r, srcu_idx; + + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + + switch (mop->op) { + case KVM_S390_MEMOP_LOGICAL_READ: + case KVM_S390_MEMOP_LOGICAL_WRITE: + r = kvm_s390_guest_mem_op(vcpu, mop); + break; + case KVM_S390_MEMOP_SIDA_READ: + case KVM_S390_MEMOP_SIDA_WRITE: + /* we are locked against sida going away by the vcpu->mutex */ + r = kvm_s390_guest_sida_op(vcpu, mop); + break; + default: + r = -EINVAL; + } + + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + return r; +} + long kvm_arch_vcpu_async_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -4368,6 +4767,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int idx; long r; + u16 rc, rrc; vcpu_load(vcpu); @@ -4389,18 +4789,40 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_S390_CLEAR_RESET: r = 0; kvm_arch_vcpu_ioctl_clear_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET_CLEAR, &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET CLEAR VCPU: rc %x rrc %x", + rc, rrc); + } break; case KVM_S390_INITIAL_RESET: r = 0; kvm_arch_vcpu_ioctl_initial_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET_INITIAL, + &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET INITIAL VCPU: rc %x rrc %x", + rc, rrc); + } break; case KVM_S390_NORMAL_RESET: r = 0; kvm_arch_vcpu_ioctl_normal_reset(vcpu); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + r = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_CPU_RESET, &rc, &rrc); + VCPU_EVENT(vcpu, 3, "PROTVIRT RESET NORMAL VCPU: rc %x rrc %x", + rc, rrc); + } break; case KVM_SET_ONE_REG: case KVM_GET_ONE_REG: { struct kvm_one_reg reg; + r = -EINVAL; + if (kvm_s390_pv_cpu_is_protected(vcpu)) + break; r = -EFAULT; if (copy_from_user(®, argp, sizeof(reg))) break; @@ -4463,7 +4885,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_s390_mem_op mem_op; if (copy_from_user(&mem_op, argp, sizeof(mem_op)) == 0) - r = kvm_s390_guest_mem_op(vcpu, &mem_op); + r = kvm_s390_guest_memsida_op(vcpu, &mem_op); else r = -EFAULT; break; @@ -4523,12 +4945,6 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) -{ - return 0; -} - /* Section: memory related */ int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, @@ -4549,12 +4965,15 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, if (mem->guest_phys_addr + mem->memory_size > kvm->arch.mem_limit) return -EINVAL; + /* When we are protected, we should not change the memory slots */ + if (kvm_s390_pv_get_handle(kvm)) + return -EINVAL; return 0; } void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, + struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { @@ -4570,7 +4989,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, old->npages * PAGE_SIZE); if (rc) break; - /* FALLTHROUGH */ + fallthrough; case KVM_MR_CREATE: rc = gmap_map_segment(kvm->arch.gmap, mem->userspace_addr, mem->guest_phys_addr, mem->memory_size); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 6d9448dbd052..79dcd647b378 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -2,7 +2,7 @@ /* * definition for kvm on s390 * - * Copyright IBM Corp. 2008, 2009 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -15,6 +15,7 @@ #include <linux/hrtimer.h> #include <linux/kvm.h> #include <linux/kvm_host.h> +#include <linux/lockdep.h> #include <asm/facility.h> #include <asm/processor.h> #include <asm/sclp.h> @@ -25,6 +26,17 @@ #define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1)) extern debug_info_t *kvm_s390_dbf; +extern debug_info_t *kvm_s390_dbf_uv; + +#define KVM_UV_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ +do { \ + debug_sprintf_event((d_kvm)->arch.dbf, d_loglevel, d_string "\n", \ + d_args); \ + debug_sprintf_event(kvm_s390_dbf_uv, d_loglevel, \ + "%d: " d_string "\n", (d_kvm)->userspace_pid, \ + d_args); \ +} while (0) + #define KVM_EVENT(d_loglevel, d_string, d_args...)\ do { \ debug_sprintf_event(kvm_s390_dbf, d_loglevel, d_string "\n", \ @@ -196,6 +208,39 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm) return kvm->arch.user_cpu_state_ctrl != 0; } +/* implemented in pv.c */ +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc); +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, + u16 *rrc); +int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, + unsigned long tweak, u16 *rc, u16 *rrc); +int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state); + +static inline u64 kvm_s390_pv_get_handle(struct kvm *kvm) +{ + return kvm->arch.pv.handle; +} + +static inline u64 kvm_s390_pv_cpu_get_handle(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pv.handle; +} + +static inline bool kvm_s390_pv_is_protected(struct kvm *kvm) +{ + lockdep_assert_held(&kvm->lock); + return !!kvm_s390_pv_get_handle(kvm); +} + +static inline bool kvm_s390_pv_cpu_is_protected(struct kvm_vcpu *vcpu) +{ + lockdep_assert_held(&vcpu->mutex); + return !!kvm_s390_pv_cpu_get_handle(vcpu); +} + /* implemented in interrupt.c */ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu); @@ -286,8 +331,8 @@ void kvm_s390_set_tod_clock(struct kvm *kvm, long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); -void kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); -void kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); +int kvm_s390_vcpu_start(struct kvm_vcpu *vcpu); +int kvm_s390_vcpu_stop(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_block(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unblock(struct kvm_vcpu *vcpu); bool kvm_s390_vcpu_sie_inhibited(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index ed52ffa8d5d4..69a824f9ef0b 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -2,7 +2,7 @@ /* * handling privileged instructions * - * Copyright IBM Corp. 2008, 2018 + * Copyright IBM Corp. 2008, 2020 * * Author(s): Carsten Otte <cotte@de.ibm.com> * Christian Borntraeger <borntraeger@de.ibm.com> @@ -872,7 +872,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu) operand2 = kvm_s390_get_base_disp_s(vcpu, &ar); - if (operand2 & 0xfff) + if (!kvm_s390_pv_cpu_is_protected(vcpu) && (operand2 & 0xfff)) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); switch (fc) { @@ -893,8 +893,13 @@ static int handle_stsi(struct kvm_vcpu *vcpu) handle_stsi_3_2_2(vcpu, (void *) mem); break; } - - rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE); + if (kvm_s390_pv_cpu_is_protected(vcpu)) { + memcpy((void *)sida_origin(vcpu->arch.sie_block), (void *)mem, + PAGE_SIZE); + rc = 0; + } else { + rc = write_guest(vcpu, operand2, ar, (void *)mem, PAGE_SIZE); + } if (rc) { rc = kvm_s390_inject_prog_cond(vcpu, rc); goto out; diff --git a/arch/s390/kvm/pv.c b/arch/s390/kvm/pv.c new file mode 100644 index 000000000000..63e330109b63 --- /dev/null +++ b/arch/s390/kvm/pv.c @@ -0,0 +1,303 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Hosting Protected Virtual Machines + * + * Copyright IBM Corp. 2019, 2020 + * Author(s): Janosch Frank <frankja@linux.ibm.com> + */ +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/pagemap.h> +#include <linux/sched/signal.h> +#include <asm/pgalloc.h> +#include <asm/gmap.h> +#include <asm/uv.h> +#include <asm/mman.h> +#include "kvm-s390.h" + +int kvm_s390_pv_destroy_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) +{ + int cc = 0; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) { + cc = uv_cmd_nodata(kvm_s390_pv_cpu_get_handle(vcpu), + UVC_CMD_DESTROY_SEC_CPU, rc, rrc); + + KVM_UV_EVENT(vcpu->kvm, 3, + "PROTVIRT DESTROY VCPU %d: rc %x rrc %x", + vcpu->vcpu_id, *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy cpu failed rc %x rrc %x", + *rc, *rrc); + } + /* Intended memory leak for something that should never happen. */ + if (!cc) + free_pages(vcpu->arch.pv.stor_base, + get_order(uv_info.guest_cpu_stor_len)); + + free_page(sida_origin(vcpu->arch.sie_block)); + vcpu->arch.sie_block->pv_handle_cpu = 0; + vcpu->arch.sie_block->pv_handle_config = 0; + memset(&vcpu->arch.pv, 0, sizeof(vcpu->arch.pv)); + vcpu->arch.sie_block->sdf = 0; + /* + * The sidad field (for sdf == 2) is now the gbea field (for sdf == 0). + * Use the reset value of gbea to avoid leaking the kernel pointer of + * the just freed sida. + */ + vcpu->arch.sie_block->gbea = 1; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + + return cc ? EIO : 0; +} + +int kvm_s390_pv_create_cpu(struct kvm_vcpu *vcpu, u16 *rc, u16 *rrc) +{ + struct uv_cb_csc uvcb = { + .header.cmd = UVC_CMD_CREATE_SEC_CPU, + .header.len = sizeof(uvcb), + }; + int cc; + + if (kvm_s390_pv_cpu_get_handle(vcpu)) + return -EINVAL; + + vcpu->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, + get_order(uv_info.guest_cpu_stor_len)); + if (!vcpu->arch.pv.stor_base) + return -ENOMEM; + + /* Input */ + uvcb.guest_handle = kvm_s390_pv_get_handle(vcpu->kvm); + uvcb.num = vcpu->arch.sie_block->icpua; + uvcb.state_origin = (u64)vcpu->arch.sie_block; + uvcb.stor_origin = (u64)vcpu->arch.pv.stor_base; + + /* Alloc Secure Instruction Data Area Designation */ + vcpu->arch.sie_block->sidad = __get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!vcpu->arch.sie_block->sidad) { + free_pages(vcpu->arch.pv.stor_base, + get_order(uv_info.guest_cpu_stor_len)); + return -ENOMEM; + } + + cc = uv_call(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(vcpu->kvm, 3, + "PROTVIRT CREATE VCPU: cpu %d handle %llx rc %x rrc %x", + vcpu->vcpu_id, uvcb.cpu_handle, uvcb.header.rc, + uvcb.header.rrc); + + if (cc) { + u16 dummy; + + kvm_s390_pv_destroy_cpu(vcpu, &dummy, &dummy); + return -EIO; + } + + /* Output */ + vcpu->arch.pv.handle = uvcb.cpu_handle; + vcpu->arch.sie_block->pv_handle_cpu = uvcb.cpu_handle; + vcpu->arch.sie_block->pv_handle_config = kvm_s390_pv_get_handle(vcpu->kvm); + vcpu->arch.sie_block->sdf = 2; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + return 0; +} + +/* only free resources when the destroy was successful */ +static void kvm_s390_pv_dealloc_vm(struct kvm *kvm) +{ + vfree(kvm->arch.pv.stor_var); + free_pages(kvm->arch.pv.stor_base, + get_order(uv_info.guest_base_stor_len)); + memset(&kvm->arch.pv, 0, sizeof(kvm->arch.pv)); +} + +static int kvm_s390_pv_alloc_vm(struct kvm *kvm) +{ + unsigned long base = uv_info.guest_base_stor_len; + unsigned long virt = uv_info.guest_virt_var_stor_len; + unsigned long npages = 0, vlen = 0; + struct kvm_memory_slot *memslot; + + kvm->arch.pv.stor_var = NULL; + kvm->arch.pv.stor_base = __get_free_pages(GFP_KERNEL, get_order(base)); + if (!kvm->arch.pv.stor_base) + return -ENOMEM; + + /* + * Calculate current guest storage for allocation of the + * variable storage, which is based on the length in MB. + * + * Slots are sorted by GFN + */ + mutex_lock(&kvm->slots_lock); + memslot = kvm_memslots(kvm)->memslots; + npages = memslot->base_gfn + memslot->npages; + mutex_unlock(&kvm->slots_lock); + + kvm->arch.pv.guest_len = npages * PAGE_SIZE; + + /* Allocate variable storage */ + vlen = ALIGN(virt * ((npages * PAGE_SIZE) / HPAGE_SIZE), PAGE_SIZE); + vlen += uv_info.guest_virt_base_stor_len; + kvm->arch.pv.stor_var = vzalloc(vlen); + if (!kvm->arch.pv.stor_var) + goto out_err; + return 0; + +out_err: + kvm_s390_pv_dealloc_vm(kvm); + return -ENOMEM; +} + +/* this should not fail, but if it does, we must not free the donated memory */ +int kvm_s390_pv_deinit_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + int cc; + + /* make all pages accessible before destroying the guest */ + s390_reset_acc(kvm->mm); + + cc = uv_cmd_nodata(kvm_s390_pv_get_handle(kvm), + UVC_CMD_DESTROY_SEC_CONF, rc, rrc); + WRITE_ONCE(kvm->arch.gmap->guest_handle, 0); + atomic_set(&kvm->mm->context.is_protected, 0); + KVM_UV_EVENT(kvm, 3, "PROTVIRT DESTROY VM: rc %x rrc %x", *rc, *rrc); + WARN_ONCE(cc, "protvirt destroy vm failed rc %x rrc %x", *rc, *rrc); + /* Inteded memory leak on "impossible" error */ + if (!cc) + kvm_s390_pv_dealloc_vm(kvm); + return cc ? -EIO : 0; +} + +int kvm_s390_pv_init_vm(struct kvm *kvm, u16 *rc, u16 *rrc) +{ + struct uv_cb_cgc uvcb = { + .header.cmd = UVC_CMD_CREATE_SEC_CONF, + .header.len = sizeof(uvcb) + }; + int cc, ret; + u16 dummy; + + ret = kvm_s390_pv_alloc_vm(kvm); + if (ret) + return ret; + + /* Inputs */ + uvcb.guest_stor_origin = 0; /* MSO is 0 for KVM */ + uvcb.guest_stor_len = kvm->arch.pv.guest_len; + uvcb.guest_asce = kvm->arch.gmap->asce; + uvcb.guest_sca = (unsigned long)kvm->arch.sca; + uvcb.conf_base_stor_origin = (u64)kvm->arch.pv.stor_base; + uvcb.conf_virt_stor_origin = (u64)kvm->arch.pv.stor_var; + + cc = uv_call(0, (u64)&uvcb); + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT CREATE VM: handle %llx len %llx rc %x rrc %x", + uvcb.guest_handle, uvcb.guest_stor_len, *rc, *rrc); + + /* Outputs */ + kvm->arch.pv.handle = uvcb.guest_handle; + + if (cc) { + if (uvcb.header.rc & UVC_RC_NEED_DESTROY) + kvm_s390_pv_deinit_vm(kvm, &dummy, &dummy); + else + kvm_s390_pv_dealloc_vm(kvm); + return -EIO; + } + kvm->arch.gmap->guest_handle = uvcb.guest_handle; + atomic_set(&kvm->mm->context.is_protected, 1); + return 0; +} + +int kvm_s390_pv_set_sec_parms(struct kvm *kvm, void *hdr, u64 length, u16 *rc, + u16 *rrc) +{ + struct uv_cb_ssc uvcb = { + .header.cmd = UVC_CMD_SET_SEC_CONF_PARAMS, + .header.len = sizeof(uvcb), + .sec_header_origin = (u64)hdr, + .sec_header_len = length, + .guest_handle = kvm_s390_pv_get_handle(kvm), + }; + int cc = uv_call(0, (u64)&uvcb); + + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM SET PARMS: rc %x rrc %x", + *rc, *rrc); + return cc ? -EINVAL : 0; +} + +static int unpack_one(struct kvm *kvm, unsigned long addr, u64 tweak, + u64 offset, u16 *rc, u16 *rrc) +{ + struct uv_cb_unp uvcb = { + .header.cmd = UVC_CMD_UNPACK_IMG, + .header.len = sizeof(uvcb), + .guest_handle = kvm_s390_pv_get_handle(kvm), + .gaddr = addr, + .tweak[0] = tweak, + .tweak[1] = offset, + }; + int ret = gmap_make_secure(kvm->arch.gmap, addr, &uvcb); + + *rc = uvcb.header.rc; + *rrc = uvcb.header.rrc; + + if (ret && ret != -EAGAIN) + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: failed addr %llx with rc %x rrc %x", + uvcb.gaddr, *rc, *rrc); + return ret; +} + +int kvm_s390_pv_unpack(struct kvm *kvm, unsigned long addr, unsigned long size, + unsigned long tweak, u16 *rc, u16 *rrc) +{ + u64 offset = 0; + int ret = 0; + + if (addr & ~PAGE_MASK || !size || size & ~PAGE_MASK) + return -EINVAL; + + KVM_UV_EVENT(kvm, 3, "PROTVIRT VM UNPACK: start addr %lx size %lx", + addr, size); + + while (offset < size) { + ret = unpack_one(kvm, addr, tweak, offset, rc, rrc); + if (ret == -EAGAIN) { + cond_resched(); + if (fatal_signal_pending(current)) + break; + continue; + } + if (ret) + break; + addr += PAGE_SIZE; + offset += PAGE_SIZE; + } + if (!ret) + KVM_UV_EVENT(kvm, 3, "%s", "PROTVIRT VM UNPACK: successful"); + return ret; +} + +int kvm_s390_pv_set_cpu_state(struct kvm_vcpu *vcpu, u8 state) +{ + struct uv_cb_cpu_set_state uvcb = { + .header.cmd = UVC_CMD_CPU_SET_STATE, + .header.len = sizeof(uvcb), + .cpu_handle = kvm_s390_pv_cpu_get_handle(vcpu), + .state = state, + }; + int cc; + + cc = uv_call(0, (u64)&uvcb); + KVM_UV_EVENT(vcpu->kvm, 3, "PROTVIRT SET CPU %d STATE %d rc %x rrc %x", + vcpu->vcpu_id, state, uvcb.header.rc, uvcb.header.rrc); + if (cc) + return -EINVAL; + return 0; +} diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index aeccdb30899a..09bf7f2121ac 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -38,6 +38,7 @@ #include <asm/irq.h> #include <asm/mmu_context.h> #include <asm/facility.h> +#include <asm/uv.h> #include "../kernel/entry.h" #define __FAIL_ADDR_MASK -4096L @@ -812,3 +813,80 @@ out_extint: early_initcall(pfault_irq_init); #endif /* CONFIG_PFAULT */ + +#if IS_ENABLED(CONFIG_PGSTE) +void do_secure_storage_access(struct pt_regs *regs) +{ + unsigned long addr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct vm_area_struct *vma; + struct mm_struct *mm; + struct page *page; + int rc; + + switch (get_fault_type(regs)) { + case USER_FAULT: + mm = current->mm; + down_read(&mm->mmap_sem); + vma = find_vma(mm, addr); + if (!vma) { + up_read(&mm->mmap_sem); + do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); + break; + } + page = follow_page(vma, addr, FOLL_WRITE | FOLL_GET); + if (IS_ERR_OR_NULL(page)) { + up_read(&mm->mmap_sem); + break; + } + if (arch_make_page_accessible(page)) + send_sig(SIGSEGV, current, 0); + put_page(page); + up_read(&mm->mmap_sem); + break; + case KERNEL_FAULT: + page = phys_to_page(addr); + if (unlikely(!try_get_page(page))) + break; + rc = arch_make_page_accessible(page); + put_page(page); + if (rc) + BUG(); + break; + case VDSO_FAULT: + /* fallthrough */ + case GMAP_FAULT: + /* fallthrough */ + default: + do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); + WARN_ON_ONCE(1); + } +} +NOKPROBE_SYMBOL(do_secure_storage_access); + +void do_non_secure_storage_access(struct pt_regs *regs) +{ + unsigned long gaddr = regs->int_parm_long & __FAIL_ADDR_MASK; + struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; + + if (get_fault_type(regs) != GMAP_FAULT) { + do_fault_error(regs, VM_READ | VM_WRITE, VM_FAULT_BADMAP); + WARN_ON_ONCE(1); + return; + } + + if (gmap_convert_to_secure(gmap, gaddr) == -EINVAL) + send_sig(SIGSEGV, current, 0); +} +NOKPROBE_SYMBOL(do_non_secure_storage_access); + +#else +void do_secure_storage_access(struct pt_regs *regs) +{ + default_trap_handler(regs); +} + +void do_non_secure_storage_access(struct pt_regs *regs) +{ + default_trap_handler(regs); +} +#endif diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index edcdca97e85e..2fbece47ef6f 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -804,7 +804,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + fallthrough; case _ASCE_TYPE_REGION2: table += (gaddr & _REGION2_INDEX) >> _REGION2_SHIFT; if (level == 3) @@ -812,7 +812,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + fallthrough; case _ASCE_TYPE_REGION3: table += (gaddr & _REGION3_INDEX) >> _REGION3_SHIFT; if (level == 2) @@ -820,7 +820,7 @@ static inline unsigned long *gmap_table_walk(struct gmap *gmap, if (*table & _REGION_ENTRY_INVALID) return NULL; table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN); - /* Fallthrough */ + fallthrough; case _ASCE_TYPE_SEGMENT: table += (gaddr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; if (level == 1) @@ -2548,6 +2548,23 @@ int s390_enable_sie(void) } EXPORT_SYMBOL_GPL(s390_enable_sie); +int gmap_mark_unmergeable(void) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int ret; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, + MADV_UNMERGEABLE, &vma->vm_flags); + if (ret) + return ret; + } + mm->def_flags &= ~VM_MERGEABLE; + return 0; +} +EXPORT_SYMBOL_GPL(gmap_mark_unmergeable); + /* * Enable storage key handling from now on and initialize the storage * keys with the default key. @@ -2593,7 +2610,6 @@ static const struct mm_walk_ops enable_skey_walk_ops = { int s390_enable_skey(void) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; int rc = 0; down_write(&mm->mmap_sem); @@ -2601,16 +2617,11 @@ int s390_enable_skey(void) goto out_up; mm->context.uses_skeys = 1; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (ksm_madvise(vma, vma->vm_start, vma->vm_end, - MADV_UNMERGEABLE, &vma->vm_flags)) { - mm->context.uses_skeys = 0; - rc = -ENOMEM; - goto out_up; - } + rc = gmap_mark_unmergeable(); + if (rc) { + mm->context.uses_skeys = 0; + goto out_up; } - mm->def_flags &= ~VM_MERGEABLE; - walk_page_range(mm, 0, TASK_SIZE, &enable_skey_walk_ops, NULL); out_up: @@ -2640,3 +2651,38 @@ void s390_reset_cmma(struct mm_struct *mm) up_write(&mm->mmap_sem); } EXPORT_SYMBOL_GPL(s390_reset_cmma); + +/* + * make inaccessible pages accessible again + */ +static int __s390_reset_acc(pte_t *ptep, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) + WARN_ON_ONCE(uv_convert_from_secure(pte_val(pte) & PAGE_MASK)); + return 0; +} + +static const struct mm_walk_ops reset_acc_walk_ops = { + .pte_entry = __s390_reset_acc, +}; + +#include <linux/sched/mm.h> +void s390_reset_acc(struct mm_struct *mm) +{ + /* + * we might be called during + * reset: we walk the pages and clear + * close of all kvm file descriptors: we walk the pages and clear + * exit of process on fd closure: vma already gone, do nothing + */ + if (!mmget_not_zero(mm)) + return; + down_read(&mm->mmap_sem); + walk_page_range(mm, 0, TASK_SIZE, &reset_acc_walk_ops, NULL); + up_read(&mm->mmap_sem); + mmput(mm); +} +EXPORT_SYMBOL_GPL(s390_reset_acc); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 98959e8cd448..42a2d0d3984a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -49,13 +49,16 @@ #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS +#define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ + KVM_DIRTY_LOG_INITIALLY_SET) + /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) #define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2) #define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3) #define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4) -#define KVM_REQ_LOAD_CR3 KVM_ARCH_REQ(5) +#define KVM_REQ_LOAD_MMU_PGD KVM_ARCH_REQ(5) #define KVM_REQ_EVENT KVM_ARCH_REQ(6) #define KVM_REQ_APF_HALT KVM_ARCH_REQ(7) #define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8) @@ -182,7 +185,10 @@ enum exit_fastpath_completion { EXIT_FASTPATH_SKIP_EMUL_INS, }; -#include <asm/kvm_emulate.h> +struct x86_emulate_ctxt; +struct x86_exception; +enum x86_intercept; +enum x86_intercept_stage; #define KVM_NR_MEM_OBJS 40 @@ -297,7 +303,6 @@ union kvm_mmu_extended_role { unsigned int cr4_pke:1; unsigned int cr4_smap:1; unsigned int cr4_smep:1; - unsigned int cr4_la57:1; unsigned int maxphyaddr:6; }; }; @@ -382,8 +387,7 @@ struct kvm_mmu_root_info { * current mmu mode. */ struct kvm_mmu { - void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); - unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); + unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu); u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 err, bool prefault); @@ -678,7 +682,7 @@ struct kvm_vcpu_arch { /* emulate context */ - struct x86_emulate_ctxt emulate_ctxt; + struct x86_emulate_ctxt *emulate_ctxt; bool emulate_regs_need_sync_to_vcpu; bool emulate_regs_need_sync_from_vcpu; int (*complete_userspace_io)(struct kvm_vcpu *vcpu); @@ -808,10 +812,6 @@ struct kvm_vcpu_arch { int pending_ioapic_eoi; int pending_external_vector; - /* GPA available */ - bool gpa_available; - gpa_t gpa_val; - /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; @@ -890,6 +890,7 @@ enum kvm_irqchip_mode { #define APICV_INHIBIT_REASON_NESTED 2 #define APICV_INHIBIT_REASON_IRQWIN 3 #define APICV_INHIBIT_REASON_PIT_REINJ 4 +#define APICV_INHIBIT_REASON_X2APIC 5 struct kvm_arch { unsigned long n_used_mmu_pages; @@ -920,6 +921,7 @@ struct kvm_arch { atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; struct kvm_apic_map *apic_map; + bool apic_map_dirty; bool apic_access_page_done; unsigned long apicv_inhibit_reasons; @@ -1052,19 +1054,14 @@ static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) } struct kvm_x86_ops { - int (*cpu_has_kvm_support)(void); /* __init */ - int (*disabled_by_bios)(void); /* __init */ int (*hardware_enable)(void); void (*hardware_disable)(void); - int (*check_processor_compatibility)(void);/* __init */ - int (*hardware_setup)(void); /* __init */ - void (*hardware_unsetup)(void); /* __exit */ + void (*hardware_unsetup)(void); bool (*cpu_has_accelerated_tpr)(void); bool (*has_emulated_msr)(int index); void (*cpuid_update)(struct kvm_vcpu *vcpu); - struct kvm *(*vm_alloc)(void); - void (*vm_free)(struct kvm *); + unsigned int vm_size; int (*vm_init)(struct kvm *kvm); void (*vm_destroy)(struct kvm *kvm); @@ -1090,7 +1087,6 @@ struct kvm_x86_ops { void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); - void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); @@ -1153,13 +1149,8 @@ struct kvm_x86_ops { int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); int (*get_tdp_level)(struct kvm_vcpu *vcpu); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - int (*get_lpage_level)(void); - bool (*rdtscp_supported)(void); - bool (*invpcid_supported)(void); - - void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); - void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); + void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3); bool (*has_wbinvd_exit)(void); @@ -1171,16 +1162,12 @@ struct kvm_x86_ops { int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, - enum x86_intercept_stage stage); + enum x86_intercept_stage stage, + struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion *exit_fastpath); - bool (*mpx_supported)(void); - bool (*xsaves_supported)(void); - bool (*umip_emulated)(void); - bool (*pt_supported)(void); - bool (*pku_supported)(void); - int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); + int (*check_nested_events)(struct kvm_vcpu *vcpu); void (*request_immediate_exit)(struct kvm_vcpu *vcpu); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); @@ -1269,6 +1256,15 @@ struct kvm_x86_ops { int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); }; +struct kvm_x86_init_ops { + int (*cpu_has_kvm_support)(void); + int (*disabled_by_bios)(void); + int (*check_processor_compatibility)(void); + int (*hardware_setup)(void); + + struct kvm_x86_ops *runtime_ops; +}; + struct kvm_arch_async_pf { u32 token; gfn_t gfn; @@ -1276,25 +1272,24 @@ struct kvm_arch_async_pf { bool direct_map; }; -extern struct kvm_x86_ops *kvm_x86_ops; +extern u64 __read_mostly host_efer; + +extern struct kvm_x86_ops kvm_x86_ops; extern struct kmem_cache *x86_fpu_cache; #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { - return kvm_x86_ops->vm_alloc(); -} - -static inline void kvm_arch_free_vm(struct kvm *kvm) -{ - return kvm_x86_ops->vm_free(kvm); + return __vmalloc(kvm_x86_ops.vm_size, + GFP_KERNEL_ACCOUNT | __GFP_ZERO, PAGE_KERNEL); } +void kvm_arch_free_vm(struct kvm *kvm); #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm) { - if (kvm_x86_ops->tlb_remote_flush && - !kvm_x86_ops->tlb_remote_flush(kvm)) + if (kvm_x86_ops.tlb_remote_flush && + !kvm_x86_ops.tlb_remote_flush(kvm)) return 0; else return -ENOTSUPP; @@ -1313,7 +1308,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot); + struct kvm_memory_slot *memslot, + int start_level); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, @@ -1379,10 +1375,11 @@ extern u64 kvm_mce_cap_supported; * * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to * decode the instruction length. For use *only* by - * kvm_x86_ops->skip_emulated_instruction() implementations. + * kvm_x86_ops.skip_emulated_instruction() implementations. * - * EMULTYPE_ALLOW_RETRY - Set when the emulator should resume the guest to - * retry native execution under certain conditions. + * EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to + * retry native execution under certain conditions, + * Can only be set in conjunction with EMULTYPE_PF. * * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was * triggered by KVM's magic "force emulation" prefix, @@ -1395,13 +1392,18 @@ extern u64 kvm_mce_cap_supported; * backdoor emulation, which is opt in via module param. * VMware backoor emulation handles select instructions * and reinjects the #GP for all other cases. + * + * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which + * case the CR2/GPA value pass on the stack is valid. */ #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) -#define EMULTYPE_ALLOW_RETRY (1 << 3) +#define EMULTYPE_ALLOW_RETRY_PF (1 << 3) #define EMULTYPE_TRAP_UD_FORCED (1 << 4) #define EMULTYPE_VMWARE_GP (1 << 5) +#define EMULTYPE_PF (1 << 6) + int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void *insn, int insn_len); @@ -1414,8 +1416,6 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); -struct x86_emulate_ctxt; - int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); @@ -1512,8 +1512,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); -void kvm_enable_tdp(void); -void kvm_disable_tdp(void); +void kvm_configure_mmu(bool enable_tdp, int tdp_page_level); static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) @@ -1670,14 +1669,14 @@ static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { - if (kvm_x86_ops->vcpu_blocking) - kvm_x86_ops->vcpu_blocking(vcpu); + if (kvm_x86_ops.vcpu_blocking) + kvm_x86_ops.vcpu_blocking(vcpu); } static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { - if (kvm_x86_ops->vcpu_unblocking) - kvm_x86_ops->vcpu_unblocking(vcpu); + if (kvm_x86_ops.vcpu_unblocking) + kvm_x86_ops.vcpu_unblocking(vcpu); } static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index 172f9749dbb2..87bd6025d91d 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -49,8 +49,7 @@ struct kvm_page_track_notifier_node { void kvm_page_track_init(struct kvm *kvm); void kvm_page_track_cleanup(struct kvm *kvm); -void kvm_page_track_free_memslot(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont); +void kvm_page_track_free_memslot(struct kvm_memory_slot *slot); int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, unsigned long npages); diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 8521af3fef27..5e090d1f03f8 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -500,6 +500,18 @@ enum vmcs_field { VMX_EPT_EXECUTABLE_MASK) #define VMX_EPT_MT_MASK (7ull << VMX_EPT_MT_EPTE_SHIFT) +static inline u8 vmx_eptp_page_walk_level(u64 eptp) +{ + u64 encoded_level = eptp & VMX_EPTP_PWL_MASK; + + if (encoded_level == VMX_EPTP_PWL_5) + return 5; + + /* @eptp must be pre-validated by the caller. */ + WARN_ON_ONCE(encoded_level != VMX_EPTP_PWL_4); + return 4; +} + /* The mask to use to trigger an EPT Misconfiguration in order to track MMIO */ #define VMX_EPT_MISCONFIG_WX_VALUE (VMX_EPT_WRITABLE_MASK | \ VMX_EPT_EXECUTABLE_MASK) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b1c469446b07..901cd1fdecd9 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -24,6 +24,13 @@ #include "trace.h" #include "pmu.h" +/* + * Unlike "struct cpuinfo_x86.x86_capability", kvm_cpu_caps doesn't need to be + * aligned to sizeof(unsigned long) because it's not accessed via bitops. + */ +u32 kvm_cpu_caps[NCAPINTS] __read_mostly; +EXPORT_SYMBOL_GPL(kvm_cpu_caps); + static u32 xstate_required_size(u64 xstate_bv, bool compacted) { int feature_bit = 0; @@ -45,23 +52,6 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) return ret; } -bool kvm_mpx_supported(void) -{ - return ((host_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) - && kvm_x86_ops->mpx_supported()); -} -EXPORT_SYMBOL_GPL(kvm_mpx_supported); - -u64 kvm_supported_xcr0(void) -{ - u64 xcr0 = KVM_SUPPORTED_XCR0 & host_xcr0; - - if (!kvm_mpx_supported()) - xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); - - return xcr0; -} - #define F feature_bit int kvm_update_cpuid(struct kvm_vcpu *vcpu) @@ -74,32 +64,24 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) return 0; /* Update OSXSAVE bit */ - if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) { - best->ecx &= ~F(OSXSAVE); - if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) - best->ecx |= F(OSXSAVE); - } + if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) + cpuid_entry_change(best, X86_FEATURE_OSXSAVE, + kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)); - best->edx &= ~F(APIC); - if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE) - best->edx |= F(APIC); + cpuid_entry_change(best, X86_FEATURE_APIC, + vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); if (apic) { - if (best->ecx & F(TSC_DEADLINE_TIMER)) + if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; else apic->lapic_timer.timer_mode_mask = 1 << 17; } best = kvm_find_cpuid_entry(vcpu, 7, 0); - if (best) { - /* Update OSPKE bit */ - if (boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) { - best->ecx &= ~F(OSPKE); - if (kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) - best->ecx |= F(OSPKE); - } - } + if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) + cpuid_entry_change(best, X86_FEATURE_OSPKE, + kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); best = kvm_find_cpuid_entry(vcpu, 0xD, 0); if (!best) { @@ -107,14 +89,14 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; } else { vcpu->arch.guest_supported_xcr0 = - (best->eax | ((u64)best->edx << 32)) & - kvm_supported_xcr0(); + (best->eax | ((u64)best->edx << 32)) & supported_xcr0; vcpu->arch.guest_xstate_size = best->ebx = xstate_required_size(vcpu->arch.xcr0, false); } best = kvm_find_cpuid_entry(vcpu, 0xD, 1); - if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) + if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || + cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); /* @@ -136,12 +118,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) { best = kvm_find_cpuid_entry(vcpu, 0x1, 0); - if (best) { - if (vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT) - best->ecx |= F(MWAIT); - else - best->ecx &= ~F(MWAIT); - } + if (best) + cpuid_entry_change(best, X86_FEATURE_MWAIT, + vcpu->arch.ia32_misc_enable_msr & + MSR_IA32_MISC_ENABLE_MWAIT); } /* Update physical-address width */ @@ -154,10 +134,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) static int is_efer_nx(void) { - unsigned long long efer = 0; - - rdmsrl_safe(MSR_EFER, &efer); - return efer & EFER_NX; + return host_efer & EFER_NX; } static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) @@ -173,8 +150,8 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu) break; } } - if (entry && (entry->edx & F(NX)) && !is_efer_nx()) { - entry->edx &= ~F(NX); + if (entry && cpuid_entry_has(entry, X86_FEATURE_NX) && !is_efer_nx()) { + cpuid_entry_clear(entry, X86_FEATURE_NX); printk(KERN_INFO "kvm: guest NX capability removed\n"); } } @@ -232,7 +209,7 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_nent = cpuid->nent; cpuid_fix_nx_cap(vcpu); kvm_apic_set_version(vcpu); - kvm_x86_ops->cpuid_update(vcpu); + kvm_x86_ops.cpuid_update(vcpu); r = kvm_update_cpuid(vcpu); out: @@ -255,7 +232,7 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, goto out; vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); - kvm_x86_ops->cpuid_update(vcpu); + kvm_x86_ops.cpuid_update(vcpu); r = kvm_update_cpuid(vcpu); out: return r; @@ -281,15 +258,189 @@ out: return r; } -static __always_inline void cpuid_mask(u32 *word, int wordnum) +static __always_inline void kvm_cpu_cap_mask(enum cpuid_leafs leaf, u32 mask) { - reverse_cpuid_check(wordnum); - *word &= boot_cpu_data.x86_capability[wordnum]; + const struct cpuid_reg cpuid = x86_feature_cpuid(leaf * 32); + struct kvm_cpuid_entry2 entry; + + reverse_cpuid_check(leaf); + kvm_cpu_caps[leaf] &= mask; + + cpuid_count(cpuid.function, cpuid.index, + &entry.eax, &entry.ebx, &entry.ecx, &entry.edx); + + kvm_cpu_caps[leaf] &= *__cpuid_entry_get_reg(&entry, cpuid.reg); +} + +void kvm_set_cpu_caps(void) +{ + unsigned int f_nx = is_efer_nx() ? F(NX) : 0; +#ifdef CONFIG_X86_64 + unsigned int f_gbpages = F(GBPAGES); + unsigned int f_lm = F(LM); +#else + unsigned int f_gbpages = 0; + unsigned int f_lm = 0; +#endif + + BUILD_BUG_ON(sizeof(kvm_cpu_caps) > + sizeof(boot_cpu_data.x86_capability)); + + memcpy(&kvm_cpu_caps, &boot_cpu_data.x86_capability, + sizeof(kvm_cpu_caps)); + + kvm_cpu_cap_mask(CPUID_1_ECX, + /* + * NOTE: MONITOR (and MWAIT) are emulated as NOP, but *not* + * advertised to guests via CPUID! + */ + F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | + 0 /* DS-CPL, VMX, SMX, EST */ | + 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | + F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | + F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | + F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | + 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | + F(F16C) | F(RDRAND) + ); + /* KVM emulates x2apic in software irrespective of host support. */ + kvm_cpu_cap_set(X86_FEATURE_X2APIC); + + kvm_cpu_cap_mask(CPUID_1_EDX, + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) | + 0 /* Reserved, DS, ACPI */ | F(MMX) | + F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | + 0 /* HTT, TM, Reserved, PBE */ + ); + + kvm_cpu_cap_mask(CPUID_7_0_EBX, + F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | + F(BMI2) | F(ERMS) | 0 /*INVPCID*/ | F(RTM) | 0 /*MPX*/ | F(RDSEED) | + F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | + F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | + F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | 0 /*INTEL_PT*/ + ); + + kvm_cpu_cap_mask(CPUID_7_ECX, + F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) | + F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | + F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/ + ); + /* Set LA57 based on hardware capability. */ + if (cpuid_ecx(7) & F(LA57)) + kvm_cpu_cap_set(X86_FEATURE_LA57); + + kvm_cpu_cap_mask(CPUID_7_EDX, + F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | + F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | + F(MD_CLEAR) | F(AVX512_VP2INTERSECT) | F(FSRM) + ); + + /* TSC_ADJUST and ARCH_CAPABILITIES are emulated in software. */ + kvm_cpu_cap_set(X86_FEATURE_TSC_ADJUST); + kvm_cpu_cap_set(X86_FEATURE_ARCH_CAPABILITIES); + + if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS)) + kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL); + if (boot_cpu_has(X86_FEATURE_STIBP)) + kvm_cpu_cap_set(X86_FEATURE_INTEL_STIBP); + if (boot_cpu_has(X86_FEATURE_AMD_SSBD)) + kvm_cpu_cap_set(X86_FEATURE_SPEC_CTRL_SSBD); + + kvm_cpu_cap_mask(CPUID_7_1_EAX, + F(AVX512_BF16) + ); + + kvm_cpu_cap_mask(CPUID_D_1_EAX, + F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | F(XSAVES) + ); + + kvm_cpu_cap_mask(CPUID_8000_0001_ECX, + F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | + F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | + F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | + F(TOPOEXT) | F(PERFCTR_CORE) + ); + + kvm_cpu_cap_mask(CPUID_8000_0001_EDX, + F(FPU) | F(VME) | F(DE) | F(PSE) | + F(TSC) | F(MSR) | F(PAE) | F(MCE) | + F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | + F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | + F(PAT) | F(PSE36) | 0 /* Reserved */ | + f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | + F(FXSR) | F(FXSR_OPT) | f_gbpages | F(RDTSCP) | + 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW) + ); + + if (!tdp_enabled && IS_ENABLED(CONFIG_X86_64)) + kvm_cpu_cap_set(X86_FEATURE_GBPAGES); + + kvm_cpu_cap_mask(CPUID_8000_0008_EBX, + F(CLZERO) | F(XSAVEERPTR) | + F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | + F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON) + ); + + /* + * AMD has separate bits for each SPEC_CTRL bit. + * arch/x86/kernel/cpu/bugs.c is kind enough to + * record that in cpufeatures so use them. + */ + if (boot_cpu_has(X86_FEATURE_IBPB)) + kvm_cpu_cap_set(X86_FEATURE_AMD_IBPB); + if (boot_cpu_has(X86_FEATURE_IBRS)) + kvm_cpu_cap_set(X86_FEATURE_AMD_IBRS); + if (boot_cpu_has(X86_FEATURE_STIBP)) + kvm_cpu_cap_set(X86_FEATURE_AMD_STIBP); + if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) + kvm_cpu_cap_set(X86_FEATURE_AMD_SSBD); + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + kvm_cpu_cap_set(X86_FEATURE_AMD_SSB_NO); + /* + * The preference is to use SPEC CTRL MSR instead of the + * VIRT_SPEC MSR. + */ + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) && + !boot_cpu_has(X86_FEATURE_AMD_SSBD)) + kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); + + /* + * Hide all SVM features by default, SVM will set the cap bits for + * features it emulates and/or exposes for L1. + */ + kvm_cpu_cap_mask(CPUID_8000_000A_EDX, 0); + + kvm_cpu_cap_mask(CPUID_C000_0001_EDX, + F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | + F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | + F(PMM) | F(PMM_EN) + ); } +EXPORT_SYMBOL_GPL(kvm_set_cpu_caps); -static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function, - u32 index) +struct kvm_cpuid_array { + struct kvm_cpuid_entry2 *entries; + const int maxnent; + int nent; +}; + +static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, + u32 function, u32 index) { + struct kvm_cpuid_entry2 *entry; + + if (array->nent >= array->maxnent) + return NULL; + + entry = &array->entries[array->nent++]; + entry->function = function; entry->index = index; entry->flags = 0; @@ -298,9 +449,6 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function, &entry->eax, &entry->ebx, &entry->ecx, &entry->edx); switch (function) { - case 2: - entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC; - break; case 4: case 7: case 0xb: @@ -316,11 +464,18 @@ static void do_host_cpuid(struct kvm_cpuid_entry2 *entry, u32 function, entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; break; } + + return entry; } -static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, - u32 func, int *nent, int maxnent) +static int __do_cpuid_func_emulated(struct kvm_cpuid_array *array, u32 func) { + struct kvm_cpuid_entry2 *entry; + + if (array->nent >= array->maxnent) + return -E2BIG; + + entry = &array->entries[array->nent]; entry->function = func; entry->index = 0; entry->flags = 0; @@ -328,17 +483,17 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, switch (func) { case 0: entry->eax = 7; - ++*nent; + ++array->nent; break; case 1: entry->ecx = F(MOVBE); - ++*nent; + ++array->nent; break; case 7: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; entry->eax = 0; entry->ecx = F(RDPID); - ++*nent; + ++array->nent; default: break; } @@ -346,223 +501,60 @@ static int __do_cpuid_func_emulated(struct kvm_cpuid_entry2 *entry, return 0; } -static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) +static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) { - unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; - unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; - unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; - unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; - unsigned f_la57; - unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0; - - /* cpuid 7.0.ebx */ - const u32 kvm_cpuid_7_0_ebx_x86_features = - F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | - F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | - F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | - F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; - - /* cpuid 7.0.ecx*/ - const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) | - F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | - F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/; - - /* cpuid 7.0.edx*/ - const u32 kvm_cpuid_7_0_edx_x86_features = - F(AVX512_4VNNIW) | F(AVX512_4FMAPS) | F(SPEC_CTRL) | - F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | - F(MD_CLEAR); - - /* cpuid 7.1.eax */ - const u32 kvm_cpuid_7_1_eax_x86_features = - F(AVX512_BF16); - - switch (index) { - case 0: - entry->eax = min(entry->eax, 1u); - entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; - cpuid_mask(&entry->ebx, CPUID_7_0_EBX); - /* TSC_ADJUST is emulated */ - entry->ebx |= F(TSC_ADJUST); - - entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; - f_la57 = entry->ecx & F(LA57); - cpuid_mask(&entry->ecx, CPUID_7_ECX); - /* Set LA57 based on hardware capability. */ - entry->ecx |= f_la57; - entry->ecx |= f_umip; - entry->ecx |= f_pku; - /* PKU is not yet implemented for shadow paging. */ - if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) - entry->ecx &= ~F(PKU); - - entry->edx &= kvm_cpuid_7_0_edx_x86_features; - cpuid_mask(&entry->edx, CPUID_7_EDX); - if (boot_cpu_has(X86_FEATURE_IBPB) && boot_cpu_has(X86_FEATURE_IBRS)) - entry->edx |= F(SPEC_CTRL); - if (boot_cpu_has(X86_FEATURE_STIBP)) - entry->edx |= F(INTEL_STIBP); - if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - boot_cpu_has(X86_FEATURE_AMD_SSBD)) - entry->edx |= F(SPEC_CTRL_SSBD); - /* - * We emulate ARCH_CAPABILITIES in software even - * if the host doesn't support it. - */ - entry->edx |= F(ARCH_CAPABILITIES); - break; - case 1: - entry->eax &= kvm_cpuid_7_1_eax_x86_features; - entry->ebx = 0; - entry->ecx = 0; - entry->edx = 0; - break; - default: - WARN_ON_ONCE(1); - entry->eax = 0; - entry->ebx = 0; - entry->ecx = 0; - entry->edx = 0; - break; - } -} - -static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, - int *nent, int maxnent) -{ - int r; - unsigned f_nx = is_efer_nx() ? F(NX) : 0; -#ifdef CONFIG_X86_64 - unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) - ? F(GBPAGES) : 0; - unsigned f_lm = F(LM); -#else - unsigned f_gbpages = 0; - unsigned f_lm = 0; -#endif - unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0; - unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; - unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0; - - /* cpuid 1.edx */ - const u32 kvm_cpuid_1_edx_x86_features = - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLUSH) | - 0 /* Reserved, DS, ACPI */ | F(MMX) | - F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) | - 0 /* HTT, TM, Reserved, PBE */; - /* cpuid 0x80000001.edx */ - const u32 kvm_cpuid_8000_0001_edx_x86_features = - F(FPU) | F(VME) | F(DE) | F(PSE) | - F(TSC) | F(MSR) | F(PAE) | F(MCE) | - F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) | - F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | - F(PAT) | F(PSE36) | 0 /* Reserved */ | - f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | - F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp | - 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); - /* cpuid 1.ecx */ - const u32 kvm_cpuid_1_ecx_x86_features = - /* NOTE: MONITOR (and MWAIT) are emulated as NOP, - * but *not* advertised to guests via CPUID ! */ - F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | - 0 /* DS-CPL, VMX, SMX, EST */ | - 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | - F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | - F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | - F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | - F(F16C) | F(RDRAND); - /* cpuid 0x80000001.ecx */ - const u32 kvm_cpuid_8000_0001_ecx_x86_features = - F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | - F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | - 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | - F(TOPOEXT) | F(PERFCTR_CORE); - - /* cpuid 0x80000008.ebx */ - const u32 kvm_cpuid_8000_0008_ebx_x86_features = - F(CLZERO) | F(XSAVEERPTR) | - F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) | - F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON); - - /* cpuid 0xC0000001.edx */ - const u32 kvm_cpuid_C000_0001_edx_x86_features = - F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | - F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | - F(PMM) | F(PMM_EN); - - /* cpuid 0xD.1.eax */ - const u32 kvm_cpuid_D_1_eax_x86_features = - F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves; + struct kvm_cpuid_entry2 *entry; + int r, i, max_idx; /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); r = -E2BIG; - if (WARN_ON(*nent >= maxnent)) + entry = do_host_cpuid(array, function, 0); + if (!entry) goto out; - do_host_cpuid(entry, function, 0); - ++*nent; - switch (function) { case 0: /* Limited to the highest leaf implemented in KVM. */ entry->eax = min(entry->eax, 0x1fU); break; case 1: - entry->edx &= kvm_cpuid_1_edx_x86_features; - cpuid_mask(&entry->edx, CPUID_1_EDX); - entry->ecx &= kvm_cpuid_1_ecx_x86_features; - cpuid_mask(&entry->ecx, CPUID_1_ECX); - /* we support x2apic emulation even if host does not support - * it since we emulate x2apic in software */ - entry->ecx |= F(X2APIC); + cpuid_entry_override(entry, CPUID_1_EDX); + cpuid_entry_override(entry, CPUID_1_ECX); break; - /* function 2 entries are STATEFUL. That is, repeated cpuid commands - * may return different values. This forces us to get_cpu() before - * issuing the first command, and also to emulate this annoying behavior - * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */ - case 2: { - int t, times = entry->eax & 0xff; - - entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - for (t = 1; t < times; ++t) { - if (*nent >= maxnent) - goto out; - - do_host_cpuid(&entry[t], function, 0); - ++*nent; - } + case 2: + /* + * On ancient CPUs, function 2 entries are STATEFUL. That is, + * CPUID(function=2, index=0) may return different results each + * time, with the least-significant byte in EAX enumerating the + * number of times software should do CPUID(2, 0). + * + * Modern CPUs, i.e. every CPU KVM has *ever* run on are less + * idiotic. Intel's SDM states that EAX & 0xff "will always + * return 01H. Software should ignore this value and not + * interpret it as an informational descriptor", while AMD's + * APM states that CPUID(2) is reserved. + * + * WARN if a frankenstein CPU that supports virtualization and + * a stateful CPUID.0x2 is encountered. + */ + WARN_ON_ONCE((entry->eax & 0xff) > 1); break; - } /* functions 4 and 0x8000001d have additional index. */ case 4: - case 0x8000001d: { - int i, cache_type; - - /* read more entries until cache_type is zero */ - for (i = 1; ; ++i) { - if (*nent >= maxnent) + case 0x8000001d: + /* + * Read entries until the cache type in the previous entry is + * zero, i.e. indicates an invalid entry. + */ + for (i = 1; entry->eax & 0x1f; ++i) { + entry = do_host_cpuid(array, function, i); + if (!entry) goto out; - - cache_type = entry[i - 1].eax & 0x1f; - if (!cache_type) - break; - do_host_cpuid(&entry[i], function, i); - ++*nent; } break; - } case 6: /* Thermal management */ entry->eax = 0x4; /* allow ARAT */ entry->ebx = 0; @@ -570,22 +562,24 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, entry->edx = 0; break; /* function 7 has additional index. */ - case 7: { - int i; - - for (i = 0; ; ) { - do_cpuid_7_mask(&entry[i], i); - if (i == entry->eax) - break; - if (*nent >= maxnent) + case 7: + entry->eax = min(entry->eax, 1u); + cpuid_entry_override(entry, CPUID_7_0_EBX); + cpuid_entry_override(entry, CPUID_7_ECX); + cpuid_entry_override(entry, CPUID_7_EDX); + + /* KVM only supports 0x7.0 and 0x7.1, capped above via min(). */ + if (entry->eax == 1) { + entry = do_host_cpuid(array, function, 1); + if (!entry) goto out; - ++i; - do_host_cpuid(&entry[i], function, i); - ++*nent; + cpuid_entry_override(entry, CPUID_7_1_EAX); + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; } break; - } case 9: break; case 0xa: { /* Architectural Performance Monitoring */ @@ -622,79 +616,81 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, * thus they can be handled by common code. */ case 0x1f: - case 0xb: { - int i; - + case 0xb: /* - * We filled in entry[0] for CPUID(EAX=<function>, - * ECX=00H) above. If its level type (ECX[15:8]) is - * zero, then the leaf is unimplemented, and we're - * done. Otherwise, continue to populate entries - * until the level type (ECX[15:8]) of the previously - * added entry is zero. + * Populate entries until the level type (ECX[15:8]) of the + * previous entry is zero. Note, CPUID EAX.{0x1f,0xb}.0 is + * the starting entry, filled by the primary do_host_cpuid(). */ - for (i = 1; entry[i - 1].ecx & 0xff00; ++i) { - if (*nent >= maxnent) + for (i = 1; entry->ecx & 0xff00; ++i) { + entry = do_host_cpuid(array, function, i); + if (!entry) goto out; - - do_host_cpuid(&entry[i], function, i); - ++*nent; } break; - } - case 0xd: { - int idx, i; - u64 supported = kvm_supported_xcr0(); - - entry->eax &= supported; - entry->ebx = xstate_required_size(supported, false); + case 0xd: + entry->eax &= supported_xcr0; + entry->ebx = xstate_required_size(supported_xcr0, false); entry->ecx = entry->ebx; - entry->edx &= supported >> 32; - if (!supported) + entry->edx &= supported_xcr0 >> 32; + if (!supported_xcr0) break; - for (idx = 1, i = 1; idx < 64; ++idx) { - u64 mask = ((u64)1 << idx); - if (*nent >= maxnent) + entry = do_host_cpuid(array, function, 1); + if (!entry) + goto out; + + cpuid_entry_override(entry, CPUID_D_1_EAX); + if (entry->eax & (F(XSAVES)|F(XSAVEC))) + entry->ebx = xstate_required_size(supported_xcr0 | supported_xss, + true); + else { + WARN_ON_ONCE(supported_xss != 0); + entry->ebx = 0; + } + entry->ecx &= supported_xss; + entry->edx &= supported_xss >> 32; + + for (i = 2; i < 64; ++i) { + bool s_state; + if (supported_xcr0 & BIT_ULL(i)) + s_state = false; + else if (supported_xss & BIT_ULL(i)) + s_state = true; + else + continue; + + entry = do_host_cpuid(array, function, i); + if (!entry) goto out; - do_host_cpuid(&entry[i], function, idx); - if (idx == 1) { - entry[i].eax &= kvm_cpuid_D_1_eax_x86_features; - cpuid_mask(&entry[i].eax, CPUID_D_1_EAX); - entry[i].ebx = 0; - if (entry[i].eax & (F(XSAVES)|F(XSAVEC))) - entry[i].ebx = - xstate_required_size(supported, - true); - } else { - if (entry[i].eax == 0 || !(supported & mask)) - continue; - if (WARN_ON_ONCE(entry[i].ecx & 1)) - continue; + /* + * The supported check above should have filtered out + * invalid sub-leafs. Only valid sub-leafs should + * reach this point, and they should have a non-zero + * save state size. Furthermore, check whether the + * processor agrees with supported_xcr0/supported_xss + * on whether this is an XCR0- or IA32_XSS-managed area. + */ + if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) { + --array->nent; + continue; } - entry[i].ecx = 0; - entry[i].edx = 0; - ++*nent; - ++i; + entry->edx = 0; } break; - } /* Intel PT */ - case 0x14: { - int t, times = entry->eax; - - if (!f_intel_pt) + case 0x14: + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; + } - for (t = 1; t <= times; ++t) { - if (*nent >= maxnent) + for (i = 1, max_idx = entry->eax; i <= max_idx; ++i) { + if (!do_host_cpuid(array, function, i)) goto out; - do_host_cpuid(&entry[t], function, t); - ++*nent; } break; - } case KVM_CPUID_SIGNATURE: { static const char signature[12] = "KVMKVMKVM\0\0"; const u32 *sigptr = (const u32 *)signature; @@ -729,10 +725,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, 0x8000001f); break; case 0x80000001: - entry->edx &= kvm_cpuid_8000_0001_edx_x86_features; - cpuid_mask(&entry->edx, CPUID_8000_0001_EDX); - entry->ecx &= kvm_cpuid_8000_0001_ecx_x86_features; - cpuid_mask(&entry->ecx, CPUID_8000_0001_ECX); + cpuid_entry_override(entry, CPUID_8000_0001_EDX); + cpuid_entry_override(entry, CPUID_8000_0001_ECX); break; case 0x80000007: /* Advanced power management */ /* invariant TSC is CPUID.80000007H:EDX[8] */ @@ -750,33 +744,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); entry->edx = 0; - entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; - cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); - /* - * AMD has separate bits for each SPEC_CTRL bit. - * arch/x86/kernel/cpu/bugs.c is kind enough to - * record that in cpufeatures so use them. - */ - if (boot_cpu_has(X86_FEATURE_IBPB)) - entry->ebx |= F(AMD_IBPB); - if (boot_cpu_has(X86_FEATURE_IBRS)) - entry->ebx |= F(AMD_IBRS); - if (boot_cpu_has(X86_FEATURE_STIBP)) - entry->ebx |= F(AMD_STIBP); - if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) || - boot_cpu_has(X86_FEATURE_AMD_SSBD)) - entry->ebx |= F(AMD_SSBD); - if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - entry->ebx |= F(AMD_SSB_NO); - /* - * The preference is to use SPEC CTRL MSR instead of the - * VIRT_SPEC MSR. - */ - if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) && - !boot_cpu_has(X86_FEATURE_AMD_SSBD)) - entry->ebx |= F(VIRT_SSBD); + cpuid_entry_override(entry, CPUID_8000_0008_EBX); break; } + case 0x8000000A: + if (!kvm_cpu_cap_has(X86_FEATURE_SVM)) { + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; + break; + } + entry->eax = 1; /* SVM revision 1 */ + entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper + ASID emulation to nested SVM */ + entry->ecx = 0; /* Reserved */ + cpuid_entry_override(entry, CPUID_8000_000A_EDX); + break; case 0x80000019: entry->ecx = entry->edx = 0; break; @@ -794,8 +775,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = min(entry->eax, 0xC0000004); break; case 0xC0000001: - entry->edx &= kvm_cpuid_C000_0001_edx_x86_features; - cpuid_mask(&entry->edx, CPUID_C000_0001_EDX); + cpuid_entry_override(entry, CPUID_C000_0001_EDX); break; case 3: /* Processor serial number */ case 5: /* MONITOR/MWAIT */ @@ -807,8 +787,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function, break; } - kvm_x86_ops->set_supported_cpuid(function, entry); - r = 0; out: @@ -817,26 +795,39 @@ out: return r; } -static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func, - int *nent, int maxnent, unsigned int type) +static int do_cpuid_func(struct kvm_cpuid_array *array, u32 func, + unsigned int type) { - if (*nent >= maxnent) - return -E2BIG; - if (type == KVM_GET_EMULATED_CPUID) - return __do_cpuid_func_emulated(entry, func, nent, maxnent); + return __do_cpuid_func_emulated(array, func); - return __do_cpuid_func(entry, func, nent, maxnent); + return __do_cpuid_func(array, func); } -struct kvm_cpuid_param { - u32 func; - bool (*qualifier)(const struct kvm_cpuid_param *param); -}; +#define CENTAUR_CPUID_SIGNATURE 0xC0000000 -static bool is_centaur_cpu(const struct kvm_cpuid_param *param) +static int get_cpuid_func(struct kvm_cpuid_array *array, u32 func, + unsigned int type) { - return boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR; + u32 limit; + int r; + + if (func == CENTAUR_CPUID_SIGNATURE && + boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) + return 0; + + r = do_cpuid_func(array, func, type); + if (r) + return r; + + limit = array->entries[array->nent - 1].eax; + for (func = func + 1; func <= limit; ++func) { + r = do_cpuid_func(array, func, type); + if (r) + break; + } + + return r; } static bool sanity_check_entries(struct kvm_cpuid_entry2 __user *entries, @@ -870,157 +861,145 @@ int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type) { - struct kvm_cpuid_entry2 *cpuid_entries; - int limit, nent = 0, r = -E2BIG, i; - u32 func; - static const struct kvm_cpuid_param param[] = { - { .func = 0 }, - { .func = 0x80000000 }, - { .func = 0xC0000000, .qualifier = is_centaur_cpu }, - { .func = KVM_CPUID_SIGNATURE }, + static const u32 funcs[] = { + 0, 0x80000000, CENTAUR_CPUID_SIGNATURE, KVM_CPUID_SIGNATURE, + }; + + struct kvm_cpuid_array array = { + .nent = 0, + .maxnent = cpuid->nent, }; + int r, i; if (cpuid->nent < 1) - goto out; + return -E2BIG; if (cpuid->nent > KVM_MAX_CPUID_ENTRIES) cpuid->nent = KVM_MAX_CPUID_ENTRIES; if (sanity_check_entries(entries, cpuid->nent, type)) return -EINVAL; - r = -ENOMEM; - cpuid_entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2), + array.entries = vzalloc(array_size(sizeof(struct kvm_cpuid_entry2), cpuid->nent)); - if (!cpuid_entries) - goto out; - - r = 0; - for (i = 0; i < ARRAY_SIZE(param); i++) { - const struct kvm_cpuid_param *ent = ¶m[i]; - - if (ent->qualifier && !ent->qualifier(ent)) - continue; - - r = do_cpuid_func(&cpuid_entries[nent], ent->func, - &nent, cpuid->nent, type); - - if (r) - goto out_free; - - limit = cpuid_entries[nent - 1].eax; - for (func = ent->func + 1; func <= limit && nent < cpuid->nent && r == 0; ++func) - r = do_cpuid_func(&cpuid_entries[nent], func, - &nent, cpuid->nent, type); + if (!array.entries) + return -ENOMEM; + for (i = 0; i < ARRAY_SIZE(funcs); i++) { + r = get_cpuid_func(&array, funcs[i], type); if (r) goto out_free; } + cpuid->nent = array.nent; - r = -EFAULT; - if (copy_to_user(entries, cpuid_entries, - nent * sizeof(struct kvm_cpuid_entry2))) - goto out_free; - cpuid->nent = nent; - r = 0; + if (copy_to_user(entries, array.entries, + array.nent * sizeof(struct kvm_cpuid_entry2))) + r = -EFAULT; out_free: - vfree(cpuid_entries); -out: + vfree(array.entries); return r; } -static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) -{ - struct kvm_cpuid_entry2 *e = &vcpu->arch.cpuid_entries[i]; - struct kvm_cpuid_entry2 *ej; - int j = i; - int nent = vcpu->arch.cpuid_nent; - - e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT; - /* when no next entry is found, the current entry[i] is reselected */ - do { - j = (j + 1) % nent; - ej = &vcpu->arch.cpuid_entries[j]; - } while (ej->function != e->function); - - ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT; - - return j; -} - -/* find an entry with matching function, matching index (if needed), and that - * should be read next (if it's stateful) */ -static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e, - u32 function, u32 index) -{ - if (e->function != function) - return 0; - if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index) - return 0; - if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) && - !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT)) - return 0; - return 1; -} - struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index) { + struct kvm_cpuid_entry2 *e; int i; - struct kvm_cpuid_entry2 *best = NULL; for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { - struct kvm_cpuid_entry2 *e; - e = &vcpu->arch.cpuid_entries[i]; - if (is_matching_cpuid_entry(e, function, index)) { - if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) - move_to_next_stateful_cpuid_entry(vcpu, i); - best = e; - break; - } + + if (e->function == function && (e->index == index || + !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX))) + return e; } - return best; + return NULL; } EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); /* - * If the basic or extended CPUID leaf requested is higher than the - * maximum supported basic or extended leaf, respectively, then it is - * out of range. + * Intel CPUID semantics treats any query for an out-of-range leaf as if the + * highest basic leaf (i.e. CPUID.0H:EAX) were requested. AMD CPUID semantics + * returns all zeroes for any undefined leaf, whether or not the leaf is in + * range. Centaur/VIA follows Intel semantics. + * + * A leaf is considered out-of-range if its function is higher than the maximum + * supported leaf of its associated class or if its associated class does not + * exist. + * + * There are three primary classes to be considered, with their respective + * ranges described as "<base> - <top>[,<base2> - <top2>] inclusive. A primary + * class exists if a guest CPUID entry for its <base> leaf exists. For a given + * class, CPUID.<base>.EAX contains the max supported leaf for the class. + * + * - Basic: 0x00000000 - 0x3fffffff, 0x50000000 - 0x7fffffff + * - Hypervisor: 0x40000000 - 0x4fffffff + * - Extended: 0x80000000 - 0xbfffffff + * - Centaur: 0xc0000000 - 0xcfffffff + * + * The Hypervisor class is further subdivided into sub-classes that each act as + * their own indepdent class associated with a 0x100 byte range. E.g. if Qemu + * is advertising support for both HyperV and KVM, the resulting Hypervisor + * CPUID sub-classes are: + * + * - HyperV: 0x40000000 - 0x400000ff + * - KVM: 0x40000100 - 0x400001ff */ -static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function) +static struct kvm_cpuid_entry2 * +get_out_of_range_cpuid_entry(struct kvm_vcpu *vcpu, u32 *fn_ptr, u32 index) { - struct kvm_cpuid_entry2 *max; + struct kvm_cpuid_entry2 *basic, *class; + u32 function = *fn_ptr; + + basic = kvm_find_cpuid_entry(vcpu, 0, 0); + if (!basic) + return NULL; + + if (is_guest_vendor_amd(basic->ebx, basic->ecx, basic->edx) || + is_guest_vendor_hygon(basic->ebx, basic->ecx, basic->edx)) + return NULL; - max = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); - return max && function <= max->eax; + if (function >= 0x40000000 && function <= 0x4fffffff) + class = kvm_find_cpuid_entry(vcpu, function & 0xffffff00, 0); + else if (function >= 0xc0000000) + class = kvm_find_cpuid_entry(vcpu, 0xc0000000, 0); + else + class = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + + if (class && function <= class->eax) + return NULL; + + /* + * Leaf specific adjustments are also applied when redirecting to the + * max basic entry, e.g. if the max basic leaf is 0xb but there is no + * entry for CPUID.0xb.index (see below), then the output value for EDX + * needs to be pulled from CPUID.0xb.1. + */ + *fn_ptr = basic->eax; + + /* + * The class does not exist or the requested function is out of range; + * the effective CPUID entry is the max basic leaf. Note, the index of + * the original requested leaf is observed! + */ + return kvm_find_cpuid_entry(vcpu, basic->eax, index); } bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, - u32 *ecx, u32 *edx, bool check_limit) + u32 *ecx, u32 *edx, bool exact_only) { - u32 function = *eax, index = *ecx; + u32 orig_function = *eax, function = *eax, index = *ecx; struct kvm_cpuid_entry2 *entry; - struct kvm_cpuid_entry2 *max; - bool found; + bool exact, used_max_basic = false; entry = kvm_find_cpuid_entry(vcpu, function, index); - found = entry; - /* - * Intel CPUID semantics treats any query for an out-of-range - * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were - * requested. AMD CPUID semantics returns all zeroes for any - * undefined leaf, whether or not the leaf is in range. - */ - if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) && - !cpuid_function_in_range(vcpu, function)) { - max = kvm_find_cpuid_entry(vcpu, 0, 0); - if (max) { - function = max->eax; - entry = kvm_find_cpuid_entry(vcpu, function, index); - } + exact = !!entry; + + if (!entry && !exact_only) { + entry = get_out_of_range_cpuid_entry(vcpu, &function, index); + used_max_basic = !!entry; } + if (entry) { *eax = entry->eax; *ebx = entry->ebx; @@ -1049,8 +1028,9 @@ bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, } } } - trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found); - return found; + trace_kvm_cpuid(orig_function, index, *eax, *ebx, *ecx, *edx, exact, + used_max_basic); + return exact; } EXPORT_SYMBOL_GPL(kvm_cpuid); @@ -1063,7 +1043,7 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) eax = kvm_rax_read(vcpu); ecx = kvm_rcx_read(vcpu); - kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, true); + kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx, false); kvm_rax_write(vcpu, eax); kvm_rbx_write(vcpu, ebx); kvm_rcx_write(vcpu, ecx); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 7366c618aa04..63a70f6a3df3 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -6,8 +6,10 @@ #include <asm/cpu.h> #include <asm/processor.h> +extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; +void kvm_set_cpu_caps(void); + int kvm_update_cpuid(struct kvm_vcpu *vcpu); -bool kvm_mpx_supported(void); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, @@ -23,7 +25,7 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, - u32 *ecx, u32 *edx, bool check_limit); + u32 *ecx, u32 *edx, bool exact_only); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); @@ -64,7 +66,7 @@ static const struct cpuid_reg reverse_cpuid[] = { * and can't be used by KVM to query/control guest capabilities. And obviously * the leaf being queried must have an entry in the lookup table. */ -static __always_inline void reverse_cpuid_check(unsigned x86_leaf) +static __always_inline void reverse_cpuid_check(unsigned int x86_leaf) { BUILD_BUG_ON(x86_leaf == CPUID_LNX_1); BUILD_BUG_ON(x86_leaf == CPUID_LNX_2); @@ -88,24 +90,18 @@ static __always_inline u32 __feature_bit(int x86_feature) #define feature_bit(name) __feature_bit(X86_FEATURE_##name) -static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) +static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned int x86_feature) { - unsigned x86_leaf = x86_feature / 32; + unsigned int x86_leaf = x86_feature / 32; reverse_cpuid_check(x86_leaf); return reverse_cpuid[x86_leaf]; } -static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned x86_feature) +static __always_inline u32 *__cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, + u32 reg) { - struct kvm_cpuid_entry2 *entry; - const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); - - entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index); - if (!entry) - return NULL; - - switch (cpuid.reg) { + switch (reg) { case CPUID_EAX: return &entry->eax; case CPUID_EBX: @@ -120,9 +116,86 @@ static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsi } } -static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_feature) +static __always_inline u32 *cpuid_entry_get_reg(struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature) +{ + const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); + + return __cpuid_entry_get_reg(entry, cpuid.reg); +} + +static __always_inline u32 cpuid_entry_get(struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature) +{ + u32 *reg = cpuid_entry_get_reg(entry, x86_feature); + + return *reg & __feature_bit(x86_feature); +} + +static __always_inline bool cpuid_entry_has(struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature) +{ + return cpuid_entry_get(entry, x86_feature); +} + +static __always_inline void cpuid_entry_clear(struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature) +{ + u32 *reg = cpuid_entry_get_reg(entry, x86_feature); + + *reg &= ~__feature_bit(x86_feature); +} + +static __always_inline void cpuid_entry_set(struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature) +{ + u32 *reg = cpuid_entry_get_reg(entry, x86_feature); + + *reg |= __feature_bit(x86_feature); +} + +static __always_inline void cpuid_entry_change(struct kvm_cpuid_entry2 *entry, + unsigned int x86_feature, + bool set) +{ + u32 *reg = cpuid_entry_get_reg(entry, x86_feature); + + /* + * Open coded instead of using cpuid_entry_{clear,set}() to coerce the + * compiler into using CMOV instead of Jcc when possible. + */ + if (set) + *reg |= __feature_bit(x86_feature); + else + *reg &= ~__feature_bit(x86_feature); +} + +static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry, + enum cpuid_leafs leaf) +{ + u32 *reg = cpuid_entry_get_reg(entry, leaf * 32); + + BUILD_BUG_ON(leaf >= ARRAY_SIZE(kvm_cpu_caps)); + *reg = kvm_cpu_caps[leaf]; +} + +static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, + unsigned int x86_feature) { - int *reg; + const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); + struct kvm_cpuid_entry2 *entry; + + entry = kvm_find_cpuid_entry(vcpu, cpuid.function, cpuid.index); + if (!entry) + return NULL; + + return __cpuid_entry_get_reg(entry, cpuid.reg); +} + +static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, + unsigned int x86_feature) +{ + u32 *reg; reg = guest_cpuid_get_register(vcpu, x86_feature); if (!reg) @@ -131,21 +204,24 @@ static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_ return *reg & __feature_bit(x86_feature); } -static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned x86_feature) +static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, + unsigned int x86_feature) { - int *reg; + u32 *reg; reg = guest_cpuid_get_register(vcpu, x86_feature); if (reg) *reg &= ~__feature_bit(x86_feature); } -static inline bool guest_cpuid_is_amd(struct kvm_vcpu *vcpu) +static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0, 0); - return best && best->ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx; + return best && + (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) || + is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); } static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) @@ -192,4 +268,39 @@ static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu) MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; } +static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) +{ + unsigned int x86_leaf = x86_feature / 32; + + reverse_cpuid_check(x86_leaf); + kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); +} + +static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) +{ + unsigned int x86_leaf = x86_feature / 32; + + reverse_cpuid_check(x86_leaf); + kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature); +} + +static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature) +{ + unsigned int x86_leaf = x86_feature / 32; + + reverse_cpuid_check(x86_leaf); + return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature); +} + +static __always_inline bool kvm_cpu_cap_has(unsigned int x86_feature) +{ + return !!kvm_cpu_cap_get(x86_feature); +} + +static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature) +{ + if (boot_cpu_has(x86_feature)) + kvm_cpu_cap_set(x86_feature); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index bc00642e5d3b..bddaba9c68dd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -20,7 +20,7 @@ #include <linux/kvm_host.h> #include "kvm_cache_regs.h" -#include <asm/kvm_emulate.h> +#include "kvm_emulate.h" #include <linux/stringify.h> #include <asm/fpu/api.h> #include <asm/debugreg.h> @@ -665,6 +665,17 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); } +static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt) +{ + return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48; +} + +static inline bool emul_is_noncanonical_address(u64 la, + struct x86_emulate_ctxt *ctxt) +{ + return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la; +} + /* * x86 defines three classes of vector instructions: explicitly * aligned, explicitly unaligned, and the rest, which change behaviour @@ -2711,10 +2722,8 @@ static bool vendor_intel(struct x86_emulate_ctxt *ctxt) u32 eax, ebx, ecx, edx; eax = ecx = 0; - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); - return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx - && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx - && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true); + return is_guest_vendor_intel(ebx, ecx, edx); } static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) @@ -2731,36 +2740,18 @@ static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) eax = 0x00000000; ecx = 0x00000000; - ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); + ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true); /* - * Intel ("GenuineIntel") - * remark: Intel CPUs only support "syscall" in 64bit - * longmode. Also an 64bit guest with a - * 32bit compat-app running will #UD !! While this - * behaviour can be fixed (by emulating) into AMD - * response - CPUs of AMD can't behave like Intel. + * remark: Intel CPUs only support "syscall" in 64bit longmode. Also a + * 64bit guest with a 32bit compat-app running will #UD !! While this + * behaviour can be fixed (by emulating) into AMD response - CPUs of + * AMD can't behave like Intel. */ - if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && - ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && - edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) + if (is_guest_vendor_intel(ebx, ecx, edx)) return false; - /* AMD ("AuthenticAMD") */ - if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && - ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && - edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) - return true; - - /* AMD ("AMDisbetter!") */ - if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && - ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && - edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) - return true; - - /* Hygon ("HygonGenuine") */ - if (ebx == X86EMUL_CPUID_VENDOR_HygonGenuine_ebx && - ecx == X86EMUL_CPUID_VENDOR_HygonGenuine_ecx && - edx == X86EMUL_CPUID_VENDOR_HygonGenuine_edx) + if (is_guest_vendor_amd(ebx, ecx, edx) || + is_guest_vendor_hygon(ebx, ecx, edx)) return true; /* @@ -3980,7 +3971,7 @@ static int em_cpuid(struct x86_emulate_ctxt *ctxt) eax = reg_read(ctxt, VCPU_REGS_RAX); ecx = reg_read(ctxt, VCPU_REGS_RCX); - ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, true); + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); *reg_write(ctxt, VCPU_REGS_RAX) = eax; *reg_write(ctxt, VCPU_REGS_RBX) = ebx; *reg_write(ctxt, VCPU_REGS_RCX) = ecx; @@ -4250,7 +4241,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt) eax = 0x80000008; ecx = 0; if (ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, - &edx, false)) + &edx, true)) maxphyaddr = eax & 0xff; else maxphyaddr = 36; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a86fda7a1d03..bcefa9d4e57e 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1022,7 +1022,7 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) return 1; - kvm_x86_ops->patch_hypercall(vcpu, instructions); + kvm_x86_ops.patch_hypercall(vcpu, instructions); ((unsigned char *)instructions)[3] = 0xc3; /* ret */ if (__copy_to_user((void __user *)addr, instructions, 4)) return 1; @@ -1607,7 +1607,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) * hypercall generates UD from non zero cpl and real mode * per HYPER-V spec */ - if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { + if (kvm_x86_ops.get_cpl(vcpu) != 0 || !is_protmode(vcpu)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } @@ -1800,8 +1800,8 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, }; int i, nent = ARRAY_SIZE(cpuid_entries); - if (kvm_x86_ops->nested_get_evmcs_version) - evmcs_ver = kvm_x86_ops->nested_get_evmcs_version(vcpu); + if (kvm_x86_ops.nested_get_evmcs_version) + evmcs_ver = kvm_x86_ops.nested_get_evmcs_version(vcpu); /* Skip NESTED_FEATURES if eVMCS is not supported */ if (!evmcs_ver) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index b24c606ac04b..febca334c320 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -367,7 +367,7 @@ static void pit_load_count(struct kvm_pit *pit, int channel, u32 val) { struct kvm_kpit_state *ps = &pit->pit_state; - pr_debug("load_count val is %d, channel is %d\n", val, channel); + pr_debug("load_count val is %u, channel is %d\n", val, channel); /* * The largest possible initial count is 0; this is equivalent diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 58767020de41..62558b9bdda7 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -68,7 +68,7 @@ static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, int reg) return 0; if (!kvm_register_is_available(vcpu, reg)) - kvm_x86_ops->cache_reg(vcpu, reg); + kvm_x86_ops.cache_reg(vcpu, reg); return vcpu->arch.regs[reg]; } @@ -108,7 +108,7 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) might_sleep(); /* on svm */ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR)) - kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); + kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_PDPTR); return vcpu->arch.walk_mmu->pdptrs[index]; } @@ -117,7 +117,7 @@ static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; if (tmask & vcpu->arch.cr0_guest_owned_bits) - kvm_x86_ops->decache_cr0_guest_bits(vcpu); + kvm_x86_ops.decache_cr0_guest_bits(vcpu); return vcpu->arch.cr0 & mask; } @@ -130,14 +130,14 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; if (tmask & vcpu->arch.cr4_guest_owned_bits) - kvm_x86_ops->decache_cr4_guest_bits(vcpu); + kvm_x86_ops.decache_cr4_guest_bits(vcpu); return vcpu->arch.cr4 & mask; } static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu) { if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) - kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_CR3); + kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR3); return vcpu->arch.cr3; } diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h index c06e8353efd3..43c93ffa76ed 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/kvm/kvm_emulate.h @@ -221,7 +221,7 @@ struct x86_emulate_ops { enum x86_intercept_stage stage); bool (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, - u32 *ecx, u32 *edx, bool check_limit); + u32 *ecx, u32 *edx, bool exact_only); bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt); bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt); @@ -301,6 +301,7 @@ struct fastop; typedef void (*fastop_t)(struct fastop *); struct x86_emulate_ctxt { + void *vcpu; const struct x86_emulate_ops *ops; /* Register state before/after emulation. */ @@ -319,6 +320,10 @@ struct x86_emulate_ctxt { bool have_exception; struct x86_exception exception; + /* GPA available */ + bool gpa_available; + gpa_t gpa_val; + /* * decode cache */ @@ -329,9 +334,6 @@ struct x86_emulate_ctxt { u8 intercept; u8 op_bytes; u8 ad_bytes; - struct operand src; - struct operand src2; - struct operand dst; union { int (*execute)(struct x86_emulate_ctxt *ctxt); fastop_t fop; @@ -359,6 +361,11 @@ struct x86_emulate_ctxt { u8 seg_override; u64 d; unsigned long _eip; + + /* Here begins the usercopy section. */ + struct operand src; + struct operand src2; + struct operand dst; struct operand memop; unsigned long _regs[NR_VCPU_REGS]; struct operand *memopp; @@ -388,6 +395,34 @@ struct x86_emulate_ctxt { #define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e #define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69 +#define X86EMUL_CPUID_VENDOR_CentaurHauls_ebx 0x746e6543 +#define X86EMUL_CPUID_VENDOR_CentaurHauls_ecx 0x736c7561 +#define X86EMUL_CPUID_VENDOR_CentaurHauls_edx 0x48727561 + +static inline bool is_guest_vendor_intel(u32 ebx, u32 ecx, u32 edx) +{ + return ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && + ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && + edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; +} + +static inline bool is_guest_vendor_amd(u32 ebx, u32 ecx, u32 edx) +{ + return (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && + ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && + edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) || + (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && + ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && + edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx); +} + +static inline bool is_guest_vendor_hygon(u32 ebx, u32 ecx, u32 edx) +{ + return ebx == X86EMUL_CPUID_VENDOR_HygonGenuine_ebx && + ecx == X86EMUL_CPUID_VENDOR_HygonGenuine_ecx && + edx == X86EMUL_CPUID_VENDOR_HygonGenuine_edx; +} + enum x86_intercept_stage { X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */ X86_ICPT_PRE_EXCEPT, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 7356a56e6282..ca80daf8f878 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -164,14 +164,28 @@ static void kvm_apic_map_free(struct rcu_head *rcu) kvfree(map); } -static void recalculate_apic_map(struct kvm *kvm) +void kvm_recalculate_apic_map(struct kvm *kvm) { struct kvm_apic_map *new, *old = NULL; struct kvm_vcpu *vcpu; int i; u32 max_id = 255; /* enough space for any xAPIC ID */ + if (!kvm->arch.apic_map_dirty) { + /* + * Read kvm->arch.apic_map_dirty before + * kvm->arch.apic_map + */ + smp_rmb(); + return; + } + mutex_lock(&kvm->arch.apic_map_lock); + if (!kvm->arch.apic_map_dirty) { + /* Someone else has updated the map. */ + mutex_unlock(&kvm->arch.apic_map_lock); + return; + } kvm_for_each_vcpu(i, vcpu, kvm) if (kvm_apic_present(vcpu)) @@ -236,6 +250,12 @@ out: old = rcu_dereference_protected(kvm->arch.apic_map, lockdep_is_held(&kvm->arch.apic_map_lock)); rcu_assign_pointer(kvm->arch.apic_map, new); + /* + * Write kvm->arch.apic_map before + * clearing apic->apic_map_dirty + */ + smp_wmb(); + kvm->arch.apic_map_dirty = false; mutex_unlock(&kvm->arch.apic_map_lock); if (old) @@ -257,20 +277,20 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val) else static_key_slow_inc(&apic_sw_disabled.key); - recalculate_apic_map(apic->vcpu->kvm); + apic->vcpu->kvm->arch.apic_map_dirty = true; } } static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id) { kvm_lapic_set_reg(apic, APIC_ID, id << 24); - recalculate_apic_map(apic->vcpu->kvm); + apic->vcpu->kvm->arch.apic_map_dirty = true; } static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id) { kvm_lapic_set_reg(apic, APIC_LDR, id); - recalculate_apic_map(apic->vcpu->kvm); + apic->vcpu->kvm->arch.apic_map_dirty = true; } static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) @@ -286,7 +306,7 @@ static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id) kvm_lapic_set_reg(apic, APIC_ID, id); kvm_lapic_set_reg(apic, APIC_LDR, ldr); - recalculate_apic_map(apic->vcpu->kvm); + apic->vcpu->kvm->arch.apic_map_dirty = true; } static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) @@ -294,11 +314,6 @@ static inline int apic_lvt_enabled(struct kvm_lapic *apic, int lvt_type) return !(kvm_lapic_get_reg(apic, lvt_type) & APIC_LVT_MASKED); } -static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) -{ - return kvm_lapic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; -} - static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) { return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT; @@ -448,7 +463,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic) if (unlikely(vcpu->arch.apicv_active)) { /* need to update RVI */ kvm_lapic_clear_vector(vec, apic->regs + APIC_IRR); - kvm_x86_ops->hwapic_irr_update(vcpu, + kvm_x86_ops.hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); } else { apic->irr_pending = false; @@ -473,7 +488,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(vcpu->arch.apicv_active)) - kvm_x86_ops->hwapic_isr_update(vcpu, vec); + kvm_x86_ops.hwapic_isr_update(vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -521,7 +536,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(vcpu->arch.apicv_active)) - kvm_x86_ops->hwapic_isr_update(vcpu, + kvm_x86_ops.hwapic_isr_update(vcpu, apic_find_highest_isr(apic)); else { --apic->isr_count; @@ -659,7 +674,7 @@ static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; if (apic->vcpu->arch.apicv_active) - highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu); + highest_irr = kvm_x86_ops.sync_pir_to_irr(apic->vcpu); else highest_irr = apic_find_highest_irr(apic); if (highest_irr == -1 || (highest_irr & 0xF0) <= ppr) @@ -1048,7 +1063,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, apic->regs + APIC_TMR); } - if (kvm_x86_ops->deliver_posted_interrupt(vcpu, vector)) { + if (kvm_x86_ops.deliver_posted_interrupt(vcpu, vector)) { kvm_lapic_set_irr(vector, apic); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); @@ -1226,7 +1241,7 @@ void kvm_apic_set_eoi_accelerated(struct kvm_vcpu *vcpu, int vector) } EXPORT_SYMBOL_GPL(kvm_apic_set_eoi_accelerated); -static void apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) +void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high) { struct kvm_lapic_irq irq; @@ -1737,7 +1752,7 @@ static void cancel_hv_timer(struct kvm_lapic *apic) { WARN_ON(preemptible()); WARN_ON(!apic->lapic_timer.hv_timer_in_use); - kvm_x86_ops->cancel_hv_timer(apic->vcpu); + kvm_x86_ops.cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; } @@ -1748,13 +1763,13 @@ static bool start_hv_timer(struct kvm_lapic *apic) bool expired; WARN_ON(preemptible()); - if (!kvm_x86_ops->set_hv_timer) + if (!kvm_x86_ops.set_hv_timer) return false; if (!ktimer->tscdeadline) return false; - if (kvm_x86_ops->set_hv_timer(vcpu, ktimer->tscdeadline, &expired)) + if (kvm_x86_ops.set_hv_timer(vcpu, ktimer->tscdeadline, &expired)) return false; ktimer->hv_timer_in_use = true; @@ -1917,7 +1932,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_DFR: if (!apic_x2apic_mode(apic)) { kvm_lapic_set_reg(apic, APIC_DFR, val | 0x0FFFFFFF); - recalculate_apic_map(apic->vcpu->kvm); + apic->vcpu->kvm->arch.apic_map_dirty = true; } else ret = 1; break; @@ -1946,7 +1961,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_ICR: /* No delay here, so we always clear the pending bit */ val &= ~(1 << 12); - apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2)); + kvm_apic_send_ipi(apic, val, kvm_lapic_get_reg(apic, APIC_ICR2)); kvm_lapic_set_reg(apic, APIC_ICR, val); break; @@ -2023,6 +2038,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; } + kvm_recalculate_apic_map(apic->vcpu->kvm); + return ret; } EXPORT_SYMBOL_GPL(kvm_lapic_reg_write); @@ -2171,7 +2188,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) static_key_slow_dec_deferred(&apic_hw_disabled); } else { static_key_slow_inc(&apic_hw_disabled.key); - recalculate_apic_map(vcpu->kvm); + vcpu->kvm->arch.apic_map_dirty = true; } } @@ -2179,7 +2196,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) kvm_apic_set_x2apic_id(apic, vcpu->vcpu_id); if ((old_value ^ value) & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)) - kvm_x86_ops->set_virtual_apic_mode(vcpu); + kvm_x86_ops.set_virtual_apic_mode(vcpu); apic->base_address = apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_BASE; @@ -2212,6 +2229,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) if (!apic) return; + vcpu->kvm->arch.apic_map_dirty = false; /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); @@ -2256,13 +2274,15 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (vcpu->arch.apicv_active) { - kvm_x86_ops->apicv_post_state_restore(vcpu); - kvm_x86_ops->hwapic_irr_update(vcpu, -1); - kvm_x86_ops->hwapic_isr_update(vcpu, -1); + kvm_x86_ops.apicv_post_state_restore(vcpu); + kvm_x86_ops.hwapic_irr_update(vcpu, -1); + kvm_x86_ops.hwapic_isr_update(vcpu, -1); } vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; + + kvm_recalculate_apic_map(vcpu->kvm); } /* @@ -2484,17 +2504,18 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) struct kvm_lapic *apic = vcpu->arch.apic; int r; - kvm_lapic_set_base(vcpu, vcpu->arch.apic_base); /* set SPIV separately to get count of SW disabled APICs right */ apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV))); r = kvm_apic_state_fixup(vcpu, s, true); - if (r) + if (r) { + kvm_recalculate_apic_map(vcpu->kvm); return r; + } memcpy(vcpu->arch.apic->regs, s->regs, sizeof(*s)); - recalculate_apic_map(vcpu->kvm); + kvm_recalculate_apic_map(vcpu->kvm); kvm_apic_set_version(vcpu); apic_update_ppr(apic); @@ -2506,10 +2527,10 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) kvm_apic_update_apicv(vcpu); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { - kvm_x86_ops->apicv_post_state_restore(vcpu); - kvm_x86_ops->hwapic_irr_update(vcpu, + kvm_x86_ops.apicv_post_state_restore(vcpu); + kvm_x86_ops.hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); - kvm_x86_ops->hwapic_isr_update(vcpu, + kvm_x86_ops.hwapic_isr_update(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index ec6fbfe325cf..40ed6ed22751 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -78,6 +78,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); +void kvm_recalculate_apic_map(struct kvm *kvm); void kvm_apic_set_version(struct kvm_vcpu *vcpu); int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val); int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, @@ -95,6 +96,7 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu); bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map); +void kvm_apic_send_ipi(struct kvm_lapic *apic, u32 icr_low, u32 icr_high); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index a647601c9e1c..8a3b1bce722a 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -95,11 +95,11 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu)); } -static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu) +static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) { if (VALID_PAGE(vcpu->arch.mmu->root_hpa)) - vcpu->arch.mmu->set_cr3(vcpu, vcpu->arch.mmu->root_hpa | - kvm_get_active_pcid(vcpu)); + kvm_x86_ops.load_mmu_pgd(vcpu, vcpu->arch.mmu->root_hpa | + kvm_get_active_pcid(vcpu)); } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, @@ -170,8 +170,8 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned pte_access, unsigned pte_pkey, unsigned pfec) { - int cpl = kvm_x86_ops->get_cpl(vcpu); - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + int cpl = kvm_x86_ops.get_cpl(vcpu); + unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); /* * If CPL < 3, SMAP prevention are disabled if EFLAGS.AC = 1. diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 87e9ba27ada1..8071952e9cf2 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -19,6 +19,7 @@ #include "mmu.h" #include "x86.h" #include "kvm_cache_regs.h" +#include "kvm_emulate.h" #include "cpuid.h" #include <linux/kvm_host.h> @@ -86,6 +87,8 @@ __MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint"); */ bool tdp_enabled = false; +static int max_page_level __read_mostly; + enum { AUDIT_PRE_PAGE_FAULT, AUDIT_POST_PAGE_FAULT, @@ -215,17 +218,6 @@ struct kvm_shadow_walk_iterator { unsigned index; }; -static const union kvm_mmu_page_role mmu_base_role_mask = { - .cr0_wp = 1, - .gpte_is_8_bytes = 1, - .nxe = 1, - .smep_andnot_wp = 1, - .smap_andnot_wp = 1, - .smm = 1, - .guest_mode = 1, - .ad_disabled = 1, -}; - #define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \ for (shadow_walk_init_using_root(&(_walker), (_vcpu), \ (_root), (_addr)); \ @@ -313,7 +305,7 @@ kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu); static inline bool kvm_available_flush_tlb_with_range(void) { - return kvm_x86_ops->tlb_remote_flush_with_range; + return kvm_x86_ops.tlb_remote_flush_with_range; } static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, @@ -321,8 +313,8 @@ static void kvm_flush_remote_tlbs_with_range(struct kvm *kvm, { int ret = -ENOTSUPP; - if (range && kvm_x86_ops->tlb_remote_flush_with_range) - ret = kvm_x86_ops->tlb_remote_flush_with_range(kvm, range); + if (range && kvm_x86_ops.tlb_remote_flush_with_range) + ret = kvm_x86_ops.tlb_remote_flush_with_range(kvm, range); if (ret) kvm_flush_remote_tlbs(kvm); @@ -1650,7 +1642,7 @@ static bool spte_set_dirty(u64 *sptep) rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep); /* - * Similar to the !kvm_x86_ops->slot_disable_log_dirty case, + * Similar to the !kvm_x86_ops.slot_disable_log_dirty case, * do not bother adding back write access to pages marked * SPTE_AD_WRPROT_ONLY_MASK. */ @@ -1739,8 +1731,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { - if (kvm_x86_ops->enable_log_dirty_pt_masked) - kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset, + if (kvm_x86_ops.enable_log_dirty_pt_masked) + kvm_x86_ops.enable_log_dirty_pt_masked(kvm, slot, gfn_offset, mask); else kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); @@ -1755,8 +1747,8 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, */ int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu) { - if (kvm_x86_ops->write_log_dirty) - return kvm_x86_ops->write_log_dirty(vcpu); + if (kvm_x86_ops.write_log_dirty) + return kvm_x86_ops.write_log_dirty(vcpu); return 0; } @@ -3044,7 +3036,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (level > PT_PAGE_TABLE_LEVEL) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) - spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, + spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, kvm_is_mmio_pfn(pfn)); if (host_writable) @@ -3292,7 +3284,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PT_PAGE_TABLE_LEVEL; - max_level = min(max_level, kvm_x86_ops->get_lpage_level()); + max_level = min(max_level, max_page_level); for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) @@ -3568,8 +3560,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * write-protected for dirty-logging or access tracking. */ if ((error_code & PFERR_WRITE_MASK) && - spte_can_locklessly_be_made_writable(spte)) - { + spte_can_locklessly_be_made_writable(spte)) { new_spte |= PT_WRITABLE_MASK; /* @@ -3731,7 +3722,9 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->pae_root); } else BUG(); - vcpu->arch.mmu->root_cr3 = vcpu->arch.mmu->get_cr3(vcpu); + + /* root_cr3 is ignored for direct MMUs. */ + vcpu->arch.mmu->root_cr3 = 0; return 0; } @@ -3743,7 +3736,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) gfn_t root_gfn, root_cr3; int i; - root_cr3 = vcpu->arch.mmu->get_cr3(vcpu); + root_cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); root_gfn = root_cr3 >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) @@ -4080,7 +4073,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id; arch.gfn = gfn; arch.direct_map = vcpu->arch.mmu->direct_map; - arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu); + arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); return kvm_setup_async_pf(vcpu, cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); @@ -4252,6 +4245,14 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->nx = false; } +static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t cr3, + union kvm_mmu_page_role role) +{ + return (role.direct || cr3 == root->cr3) && + VALID_PAGE(root->hpa) && page_header(root->hpa) && + role.word == page_header(root->hpa)->role.word; +} + /* * Find out if a previously cached root matching the new CR3/role is available. * The current root is also inserted into the cache. @@ -4270,12 +4271,13 @@ static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3, root.cr3 = mmu->root_cr3; root.hpa = mmu->root_hpa; + if (is_root_usable(&root, new_cr3, new_role)) + return true; + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { swap(root, mmu->prev_roots[i]); - if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) && - page_header(root.hpa) != NULL && - new_role.word == page_header(root.hpa)->role.word) + if (is_root_usable(&root, new_cr3, new_role)) break; } @@ -4309,7 +4311,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, * accompanied by KVM_REQ_MMU_RELOAD, which will free * the root set here and allocate a new one. */ - kvm_make_request(KVM_REQ_LOAD_CR3, vcpu); + kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); if (!skip_tlb_flush) { kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); @@ -4508,7 +4510,8 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, cpuid_maxphyaddr(vcpu), context->root_level, context->nx, guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES), - is_pse(vcpu), guest_cpuid_is_amd(vcpu)); + is_pse(vcpu), + guest_cpuid_is_amd_or_hygon(vcpu)); } static void @@ -4874,7 +4877,6 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); ext.cr4_pse = !!is_pse(vcpu); ext.cr4_pke = !!kvm_read_cr4_bits(vcpu, X86_CR4_PKE); - ext.cr4_la57 = !!kvm_read_cr4_bits(vcpu, X86_CR4_LA57); ext.maxphyaddr = cpuid_maxphyaddr(vcpu); ext.valid = 1; @@ -4907,7 +4909,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); role.base.ad_disabled = (shadow_accessed_mask == 0); - role.base.level = kvm_x86_ops->get_tdp_level(vcpu); + role.base.level = kvm_x86_ops.get_tdp_level(vcpu); role.base.direct = true; role.base.gpte_is_8_bytes = true; @@ -4920,7 +4922,6 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) union kvm_mmu_role new_role = kvm_calc_tdp_mmu_root_page_role(vcpu, false); - new_role.base.word &= mmu_base_role_mask.word; if (new_role.as_u64 == context->mmu_role.as_u64) return; @@ -4929,10 +4930,9 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; context->update_pte = nonpaging_update_pte; - context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu); + context->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu); context->direct_map = true; - context->set_cr3 = kvm_x86_ops->set_tdp_cr3; - context->get_cr3 = get_cr3; + context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; @@ -4992,7 +4992,6 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) union kvm_mmu_role new_role = kvm_calc_shadow_mmu_root_page_role(vcpu, false); - new_role.base.word &= mmu_base_role_mask.word; if (new_role.as_u64 == context->mmu_role.as_u64) return; @@ -5012,14 +5011,14 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); static union kvm_mmu_role kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, - bool execonly) + bool execonly, u8 level) { union kvm_mmu_role role = {0}; /* SMM flag is inherited from root_mmu */ role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm; - role.base.level = PT64_ROOT_4LEVEL; + role.base.level = level; role.base.gpte_is_8_bytes = true; role.base.direct = false; role.base.ad_disabled = !accessed_dirty; @@ -5043,17 +5042,17 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp) { struct kvm_mmu *context = vcpu->arch.mmu; + u8 level = vmx_eptp_page_walk_level(new_eptp); union kvm_mmu_role new_role = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, - execonly); + execonly, level); __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false); - new_role.base.word &= mmu_base_role_mask.word; if (new_role.as_u64 == context->mmu_role.as_u64) return; - context->shadow_root_level = PT64_ROOT_4LEVEL; + context->shadow_root_level = level; context->nx = true; context->ept_ad = accessed_dirty; @@ -5062,7 +5061,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, context->sync_page = ept_sync_page; context->invlpg = ept_invlpg; context->update_pte = ept_update_pte; - context->root_level = PT64_ROOT_4LEVEL; + context->root_level = level; context->direct_map = false; context->mmu_role.as_u64 = new_role.as_u64; @@ -5079,8 +5078,7 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu) struct kvm_mmu *context = vcpu->arch.mmu; kvm_init_shadow_mmu(vcpu); - context->set_cr3 = kvm_x86_ops->set_cr3; - context->get_cr3 = get_cr3; + context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; } @@ -5090,12 +5088,11 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) union kvm_mmu_role new_role = kvm_calc_mmu_role_common(vcpu, false); struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; - new_role.base.word &= mmu_base_role_mask.word; if (new_role.as_u64 == g_context->mmu_role.as_u64) return; g_context->mmu_role.as_u64 = new_role.as_u64; - g_context->get_cr3 = get_cr3; + g_context->get_guest_pgd = get_cr3; g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; @@ -5185,8 +5182,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); if (r) goto out; - kvm_mmu_load_cr3(vcpu); - kvm_x86_ops->tlb_flush(vcpu, true); + kvm_mmu_load_pgd(vcpu); + kvm_x86_ops.tlb_flush(vcpu, true); out: return r; } @@ -5329,6 +5326,22 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) return spte; } +/* + * Ignore various flags when determining if a SPTE can be immediately + * overwritten for the current MMU. + * - level: explicitly checked in mmu_pte_write_new_pte(), and will never + * match the current MMU role, as MMU's level tracks the root level. + * - access: updated based on the new guest PTE + * - quadrant: handled by get_written_sptes() + * - invalid: always false (loop only walks valid shadow pages) + */ +static const union kvm_mmu_page_role role_ign = { + .level = 0xf, + .access = 0x7, + .quadrant = 0x3, + .invalid = 0x1, +}; + static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes, struct kvm_page_track_notifier_node *node) @@ -5384,8 +5397,8 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, entry = *spte; mmu_page_zap_pte(vcpu->kvm, sp, spte); if (gentry && - !((sp->role.word ^ base_role) - & mmu_base_role_mask.word) && rmap_can_add(vcpu)) + !((sp->role.word ^ base_role) & ~role_ign.word) && + rmap_can_add(vcpu)) mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (need_remote_flush(entry, *spte)) remote_flush = true; @@ -5416,18 +5429,12 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len) { - int r, emulation_type = 0; + int r, emulation_type = EMULTYPE_PF; bool direct = vcpu->arch.mmu->direct_map; if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa))) return RET_PF_RETRY; - /* With shadow page tables, fault_address contains a GVA or nGPA. */ - if (vcpu->arch.mmu->direct_map) { - vcpu->arch.gpa_available = true; - vcpu->arch.gpa_val = cr2_or_gpa; - } - r = RET_PF_INVALID; if (unlikely(error_code & PFERR_RSVD_MASK)) { r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct); @@ -5471,7 +5478,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, * for L1 isn't going to magically fix whatever issue cause L2 to fail. */ if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu)) - emulation_type = EMULTYPE_ALLOW_RETRY; + emulation_type |= EMULTYPE_ALLOW_RETRY_PF; emulate: /* * On AMD platforms, under certain conditions insn_len may be zero on #NPF. @@ -5481,7 +5488,7 @@ emulate: * guest, with the exception of AMD Erratum 1096 which is unrecoverable. */ if (unlikely(insn && !insn_len)) { - if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu)) + if (!kvm_x86_ops.need_emulation_on_page_fault(vcpu)) return 1; } @@ -5516,7 +5523,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) if (VALID_PAGE(mmu->prev_roots[i].hpa)) mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); - kvm_x86_ops->tlb_flush_gva(vcpu, gva); + kvm_x86_ops.tlb_flush_gva(vcpu, gva); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); @@ -5541,7 +5548,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } if (tlb_flush) - kvm_x86_ops->tlb_flush_gva(vcpu, gva); + kvm_x86_ops.tlb_flush_gva(vcpu, gva); ++vcpu->stat.invlpg; @@ -5553,18 +5560,25 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); -void kvm_enable_tdp(void) +void kvm_configure_mmu(bool enable_tdp, int tdp_page_level) { - tdp_enabled = true; -} -EXPORT_SYMBOL_GPL(kvm_enable_tdp); + tdp_enabled = enable_tdp; -void kvm_disable_tdp(void) -{ - tdp_enabled = false; + /* + * max_page_level reflects the capabilities of KVM's MMU irrespective + * of kernel support, e.g. KVM may be capable of using 1GB pages when + * the kernel is not. But, KVM never creates a page size greater than + * what is used by the kernel for any given HVA, i.e. the kernel's + * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). + */ + if (tdp_enabled) + max_page_level = tdp_page_level; + else if (boot_cpu_has(X86_FEATURE_GBPAGES)) + max_page_level = PT_PDPE_LEVEL; + else + max_page_level = PT_DIRECTORY_LEVEL; } -EXPORT_SYMBOL_GPL(kvm_disable_tdp); - +EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head); @@ -5658,7 +5672,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can * skip allocating the PDP table. */ - if (tdp_enabled && kvm_x86_ops->get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) + if (tdp_enabled && kvm_x86_ops.get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); @@ -5860,23 +5874,17 @@ static bool slot_rmap_write_protect(struct kvm *kvm, } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot) + struct kvm_memory_slot *memslot, + int start_level) { bool flush; spin_lock(&kvm->mmu_lock); - flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect, - false); + flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, + start_level, PT_MAX_HUGEPAGE_LEVEL, false); spin_unlock(&kvm->mmu_lock); /* - * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() - * which do tlb flush out of mmu-lock should be serialized by - * kvm->slots_lock otherwise tlb flush would be missed. - */ - lockdep_assert_held(&kvm->slots_lock); - - /* * We can flush all the TLBs out of the mmu lock without TLB * corruption since we just change the spte from writable to * readonly so that we only need to care the case of changing @@ -5888,8 +5896,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, * on PT_WRITABLE_MASK anymore. */ if (flush) - kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, - memslot->npages); + kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, @@ -5941,6 +5948,21 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, spin_unlock(&kvm->mmu_lock); } +void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + /* + * All current use cases for flushing the TLBs for a specific memslot + * are related to dirty logging, and do the TLB flush out of mmu_lock. + * The interaction between the various operations on memslot must be + * serialized by slots_locks to ensure the TLB flush from one operation + * is observed by any other operation on the same memslot. + */ + lockdep_assert_held(&kvm->slots_lock); + kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, + memslot->npages); +} + void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -5950,8 +5972,6 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); spin_unlock(&kvm->mmu_lock); - lockdep_assert_held(&kvm->slots_lock); - /* * It's also safe to flush TLBs out of mmu lock here as currently this * function is only used for dirty logging, in which case flushing TLB @@ -5959,8 +5979,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, * dirty_bitmap. */ if (flush) - kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, - memslot->npages); + kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); @@ -5974,12 +5993,8 @@ void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, false); spin_unlock(&kvm->mmu_lock); - /* see kvm_mmu_slot_remove_write_access */ - lockdep_assert_held(&kvm->slots_lock); - if (flush) - kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, - memslot->npages); + kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); @@ -5992,12 +6007,8 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm, flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); spin_unlock(&kvm->mmu_lock); - lockdep_assert_held(&kvm->slots_lock); - - /* see kvm_mmu_slot_leaf_clear_dirty */ if (flush) - kvm_flush_remote_tlbs_with_address(kvm, memslot->base_gfn, - memslot->npages); + kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty); diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 3521e2d176f2..ddc1ec3bdacd 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -14,22 +14,18 @@ #include <linux/kvm_host.h> #include <linux/rculist.h> -#include <asm/kvm_host.h> #include <asm/kvm_page_track.h> #include "mmu.h" -void kvm_page_track_free_memslot(struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { int i; - for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) - if (!dont || free->arch.gfn_track[i] != - dont->arch.gfn_track[i]) { - kvfree(free->arch.gfn_track[i]); - free->arch.gfn_track[i] = NULL; - } + for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { + kvfree(slot->arch.gfn_track[i]); + slot->arch.gfn_track[i] = NULL; + } } int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, @@ -48,7 +44,7 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, return 0; track_free: - kvm_page_track_free_memslot(slot, NULL); + kvm_page_track_free_memslot(slot); return -ENOMEM; } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 21a3320f166a..9bdf9b7d9a96 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -66,7 +66,7 @@ #define PT_GUEST_ACCESSED_SHIFT 8 #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad) #define CMPXCHG cmpxchg64 - #define PT_MAX_FULL_LEVELS 4 + #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #error Invalid PTTYPE value #endif @@ -333,7 +333,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: walker->level = mmu->root_level; - pte = mmu->get_cr3(vcpu); + pte = mmu->get_guest_pgd(vcpu); have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); #if PTTYPE == 64 diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index bcc6a73d6628..a5078841bdac 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -111,7 +111,7 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .config = config, }; - attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); + attr.sample_period = get_sample_period(pmc, pmc->counter); if (in_tx) attr.config |= HSW_IN_TX; @@ -158,7 +158,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) /* recalibrate sample period and check if it's accepted by perf core */ if (perf_event_period(pmc->perf_event, - (-pmc->counter) & pmc_bitmask(pmc))) + get_sample_period(pmc, pmc->counter))) return false; /* reuse perf_event to serve as pmc_reprogram_counter() does*/ @@ -211,7 +211,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) ARCH_PERFMON_EVENTSEL_CMASK | HSW_IN_TX | HSW_IN_TX_CHECKPOINTED))) { - config = kvm_x86_ops->pmu_ops->find_arch_event(pmc_to_pmu(pmc), + config = kvm_x86_ops.pmu_ops->find_arch_event(pmc_to_pmu(pmc), event_select, unit_mask); if (config != PERF_COUNT_HW_MAX) @@ -265,7 +265,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) pmc->current_config = (u64)ctrl; pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, - kvm_x86_ops->pmu_ops->find_fixed_event(idx), + kvm_x86_ops.pmu_ops->find_fixed_event(idx), !(en_field & 0x2), /* exclude user */ !(en_field & 0x1), /* exclude kernel */ pmi, false, false); @@ -274,7 +274,7 @@ EXPORT_SYMBOL_GPL(reprogram_fixed_counter); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx) { - struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx); + struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx); if (!pmc) return; @@ -296,7 +296,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) int bit; for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) { - struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, bit); + struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, bit); if (unlikely(!pmc || !pmc->perf_event)) { clear_bit(bit, pmu->reprogram_pmi); @@ -318,7 +318,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) /* check if idx is a valid index to access PMU */ int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { - return kvm_x86_ops->pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); + return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); } bool is_vmware_backdoor_pmc(u32 pmc_idx) @@ -368,7 +368,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); - pmc = kvm_x86_ops->pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask); + pmc = kvm_x86_ops.pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask); if (!pmc) return 1; @@ -384,14 +384,14 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { - return kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr) || - kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, msr); + return kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr) || + kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, msr); } static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); - struct kvm_pmc *pmc = kvm_x86_ops->pmu_ops->msr_idx_to_pmc(vcpu, msr); + struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr); if (pmc) __set_bit(pmc->idx, pmu->pmc_in_use); @@ -399,13 +399,13 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data) { - return kvm_x86_ops->pmu_ops->get_msr(vcpu, msr, data); + return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr, data); } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); - return kvm_x86_ops->pmu_ops->set_msr(vcpu, msr_info); + return kvm_x86_ops.pmu_ops->set_msr(vcpu, msr_info); } /* refresh PMU settings. This function generally is called when underlying @@ -414,7 +414,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ void kvm_pmu_refresh(struct kvm_vcpu *vcpu) { - kvm_x86_ops->pmu_ops->refresh(vcpu); + kvm_x86_ops.pmu_ops->refresh(vcpu); } void kvm_pmu_reset(struct kvm_vcpu *vcpu) @@ -422,7 +422,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); irq_work_sync(&pmu->irq_work); - kvm_x86_ops->pmu_ops->reset(vcpu); + kvm_x86_ops.pmu_ops->reset(vcpu); } void kvm_pmu_init(struct kvm_vcpu *vcpu) @@ -430,7 +430,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu) struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); memset(pmu, 0, sizeof(*pmu)); - kvm_x86_ops->pmu_ops->init(vcpu); + kvm_x86_ops.pmu_ops->init(vcpu); init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn); pmu->event_count = 0; pmu->need_cleanup = false; @@ -462,7 +462,7 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) pmu->pmc_in_use, X86_PMC_IDX_MAX); for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) { - pmc = kvm_x86_ops->pmu_ops->pmc_idx_to_pmc(pmu, i); + pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i); if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc)) pmc_stop_counter(pmc); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 13332984b6d5..a6c78a797cb1 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -88,7 +88,7 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc) static inline bool pmc_is_enabled(struct kvm_pmc *pmc) { - return kvm_x86_ops->pmu_ops->pmc_is_enabled(pmc); + return kvm_x86_ops.pmu_ops->pmc_is_enabled(pmc); } static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu, @@ -129,6 +129,15 @@ static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) return NULL; } +static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value) +{ + u64 sample_period = (-counter_value) & pmc_bitmask(pmc); + + if (!sample_period) + sample_period = pmc_bitmask(pmc) + 1; + return sample_period; +} + void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel); void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int fixed_idx); void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 216364cb65a3..851e9cc79930 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -522,10 +522,31 @@ static void recalc_intercepts(struct vcpu_svm *svm) h = &svm->nested.hsave->control; g = &svm->nested; - c->intercept_cr = h->intercept_cr | g->intercept_cr; - c->intercept_dr = h->intercept_dr | g->intercept_dr; - c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions; - c->intercept = h->intercept | g->intercept; + c->intercept_cr = h->intercept_cr; + c->intercept_dr = h->intercept_dr; + c->intercept_exceptions = h->intercept_exceptions; + c->intercept = h->intercept; + + if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { + /* We only want the cr8 intercept bits of L1 */ + c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ); + c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE); + + /* + * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not + * affect any interrupt we may want to inject; therefore, + * interrupt window vmexits are irrelevant to L0. + */ + c->intercept &= ~(1ULL << INTERCEPT_VINTR); + } + + /* We don't want to see VMMCALLs from a nested guest */ + c->intercept &= ~(1ULL << INTERCEPT_VMMCALL); + + c->intercept_cr |= g->intercept_cr; + c->intercept_dr |= g->intercept_dr; + c->intercept_exceptions |= g->intercept_exceptions; + c->intercept |= g->intercept; } static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm) @@ -630,6 +651,11 @@ static inline void clr_intercept(struct vcpu_svm *svm, int bit) recalc_intercepts(svm); } +static inline bool is_intercept(struct vcpu_svm *svm, int bit) +{ + return (svm->vmcb->control.intercept & (1ULL << bit)) != 0; +} + static inline bool vgif_enabled(struct vcpu_svm *svm) { return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK); @@ -1209,6 +1235,7 @@ static int avic_ga_log_notifier(u32 ga_tag) u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n", __func__, vm_id, vcpu_id); + trace_kvm_avic_ga_log(vm_id, vcpu_id); spin_lock_irqsave(&svm_vm_data_hash_lock, flags); hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { @@ -1370,6 +1397,29 @@ static void svm_hardware_teardown(void) iopm_base = 0; } +static __init void svm_set_cpu_caps(void) +{ + kvm_set_cpu_caps(); + + supported_xss = 0; + + /* CPUID 0x80000001 and 0x8000000A (SVM features) */ + if (nested) { + kvm_cpu_cap_set(X86_FEATURE_SVM); + + if (nrips) + kvm_cpu_cap_set(X86_FEATURE_NRIPS); + + if (npt_enabled) + kvm_cpu_cap_set(X86_FEATURE_NPT); + } + + /* CPUID 0x80000008 */ + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || + boot_cpu_has(X86_FEATURE_AMD_SSBD)) + kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); +} + static __init int svm_hardware_setup(void) { int cpu; @@ -1388,6 +1438,8 @@ static __init int svm_hardware_setup(void) init_msrpm_offsets(); + supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); + if (boot_cpu_has(X86_FEATURE_NX)) kvm_enable_efer_bits(EFER_NX); @@ -1435,16 +1487,11 @@ static __init int svm_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; - if (npt_enabled && !npt) { - printk(KERN_INFO "kvm: Nested Paging disabled\n"); + if (npt_enabled && !npt) npt_enabled = false; - } - if (npt_enabled) { - printk(KERN_INFO "kvm: Nested Paging enabled\n"); - kvm_enable_tdp(); - } else - kvm_disable_tdp(); + kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL); + pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); if (nrips) { if (!boot_cpu_has(X86_FEATURE_NRIPS)) @@ -1480,6 +1527,8 @@ static __init int svm_hardware_setup(void) pr_info("Virtual GIF supported\n"); } + svm_set_cpu_caps(); + return 0; err: @@ -1939,19 +1988,6 @@ static void __unregister_enc_region_locked(struct kvm *kvm, kfree(region); } -static struct kvm *svm_vm_alloc(void) -{ - struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm), - GFP_KERNEL_ACCOUNT | __GFP_ZERO, - PAGE_KERNEL); - return &kvm_svm->kvm; -} - -static void svm_vm_free(struct kvm *kvm) -{ - vfree(to_kvm_svm(kvm)); -} - static void sev_vm_destroy(struct kvm *kvm) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; @@ -2186,7 +2222,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) } init_vmcb(svm); - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true); + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false); kvm_rdx_write(vcpu, eax); if (kvm_vcpu_apicv_active(vcpu) && !init_event) @@ -2420,14 +2456,38 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) } } +static inline void svm_enable_vintr(struct vcpu_svm *svm) +{ + struct vmcb_control_area *control; + + /* The following fields are ignored when AVIC is enabled */ + WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu)); + + /* + * This is just a dummy VINTR to actually cause a vmexit to happen. + * Actual injection of virtual interrupts happens through EVENTINJ. + */ + control = &svm->vmcb->control; + control->int_vector = 0x0; + control->int_ctl &= ~V_INTR_PRIO_MASK; + control->int_ctl |= V_IRQ_MASK | + ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); + mark_dirty(svm->vmcb, VMCB_INTR); +} + static void svm_set_vintr(struct vcpu_svm *svm) { set_intercept(svm, INTERCEPT_VINTR); + if (is_intercept(svm, INTERCEPT_VINTR)) + svm_enable_vintr(svm); } static void svm_clear_vintr(struct vcpu_svm *svm) { clr_intercept(svm, INTERCEPT_VINTR); + + svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; + mark_dirty(svm->vmcb, VMCB_INTR); } static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) @@ -2983,15 +3043,6 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) return pdpte; } -static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, - unsigned long root) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - svm->vmcb->control.nested_cr3 = __sme_set(root); - mark_dirty(svm->vmcb, VMCB_NPT); -} - static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, struct x86_exception *fault) { @@ -3027,8 +3078,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu = &vcpu->arch.guest_mmu; kvm_init_shadow_mmu(vcpu); - vcpu->arch.mmu->set_cr3 = nested_svm_set_tdp_cr3; - vcpu->arch.mmu->get_cr3 = nested_svm_get_tdp_cr3; + vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu); @@ -3089,43 +3139,36 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, return vmexit; } -/* This function returns true if it is save to enable the irq window */ -static inline bool nested_svm_intr(struct vcpu_svm *svm) +static void nested_svm_intr(struct vcpu_svm *svm) { - if (!is_guest_mode(&svm->vcpu)) - return true; - - if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) - return true; - - if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) - return false; - - /* - * if vmexit was already requested (by intercepted exception - * for instance) do not overwrite it with "external interrupt" - * vmexit. - */ - if (svm->nested.exit_required) - return false; - svm->vmcb->control.exit_code = SVM_EXIT_INTR; svm->vmcb->control.exit_info_1 = 0; svm->vmcb->control.exit_info_2 = 0; - if (svm->nested.intercept & 1ULL) { - /* - * The #vmexit can't be emulated here directly because this - * code path runs with irqs and preemption disabled. A - * #vmexit emulation might sleep. Only signal request for - * the #vmexit here. - */ - svm->nested.exit_required = true; - trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); - return false; + /* nested_svm_vmexit this gets called afterwards from handle_exit */ + svm->nested.exit_required = true; + trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip); +} + +static bool nested_exit_on_intr(struct vcpu_svm *svm) +{ + return (svm->nested.intercept & 1ULL); +} + +static int svm_check_nested_events(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + bool block_nested_events = + kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required; + + if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(svm)) { + if (block_nested_events) + return -EBUSY; + nested_svm_intr(svm); + return 0; } - return true; + return 0; } /* This function returns true if it is save to enable the nmi window */ @@ -3244,9 +3287,6 @@ static int nested_svm_exit_special(struct vcpu_svm *svm) return NESTED_EXIT_CONTINUE; } -/* - * If this function returns true, this #vmexit was already handled - */ static int nested_svm_intercept(struct vcpu_svm *svm) { u32 exit_code = svm->vmcb->control.exit_code; @@ -3521,6 +3561,9 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm) static bool nested_vmcb_checks(struct vmcb *vmcb) { + if ((vmcb->save.efer & EFER_SVME) == 0) + return false; + if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0) return false; @@ -3537,6 +3580,10 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, struct vmcb *nested_vmcb, struct kvm_host_map *map) { + bool evaluate_pending_interrupts = + is_intercept(svm, INTERCEPT_VINTR) || + is_intercept(svm, INTERCEPT_IRET); + if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; else @@ -3596,15 +3643,6 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, else svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; - if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { - /* We only want the cr8 intercept bits of the guest */ - clr_cr_intercept(svm, INTERCEPT_CR8_READ); - clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); - } - - /* We don't want to see VMMCALLs from a nested guest */ - clr_intercept(svm, INTERCEPT_VMMCALL); - svm->vcpu.arch.tsc_offset += nested_vmcb->control.tsc_offset; svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset; @@ -3632,7 +3670,21 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->nested.vmcb = vmcb_gpa; + /* + * If L1 had a pending IRQ/NMI before executing VMRUN, + * which wasn't delivered because it was disallowed (e.g. + * interrupts disabled), L0 needs to evaluate if this pending + * event should cause an exit from L2 to L1 or be delivered + * directly to L2. + * + * Usually this would be handled by the processor noticing an + * IRQ/NMI window request. However, VMRUN can unblock interrupts + * by implicitly setting GIF, so force L0 to perform pending event + * evaluation by requesting a KVM_REQ_EVENT. + */ enable_gif(svm); + if (unlikely(evaluate_pending_interrupts)) + kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); mark_all_dirty(svm->vmcb); } @@ -3834,11 +3886,8 @@ static int clgi_interception(struct vcpu_svm *svm) disable_gif(svm); /* After a CLGI no interrupts should come */ - if (!kvm_vcpu_apicv_active(&svm->vcpu)) { + if (!kvm_vcpu_apicv_active(&svm->vcpu)) svm_clear_vintr(svm); - svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; - mark_dirty(svm->vmcb, VMCB_INTR); - } return ret; } @@ -5124,19 +5173,6 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) ++vcpu->stat.nmi_injections; } -static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) -{ - struct vmcb_control_area *control; - - /* The following fields are ignored when AVIC is enabled */ - control = &svm->vmcb->control; - control->int_vector = irq; - control->int_ctl &= ~V_INTR_PRIO_MASK; - control->int_ctl |= V_IRQ_MASK | - ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); - mark_dirty(svm->vmcb, VMCB_INTR); -} - static void svm_set_irq(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5525,18 +5561,15 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - int ret; if (!gif_set(svm) || (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) return 0; - ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); - - if (is_guest_mode(vcpu)) - return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); - - return ret; + if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)) + return !!(svm->vcpu.arch.hflags & HF_HIF_MASK); + else + return !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); } static void enable_irq_window(struct kvm_vcpu *vcpu) @@ -5551,7 +5584,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) * enabled, the STGI interception will not occur. Enable the irq * window under the assumption that the hardware will set the GIF. */ - if ((vgif_enabled(svm) || gif_set(svm)) && nested_svm_intr(svm)) { + if (vgif_enabled(svm) || gif_set(svm)) { /* * IRQ window is not needed when AVIC is enabled, * unless we have pending ExtINT since it cannot be injected @@ -5560,7 +5593,6 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) */ svm_toggle_avic_for_irq_window(vcpu, false); svm_set_vintr(svm); - svm_inject_irq(svm, 0x0); } } @@ -5946,24 +5978,30 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) } STACK_FRAME_NON_STANDARD(svm_vcpu_run); -static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) +static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); + bool update_guest_cr3 = true; + unsigned long cr3; - svm->vmcb->save.cr3 = __sme_set(root); - mark_dirty(svm->vmcb, VMCB_CR); -} - -static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) -{ - struct vcpu_svm *svm = to_svm(vcpu); + cr3 = __sme_set(root); + if (npt_enabled) { + svm->vmcb->control.nested_cr3 = cr3; + mark_dirty(svm->vmcb, VMCB_NPT); - svm->vmcb->control.nested_cr3 = __sme_set(root); - mark_dirty(svm->vmcb, VMCB_NPT); + /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ + if (is_guest_mode(vcpu)) + update_guest_cr3 = false; + else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) + cr3 = vcpu->arch.cr3; + else /* CR3 is already up-to-date. */ + update_guest_cr3 = false; + } - /* Also sync guest cr3 here in case we live migrate */ - svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); - mark_dirty(svm->vmcb, VMCB_CR); + if (update_guest_cr3) { + svm->vmcb->save.cr3 = cr3; + mark_dirty(svm->vmcb, VMCB_CR); + } } static int is_disabled(void) @@ -6025,12 +6063,19 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) boot_cpu_has(X86_FEATURE_XSAVES); /* Update nrips enabled cache */ - svm->nrips_enabled = !!guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); + svm->nrips_enabled = kvm_cpu_cap_has(X86_FEATURE_NRIPS) && + guest_cpuid_has(&svm->vcpu, X86_FEATURE_NRIPS); if (!kvm_vcpu_apicv_active(vcpu)) return; - guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC); + /* + * AVIC does not work with an x2APIC mode guest. If the X2APIC feature + * is exposed to the guest, disable AVIC. + */ + if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC)) + kvm_request_apicv_update(vcpu->kvm, false, + APICV_INHIBIT_REASON_X2APIC); /* * Currently, AVIC does not work with nested virtualization. @@ -6041,88 +6086,11 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu) APICV_INHIBIT_REASON_NESTED); } -#define F feature_bit - -static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) -{ - switch (func) { - case 0x1: - if (avic) - entry->ecx &= ~F(X2APIC); - break; - case 0x80000001: - if (nested) - entry->ecx |= (1 << 2); /* Set SVM bit */ - break; - case 0x80000008: - if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || - boot_cpu_has(X86_FEATURE_AMD_SSBD)) - entry->ebx |= F(VIRT_SSBD); - break; - case 0x8000000A: - entry->eax = 1; /* SVM revision 1 */ - entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper - ASID emulation to nested SVM */ - entry->ecx = 0; /* Reserved */ - entry->edx = 0; /* Per default do not support any - additional features */ - - /* Support next_rip if host supports it */ - if (boot_cpu_has(X86_FEATURE_NRIPS)) - entry->edx |= F(NRIPS); - - /* Support NPT for the guest if enabled */ - if (npt_enabled) - entry->edx |= F(NPT); - - } -} - -static int svm_get_lpage_level(void) -{ - return PT_PDPE_LEVEL; -} - -static bool svm_rdtscp_supported(void) -{ - return boot_cpu_has(X86_FEATURE_RDTSCP); -} - -static bool svm_invpcid_supported(void) -{ - return false; -} - -static bool svm_mpx_supported(void) -{ - return false; -} - -static bool svm_xsaves_supported(void) -{ - return boot_cpu_has(X86_FEATURE_XSAVES); -} - -static bool svm_umip_emulated(void) -{ - return false; -} - -static bool svm_pt_supported(void) -{ - return false; -} - static bool svm_has_wbinvd_exit(void) { return true; } -static bool svm_pku_supported(void) -{ - return false; -} - #define PRE_EX(exit) { .exit_code = (exit), \ .stage = X86_ICPT_PRE_EXCEPT, } #define POST_EX(exit) { .exit_code = (exit), \ @@ -6189,7 +6157,8 @@ static const struct __x86_intercept { static int svm_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, - enum x86_intercept_stage stage) + enum x86_intercept_stage stage, + struct x86_exception *exception) { struct vcpu_svm *svm = to_svm(vcpu); int vmexit, ret = X86EMUL_CONTINUE; @@ -7371,7 +7340,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) * TODO: Last condition latch INIT signals on vCPU when * vCPU is in guest-mode and vmcb12 defines intercept on INIT. * To properly emulate the INIT intercept, SVM should implement - * kvm_x86_ops->check_nested_events() and call nested_svm_vmexit() + * kvm_x86_ops.check_nested_events() and call nested_svm_vmexit() * there if an INIT signal is pending. */ return !gif_set(svm) || @@ -7384,7 +7353,8 @@ static bool svm_check_apicv_inhibit_reasons(ulong bit) BIT(APICV_INHIBIT_REASON_HYPERV) | BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | - BIT(APICV_INHIBIT_REASON_PIT_REINJ); + BIT(APICV_INHIBIT_REASON_PIT_REINJ) | + BIT(APICV_INHIBIT_REASON_X2APIC); return supported & BIT(bit); } @@ -7394,12 +7364,8 @@ static void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate) avic_update_access_page(kvm, activate); } -static struct kvm_x86_ops svm_x86_ops __ro_after_init = { - .cpu_has_kvm_support = has_svm, - .disabled_by_bios = is_disabled, - .hardware_setup = svm_hardware_setup, +static struct kvm_x86_ops svm_x86_ops __initdata = { .hardware_unsetup = svm_hardware_teardown, - .check_processor_compatibility = svm_check_processor_compat, .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, @@ -7409,8 +7375,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .vcpu_free = svm_free_vcpu, .vcpu_reset = svm_vcpu_reset, - .vm_alloc = svm_vm_alloc, - .vm_free = svm_vm_free, + .vm_size = sizeof(struct kvm_svm), .vm_init = svm_vm_init, .vm_destroy = svm_vm_destroy, @@ -7432,7 +7397,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, - .set_cr3 = svm_set_cr3, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, .get_idt = svm_get_idt, @@ -7485,26 +7449,14 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .get_exit_info = svm_get_exit_info, - .get_lpage_level = svm_get_lpage_level, - .cpuid_update = svm_cpuid_update, - .rdtscp_supported = svm_rdtscp_supported, - .invpcid_supported = svm_invpcid_supported, - .mpx_supported = svm_mpx_supported, - .xsaves_supported = svm_xsaves_supported, - .umip_emulated = svm_umip_emulated, - .pt_supported = svm_pt_supported, - .pku_supported = svm_pku_supported, - - .set_supported_cpuid = svm_set_supported_cpuid, - .has_wbinvd_exit = svm_has_wbinvd_exit, .read_l1_tsc_offset = svm_read_l1_tsc_offset, .write_l1_tsc_offset = svm_write_l1_tsc_offset, - .set_tdp_cr3 = set_tdp_cr3, + .load_mmu_pgd = svm_load_mmu_pgd, .check_intercept = svm_check_intercept, .handle_exit_irqoff = svm_handle_exit_irqoff, @@ -7534,11 +7486,22 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, .apic_init_signal_blocked = svm_apic_init_signal_blocked, + + .check_nested_events = svm_check_nested_events, +}; + +static struct kvm_x86_init_ops svm_init_ops __initdata = { + .cpu_has_kvm_support = has_svm, + .disabled_by_bios = is_disabled, + .hardware_setup = svm_hardware_setup, + .check_processor_compatibility = svm_check_processor_compat, + + .runtime_ops = &svm_x86_ops, }; static int __init svm_init(void) { - return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), + return kvm_init(&svm_init_ops, sizeof(struct vcpu_svm), __alignof__(struct vcpu_svm), THIS_MODULE); } diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index cef5a344fedb..249062f24b94 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -151,32 +151,38 @@ TRACE_EVENT(kvm_fast_mmio, * Tracepoint for cpuid. */ TRACE_EVENT(kvm_cpuid, - TP_PROTO(unsigned int function, unsigned long rax, unsigned long rbx, - unsigned long rcx, unsigned long rdx, bool found), - TP_ARGS(function, rax, rbx, rcx, rdx, found), + TP_PROTO(unsigned int function, unsigned int index, unsigned long rax, + unsigned long rbx, unsigned long rcx, unsigned long rdx, + bool found, bool used_max_basic), + TP_ARGS(function, index, rax, rbx, rcx, rdx, found, used_max_basic), TP_STRUCT__entry( __field( unsigned int, function ) + __field( unsigned int, index ) __field( unsigned long, rax ) __field( unsigned long, rbx ) __field( unsigned long, rcx ) __field( unsigned long, rdx ) __field( bool, found ) + __field( bool, used_max_basic ) ), TP_fast_assign( __entry->function = function; + __entry->index = index; __entry->rax = rax; __entry->rbx = rbx; __entry->rcx = rcx; __entry->rdx = rdx; __entry->found = found; + __entry->used_max_basic = used_max_basic; ), - TP_printk("func %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s", - __entry->function, __entry->rax, + TP_printk("func %x idx %x rax %lx rbx %lx rcx %lx rdx %lx, cpuid entry %s%s", + __entry->function, __entry->index, __entry->rax, __entry->rbx, __entry->rcx, __entry->rdx, - __entry->found ? "found" : "not found") + __entry->found ? "found" : "not found", + __entry->used_max_basic ? ", used max basic" : "") ); #define AREG(x) { APIC_##x, "APIC_" #x } @@ -240,7 +246,7 @@ TRACE_EVENT(kvm_exit, __entry->guest_rip = kvm_rip_read(vcpu); __entry->isa = isa; __entry->vcpu_id = vcpu->vcpu_id; - kvm_x86_ops->get_exit_info(vcpu, &__entry->info1, + kvm_x86_ops.get_exit_info(vcpu, &__entry->info1, &__entry->info2); ), @@ -744,14 +750,14 @@ TRACE_EVENT(kvm_emulate_insn, ), TP_fast_assign( - __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); - __entry->len = vcpu->arch.emulate_ctxt.fetch.ptr - - vcpu->arch.emulate_ctxt.fetch.data; - __entry->rip = vcpu->arch.emulate_ctxt._eip - __entry->len; + __entry->csbase = kvm_x86_ops.get_segment_base(vcpu, VCPU_SREG_CS); + __entry->len = vcpu->arch.emulate_ctxt->fetch.ptr + - vcpu->arch.emulate_ctxt->fetch.data; + __entry->rip = vcpu->arch.emulate_ctxt->_eip - __entry->len; memcpy(__entry->insn, - vcpu->arch.emulate_ctxt.fetch.data, + vcpu->arch.emulate_ctxt->fetch.data, 15); - __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt.mode); + __entry->flags = kei_decode_mode(vcpu->arch.emulate_ctxt->mode); __entry->failed = failed; ), @@ -1367,6 +1373,24 @@ TRACE_EVENT(kvm_avic_unaccelerated_access, __entry->vec) ); +TRACE_EVENT(kvm_avic_ga_log, + TP_PROTO(u32 vmid, u32 vcpuid), + TP_ARGS(vmid, vcpuid), + + TP_STRUCT__entry( + __field(u32, vmid) + __field(u32, vcpuid) + ), + + TP_fast_assign( + __entry->vmid = vmid; + __entry->vcpuid = vcpuid; + ), + + TP_printk("vmid=%u, vcpuid=%u", + __entry->vmid, __entry->vcpuid) +); + TRACE_EVENT(kvm_hv_timer_state, TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use), TP_ARGS(vcpu_id, hv_timer_in_use), diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index f486e2606247..8903475f751e 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -101,7 +101,7 @@ static inline bool cpu_has_load_perf_global_ctrl(void) (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); } -static inline bool vmx_mpx_supported(void) +static inline bool cpu_has_vmx_mpx(void) { return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) && (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS); @@ -146,11 +146,6 @@ static inline bool vmx_umip_emulated(void) SECONDARY_EXEC_DESC; } -static inline bool vmx_pku_supported(void) -{ - return boot_cpu_has(X86_FEATURE_PKU); -} - static inline bool cpu_has_vmx_rdtscp(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -354,4 +349,22 @@ static inline bool cpu_has_vmx_intel_pt(void) (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL); } +/* + * Processor Trace can operate in one of three modes: + * a. system-wide: trace both host/guest and output to host buffer + * b. host-only: only trace host and output to host buffer + * c. host-guest: trace host and guest simultaneously and output to their + * respective buffer + * + * KVM currently only supports (a) and (c). + */ +static inline bool vmx_pt_mode_is_system(void) +{ + return pt_mode == PT_MODE_SYSTEM; +} +static inline bool vmx_pt_mode_is_host_guest(void) +{ + return pt_mode == PT_MODE_HOST_GUEST; +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 6de47f2569c9..e5f7a7ebf27d 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -198,6 +198,13 @@ static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {} static inline void evmcs_touch_msr_bitmap(void) {} #endif /* IS_ENABLED(CONFIG_HYPERV) */ +enum nested_evmptrld_status { + EVMPTRLD_DISABLED, + EVMPTRLD_SUCCEEDED, + EVMPTRLD_VMFAIL, + EVMPTRLD_ERROR, +}; + bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa); uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu); int nested_enable_evmcs(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 9750e590c89d..de232306561a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -353,9 +353,8 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) to_vmx(vcpu)->nested.msrs.ept_caps & VMX_EPT_EXECUTE_ONLY_BIT, nested_ept_ad_enabled(vcpu), - nested_ept_get_cr3(vcpu)); - vcpu->arch.mmu->set_cr3 = vmx_set_cr3; - vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3; + nested_ept_get_eptp(vcpu)); + vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp; vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.mmu->get_pdptr = kvm_pdptr_read; @@ -1910,18 +1909,18 @@ static int copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx) * This is an equivalent of the nested hypervisor executing the vmptrld * instruction. */ -static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, - bool from_launch) +static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld( + struct kvm_vcpu *vcpu, bool from_launch) { struct vcpu_vmx *vmx = to_vmx(vcpu); bool evmcs_gpa_changed = false; u64 evmcs_gpa; if (likely(!vmx->nested.enlightened_vmcs_enabled)) - return 1; + return EVMPTRLD_DISABLED; if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) - return 1; + return EVMPTRLD_DISABLED; if (unlikely(!vmx->nested.hv_evmcs || evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) { @@ -1932,7 +1931,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa), &vmx->nested.hv_evmcs_map)) - return 0; + return EVMPTRLD_ERROR; vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva; @@ -1961,7 +1960,7 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) && (vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) { nested_release_evmcs(vcpu); - return 0; + return EVMPTRLD_VMFAIL; } vmx->nested.dirty_vmcs12 = true; @@ -1990,21 +1989,13 @@ static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu, vmx->nested.hv_evmcs->hv_clean_fields &= ~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL; - return 1; + return EVMPTRLD_SUCCEEDED; } void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * hv_evmcs may end up being not mapped after migration (when - * L2 was running), map it here to make sure vmcs12 changes are - * properly reflected. - */ - if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) - nested_vmx_handle_enlightened_vmptrld(vcpu, false); - if (vmx->nested.hv_evmcs) { copy_vmcs12_to_enlightened(vmx); /* All fields are clean */ @@ -2475,9 +2466,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * If L1 use EPT, then L0 needs to execute INVEPT on * EPTP02 instead of EPTP01. Therefore, delay TLB * flush until vmcs02->eptp is fully updated by - * KVM_REQ_LOAD_CR3. Note that this assumes + * KVM_REQ_LOAD_MMU_PGD. Note that this assumes * KVM_REQ_TLB_FLUSH is evaluated after - * KVM_REQ_LOAD_CR3 in vcpu_enter_guest(). + * KVM_REQ_LOAD_MMU_PGD in vcpu_enter_guest(). */ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); } @@ -2522,7 +2513,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12 * on nested VM-Exit, which can occur without actually running L2 and - * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with + * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the * transition to HLT instead of running L2. */ @@ -2564,13 +2555,13 @@ static int nested_vmx_check_nmi_controls(struct vmcs12 *vmcs12) return 0; } -static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) +static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp) { struct vcpu_vmx *vmx = to_vmx(vcpu); int maxphyaddr = cpuid_maxphyaddr(vcpu); /* Check for memory type validity */ - switch (address & VMX_EPTP_MT_MASK) { + switch (new_eptp & VMX_EPTP_MT_MASK) { case VMX_EPTP_MT_UC: if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT))) return false; @@ -2583,16 +2574,26 @@ static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address) return false; } - /* only 4 levels page-walk length are valid */ - if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4)) + /* Page-walk levels validity. */ + switch (new_eptp & VMX_EPTP_PWL_MASK) { + case VMX_EPTP_PWL_5: + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT))) + return false; + break; + case VMX_EPTP_PWL_4: + if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT))) + return false; + break; + default: return false; + } /* Reserved bits should not be set */ - if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f))) + if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f))) return false; /* AD, if set, should be supported */ - if (address & VMX_EPTP_AD_ENABLE_BIT) { + if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) { if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT))) return false; } @@ -2641,7 +2642,7 @@ static int nested_check_vm_execution_controls(struct kvm_vcpu *vcpu, return -EINVAL; if (nested_cpu_has_ept(vmcs12) && - CC(!valid_ept_address(vcpu, vmcs12->ept_pointer))) + CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer))) return -EINVAL; if (nested_cpu_has_vmfunc(vmcs12)) { @@ -2961,7 +2962,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu) /* * Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS, * which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to - * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e. + * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e. * there is no need to preserve other bits or save/restore the field. */ vmcs_writel(GUEST_RFLAGS, 0); @@ -3053,6 +3054,27 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) struct page *page; u64 hpa; + /* + * hv_evmcs may end up being not mapped after migration (when + * L2 was running), map it here to make sure vmcs12 changes are + * properly reflected. + */ + if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) { + enum nested_evmptrld_status evmptrld_status = + nested_vmx_handle_enlightened_vmptrld(vcpu, false); + + if (evmptrld_status == EVMPTRLD_VMFAIL || + evmptrld_status == EVMPTRLD_ERROR) { + pr_debug_ratelimited("%s: enlightened vmptrld failed\n", + __func__); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = + KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return false; + } + } + if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { /* * Translate L1 physical address to host physical @@ -3316,12 +3338,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) enum nvmx_vmentry_status status; struct vcpu_vmx *vmx = to_vmx(vcpu); u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu); + enum nested_evmptrld_status evmptrld_status; if (!nested_vmx_check_permission(vcpu)) return 1; - if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch)) + evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch); + if (evmptrld_status == EVMPTRLD_ERROR) { + kvm_queue_exception(vcpu, UD_VECTOR); return 1; + } else if (evmptrld_status == EVMPTRLD_VMFAIL) { + return nested_vmx_failInvalid(vcpu); + } if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull) return nested_vmx_failInvalid(vcpu); @@ -3499,7 +3527,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu, } -static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) +void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); gfn_t gfn; @@ -3604,7 +3632,7 @@ static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) vcpu->arch.exception.payload); } -static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) +static int vmx_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qual; @@ -3680,8 +3708,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) return 0; } - if ((kvm_cpu_has_interrupt(vcpu) || external_intr) && - nested_exit_on_intr(vcpu)) { + if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) { if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0); @@ -4024,7 +4051,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * * If vmcs12 uses EPT, we need to execute this flush on EPTP01 * and therefore we request the TLB flush to happen only after VMCS EPTP - * has been set by KVM_REQ_LOAD_CR3. + * has been set by KVM_REQ_LOAD_MMU_PGD. */ if (enable_vpid && (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) { @@ -4329,17 +4356,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; if (likely(!vmx->fail)) { - /* - * TODO: SDM says that with acknowledge interrupt on - * exit, bit 31 of the VM-exit interrupt information - * (valid interrupt) is always set to 1 on - * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't - * need kvm_cpu_has_interrupt(). See the commit - * message for details. - */ - if (nested_exit_intr_ack_set(vcpu) && - exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && - kvm_cpu_has_interrupt(vcpu)) { + if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT && + nested_exit_intr_ack_set(vcpu)) { int irq = kvm_cpu_get_interrupt(vcpu); WARN_ON(irq < 0); vmcs12->vm_exit_intr_info = irq | @@ -4383,7 +4401,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, * Decode the memory-address operand of a vmx instruction, as recorded on an * exit caused by such an instruction (run by a guest hypervisor). * On success, returns 0. When the operand is invalid, returns 1 and throws - * #UD or #GP. + * #UD, #GP, or #SS. */ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret) @@ -4424,7 +4442,7 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, if (base_is_valid) off += kvm_register_read(vcpu, base_reg); if (index_is_valid) - off += kvm_register_read(vcpu, index_reg)<<scaling; + off += kvm_register_read(vcpu, index_reg) << scaling; vmx_get_segment(vcpu, &s, seg_reg); /* @@ -4517,7 +4535,7 @@ void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu) return; vmx = to_vmx(vcpu); - if (kvm_x86_ops->pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { + if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) { vmx->nested.msrs.entry_ctls_high |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL; vmx->nested.msrs.exit_ctls_high |= @@ -4603,7 +4621,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) vmx->nested.vmcs02_initialized = false; vmx->nested.vmxon = true; - if (pt_mode == PT_MODE_HOST_GUEST) { + if (vmx_pt_mode_is_host_guest()) { vmx->pt_desc.guest.ctl = 0; pt_update_intercept_for_msr(vmx); } @@ -5235,7 +5253,7 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { u32 index = kvm_rcx_read(vcpu); - u64 address; + u64 new_eptp; bool accessed_dirty; struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -5248,23 +5266,23 @@ static int nested_vmx_eptp_switching(struct kvm_vcpu *vcpu, if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT, - &address, index * 8, 8)) + &new_eptp, index * 8, 8)) return 1; - accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT); + accessed_dirty = !!(new_eptp & VMX_EPTP_AD_ENABLE_BIT); /* * If the (L2) guest does a vmfunc to the currently * active ept pointer, we don't have to do anything else */ - if (vmcs12->ept_pointer != address) { - if (!valid_ept_address(vcpu, address)) + if (vmcs12->ept_pointer != new_eptp) { + if (!nested_vmx_check_eptp(vcpu, new_eptp)) return 1; kvm_mmu_unload(vcpu); mmu->ept_ad = accessed_dirty; mmu->mmu_role.base.ad_disabled = !accessed_dirty; - vmcs12->ept_pointer = address; + vmcs12->ept_pointer = new_eptp; /* * TODO: Check what's the correct approach in case * mmu reload fails. Currently, we just let the next @@ -5525,8 +5543,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - if (vmx->nested.nested_run_pending) - return false; + WARN_ON_ONCE(vmx->nested.nested_run_pending); if (unlikely(vmx->fail)) { trace_kvm_nested_vmenter_failed( @@ -5535,19 +5552,6 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) return true; } - /* - * The host physical addresses of some pages of guest memory - * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC - * Page). The CPU may write to these pages via their host - * physical address while L2 is running, bypassing any - * address-translation-based dirty tracking (e.g. EPT write - * protection). - * - * Mark them dirty on every exit from L2 to prevent them from - * getting out of sync with dirty tracking. - */ - nested_mark_vmcs12_pages_dirty(vcpu); - trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, vmcs_readl(EXIT_QUALIFICATION), vmx->idt_vectoring_info, @@ -5628,7 +5632,7 @@ bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) case EXIT_REASON_MWAIT_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING); case EXIT_REASON_MONITOR_TRAP_FLAG: - return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG); + return nested_cpu_has_mtf(vmcs12); case EXIT_REASON_MONITOR_INSTRUCTION: return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING); case EXIT_REASON_PAUSE_INSTRUCTION: @@ -5905,10 +5909,12 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa); } else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) { /* - * Sync eVMCS upon entry as we may not have - * HV_X64_MSR_VP_ASSIST_PAGE set up yet. + * nested_vmx_handle_enlightened_vmptrld() cannot be called + * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be + * restored yet. EVMCS will be mapped from + * nested_get_vmcs12_pages(). */ - vmx->nested.need_vmcs12_to_shadow_sync = true; + kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu); } else { return -EINVAL; } @@ -6130,11 +6136,13 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) /* nested EPT: emulate EPT also to L1 */ msrs->secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; - msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT | - VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; - if (cpu_has_vmx_ept_execute_only()) - msrs->ept_caps |= - VMX_EPT_EXECUTE_ONLY_BIT; + msrs->ept_caps = + VMX_EPT_PAGE_WALK_4_BIT | + VMX_EPT_PAGE_WALK_5_BIT | + VMX_EPTP_WB_BIT | + VMX_EPT_INVEPT_BIT | + VMX_EPT_EXECUTE_ONLY_BIT; + msrs->ept_caps &= ept_caps; msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | @@ -6233,7 +6241,8 @@ void nested_vmx_hardware_unsetup(void) } } -__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) +__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops, + int (*exit_handlers[])(struct kvm_vcpu *)) { int i; @@ -6269,12 +6278,12 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; - kvm_x86_ops->check_nested_events = vmx_check_nested_events; - kvm_x86_ops->get_nested_state = vmx_get_nested_state; - kvm_x86_ops->set_nested_state = vmx_set_nested_state; - kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages; - kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs; - kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version; + ops->check_nested_events = vmx_check_nested_events; + ops->get_nested_state = vmx_get_nested_state; + ops->set_nested_state = vmx_set_nested_state; + ops->get_vmcs12_pages = nested_get_vmcs12_pages; + ops->nested_enable_evmcs = nested_enable_evmcs; + ops->nested_get_evmcs_version = nested_get_evmcs_version; return 0; } diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 9aeda46f473e..ac56aefa49e3 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -19,7 +19,8 @@ enum nvmx_vmentry_status { void vmx_leave_nested(struct kvm_vcpu *vcpu); void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps); void nested_vmx_hardware_unsetup(void); -__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)); +__init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops, + int (*exit_handlers[])(struct kvm_vcpu *)); void nested_vmx_set_vmcs_shadowing_bitmap(void); void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu); enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, @@ -33,6 +34,7 @@ int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata); int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, u32 vmx_instruction_info, bool wr, int len, gva_t *ret); void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu); +void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu); bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port, int size); @@ -60,7 +62,7 @@ static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu) vmx->nested.hv_evmcs; } -static inline unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) +static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu) { /* return the page table to be shadowed - in our case, EPT12 */ return get_vmcs12(vcpu)->ept_pointer; @@ -68,7 +70,7 @@ static inline unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu) { - return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT; + return nested_ept_get_eptp(vcpu) & VMX_EPTP_AD_ENABLE_BIT; } /* diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h index 45eaedee2ac0..19717d0a1100 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -13,6 +13,8 @@ #define __ex(x) __kvm_handle_fault_on_reboot(x) asmlinkage void vmread_error(unsigned long field, bool fault); +__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, + bool fault); void vmwrite_error(unsigned long field, unsigned long value); void vmclear_error(struct vmcs *vmcs, u64 phys_addr); void vmptrld_error(struct vmcs *vmcs, u64 phys_addr); @@ -70,15 +72,28 @@ static __always_inline unsigned long __vmcs_readl(unsigned long field) asm volatile("1: vmread %2, %1\n\t" ".byte 0x3e\n\t" /* branch taken hint */ "ja 3f\n\t" - "mov %2, %%" _ASM_ARG1 "\n\t" - "xor %%" _ASM_ARG2 ", %%" _ASM_ARG2 "\n\t" - "2: call vmread_error\n\t" - "xor %k1, %k1\n\t" + + /* + * VMREAD failed. Push '0' for @fault, push the failing + * @field, and bounce through the trampoline to preserve + * volatile registers. + */ + "push $0\n\t" + "push %2\n\t" + "2:call vmread_error_trampoline\n\t" + + /* + * Unwind the stack. Note, the trampoline zeros out the + * memory for @fault so that the result is '0' on error. + */ + "pop %2\n\t" + "pop %1\n\t" "3:\n\t" + /* VMREAD faulted. As above, except push '1' for @fault. */ ".pushsection .fixup, \"ax\"\n\t" - "4: mov %2, %%" _ASM_ARG1 "\n\t" - "mov $1, %%" _ASM_ARG2 "\n\t" + "4: push $1\n\t" + "push %2\n\t" "jmp 2b\n\t" ".popsection\n\t" _ASM_EXTABLE(1b, 4b) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index fd21cdb10b79..7c857737b438 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -263,9 +263,15 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); + if (pmc->perf_event) + perf_event_period(pmc->perf_event, + get_sample_period(pmc, data)); return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); + if (pmc->perf_event) + perf_event_period(pmc->perf_event, + get_sample_period(pmc, data)); return 0; } else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) { if (data == pmc->eventsel) @@ -329,7 +335,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); - if (kvm_x86_ops->pt_supported()) + if (vmx_pt_mode_is_host_guest()) pmu->global_ovf_ctrl_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 81ada2ce99e7..9651ba388ba9 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -135,12 +135,12 @@ SYM_FUNC_START(__vmx_vcpu_run) cmpb $0, %bl /* Load guest registers. Don't clobber flags. */ - mov VCPU_RBX(%_ASM_AX), %_ASM_BX mov VCPU_RCX(%_ASM_AX), %_ASM_CX mov VCPU_RDX(%_ASM_AX), %_ASM_DX + mov VCPU_RBX(%_ASM_AX), %_ASM_BX + mov VCPU_RBP(%_ASM_AX), %_ASM_BP mov VCPU_RSI(%_ASM_AX), %_ASM_SI mov VCPU_RDI(%_ASM_AX), %_ASM_DI - mov VCPU_RBP(%_ASM_AX), %_ASM_BP #ifdef CONFIG_X86_64 mov VCPU_R8 (%_ASM_AX), %r8 mov VCPU_R9 (%_ASM_AX), %r9 @@ -168,12 +168,12 @@ SYM_FUNC_START(__vmx_vcpu_run) /* Save all guest registers, including RAX from the stack */ __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX) - mov %_ASM_BX, VCPU_RBX(%_ASM_AX) mov %_ASM_CX, VCPU_RCX(%_ASM_AX) mov %_ASM_DX, VCPU_RDX(%_ASM_AX) + mov %_ASM_BX, VCPU_RBX(%_ASM_AX) + mov %_ASM_BP, VCPU_RBP(%_ASM_AX) mov %_ASM_SI, VCPU_RSI(%_ASM_AX) mov %_ASM_DI, VCPU_RDI(%_ASM_AX) - mov %_ASM_BP, VCPU_RBP(%_ASM_AX) #ifdef CONFIG_X86_64 mov %r8, VCPU_R8 (%_ASM_AX) mov %r9, VCPU_R9 (%_ASM_AX) @@ -197,12 +197,12 @@ SYM_FUNC_START(__vmx_vcpu_run) * free. RSP and RAX are exempt as RSP is restored by hardware during * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail. */ -1: xor %ebx, %ebx - xor %ecx, %ecx +1: xor %ecx, %ecx xor %edx, %edx + xor %ebx, %ebx + xor %ebp, %ebp xor %esi, %esi xor %edi, %edi - xor %ebp, %ebp #ifdef CONFIG_X86_64 xor %r8d, %r8d xor %r9d, %r9d @@ -234,3 +234,61 @@ SYM_FUNC_START(__vmx_vcpu_run) 2: mov $1, %eax jmp 1b SYM_FUNC_END(__vmx_vcpu_run) + +/** + * vmread_error_trampoline - Trampoline from inline asm to vmread_error() + * @field: VMCS field encoding that failed + * @fault: %true if the VMREAD faulted, %false if it failed + + * Save and restore volatile registers across a call to vmread_error(). Note, + * all parameters are passed on the stack. + */ +SYM_FUNC_START(vmread_error_trampoline) + push %_ASM_BP + mov %_ASM_SP, %_ASM_BP + + push %_ASM_AX + push %_ASM_CX + push %_ASM_DX +#ifdef CONFIG_X86_64 + push %rdi + push %rsi + push %r8 + push %r9 + push %r10 + push %r11 +#endif +#ifdef CONFIG_X86_64 + /* Load @field and @fault to arg1 and arg2 respectively. */ + mov 3*WORD_SIZE(%rbp), %_ASM_ARG2 + mov 2*WORD_SIZE(%rbp), %_ASM_ARG1 +#else + /* Parameters are passed on the stack for 32-bit (see asmlinkage). */ + push 3*WORD_SIZE(%ebp) + push 2*WORD_SIZE(%ebp) +#endif + + call vmread_error + +#ifndef CONFIG_X86_64 + add $8, %esp +#endif + + /* Zero out @fault, which will be popped into the result register. */ + _ASM_MOV $0, 3*WORD_SIZE(%_ASM_BP) + +#ifdef CONFIG_X86_64 + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rsi + pop %rdi +#endif + pop %_ASM_DX + pop %_ASM_CX + pop %_ASM_AX + pop %_ASM_BP + + ret +SYM_FUNC_END(vmread_error_trampoline) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 458e684dfbdc..91749f1254e8 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -437,7 +437,6 @@ static const struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; -u64 host_efer; static unsigned long host_idt_base; /* @@ -658,53 +657,16 @@ static int vmx_set_guest_msr(struct vcpu_vmx *vmx, struct shared_msr_entry *msr, return ret; } -void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs) -{ - vmcs_clear(loaded_vmcs->vmcs); - if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) - vmcs_clear(loaded_vmcs->shadow_vmcs); - loaded_vmcs->cpu = -1; - loaded_vmcs->launched = 0; -} - #ifdef CONFIG_KEXEC_CORE -/* - * This bitmap is used to indicate whether the vmclear - * operation is enabled on all cpus. All disabled by - * default. - */ -static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; - -static inline void crash_enable_local_vmclear(int cpu) -{ - cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline void crash_disable_local_vmclear(int cpu) -{ - cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - -static inline int crash_local_vmclear_enabled(int cpu) -{ - return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); -} - static void crash_vmclear_local_loaded_vmcss(void) { int cpu = raw_smp_processor_id(); struct loaded_vmcs *v; - if (!crash_local_vmclear_enabled(cpu)) - return; - list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), loaded_vmcss_on_cpu_link) vmcs_clear(v->vmcs); } -#else -static inline void crash_enable_local_vmclear(int cpu) { } -static inline void crash_disable_local_vmclear(int cpu) { } #endif /* CONFIG_KEXEC_CORE */ static void __loaded_vmcs_clear(void *arg) @@ -716,19 +678,24 @@ static void __loaded_vmcs_clear(void *arg) return; /* vcpu migration can race with cpu offline */ if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) per_cpu(current_vmcs, cpu) = NULL; - crash_disable_local_vmclear(cpu); + + vmcs_clear(loaded_vmcs->vmcs); + if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched) + vmcs_clear(loaded_vmcs->shadow_vmcs); + list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); /* - * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link - * is before setting loaded_vmcs->vcpu to -1 which is done in - * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist - * then adds the vmcs into percpu list before it is deleted. + * Ensure all writes to loaded_vmcs, including deleting it from its + * current percpu list, complete before setting loaded_vmcs->vcpu to + * -1, otherwise a different cpu can see vcpu == -1 first and add + * loaded_vmcs to its percpu list before it's deleted from this cpu's + * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs(). */ smp_wmb(); - loaded_vmcs_init(loaded_vmcs); - crash_enable_local_vmclear(cpu); + loaded_vmcs->cpu = -1; + loaded_vmcs->launched = 0; } void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) @@ -812,7 +779,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu) if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; if (enable_ept) - eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ + eb &= ~(1u << PF_VECTOR); /* When we are running a nested L2 guest and L1 specified for it a * certain exception bitmap, we must trap the same exceptions and pass @@ -1063,7 +1030,7 @@ static unsigned long segment_base(u16 selector) static inline bool pt_can_write_msr(struct vcpu_vmx *vmx) { - return (pt_mode == PT_MODE_HOST_GUEST) && + return vmx_pt_mode_is_host_guest() && !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN); } @@ -1097,7 +1064,7 @@ static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range) static void pt_guest_enter(struct vcpu_vmx *vmx) { - if (pt_mode == PT_MODE_SYSTEM) + if (vmx_pt_mode_is_system()) return; /* @@ -1114,7 +1081,7 @@ static void pt_guest_enter(struct vcpu_vmx *vmx) static void pt_guest_exit(struct vcpu_vmx *vmx) { - if (pt_mode == PT_MODE_SYSTEM) + if (vmx_pt_mode_is_system()) return; if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) { @@ -1347,18 +1314,17 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) if (!already_loaded) { loaded_vmcs_clear(vmx->loaded_vmcs); local_irq_disable(); - crash_disable_local_vmclear(cpu); /* - * Read loaded_vmcs->cpu should be before fetching - * loaded_vmcs->loaded_vmcss_on_cpu_link. - * See the comments in __loaded_vmcs_clear(). + * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to + * this cpu's percpu list, otherwise it may not yet be deleted + * from its previous cpu's percpu list. Pairs with the + * smb_wmb() in __loaded_vmcs_clear(). */ smp_rmb(); list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, &per_cpu(loaded_vmcss_on_cpu, cpu)); - crash_enable_local_vmclear(cpu); local_irq_enable(); } @@ -1691,16 +1657,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu) vmx_clear_hlt(vcpu); } -static bool vmx_rdtscp_supported(void) -{ - return cpu_has_vmx_rdtscp(); -} - -static bool vmx_invpcid_supported(void) -{ - return cpu_has_vmx_invpcid(); -} - /* * Swap MSR entry in host/guest MSR entry array. */ @@ -1908,24 +1864,24 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) &msr_info->data); break; case MSR_IA32_RTIT_CTL: - if (pt_mode != PT_MODE_HOST_GUEST) + if (!vmx_pt_mode_is_host_guest()) return 1; msr_info->data = vmx->pt_desc.guest.ctl; break; case MSR_IA32_RTIT_STATUS: - if (pt_mode != PT_MODE_HOST_GUEST) + if (!vmx_pt_mode_is_host_guest()) return 1; msr_info->data = vmx->pt_desc.guest.status; break; case MSR_IA32_RTIT_CR3_MATCH: - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || !intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_cr3_filtering)) return 1; msr_info->data = vmx->pt_desc.guest.cr3_match; break; case MSR_IA32_RTIT_OUTPUT_BASE: - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, @@ -1934,7 +1890,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmx->pt_desc.guest.output_base; break; case MSR_IA32_RTIT_OUTPUT_MASK: - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || (!intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_topa_output) && !intel_pt_validate_cap(vmx->pt_desc.caps, @@ -1944,7 +1900,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: index = msr_info->index - MSR_IA32_RTIT_ADDR0_A; - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps, PT_CAP_num_address_ranges))) return 1; @@ -2150,7 +2106,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_RTIT_CTL: - if ((pt_mode != PT_MODE_HOST_GUEST) || + if (!vmx_pt_mode_is_host_guest() || vmx_rtit_ctl_check(vcpu, data) || vmx->nested.vmxon) return 1; @@ -2266,18 +2222,33 @@ static __init int vmx_disabled_by_bios(void) !boot_cpu_has(X86_FEATURE_VMX); } -static void kvm_cpu_vmxon(u64 addr) +static int kvm_cpu_vmxon(u64 vmxon_pointer) { + u64 msr; + cr4_set_bits(X86_CR4_VMXE); intel_pt_handle_vmx(1); - asm volatile ("vmxon %0" : : "m"(addr)); + asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t" + _ASM_EXTABLE(1b, %l[fault]) + : : [vmxon_pointer] "m"(vmxon_pointer) + : : fault); + return 0; + +fault: + WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n", + rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr); + intel_pt_handle_vmx(0); + cr4_clear_bits(X86_CR4_VMXE); + + return -EFAULT; } static int hardware_enable(void) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); + int r; if (cr4_read_shadow() & X86_CR4_VMXE) return -EBUSY; @@ -2294,18 +2265,10 @@ static int hardware_enable(void) INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu)); spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu)); - /* - * Now we can enable the vmclear operation in kdump - * since the loaded_vmcss_on_cpu list on this cpu - * has been initialized. - * - * Though the cpu is not in VMX operation now, there - * is no problem to enable the vmclear operation - * for the loaded_vmcss_on_cpu list is empty! - */ - crash_enable_local_vmclear(cpu); + r = kvm_cpu_vmxon(phys_addr); + if (r) + return r; - kvm_cpu_vmxon(phys_addr); if (enable_ept) ept_sync_global(); @@ -2617,9 +2580,12 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) if (!loaded_vmcs->vmcs) return -ENOMEM; + vmcs_clear(loaded_vmcs->vmcs); + loaded_vmcs->shadow_vmcs = NULL; loaded_vmcs->hv_timer_soft_disabled = false; - loaded_vmcs_init(loaded_vmcs); + loaded_vmcs->cpu = -1; + loaded_vmcs->launched = 0; if (cpu_has_vmx_msr_bitmap()) { loaded_vmcs->msr_bitmap = (unsigned long *) @@ -3001,9 +2967,8 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static int get_ept_level(struct kvm_vcpu *vcpu) { - /* Nested EPT currently only supports 4-level walks. */ if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) - return 4; + return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu)); if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) return 5; return 4; @@ -3023,7 +2988,7 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) return eptp; } -void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@ -3035,7 +3000,7 @@ void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) eptp = construct_eptp(vcpu, cr3); vmcs_write64(EPT_POINTER, eptp); - if (kvm_x86_ops->tlb_remote_flush) { + if (kvm_x86_ops.tlb_remote_flush) { spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock); to_vmx(vcpu)->ept_pointer = eptp; to_kvm_vmx(kvm)->ept_pointers_match @@ -4040,7 +4005,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; - if (pt_mode == PT_MODE_SYSTEM) + if (vmx_pt_mode_is_system()) exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX); if (!cpu_need_virtualize_apic_accesses(vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; @@ -4095,7 +4060,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } } - if (vmx_rdtscp_supported()) { + if (cpu_has_vmx_rdtscp()) { bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP); if (!rdtscp_enabled) exec_control &= ~SECONDARY_EXEC_RDTSCP; @@ -4110,7 +4075,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) } } - if (vmx_invpcid_supported()) { + if (cpu_has_vmx_invpcid()) { /* Exposing INVPCID only when PCID is exposed */ bool invpcid_enabled = guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) && @@ -4281,7 +4246,7 @@ static void init_vmcs(struct vcpu_vmx *vmx) if (cpu_has_vmx_encls_vmexit()) vmcs_write64(ENCLS_EXITING_BITMAP, -1ull); - if (pt_mode == PT_MODE_HOST_GUEST) { + if (vmx_pt_mode_is_host_guest()) { memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc)); /* Bit[6~0] are forced to 1, writes are ignored. */ vmx->pt_desc.guest.output_mask = 0x7F; @@ -4509,8 +4474,13 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { - return (!to_vmx(vcpu)->nested.nested_run_pending && - vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && + if (to_vmx(vcpu)->nested.nested_run_pending) + return false; + + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + return true; + + return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } @@ -4566,7 +4536,6 @@ static bool rmode_exception(struct kvm_vcpu *vcpu, int vec) case GP_VECTOR: case MF_VECTOR: return true; - break; } return false; } @@ -5343,7 +5312,6 @@ static void vmx_enable_tdp(void) VMX_EPT_RWX_MASK, 0ull); ept_set_mmio_spte_mask(); - kvm_enable_tdp(); } /* @@ -5876,8 +5844,23 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, if (vmx->emulation_required) return handle_invalid_guest_state(vcpu); - if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason)) - return nested_vmx_reflect_vmexit(vcpu, exit_reason); + if (is_guest_mode(vcpu)) { + /* + * The host physical addresses of some pages of guest memory + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC + * Page). The CPU may write to these pages via their host + * physical address while L2 is running, bypassing any + * address-translation-based dirty tracking (e.g. EPT write + * protection). + * + * Mark them dirty on every exit from L2 to prevent them from + * getting out of sync with dirty tracking. + */ + nested_mark_vmcs12_pages_dirty(vcpu); + + if (nested_vmx_exit_reflected(vcpu, exit_reason)) + return nested_vmx_reflect_vmexit(vcpu, exit_reason); + } if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { dump_vmcs(); @@ -6237,15 +6220,13 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* if exit due to PF check for async PF */ - if (is_page_fault(vmx->exit_intr_info)) + if (is_page_fault(vmx->exit_intr_info)) { vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); - /* Handle machine checks before interrupts are enabled */ - if (is_machine_check(vmx->exit_intr_info)) + } else if (is_machine_check(vmx->exit_intr_info)) { kvm_machine_check(); - /* We need to handle NMIs before interrupts are enabled */ - if (is_nmi(vmx->exit_intr_info)) { + } else if (is_nmi(vmx->exit_intr_info)) { kvm_before_interrupt(&vmx->vcpu); asm("int $2"); kvm_after_interrupt(&vmx->vcpu); @@ -6331,11 +6312,6 @@ static bool vmx_has_emulated_msr(int index) } } -static bool vmx_pt_supported(void) -{ - return pt_mode == PT_MODE_HOST_GUEST; -} - static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -6581,7 +6557,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) pt_guest_enter(vmx); - atomic_switch_perf_msrs(vmx); + if (vcpu_to_pmu(vcpu)->version) + atomic_switch_perf_msrs(vmx); atomic_switch_umwait_control_msr(vmx); if (enable_preemption_timer) @@ -6698,20 +6675,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_complete_interrupts(vmx); } -static struct kvm *vmx_vm_alloc(void) -{ - struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx), - GFP_KERNEL_ACCOUNT | __GFP_ZERO, - PAGE_KERNEL); - return &kvm_vmx->kvm; -} - -static void vmx_vm_free(struct kvm *kvm) -{ - kfree(kvm->arch.hyperv.hv_pa_pg); - vfree(to_kvm_vmx(kvm)); -} - static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6914,17 +6877,24 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) u8 cache; u64 ipat = 0; - /* For VT-d and EPT combination - * 1. MMIO: always map as UC - * 2. EPT with VT-d: - * a. VT-d without snooping control feature: can't guarantee the - * result, try to trust guest. - * b. VT-d with snooping control feature: snooping control feature of - * VT-d engine can guarantee the cache correctness. Just set it - * to WB to keep consistent with host. So the same as item 3. - * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep - * consistent with host MTRR + /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in + * memory aliases with conflicting memory types and sometimes MCEs. + * We have to be careful as to what are honored and when. + * + * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to + * UC. The effective memory type is UC or WC depending on guest PAT. + * This was historically the source of MCEs and we want to be + * conservative. + * + * When there is no need to deal with noncoherent DMA (e.g., no VT-d + * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The + * EPT memory type is set to WB. The effective memory type is forced + * WB. + * + * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The + * EPT memory type is used to emulate guest CD/MTRR. */ + if (is_mmio) { cache = MTRR_TYPE_UNCACHABLE; goto exit; @@ -6951,15 +6921,6 @@ exit: return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; } -static int vmx_get_lpage_level(void) -{ - if (enable_ept && !cpu_has_vmx_ept_1g_page()) - return PT_DIRECTORY_LEVEL; - else - /* For shadow and EPT supported 1GB page */ - return PT_PDPE_LEVEL; -} - static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) { /* @@ -7150,10 +7111,37 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) } } -static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) +static __init void vmx_set_cpu_caps(void) { - if (func == 1 && nested) - entry->ecx |= feature_bit(VMX); + kvm_set_cpu_caps(); + + /* CPUID 0x1 */ + if (nested) + kvm_cpu_cap_set(X86_FEATURE_VMX); + + /* CPUID 0x7 */ + if (kvm_mpx_supported()) + kvm_cpu_cap_check_and_set(X86_FEATURE_MPX); + if (cpu_has_vmx_invpcid()) + kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID); + if (vmx_pt_mode_is_host_guest()) + kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT); + + /* PKU is not yet implemented for shadow paging. */ + if (enable_ept && boot_cpu_has(X86_FEATURE_OSPKE)) + kvm_cpu_cap_check_and_set(X86_FEATURE_PKU); + + if (vmx_umip_emulated()) + kvm_cpu_cap_set(X86_FEATURE_UMIP); + + /* CPUID 0xD.1 */ + supported_xss = 0; + if (!vmx_xsaves_supported()) + kvm_cpu_cap_clear(X86_FEATURE_XSAVES); + + /* CPUID 0x80000001 */ + if (!cpu_has_vmx_rdtscp()) + kvm_cpu_cap_clear(X86_FEATURE_RDTSCP); } static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu) @@ -7197,10 +7185,10 @@ static int vmx_check_intercept_io(struct kvm_vcpu *vcpu, static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, - enum x86_intercept_stage stage) + enum x86_intercept_stage stage, + struct x86_exception *exception) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; switch (info->intercept) { /* @@ -7209,8 +7197,8 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, */ case x86_intercept_rdtscp: if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { - ctxt->exception.vector = UD_VECTOR; - ctxt->exception.error_code_valid = false; + exception->vector = UD_VECTOR; + exception->error_code_valid = false; return X86EMUL_PROPAGATE_FAULT; } break; @@ -7321,7 +7309,8 @@ static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) static void vmx_slot_enable_log_dirty(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_mmu_slot_leaf_clear_dirty(kvm, slot); + if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) + kvm_mmu_slot_leaf_clear_dirty(kvm, slot); kvm_mmu_slot_largepage_remove_write_access(kvm, slot); } @@ -7504,7 +7493,7 @@ static void pi_post_block(struct kvm_vcpu *vcpu) static void vmx_post_block(struct kvm_vcpu *vcpu) { - if (kvm_x86_ops->set_hv_timer) + if (kvm_x86_ops.set_hv_timer) kvm_lapic_switch_to_hv_timer(vcpu); pi_post_block(vcpu); @@ -7671,13 +7660,164 @@ static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.vmxon; } +static void hardware_unsetup(void) +{ + if (nested) + nested_vmx_hardware_unsetup(); + + free_kvm_area(); +} + +static bool vmx_check_apicv_inhibit_reasons(ulong bit) +{ + ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | + BIT(APICV_INHIBIT_REASON_HYPERV); + + return supported & BIT(bit); +} + +static struct kvm_x86_ops vmx_x86_ops __initdata = { + .hardware_unsetup = hardware_unsetup, + + .hardware_enable = hardware_enable, + .hardware_disable = hardware_disable, + .cpu_has_accelerated_tpr = report_flexpriority, + .has_emulated_msr = vmx_has_emulated_msr, + + .vm_size = sizeof(struct kvm_vmx), + .vm_init = vmx_vm_init, + + .vcpu_create = vmx_create_vcpu, + .vcpu_free = vmx_free_vcpu, + .vcpu_reset = vmx_vcpu_reset, + + .prepare_guest_switch = vmx_prepare_switch_to_guest, + .vcpu_load = vmx_vcpu_load, + .vcpu_put = vmx_vcpu_put, + + .update_bp_intercept = update_exception_bitmap, + .get_msr_feature = vmx_get_msr_feature, + .get_msr = vmx_get_msr, + .set_msr = vmx_set_msr, + .get_segment_base = vmx_get_segment_base, + .get_segment = vmx_get_segment, + .set_segment = vmx_set_segment, + .get_cpl = vmx_get_cpl, + .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, + .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, + .set_cr0 = vmx_set_cr0, + .set_cr4 = vmx_set_cr4, + .set_efer = vmx_set_efer, + .get_idt = vmx_get_idt, + .set_idt = vmx_set_idt, + .get_gdt = vmx_get_gdt, + .set_gdt = vmx_set_gdt, + .get_dr6 = vmx_get_dr6, + .set_dr6 = vmx_set_dr6, + .set_dr7 = vmx_set_dr7, + .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, + .cache_reg = vmx_cache_reg, + .get_rflags = vmx_get_rflags, + .set_rflags = vmx_set_rflags, + + .tlb_flush = vmx_flush_tlb, + .tlb_flush_gva = vmx_flush_tlb_gva, + + .run = vmx_vcpu_run, + .handle_exit = vmx_handle_exit, + .skip_emulated_instruction = vmx_skip_emulated_instruction, + .update_emulated_instruction = vmx_update_emulated_instruction, + .set_interrupt_shadow = vmx_set_interrupt_shadow, + .get_interrupt_shadow = vmx_get_interrupt_shadow, + .patch_hypercall = vmx_patch_hypercall, + .set_irq = vmx_inject_irq, + .set_nmi = vmx_inject_nmi, + .queue_exception = vmx_queue_exception, + .cancel_injection = vmx_cancel_injection, + .interrupt_allowed = vmx_interrupt_allowed, + .nmi_allowed = vmx_nmi_allowed, + .get_nmi_mask = vmx_get_nmi_mask, + .set_nmi_mask = vmx_set_nmi_mask, + .enable_nmi_window = enable_nmi_window, + .enable_irq_window = enable_irq_window, + .update_cr8_intercept = update_cr8_intercept, + .set_virtual_apic_mode = vmx_set_virtual_apic_mode, + .set_apic_access_page_addr = vmx_set_apic_access_page_addr, + .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, + .load_eoi_exitmap = vmx_load_eoi_exitmap, + .apicv_post_state_restore = vmx_apicv_post_state_restore, + .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons, + .hwapic_irr_update = vmx_hwapic_irr_update, + .hwapic_isr_update = vmx_hwapic_isr_update, + .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, + .sync_pir_to_irr = vmx_sync_pir_to_irr, + .deliver_posted_interrupt = vmx_deliver_posted_interrupt, + .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, + + .set_tss_addr = vmx_set_tss_addr, + .set_identity_map_addr = vmx_set_identity_map_addr, + .get_tdp_level = get_ept_level, + .get_mt_mask = vmx_get_mt_mask, + + .get_exit_info = vmx_get_exit_info, + + .cpuid_update = vmx_cpuid_update, + + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + + .read_l1_tsc_offset = vmx_read_l1_tsc_offset, + .write_l1_tsc_offset = vmx_write_l1_tsc_offset, + + .load_mmu_pgd = vmx_load_mmu_pgd, + + .check_intercept = vmx_check_intercept, + .handle_exit_irqoff = vmx_handle_exit_irqoff, + + .request_immediate_exit = vmx_request_immediate_exit, + + .sched_in = vmx_sched_in, + + .slot_enable_log_dirty = vmx_slot_enable_log_dirty, + .slot_disable_log_dirty = vmx_slot_disable_log_dirty, + .flush_log_dirty = vmx_flush_log_dirty, + .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, + .write_log_dirty = vmx_write_pml_buffer, + + .pre_block = vmx_pre_block, + .post_block = vmx_post_block, + + .pmu_ops = &intel_pmu_ops, + + .update_pi_irte = vmx_update_pi_irte, + +#ifdef CONFIG_X86_64 + .set_hv_timer = vmx_set_hv_timer, + .cancel_hv_timer = vmx_cancel_hv_timer, +#endif + + .setup_mce = vmx_setup_mce, + + .smi_allowed = vmx_smi_allowed, + .pre_enter_smm = vmx_pre_enter_smm, + .pre_leave_smm = vmx_pre_leave_smm, + .enable_smi_window = enable_smi_window, + + .check_nested_events = NULL, + .get_nested_state = NULL, + .set_nested_state = NULL, + .get_vmcs12_pages = NULL, + .nested_enable_evmcs = NULL, + .nested_get_evmcs_version = NULL, + .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, + .apic_init_signal_blocked = vmx_apic_init_signal_blocked, +}; + static __init int hardware_setup(void) { unsigned long host_bndcfgs; struct desc_ptr dt; - int r, i; - - rdmsrl_safe(MSR_EFER, &host_efer); + int r, i, ept_lpage_level; store_idt(&dt); host_idt_base = dt.address; @@ -7696,6 +7836,10 @@ static __init int hardware_setup(void) WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost"); } + if (!cpu_has_vmx_mpx()) + supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | + XFEATURE_MASK_BNDCSR); + if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() || !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; @@ -7724,19 +7868,16 @@ static __init int hardware_setup(void) * using the APIC_ACCESS_ADDR VMCS field. */ if (!flexpriority_enabled) - kvm_x86_ops->set_apic_access_page_addr = NULL; + vmx_x86_ops.set_apic_access_page_addr = NULL; if (!cpu_has_vmx_tpr_shadow()) - kvm_x86_ops->update_cr8_intercept = NULL; - - if (enable_ept && !cpu_has_vmx_ept_2m_page()) - kvm_disable_largepages(); + vmx_x86_ops.update_cr8_intercept = NULL; #if IS_ENABLED(CONFIG_HYPERV) if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH && enable_ept) { - kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb; - kvm_x86_ops->tlb_remote_flush_with_range = + vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb; + vmx_x86_ops.tlb_remote_flush_with_range = hv_remote_flush_tlb_with_range; } #endif @@ -7751,7 +7892,7 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_apicv()) { enable_apicv = 0; - kvm_x86_ops->sync_pir_to_irr = NULL; + vmx_x86_ops.sync_pir_to_irr = NULL; } if (cpu_has_vmx_tsc_scaling()) { @@ -7764,8 +7905,16 @@ static __init int hardware_setup(void) if (enable_ept) vmx_enable_tdp(); + + if (!enable_ept) + ept_lpage_level = 0; + else if (cpu_has_vmx_ept_1g_page()) + ept_lpage_level = PT_PDPE_LEVEL; + else if (cpu_has_vmx_ept_2m_page()) + ept_lpage_level = PT_DIRECTORY_LEVEL; else - kvm_disable_tdp(); + ept_lpage_level = PT_PAGE_TABLE_LEVEL; + kvm_configure_mmu(enable_ept, ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT @@ -7775,10 +7924,10 @@ static __init int hardware_setup(void) enable_pml = 0; if (!enable_pml) { - kvm_x86_ops->slot_enable_log_dirty = NULL; - kvm_x86_ops->slot_disable_log_dirty = NULL; - kvm_x86_ops->flush_log_dirty = NULL; - kvm_x86_ops->enable_log_dirty_pt_masked = NULL; + vmx_x86_ops.slot_enable_log_dirty = NULL; + vmx_x86_ops.slot_disable_log_dirty = NULL; + vmx_x86_ops.flush_log_dirty = NULL; + vmx_x86_ops.enable_log_dirty_pt_masked = NULL; } if (!cpu_has_vmx_preemption_timer()) @@ -7806,9 +7955,9 @@ static __init int hardware_setup(void) } if (!enable_preemption_timer) { - kvm_x86_ops->set_hv_timer = NULL; - kvm_x86_ops->cancel_hv_timer = NULL; - kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit; + vmx_x86_ops.set_hv_timer = NULL; + vmx_x86_ops.cancel_hv_timer = NULL; + vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit; } kvm_set_posted_intr_wakeup_handler(wakeup_handler); @@ -7824,185 +7973,27 @@ static __init int hardware_setup(void) nested_vmx_setup_ctls_msrs(&vmcs_config.nested, vmx_capability.ept); - r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers); + r = nested_vmx_hardware_setup(&vmx_x86_ops, + kvm_vmx_exit_handlers); if (r) return r; } + vmx_set_cpu_caps(); + r = alloc_kvm_area(); if (r) nested_vmx_hardware_unsetup(); return r; } -static __exit void hardware_unsetup(void) -{ - if (nested) - nested_vmx_hardware_unsetup(); - - free_kvm_area(); -} - -static bool vmx_check_apicv_inhibit_reasons(ulong bit) -{ - ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_HYPERV); - - return supported & BIT(bit); -} - -static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { +static struct kvm_x86_init_ops vmx_init_ops __initdata = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, - .hardware_setup = hardware_setup, - .hardware_unsetup = hardware_unsetup, .check_processor_compatibility = vmx_check_processor_compat, - .hardware_enable = hardware_enable, - .hardware_disable = hardware_disable, - .cpu_has_accelerated_tpr = report_flexpriority, - .has_emulated_msr = vmx_has_emulated_msr, - - .vm_init = vmx_vm_init, - .vm_alloc = vmx_vm_alloc, - .vm_free = vmx_vm_free, - - .vcpu_create = vmx_create_vcpu, - .vcpu_free = vmx_free_vcpu, - .vcpu_reset = vmx_vcpu_reset, - - .prepare_guest_switch = vmx_prepare_switch_to_guest, - .vcpu_load = vmx_vcpu_load, - .vcpu_put = vmx_vcpu_put, - - .update_bp_intercept = update_exception_bitmap, - .get_msr_feature = vmx_get_msr_feature, - .get_msr = vmx_get_msr, - .set_msr = vmx_set_msr, - .get_segment_base = vmx_get_segment_base, - .get_segment = vmx_get_segment, - .set_segment = vmx_set_segment, - .get_cpl = vmx_get_cpl, - .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, - .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, - .set_cr0 = vmx_set_cr0, - .set_cr3 = vmx_set_cr3, - .set_cr4 = vmx_set_cr4, - .set_efer = vmx_set_efer, - .get_idt = vmx_get_idt, - .set_idt = vmx_set_idt, - .get_gdt = vmx_get_gdt, - .set_gdt = vmx_set_gdt, - .get_dr6 = vmx_get_dr6, - .set_dr6 = vmx_set_dr6, - .set_dr7 = vmx_set_dr7, - .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, - .cache_reg = vmx_cache_reg, - .get_rflags = vmx_get_rflags, - .set_rflags = vmx_set_rflags, - - .tlb_flush = vmx_flush_tlb, - .tlb_flush_gva = vmx_flush_tlb_gva, - - .run = vmx_vcpu_run, - .handle_exit = vmx_handle_exit, - .skip_emulated_instruction = vmx_skip_emulated_instruction, - .update_emulated_instruction = vmx_update_emulated_instruction, - .set_interrupt_shadow = vmx_set_interrupt_shadow, - .get_interrupt_shadow = vmx_get_interrupt_shadow, - .patch_hypercall = vmx_patch_hypercall, - .set_irq = vmx_inject_irq, - .set_nmi = vmx_inject_nmi, - .queue_exception = vmx_queue_exception, - .cancel_injection = vmx_cancel_injection, - .interrupt_allowed = vmx_interrupt_allowed, - .nmi_allowed = vmx_nmi_allowed, - .get_nmi_mask = vmx_get_nmi_mask, - .set_nmi_mask = vmx_set_nmi_mask, - .enable_nmi_window = enable_nmi_window, - .enable_irq_window = enable_irq_window, - .update_cr8_intercept = update_cr8_intercept, - .set_virtual_apic_mode = vmx_set_virtual_apic_mode, - .set_apic_access_page_addr = vmx_set_apic_access_page_addr, - .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl, - .load_eoi_exitmap = vmx_load_eoi_exitmap, - .apicv_post_state_restore = vmx_apicv_post_state_restore, - .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons, - .hwapic_irr_update = vmx_hwapic_irr_update, - .hwapic_isr_update = vmx_hwapic_isr_update, - .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt, - .sync_pir_to_irr = vmx_sync_pir_to_irr, - .deliver_posted_interrupt = vmx_deliver_posted_interrupt, - .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt, - - .set_tss_addr = vmx_set_tss_addr, - .set_identity_map_addr = vmx_set_identity_map_addr, - .get_tdp_level = get_ept_level, - .get_mt_mask = vmx_get_mt_mask, - - .get_exit_info = vmx_get_exit_info, - - .get_lpage_level = vmx_get_lpage_level, - - .cpuid_update = vmx_cpuid_update, - - .rdtscp_supported = vmx_rdtscp_supported, - .invpcid_supported = vmx_invpcid_supported, - - .set_supported_cpuid = vmx_set_supported_cpuid, - - .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - - .read_l1_tsc_offset = vmx_read_l1_tsc_offset, - .write_l1_tsc_offset = vmx_write_l1_tsc_offset, - - .set_tdp_cr3 = vmx_set_cr3, - - .check_intercept = vmx_check_intercept, - .handle_exit_irqoff = vmx_handle_exit_irqoff, - .mpx_supported = vmx_mpx_supported, - .xsaves_supported = vmx_xsaves_supported, - .umip_emulated = vmx_umip_emulated, - .pt_supported = vmx_pt_supported, - .pku_supported = vmx_pku_supported, - - .request_immediate_exit = vmx_request_immediate_exit, - - .sched_in = vmx_sched_in, - - .slot_enable_log_dirty = vmx_slot_enable_log_dirty, - .slot_disable_log_dirty = vmx_slot_disable_log_dirty, - .flush_log_dirty = vmx_flush_log_dirty, - .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, - .write_log_dirty = vmx_write_pml_buffer, - - .pre_block = vmx_pre_block, - .post_block = vmx_post_block, - - .pmu_ops = &intel_pmu_ops, - - .update_pi_irte = vmx_update_pi_irte, - -#ifdef CONFIG_X86_64 - .set_hv_timer = vmx_set_hv_timer, - .cancel_hv_timer = vmx_cancel_hv_timer, -#endif - - .setup_mce = vmx_setup_mce, - - .smi_allowed = vmx_smi_allowed, - .pre_enter_smm = vmx_pre_enter_smm, - .pre_leave_smm = vmx_pre_leave_smm, - .enable_smi_window = enable_smi_window, + .hardware_setup = hardware_setup, - .check_nested_events = NULL, - .get_nested_state = NULL, - .set_nested_state = NULL, - .get_vmcs12_pages = NULL, - .nested_enable_evmcs = NULL, - .nested_get_evmcs_version = NULL, - .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, - .apic_init_signal_blocked = vmx_apic_init_signal_blocked, + .runtime_ops = &vmx_x86_ops, }; static void vmx_cleanup_l1d_flush(void) @@ -8089,7 +8080,7 @@ static int __init vmx_init(void) } #endif - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx), __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) return r; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 0695ea177e22..aab9df55336e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -12,7 +12,6 @@ #include "vmcs.h" extern const u32 vmx_msr_index[]; -extern u64 host_efer; #define MSR_TYPE_R 1 #define MSR_TYPE_W 2 @@ -333,9 +332,9 @@ u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu); void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask); void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); +void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -450,7 +449,7 @@ static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx) static inline u32 vmx_vmentry_ctrl(void) { u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; - if (pt_mode == PT_MODE_SYSTEM) + if (vmx_pt_mode_is_system()) vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | VM_ENTRY_LOAD_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ @@ -461,7 +460,7 @@ static inline u32 vmx_vmentry_ctrl(void) static inline u32 vmx_vmexit_ctrl(void) { u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; - if (pt_mode == PT_MODE_SYSTEM) + if (vmx_pt_mode_is_system()) vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | VM_EXIT_CLEAR_IA32_RTIT_CTL); /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ @@ -491,7 +490,6 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); -void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs); void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs); static inline struct vmcs *alloc_vmcs(bool shadow) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bf8564d73fc3..b8124b562dea 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -22,6 +22,7 @@ #include "i8254.h" #include "tss.h" #include "kvm_cache_regs.h" +#include "kvm_emulate.h" #include "x86.h" #include "cpuid.h" #include "pmu.h" @@ -81,7 +82,7 @@ u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P; EXPORT_SYMBOL_GPL(kvm_mce_cap_supported); #define emul_to_vcpu(ctxt) \ - container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) + ((struct kvm_vcpu *)(ctxt)->vcpu) /* EFER defaults: * - enable syscall per default because its emulated by KVM @@ -109,7 +110,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); static void store_regs(struct kvm_vcpu *vcpu); static int sync_regs(struct kvm_vcpu *vcpu); -struct kvm_x86_ops *kvm_x86_ops __read_mostly; +struct kvm_x86_ops kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); static bool __read_mostly ignore_msrs = 0; @@ -180,7 +181,17 @@ struct kvm_shared_msrs { static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; static struct kvm_shared_msrs __percpu *shared_msrs; +#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ + | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ + | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ + | XFEATURE_MASK_PKRU) + +u64 __read_mostly host_efer; +EXPORT_SYMBOL_GPL(host_efer); + static u64 __read_mostly host_xss; +u64 __read_mostly supported_xss; +EXPORT_SYMBOL_GPL(supported_xss); struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, @@ -226,10 +237,25 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { }; u64 __read_mostly host_xcr0; +u64 __read_mostly supported_xcr0; +EXPORT_SYMBOL_GPL(supported_xcr0); struct kmem_cache *x86_fpu_cache; EXPORT_SYMBOL_GPL(x86_fpu_cache); +static struct kmem_cache *x86_emulator_cache; + +static struct kmem_cache *kvm_alloc_emulator_cache(void) +{ + unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); + unsigned int size = sizeof(struct x86_emulate_ctxt); + + return kmem_cache_create_usercopy("x86_emulator", size, + __alignof__(struct x86_emulate_ctxt), + SLAB_ACCOUNT, useroffset, + size - useroffset, NULL); +} + static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) @@ -350,6 +376,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } kvm_lapic_set_base(vcpu, msr_info->data); + kvm_recalculate_apic_map(vcpu->kvm); return 0; } EXPORT_SYMBOL_GPL(kvm_set_apic_base); @@ -619,7 +646,7 @@ EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); */ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) { - if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl) + if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl) return true; kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return false; @@ -760,7 +787,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!is_pae(vcpu)) return 1; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); if (cs_l) return 1; } else @@ -773,7 +800,7 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) return 1; - kvm_x86_ops->set_cr0(vcpu, cr0); + kvm_x86_ops.set_cr0(vcpu, cr0); if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); @@ -869,7 +896,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - if (kvm_x86_ops->get_cpl(vcpu) != 0 || + if (kvm_x86_ops.get_cpl(vcpu) != 0 || __kvm_set_xcr(vcpu, index, xcr)) { kvm_inject_gp(vcpu, 0); return 1; @@ -903,10 +930,10 @@ static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c) { u64 reserved_bits = __cr4_reserved_bits(cpu_has, c); - if (cpuid_ecx(0x7) & feature_bit(LA57)) + if (kvm_cpu_cap_has(X86_FEATURE_LA57)) reserved_bits &= ~X86_CR4_LA57; - if (kvm_x86_ops->umip_emulated()) + if (kvm_cpu_cap_has(X86_FEATURE_UMIP)) reserved_bits &= ~X86_CR4_UMIP; return reserved_bits; @@ -950,7 +977,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; } - if (kvm_x86_ops->set_cr4(vcpu, cr4)) + if (kvm_x86_ops.set_cr4(vcpu, cr4)) return 1; if (((cr4 ^ old_cr4) & pdptr_bits) || @@ -1034,7 +1061,7 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) static void kvm_update_dr6(struct kvm_vcpu *vcpu) { if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6); + kvm_x86_ops.set_dr6(vcpu, vcpu->arch.dr6); } static void kvm_update_dr7(struct kvm_vcpu *vcpu) @@ -1045,7 +1072,7 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) dr7 = vcpu->arch.guest_debug_dr7; else dr7 = vcpu->arch.dr7; - kvm_x86_ops->set_dr7(vcpu, dr7); + kvm_x86_ops.set_dr7(vcpu, dr7); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; @@ -1115,7 +1142,7 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) *val = vcpu->arch.dr6; else - *val = kvm_x86_ops->get_dr6(vcpu); + *val = kvm_x86_ops.get_dr6(vcpu); break; case 5: /* fall through */ @@ -1350,7 +1377,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - if (kvm_x86_ops->get_msr_feature(msr)) + if (kvm_x86_ops.get_msr_feature(msr)) return 1; } return 0; @@ -1418,7 +1445,7 @@ static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) efer &= ~EFER_LMA; efer |= vcpu->arch.efer & EFER_LMA; - kvm_x86_ops->set_efer(vcpu, efer); + kvm_x86_ops.set_efer(vcpu, efer); /* Update reserved bits */ if ((efer ^ old_efer) & EFER_NX) @@ -1474,7 +1501,7 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, msr.index = index; msr.host_initiated = host_initiated; - return kvm_x86_ops->set_msr(vcpu, &msr); + return kvm_x86_ops.set_msr(vcpu, &msr); } /* @@ -1492,7 +1519,7 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, msr.index = index; msr.host_initiated = host_initiated; - ret = kvm_x86_ops->get_msr(vcpu, &msr); + ret = kvm_x86_ops.get_msr(vcpu, &msr); if (!ret) *data = msr.data; return ret; @@ -1561,8 +1588,12 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && ((data & APIC_MODE_MASK) == APIC_DM_FIXED)) { + data &= ~(1 << 12); + kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32)); kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32)); - return kvm_lapic_reg_write(vcpu->arch.apic, APIC_ICR, (u32)data); + kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data); + trace_kvm_apic_write(APIC_ICR, (u32)data); + return 0; } return 1; @@ -1571,11 +1602,12 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); - u64 data = kvm_read_edx_eax(vcpu); + u64 data; int ret = 0; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): + data = kvm_read_edx_eax(vcpu); ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); break; default: @@ -1876,7 +1908,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) { - u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + u64 curr_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } @@ -1918,7 +1950,7 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); return tsc_offset + kvm_scale_tsc(vcpu, host_tsc); } @@ -1926,7 +1958,7 @@ EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { - vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset); + vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset); } static inline bool kvm_check_tsc_unstable(void) @@ -2050,7 +2082,7 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc); static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { - u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu); + u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); } @@ -2525,7 +2557,7 @@ static void kvmclock_sync_fn(struct work_struct *work) static bool can_set_mci_status(struct kvm_vcpu *vcpu) { /* McStatusWrEn enabled? */ - if (guest_cpuid_is_amd(vcpu)) + if (guest_cpuid_is_amd_or_hygon(vcpu)) return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); return false; @@ -2647,7 +2679,7 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa); + kvm_x86_ops.tlb_flush(vcpu, invalidate_gpa); } static void record_steal_time(struct kvm_vcpu *vcpu) @@ -2800,12 +2832,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) return 1; /* - * We do support PT if kvm_x86_ops->pt_supported(), but we do - * not support IA32_XSS[bit 8]. Guests will have to use - * RDMSR/WRMSR rather than XSAVES/XRSTORS to save/restore PT - * MSRs. + * KVM supports exposing PT to the guest, but does not support + * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than + * XSAVES/XRSTORS to save/restore PT MSRs. */ - if (data != 0) + if (data & ~supported_xss) return 1; vcpu->arch.ia32_xss = data; break; @@ -3079,7 +3110,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data); - break; case MSR_IA32_TSCDEADLINE: msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); break; @@ -3162,7 +3192,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return kvm_hv_get_msr_common(vcpu, msr_info->index, &msr_info->data, msr_info->host_initiated); - break; case MSR_IA32_BBL_CR_CTL3: /* This legacy MSR exists but isn't fully documented in current * silicon. It is however accessed by winxp in very narrow @@ -3367,10 +3396,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE); + r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE); break; case KVM_CAP_VAPIC: - r = !kvm_x86_ops->cpu_has_accelerated_tpr(); + r = !kvm_x86_ops.cpu_has_accelerated_tpr(); break; case KVM_CAP_NR_VCPUS: r = KVM_SOFT_MAX_VCPUS; @@ -3397,14 +3426,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_X2APIC_API_VALID_FLAGS; break; case KVM_CAP_NESTED_STATE: - r = kvm_x86_ops->get_nested_state ? - kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0; + r = kvm_x86_ops.get_nested_state ? + kvm_x86_ops.get_nested_state(NULL, NULL, 0) : 0; break; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - r = kvm_x86_ops->enable_direct_tlbflush != NULL; + r = kvm_x86_ops.enable_direct_tlbflush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - r = kvm_x86_ops->nested_enable_evmcs != NULL; + r = kvm_x86_ops.nested_enable_evmcs != NULL; break; default: break; @@ -3466,7 +3495,7 @@ long kvm_arch_dev_ioctl(struct file *filp, r = 0; break; } - case KVM_X86_GET_MCE_CAP_SUPPORTED: { + case KVM_X86_GET_MCE_CAP_SUPPORTED: r = -EFAULT; if (copy_to_user(argp, &kvm_mce_cap_supported, sizeof(kvm_mce_cap_supported))) @@ -3498,9 +3527,9 @@ long kvm_arch_dev_ioctl(struct file *filp, case KVM_GET_MSRS: r = msr_io(NULL, argp, do_get_msr_feature, 1); break; - } default: r = -EINVAL; + break; } out: return r; @@ -3520,14 +3549,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ if (need_emulate_wbinvd(vcpu)) { - if (kvm_x86_ops->has_wbinvd_exit()) + if (kvm_x86_ops.has_wbinvd_exit()) cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); else if (vcpu->cpu != -1 && vcpu->cpu != cpu) smp_call_function_single(vcpu->cpu, wbinvd_ipi, NULL, 1); } - kvm_x86_ops->vcpu_load(vcpu, cpu); + kvm_x86_ops.vcpu_load(vcpu, cpu); /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { @@ -3594,7 +3623,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) int idx; if (vcpu->preempted) - vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu); + vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu); /* * Disable page faults because we're in atomic context here. @@ -3613,7 +3642,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) kvm_steal_time_set_preempted(vcpu); srcu_read_unlock(&vcpu->kvm->srcu, idx); pagefault_enable(); - kvm_x86_ops->vcpu_put(vcpu); + kvm_x86_ops.vcpu_put(vcpu); vcpu->arch.last_host_tsc = rdtsc(); /* * If userspace has set any breakpoints or watchpoints, dr6 is restored @@ -3627,7 +3656,7 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_x86_ops.sync_pir_to_irr(vcpu); return kvm_apic_get_state(vcpu, s); } @@ -3735,7 +3764,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; - kvm_x86_ops->setup_mce(vcpu); + kvm_x86_ops.setup_mce(vcpu); out: return r; } @@ -3839,11 +3868,11 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; events->interrupt.soft = 0; - events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); + events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; - events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); + events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu); events->nmi.pad = 0; events->sipi_vector = 0; /* never valid when reporting to user space */ @@ -3910,13 +3939,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) - kvm_x86_ops->set_interrupt_shadow(vcpu, + kvm_x86_ops.set_interrupt_shadow(vcpu, events->interrupt.shadow); vcpu->arch.nmi_injected = events->nmi.injected; if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) vcpu->arch.nmi_pending = events->nmi.pending; - kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); + kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked); if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && lapic_in_kernel(vcpu)) @@ -4103,8 +4132,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, * CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility * with old userspace. */ - if (xstate_bv & ~kvm_supported_xcr0() || - mxcsr & ~mxcsr_feature_mask) + if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask) return -EINVAL; load_xsave(vcpu, (u8 *)guest_xsave->region); } else { @@ -4191,9 +4219,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - if (!kvm_x86_ops->nested_enable_evmcs) + if (!kvm_x86_ops.nested_enable_evmcs) return -ENOTTY; - r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version); + r = kvm_x86_ops.nested_enable_evmcs(vcpu, &vmcs_version); if (!r) { user_ptr = (void __user *)(uintptr_t)cap->args[0]; if (copy_to_user(user_ptr, &vmcs_version, @@ -4202,10 +4230,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, } return r; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: - if (!kvm_x86_ops->enable_direct_tlbflush) + if (!kvm_x86_ops.enable_direct_tlbflush) return -ENOTTY; - return kvm_x86_ops->enable_direct_tlbflush(vcpu); + return kvm_x86_ops.enable_direct_tlbflush(vcpu); default: return -EINVAL; @@ -4508,7 +4536,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_data_size; r = -EINVAL; - if (!kvm_x86_ops->get_nested_state) + if (!kvm_x86_ops.get_nested_state) break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); @@ -4516,7 +4544,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (get_user(user_data_size, &user_kvm_nested_state->size)) break; - r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state, + r = kvm_x86_ops.get_nested_state(vcpu, user_kvm_nested_state, user_data_size); if (r < 0) break; @@ -4538,7 +4566,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, int idx; r = -EINVAL; - if (!kvm_x86_ops->set_nested_state) + if (!kvm_x86_ops.set_nested_state) break; r = -EFAULT; @@ -4560,7 +4588,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + r = kvm_x86_ops.set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } @@ -4604,14 +4632,14 @@ static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) if (addr > (unsigned int)(-3 * PAGE_SIZE)) return -EINVAL; - ret = kvm_x86_ops->set_tss_addr(kvm, addr); + ret = kvm_x86_ops.set_tss_addr(kvm, addr); return ret; } static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) { - return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr); + return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr); } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, @@ -4763,77 +4791,13 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm, return 0; } -/** - * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot - * @kvm: kvm instance - * @log: slot id and address to which we copy the log - * - * Steps 1-4 below provide general overview of dirty page logging. See - * kvm_get_dirty_log_protect() function description for additional details. - * - * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we - * always flush the TLB (step 4) even if previous step failed and the dirty - * bitmap may be corrupt. Regardless of previous outcome the KVM logging API - * does not preclude user space subsequent dirty log read. Flushing TLB ensures - * writes will be marked dirty for next log read. - * - * 1. Take a snapshot of the bit and clear it if needed. - * 2. Write protect the corresponding page. - * 3. Copy the snapshot to the userspace. - * 4. Flush TLB's if needed. - */ -int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) { - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - /* * Flush potentially hardware-cached dirty pages to dirty_bitmap. */ - if (kvm_x86_ops->flush_log_dirty) - kvm_x86_ops->flush_log_dirty(kvm); - - r = kvm_get_dirty_log_protect(kvm, log, &flush); - - /* - * All the TLBs can be flushed out of mmu lock, see the comments in - * kvm_mmu_slot_remove_write_access(). - */ - lockdep_assert_held(&kvm->slots_lock); - if (flush) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; -} - -int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log) -{ - bool flush = false; - int r; - - mutex_lock(&kvm->slots_lock); - - /* - * Flush potentially hardware-cached dirty pages to dirty_bitmap. - */ - if (kvm_x86_ops->flush_log_dirty) - kvm_x86_ops->flush_log_dirty(kvm); - - r = kvm_clear_dirty_log_protect(kvm, log, &flush); - - /* - * All the TLBs can be flushed out of mmu lock, see the comments in - * kvm_mmu_slot_remove_write_access(). - */ - lockdep_assert_held(&kvm->slots_lock); - if (flush) - kvm_flush_remote_tlbs(kvm); - - mutex_unlock(&kvm->slots_lock); - return r; + if (kvm_x86_ops.flush_log_dirty) + kvm_x86_ops.flush_log_dirty(kvm); } int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, @@ -5186,8 +5150,8 @@ set_identity_unlock: } case KVM_MEMORY_ENCRYPT_OP: { r = -ENOTTY; - if (kvm_x86_ops->mem_enc_op) - r = kvm_x86_ops->mem_enc_op(kvm, argp); + if (kvm_x86_ops.mem_enc_op) + r = kvm_x86_ops.mem_enc_op(kvm, argp); break; } case KVM_MEMORY_ENCRYPT_REG_REGION: { @@ -5198,8 +5162,8 @@ set_identity_unlock: goto out; r = -ENOTTY; - if (kvm_x86_ops->mem_enc_reg_region) - r = kvm_x86_ops->mem_enc_reg_region(kvm, ®ion); + if (kvm_x86_ops.mem_enc_reg_region) + r = kvm_x86_ops.mem_enc_reg_region(kvm, ®ion); break; } case KVM_MEMORY_ENCRYPT_UNREG_REGION: { @@ -5210,8 +5174,8 @@ set_identity_unlock: goto out; r = -ENOTTY; - if (kvm_x86_ops->mem_enc_unreg_region) - r = kvm_x86_ops->mem_enc_unreg_region(kvm, ®ion); + if (kvm_x86_ops.mem_enc_unreg_region) + r = kvm_x86_ops.mem_enc_unreg_region(kvm, ®ion); break; } case KVM_HYPERV_EVENTFD: { @@ -5262,28 +5226,28 @@ static void kvm_init_msr_list(void) continue; break; case MSR_TSC_AUX: - if (!kvm_x86_ops->rdtscp_supported()) + if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) continue; break; case MSR_IA32_RTIT_CTL: case MSR_IA32_RTIT_STATUS: - if (!kvm_x86_ops->pt_supported()) + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) continue; break; case MSR_IA32_RTIT_CR3_MATCH: - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || !intel_pt_validate_hw_cap(PT_CAP_cr3_filtering)) continue; break; case MSR_IA32_RTIT_OUTPUT_BASE: case MSR_IA32_RTIT_OUTPUT_MASK: - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || (!intel_pt_validate_hw_cap(PT_CAP_topa_output) && !intel_pt_validate_hw_cap(PT_CAP_single_range_output))) continue; break; case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: { - if (!kvm_x86_ops->pt_supported() || + if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >= intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2) continue; @@ -5306,7 +5270,7 @@ static void kvm_init_msr_list(void) } for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { - if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i])) + if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i])) continue; emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; @@ -5369,13 +5333,13 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) static void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops->set_segment(vcpu, var, seg); + kvm_x86_ops.set_segment(vcpu, var, seg); } void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - kvm_x86_ops->get_segment(vcpu, var, seg); + kvm_x86_ops.get_segment(vcpu, var, seg); } gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, @@ -5395,14 +5359,14 @@ gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_FETCH_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } @@ -5410,7 +5374,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; access |= PFERR_WRITE_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } @@ -5459,7 +5423,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, struct x86_exception *exception) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; unsigned offset; int ret; @@ -5484,7 +5448,7 @@ int kvm_read_guest_virt(struct kvm_vcpu *vcpu, gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { - u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; /* * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED @@ -5505,7 +5469,7 @@ static int emulator_read_std(struct x86_emulate_ctxt *ctxt, struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = 0; - if (!system && kvm_x86_ops->get_cpl(vcpu) == 3) + if (!system && kvm_x86_ops.get_cpl(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); @@ -5558,7 +5522,7 @@ static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *v struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = PFERR_WRITE_MASK; - if (!system && kvm_x86_ops->get_cpl(vcpu) == 3) + if (!system && kvm_x86_ops.get_cpl(vcpu) == 3) access |= PFERR_USER_MASK; return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, @@ -5621,7 +5585,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, gpa_t *gpa, struct x86_exception *exception, bool write) { - u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) + u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0) | (write ? PFERR_WRITE_MASK : 0); /* @@ -5740,7 +5704,7 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, int handled, ret; bool write = ops->write; struct kvm_mmio_fragment *frag; - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; /* * If the exit was due to a NPF we may already have a GPA. @@ -5749,10 +5713,9 @@ static int emulator_read_write_onepage(unsigned long addr, void *val, * operation using rep will only have the initial GPA from the NPF * occurred. */ - if (vcpu->arch.gpa_available && - emulator_can_use_gpa(ctxt) && - (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) { - gpa = vcpu->arch.gpa_val; + if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) && + (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) { + gpa = ctxt->gpa_val; ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write); } else { ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); @@ -5972,11 +5935,9 @@ static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, return 0; } -static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, void *val, - unsigned int count) +static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, + unsigned short port, void *val, unsigned int count) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int ret; if (vcpu->arch.pio.count) @@ -5996,20 +5957,33 @@ data_avail: return 0; } -static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, - int size, unsigned short port, - const void *val, unsigned int count) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count); + +} +static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, + unsigned short port, const void *val, + unsigned int count) +{ memcpy(vcpu->arch.pio_data, val, size * count); trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data); return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false); } +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, + const void *val, unsigned int count) +{ + return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count); +} + static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) { - return kvm_x86_ops->get_segment_base(vcpu, seg); + return kvm_x86_ops.get_segment_base(vcpu, seg); } static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) @@ -6022,7 +5996,7 @@ static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; - if (kvm_x86_ops->has_wbinvd_exit()) { + if (kvm_x86_ops.has_wbinvd_exit()) { int cpu = get_cpu(); cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); @@ -6127,27 +6101,27 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { - return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); + return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt)); } static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt); } static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt); } static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt); } static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); + kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt); } static unsigned long emulator_get_cached_segment_base( @@ -6269,13 +6243,15 @@ static int emulator_intercept(struct x86_emulate_ctxt *ctxt, struct x86_instruction_info *info, enum x86_intercept_stage stage) { - return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); + return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage, + &ctxt->exception); } static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, - u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit) + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, + bool exact_only) { - return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit); + return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only); } static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt) @@ -6305,7 +6281,7 @@ static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulon static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) { - kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); + kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked); } static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) @@ -6321,7 +6297,7 @@ static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_fla static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, const char *smstate) { - return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smstate); + return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate); } static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) @@ -6383,7 +6359,7 @@ static const struct x86_emulate_ops emulate_ops = { static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); + u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -6394,7 +6370,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) if (int_shadow & mask) mask = 0; if (unlikely(int_shadow || mask)) { - kvm_x86_ops->set_interrupt_shadow(vcpu, mask); + kvm_x86_ops.set_interrupt_shadow(vcpu, mask); if (!mask) kvm_make_request(KVM_REQ_EVENT, vcpu); } @@ -6402,7 +6378,7 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) static bool inject_emulated_exception(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; if (ctxt->exception.vector == PF_VECTOR) return kvm_propagate_fault(vcpu, &ctxt->exception); @@ -6414,13 +6390,31 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu) return false; } +static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) +{ + struct x86_emulate_ctxt *ctxt; + + ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT); + if (!ctxt) { + pr_err("kvm: failed to allocate vcpu's emulator\n"); + return NULL; + } + + ctxt->vcpu = vcpu; + ctxt->ops = &emulate_ops; + vcpu->arch.emulate_ctxt = ctxt; + + return ctxt; +} + static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int cs_db, cs_l; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + ctxt->gpa_available = false; ctxt->eflags = kvm_get_rflags(vcpu); ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; @@ -6440,7 +6434,7 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); @@ -6479,7 +6473,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) kvm_queue_exception(vcpu, UD_VECTOR); - if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) { + if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; @@ -6496,10 +6490,11 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, gpa_t gpa = cr2_or_gpa; kvm_pfn_t pfn; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu))) + if (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) return false; if (!vcpu->arch.mmu->direct_map) { @@ -6587,10 +6582,11 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt, */ vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; - if (!(emulation_type & EMULTYPE_ALLOW_RETRY)) + if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) return false; - if (WARN_ON_ONCE(is_guest_mode(vcpu))) + if (WARN_ON_ONCE(is_guest_mode(vcpu)) || + WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) return false; if (x86_page_table_writing_insn(ctxt)) @@ -6658,10 +6654,10 @@ static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) { - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); int r; - r = kvm_x86_ops->skip_emulated_instruction(vcpu); + r = kvm_x86_ops.skip_emulated_instruction(vcpu); if (unlikely(!r)) return 0; @@ -6753,7 +6749,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len) { int r; - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; bool writeback = true; bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable; @@ -6843,8 +6839,19 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } restart: - /* Save the faulting GPA (cr2) in the address field */ - ctxt->exception.address = cr2_or_gpa; + if (emulation_type & EMULTYPE_PF) { + /* Save the faulting GPA (cr2) in the address field */ + ctxt->exception.address = cr2_or_gpa; + + /* With shadow page tables, cr2 contains a GVA or nGPA. */ + if (vcpu->arch.mmu->direct_map) { + ctxt->gpa_available = true; + ctxt->gpa_val = cr2_or_gpa; + } + } else { + /* Sanitize the address out of an abundance of paranoia. */ + ctxt->exception.address = 0; + } r = x86_emulate_insn(ctxt); @@ -6885,7 +6892,7 @@ restart: r = 1; if (writeback) { - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + unsigned long rflags = kvm_x86_ops.get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; if (!ctxt->have_exception || @@ -6893,8 +6900,8 @@ restart: kvm_rip_write(vcpu, ctxt->eip); if (r && ctxt->tf) r = kvm_vcpu_do_singlestep(vcpu); - if (kvm_x86_ops->update_emulated_instruction) - kvm_x86_ops->update_emulated_instruction(vcpu); + if (kvm_x86_ops.update_emulated_instruction) + kvm_x86_ops.update_emulated_instruction(vcpu); __kvm_set_rflags(vcpu, ctxt->eflags); } @@ -6945,8 +6952,8 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { unsigned long val = kvm_rax_read(vcpu); - int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, - size, port, &val, 1); + int ret = emulator_pio_out(vcpu, size, port, &val, 1); + if (ret) return ret; @@ -6982,11 +6989,10 @@ static int complete_fast_pio_in(struct kvm_vcpu *vcpu) val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; /* - * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform + * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform * the copy and tracing */ - emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, - vcpu->arch.pio.port, &val, 1); + emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1); kvm_rax_write(vcpu, val); return kvm_skip_emulated_instruction(vcpu); @@ -7001,8 +7007,7 @@ static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, /* For size less than 4 we merge, else we zero extend */ val = (size < 4) ? kvm_rax_read(vcpu) : 0; - ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, - &val, 1); + ret = emulator_pio_in(vcpu, size, port, &val, 1); if (ret) { kvm_rax_write(vcpu, val); return ret; @@ -7225,7 +7230,7 @@ static int kvm_is_user_mode(void) int user_mode = 3; if (__this_cpu_read(current_vcpu)) - user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu)); + user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu)); return user_mode != 0; } @@ -7302,10 +7307,10 @@ static struct notifier_block pvclock_gtod_notifier = { int kvm_arch_init(void *opaque) { + struct kvm_x86_init_ops *ops = opaque; int r; - struct kvm_x86_ops *ops = opaque; - if (kvm_x86_ops) { + if (kvm_x86_ops.hardware_enable) { printk(KERN_ERR "kvm: already loaded the other module\n"); r = -EEXIST; goto out; @@ -7342,18 +7347,22 @@ int kvm_arch_init(void *opaque) goto out; } + x86_emulator_cache = kvm_alloc_emulator_cache(); + if (!x86_emulator_cache) { + pr_err("kvm: failed to allocate cache for x86 emulator\n"); + goto out_free_x86_fpu_cache; + } + shared_msrs = alloc_percpu(struct kvm_shared_msrs); if (!shared_msrs) { printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n"); - goto out_free_x86_fpu_cache; + goto out_free_x86_emulator_cache; } r = kvm_mmu_module_init(); if (r) goto out_free_percpu; - kvm_x86_ops = ops; - kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0, PT_PRESENT_MASK, 0, sme_me_mask); @@ -7361,8 +7370,10 @@ int kvm_arch_init(void *opaque) perf_register_guest_info_callbacks(&kvm_guest_cbs); - if (boot_cpu_has(X86_FEATURE_XSAVE)) + if (boot_cpu_has(X86_FEATURE_XSAVE)) { host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; + } kvm_lapic_init(); if (pi_inject_timer == -1) @@ -7378,6 +7389,8 @@ int kvm_arch_init(void *opaque) out_free_percpu: free_percpu(shared_msrs); +out_free_x86_emulator_cache: + kmem_cache_destroy(x86_emulator_cache); out_free_x86_fpu_cache: kmem_cache_destroy(x86_fpu_cache); out: @@ -7400,7 +7413,7 @@ void kvm_arch_exit(void) #ifdef CONFIG_X86_64 pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); #endif - kvm_x86_ops = NULL; + kvm_x86_ops.hardware_enable = NULL; kvm_mmu_module_exit(); free_percpu(shared_msrs); kmem_cache_destroy(x86_fpu_cache); @@ -7538,7 +7551,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a3 &= 0xFFFFFFFF; } - if (kvm_x86_ops->get_cpl(vcpu) != 0) { + if (kvm_x86_ops.get_cpl(vcpu) != 0) { ret = -KVM_EPERM; goto out; } @@ -7584,7 +7597,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); - kvm_x86_ops->patch_hypercall(vcpu, instruction); + kvm_x86_ops.patch_hypercall(vcpu, instruction); return emulator_write_emulated(ctxt, rip, instruction, 3, &ctxt->exception); @@ -7613,7 +7626,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) { int max_irr, tpr; - if (!kvm_x86_ops->update_cr8_intercept) + if (!kvm_x86_ops.update_cr8_intercept) return; if (!lapic_in_kernel(vcpu)) @@ -7632,17 +7645,17 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) tpr = kvm_lapic_get_cr8(vcpu); - kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); + kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr); } -static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) +static int inject_pending_event(struct kvm_vcpu *vcpu) { int r; /* try to reinject previous events if any */ if (vcpu->arch.exception.injected) - kvm_x86_ops->queue_exception(vcpu); + kvm_x86_ops.queue_exception(vcpu); /* * Do not inject an NMI or interrupt if there is a pending * exception. Exceptions and interrupts are recognized at @@ -7659,9 +7672,9 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) */ else if (!vcpu->arch.exception.pending) { if (vcpu->arch.nmi_injected) - kvm_x86_ops->set_nmi(vcpu); + kvm_x86_ops.set_nmi(vcpu); else if (vcpu->arch.interrupt.injected) - kvm_x86_ops->set_irq(vcpu); + kvm_x86_ops.set_irq(vcpu); } /* @@ -7670,8 +7683,8 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) * from L2 to L1 due to pending L1 events which require exit * from L2 to L1. */ - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) { + r = kvm_x86_ops.check_nested_events(vcpu); if (r != 0) return r; } @@ -7708,7 +7721,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) } } - kvm_x86_ops->queue_exception(vcpu); + kvm_x86_ops.queue_exception(vcpu); } /* Don't consider new event if we re-injected an event */ @@ -7716,14 +7729,14 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) return 0; if (vcpu->arch.smi_pending && !is_smm(vcpu) && - kvm_x86_ops->smi_allowed(vcpu)) { + kvm_x86_ops.smi_allowed(vcpu)) { vcpu->arch.smi_pending = false; ++vcpu->arch.smi_count; enter_smm(vcpu); - } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { + } else if (vcpu->arch.nmi_pending && kvm_x86_ops.nmi_allowed(vcpu)) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; - kvm_x86_ops->set_nmi(vcpu); + kvm_x86_ops.set_nmi(vcpu); } else if (kvm_cpu_has_injectable_intr(vcpu)) { /* * Because interrupts can be injected asynchronously, we are @@ -7732,15 +7745,15 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) * proposal and current concerns. Perhaps we should be setting * KVM_REQ_EVENT only on certain events and not unconditionally? */ - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) { - r = kvm_x86_ops->check_nested_events(vcpu, req_int_win); + if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) { + r = kvm_x86_ops.check_nested_events(vcpu); if (r != 0) return r; } - if (kvm_x86_ops->interrupt_allowed(vcpu)) { + if (kvm_x86_ops.interrupt_allowed(vcpu)) { kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - kvm_x86_ops->set_irq(vcpu); + kvm_x86_ops.set_irq(vcpu); } } @@ -7756,7 +7769,7 @@ static void process_nmi(struct kvm_vcpu *vcpu) * If an NMI is already in progress, limit further NMIs to just one. * Otherwise, allow two (and we'll inject the first one immediately). */ - if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) + if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) limit = 1; vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); @@ -7846,11 +7859,11 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7f7c, seg.limit); put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg)); - kvm_x86_ops->get_gdt(vcpu, &dt); + kvm_x86_ops.get_gdt(vcpu, &dt); put_smstate(u32, buf, 0x7f74, dt.address); put_smstate(u32, buf, 0x7f70, dt.size); - kvm_x86_ops->get_idt(vcpu, &dt); + kvm_x86_ops.get_idt(vcpu, &dt); put_smstate(u32, buf, 0x7f58, dt.address); put_smstate(u32, buf, 0x7f54, dt.size); @@ -7900,7 +7913,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7e94, seg.limit); put_smstate(u64, buf, 0x7e98, seg.base); - kvm_x86_ops->get_idt(vcpu, &dt); + kvm_x86_ops.get_idt(vcpu, &dt); put_smstate(u32, buf, 0x7e84, dt.size); put_smstate(u64, buf, 0x7e88, dt.address); @@ -7910,7 +7923,7 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7e74, seg.limit); put_smstate(u64, buf, 0x7e78, seg.base); - kvm_x86_ops->get_gdt(vcpu, &dt); + kvm_x86_ops.get_gdt(vcpu, &dt); put_smstate(u32, buf, 0x7e64, dt.size); put_smstate(u64, buf, 0x7e68, dt.address); @@ -7940,28 +7953,28 @@ static void enter_smm(struct kvm_vcpu *vcpu) * vCPU state (e.g. leave guest mode) after we've saved the state into * the SMM state-save area. */ - kvm_x86_ops->pre_enter_smm(vcpu, buf); + kvm_x86_ops.pre_enter_smm(vcpu, buf); vcpu->arch.hflags |= HF_SMM_MASK; kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf)); - if (kvm_x86_ops->get_nmi_mask(vcpu)) + if (kvm_x86_ops.get_nmi_mask(vcpu)) vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; else - kvm_x86_ops->set_nmi_mask(vcpu, true); + kvm_x86_ops.set_nmi_mask(vcpu, true); kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); kvm_rip_write(vcpu, 0x8000); cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG); - kvm_x86_ops->set_cr0(vcpu, cr0); + kvm_x86_ops.set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; - kvm_x86_ops->set_cr4(vcpu, 0); + kvm_x86_ops.set_cr4(vcpu, 0); /* Undocumented: IDT limit is set to zero on entry to SMM. */ dt.address = dt.size = 0; - kvm_x86_ops->set_idt(vcpu, &dt); + kvm_x86_ops.set_idt(vcpu, &dt); __kvm_set_dr(vcpu, 7, DR7_FIXED_1); @@ -7992,7 +8005,7 @@ static void enter_smm(struct kvm_vcpu *vcpu) #ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) - kvm_x86_ops->set_efer(vcpu, 0); + kvm_x86_ops.set_efer(vcpu, 0); #endif kvm_update_cpuid(vcpu); @@ -8030,7 +8043,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); kvm_apic_update_apicv(vcpu); - kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu); + kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); @@ -8043,23 +8056,30 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); */ void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { - if (!kvm_x86_ops->check_apicv_inhibit_reasons || - !kvm_x86_ops->check_apicv_inhibit_reasons(bit)) + unsigned long old, new, expected; + + if (!kvm_x86_ops.check_apicv_inhibit_reasons || + !kvm_x86_ops.check_apicv_inhibit_reasons(bit)) return; - if (activate) { - if (!test_and_clear_bit(bit, &kvm->arch.apicv_inhibit_reasons) || - !kvm_apicv_activated(kvm)) - return; - } else { - if (test_and_set_bit(bit, &kvm->arch.apicv_inhibit_reasons) || - kvm_apicv_activated(kvm)) - return; - } + old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); + do { + expected = new = old; + if (activate) + __clear_bit(bit, &new); + else + __set_bit(bit, &new); + if (new == old) + break; + old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new); + } while (old != expected); + + if (!!old == !!new) + return; trace_kvm_apicv_update_request(activate, bit); - if (kvm_x86_ops->pre_update_apicv_exec_ctrl) - kvm_x86_ops->pre_update_apicv_exec_ctrl(kvm, activate); + if (kvm_x86_ops.pre_update_apicv_exec_ctrl) + kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate); kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@ -8075,7 +8095,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { if (vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_x86_ops.sync_pir_to_irr(vcpu); if (ioapic_in_kernel(vcpu->kvm)) kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -8095,7 +8115,7 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors, vcpu_to_synic(vcpu)->vec_bitmap, 256); - kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); + kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap); } int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, @@ -8122,13 +8142,13 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; - if (!kvm_x86_ops->set_apic_access_page_addr) + if (!kvm_x86_ops.set_apic_access_page_addr) return; page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (is_error_page(page)) return; - kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page)); + kvm_x86_ops.set_apic_access_page_addr(vcpu, page_to_phys(page)); /* * Do not pin apic access page in memory, the MMU notifier @@ -8160,7 +8180,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { - if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) { + if (unlikely(!kvm_x86_ops.get_vmcs12_pages(vcpu))) { r = 0; goto out; } @@ -8180,8 +8200,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); - if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu)) - kvm_mmu_load_cr3(vcpu); + if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) + kvm_mmu_load_pgd(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb(vcpu, true); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { @@ -8266,7 +8286,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - if (inject_pending_event(vcpu, req_int_win) != 0) + if (inject_pending_event(vcpu) != 0) req_immediate_exit = true; else { /* Enable SMI/NMI/IRQ window open exits if needed. @@ -8284,12 +8304,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * SMI. */ if (vcpu->arch.smi_pending && !is_smm(vcpu)) - if (!kvm_x86_ops->enable_smi_window(vcpu)) + if (!kvm_x86_ops.enable_smi_window(vcpu)) req_immediate_exit = true; if (vcpu->arch.nmi_pending) - kvm_x86_ops->enable_nmi_window(vcpu); + kvm_x86_ops.enable_nmi_window(vcpu); if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - kvm_x86_ops->enable_irq_window(vcpu); + kvm_x86_ops.enable_irq_window(vcpu); WARN_ON(vcpu->arch.exception.pending); } @@ -8306,7 +8326,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) preempt_disable(); - kvm_x86_ops->prepare_guest_switch(vcpu); + kvm_x86_ops.prepare_guest_switch(vcpu); /* * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt @@ -8337,7 +8357,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * notified with kvm_vcpu_kick. */ if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); + kvm_x86_ops.sync_pir_to_irr(vcpu); if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || need_resched() || signal_pending(current)) { @@ -8352,7 +8372,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); - kvm_x86_ops->request_immediate_exit(vcpu); + kvm_x86_ops.request_immediate_exit(vcpu); } trace_kvm_entry(vcpu->vcpu_id); @@ -8372,7 +8392,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } - kvm_x86_ops->run(vcpu); + kvm_x86_ops.run(vcpu); /* * Do this here before restoring debug registers on the host. And @@ -8382,7 +8402,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) */ if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); - kvm_x86_ops->sync_dirty_debug_regs(vcpu); + kvm_x86_ops.sync_dirty_debug_regs(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr6(vcpu); kvm_update_dr7(vcpu); @@ -8404,7 +8424,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops->handle_exit_irqoff(vcpu, &exit_fastpath); + kvm_x86_ops.handle_exit_irqoff(vcpu, &exit_fastpath); /* * Consume any pending interrupts, including the possible source of @@ -8447,12 +8467,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (vcpu->arch.apic_attention) kvm_lapic_sync_from_vapic(vcpu); - vcpu->arch.gpa_available = false; - r = kvm_x86_ops->handle_exit(vcpu, exit_fastpath); + r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath); return r; cancel_injection: - kvm_x86_ops->cancel_injection(vcpu); + kvm_x86_ops.cancel_injection(vcpu); if (unlikely(vcpu->arch.apic_attention)) kvm_lapic_sync_from_vapic(vcpu); out: @@ -8462,13 +8481,13 @@ out: static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) { if (!kvm_arch_vcpu_runnable(vcpu) && - (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) { + (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) { srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); - if (kvm_x86_ops->post_block) - kvm_x86_ops->post_block(vcpu); + if (kvm_x86_ops.post_block) + kvm_x86_ops.post_block(vcpu); if (!kvm_check_request(KVM_REQ_UNHALT, vcpu)) return 1; @@ -8488,15 +8507,14 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) break; default: return -EINTR; - break; } return 1; } static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) - kvm_x86_ops->check_nested_events(vcpu, false); + if (is_guest_mode(vcpu) && kvm_x86_ops.check_nested_events) + kvm_x86_ops.check_nested_events(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); @@ -8652,7 +8670,7 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) kvm_save_current_fpu(vcpu->arch.user_fpu); - /* PKRU is separately restored in kvm_x86_ops->run. */ + /* PKRU is separately restored in kvm_x86_ops.run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); @@ -8757,7 +8775,7 @@ static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) * that usually, but some bad designed PV devices (vmware * backdoor interface) need this to work */ - emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt); + emulator_writeback_register_cache(vcpu->arch.emulate_ctxt); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; } regs->rax = kvm_rax_read(vcpu); @@ -8855,10 +8873,10 @@ static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); - kvm_x86_ops->get_idt(vcpu, &dt); + kvm_x86_ops.get_idt(vcpu, &dt); sregs->idt.limit = dt.size; sregs->idt.base = dt.address; - kvm_x86_ops->get_gdt(vcpu, &dt); + kvm_x86_ops.get_gdt(vcpu, &dt); sregs->gdt.limit = dt.size; sregs->gdt.base = dt.address; @@ -8943,7 +8961,7 @@ out: int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { - struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); @@ -9005,10 +9023,10 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) dt.size = sregs->idt.limit; dt.address = sregs->idt.base; - kvm_x86_ops->set_idt(vcpu, &dt); + kvm_x86_ops.set_idt(vcpu, &dt); dt.size = sregs->gdt.limit; dt.address = sregs->gdt.base; - kvm_x86_ops->set_gdt(vcpu, &dt); + kvm_x86_ops.set_gdt(vcpu, &dt); vcpu->arch.cr2 = sregs->cr2; mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; @@ -9018,16 +9036,16 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) kvm_set_cr8(vcpu, sregs->cr8); mmu_reset_needed |= vcpu->arch.efer != sregs->efer; - kvm_x86_ops->set_efer(vcpu, sregs->efer); + kvm_x86_ops.set_efer(vcpu, sregs->efer); mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; - kvm_x86_ops->set_cr0(vcpu, sregs->cr0); + kvm_x86_ops.set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)); - kvm_x86_ops->set_cr4(vcpu, sregs->cr4); + kvm_x86_ops.set_cr4(vcpu, sregs->cr4); if (cpuid_update_needed) kvm_update_cpuid(vcpu); @@ -9133,7 +9151,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops->update_bp_intercept(vcpu); + kvm_x86_ops.update_bp_intercept(vcpu); r = 0; @@ -9275,7 +9293,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) struct page *page; int r; - vcpu->arch.emulate_ctxt.ops = &emulate_ops; if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else @@ -9313,11 +9330,14 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) GFP_KERNEL_ACCOUNT)) goto fail_free_mce_banks; + if (!alloc_emulate_ctxt(vcpu)) + goto free_wbinvd_dirty_mask; + vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL_ACCOUNT); if (!vcpu->arch.user_fpu) { pr_err("kvm: failed to allocate userspace's fpu\n"); - goto free_wbinvd_dirty_mask; + goto free_emulate_ctxt; } vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, @@ -9342,7 +9362,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) kvm_hv_vcpu_init(vcpu); - r = kvm_x86_ops->vcpu_create(vcpu); + r = kvm_x86_ops.vcpu_create(vcpu); if (r) goto free_guest_fpu; @@ -9359,6 +9379,8 @@ free_guest_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); free_user_fpu: kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); +free_emulate_ctxt: + kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_wbinvd_dirty_mask: free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fail_free_mce_banks: @@ -9393,11 +9415,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) mutex_unlock(&vcpu->mutex); - if (!kvmclock_periodic_sync) - return; - - schedule_delayed_work(&kvm->arch.kvmclock_sync_work, - KVMCLOCK_SYNC_PERIOD); + if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); } void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -9409,8 +9429,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvmclock_reset(vcpu); - kvm_x86_ops->vcpu_free(vcpu); + kvm_x86_ops.vcpu_free(vcpu); + kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu); kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu); @@ -9496,7 +9517,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.ia32_xss = 0; - kvm_x86_ops->vcpu_reset(vcpu, init_event); + kvm_x86_ops.vcpu_reset(vcpu, init_event); } void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) @@ -9521,7 +9542,7 @@ int kvm_arch_hardware_enable(void) bool stable, backwards_tsc = false; kvm_shared_msr_cpu_online(); - ret = kvm_x86_ops->hardware_enable(); + ret = kvm_x86_ops.hardware_enable(); if (ret != 0) return ret; @@ -9603,18 +9624,29 @@ int kvm_arch_hardware_enable(void) void kvm_arch_hardware_disable(void) { - kvm_x86_ops->hardware_disable(); + kvm_x86_ops.hardware_disable(); drop_user_return_notifiers(); } -int kvm_arch_hardware_setup(void) +int kvm_arch_hardware_setup(void *opaque) { + struct kvm_x86_init_ops *ops = opaque; int r; - r = kvm_x86_ops->hardware_setup(); + rdmsrl_safe(MSR_EFER, &host_efer); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + rdmsrl(MSR_IA32_XSS, host_xss); + + r = ops->hardware_setup(); if (r != 0) return r; + memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); + + if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) + supported_xss = 0; + cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data); if (kvm_has_tsc_control) { @@ -9631,28 +9663,26 @@ int kvm_arch_hardware_setup(void) kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; } - if (boot_cpu_has(X86_FEATURE_XSAVES)) - rdmsrl(MSR_IA32_XSS, host_xss); - kvm_init_msr_list(); return 0; } void kvm_arch_hardware_unsetup(void) { - kvm_x86_ops->hardware_unsetup(); + kvm_x86_ops.hardware_unsetup(); } -int kvm_arch_check_processor_compat(void) +int kvm_arch_check_processor_compat(void *opaque) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + struct kvm_x86_init_ops *ops = opaque; WARN_ON(!irqs_disabled()); if (kvm_host_cr4_reserved_bits(c) != cr4_reserved_bits) return -EIO; - return kvm_x86_ops->check_processor_compatibility(); + return ops->check_processor_compatibility(); } bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) @@ -9678,9 +9708,16 @@ void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) pmu->need_cleanup = true; kvm_make_request(KVM_REQ_PMU, vcpu); } - kvm_x86_ops->sched_in(vcpu, cpu); + kvm_x86_ops.sched_in(vcpu, cpu); +} + +void kvm_arch_free_vm(struct kvm *kvm) +{ + kfree(kvm->arch.hyperv.hv_pa_pg); + vfree(kvm); } + int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { if (type) @@ -9715,7 +9752,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); - return kvm_x86_ops->vm_init(kvm); + return kvm_x86_ops.vm_init(kvm); } int kvm_arch_post_init_vm(struct kvm *kvm) @@ -9763,9 +9800,9 @@ void kvm_arch_sync_events(struct kvm *kvm) int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) { int i, r; - unsigned long hva; + unsigned long hva, uninitialized_var(old_npages); struct kvm_memslots *slots = kvm_memslots(kvm); - struct kvm_memory_slot *slot, old; + struct kvm_memory_slot *slot; /* Called with kvm->slots_lock held. */ if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) @@ -9773,7 +9810,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) slot = id_to_memslot(slots, id); if (size) { - if (slot->npages) + if (slot && slot->npages) return -EEXIST; /* @@ -9785,13 +9822,18 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) if (IS_ERR((void *)hva)) return PTR_ERR((void *)hva); } else { - if (!slot->npages) + if (!slot || !slot->npages) return 0; - hva = 0; + /* + * Stuff a non-canonical value to catch use-after-delete. This + * ends up being 0 on 32-bit KVM, but there's no better + * alternative. + */ + hva = (unsigned long)(0xdeadull << 48); + old_npages = slot->npages; } - old = *slot; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { struct kvm_userspace_memory_region m; @@ -9806,7 +9848,7 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) } if (!size) - vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); + vm_munmap(hva, old_npages * PAGE_SIZE); return 0; } @@ -9833,8 +9875,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); mutex_unlock(&kvm->slots_lock); } - if (kvm_x86_ops->vm_destroy) - kvm_x86_ops->vm_destroy(kvm); + if (kvm_x86_ops.vm_destroy) + kvm_x86_ops.vm_destroy(kvm); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); @@ -9845,34 +9887,36 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_hv_destroy_vm(kvm); } -void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, - struct kvm_memory_slot *dont) +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { int i; for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { - if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) { - kvfree(free->arch.rmap[i]); - free->arch.rmap[i] = NULL; - } + kvfree(slot->arch.rmap[i]); + slot->arch.rmap[i] = NULL; + if (i == 0) continue; - if (!dont || free->arch.lpage_info[i - 1] != - dont->arch.lpage_info[i - 1]) { - kvfree(free->arch.lpage_info[i - 1]); - free->arch.lpage_info[i - 1] = NULL; - } + kvfree(slot->arch.lpage_info[i - 1]); + slot->arch.lpage_info[i - 1] = NULL; } - kvm_page_track_free_memslot(free, dont); + kvm_page_track_free_memslot(slot); } -int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, - unsigned long npages) +static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot, + unsigned long npages) { int i; + /* + * Clear out the previous array pointers for the KVM_MR_MOVE case. The + * old arrays will be freed by __kvm_set_memory_region() if installing + * the new memslot is successful. + */ + memset(&slot->arch, 0, sizeof(slot->arch)); + for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { struct kvm_lpage_info *linfo; unsigned long ugfn; @@ -9903,11 +9947,9 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, ugfn = slot->userspace_addr >> PAGE_SHIFT; /* * If the gfn and userspace address are not aligned wrt each - * other, or if explicitly asked to, disable large page - * support for this slot + * other, disable large page support for this slot. */ - if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || - !kvm_largepages_enabled()) { + if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) { unsigned long j; for (j = 0; j < lpages; ++j) @@ -9954,6 +9996,9 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { + if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) + return kvm_alloc_memslot_metadata(memslot, + mem->memory_size >> PAGE_SHIFT); return 0; } @@ -9962,14 +10007,14 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, { /* Still write protect RO slot */ if (new->flags & KVM_MEM_READONLY) { - kvm_mmu_slot_remove_write_access(kvm, new); + kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL); return; } /* * Call kvm_x86_ops dirty logging hooks when they are valid. * - * kvm_x86_ops->slot_disable_log_dirty is called when: + * kvm_x86_ops.slot_disable_log_dirty is called when: * * - KVM_MR_CREATE with dirty logging is disabled * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag @@ -9981,7 +10026,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * any additional overhead from PML when guest is running with dirty * logging disabled for memory slots. * - * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot + * kvm_x86_ops.slot_enable_log_dirty is called when switching new slot * to dirty logging mode. * * If kvm_x86_ops dirty logging hooks are invalid, use write protect. @@ -9997,19 +10042,32 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, * See the comments in fast_page_fault(). */ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) { - if (kvm_x86_ops->slot_enable_log_dirty) - kvm_x86_ops->slot_enable_log_dirty(kvm, new); - else - kvm_mmu_slot_remove_write_access(kvm, new); + if (kvm_x86_ops.slot_enable_log_dirty) { + kvm_x86_ops.slot_enable_log_dirty(kvm, new); + } else { + int level = + kvm_dirty_log_manual_protect_and_init_set(kvm) ? + PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL; + + /* + * If we're with initial-all-set, we don't need + * to write protect any small page because + * they're reported as dirty already. However + * we still need to write-protect huge pages + * so that the page split can happen lazily on + * the first write to the huge page. + */ + kvm_mmu_slot_remove_write_access(kvm, new, level); + } } else { - if (kvm_x86_ops->slot_disable_log_dirty) - kvm_x86_ops->slot_disable_log_dirty(kvm, new); + if (kvm_x86_ops.slot_disable_log_dirty) + kvm_x86_ops.slot_disable_log_dirty(kvm, new); } } void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old, + struct kvm_memory_slot *old, const struct kvm_memory_slot *new, enum kvm_mr_change change) { @@ -10051,6 +10109,10 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, */ if (change != KVM_MR_DELETE) kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new); + + /* Free the arrays associated with the old memslot. */ + if (change == KVM_MR_MOVE) + kvm_arch_free_memslot(kvm, old); } void kvm_arch_flush_shadow_all(struct kvm *kvm) @@ -10067,8 +10129,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) { return (is_guest_mode(vcpu) && - kvm_x86_ops->guest_apic_has_interrupt && - kvm_x86_ops->guest_apic_has_interrupt(vcpu)); + kvm_x86_ops.guest_apic_has_interrupt && + kvm_x86_ops.guest_apic_has_interrupt(vcpu)); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) @@ -10087,7 +10149,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && - kvm_x86_ops->nmi_allowed(vcpu))) + kvm_x86_ops.nmi_allowed(vcpu))) return true; if (kvm_test_request(KVM_REQ_SMI, vcpu) || @@ -10120,7 +10182,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) kvm_test_request(KVM_REQ_EVENT, vcpu)) return true; - if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu)) + if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu)) return true; return false; @@ -10138,7 +10200,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->interrupt_allowed(vcpu); + return kvm_x86_ops.interrupt_allowed(vcpu); } unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) @@ -10160,7 +10222,7 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags; - rflags = kvm_x86_ops->get_rflags(vcpu); + rflags = kvm_x86_ops.get_rflags(vcpu); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) rflags &= ~X86_EFLAGS_TF; return rflags; @@ -10172,7 +10234,7 @@ static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; - kvm_x86_ops->set_rflags(vcpu, rflags); + kvm_x86_ops.set_rflags(vcpu, rflags); } void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) @@ -10195,7 +10257,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) return; if (!vcpu->arch.mmu->direct_map && - work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu)) + work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu)) return; kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true); @@ -10283,7 +10345,7 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || (vcpu->arch.apf.send_user_only && - kvm_x86_ops->get_cpl(vcpu) == 0)) + kvm_x86_ops.get_cpl(vcpu) == 0)) return false; return true; @@ -10303,7 +10365,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) * If interrupts are off we cannot even use an artificial * halt state. */ - return kvm_x86_ops->interrupt_allowed(vcpu); + return kvm_x86_ops.interrupt_allowed(vcpu); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, @@ -10432,7 +10494,7 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, irqfd->producer = prod; - return kvm_x86_ops->update_pi_irte(irqfd->kvm, + return kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 1); } @@ -10452,7 +10514,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ - ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); + ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); @@ -10461,7 +10523,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set) { - return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set); + return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set); } bool kvm_vector_hashing_enabled(void) @@ -10518,4 +10580,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 3624665acee4..b968acc0516f 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -5,6 +5,7 @@ #include <linux/kvm_host.h> #include <asm/pvclock.h> #include "kvm_cache_regs.h" +#include "kvm_emulate.h" #define KVM_DEFAULT_PLE_GAP 128 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096 @@ -96,7 +97,7 @@ static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) if (!is_long_mode(vcpu)) return false; - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l); return cs_l; } @@ -149,11 +150,6 @@ static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48; } -static inline u8 ctxt_virt_addr_bits(struct x86_emulate_ctxt *ctxt) -{ - return (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_LA57) ? 57 : 48; -} - static inline u64 get_canonical(u64 la, u8 vaddr_bits) { return ((int64_t)la << (64 - vaddr_bits)) >> (64 - vaddr_bits); @@ -164,12 +160,6 @@ static inline bool is_noncanonical_address(u64 la, struct kvm_vcpu *vcpu) return get_canonical(la, vcpu_virt_addr_bits(vcpu)) != la; } -static inline bool emul_is_noncanonical_address(u64 la, - struct x86_emulate_ctxt *ctxt) -{ - return get_canonical(la, ctxt_virt_addr_bits(ctxt)) != la; -} - static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, unsigned access) { @@ -247,7 +237,7 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk) static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) { - return is_smm(vcpu) || kvm_x86_ops->apic_init_signal_blocked(vcpu); + return is_smm(vcpu) || kvm_x86_ops.apic_init_signal_blocked(vcpu); } void kvm_set_pending_timer(struct kvm_vcpu *vcpu); @@ -280,13 +270,15 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); -#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ - | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ - | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ - | XFEATURE_MASK_PKRU) extern u64 host_xcr0; +extern u64 supported_xcr0; +extern u64 supported_xss; -extern u64 kvm_supported_xcr0(void); +static inline bool kvm_mpx_supported(void) +{ + return (supported_xcr0 & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) + == (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR); +} extern unsigned int min_timer_period_us; |