diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-26 23:20:14 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-26 23:20:14 +0200 |
commit | bf9095424d027e942e1d1ee74977e17b7df8e455 (patch) | |
tree | 57659cf68b7df09005bc5ada4d315d66472cebf3 /arch/arm64/kvm/arm.c | |
parent | Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel... (diff) | |
parent | KVM: x86: Fix the intel_pt PMI handling wrongly considered from guest (diff) | |
download | linux-bf9095424d027e942e1d1ee74977e17b7df8e455.tar.xz linux-bf9095424d027e942e1d1ee74977e17b7df8e455.zip |
Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
Pull kvm updates from Paolo Bonzini:
"S390:
- ultravisor communication device driver
- fix TEID on terminating storage key ops
RISC-V:
- Added Sv57x4 support for G-stage page table
- Added range based local HFENCE functions
- Added remote HFENCE functions based on VCPU requests
- Added ISA extension registers in ONE_REG interface
- Updated KVM RISC-V maintainers entry to cover selftests support
ARM:
- Add support for the ARMv8.6 WFxT extension
- Guard pages for the EL2 stacks
- Trap and emulate AArch32 ID registers to hide unsupported features
- Ability to select and save/restore the set of hypercalls exposed to
the guest
- Support for PSCI-initiated suspend in collaboration with userspace
- GICv3 register-based LPI invalidation support
- Move host PMU event merging into the vcpu data structure
- GICv3 ITS save/restore fixes
- The usual set of small-scale cleanups and fixes
x86:
- New ioctls to get/set TSC frequency for a whole VM
- Allow userspace to opt out of hypercall patching
- Only do MSR filtering for MSRs accessed by rdmsr/wrmsr
AMD SEV improvements:
- Add KVM_EXIT_SHUTDOWN metadata for SEV-ES
- V_TSC_AUX support
Nested virtualization improvements for AMD:
- Support for "nested nested" optimizations (nested vVMLOAD/VMSAVE,
nested vGIF)
- Allow AVIC to co-exist with a nested guest running
- Fixes for LBR virtualizations when a nested guest is running, and
nested LBR virtualization support
- PAUSE filtering for nested hypervisors
Guest support:
- Decoupling of vcpu_is_preempted from PV spinlocks"
* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (199 commits)
KVM: x86: Fix the intel_pt PMI handling wrongly considered from guest
KVM: selftests: x86: Sync the new name of the test case to .gitignore
Documentation: kvm: reorder ARM-specific section about KVM_SYSTEM_EVENT_SUSPEND
x86, kvm: use correct GFP flags for preemption disabled
KVM: LAPIC: Drop pending LAPIC timer injection when canceling the timer
x86/kvm: Alloc dummy async #PF token outside of raw spinlock
KVM: x86: avoid calling x86 emulator without a decoded instruction
KVM: SVM: Use kzalloc for sev ioctl interfaces to prevent kernel data leak
x86/fpu: KVM: Set the base guest FPU uABI size to sizeof(struct kvm_xsave)
s390/uv_uapi: depend on CONFIG_S390
KVM: selftests: x86: Fix test failure on arch lbr capable platforms
KVM: LAPIC: Trace LAPIC timer expiration on every vmentry
KVM: s390: selftest: Test suppression indication on key prot exception
KVM: s390: Don't indicate suppression on dirtying, failing memop
selftests: drivers/s390x: Add uvdevice tests
drivers/s390/char: Add Ultravisor io device
MAINTAINERS: Update KVM RISC-V entry to cover selftests support
RISC-V: KVM: Introduce ISA extension register
RISC-V: KVM: Cleanup stale TLB entries when host CPU changes
RISC-V: KVM: Add remote HFENCE functions based on VCPU requests
...
Diffstat (limited to 'arch/arm64/kvm/arm.c')
-rw-r--r-- | arch/arm64/kvm/arm.c | 164 |
1 files changed, 134 insertions, 30 deletions
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index cedc3ba2c098..400bb0fe2745 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -97,6 +97,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, } mutex_unlock(&kvm->lock); break; + case KVM_CAP_ARM_SYSTEM_SUSPEND: + r = 0; + set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); + break; default: r = -EINVAL; break; @@ -153,9 +157,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_vgic_early_init(kvm); /* The maximum number of VCPUs is limited by the host's GIC model */ - kvm->arch.max_vcpus = kvm_arm_default_max_vcpus(); + kvm->max_vcpus = kvm_arm_default_max_vcpus(); set_default_spectre(kvm); + kvm_arm_init_hypercalls(kvm); return ret; out_free_stage2_pgd: @@ -210,6 +215,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_GUEST_DEBUG: case KVM_CAP_VCPU_ATTRIBUTES: case KVM_CAP_PTP_KVM: + case KVM_CAP_ARM_SYSTEM_SUSPEND: r = 1; break; case KVM_CAP_SET_GUEST_DEBUG2: @@ -230,7 +236,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: case KVM_CAP_MAX_VCPU_ID: if (kvm) - r = kvm->arch.max_vcpus; + r = kvm->max_vcpus; else r = kvm_arm_default_max_vcpus(); break; @@ -306,7 +312,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) return -EBUSY; - if (id >= kvm->arch.max_vcpus) + if (id >= kvm->max_vcpus) return -EINVAL; return 0; @@ -356,11 +362,6 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) kvm_arm_vcpu_destroy(vcpu); } -int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) -{ - return kvm_timer_is_pending(vcpu); -} - void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { @@ -432,20 +433,34 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->cpu = -1; } -static void vcpu_power_off(struct kvm_vcpu *vcpu) +void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu) { - vcpu->arch.power_off = true; + vcpu->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; kvm_make_request(KVM_REQ_SLEEP, vcpu); kvm_vcpu_kick(vcpu); } +bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED; +} + +static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu) +{ + vcpu->arch.mp_state.mp_state = KVM_MP_STATE_SUSPENDED; + kvm_make_request(KVM_REQ_SUSPEND, vcpu); + kvm_vcpu_kick(vcpu); +} + +static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED; +} + int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { - if (vcpu->arch.power_off) - mp_state->mp_state = KVM_MP_STATE_STOPPED; - else - mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + *mp_state = vcpu->arch.mp_state; return 0; } @@ -457,10 +472,13 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: - vcpu->arch.power_off = false; + vcpu->arch.mp_state = *mp_state; break; case KVM_MP_STATE_STOPPED: - vcpu_power_off(vcpu); + kvm_arm_vcpu_power_off(vcpu); + break; + case KVM_MP_STATE_SUSPENDED: + kvm_arm_vcpu_suspend(vcpu); break; default: ret = -EINVAL; @@ -480,7 +498,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) { bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF); return ((irq_lines || kvm_vgic_vcpu_pending_irq(v)) - && !v->arch.power_off && !v->arch.pause); + && !kvm_arm_vcpu_stopped(v) && !v->arch.pause); } bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) @@ -592,15 +610,15 @@ void kvm_arm_resume_guest(struct kvm *kvm) } } -static void vcpu_req_sleep(struct kvm_vcpu *vcpu) +static void kvm_vcpu_sleep(struct kvm_vcpu *vcpu) { struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu); rcuwait_wait_event(wait, - (!vcpu->arch.power_off) &&(!vcpu->arch.pause), + (!kvm_arm_vcpu_stopped(vcpu)) && (!vcpu->arch.pause), TASK_INTERRUPTIBLE); - if (vcpu->arch.power_off || vcpu->arch.pause) { + if (kvm_arm_vcpu_stopped(vcpu) || vcpu->arch.pause) { /* Awaken to handle a signal, request we sleep again later. */ kvm_make_request(KVM_REQ_SLEEP, vcpu); } @@ -639,6 +657,7 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) preempt_enable(); kvm_vcpu_halt(vcpu); + vcpu->arch.flags &= ~KVM_ARM64_WFIT; kvm_clear_request(KVM_REQ_UNHALT, vcpu); preempt_disable(); @@ -646,11 +665,53 @@ void kvm_vcpu_wfi(struct kvm_vcpu *vcpu) preempt_enable(); } -static void check_vcpu_requests(struct kvm_vcpu *vcpu) +static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu) +{ + if (!kvm_arm_vcpu_suspended(vcpu)) + return 1; + + kvm_vcpu_wfi(vcpu); + + /* + * The suspend state is sticky; we do not leave it until userspace + * explicitly marks the vCPU as runnable. Request that we suspend again + * later. + */ + kvm_make_request(KVM_REQ_SUSPEND, vcpu); + + /* + * Check to make sure the vCPU is actually runnable. If so, exit to + * userspace informing it of the wakeup condition. + */ + if (kvm_arch_vcpu_runnable(vcpu)) { + memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event)); + vcpu->run->system_event.type = KVM_SYSTEM_EVENT_WAKEUP; + vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; + return 0; + } + + /* + * Otherwise, we were unblocked to process a different event, such as a + * pending signal. Return 1 and allow kvm_arch_vcpu_ioctl_run() to + * process the event. + */ + return 1; +} + +/** + * check_vcpu_requests - check and handle pending vCPU requests + * @vcpu: the VCPU pointer + * + * Return: 1 if we should enter the guest + * 0 if we should exit to userspace + * < 0 if we should exit to userspace, where the return value indicates + * an error + */ +static int check_vcpu_requests(struct kvm_vcpu *vcpu) { if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) - vcpu_req_sleep(vcpu); + kvm_vcpu_sleep(vcpu); if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) kvm_reset_vcpu(vcpu); @@ -675,7 +736,12 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_RELOAD_PMU, vcpu)) kvm_pmu_handle_pmcr(vcpu, __vcpu_sys_reg(vcpu, PMCR_EL0)); + + if (kvm_check_request(KVM_REQ_SUSPEND, vcpu)) + return kvm_vcpu_suspend(vcpu); } + + return 1; } static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) @@ -792,7 +858,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (!ret) ret = 1; - check_vcpu_requests(vcpu); + if (ret > 0) + ret = check_vcpu_requests(vcpu); /* * Preparing the interrupts to be injected also @@ -816,6 +883,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_vgic_flush_hwstate(vcpu); + kvm_pmu_update_vcpu_events(vcpu); + /* * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. @@ -1125,9 +1194,9 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, * Handle the "start in power-off" case. */ if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) - vcpu_power_off(vcpu); + kvm_arm_vcpu_power_off(vcpu); else - vcpu->arch.power_off = false; + vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE; return 0; } @@ -1485,7 +1554,6 @@ static void cpu_prepare_hyp_mode(int cpu) tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET; params->tcr_el2 = tcr; - params->stack_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stack_page, cpu) + PAGE_SIZE); params->pgd_pa = kvm_mmu_get_httbr(); if (is_protected_kvm_enabled()) params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; @@ -1763,8 +1831,6 @@ static int init_subsystems(void) kvm_register_perf_callbacks(NULL); - kvm_sys_reg_table_init(); - out: if (err || !is_protected_kvm_enabled()) on_each_cpu(_kvm_arch_hardware_disable, NULL, 1); @@ -1935,14 +2001,46 @@ static int init_hyp_mode(void) * Map the Hyp stack pages */ for_each_possible_cpu(cpu) { + struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu); - err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE, - PAGE_HYP); + unsigned long hyp_addr; + + /* + * Allocate a contiguous HYP private VA range for the stack + * and guard page. The allocation is also aligned based on + * the order of its size. + */ + err = hyp_alloc_private_va_range(PAGE_SIZE * 2, &hyp_addr); + if (err) { + kvm_err("Cannot allocate hyp stack guard page\n"); + goto out_err; + } + /* + * Since the stack grows downwards, map the stack to the page + * at the higher address and leave the lower guard page + * unbacked. + * + * Any valid stack address now has the PAGE_SHIFT bit as 1 + * and addresses corresponding to the guard page have the + * PAGE_SHIFT bit as 0 - this is used for overflow detection. + */ + err = __create_hyp_mappings(hyp_addr + PAGE_SIZE, PAGE_SIZE, + __pa(stack_page), PAGE_HYP); if (err) { kvm_err("Cannot map hyp stack\n"); goto out_err; } + + /* + * Save the stack PA in nvhe_init_params. This will be needed + * to recreate the stack mapping in protected nVHE mode. + * __hyp_pa() won't do the right thing there, since the stack + * has been mapped in the flexible private VA space. + */ + params->stack_pa = __pa(stack_page); + + params->stack_hyp_va = hyp_addr + (2 * PAGE_SIZE); } for_each_possible_cpu(cpu) { @@ -2091,6 +2189,12 @@ int kvm_arch_init(void *opaque) return -ENODEV; } + err = kvm_sys_reg_table_init(); + if (err) { + kvm_info("Error initializing system register tables"); + return err; + } + in_hyp_mode = is_kernel_in_hyp_mode(); if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || |