diff options
Diffstat (limited to 'arch/arm64')
-rw-r--r-- | arch/arm64/include/asm/kvm_asm.h | 1 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_host.h | 7 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_mmu.h | 3 | ||||
-rw-r--r-- | arch/arm64/include/asm/kvm_nested.h | 4 | ||||
-rw-r--r-- | arch/arm64/kernel/asm-offsets.c | 1 | ||||
-rw-r--r-- | arch/arm64/kernel/signal.c | 92 | ||||
-rw-r--r-- | arch/arm64/kvm/arm.c | 5 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/hyp-init.S | 52 | ||||
-rw-r--r-- | arch/arm64/kvm/hypercalls.c | 12 | ||||
-rw-r--r-- | arch/arm64/kvm/mmu.c | 15 | ||||
-rw-r--r-- | arch/arm64/kvm/nested.c | 53 | ||||
-rw-r--r-- | arch/arm64/kvm/sys_regs.c | 77 | ||||
-rw-r--r-- | arch/arm64/kvm/vgic/vgic-init.c | 41 | ||||
-rw-r--r-- | arch/arm64/kvm/vgic/vgic-kvm-device.c | 7 | ||||
-rw-r--r-- | arch/arm64/net/bpf_jit_comp.c | 12 |
15 files changed, 307 insertions, 75 deletions
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index b36a3b6cc011..67afac659231 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -178,6 +178,7 @@ struct kvm_nvhe_init_params { unsigned long hcr_el2; unsigned long vttbr; unsigned long vtcr; + unsigned long tmp; }; /* diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 94cff508874b..bf64fed9820e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -51,6 +51,7 @@ #define KVM_REQ_RELOAD_PMU KVM_ARCH_REQ(5) #define KVM_REQ_SUSPEND KVM_ARCH_REQ(6) #define KVM_REQ_RESYNC_PMU_EL0 KVM_ARCH_REQ(7) +#define KVM_REQ_NESTED_S2_UNMAP KVM_ARCH_REQ(8) #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) @@ -212,6 +213,12 @@ struct kvm_s2_mmu { bool nested_stage2_enabled; /* + * true when this MMU needs to be unmapped before being used for a new + * purpose. + */ + bool pending_unmap; + + /* * 0: Nobody is currently using this, check vttbr for validity * >0: Somebody is actively using this. */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index cd4087fbda9a..66d93e320ec8 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -166,7 +166,8 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, int create_hyp_stack(phys_addr_t phys_addr, unsigned long *haddr); void __init free_hyp_pgds(void); -void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size); +void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, + u64 size, bool may_block); void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end); void kvm_stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end); diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h index e8bc6d67aba2..233e65522716 100644 --- a/arch/arm64/include/asm/kvm_nested.h +++ b/arch/arm64/include/asm/kvm_nested.h @@ -78,6 +78,8 @@ extern void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid, extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu); extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu); +extern void check_nested_vcpu_requests(struct kvm_vcpu *vcpu); + struct kvm_s2_trans { phys_addr_t output; unsigned long block_size; @@ -124,7 +126,7 @@ extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans); extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2); extern void kvm_nested_s2_wp(struct kvm *kvm); -extern void kvm_nested_s2_unmap(struct kvm *kvm); +extern void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block); extern void kvm_nested_s2_flush(struct kvm *kvm); unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu, u64 val); diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 27de1dddb0ab..b21dd24b8efc 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -146,6 +146,7 @@ int main(void) DEFINE(NVHE_INIT_HCR_EL2, offsetof(struct kvm_nvhe_init_params, hcr_el2)); DEFINE(NVHE_INIT_VTTBR, offsetof(struct kvm_nvhe_init_params, vttbr)); DEFINE(NVHE_INIT_VTCR, offsetof(struct kvm_nvhe_init_params, vtcr)); + DEFINE(NVHE_INIT_TMP, offsetof(struct kvm_nvhe_init_params, tmp)); #endif #ifdef CONFIG_CPU_PM DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp)); diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 561986947530..c7d311d8b92a 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -19,6 +19,7 @@ #include <linux/ratelimit.h> #include <linux/rseq.h> #include <linux/syscalls.h> +#include <linux/pkeys.h> #include <asm/daifflags.h> #include <asm/debug-monitors.h> @@ -66,10 +67,63 @@ struct rt_sigframe_user_layout { unsigned long end_offset; }; +/* + * Holds any EL0-controlled state that influences unprivileged memory accesses. + * This includes both accesses done in userspace and uaccess done in the kernel. + * + * This state needs to be carefully managed to ensure that it doesn't cause + * uaccess to fail when setting up the signal frame, and the signal handler + * itself also expects a well-defined state when entered. + */ +struct user_access_state { + u64 por_el0; +}; + #define BASE_SIGFRAME_SIZE round_up(sizeof(struct rt_sigframe), 16) #define TERMINATOR_SIZE round_up(sizeof(struct _aarch64_ctx), 16) #define EXTRA_CONTEXT_SIZE round_up(sizeof(struct extra_context), 16) +/* + * Save the user access state into ua_state and reset it to disable any + * restrictions. + */ +static void save_reset_user_access_state(struct user_access_state *ua_state) +{ + if (system_supports_poe()) { + u64 por_enable_all = 0; + + for (int pkey = 0; pkey < arch_max_pkey(); pkey++) + por_enable_all |= POE_RXW << (pkey * POR_BITS_PER_PKEY); + + ua_state->por_el0 = read_sysreg_s(SYS_POR_EL0); + write_sysreg_s(por_enable_all, SYS_POR_EL0); + /* Ensure that any subsequent uaccess observes the updated value */ + isb(); + } +} + +/* + * Set the user access state for invoking the signal handler. + * + * No uaccess should be done after that function is called. + */ +static void set_handler_user_access_state(void) +{ + if (system_supports_poe()) + write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); +} + +/* + * Restore the user access state to the values saved in ua_state. + * + * No uaccess should be done after that function is called. + */ +static void restore_user_access_state(const struct user_access_state *ua_state) +{ + if (system_supports_poe()) + write_sysreg_s(ua_state->por_el0, SYS_POR_EL0); +} + static void init_user_layout(struct rt_sigframe_user_layout *user) { const size_t reserved_size = @@ -261,18 +315,20 @@ static int restore_fpmr_context(struct user_ctxs *user) return err; } -static int preserve_poe_context(struct poe_context __user *ctx) +static int preserve_poe_context(struct poe_context __user *ctx, + const struct user_access_state *ua_state) { int err = 0; __put_user_error(POE_MAGIC, &ctx->head.magic, err); __put_user_error(sizeof(*ctx), &ctx->head.size, err); - __put_user_error(read_sysreg_s(SYS_POR_EL0), &ctx->por_el0, err); + __put_user_error(ua_state->por_el0, &ctx->por_el0, err); return err; } -static int restore_poe_context(struct user_ctxs *user) +static int restore_poe_context(struct user_ctxs *user, + struct user_access_state *ua_state) { u64 por_el0; int err = 0; @@ -282,7 +338,7 @@ static int restore_poe_context(struct user_ctxs *user) __get_user_error(por_el0, &(user->poe->por_el0), err); if (!err) - write_sysreg_s(por_el0, SYS_POR_EL0); + ua_state->por_el0 = por_el0; return err; } @@ -850,7 +906,8 @@ invalid: } static int restore_sigframe(struct pt_regs *regs, - struct rt_sigframe __user *sf) + struct rt_sigframe __user *sf, + struct user_access_state *ua_state) { sigset_t set; int i, err; @@ -899,7 +956,7 @@ static int restore_sigframe(struct pt_regs *regs, err = restore_zt_context(&user); if (err == 0 && system_supports_poe() && user.poe) - err = restore_poe_context(&user); + err = restore_poe_context(&user, ua_state); return err; } @@ -908,6 +965,7 @@ SYSCALL_DEFINE0(rt_sigreturn) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; + struct user_access_state ua_state; /* Always make any pending restarted system calls return -EINTR */ current->restart_block.fn = do_no_restart_syscall; @@ -924,12 +982,14 @@ SYSCALL_DEFINE0(rt_sigreturn) if (!access_ok(frame, sizeof (*frame))) goto badframe; - if (restore_sigframe(regs, frame)) + if (restore_sigframe(regs, frame, &ua_state)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) goto badframe; + restore_user_access_state(&ua_state); + return regs->regs[0]; badframe: @@ -1035,7 +1095,8 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, } static int setup_sigframe(struct rt_sigframe_user_layout *user, - struct pt_regs *regs, sigset_t *set) + struct pt_regs *regs, sigset_t *set, + const struct user_access_state *ua_state) { int i, err = 0; struct rt_sigframe __user *sf = user->sigframe; @@ -1097,10 +1158,9 @@ static int setup_sigframe(struct rt_sigframe_user_layout *user, struct poe_context __user *poe_ctx = apply_user_offset(user, user->poe_offset); - err |= preserve_poe_context(poe_ctx); + err |= preserve_poe_context(poe_ctx, ua_state); } - /* ZA state if present */ if (system_supports_sme() && err == 0 && user->za_offset) { struct za_context __user *za_ctx = @@ -1237,9 +1297,6 @@ static void setup_return(struct pt_regs *regs, struct k_sigaction *ka, sme_smstop(); } - if (system_supports_poe()) - write_sysreg_s(POR_EL0_INIT, SYS_POR_EL0); - if (ka->sa.sa_flags & SA_RESTORER) sigtramp = ka->sa.sa_restorer; else @@ -1253,6 +1310,7 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, { struct rt_sigframe_user_layout user; struct rt_sigframe __user *frame; + struct user_access_state ua_state; int err = 0; fpsimd_signal_preserve_current_state(); @@ -1260,13 +1318,14 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, if (get_sigframe(&user, ksig, regs)) return 1; + save_reset_user_access_state(&ua_state); frame = user.sigframe; __put_user_error(0, &frame->uc.uc_flags, err); __put_user_error(NULL, &frame->uc.uc_link, err); err |= __save_altstack(&frame->uc.uc_stack, regs->sp); - err |= setup_sigframe(&user, regs, set); + err |= setup_sigframe(&user, regs, set, &ua_state); if (err == 0) { setup_return(regs, &ksig->ka, &user, usig); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { @@ -1276,6 +1335,11 @@ static int setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, } } + if (err == 0) + set_handler_user_access_state(); + else + restore_user_access_state(&ua_state); + return err; } diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a0d01c46e408..48cafb65d6ac 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -997,6 +997,9 @@ static int kvm_vcpu_suspend(struct kvm_vcpu *vcpu) static int check_vcpu_requests(struct kvm_vcpu *vcpu) { if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) + return -EIO; + if (kvm_check_request(KVM_REQ_SLEEP, vcpu)) kvm_vcpu_sleep(vcpu); @@ -1031,6 +1034,8 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu) if (kvm_dirty_ring_check_request(vcpu)) return 0; + + check_nested_vcpu_requests(vcpu); } return 1; diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index 401af1835be6..fc1866226067 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -24,28 +24,25 @@ .align 11 SYM_CODE_START(__kvm_hyp_init) - ventry __invalid // Synchronous EL2t - ventry __invalid // IRQ EL2t - ventry __invalid // FIQ EL2t - ventry __invalid // Error EL2t + ventry . // Synchronous EL2t + ventry . // IRQ EL2t + ventry . // FIQ EL2t + ventry . // Error EL2t - ventry __invalid // Synchronous EL2h - ventry __invalid // IRQ EL2h - ventry __invalid // FIQ EL2h - ventry __invalid // Error EL2h + ventry . // Synchronous EL2h + ventry . // IRQ EL2h + ventry . // FIQ EL2h + ventry . // Error EL2h ventry __do_hyp_init // Synchronous 64-bit EL1 - ventry __invalid // IRQ 64-bit EL1 - ventry __invalid // FIQ 64-bit EL1 - ventry __invalid // Error 64-bit EL1 + ventry . // IRQ 64-bit EL1 + ventry . // FIQ 64-bit EL1 + ventry . // Error 64-bit EL1 - ventry __invalid // Synchronous 32-bit EL1 - ventry __invalid // IRQ 32-bit EL1 - ventry __invalid // FIQ 32-bit EL1 - ventry __invalid // Error 32-bit EL1 - -__invalid: - b . + ventry . // Synchronous 32-bit EL1 + ventry . // IRQ 32-bit EL1 + ventry . // FIQ 32-bit EL1 + ventry . // Error 32-bit EL1 /* * Only uses x0..x3 so as to not clobber callee-saved SMCCC registers. @@ -76,6 +73,13 @@ __do_hyp_init: eret SYM_CODE_END(__kvm_hyp_init) +SYM_CODE_START_LOCAL(__kvm_init_el2_state) + /* Initialize EL2 CPU state to sane values. */ + init_el2_state // Clobbers x0..x2 + finalise_el2_state + ret +SYM_CODE_END(__kvm_init_el2_state) + /* * Initialize the hypervisor in EL2. * @@ -102,9 +106,12 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init) // TPIDR_EL2 is used to preserve x0 across the macro maze... isb msr tpidr_el2, x0 - init_el2_state - finalise_el2_state + str lr, [x0, #NVHE_INIT_TMP] + + bl __kvm_init_el2_state + mrs x0, tpidr_el2 + ldr lr, [x0, #NVHE_INIT_TMP] 1: ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] @@ -199,9 +206,8 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) 2: msr SPsel, #1 // We want to use SP_EL{1,2} - /* Initialize EL2 CPU state to sane values. */ - init_el2_state // Clobbers x0..x2 - finalise_el2_state + bl __kvm_init_el2_state + __init_el2_nvhe_prepare_eret /* Enable MMU, set vectors and stack. */ diff --git a/arch/arm64/kvm/hypercalls.c b/arch/arm64/kvm/hypercalls.c index 5763d979d8ca..ee6573befb81 100644 --- a/arch/arm64/kvm/hypercalls.c +++ b/arch/arm64/kvm/hypercalls.c @@ -317,7 +317,7 @@ int kvm_smccc_call_handler(struct kvm_vcpu *vcpu) * to the guest, and hide SSBS so that the * guest stays protected. */ - if (cpus_have_final_cap(ARM64_SSBS)) + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) break; fallthrough; case SPECTRE_UNAFFECTED: @@ -428,7 +428,7 @@ int kvm_arm_copy_fw_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) * Convert the workaround level into an easy-to-compare number, where higher * values mean better protection. */ -static int get_kernel_wa_level(u64 regid) +static int get_kernel_wa_level(struct kvm_vcpu *vcpu, u64 regid) { switch (regid) { case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: @@ -449,7 +449,7 @@ static int get_kernel_wa_level(u64 regid) * don't have any FW mitigation if SSBS is there at * all times. */ - if (cpus_have_final_cap(ARM64_SSBS)) + if (kvm_has_feat(vcpu->kvm, ID_AA64PFR1_EL1, SSBS, IMP)) return KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL; fallthrough; case SPECTRE_UNAFFECTED: @@ -486,7 +486,7 @@ int kvm_arm_get_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1: case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2: case KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_3: - val = get_kernel_wa_level(reg->id) & KVM_REG_FEATURE_LEVEL_MASK; + val = get_kernel_wa_level(vcpu, reg->id) & KVM_REG_FEATURE_LEVEL_MASK; break; case KVM_REG_ARM_STD_BMAP: val = READ_ONCE(smccc_feat->std_bmap); @@ -588,7 +588,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) if (val & ~KVM_REG_FEATURE_LEVEL_MASK) return -EINVAL; - if (get_kernel_wa_level(reg->id) < val) + if (get_kernel_wa_level(vcpu, reg->id) < val) return -EINVAL; return 0; @@ -624,7 +624,7 @@ int kvm_arm_set_fw_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) * We can deal with NOT_AVAIL on NOT_REQUIRED, but not the * other way around. */ - if (get_kernel_wa_level(reg->id) < wa_level) + if (get_kernel_wa_level(vcpu, reg->id) < wa_level) return -EINVAL; return 0; diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a509b63bd4dd..0f7658aefa1a 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -328,9 +328,10 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 may_block)); } -void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size) +void kvm_stage2_unmap_range(struct kvm_s2_mmu *mmu, phys_addr_t start, + u64 size, bool may_block) { - __unmap_stage2_range(mmu, start, size, true); + __unmap_stage2_range(mmu, start, size, may_block); } void kvm_stage2_flush_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) @@ -1015,7 +1016,7 @@ static void stage2_unmap_memslot(struct kvm *kvm, if (!(vma->vm_flags & VM_PFNMAP)) { gpa_t gpa = addr + (vm_start - memslot->userspace_addr); - kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, vm_end - vm_start, true); } hva = vm_end; } while (hva < reg_end); @@ -1042,7 +1043,7 @@ void stage2_unmap_vm(struct kvm *kvm) kvm_for_each_memslot(memslot, bkt, slots) stage2_unmap_memslot(kvm, memslot); - kvm_nested_s2_unmap(kvm); + kvm_nested_s2_unmap(kvm, true); write_unlock(&kvm->mmu_lock); mmap_read_unlock(current->mm); @@ -1912,7 +1913,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) (range->end - range->start) << PAGE_SHIFT, range->may_block); - kvm_nested_s2_unmap(kvm); + kvm_nested_s2_unmap(kvm, range->may_block); return false; } @@ -2179,8 +2180,8 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, phys_addr_t size = slot->npages << PAGE_SHIFT; write_lock(&kvm->mmu_lock); - kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size); - kvm_nested_s2_unmap(kvm); + kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true); + kvm_nested_s2_unmap(kvm, true); write_unlock(&kvm->mmu_lock); } diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index f9e30dd34c7a..c4b17d90fc49 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -632,9 +632,9 @@ static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu) /* Set the scene for the next search */ kvm->arch.nested_mmus_next = (i + 1) % kvm->arch.nested_mmus_size; - /* Clear the old state */ + /* Make sure we don't forget to do the laundry */ if (kvm_s2_mmu_valid(s2_mmu)) - kvm_stage2_unmap_range(s2_mmu, 0, kvm_phys_size(s2_mmu)); + s2_mmu->pending_unmap = true; /* * The virtual VMID (modulo CnP) will be used as a key when matching @@ -650,6 +650,16 @@ static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu) out: atomic_inc(&s2_mmu->refcnt); + + /* + * Set the vCPU request to perform an unmap, even if the pending unmap + * originates from another vCPU. This guarantees that the MMU has been + * completely unmapped before any vCPU actually uses it, and allows + * multiple vCPUs to lend a hand with completing the unmap. + */ + if (s2_mmu->pending_unmap) + kvm_make_request(KVM_REQ_NESTED_S2_UNMAP, vcpu); + return s2_mmu; } @@ -663,6 +673,13 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu) void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) { + /* + * The vCPU kept its reference on the MMU after the last put, keep + * rolling with it. + */ + if (vcpu->arch.hw_mmu) + return; + if (is_hyp_ctxt(vcpu)) { vcpu->arch.hw_mmu = &vcpu->kvm->arch.mmu; } else { @@ -674,10 +691,18 @@ void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu) void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu) { - if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu)) { + /* + * Keep a reference on the associated stage-2 MMU if the vCPU is + * scheduling out and not in WFI emulation, suggesting it is likely to + * reuse the MMU sometime soon. + */ + if (vcpu->scheduled_out && !vcpu_get_flag(vcpu, IN_WFI)) + return; + + if (kvm_is_nested_s2_mmu(vcpu->kvm, vcpu->arch.hw_mmu)) atomic_dec(&vcpu->arch.hw_mmu->refcnt); - vcpu->arch.hw_mmu = NULL; - } + + vcpu->arch.hw_mmu = NULL; } /* @@ -730,7 +755,7 @@ void kvm_nested_s2_wp(struct kvm *kvm) } } -void kvm_nested_s2_unmap(struct kvm *kvm) +void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block) { int i; @@ -740,7 +765,7 @@ void kvm_nested_s2_unmap(struct kvm *kvm) struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i]; if (kvm_s2_mmu_valid(mmu)) - kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu)); + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block); } } @@ -1184,3 +1209,17 @@ int kvm_init_nv_sysregs(struct kvm *kvm) return 0; } + +void check_nested_vcpu_requests(struct kvm_vcpu *vcpu) +{ + if (kvm_check_request(KVM_REQ_NESTED_S2_UNMAP, vcpu)) { + struct kvm_s2_mmu *mmu = vcpu->arch.hw_mmu; + + write_lock(&vcpu->kvm->mmu_lock); + if (mmu->pending_unmap) { + kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), true); + mmu->pending_unmap = false; + } + write_unlock(&vcpu->kvm->mmu_lock); + } +} diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index dad88e31f953..ff8c4e1b847e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1527,6 +1527,14 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_SME); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_RNDR_trap); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_NMI); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE_frac); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_GCS); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_THE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTEX); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_DF2); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_PFAR); break; case SYS_ID_AA64PFR2_EL1: /* We only expose FPMR */ @@ -1550,7 +1558,8 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; break; case SYS_ID_AA64MMFR3_EL1: - val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE; + val &= ID_AA64MMFR3_EL1_TCRX | ID_AA64MMFR3_EL1_S1POE | + ID_AA64MMFR3_EL1_S1PIE; break; case SYS_ID_MMFR4_EL1: val &= ~ARM64_FEATURE_MASK(ID_MMFR4_EL1_CCIDX); @@ -1985,7 +1994,7 @@ static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) * one cache line. */ if (kvm_has_mte(vcpu->kvm)) - clidr |= 2 << CLIDR_TTYPE_SHIFT(loc); + clidr |= 2ULL << CLIDR_TTYPE_SHIFT(loc); __vcpu_sys_reg(vcpu, r->reg) = clidr; @@ -2376,7 +2385,19 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64PFR0_EL1_RAS | ID_AA64PFR0_EL1_AdvSIMD | ID_AA64PFR0_EL1_FP), }, - ID_SANITISED(ID_AA64PFR1_EL1), + ID_WRITABLE(ID_AA64PFR1_EL1, ~(ID_AA64PFR1_EL1_PFAR | + ID_AA64PFR1_EL1_DF2 | + ID_AA64PFR1_EL1_MTEX | + ID_AA64PFR1_EL1_THE | + ID_AA64PFR1_EL1_GCS | + ID_AA64PFR1_EL1_MTE_frac | + ID_AA64PFR1_EL1_NMI | + ID_AA64PFR1_EL1_RNDR_trap | + ID_AA64PFR1_EL1_SME | + ID_AA64PFR1_EL1_RES0 | + ID_AA64PFR1_EL1_MPAM_frac | + ID_AA64PFR1_EL1_RAS_frac | + ID_AA64PFR1_EL1_MTE)), ID_WRITABLE(ID_AA64PFR2_EL1, ID_AA64PFR2_EL1_FPMR), ID_UNALLOCATED(4,3), ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0), @@ -2390,7 +2411,21 @@ static const struct sys_reg_desc sys_reg_descs[] = { .get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, .reset = read_sanitised_id_aa64dfr0_el1, - .val = ID_AA64DFR0_EL1_PMUVer_MASK | + /* + * Prior to FEAT_Debugv8.9, the architecture defines context-aware + * breakpoints (CTX_CMPs) as the highest numbered breakpoints (BRPs). + * KVM does not trap + emulate the breakpoint registers, and as such + * cannot support a layout that misaligns with the underlying hardware. + * While it may be possible to describe a subset that aligns with + * hardware, just prevent changes to BRPs and CTX_CMPs altogether for + * simplicity. + * + * See DDI0487K.a, section D2.8.3 Breakpoint types and linking + * of breakpoints for more details. + */ + .val = ID_AA64DFR0_EL1_DoubleLock_MASK | + ID_AA64DFR0_EL1_WRPs_MASK | + ID_AA64DFR0_EL1_PMUVer_MASK | ID_AA64DFR0_EL1_DebugVer_MASK, }, ID_SANITISED(ID_AA64DFR1_EL1), ID_UNALLOCATED(5,2), @@ -2433,6 +2468,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_AA64MMFR2_EL1_NV | ID_AA64MMFR2_EL1_CCIDX)), ID_WRITABLE(ID_AA64MMFR3_EL1, (ID_AA64MMFR3_EL1_TCRX | + ID_AA64MMFR3_EL1_S1PIE | ID_AA64MMFR3_EL1_S1POE)), ID_SANITISED(ID_AA64MMFR4_EL1), ID_UNALLOCATED(7,5), @@ -2903,7 +2939,7 @@ static bool handle_alle1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * Drop all shadow S2s, resulting in S1/S2 TLBIs for each of the * corresponding VMIDs. */ - kvm_nested_s2_unmap(vcpu->kvm); + kvm_nested_s2_unmap(vcpu->kvm, true); write_unlock(&vcpu->kvm->mmu_lock); @@ -2955,7 +2991,30 @@ union tlbi_info { static void s2_mmu_unmap_range(struct kvm_s2_mmu *mmu, const union tlbi_info *info) { - kvm_stage2_unmap_range(mmu, info->range.start, info->range.size); + /* + * The unmap operation is allowed to drop the MMU lock and block, which + * means that @mmu could be used for a different context than the one + * currently being invalidated. + * + * This behavior is still safe, as: + * + * 1) The vCPU(s) that recycled the MMU are responsible for invalidating + * the entire MMU before reusing it, which still honors the intent + * of a TLBI. + * + * 2) Until the guest TLBI instruction is 'retired' (i.e. increment PC + * and ERET to the guest), other vCPUs are allowed to use stale + * translations. + * + * 3) Accidentally unmapping an unrelated MMU context is nonfatal, and + * at worst may cause more aborts for shadow stage-2 fills. + * + * Dropping the MMU lock also implies that shadow stage-2 fills could + * happen behind the back of the TLBI. This is still safe, though, as + * the L1 needs to put its stage-2 in a consistent state before doing + * the TLBI. + */ + kvm_stage2_unmap_range(mmu, info->range.start, info->range.size, true); } static bool handle_vmalls12e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -3050,7 +3109,11 @@ static void s2_mmu_unmap_ipa(struct kvm_s2_mmu *mmu, max_size = compute_tlb_inval_range(mmu, info->ipa.addr); base_addr &= ~(max_size - 1); - kvm_stage2_unmap_range(mmu, base_addr, max_size); + /* + * See comment in s2_mmu_unmap_range() for why this is allowed to + * reschedule. + */ + kvm_stage2_unmap_range(mmu, base_addr, max_size, true); } static bool handle_ipas2e1is(struct kvm_vcpu *vcpu, struct sys_reg_params *p, diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c index e7c53e8af3d1..48c952563e85 100644 --- a/arch/arm64/kvm/vgic/vgic-init.c +++ b/arch/arm64/kvm/vgic/vgic-init.c @@ -417,8 +417,28 @@ static void __kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) kfree(vgic_cpu->private_irqs); vgic_cpu->private_irqs = NULL; - if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) + if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { + /* + * If this vCPU is being destroyed because of a failed creation + * then unregister the redistributor to avoid leaving behind a + * dangling pointer to the vCPU struct. + * + * vCPUs that have been successfully created (i.e. added to + * kvm->vcpu_array) get unregistered in kvm_vgic_destroy(), as + * this function gets called while holding kvm->arch.config_lock + * in the VM teardown path and would otherwise introduce a lock + * inversion w.r.t. kvm->srcu. + * + * vCPUs that failed creation are torn down outside of the + * kvm->arch.config_lock and do not get unregistered in + * kvm_vgic_destroy(), meaning it is both safe and necessary to + * do so here. + */ + if (kvm_get_vcpu_by_id(vcpu->kvm, vcpu->vcpu_id) != vcpu) + vgic_unregister_redist_iodev(vcpu); + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; + } } void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -524,22 +544,31 @@ int kvm_vgic_map_resources(struct kvm *kvm) if (ret) goto out; - dist->ready = true; dist_base = dist->vgic_dist_base; mutex_unlock(&kvm->arch.config_lock); ret = vgic_register_dist_iodev(kvm, dist_base, type); - if (ret) + if (ret) { kvm_err("Unable to register VGIC dist MMIO regions\n"); + goto out_slots; + } + /* + * kvm_io_bus_register_dev() guarantees all readers see the new MMIO + * registration before returning through synchronize_srcu(), which also + * implies a full memory barrier. As such, marking the distributor as + * 'ready' here is guaranteed to be ordered after all vCPUs having seen + * a completely configured distributor. + */ + dist->ready = true; goto out_slots; out: mutex_unlock(&kvm->arch.config_lock); out_slots: - mutex_unlock(&kvm->slots_lock); - if (ret) - kvm_vgic_destroy(kvm); + kvm_vm_dead(kvm); + + mutex_unlock(&kvm->slots_lock); return ret; } diff --git a/arch/arm64/kvm/vgic/vgic-kvm-device.c b/arch/arm64/kvm/vgic/vgic-kvm-device.c index 1d26bb5b02f4..5f4f57aaa23e 100644 --- a/arch/arm64/kvm/vgic/vgic-kvm-device.c +++ b/arch/arm64/kvm/vgic/vgic-kvm-device.c @@ -236,7 +236,12 @@ static int vgic_set_common_attr(struct kvm_device *dev, mutex_lock(&dev->kvm->arch.config_lock); - if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_spis) + /* + * Either userspace has already configured NR_IRQS or + * the vgic has already been initialized and vgic_init() + * supplied a default amount of SPIs. + */ + if (dev->kvm->arch.vgic.nr_spis) ret = -EBUSY; else dev->kvm->arch.vgic.nr_spis = diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 8bbd0b20136a..5db82bfc9dc1 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -2220,7 +2220,11 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, emit(A64_STR64I(A64_R(20), A64_SP, regs_off + 8), ctx); if (flags & BPF_TRAMP_F_CALL_ORIG) { - emit_a64_mov_i64(A64_R(0), (const u64)im, ctx); + /* for the first pass, assume the worst case */ + if (!ctx->image) + ctx->idx += 4; + else + emit_a64_mov_i64(A64_R(0), (const u64)im, ctx); emit_call((const u64)__bpf_tramp_enter, ctx); } @@ -2264,7 +2268,11 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, if (flags & BPF_TRAMP_F_CALL_ORIG) { im->ip_epilogue = ctx->ro_image + ctx->idx; - emit_a64_mov_i64(A64_R(0), (const u64)im, ctx); + /* for the first pass, assume the worst case */ + if (!ctx->image) + ctx->idx += 4; + else + emit_a64_mov_i64(A64_R(0), (const u64)im, ctx); emit_call((const u64)__bpf_tramp_exit, ctx); } |