diff options
Diffstat (limited to 'arch')
-rw-r--r-- | arch/x86/kernel/callthunks.c | 4 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/core.c | 10 | ||||
-rw-r--r-- | arch/x86/kernel/kprobes/opt.c | 28 | ||||
-rw-r--r-- | arch/x86/kvm/hyperv.c | 63 | ||||
-rw-r--r-- | arch/x86/kvm/irq_comm.c | 5 | ||||
-rw-r--r-- | arch/x86/kvm/lapic.h | 4 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/spte.h | 2 | ||||
-rw-r--r-- | arch/x86/kvm/mmu/tdp_mmu.c | 25 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/pmu.h | 3 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/nested.c | 20 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 7 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 3 | ||||
-rw-r--r-- | arch/x86/kvm/xen.c | 144 |
14 files changed, 181 insertions, 140 deletions
diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index 7d2c75ec9a8c..ffea98f9064b 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -119,7 +119,7 @@ static bool is_coretext(const struct core_text *ct, void *addr) return within_module_coretext(addr); } -static __init_or_module bool skip_addr(void *dest) +static bool skip_addr(void *dest) { if (dest == error_entry) return true; @@ -181,7 +181,7 @@ static const u8 nops[] = { 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, 0x90, }; -static __init_or_module void *patch_dest(void *dest, bool direct) +static void *patch_dest(void *dest, bool direct) { unsigned int tsize = SKL_TMPL_SIZE; u8 *pad = dest - tsize; diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 66299682b6b7..b36f3c367cb2 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -37,6 +37,7 @@ #include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> +#include <linux/kgdb.h> #include <linux/ftrace.h> #include <linux/kasan.h> #include <linux/moduleloader.h> @@ -281,12 +282,15 @@ static int can_probe(unsigned long paddr) if (ret < 0) return 0; +#ifdef CONFIG_KGDB /* - * Another debugging subsystem might insert this breakpoint. - * In that case, we can't recover it. + * If there is a dynamically installed kgdb sw breakpoint, + * this function should not be probed. */ - if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && + kgdb_has_hit_break(addr)) return 0; +#endif addr += insn.length; } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index e6b8c5362b94..e57e07b0edb6 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -15,6 +15,7 @@ #include <linux/extable.h> #include <linux/kdebug.h> #include <linux/kallsyms.h> +#include <linux/kgdb.h> #include <linux/ftrace.h> #include <linux/objtool.h> #include <linux/pgtable.h> @@ -279,19 +280,6 @@ static int insn_is_indirect_jump(struct insn *insn) return ret; } -static bool is_padding_int3(unsigned long addr, unsigned long eaddr) -{ - unsigned char ops; - - for (; addr < eaddr; addr++) { - if (get_kernel_nofault(ops, (void *)addr) < 0 || - ops != INT3_INSN_OPCODE) - return false; - } - - return true; -} - /* Decode whole function to ensure any instructions don't jump into target */ static int can_optimize(unsigned long paddr) { @@ -334,15 +322,15 @@ static int can_optimize(unsigned long paddr) ret = insn_decode_kernel(&insn, (void *)recovered_insn); if (ret < 0) return 0; - +#ifdef CONFIG_KGDB /* - * In the case of detecting unknown breakpoint, this could be - * a padding INT3 between functions. Let's check that all the - * rest of the bytes are also INT3. + * If there is a dynamically installed kgdb sw breakpoint, + * this function should not be probed. */ - if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) - return is_padding_int3(addr, paddr - offset + size) ? 1 : 0; - + if (insn.opcode.bytes[0] == INT3_INSN_OPCODE && + kgdb_has_hit_break(addr)) + return 0; +#endif /* Recover address */ insn.kaddr = (void *)addr; insn.next_byte = (void *)(addr + insn.length); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 2c7f2a26421e..e8296942a868 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1769,6 +1769,7 @@ static bool hv_is_vp_in_sparse_set(u32 vp_id, u64 valid_bank_mask, u64 sparse_ba } struct kvm_hv_hcall { + /* Hypercall input data */ u64 param; u64 ingpa; u64 outgpa; @@ -1779,12 +1780,21 @@ struct kvm_hv_hcall { bool fast; bool rep; sse128_t xmm[HV_HYPERCALL_MAX_XMM_REGISTERS]; + + /* + * Current read offset when KVM reads hypercall input data gradually, + * either offset in bytes from 'ingpa' for regular hypercalls or the + * number of already consumed 'XMM halves' for 'fast' hypercalls. + */ + union { + gpa_t data_offset; + int consumed_xmm_halves; + }; }; static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc, - u16 orig_cnt, u16 cnt_cap, u64 *data, - int consumed_xmm_halves, gpa_t offset) + u16 orig_cnt, u16 cnt_cap, u64 *data) { /* * Preserve the original count when ignoring entries via a "cap", KVM @@ -1799,11 +1809,11 @@ static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc, * Each XMM holds two sparse banks, but do not count halves that * have already been consumed for hypercall parameters. */ - if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - consumed_xmm_halves) + if (orig_cnt > 2 * HV_HYPERCALL_MAX_XMM_REGISTERS - hc->consumed_xmm_halves) return HV_STATUS_INVALID_HYPERCALL_INPUT; for (i = 0; i < cnt; i++) { - j = i + consumed_xmm_halves; + j = i + hc->consumed_xmm_halves; if (j % 2) data[i] = sse128_hi(hc->xmm[j / 2]); else @@ -1812,27 +1822,24 @@ static int kvm_hv_get_hc_data(struct kvm *kvm, struct kvm_hv_hcall *hc, return 0; } - return kvm_read_guest(kvm, hc->ingpa + offset, data, + return kvm_read_guest(kvm, hc->ingpa + hc->data_offset, data, cnt * sizeof(*data)); } static u64 kvm_get_sparse_vp_set(struct kvm *kvm, struct kvm_hv_hcall *hc, - u64 *sparse_banks, int consumed_xmm_halves, - gpa_t offset) + u64 *sparse_banks) { if (hc->var_cnt > HV_MAX_SPARSE_VCPU_BANKS) return -EINVAL; /* Cap var_cnt to ignore banks that cannot contain a legal VP index. */ return kvm_hv_get_hc_data(kvm, hc, hc->var_cnt, KVM_HV_MAX_SPARSE_VCPU_SET_BITS, - sparse_banks, consumed_xmm_halves, offset); + sparse_banks); } -static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[], - int consumed_xmm_halves, gpa_t offset) +static int kvm_hv_get_tlb_flush_entries(struct kvm *kvm, struct kvm_hv_hcall *hc, u64 entries[]) { - return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, - entries, consumed_xmm_halves, offset); + return kvm_hv_get_hc_data(kvm, hc, hc->rep_cnt, hc->rep_cnt, entries); } static void hv_tlb_flush_enqueue(struct kvm_vcpu *vcpu, @@ -1926,8 +1933,6 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) struct kvm_vcpu *v; unsigned long i; bool all_cpus; - int consumed_xmm_halves = 0; - gpa_t data_offset; /* * The Hyper-V TLFS doesn't allow more than HV_MAX_SPARSE_VCPU_BANKS @@ -1955,12 +1960,12 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) flush.address_space = hc->ingpa; flush.flags = hc->outgpa; flush.processor_mask = sse128_lo(hc->xmm[0]); - consumed_xmm_halves = 1; + hc->consumed_xmm_halves = 1; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush, sizeof(flush)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; - data_offset = sizeof(flush); + hc->data_offset = sizeof(flush); } trace_kvm_hv_flush_tlb(flush.processor_mask, @@ -1985,12 +1990,12 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) flush_ex.flags = hc->outgpa; memcpy(&flush_ex.hv_vp_set, &hc->xmm[0], sizeof(hc->xmm[0])); - consumed_xmm_halves = 2; + hc->consumed_xmm_halves = 2; } else { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &flush_ex, sizeof(flush_ex)))) return HV_STATUS_INVALID_HYPERCALL_INPUT; - data_offset = sizeof(flush_ex); + hc->data_offset = sizeof(flush_ex); } trace_kvm_hv_flush_tlb_ex(flush_ex.hv_vp_set.valid_bank_mask, @@ -2009,8 +2014,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) if (!hc->var_cnt) goto ret_success; - if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, - consumed_xmm_halves, data_offset)) + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } @@ -2021,8 +2025,10 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) * consumed_xmm_halves to make sure TLB flush entries are read * from the correct offset. */ - data_offset += hc->var_cnt * sizeof(sparse_banks[0]); - consumed_xmm_halves += hc->var_cnt; + if (hc->fast) + hc->consumed_xmm_halves += hc->var_cnt; + else + hc->data_offset += hc->var_cnt * sizeof(sparse_banks[0]); } if (hc->code == HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE || @@ -2030,8 +2036,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) hc->rep_cnt > ARRAY_SIZE(__tlb_flush_entries)) { tlb_flush_entries = NULL; } else { - if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries, - consumed_xmm_halves, data_offset)) + if (kvm_hv_get_tlb_flush_entries(kvm, hc, __tlb_flush_entries)) return HV_STATUS_INVALID_HYPERCALL_INPUT; tlb_flush_entries = __tlb_flush_entries; } @@ -2180,9 +2185,13 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) if (!hc->var_cnt) goto ret_success; - if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks, 1, - offsetof(struct hv_send_ipi_ex, - vp_set.bank_contents))) + if (!hc->fast) + hc->data_offset = offsetof(struct hv_send_ipi_ex, + vp_set.bank_contents); + else + hc->consumed_xmm_halves = 1; + + if (kvm_get_sparse_vp_set(kvm, hc, sparse_banks)) return HV_STATUS_INVALID_HYPERCALL_INPUT; } diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 0687162c4f22..3742d9adacfc 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -426,8 +426,9 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, kvm_set_msi_irq(vcpu->kvm, entry, &irq); if (irq.trig_mode && - kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, - irq.dest_id, irq.dest_mode)) + (kvm_apic_match_dest(vcpu, NULL, APIC_DEST_NOSHORT, + irq.dest_id, irq.dest_mode) || + kvm_apic_pending_eoi(vcpu, irq.vector))) __set_bit(irq.vector, ioapic_handled_vectors); } } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 28e3769066e2..58c3242fcc7a 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -188,11 +188,11 @@ static inline bool lapic_in_kernel(struct kvm_vcpu *vcpu) extern struct static_key_false_deferred apic_hw_disabled; -static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic) +static inline bool kvm_apic_hw_enabled(struct kvm_lapic *apic) { if (static_branch_unlikely(&apic_hw_disabled.key)) return apic->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; - return MSR_IA32_APICBASE_ENABLE; + return true; } extern struct static_key_false_deferred apic_sw_disabled; diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index 1f03701b943a..6f54dc9409c9 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -363,7 +363,7 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, * A shadow-present leaf SPTE may be non-writable for 4 possible reasons: * * 1. To intercept writes for dirty logging. KVM write-protects huge pages - * so that they can be split be split down into the dirty logging + * so that they can be split down into the dirty logging * granularity (4KiB) whenever the guest writes to them. KVM also * write-protects 4KiB pages so that writes can be recorded in the dirty log * (e.g. if not using PML). SPTEs are write-protected for dirty logging diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 771210ce5181..d6df38d371a0 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -1074,7 +1074,9 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int ret = RET_PF_FIXED; bool wrprot = false; - WARN_ON(sp->role.level != fault->goal_level); + if (WARN_ON_ONCE(sp->role.level != fault->goal_level)) + return RET_PF_RETRY; + if (unlikely(!fault->slot)) new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_ALL); else @@ -1173,9 +1175,6 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, iter.old_spte, iter.level); - if (iter.level == fault->goal_level) - break; - /* * If SPTE has been frozen by another thread, just give up and * retry, avoiding unnecessary page table allocation and free. @@ -1183,6 +1182,9 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (is_removed_spte(iter.old_spte)) goto retry; + if (iter.level == fault->goal_level) + goto map_target_level; + /* Step down into the lower level page table if it exists. */ if (is_shadow_present_pte(iter.old_spte) && !is_large_pte(iter.old_spte)) @@ -1203,8 +1205,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) r = tdp_mmu_link_sp(kvm, &iter, sp, true); /* - * Also force the guest to retry the access if the upper level SPTEs - * aren't in place. + * Force the guest to retry if installing an upper level SPTE + * failed, e.g. because a different task modified the SPTE. */ if (r) { tdp_mmu_free_sp(sp); @@ -1214,11 +1216,20 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) if (fault->huge_page_disallowed && fault->req_level >= iter.level) { spin_lock(&kvm->arch.tdp_mmu_pages_lock); - track_possible_nx_huge_page(kvm, sp); + if (sp->nx_huge_page_disallowed) + track_possible_nx_huge_page(kvm, sp); spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } } + /* + * The walk aborted before reaching the target level, e.g. because the + * iterator detected an upper level SPTE was frozen during traversal. + */ + WARN_ON_ONCE(iter.level == fault->goal_level); + goto retry; + +map_target_level: ret = tdp_mmu_map_handle_target_level(vcpu, fault, &iter); retry: diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 684393c22105..eb594620dd75 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -238,7 +238,8 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) return false; /* recalibrate sample period and check if it's accepted by perf core */ - if (perf_event_period(pmc->perf_event, + if (is_sampling_event(pmc->perf_event) && + perf_event_period(pmc->perf_event, get_sample_period(pmc, pmc->counter))) return false; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 85ff3c0588ba..cdb91009701d 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -140,7 +140,8 @@ static inline u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value) static inline void pmc_update_sample_period(struct kvm_pmc *pmc) { - if (!pmc->perf_event || pmc->is_paused) + if (!pmc->perf_event || pmc->is_paused || + !is_sampling_event(pmc->perf_event)) return; perf_event_period(pmc->perf_event, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b6f4411b613e..d93c715cda6a 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5296,10 +5296,19 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vcpu); - kvm_vcpu_write_guest(vcpu, - vmptr + offsetof(struct vmcs12, - launch_state), - &zero, sizeof(zero)); + /* + * Silently ignore memory errors on VMCLEAR, Intel's pseudocode + * for VMCLEAR includes a "ensure that data for VMCS referenced + * by the operand is in memory" clause that guards writes to + * memory, i.e. doing nothing for I/O is architecturally valid. + * + * FIXME: Suppress failures if and only if no memslot is found, + * i.e. exit to userspace if __copy_to_user() fails. + */ + (void)kvm_vcpu_write_guest(vcpu, + vmptr + offsetof(struct vmcs12, + launch_state), + &zero, sizeof(zero)); } else if (vmx->nested.hv_evmcs && vmptr == vmx->nested.hv_evmcs_vmptr) { nested_release_evmcs(vcpu); } @@ -6873,7 +6882,8 @@ void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps) SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_RDSEED_EXITING | SECONDARY_EXEC_XSAVES | - SECONDARY_EXEC_TSC_SCALING; + SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; /* * We can emulate "VMCS shadowing," even if the hardware diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index fe5615fd8295..fc9008dbed33 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4459,6 +4459,13 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, * controls for features that are/aren't exposed to the guest. */ if (nested) { + /* + * All features that can be added or removed to VMX MSRs must + * be supported in the first place for nested virtualization. + */ + if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control))) + enabled = false; + if (enabled) vmx->nested.msrs.secondary_ctls_high |= control; else diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 312aea1854ae..da4bbd043a7b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13132,6 +13132,9 @@ int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, struct x86_exception *e) { if (r == X86EMUL_PROPAGATE_FAULT) { + if (KVM_BUG_ON(!e, vcpu->kvm)) + return -EIO; + kvm_inject_emulated_page_fault(vcpu, e); return 1; } diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index d7af40240248..2e29bdc2949c 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -41,7 +41,7 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - if (gfn == GPA_INVALID) { + if (gfn == KVM_XEN_INVALID_GFN) { kvm_gpc_deactivate(gpc); goto out; } @@ -659,7 +659,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) if (kvm->arch.xen.shinfo_cache.active) data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); else - data->u.shared_info.gfn = GPA_INVALID; + data->u.shared_info.gfn = KVM_XEN_INVALID_GFN; r = 0; break; @@ -705,7 +705,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) BUILD_BUG_ON(offsetof(struct vcpu_info, time) != offsetof(struct compat_vcpu_info, time)); - if (data->u.gpa == GPA_INVALID) { + if (data->u.gpa == KVM_XEN_INVALID_GPA) { kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_info_cache); r = 0; break; @@ -719,7 +719,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: - if (data->u.gpa == GPA_INVALID) { + if (data->u.gpa == KVM_XEN_INVALID_GPA) { kvm_gpc_deactivate(&vcpu->arch.xen.vcpu_time_info_cache); r = 0; break; @@ -739,7 +739,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) r = -EOPNOTSUPP; break; } - if (data->u.gpa == GPA_INVALID) { + if (data->u.gpa == KVM_XEN_INVALID_GPA) { r = 0; deactivate_out: kvm_gpc_deactivate(&vcpu->arch.xen.runstate_cache); @@ -937,7 +937,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) if (vcpu->arch.xen.vcpu_info_cache.active) data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa; else - data->u.gpa = GPA_INVALID; + data->u.gpa = KVM_XEN_INVALID_GPA; r = 0; break; @@ -945,7 +945,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) if (vcpu->arch.xen.vcpu_time_info_cache.active) data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa; else - data->u.gpa = GPA_INVALID; + data->u.gpa = KVM_XEN_INVALID_GPA; r = 0; break; @@ -1069,6 +1069,7 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 : kvm->arch.xen_hvm_config.blob_size_32; u8 *page; + int ret; if (page_num >= blob_size) return 1; @@ -1079,10 +1080,10 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) if (IS_ERR(page)) return PTR_ERR(page); - if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) { - kfree(page); + ret = kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE); + kfree(page); + if (ret) return 1; - } } return 0; } @@ -1183,30 +1184,22 @@ static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, u64 param, u64 *r) { - int idx, i; struct sched_poll sched_poll; evtchn_port_t port, *ports; - gpa_t gpa; + struct x86_exception e; + int i; if (!lapic_in_kernel(vcpu) || !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) return false; - idx = srcu_read_lock(&vcpu->kvm->srcu); - gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - if (!gpa) { - *r = -EFAULT; - return true; - } - if (IS_ENABLED(CONFIG_64BIT) && !longmode) { struct compat_sched_poll sp32; /* Sanity check that the compat struct definition is correct */ BUILD_BUG_ON(sizeof(sp32) != 16); - if (kvm_vcpu_read_guest(vcpu, gpa, &sp32, sizeof(sp32))) { + if (kvm_read_guest_virt(vcpu, param, &sp32, sizeof(sp32), &e)) { *r = -EFAULT; return true; } @@ -1220,8 +1213,8 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, sched_poll.nr_ports = sp32.nr_ports; sched_poll.timeout = sp32.timeout; } else { - if (kvm_vcpu_read_guest(vcpu, gpa, &sched_poll, - sizeof(sched_poll))) { + if (kvm_read_guest_virt(vcpu, param, &sched_poll, + sizeof(sched_poll), &e)) { *r = -EFAULT; return true; } @@ -1243,18 +1236,13 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, } else ports = &port; + if (kvm_read_guest_virt(vcpu, (gva_t)sched_poll.ports, ports, + sched_poll.nr_ports * sizeof(*ports), &e)) { + *r = -EFAULT; + return true; + } + for (i = 0; i < sched_poll.nr_ports; i++) { - idx = srcu_read_lock(&vcpu->kvm->srcu); - gpa = kvm_mmu_gva_to_gpa_system(vcpu, - (gva_t)(sched_poll.ports + i), - NULL); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - - if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, - &ports[i], sizeof(port))) { - *r = -EFAULT; - goto out; - } if (ports[i] >= max_evtchn_port(vcpu->kvm)) { *r = -EINVAL; goto out; @@ -1330,9 +1318,8 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, int vcpu_id, u64 param, u64 *r) { struct vcpu_set_singleshot_timer oneshot; + struct x86_exception e; s64 delta; - gpa_t gpa; - int idx; if (!kvm_xen_timer_enabled(vcpu)) return false; @@ -1343,9 +1330,6 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, *r = -EINVAL; return true; } - idx = srcu_read_lock(&vcpu->kvm->srcu); - gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); - srcu_read_unlock(&vcpu->kvm->srcu, idx); /* * The only difference for 32-bit compat is the 4 bytes of @@ -1363,9 +1347,8 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) != sizeof_field(struct vcpu_set_singleshot_timer, flags)); - if (!gpa || - kvm_vcpu_read_guest(vcpu, gpa, &oneshot, longmode ? sizeof(oneshot) : - sizeof(struct compat_vcpu_set_singleshot_timer))) { + if (kvm_read_guest_virt(vcpu, param, &oneshot, longmode ? sizeof(oneshot) : + sizeof(struct compat_vcpu_set_singleshot_timer), &e)) { *r = -EFAULT; return true; } @@ -1825,20 +1808,20 @@ static int kvm_xen_eventfd_update(struct kvm *kvm, { u32 port = data->u.evtchn.send_port; struct evtchnfd *evtchnfd; + int ret; - if (!port || port >= max_evtchn_port(kvm)) - return -EINVAL; - + /* Protect writes to evtchnfd as well as the idr lookup. */ mutex_lock(&kvm->lock); evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port); - mutex_unlock(&kvm->lock); + ret = -ENOENT; if (!evtchnfd) - return -ENOENT; + goto out_unlock; /* For an UPDATE, nothing may change except the priority/vcpu */ + ret = -EINVAL; if (evtchnfd->type != data->u.evtchn.type) - return -EINVAL; + goto out_unlock; /* * Port cannot change, and if it's zero that was an eventfd @@ -1846,20 +1829,21 @@ static int kvm_xen_eventfd_update(struct kvm *kvm, */ if (!evtchnfd->deliver.port.port || evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port) - return -EINVAL; + goto out_unlock; /* We only support 2 level event channels for now */ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) - return -EINVAL; + goto out_unlock; - mutex_lock(&kvm->lock); evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) { evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu; evtchnfd->deliver.port.vcpu_idx = -1; } + ret = 0; +out_unlock: mutex_unlock(&kvm->lock); - return 0; + return ret; } /* @@ -1871,12 +1855,9 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm, { u32 port = data->u.evtchn.send_port; struct eventfd_ctx *eventfd = NULL; - struct evtchnfd *evtchnfd = NULL; + struct evtchnfd *evtchnfd; int ret = -EINVAL; - if (!port || port >= max_evtchn_port(kvm)) - return -EINVAL; - evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL); if (!evtchnfd) return -ENOMEM; @@ -1952,8 +1933,7 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) if (!evtchnfd) return -ENOENT; - if (kvm) - synchronize_srcu(&kvm->srcu); + synchronize_srcu(&kvm->srcu); if (!evtchnfd->deliver.port.port) eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); kfree(evtchnfd); @@ -1962,18 +1942,42 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) static int kvm_xen_eventfd_reset(struct kvm *kvm) { - struct evtchnfd *evtchnfd; + struct evtchnfd *evtchnfd, **all_evtchnfds; int i; + int n = 0; mutex_lock(&kvm->lock); + + /* + * Because synchronize_srcu() cannot be called inside the + * critical section, first collect all the evtchnfd objects + * in an array as they are removed from evtchn_ports. + */ + idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) + n++; + + all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL); + if (!all_evtchnfds) { + mutex_unlock(&kvm->lock); + return -ENOMEM; + } + + n = 0; idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { + all_evtchnfds[n++] = evtchnfd; idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port); - synchronize_srcu(&kvm->srcu); + } + mutex_unlock(&kvm->lock); + + synchronize_srcu(&kvm->srcu); + + while (n--) { + evtchnfd = all_evtchnfds[n]; if (!evtchnfd->deliver.port.port) eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx); kfree(evtchnfd); } - mutex_unlock(&kvm->lock); + kfree(all_evtchnfds); return 0; } @@ -2002,20 +2006,22 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r) { struct evtchnfd *evtchnfd; struct evtchn_send send; - gpa_t gpa; - int idx; - - idx = srcu_read_lock(&vcpu->kvm->srcu); - gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + struct x86_exception e; - if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &send, sizeof(send))) { + /* Sanity check: this structure is the same for 32-bit and 64-bit */ + BUILD_BUG_ON(sizeof(send) != 4); + if (kvm_read_guest_virt(vcpu, param, &send, sizeof(send), &e)) { *r = -EFAULT; return true; } - /* The evtchn_ports idr is protected by vcpu->kvm->srcu */ + /* + * evtchnfd is protected by kvm->srcu; the idr lookup instead + * is protected by RCU. + */ + rcu_read_lock(); evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port); + rcu_read_unlock(); if (!evtchnfd) return false; |