From c14d08c5adb25f397638be3d8dd7f4738fb38272 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Wed, 26 Apr 2023 17:23:19 +0000 Subject: KVM: arm64: Rename free_removed to free_unlinked Normalize on referring to tables outside of an active paging structure as 'unlinked'. A subsequent change to KVM will add support for building page tables that are not part of an active paging structure. The existing 'removed_table' terminology is quite clunky when applied in this context. Signed-off-by: Ricardo Koller Reviewed-by: Oliver Upton Reviewed-by: Shaoqin Huang Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20230426172330.1439644-2-ricarkol@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pgtable.h | 8 ++++---- arch/arm64/kvm/hyp/nvhe/mem_protect.c | 6 +++--- arch/arm64/kvm/hyp/pgtable.c | 6 +++--- arch/arm64/kvm/mmu.c | 10 +++++----- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 4cd6762bda80..26a4293726c1 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -104,7 +104,7 @@ static inline bool kvm_level_supports_block_mapping(u32 level) * allocation is physically contiguous. * @free_pages_exact: Free an exact number of memory pages previously * allocated by zalloc_pages_exact. - * @free_removed_table: Free a removed paging structure by unlinking and + * @free_unlinked_table: Free an unlinked paging structure by unlinking and * dropping references. * @get_page: Increment the refcount on a page. * @put_page: Decrement the refcount on a page. When the @@ -124,7 +124,7 @@ struct kvm_pgtable_mm_ops { void* (*zalloc_page)(void *arg); void* (*zalloc_pages_exact)(size_t size); void (*free_pages_exact)(void *addr, size_t size); - void (*free_removed_table)(void *addr, u32 level); + void (*free_unlinked_table)(void *addr, u32 level); void (*get_page)(void *addr); void (*put_page)(void *addr); int (*page_count)(void *addr); @@ -440,7 +440,7 @@ int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); /** - * kvm_pgtable_stage2_free_removed() - Free a removed stage-2 paging structure. + * kvm_pgtable_stage2_free_unlinked() - Free an unlinked stage-2 paging structure. * @mm_ops: Memory management callbacks. * @pgtable: Unlinked stage-2 paging structure to be freed. * @level: Level of the stage-2 paging structure to be freed. @@ -448,7 +448,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); * The page-table is assumed to be unreachable by any hardware walkers prior to * freeing and therefore no TLB invalidation is performed. */ -void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level); +void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level); /** * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table. diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 2e9ec4a2a4a3..d35e75b13ffe 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -91,9 +91,9 @@ static void host_s2_put_page(void *addr) hyp_put_page(&host_s2_pool, addr); } -static void host_s2_free_removed_table(void *addr, u32 level) +static void host_s2_free_unlinked_table(void *addr, u32 level) { - kvm_pgtable_stage2_free_removed(&host_mmu.mm_ops, addr, level); + kvm_pgtable_stage2_free_unlinked(&host_mmu.mm_ops, addr, level); } static int prepare_s2_pool(void *pgt_pool_base) @@ -110,7 +110,7 @@ static int prepare_s2_pool(void *pgt_pool_base) host_mmu.mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_pages_exact = host_s2_zalloc_pages_exact, .zalloc_page = host_s2_zalloc_page, - .free_removed_table = host_s2_free_removed_table, + .free_unlinked_table = host_s2_free_unlinked_table, .phys_to_virt = hyp_phys_to_virt, .virt_to_phys = hyp_virt_to_phys, .page_count = hyp_page_count, diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 3d61bd3e591d..a3246d6cddec 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -860,7 +860,7 @@ static int stage2_map_walk_table_pre(const struct kvm_pgtable_visit_ctx *ctx, if (ret) return ret; - mm_ops->free_removed_table(childp, ctx->level); + mm_ops->free_unlinked_table(childp, ctx->level); return 0; } @@ -905,7 +905,7 @@ static int stage2_map_walk_leaf(const struct kvm_pgtable_visit_ctx *ctx, * The TABLE_PRE callback runs for table entries on the way down, looking * for table entries which we could conceivably replace with a block entry * for this mapping. If it finds one it replaces the entry and calls - * kvm_pgtable_mm_ops::free_removed_table() to tear down the detached table. + * kvm_pgtable_mm_ops::free_unlinked_table() to tear down the detached table. * * Otherwise, the LEAF callback performs the mapping at the existing leaves * instead. @@ -1276,7 +1276,7 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) pgt->pgd = NULL; } -void kvm_pgtable_stage2_free_removed(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) +void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level) { kvm_pteref_t ptep = (kvm_pteref_t)pgtable; struct kvm_pgtable_walker walker = { diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 3b9d4d24c361..a0d3c773af99 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -131,21 +131,21 @@ static void kvm_s2_free_pages_exact(void *virt, size_t size) static struct kvm_pgtable_mm_ops kvm_s2_mm_ops; -static void stage2_free_removed_table_rcu_cb(struct rcu_head *head) +static void stage2_free_unlinked_table_rcu_cb(struct rcu_head *head) { struct page *page = container_of(head, struct page, rcu_head); void *pgtable = page_to_virt(page); u32 level = page_private(page); - kvm_pgtable_stage2_free_removed(&kvm_s2_mm_ops, pgtable, level); + kvm_pgtable_stage2_free_unlinked(&kvm_s2_mm_ops, pgtable, level); } -static void stage2_free_removed_table(void *addr, u32 level) +static void stage2_free_unlinked_table(void *addr, u32 level) { struct page *page = virt_to_page(addr); set_page_private(page, (unsigned long)level); - call_rcu(&page->rcu_head, stage2_free_removed_table_rcu_cb); + call_rcu(&page->rcu_head, stage2_free_unlinked_table_rcu_cb); } static void kvm_host_get_page(void *addr) @@ -701,7 +701,7 @@ static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .zalloc_page = stage2_memcache_zalloc_page, .zalloc_pages_exact = kvm_s2_zalloc_pages_exact, .free_pages_exact = kvm_s2_free_pages_exact, - .free_removed_table = stage2_free_removed_table, + .free_unlinked_table = stage2_free_unlinked_table, .get_page = kvm_host_get_page, .put_page = kvm_s2_put_page, .page_count = kvm_host_page_count, -- cgit v1.2.3 From 02f10845f435fbda4aa2385d4c3a9730c4a5c75a Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Wed, 26 Apr 2023 17:23:20 +0000 Subject: KVM: arm64: Add KVM_PGTABLE_WALK flags for skipping CMOs and BBM TLBIs Add two flags to kvm_pgtable_visit_ctx, KVM_PGTABLE_WALK_SKIP_BBM_TLBI and KVM_PGTABLE_WALK_SKIP_CMO, to indicate that the walk should not perform TLB invalidations (TLBIs) in break-before-make (BBM) nor cache maintenance operations (CMO). This will be used by a future commit to create unlinked tables not accessible to the HW page-table walker. Signed-off-by: Ricardo Koller Reviewed-by: Shaoqin Huang Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20230426172330.1439644-3-ricarkol@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pgtable.h | 8 ++++++++ arch/arm64/kvm/hyp/pgtable.c | 37 +++++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 26a4293726c1..3f2d43ba2b62 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -195,6 +195,12 @@ typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, * with other software walkers. * @KVM_PGTABLE_WALK_HANDLE_FAULT: Indicates the page-table walk was * invoked from a fault handler. + * @KVM_PGTABLE_WALK_SKIP_BBM_TLBI: Visit and update table entries + * without Break-before-make's + * TLB invalidation. + * @KVM_PGTABLE_WALK_SKIP_CMO: Visit and update table entries + * without Cache maintenance + * operations required. */ enum kvm_pgtable_walk_flags { KVM_PGTABLE_WALK_LEAF = BIT(0), @@ -202,6 +208,8 @@ enum kvm_pgtable_walk_flags { KVM_PGTABLE_WALK_TABLE_POST = BIT(2), KVM_PGTABLE_WALK_SHARED = BIT(3), KVM_PGTABLE_WALK_HANDLE_FAULT = BIT(4), + KVM_PGTABLE_WALK_SKIP_BBM_TLBI = BIT(5), + KVM_PGTABLE_WALK_SKIP_CMO = BIT(6), }; struct kvm_pgtable_visit_ctx { diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index a3246d6cddec..633679ee3c49 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -62,6 +62,16 @@ struct kvm_pgtable_walk_data { u64 end; }; +static bool kvm_pgtable_walk_skip_bbm_tlbi(const struct kvm_pgtable_visit_ctx *ctx) +{ + return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_BBM_TLBI); +} + +static bool kvm_pgtable_walk_skip_cmo(const struct kvm_pgtable_visit_ctx *ctx) +{ + return unlikely(ctx->flags & KVM_PGTABLE_WALK_SKIP_CMO); +} + static bool kvm_phys_is_valid(u64 phys) { return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_EL1_PARANGE_MAX)); @@ -741,14 +751,17 @@ static bool stage2_try_break_pte(const struct kvm_pgtable_visit_ctx *ctx, if (!stage2_try_set_pte(ctx, KVM_INVALID_PTE_LOCKED)) return false; - /* - * Perform the appropriate TLB invalidation based on the evicted pte - * value (if any). - */ - if (kvm_pte_table(ctx->old, ctx->level)) - kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); - else if (kvm_pte_valid(ctx->old)) - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, ctx->addr, ctx->level); + if (!kvm_pgtable_walk_skip_bbm_tlbi(ctx)) { + /* + * Perform the appropriate TLB invalidation based on the + * evicted pte value (if any). + */ + if (kvm_pte_table(ctx->old, ctx->level)) + kvm_call_hyp(__kvm_tlb_flush_vmid, mmu); + else if (kvm_pte_valid(ctx->old)) + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, + ctx->addr, ctx->level); + } if (stage2_pte_is_counted(ctx->old)) mm_ops->put_page(ctx->ptep); @@ -832,11 +845,13 @@ static int stage2_map_walker_try_leaf(const struct kvm_pgtable_visit_ctx *ctx, return -EAGAIN; /* Perform CMOs before installation of the guest stage-2 PTE */ - if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new)) + if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->dcache_clean_inval_poc && + stage2_pte_cacheable(pgt, new)) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(new, mm_ops), - granule); + granule); - if (mm_ops->icache_inval_pou && stage2_pte_executable(new)) + if (!kvm_pgtable_walk_skip_cmo(ctx) && mm_ops->icache_inval_pou && + stage2_pte_executable(new)) mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); stage2_make_pte(ctx, new); -- cgit v1.2.3 From e7c05540c694b2f53a4d25e360c39984d521ccb1 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Wed, 26 Apr 2023 17:23:21 +0000 Subject: KVM: arm64: Add helper for creating unlinked stage2 subtrees Add a stage2 helper, kvm_pgtable_stage2_create_unlinked(), for creating unlinked tables (which is the opposite of kvm_pgtable_stage2_free_unlinked()). Creating an unlinked table is useful for splitting level 1 and 2 entries into subtrees of PAGE_SIZE PTEs. For example, a level 1 entry can be split into PAGE_SIZE PTEs by first creating a fully populated tree, and then use it to replace the level 1 entry in a single step. This will be used in a subsequent commit for eager huge-page splitting (a dirty-logging optimization). Signed-off-by: Ricardo Koller Reviewed-by: Shaoqin Huang Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20230426172330.1439644-4-ricarkol@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pgtable.h | 26 ++++++++++++++++++ arch/arm64/kvm/hyp/pgtable.c | 53 ++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 3f2d43ba2b62..c8e0e7d9303b 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -458,6 +458,32 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt); */ void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, u32 level); +/** + * kvm_pgtable_stage2_create_unlinked() - Create an unlinked stage-2 paging structure. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). + * @phys: Physical address of the memory to map. + * @level: Starting level of the stage-2 paging structure to be created. + * @prot: Permissions and attributes for the mapping. + * @mc: Cache of pre-allocated and zeroed memory from which to allocate + * page-table pages. + * @force_pte: Force mappings to PAGE_SIZE granularity. + * + * Returns an unlinked page-table tree. This new page-table tree is + * not reachable (i.e., it is unlinked) from the root pgd and it's + * therefore unreachableby the hardware page-table walker. No TLB + * invalidation or CMOs are performed. + * + * If device attributes are not explicitly requested in @prot, then the + * mapping will be normal, cacheable. + * + * Return: The fully populated (unlinked) stage-2 paging structure, or + * an ERR_PTR(error) on failure. + */ +kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, + u64 phys, u32 level, + enum kvm_pgtable_prot prot, + void *mc, bool force_pte); + /** * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table. * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 633679ee3c49..56edffc02bc6 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1222,6 +1222,59 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) return kvm_pgtable_walk(pgt, addr, size, &walker); } +kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, + u64 phys, u32 level, + enum kvm_pgtable_prot prot, + void *mc, bool force_pte) +{ + struct stage2_map_data map_data = { + .phys = phys, + .mmu = pgt->mmu, + .memcache = mc, + .force_pte = force_pte, + }; + struct kvm_pgtable_walker walker = { + .cb = stage2_map_walker, + .flags = KVM_PGTABLE_WALK_LEAF | + KVM_PGTABLE_WALK_SKIP_BBM_TLBI | + KVM_PGTABLE_WALK_SKIP_CMO, + .arg = &map_data, + }; + /* + * The input address (.addr) is irrelevant for walking an + * unlinked table. Construct an ambiguous IA range to map + * kvm_granule_size(level) worth of memory. + */ + struct kvm_pgtable_walk_data data = { + .walker = &walker, + .addr = 0, + .end = kvm_granule_size(level), + }; + struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; + kvm_pte_t *pgtable; + int ret; + + if (!IS_ALIGNED(phys, kvm_granule_size(level))) + return ERR_PTR(-EINVAL); + + ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); + if (ret) + return ERR_PTR(ret); + + pgtable = mm_ops->zalloc_page(mc); + if (!pgtable) + return ERR_PTR(-ENOMEM); + + ret = __kvm_pgtable_walk(&data, mm_ops, (kvm_pteref_t)pgtable, + level + 1); + if (ret) { + kvm_pgtable_stage2_free_unlinked(mm_ops, pgtable, level); + mm_ops->put_page(pgtable); + return ERR_PTR(ret); + } + + return pgtable; +} int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops, -- cgit v1.2.3 From 26f457142d7ee2da20a5b701862230e4961423d9 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Wed, 26 Apr 2023 17:23:22 +0000 Subject: KVM: arm64: Export kvm_are_all_memslots_empty() Export kvm_are_all_memslots_empty(). This will be used by a future commit when checking before setting a capability. Signed-off-by: Ricardo Koller Reviewed-by: Shaoqin Huang Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20230426172330.1439644-5-ricarkol@google.com Signed-off-by: Oliver Upton --- include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0e571e973bc2..7651069ada46 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -991,6 +991,8 @@ static inline bool kvm_memslots_empty(struct kvm_memslots *slots) return RB_EMPTY_ROOT(&slots->gfn_tree); } +bool kvm_are_all_memslots_empty(struct kvm *kvm); + #define kvm_for_each_memslot(memslot, bkt, slots) \ hash_for_each(slots->id_hash, bkt, memslot, id_node[slots->node_idx]) \ if (WARN_ON_ONCE(!memslot->npages)) { \ diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cb5c13eee193..13aed654111a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4598,7 +4598,7 @@ int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm, return -EINVAL; } -static bool kvm_are_all_memslots_empty(struct kvm *kvm) +bool kvm_are_all_memslots_empty(struct kvm *kvm) { int i; @@ -4611,6 +4611,7 @@ static bool kvm_are_all_memslots_empty(struct kvm *kvm) return true; } +EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty); static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm, struct kvm_enable_cap *cap) -- cgit v1.2.3 From 2f440b72e852be428540579b5813ba2b8236578d Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Wed, 26 Apr 2023 17:23:23 +0000 Subject: KVM: arm64: Add KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE Add a capability for userspace to specify the eager split chunk size. The chunk size specifies how many pages to break at a time, using a single allocation. Bigger the chunk size, more pages need to be allocated ahead of time. Suggested-by: Oliver Upton Signed-off-by: Ricardo Koller Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20230426172330.1439644-6-ricarkol@google.com Signed-off-by: Oliver Upton --- Documentation/virt/kvm/api.rst | 27 +++++++++++++++++++++++++++ arch/arm64/include/asm/kvm_host.h | 15 +++++++++++++++ arch/arm64/include/asm/kvm_pgtable.h | 18 ++++++++++++++++++ arch/arm64/kvm/arm.c | 28 ++++++++++++++++++++++++++++ arch/arm64/kvm/mmu.c | 4 ++++ include/uapi/linux/kvm.h | 2 ++ 6 files changed, 94 insertions(+) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index add067793b90..656bd293c8f4 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8445,6 +8445,33 @@ structure. When getting the Modified Change Topology Report value, the attr->addr must point to a byte where the value will be stored or retrieved from. +8.40 KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE +--------------------------------------- + +:Capability: KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE +:Architectures: arm64 +:Type: vm +:Parameters: arg[0] is the new split chunk size. +:Returns: 0 on success, -EINVAL if any memslot was already created. + +This capability sets the chunk size used in Eager Page Splitting. + +Eager Page Splitting improves the performance of dirty-logging (used +in live migrations) when guest memory is backed by huge-pages. It +avoids splitting huge-pages (into PAGE_SIZE pages) on fault, by doing +it eagerly when enabling dirty logging (with the +KVM_MEM_LOG_DIRTY_PAGES flag for a memory region), or when using +KVM_CLEAR_DIRTY_LOG. + +The chunk size specifies how many pages to break at a time, using a +single allocation for each chunk. Bigger the chunk size, more pages +need to be allocated ahead of time. + +The chunk size needs to be a valid block size. The list of acceptable +block sizes is exposed in KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES as a +64-bit bitmap (each bit describing a block size). The default value is +0, to disable the eager page splitting. + 9. Known KVM API problems ========================= diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7e7e19ef6993..b743198450b3 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -159,6 +159,21 @@ struct kvm_s2_mmu { /* The last vcpu id that ran on each physical CPU */ int __percpu *last_vcpu_ran; +#define KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT 0 + /* + * Memory cache used to split + * KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE worth of huge pages. It + * is used to allocate stage2 page tables while splitting huge + * pages. The choice of KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE + * influences both the capacity of the split page cache, and + * how often KVM reschedules. Be wary of raising CHUNK_SIZE + * too high. + * + * Protected by kvm->slots_lock. + */ + struct kvm_mmu_memory_cache split_page_cache; + uint64_t split_page_chunk_size; + struct kvm_arch *arch; }; diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index c8e0e7d9303b..cbc6971e2cb4 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -92,6 +92,24 @@ static inline bool kvm_level_supports_block_mapping(u32 level) return level >= KVM_PGTABLE_MIN_BLOCK_LEVEL; } +static inline u32 kvm_supported_block_sizes(void) +{ + u32 level = KVM_PGTABLE_MIN_BLOCK_LEVEL; + u32 r = 0; + + for (; level < KVM_PGTABLE_MAX_LEVELS; level++) + r |= BIT(kvm_granule_shift(level)); + + return r; +} + +static inline bool kvm_is_block_size_supported(u64 size) +{ + bool is_power_of_two = IS_ALIGNED(size, size); + + return is_power_of_two && (size & kvm_supported_block_sizes()); +} + /** * struct kvm_pgtable_mm_ops - Memory management callbacks. * @zalloc_page: Allocate a single zeroed memory page. diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 14391826241c..c605626801c4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -65,6 +65,7 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) { int r; + u64 new_cap; if (cap->flags) return -EINVAL; @@ -89,6 +90,24 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, r = 0; set_bit(KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED, &kvm->arch.flags); break; + case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: + new_cap = cap->args[0]; + + mutex_lock(&kvm->slots_lock); + /* + * To keep things simple, allow changing the chunk + * size only when no memory slots have been created. + */ + if (!kvm_are_all_memslots_empty(kvm)) { + r = -EINVAL; + } else if (new_cap && !kvm_is_block_size_supported(new_cap)) { + r = -EINVAL; + } else { + r = 0; + kvm->arch.mmu.split_page_chunk_size = new_cap; + } + mutex_unlock(&kvm->slots_lock); + break; default: r = -EINVAL; break; @@ -302,6 +321,15 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_ARM_PTRAUTH_GENERIC: r = system_has_full_ptr_auth(); break; + case KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE: + if (kvm) + r = kvm->arch.mmu.split_page_chunk_size; + else + r = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; + break; + case KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES: + r = kvm_supported_block_sizes(); + break; default: r = 0; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a0d3c773af99..f2d30486f755 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -775,6 +775,10 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t for_each_possible_cpu(cpu) *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; + /* The eager page splitting is disabled by default */ + mmu->split_page_chunk_size = KVM_ARM_EAGER_SPLIT_CHUNK_SIZE_DEFAULT; + mmu->split_page_cache.gfp_zero = __GFP_ZERO; + mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); return 0; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 737318b1c1d9..44edee0211fb 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1190,6 +1190,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226 #define KVM_CAP_COUNTER_OFFSET 227 +#define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 +#define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 8f5a3eb7513fc4deae511ce91af1c2c23874a8a7 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Wed, 26 Apr 2023 17:23:24 +0000 Subject: KVM: arm64: Add kvm_pgtable_stage2_split() Add a new stage2 function, kvm_pgtable_stage2_split(), for splitting a range of huge pages. This will be used for eager-splitting huge pages into PAGE_SIZE pages. The goal is to avoid having to split huge pages on write-protection faults, and instead use this function to do it ahead of time for large ranges (e.g., all guest memory in 1G chunks at a time). Signed-off-by: Ricardo Koller Reviewed-by: Shaoqin Huang Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20230426172330.1439644-7-ricarkol@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pgtable.h | 19 +++++++ arch/arm64/kvm/hyp/pgtable.c | 103 +++++++++++++++++++++++++++++++++++ 2 files changed, 122 insertions(+) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index cbc6971e2cb4..850d65f705fa 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -671,6 +671,25 @@ bool kvm_pgtable_stage2_is_young(struct kvm_pgtable *pgt, u64 addr); */ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size); +/** + * kvm_pgtable_stage2_split() - Split a range of huge pages into leaf PTEs pointing + * to PAGE_SIZE guest pages. + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). + * @addr: Intermediate physical address from which to split. + * @size: Size of the range. + * @mc: Cache of pre-allocated and zeroed memory from which to allocate + * page-table pages. + * + * The function tries to split any level 1 or 2 entry that overlaps + * with the input range (given by @addr and @size). + * + * Return: 0 on success, negative error code on failure. Note that + * kvm_pgtable_stage2_split() is best effort: it tries to break as many + * blocks in the input range as allowed by @mc_capacity. + */ +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_mmu_memory_cache *mc); + /** * kvm_pgtable_walk() - Walk a page-table. * @pgt: Page-table structure initialised by kvm_pgtable_*_init(). diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 56edffc02bc6..8b03cd6fed12 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1276,6 +1276,109 @@ kvm_pte_t *kvm_pgtable_stage2_create_unlinked(struct kvm_pgtable *pgt, return pgtable; } +/* + * Get the number of page-tables needed to replace a block with a + * fully populated tree up to the PTE entries. Note that @level is + * interpreted as in "level @level entry". + */ +static int stage2_block_get_nr_page_tables(u32 level) +{ + switch (level) { + case 1: + return PTRS_PER_PTE + 1; + case 2: + return 1; + case 3: + return 0; + default: + WARN_ON_ONCE(level < KVM_PGTABLE_MIN_BLOCK_LEVEL || + level >= KVM_PGTABLE_MAX_LEVELS); + return -EINVAL; + }; +} + +static int stage2_split_walker(const struct kvm_pgtable_visit_ctx *ctx, + enum kvm_pgtable_walk_flags visit) +{ + struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops; + struct kvm_mmu_memory_cache *mc = ctx->arg; + struct kvm_s2_mmu *mmu; + kvm_pte_t pte = ctx->old, new, *childp; + enum kvm_pgtable_prot prot; + u32 level = ctx->level; + bool force_pte; + int nr_pages; + u64 phys; + + /* No huge-pages exist at the last level */ + if (level == KVM_PGTABLE_MAX_LEVELS - 1) + return 0; + + /* We only split valid block mappings */ + if (!kvm_pte_valid(pte)) + return 0; + + nr_pages = stage2_block_get_nr_page_tables(level); + if (nr_pages < 0) + return nr_pages; + + if (mc->nobjs >= nr_pages) { + /* Build a tree mapped down to the PTE granularity. */ + force_pte = true; + } else { + /* + * Don't force PTEs, so create_unlinked() below does + * not populate the tree up to the PTE level. The + * consequence is that the call will require a single + * page of level 2 entries at level 1, or a single + * page of PTEs at level 2. If we are at level 1, the + * PTEs will be created recursively. + */ + force_pte = false; + nr_pages = 1; + } + + if (mc->nobjs < nr_pages) + return -ENOMEM; + + mmu = container_of(mc, struct kvm_s2_mmu, split_page_cache); + phys = kvm_pte_to_phys(pte); + prot = kvm_pgtable_stage2_pte_prot(pte); + + childp = kvm_pgtable_stage2_create_unlinked(mmu->pgt, phys, + level, prot, mc, force_pte); + if (IS_ERR(childp)) + return PTR_ERR(childp); + + if (!stage2_try_break_pte(ctx, mmu)) { + kvm_pgtable_stage2_free_unlinked(mm_ops, childp, level); + mm_ops->put_page(childp); + return -EAGAIN; + } + + /* + * Note, the contents of the page table are guaranteed to be made + * visible before the new PTE is assigned because stage2_make_pte() + * writes the PTE using smp_store_release(). + */ + new = kvm_init_table_pte(childp, mm_ops); + stage2_make_pte(ctx, new); + dsb(ishst); + return 0; +} + +int kvm_pgtable_stage2_split(struct kvm_pgtable *pgt, u64 addr, u64 size, + struct kvm_mmu_memory_cache *mc) +{ + struct kvm_pgtable_walker walker = { + .cb = stage2_split_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = mc, + }; + + return kvm_pgtable_walk(pgt, addr, size, &walker); +} + int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops, enum kvm_pgtable_stage2_flags flags, -- cgit v1.2.3 From 6bd92b9d8b02a67d67b6d7351ad5c81321a02017 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Wed, 26 Apr 2023 17:23:25 +0000 Subject: KVM: arm64: Refactor kvm_arch_commit_memory_region() Refactor kvm_arch_commit_memory_region() as a preparation for a future commit to look cleaner and more understandable. Also, it looks more like its x86 counterpart (in kvm_mmu_slot_apply_flags()). Signed-off-by: Ricardo Koller Reviewed-by: Shaoqin Huang Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20230426172330.1439644-8-ricarkol@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index f2d30486f755..8df15a174449 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1794,20 +1794,32 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, const struct kvm_memory_slot *new, enum kvm_mr_change change) { + bool log_dirty_pages = new && new->flags & KVM_MEM_LOG_DIRTY_PAGES; + /* * At this point memslot has been committed and there is an * allocated dirty_bitmap[], dirty pages will be tracked while the * memory slot is write protected. */ - if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) { + if (log_dirty_pages) { + + if (change == KVM_MR_DELETE) + return; + /* - * If we're with initial-all-set, we don't need to write - * protect any pages because they're all reported as dirty. - * Huge pages and normal pages will be write protect gradually. + * Pages are write-protected on either of these two + * cases: + * + * 1. with initial-all-set: gradually with CLEAR ioctls, */ - if (!kvm_dirty_log_manual_protect_and_init_set(kvm)) { - kvm_mmu_wp_memory_region(kvm, new->id); - } + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + return; + /* + * or + * 2. without initial-all-set: all in one shot when + * enabling dirty logging. + */ + kvm_mmu_wp_memory_region(kvm, new->id); } } -- cgit v1.2.3 From ce2b60223800c801b4b519c07aff3aa9c75c2b6d Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Wed, 26 Apr 2023 17:23:26 +0000 Subject: KVM: arm64: Add kvm_uninit_stage2_mmu() Add kvm_uninit_stage2_mmu() and move kvm_free_stage2_pgd() into it. A future commit will add some more things to do inside of kvm_uninit_stage2_mmu(). Signed-off-by: Ricardo Koller Reviewed-by: Shaoqin Huang Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20230426172330.1439644-9-ricarkol@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_mmu.h | 1 + arch/arm64/kvm/mmu.c | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 27e63c111f78..20c50e00496d 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -172,6 +172,7 @@ void __init free_hyp_pgds(void); void stage2_unmap_vm(struct kvm *kvm); int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type); +void kvm_uninit_stage2_mmu(struct kvm *kvm); void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu); int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 8df15a174449..d3fb35a002f9 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -790,6 +790,11 @@ out_free_pgtable: return err; } +void kvm_uninit_stage2_mmu(struct kvm *kvm) +{ + kvm_free_stage2_pgd(&kvm->arch.mmu); +} + static void stage2_unmap_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) { @@ -1893,7 +1898,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_free_stage2_pgd(&kvm->arch.mmu); + kvm_uninit_stage2_mmu(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, -- cgit v1.2.3 From e7bf7a490c68b0b64bc05aa0a4f09f6044037db1 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Wed, 26 Apr 2023 17:23:27 +0000 Subject: KVM: arm64: Split huge pages when dirty logging is enabled Split huge pages eagerly when enabling dirty logging. The goal is to avoid doing it while faulting on write-protected pages, which negatively impacts guest performance. A memslot marked for dirty logging is split in 1GB pieces at a time. This is in order to release the mmu_lock and give other kernel threads the opportunity to run, and also in order to allocate enough pages to split a 1GB range worth of huge pages (or a single 1GB huge page). Note that these page allocations can fail, so eager page splitting is best-effort. This is not a correctness issue though, as huge pages can still be split on write-faults. Eager page splitting only takes effect when the huge page mapping has been existing in the stage-2 page table. Otherwise, the huge page will be mapped to multiple non-huge pages on page fault. The benefits of eager page splitting are the same as in x86, added with commit a3fe5dbda0a4 ("KVM: x86/mmu: Split huge pages mapped by the TDP MMU when dirty logging is enabled"). For example, when running dirty_log_perf_test with 64 virtual CPUs (Ampere Altra), 1GB per vCPU, 50% reads, and 2MB HugeTLB memory, the time it takes vCPUs to access all of their memory after dirty logging is enabled decreased by 44% from 2.58s to 1.42s. Signed-off-by: Ricardo Koller Reviewed-by: Shaoqin Huang Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20230426172330.1439644-10-ricarkol@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 127 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 123 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index d3fb35a002f9..a36a01426b59 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -31,14 +31,21 @@ static phys_addr_t __ro_after_init hyp_idmap_vector; static unsigned long __ro_after_init io_map_base; -static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +static phys_addr_t __stage2_range_addr_end(phys_addr_t addr, phys_addr_t end, + phys_addr_t size) { - phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); phys_addr_t boundary = ALIGN_DOWN(addr + size, size); return (boundary - 1 < end - 1) ? boundary : end; } +static phys_addr_t stage2_range_addr_end(phys_addr_t addr, phys_addr_t end) +{ + phys_addr_t size = kvm_granule_size(KVM_PGTABLE_MIN_BLOCK_LEVEL); + + return __stage2_range_addr_end(addr, end, size); +} + /* * Release kvm_mmu_lock periodically if the memory region is large. Otherwise, * we may see kernel panics with CONFIG_DETECT_HUNG_TASK, @@ -75,6 +82,79 @@ static int stage2_apply_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, #define stage2_apply_range_resched(mmu, addr, end, fn) \ stage2_apply_range(mmu, addr, end, fn, true) +/* + * Get the maximum number of page-tables pages needed to split a range + * of blocks into PAGE_SIZE PTEs. It assumes the range is already + * mapped at level 2, or at level 1 if allowed. + */ +static int kvm_mmu_split_nr_page_tables(u64 range) +{ + int n = 0; + + if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) + n += DIV_ROUND_UP_ULL(range, PUD_SIZE); + n += DIV_ROUND_UP_ULL(range, PMD_SIZE); + return n; +} + +static bool need_split_memcache_topup_or_resched(struct kvm *kvm) +{ + struct kvm_mmu_memory_cache *cache; + u64 chunk_size, min; + + if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) + return true; + + chunk_size = kvm->arch.mmu.split_page_chunk_size; + min = kvm_mmu_split_nr_page_tables(chunk_size); + cache = &kvm->arch.mmu.split_page_cache; + return kvm_mmu_memory_cache_nr_free_objects(cache) < min; +} + +static int kvm_mmu_split_huge_pages(struct kvm *kvm, phys_addr_t addr, + phys_addr_t end) +{ + struct kvm_mmu_memory_cache *cache; + struct kvm_pgtable *pgt; + int ret, cache_capacity; + u64 next, chunk_size; + + lockdep_assert_held_write(&kvm->mmu_lock); + + chunk_size = kvm->arch.mmu.split_page_chunk_size; + cache_capacity = kvm_mmu_split_nr_page_tables(chunk_size); + + if (chunk_size == 0) + return 0; + + cache = &kvm->arch.mmu.split_page_cache; + + do { + if (need_split_memcache_topup_or_resched(kvm)) { + write_unlock(&kvm->mmu_lock); + cond_resched(); + /* Eager page splitting is best-effort. */ + ret = __kvm_mmu_topup_memory_cache(cache, + cache_capacity, + cache_capacity); + write_lock(&kvm->mmu_lock); + if (ret) + break; + } + + pgt = kvm->arch.mmu.pgt; + if (!pgt) + return -EINVAL; + + next = __stage2_range_addr_end(addr, end, chunk_size); + ret = kvm_pgtable_stage2_split(pgt, addr, next - addr, cache); + if (ret) + break; + } while (addr = next, addr != end); + + return ret; +} + static bool memslot_is_logging(struct kvm_memory_slot *memslot) { return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY); @@ -793,6 +873,7 @@ out_free_pgtable: void kvm_uninit_stage2_mmu(struct kvm *kvm) { kvm_free_stage2_pgd(&kvm->arch.mmu); + kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); } static void stage2_unmap_memslot(struct kvm *kvm, @@ -1019,6 +1100,34 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, stage2_wp_range(&kvm->arch.mmu, start, end); } +/** + * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE + * pages for memory slot + * @kvm: The KVM pointer + * @slot: The memory slot to split + * + * Acquires kvm->mmu_lock. Called with kvm->slots_lock mutex acquired, + * serializing operations for VM memory regions. + */ +static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + phys_addr_t start, end; + + lockdep_assert_held(&kvm->slots_lock); + + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, slot); + + start = memslot->base_gfn << PAGE_SHIFT; + end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; + + write_lock(&kvm->mmu_lock); + kvm_mmu_split_huge_pages(kvm, start, end); + write_unlock(&kvm->mmu_lock); +} + /* * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected * dirty pages. @@ -1812,8 +1921,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, return; /* - * Pages are write-protected on either of these two - * cases: + * Huge and normal pages are write-protected and split + * on either of these two cases: * * 1. with initial-all-set: gradually with CLEAR ioctls, */ @@ -1825,6 +1934,16 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * enabling dirty logging. */ kvm_mmu_wp_memory_region(kvm, new->id); + kvm_mmu_split_memory_region(kvm, new->id); + } else { + /* + * Free any leftovers from the eager page splitting cache. Do + * this when deleting, moving, disabling dirty logging, or + * creating the memslot (a nop). Doing it for deletes makes + * sure we don't leak memory, and there's no need to keep the + * cache around for any of the other cases. + */ + kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache); } } -- cgit v1.2.3 From 3005f6f29447d4f397c2ba67119fdea222ee51d3 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Wed, 26 Apr 2023 17:23:28 +0000 Subject: KVM: arm64: Open-code kvm_mmu_write_protect_pt_masked() Move the functionality of kvm_mmu_write_protect_pt_masked() into its caller, kvm_arch_mmu_enable_log_dirty_pt_masked(). This will be used in a subsequent commit in order to share some of the code in kvm_arch_mmu_enable_log_dirty_pt_masked(). Signed-off-by: Ricardo Koller Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20230426172330.1439644-11-ricarkol@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 42 +++++++++++++++--------------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index a36a01426b59..272558f54101 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1078,28 +1078,6 @@ static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) kvm_flush_remote_tlbs(kvm); } -/** - * kvm_mmu_write_protect_pt_masked() - write protect dirty pages - * @kvm: The KVM pointer - * @slot: The memory slot associated with mask - * @gfn_offset: The gfn offset in memory slot - * @mask: The mask of dirty pages at offset 'gfn_offset' in this memory - * slot to be write protected - * - * Walks bits set in mask write protects the associated pte's. Caller must - * acquire kvm_mmu_lock. - */ -static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, - struct kvm_memory_slot *slot, - gfn_t gfn_offset, unsigned long mask) -{ - phys_addr_t base_gfn = slot->base_gfn + gfn_offset; - phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; - phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; - - stage2_wp_range(&kvm->arch.mmu, start, end); -} - /** * kvm_mmu_split_memory_region() - split the stage 2 blocks into PAGE_SIZE * pages for memory slot @@ -1129,17 +1107,27 @@ static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) } /* - * kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected - * dirty pages. + * kvm_arch_mmu_enable_log_dirty_pt_masked() - enable dirty logging for selected pages. + * @kvm: The KVM pointer + * @slot: The memory slot associated with mask + * @gfn_offset: The gfn offset in memory slot + * @mask: The mask of pages at offset 'gfn_offset' in this memory + * slot to enable dirty logging on * - * It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to - * enable dirty logging for them. + * Writes protect selected pages to enable dirty logging for them. Caller must + * acquire kvm->mmu_lock. */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn_offset, unsigned long mask) { - kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); + phys_addr_t base_gfn = slot->base_gfn + gfn_offset; + phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; + phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; + + lockdep_assert_held_write(&kvm->mmu_lock); + + stage2_wp_range(&kvm->arch.mmu, start, end); } static void kvm_send_hwpoison_signal(unsigned long address, short lsb) -- cgit v1.2.3 From 6acf51666d039857421b6f8a3660301c82649fa5 Mon Sep 17 00:00:00 2001 From: Ricardo Koller Date: Wed, 26 Apr 2023 17:23:29 +0000 Subject: KVM: arm64: Split huge pages during KVM_CLEAR_DIRTY_LOG This is the arm64 counterpart of commit cb00a70bd4b7 ("KVM: x86/mmu: Split huge pages mapped by the TDP MMU during KVM_CLEAR_DIRTY_LOG"), which has the benefit of splitting the cost of splitting a memslot across multiple ioctls. Split huge pages on the range specified using KVM_CLEAR_DIRTY_LOG. And do not split when enabling dirty logging if KVM_DIRTY_LOG_INITIALLY_SET is set. Signed-off-by: Ricardo Koller Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20230426172330.1439644-12-ricarkol@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 272558f54101..3386bd28d267 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1114,8 +1114,8 @@ static void kvm_mmu_split_memory_region(struct kvm *kvm, int slot) * @mask: The mask of pages at offset 'gfn_offset' in this memory * slot to enable dirty logging on * - * Writes protect selected pages to enable dirty logging for them. Caller must - * acquire kvm->mmu_lock. + * Writes protect selected pages to enable dirty logging, and then + * splits them to PAGE_SIZE. Caller must acquire kvm->mmu_lock. */ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, @@ -1128,6 +1128,17 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, lockdep_assert_held_write(&kvm->mmu_lock); stage2_wp_range(&kvm->arch.mmu, start, end); + + /* + * Eager-splitting is done when manual-protect is set. We + * also check for initially-all-set because we can avoid + * eager-splitting if initially-all-set is false. + * Initially-all-set equal false implies that huge-pages were + * already split when enabling dirty logging: no need to do it + * again. + */ + if (kvm_dirty_log_manual_protect_and_init_set(kvm)) + kvm_mmu_split_huge_pages(kvm, start, end); } static void kvm_send_hwpoison_signal(unsigned long address, short lsb) -- cgit v1.2.3 From a12ab1378a88b38f3af8b1a899b7d8526324ba6f Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 26 Apr 2023 17:23:30 +0000 Subject: KVM: arm64: Use local TLBI on permission relaxation Broadcast TLB invalidations (TLBIs) targeting the Inner Shareable Domain are usually less performant than their non-shareable variant. In particular, we observed some implementations that take millliseconds to complete parallel broadcasted TLBIs. It's safe to use non-shareable TLBIs when relaxing permissions on a PTE in the KVM case. According to the ARM ARM (0487I.a) section D8.13.1 "Using break-before-make when updating translation table entries", permission relaxation does not need break-before-make. Specifically, R_WHZWS states that these are the only changes that require a break-before-make sequence: changes of memory type (Shareability or Cacheability), address changes, or changing the block size. Signed-off-by: Marc Zyngier Signed-off-by: Ricardo Koller Reviewed-by: Gavin Shan Link: https://lore.kernel.org/r/20230426172330.1439644-13-ricarkol@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_asm.h | 4 +++ arch/arm64/kvm/hyp/nvhe/hyp-main.c | 10 ++++++++ arch/arm64/kvm/hyp/nvhe/tlb.c | 52 ++++++++++++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/pgtable.c | 2 +- arch/arm64/kvm/hyp/vhe/tlb.c | 32 +++++++++++++++++++++++ 5 files changed, 99 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 43c3bc0f9544..bb17b2ead4c7 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -68,6 +68,7 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context, __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa, + __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa_nsh, __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid, __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context, __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, @@ -225,6 +226,9 @@ extern void __kvm_flush_vm_context(void); extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu); extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa, int level); +extern void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, + int level); extern void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu); extern void __kvm_timer_set_cntvoff(u64 cntvoff); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 728e01d4536b..c6bf1e49ca93 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -125,6 +125,15 @@ static void handle___kvm_tlb_flush_vmid_ipa(struct kvm_cpu_context *host_ctxt) __kvm_tlb_flush_vmid_ipa(kern_hyp_va(mmu), ipa, level); } +static void handle___kvm_tlb_flush_vmid_ipa_nsh(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); + DECLARE_REG(phys_addr_t, ipa, host_ctxt, 2); + DECLARE_REG(int, level, host_ctxt, 3); + + __kvm_tlb_flush_vmid_ipa_nsh(kern_hyp_va(mmu), ipa, level); +} + static void handle___kvm_tlb_flush_vmid(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm_s2_mmu *, mmu, host_ctxt, 1); @@ -315,6 +324,7 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_vcpu_run), HANDLE_FUNC(__kvm_flush_vm_context), HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa), + HANDLE_FUNC(__kvm_tlb_flush_vmid_ipa_nsh), HANDLE_FUNC(__kvm_tlb_flush_vmid), HANDLE_FUNC(__kvm_flush_cpu_context), HANDLE_FUNC(__kvm_timer_set_cntvoff), diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 978179133f4b..b9991bbd8e3f 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -130,6 +130,58 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, __tlb_switch_to_host(&cxt); } +void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, int level) +{ + struct tlb_inv_context cxt; + + /* Switch to requested VMID */ + __tlb_switch_to_guest(mmu, &cxt, true); + + /* + * We could do so much better if we had the VA as well. + * Instead, we invalidate Stage-2 for this IPA, and the + * whole of Stage-1. Weep... + */ + ipa >>= 12; + __tlbi_level(ipas2e1, ipa, level); + + /* + * We have to ensure completion of the invalidation at Stage-2, + * since a table walk on another CPU could refill a TLB with a + * complete (S1 + S2) walk based on the old Stage-2 mapping if + * the Stage-1 invalidation happened first. + */ + dsb(nsh); + __tlbi(vmalle1); + dsb(nsh); + isb(); + + /* + * If the host is running at EL1 and we have a VPIPT I-cache, + * then we must perform I-cache maintenance at EL2 in order for + * it to have an effect on the guest. Since the guest cannot hit + * I-cache lines allocated with a different VMID, we don't need + * to worry about junk out of guest reset (we nuke the I-cache on + * VMID rollover), but we do need to be careful when remapping + * executable pages for the same guest. This can happen when KSM + * takes a CoW fault on an executable page, copies the page into + * a page that was previously mapped in the guest and then needs + * to invalidate the guest view of the I-cache for that page + * from EL1. To solve this, we invalidate the entire I-cache when + * unmapping a page from a guest if we have a VPIPT I-cache but + * the host is running at EL1. As above, we could do better if + * we had the VA. + * + * The moral of this story is: if you have a VPIPT I-cache, then + * you should be running with VHE enabled. + */ + if (icache_is_vpipt()) + icache_inval_all_pou(); + + __tlb_switch_to_host(&cxt); +} + void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 8b03cd6fed12..2dd7e4a2da73 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -1189,7 +1189,7 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED); if (!ret) - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, pgt->mmu, addr, level); + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa_nsh, pgt->mmu, addr, level); return ret; } diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 24cef9b87f9e..e69da550cdc5 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -111,6 +111,38 @@ void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, __tlb_switch_to_host(&cxt); } +void __kvm_tlb_flush_vmid_ipa_nsh(struct kvm_s2_mmu *mmu, + phys_addr_t ipa, int level) +{ + struct tlb_inv_context cxt; + + dsb(nshst); + + /* Switch to requested VMID */ + __tlb_switch_to_guest(mmu, &cxt); + + /* + * We could do so much better if we had the VA as well. + * Instead, we invalidate Stage-2 for this IPA, and the + * whole of Stage-1. Weep... + */ + ipa >>= 12; + __tlbi_level(ipas2e1, ipa, level); + + /* + * We have to ensure completion of the invalidation at Stage-2, + * since a table walk on another CPU could refill a TLB with a + * complete (S1 + S2) walk based on the old Stage-2 mapping if + * the Stage-1 invalidation happened first. + */ + dsb(nsh); + __tlbi(vmalle1); + dsb(nsh); + isb(); + + __tlb_switch_to_host(&cxt); +} + void __kvm_tlb_flush_vmid(struct kvm_s2_mmu *mmu) { struct tlb_inv_context cxt; -- cgit v1.2.3 From 14c3555f055dd0819381148bf5b569cc5ba9ddfb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 17 May 2023 22:23:39 +0200 Subject: arm64: kvm: avoid overflow in integer division The newly added kvm_mmu_split_nr_page_tables() function uses DIV_ROUND_DOWN_ULL() to divide 64-bit addresses, but this requires a 32-bit divisior, and PUD_SIZE may exceed that when 64KB pages are used: arch/arm64/kvm/mmu.c: In function 'kvm_mmu_split_nr_page_tables': include/linux/math.h:42:64: error: conversion from 'long unsigned int' to 'u32' {aka 'unsigned int'} changes value from '68719476736' to '0' [-Werror=overflow] 42 | DIV_ROUND_DOWN_ULL((unsigned long long)(ll) + (d) - 1, (d)) | ^~~ include/linux/math.h:39:47: note: in definition of macro 'DIV_ROUND_DOWN_ULL' 39 | #define DIV_ROUND_DOWN_ULL(ll, d) div_u64(ll, d) | ^ arch/arm64/kvm/mmu.c:95:22: note: in expansion of macro 'DIV_ROUND_UP_ULL' 95 | n += DIV_ROUND_UP_ULL(range, PUD_SIZE); | ^~~~~~~~~~~~~~~~ Since this code is only used on 64-bit targets, DIV_ROUND_UP() can deal with this more easily, as it already takes 64-bit arguments. Fixes: e7bf7a490c68 ("KVM: arm64: Split huge pages when dirty logging is enabled") Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20230517202352.793673-1-arnd@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 3386bd28d267..6db9ef288ec3 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -92,8 +92,8 @@ static int kvm_mmu_split_nr_page_tables(u64 range) int n = 0; if (KVM_PGTABLE_MIN_BLOCK_LEVEL < 2) - n += DIV_ROUND_UP_ULL(range, PUD_SIZE); - n += DIV_ROUND_UP_ULL(range, PMD_SIZE); + n += DIV_ROUND_UP(range, PUD_SIZE); + n += DIV_ROUND_UP(range, PMD_SIZE); return n; } -- cgit v1.2.3 From c876c3f182a5cc16711962efdd9bf56b9fb84317 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Mon, 15 May 2023 18:00:16 +0100 Subject: KVM: arm64: Relax trapping of CTR_EL0 when FEAT_EVT is available CTR_EL0 can often be used in userspace, and it would be nice if KVM didn't have to emulate it unnecessarily. While it isn't possible to trap the cache configuration registers independently from CTR_EL0 in the base ARMv8.0 architecture, FEAT_EVT allows these cache configuration registers (CCSIDR_EL1, CCSIDR2_EL1, CLIDR_EL1 and CSSELR_EL1) to be trapped independently by setting HCR_EL2.TID4. Switch to using TID4 instead of TID2 in the cases where FEAT_EVT is available *and* that KVM doesn't need to sanitise CTR_EL0 to paper over mismatched cache configurations. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230515170016.965378-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_arm.h | 3 ++- arch/arm64/include/asm/kvm_emulate.h | 6 ++++++ arch/arm64/kernel/cpufeature.c | 11 +++++++++++ arch/arm64/tools/cpucaps | 1 + 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index baef29fcbeee..fe25a88f98a5 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -18,6 +18,7 @@ #define HCR_ATA_SHIFT 56 #define HCR_ATA (UL(1) << HCR_ATA_SHIFT) #define HCR_AMVOFFEN (UL(1) << 51) +#define HCR_TID4 (UL(1) << 49) #define HCR_FIEN (UL(1) << 47) #define HCR_FWB (UL(1) << 46) #define HCR_API (UL(1) << 41) @@ -86,7 +87,7 @@ #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ HCR_BSU_IS | HCR_FB | HCR_TACR | \ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \ - HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3 | HCR_TID2) + HCR_FMO | HCR_IMO | HCR_PTW | HCR_TID3) #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) #define HCR_HOST_NVHE_FLAGS (HCR_RW | HCR_API | HCR_APK | HCR_ATA) #define HCR_HOST_NVHE_PROTECTED_FLAGS (HCR_HOST_NVHE_FLAGS | HCR_TSC) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index b31b32ecbe2d..35bffdec0214 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -95,6 +95,12 @@ static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) vcpu->arch.hcr_el2 |= HCR_TVM; } + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; + if (vcpu_el1_is_32bit(vcpu)) vcpu->arch.hcr_el2 &= ~HCR_RW; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 7d7128c65161..4a2ab3f366de 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -2641,6 +2641,17 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_dit, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, DIT, IMP) }, + { + .desc = "Enhanced Virtualization Traps", + .capability = ARM64_HAS_EVT, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64MMFR2_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64MMFR2_EL1_EVT_SHIFT, + .field_width = 4, + .min_field_value = ID_AA64MMFR2_EL1_EVT_IMP, + .matches = has_cpuid_feature, + }, {}, }; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 40ba95472594..606d1184a5e9 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -25,6 +25,7 @@ HAS_E0PD HAS_ECV HAS_ECV_CNTPOFF HAS_EPAN +HAS_EVT HAS_GENERIC_AUTH HAS_GENERIC_AUTH_ARCH_QARMA3 HAS_GENERIC_AUTH_ARCH_QARMA5 -- cgit v1.2.3 From 76021e96d781e1fe8de02ebe52f3eb276716b6b0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Sat, 11 Feb 2023 01:07:19 +0000 Subject: KVM: Protect vcpu->pid dereference via debugfs with RCU Wrap the vcpu->pid dereference in the debugfs hook vcpu_get_pid() with proper RCU read (un)lock. Unlike the code in kvm_vcpu_ioctl(), vcpu_get_pid() is not a simple access; the pid pointer is passed to pid_nr() and fully dereferenced if the pointer is non-NULL. Failure to acquire RCU could result in use-after-free of the old pid if a different task invokes KVM_RUN and puts the last reference to the old vcpu->pid between vcpu_get_pid() reading the pointer and dereferencing it in pid_nr(). Fixes: e36de87d34a7 ("KVM: debugfs: expose pid of vcpu threads") Link: https://lore.kernel.org/r/20230211010719.982919-1-seanjc@google.com Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 479802a892d4..6a658f30af91 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3870,7 +3870,10 @@ static int create_vcpu_fd(struct kvm_vcpu *vcpu) static int vcpu_get_pid(void *data, u64 *val) { struct kvm_vcpu *vcpu = data; - *val = pid_nr(rcu_access_pointer(vcpu->pid)); + + rcu_read_lock(); + *val = pid_nr(rcu_dereference(vcpu->pid)); + rcu_read_unlock(); return 0; } -- cgit v1.2.3 From 12ced095956a517dc0c8c8f9264e29fd67124302 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 25 Apr 2023 13:39:32 +0200 Subject: KVM: x86/mmu: Add comment on try_cmpxchg64 usage in tdp_mmu_set_spte_atomic Commit aee98a6838d5 ("KVM: x86/mmu: Use try_cmpxchg64 in tdp_mmu_set_spte_atomic") removed the comment that iter->old_spte is updated when different logical CPU modifies the page table entry. Although this is what try_cmpxchg does implicitly, it won't hurt if this fact is explicitly mentioned in a restored comment. Cc: Paolo Bonzini Cc: Sean Christopherson Cc: David Matlack Signed-off-by: Uros Bizjak Link: https://lore.kernel.org/r/20230425113932.3148-1-ubizjak@gmail.com [sean: extend comment above try_cmpxchg64()] Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/tdp_mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 08340219c35a..512163d52194 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -592,7 +592,10 @@ static inline int tdp_mmu_set_spte_atomic(struct kvm *kvm, /* * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and - * does not hold the mmu_lock. + * does not hold the mmu_lock. On failure, i.e. if a different logical + * CPU modified the SPTE, try_cmpxchg64() updates iter->old_spte with + * the current value, so the caller operates on fresh data, e.g. if it + * retries tdp_mmu_set_spte_atomic() */ if (!try_cmpxchg64(sptep, &iter->old_spte, new_spte)) return -EBUSY; -- cgit v1.2.3 From 762b33eb90c9cc9227bf035cf2cf6f1458afecdb Mon Sep 17 00:00:00 2001 From: Like Xu Date: Tue, 23 May 2023 11:29:47 +0800 Subject: KVM: x86/mmu: Assert on @mmu in the __kvm_mmu_invalidate_addr() Add assertion to track that "mmu == vcpu->arch.mmu" is always true in the context of __kvm_mmu_invalidate_addr(). for_each_shadow_entry_using_root() and kvm_sync_spte() operate on vcpu->arch.mmu, but the only reason that doesn't cause explosions is because handle_invept() frees roots instead of doing a manual invalidation. As of now, there are no major roadblocks to switching INVEPT emulation over to use kvm_mmu_invalidate_addr(). Suggested-by: Sean Christopherson Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20230523032947.60041-1-likexu@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c8961f45e3b1..258f12235874 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5797,6 +5797,14 @@ static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu vcpu_clear_mmio_info(vcpu, addr); + /* + * Walking and synchronizing SPTEs both assume they are operating in + * the context of the current MMU, and would need to be reworked if + * this is ever used to sync the guest_mmu, e.g. to emulate INVEPT. + */ + if (WARN_ON_ONCE(mmu != vcpu->arch.mmu)) + return; + if (!VALID_PAGE(root_hpa)) return; -- cgit v1.2.3 From 334006b78ca84b7619d7dd313d5b6b39007e9528 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Mon, 10 Apr 2023 08:50:16 -0400 Subject: KVM: VMX: Use kvm_read_cr4() to get cr4 value Directly use vcpu->arch.cr4 is not recommended since it gets stale value if the cr4 is not available. Use kvm_read_cr4() instead to ensure correct value. Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20230410125017.1305238-2-xiaoyao.li@intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 44fb619803b8..4335d4b9cb2e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3387,7 +3387,7 @@ static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - unsigned long old_cr4 = vcpu->arch.cr4; + unsigned long old_cr4 = kvm_read_cr4(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); /* * Pass through host's Machine Check Enable value to hw_cr4, which -- cgit v1.2.3 From 82dc11b82b001ca8024663b3e7990b59f600841f Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Mon, 10 Apr 2023 08:50:17 -0400 Subject: KVM: VMX: Move the comment of CR4.MCE handling right above the code Move the comment about keeping the hosts CR4.MCE loaded in hardware above the code that actually modifies the hardware CR4 value. No functional change indented. Signed-off-by: Xiaoyao Li Link: https://lore.kernel.org/r/20230410125017.1305238-3-xiaoyao.li@intel.com [sean: elaborate in changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4335d4b9cb2e..275542d24375 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3389,13 +3389,13 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long old_cr4 = kvm_read_cr4(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long hw_cr4; + /* * Pass through host's Machine Check Enable value to hw_cr4, which * is in force while we are in guest mode. Do not let guests control * this bit, even if host CR4.MCE == 0. */ - unsigned long hw_cr4; - hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE); if (is_unrestricted_guest(vcpu)) hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST; -- cgit v1.2.3 From 3243b93c16d90c2d63cf30655276ffdf5bb65bf7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Apr 2023 16:19:13 -0700 Subject: KVM: VMX: Treat UMIP as emulated if and only if the host doesn't have UMIP Advertise UMIP as emulated if and only if the host doesn't natively support UMIP, otherwise vmx_umip_emulated() is misleading when the host _does_ support UMIP. Of the four users of vmx_umip_emulated(), two already check for native support, and the logic in vmx_set_cpu_caps() is relevant if and only if UMIP isn't natively supported as UMIP is set in KVM's caps by kvm_set_cpu_caps() when UMIP is present in hardware. That leaves KVM's stuffing of X86_CR4_UMIP into the default cr4_fixed1 value enumerated for nested VMX. In that case, checking for (lack of) host support is actually a bug fix of sorts, as enumerating UMIP support based solely on descriptor table exiting works only because KVM doesn't sanity check MSR_IA32_VMX_CR4_FIXED1. E.g. if a (very theoretical) host supported UMIP in hardware but didn't allow UMIP+VMX, KVM would advertise UMIP but not actually emulate UMIP. Of course, KVM would explode long before it could run a nested VM on said theoretical CPU, as KVM doesn't modify host CR4 when enabling VMX, i.e. would load an "illegal" value into vmcs.HOST_CR4. Reported-by: Robert Hoo Link: https://lore.kernel.org/all/20230310125718.1442088-2-robert.hu@intel.com Link: https://lore.kernel.org/r/20230413231914.1482782-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/capabilities.h | 4 ++-- arch/x86/kvm/vmx/nested.c | 3 +-- arch/x86/kvm/vmx/vmx.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 45162c1bcd8f..d0abee35d7ba 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -152,8 +152,8 @@ static inline bool cpu_has_vmx_ept(void) static inline bool vmx_umip_emulated(void) { - return vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_DESC; + return !boot_cpu_has(X86_FEATURE_UMIP) && + (vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_DESC); } static inline bool cpu_has_vmx_rdtscp(void) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e35cf0bd0df9..368a43e3b40e 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2328,8 +2328,7 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0 * Preset *DT exiting when emulating UMIP, so that vmx_set_cr4() * will not have to rewrite the controls just for this bit. */ - if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated() && - (vmcs12->guest_cr4 & X86_CR4_UMIP)) + if (vmx_umip_emulated() && (vmcs12->guest_cr4 & X86_CR4_UMIP)) exec_control |= SECONDARY_EXEC_DESC; if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 275542d24375..98f966635c88 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3404,7 +3404,7 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) else hw_cr4 |= KVM_PMODE_VM_CR4_ALWAYS_ON; - if (!boot_cpu_has(X86_FEATURE_UMIP) && vmx_umip_emulated()) { + if (vmx_umip_emulated()) { if (cr4 & X86_CR4_UMIP) { secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC); hw_cr4 &= ~X86_CR4_UMIP; -- cgit v1.2.3 From 023cfa6fc200fc179dbf8e1857cc7140fa1466f9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 13 Apr 2023 16:19:14 -0700 Subject: KVM: VMX: Use proper accessor to read guest CR4 in handle_desc() Use kvm_is_cr4_bit_set() to read guest CR4.UMIP when sanity checking that a descriptor table VM-Exit occurs if and only if guest.CR4.UMIP=1. UMIP can't be guest-owned, i.e. using kvm_read_cr4_bits() to decache guest- owned bits isn't strictly necessary, but eliminating raw reads of vcpu->arch.cr4 is desirable as it makes it easy to visually audit KVM for correctness. Opportunistically add a compile-time assertion that UMIP isn't guest-owned as letting the guest own UMIP isn't compatible with emulation (or any CR4 bit that is emulated by KVM). Opportunistically change the WARN_ON() to a ONCE variant. When the WARN fires, it fires _a lot_, and spamming the kernel logs ends up doing more harm than whatever led to KVM's unnecessary emulation. Reported-by: Robert Hoo Link: https://lore.kernel.org/all/20230310125718.1442088-4-robert.hu@intel.com Link: https://lore.kernel.org/r/20230413231914.1482782-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 98f966635c88..7ecab2118106 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5402,7 +5402,13 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) static int handle_desc(struct kvm_vcpu *vcpu) { - WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); + /* + * UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this + * and other code needs to be updated if UMIP can be guest owned. + */ + BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP); + + WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP)); return kvm_emulate_instruction(vcpu, 0); } -- cgit v1.2.3 From b53d4a27234955810362f91be679afc12e4c3237 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Tue, 30 May 2023 15:08:45 +0000 Subject: KVM: arm64: Use BTI for nvhe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CONFIG_ARM64_BTI_KERNEL compiles the kernel to support ARMv8.5-BTI. However, the nvhe code doesn't make use of it as it doesn't map any pages with Guarded Page(GP) bit. kvm pgtable code is modified to map executable pages with GP bit if BTI is enabled for the kernel. At hyp init, SCTLR_EL2.BT is set to 1 to match EL1 configuration (SCTLR_EL1.BT1) set in bti_enable(). One difference between kernel and nvhe code, is that the kernel maps .text with GP while nvhe maps all the executable pages, this makes nvhe code need to deal with special initialization code coming from other executable sections (.idmap.text). For this we need to add bti instruction at the beginning of __kvm_handle_stub_hvc as it can be called by __host_hvc through branch instruction(br) and unlike SYM_FUNC_START, SYM_CODE_START doesn’t add bti instruction at the beginning, and it can’t be modified to add it as it is used with vector tables. Another solution which is more intrusive is to convert __kvm_handle_stub_hvc to a function and inject “bti jc” instead of “bti c” in SYM_FUNC_START Signed-off-by: Mostafa Saleh Link: https://lore.kernel.org/r/20230530150845.2856828-1-smostafa@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/sysreg.h | 1 + arch/arm64/kvm/hyp/nvhe/hyp-init.S | 12 ++++++++++++ arch/arm64/kvm/hyp/pgtable.c | 7 ++++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index e72d9aaab6b1..204124ce86c4 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -558,6 +558,7 @@ (BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \ (BIT(29))) +#define SCTLR_EL2_BT (BIT(36)) #ifdef CONFIG_CPU_BIG_ENDIAN #define ENDIAN_SET_EL2 SCTLR_ELx_EE #else diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index a6d67c2bb5ae..f3ee66aa2f9d 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -128,6 +128,13 @@ alternative_if ARM64_HAS_ADDRESS_AUTH SCTLR_ELx_ENDA | SCTLR_ELx_ENDB) orr x0, x0, x1 alternative_else_nop_endif + +#ifdef CONFIG_ARM64_BTI_KERNEL +alternative_if ARM64_BTI + orr x0, x0, #SCTLR_EL2_BT +alternative_else_nop_endif +#endif /* CONFIG_ARM64_BTI_KERNEL */ + msr sctlr_el2, x0 isb @@ -196,6 +203,11 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) SYM_CODE_END(__kvm_hyp_init_cpu) SYM_CODE_START(__kvm_handle_stub_hvc) + /* + * __kvm_handle_stub_hvc called from __host_hvc through branch instruction(br) so + * we need bti j at beginning. + */ + bti j cmp x0, #HVC_SOFT_RESTART b.ne 1f diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 3d61bd3e591d..a79a45fc4047 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -34,7 +34,7 @@ #define KVM_PTE_LEAF_ATTR_LO_S2_SH_IS 3 #define KVM_PTE_LEAF_ATTR_LO_S2_AF BIT(10) -#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) +#define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 50) #define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) @@ -42,6 +42,8 @@ #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) +#define KVM_PTE_LEAF_ATTR_HI_S1_GP BIT(50) + #define KVM_PTE_LEAF_ATTR_S2_PERMS (KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ KVM_PTE_LEAF_ATTR_HI_S2_XN) @@ -371,6 +373,9 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) if (device) return -EINVAL; + + if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) && system_supports_bti()) + attr |= KVM_PTE_LEAF_ATTR_HI_S1_GP; } else { attr |= KVM_PTE_LEAF_ATTR_HI_S1_XN; } -- cgit v1.2.3 From 5efde6d73d58ec1ba6e22cc0cbc89fbdb38e632c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 Apr 2023 17:17:24 -0700 Subject: KVM: selftests: Refactor stable TSC check to use TEST_REQUIRE() Refactor the nested TSC scaling test's check on a stable system TSC to use TEST_REQUIRE() to do the heavy lifting when the system doesn't have a stable TSC. Using a helper+TEST_REQUIRE() eliminates the need for gotos and a custom message. Cc: Hao Ge Cc: Vipin Sharma Link: https://lore.kernel.org/r/20230406001724.706668-1-seanjc@google.com Signed-off-by: Sean Christopherson --- .../kvm/x86_64/vmx_nested_tsc_scaling_test.c | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c index fa03c8d1ce4e..e710b6e7fb38 100644 --- a/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86_64/vmx_nested_tsc_scaling_test.c @@ -116,29 +116,21 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_DONE(); } -static void stable_tsc_check_supported(void) +static bool system_has_stable_tsc(void) { + bool tsc_is_stable; FILE *fp; char buf[4]; fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); if (fp == NULL) - goto skip_test; + return false; - if (fgets(buf, sizeof(buf), fp) == NULL) - goto close_fp; + tsc_is_stable = fgets(buf, sizeof(buf), fp) && + !strncmp(buf, "tsc", sizeof(buf)); - if (strncmp(buf, "tsc", sizeof(buf))) - goto close_fp; - - fclose(fp); - return; - -close_fp: fclose(fp); -skip_test: - print_skip("Kernel does not use TSC clocksource - assuming that host TSC is not stable"); - exit(KSFT_SKIP); + return tsc_is_stable; } int main(int argc, char *argv[]) @@ -156,7 +148,7 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL)); - stable_tsc_check_supported(); + TEST_REQUIRE(system_has_stable_tsc()); /* * We set L1's scale factor to be a random number from 2 to 10. -- cgit v1.2.3 From 56f413f2cd373d6ed7c4ecb2e0e3e740cc2fdc8c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 17 Apr 2023 18:53:22 +0100 Subject: KVM: selftests: Fix spelling mistake "miliseconds" -> "milliseconds" There is a spelling mistake in the help for the -p option. Fix it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20230417175322.53249-1-colin.i.king@gmail.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c index 251794f83719..7f36c32fa760 100644 --- a/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c +++ b/tools/testing/selftests/kvm/x86_64/nx_huge_pages_test.c @@ -226,7 +226,7 @@ static void help(char *name) puts(""); printf("usage: %s [-h] [-p period_ms] [-t token]\n", name); puts(""); - printf(" -p: The NX reclaim period in miliseconds.\n"); + printf(" -p: The NX reclaim period in milliseconds.\n"); printf(" -t: The magic token to indicate environment setup is done.\n"); printf(" -r: The test has reboot permissions and can disable NX huge pages.\n"); puts(""); -- cgit v1.2.3 From ba125de35da5184c5325bef5c4c89f6928ce8875 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 27 Apr 2023 16:11:11 -0400 Subject: KVM: selftests: Setup vcpu_alias only for minor mode test This fixes two things: - Unbreaks MISSING mode test on anonymous memory type - Prefault alias mem before uffd thread creations, otherwise the uffd thread timing will be inaccurate when guest mem size is large, because it'll take prefault time into total time. Signed-off-by: Peter Xu Reviewed-by: James Houghton Link: https://lore.kernel.org/r/20230427201112.2164776-2-peterx@redhat.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/demand_paging_test.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 2439c4043fed..9c18686b4f63 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -128,6 +128,7 @@ static void prefault_mem(void *alias, uint64_t len) static void run_test(enum vm_guest_mode mode, void *arg) { + struct memstress_vcpu_args *vcpu_args; struct test_params *p = arg; struct uffd_desc **uffd_descs = NULL; struct timespec start; @@ -145,24 +146,24 @@ static void run_test(enum vm_guest_mode mode, void *arg) "Failed to allocate buffer for guest data pattern"); memset(guest_data_prototype, 0xAB, demand_paging_size); + if (p->uffd_mode == UFFDIO_REGISTER_MODE_MINOR) { + for (i = 0; i < nr_vcpus; i++) { + vcpu_args = &memstress_args.vcpu_args[i]; + prefault_mem(addr_gpa2alias(vm, vcpu_args->gpa), + vcpu_args->pages * memstress_args.guest_page_size); + } + } + if (p->uffd_mode) { uffd_descs = malloc(nr_vcpus * sizeof(struct uffd_desc *)); TEST_ASSERT(uffd_descs, "Memory allocation failed"); - for (i = 0; i < nr_vcpus; i++) { - struct memstress_vcpu_args *vcpu_args; void *vcpu_hva; - void *vcpu_alias; vcpu_args = &memstress_args.vcpu_args[i]; /* Cache the host addresses of the region */ vcpu_hva = addr_gpa2hva(vm, vcpu_args->gpa); - vcpu_alias = addr_gpa2alias(vm, vcpu_args->gpa); - - prefault_mem(vcpu_alias, - vcpu_args->pages * memstress_args.guest_page_size); - /* * Set up user fault fd to handle demand paging * requests. -- cgit v1.2.3 From 21912a653d7dc9b79f3b7e9884179d7b7d593448 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 27 Apr 2023 16:11:12 -0400 Subject: KVM: selftests: Allow dumping per-vcpu info for uffd threads There's one PER_VCPU_DEBUG in per-vcpu uffd threads but it's never hit. Trigger that when quit in normal ways (kick pollfd[1]), meanwhile fix the number of nanosec calculation. Signed-off-by: Peter Xu Reviewed-by: James Houghton Link: https://lore.kernel.org/r/20230427201112.2164776-3-peterx@redhat.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/userfaultfd_util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/userfaultfd_util.c b/tools/testing/selftests/kvm/lib/userfaultfd_util.c index 92cef20902f1..271f63891581 100644 --- a/tools/testing/selftests/kvm/lib/userfaultfd_util.c +++ b/tools/testing/selftests/kvm/lib/userfaultfd_util.c @@ -70,7 +70,7 @@ static void *uffd_handler_thread_fn(void *arg) r = read(pollfd[1].fd, &tmp_chr, 1); TEST_ASSERT(r == 1, "Error reading pipefd in UFFD thread\n"); - return NULL; + break; } if (!(pollfd[0].revents & POLLIN)) @@ -103,7 +103,7 @@ static void *uffd_handler_thread_fn(void *arg) ts_diff = timespec_elapsed(start); PER_VCPU_DEBUG("userfaulted %ld pages over %ld.%.9lds. (%f/sec)\n", pages, ts_diff.tv_sec, ts_diff.tv_nsec, - pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / 100000000.0)); + pages / ((double)ts_diff.tv_sec + (double)ts_diff.tv_nsec / NSEC_PER_SEC)); return NULL; } -- cgit v1.2.3 From c9d601548603c54919a3b1333c5b972252b4031d Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 7 Mar 2023 21:52:33 +0800 Subject: KVM: allow KVM_BUG/KVM_BUG_ON to handle 64-bit cond Current KVM_BUG and KVM_BUG_ON assume that 'cond' passed from callers is 32-bit as it casts 'cond' to the type of int. This will be wrong if 'cond' provided by a caller is 64-bit, e.g. an error code of 0xc0000d0300000000 will be converted to 0, which is not expected. Improves the implementation by using bool in KVM_BUG and KVM_BUG_ON. 'bool' is preferred to 'int' as __ret is essentially used as a boolean and coding-stytle.rst documents that use of bool is encouraged to improve readability and is often a better option than 'int' for storing boolean values. Fixes: 0b8f11737cff ("KVM: Add infrastructure and macro to mark VM as bugged") Signed-off-by: Wei Wang Reviewed-by: Mingwei Zhang Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20230307135233.54684-1-wei.w.wang@intel.com Signed-off-by: Sean Christopherson --- include/linux/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0e571e973bc2..2dc604af1ac1 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -849,7 +849,7 @@ static inline void kvm_vm_bugged(struct kvm *kvm) #define KVM_BUG(cond, kvm, fmt...) \ ({ \ - int __ret = (cond); \ + bool __ret = !!(cond); \ \ if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt)) \ kvm_vm_bugged(kvm); \ @@ -858,7 +858,7 @@ static inline void kvm_vm_bugged(struct kvm *kvm) #define KVM_BUG_ON(cond, kvm) \ ({ \ - int __ret = (cond); \ + bool __ret = !!(cond); \ \ if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \ kvm_vm_bugged(kvm); \ -- cgit v1.2.3 From 0d3518d2f8c3c432270e3eff56e28ae6c0cedfac Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Tue, 23 May 2023 16:56:35 +0000 Subject: KVM: SVM: Remove TSS reloading code after VMEXIT Remove the dedicated post-VMEXIT TSS reloading code now that KVM uses VMLOAD to load host segment state, which includes TSS state. Fixes: e79b91bb3c91 ("KVM: SVM: use vmsave/vmload for saving/restoring additional host state") Reported-by: Venkatesh Srinivas Suggested-by: Jim Mattson Signed-off-by: Mingwei Zhang Link: https://lore.kernel.org/r/20230523165635.4002711-1-mizhang@google.com [sean: massage changelog] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 24 ------------------------ arch/x86/kvm/svm/svm.h | 1 - 2 files changed, 25 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ca32389f3c36..616ea6f55ae5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -244,15 +244,6 @@ static u8 rsm_ins_bytes[] = "\x0f\xaa"; static unsigned long iopm_base; -struct kvm_ldttss_desc { - u16 limit0; - u16 base0; - unsigned base1:8, type:5, dpl:2, p:1; - unsigned limit1:4, zero0:3, g:1, base2:8; - u32 base3; - u32 zero1; -} __attribute__((packed)); - DEFINE_PER_CPU(struct svm_cpu_data, svm_data); /* @@ -588,7 +579,6 @@ static int svm_hardware_enable(void) struct svm_cpu_data *sd; uint64_t efer; - struct desc_struct *gdt; int me = raw_smp_processor_id(); rdmsrl(MSR_EFER, efer); @@ -601,9 +591,6 @@ static int svm_hardware_enable(void) sd->next_asid = sd->max_asid + 1; sd->min_asid = max_sev_asid + 1; - gdt = get_current_gdt_rw(); - sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); - wrmsrl(MSR_EFER, efer | EFER_SVME); wrmsrl(MSR_VM_HSAVE_PA, sd->save_area_pa); @@ -3457,14 +3444,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) return svm_invoke_exit_handler(vcpu, exit_code); } -static void reload_tss(struct kvm_vcpu *vcpu) -{ - struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); - - sd->tss_desc->type = 9; /* available 32/64-bit TSS */ - load_TR_desc(); -} - static void pre_svm_run(struct kvm_vcpu *vcpu) { struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); @@ -4099,9 +4078,6 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted); - if (!sev_es_guest(vcpu->kvm)) - reload_tss(vcpu); - if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) x86_spec_ctrl_restore_host(svm->virt_spec_ctrl); diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index f44751dd8d5d..18af7e712a5a 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -303,7 +303,6 @@ struct svm_cpu_data { u32 max_asid; u32 next_asid; u32 min_asid; - struct kvm_ldttss_desc *tss_desc; struct page *save_area; unsigned long save_area_pa; -- cgit v1.2.3 From a33ba1bf0dc6082227d1ddf964632e07b53b8971 Mon Sep 17 00:00:00 2001 From: Wenyao Hai Date: Thu, 11 May 2023 16:33:44 -0700 Subject: KVM: VMX: Open code writing vCPU's PAT in VMX's MSR handler Open code setting "vcpu->arch.pat" in vmx_set_msr() instead of bouncing through kvm_set_msr_common() to get to the same code in kvm_mtrr_set_msr(). This aligns VMX with SVM, avoids hiding a very simple operation behind a relatively complicated function call (finding the PAT MSR case in kvm_set_msr_common() is non-trivial), and most importantly, makes it clear that not unwinding the VMCS updates if kvm_set_msr_common() isn't a bug (because kvm_set_msr_common() can never fail for PAT). Opportunistically set vcpu->arch.pat before updating the VMCS info so that a future patch can move the common bits (back) into kvm_set_msr_common() without a functional change. Note, MSR_IA32_CR_PAT is 0x277, and is very subtly handled by case 0x200 ... MSR_IA32_MC0_CTL2 - 1: in kvm_set_msr_common(). Cc: Kai Huang Signed-off-by: Wenyao Hai [sean: massage changelog, hoist setting vcpu->arch.pat up] Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230511233351.635053-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 44fb619803b8..33b8625d3541 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2290,16 +2290,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!kvm_pat_valid(data)) return 1; + vcpu->arch.pat = data; + if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) get_vmcs12(vcpu)->guest_ia32_pat = data; - if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { + if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) vmcs_write64(GUEST_IA32_PAT, data); - vcpu->arch.pat = data; - break; - } - ret = kvm_set_msr_common(vcpu, msr_info); break; case MSR_IA32_MCG_EXT_CTL: if ((!msr_info->host_initiated && -- cgit v1.2.3 From 7aeae027611ff27f13a32e19736c8fd06e41786c Mon Sep 17 00:00:00 2001 From: Ke Guo Date: Thu, 11 May 2023 16:33:45 -0700 Subject: KVM: SVM: Use kvm_pat_valid() directly instead of kvm_mtrr_valid() Use kvm_pat_valid() directly instead of bouncing through kvm_mtrr_valid(). The PAT is not an MTRR, and kvm_mtrr_valid() just redirects to kvm_pat_valid(), i.e. is exempt from KVM's "zap SPTEs" logic that's needed to honor guest MTRRs when the VM has a passthrough device with non-coherent DMA (KVM does NOT set "ignore guest PAT" in this case, and so enables hardware virtualization of the guest's PAT, i.e. doesn't need to manually emulate the PAT memtype). Signed-off-by: Ke Guo [sean: massage changelog] Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230511233351.635053-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ca32389f3c36..e1e17c7dc7d5 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2939,7 +2939,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_CR_PAT: - if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data)) + if (!kvm_pat_valid(data)) return 1; vcpu->arch.pat = data; svm->vmcb01.ptr->save.g_pat = data; -- cgit v1.2.3 From ebda79e5057778be1ad8ed072e4229894dfc66b7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 May 2023 16:33:46 -0700 Subject: KVM: x86: Add helper to query if variable MTRR MSR is base (versus mask) Add a helper to query whether a variable MTRR MSR is a base versus as mask MSR. Replace the unnecessarily complex math with a simple check on bit 0; base MSRs are even, mask MSRs are odd. Link: https://lore.kernel.org/r/20230511233351.635053-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mtrr.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 9fac1ec03463..f65ce4b3980f 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -25,6 +25,12 @@ #define IA32_MTRR_DEF_TYPE_FE (1ULL << 10) #define IA32_MTRR_DEF_TYPE_TYPE_MASK (0xff) +static bool is_mtrr_base_msr(unsigned int msr) +{ + /* MTRR base MSRs use even numbers, masks use odd numbers. */ + return !(msr & 0x1); +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -342,10 +348,9 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; struct kvm_mtrr_range *tmp, *cur; - int index, is_mtrr_mask; + int index; index = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * index; cur = &mtrr_state->var_ranges[index]; /* remove the entry if it's in the list. */ @@ -356,7 +361,7 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) * Set all illegal GPA bits in the mask, since those bits must * implicitly be 0. The bits are then cleared when reading them. */ - if (!is_mtrr_mask) + if (is_mtrr_base_msr(msr)) cur->base = data; else cur->mask = data | kvm_vcpu_reserved_gpa_bits_raw(vcpu); @@ -418,11 +423,8 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) else if (msr == MSR_IA32_CR_PAT) *pdata = vcpu->arch.pat; else { /* Variable MTRRs */ - int is_mtrr_mask; - index = (msr - 0x200) / 2; - is_mtrr_mask = msr - 0x200 - 2 * index; - if (!is_mtrr_mask) + if (is_mtrr_base_msr(msr)) *pdata = vcpu->arch.mtrr_state.var_ranges[index].base; else *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; -- cgit v1.2.3 From 9ae38b4fb13597ce1821376d23958bbe4976c759 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 May 2023 16:33:47 -0700 Subject: KVM: x86: Add helper to get variable MTRR range from MSR index Add a helper to dedup the logic for retrieving a variable MTRR range structure given a variable MTRR MSR index. No functional change intended. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230511233351.635053-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mtrr.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index f65ce4b3980f..59851dbebfea 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -31,6 +31,14 @@ static bool is_mtrr_base_msr(unsigned int msr) return !(msr & 0x1); } +static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu, + unsigned int msr) +{ + int index = (msr - 0x200) / 2; + + return &vcpu->arch.mtrr_state.var_ranges[index]; +} + static bool msr_mtrr_valid(unsigned msr) { switch (msr) { @@ -314,7 +322,6 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; gfn_t start, end; - int index; if (msr == MSR_IA32_CR_PAT || !tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm)) @@ -332,8 +339,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) end = ~0ULL; } else { /* variable range MTRRs. */ - index = (msr - 0x200) / 2; - var_mtrr_range(&mtrr_state->var_ranges[index], &start, &end); + var_mtrr_range(var_mtrr_msr_to_range(vcpu, msr), &start, &end); } kvm_zap_gfn_range(vcpu->kvm, gpa_to_gfn(start), gpa_to_gfn(end)); @@ -348,14 +354,12 @@ static void set_var_mtrr_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) { struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; struct kvm_mtrr_range *tmp, *cur; - int index; - index = (msr - 0x200) / 2; - cur = &mtrr_state->var_ranges[index]; + cur = var_mtrr_msr_to_range(vcpu, msr); /* remove the entry if it's in the list. */ if (var_mtrr_range_is_valid(cur)) - list_del(&mtrr_state->var_ranges[index].node); + list_del(&cur->node); /* * Set all illegal GPA bits in the mask, since those bits must @@ -423,11 +427,10 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) else if (msr == MSR_IA32_CR_PAT) *pdata = vcpu->arch.pat; else { /* Variable MTRRs */ - index = (msr - 0x200) / 2; if (is_mtrr_base_msr(msr)) - *pdata = vcpu->arch.mtrr_state.var_ranges[index].base; + *pdata = var_mtrr_msr_to_range(vcpu, msr)->base; else - *pdata = vcpu->arch.mtrr_state.var_ranges[index].mask; + *pdata = var_mtrr_msr_to_range(vcpu, msr)->mask; *pdata &= ~kvm_vcpu_reserved_gpa_bits_raw(vcpu); } -- cgit v1.2.3 From 34a83deac31cd9fdecef331578422095af2db4b0 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 May 2023 16:33:48 -0700 Subject: KVM: x86: Use MTRR macros to define possible MTRR MSR ranges Use the MTRR macros to identify the ranges of possible MTRR MSRs instead of bounding the ranges with a mismash of open coded values and unrelated MSR indices. Carving out the gap for the machine check MSRs in particular is confusing, as it's easy to incorrectly think the case statement handles MCE MSRs instead of skipping them. Drop the range-based funneling of MSRs between the end of the MCE MSRs and MTRR_DEF_TYPE, i.e. 0x2A0-0x2FF, and instead handle MTTR_DEF_TYPE as the one-off case that it is. Extract PAT (0x277) as well in anticipation of dropping PAT "handling" from the MTRR code. Keep the range-based handling for the variable+fixed MTRRs even though capturing unknown MSRs 0x214-0x24F is arguably "wrong". There is a gap in the fixed MTRRs, 0x260-0x267, i.e. the MTRR code needs to filter out unknown MSRs anyways, and using a single range generates marginally better code for the big switch statement. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230511233351.635053-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mtrr.c | 7 ++++--- arch/x86/kvm/x86.c | 10 ++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index 59851dbebfea..dc213b940141 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -34,7 +34,7 @@ static bool is_mtrr_base_msr(unsigned int msr) static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu, unsigned int msr) { - int index = (msr - 0x200) / 2; + int index = (msr - MTRRphysBase_MSR(0)) / 2; return &vcpu->arch.mtrr_state.var_ranges[index]; } @@ -42,7 +42,7 @@ static struct kvm_mtrr_range *var_mtrr_msr_to_range(struct kvm_vcpu *vcpu, static bool msr_mtrr_valid(unsigned msr) { switch (msr) { - case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: + case MTRRphysBase_MSR(0) ... MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1): case MSR_MTRRfix64K_00000: case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: @@ -88,7 +88,8 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) } /* variable MTRRs */ - WARN_ON(!(msr >= 0x200 && msr < 0x200 + 2 * KVM_NR_VAR_MTRR)); + WARN_ON(!(msr >= MTRRphysBase_MSR(0) && + msr <= MTRRphysMask_MSR(KVM_NR_VAR_MTRR - 1))); mask = kvm_vcpu_reserved_gpa_bits_raw(vcpu); if ((msr & 1) == 0) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c0778ca39650..8017e0eb1e67 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3702,8 +3702,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } break; - case 0x200 ... MSR_IA32_MC0_CTL2 - 1: - case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: + case MSR_IA32_CR_PAT: + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: return kvm_mtrr_set_msr(vcpu, msr, data); case MSR_IA32_APICBASE: return kvm_set_apic_base(vcpu, msr_info); @@ -4110,9 +4111,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = kvm_scale_tsc(rdtsc(), ratio) + offset; break; } + case MSR_IA32_CR_PAT: case MSR_MTRRcap: - case 0x200 ... MSR_IA32_MC0_CTL2 - 1: - case MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) ... 0x2ff: + case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data); case 0xcd: /* fsb frequency */ msr_info->data = 3; -- cgit v1.2.3 From bc7fe2f0b7511ef6bb2765b6e1f923da449e00ef Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 May 2023 16:33:49 -0700 Subject: KVM: x86: Move PAT MSR handling out of mtrr.c Drop handling of MSR_IA32_CR_PAT from mtrr.c now that SVM and VMX handle writes without bouncing through kvm_set_msr_common(). PAT isn't truly an MTRR even though it affects memory types, and more importantly KVM enables hardware virtualization of guest PAT (by NOT setting "ignore guest PAT") when a guest has non-coherent DMA, i.e. KVM doesn't need to zap SPTEs when the guest PAT changes. The read path is and always has been trivial, i.e. burying it in the MTRR code does more harm than good. WARN and continue for the PAT case in kvm_set_msr_common(), as that code is _currently_ reached if and only if KVM is buggy. Defer cleaning up the lack of symmetry between the read and write paths to a future patch. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230511233351.635053-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mtrr.c | 19 ++++++------------- arch/x86/kvm/x86.c | 13 +++++++++++++ 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index dc213b940141..cdbbb511f940 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -55,7 +55,6 @@ static bool msr_mtrr_valid(unsigned msr) case MSR_MTRRfix4K_F0000: case MSR_MTRRfix4K_F8000: case MSR_MTRRdefType: - case MSR_IA32_CR_PAT: return true; } return false; @@ -74,9 +73,7 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!msr_mtrr_valid(msr)) return false; - if (msr == MSR_IA32_CR_PAT) { - return kvm_pat_valid(data); - } else if (msr == MSR_MTRRdefType) { + if (msr == MSR_MTRRdefType) { if (data & ~0xcff) return false; return valid_mtrr_type(data & 0xff); @@ -324,8 +321,7 @@ static void update_mtrr(struct kvm_vcpu *vcpu, u32 msr) struct kvm_mtrr *mtrr_state = &vcpu->arch.mtrr_state; gfn_t start, end; - if (msr == MSR_IA32_CR_PAT || !tdp_enabled || - !kvm_arch_has_noncoherent_dma(vcpu->kvm)) + if (!tdp_enabled || !kvm_arch_has_noncoherent_dma(vcpu->kvm)) return; if (!mtrr_is_enabled(mtrr_state) && msr != MSR_MTRRdefType) @@ -392,8 +388,6 @@ int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data) *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index] = data; else if (msr == MSR_MTRRdefType) vcpu->arch.mtrr_state.deftype = data; - else if (msr == MSR_IA32_CR_PAT) - vcpu->arch.pat = data; else set_var_mtrr_msr(vcpu, msr, data); @@ -421,13 +415,12 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return 1; index = fixed_msr_to_range_index(msr); - if (index >= 0) + if (index >= 0) { *pdata = *(u64 *)&vcpu->arch.mtrr_state.fixed_ranges[index]; - else if (msr == MSR_MTRRdefType) + } else if (msr == MSR_MTRRdefType) { *pdata = vcpu->arch.mtrr_state.deftype; - else if (msr == MSR_IA32_CR_PAT) - *pdata = vcpu->arch.pat; - else { /* Variable MTRRs */ + } else { + /* Variable MTRRs */ if (is_mtrr_base_msr(msr)) *pdata = var_mtrr_msr_to_range(vcpu, msr)->base; else diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8017e0eb1e67..902c9935979f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3703,6 +3703,17 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_CR_PAT: + /* + * Writes to PAT should be handled by vendor code as both SVM + * and VMX track the guest's PAT in the VMCB/VMCS. + */ + WARN_ON_ONCE(1); + + if (!kvm_pat_valid(data)) + return 1; + + vcpu->arch.pat = data; + break; case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: case MSR_MTRRdefType: return kvm_mtrr_set_msr(vcpu, msr, data); @@ -4112,6 +4123,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; } case MSR_IA32_CR_PAT: + msr_info->data = vcpu->arch.pat; + break; case MSR_MTRRcap: case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: case MSR_MTRRdefType: -- cgit v1.2.3 From 3a5f49078eb5106f9b918066908508c3044b5ada Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 May 2023 16:33:50 -0700 Subject: KVM: x86: Make kvm_mtrr_valid() static now that there are no external users Make kvm_mtrr_valid() local to mtrr.c now that it's not used to check the validity of a PAT MSR value. Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230511233351.635053-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mtrr.c | 3 +-- arch/x86/kvm/x86.h | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c index cdbbb511f940..3eb6e7f47e96 100644 --- a/arch/x86/kvm/mtrr.c +++ b/arch/x86/kvm/mtrr.c @@ -65,7 +65,7 @@ static bool valid_mtrr_type(unsigned t) return t < 8 && (1 << t) & 0x73; /* 0, 1, 4, 5, 6 */ } -bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) +static bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) { int i; u64 mask; @@ -100,7 +100,6 @@ bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data) return (data & mask) == 0; } -EXPORT_SYMBOL_GPL(kvm_mtrr_valid); static bool mtrr_is_enabled(struct kvm_mtrr *mtrr_state) { diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c544602d07a3..82e3dafc5453 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -309,7 +309,6 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu); u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); -bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, -- cgit v1.2.3 From dee321977a230802a5065af4ad4f4f5e8558a738 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 11 May 2023 16:33:51 -0700 Subject: KVM: x86: Move common handling of PAT MSR writes to kvm_set_msr_common() Move the common check-and-set handling of PAT MSR writes out of vendor code and into kvm_set_msr_common(). This aligns writes with reads, which are already handled in common code, i.e. makes the handling of reads and writes symmetrical in common code. Alternatively, the common handling in kvm_get_msr_common() could be moved to vendor code, but duplicating code is generally undesirable (even though the duplicatated code is trivial in this case), and guest writes to PAT should be rare, i.e. the overhead of the extra function call is a non-issue in practice. Suggested-by: Kai Huang Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230511233351.635053-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 7 ++++--- arch/x86/kvm/vmx/vmx.c | 7 +++---- arch/x86/kvm/x86.c | 6 ------ 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e1e17c7dc7d5..924609d63f8a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2939,9 +2939,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_CR_PAT: - if (!kvm_pat_valid(data)) - return 1; - vcpu->arch.pat = data; + ret = kvm_set_msr_common(vcpu, msr); + if (ret) + break; + svm->vmcb01.ptr->save.g_pat = data; if (is_guest_mode(vcpu)) nested_vmcb02_compute_g_pat(svm); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 33b8625d3541..2d9d155691a7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2287,10 +2287,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; goto find_uret_msr; case MSR_IA32_CR_PAT: - if (!kvm_pat_valid(data)) - return 1; - - vcpu->arch.pat = data; + ret = kvm_set_msr_common(vcpu, msr_info); + if (ret) + break; if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 902c9935979f..5ad55ef71433 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3703,12 +3703,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_CR_PAT: - /* - * Writes to PAT should be handled by vendor code as both SVM - * and VMX track the guest's PAT in the VMCB/VMCS. - */ - WARN_ON_ONCE(1); - if (!kvm_pat_valid(data)) return 1; -- cgit v1.2.3 From 0d42522bdee70b9197be63dd76c9f6297cd1e832 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Wed, 19 Apr 2023 10:19:25 +0800 Subject: KVM: x86: Fix poll command According to the hardware manual, when the Poll command is issued, the byte returned by the I/O read is 1 in Bit 7 when there is an interrupt, and the highest priority binary code in Bits 2:0. The current pic simulation code is not implemented strictly according to the above expression. Fix the implementation of pic_poll_read(), set Bit 7 when there is an interrupt. Signed-off-by: Jinliang Zheng Link: https://lore.kernel.org/r/20230419021924.1342184-1-alexjlzheng@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/i8259.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 4756bcb5724f..8dec646e764b 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -411,7 +411,10 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) pic_clear_isr(s, ret); if (addr1 >> 7 || ret != 2) pic_update_irq(s->pics_state); + /* Bit 7 is 1, means there's an interrupt */ + ret |= 0x80; } else { + /* Bit 7 is 0, means there's no interrupt */ ret = 0x07; pic_update_irq(s->pics_state); } -- cgit v1.2.3 From 70b0bc4c0a05cb68ffeeaba8c8340896b5ff6fd7 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 27 Mar 2023 19:54:57 +0200 Subject: KVM: Don't kfree(NULL) on kzalloc() failure in kvm_assign_ioeventfd_idx() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On kzalloc() failure, taking the `goto fail` path leads to kfree(NULL). Such no-op has no use. Move it out. Signed-off-by: Michal Luczaj Reviewed-by: Sean Christopherson Reviewed-by: Philippe Mathieu-Daudé Link: https://lore.kernel.org/r/20230327175457.735903-1-mhal@rbox.co Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index b0af834ffa95..7c42441425cf 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -889,9 +889,9 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm, unlock_fail: mutex_unlock(&kvm->slots_lock); + kfree(p); fail: - kfree(p); eventfd_ctx_put(eventfd); return ret; -- cgit v1.2.3 From 07b4b2f4047f600ca7974797900b7409081f826c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 12 Apr 2023 16:09:13 -0400 Subject: KVM: selftests: touch all pages of args on each memstress iteration Access the same memory addresses on each iteration of the memstress guest code. This ensures that the state of KVM's page tables is the same after every iteration, including the pages that host the guest page tables for args and vcpu_args. This difference is visible when running the proposed dirty_log_page_splitting_test[*] on AMD, or on Intel with pml=0 and eptad=0. The tests fail due to different semantics of dirty bits for page-table pages on AMD (and eptad=0) and Intel. Both AMD and Intel with eptad=0 treat page-table accesses as writes, therefore more pages are dropped before the repopulation phase when dirty logging is disabled. The "missing" page had been included in the population phase because it hosts the page tables for vcpu_args, but repopulation does not need it." Signed-off-by: Paolo Bonzini Reviewed-by: Vipin Sharma Link: https://lore.kernel.org/r/20230412200913.1570873-1-pbonzini@redhat.com [sean: add additional details in changelog] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/lib/memstress.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index 5f1d3173c238..7d2f812e7c9a 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -64,6 +64,9 @@ void memstress_guest_code(uint32_t vcpu_idx) GUEST_ASSERT(vcpu_args->vcpu_idx == vcpu_idx); while (true) { + for (i = 0; i < sizeof(memstress_args); i += args->guest_page_size) + (void) *((volatile char *)args + i); + for (i = 0; i < pages; i++) { if (args->random_access) page = guest_random_u32(&rand_state) % pages; -- cgit v1.2.3 From de10b798055db5df93474cdfa5dbffc57169f458 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 31 Jan 2023 18:18:19 +0000 Subject: KVM: selftests: Move dirty logging functions to memstress.(c|h) Move some helper functions from dirty_log_perf_test.c to the memstress library so that they can be used in a future commit which tests page splitting during dirty logging. Reviewed-by: Vipin Sharma Signed-off-by: Ben Gardon Link: https://lore.kernel.org/r/20230131181820.179033-2-bgardon@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/dirty_log_perf_test.c | 84 ++--------------------- tools/testing/selftests/kvm/include/memstress.h | 8 +++ tools/testing/selftests/kvm/lib/memstress.c | 72 +++++++++++++++++++ 3 files changed, 87 insertions(+), 77 deletions(-) diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index e9d6d1aecf89..416719e20518 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -136,77 +136,6 @@ struct test_params { bool random_access; }; -static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable) -{ - int i; - - for (i = 0; i < slots; i++) { - int slot = MEMSTRESS_MEM_SLOT_INDEX + i; - int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0; - - vm_mem_region_set_flags(vm, slot, flags); - } -} - -static inline void enable_dirty_logging(struct kvm_vm *vm, int slots) -{ - toggle_dirty_logging(vm, slots, true); -} - -static inline void disable_dirty_logging(struct kvm_vm *vm, int slots) -{ - toggle_dirty_logging(vm, slots, false); -} - -static void get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots) -{ - int i; - - for (i = 0; i < slots; i++) { - int slot = MEMSTRESS_MEM_SLOT_INDEX + i; - - kvm_vm_get_dirty_log(vm, slot, bitmaps[i]); - } -} - -static void clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], - int slots, uint64_t pages_per_slot) -{ - int i; - - for (i = 0; i < slots; i++) { - int slot = MEMSTRESS_MEM_SLOT_INDEX + i; - - kvm_vm_clear_dirty_log(vm, slot, bitmaps[i], 0, pages_per_slot); - } -} - -static unsigned long **alloc_bitmaps(int slots, uint64_t pages_per_slot) -{ - unsigned long **bitmaps; - int i; - - bitmaps = malloc(slots * sizeof(bitmaps[0])); - TEST_ASSERT(bitmaps, "Failed to allocate bitmaps array."); - - for (i = 0; i < slots; i++) { - bitmaps[i] = bitmap_zalloc(pages_per_slot); - TEST_ASSERT(bitmaps[i], "Failed to allocate slot bitmap."); - } - - return bitmaps; -} - -static void free_bitmaps(unsigned long *bitmaps[], int slots) -{ - int i; - - for (i = 0; i < slots; i++) - free(bitmaps[i]); - - free(bitmaps); -} - static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *p = arg; @@ -236,7 +165,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) host_num_pages = vm_num_host_pages(mode, guest_num_pages); pages_per_slot = host_num_pages / p->slots; - bitmaps = alloc_bitmaps(p->slots, pages_per_slot); + bitmaps = memstress_alloc_bitmaps(p->slots, pages_per_slot); if (dirty_log_manual_caps) vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, @@ -277,7 +206,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Enable dirty logging */ clock_gettime(CLOCK_MONOTONIC, &start); - enable_dirty_logging(vm, p->slots); + memstress_enable_dirty_logging(vm, p->slots); ts_diff = timespec_elapsed(start); pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", ts_diff.tv_sec, ts_diff.tv_nsec); @@ -306,7 +235,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) iteration, ts_diff.tv_sec, ts_diff.tv_nsec); clock_gettime(CLOCK_MONOTONIC, &start); - get_dirty_log(vm, bitmaps, p->slots); + memstress_get_dirty_log(vm, bitmaps, p->slots); ts_diff = timespec_elapsed(start); get_dirty_log_total = timespec_add(get_dirty_log_total, ts_diff); @@ -315,7 +244,8 @@ static void run_test(enum vm_guest_mode mode, void *arg) if (dirty_log_manual_caps) { clock_gettime(CLOCK_MONOTONIC, &start); - clear_dirty_log(vm, bitmaps, p->slots, pages_per_slot); + memstress_clear_dirty_log(vm, bitmaps, p->slots, + pages_per_slot); ts_diff = timespec_elapsed(start); clear_dirty_log_total = timespec_add(clear_dirty_log_total, ts_diff); @@ -334,7 +264,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Disable dirty logging */ clock_gettime(CLOCK_MONOTONIC, &start); - disable_dirty_logging(vm, p->slots); + memstress_disable_dirty_logging(vm, p->slots); ts_diff = timespec_elapsed(start); pr_info("Disabling dirty logging time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); @@ -359,7 +289,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) clear_dirty_log_total.tv_nsec, avg.tv_sec, avg.tv_nsec); } - free_bitmaps(bitmaps, p->slots); + memstress_free_bitmaps(bitmaps, p->slots); arch_cleanup_vm(vm); memstress_destroy_vm(vm); } diff --git a/tools/testing/selftests/kvm/include/memstress.h b/tools/testing/selftests/kvm/include/memstress.h index 72e3e358ef7b..ce4e603050ea 100644 --- a/tools/testing/selftests/kvm/include/memstress.h +++ b/tools/testing/selftests/kvm/include/memstress.h @@ -72,4 +72,12 @@ void memstress_guest_code(uint32_t vcpu_id); uint64_t memstress_nested_pages(int nr_vcpus); void memstress_setup_nested(struct kvm_vm *vm, int nr_vcpus, struct kvm_vcpu *vcpus[]); +void memstress_enable_dirty_logging(struct kvm_vm *vm, int slots); +void memstress_disable_dirty_logging(struct kvm_vm *vm, int slots); +void memstress_get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots); +void memstress_clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], + int slots, uint64_t pages_per_slot); +unsigned long **memstress_alloc_bitmaps(int slots, uint64_t pages_per_slot); +void memstress_free_bitmaps(unsigned long *bitmaps[], int slots); + #endif /* SELFTEST_KVM_MEMSTRESS_H */ diff --git a/tools/testing/selftests/kvm/lib/memstress.c b/tools/testing/selftests/kvm/lib/memstress.c index 7d2f812e7c9a..df457452d146 100644 --- a/tools/testing/selftests/kvm/lib/memstress.c +++ b/tools/testing/selftests/kvm/lib/memstress.c @@ -5,6 +5,7 @@ #define _GNU_SOURCE #include +#include #include "kvm_util.h" #include "memstress.h" @@ -323,3 +324,74 @@ void memstress_join_vcpu_threads(int nr_vcpus) for (i = 0; i < nr_vcpus; i++) pthread_join(vcpu_threads[i].thread, NULL); } + +static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable) +{ + int i; + + for (i = 0; i < slots; i++) { + int slot = MEMSTRESS_MEM_SLOT_INDEX + i; + int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0; + + vm_mem_region_set_flags(vm, slot, flags); + } +} + +void memstress_enable_dirty_logging(struct kvm_vm *vm, int slots) +{ + toggle_dirty_logging(vm, slots, true); +} + +void memstress_disable_dirty_logging(struct kvm_vm *vm, int slots) +{ + toggle_dirty_logging(vm, slots, false); +} + +void memstress_get_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], int slots) +{ + int i; + + for (i = 0; i < slots; i++) { + int slot = MEMSTRESS_MEM_SLOT_INDEX + i; + + kvm_vm_get_dirty_log(vm, slot, bitmaps[i]); + } +} + +void memstress_clear_dirty_log(struct kvm_vm *vm, unsigned long *bitmaps[], + int slots, uint64_t pages_per_slot) +{ + int i; + + for (i = 0; i < slots; i++) { + int slot = MEMSTRESS_MEM_SLOT_INDEX + i; + + kvm_vm_clear_dirty_log(vm, slot, bitmaps[i], 0, pages_per_slot); + } +} + +unsigned long **memstress_alloc_bitmaps(int slots, uint64_t pages_per_slot) +{ + unsigned long **bitmaps; + int i; + + bitmaps = malloc(slots * sizeof(bitmaps[0])); + TEST_ASSERT(bitmaps, "Failed to allocate bitmaps array."); + + for (i = 0; i < slots; i++) { + bitmaps[i] = bitmap_zalloc(pages_per_slot); + TEST_ASSERT(bitmaps[i], "Failed to allocate slot bitmap."); + } + + return bitmaps; +} + +void memstress_free_bitmaps(unsigned long *bitmaps[], int slots) +{ + int i; + + for (i = 0; i < slots; i++) + free(bitmaps[i]); + + free(bitmaps); +} -- cgit v1.2.3 From dfa78a20cc879205b2c6239300dac09907ad3da1 Mon Sep 17 00:00:00 2001 From: Ben Gardon Date: Tue, 31 Jan 2023 18:18:20 +0000 Subject: KVM: selftests: Add dirty logging page splitting test Add a test for page splitting during dirty logging and for hugepage recovery after dirty logging. Page splitting represents non-trivial behavior, which is complicated by MANUAL_PROTECT mode, which causes pages to be split on the first clear, instead of when dirty logging is enabled. Add a test which makes assertions about page counts to help define the expected behavior of page splitting and to provide needed coverage of the behavior. This also helps ensure that a failure in eager page splitting is not covered up by splitting in the vCPU path. Tested by running the test on an Intel Haswell machine w/wo MANUAL_PROTECT. Reviewed-by: Vipin Sharma Signed-off-by: Ben Gardon Link: https://lore.kernel.org/r/20230131181820.179033-3-bgardon@google.com [sean: let the user run without hugetlb, as suggested by Paolo] Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile | 1 + .../kvm/x86_64/dirty_log_page_splitting_test.c | 259 +++++++++++++++++++++ 2 files changed, 260 insertions(+) create mode 100644 tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 7a5ff646e7e7..ee41ff0c5a86 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -61,6 +61,7 @@ TEST_PROGS_x86_64 += x86_64/nx_huge_pages_test.sh # Compiled test targets TEST_GEN_PROGS_x86_64 = x86_64/cpuid_test TEST_GEN_PROGS_x86_64 += x86_64/cr4_cpuid_sync_test +TEST_GEN_PROGS_x86_64 += x86_64/dirty_log_page_splitting_test TEST_GEN_PROGS_x86_64 += x86_64/get_msr_index_features TEST_GEN_PROGS_x86_64 += x86_64/exit_on_emulation_failure_test TEST_GEN_PROGS_x86_64 += x86_64/fix_hypercall_test diff --git a/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c new file mode 100644 index 000000000000..beb7e2c10211 --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/dirty_log_page_splitting_test.c @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KVM dirty logging page splitting test + * + * Based on dirty_log_perf.c + * + * Copyright (C) 2018, Red Hat, Inc. + * Copyright (C) 2023, Google, Inc. + */ + +#include +#include +#include +#include + +#include "kvm_util.h" +#include "test_util.h" +#include "memstress.h" +#include "guest_modes.h" + +#define VCPUS 2 +#define SLOTS 2 +#define ITERATIONS 2 + +static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE; + +static enum vm_mem_backing_src_type backing_src = VM_MEM_SRC_ANONYMOUS_HUGETLB; + +static u64 dirty_log_manual_caps; +static bool host_quit; +static int iteration; +static int vcpu_last_completed_iteration[KVM_MAX_VCPUS]; + +struct kvm_page_stats { + uint64_t pages_4k; + uint64_t pages_2m; + uint64_t pages_1g; + uint64_t hugepages; +}; + +static void get_page_stats(struct kvm_vm *vm, struct kvm_page_stats *stats, const char *stage) +{ + stats->pages_4k = vm_get_stat(vm, "pages_4k"); + stats->pages_2m = vm_get_stat(vm, "pages_2m"); + stats->pages_1g = vm_get_stat(vm, "pages_1g"); + stats->hugepages = stats->pages_2m + stats->pages_1g; + + pr_debug("\nPage stats after %s: 4K: %ld 2M: %ld 1G: %ld huge: %ld\n", + stage, stats->pages_4k, stats->pages_2m, stats->pages_1g, + stats->hugepages); +} + +static void run_vcpu_iteration(struct kvm_vm *vm) +{ + int i; + + iteration++; + for (i = 0; i < VCPUS; i++) { + while (READ_ONCE(vcpu_last_completed_iteration[i]) != + iteration) + ; + } +} + +static void vcpu_worker(struct memstress_vcpu_args *vcpu_args) +{ + struct kvm_vcpu *vcpu = vcpu_args->vcpu; + int vcpu_idx = vcpu_args->vcpu_idx; + + while (!READ_ONCE(host_quit)) { + int current_iteration = READ_ONCE(iteration); + + vcpu_run(vcpu); + + ASSERT_EQ(get_ucall(vcpu, NULL), UCALL_SYNC); + + vcpu_last_completed_iteration[vcpu_idx] = current_iteration; + + /* Wait for the start of the next iteration to be signaled. */ + while (current_iteration == READ_ONCE(iteration) && + READ_ONCE(iteration) >= 0 && + !READ_ONCE(host_quit)) + ; + } +} + +static void run_test(enum vm_guest_mode mode, void *unused) +{ + struct kvm_vm *vm; + unsigned long **bitmaps; + uint64_t guest_num_pages; + uint64_t host_num_pages; + uint64_t pages_per_slot; + int i; + uint64_t total_4k_pages; + struct kvm_page_stats stats_populated; + struct kvm_page_stats stats_dirty_logging_enabled; + struct kvm_page_stats stats_dirty_pass[ITERATIONS]; + struct kvm_page_stats stats_clear_pass[ITERATIONS]; + struct kvm_page_stats stats_dirty_logging_disabled; + struct kvm_page_stats stats_repopulated; + + vm = memstress_create_vm(mode, VCPUS, guest_percpu_mem_size, + SLOTS, backing_src, false); + + guest_num_pages = (VCPUS * guest_percpu_mem_size) >> vm->page_shift; + guest_num_pages = vm_adjust_num_guest_pages(mode, guest_num_pages); + host_num_pages = vm_num_host_pages(mode, guest_num_pages); + pages_per_slot = host_num_pages / SLOTS; + + bitmaps = memstress_alloc_bitmaps(SLOTS, pages_per_slot); + + if (dirty_log_manual_caps) + vm_enable_cap(vm, KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2, + dirty_log_manual_caps); + + /* Start the iterations */ + iteration = -1; + host_quit = false; + + for (i = 0; i < VCPUS; i++) + vcpu_last_completed_iteration[i] = -1; + + memstress_start_vcpu_threads(VCPUS, vcpu_worker); + + run_vcpu_iteration(vm); + get_page_stats(vm, &stats_populated, "populating memory"); + + /* Enable dirty logging */ + memstress_enable_dirty_logging(vm, SLOTS); + + get_page_stats(vm, &stats_dirty_logging_enabled, "enabling dirty logging"); + + while (iteration < ITERATIONS) { + run_vcpu_iteration(vm); + get_page_stats(vm, &stats_dirty_pass[iteration - 1], + "dirtying memory"); + + memstress_get_dirty_log(vm, bitmaps, SLOTS); + + if (dirty_log_manual_caps) { + memstress_clear_dirty_log(vm, bitmaps, SLOTS, pages_per_slot); + + get_page_stats(vm, &stats_clear_pass[iteration - 1], "clearing dirty log"); + } + } + + /* Disable dirty logging */ + memstress_disable_dirty_logging(vm, SLOTS); + + get_page_stats(vm, &stats_dirty_logging_disabled, "disabling dirty logging"); + + /* Run vCPUs again to fault pages back in. */ + run_vcpu_iteration(vm); + get_page_stats(vm, &stats_repopulated, "repopulating memory"); + + /* + * Tell the vCPU threads to quit. No need to manually check that vCPUs + * have stopped running after disabling dirty logging, the join will + * wait for them to exit. + */ + host_quit = true; + memstress_join_vcpu_threads(VCPUS); + + memstress_free_bitmaps(bitmaps, SLOTS); + memstress_destroy_vm(vm); + + /* Make assertions about the page counts. */ + total_4k_pages = stats_populated.pages_4k; + total_4k_pages += stats_populated.pages_2m * 512; + total_4k_pages += stats_populated.pages_1g * 512 * 512; + + /* + * Check that all huge pages were split. Since large pages can only + * exist in the data slot, and the vCPUs should have dirtied all pages + * in the data slot, there should be no huge pages left after splitting. + * Splitting happens at dirty log enable time without + * KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 and after the first clear pass + * with that capability. + */ + if (dirty_log_manual_caps) { + ASSERT_EQ(stats_clear_pass[0].hugepages, 0); + ASSERT_EQ(stats_clear_pass[0].pages_4k, total_4k_pages); + ASSERT_EQ(stats_dirty_logging_enabled.hugepages, stats_populated.hugepages); + } else { + ASSERT_EQ(stats_dirty_logging_enabled.hugepages, 0); + ASSERT_EQ(stats_dirty_logging_enabled.pages_4k, total_4k_pages); + } + + /* + * Once dirty logging is disabled and the vCPUs have touched all their + * memory again, the page counts should be the same as they were + * right after initial population of memory. + */ + ASSERT_EQ(stats_populated.pages_4k, stats_repopulated.pages_4k); + ASSERT_EQ(stats_populated.pages_2m, stats_repopulated.pages_2m); + ASSERT_EQ(stats_populated.pages_1g, stats_repopulated.pages_1g); +} + +static void help(char *name) +{ + puts(""); + printf("usage: %s [-h] [-b vcpu bytes] [-s mem type]\n", + name); + puts(""); + printf(" -b: specify the size of the memory region which should be\n" + " dirtied by each vCPU. e.g. 10M or 3G.\n" + " (default: 1G)\n"); + backing_src_help("-s"); + puts(""); +} + +int main(int argc, char *argv[]) +{ + int opt; + + TEST_REQUIRE(get_kvm_param_bool("eager_page_split")); + TEST_REQUIRE(get_kvm_param_bool("tdp_mmu")); + + while ((opt = getopt(argc, argv, "b:hs:")) != -1) { + switch (opt) { + case 'b': + guest_percpu_mem_size = parse_size(optarg); + break; + case 'h': + help(argv[0]); + exit(0); + case 's': + backing_src = parse_backing_src_type(optarg); + break; + default: + help(argv[0]); + exit(1); + } + } + + if (!is_backing_src_hugetlb(backing_src)) { + pr_info("This test will only work reliably with HugeTLB memory. " + "It can work with THP, but that is best effort.\n"); + } + + guest_modes_append_default(); + + dirty_log_manual_caps = 0; + for_each_guest_mode(run_test, NULL); + + dirty_log_manual_caps = + kvm_check_cap(KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2); + + if (dirty_log_manual_caps) { + dirty_log_manual_caps &= (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | + KVM_DIRTY_LOG_INITIALLY_SET); + for_each_guest_mode(run_test, NULL); + } else { + pr_info("Skipping testing with MANUAL_PROTECT as it is not supported"); + } + + return 0; +} -- cgit v1.2.3 From 33ab767c2628136c54c2e2160e4c536c7db9c6d0 Mon Sep 17 00:00:00 2001 From: Jinrong Liang Date: Tue, 11 Apr 2023 21:03:38 +0800 Subject: KVM: x86/pmu: Remove redundant check for MSR_IA32_DS_AREA set handler After commit 2de154f541fc ("KVM: x86/pmu: Provide "error" semantics for unsupported-but-known PMU MSRs"), the guest_cpuid_has(DS) check is not necessary any more since if the guest supports X86_FEATURE_DS, it never returns 1. And if the guest does not support this feature, the set_msr handler will get false from kvm_pmu_is_valid_msr() before reaching this point. Therefore, the check will not be true in all cases and can be safely removed, which also simplifies the code and improves its readability. Signed-off-by: Jinrong Liang Link: https://lore.kernel.org/r/20230411130338.8592-1-cloudliang@tencent.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/pmu_intel.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 741efe2c497b..84be32d9f365 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -444,8 +444,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_IA32_DS_AREA: - if (msr_info->host_initiated && data && !guest_cpuid_has(vcpu, X86_FEATURE_DS)) - return 1; if (is_noncanonical_address(data, vcpu)) return 1; -- cgit v1.2.3 From ab322c43cce97ff6d05439c9b72bf1513e3e1020 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 26 May 2023 14:03:39 -0700 Subject: KVM: x86: Update number of entries for KVM_GET_CPUID2 on success, not failure Update cpuid->nent if and only if kvm_vcpu_ioctl_get_cpuid2() succeeds. The sole caller copies @cpuid to userspace only on success, i.e. the existing code effectively does nothing. Arguably, KVM should report the number of entries when returning -E2BIG so that userspace doesn't have to guess the size, but all other similar KVM ioctls() don't report the size either, i.e. userspace is conditioned to guess. Suggested-by: Takahiro Itazuri Link: https://lore.kernel.org/all/20230410141820.57328-1-itazur@amazon.com Link: https://lore.kernel.org/r/20230526210340.2799158-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0c9660a07b23..241f554f1764 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -501,20 +501,15 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries) { - int r; - - r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) - goto out; - r = -EFAULT; + return -E2BIG; + if (copy_to_user(entries, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent * sizeof(struct kvm_cpuid_entry2))) - goto out; - return 0; + return -EFAULT; -out: cpuid->nent = vcpu->arch.cpuid_nent; - return r; + return 0; } /* Mask kvm_cpu_caps for @leaf with the raw CPUID capabilities of this CPU. */ -- cgit v1.2.3 From 2c76131319982f9c9410bc12127ac1df4e810b87 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 26 May 2023 14:03:40 -0700 Subject: KVM: selftests: Extend cpuid_test to verify KVM_GET_CPUID2 "nent" updates Verify that KVM reports the actual number of CPUID entries on success, but doesn't touch the userspace struct on failure (which for better or worse, is KVM's ABI). Link: https://lore.kernel.org/r/20230526210340.2799158-3-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86_64/cpuid_test.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tools/testing/selftests/kvm/x86_64/cpuid_test.c b/tools/testing/selftests/kvm/x86_64/cpuid_test.c index 2fc3ad9c887e..d3c3aa93f090 100644 --- a/tools/testing/selftests/kvm/x86_64/cpuid_test.c +++ b/tools/testing/selftests/kvm/x86_64/cpuid_test.c @@ -163,6 +163,25 @@ static void set_cpuid_after_run(struct kvm_vcpu *vcpu) ent->eax = eax; } +static void test_get_cpuid2(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid2 *cpuid = allocate_kvm_cpuid2(vcpu->cpuid->nent + 1); + int i, r; + + vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid); + TEST_ASSERT(cpuid->nent == vcpu->cpuid->nent, + "KVM didn't update nent on success, wanted %u, got %u\n", + vcpu->cpuid->nent, cpuid->nent); + + for (i = 0; i < vcpu->cpuid->nent; i++) { + cpuid->nent = i; + r = __vcpu_ioctl(vcpu, KVM_GET_CPUID2, cpuid); + TEST_ASSERT(r && errno == E2BIG, KVM_IOCTL_ERROR(KVM_GET_CPUID2, r)); + TEST_ASSERT(cpuid->nent == i, "KVM modified nent on failure"); + } + free(cpuid); +} + int main(void) { struct kvm_vcpu *vcpu; @@ -183,5 +202,7 @@ int main(void) set_cpuid_after_run(vcpu); + test_get_cpuid2(vcpu); + kvm_vm_free(vm); } -- cgit v1.2.3 From 331f229768160dceec5b5694ea32ecf66e2e2452 Mon Sep 17 00:00:00 2001 From: Jon Kohler Date: Wed, 31 May 2023 11:58:21 -0400 Subject: KVM: VMX: restore vmx_vmexit alignment Commit 8bd200d23ec4 ("KVM: VMX: Flatten __vmx_vcpu_run()") changed vmx_vmexit from SYM_FUNC_START to SYM_INNER_LABEL, accidentally removing 16 byte alignment as SYM_FUNC_START uses SYM_A_ALIGN and SYM_INNER_LABEL does not. Josh mentioned [1] this was unintentional. Fix by changing to SYM_INNER_LABEL_ALIGN instead. [1] https://lore.kernel.org/lkml/Y3adkSe%2FJ70PqUyt@p183 Fixes: 8bd200d23ec4 ("KVM: VMX: Flatten __vmx_vcpu_run()") Signed-off-by: Jon Kohler Suggested-by: Alexey Dobriyan CC: Josh Poimboeuf Acked-by: Josh Poimboeuf Reviewed-by: Jim Mattson Link: https://lore.kernel.org/r/20230531155821.80590-1-jon@nutanix.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmenter.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 631fd7da2bc3..07e927d4d099 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -187,7 +187,7 @@ SYM_FUNC_START(__vmx_vcpu_run) _ASM_EXTABLE(.Lvmresume, .Lfixup) _ASM_EXTABLE(.Lvmlaunch, .Lfixup) -SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) +SYM_INNER_LABEL_ALIGN(vmx_vmexit, SYM_L_GLOBAL) /* Restore unwind state from before the VMRESUME/VMLAUNCH. */ UNWIND_HINT_RESTORE -- cgit v1.2.3 From 048be5fea43deef7e96c0de5ba05515c5cbe28cb Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 23 May 2023 11:18:18 +0100 Subject: KVM: arm64: Block unsafe FF-A calls from the host When KVM is initialised in protected mode, we must take care to filter certain FFA calls from the host kernel so that the integrity of guest and hypervisor memory is maintained and is not made available to the secure world. As a first step, intercept and block all memory-related FF-A SMC calls from the host to EL3 and don't advertise any FF-A features. This puts the framework in place for handling them properly. Co-developed-by: Andrew Walbran Signed-off-by: Andrew Walbran Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20230523101828.7328-2-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/include/nvhe/ffa.h | 16 +++++ arch/arm64/kvm/hyp/nvhe/Makefile | 2 +- arch/arm64/kvm/hyp/nvhe/ffa.c | 119 ++++++++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/hyp-main.c | 3 + 4 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kvm/hyp/include/nvhe/ffa.h create mode 100644 arch/arm64/kvm/hyp/nvhe/ffa.c diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h new file mode 100644 index 000000000000..fc09ec671e24 --- /dev/null +++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2022 - Google LLC + * Author: Andrew Walbran + */ +#ifndef __KVM_HYP_FFA_H +#define __KVM_HYP_FFA_H + +#include + +#define FFA_MIN_FUNC_NUM 0x60 +#define FFA_MAX_FUNC_NUM 0x7F + +bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt); + +#endif /* __KVM_HYP_FFA_H */ diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index 530347cdebe3..9ddc025e4b86 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -22,7 +22,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs)) hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ - cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o + cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o ffa.o hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c new file mode 100644 index 000000000000..302806928a64 --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * FF-A v1.0 proxy to filter out invalid memory-sharing SMC calls issued by + * the host. FF-A is a slightly more palatable abbreviation of "Arm Firmware + * Framework for Arm A-profile", which is specified by Arm in document + * number DEN0077. + * + * Copyright (C) 2022 - Google LLC + * Author: Andrew Walbran + * + * This driver hooks into the SMC trapping logic for the host and intercepts + * all calls falling within the FF-A range. Each call is either: + * + * - Forwarded on unmodified to the SPMD at EL3 + * - Rejected as "unsupported" + * - Accompanied by a host stage-2 page-table check/update and reissued + * + * Consequently, any attempts by the host to make guest memory pages + * accessible to the secure world using FF-A will be detected either here + * (in the case that the memory is already owned by the guest) or during + * donation to the guest (in the case that the memory was previously shared + * with the secure world). + * + * To allow the rolling-back of page-table updates and FF-A calls in the + * event of failure, operations involving the RXTX buffers are locked for + * the duration and are therefore serialised. + */ + +#include +#include +#include +#include + +static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno) +{ + *res = (struct arm_smccc_res) { + .a0 = FFA_ERROR, + .a2 = ffa_errno, + }; +} + +static void ffa_set_retval(struct kvm_cpu_context *ctxt, + struct arm_smccc_res *res) +{ + cpu_reg(ctxt, 0) = res->a0; + cpu_reg(ctxt, 1) = res->a1; + cpu_reg(ctxt, 2) = res->a2; + cpu_reg(ctxt, 3) = res->a3; +} + +static bool is_ffa_call(u64 func_id) +{ + return ARM_SMCCC_IS_FAST_CALL(func_id) && + ARM_SMCCC_OWNER_NUM(func_id) == ARM_SMCCC_OWNER_STANDARD && + ARM_SMCCC_FUNC_NUM(func_id) >= FFA_MIN_FUNC_NUM && + ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM; +} + +/* + * Is a given FFA function supported, either by forwarding on directly + * or by handling at EL2? + */ +static bool ffa_call_supported(u64 func_id) +{ + switch (func_id) { + /* Unsupported memory management calls */ + case FFA_FN64_MEM_RETRIEVE_REQ: + case FFA_MEM_RETRIEVE_RESP: + case FFA_MEM_RELINQUISH: + case FFA_MEM_OP_PAUSE: + case FFA_MEM_OP_RESUME: + case FFA_MEM_FRAG_RX: + case FFA_FN64_MEM_DONATE: + /* Indirect message passing via RX/TX buffers */ + case FFA_MSG_SEND: + case FFA_MSG_POLL: + case FFA_MSG_WAIT: + /* 32-bit variants of 64-bit calls */ + case FFA_MSG_SEND_DIRECT_REQ: + case FFA_MSG_SEND_DIRECT_RESP: + case FFA_RXTX_MAP: + case FFA_MEM_DONATE: + case FFA_MEM_RETRIEVE_REQ: + /* Don't advertise any features just yet */ + case FFA_FEATURES: + return false; + } + + return true; +} + +bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(u64, func_id, host_ctxt, 0); + struct arm_smccc_res res; + + /* + * There's no way we can tell what a non-standard SMC call might + * be up to. Ideally, we would terminate these here and return + * an error to the host, but sadly devices make use of custom + * firmware calls for things like power management, debugging, + * RNG access and crash reporting. + * + * Given that the architecture requires us to trust EL3 anyway, + * we forward unrecognised calls on under the assumption that + * the firmware doesn't expose a mechanism to access arbitrary + * non-secure memory. Short of a per-device table of SMCs, this + * is the best we can do. + */ + if (!is_ffa_call(func_id)) + return false; + + if (ffa_call_supported(func_id)) + return false; /* Pass through */ + + ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED); + ffa_set_retval(host_ctxt, &res); + return true; +} diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 728e01d4536b..223611e43279 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include @@ -373,6 +374,8 @@ static void handle_host_smc(struct kvm_cpu_context *host_ctxt) bool handled; handled = kvm_host_psci_handler(host_ctxt); + if (!handled) + handled = kvm_host_ffa_handler(host_ctxt); if (!handled) default_host_smc_handler(host_ctxt); -- cgit v1.2.3 From 12bdce4f41197a1a97ba1c711f77d557841e13d9 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 23 May 2023 11:18:19 +0100 Subject: KVM: arm64: Probe FF-A version and host/hyp partition ID during init Probe FF-A during pKVM initialisation so that we can detect any inconsistencies in the version or partition ID early on. Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20230523101828.7328-3-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 1 + arch/arm64/kvm/hyp/include/nvhe/ffa.h | 1 + arch/arm64/kvm/hyp/nvhe/ffa.c | 30 ++++++++++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/setup.c | 5 +++++ 5 files changed, 38 insertions(+) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7e7e19ef6993..396e68cced9e 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -405,6 +405,7 @@ struct kvm_host_data { struct kvm_host_psci_config { /* PSCI version used by host. */ u32 version; + u32 smccc_version; /* Function IDs used by host if version is v0.1. */ struct psci_0_1_function_ids function_ids_0_1; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 14391826241c..41395693f3e2 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1910,6 +1910,7 @@ static bool __init init_psci_relay(void) } kvm_host_psci_config.version = psci_ops.get_version(); + kvm_host_psci_config.smccc_version = arm_smccc_get_version(); if (kvm_host_psci_config.version == PSCI_VERSION(0, 1)) { kvm_host_psci_config.function_ids_0_1 = get_psci_0_1_function_ids(); diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h index fc09ec671e24..5c9b92430ff3 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/ffa.h +++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h @@ -11,6 +11,7 @@ #define FFA_MIN_FUNC_NUM 0x60 #define FFA_MAX_FUNC_NUM 0x7F +int hyp_ffa_init(void); bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt); #endif /* __KVM_HYP_FFA_H */ diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 302806928a64..abdcaf98d9b0 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -31,6 +31,12 @@ #include #include +/* + * "ID value 0 must be returned at the Non-secure physical FF-A instance" + * We share this ID with the host. + */ +#define HOST_FFA_ID 0 + static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno) { *res = (struct arm_smccc_res) { @@ -117,3 +123,27 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) ffa_set_retval(host_ctxt, &res); return true; } + +int hyp_ffa_init(void) +{ + struct arm_smccc_res res; + + if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2) + return 0; + + arm_smccc_1_1_smc(FFA_VERSION, FFA_VERSION_1_0, 0, 0, 0, 0, 0, 0, &res); + if (res.a0 == FFA_RET_NOT_SUPPORTED) + return 0; + + if (res.a0 != FFA_VERSION_1_0) + return -EOPNOTSUPP; + + arm_smccc_1_1_smc(FFA_ID_GET, 0, 0, 0, 0, 0, 0, 0, &res); + if (res.a0 != FFA_SUCCESS) + return -EOPNOTSUPP; + + if (res.a2 != HOST_FFA_ID) + return -EINVAL; + + return 0; +} diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 110f04627785..c4ca174a0592 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -314,6 +315,10 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; + ret = hyp_ffa_init(); + if (ret) + goto out; + pkvm_hyp_vm_table_init(vm_table_base); out: /* -- cgit v1.2.3 From bc3888a0f4e979ecf9dd8c33a84b8da8cc130790 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 23 May 2023 11:18:20 +0100 Subject: KVM: arm64: Allocate pages for hypervisor FF-A mailboxes The FF-A proxy code needs to allocate its own buffer pair for communication with EL3 and for forwarding calls from the host at EL1. Reserve a couple of pages for this purpose and use them to initialise the hypervisor's FF-A buffer structure. Co-developed-by: Andrew Walbran Signed-off-by: Andrew Walbran Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20230523101828.7328-4-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pkvm.h | 8 ++++++++ arch/arm64/kvm/hyp/include/nvhe/ffa.h | 2 +- arch/arm64/kvm/hyp/nvhe/ffa.c | 24 +++++++++++++++++++++++- arch/arm64/kvm/hyp/nvhe/setup.c | 8 +++++++- arch/arm64/kvm/pkvm.c | 1 + 5 files changed, 40 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 01129b0d4c68..2b495ec59deb 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -106,4 +106,12 @@ static inline unsigned long host_s2_pgtable_pages(void) return res; } +#define KVM_FFA_MBOX_NR_PAGES 1 + +static inline unsigned long hyp_ffa_proxy_pages(void) +{ + /* A page each for the hypervisor's RX and TX mailboxes. */ + return 2 * KVM_FFA_MBOX_NR_PAGES; +} + #endif /* __ARM64_KVM_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/include/nvhe/ffa.h b/arch/arm64/kvm/hyp/include/nvhe/ffa.h index 5c9b92430ff3..1becb10ecd80 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/ffa.h +++ b/arch/arm64/kvm/hyp/include/nvhe/ffa.h @@ -11,7 +11,7 @@ #define FFA_MIN_FUNC_NUM 0x60 #define FFA_MAX_FUNC_NUM 0x7F -int hyp_ffa_init(void); +int hyp_ffa_init(void *pages); bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt); #endif /* __KVM_HYP_FFA_H */ diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index abdcaf98d9b0..c85e5d46a90d 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -28,8 +28,11 @@ #include #include +#include + #include #include +#include /* * "ID value 0 must be returned at the Non-secure physical FF-A instance" @@ -37,6 +40,19 @@ */ #define HOST_FFA_ID 0 +struct kvm_ffa_buffers { + hyp_spinlock_t lock; + void *tx; + void *rx; +}; + +/* + * Note that we don't currently lock these buffers explicitly, instead + * relying on the locking of the host FFA buffers as we only have one + * client. + */ +static struct kvm_ffa_buffers hyp_buffers; + static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno) { *res = (struct arm_smccc_res) { @@ -124,7 +140,7 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) return true; } -int hyp_ffa_init(void) +int hyp_ffa_init(void *pages) { struct arm_smccc_res res; @@ -145,5 +161,11 @@ int hyp_ffa_init(void) if (res.a2 != HOST_FFA_ID) return -EINVAL; + hyp_buffers = (struct kvm_ffa_buffers) { + .lock = __HYP_SPIN_LOCK_UNLOCKED, + .tx = pages, + .rx = pages + (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE), + }; + return 0; } diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index c4ca174a0592..bb98630dfeaf 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -29,6 +29,7 @@ static void *vmemmap_base; static void *vm_table_base; static void *hyp_pgt_base; static void *host_s2_pgt_base; +static void *ffa_proxy_pages; static struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; static struct hyp_pool hpool; @@ -58,6 +59,11 @@ static int divide_memory_pool(void *virt, unsigned long size) if (!host_s2_pgt_base) return -ENOMEM; + nr_pages = hyp_ffa_proxy_pages(); + ffa_proxy_pages = hyp_early_alloc_contig(nr_pages); + if (!ffa_proxy_pages) + return -ENOMEM; + return 0; } @@ -315,7 +321,7 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; - ret = hyp_ffa_init(); + ret = hyp_ffa_init(ffa_proxy_pages); if (ret) goto out; diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index 6e9ece1ebbe7..994a494703c3 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -78,6 +78,7 @@ void __init kvm_hyp_reserve(void) hyp_mem_pages += host_s2_pgtable_pages(); hyp_mem_pages += hyp_vm_table_pages(); hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE); + hyp_mem_pages += hyp_ffa_proxy_pages(); /* * Try to allocate a PMD-aligned region to reduce TLB pressure once -- cgit v1.2.3 From 9d0c6a9af9e38efa675e565bd181794deca1188a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 23 May 2023 11:18:21 +0100 Subject: KVM: arm64: Handle FFA_RXTX_MAP and FFA_RXTX_UNMAP calls from the host Handle FFA_RXTX_MAP and FFA_RXTX_UNMAP calls from the host by sharing the host's mailbox memory with the hypervisor and establishing a separate pair of mailboxes between the hypervisor and the SPMD at EL3. Co-developed-by: Andrew Walbran Signed-off-by: Andrew Walbran Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20230523101828.7328-5-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/ffa.c | 188 ++++++++++++++++++++++++++++++++++++++++++ include/linux/arm_ffa.h | 8 ++ 2 files changed, 196 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index c85e5d46a90d..0c020cd9f722 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -31,6 +31,8 @@ #include #include +#include +#include #include #include @@ -52,6 +54,7 @@ struct kvm_ffa_buffers { * client. */ static struct kvm_ffa_buffers hyp_buffers; +static struct kvm_ffa_buffers host_buffers; static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno) { @@ -61,6 +64,15 @@ static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno) }; } +static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret) +{ + if (ret == FFA_RET_SUCCESS) { + *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS }; + } else { + ffa_to_smccc_error(res, ret); + } +} + static void ffa_set_retval(struct kvm_cpu_context *ctxt, struct arm_smccc_res *res) { @@ -78,6 +90,144 @@ static bool is_ffa_call(u64 func_id) ARM_SMCCC_FUNC_NUM(func_id) <= FFA_MAX_FUNC_NUM; } +static int ffa_map_hyp_buffers(u64 ffa_page_count) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_smc(FFA_FN64_RXTX_MAP, + hyp_virt_to_phys(hyp_buffers.tx), + hyp_virt_to_phys(hyp_buffers.rx), + ffa_page_count, + 0, 0, 0, 0, + &res); + + return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; +} + +static int ffa_unmap_hyp_buffers(void) +{ + struct arm_smccc_res res; + + arm_smccc_1_1_smc(FFA_RXTX_UNMAP, + HOST_FFA_ID, + 0, 0, 0, 0, 0, 0, + &res); + + return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; +} + +static void do_ffa_rxtx_map(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(phys_addr_t, tx, ctxt, 1); + DECLARE_REG(phys_addr_t, rx, ctxt, 2); + DECLARE_REG(u32, npages, ctxt, 3); + int ret = 0; + void *rx_virt, *tx_virt; + + if (npages != (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) / FFA_PAGE_SIZE) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + if (!PAGE_ALIGNED(tx) || !PAGE_ALIGNED(rx)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (host_buffers.tx) { + ret = FFA_RET_DENIED; + goto out_unlock; + } + + /* + * Map our hypervisor buffers into the SPMD before mapping and + * pinning the host buffers in our own address space. + */ + ret = ffa_map_hyp_buffers(npages); + if (ret) + goto out_unlock; + + ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(tx)); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unmap; + } + + ret = __pkvm_host_share_hyp(hyp_phys_to_pfn(rx)); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unshare_tx; + } + + tx_virt = hyp_phys_to_virt(tx); + ret = hyp_pin_shared_mem(tx_virt, tx_virt + 1); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unshare_rx; + } + + rx_virt = hyp_phys_to_virt(rx); + ret = hyp_pin_shared_mem(rx_virt, rx_virt + 1); + if (ret) { + ret = FFA_RET_INVALID_PARAMETERS; + goto err_unpin_tx; + } + + host_buffers.tx = tx_virt; + host_buffers.rx = rx_virt; + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + ffa_to_smccc_res(res, ret); + return; + +err_unpin_tx: + hyp_unpin_shared_mem(tx_virt, tx_virt + 1); +err_unshare_rx: + __pkvm_host_unshare_hyp(hyp_phys_to_pfn(rx)); +err_unshare_tx: + __pkvm_host_unshare_hyp(hyp_phys_to_pfn(tx)); +err_unmap: + ffa_unmap_hyp_buffers(); + goto out_unlock; +} + +static void do_ffa_rxtx_unmap(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, id, ctxt, 1); + int ret = 0; + + if (id != HOST_FFA_ID) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + hyp_unpin_shared_mem(host_buffers.tx, host_buffers.tx + 1); + WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.tx))); + host_buffers.tx = NULL; + + hyp_unpin_shared_mem(host_buffers.rx, host_buffers.rx + 1); + WARN_ON(__pkvm_host_unshare_hyp(hyp_virt_to_pfn(host_buffers.rx))); + host_buffers.rx = NULL; + + ffa_unmap_hyp_buffers(); + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + ffa_to_smccc_res(res, ret); +} + /* * Is a given FFA function supported, either by forwarding on directly * or by handling at EL2? @@ -132,10 +282,21 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) if (!is_ffa_call(func_id)) return false; + switch (func_id) { + /* Memory management */ + case FFA_FN64_RXTX_MAP: + do_ffa_rxtx_map(&res, host_ctxt); + goto out_handled; + case FFA_RXTX_UNMAP: + do_ffa_rxtx_unmap(&res, host_ctxt); + goto out_handled; + } + if (ffa_call_supported(func_id)) return false; /* Pass through */ ffa_to_smccc_error(&res, FFA_RET_NOT_SUPPORTED); +out_handled: ffa_set_retval(host_ctxt, &res); return true; } @@ -143,6 +304,7 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) int hyp_ffa_init(void *pages) { struct arm_smccc_res res; + size_t min_rxtx_sz; if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2) return 0; @@ -161,11 +323,37 @@ int hyp_ffa_init(void *pages) if (res.a2 != HOST_FFA_ID) return -EINVAL; + arm_smccc_1_1_smc(FFA_FEATURES, FFA_FN64_RXTX_MAP, + 0, 0, 0, 0, 0, 0, &res); + if (res.a0 != FFA_SUCCESS) + return -EOPNOTSUPP; + + switch (res.a2) { + case FFA_FEAT_RXTX_MIN_SZ_4K: + min_rxtx_sz = SZ_4K; + break; + case FFA_FEAT_RXTX_MIN_SZ_16K: + min_rxtx_sz = SZ_16K; + break; + case FFA_FEAT_RXTX_MIN_SZ_64K: + min_rxtx_sz = SZ_64K; + break; + default: + return -EINVAL; + } + + if (min_rxtx_sz > PAGE_SIZE) + return -EOPNOTSUPP; + hyp_buffers = (struct kvm_ffa_buffers) { .lock = __HYP_SPIN_LOCK_UNLOCKED, .tx = pages, .rx = pages + (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE), }; + host_buffers = (struct kvm_ffa_buffers) { + .lock = __HYP_SPIN_LOCK_UNLOCKED, + }; + return 0; } diff --git a/include/linux/arm_ffa.h b/include/linux/arm_ffa.h index c87aeecaa9b2..b9f81035eb41 100644 --- a/include/linux/arm_ffa.h +++ b/include/linux/arm_ffa.h @@ -94,6 +94,14 @@ */ #define FFA_PAGE_SIZE SZ_4K +/* + * Minimum buffer size/alignment encodings returned by an FFA_FEATURES + * query for FFA_RXTX_MAP. + */ +#define FFA_FEAT_RXTX_MIN_SZ_4K 0 +#define FFA_FEAT_RXTX_MIN_SZ_64K 1 +#define FFA_FEAT_RXTX_MIN_SZ_16K 2 + /* FFA Bus/Device/Driver related */ struct ffa_device { int vm_id; -- cgit v1.2.3 From f9112eade788439d721ca3032369fb4bf4c7e222 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 23 May 2023 11:18:22 +0100 Subject: KVM: arm64: Add FF-A helpers to share/unshare memory with secure world Extend pKVM's memory protection code so that we can update the host's stage-2 page-table to track pages shared with secure world by the host using FF-A and prevent those pages from being mapped into a guest. Co-developed-by: Andrew Walbran Signed-off-by: Andrew Walbran Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20230523101828.7328-6-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/include/nvhe/mem_protect.h | 3 ++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 68 +++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index b7bdbe63deed..0972faccc2af 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -57,6 +57,7 @@ extern struct host_mmu host_mmu; enum pkvm_component_id { PKVM_ID_HOST, PKVM_ID_HYP, + PKVM_ID_FFA, }; extern unsigned long hyp_nr_cpus; @@ -66,6 +67,8 @@ int __pkvm_host_share_hyp(u64 pfn); int __pkvm_host_unshare_hyp(u64 pfn); int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); +int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages); +int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages); bool addr_is_memory(phys_addr_t phys); int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 2e9ec4a2a4a3..e327e94d0e40 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -842,6 +842,13 @@ static int check_share(struct pkvm_mem_share *share) case PKVM_ID_HYP: ret = hyp_ack_share(completer_addr, tx, share->completer_prot); break; + case PKVM_ID_FFA: + /* + * We only check the host; the secure side will check the other + * end when we forward the FFA call. + */ + ret = 0; + break; default: ret = -EINVAL; } @@ -870,6 +877,13 @@ static int __do_share(struct pkvm_mem_share *share) case PKVM_ID_HYP: ret = hyp_complete_share(completer_addr, tx, share->completer_prot); break; + case PKVM_ID_FFA: + /* + * We're not responsible for any secure page-tables, so there's + * nothing to do here. + */ + ret = 0; + break; default: ret = -EINVAL; } @@ -918,6 +932,10 @@ static int check_unshare(struct pkvm_mem_share *share) case PKVM_ID_HYP: ret = hyp_ack_unshare(completer_addr, tx); break; + case PKVM_ID_FFA: + /* See check_share() */ + ret = 0; + break; default: ret = -EINVAL; } @@ -946,6 +964,10 @@ static int __do_unshare(struct pkvm_mem_share *share) case PKVM_ID_HYP: ret = hyp_complete_unshare(completer_addr, tx); break; + case PKVM_ID_FFA: + /* See __do_share() */ + ret = 0; + break; default: ret = -EINVAL; } @@ -1235,3 +1257,49 @@ void hyp_unpin_shared_mem(void *from, void *to) hyp_unlock_component(); host_unlock_component(); } + +int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages) +{ + int ret; + struct pkvm_mem_share share = { + .tx = { + .nr_pages = nr_pages, + .initiator = { + .id = PKVM_ID_HOST, + .addr = hyp_pfn_to_phys(pfn), + }, + .completer = { + .id = PKVM_ID_FFA, + }, + }, + }; + + host_lock_component(); + ret = do_share(&share); + host_unlock_component(); + + return ret; +} + +int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages) +{ + int ret; + struct pkvm_mem_share share = { + .tx = { + .nr_pages = nr_pages, + .initiator = { + .id = PKVM_ID_HOST, + .addr = hyp_pfn_to_phys(pfn), + }, + .completer = { + .id = PKVM_ID_FFA, + }, + }, + }; + + host_lock_component(); + ret = do_unshare(&share); + host_unlock_component(); + + return ret; +} -- cgit v1.2.3 From 43609000177625b1c93c55a16a07aee5a4258260 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 23 May 2023 11:18:23 +0100 Subject: KVM: arm64: Handle FFA_MEM_SHARE calls from the host Intercept FFA_MEM_SHARE/FFA_FN64_MEM_SHARE calls from the host and transition the host stage-2 page-table entries from the OWNED state to the SHARED_OWNED state prior to forwarding the call onto EL3. Co-developed-by: Andrew Walbran Signed-off-by: Andrew Walbran Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20230523101828.7328-7-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/ffa.c | 155 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 0c020cd9f722..83c41c0c441d 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -116,6 +116,14 @@ static int ffa_unmap_hyp_buffers(void) return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; } +static void ffa_mem_share(struct arm_smccc_res *res, u32 len, u32 fraglen) +{ + arm_smccc_1_1_smc(FFA_FN64_MEM_SHARE, + len, fraglen, + 0, 0, 0, 0, 0, + res); +} + static void do_ffa_rxtx_map(struct arm_smccc_res *res, struct kvm_cpu_context *ctxt) { @@ -228,6 +236,149 @@ out: ffa_to_smccc_res(res, ret); } +static u32 __ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 i; + + for (i = 0; i < nranges; ++i) { + struct ffa_mem_region_addr_range *range = &ranges[i]; + u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE; + u64 pfn = hyp_phys_to_pfn(range->address); + + if (!PAGE_ALIGNED(sz)) + break; + + if (__pkvm_host_share_ffa(pfn, sz / PAGE_SIZE)) + break; + } + + return i; +} + +static u32 __ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 i; + + for (i = 0; i < nranges; ++i) { + struct ffa_mem_region_addr_range *range = &ranges[i]; + u64 sz = (u64)range->pg_cnt * FFA_PAGE_SIZE; + u64 pfn = hyp_phys_to_pfn(range->address); + + if (!PAGE_ALIGNED(sz)) + break; + + if (__pkvm_host_unshare_ffa(pfn, sz / PAGE_SIZE)) + break; + } + + return i; +} + +static int ffa_host_share_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 nshared = __ffa_host_share_ranges(ranges, nranges); + int ret = 0; + + if (nshared != nranges) { + WARN_ON(__ffa_host_unshare_ranges(ranges, nshared) != nshared); + ret = FFA_RET_DENIED; + } + + return ret; +} + +static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, + u32 nranges) +{ + u32 nunshared = __ffa_host_unshare_ranges(ranges, nranges); + int ret = 0; + + if (nunshared != nranges) { + WARN_ON(__ffa_host_share_ranges(ranges, nunshared) != nunshared); + ret = FFA_RET_DENIED; + } + + return ret; +} + +static void do_ffa_mem_share(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, len, ctxt, 1); + DECLARE_REG(u32, fraglen, ctxt, 2); + DECLARE_REG(u64, addr_mbz, ctxt, 3); + DECLARE_REG(u32, npages_mbz, ctxt, 4); + struct ffa_composite_mem_region *reg; + struct ffa_mem_region *buf; + int ret = 0; + u32 offset; + + if (addr_mbz || npages_mbz || fraglen > len || + fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + if (fraglen < len) { + ret = FFA_RET_ABORTED; + goto out; + } + + if (fraglen < sizeof(struct ffa_mem_region) + + sizeof(struct ffa_mem_region_attributes)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out; + } + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + buf = hyp_buffers.tx; + memcpy(buf, host_buffers.tx, fraglen); + + offset = buf->ep_mem_access[0].composite_off; + if (!offset || buf->ep_count != 1 || buf->sender_id != HOST_FFA_ID) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + reg = (void *)buf + offset; + if (fraglen < offset + sizeof(struct ffa_composite_mem_region) + + reg->addr_range_cnt * + sizeof(struct ffa_mem_region_addr_range)) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + ret = ffa_host_share_ranges(reg->constituents, reg->addr_range_cnt); + if (ret) + goto out_unlock; + + ffa_mem_share(res, len, fraglen); + if (res->a0 != FFA_SUCCESS) { + WARN_ON(ffa_host_unshare_ranges(reg->constituents, + reg->addr_range_cnt)); + } + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + if (ret) + ffa_to_smccc_res(res, ret); + return; +} + /* * Is a given FFA function supported, either by forwarding on directly * or by handling at EL2? @@ -290,6 +441,10 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) case FFA_RXTX_UNMAP: do_ffa_rxtx_unmap(&res, host_ctxt); goto out_handled; + case FFA_MEM_SHARE: + case FFA_FN64_MEM_SHARE: + do_ffa_mem_share(&res, host_ctxt); + goto out_handled; } if (ffa_call_supported(func_id)) -- cgit v1.2.3 From 0e3bcb49c13567adb821c4251dcfd04ad7e1179f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 23 May 2023 11:18:24 +0100 Subject: KVM: arm64: Handle FFA_MEM_RECLAIM calls from the host Intecept FFA_MEM_RECLAIM calls from the host and transition the host stage-2 page-table entries from the SHARED_OWNED state back to the OWNED state once EL3 has confirmed that the secure mapping has been reclaimed. Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20230523101828.7328-8-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/ffa.c | 79 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 83c41c0c441d..550936f4b5da 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -124,6 +124,23 @@ static void ffa_mem_share(struct arm_smccc_res *res, u32 len, u32 fraglen) res); } +static void ffa_mem_reclaim(struct arm_smccc_res *res, u32 handle_lo, + u32 handle_hi, u32 flags) +{ + arm_smccc_1_1_smc(FFA_MEM_RECLAIM, + handle_lo, handle_hi, flags, + 0, 0, 0, 0, + res); +} + +static void ffa_retrieve_req(struct arm_smccc_res *res, u32 len) +{ + arm_smccc_1_1_smc(FFA_FN64_MEM_RETRIEVE_REQ, + len, len, + 0, 0, 0, 0, 0, + res); +} + static void do_ffa_rxtx_map(struct arm_smccc_res *res, struct kvm_cpu_context *ctxt) { @@ -379,6 +396,65 @@ out: return; } +static void do_ffa_mem_reclaim(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, handle_lo, ctxt, 1); + DECLARE_REG(u32, handle_hi, ctxt, 2); + DECLARE_REG(u32, flags, ctxt, 3); + struct ffa_composite_mem_region *reg; + struct ffa_mem_region *buf; + int ret = 0; + u32 offset; + u64 handle; + + handle = PACK_HANDLE(handle_lo, handle_hi); + + hyp_spin_lock(&host_buffers.lock); + + buf = hyp_buffers.tx; + *buf = (struct ffa_mem_region) { + .sender_id = HOST_FFA_ID, + .handle = handle, + }; + + ffa_retrieve_req(res, sizeof(*buf)); + buf = hyp_buffers.rx; + if (res->a0 != FFA_MEM_RETRIEVE_RESP) + goto out_unlock; + + /* Check for fragmentation */ + if (res->a1 != res->a2) { + ret = FFA_RET_ABORTED; + goto out_unlock; + } + + offset = buf->ep_mem_access[0].composite_off; + /* + * We can trust the SPMD to get this right, but let's at least + * check that we end up with something that doesn't look _completely_ + * bogus. + */ + if (WARN_ON(offset > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) { + ret = FFA_RET_ABORTED; + goto out_unlock; + } + + reg = (void *)buf + offset; + ffa_mem_reclaim(res, handle_lo, handle_hi, flags); + if (res->a0 != FFA_SUCCESS) + goto out_unlock; + + /* If the SPMD was happy, then we should be too. */ + WARN_ON(ffa_host_unshare_ranges(reg->constituents, + reg->addr_range_cnt)); +out_unlock: + hyp_spin_unlock(&host_buffers.lock); + + if (ret) + ffa_to_smccc_res(res, ret); +} + /* * Is a given FFA function supported, either by forwarding on directly * or by handling at EL2? @@ -445,6 +521,9 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) case FFA_FN64_MEM_SHARE: do_ffa_mem_share(&res, host_ctxt); goto out_handled; + case FFA_MEM_RECLAIM: + do_ffa_mem_reclaim(&res, host_ctxt); + goto out_handled; } if (ffa_call_supported(func_id)) -- cgit v1.2.3 From 634d90cf0ac6562f121a4acd4caec36d695d6aa2 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 23 May 2023 11:18:25 +0100 Subject: KVM: arm64: Handle FFA_MEM_LEND calls from the host Handle FFA_MEM_LEND calls from the host by treating them identically to FFA_MEM_SHARE calls for the purposes of the host stage-2 page-table, but forwarding on the original request to EL3. Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20230523101828.7328-9-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/ffa.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 550936f4b5da..d90ab1c9ec72 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -116,10 +116,10 @@ static int ffa_unmap_hyp_buffers(void) return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; } -static void ffa_mem_share(struct arm_smccc_res *res, u32 len, u32 fraglen) +static void ffa_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len, + u32 fraglen) { - arm_smccc_1_1_smc(FFA_FN64_MEM_SHARE, - len, fraglen, + arm_smccc_1_1_smc(func_id, len, fraglen, 0, 0, 0, 0, 0, res); } @@ -321,8 +321,9 @@ static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, return ret; } -static void do_ffa_mem_share(struct arm_smccc_res *res, - struct kvm_cpu_context *ctxt) +static __always_inline void do_ffa_mem_xfer(const u64 func_id, + struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) { DECLARE_REG(u32, len, ctxt, 1); DECLARE_REG(u32, fraglen, ctxt, 2); @@ -333,6 +334,9 @@ static void do_ffa_mem_share(struct arm_smccc_res *res, int ret = 0; u32 offset; + BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE && + func_id != FFA_FN64_MEM_LEND); + if (addr_mbz || npages_mbz || fraglen > len || fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) { ret = FFA_RET_INVALID_PARAMETERS; @@ -382,7 +386,7 @@ static void do_ffa_mem_share(struct arm_smccc_res *res, if (ret) goto out_unlock; - ffa_mem_share(res, len, fraglen); + ffa_mem_xfer(res, func_id, len, fraglen); if (res->a0 != FFA_SUCCESS) { WARN_ON(ffa_host_unshare_ranges(reg->constituents, reg->addr_range_cnt)); @@ -519,11 +523,15 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) goto out_handled; case FFA_MEM_SHARE: case FFA_FN64_MEM_SHARE: - do_ffa_mem_share(&res, host_ctxt); + do_ffa_mem_xfer(FFA_FN64_MEM_SHARE, &res, host_ctxt); goto out_handled; case FFA_MEM_RECLAIM: do_ffa_mem_reclaim(&res, host_ctxt); goto out_handled; + case FFA_MEM_LEND: + case FFA_FN64_MEM_LEND: + do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt); + goto out_handled; } if (ffa_call_supported(func_id)) -- cgit v1.2.3 From 20936cd11479709ccd5de2dc77ac8063cd9bfad8 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Tue, 23 May 2023 11:18:26 +0100 Subject: KVM: arm64: Handle FFA_FEATURES call from the host Filter out advertising unsupported features, and only advertise features and properties that are supported by the hypervisor proxy. Signed-off-by: Fuad Tabba Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20230523101828.7328-10-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/ffa.c | 45 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index d90ab1c9ec72..926314a9b9d7 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -64,15 +64,21 @@ static void ffa_to_smccc_error(struct arm_smccc_res *res, u64 ffa_errno) }; } -static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret) +static void ffa_to_smccc_res_prop(struct arm_smccc_res *res, int ret, u64 prop) { if (ret == FFA_RET_SUCCESS) { - *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS }; + *res = (struct arm_smccc_res) { .a0 = FFA_SUCCESS, + .a2 = prop }; } else { ffa_to_smccc_error(res, ret); } } +static void ffa_to_smccc_res(struct arm_smccc_res *res, int ret) +{ + ffa_to_smccc_res_prop(res, ret, 0); +} + static void ffa_set_retval(struct kvm_cpu_context *ctxt, struct arm_smccc_res *res) { @@ -484,14 +490,41 @@ static bool ffa_call_supported(u64 func_id) case FFA_RXTX_MAP: case FFA_MEM_DONATE: case FFA_MEM_RETRIEVE_REQ: - /* Don't advertise any features just yet */ - case FFA_FEATURES: return false; } return true; } +static bool do_ffa_features(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, id, ctxt, 1); + u64 prop = 0; + int ret = 0; + + if (!ffa_call_supported(id)) { + ret = FFA_RET_NOT_SUPPORTED; + goto out_handled; + } + + switch (id) { + case FFA_MEM_SHARE: + case FFA_FN64_MEM_SHARE: + case FFA_MEM_LEND: + case FFA_FN64_MEM_LEND: + ret = FFA_RET_SUCCESS; + prop = 0; /* No support for dynamic buffers */ + goto out_handled; + default: + return false; + } + +out_handled: + ffa_to_smccc_res_prop(res, ret, prop); + return true; +} + bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(u64, func_id, host_ctxt, 0); @@ -514,6 +547,10 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) return false; switch (func_id) { + case FFA_FEATURES: + if (!do_ffa_features(&res, host_ctxt)) + return false; + goto out_handled; /* Memory management */ case FFA_FN64_RXTX_MAP: do_ffa_rxtx_map(&res, host_ctxt); -- cgit v1.2.3 From 0a9f15fd56742b785ba4d06e64976f1f700b807c Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Tue, 23 May 2023 11:18:27 +0100 Subject: KVM: arm64: pkvm: Add support for fragmented FF-A descriptors FF-A memory descriptors may need to be sent in fragments when they don't fit in the mailboxes. Doing so involves using the FRAG_TX and FRAG_RX primitives defined in the FF-A protocol. Add support in the pKVM FF-A relayer for fragmented descriptors by monitoring outgoing FRAG_TX transactions and by buffering large descriptors on the reclaim path. Co-developed-by: Andrew Walbran Signed-off-by: Andrew Walbran Signed-off-by: Quentin Perret Signed-off-by: Will Deacon Link: https://lore.kernel.org/r/20230523101828.7328-11-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_pkvm.h | 17 +++- arch/arm64/kvm/hyp/nvhe/ffa.c | 170 ++++++++++++++++++++++++++++++++------ 2 files changed, 162 insertions(+), 25 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h index 2b495ec59deb..e46250a02017 100644 --- a/arch/arm64/include/asm/kvm_pkvm.h +++ b/arch/arm64/include/asm/kvm_pkvm.h @@ -6,7 +6,9 @@ #ifndef __ARM64_KVM_PKVM_H__ #define __ARM64_KVM_PKVM_H__ +#include #include +#include #include /* Maximum number of VMs that can co-exist under pKVM. */ @@ -110,8 +112,19 @@ static inline unsigned long host_s2_pgtable_pages(void) static inline unsigned long hyp_ffa_proxy_pages(void) { - /* A page each for the hypervisor's RX and TX mailboxes. */ - return 2 * KVM_FFA_MBOX_NR_PAGES; + size_t desc_max; + + /* + * The hypervisor FFA proxy needs enough memory to buffer a fragmented + * descriptor returned from EL3 in response to a RETRIEVE_REQ call. + */ + desc_max = sizeof(struct ffa_mem_region) + + sizeof(struct ffa_mem_region_attributes) + + sizeof(struct ffa_composite_mem_region) + + SG_MAX_SEGMENTS * sizeof(struct ffa_mem_region_addr_range); + + /* Plus a page each for the hypervisor's RX and TX mailboxes. */ + return (2 * KVM_FFA_MBOX_NR_PAGES) + DIV_ROUND_UP(desc_max, PAGE_SIZE); } #endif /* __ARM64_KVM_PKVM_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c index 926314a9b9d7..58dcd92bf346 100644 --- a/arch/arm64/kvm/hyp/nvhe/ffa.c +++ b/arch/arm64/kvm/hyp/nvhe/ffa.c @@ -42,6 +42,18 @@ */ #define HOST_FFA_ID 0 +/* + * A buffer to hold the maximum descriptor size we can see from the host, + * which is required when the SPMD returns a fragmented FFA_MEM_RETRIEVE_RESP + * when resolving the handle on the reclaim path. + */ +struct kvm_ffa_descriptor_buffer { + void *buf; + size_t len; +}; + +static struct kvm_ffa_descriptor_buffer ffa_desc_buf; + struct kvm_ffa_buffers { hyp_spinlock_t lock; void *tx; @@ -122,6 +134,24 @@ static int ffa_unmap_hyp_buffers(void) return res.a0 == FFA_SUCCESS ? FFA_RET_SUCCESS : res.a2; } +static void ffa_mem_frag_tx(struct arm_smccc_res *res, u32 handle_lo, + u32 handle_hi, u32 fraglen, u32 endpoint_id) +{ + arm_smccc_1_1_smc(FFA_MEM_FRAG_TX, + handle_lo, handle_hi, fraglen, endpoint_id, + 0, 0, 0, + res); +} + +static void ffa_mem_frag_rx(struct arm_smccc_res *res, u32 handle_lo, + u32 handle_hi, u32 fragoff) +{ + arm_smccc_1_1_smc(FFA_MEM_FRAG_RX, + handle_lo, handle_hi, fragoff, HOST_FFA_ID, + 0, 0, 0, + res); +} + static void ffa_mem_xfer(struct arm_smccc_res *res, u64 func_id, u32 len, u32 fraglen) { @@ -327,6 +357,64 @@ static int ffa_host_unshare_ranges(struct ffa_mem_region_addr_range *ranges, return ret; } +static void do_ffa_mem_frag_tx(struct arm_smccc_res *res, + struct kvm_cpu_context *ctxt) +{ + DECLARE_REG(u32, handle_lo, ctxt, 1); + DECLARE_REG(u32, handle_hi, ctxt, 2); + DECLARE_REG(u32, fraglen, ctxt, 3); + DECLARE_REG(u32, endpoint_id, ctxt, 4); + struct ffa_mem_region_addr_range *buf; + int ret = FFA_RET_INVALID_PARAMETERS; + u32 nr_ranges; + + if (fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE) + goto out; + + if (fraglen % sizeof(*buf)) + goto out; + + hyp_spin_lock(&host_buffers.lock); + if (!host_buffers.tx) + goto out_unlock; + + buf = hyp_buffers.tx; + memcpy(buf, host_buffers.tx, fraglen); + nr_ranges = fraglen / sizeof(*buf); + + ret = ffa_host_share_ranges(buf, nr_ranges); + if (ret) { + /* + * We're effectively aborting the transaction, so we need + * to restore the global state back to what it was prior to + * transmission of the first fragment. + */ + ffa_mem_reclaim(res, handle_lo, handle_hi, 0); + WARN_ON(res->a0 != FFA_SUCCESS); + goto out_unlock; + } + + ffa_mem_frag_tx(res, handle_lo, handle_hi, fraglen, endpoint_id); + if (res->a0 != FFA_SUCCESS && res->a0 != FFA_MEM_FRAG_RX) + WARN_ON(ffa_host_unshare_ranges(buf, nr_ranges)); + +out_unlock: + hyp_spin_unlock(&host_buffers.lock); +out: + if (ret) + ffa_to_smccc_res(res, ret); + + /* + * If for any reason this did not succeed, we're in trouble as we have + * now lost the content of the previous fragments and we can't rollback + * the host stage-2 changes. The pages previously marked as shared will + * remain stuck in that state forever, hence preventing the host from + * sharing/donating them again and may possibly lead to subsequent + * failures, but this will not compromise confidentiality. + */ + return; +} + static __always_inline void do_ffa_mem_xfer(const u64 func_id, struct arm_smccc_res *res, struct kvm_cpu_context *ctxt) @@ -337,8 +425,8 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id, DECLARE_REG(u32, npages_mbz, ctxt, 4); struct ffa_composite_mem_region *reg; struct ffa_mem_region *buf; + u32 offset, nr_ranges; int ret = 0; - u32 offset; BUILD_BUG_ON(func_id != FFA_FN64_MEM_SHARE && func_id != FFA_FN64_MEM_LEND); @@ -349,11 +437,6 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id, goto out; } - if (fraglen < len) { - ret = FFA_RET_ABORTED; - goto out; - } - if (fraglen < sizeof(struct ffa_mem_region) + sizeof(struct ffa_mem_region_attributes)) { ret = FFA_RET_INVALID_PARAMETERS; @@ -381,21 +464,26 @@ static __always_inline void do_ffa_mem_xfer(const u64 func_id, } reg = (void *)buf + offset; - if (fraglen < offset + sizeof(struct ffa_composite_mem_region) + - reg->addr_range_cnt * - sizeof(struct ffa_mem_region_addr_range)) { + nr_ranges = ((void *)buf + fraglen) - (void *)reg->constituents; + if (nr_ranges % sizeof(reg->constituents[0])) { ret = FFA_RET_INVALID_PARAMETERS; goto out_unlock; } - ret = ffa_host_share_ranges(reg->constituents, reg->addr_range_cnt); + nr_ranges /= sizeof(reg->constituents[0]); + ret = ffa_host_share_ranges(reg->constituents, nr_ranges); if (ret) goto out_unlock; ffa_mem_xfer(res, func_id, len, fraglen); - if (res->a0 != FFA_SUCCESS) { - WARN_ON(ffa_host_unshare_ranges(reg->constituents, - reg->addr_range_cnt)); + if (fraglen != len) { + if (res->a0 != FFA_MEM_FRAG_RX) + goto err_unshare; + + if (res->a3 != fraglen) + goto err_unshare; + } else if (res->a0 != FFA_SUCCESS) { + goto err_unshare; } out_unlock: @@ -404,6 +492,10 @@ out: if (ret) ffa_to_smccc_res(res, ret); return; + +err_unshare: + WARN_ON(ffa_host_unshare_ranges(reg->constituents, nr_ranges)); + goto out_unlock; } static void do_ffa_mem_reclaim(struct arm_smccc_res *res, @@ -413,9 +505,9 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res, DECLARE_REG(u32, handle_hi, ctxt, 2); DECLARE_REG(u32, flags, ctxt, 3); struct ffa_composite_mem_region *reg; + u32 offset, len, fraglen, fragoff; struct ffa_mem_region *buf; int ret = 0; - u32 offset; u64 handle; handle = PACK_HANDLE(handle_lo, handle_hi); @@ -433,11 +525,8 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res, if (res->a0 != FFA_MEM_RETRIEVE_RESP) goto out_unlock; - /* Check for fragmentation */ - if (res->a1 != res->a2) { - ret = FFA_RET_ABORTED; - goto out_unlock; - } + len = res->a1; + fraglen = res->a2; offset = buf->ep_mem_access[0].composite_off; /* @@ -445,16 +534,36 @@ static void do_ffa_mem_reclaim(struct arm_smccc_res *res, * check that we end up with something that doesn't look _completely_ * bogus. */ - if (WARN_ON(offset > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) { + if (WARN_ON(offset > len || + fraglen > KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE)) { ret = FFA_RET_ABORTED; goto out_unlock; } - reg = (void *)buf + offset; + if (len > ffa_desc_buf.len) { + ret = FFA_RET_NO_MEMORY; + goto out_unlock; + } + + buf = ffa_desc_buf.buf; + memcpy(buf, hyp_buffers.rx, fraglen); + + for (fragoff = fraglen; fragoff < len; fragoff += fraglen) { + ffa_mem_frag_rx(res, handle_lo, handle_hi, fragoff); + if (res->a0 != FFA_MEM_FRAG_TX) { + ret = FFA_RET_INVALID_PARAMETERS; + goto out_unlock; + } + + fraglen = res->a3; + memcpy((void *)buf + fragoff, hyp_buffers.rx, fraglen); + } + ffa_mem_reclaim(res, handle_lo, handle_hi, flags); if (res->a0 != FFA_SUCCESS) goto out_unlock; + reg = (void *)buf + offset; /* If the SPMD was happy, then we should be too. */ WARN_ON(ffa_host_unshare_ranges(reg->constituents, reg->addr_range_cnt)); @@ -569,6 +678,9 @@ bool kvm_host_ffa_handler(struct kvm_cpu_context *host_ctxt) case FFA_FN64_MEM_LEND: do_ffa_mem_xfer(FFA_FN64_MEM_LEND, &res, host_ctxt); goto out_handled; + case FFA_MEM_FRAG_TX: + do_ffa_mem_frag_tx(&res, host_ctxt); + goto out_handled; } if (ffa_call_supported(func_id)) @@ -584,6 +696,7 @@ int hyp_ffa_init(void *pages) { struct arm_smccc_res res; size_t min_rxtx_sz; + void *tx, *rx; if (kvm_host_psci_config.smccc_version < ARM_SMCCC_VERSION_1_2) return 0; @@ -624,10 +737,21 @@ int hyp_ffa_init(void *pages) if (min_rxtx_sz > PAGE_SIZE) return -EOPNOTSUPP; + tx = pages; + pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; + rx = pages; + pages += KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE; + + ffa_desc_buf = (struct kvm_ffa_descriptor_buffer) { + .buf = pages, + .len = PAGE_SIZE * + (hyp_ffa_proxy_pages() - (2 * KVM_FFA_MBOX_NR_PAGES)), + }; + hyp_buffers = (struct kvm_ffa_buffers) { .lock = __HYP_SPIN_LOCK_UNLOCKED, - .tx = pages, - .rx = pages + (KVM_FFA_MBOX_NR_PAGES * PAGE_SIZE), + .tx = tx, + .rx = rx, }; host_buffers = (struct kvm_ffa_buffers) { -- cgit v1.2.3 From 5e50082c8c21cd32ef21f523c149939668954ab6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 Apr 2023 16:45:55 -0700 Subject: KVM: VMX: Inject #GP on ENCLS if vCPU has paging disabled (CR0.PG==0) Inject a #GP when emulating/forwarding a valid ENCLS leaf if the vCPU has paging disabled, e.g. if KVM is intercepting ECREATE to enforce additional restrictions. The pseudocode in the SDM lists all #GP triggers, including CR0.PG=0, as being checked after the ENLCS-exiting checks, i.e. the VM-Exit will occur before the CPU performs the CR0.PG check. Fixes: 70210c044b4e ("KVM: VMX: Add SGX ENCLS[ECREATE] handler to enforce CPUID restrictions") Cc: Binbin Wu Cc: Kai Huang Tested-by: Kai Huang Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230405234556.696927-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/sgx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 2261b684a7d4..137088a663ff 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -382,7 +382,7 @@ int handle_encls(struct kvm_vcpu *vcpu) if (!encls_leaf_enabled_in_guest(vcpu, leaf)) { kvm_queue_exception(vcpu, UD_VECTOR); - } else if (!sgx_enabled_in_guest_bios(vcpu)) { + } else if (!sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) { kvm_inject_gp(vcpu, 0); } else { if (leaf == ECREATE) -- cgit v1.2.3 From c3a1e119a343a70a9f49689b8f18bb43f236d681 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 5 Apr 2023 16:45:56 -0700 Subject: KVM: VMX: Inject #GP, not #UD, if SGX2 ENCLS leafs are unsupported Per Intel's SDM, unsupported ENCLS leafs result in a #GP, not a #UD. SGX1 is a special snowflake as the SGX1 flag is used by the CPU as a "soft" disable, e.g. if software disables machine check reporting, i.e. having SGX but not SGX1 is effectively "SGX completely unsupported" and and thus #UDs. Fixes: 9798adbc04cf ("KVM: VMX: Frame in ENCLS handler for SGX virtualization") Cc: Binbin Wu Cc: Kai Huang Tested-by: Kai Huang Reviewed-by: Kai Huang Link: https://lore.kernel.org/r/20230405234556.696927-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/sgx.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 137088a663ff..3e822e582497 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -357,11 +357,12 @@ static int handle_encls_einit(struct kvm_vcpu *vcpu) static inline bool encls_leaf_enabled_in_guest(struct kvm_vcpu *vcpu, u32 leaf) { - if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX)) - return false; - + /* + * ENCLS generates a #UD if SGX1 isn't supported, i.e. this point will + * be reached if and only if the SGX1 leafs are enabled. + */ if (leaf >= ECREATE && leaf <= ETRACK) - return guest_cpuid_has(vcpu, X86_FEATURE_SGX1); + return true; if (leaf >= EAUG && leaf <= EMODT) return guest_cpuid_has(vcpu, X86_FEATURE_SGX2); @@ -380,9 +381,11 @@ int handle_encls(struct kvm_vcpu *vcpu) { u32 leaf = (u32)kvm_rax_read(vcpu); - if (!encls_leaf_enabled_in_guest(vcpu, leaf)) { + if (!enable_sgx || !guest_cpuid_has(vcpu, X86_FEATURE_SGX) || + !guest_cpuid_has(vcpu, X86_FEATURE_SGX1)) { kvm_queue_exception(vcpu, UD_VECTOR); - } else if (!sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) { + } else if (!encls_leaf_enabled_in_guest(vcpu, leaf) || + !sgx_enabled_in_guest_bios(vcpu) || !is_paging(vcpu)) { kvm_inject_gp(vcpu, 0); } else { if (leaf == ECREATE) -- cgit v1.2.3 From 791a089861fced01dc6d8a67507d711839659fb8 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Jun 2023 18:19:20 -0700 Subject: KVM: SVM: Invoke trace_kvm_exit() for fastpath VM-Exits Move SVM's call to trace_kvm_exit() from the "slow" VM-Exit handler to svm_vcpu_run() so that KVM traces fastpath VM-Exits that re-enter the guest without bouncing through the slow path. This bug is benign in the current code base as KVM doesn't currently support any such exits on SVM. Fixes: a9ab13ff6e84 ("KVM: X86: Improve latency for single target IPI fastpath") Link: https://lore.kernel.org/r/20230602011920.787844-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 616ea6f55ae5..3a29273d070e 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3405,8 +3405,6 @@ static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(vcpu, KVM_ISA_SVM); - /* SEV-ES guests must use the CR write traps to track CR registers. */ if (!sev_es_guest(vcpu->kvm)) { if (!svm_is_intercept(svm, INTERCEPT_CR0_WRITE)) @@ -4132,6 +4130,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) SVM_EXIT_EXCP_BASE + MC_VECTOR)) svm_handle_mce(vcpu); + trace_kvm_exit(vcpu, KVM_ISA_SVM); + svm_complete_interrupts(vcpu); if (is_guest_mode(vcpu)) -- cgit v1.2.3 From 22725266bdf95bddd01a23841097492489dfc9d9 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Thu, 18 May 2023 17:13:37 +0800 Subject: KVM: Fix comment for KVM_ENABLE_CAP Fix comment for vcpu ioctl version of KVM_ENABLE_CAP. KVM provides ioctl KVM_ENABLE_CAP to allow userspace to enable an extension which is not enabled by default. For vcpu ioctl version, it is available with the capability KVM_CAP_ENABLE_CAP. For vm ioctl version, it is available with the capability KVM_CAP_ENABLE_CAP_VM. Signed-off-by: Binbin Wu Link: https://lore.kernel.org/r/20230518091339.1102-2-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- include/uapi/linux/kvm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 737318b1c1d9..bddf2871db8f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1613,7 +1613,7 @@ struct kvm_s390_ucas_mapping { #define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs) #define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs) /* - * vcpu version available with KVM_ENABLE_CAP + * vcpu version available with KVM_CAP_ENABLE_CAP * vm version available with KVM_CAP_ENABLE_CAP_VM */ #define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap) -- cgit v1.2.3 From 06b66e050095db599f4f598ee627d1dc9c933018 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Thu, 18 May 2023 17:13:39 +0800 Subject: KVM: x86: Fix a typo in Documentation/virt/kvm/x86/mmu.rst L1 CR4.LA57 should be '0' instead of '1' when shadowing 5-level NPT for 4-level NPT L1 guest. Signed-off-by: Binbin Wu Link: https://lore.kernel.org/r/20230518091339.1102-4-binbin.wu@linux.intel.com Signed-off-by: Sean Christopherson --- Documentation/virt/kvm/x86/mmu.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/virt/kvm/x86/mmu.rst b/Documentation/virt/kvm/x86/mmu.rst index 8364afa228ec..26f62034b6f3 100644 --- a/Documentation/virt/kvm/x86/mmu.rst +++ b/Documentation/virt/kvm/x86/mmu.rst @@ -205,7 +205,7 @@ Shadow pages contain the following information: role.passthrough: The page is not backed by a guest page table, but its first entry points to one. This is set if NPT uses 5-level page tables (host - CR4.LA57=1) and is shadowing L1's 4-level NPT (L1 CR4.LA57=1). + CR4.LA57=1) and is shadowing L1's 4-level NPT (L1 CR4.LA57=0). gfn: Either the guest page table containing the translations shadowed by this page, or the base page frame for linear translations. See role.direct. -- cgit v1.2.3 From 78329084b86436b9a77cedd3e1331734c442d413 Mon Sep 17 00:00:00 2001 From: Ye Xingchen Date: Sat, 6 May 2023 17:10:30 +0800 Subject: RISC-V: KVM: use bitmap_zero() API bitmap_zero() is faster than bitmap_clear(), so use bitmap_zero() instead of bitmap_clear(). Signed-off-by: Ye Xingchen Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/kvm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c index 0e5479600695..44bc324aeeb0 100644 --- a/arch/riscv/kvm/tlb.c +++ b/arch/riscv/kvm/tlb.c @@ -296,7 +296,7 @@ static void make_xfence_request(struct kvm *kvm, unsigned int actual_req = req; DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); - bitmap_clear(vcpu_mask, 0, KVM_MAX_VCPUS); + bitmap_zero(vcpu_mask, KVM_MAX_VCPUS); kvm_for_each_vcpu(i, vcpu, kvm) { if (hbase != -1UL) { if (vcpu->vcpu_id < hbase) -- cgit v1.2.3 From 19bff88e65b552f54c191e600498bd50ff47e1ef Mon Sep 17 00:00:00 2001 From: wchen Date: Sat, 20 May 2023 23:01:16 +0800 Subject: RISC-V: KVM: Redirect AMO load/store misaligned traps to guest The M-mode redirects an unhandled misaligned trap back to S-mode when not delegating it to VS-mode(hedeleg). However, KVM running in HS-mode terminates the VS-mode software when back from M-mode. The KVM should redirect the trap back to VS-mode, and let VS-mode trap handler decide the next step. Here is a way to handle misaligned traps in KVM, not only directing them to VS-mode or terminate it. Signed-off-by: wchen Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/include/asm/csr.h | 2 ++ arch/riscv/kvm/vcpu_exit.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index b6acb7ed115f..917814a0f99e 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -82,7 +82,9 @@ #define EXC_INST_ACCESS 1 #define EXC_INST_ILLEGAL 2 #define EXC_BREAKPOINT 3 +#define EXC_LOAD_MISALIGNED 4 #define EXC_LOAD_ACCESS 5 +#define EXC_STORE_MISALIGNED 6 #define EXC_STORE_ACCESS 7 #define EXC_SYSCALL 8 #define EXC_HYPERVISOR_SYSCALL 9 diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index 4ea101a73d8b..2415722c01b8 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -183,6 +183,8 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, run->exit_reason = KVM_EXIT_UNKNOWN; switch (trap->scause) { case EXC_INST_ILLEGAL: + case EXC_LOAD_MISALIGNED: + case EXC_STORE_MISALIGNED: if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) { kvm_riscv_vcpu_trap_redirect(vcpu, trap); ret = 1; -- cgit v1.2.3 From ae328dadf9852bc420880d48e341f528b7e0ce78 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 30 May 2023 19:50:22 +0200 Subject: RISC-V: KVM: Rename dis_idx to ext_idx Make the name of the extension_disabled[] index more general in order to expand its application. Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/kvm/vcpu_sbi.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index e52fde504433..6aa15f1b97d9 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -31,49 +31,49 @@ static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu = { #endif struct kvm_riscv_sbi_extension_entry { - enum KVM_RISCV_SBI_EXT_ID dis_idx; + enum KVM_RISCV_SBI_EXT_ID ext_idx; const struct kvm_vcpu_sbi_extension *ext_ptr; }; static const struct kvm_riscv_sbi_extension_entry sbi_ext[] = { { - .dis_idx = KVM_RISCV_SBI_EXT_V01, + .ext_idx = KVM_RISCV_SBI_EXT_V01, .ext_ptr = &vcpu_sbi_ext_v01, }, { - .dis_idx = KVM_RISCV_SBI_EXT_MAX, /* Can't be disabled */ + .ext_idx = KVM_RISCV_SBI_EXT_MAX, /* Can't be disabled */ .ext_ptr = &vcpu_sbi_ext_base, }, { - .dis_idx = KVM_RISCV_SBI_EXT_TIME, + .ext_idx = KVM_RISCV_SBI_EXT_TIME, .ext_ptr = &vcpu_sbi_ext_time, }, { - .dis_idx = KVM_RISCV_SBI_EXT_IPI, + .ext_idx = KVM_RISCV_SBI_EXT_IPI, .ext_ptr = &vcpu_sbi_ext_ipi, }, { - .dis_idx = KVM_RISCV_SBI_EXT_RFENCE, + .ext_idx = KVM_RISCV_SBI_EXT_RFENCE, .ext_ptr = &vcpu_sbi_ext_rfence, }, { - .dis_idx = KVM_RISCV_SBI_EXT_SRST, + .ext_idx = KVM_RISCV_SBI_EXT_SRST, .ext_ptr = &vcpu_sbi_ext_srst, }, { - .dis_idx = KVM_RISCV_SBI_EXT_HSM, + .ext_idx = KVM_RISCV_SBI_EXT_HSM, .ext_ptr = &vcpu_sbi_ext_hsm, }, { - .dis_idx = KVM_RISCV_SBI_EXT_PMU, + .ext_idx = KVM_RISCV_SBI_EXT_PMU, .ext_ptr = &vcpu_sbi_ext_pmu, }, { - .dis_idx = KVM_RISCV_SBI_EXT_EXPERIMENTAL, + .ext_idx = KVM_RISCV_SBI_EXT_EXPERIMENTAL, .ext_ptr = &vcpu_sbi_ext_experimental, }, { - .dis_idx = KVM_RISCV_SBI_EXT_VENDOR, + .ext_idx = KVM_RISCV_SBI_EXT_VENDOR, .ext_ptr = &vcpu_sbi_ext_vendor, }, }; @@ -147,7 +147,7 @@ static int riscv_vcpu_set_sbi_ext_single(struct kvm_vcpu *vcpu, return -EINVAL; for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) { - if (sbi_ext[i].dis_idx == reg_num) { + if (sbi_ext[i].ext_idx == reg_num) { sext = &sbi_ext[i]; break; } @@ -155,7 +155,7 @@ static int riscv_vcpu_set_sbi_ext_single(struct kvm_vcpu *vcpu, if (!sext) return -ENOENT; - scontext->extension_disabled[sext->dis_idx] = !reg_val; + scontext->extension_disabled[sext->ext_idx] = !reg_val; return 0; } @@ -172,7 +172,7 @@ static int riscv_vcpu_get_sbi_ext_single(struct kvm_vcpu *vcpu, return -EINVAL; for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) { - if (sbi_ext[i].dis_idx == reg_num) { + if (sbi_ext[i].ext_idx == reg_num) { sext = &sbi_ext[i]; break; } @@ -180,7 +180,7 @@ static int riscv_vcpu_get_sbi_ext_single(struct kvm_vcpu *vcpu, if (!sext) return -ENOENT; - *reg_val = !scontext->extension_disabled[sext->dis_idx]; + *reg_val = !scontext->extension_disabled[sext->ext_idx]; return 0; } @@ -315,8 +315,8 @@ const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext( sext = &sbi_ext[i]; if (sext->ext_ptr->extid_start <= extid && sext->ext_ptr->extid_end >= extid) { - if (sext->dis_idx < KVM_RISCV_SBI_EXT_MAX && - scontext->extension_disabled[sext->dis_idx]) + if (sext->ext_idx < KVM_RISCV_SBI_EXT_MAX && + scontext->extension_disabled[sext->ext_idx]) return NULL; return sbi_ext[i].ext_ptr; } -- cgit v1.2.3 From 9f9e3ebe111c7aabf18a4085648dcc58c568c1d6 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 30 May 2023 19:50:23 +0200 Subject: RISC-V: KVM: Convert extension_disabled[] to ext_status[] Change the boolean extension_disabled[] array to an array of enums, ext_status[]. For now, the enum only has two states, which correspond to the previous boolean states, so this patch has no intended functional change. The next patch will add another state, expanding the purpose of ext_status[]. Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_vcpu_sbi.h | 7 ++++++- arch/riscv/kvm/vcpu_sbi.c | 9 ++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index 4278125a38a5..ad1588d6064b 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -14,9 +14,14 @@ #define KVM_SBI_VERSION_MAJOR 1 #define KVM_SBI_VERSION_MINOR 0 +enum kvm_riscv_sbi_ext_status { + KVM_RISCV_SBI_EXT_AVAILABLE, + KVM_RISCV_SBI_EXT_UNAVAILABLE, +}; + struct kvm_vcpu_sbi_context { int return_handled; - bool extension_disabled[KVM_RISCV_SBI_EXT_MAX]; + enum kvm_riscv_sbi_ext_status ext_status[KVM_RISCV_SBI_EXT_MAX]; }; struct kvm_vcpu_sbi_return { diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index 6aa15f1b97d9..28e55ba023dc 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -155,7 +155,8 @@ static int riscv_vcpu_set_sbi_ext_single(struct kvm_vcpu *vcpu, if (!sext) return -ENOENT; - scontext->extension_disabled[sext->ext_idx] = !reg_val; + scontext->ext_status[sext->ext_idx] = reg_val ? + KVM_RISCV_SBI_EXT_AVAILABLE : KVM_RISCV_SBI_EXT_UNAVAILABLE; return 0; } @@ -180,7 +181,8 @@ static int riscv_vcpu_get_sbi_ext_single(struct kvm_vcpu *vcpu, if (!sext) return -ENOENT; - *reg_val = !scontext->extension_disabled[sext->ext_idx]; + *reg_val = scontext->ext_status[sext->ext_idx] == + KVM_RISCV_SBI_EXT_AVAILABLE; return 0; } @@ -316,7 +318,8 @@ const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext( if (sext->ext_ptr->extid_start <= extid && sext->ext_ptr->extid_end >= extid) { if (sext->ext_idx < KVM_RISCV_SBI_EXT_MAX && - scontext->extension_disabled[sext->ext_idx]) + scontext->ext_status[sext->ext_idx] == + KVM_RISCV_SBI_EXT_UNAVAILABLE) return NULL; return sbi_ext[i].ext_ptr; } -- cgit v1.2.3 From 95c99104cb42bea5a0874c362f284ae4b91289dd Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Tue, 30 May 2023 19:50:24 +0200 Subject: RISC-V: KVM: Probe for SBI extension status Rather than defaulting the status to available and allowing the user to set availability, default to uninitialized and only allow the user to set the status to unavailable. Then, when an extension is first used, ensure it is available by invoking its probe function, if it has one (an extension is assumed available if it doesn't have a probe function). Checking the status in kvm_vcpu_sbi_find_ext() ensures extension functions cannot be invoked when they're unavailable. Signed-off-by: Andrew Jones Reviewed-by: Anup Patel Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_vcpu_sbi.h | 1 + arch/riscv/kvm/vcpu_sbi.c | 51 ++++++++++++++++++++++++++--------- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index ad1588d6064b..14ee03df252d 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -15,6 +15,7 @@ #define KVM_SBI_VERSION_MINOR 0 enum kvm_riscv_sbi_ext_status { + KVM_RISCV_SBI_EXT_UNINITIALIZED, KVM_RISCV_SBI_EXT_AVAILABLE, KVM_RISCV_SBI_EXT_UNAVAILABLE, }; diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index 28e55ba023dc..b3e92c11e54f 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -155,8 +155,15 @@ static int riscv_vcpu_set_sbi_ext_single(struct kvm_vcpu *vcpu, if (!sext) return -ENOENT; - scontext->ext_status[sext->ext_idx] = reg_val ? - KVM_RISCV_SBI_EXT_AVAILABLE : KVM_RISCV_SBI_EXT_UNAVAILABLE; + /* + * We can't set the extension status to available here, since it may + * have a probe() function which needs to confirm availability first, + * but it may be too early to call that here. We can set the status to + * unavailable, though. + */ + if (!reg_val) + scontext->ext_status[sext->ext_idx] = + KVM_RISCV_SBI_EXT_UNAVAILABLE; return 0; } @@ -181,8 +188,15 @@ static int riscv_vcpu_get_sbi_ext_single(struct kvm_vcpu *vcpu, if (!sext) return -ENOENT; - *reg_val = scontext->ext_status[sext->ext_idx] == - KVM_RISCV_SBI_EXT_AVAILABLE; + /* + * If the extension status is still uninitialized, then we should probe + * to determine if it's available, but it may be too early to do that + * here. The best we can do is report that the extension has not been + * disabled, i.e. we return 1 when the extension is available and also + * when it only may be available. + */ + *reg_val = scontext->ext_status[sext->ext_idx] != + KVM_RISCV_SBI_EXT_UNAVAILABLE; return 0; } @@ -309,19 +323,32 @@ int kvm_riscv_vcpu_get_reg_sbi_ext(struct kvm_vcpu *vcpu, const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext( struct kvm_vcpu *vcpu, unsigned long extid) { - int i; - const struct kvm_riscv_sbi_extension_entry *sext; struct kvm_vcpu_sbi_context *scontext = &vcpu->arch.sbi_context; + const struct kvm_riscv_sbi_extension_entry *entry; + const struct kvm_vcpu_sbi_extension *ext; + int i; for (i = 0; i < ARRAY_SIZE(sbi_ext); i++) { - sext = &sbi_ext[i]; - if (sext->ext_ptr->extid_start <= extid && - sext->ext_ptr->extid_end >= extid) { - if (sext->ext_idx < KVM_RISCV_SBI_EXT_MAX && - scontext->ext_status[sext->ext_idx] == + entry = &sbi_ext[i]; + ext = entry->ext_ptr; + + if (ext->extid_start <= extid && ext->extid_end >= extid) { + if (entry->ext_idx >= KVM_RISCV_SBI_EXT_MAX || + scontext->ext_status[entry->ext_idx] == + KVM_RISCV_SBI_EXT_AVAILABLE) + return ext; + if (scontext->ext_status[entry->ext_idx] == KVM_RISCV_SBI_EXT_UNAVAILABLE) return NULL; - return sbi_ext[i].ext_ptr; + if (ext->probe && !ext->probe(vcpu)) { + scontext->ext_status[entry->ext_idx] = + KVM_RISCV_SBI_EXT_UNAVAILABLE; + return NULL; + } + + scontext->ext_status[entry->ext_idx] = + KVM_RISCV_SBI_EXT_AVAILABLE; + return ext; } } -- cgit v1.2.3 From 02f1b0b736606f9870595b3089d9c124f9da8be9 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Wed, 24 May 2023 14:16:32 +0800 Subject: KVM: x86: Correct the name for skipping VMENTER l1d flush There is no VMENTER_L1D_FLUSH_NESTED_VM. It should be ARCH_CAP_SKIP_VMENTRY_L1DFLUSH. Signed-off-by: Chao Gao Reviewed-by: Xiaoyao Li Link: https://lore.kernel.org/r/20230524061634.54141-3-chao.gao@intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5ad55ef71433..7c7be4815eaa 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1631,7 +1631,7 @@ static u64 kvm_get_arch_capabilities(void) * If we're doing cache flushes (either "always" or "cond") * we will do one whenever the guest does a vmlaunch/vmresume. * If an outer hypervisor is doing the cache flush for us - * (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that + * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that * capability to the guest too, and if EPT is disabled we're not * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will * require a nested hypervisor to do a flush of its own. -- cgit v1.2.3 From 6d1bc9754b04075d938b47cf7f7800814b8911a7 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Mon, 22 May 2023 18:12:48 +0200 Subject: KVM: SVM: enhance info printk's in SEV init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Let's print available ASID ranges for SEV/SEV-ES guests. This information can be useful for system administrator to debug if SEV/SEV-ES fails to enable. There are a few reasons. SEV: - NPT is disabled (module parameter) - CPU lacks some features (sev, decodeassists) - Maximum SEV ASID is 0 SEV-ES: - mmio_caching is disabled (module parameter) - CPU lacks sev_es feature - Minimum SEV ASID value is 1 (can be adjusted in BIOS/UEFI) Cc: Sean Christopherson Cc: Paolo Bonzini Cc: Stéphane Graber Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Suggested-by: Sean Christopherson Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20230522161249.800829-3-aleksandr.mikhalitsyn@canonical.com [sean: print '0' for min SEV-ES ASID if there are no available ASIDs] Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 69ae5e1b3120..d65578d8784d 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2219,7 +2219,6 @@ void __init sev_hardware_setup(void) if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)) goto out; - pr_info("SEV supported: %u ASIDs\n", sev_asid_count); sev_supported = true; /* SEV-ES support requested? */ @@ -2247,10 +2246,18 @@ void __init sev_hardware_setup(void) if (misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)) goto out; - pr_info("SEV-ES supported: %u ASIDs\n", sev_es_asid_count); sev_es_supported = true; out: + if (boot_cpu_has(X86_FEATURE_SEV)) + pr_info("SEV %s (ASIDs %u - %u)\n", + sev_supported ? "enabled" : "disabled", + min_sev_asid, max_sev_asid); + if (boot_cpu_has(X86_FEATURE_SEV_ES)) + pr_info("SEV-ES %s (ASIDs %u - %u)\n", + sev_es_supported ? "enabled" : "disabled", + min_sev_asid > 1 ? 1 : 0, min_sev_asid - 1); + sev_enabled = sev_supported; sev_es_enabled = sev_es_supported; #endif -- cgit v1.2.3 From 878940b33d7678e39a526ffe264ee025977dc67e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Jun 2023 18:15:16 -0700 Subject: KVM: VMX: Retry APIC-access page reload if invalidation is in-progress Re-request an APIC-access page reload if there is a relevant mmu_notifier invalidation in-progress when KVM retrieves the backing pfn, i.e. stall vCPUs until the backing pfn for the APIC-access page is "officially" stable. Relying on the primary MMU to not make changes after invoking ->invalidate_range() works, e.g. any additional changes to a PRESENT PTE would also trigger an ->invalidate_range(), but using ->invalidate_range() to fudge around KVM not honoring past and in-progress invalidations is a bit hacky. Honoring invalidations will allow using KVM's standard mmu_notifier hooks to detect APIC-access page reloads, which will in turn allow removing KVM's implementation of ->invalidate_range() (the APIC-access page case is a true one-off). Opportunistically add a comment to explain why doing nothing if a memslot isn't found is functionally correct. Suggested-by: Jason Gunthorpe Cc: Alistair Popple Cc: Robin Murphy Reviewed-by: Alistair Popple Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20230602011518.787006-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 50 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7ecab2118106..9ea4a5dfe62a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6714,7 +6714,12 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { - struct page *page; + const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT; + struct kvm *kvm = vcpu->kvm; + struct kvm_memslots *slots = kvm_memslots(kvm); + struct kvm_memory_slot *slot; + unsigned long mmu_seq; + kvm_pfn_t pfn; /* Defer reload until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { @@ -6726,18 +6731,53 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) return; - page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) + /* + * Grab the memslot so that the hva lookup for the mmu_notifier retry + * is guaranteed to use the same memslot as the pfn lookup, i.e. rely + * on the pfn lookup's validation of the memslot to ensure a valid hva + * is used for the retry check. + */ + slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return; + + /* + * Ensure that the mmu_notifier sequence count is read before KVM + * retrieves the pfn from the primary MMU. Note, the memslot is + * protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb() + * in kvm_mmu_invalidate_end(). + */ + mmu_seq = kvm->mmu_invalidate_seq; + smp_rmb(); + + /* + * No need to retry if the memslot does not exist or is invalid. KVM + * controls the APIC-access page memslot, and only deletes the memslot + * if APICv is permanently inhibited, i.e. the memslot won't reappear. + */ + pfn = gfn_to_pfn_memslot(slot, gfn); + if (is_error_noslot_pfn(pfn)) return; - vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page)); + read_lock(&vcpu->kvm->mmu_lock); + if (mmu_invalidate_retry_hva(kvm, mmu_seq, + gfn_to_hva_memslot(slot, gfn))) { + kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); + read_unlock(&vcpu->kvm->mmu_lock); + goto out; + } + + vmcs_write64(APIC_ACCESS_ADDR, pfn_to_hpa(pfn)); + read_unlock(&vcpu->kvm->mmu_lock); + vmx_flush_tlb_current(vcpu); +out: /* * Do not pin apic access page in memory, the MMU notifier * will call us again if it is migrated or swapped out. */ - put_page(page); + kvm_release_pfn_clean(pfn); } static void vmx_hwapic_isr_update(int max_isr) -- cgit v1.2.3 From 0a8a5f2c8c266e9d94fb45f76a26cff135d0051c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Jun 2023 18:15:17 -0700 Subject: KVM: x86: Use standard mmu_notifier invalidate hooks for APIC access page Now that KVM honors past and in-progress mmu_notifier invalidations when reloading the APIC-access page, use KVM's "standard" invalidation hooks to trigger a reload and delete the one-off usage of invalidate_range(). Aside from eliminating one-off code in KVM, dropping KVM's use of invalidate_range() will allow common mmu_notifier to redefine the API to be more strictly focused on invalidating secondary TLBs that share the primary MMU's page tables. Suggested-by: Jason Gunthorpe Cc: Alistair Popple Cc: Robin Murphy Reviewed-by: Alistair Popple Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20230602011518.787006-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 3 +++ arch/x86/kvm/x86.c | 14 -------------- include/linux/kvm_host.h | 3 --- virt/kvm/kvm_main.c | 18 ------------------ 4 files changed, 3 insertions(+), 35 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c8961f45e3b1..01a11ce68e57 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1600,6 +1600,9 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); + if (range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) + kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + return flush; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c0778ca39650..f962b7e3487e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10435,20 +10435,6 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); } -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) -{ - unsigned long apic_address; - - /* - * The physical address of apic access page is stored in the VMCS. - * Update it when it becomes invalid. - */ - apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (start <= apic_address && apic_address < end) - kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); -} - void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 0e571e973bc2..cb66f4100be7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2237,9 +2237,6 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, } #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end); - void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 479802a892d4..f3c7c3c90161 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -154,11 +154,6 @@ static unsigned long long kvm_active_vms; static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask); -__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) -{ -} - __weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) { } @@ -521,18 +516,6 @@ static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) return container_of(mn, struct kvm, mmu_notifier); } -static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - struct kvm *kvm = mmu_notifier_to_kvm(mn); - int idx; - - idx = srcu_read_lock(&kvm->srcu); - kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); - srcu_read_unlock(&kvm->srcu, idx); -} - typedef bool (*hva_handler_t)(struct kvm *kvm, struct kvm_gfn_range *range); typedef void (*on_lock_fn_t)(struct kvm *kvm, unsigned long start, @@ -892,7 +875,6 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { - .invalidate_range = kvm_mmu_notifier_invalidate_range, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, -- cgit v1.2.3 From 0a3869e14d4a5e1016aad6bc6c5b70f82bc0dbbe Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Jun 2023 18:15:18 -0700 Subject: KVM: x86/mmu: Trigger APIC-access page reload iff vendor code cares Request an APIC-access page reload when the backing page is migrated (or unmapped) if and only if vendor code actually plugs the backing pfn into structures that reside outside of KVM's MMU. This avoids kicking all vCPUs in the (hopefully infrequent) scenario where the backing page is migrated/invalidated. Unlike VMX's APICv, SVM's AVIC doesn't plug the backing pfn directly into the VMCB and so doesn't need a hook to invalidate an out-of-MMU "mapping". Reviewed-by: Paolo Bonzini Link: https://lore.kernel.org/r/20230602011518.787006-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 01a11ce68e57..beb507d82adf 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1600,7 +1600,8 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) if (tdp_mmu_enabled) flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush); - if (range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) + if (kvm_x86_ops.set_apic_access_page_addr && + range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); return flush; -- cgit v1.2.3 From 5f643e460ab1298a32b7d0db104bfcab9d6165c0 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 5 Jun 2023 13:44:19 +0200 Subject: KVM: Clean up kvm_vm_ioctl_create_vcpu() Since c9d601548603 ("KVM: allow KVM_BUG/KVM_BUG_ON to handle 64-bit cond") 'cond' is internally converted to boolean, so caller's explicit conversion from void* is unnecessary. Remove the double bang. Signed-off-by: Michal Luczaj Reviewed-by: Yuan Yao base-commit: 76a17bf03a268bc342e08c05d8ddbe607d294eb4 Link: https://lore.kernel.org/r/20230605114852.288964-1-mhal@rbox.co Signed-off-by: Sean Christopherson --- virt/kvm/kvm_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6a658f30af91..64dd940c549e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3975,7 +3975,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id) if (r < 0) goto kvm_put_xa_release; - if (KVM_BUG_ON(!!xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) { + if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, 0), kvm)) { r = -EINVAL; goto kvm_put_xa_release; } -- cgit v1.2.3 From 056b9919a16aedc560e6756a235ef5a62a68db05 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Jun 2023 18:05:50 -0700 Subject: KVM: x86: Use cpu_feature_enabled() for PKU instead of #ifdef Replace an #ifdef on CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS with a cpu_feature_enabled() check on X86_FEATURE_PKU. The macro magic of DISABLED_MASK_BIT_SET() means that cpu_feature_enabled() provides the same end result (no code generated) when PKU is disabled by Kconfig. No functional change intended. Cc: Jon Kohler Reviewed-by: Jon Kohler Link: https://lore.kernel.org/r/20230602010550.785722-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7c7be4815eaa..6f461afed800 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1017,13 +1017,11 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS - if (static_cpu_has(X86_FEATURE_PKU) && + if (cpu_feature_enabled(X86_FEATURE_PKU) && vcpu->arch.pkru != vcpu->arch.host_pkru && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) write_pkru(vcpu->arch.pkru); -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); @@ -1032,15 +1030,13 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) if (vcpu->arch.guest_state_protected) return; -#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS - if (static_cpu_has(X86_FEATURE_PKU) && + if (cpu_feature_enabled(X86_FEATURE_PKU) && ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) { vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vcpu->arch.host_pkru) write_pkru(vcpu->arch.host_pkru); } -#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { -- cgit v1.2.3 From e12fa4b92a07f4274ffa02e581d059a7869dd50e Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 5 Jun 2023 22:01:21 +0200 Subject: KVM: x86: Clean up: remove redundant bool conversions As test_bit() returns bool, explicitly converting result to bool is unnecessary. Get rid of '!!'. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Michal Luczaj Link: https://lore.kernel.org/r/20230605200158.118109-1-mhal@rbox.co Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/x86.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 924609d63f8a..02183ebc0e16 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -752,7 +752,7 @@ static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) BUG_ON(offset == MSR_INVALID); - return !!test_bit(bit_write, &tmp); + return test_bit(bit_write, &tmp); } static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6f461afed800..355f3df8dbad 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1805,7 +1805,7 @@ bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) unsigned long *bitmap = ranges[i].bitmap; if ((index >= start) && (index < end) && (flags & type)) { - allowed = !!test_bit(index - start, bitmap); + allowed = test_bit(index - start, bitmap); break; } } -- cgit v1.2.3 From d4ec586c60ab978554245c58cf432df444c93b4e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 6 Jun 2023 17:12:26 -0700 Subject: KVM: selftests: Allow specify physical cpu list in demand paging test Mimic the dirty log test and allow the user to pin demand paging test tasks to physical CPUs. Put the help message into a general helper as suggested by Sean. Signed-off-by: Peter Xu [sean: rebase, tweak arg ordering, add "print" to helper, print program name] Link: https://lore.kernel.org/r/20230607001226.1398889-1-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/demand_paging_test.c | 15 +++++++++++++-- tools/testing/selftests/kvm/dirty_log_perf_test.c | 12 +----------- tools/testing/selftests/kvm/include/kvm_util_base.h | 1 + tools/testing/selftests/kvm/lib/kvm_util.c | 17 +++++++++++++++++ 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index 9c18686b4f63..09c116a82a84 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -208,10 +208,11 @@ static void help(char *name) { puts(""); printf("usage: %s [-h] [-m vm_mode] [-u uffd_mode] [-d uffd_delay_usec]\n" - " [-b memory] [-s type] [-v vcpus] [-o]\n", name); + " [-b memory] [-s type] [-v vcpus] [-c cpu_list] [-o]\n", name); guest_modes_help(); printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n" " UFFD registration mode: 'MISSING' or 'MINOR'.\n"); + kvm_print_vcpu_pinning_help(); printf(" -d: add a delay in usec to the User Fault\n" " FD handler to simulate demand paging\n" " overheads. Ignored without -u.\n"); @@ -229,6 +230,7 @@ static void help(char *name) int main(int argc, char *argv[]) { int max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + const char *cpulist = NULL; struct test_params p = { .src_type = DEFAULT_VM_MEM_SRC, .partition_vcpu_memory_access = true, @@ -237,7 +239,7 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:o")) != -1) { + while ((opt = getopt(argc, argv, "hm:u:d:b:s:v:c:o")) != -1) { switch (opt) { case 'm': guest_modes_cmdline(optarg); @@ -264,6 +266,9 @@ int main(int argc, char *argv[]) TEST_ASSERT(nr_vcpus <= max_vcpus, "Invalid number of vcpus, must be between 1 and %d", max_vcpus); break; + case 'c': + cpulist = optarg; + break; case 'o': p.partition_vcpu_memory_access = false; break; @@ -279,6 +284,12 @@ int main(int argc, char *argv[]) TEST_FAIL("userfaultfd MINOR mode requires shared memory; pick a different -s"); } + if (cpulist) { + kvm_parse_vcpu_pinning(cpulist, memstress_args.vcpu_to_pcpu, + nr_vcpus); + memstress_args.pin_vcpus = true; + } + for_each_guest_mode(run_test, &p); return 0; diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 416719e20518..d374dbcf9a53 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -332,17 +332,7 @@ static void help(char *name) " so -w X means each page has an X%% chance of writing\n" " and a (100-X)%% chance of reading.\n" " (default: 100 i.e. all pages are written to.)\n"); - printf(" -c: Pin tasks to physical CPUs. Takes a list of comma separated\n" - " values (target pCPU), one for each vCPU, plus an optional\n" - " entry for the main application task (specified via entry\n" - " ). If used, entries must be provided for all\n" - " vCPUs, i.e. pinning vCPUs is all or nothing.\n\n" - " E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n" - " vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n" - " ./dirty_log_perf_test -v 3 -c 22,23,24,50\n\n" - " To leave the application task unpinned, drop the final entry:\n\n" - " ./dirty_log_perf_test -v 3 -c 22,23,24\n\n" - " (default: no pinning)\n"); + kvm_print_vcpu_pinning_help(); puts(""); exit(0); } diff --git a/tools/testing/selftests/kvm/include/kvm_util_base.h b/tools/testing/selftests/kvm/include/kvm_util_base.h index a089c356f354..07732a157ccd 100644 --- a/tools/testing/selftests/kvm/include/kvm_util_base.h +++ b/tools/testing/selftests/kvm/include/kvm_util_base.h @@ -733,6 +733,7 @@ static inline struct kvm_vm *vm_create_with_one_vcpu(struct kvm_vcpu **vcpu, struct kvm_vcpu *vm_recreate_with_one_vcpu(struct kvm_vm *vm); void kvm_pin_this_task_to_pcpu(uint32_t pcpu); +void kvm_print_vcpu_pinning_help(void); void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], int nr_vcpus); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 298c4372fb1a..9741a7ff6380 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -494,6 +494,23 @@ static uint32_t parse_pcpu(const char *cpu_str, const cpu_set_t *allowed_mask) return pcpu; } +void kvm_print_vcpu_pinning_help(void) +{ + const char *name = program_invocation_name; + + printf(" -c: Pin tasks to physical CPUs. Takes a list of comma separated\n" + " values (target pCPU), one for each vCPU, plus an optional\n" + " entry for the main application task (specified via entry\n" + " ). If used, entries must be provided for all\n" + " vCPUs, i.e. pinning vCPUs is all or nothing.\n\n" + " E.g. to create 3 vCPUs, pin vCPU0=>pCPU22, vCPU1=>pCPU23,\n" + " vCPU2=>pCPU24, and pin the application task to pCPU50:\n\n" + " %s -v 3 -c 22,23,24,50\n\n" + " To leave the application task unpinned, drop the final entry:\n\n" + " %s -v 3 -c 22,23,24\n\n" + " (default: no pinning)\n", name, name); +} + void kvm_parse_vcpu_pinning(const char *pcpus_string, uint32_t vcpu_to_pcpu[], int nr_vcpus) { -- cgit v1.2.3 From 53550b89220beda42934a76a61313ecf308b2b03 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 2 Jun 2023 18:10:47 -0700 Subject: KVM: x86/pmu: Rename global_ovf_ctrl_mask to global_status_mask Rename global_ovf_ctrl_mask to global_status_mask to avoid confusion now that Intel has renamed GLOBAL_OVF_CTRL to GLOBAL_STATUS_RESET in PMU v4. GLOBAL_OVF_CTRL and GLOBAL_STATUS_RESET are the same MSR index, i.e. are just different names for the same thing, but the SDM provides different entries in the IA-32 Architectural MSRs table, which gets really confusing when looking at PMU v4 definitions since it *looks* like GLOBAL_STATUS has bits that don't exist in GLOBAL_OVF_CTRL, but in reality the bits are simply defined in the GLOBAL_STATUS_RESET entry. No functional change intended. Cc: Like Xu Link: https://lore.kernel.org/r/20230603011058.1038821-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/vmx/pmu_intel.c | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fb9d1f2d6136..28bd38303d70 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -523,7 +523,7 @@ struct kvm_pmu { u64 global_status; u64 counter_bitmask[2]; u64 global_ctrl_mask; - u64 global_ovf_ctrl_mask; + u64 global_status_mask; u64 reserved_bits; u64 raw_event_mask; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 741efe2c497b..fb96cbfc9ae8 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -427,7 +427,11 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } break; case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - if (data & pmu->global_ovf_ctrl_mask) + /* + * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in + * GLOBAL_STATUS, and so the set of reserved bits is the same. + */ + if (data & pmu->global_status_mask) return 1; if (!msr_info->host_initiated) @@ -531,7 +535,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->reserved_bits = 0xffffffff00200000ull; pmu->raw_event_mask = X86_RAW_EVENT_MASK; pmu->global_ctrl_mask = ~0ull; - pmu->global_ovf_ctrl_mask = ~0ull; + pmu->global_status_mask = ~0ull; pmu->fixed_ctr_ctrl_mask = ~0ull; pmu->pebs_enable_mask = ~0ull; pmu->pebs_data_cfg_mask = ~0ull; @@ -585,11 +589,17 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) counter_mask = ~(((1ull << pmu->nr_arch_gp_counters) - 1) | (((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED)); pmu->global_ctrl_mask = counter_mask; - pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask + + /* + * GLOBAL_STATUS and GLOBAL_OVF_CONTROL (a.k.a. GLOBAL_STATUS_RESET) + * share reserved bit definitions. The kernel just happens to use + * OVF_CTRL for the names. + */ + pmu->global_status_mask = pmu->global_ctrl_mask & ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF | MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD); if (vmx_pt_mode_is_host_guest()) - pmu->global_ovf_ctrl_mask &= + pmu->global_status_mask &= ~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI; entry = kvm_find_cpuid_entry_index(vcpu, 7, 0); -- cgit v1.2.3 From 8de18543dfe34a6aac9deefb0256db2609cfa351 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:48 -0700 Subject: KVM: x86/pmu: Move reprogram_counters() to pmu.h Move reprogram_counters() out of Intel specific PMU code and into pmu.h so that it can be used to implement AMD PMU v2 support. No functional change intended. Suggested-by: Sean Christopherson Signed-off-by: Like Xu [sean: rewrite changelog] Link: https://lore.kernel.org/r/20230603011058.1038821-3-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.h | 12 ++++++++++++ arch/x86/kvm/vmx/pmu_intel.c | 12 ------------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 5c7bbf03b599..986563aeeef8 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -201,6 +201,18 @@ static inline void kvm_pmu_request_counter_reprogram(struct kvm_pmc *pmc) kvm_make_request(KVM_REQ_PMU, pmc->vcpu); } +static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff) +{ + int bit; + + if (!diff) + return; + + for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + set_bit(bit, pmu->reprogram_pmi); + kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); +} + void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index fb96cbfc9ae8..edcf8670eb4e 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -73,18 +73,6 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } } -static void reprogram_counters(struct kvm_pmu *pmu, u64 diff) -{ - int bit; - - if (!diff) - return; - - for_each_set_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) - set_bit(bit, pmu->reprogram_pmi); - kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); -} - static bool intel_hw_event_available(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); -- cgit v1.2.3 From 30dab5c0b65ea580878f36d6ae7ff85f06813387 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:49 -0700 Subject: KVM: x86/pmu: Reject userspace attempts to set reserved GLOBAL_STATUS bits Reject userspace writes to MSR_CORE_PERF_GLOBAL_STATUS that attempt to set reserved bits. Allowing userspace to stuff reserved bits doesn't harm KVM itself, but it's architecturally wrong and the guest can't clear the unsupported bits, e.g. makes the guest's PMI handler very confused. Signed-off-by: Like Xu [sean: rewrite changelog to avoid use of #GP, rebase on name change] Link: https://lore.kernel.org/r/20230603011058.1038821-4-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/pmu_intel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index edcf8670eb4e..efd113f24c1b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -402,6 +402,9 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) return 1; /* RO MSR */ + if (data & pmu->global_status_mask) + return 1; + pmu->global_status = data; break; case MSR_CORE_PERF_GLOBAL_CTRL: -- cgit v1.2.3 From c85cdc1cc1ea27573ab15c76f13a06f12530aa54 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:50 -0700 Subject: KVM: x86/pmu: Move handling PERF_GLOBAL_CTRL and friends to common x86 Move the handling of GLOBAL_CTRL, GLOBAL_STATUS, and GLOBAL_OVF_CTRL, a.k.a. GLOBAL_STATUS_RESET, from Intel PMU code to generic x86 PMU code. AMD PerfMonV2 defines three registers that have the same semantics as Intel's variants, just with different names and indices. Conveniently, since KVM virtualizes GLOBAL_CTRL on Intel only for PMU v2 and above, and AMD's version shows up in v2, KVM can use common code for the existence check as well. Signed-off-by: Like Xu Co-developed-by: Sean Christopherson Link: https://lore.kernel.org/r/20230603011058.1038821-5-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 71 ++++++++++++++++++++++++++++++++++++++++++-- arch/x86/kvm/pmu.h | 14 +++++++++ arch/x86/kvm/vmx/nested.c | 4 +-- arch/x86/kvm/vmx/pmu_intel.c | 47 ++--------------------------- arch/x86/kvm/vmx/vmx.h | 12 -------- 5 files changed, 86 insertions(+), 62 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 1690d41c1830..c720cc186ab4 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -562,6 +562,14 @@ void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)); + default: + break; + } return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); } @@ -577,13 +585,70 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u32 msr = msr_info->index; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + msr_info->data = pmu->global_status; + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + msr_info->data = pmu->global_ctrl; + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + msr_info->data = 0; + break; + default: + return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); + } + + return 0; } int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { - kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); - return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + u32 msr = msr_info->index; + u64 data = msr_info->data; + u64 diff; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_STATUS: + if (!msr_info->host_initiated) + return 1; /* RO MSR */ + + if (data & pmu->global_status_mask) + return 1; + + pmu->global_status = data; + break; + case MSR_CORE_PERF_GLOBAL_CTRL: + if (!kvm_valid_perf_global_ctrl(pmu, data)) + return 1; + + if (pmu->global_ctrl != data) { + diff = pmu->global_ctrl ^ data; + pmu->global_ctrl = data; + reprogram_counters(pmu, diff); + } + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + /* + * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in + * GLOBAL_STATUS, and so the set of reserved bits is the same. + */ + if (data & pmu->global_status_mask) + return 1; + + if (!msr_info->host_initiated) + pmu->global_status &= ~data; + break; + default: + kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index); + return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); + } + + return 0; } /* refresh PMU settings. This function generally is called when underlying diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 986563aeeef8..7c2c64142443 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -41,6 +41,20 @@ struct kvm_pmu_ops { void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); +static inline bool kvm_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) +{ + /* + * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is + * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is + * greater than zero. However, KVM only exposes and emulates the MSR + * to/for the guest if the guest PMU supports at least "Architectural + * Performance Monitoring Version 2". + * + * AMD's version of PERF_GLOBAL_CTRL conveniently shows up with v2. + */ + return pmu->version > 1; +} + static inline u64 pmc_bitmask(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index e35cf0bd0df9..ba2ed6d87364 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2649,7 +2649,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) && - intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && + kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)) && WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->guest_ia32_perf_global_ctrl))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -4524,7 +4524,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, vcpu->arch.pat = vmcs12->host_ia32_pat; } if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) && - intel_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) + kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu))) WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL, vmcs12->host_ia32_perf_global_ctrl)); diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index efd113f24c1b..ff2f52d1e22f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -100,7 +100,7 @@ static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) { struct kvm_pmu *pmu = pmc_to_pmu(pmc); - if (!intel_pmu_has_perf_global_ctrl(pmu)) + if (!kvm_pmu_has_perf_global_ctrl(pmu)) return true; return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); @@ -186,11 +186,7 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) switch (msr) { case MSR_CORE_PERF_FIXED_CTR_CTRL: - case MSR_CORE_PERF_GLOBAL_STATUS: - case MSR_CORE_PERF_GLOBAL_CTRL: - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - return intel_pmu_has_perf_global_ctrl(pmu); - break; + return kvm_pmu_has_perf_global_ctrl(pmu); case MSR_IA32_PEBS_ENABLE: ret = vcpu_get_perf_capabilities(vcpu) & PERF_CAP_PEBS_FORMAT; break; @@ -340,15 +336,6 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_FIXED_CTR_CTRL: msr_info->data = pmu->fixed_ctr_ctrl; break; - case MSR_CORE_PERF_GLOBAL_STATUS: - msr_info->data = pmu->global_status; - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - msr_info->data = pmu->global_ctrl; - break; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - msr_info->data = 0; - break; case MSR_IA32_PEBS_ENABLE: msr_info->data = pmu->pebs_enable; break; @@ -398,36 +385,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (pmu->fixed_ctr_ctrl != data) reprogram_fixed_counters(pmu, data); break; - case MSR_CORE_PERF_GLOBAL_STATUS: - if (!msr_info->host_initiated) - return 1; /* RO MSR */ - - if (data & pmu->global_status_mask) - return 1; - - pmu->global_status = data; - break; - case MSR_CORE_PERF_GLOBAL_CTRL: - if (!kvm_valid_perf_global_ctrl(pmu, data)) - return 1; - - if (pmu->global_ctrl != data) { - diff = pmu->global_ctrl ^ data; - pmu->global_ctrl = data; - reprogram_counters(pmu, diff); - } - break; - case MSR_CORE_PERF_GLOBAL_OVF_CTRL: - /* - * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in - * GLOBAL_STATUS, and so the set of reserved bits is the same. - */ - if (data & pmu->global_status_mask) - return 1; - - if (!msr_info->host_initiated) - pmu->global_status &= ~data; - break; case MSR_IA32_PEBS_ENABLE: if (data & pmu->pebs_enable_mask) return 1; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 9e66531861cf..32384ba38499 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -93,18 +93,6 @@ union vmx_exit_reason { u32 full; }; -static inline bool intel_pmu_has_perf_global_ctrl(struct kvm_pmu *pmu) -{ - /* - * Architecturally, Intel's SDM states that IA32_PERF_GLOBAL_CTRL is - * supported if "CPUID.0AH: EAX[7:0] > 0", i.e. if the PMU version is - * greater than zero. However, KVM only exposes and emulates the MSR - * to/for the guest if the guest PMU supports at least "Architectural - * Performance Monitoring Version 2". - */ - return pmu->version > 1; -} - struct lbr_desc { /* Basic info about guest LBR records. */ struct x86_pmu_lbr records; -- cgit v1.2.3 From 13afa29ae489d9b7c1038179f1e2bec74873e471 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:51 -0700 Subject: KVM: x86/pmu: Provide Intel PMU's pmc_is_enabled() as generic x86 code Move the Intel PMU implementation of pmc_is_enabled() to common x86 code as pmc_is_globally_enabled(), and drop AMD's implementation. AMD PMU currently supports only v1, and thus not PERF_GLOBAL_CONTROL, thus the semantics for AMD are unchanged. And when support for AMD PMU v2 comes along, the common behavior will also Just Work. Signed-off-by: Like Xu Co-developed-by: Sean Christopherson Link: https://lore.kernel.org/r/20230603011058.1038821-6-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-pmu-ops.h | 1 - arch/x86/kvm/pmu.c | 5 ----- arch/x86/kvm/pmu.h | 16 +++++++++++++++- arch/x86/kvm/svm/pmu.c | 9 --------- arch/x86/kvm/vmx/pmu_intel.c | 14 +------------- 5 files changed, 16 insertions(+), 29 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h index c17e3e96fc1d..6c98f4bb4228 100644 --- a/arch/x86/include/asm/kvm-x86-pmu-ops.h +++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h @@ -13,7 +13,6 @@ BUILD_BUG_ON(1) * at the call sites. */ KVM_X86_PMU_OP(hw_event_available) -KVM_X86_PMU_OP(pmc_is_enabled) KVM_X86_PMU_OP(pmc_idx_to_pmc) KVM_X86_PMU_OP(rdpmc_ecx_to_pmc) KVM_X86_PMU_OP(msr_idx_to_pmc) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index c720cc186ab4..4315f46aabfb 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -93,11 +93,6 @@ void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) #undef __KVM_X86_PMU_OP } -static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) -{ - return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc); -} - static void kvm_pmi_trigger_fn(struct irq_work *irq_work) { struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work); diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 7c2c64142443..f77bfc7ede42 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -20,7 +20,6 @@ struct kvm_pmu_ops { bool (*hw_event_available)(struct kvm_pmc *pmc); - bool (*pmc_is_enabled)(struct kvm_pmc *pmc); struct kvm_pmc *(*pmc_idx_to_pmc)(struct kvm_pmu *pmu, int pmc_idx); struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); @@ -227,6 +226,21 @@ static inline void reprogram_counters(struct kvm_pmu *pmu, u64 diff) kvm_make_request(KVM_REQ_PMU, pmu_to_vcpu(pmu)); } +/* + * Check if a PMC is enabled by comparing it against global_ctrl bits. + * + * If the vPMU doesn't have global_ctrl MSR, all vPMCs are enabled. + */ +static inline bool pmc_is_globally_enabled(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = pmc_to_pmu(pmc); + + if (!kvm_pmu_has_perf_global_ctrl(pmu)) + return true; + + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); +} + void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 5fa939e411d8..70143275e0a7 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -78,14 +78,6 @@ static bool amd_hw_event_available(struct kvm_pmc *pmc) return true; } -/* check if a PMC is enabled by comparing it against global_ctrl bits. Because - * AMD CPU doesn't have global_ctrl MSR, all PMCs are enabled (return TRUE). - */ -static bool amd_pmc_is_enabled(struct kvm_pmc *pmc) -{ - return true; -} - static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -220,7 +212,6 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) struct kvm_pmu_ops amd_pmu_ops __initdata = { .hw_event_available = amd_hw_event_available, - .pmc_is_enabled = amd_pmc_is_enabled, .pmc_idx_to_pmc = amd_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = amd_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = amd_msr_idx_to_pmc, diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index ff2f52d1e22f..b2f279f934b1 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -95,17 +95,6 @@ static bool intel_hw_event_available(struct kvm_pmc *pmc) return true; } -/* check if a PMC is enabled by comparing it with globl_ctrl bits. */ -static bool intel_pmc_is_enabled(struct kvm_pmc *pmc) -{ - struct kvm_pmu *pmu = pmc_to_pmu(pmc); - - if (!kvm_pmu_has_perf_global_ctrl(pmu)) - return true; - - return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); -} - static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -759,7 +748,7 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) pmc = intel_pmc_idx_to_pmc(pmu, bit); if (!pmc || !pmc_speculative_in_use(pmc) || - !intel_pmc_is_enabled(pmc) || !pmc->perf_event) + !pmc_is_globally_enabled(pmc) || !pmc->perf_event) continue; /* @@ -774,7 +763,6 @@ void intel_pmu_cross_mapped_check(struct kvm_pmu *pmu) struct kvm_pmu_ops intel_pmu_ops __initdata = { .hw_event_available = intel_hw_event_available, - .pmc_is_enabled = intel_pmc_is_enabled, .pmc_idx_to_pmc = intel_pmc_idx_to_pmc, .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc, .msr_idx_to_pmc = intel_msr_idx_to_pmc, -- cgit v1.2.3 From 6593039d33c119968b9e2d85eaca853b391e8a55 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:52 -0700 Subject: KVM: x86: Explicitly zero cpuid "0xa" leaf when PMU is disabled Add an explicit !enable_pmu check as relying on kvm_pmu_cap to be zeroed isn't obvious. Although when !enable_pmu, KVM will have zero-padded kvm_pmu_cap to do subsequent CPUID leaf assignments. Suggested-by: Sean Christopherson Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20230603011058.1038821-7-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0c9660a07b23..61bc71882f07 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -948,7 +948,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) union cpuid10_eax eax; union cpuid10_edx edx; - if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { + if (!enable_pmu || !static_cpu_has(X86_FEATURE_ARCH_PERFMON)) { entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } -- cgit v1.2.3 From 6a08083f294cb623fbc882417b28c5646b01f4bf Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:53 -0700 Subject: KVM: x86/pmu: Disable vPMU if the minimum num of counters isn't met Disable PMU support when running on AMD and perf reports fewer than four general purpose counters. All AMD PMUs must define at least four counters due to AMD's legacy architecture hardcoding the number of counters without providing a way to enumerate the number of counters to software, e.g. from AMD's APM: The legacy architecture defines four performance counters (PerfCtrn) and corresponding event-select registers (PerfEvtSeln). Virtualizing fewer than four counters can lead to guest instability as software expects four counters to be available. Rather than bleed AMD details into the common code, just define a const unsigned int and provide a convenient location to document why Intel and AMD have different mins (in particular, AMD's lack of any way to enumerate less than four counters to the guest). Keep the minimum number of counters at Intel at one, even though old P6 and Core Solo/Duo processor effectively require a minimum of two counters. KVM can, and more importantly has up until this point, supported a vPMU so long as the CPU has at least one counter. Perf's support for P6/Core CPUs does require two counters, but perf will happily chug along with a single counter when running on a modern CPU. Cc: Jim Mattson Suggested-by: Sean Christopherson Signed-off-by: Like Xu [sean: set Intel min to '1', not '2'] Link: https://lore.kernel.org/r/20230603011058.1038821-8-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.h | 14 ++++++++++---- arch/x86/kvm/svm/pmu.c | 1 + arch/x86/kvm/vmx/pmu_intel.c | 1 + 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index f77bfc7ede42..7d9ba301c090 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -36,6 +36,7 @@ struct kvm_pmu_ops { const u64 EVENTSEL_EVENT; const int MAX_NR_GP_COUNTERS; + const int MIN_NR_GP_COUNTERS; }; void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops); @@ -174,6 +175,7 @@ extern struct x86_pmu_capability kvm_pmu_cap; static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) { bool is_intel = boot_cpu_data.x86_vendor == X86_VENDOR_INTEL; + int min_nr_gp_ctrs = pmu_ops->MIN_NR_GP_COUNTERS; /* * Hybrid PMUs don't play nice with virtualization without careful @@ -188,11 +190,15 @@ static inline void kvm_init_pmu_capability(const struct kvm_pmu_ops *pmu_ops) perf_get_x86_pmu_capability(&kvm_pmu_cap); /* - * For Intel, only support guest architectural pmu - * on a host with architectural pmu. + * WARN if perf did NOT disable hardware PMU if the number of + * architecturally required GP counters aren't present, i.e. if + * there are a non-zero number of counters, but fewer than what + * is architecturally required. */ - if ((is_intel && !kvm_pmu_cap.version) || - !kvm_pmu_cap.num_counters_gp) + if (!kvm_pmu_cap.num_counters_gp || + WARN_ON_ONCE(kvm_pmu_cap.num_counters_gp < min_nr_gp_ctrs)) + enable_pmu = false; + else if (is_intel && !kvm_pmu_cap.version) enable_pmu = false; } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 70143275e0a7..e5c69062a909 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -224,4 +224,5 @@ struct kvm_pmu_ops amd_pmu_ops __initdata = { .reset = amd_pmu_reset, .EVENTSEL_EVENT = AMD64_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_AMD_PMC_MAX_GENERIC, + .MIN_NR_GP_COUNTERS = AMD64_NUM_COUNTERS, }; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index b2f279f934b1..30ec9ccdea47 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -777,4 +777,5 @@ struct kvm_pmu_ops intel_pmu_ops __initdata = { .cleanup = intel_pmu_cleanup, .EVENTSEL_EVENT = ARCH_PERFMON_EVENTSEL_EVENT, .MAX_NR_GP_COUNTERS = KVM_INTEL_PMC_MAX_GENERIC, + .MIN_NR_GP_COUNTERS = 1, }; -- cgit v1.2.3 From d338d8789e64b2d8ed2f82f9364c415d6efa118d Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:54 -0700 Subject: KVM: x86/pmu: Advertise PERFCTR_CORE iff the min nr of counters is met Enable and advertise PERFCTR_CORE if and only if the minimum number of required counters are available, i.e. if perf says there are less than six general purpose counters. Opportunistically, use kvm_cpu_cap_check_and_set() instead of open coding the check for host support. Suggested-by: Sean Christopherson Signed-off-by: Like Xu [sean: massage shortlog and changelog] Link: https://lore.kernel.org/r/20230603011058.1038821-9-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/svm.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index ca32389f3c36..d9669e3cc00a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5025,9 +5025,18 @@ static __init void svm_set_cpu_caps(void) boot_cpu_has(X86_FEATURE_AMD_SSBD)) kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); - /* AMD PMU PERFCTR_CORE CPUID */ - if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) - kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE); + if (enable_pmu) { + /* + * Enumerate support for PERFCTR_CORE if and only if KVM has + * access to enough counters to virtualize "core" support, + * otherwise limit vPMU support to the legacy number of counters. + */ + if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE) + kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS, + kvm_pmu_cap.num_counters_gp); + else + kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE); + } /* CPUID 0x8000001F (SME/SEV features) */ sev_set_cpu_caps(); -- cgit v1.2.3 From 1c2bf8a6b045a6ac4e75a7a07fde70db63e5a380 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:55 -0700 Subject: KVM: x86/pmu: Constrain the num of guest counters with kvm_pmu_cap Cap the number of general purpose counters enumerated on AMD to what KVM actually supports, i.e. don't allow userspace to coerce KVM into thinking there are more counters than actually exist, e.g. by enumerating X86_FEATURE_PERFCTR_CORE in guest CPUID when its not supported. Suggested-by: Sean Christopherson Signed-off-by: Like Xu [sean: massage changelog] Link: https://lore.kernel.org/r/20230603011058.1038821-10-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/pmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index e5c69062a909..c03958063a76 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -170,6 +170,9 @@ static void amd_pmu_refresh(struct kvm_vcpu *vcpu) else pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; + pmu->nr_arch_gp_counters = min_t(unsigned int, pmu->nr_arch_gp_counters, + kvm_pmu_cap.num_counters_gp); + pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; pmu->reserved_bits = 0xfffffff000280000ull; pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; -- cgit v1.2.3 From fe8d76c1a6f04387953727ee3e63469956e7360d Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:56 -0700 Subject: KVM: x86/cpuid: Add a KVM-only leaf to redirect AMD PerfMonV2 flag Add a KVM-only leaf for AMD's PerfMonV2 to redirect the kernel's scattered version to its architectural location, e.g. so that KVM can query guest support via guest_cpuid_has(). Suggested-by: Sean Christopherson Signed-off-by: Like Xu [sean: massage changelog] Link: https://lore.kernel.org/r/20230603011058.1038821-11-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/reverse_cpuid.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kvm/reverse_cpuid.h b/arch/x86/kvm/reverse_cpuid.h index a5717282bb9c..56cbdb24400a 100644 --- a/arch/x86/kvm/reverse_cpuid.h +++ b/arch/x86/kvm/reverse_cpuid.h @@ -15,6 +15,7 @@ enum kvm_only_cpuid_leafs { CPUID_12_EAX = NCAPINTS, CPUID_7_1_EDX, CPUID_8000_0007_EDX, + CPUID_8000_0022_EAX, NR_KVM_CPU_CAPS, NKVMCAPINTS = NR_KVM_CPU_CAPS - NCAPINTS, @@ -47,6 +48,9 @@ enum kvm_only_cpuid_leafs { /* CPUID level 0x80000007 (EDX). */ #define KVM_X86_FEATURE_CONSTANT_TSC KVM_X86_FEATURE(CPUID_8000_0007_EDX, 8) +/* CPUID level 0x80000022 (EAX) */ +#define KVM_X86_FEATURE_PERFMON_V2 KVM_X86_FEATURE(CPUID_8000_0022_EAX, 0) + struct cpuid_reg { u32 function; u32 index; @@ -74,6 +78,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_1_EDX] = { 7, 1, CPUID_EDX}, [CPUID_8000_0007_EDX] = {0x80000007, 0, CPUID_EDX}, [CPUID_8000_0021_EAX] = {0x80000021, 0, CPUID_EAX}, + [CPUID_8000_0022_EAX] = {0x80000022, 0, CPUID_EAX}, }; /* @@ -108,6 +113,8 @@ static __always_inline u32 __feature_translate(int x86_feature) return KVM_X86_FEATURE_SGX_EDECCSSA; else if (x86_feature == X86_FEATURE_CONSTANT_TSC) return KVM_X86_FEATURE_CONSTANT_TSC; + else if (x86_feature == X86_FEATURE_PERFMON_V2) + return KVM_X86_FEATURE_PERFMON_V2; return x86_feature; } -- cgit v1.2.3 From 4a2771895ca63a055df815be5e307cce8e85308c Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:57 -0700 Subject: KVM: x86/svm/pmu: Add AMD PerfMonV2 support If AMD Performance Monitoring Version 2 (PerfMonV2) is detected by the guest, it can use a new scheme to manage the Core PMCs using the new global control and status registers. In addition to benefiting from the PerfMonV2 functionality in the same way as the host (higher precision), the guest also can reduce the number of vm-exits by lowering the total number of MSRs accesses. In terms of implementation details, amd_is_valid_msr() is resurrected since three newly added MSRs could not be mapped to one vPMC. The possibility of emulating PerfMonV2 on the mainframe has also been eliminated for reasons of precision. Co-developed-by: Sandipan Das Signed-off-by: Sandipan Das Signed-off-by: Like Xu [sean: drop "Based on the observed HW." comments] Link: https://lore.kernel.org/r/20230603011058.1038821-12-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/pmu.c | 18 ++++++++++++++++- arch/x86/kvm/svm/pmu.c | 55 +++++++++++++++++++++++++++++++++++++++++--------- arch/x86/kvm/x86.c | 10 +++++++++ 3 files changed, 72 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 4315f46aabfb..bf653df86112 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -585,11 +585,14 @@ int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr) { case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: msr_info->data = pmu->global_status; break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: case MSR_CORE_PERF_GLOBAL_CTRL: msr_info->data = pmu->global_ctrl; break; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = 0; break; @@ -607,16 +610,28 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) u64 data = msr_info->data; u64 diff; + /* + * Note, AMD ignores writes to reserved bits and read-only PMU MSRs, + * whereas Intel generates #GP on attempts to write reserved/RO MSRs. + */ switch (msr) { case MSR_CORE_PERF_GLOBAL_STATUS: if (!msr_info->host_initiated) return 1; /* RO MSR */ + fallthrough; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + /* Per PPR, Read-only MSR. Writes are ignored. */ + if (!msr_info->host_initiated) + break; if (data & pmu->global_status_mask) return 1; pmu->global_status = data; break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + data &= ~pmu->global_ctrl_mask; + fallthrough; case MSR_CORE_PERF_GLOBAL_CTRL: if (!kvm_valid_perf_global_ctrl(pmu, data)) return 1; @@ -634,7 +649,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ if (data & pmu->global_status_mask) return 1; - + fallthrough; + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: if (!msr_info->host_initiated) pmu->global_status &= ~data; break; diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index c03958063a76..cef5a3d0abd0 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -94,12 +94,6 @@ static struct kvm_pmc *amd_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return amd_pmc_idx_to_pmc(vcpu_to_pmu(vcpu), idx & ~(3u << 30)); } -static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) -{ - /* All MSRs refer to exactly one PMC, so msr_idx_to_pmc is enough. */ - return false; -} - static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -111,6 +105,29 @@ static struct kvm_pmc *amd_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr) return pmc; } +static bool amd_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + + switch (msr) { + case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3: + return pmu->version > 0; + case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5: + return guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE); + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + return pmu->version > 1; + default: + if (msr > MSR_F15H_PERF_CTR5 && + msr < MSR_F15H_PERF_CTL0 + 2 * pmu->nr_arch_gp_counters) + return pmu->version > 1; + break; + } + + return amd_msr_idx_to_pmc(vcpu, msr); +} + static int amd_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -164,23 +181,39 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void amd_pmu_refresh(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + union cpuid_0x80000022_ebx ebx; - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + pmu->version = 1; + if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) { + pmu->version = 2; + /* + * Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest + * CPUID entry is guaranteed to be non-NULL. + */ + BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 || + x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index); + ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx; + pmu->nr_arch_gp_counters = ebx.split.num_core_pmc; + } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE; - else + } else { pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS; + } pmu->nr_arch_gp_counters = min_t(unsigned int, pmu->nr_arch_gp_counters, kvm_pmu_cap.num_counters_gp); + if (pmu->version > 1) { + pmu->global_ctrl_mask = ~((1ull << pmu->nr_arch_gp_counters) - 1); + pmu->global_status_mask = pmu->global_ctrl_mask; + } + pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1; pmu->reserved_bits = 0xfffffff000280000ull; pmu->raw_event_mask = AMD64_RAW_EVENT_MASK; - pmu->version = 1; /* not applicable to AMD; but clean them to prevent any fall out */ pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->nr_arch_fixed_counters = 0; - pmu->global_status = 0; bitmap_set(pmu->all_valid_pmc_idx, 0, pmu->nr_arch_gp_counters); } @@ -211,6 +244,8 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu) pmc_stop_counter(pmc); pmc->counter = pmc->prev_counter = pmc->eventsel = 0; } + + pmu->global_ctrl = pmu->global_status = 0; } struct kvm_pmu_ops amd_pmu_ops __initdata = { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c0778ca39650..abfba3cae0ba 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1483,6 +1483,10 @@ static const u32 msrs_to_save_pmu[] = { MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, + + MSR_AMD64_PERF_CNTR_GLOBAL_CTL, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, + MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, }; static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + @@ -7150,6 +7154,12 @@ static void kvm_probe_msr_to_save(u32 msr_index) kvm_pmu_cap.num_counters_fixed) return; break; + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: + if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) + return; + break; case MSR_IA32_XFD: case MSR_IA32_XFD_ERR: if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) -- cgit v1.2.3 From 94cdeebd82111d7b7da5bd4da053eed9e0f65d72 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 2 Jun 2023 18:10:58 -0700 Subject: KVM: x86/cpuid: Add AMD CPUID ExtPerfMonAndDbg leaf 0x80000022 CPUID leaf 0x80000022 i.e. ExtPerfMonAndDbg advertises some new performance monitoring features for AMD processors. Bit 0 of EAX indicates support for Performance Monitoring Version 2 (PerfMonV2) features. If found to be set during PMU initialization, the EBX bits of the same CPUID function can be used to determine the number of available PMCs for different PMU types. Expose the relevant bits via KVM_GET_SUPPORTED_CPUID so that guests can make use of the PerfMonV2 features. Co-developed-by: Sandipan Das Signed-off-by: Sandipan Das Signed-off-by: Like Xu Link: https://lore.kernel.org/r/20230603011058.1038821-13-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 28 +++++++++++++++++++++++++++- arch/x86/kvm/svm/svm.c | 4 ++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 61bc71882f07..0e5584f4acd7 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -734,6 +734,10 @@ void kvm_set_cpu_caps(void) F(NULL_SEL_CLR_BASE) | F(AUTOIBRS) | 0 /* PrefetchCtlMsr */ ); + kvm_cpu_cap_init_kvm_defined(CPUID_8000_0022_EAX, + F(PERFMON_V2) + ); + /* * Synthesize "LFENCE is serializing" into the AMD-defined entry in * KVM's supported CPUID if the feature is reported as supported by the @@ -1128,7 +1132,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = 0; break; case 0x80000000: - entry->eax = min(entry->eax, 0x80000021); + entry->eax = min(entry->eax, 0x80000022); /* * Serializing LFENCE is reported in a multitude of ways, and * NullSegClearsBase is not reported in CPUID on Zen2; help @@ -1233,6 +1237,28 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx = entry->ecx = entry->edx = 0; cpuid_entry_override(entry, CPUID_8000_0021_EAX); break; + /* AMD Extended Performance Monitoring and Debug */ + case 0x80000022: { + union cpuid_0x80000022_ebx ebx; + + entry->ecx = entry->edx = 0; + if (!enable_pmu || !kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) { + entry->eax = entry->ebx; + break; + } + + cpuid_entry_override(entry, CPUID_8000_0022_EAX); + + if (kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) + ebx.split.num_core_pmc = kvm_pmu_cap.num_counters_gp; + else if (kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) + ebx.split.num_core_pmc = AMD64_NUM_COUNTERS_CORE; + else + ebx.split.num_core_pmc = AMD64_NUM_COUNTERS; + + entry->ebx = ebx.full; + break; + } /*Add support for Centaur's CPUID instruction*/ case 0xC0000000: /*Just support up to 0xC0000004 now*/ diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d9669e3cc00a..ff48cdea1fbf 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -5036,6 +5036,10 @@ static __init void svm_set_cpu_caps(void) kvm_pmu_cap.num_counters_gp); else kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE); + + if (kvm_pmu_cap.version != 2 || + !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) + kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2); } /* CPUID 0x8000001F (SME/SEV features) */ -- cgit v1.2.3 From a7a2c72ae01483d3923b18ee18c8007de2bc5e20 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 9 Jun 2023 19:00:44 +0000 Subject: KVM: arm64: Separate out feature sanitisation and initialisation kvm_vcpu_set_target() iteratively sanitises and copies feature flags in one go. This is rather odd, especially considering the fact that bitmap accessors can do the heavy lifting. A subsequent change will make vCPU features VM-wide, and fitting that into the present implementation is just a chore. Rework the whole thing to use bitmap accessors to sanitise and copy flags. Link: https://lore.kernel.org/r/20230609190054.1542113-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 1 + arch/arm64/kvm/arm.c | 75 ++++++++++++++++++++++++--------------- 2 files changed, 48 insertions(+), 28 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 7e7e19ef6993..565cad211388 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -39,6 +39,7 @@ #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS #define KVM_VCPU_MAX_FEATURES 7 +#define KVM_VCPU_VALID_FEATURES (BIT(KVM_VCPU_MAX_FEATURES) - 1) #define KVM_REQ_SLEEP \ KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 14391826241c..d5298054a8ed 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1167,42 +1167,40 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level, return -EINVAL; } -static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, - const struct kvm_vcpu_init *init) +static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) { - unsigned int i, ret; - u32 phys_target = kvm_target_cpu(); + unsigned long features = init->features[0]; + int i; - if (init->target != phys_target) - return -EINVAL; + if (features & ~KVM_VCPU_VALID_FEATURES) + return -ENOENT; - /* - * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must - * use the same target. - */ - if (vcpu->arch.target != -1 && vcpu->arch.target != init->target) - return -EINVAL; + for (i = 1; i < ARRAY_SIZE(init->features); i++) { + if (init->features[i]) + return -ENOENT; + } - /* -ENOENT for unknown features, -EINVAL for invalid combinations. */ - for (i = 0; i < sizeof(init->features) * 8; i++) { - bool set = (init->features[i / 32] & (1 << (i % 32))); + return 0; +} - if (set && i >= KVM_VCPU_MAX_FEATURES) - return -ENOENT; +static bool kvm_vcpu_init_changed(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) +{ + unsigned long features = init->features[0]; - /* - * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must - * use the same feature set. - */ - if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES && - test_bit(i, vcpu->arch.features) != set) - return -EINVAL; + return !bitmap_equal(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES) || + vcpu->arch.target != init->target; +} - if (set) - set_bit(i, vcpu->arch.features); - } +static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) +{ + unsigned long features = init->features[0]; + int ret; - vcpu->arch.target = phys_target; + vcpu->arch.target = init->target; + bitmap_copy(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES); /* Now we know what it is, we can reset it. */ ret = kvm_reset_vcpu(vcpu); @@ -1214,6 +1212,27 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, return ret; } +static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, + const struct kvm_vcpu_init *init) +{ + int ret; + + if (init->target != kvm_target_cpu()) + return -EINVAL; + + ret = kvm_vcpu_init_check_features(vcpu, init); + if (ret) + return ret; + + if (vcpu->arch.target == -1) + return __kvm_vcpu_set_target(vcpu, init); + + if (kvm_vcpu_init_changed(vcpu, init)) + return -EINVAL; + + return kvm_reset_vcpu(vcpu); +} + static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) { -- cgit v1.2.3 From e3c1c0cae31ec9ebfdffeaa2c86ddeba6cf5c74c Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 9 Jun 2023 19:00:45 +0000 Subject: KVM: arm64: Relax invariance of KVM_ARM_VCPU_POWER_OFF Allow the value of KVM_ARM_VCPU_POWER_OFF to differ between calls to KVM_ARM_VCPU_INIT. Userspace can already change the state of the vCPU through the KVM_SET_MP_STATE ioctl, so making the bit invariant seems needlessly restrictive. Link: https://lore.kernel.org/r/20230609190054.1542113-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index d5298054a8ed..a9c18f45df3f 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1236,8 +1236,19 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) { + bool power_off = false; int ret; + /* + * Treat the power-off vCPU feature as ephemeral. Clear the bit to avoid + * reflecting it in the finalized feature set, thus limiting its scope + * to a single KVM_ARM_VCPU_INIT call. + */ + if (init->features[0] & KVM_ARM_VCPU_POWER_OFF) { + init->features[0] &= ~KVM_ARM_VCPU_POWER_OFF; + power_off = true; + } + ret = kvm_vcpu_set_target(vcpu, init); if (ret) return ret; @@ -1266,7 +1277,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, */ spin_lock(&vcpu->arch.mp_state_lock); - if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features)) + if (power_off) __kvm_arm_vcpu_power_off(vcpu); else WRITE_ONCE(vcpu->arch.mp_state.mp_state, KVM_MP_STATE_RUNNABLE); -- cgit v1.2.3 From 2251e9ff1573a266102f40e507f0b8dc5861f3e4 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 9 Jun 2023 19:00:46 +0000 Subject: KVM: arm64: Make vCPU feature flags consistent VM-wide To date KVM has allowed userspace to construct asymmetric VMs where particular features may only be supported on a subset of vCPUs. This wasn't really the intened usage pattern, and it is a total pain in the ass to keep working in the kernel. What's more, this is at odds with CPU features in host userspace, where asymmetric features are largely hidden or disabled. It's time to put an end to the whole game. Require all vCPUs in the VM to have the same feature set, rejecting deviants in the KVM_ARM_VCPU_INIT ioctl. Preserve some of the vestiges of per-vCPU feature flags in case we need to reinstate the old behavior for some limited configurations. Yes, this is a sign of cowardice around a user-visibile change. Hoist all of the 32-bit limitations into kvm_vcpu_init_check_features() to avoid nested attempts to acquire the config_lock, which won't end well. Link: https://lore.kernel.org/r/20230609190054.1542113-4-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_emulate.h | 7 +---- arch/arm64/include/asm/kvm_host.h | 22 ++++++-------- arch/arm64/kvm/arm.c | 31 ++++++++++++++++++- arch/arm64/kvm/reset.c | 58 ------------------------------------ 4 files changed, 40 insertions(+), 78 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index b31b32ecbe2d..b19c35129d1c 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -62,12 +62,7 @@ static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) #else static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) { - struct kvm *kvm = vcpu->kvm; - - WARN_ON_ONCE(!test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, - &kvm->arch.flags)); - - return test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags); + return test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features); } #endif diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 565cad211388..2c8c4ace81ef 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -215,25 +215,21 @@ struct kvm_arch { #define KVM_ARCH_FLAG_MTE_ENABLED 1 /* At least one vCPU has ran in the VM */ #define KVM_ARCH_FLAG_HAS_RAN_ONCE 2 - /* - * The following two bits are used to indicate the guest's EL1 - * register width configuration. A value of KVM_ARCH_FLAG_EL1_32BIT - * bit is valid only when KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED is set. - * Otherwise, the guest's EL1 register width has not yet been - * determined yet. - */ -#define KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED 3 -#define KVM_ARCH_FLAG_EL1_32BIT 4 + /* The vCPU feature set for the VM is configured */ +#define KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED 3 /* PSCI SYSTEM_SUSPEND enabled for the guest */ -#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 5 +#define KVM_ARCH_FLAG_SYSTEM_SUSPEND_ENABLED 4 /* VM counter offset */ -#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET 6 +#define KVM_ARCH_FLAG_VM_COUNTER_OFFSET 5 /* Timer PPIs made immutable */ -#define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 7 +#define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 6 /* SMCCC filter initialized for the VM */ -#define KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED 8 +#define KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED 7 unsigned long flags; + /* VM-wide vCPU feature set */ + DECLARE_BITMAP(vcpu_features, KVM_VCPU_MAX_FEATURES); + /* * VM-wide PMU filter, implemented as a bitmap and big enough for * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+). diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a9c18f45df3f..85c978ad1f27 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -170,6 +170,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) */ kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit(); + bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); + return 0; err_free_cpumask: @@ -1181,6 +1183,20 @@ static int kvm_vcpu_init_check_features(struct kvm_vcpu *vcpu, return -ENOENT; } + if (!test_bit(KVM_ARM_VCPU_EL1_32BIT, &features)) + return 0; + + if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1)) + return -EINVAL; + + /* MTE is incompatible with AArch32 */ + if (kvm_has_mte(vcpu->kvm)) + return -EINVAL; + + /* NV is incompatible with AArch32 */ + if (test_bit(KVM_ARM_VCPU_HAS_EL2, &features)) + return -EINVAL; + return 0; } @@ -1197,7 +1213,14 @@ static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { unsigned long features = init->features[0]; - int ret; + struct kvm *kvm = vcpu->kvm; + int ret = -EINVAL; + + mutex_lock(&kvm->arch.config_lock); + + if (test_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags) && + !bitmap_equal(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES)) + goto out_unlock; vcpu->arch.target = init->target; bitmap_copy(vcpu->arch.features, &features, KVM_VCPU_MAX_FEATURES); @@ -1207,8 +1230,14 @@ static int __kvm_vcpu_set_target(struct kvm_vcpu *vcpu, if (ret) { vcpu->arch.target = -1; bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); + goto out_unlock; } + bitmap_copy(kvm->arch.vcpu_features, &features, KVM_VCPU_MAX_FEATURES); + set_bit(KVM_ARCH_FLAG_VCPU_FEATURES_CONFIGURED, &kvm->arch.flags); + +out_unlock: + mutex_unlock(&kvm->arch.config_lock); return ret; } diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index b5dee8e57e77..bc8556b6f459 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -186,57 +186,6 @@ static int kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) return 0; } -/** - * kvm_set_vm_width() - set the register width for the guest - * @vcpu: Pointer to the vcpu being configured - * - * Set both KVM_ARCH_FLAG_EL1_32BIT and KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED - * in the VM flags based on the vcpu's requested register width, the HW - * capabilities and other options (such as MTE). - * When REG_WIDTH_CONFIGURED is already set, the vcpu settings must be - * consistent with the value of the FLAG_EL1_32BIT bit in the flags. - * - * Return: 0 on success, negative error code on failure. - */ -static int kvm_set_vm_width(struct kvm_vcpu *vcpu) -{ - struct kvm *kvm = vcpu->kvm; - bool is32bit; - - is32bit = vcpu_has_feature(vcpu, KVM_ARM_VCPU_EL1_32BIT); - - lockdep_assert_held(&kvm->arch.config_lock); - - if (test_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags)) { - /* - * The guest's register width is already configured. - * Make sure that the vcpu is consistent with it. - */ - if (is32bit == test_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags)) - return 0; - - return -EINVAL; - } - - if (!cpus_have_const_cap(ARM64_HAS_32BIT_EL1) && is32bit) - return -EINVAL; - - /* MTE is incompatible with AArch32 */ - if (kvm_has_mte(kvm) && is32bit) - return -EINVAL; - - /* NV is incompatible with AArch32 */ - if (vcpu_has_nv(vcpu) && is32bit) - return -EINVAL; - - if (is32bit) - set_bit(KVM_ARCH_FLAG_EL1_32BIT, &kvm->arch.flags); - - set_bit(KVM_ARCH_FLAG_REG_WIDTH_CONFIGURED, &kvm->arch.flags); - - return 0; -} - /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer @@ -262,13 +211,6 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) bool loaded; u32 pstate; - mutex_lock(&vcpu->kvm->arch.config_lock); - ret = kvm_set_vm_width(vcpu); - mutex_unlock(&vcpu->kvm->arch.config_lock); - - if (ret) - return ret; - spin_lock(&vcpu->arch.mp_state_lock); reset_state = vcpu->arch.reset_state; vcpu->arch.reset_state.reset = false; -- cgit v1.2.3 From f90f9360c3d7fbb731732fbb9a1228f55d178e10 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 9 Jun 2023 19:00:47 +0000 Subject: KVM: arm64: Rewrite IMPDEF PMU version as NI KVM allows userspace to write an IMPDEF PMU version to the corresponding 32bit and 64bit ID register fields for the sake of backwards compatibility with kernels that lacked commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the ID_AA64DFR0_EL1.PMUver limit to VM creation"). Plumbing that IMPDEF PMU version through to the gues is getting in the way of progress, and really doesn't any sense in the first place. Bite the bullet and reinterpret the IMPDEF PMU version as NI (0) for userspace writes. Additionally, spill the dirty details into a comment. Link: https://lore.kernel.org/r/20230609190054.1542113-5-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 5 +--- arch/arm64/kvm/arm.c | 2 +- arch/arm64/kvm/sys_regs.c | 56 ++++++++++++++++++++++++--------------- include/kvm/arm_pmu.h | 2 +- 4 files changed, 38 insertions(+), 27 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 2c8c4ace81ef..44da989435fc 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -241,10 +241,7 @@ struct kvm_arch { u8 pfr0_csv2; u8 pfr0_csv3; - struct { - u8 imp:4; - u8 unimp:4; - } dfr0_pmuver; + u8 dfr0_pmuver; /* Hypercall features firmware registers' descriptor */ struct kvm_smccc_features smccc_feat; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 85c978ad1f27..695239e01618 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -168,7 +168,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) * Initialise the default PMUver before there is a chance to * create an actual PMU. */ - kvm->arch.dfr0_pmuver.imp = kvm_arm_pmu_get_pmuver_limit(); + kvm->arch.dfr0_pmuver = kvm_arm_pmu_get_pmuver_limit(); bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 71b12094d613..e1ea7e0da5cd 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1177,9 +1177,9 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu) { if (kvm_vcpu_has_pmu(vcpu)) - return vcpu->kvm->arch.dfr0_pmuver.imp; + return vcpu->kvm->arch.dfr0_pmuver; - return vcpu->kvm->arch.dfr0_pmuver.unimp; + return 0; } static u8 perfmon_to_pmuver(u8 perfmon) @@ -1384,18 +1384,35 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, bool valid_pmu; host_pmuver = kvm_arm_pmu_get_pmuver_limit(); + pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); + + /* + * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the + * ID_AA64DFR0_EL1.PMUver limit to VM creation"), KVM erroneously + * exposed an IMP_DEF PMU to userspace and the guest on systems w/ + * non-architectural PMUs. Of course, PMUv3 is the only game in town for + * PMU virtualization, so the IMP_DEF value was rather user-hostile. + * + * At minimum, we're on the hook to allow values that were given to + * userspace by KVM. Cover our tracks here and replace the IMP_DEF value + * with a more sensible NI. The value of an ID register changing under + * the nose of the guest is unfortunate, but is certainly no more + * surprising than an ill-guided PMU driver poking at impdef system + * registers that end in an UNDEF... + */ + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) { + val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; + pmuver = 0; + } /* * Allow AA64DFR0_EL1.PMUver to be set from userspace as long - * as it doesn't promise more than what the HW gives us. We - * allow an IMPDEF PMU though, only if no PMU is supported - * (KVM backward compatibility handling). + * as it doesn't promise more than what the HW gives us. */ - pmuver = FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), val); - if ((pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF && pmuver > host_pmuver)) + if (pmuver > host_pmuver) return -EINVAL; - valid_pmu = (pmuver != 0 && pmuver != ID_AA64DFR0_EL1_PMUVer_IMP_DEF); + valid_pmu = pmuver; /* Make sure view register and PMU support do match */ if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) @@ -1407,11 +1424,7 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, if (val) return -EINVAL; - if (valid_pmu) - vcpu->kvm->arch.dfr0_pmuver.imp = pmuver; - else - vcpu->kvm->arch.dfr0_pmuver.unimp = pmuver; - + vcpu->kvm->arch.dfr0_pmuver = pmuver; return 0; } @@ -1423,6 +1436,12 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, bool valid_pmu; host_perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); + perfmon = SYS_FIELD_GET(ID_DFR0_EL1, PerfMon, val); + + if (perfmon == ID_DFR0_EL1_PerfMon_IMPDEF) { + val &= ~ID_DFR0_EL1_PerfMon_MASK; + perfmon = 0; + } /* * Allow DFR0_EL1.PerfMon to be set from userspace as long as @@ -1430,12 +1449,11 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, * AArch64 side (as everything is emulated with that), and * that this is a PMUv3. */ - perfmon = FIELD_GET(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon), val); - if ((perfmon != ID_DFR0_EL1_PerfMon_IMPDEF && perfmon > host_perfmon) || + if (perfmon > host_perfmon || (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3)) return -EINVAL; - valid_pmu = (perfmon != 0 && perfmon != ID_DFR0_EL1_PerfMon_IMPDEF); + valid_pmu = perfmon; /* Make sure view register and PMU support do match */ if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) @@ -1447,11 +1465,7 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, if (val) return -EINVAL; - if (valid_pmu) - vcpu->kvm->arch.dfr0_pmuver.imp = perfmon_to_pmuver(perfmon); - else - vcpu->kvm->arch.dfr0_pmuver.unimp = perfmon_to_pmuver(perfmon); - + vcpu->kvm->arch.dfr0_pmuver = perfmon_to_pmuver(perfmon); return 0; } diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 1a6a695ca67a..0be60d5eebd7 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -93,7 +93,7 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); * Evaluates as true when emulating PMUv3p5, and false otherwise. */ #define kvm_pmu_is_3p5(vcpu) \ - (vcpu->kvm->arch.dfr0_pmuver.imp >= ID_AA64DFR0_EL1_PMUVer_V3P5) + (vcpu->kvm->arch.dfr0_pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5) u8 kvm_arm_pmu_get_pmuver_limit(void); -- cgit v1.2.3 From d86cde6e335fa442c6b0866562140a2bef25402a Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Fri, 9 Jun 2023 19:00:48 +0000 Subject: KVM: arm64: Reuse fields of sys_reg_desc for idreg sys_reg_desc::{reset, val} are presently unused for ID register descriptors. Repurpose these fields to support user-configurable ID registers. Use the ::reset() function pointer to return the sanitised value of a given ID register, optionally with KVM-specific feature sanitisation. Additionally, keep a mask of writable register fields in ::val. Signed-off-by: Jing Zhang Link: https://lore.kernel.org/r/20230609190054.1542113-6-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 94 +++++++++++++++++++++++++++++++++++++---------- arch/arm64/kvm/sys_regs.h | 15 +++++--- 2 files changed, 84 insertions(+), 25 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index e1ea7e0da5cd..9d37f16cfb7b 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -540,10 +540,11 @@ static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return 0; } -static void reset_bvr(struct kvm_vcpu *vcpu, +static u64 reset_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { vcpu->arch.vcpu_debug_state.dbg_bvr[rd->CRm] = rd->val; + return rd->val; } static bool trap_bcr(struct kvm_vcpu *vcpu, @@ -576,10 +577,11 @@ static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return 0; } -static void reset_bcr(struct kvm_vcpu *vcpu, +static u64 reset_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { vcpu->arch.vcpu_debug_state.dbg_bcr[rd->CRm] = rd->val; + return rd->val; } static bool trap_wvr(struct kvm_vcpu *vcpu, @@ -613,10 +615,11 @@ static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return 0; } -static void reset_wvr(struct kvm_vcpu *vcpu, +static u64 reset_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { vcpu->arch.vcpu_debug_state.dbg_wvr[rd->CRm] = rd->val; + return rd->val; } static bool trap_wcr(struct kvm_vcpu *vcpu, @@ -649,25 +652,28 @@ static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return 0; } -static void reset_wcr(struct kvm_vcpu *vcpu, +static u64 reset_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { vcpu->arch.vcpu_debug_state.dbg_wcr[rd->CRm] = rd->val; + return rd->val; } -static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 amair = read_sysreg(amair_el1); vcpu_write_sys_reg(vcpu, amair, AMAIR_EL1); + return amair; } -static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 actlr = read_sysreg(actlr_el1); vcpu_write_sys_reg(vcpu, actlr, ACTLR_EL1); + return actlr; } -static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 mpidr; @@ -681,7 +687,10 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) mpidr = (vcpu->vcpu_id & 0x0f) << MPIDR_LEVEL_SHIFT(0); mpidr |= ((vcpu->vcpu_id >> 4) & 0xff) << MPIDR_LEVEL_SHIFT(1); mpidr |= ((vcpu->vcpu_id >> 12) & 0xff) << MPIDR_LEVEL_SHIFT(2); - vcpu_write_sys_reg(vcpu, (1ULL << 31) | mpidr, MPIDR_EL1); + mpidr |= (1ULL << 31); + vcpu_write_sys_reg(vcpu, mpidr, MPIDR_EL1); + + return mpidr; } static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, @@ -693,13 +702,13 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } -static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX); /* No PMU available, any PMU reg may UNDEF... */ if (!kvm_arm_support_pmu_v3()) - return; + return 0; n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT; n &= ARMV8_PMU_PMCR_N_MASK; @@ -708,33 +717,41 @@ static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= mask; + + return __vcpu_sys_reg(vcpu, r->reg); } -static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0); + + return __vcpu_sys_reg(vcpu, r->reg); } -static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK; + + return __vcpu_sys_reg(vcpu, r->reg); } -static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { reset_unknown(vcpu, r); __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK; + + return __vcpu_sys_reg(vcpu, r->reg); } -static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 pmcr; /* No PMU available, PMCR_EL0 may UNDEF... */ if (!kvm_arm_support_pmu_v3()) - return; + return 0; /* Only preserve PMCR_EL0.N, and reset the rest to 0 */ pmcr = read_sysreg(pmcr_el0) & (ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT); @@ -742,6 +759,8 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) pmcr |= ARMV8_PMU_PMCR_LC; __vcpu_sys_reg(vcpu, r->reg) = pmcr; + + return __vcpu_sys_reg(vcpu, r->reg); } static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) @@ -1209,7 +1228,8 @@ static u8 pmuver_to_perfmon(u8 pmuver) } /* Read a sanitised cpufeature ID register by sys_reg_desc */ -static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r) +static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) { u32 id = reg_to_encoding(r); u64 val; @@ -1280,6 +1300,17 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r return val; } +static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *r) +{ + return __kvm_read_sanitised_id_reg(vcpu, r); +} + +static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + return __kvm_read_sanitised_id_reg(vcpu, r); +} + static unsigned int id_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { @@ -1530,7 +1561,7 @@ static bool access_clidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, * Fabricate a CLIDR_EL1 value instead of using the real value, which can vary * by the physical CPU which the vcpu currently resides in. */ -static void reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static u64 reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 ctr_el0 = read_sanitised_ftr_reg(SYS_CTR_EL0); u64 clidr; @@ -1578,6 +1609,8 @@ static void reset_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) clidr |= 2 << CLIDR_TTYPE_SHIFT(loc); __vcpu_sys_reg(vcpu, r->reg) = clidr; + + return __vcpu_sys_reg(vcpu, r->reg); } static int set_clidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, @@ -1677,6 +1710,17 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu, .visibility = elx2_visibility, \ } +/* + * Since reset() callback and field val are not used for idregs, they will be + * used for specific purposes for idregs. + * The reset() would return KVM sanitised register value. The value would be the + * same as the host kernel sanitised value if there is no KVM sanitisation. + * The val would be used as a mask indicating writable fields for the idreg. + * Only bits with 1 are writable from userspace. This mask might not be + * necessary in the future whenever all ID registers are enabled as writable + * from userspace. + */ + /* sys_reg_desc initialiser for known cpufeature ID registers */ #define ID_SANITISED(name) { \ SYS_DESC(SYS_##name), \ @@ -1684,6 +1728,8 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu, .get_user = get_id_reg, \ .set_user = set_id_reg, \ .visibility = id_visibility, \ + .reset = kvm_read_sanitised_id_reg, \ + .val = 0, \ } /* sys_reg_desc initialiser for known cpufeature ID registers */ @@ -1693,6 +1739,8 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu, .get_user = get_id_reg, \ .set_user = set_id_reg, \ .visibility = aa32_id_visibility, \ + .reset = kvm_read_sanitised_id_reg, \ + .val = 0, \ } /* @@ -1705,7 +1753,9 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu, .access = access_id_reg, \ .get_user = get_id_reg, \ .set_user = set_id_reg, \ - .visibility = raz_visibility \ + .visibility = raz_visibility, \ + .reset = kvm_read_sanitised_id_reg, \ + .val = 0, \ } /* @@ -1719,6 +1769,8 @@ static unsigned int elx2_visibility(const struct kvm_vcpu *vcpu, .get_user = get_id_reg, \ .set_user = set_id_reg, \ .visibility = raz_visibility, \ + .reset = kvm_read_sanitised_id_reg, \ + .val = 0, \ } static bool access_sp_el1(struct kvm_vcpu *vcpu, @@ -3055,19 +3107,21 @@ id_to_sys_reg_desc(struct kvm_vcpu *vcpu, u64 id, */ #define FUNCTION_INVARIANT(reg) \ - static void get_##reg(struct kvm_vcpu *v, \ + static u64 get_##reg(struct kvm_vcpu *v, \ const struct sys_reg_desc *r) \ { \ ((struct sys_reg_desc *)r)->val = read_sysreg(reg); \ + return ((struct sys_reg_desc *)r)->val; \ } FUNCTION_INVARIANT(midr_el1) FUNCTION_INVARIANT(revidr_el1) FUNCTION_INVARIANT(aidr_el1) -static void get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) +static u64 get_ctr_el0(struct kvm_vcpu *v, const struct sys_reg_desc *r) { ((struct sys_reg_desc *)r)->val = read_sanitised_ftr_reg(SYS_CTR_EL0); + return ((struct sys_reg_desc *)r)->val; } /* ->val is filled in by kvm_sys_reg_table_init() */ diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 6b11f2cc7146..63bca7521dfd 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -64,13 +64,16 @@ struct sys_reg_desc { struct sys_reg_params *, const struct sys_reg_desc *); - /* Initialization for vcpu. */ - void (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *); + /* + * Initialization for vcpu. Return initialized value, or KVM + * sanitized value for ID registers. + */ + u64 (*reset)(struct kvm_vcpu *, const struct sys_reg_desc *); /* Index into sys_reg[], or 0 if we don't need to save it. */ int reg; - /* Value (usually reset value) */ + /* Value (usually reset value), or write mask for idregs */ u64 val; /* Custom get/set_user functions, fallback to generic if NULL */ @@ -123,19 +126,21 @@ static inline bool read_zero(struct kvm_vcpu *vcpu, } /* Reset functions */ -static inline void reset_unknown(struct kvm_vcpu *vcpu, +static inline u64 reset_unknown(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); __vcpu_sys_reg(vcpu, r->reg) = 0x1de7ec7edbadc0deULL; + return __vcpu_sys_reg(vcpu, r->reg); } -static inline void reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +static inline u64 reset_val(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { BUG_ON(!r->reg); BUG_ON(r->reg >= NR_SYS_REGS); __vcpu_sys_reg(vcpu, r->reg) = r->val; + return __vcpu_sys_reg(vcpu, r->reg); } static inline unsigned int sysreg_visibility(const struct kvm_vcpu *vcpu, -- cgit v1.2.3 From c4b9fd2ac035a55d1fd98322f4360c9d07530597 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:44 +0100 Subject: KVM: arm64: Drop is_kernel_in_hyp_mode() from __invalidate_icache_guest_page() It is pretty obvious that is_kernel_in_hyp_mode() doesn't make much sense in the hypervisor part of KVM, and should be reserved to the kernel side. However, mem_protect.c::invalidate_icache_guest_page() calls into __invalidate_icache_guest_page(), which uses is_kernel_in_hyp_mode(). Given that this is part of the pKVM side of the hypervisor, this helper can only return true. Nothing goes really bad, but __invalidate_icache_guest_page() could spell out what the actual check is: we cannot invalidate the cache if the i-cache is VPIPT and we're running at EL1. Drop the is_kernel_in_hyp_mode() check for an explicit check against CurrentEL being EL1 or not. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230609162200.2024064-2-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_mmu.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 27e63c111f78..c8113b931263 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -227,7 +227,8 @@ static inline void __invalidate_icache_guest_page(void *va, size_t size) if (icache_is_aliasing()) { /* any kind of VIPT cache */ icache_inval_all_pou(); - } else if (is_kernel_in_hyp_mode() || !icache_is_vpipt()) { + } else if (read_sysreg(CurrentEL) != CurrentEL_EL1 || + !icache_is_vpipt()) { /* PIPT or VPIPT at EL2 (see comment in __kvm_tlb_flush_vmid_ipa) */ icache_inval_pou((unsigned long)va, (unsigned long)va + size); } -- cgit v1.2.3 From 35230be87ec6147c20e7433ab9d41e2fd2664631 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:45 +0100 Subject: arm64: Prevent the use of is_kernel_in_hyp_mode() in hypervisor code Using is_kernel_in_hyp_mode() in hypervisor code is a pretty bad mistake. This helper only checks for CurrentEL being EL2, which is always true. Make the compilation fail if using the helper in hypervisor context Whilst we're at it, flag the helper as __always_inline, which it really should be. Signed-off-by: Marc Zyngier Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20230609162200.2024064-3-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/virt.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 4eb601e7de50..21e94068804d 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -110,8 +110,10 @@ static inline bool is_hyp_mode_mismatched(void) return __boot_cpu_mode[0] != __boot_cpu_mode[1]; } -static inline bool is_kernel_in_hyp_mode(void) +static __always_inline bool is_kernel_in_hyp_mode(void) { + BUILD_BUG_ON(__is_defined(__KVM_NVHE_HYPERVISOR__) || + __is_defined(__KVM_VHE_HYPERVISOR__)); return read_sysreg(CurrentEL) == CurrentEL_EL2; } -- cgit v1.2.3 From 0ddc312b7c73049d982d173fee4c5dabf1727ebb Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:46 +0100 Subject: arm64: Turn kaslr_feature_override into a generic SW feature override Disabling KASLR from the command line is implemented as a feature override. Repaint it slightly so that it can further be used as more generic infrastructure for SW override purposes. Signed-off-by: Marc Zyngier Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20230609162200.2024064-4-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/cpufeature.h | 4 ++++ arch/arm64/kernel/cpufeature.c | 2 ++ arch/arm64/kernel/idreg-override.c | 16 ++++++---------- arch/arm64/kernel/kaslr.c | 6 +++--- 4 files changed, 15 insertions(+), 13 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 6bf013fb110d..bc1009890180 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -15,6 +15,8 @@ #define MAX_CPU_FEATURES 128 #define cpu_feature(x) KERNEL_HWCAP_ ## x +#define ARM64_SW_FEATURE_OVERRIDE_NOKASLR 0 + #ifndef __ASSEMBLY__ #include @@ -925,6 +927,8 @@ extern struct arm64_ftr_override id_aa64smfr0_override; extern struct arm64_ftr_override id_aa64isar1_override; extern struct arm64_ftr_override id_aa64isar2_override; +extern struct arm64_ftr_override arm64_sw_feature_override; + u32 get_kvm_ipa_limit(void); void dump_cpu_features(void); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 7d7128c65161..2d2b7bb5fa0c 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -664,6 +664,8 @@ struct arm64_ftr_override __ro_after_init id_aa64smfr0_override; struct arm64_ftr_override __ro_after_init id_aa64isar1_override; struct arm64_ftr_override __ro_after_init id_aa64isar2_override; +struct arm64_ftr_override arm64_sw_feature_override; + static const struct __ftr_reg_entry { u32 sys_id; struct arm64_ftr_reg *reg; diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 370ab84fd06e..8c93b6198bf5 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -138,15 +138,11 @@ static const struct ftr_set_desc smfr0 __initconst = { }, }; -extern struct arm64_ftr_override kaslr_feature_override; - -static const struct ftr_set_desc kaslr __initconst = { - .name = "kaslr", -#ifdef CONFIG_RANDOMIZE_BASE - .override = &kaslr_feature_override, -#endif +static const struct ftr_set_desc sw_features __initconst = { + .name = "arm64_sw", + .override = &arm64_sw_feature_override, .fields = { - FIELD("disabled", 0, NULL), + FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL), {} }, }; @@ -158,7 +154,7 @@ static const struct ftr_set_desc * const regs[] __initconst = { &isar1, &isar2, &smfr0, - &kaslr, + &sw_features, }; static const struct { @@ -175,7 +171,7 @@ static const struct { "id_aa64isar1.api=0 id_aa64isar1.apa=0 " "id_aa64isar2.gpa3=0 id_aa64isar2.apa3=0" }, { "arm64.nomte", "id_aa64pfr1.mte=0" }, - { "nokaslr", "kaslr.disabled=1" }, + { "nokaslr", "arm64_sw.nokaslr=1" }, }; static int __init parse_nokaslr(char *unused) diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index e7477f21a4c9..5d4ce7f5f157 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -23,8 +23,6 @@ u64 __ro_after_init module_alloc_base; u16 __initdata memstart_offset_seed; -struct arm64_ftr_override kaslr_feature_override __initdata; - static int __init kaslr_init(void) { u64 module_range; @@ -36,7 +34,9 @@ static int __init kaslr_init(void) */ module_alloc_base = (u64)_etext - MODULES_VSIZE; - if (kaslr_feature_override.val & kaslr_feature_override.mask & 0xf) { + if (cpuid_feature_extract_unsigned_field(arm64_sw_feature_override.val & + arm64_sw_feature_override.mask, + ARM64_SW_FEATURE_OVERRIDE_NOKASLR)) { pr_info("KASLR disabled on command line\n"); return 0; } -- cgit v1.2.3 From e2d6c906f0ac69559da887c0a2c3c10070e746c5 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:47 +0100 Subject: arm64: Add KVM_HVHE capability and has_hvhe() predicate Expose a capability keying the hVHE feature as well as a new predicate testing it. Nothing is so far using it, and nothing is enabling it yet. Signed-off-by: Marc Zyngier Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20230609162200.2024064-5-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/cpufeature.h | 1 + arch/arm64/include/asm/virt.h | 8 ++++++++ arch/arm64/kernel/cpufeature.c | 19 +++++++++++++++++++ arch/arm64/tools/cpucaps | 1 + 4 files changed, 29 insertions(+) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index bc1009890180..3d4b547ae312 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -16,6 +16,7 @@ #define cpu_feature(x) KERNEL_HWCAP_ ## x #define ARM64_SW_FEATURE_OVERRIDE_NOKASLR 0 +#define ARM64_SW_FEATURE_OVERRIDE_HVHE 4 #ifndef __ASSEMBLY__ diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 21e94068804d..5227db7640c8 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -142,6 +142,14 @@ static __always_inline bool is_protected_kvm_enabled(void) return cpus_have_final_cap(ARM64_KVM_PROTECTED_MODE); } +static __always_inline bool has_hvhe(void) +{ + if (is_vhe_hyp_code()) + return false; + + return cpus_have_final_cap(ARM64_KVM_HVHE); +} + static inline bool is_hyp_nvhe(void) { return is_hyp_mode_available() && !is_kernel_in_hyp_mode(); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 2d2b7bb5fa0c..c743b1a5ae31 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1998,6 +1998,19 @@ static bool has_nested_virt_support(const struct arm64_cpu_capabilities *cap, return true; } +static bool hvhe_possible(const struct arm64_cpu_capabilities *entry, + int __unused) +{ + u64 val; + + val = read_sysreg(id_aa64mmfr1_el1); + if (!cpuid_feature_extract_unsigned_field(val, ID_AA64MMFR1_EL1_VH_SHIFT)) + return false; + + val = arm64_sw_feature_override.val & arm64_sw_feature_override.mask; + return cpuid_feature_extract_unsigned_field(val, ARM64_SW_FEATURE_OVERRIDE_HVHE); +} + #ifdef CONFIG_ARM64_PAN static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) { @@ -2643,6 +2656,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_dit, ARM64_CPUID_FIELDS(ID_AA64PFR0_EL1, DIT, IMP) }, + { + .desc = "VHE for hypervisor only", + .capability = ARM64_KVM_HVHE, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = hvhe_possible, + }, {}, }; diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 40ba95472594..3c23a55d7c2f 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -47,6 +47,7 @@ HAS_TLB_RANGE HAS_VIRT_HOST_EXTN HAS_WFXT HW_DBM +KVM_HVHE KVM_PROTECTED_MODE MISMATCHED_CACHE_TYPE MTE -- cgit v1.2.3 From 7a26e1f51e3c26a1b4a1f087e2056f1554365682 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:48 +0100 Subject: arm64: Don't enable VHE for the kernel if OVERRIDE_HVHE is set If the OVERRIDE_HVHE SW override is set (as a precursor of the KVM_HVHE capability), do not enable VHE for the kernel and drop to EL1 as if VHE was either disabled or unavailable. Further changes will enable VHE at EL2 only, with the kernel still running at EL1. Signed-off-by: Marc Zyngier Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20230609162200.2024064-6-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kernel/hyp-stub.S | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index 9439240c3fcf..5c71e1019545 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -82,7 +82,15 @@ SYM_CODE_START_LOCAL(__finalise_el2) tbnz x1, #0, 1f // Needs to be VHE capable, obviously - check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 2f 1f x1 x2 + check_override id_aa64mmfr1 ID_AA64MMFR1_EL1_VH_SHIFT 0f 1f x1 x2 + +0: // Check whether we only want the hypervisor to run VHE, not the kernel + adr_l x1, arm64_sw_feature_override + ldr x2, [x1, FTR_OVR_VAL_OFFSET] + ldr x1, [x1, FTR_OVR_MASK_OFFSET] + and x2, x2, x1 + ubfx x2, x2, #ARM64_SW_FEATURE_OVERRIDE_HVHE, #4 + cbz x2, 2f 1: mov_q x0, HVC_STUB_ERR eret -- cgit v1.2.3 From 9e7462bbe00d8431694720804a50b8f48d8ed0b0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:49 +0100 Subject: arm64: Allow EL1 physical timer access when running VHE To initialise the timer access from EL2 when HCR_EL2.E2H is set, we must make use the CNTHCTL_EL2 formap used is appropriate. This amounts to shifting the timer/counter enable bits by 10 to the left. Signed-off-by: Marc Zyngier Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20230609162200.2024064-7-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/el2_setup.h | 5 +++++ arch/arm64/kvm/hyp/nvhe/hyp-init.S | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 037724b19c5c..225bf1f2514d 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -34,6 +34,11 @@ */ .macro __init_el2_timers mov x0, #3 // Enable EL1 physical timers + mrs x1, hcr_el2 + and x1, x1, #HCR_E2H + cbz x1, .LnVHE_\@ + lsl x0, x0, #10 +.LnVHE_\@: msr cnthctl_el2, x0 msr cntvoff_el2, xzr // Clear virtual offset .endm diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index a6d67c2bb5ae..f9ee10e29497 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -95,6 +95,15 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init) ldr x1, [x0, #NVHE_INIT_HCR_EL2] msr hcr_el2, x1 + mov x2, #HCR_E2H + and x2, x1, x2 + cbz x2, 1f + + mrs x1, cnthctl_el2 + and x1, x1, #~(BIT(0) | BIT(1)) + orr x1, x1, #(BIT(10) | BIT(11)) + msr cnthctl_el2, x1 +1: ldr x1, [x0, #NVHE_INIT_VTTBR] msr vttbr_el2, x1 -- cgit v1.2.3 From 659803aef48b1a43bf47c6b105f5e3cee6a1501c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:50 +0100 Subject: arm64: Use CPACR_EL1 format to set CPTR_EL2 when E2H is set When HCR_EL2.E2H is set, the CPTR_EL2 register takes the CPACR_EL1 format. Yes, this is good fun. Hack the bits of startup code that assume E2H=0 while setting up CPTR_EL2 to make them grok the CPTR_EL1 format. Signed-off-by: Marc Zyngier Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20230609162200.2024064-8-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/el2_setup.h | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index 225bf1f2514d..bba508ffa12d 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -129,8 +129,15 @@ .endm /* Coprocessor traps */ -.macro __init_el2_nvhe_cptr +.macro __init_el2_cptr + mrs x1, hcr_el2 + and x1, x1, #HCR_E2H + cbz x1, .LnVHE_\@ + mov x0, #(CPACR_EL1_FPEN_EL1EN | CPACR_EL1_FPEN_EL0EN) + b .Lset_cptr_\@ +.LnVHE_\@: mov x0, #0x33ff +.Lset_cptr_\@: msr cptr_el2, x0 // Disable copro. traps to EL2 .endm @@ -196,7 +203,7 @@ __init_el2_gicv3 __init_el2_hstr __init_el2_nvhe_idregs - __init_el2_nvhe_cptr + __init_el2_cptr __init_el2_fgt __init_el2_nvhe_prepare_eret .endm @@ -244,7 +251,17 @@ .Linit_sve_\@: /* SVE register access */ mrs x0, cptr_el2 // Disable SVE traps + mrs x1, hcr_el2 + and x1, x1, #HCR_E2H + cbz x1, .Lcptr_nvhe_\@ + + // VHE case + orr x0, x0, #(CPACR_EL1_ZEN_EL1EN | CPACR_EL1_ZEN_EL0EN) + b .Lset_cptr_\@ + +.Lcptr_nvhe_\@: // nVHE case bic x0, x0, #CPTR_EL2_TZ +.Lset_cptr_\@: msr cptr_el2, x0 isb mov x1, #ZCR_ELx_LEN_MASK // SVE: Enable full vector -- cgit v1.2.3 From 57e784b4079e9499ea1bafd56a0d422252aa2401 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:51 +0100 Subject: KVM: arm64: Remove alternatives from sysreg accessors in VHE hypervisor context In the VHE hypervisor code, we should be using the remapped VHE accessors, no ifs, no buts. No need to generate any alternative. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230609162200.2024064-9-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_hyp.h | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index bdd9cf546d95..fea04eb25cb4 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -16,6 +16,23 @@ DECLARE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DECLARE_PER_CPU(unsigned long, kvm_hyp_vector); DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); +/* + * Unified accessors for registers that have a different encoding + * between VHE and non-VHE. They must be specified without their "ELx" + * encoding, but with the SYS_ prefix, as defined in asm/sysreg.h. + */ + +#if defined(__KVM_VHE_HYPERVISOR__) + +#define read_sysreg_el0(r) read_sysreg_s(r##_EL02) +#define write_sysreg_el0(v,r) write_sysreg_s(v, r##_EL02) +#define read_sysreg_el1(r) read_sysreg_s(r##_EL12) +#define write_sysreg_el1(v,r) write_sysreg_s(v, r##_EL12) +#define read_sysreg_el2(r) read_sysreg_s(r##_EL1) +#define write_sysreg_el2(v,r) write_sysreg_s(v, r##_EL1) + +#else // !__KVM_VHE_HYPERVISOR__ + #define read_sysreg_elx(r,nvh,vh) \ ({ \ u64 reg; \ @@ -35,12 +52,6 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); : : "rZ" (__val)); \ } while (0) -/* - * Unified accessors for registers that have a different encoding - * between VHE and non-VHE. They must be specified without their "ELx" - * encoding, but with the SYS_ prefix, as defined in asm/sysreg.h. - */ - #define read_sysreg_el0(r) read_sysreg_elx(r, _EL0, _EL02) #define write_sysreg_el0(v,r) write_sysreg_elx(v, r, _EL0, _EL02) #define read_sysreg_el1(r) read_sysreg_elx(r, _EL1, _EL12) @@ -48,6 +59,8 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); #define read_sysreg_el2(r) read_sysreg_elx(r, _EL2, _EL1) #define write_sysreg_el2(v,r) write_sysreg_elx(v, r, _EL2, _EL1) +#endif // __KVM_VHE_HYPERVISOR__ + /* * Without an __arch_swab32(), we fall back to ___constant_swab32(), but the * static inline can allow the compiler to out-of-line this. KVM always wants -- cgit v1.2.3 From 6f617d3aa643e4ed6929f5b582759f4a73804034 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:52 +0100 Subject: KVM: arm64: Key use of VHE instructions in nVHE code off ARM64_KVM_HVHE We can now start with the fun stuff: if we enable VHE *only* for the hypervisor, we need to generate the VHE instructions when accessing the system registers. For this, reporpose the alternative sequence to be keyed off ARM64_KVM_HVHE in the nVHE hypervisor code, and only there. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230609162200.2024064-10-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_hyp.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index fea04eb25cb4..b7238c72a04c 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -33,12 +33,18 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); #else // !__KVM_VHE_HYPERVISOR__ +#if defined(__KVM_NVHE_HYPERVISOR__) +#define VHE_ALT_KEY ARM64_KVM_HVHE +#else +#define VHE_ALT_KEY ARM64_HAS_VIRT_HOST_EXTN +#endif + #define read_sysreg_elx(r,nvh,vh) \ ({ \ u64 reg; \ - asm volatile(ALTERNATIVE(__mrs_s("%0", r##nvh), \ + asm volatile(ALTERNATIVE(__mrs_s("%0", r##nvh), \ __mrs_s("%0", r##vh), \ - ARM64_HAS_VIRT_HOST_EXTN) \ + VHE_ALT_KEY) \ : "=r" (reg)); \ reg; \ }) @@ -48,7 +54,7 @@ DECLARE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); u64 __val = (u64)(v); \ asm volatile(ALTERNATIVE(__msr_s(r##nvh, "%x0"), \ __msr_s(r##vh, "%x0"), \ - ARM64_HAS_VIRT_HOST_EXTN) \ + VHE_ALT_KEY) \ : : "rZ" (__val)); \ } while (0) -- cgit v1.2.3 From d0daf5a21e635057e87cc05d6bca012157aa3ab7 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:53 +0100 Subject: KVM: arm64: Force HCR_EL2.E2H when ARM64_KVM_HVHE is set Obviously, in order to be able to use VHE whilst at EL2, we need to set HCR_EL2.E2H. Do so when ARM64_KVM_HVHE is set. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230609162200.2024064-11-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 14391826241c..c12276dd2cf4 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1676,6 +1676,8 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; else params->hcr_el2 = HCR_HOST_NVHE_FLAGS; + if (cpus_have_final_cap(ARM64_KVM_HVHE)) + params->hcr_el2 |= HCR_E2H; params->vttbr = params->vtcr = 0; /* -- cgit v1.2.3 From cff3b5cf96edbb9e7448bca6261871272922a0df Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:54 +0100 Subject: KVM: arm64: Disable TTBR1_EL2 when using ARM64_KVM_HVHE When using hVHE, we end-up with two TTBRs at EL2. That's great, but we're not quite ready for this just yet. Disable TTBR1_EL2 by setting TCR_EL2.EPD1 so that we only translate via TTBR0_EL2. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230609162200.2024064-12-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index c12276dd2cf4..35b32cb6faa5 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1666,7 +1666,13 @@ static void __init cpu_prepare_hyp_mode(int cpu, u32 hyp_va_bits) params->mair_el2 = read_sysreg(mair_el1); - tcr = (read_sysreg(tcr_el1) & TCR_EL2_MASK) | TCR_EL2_RES1; + tcr = read_sysreg(tcr_el1); + if (cpus_have_final_cap(ARM64_KVM_HVHE)) { + tcr |= TCR_EPD1_MASK; + } else { + tcr &= TCR_EL2_MASK; + tcr |= TCR_EL2_RES1; + } tcr &= ~TCR_T0SZ_MASK; tcr |= TCR_T0SZ(hyp_va_bits); params->tcr_el2 = tcr; -- cgit v1.2.3 From 6537565fd9b7f169cda025482f6de2646cf7b60a Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:55 +0100 Subject: KVM: arm64: Adjust EL2 stage-1 leaf AP bits when ARM64_KVM_HVHE is set El2 stage-1 page-table format is subtly (and annoyingly) different when HCR_EL2.E2H is set. Take the ARM64_KVM_HVHE configuration into account when setting the AP bits. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230609162200.2024064-13-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/pgtable.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 5282cb9ca4cf..21d0d0b1cf15 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -21,8 +21,10 @@ #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) #define KVM_PTE_LEAF_ATTR_LO_S1_AP GENMASK(7, 6) -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO 3 -#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW 1 +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RO \ + ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 2 : 3; }) +#define KVM_PTE_LEAF_ATTR_LO_S1_AP_RW \ + ({ cpus_have_final_cap(ARM64_KVM_HVHE) ? 0 : 1; }) #define KVM_PTE_LEAF_ATTR_LO_S1_SH GENMASK(9, 8) #define KVM_PTE_LEAF_ATTR_LO_S1_SH_IS 3 #define KVM_PTE_LEAF_ATTR_LO_S1_AF BIT(10) -- cgit v1.2.3 From 75c76ab5a641b64e872b6e136b5edf8a72009cc6 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:56 +0100 Subject: KVM: arm64: Rework CPTR_EL2 programming for HVHE configuration Just like we repainted the early arm64 code, we need to update the CPTR_EL2 accesses that are taking place in the nVHE code when hVHE is used, making them look as if they were CPACR_EL1 accesses. Just like the VHE code. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230609162200.2024064-14-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_arm.h | 4 +--- arch/arm64/include/asm/kvm_emulate.h | 31 +++++++++++++++++++++++++++++++ arch/arm64/kvm/arm.c | 2 +- arch/arm64/kvm/fpsimd.c | 4 ++-- arch/arm64/kvm/hyp/include/hyp/switch.h | 2 +- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 6 +++++- arch/arm64/kvm/hyp/nvhe/pkvm.c | 24 ++++++++++++++++++------ arch/arm64/kvm/hyp/nvhe/switch.c | 28 ++++++++++++++++------------ arch/arm64/kvm/hyp/vhe/switch.c | 2 +- arch/arm64/kvm/sys_regs.c | 2 +- 10 files changed, 77 insertions(+), 28 deletions(-) diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index baef29fcbeee..e448f8f7fd7e 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -285,7 +285,6 @@ #define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT) #define CPTR_EL2_TZ (1 << 8) #define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */ -#define CPTR_EL2_DEFAULT CPTR_NVHE_EL2_RES1 #define CPTR_NVHE_EL2_RES0 (GENMASK(63, 32) | \ GENMASK(29, 21) | \ GENMASK(19, 14) | \ @@ -347,8 +346,7 @@ ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \ ECN(BKPT32), ECN(VECTOR32), ECN(BRK64), ECN(ERET) -#define CPACR_EL1_DEFAULT (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN |\ - CPACR_EL1_ZEN_EL1EN) +#define CPACR_EL1_TTA (1 << 28) #define kvm_mode_names \ { PSR_MODE_EL0t, "EL0t" }, \ diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index b31b32ecbe2d..4d82e622240d 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -570,4 +570,35 @@ static inline bool vcpu_has_feature(struct kvm_vcpu *vcpu, int feature) return test_bit(feature, vcpu->arch.features); } +static __always_inline u64 kvm_get_reset_cptr_el2(struct kvm_vcpu *vcpu) +{ + u64 val; + + if (has_vhe()) { + val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN | + CPACR_EL1_ZEN_EL1EN); + } else if (has_hvhe()) { + val = (CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN); + } else { + val = CPTR_NVHE_EL2_RES1; + + if (vcpu_has_sve(vcpu) && + (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)) + val |= CPTR_EL2_TZ; + if (cpus_have_final_cap(ARM64_SME)) + val &= ~CPTR_EL2_TSM; + } + + return val; +} + +static __always_inline void kvm_reset_cptr_el2(struct kvm_vcpu *vcpu) +{ + u64 val = kvm_get_reset_cptr_el2(vcpu); + + if (has_vhe() || has_hvhe()) + write_sysreg(val, cpacr_el1); + else + write_sysreg(val, cptr_el2); +} #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 35b32cb6faa5..01c975a0acc9 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1240,7 +1240,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, } vcpu_reset_hcr(vcpu); - vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT; + vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); /* * Handle the "start in power-off" case. diff --git a/arch/arm64/kvm/fpsimd.c b/arch/arm64/kvm/fpsimd.c index 4c9dcd8fc939..8c1d0d4853df 100644 --- a/arch/arm64/kvm/fpsimd.c +++ b/arch/arm64/kvm/fpsimd.c @@ -180,7 +180,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) /* * If we have VHE then the Hyp code will reset CPACR_EL1 to - * CPACR_EL1_DEFAULT and we need to reenable SME. + * the default value and we need to reenable SME. */ if (has_vhe() && system_supports_sme()) { /* Also restore EL0 state seen on entry */ @@ -210,7 +210,7 @@ void kvm_arch_vcpu_put_fp(struct kvm_vcpu *vcpu) /* * The FPSIMD/SVE state in the CPU has not been touched, and we * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been - * reset to CPACR_EL1_DEFAULT by the Hyp code, disabling SVE + * reset by kvm_reset_cptr_el2() in the Hyp code, disabling SVE * for EL0. To avoid spurious traps, restore the trap state * seen by kvm_arch_vcpu_load_fp(): */ diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index e78a08a72a3c..b5f53d3fc14c 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -192,7 +192,7 @@ static bool kvm_hyp_handle_fpsimd(struct kvm_vcpu *vcpu, u64 *exit_code) /* Valid trap. Switch the context: */ /* First disable enough traps to allow us to update the registers */ - if (has_vhe()) { + if (has_vhe() || has_hvhe()) { reg = CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN; if (sve_guest) reg |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 728e01d4536b..ce602f9e93eb 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -392,7 +392,11 @@ void handle_trap(struct kvm_cpu_context *host_ctxt) handle_host_smc(host_ctxt); break; case ESR_ELx_EC_SVE: - sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0); + if (has_hvhe()) + sysreg_clear_set(cpacr_el1, 0, (CPACR_EL1_ZEN_EL1EN | + CPACR_EL1_ZEN_EL0EN)); + else + sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0); isb(); sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); break; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index a06ece14a6d8..5d5ee735a7d9 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -27,6 +27,7 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) u64 hcr_set = HCR_RW; u64 hcr_clear = 0; u64 cptr_set = 0; + u64 cptr_clear = 0; /* Protected KVM does not support AArch32 guests. */ BUILD_BUG_ON(FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_EL0), @@ -57,12 +58,17 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) } /* Trap SVE */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) - cptr_set |= CPTR_EL2_TZ; + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), feature_ids)) { + if (has_hvhe()) + cptr_clear |= CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN; + else + cptr_set |= CPTR_EL2_TZ; + } vcpu->arch.hcr_el2 |= hcr_set; vcpu->arch.hcr_el2 &= ~hcr_clear; vcpu->arch.cptr_el2 |= cptr_set; + vcpu->arch.cptr_el2 &= ~cptr_clear; } /* @@ -120,8 +126,12 @@ static void pvm_init_traps_aa64dfr0(struct kvm_vcpu *vcpu) mdcr_set |= MDCR_EL2_TTRF; /* Trap Trace */ - if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) - cptr_set |= CPTR_EL2_TTA; + if (!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_TraceVer), feature_ids)) { + if (has_hvhe()) + cptr_set |= CPACR_EL1_TTA; + else + cptr_set |= CPTR_EL2_TTA; + } vcpu->arch.mdcr_el2 |= mdcr_set; vcpu->arch.mdcr_el2 &= ~mdcr_clear; @@ -176,8 +186,10 @@ static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) /* Clear res0 and set res1 bits to trap potential new features. */ vcpu->arch.hcr_el2 &= ~(HCR_RES0); vcpu->arch.mdcr_el2 &= ~(MDCR_EL2_RES0); - vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; - vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); + if (!has_hvhe()) { + vcpu->arch.cptr_el2 |= CPTR_NVHE_EL2_RES1; + vcpu->arch.cptr_el2 &= ~(CPTR_NVHE_EL2_RES0); + } } /* diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index 71fa16a0dc77..5fa0b1c9ee8d 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -44,13 +44,24 @@ static void __activate_traps(struct kvm_vcpu *vcpu) __activate_traps_common(vcpu); val = vcpu->arch.cptr_el2; - val |= CPTR_EL2_TTA | CPTR_EL2_TAM; + val |= CPTR_EL2_TAM; /* Same bit irrespective of E2H */ + val |= has_hvhe() ? CPACR_EL1_TTA : CPTR_EL2_TTA; + if (cpus_have_final_cap(ARM64_SME)) { + if (has_hvhe()) + val &= ~(CPACR_EL1_SMEN_EL1EN | CPACR_EL1_SMEN_EL0EN); + else + val |= CPTR_EL2_TSM; + } + if (!guest_owns_fp_regs(vcpu)) { - val |= CPTR_EL2_TFP | CPTR_EL2_TZ; + if (has_hvhe()) + val &= ~(CPACR_EL1_FPEN_EL0EN | CPACR_EL1_FPEN_EL1EN | + CPACR_EL1_ZEN_EL0EN | CPACR_EL1_ZEN_EL1EN); + else + val |= CPTR_EL2_TFP | CPTR_EL2_TZ; + __activate_traps_fpsimd32(vcpu); } - if (cpus_have_final_cap(ARM64_SME)) - val |= CPTR_EL2_TSM; write_sysreg(val, cptr_el2); write_sysreg(__this_cpu_read(kvm_hyp_vector), vbar_el2); @@ -73,7 +84,6 @@ static void __activate_traps(struct kvm_vcpu *vcpu) static void __deactivate_traps(struct kvm_vcpu *vcpu) { extern char __kvm_hyp_host_vector[]; - u64 cptr; ___deactivate_traps(vcpu); @@ -98,13 +108,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); - cptr = CPTR_EL2_DEFAULT; - if (vcpu_has_sve(vcpu) && (vcpu->arch.fp_state == FP_STATE_GUEST_OWNED)) - cptr |= CPTR_EL2_TZ; - if (cpus_have_final_cap(ARM64_SME)) - cptr &= ~CPTR_EL2_TSM; - - write_sysreg(cptr, cptr_el2); + kvm_reset_cptr_el2(vcpu); write_sysreg(__kvm_hyp_host_vector, vbar_el2); } diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index 3d868e84c7a0..034777a23b74 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -84,7 +84,7 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) */ asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); - write_sysreg(CPACR_EL1_DEFAULT, cpacr_el1); + kvm_reset_cptr_el2(vcpu); if (!arm64_kernel_unmapped_at_el0()) host_vectors = __this_cpu_read(this_cpu_vector); diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 71b12094d613..11ededa80737 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2180,7 +2180,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(ACTLR_EL2, access_rw, reset_val, 0), EL2_REG(HCR_EL2, access_rw, reset_val, 0), EL2_REG(MDCR_EL2, access_rw, reset_val, 0), - EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_EL2_DEFAULT ), + EL2_REG(CPTR_EL2, access_rw, reset_val, CPTR_NVHE_EL2_RES1), EL2_REG(HSTR_EL2, access_rw, reset_val, 0), EL2_REG(HACR_EL2, access_rw, reset_val, 0), -- cgit v1.2.3 From aca18585db4fd0ed0bd7420eddcdc39a535194fe Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:57 +0100 Subject: KVM: arm64: Program the timer traps with VHE layout in hVHE mode Just like the rest of the timer code, we need to shift the enable bits around when HCR_EL2.E2H is set, which is the case in hVHE mode. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230609162200.2024064-15-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/timer-sr.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/timer-sr.c b/arch/arm64/kvm/hyp/nvhe/timer-sr.c index b185ac0dbd47..3aaab20ae5b4 100644 --- a/arch/arm64/kvm/hyp/nvhe/timer-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/timer-sr.c @@ -17,21 +17,24 @@ void __kvm_timer_set_cntvoff(u64 cntvoff) } /* - * Should only be called on non-VHE systems. + * Should only be called on non-VHE or hVHE setups. * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). */ void __timer_disable_traps(struct kvm_vcpu *vcpu) { - u64 val; + u64 val, shift = 0; + + if (has_hvhe()) + shift = 10; /* Allow physical timer/counter access for the host */ val = read_sysreg(cnthctl_el2); - val |= CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN; + val |= (CNTHCTL_EL1PCTEN | CNTHCTL_EL1PCEN) << shift; write_sysreg(val, cnthctl_el2); } /* - * Should only be called on non-VHE systems. + * Should only be called on non-VHE or hVHE setups. * VHE systems use EL2 timers and configure EL1 timers in kvm_timer_init_vhe(). */ void __timer_enable_traps(struct kvm_vcpu *vcpu) @@ -50,5 +53,10 @@ void __timer_enable_traps(struct kvm_vcpu *vcpu) else clr |= CNTHCTL_EL1PCTEN; + if (has_hvhe()) { + clr <<= 10; + set <<= 10; + } + sysreg_clear_set(cnthctl_el2, clr, set); } -- cgit v1.2.3 From 38cba55008e5fab9181302ea5daf79e2070c9998 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:58 +0100 Subject: KVM: arm64: Force HCR_E2H in guest context when ARM64_KVM_HVHE is set Also make sure HCR_EL2.E2H is set when switching HCR_EL2 in guest context. Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230609162200.2024064-16-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_emulate.h | 2 +- arch/arm64/kvm/hyp/nvhe/pkvm.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 4d82e622240d..cf40d19a72f8 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -74,7 +74,7 @@ static __always_inline bool vcpu_el1_is_32bit(struct kvm_vcpu *vcpu) static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu) { vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; - if (is_kernel_in_hyp_mode()) + if (has_vhe() || has_hvhe()) vcpu->arch.hcr_el2 |= HCR_E2H; if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN)) { /* route synchronous external abort exceptions to EL2 */ diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 5d5ee735a7d9..8033ef353a5d 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -44,6 +44,9 @@ static void pvm_init_traps_aa64pfr0(struct kvm_vcpu *vcpu) BUILD_BUG_ON(!FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AdvSIMD), PVM_ID_AA64PFR0_ALLOW)); + if (has_hvhe()) + hcr_set |= HCR_E2H; + /* Trap RAS unless all current versions are supported */ if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_RAS), feature_ids) < ID_AA64PFR0_EL1_RAS_V1P1) { -- cgit v1.2.3 From ad744e8cb346743dd76425942910c7b75a782ed0 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 9 Jun 2023 17:21:59 +0100 Subject: arm64: Allow arm64_sw.hvhe on command line Add the arm64_sw.hvhe=1 option to force the use of the hVHE mode in the hypervisor code only. This enables the hVHE mode of operation when using KVM on VHE hardware. Signed-off-by: Marc Zyngier Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20230609162200.2024064-17-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kernel/idreg-override.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/arm64/kernel/idreg-override.c b/arch/arm64/kernel/idreg-override.c index 8c93b6198bf5..c553d30089e5 100644 --- a/arch/arm64/kernel/idreg-override.c +++ b/arch/arm64/kernel/idreg-override.c @@ -138,11 +138,22 @@ static const struct ftr_set_desc smfr0 __initconst = { }, }; +static bool __init hvhe_filter(u64 val) +{ + u64 mmfr1 = read_sysreg(id_aa64mmfr1_el1); + + return (val == 1 && + lower_32_bits(__boot_status) == BOOT_CPU_MODE_EL2 && + cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_EL1_VH_SHIFT)); +} + static const struct ftr_set_desc sw_features __initconst = { .name = "arm64_sw", .override = &arm64_sw_feature_override, .fields = { FIELD("nokaslr", ARM64_SW_FEATURE_OVERRIDE_NOKASLR, NULL), + FIELD("hvhe", ARM64_SW_FEATURE_OVERRIDE_HVHE, hvhe_filter), {} }, }; -- cgit v1.2.3 From 285cff4c0454340a4dc53f46e67f2cb1c293bd74 Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Fri, 24 Mar 2023 15:54:23 +0100 Subject: KVM: s390: fix KVM_S390_GET_CMMA_BITS for GFNs in memslot holes The KVM_S390_GET_CMMA_BITS ioctl may return incorrect values when userspace specifies a start_gfn outside of memslots. This can occur when a VM has multiple memslots with a hole in between: +-----+----------+--------+--------+ | ... | Slot N-1 | | Slot N | +-----+----------+--------+--------+ ^ ^ ^ ^ | | | | GFN A A+B | | A+B+C | A+B+C+D When userspace specifies a GFN in [A+B, A+B+C), it would expect to get the CMMA values of the first dirty page in Slot N. However, userspace may get a start_gfn of A+B+C+D with a count of 0, hence completely skipping over any dirty pages in slot N. The error is in kvm_s390_next_dirty_cmma(), which assumes gfn_to_memslot_approx() will return the memslot _below_ the specified GFN when the specified GFN lies outside a memslot. In reality it may return either the memslot below or above the specified GFN. When a memslot above the specified GFN is returned this happens: - ofs is calculated, but since the memslot's base_gfn is larger than the specified cur_gfn, ofs will underflow to a huge number. - ofs is passed to find_next_bit(). Since ofs will exceed the memslot's number of pages, the number of pages in the memslot is returned, completely skipping over all bits in the memslot userspace would be interested in. Fix this by resetting ofs to zero when a memslot _above_ cur_gfn is returned (cur_gfn < ms->base_gfn). Signed-off-by: Nico Boehr Reviewed-by: Claudio Imbrenda Fixes: afdad61615cc ("KVM: s390: Fix storage attributes migration with memory slots") Message-Id: <20230324145424.293889-2-nrb@linux.ibm.com> Signed-off-by: Claudio Imbrenda Signed-off-by: Janosch Frank --- arch/s390/kvm/kvm-s390.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 17b81659cdb2..670019696464 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2156,6 +2156,10 @@ static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots, ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); ofs = 0; } + + if (cur_gfn < ms->base_gfn) + ofs = 0; + ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs); while (ofs >= ms->npages && (mnode = rb_next(mnode))) { ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]); -- cgit v1.2.3 From 21e87daece5aed25f47f051af5fccbbd39164c88 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 13 Jun 2023 09:43:39 +0300 Subject: KVM: arm64: timers: Fix resource leaks in kvm_timer_hyp_init() Smatch detected this bug: arch/arm64/kvm/arch_timer.c:1425 kvm_timer_hyp_init() warn: missing unwind goto? There are two resources to be freed the vtimer and ptimer. The line that Smatch complains about should free the vtimer first before returning and then after that cleanup code should free the ptimer. I've added a out_free_ptimer_irq to free the ptimer and renamed the existing label to out_free_vtimer_irq. Fixes: 9e01dc76be6a ("KVM: arm/arm64: arch_timer: Assign the phys timer on VHE systems") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/72fffc35-7669-40b1-9d14-113c43269cf3@kili.mountain Signed-off-by: Oliver Upton --- arch/arm64/kvm/arch_timer.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c index 05b022be885b..0696732fa38c 100644 --- a/arch/arm64/kvm/arch_timer.c +++ b/arch/arm64/kvm/arch_timer.c @@ -1406,7 +1406,7 @@ int __init kvm_timer_hyp_init(bool has_gic) kvm_get_running_vcpus()); if (err) { kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); - goto out_free_irq; + goto out_free_vtimer_irq; } static_branch_enable(&has_gic_active_state); @@ -1422,7 +1422,7 @@ int __init kvm_timer_hyp_init(bool has_gic) if (err) { kvm_err("kvm_arch_timer: can't request ptimer interrupt %d (%d)\n", host_ptimer_irq, err); - return err; + goto out_free_vtimer_irq; } if (has_gic) { @@ -1430,7 +1430,7 @@ int __init kvm_timer_hyp_init(bool has_gic) kvm_get_running_vcpus()); if (err) { kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); - goto out_free_irq; + goto out_free_ptimer_irq; } } @@ -1439,11 +1439,15 @@ int __init kvm_timer_hyp_init(bool has_gic) kvm_err("kvm_arch_timer: invalid physical timer IRQ: %d\n", info->physical_irq); err = -ENODEV; - goto out_free_irq; + goto out_free_vtimer_irq; } return 0; -out_free_irq: + +out_free_ptimer_irq: + if (info->physical_irq > 0) + free_percpu_irq(host_ptimer_irq, kvm_get_running_vcpus()); +out_free_vtimer_irq: free_percpu_irq(host_vtimer_irq, kvm_get_running_vcpus()); return err; } -- cgit v1.2.3 From a30642570855faa1992f333ce5cb60351b0a37da Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 6 Jun 2023 17:46:36 -0700 Subject: KVM: x86: Update comments about MSR lists exposed to userspace Refresh comments about msrs_to_save, emulated_msrs, and msr_based_features to remove stale references left behind by commit 2374b7310b66 (KVM: x86/pmu: Use separate array for defining "PMU MSRs to save"), and to better reflect the current reality, e.g. emulated_msrs is no longer just for MSRs that are "kvm-specific". Reported-by: Binbin Wu Link: https://lore.kernel.org/r/20230607004636.1421424-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 355f3df8dbad..07b0f960de07 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1423,15 +1423,14 @@ int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); /* - * List of msr numbers which we expose to userspace through KVM_GET_MSRS - * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. - * - * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) - * extract the supported MSRs from the related const lists. - * msrs_to_save is selected from the msrs_to_save_all to reflect the - * capabilities of the host cpu. This capabilities test skips MSRs that are - * kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs - * may depend on host virtualization features rather than host cpu features. + * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track + * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, + * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that + * require host support, i.e. should be probed via RDMSR. emulated_msrs holds + * MSRs that KVM emulates without strictly requiring host support. + * msr_based_features holds MSRs that enumerate features, i.e. are effectively + * CPUID leafs. Note, msr_based_features isn't mutually exclusive with + * msrs_to_save and emulated_msrs. */ static const u32 msrs_to_save_base[] = { @@ -1527,11 +1526,11 @@ static const u32 emulated_msrs_all[] = { MSR_IA32_UCODE_REV, /* - * The following list leaves out MSRs whose values are determined - * by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs. - * We always support the "true" VMX control MSRs, even if the host - * processor does not, so I am putting these registers here rather - * than in msrs_to_save_all. + * KVM always supports the "true" VMX control MSRs, even if the host + * does not. The VMX MSRs as a whole are considered "emulated" as KVM + * doesn't strictly require them to exist in the host (ignoring that + * KVM would refuse to load in the first place if the core set of MSRs + * aren't supported). */ MSR_IA32_VMX_BASIC, MSR_IA32_VMX_TRUE_PINBASED_CTLS, -- cgit v1.2.3 From 0b210faf337314e4bc88e796218bc70c72a51209 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 1 Jun 2023 17:58:59 -0700 Subject: KVM: x86/mmu: Add "never" option to allow sticky disabling of nx_huge_pages Add a "never" option to the nx_huge_pages module param to allow userspace to do a one-way hard disabling of the mitigation, and don't create the per-VM recovery threads when the mitigation is hard disabled. Letting userspace pinky swear that userspace doesn't want to enable NX mitigation (without reloading KVM) allows certain use cases to avoid the latency problems associated with spawning a kthread for each VM. E.g. in FaaS use cases, the guest kernel is trusted and the host may create 100+ VMs per logical CPU, which can result in 100ms+ latencies when a burst of VMs is created. Reported-by: Li RongQing Closes: https://lore.kernel.org/all/1679555884-32544-1-git-send-email-lirongqing@baidu.com Cc: Yong He Cc: Robert Hoo Cc: Kai Huang Reviewed-by: Robert Hoo Acked-by: Kai Huang Tested-by: Luiz Capitulino Reviewed-by: Li RongQing Link: https://lore.kernel.org/r/20230602005859.784190-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 41 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 258f12235874..a2d0d06a8768 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -58,6 +58,8 @@ extern bool itlb_multihit_kvm_mitigation; +static bool nx_hugepage_mitigation_hard_disabled; + int __read_mostly nx_huge_pages = -1; static uint __read_mostly nx_huge_pages_recovery_period_ms; #ifdef CONFIG_PREEMPT_RT @@ -67,12 +69,13 @@ static uint __read_mostly nx_huge_pages_recovery_ratio = 0; static uint __read_mostly nx_huge_pages_recovery_ratio = 60; #endif +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp); static int set_nx_huge_pages(const char *val, const struct kernel_param *kp); static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops nx_huge_pages_ops = { .set = set_nx_huge_pages, - .get = param_get_bool, + .get = get_nx_huge_pages, }; static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = { @@ -6852,6 +6855,14 @@ static void mmu_destroy_caches(void) kmem_cache_destroy(mmu_page_header_cache); } +static int get_nx_huge_pages(char *buffer, const struct kernel_param *kp) +{ + if (nx_hugepage_mitigation_hard_disabled) + return sprintf(buffer, "never\n"); + + return param_get_bool(buffer, kp); +} + static bool get_nx_auto_mode(void) { /* Return true when CPU has the bug, and mitigations are ON */ @@ -6868,15 +6879,29 @@ static int set_nx_huge_pages(const char *val, const struct kernel_param *kp) bool old_val = nx_huge_pages; bool new_val; + if (nx_hugepage_mitigation_hard_disabled) + return -EPERM; + /* In "auto" mode deploy workaround only if CPU has the bug. */ - if (sysfs_streq(val, "off")) + if (sysfs_streq(val, "off")) { new_val = 0; - else if (sysfs_streq(val, "force")) + } else if (sysfs_streq(val, "force")) { new_val = 1; - else if (sysfs_streq(val, "auto")) + } else if (sysfs_streq(val, "auto")) { new_val = get_nx_auto_mode(); - else if (kstrtobool(val, &new_val) < 0) + } else if (sysfs_streq(val, "never")) { + new_val = 0; + + mutex_lock(&kvm_lock); + if (!list_empty(&vm_list)) { + mutex_unlock(&kvm_lock); + return -EBUSY; + } + nx_hugepage_mitigation_hard_disabled = true; + mutex_unlock(&kvm_lock); + } else if (kstrtobool(val, &new_val) < 0) { return -EINVAL; + } __set_nx_huge_pages(new_val); @@ -7014,6 +7039,9 @@ static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel uint old_period, new_period; int err; + if (nx_hugepage_mitigation_hard_disabled) + return -EPERM; + was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period); err = param_set_uint(val, kp); @@ -7169,6 +7197,9 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) { int err; + if (nx_hugepage_mitigation_hard_disabled) + return 0; + err = kvm_vm_create_worker_thread(kvm, kvm_nx_huge_page_recovery_worker, 0, "kvm-nx-lpage-recovery", &kvm->arch.nx_huge_page_recovery_thread); -- cgit v1.2.3 From 106ed2cad9f7bd803bd31a18fe7a9219b077bf95 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 6 Jun 2023 17:44:49 -0700 Subject: KVM: SVM: WARN, but continue, if misc_cg_set_capacity() fails WARN and continue if misc_cg_set_capacity() fails, as the only scenario in which it can fail is if the specified resource is invalid, which should never happen when CONFIG_KVM_AMD_SEV=y. Deliberately not bailing "fixes" a theoretical bug where KVM would leak the ASID bitmaps on failure, which again can't happen. If the impossible should happen, the end result is effectively the same with respect to SEV and SEV-ES (they are unusable), while continuing on has the advantage of letting KVM load, i.e. userspace can still run non-SEV guests. Reported-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/r/20230607004449.1421131-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/svm/sev.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index d65578d8784d..07756b7348ae 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -2216,9 +2216,7 @@ void __init sev_hardware_setup(void) } sev_asid_count = max_sev_asid - min_sev_asid + 1; - if (misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)) - goto out; - + WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV, sev_asid_count)); sev_supported = true; /* SEV-ES support requested? */ @@ -2243,9 +2241,7 @@ void __init sev_hardware_setup(void) goto out; sev_es_asid_count = min_sev_asid - 1; - if (misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)) - goto out; - + WARN_ON_ONCE(misc_cg_set_capacity(MISC_CG_RES_SEV_ES, sev_es_asid_count)); sev_es_supported = true; out: -- cgit v1.2.3 From 5ea5ca3c2b4bf4090232e18cfc515dcb52f914a6 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 7 Feb 2023 20:37:12 +0800 Subject: KVM: destruct kvm_io_device while unregistering it from kvm_io_bus Current usage of kvm_io_device requires users to destruct it with an extra call of kvm_iodevice_destructor after the device gets unregistered from kvm_io_bus. This is not necessary and can cause errors if a user forgot to make the extra call. Simplify the usage by combining kvm_iodevice_destructor into kvm_io_bus_unregister_dev. This reduces LOCs a bit for users and can avoid the leakage of destructing the device explicitly. Signed-off-by: Wei Wang Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20230207123713.3905-2-wei.w.wang@intel.com Signed-off-by: Sean Christopherson --- include/kvm/iodev.h | 6 ------ virt/kvm/coalesced_mmio.c | 9 ++------- virt/kvm/eventfd.c | 1 - virt/kvm/kvm_main.c | 23 +++++++++++++++-------- 4 files changed, 17 insertions(+), 22 deletions(-) diff --git a/include/kvm/iodev.h b/include/kvm/iodev.h index d75fc4365746..56619e33251e 100644 --- a/include/kvm/iodev.h +++ b/include/kvm/iodev.h @@ -55,10 +55,4 @@ static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu, : -EOPNOTSUPP; } -static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) -{ - if (dev->ops->destructor) - dev->ops->destructor(dev); -} - #endif /* __KVM_IODEV_H__ */ diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c index 5ef88f5a0864..1b90acb6e3fe 100644 --- a/virt/kvm/coalesced_mmio.c +++ b/virt/kvm/coalesced_mmio.c @@ -186,15 +186,10 @@ int kvm_vm_ioctl_unregister_coalesced_mmio(struct kvm *kvm, coalesced_mmio_in_range(dev, zone->addr, zone->size)) { r = kvm_io_bus_unregister_dev(kvm, zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev); - - kvm_iodevice_destructor(&dev->dev); - /* * On failure, unregister destroys all devices on the - * bus _except_ the target device, i.e. coalesced_zones - * has been modified. Bail after destroying the target - * device, there's no need to restart the walk as there - * aren't any zones left. + * bus, including the target device. There's no need + * to restart the walk as there aren't any zones left. */ if (r) break; diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 7c42441425cf..4d47fffe03d9 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -931,7 +931,6 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, bus = kvm_get_bus(kvm, bus_idx); if (bus) bus->ioeventfd_count--; - ioeventfd_release(p); ret = 0; break; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 64dd940c549e..b8242607392a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5297,6 +5297,12 @@ static void hardware_disable_all(void) } #endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ +static void kvm_iodevice_destructor(struct kvm_io_device *dev) +{ + if (dev->ops->destructor) + dev->ops->destructor(dev); +} + static void kvm_io_bus_destroy(struct kvm_io_bus *bus) { int i; @@ -5520,7 +5526,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_io_device *dev) { - int i, j; + int i; struct kvm_io_bus *new_bus, *bus; lockdep_assert_held(&kvm->slots_lock); @@ -5550,18 +5556,19 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx, rcu_assign_pointer(kvm->buses[bus_idx], new_bus); synchronize_srcu_expedited(&kvm->srcu); - /* Destroy the old bus _after_ installing the (null) bus. */ + /* + * If NULL bus is installed, destroy the old bus, including all the + * attached devices. Otherwise, destroy the caller's device only. + */ if (!new_bus) { pr_err("kvm: failed to shrink bus, removing it completely\n"); - for (j = 0; j < bus->dev_count; j++) { - if (j == i) - continue; - kvm_iodevice_destructor(bus->range[j].dev); - } + kvm_io_bus_destroy(bus); + return -ENOMEM; } + kvm_iodevice_destructor(dev); kfree(bus); - return new_bus ? 0 : -ENOMEM; + return 0; } struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx, -- cgit v1.2.3 From cc77b95acf3c7d9a24204b0555fed2014f300fd5 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 7 Feb 2023 20:37:13 +0800 Subject: kvm/eventfd: use list_for_each_entry when deassign ioeventfd Simpify kvm_deassign_ioeventfd_idx to use list_for_each_entry as the loop just ends at the entry that's found and deleted. Note, coalesced_mmio_ops and ioeventfd_ops are the only instances of kvm_io_device_ops that implement a destructor, all other callers of kvm_io_bus_unregister_dev() are unaffected by this change. Suggested-by: Michal Luczaj Signed-off-by: Wei Wang Reviewed-by: Sean Christopherson Link: https://lore.kernel.org/r/20230207123713.3905-3-wei.w.wang@intel.com [sean: call out that only select users implement a destructor] Signed-off-by: Sean Christopherson --- virt/kvm/eventfd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 4d47fffe03d9..89912a17f5d5 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -901,7 +901,7 @@ static int kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, struct kvm_ioeventfd *args) { - struct _ioeventfd *p, *tmp; + struct _ioeventfd *p; struct eventfd_ctx *eventfd; struct kvm_io_bus *bus; int ret = -ENOENT; @@ -915,8 +915,7 @@ kvm_deassign_ioeventfd_idx(struct kvm *kvm, enum kvm_bus bus_idx, mutex_lock(&kvm->slots_lock); - list_for_each_entry_safe(p, tmp, &kvm->ioeventfds, list) { - + list_for_each_entry(p, &kvm->ioeventfds, list) { if (p->bus_idx != bus_idx || p->eventfd != eventfd || p->addr != args->addr || -- cgit v1.2.3 From 5ed19528db8ddcf0113d721f67a381be3e30c65a Mon Sep 17 00:00:00 2001 From: Yu Zhang Date: Thu, 1 Jun 2023 16:03:38 +0800 Subject: KVM: selftests: Add new CFLAGS to generate dependency files Add "-MD" in CFLAGS to generate dependency files. Currently, each time a header file is updated in KVM selftest, we will have to run "make clean && make" to rebuild the whole test suite. By adding new compiling flags and dependent rules in Makefile, we do not need to make clean && make each time a header file is updated. Signed-off-by: Yu Zhang Link: https://lore.kernel.org/r/20230601080338.212942-1-yu.c.zhang@linux.intel.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/Makefile | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index ee41ff0c5a86..de10581ea108 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -184,6 +184,8 @@ TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(ARCH_DIR)) TEST_GEN_PROGS_EXTENDED += $(TEST_GEN_PROGS_EXTENDED_$(ARCH_DIR)) LIBKVM += $(LIBKVM_$(ARCH_DIR)) +OVERRIDE_TARGETS = 1 + # lib.mak defines $(OUTPUT), prepends $(OUTPUT)/ to $(TEST_GEN_PROGS), and most # importantly defines, i.e. overwrites, $(CC) (unless `make -e` or `make CC=`, # which causes the environment variable to override the makefile). @@ -198,7 +200,7 @@ else LINUX_TOOL_ARCH_INCLUDE = $(top_srcdir)/tools/arch/$(ARCH)/include endif CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ - -Wno-gnu-variable-sized-type-not-at-end \ + -Wno-gnu-variable-sized-type-not-at-end -MD\ -fno-builtin-memcmp -fno-builtin-memcpy -fno-builtin-memset \ -fno-stack-protector -fno-PIE -I$(LINUX_TOOL_INCLUDE) \ -I$(LINUX_TOOL_ARCH_INCLUDE) -I$(LINUX_HDR_PATH) -Iinclude \ @@ -225,7 +227,18 @@ LIBKVM_S_OBJ := $(patsubst %.S, $(OUTPUT)/%.o, $(LIBKVM_S)) LIBKVM_STRING_OBJ := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBKVM_STRING)) LIBKVM_OBJS = $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ) $(LIBKVM_STRING_OBJ) -EXTRA_CLEAN += $(LIBKVM_OBJS) cscope.* +TEST_GEN_OBJ = $(patsubst %, %.o, $(TEST_GEN_PROGS)) +TEST_GEN_OBJ += $(patsubst %, %.o, $(TEST_GEN_PROGS_EXTENDED)) +TEST_DEP_FILES = $(patsubst %.o, %.d, $(TEST_GEN_OBJ)) +TEST_DEP_FILES += $(patsubst %.o, %.d, $(LIBKVM_OBJS)) +-include $(TEST_DEP_FILES) + +$(TEST_GEN_PROGS) $(TEST_GEN_PROGS_EXTENDED): %: %.o + $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $< $(LIBKVM_OBJS) $(LDLIBS) -o $@ +$(TEST_GEN_OBJ): $(OUTPUT)/%.o: %.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ + +EXTRA_CLEAN += $(LIBKVM_OBJS) $(TEST_DEP_FILES) $(TEST_GEN_OBJ) cscope.* x := $(shell mkdir -p $(sort $(dir $(LIBKVM_C_OBJ) $(LIBKVM_S_OBJ)))) $(LIBKVM_C_OBJ): $(OUTPUT)/%.o: %.c -- cgit v1.2.3 From 8c15c2a0281087d19f62d7c2b5ab1f9e961b8d97 Mon Sep 17 00:00:00 2001 From: Mostafa Saleh Date: Wed, 14 Jun 2023 12:25:59 +0000 Subject: KVM: arm64: Use different pointer authentication keys for pKVM When the use of pointer authentication is enabled in the kernel it applies to both the kernel itself as well as KVM's nVHE hypervisor. The same keys are used for both the kernel and the nVHE hypervisor, which is less than desirable for pKVM as the host is not trusted at runtime. Naturally, the fix is to use a different set of keys for the hypervisor when running in protected mode. Have the host generate a new set of keys for the hypervisor before deprivileging the kernel. While there might be other sources of random directly available at EL2, this keeps the implementation simple, and the host is trusted anyways until it is deprivileged. Since the host and hypervisor no longer share a set of pointer authentication keys, start context switching them on the host entry/exit path exactly as we do for guest entry/exit. There is no need to handle CPU migration as the nVHE code is not migratable in the first place. Signed-off-by: Mostafa Saleh Link: https://lore.kernel.org/r/20230614122600.2098901-1-smostafa@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 26 ++++++++++++++++++++++++++ arch/arm64/kvm/hyp/nvhe/host.S | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 14391826241c..dd03b52f035d 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -51,6 +51,8 @@ DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); +DECLARE_KVM_NVHE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); + static bool vgic_present; static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled); @@ -2067,6 +2069,26 @@ static int __init kvm_hyp_init_protection(u32 hyp_va_bits) return 0; } +static void pkvm_hyp_init_ptrauth(void) +{ + struct kvm_cpu_context *hyp_ctxt; + int cpu; + + for_each_possible_cpu(cpu) { + hyp_ctxt = per_cpu_ptr_nvhe_sym(kvm_hyp_ctxt, cpu); + hyp_ctxt->sys_regs[APIAKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APIAKEYHI_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APIBKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APIBKEYHI_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APDAKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APDAKEYHI_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APDBKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APDBKEYHI_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APGAKEYLO_EL1] = get_random_long(); + hyp_ctxt->sys_regs[APGAKEYHI_EL1] = get_random_long(); + } +} + /* Inits Hyp-mode on all online CPUs */ static int __init init_hyp_mode(void) { @@ -2228,6 +2250,10 @@ static int __init init_hyp_mode(void) kvm_hyp_init_symbols(); if (is_protected_kvm_enabled()) { + if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL) && + cpus_have_const_cap(ARM64_HAS_ADDRESS_AUTH)) + pkvm_hyp_init_ptrauth(); + init_cpu_logical_map(); if (!init_psci_relay()) { diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index b6c0188c4b35..c87c63133e10 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -10,6 +10,7 @@ #include #include #include +#include .text @@ -37,10 +38,43 @@ SYM_FUNC_START(__host_exit) /* Save the host context pointer in x29 across the function call */ mov x29, x0 + +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL +alternative_if_not ARM64_HAS_ADDRESS_AUTH +b __skip_pauth_save +alternative_else_nop_endif + +alternative_if ARM64_KVM_PROTECTED_MODE + /* Save kernel ptrauth keys. */ + add x18, x29, #CPU_APIAKEYLO_EL1 + ptrauth_save_state x18, x19, x20 + + /* Use hyp keys. */ + adr_this_cpu x18, kvm_hyp_ctxt, x19 + add x18, x18, #CPU_APIAKEYLO_EL1 + ptrauth_restore_state x18, x19, x20 + isb +alternative_else_nop_endif +__skip_pauth_save: +#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */ + bl handle_trap - /* Restore host regs x0-x17 */ __host_enter_restore_full: + /* Restore kernel keys. */ +#ifdef CONFIG_ARM64_PTR_AUTH_KERNEL +alternative_if_not ARM64_HAS_ADDRESS_AUTH +b __skip_pauth_restore +alternative_else_nop_endif + +alternative_if ARM64_KVM_PROTECTED_MODE + add x18, x29, #CPU_APIAKEYLO_EL1 + ptrauth_restore_state x18, x19, x20 +alternative_else_nop_endif +__skip_pauth_restore: +#endif /* CONFIG_ARM64_PTR_AUTH_KERNEL */ + + /* Restore host regs x0-x17 */ ldp x0, x1, [x29, #CPU_XREG_OFFSET(0)] ldp x2, x3, [x29, #CPU_XREG_OFFSET(2)] ldp x4, x5, [x29, #CPU_XREG_OFFSET(4)] -- cgit v1.2.3 From 1700f89cb99aae19e4f8ec38b1b59f3b7ae71b91 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 14 Jun 2023 16:51:29 +0100 Subject: KVM: arm64: Fix hVHE init on CPUs where HCR_EL2.E2H is not RES1 On CPUs where E2H is RES1, we very quickly set the scene for running EL2 with a VHE configuration, as we do not have any other choice. However, CPUs that conform to the current writing of the architecture start with E2H=0, and only later upgrade with E2H=1. This is all good, but nothing there is actually reconfiguring EL2 to be able to correctly run the kernel at EL1. Huhuh... The "obvious" solution is not to just reinitialise the timer controls like we do, but to really intitialise *everything* unconditionally. This requires a bit of surgery, and is a good opportunity to remove the macro that messes with SPSR_EL2 in init_el2_state. With that, hVHE now works correctly on my trusted A55 machine! Reported-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230614155129.2697388-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/el2_setup.h | 1 - arch/arm64/kernel/head.S | 2 ++ arch/arm64/kvm/hyp/nvhe/hyp-init.S | 19 ++++++++++++------- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/el2_setup.h b/arch/arm64/include/asm/el2_setup.h index bba508ffa12d..5a353f94e9cd 100644 --- a/arch/arm64/include/asm/el2_setup.h +++ b/arch/arm64/include/asm/el2_setup.h @@ -205,7 +205,6 @@ __init_el2_nvhe_idregs __init_el2_cptr __init_el2_fgt - __init_el2_nvhe_prepare_eret .endm #ifndef __KVM_NVHE_HYPERVISOR__ diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index e92caebff46a..23955050da73 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -603,6 +603,8 @@ SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) msr sctlr_el1, x1 mov x2, xzr 2: + __init_el2_nvhe_prepare_eret + mov w0, #BOOT_CPU_MODE_EL2 orr x0, x0, x2 eret diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-init.S b/arch/arm64/kvm/hyp/nvhe/hyp-init.S index f9ee10e29497..74ee77d9cfd0 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-init.S +++ b/arch/arm64/kvm/hyp/nvhe/hyp-init.S @@ -83,9 +83,6 @@ SYM_CODE_END(__kvm_hyp_init) * x0: struct kvm_nvhe_init_params PA */ SYM_CODE_START_LOCAL(___kvm_hyp_init) - ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] - msr tpidr_el2, x1 - ldr x1, [x0, #NVHE_INIT_STACK_HYP_VA] mov sp, x1 @@ -99,11 +96,18 @@ SYM_CODE_START_LOCAL(___kvm_hyp_init) and x2, x1, x2 cbz x2, 1f - mrs x1, cnthctl_el2 - and x1, x1, #~(BIT(0) | BIT(1)) - orr x1, x1, #(BIT(10) | BIT(11)) - msr cnthctl_el2, x1 + // hVHE: Replay the EL2 setup to account for the E2H bit + // TPIDR_EL2 is used to preserve x0 across the macro maze... + isb + msr tpidr_el2, x0 + init_el2_state + finalise_el2_state + mrs x0, tpidr_el2 + 1: + ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] + msr tpidr_el2, x1 + ldr x1, [x0, #NVHE_INIT_VTTBR] msr vttbr_el2, x1 @@ -193,6 +197,7 @@ SYM_CODE_START_LOCAL(__kvm_hyp_init_cpu) /* Initialize EL2 CPU state to sane values. */ init_el2_state // Clobbers x0..x2 finalise_el2_state + __init_el2_nvhe_prepare_eret /* Enable MMU, set vectors and stack. */ mov x0, x28 -- cgit v1.2.3 From 473341469042b39535ee800a19c45110ebc46346 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Fri, 9 Jun 2023 19:00:49 +0000 Subject: KVM: arm64: Save ID registers' sanitized value per guest Initialize the default ID register values upon the first call to KVM_ARM_VCPU_INIT. The vCPU feature flags are finalized at that point, so it is possible to determine the maximum feature set supported by a particular VM configuration. Do nothing with these values for now, as we need to rework the plumbing of what's already writable to be compatible with the generic infrastructure. Co-developed-by: Reiji Watanabe Signed-off-by: Reiji Watanabe Signed-off-by: Jing Zhang Link: https://lore.kernel.org/r/20230609190054.1542113-7-oliver.upton@linux.dev [Oliver: Hoist everything into KVM_ARM_VCPU_INIT time, so the features are final] Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 15 +++++++++++ arch/arm64/kvm/sys_regs.c | 56 ++++++++++++++++++++++++++++++++++++--- arch/arm64/kvm/sys_regs.h | 7 +++++ 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 44da989435fc..39270bc29a3f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -225,6 +225,8 @@ struct kvm_arch { #define KVM_ARCH_FLAG_TIMER_PPIS_IMMUTABLE 6 /* SMCCC filter initialized for the VM */ #define KVM_ARCH_FLAG_SMCCC_FILTER_CONFIGURED 7 + /* Initial ID reg values loaded */ +#define KVM_ARCH_FLAG_ID_REGS_INITIALIZED 8 unsigned long flags; /* VM-wide vCPU feature set */ @@ -247,6 +249,19 @@ struct kvm_arch { struct kvm_smccc_features smccc_feat; struct maple_tree smccc_filter; + /* + * Emulated CPU ID registers per VM + * (Op0, Op1, CRn, CRm, Op2) of the ID registers to be saved in it + * is (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8. + * + * These emulated idregs are VM-wide, but accessed from the context of a vCPU. + * Atomic access to multiple idregs are guarded by kvm_arch.config_lock. + */ +#define IDREG_IDX(id) (((sys_reg_CRm(id) - 1) << 3) | sys_reg_Op2(id)) +#define IDREG(kvm, id) ((kvm)->arch.id_regs[IDREG_IDX(id)]) +#define KVM_ARM_ID_REG_NUM (IDREG_IDX(sys_reg(3, 0, 0, 7, 7)) + 1) + u64 id_regs[KVM_ARM_ID_REG_NUM]; + /* * For an untrusted host VM, 'pkvm.handle' is used to lookup * the associated pKVM instance in the hypervisor. diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9d37f16cfb7b..3015c860deca 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1311,6 +1311,17 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r return __kvm_read_sanitised_id_reg(vcpu, r); } +/* + * Return true if the register's (Op0, Op1, CRn, CRm, Op2) is + * (3, 0, 0, crm, op2), where 1<=crm<8, 0<=op2<8. + */ +static inline bool is_id_reg(u32 id) +{ + return (sys_reg_Op0(id) == 3 && sys_reg_Op1(id) == 0 && + sys_reg_CRn(id) == 0 && sys_reg_CRm(id) >= 1 && + sys_reg_CRm(id) < 8); +} + static unsigned int id_visibility(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { @@ -2303,6 +2314,8 @@ static const struct sys_reg_desc sys_reg_descs[] = { EL2_REG(SP_EL2, NULL, reset_unknown, 0), }; +static const struct sys_reg_desc *first_idreg; + static bool trap_dbgdidr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) @@ -2993,6 +3006,28 @@ static bool emulate_sys_reg(struct kvm_vcpu *vcpu, return false; } +static void kvm_reset_id_regs(struct kvm_vcpu *vcpu) +{ + const struct sys_reg_desc *idreg = first_idreg; + u32 id = reg_to_encoding(idreg); + struct kvm *kvm = vcpu->kvm; + + if (test_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags)) + return; + + lockdep_assert_held(&kvm->arch.config_lock); + + /* Initialize all idregs */ + while (is_id_reg(id)) { + IDREG(kvm, id) = idreg->reset(vcpu, idreg); + + idreg++; + id = reg_to_encoding(idreg); + } + + set_bit(KVM_ARCH_FLAG_ID_REGS_INITIALIZED, &kvm->arch.flags); +} + /** * kvm_reset_sys_regs - sets system registers to reset value * @vcpu: The VCPU pointer @@ -3004,9 +3039,17 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu) { unsigned long i; - for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) - if (sys_reg_descs[i].reset) - sys_reg_descs[i].reset(vcpu, &sys_reg_descs[i]); + kvm_reset_id_regs(vcpu); + + for (i = 0; i < ARRAY_SIZE(sys_reg_descs); i++) { + const struct sys_reg_desc *r = &sys_reg_descs[i]; + + if (is_id_reg(reg_to_encoding(r))) + continue; + + if (r->reset) + r->reset(vcpu, r); + } } /** @@ -3413,6 +3456,7 @@ int kvm_arm_copy_sys_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices) int __init kvm_sys_reg_table_init(void) { + struct sys_reg_params params; bool valid = true; unsigned int i; @@ -3431,5 +3475,11 @@ int __init kvm_sys_reg_table_init(void) for (i = 0; i < ARRAY_SIZE(invariant_sys_regs); i++) invariant_sys_regs[i].reset(NULL, &invariant_sys_regs[i]); + /* Find the first idreg (SYS_ID_PFR0_EL1) in sys_reg_descs. */ + params = encoding_to_params(SYS_ID_PFR0_EL1); + first_idreg = find_reg(¶ms, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); + if (!first_idreg) + return -EINVAL; + return 0; } diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 63bca7521dfd..c65c129b3500 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -27,6 +27,13 @@ struct sys_reg_params { bool is_write; }; +#define encoding_to_params(reg) \ + ((struct sys_reg_params){ .Op0 = sys_reg_Op0(reg), \ + .Op1 = sys_reg_Op1(reg), \ + .CRn = sys_reg_CRn(reg), \ + .CRm = sys_reg_CRm(reg), \ + .Op2 = sys_reg_Op2(reg) }) + #define esr_sys64_to_params(esr) \ ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \ .Op1 = ((esr) >> 14) & 0x7, \ -- cgit v1.2.3 From 2e8bf0cbd0589bae3a0466a3ed45f9cf9f3164eb Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Fri, 9 Jun 2023 19:00:50 +0000 Subject: KVM: arm64: Use arm64_ftr_bits to sanitise ID register writes Rather than reinventing the wheel in KVM to do ID register sanitisation we can rely on the work already done in the core kernel. Implement a generalized sanitisation of ID registers based on the combination of the arm64_ftr_bits definitions from the core kernel and (optionally) a set of KVM-specific overrides. This all amounts to absolutely nothing for now, but will be used in subsequent changes to realize user-configurable ID registers. Signed-off-by: Jing Zhang Link: https://lore.kernel.org/r/20230609190054.1542113-8-oliver.upton@linux.dev [Oliver: split off from monster patch, rewrote commit description, reworked RAZ handling, return EINVAL to userspace] Signed-off-by: Oliver Upton --- arch/arm64/include/asm/cpufeature.h | 1 + arch/arm64/kernel/cpufeature.c | 2 +- arch/arm64/kvm/sys_regs.c | 123 ++++++++++++++++++++++++++++++++++-- 3 files changed, 121 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 6bf013fb110d..dc769c2eb7a4 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -915,6 +915,7 @@ static inline unsigned int get_vmid_bits(u64 mmfr1) return 8; } +s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur); struct arm64_ftr_reg *get_arm64_ftr_reg(u32 sys_id); extern struct arm64_ftr_override id_aa64mmfr1_override; diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 7d7128c65161..3317a7b6deac 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -798,7 +798,7 @@ static u64 arm64_ftr_set_value(const struct arm64_ftr_bits *ftrp, s64 reg, return reg; } -static s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, +s64 arm64_ftr_safe_value(const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { s64 ret = 0; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 3015c860deca..9a46fb93603d 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1201,6 +1201,91 @@ static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu) return 0; } +static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, + s64 new, s64 cur) +{ + struct arm64_ftr_bits kvm_ftr = *ftrp; + + /* Some features have different safe value type in KVM than host features */ + switch (id) { + case SYS_ID_AA64DFR0_EL1: + if (kvm_ftr.shift == ID_AA64DFR0_EL1_PMUVer_SHIFT) + kvm_ftr.type = FTR_LOWER_SAFE; + break; + case SYS_ID_DFR0_EL1: + if (kvm_ftr.shift == ID_DFR0_EL1_PerfMon_SHIFT) + kvm_ftr.type = FTR_LOWER_SAFE; + break; + } + + return arm64_ftr_safe_value(&kvm_ftr, new, cur); +} + +/** + * arm64_check_features() - Check if a feature register value constitutes + * a subset of features indicated by the idreg's KVM sanitised limit. + * + * This function will check if each feature field of @val is the "safe" value + * against idreg's KVM sanitised limit return from reset() callback. + * If a field value in @val is the same as the one in limit, it is always + * considered the safe value regardless For register fields that are not in + * writable, only the value in limit is considered the safe value. + * + * Return: 0 if all the fields are safe. Otherwise, return negative errno. + */ +static int arm64_check_features(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd, + u64 val) +{ + const struct arm64_ftr_reg *ftr_reg; + const struct arm64_ftr_bits *ftrp = NULL; + u32 id = reg_to_encoding(rd); + u64 writable_mask = rd->val; + u64 limit = rd->reset(vcpu, rd); + u64 mask = 0; + + /* + * Hidden and unallocated ID registers may not have a corresponding + * struct arm64_ftr_reg. Of course, if the register is RAZ we know the + * only safe value is 0. + */ + if (sysreg_visible_as_raz(vcpu, rd)) + return val ? -E2BIG : 0; + + ftr_reg = get_arm64_ftr_reg(id); + if (!ftr_reg) + return -EINVAL; + + ftrp = ftr_reg->ftr_bits; + + for (; ftrp && ftrp->width; ftrp++) { + s64 f_val, f_lim, safe_val; + u64 ftr_mask; + + ftr_mask = arm64_ftr_mask(ftrp); + if ((ftr_mask & writable_mask) != ftr_mask) + continue; + + f_val = arm64_ftr_value(ftrp, val); + f_lim = arm64_ftr_value(ftrp, limit); + mask |= ftr_mask; + + if (f_val == f_lim) + safe_val = f_val; + else + safe_val = kvm_arm64_ftr_safe_value(id, ftrp, f_val, f_lim); + + if (safe_val != f_val) + return -E2BIG; + } + + /* For fields that are not writable, values in limit are the safe values. */ + if ((val & ~mask) != (limit & ~mask)) + return -E2BIG; + + return 0; +} + static u8 perfmon_to_pmuver(u8 perfmon) { switch (perfmon) { @@ -1528,11 +1613,41 @@ static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { - /* This is what we mean by invariant: you can't change it. */ - if (val != read_id_reg(vcpu, rd)) - return -EINVAL; + u32 id = reg_to_encoding(rd); + int ret; - return 0; + mutex_lock(&vcpu->kvm->arch.config_lock); + + /* + * Once the VM has started the ID registers are immutable. Reject any + * write that does not match the final register value. + */ + if (kvm_vm_has_ran_once(vcpu->kvm)) { + if (val != read_id_reg(vcpu, rd)) + ret = -EBUSY; + else + ret = 0; + + mutex_unlock(&vcpu->kvm->arch.config_lock); + return ret; + } + + ret = arm64_check_features(vcpu, rd, val); + if (!ret) + IDREG(vcpu->kvm, id) = val; + + mutex_unlock(&vcpu->kvm->arch.config_lock); + + /* + * arm64_check_features() returns -E2BIG to indicate the register's + * feature set is a superset of the maximally-allowed register value. + * While it would be nice to precisely describe this to userspace, the + * existing UAPI for KVM_SET_ONE_REG has it that invalid register + * writes return -EINVAL. + */ + if (ret == -E2BIG) + ret = -EINVAL; + return ret; } static int get_raz_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, -- cgit v1.2.3 From c118cead07a762592c9e67252064616efa4574fa Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Fri, 9 Jun 2023 19:00:51 +0000 Subject: KVM: arm64: Use generic sanitisation for ID_(AA64)DFR0_EL1 KVM allows userspace to specify a PMU version for the guest by writing to the corresponding ID registers. Currently the validation of these writes is done manuallly, but there's no reason we can't switch over to the generic sanitisation infrastructure. Start screening user writes through arm64_check_features() to prevent userspace from over-promising in terms of vPMU support. Leave the old masking in place for now, as we aren't completely ready to serve reads from the VM-wide values. Signed-off-by: Jing Zhang Link: https://lore.kernel.org/r/20230609190054.1542113-9-oliver.upton@linux.dev [Oliver: split off from monster patch, cleaned up handling of NI vPMU values, wrote commit description] Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 106 ++++++++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 45 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 9a46fb93603d..8054d7714c18 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -42,6 +42,8 @@ */ static u64 sys_reg_to_index(const struct sys_reg_desc *reg); +static int set_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + u64 val); static bool read_from_write_only(struct kvm_vcpu *vcpu, struct sys_reg_params *params, @@ -1503,15 +1505,35 @@ static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, return 0; } +static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + u64 val = read_sanitised_ftr_reg(SYS_ID_AA64DFR0_EL1); + + /* Limit debug to ARMv8.0 */ + val &= ~ID_AA64DFR0_EL1_DebugVer_MASK; + val |= SYS_FIELD_PREP_ENUM(ID_AA64DFR0_EL1, DebugVer, IMP); + + /* + * Only initialize the PMU version if the vCPU was configured with one. + */ + val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; + if (kvm_vcpu_has_pmu(vcpu)) + val |= SYS_FIELD_PREP(ID_AA64DFR0_EL1, PMUVer, + kvm_arm_pmu_get_pmuver_limit()); + + /* Hide SPE from guests */ + val &= ~ID_AA64DFR0_EL1_PMSVer_MASK; + + return val; +} + static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { - u8 pmuver, host_pmuver; - bool valid_pmu; - - host_pmuver = kvm_arm_pmu_get_pmuver_limit(); - pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); + u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); + int r; /* * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the @@ -1532,38 +1554,33 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, pmuver = 0; } - /* - * Allow AA64DFR0_EL1.PMUver to be set from userspace as long - * as it doesn't promise more than what the HW gives us. - */ - if (pmuver > host_pmuver) - return -EINVAL; + r = set_id_reg(vcpu, rd, val); + if (r) + return r; - valid_pmu = pmuver; + vcpu->kvm->arch.dfr0_pmuver = pmuver; + return 0; +} - /* Make sure view register and PMU support do match */ - if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) - return -EINVAL; +static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + u8 perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); + u64 val = read_sanitised_ftr_reg(SYS_ID_DFR0_EL1); - /* We can only differ with PMUver, and anything else is an error */ - val ^= read_id_reg(vcpu, rd); - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer); - if (val) - return -EINVAL; + val &= ~ID_DFR0_EL1_PerfMon_MASK; + if (kvm_vcpu_has_pmu(vcpu)) + val |= SYS_FIELD_PREP(ID_DFR0_EL1, PerfMon, perfmon); - vcpu->kvm->arch.dfr0_pmuver = pmuver; - return 0; + return val; } static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { - u8 perfmon, host_perfmon; - bool valid_pmu; - - host_perfmon = pmuver_to_perfmon(kvm_arm_pmu_get_pmuver_limit()); - perfmon = SYS_FIELD_GET(ID_DFR0_EL1, PerfMon, val); + u8 perfmon = SYS_FIELD_GET(ID_DFR0_EL1, PerfMon, val); + int r; if (perfmon == ID_DFR0_EL1_PerfMon_IMPDEF) { val &= ~ID_DFR0_EL1_PerfMon_MASK; @@ -1576,21 +1593,12 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, * AArch64 side (as everything is emulated with that), and * that this is a PMUv3. */ - if (perfmon > host_perfmon || - (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3)) - return -EINVAL; - - valid_pmu = perfmon; - - /* Make sure view register and PMU support do match */ - if (kvm_vcpu_has_pmu(vcpu) != valid_pmu) + if (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3) return -EINVAL; - /* We can only differ with PerfMon, and anything else is an error */ - val ^= read_id_reg(vcpu, rd); - val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon); - if (val) - return -EINVAL; + r = set_id_reg(vcpu, rd, val); + if (r) + return r; vcpu->kvm->arch.dfr0_pmuver = perfmon_to_pmuver(perfmon); return 0; @@ -1998,9 +2006,13 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* CRm=1 */ AA32_ID_SANITISED(ID_PFR0_EL1), AA32_ID_SANITISED(ID_PFR1_EL1), - { SYS_DESC(SYS_ID_DFR0_EL1), .access = access_id_reg, - .get_user = get_id_reg, .set_user = set_id_dfr0_el1, - .visibility = aa32_id_visibility, }, + { SYS_DESC(SYS_ID_DFR0_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_dfr0_el1, + .visibility = aa32_id_visibility, + .reset = read_sanitised_id_dfr0_el1, + .val = ID_DFR0_EL1_PerfMon_MASK, }, ID_HIDDEN(ID_AFR0_EL1), AA32_ID_SANITISED(ID_MMFR0_EL1), AA32_ID_SANITISED(ID_MMFR1_EL1), @@ -2040,8 +2052,12 @@ static const struct sys_reg_desc sys_reg_descs[] = { ID_UNALLOCATED(4,7), /* CRm=5 */ - { SYS_DESC(SYS_ID_AA64DFR0_EL1), .access = access_id_reg, - .get_user = get_id_reg, .set_user = set_id_aa64dfr0_el1, }, + { SYS_DESC(SYS_ID_AA64DFR0_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_aa64dfr0_el1, + .reset = read_sanitised_id_aa64dfr0_el1, + .val = ID_AA64DFR0_EL1_PMUVer_MASK, }, ID_SANITISED(ID_AA64DFR1_EL1), ID_UNALLOCATED(5,2), ID_UNALLOCATED(5,3), -- cgit v1.2.3 From c39f5974d38f7250782c2fe52b2793cd00efaac0 Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Fri, 9 Jun 2023 19:00:52 +0000 Subject: KVM: arm64: Use generic sanitisation for ID_AA64PFR0_EL1 KVM allows userspace to write to the CSV2 and CSV3 fields of ID_AA64PFR0_EL1 so long as it doesn't over-promise on the Spectre/Meltdown mitigation state. Switch over to the new way of the world for screening user writes. Leave the old plumbing in place until we actually start handling ID register reads from the VM-wide values. Signed-off-by: Jing Zhang Link: https://lore.kernel.org/r/20230609190054.1542113-10-oliver.upton@linux.dev [Oliver: split from monster patch, added commit description] Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 66 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 8054d7714c18..285439ec9d18 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1470,34 +1470,54 @@ static unsigned int sve_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } +static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, + const struct sys_reg_desc *rd) +{ + u64 val = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + if (!vcpu_has_sve(vcpu)) + val &= ~ID_AA64PFR0_EL1_SVE_MASK; + + /* + * The default is to expose CSV2 == 1 if the HW isn't affected. + * Although this is a per-CPU feature, we make it global because + * asymmetric systems are just a nuisance. + * + * Userspace can override this as long as it doesn't promise + * the impossible. + */ + if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) { + val &= ~ID_AA64PFR0_EL1_CSV2_MASK; + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV2, IMP); + } + if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) { + val &= ~ID_AA64PFR0_EL1_CSV3_MASK; + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, CSV3, IMP); + } + + if (kvm_vgic_global_state.type == VGIC_V3) { + val &= ~ID_AA64PFR0_EL1_GIC_MASK; + val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP); + } + + val &= ~ID_AA64PFR0_EL1_AMU_MASK; + + return val; +} + static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 val) { u8 csv2, csv3; + int r; - /* - * Allow AA64PFR0_EL1.CSV2 to be set from userspace as long as - * it doesn't promise more than what is actually provided (the - * guest could otherwise be covered in ectoplasmic residue). - */ csv2 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV2_SHIFT); - if (csv2 > 1 || - (csv2 && arm64_get_spectre_v2_state() != SPECTRE_UNAFFECTED)) - return -EINVAL; - - /* Same thing for CSV3 */ csv3 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV3_SHIFT); - if (csv3 > 1 || - (csv3 && arm64_get_meltdown_state() != SPECTRE_UNAFFECTED)) - return -EINVAL; - /* We can only differ with CSV[23], and anything else is an error */ - val ^= read_id_reg(vcpu, rd); - val &= ~(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2) | - ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3)); - if (val) - return -EINVAL; + r = set_id_reg(vcpu, rd, val); + if (r) + return r; vcpu->kvm->arch.pfr0_csv2 = csv2; vcpu->kvm->arch.pfr0_csv3 = csv3; @@ -2041,8 +2061,12 @@ static const struct sys_reg_desc sys_reg_descs[] = { /* AArch64 ID registers */ /* CRm=4 */ - { SYS_DESC(SYS_ID_AA64PFR0_EL1), .access = access_id_reg, - .get_user = get_id_reg, .set_user = set_id_aa64pfr0_el1, }, + { SYS_DESC(SYS_ID_AA64PFR0_EL1), + .access = access_id_reg, + .get_user = get_id_reg, + .set_user = set_id_aa64pfr0_el1, + .reset = read_sanitised_id_aa64pfr0_el1, + .val = ID_AA64PFR0_EL1_CSV2_MASK | ID_AA64PFR0_EL1_CSV3_MASK, }, ID_SANITISED(ID_AA64PFR1_EL1), ID_UNALLOCATED(4,2), ID_UNALLOCATED(4,3), -- cgit v1.2.3 From 6db7af0d5b2b6073caf6a7f3364d8dd2005584d4 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 9 Jun 2023 19:00:53 +0000 Subject: KVM: arm64: Handle ID register reads using the VM-wide values Everything is in place now to use the generic ID register infrastructure. Use the VM-wide values to service ID register reads. The ID registers are invariant after the VM has started, so there is no need for locking in that case. This is rather desirable for VM live migration, as the needless lock contention could prolong the VM blackout period. Link: https://lore.kernel.org/r/20230609190054.1542113-11-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/sys_regs.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 285439ec9d18..7e8772508315 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1395,7 +1395,7 @@ static u64 kvm_read_sanitised_id_reg(struct kvm_vcpu *vcpu, static u64 read_id_reg(const struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { - return __kvm_read_sanitised_id_reg(vcpu, r); + return IDREG(vcpu->kvm, reg_to_encoding(r)); } /* @@ -1634,7 +1634,19 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, static int get_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, u64 *val) { + /* + * Avoid locking if the VM has already started, as the ID registers are + * guaranteed to be invariant at that point. + */ + if (kvm_vm_has_ran_once(vcpu->kvm)) { + *val = read_id_reg(vcpu, rd); + return 0; + } + + mutex_lock(&vcpu->kvm->arch.config_lock); *val = read_id_reg(vcpu, rd); + mutex_unlock(&vcpu->kvm->arch.config_lock); + return 0; } -- cgit v1.2.3 From 686672407e6eaf8c874d4a6bf315da798f281045 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 9 Jun 2023 19:00:54 +0000 Subject: KVM: arm64: Rip out the vestiges of the 'old' ID register scheme There's no longer a need for the baggage of the old scheme for handling configurable ID register fields. Rip it all out in favor of the generalized infrastructure. Link: https://lore.kernel.org/r/20230609190054.1542113-12-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_host.h | 4 -- arch/arm64/kvm/arm.c | 23 ---------- arch/arm64/kvm/sys_regs.c | 92 ++------------------------------------- include/kvm/arm_pmu.h | 8 +++- 4 files changed, 10 insertions(+), 117 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 39270bc29a3f..e2e9552f2808 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -241,10 +241,6 @@ struct kvm_arch { cpumask_var_t supported_cpus; - u8 pfr0_csv2; - u8 pfr0_csv3; - u8 dfr0_pmuver; - /* Hypercall features firmware registers' descriptor */ struct kvm_smccc_features smccc_feat; struct maple_tree smccc_filter; diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 695239e01618..80ca7ec269dc 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -102,22 +102,6 @@ static int kvm_arm_default_max_vcpus(void) return vgic_present ? kvm_vgic_get_max_vcpus() : KVM_MAX_VCPUS; } -static void set_default_spectre(struct kvm *kvm) -{ - /* - * The default is to expose CSV2 == 1 if the HW isn't affected. - * Although this is a per-CPU feature, we make it global because - * asymmetric systems are just a nuisance. - * - * Userspace can override this as long as it doesn't promise - * the impossible. - */ - if (arm64_get_spectre_v2_state() == SPECTRE_UNAFFECTED) - kvm->arch.pfr0_csv2 = 1; - if (arm64_get_meltdown_state() == SPECTRE_UNAFFECTED) - kvm->arch.pfr0_csv3 = 1; -} - /** * kvm_arch_init_vm - initializes a VM data structure * @kvm: pointer to the KVM struct @@ -161,15 +145,8 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) /* The maximum number of VCPUs is limited by the host's GIC model */ kvm->max_vcpus = kvm_arm_default_max_vcpus(); - set_default_spectre(kvm); kvm_arm_init_hypercalls(kvm); - /* - * Initialise the default PMUver before there is a chance to - * create an actual PMU. - */ - kvm->arch.dfr0_pmuver = kvm_arm_pmu_get_pmuver_limit(); - bitmap_zero(kvm->arch.vcpu_features, KVM_VCPU_MAX_FEATURES); return 0; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 7e8772508315..5aa0c977aaa3 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -1195,14 +1195,6 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } -static u8 vcpu_pmuver(const struct kvm_vcpu *vcpu) -{ - if (kvm_vcpu_has_pmu(vcpu)) - return vcpu->kvm->arch.dfr0_pmuver; - - return 0; -} - static s64 kvm_arm64_ftr_safe_value(u32 id, const struct arm64_ftr_bits *ftrp, s64 new, s64 cur) { @@ -1288,19 +1280,6 @@ static int arm64_check_features(struct kvm_vcpu *vcpu, return 0; } -static u8 perfmon_to_pmuver(u8 perfmon) -{ - switch (perfmon) { - case ID_DFR0_EL1_PerfMon_PMUv3: - return ID_AA64DFR0_EL1_PMUVer_IMP; - case ID_DFR0_EL1_PerfMon_IMPDEF: - return ID_AA64DFR0_EL1_PMUVer_IMP_DEF; - default: - /* Anything ARMv8.1+ and NI have the same value. For now. */ - return perfmon; - } -} - static u8 pmuver_to_perfmon(u8 pmuver) { switch (pmuver) { @@ -1327,19 +1306,6 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, val = read_sanitised_ftr_reg(id); switch (id) { - case SYS_ID_AA64PFR0_EL1: - if (!vcpu_has_sve(vcpu)) - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_AMU); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); - if (kvm_vgic_global_state.type == VGIC_V3) { - val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_GIC), 1); - } - break; case SYS_ID_AA64PFR1_EL1: if (!kvm_has_mte(vcpu->kvm)) val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_EL1_MTE); @@ -1360,22 +1326,6 @@ static u64 __kvm_read_sanitised_id_reg(const struct kvm_vcpu *vcpu, if (!cpus_have_final_cap(ARM64_HAS_WFXT)) val &= ~ARM64_FEATURE_MASK(ID_AA64ISAR2_EL1_WFxT); break; - case SYS_ID_AA64DFR0_EL1: - /* Limit debug to ARMv8.0 */ - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_DebugVer), 6); - /* Set PMUver to the required version */ - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), - vcpu_pmuver(vcpu)); - /* Hide SPE from guests */ - val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMSVer); - break; - case SYS_ID_DFR0_EL1: - val &= ~ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon); - val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_DFR0_EL1_PerfMon), - pmuver_to_perfmon(vcpu_pmuver(vcpu))); - break; case SYS_ID_AA64MMFR2_EL1: val &= ~ID_AA64MMFR2_EL1_CCIDX_MASK; break; @@ -1505,26 +1455,6 @@ static u64 read_sanitised_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, return val; } -static int set_id_aa64pfr0_el1(struct kvm_vcpu *vcpu, - const struct sys_reg_desc *rd, - u64 val) -{ - u8 csv2, csv3; - int r; - - csv2 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV2_SHIFT); - csv3 = cpuid_feature_extract_unsigned_field(val, ID_AA64PFR0_EL1_CSV3_SHIFT); - - r = set_id_reg(vcpu, rd, val); - if (r) - return r; - - vcpu->kvm->arch.pfr0_csv2 = csv2; - vcpu->kvm->arch.pfr0_csv3 = csv3; - - return 0; -} - static u64 read_sanitised_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd) { @@ -1553,7 +1483,6 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, u64 val) { u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); - int r; /* * Prior to commit 3d0dba5764b9 ("KVM: arm64: PMU: Move the @@ -1569,17 +1498,10 @@ static int set_id_aa64dfr0_el1(struct kvm_vcpu *vcpu, * surprising than an ill-guided PMU driver poking at impdef system * registers that end in an UNDEF... */ - if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) { + if (pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF) val &= ~ID_AA64DFR0_EL1_PMUVer_MASK; - pmuver = 0; - } - r = set_id_reg(vcpu, rd, val); - if (r) - return r; - - vcpu->kvm->arch.dfr0_pmuver = pmuver; - return 0; + return set_id_reg(vcpu, rd, val); } static u64 read_sanitised_id_dfr0_el1(struct kvm_vcpu *vcpu, @@ -1600,7 +1522,6 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, u64 val) { u8 perfmon = SYS_FIELD_GET(ID_DFR0_EL1, PerfMon, val); - int r; if (perfmon == ID_DFR0_EL1_PerfMon_IMPDEF) { val &= ~ID_DFR0_EL1_PerfMon_MASK; @@ -1616,12 +1537,7 @@ static int set_id_dfr0_el1(struct kvm_vcpu *vcpu, if (perfmon != 0 && perfmon < ID_DFR0_EL1_PerfMon_PMUv3) return -EINVAL; - r = set_id_reg(vcpu, rd, val); - if (r) - return r; - - vcpu->kvm->arch.dfr0_pmuver = perfmon_to_pmuver(perfmon); - return 0; + return set_id_reg(vcpu, rd, val); } /* @@ -2076,7 +1992,7 @@ static const struct sys_reg_desc sys_reg_descs[] = { { SYS_DESC(SYS_ID_AA64PFR0_EL1), .access = access_id_reg, .get_user = get_id_reg, - .set_user = set_id_aa64pfr0_el1, + .set_user = set_id_reg, .reset = read_sanitised_id_aa64pfr0_el1, .val = ID_AA64PFR0_EL1_CSV2_MASK | ID_AA64PFR0_EL1_CSV3_MASK, }, ID_SANITISED(ID_AA64PFR1_EL1), diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 0be60d5eebd7..847da6fc2713 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -92,8 +92,12 @@ void kvm_vcpu_pmu_restore_host(struct kvm_vcpu *vcpu); /* * Evaluates as true when emulating PMUv3p5, and false otherwise. */ -#define kvm_pmu_is_3p5(vcpu) \ - (vcpu->kvm->arch.dfr0_pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5) +#define kvm_pmu_is_3p5(vcpu) ({ \ + u64 val = IDREG(vcpu->kvm, SYS_ID_AA64DFR0_EL1); \ + u8 pmuver = SYS_FIELD_GET(ID_AA64DFR0_EL1, PMUVer, val); \ + \ + pmuver >= ID_AA64DFR0_EL1_PMUVer_V3P5; \ +}) u8 kvm_arm_pmu_get_pmuver_limit(void); -- cgit v1.2.3 From 6df696cd9bc1ceed0e92e36908f88bbd16d18255 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 9 Jun 2023 22:01:02 +0000 Subject: arm64: errata: Mitigate Ampere1 erratum AC03_CPU_38 at stage-2 AmpereOne has an erratum in its implementation of FEAT_HAFDBS that required disabling the feature on the design. This was done by reporting the feature as not implemented in the ID register, although the corresponding control bits were not actually RES0. This does not align well with the requirements of the architecture, which mandates these bits be RES0 if HAFDBS isn't implemented. The kernel's use of stage-1 is unaffected, as the HA and HD bits are only set if HAFDBS is detected in the ID register. KVM, on the other hand, relies on the RES0 behavior at stage-2 to use the same value for VTCR_EL2 on any cpu in the system. Mitigate the non-RES0 behavior by leaving VTCR_EL2.HA clear on affected systems. Cc: stable@vger.kernel.org Cc: D Scott Phillips Cc: Darren Hart Acked-by: D Scott Phillips Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20230609220104.1836988-2-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- Documentation/arm64/silicon-errata.rst | 3 +++ arch/arm64/Kconfig | 19 +++++++++++++++++++ arch/arm64/kernel/cpu_errata.c | 7 +++++++ arch/arm64/kvm/hyp/pgtable.c | 14 +++++++++++--- arch/arm64/tools/cpucaps | 1 + 5 files changed, 41 insertions(+), 3 deletions(-) diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index 9e311bc43e05..cd46e2b20a81 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -52,6 +52,9 @@ stable kernels. | Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 | +----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ +| Ampere | AmpereOne | AC03_CPU_38 | AMPERE_ERRATUM_AC03_CPU_38 | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 | +----------------+-----------------+-----------------+-----------------------------+ | ARM | Cortex-A510 | #2064142 | ARM64_ERRATUM_2064142 | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b1201d25a8a4..0987c637fbf2 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -406,6 +406,25 @@ menu "Kernel Features" menu "ARM errata workarounds via the alternatives framework" +config AMPERE_ERRATUM_AC03_CPU_38 + bool "AmpereOne: AC03_CPU_38: Certain bits in the Virtualization Translation Control Register and Translation Control Registers do not follow RES0 semantics" + default y + help + This option adds an alternative code sequence to work around Ampere + erratum AC03_CPU_38 on AmpereOne. + + The affected design reports FEAT_HAFDBS as not implemented in + ID_AA64MMFR1_EL1.HAFDBS, but (V)TCR_ELx.{HA,HD} are not RES0 + as required by the architecture. The unadvertised HAFDBS + implementation suffers from an additional erratum where hardware + A/D updates can occur after a PTE has been marked invalid. + + The workaround forces KVM to explicitly set VTCR_EL2.HA to 0, + which avoids enabling unadvertised hardware Access Flag management + at stage-2. + + If unsure, say Y. + config ARM64_WORKAROUND_CLEAN_CACHE bool diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 307faa2b4395..be66e94a21bd 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -729,6 +729,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = { MIDR_FIXED(MIDR_CPU_VAR_REV(1,1), BIT(25)), .cpu_enable = cpu_clear_bf16_from_user_emulation, }, +#endif +#ifdef CONFIG_AMPERE_ERRATUM_AC03_CPU_38 + { + .desc = "AmpereOne erratum AC03_CPU_38", + .capability = ARM64_WORKAROUND_AMPERE_AC03_CPU_38, + ERRATA_MIDR_ALL_VERSIONS(MIDR_AMPERE1), + }, #endif { } diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 5282cb9ca4cf..32d92ce4bae5 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -611,10 +611,18 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) #ifdef CONFIG_ARM64_HW_AFDBM /* * Enable the Hardware Access Flag management, unconditionally - * on all CPUs. The features is RES0 on CPUs without the support - * and must be ignored by the CPUs. + * on all CPUs. In systems that have asymmetric support for the feature + * this allows KVM to leverage hardware support on the subset of cores + * that implement the feature. + * + * The architecture requires VTCR_EL2.HA to be RES0 (thus ignored by + * hardware) on implementations that do not advertise support for the + * feature. As such, setting HA unconditionally is safe, unless you + * happen to be running on a design that has unadvertised support for + * HAFDBS. Here be dragons. */ - vtcr |= VTCR_EL2_HA; + if (!cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + vtcr |= VTCR_EL2_HA; #endif /* CONFIG_ARM64_HW_AFDBM */ /* Set the vmid bits */ diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index 40ba95472594..9f9a2d6652eb 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -77,6 +77,7 @@ WORKAROUND_2077057 WORKAROUND_2457168 WORKAROUND_2645198 WORKAROUND_2658417 +WORKAROUND_AMPERE_AC03_CPU_38 WORKAROUND_TRBE_OVERWRITE_FILL_MODE WORKAROUND_TSB_FLUSH_FAILURE WORKAROUND_TRBE_WRITE_OUT_OF_RANGE -- cgit v1.2.3 From ce4a36225753a1a5f3641bff47ecd32fb394dd22 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 9 Jun 2023 22:01:03 +0000 Subject: KVM: arm64: Refactor HFGxTR configuration into separate helpers A subsequent change will need to flip more trap bits in HFGWTR_EL2. Make room for this by factoring out the programming of the HFGxTR registers into helpers and using locals to build the set/clear masks. Link: https://lore.kernel.org/r/20230609220104.1836988-3-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/include/hyp/switch.h | 60 +++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index e78a08a72a3c..901e47a49070 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -70,6 +70,44 @@ static inline void __activate_traps_fpsimd32(struct kvm_vcpu *vcpu) } } +static inline bool __hfgxtr_traps_required(void) +{ + if (cpus_have_final_cap(ARM64_SME)) + return true; + + return false; +} + +static inline void __activate_traps_hfgxtr(void) +{ + u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp; + + if (cpus_have_final_cap(ARM64_SME)) { + tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK; + + r_clr |= tmp; + w_clr |= tmp; + } + + sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set); + sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set); +} + +static inline void __deactivate_traps_hfgxtr(void) +{ + u64 r_clr = 0, w_clr = 0, r_set = 0, w_set = 0, tmp; + + if (cpus_have_final_cap(ARM64_SME)) { + tmp = HFGxTR_EL2_nSMPRI_EL1_MASK | HFGxTR_EL2_nTPIDR2_EL0_MASK; + + r_set |= tmp; + w_set |= tmp; + } + + sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set); + sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set); +} + static inline void __activate_traps_common(struct kvm_vcpu *vcpu) { /* Trap on AArch32 cp15 c15 (impdef sysregs) accesses (EL1 or EL0) */ @@ -89,16 +127,8 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); - if (cpus_have_final_cap(ARM64_SME)) { - sysreg_clear_set_s(SYS_HFGRTR_EL2, - HFGxTR_EL2_nSMPRI_EL1_MASK | - HFGxTR_EL2_nTPIDR2_EL0_MASK, - 0); - sysreg_clear_set_s(SYS_HFGWTR_EL2, - HFGxTR_EL2_nSMPRI_EL1_MASK | - HFGxTR_EL2_nTPIDR2_EL0_MASK, - 0); - } + if (__hfgxtr_traps_required()) + __activate_traps_hfgxtr(); } static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) @@ -109,14 +139,8 @@ static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) if (kvm_arm_support_pmu_v3()) write_sysreg(0, pmuserenr_el0); - if (cpus_have_final_cap(ARM64_SME)) { - sysreg_clear_set_s(SYS_HFGRTR_EL2, 0, - HFGxTR_EL2_nSMPRI_EL1_MASK | - HFGxTR_EL2_nTPIDR2_EL0_MASK); - sysreg_clear_set_s(SYS_HFGWTR_EL2, 0, - HFGxTR_EL2_nSMPRI_EL1_MASK | - HFGxTR_EL2_nTPIDR2_EL0_MASK); - } + if (__hfgxtr_traps_required()) + __deactivate_traps_hfgxtr(); } static inline void ___activate_traps(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 082fdfd13841fa4e38a8b073561d182e195d528c Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Fri, 9 Jun 2023 22:01:04 +0000 Subject: KVM: arm64: Prevent guests from enabling HA/HD on Ampere1 An erratum in the HAFDBS implementation in AmpereOne was addressed by clearing the feature in the ID register, with the expectation that software would not attempt to use the corresponding controls in TCR_EL1. The architecture, on the other hand, takes a much more pedantic stance on the subject, requiring the TCR bits behave as RES0. Take an extremely conservative stance on the issue and leverage the precise write trap afforded by FGT. Handle guest writes by clearing HA and HD before writing the intended value to the EL1 register alias. Link: https://lore.kernel.org/r/20230609220104.1836988-4-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/include/hyp/switch.h | 39 +++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 901e47a49070..04cc34dd4275 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -75,6 +75,9 @@ static inline bool __hfgxtr_traps_required(void) if (cpus_have_final_cap(ARM64_SME)) return true; + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + return true; + return false; } @@ -89,6 +92,12 @@ static inline void __activate_traps_hfgxtr(void) w_clr |= tmp; } + /* + * Trap guest writes to TCR_EL1 to prevent it from enabling HA or HD. + */ + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + w_set |= HFGxTR_EL2_TCR_EL1_MASK; + sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set); sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set); } @@ -104,6 +113,9 @@ static inline void __deactivate_traps_hfgxtr(void) w_set |= tmp; } + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38)) + w_clr |= HFGxTR_EL2_TCR_EL1_MASK; + sysreg_clear_set_s(SYS_HFGRTR_EL2, r_clr, r_set); sysreg_clear_set_s(SYS_HFGWTR_EL2, w_clr, w_set); } @@ -408,12 +420,39 @@ static bool kvm_hyp_handle_cntpct(struct kvm_vcpu *vcpu) return true; } +static bool handle_ampere1_tcr(struct kvm_vcpu *vcpu) +{ + u32 sysreg = esr_sys64_to_sysreg(kvm_vcpu_get_esr(vcpu)); + int rt = kvm_vcpu_sys_get_rt(vcpu); + u64 val = vcpu_get_reg(vcpu, rt); + + if (sysreg != SYS_TCR_EL1) + return false; + + /* + * Affected parts do not advertise support for hardware Access Flag / + * Dirty state management in ID_AA64MMFR1_EL1.HAFDBS, but the underlying + * control bits are still functional. The architecture requires these be + * RES0 on systems that do not implement FEAT_HAFDBS. + * + * Uphold the requirements of the architecture by masking guest writes + * to TCR_EL1.{HA,HD} here. + */ + val &= ~(TCR_HD | TCR_HA); + write_sysreg_el1(val, SYS_TCR); + return true; +} + static bool kvm_hyp_handle_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) { if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_TX2_219_TVM) && handle_tx2_tvm(vcpu)) return true; + if (cpus_have_final_cap(ARM64_WORKAROUND_AMPERE_AC03_CPU_38) && + handle_ampere1_tcr(vcpu)) + return true; + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && __vgic_v3_perform_cpuif_access(vcpu) == 1) return true; -- cgit v1.2.3 From e325ba2271849e25017743496bcec8d3a83cacb3 Mon Sep 17 00:00:00 2001 From: Nico Boehr Date: Fri, 24 Mar 2023 15:54:24 +0100 Subject: KVM: s390: selftests: add selftest for CMMA migration Add a selftest for CMMA migration on s390. The tests cover: - interaction of dirty tracking and migration mode, see my recent patch "KVM: s390: disable migration mode when dirty tracking is disabled" [1], - several invalid calls of KVM_S390_GET_CMMA_BITS, for example: invalid flags, CMMA support off, with/without peeking - ensure KVM_S390_GET_CMMA_BITS initally reports all pages as dirty, - ensure KVM_S390_GET_CMMA_BITS properly skips over holes in memslots, but also non-dirty pages Note that without the patch at [1] and the small fix in this series, the selftests will fail. [1] https://lore.kernel.org/all/20230127140532.230651-2-nrb@linux.ibm.com/ Signed-off-by: Nico Boehr Reviewed-by: Claudio Imbrenda Message-Id: <20230324145424.293889-3-nrb@linux.ibm.com> Signed-off-by: Claudio Imbrenda [frankja@linux.ibm.com: squashed 20230606150510.671301-1-nrb@linux.ibm.com / "KVM: s390: selftests: CMMA: don't run if CMMA not supported"] Signed-off-by: Janosch Frank --- tools/testing/selftests/kvm/Makefile | 1 + tools/testing/selftests/kvm/s390x/cmma_test.c | 700 ++++++++++++++++++++++++++ 2 files changed, 701 insertions(+) create mode 100644 tools/testing/selftests/kvm/s390x/cmma_test.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 7a5ff646e7e7..e0e5bf120326 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -163,6 +163,7 @@ TEST_GEN_PROGS_s390x = s390x/memop TEST_GEN_PROGS_s390x += s390x/resets TEST_GEN_PROGS_s390x += s390x/sync_regs_test TEST_GEN_PROGS_s390x += s390x/tprot +TEST_GEN_PROGS_s390x += s390x/cmma_test TEST_GEN_PROGS_s390x += demand_paging_test TEST_GEN_PROGS_s390x += dirty_log_test TEST_GEN_PROGS_s390x += kvm_create_max_vcpus diff --git a/tools/testing/selftests/kvm/s390x/cmma_test.c b/tools/testing/selftests/kvm/s390x/cmma_test.c new file mode 100644 index 000000000000..1d73e78e8fa7 --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/cmma_test.c @@ -0,0 +1,700 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test for s390x CMMA migration + * + * Copyright IBM Corp. 2023 + * + * Authors: + * Nico Boehr + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "kselftest.h" + +#define MAIN_PAGE_COUNT 512 + +#define TEST_DATA_PAGE_COUNT 512 +#define TEST_DATA_MEMSLOT 1 +#define TEST_DATA_START_GFN 4096 + +#define TEST_DATA_TWO_PAGE_COUNT 256 +#define TEST_DATA_TWO_MEMSLOT 2 +#define TEST_DATA_TWO_START_GFN 8192 + +static char cmma_value_buf[MAIN_PAGE_COUNT + TEST_DATA_PAGE_COUNT]; + +/** + * Dirty CMMA attributes of exactly one page in the TEST_DATA memslot, + * so use_cmma goes on and the CMMA related ioctls do something. + */ +static void guest_do_one_essa(void) +{ + asm volatile( + /* load TEST_DATA_START_GFN into r1 */ + " llilf 1,%[start_gfn]\n" + /* calculate the address from the gfn */ + " sllg 1,1,12(0)\n" + /* set the first page in TEST_DATA memslot to STABLE */ + " .insn rrf,0xb9ab0000,2,1,1,0\n" + /* hypercall */ + " diag 0,0,0x501\n" + "0: j 0b" + : + : [start_gfn] "L"(TEST_DATA_START_GFN) + : "r1", "r2", "memory", "cc" + ); +} + +/** + * Touch CMMA attributes of all pages in TEST_DATA memslot. Set them to stable + * state. + */ +static void guest_dirty_test_data(void) +{ + asm volatile( + /* r1 = TEST_DATA_START_GFN */ + " xgr 1,1\n" + " llilf 1,%[start_gfn]\n" + /* r5 = TEST_DATA_PAGE_COUNT */ + " lghi 5,%[page_count]\n" + /* r5 += r1 */ + "2: agfr 5,1\n" + /* r2 = r1 << 12 */ + "1: sllg 2,1,12(0)\n" + /* essa(r4, r2, SET_STABLE) */ + " .insn rrf,0xb9ab0000,4,2,1,0\n" + /* i++ */ + " agfi 1,1\n" + /* if r1 < r5 goto 1 */ + " cgrjl 1,5,1b\n" + /* hypercall */ + " diag 0,0,0x501\n" + "0: j 0b" + : + : [start_gfn] "L"(TEST_DATA_START_GFN), + [page_count] "L"(TEST_DATA_PAGE_COUNT) + : + /* the counter in our loop over the pages */ + "r1", + /* the calculated page physical address */ + "r2", + /* ESSA output register */ + "r4", + /* last page */ + "r5", + "cc", "memory" + ); +} + +static struct kvm_vm *create_vm(void) +{ + return ____vm_create(VM_MODE_DEFAULT); +} + +static void create_main_memslot(struct kvm_vm *vm) +{ + int i; + + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, 0, 0, MAIN_PAGE_COUNT, 0); + /* set the array of memslots to zero like __vm_create does */ + for (i = 0; i < NR_MEM_REGIONS; i++) + vm->memslots[i] = 0; +} + +static void create_test_memslot(struct kvm_vm *vm) +{ + vm_userspace_mem_region_add(vm, + VM_MEM_SRC_ANONYMOUS, + TEST_DATA_START_GFN << vm->page_shift, + TEST_DATA_MEMSLOT, + TEST_DATA_PAGE_COUNT, + 0 + ); + vm->memslots[MEM_REGION_TEST_DATA] = TEST_DATA_MEMSLOT; +} + +static void create_memslots(struct kvm_vm *vm) +{ + /* + * Our VM has the following memory layout: + * +------+---------------------------+ + * | GFN | Memslot | + * +------+---------------------------+ + * | 0 | | + * | ... | MAIN (Code, Stack, ...) | + * | 511 | | + * +------+---------------------------+ + * | 4096 | | + * | ... | TEST_DATA | + * | 4607 | | + * +------+---------------------------+ + */ + create_main_memslot(vm); + create_test_memslot(vm); +} + +static void finish_vm_setup(struct kvm_vm *vm) +{ + struct userspace_mem_region *slot0; + + kvm_vm_elf_load(vm, program_invocation_name); + + slot0 = memslot2region(vm, 0); + ucall_init(vm, slot0->region.guest_phys_addr + slot0->region.memory_size); + + kvm_arch_vm_post_create(vm); +} + +static struct kvm_vm *create_vm_two_memslots(void) +{ + struct kvm_vm *vm; + + vm = create_vm(); + + create_memslots(vm); + + finish_vm_setup(vm); + + return vm; +} + +static void enable_cmma(struct kvm_vm *vm) +{ + int r; + + r = __kvm_device_attr_set(vm->fd, KVM_S390_VM_MEM_CTRL, KVM_S390_VM_MEM_ENABLE_CMMA, NULL); + TEST_ASSERT(!r, "enabling cmma failed r=%d errno=%d", r, errno); +} + +static void enable_dirty_tracking(struct kvm_vm *vm) +{ + vm_mem_region_set_flags(vm, 0, KVM_MEM_LOG_DIRTY_PAGES); + vm_mem_region_set_flags(vm, TEST_DATA_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES); +} + +static int __enable_migration_mode(struct kvm_vm *vm) +{ + return __kvm_device_attr_set(vm->fd, + KVM_S390_VM_MIGRATION, + KVM_S390_VM_MIGRATION_START, + NULL + ); +} + +static void enable_migration_mode(struct kvm_vm *vm) +{ + int r = __enable_migration_mode(vm); + + TEST_ASSERT(!r, "enabling migration mode failed r=%d errno=%d", r, errno); +} + +static bool is_migration_mode_on(struct kvm_vm *vm) +{ + u64 out; + int r; + + r = __kvm_device_attr_get(vm->fd, + KVM_S390_VM_MIGRATION, + KVM_S390_VM_MIGRATION_STATUS, + &out + ); + TEST_ASSERT(!r, "getting migration mode status failed r=%d errno=%d", r, errno); + return out; +} + +static int vm_get_cmma_bits(struct kvm_vm *vm, u64 flags, int *errno_out) +{ + struct kvm_s390_cmma_log args; + int rc; + + errno = 0; + + args = (struct kvm_s390_cmma_log){ + .start_gfn = 0, + .count = sizeof(cmma_value_buf), + .flags = flags, + .values = (__u64)&cmma_value_buf[0] + }; + rc = __vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args); + + *errno_out = errno; + return rc; +} + +static void test_get_cmma_basic(void) +{ + struct kvm_vm *vm = create_vm_two_memslots(); + struct kvm_vcpu *vcpu; + int rc, errno_out; + + /* GET_CMMA_BITS without CMMA enabled should fail */ + rc = vm_get_cmma_bits(vm, 0, &errno_out); + ASSERT_EQ(rc, -1); + ASSERT_EQ(errno_out, ENXIO); + + enable_cmma(vm); + vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa); + + vcpu_run(vcpu); + + /* GET_CMMA_BITS without migration mode and without peeking should fail */ + rc = vm_get_cmma_bits(vm, 0, &errno_out); + ASSERT_EQ(rc, -1); + ASSERT_EQ(errno_out, EINVAL); + + /* GET_CMMA_BITS without migration mode and with peeking should work */ + rc = vm_get_cmma_bits(vm, KVM_S390_CMMA_PEEK, &errno_out); + ASSERT_EQ(rc, 0); + ASSERT_EQ(errno_out, 0); + + enable_dirty_tracking(vm); + enable_migration_mode(vm); + + /* GET_CMMA_BITS with invalid flags */ + rc = vm_get_cmma_bits(vm, 0xfeedc0fe, &errno_out); + ASSERT_EQ(rc, -1); + ASSERT_EQ(errno_out, EINVAL); + + kvm_vm_free(vm); +} + +static void assert_exit_was_hypercall(struct kvm_vcpu *vcpu) +{ + ASSERT_EQ(vcpu->run->exit_reason, 13); + ASSERT_EQ(vcpu->run->s390_sieic.icptcode, 4); + ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x8300); + ASSERT_EQ(vcpu->run->s390_sieic.ipb, 0x5010000); +} + +static void test_migration_mode(void) +{ + struct kvm_vm *vm = create_vm(); + struct kvm_vcpu *vcpu; + u64 orig_psw; + int rc; + + /* enabling migration mode on a VM without memory should fail */ + rc = __enable_migration_mode(vm); + ASSERT_EQ(rc, -1); + ASSERT_EQ(errno, EINVAL); + TEST_ASSERT(!is_migration_mode_on(vm), "migration mode should still be off"); + errno = 0; + + create_memslots(vm); + finish_vm_setup(vm); + + enable_cmma(vm); + vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa); + orig_psw = vcpu->run->psw_addr; + + /* + * Execute one essa instruction in the guest. Otherwise the guest will + * not have use_cmm enabled and GET_CMMA_BITS will return no pages. + */ + vcpu_run(vcpu); + assert_exit_was_hypercall(vcpu); + + /* migration mode when memslots have dirty tracking off should fail */ + rc = __enable_migration_mode(vm); + ASSERT_EQ(rc, -1); + ASSERT_EQ(errno, EINVAL); + TEST_ASSERT(!is_migration_mode_on(vm), "migration mode should still be off"); + errno = 0; + + /* enable dirty tracking */ + enable_dirty_tracking(vm); + + /* enabling migration mode should work now */ + rc = __enable_migration_mode(vm); + ASSERT_EQ(rc, 0); + TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on"); + errno = 0; + + /* execute another ESSA instruction to see this goes fine */ + vcpu->run->psw_addr = orig_psw; + vcpu_run(vcpu); + assert_exit_was_hypercall(vcpu); + + /* + * With migration mode on, create a new memslot with dirty tracking off. + * This should turn off migration mode. + */ + TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on"); + vm_userspace_mem_region_add(vm, + VM_MEM_SRC_ANONYMOUS, + TEST_DATA_TWO_START_GFN << vm->page_shift, + TEST_DATA_TWO_MEMSLOT, + TEST_DATA_TWO_PAGE_COUNT, + 0 + ); + TEST_ASSERT(!is_migration_mode_on(vm), + "creating memslot without dirty tracking turns off migration mode" + ); + + /* ESSA instructions should still execute fine */ + vcpu->run->psw_addr = orig_psw; + vcpu_run(vcpu); + assert_exit_was_hypercall(vcpu); + + /* + * Turn on dirty tracking on the new memslot. + * It should be possible to turn migration mode back on again. + */ + vm_mem_region_set_flags(vm, TEST_DATA_TWO_MEMSLOT, KVM_MEM_LOG_DIRTY_PAGES); + rc = __enable_migration_mode(vm); + ASSERT_EQ(rc, 0); + TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on"); + errno = 0; + + /* + * Turn off dirty tracking again, this time with just a flag change. + * Again, migration mode should turn off. + */ + TEST_ASSERT(is_migration_mode_on(vm), "migration mode should be on"); + vm_mem_region_set_flags(vm, TEST_DATA_TWO_MEMSLOT, 0); + TEST_ASSERT(!is_migration_mode_on(vm), + "disabling dirty tracking should turn off migration mode" + ); + + /* ESSA instructions should still execute fine */ + vcpu->run->psw_addr = orig_psw; + vcpu_run(vcpu); + assert_exit_was_hypercall(vcpu); + + kvm_vm_free(vm); +} + +/** + * Given a VM with the MAIN and TEST_DATA memslot, assert that both slots have + * CMMA attributes of all pages in both memslots and nothing more dirty. + * This has the useful side effect of ensuring nothing is CMMA dirty after this + * function. + */ +static void assert_all_slots_cmma_dirty(struct kvm_vm *vm) +{ + struct kvm_s390_cmma_log args; + + /* + * First iteration - everything should be dirty. + * Start at the main memslot... + */ + args = (struct kvm_s390_cmma_log){ + .start_gfn = 0, + .count = sizeof(cmma_value_buf), + .flags = 0, + .values = (__u64)&cmma_value_buf[0] + }; + memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf)); + vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args); + ASSERT_EQ(args.count, MAIN_PAGE_COUNT); + ASSERT_EQ(args.remaining, TEST_DATA_PAGE_COUNT); + ASSERT_EQ(args.start_gfn, 0); + + /* ...and then - after a hole - the TEST_DATA memslot should follow */ + args = (struct kvm_s390_cmma_log){ + .start_gfn = MAIN_PAGE_COUNT, + .count = sizeof(cmma_value_buf), + .flags = 0, + .values = (__u64)&cmma_value_buf[0] + }; + memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf)); + vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args); + ASSERT_EQ(args.count, TEST_DATA_PAGE_COUNT); + ASSERT_EQ(args.start_gfn, TEST_DATA_START_GFN); + ASSERT_EQ(args.remaining, 0); + + /* ...and nothing else should be there */ + args = (struct kvm_s390_cmma_log){ + .start_gfn = TEST_DATA_START_GFN + TEST_DATA_PAGE_COUNT, + .count = sizeof(cmma_value_buf), + .flags = 0, + .values = (__u64)&cmma_value_buf[0] + }; + memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf)); + vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args); + ASSERT_EQ(args.count, 0); + ASSERT_EQ(args.start_gfn, 0); + ASSERT_EQ(args.remaining, 0); +} + +/** + * Given a VM, assert no pages are CMMA dirty. + */ +static void assert_no_pages_cmma_dirty(struct kvm_vm *vm) +{ + struct kvm_s390_cmma_log args; + + /* If we start from GFN 0 again, nothing should be dirty. */ + args = (struct kvm_s390_cmma_log){ + .start_gfn = 0, + .count = sizeof(cmma_value_buf), + .flags = 0, + .values = (__u64)&cmma_value_buf[0] + }; + memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf)); + vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, &args); + if (args.count || args.remaining || args.start_gfn) + TEST_FAIL("pages are still dirty start_gfn=0x%llx count=%u remaining=%llu", + args.start_gfn, + args.count, + args.remaining + ); +} + +static void test_get_inital_dirty(void) +{ + struct kvm_vm *vm = create_vm_two_memslots(); + struct kvm_vcpu *vcpu; + + enable_cmma(vm); + vcpu = vm_vcpu_add(vm, 1, guest_do_one_essa); + + /* + * Execute one essa instruction in the guest. Otherwise the guest will + * not have use_cmm enabled and GET_CMMA_BITS will return no pages. + */ + vcpu_run(vcpu); + assert_exit_was_hypercall(vcpu); + + enable_dirty_tracking(vm); + enable_migration_mode(vm); + + assert_all_slots_cmma_dirty(vm); + + /* Start from the beginning again and make sure nothing else is dirty */ + assert_no_pages_cmma_dirty(vm); + + kvm_vm_free(vm); +} + +static void query_cmma_range(struct kvm_vm *vm, + u64 start_gfn, u64 gfn_count, + struct kvm_s390_cmma_log *res_out) +{ + *res_out = (struct kvm_s390_cmma_log){ + .start_gfn = start_gfn, + .count = gfn_count, + .flags = 0, + .values = (__u64)&cmma_value_buf[0] + }; + memset(cmma_value_buf, 0xff, sizeof(cmma_value_buf)); + vm_ioctl(vm, KVM_S390_GET_CMMA_BITS, res_out); +} + +/** + * Assert the given cmma_log struct that was executed by query_cmma_range() + * indicates the first dirty gfn is at first_dirty_gfn and contains exactly + * dirty_gfn_count CMMA values. + */ +static void assert_cmma_dirty(u64 first_dirty_gfn, + u64 dirty_gfn_count, + const struct kvm_s390_cmma_log *res) +{ + ASSERT_EQ(res->start_gfn, first_dirty_gfn); + ASSERT_EQ(res->count, dirty_gfn_count); + for (size_t i = 0; i < dirty_gfn_count; i++) + ASSERT_EQ(cmma_value_buf[0], 0x0); /* stable state */ + ASSERT_EQ(cmma_value_buf[dirty_gfn_count], 0xff); /* not touched */ +} + +static void test_get_skip_holes(void) +{ + size_t gfn_offset; + struct kvm_vm *vm = create_vm_two_memslots(); + struct kvm_s390_cmma_log log; + struct kvm_vcpu *vcpu; + u64 orig_psw; + + enable_cmma(vm); + vcpu = vm_vcpu_add(vm, 1, guest_dirty_test_data); + + orig_psw = vcpu->run->psw_addr; + + /* + * Execute some essa instructions in the guest. Otherwise the guest will + * not have use_cmm enabled and GET_CMMA_BITS will return no pages. + */ + vcpu_run(vcpu); + assert_exit_was_hypercall(vcpu); + + enable_dirty_tracking(vm); + enable_migration_mode(vm); + + /* un-dirty all pages */ + assert_all_slots_cmma_dirty(vm); + + /* Then, dirty just the TEST_DATA memslot */ + vcpu->run->psw_addr = orig_psw; + vcpu_run(vcpu); + + gfn_offset = TEST_DATA_START_GFN; + /** + * Query CMMA attributes of one page, starting at page 0. Since the + * main memslot was not touched by the VM, this should yield the first + * page of the TEST_DATA memslot. + * The dirty bitmap should now look like this: + * 0: not dirty + * [0x1, 0x200): dirty + */ + query_cmma_range(vm, 0, 1, &log); + assert_cmma_dirty(gfn_offset, 1, &log); + gfn_offset++; + + /** + * Query CMMA attributes of 32 (0x20) pages past the end of the TEST_DATA + * memslot. This should wrap back to the beginning of the TEST_DATA + * memslot, page 1. + * The dirty bitmap should now look like this: + * [0, 0x21): not dirty + * [0x21, 0x200): dirty + */ + query_cmma_range(vm, TEST_DATA_START_GFN + TEST_DATA_PAGE_COUNT, 0x20, &log); + assert_cmma_dirty(gfn_offset, 0x20, &log); + gfn_offset += 0x20; + + /* Skip 32 pages */ + gfn_offset += 0x20; + + /** + * After skipping 32 pages, query the next 32 (0x20) pages. + * The dirty bitmap should now look like this: + * [0, 0x21): not dirty + * [0x21, 0x41): dirty + * [0x41, 0x61): not dirty + * [0x61, 0x200): dirty + */ + query_cmma_range(vm, gfn_offset, 0x20, &log); + assert_cmma_dirty(gfn_offset, 0x20, &log); + gfn_offset += 0x20; + + /** + * Query 1 page from the beginning of the TEST_DATA memslot. This should + * yield page 0x21. + * The dirty bitmap should now look like this: + * [0, 0x22): not dirty + * [0x22, 0x41): dirty + * [0x41, 0x61): not dirty + * [0x61, 0x200): dirty + */ + query_cmma_range(vm, TEST_DATA_START_GFN, 1, &log); + assert_cmma_dirty(TEST_DATA_START_GFN + 0x21, 1, &log); + gfn_offset++; + + /** + * Query 15 (0xF) pages from page 0x23 in TEST_DATA memslot. + * This should yield pages [0x23, 0x33). + * The dirty bitmap should now look like this: + * [0, 0x22): not dirty + * 0x22: dirty + * [0x23, 0x33): not dirty + * [0x33, 0x41): dirty + * [0x41, 0x61): not dirty + * [0x61, 0x200): dirty + */ + gfn_offset = TEST_DATA_START_GFN + 0x23; + query_cmma_range(vm, gfn_offset, 15, &log); + assert_cmma_dirty(gfn_offset, 15, &log); + + /** + * Query 17 (0x11) pages from page 0x22 in TEST_DATA memslot. + * This should yield page [0x22, 0x33) + * The dirty bitmap should now look like this: + * [0, 0x33): not dirty + * [0x33, 0x41): dirty + * [0x41, 0x61): not dirty + * [0x61, 0x200): dirty + */ + gfn_offset = TEST_DATA_START_GFN + 0x22; + query_cmma_range(vm, gfn_offset, 17, &log); + assert_cmma_dirty(gfn_offset, 17, &log); + + /** + * Query 25 (0x19) pages from page 0x40 in TEST_DATA memslot. + * This should yield page 0x40 and nothing more, since there are more + * than 16 non-dirty pages after page 0x40. + * The dirty bitmap should now look like this: + * [0, 0x33): not dirty + * [0x33, 0x40): dirty + * [0x40, 0x61): not dirty + * [0x61, 0x200): dirty + */ + gfn_offset = TEST_DATA_START_GFN + 0x40; + query_cmma_range(vm, gfn_offset, 25, &log); + assert_cmma_dirty(gfn_offset, 1, &log); + + /** + * Query pages [0x33, 0x40). + * The dirty bitmap should now look like this: + * [0, 0x61): not dirty + * [0x61, 0x200): dirty + */ + gfn_offset = TEST_DATA_START_GFN + 0x33; + query_cmma_range(vm, gfn_offset, 0x40 - 0x33, &log); + assert_cmma_dirty(gfn_offset, 0x40 - 0x33, &log); + + /** + * Query the remaining pages [0x61, 0x200). + */ + gfn_offset = TEST_DATA_START_GFN; + query_cmma_range(vm, gfn_offset, TEST_DATA_PAGE_COUNT - 0x61, &log); + assert_cmma_dirty(TEST_DATA_START_GFN + 0x61, TEST_DATA_PAGE_COUNT - 0x61, &log); + + assert_no_pages_cmma_dirty(vm); +} + +struct testdef { + const char *name; + void (*test)(void); +} testlist[] = { + { "migration mode and dirty tracking", test_migration_mode }, + { "GET_CMMA_BITS: basic calls", test_get_cmma_basic }, + { "GET_CMMA_BITS: all pages are dirty initally", test_get_inital_dirty }, + { "GET_CMMA_BITS: holes are skipped", test_get_skip_holes }, +}; + +/** + * The kernel may support CMMA, but the machine may not (i.e. if running as + * guest-3). + * + * In this case, the CMMA capabilities are all there, but the CMMA-related + * ioctls fail. To find out whether the machine supports CMMA, create a + * temporary VM and then query the CMMA feature of the VM. + */ +static int machine_has_cmma(void) +{ + struct kvm_vm *vm = create_vm(); + int r; + + r = !__kvm_has_device_attr(vm->fd, KVM_S390_VM_MEM_CTRL, KVM_S390_VM_MEM_ENABLE_CMMA); + kvm_vm_free(vm); + + return r; +} + +int main(int argc, char *argv[]) +{ + int idx; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_SYNC_REGS)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_CMMA_MIGRATION)); + TEST_REQUIRE(machine_has_cmma()); + + ksft_print_header(); + + ksft_set_plan(ARRAY_SIZE(testlist)); + + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + testlist[idx].test(); + ksft_test_result_pass("%s\n", testlist[idx].name); + } + + ksft_finished(); /* Print results and exit() accordingly */ +} -- cgit v1.2.3 From 246be7d2720ea9a795b576067ecc5e5c7a1e7848 Mon Sep 17 00:00:00 2001 From: Pierre Morel Date: Wed, 10 May 2023 17:42:58 +0200 Subject: KVM: s390: vsie: fix the length of APCB bitmap bit_and() uses the count of bits as the woking length. Fix the previous implementation and effectively use the right bitmap size. Fixes: 19fd83a64718 ("KVM: s390: vsie: allow CRYCB FORMAT-1") Fixes: 56019f9aca22 ("KVM: s390: vsie: Allow CRYCB FORMAT-2") Signed-off-by: Pierre Morel Reviewed-by: Janosch Frank Link: https://lore.kernel.org/kvm/20230511094719.9691-1-pmorel@linux.ibm.com/ Signed-off-by: Janosch Frank --- arch/s390/kvm/vsie.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 8d6b765abf29..0333ee482eb8 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -177,7 +177,8 @@ static int setup_apcb00(struct kvm_vcpu *vcpu, unsigned long *apcb_s, sizeof(struct kvm_s390_apcb0))) return -EFAULT; - bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb0)); + bitmap_and(apcb_s, apcb_s, apcb_h, + BITS_PER_BYTE * sizeof(struct kvm_s390_apcb0)); return 0; } @@ -203,7 +204,8 @@ static int setup_apcb11(struct kvm_vcpu *vcpu, unsigned long *apcb_s, sizeof(struct kvm_s390_apcb1))) return -EFAULT; - bitmap_and(apcb_s, apcb_s, apcb_h, sizeof(struct kvm_s390_apcb1)); + bitmap_and(apcb_s, apcb_s, apcb_h, + BITS_PER_BYTE * sizeof(struct kvm_s390_apcb1)); return 0; } -- cgit v1.2.3 From 0bc380beb78aa352eadbc21d934dd9606fcee808 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 15 May 2023 10:42:34 +0200 Subject: KVM: s390/diag: fix racy access of physical cpu number in diag 9c handler We do check for target CPU == -1, but this might change at the time we are going to use it. Hold the physical target CPU in a local variable to avoid out-of-bound accesses to the cpu arrays. Cc: Pierre Morel Fixes: 87e28a15c42c ("KVM: s390: diag9c (directed yield) forwarding") Reported-by: Marc Hartmayer Reviewed-by: Nico Boehr Reviewed-by: Pierre Morel Signed-off-by: Christian Borntraeger Signed-off-by: Janosch Frank --- arch/s390/kvm/diag.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index 807fa9da1e72..3c65b8258ae6 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -166,6 +166,7 @@ static int diag9c_forwarding_overrun(void) static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) { struct kvm_vcpu *tcpu; + int tcpu_cpu; int tid; tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; @@ -181,14 +182,15 @@ static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) goto no_yield; /* target guest VCPU already running */ - if (READ_ONCE(tcpu->cpu) >= 0) { + tcpu_cpu = READ_ONCE(tcpu->cpu); + if (tcpu_cpu >= 0) { if (!diag9c_forwarding_hz || diag9c_forwarding_overrun()) goto no_yield; /* target host CPU already running */ - if (!vcpu_is_preempted(tcpu->cpu)) + if (!vcpu_is_preempted(tcpu_cpu)) goto no_yield; - smp_yield_cpu(tcpu->cpu); + smp_yield_cpu(tcpu_cpu); VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d: yield forwarded", tid); -- cgit v1.2.3 From 4255ce017723238dddbb63ba06d760d73b57896a Mon Sep 17 00:00:00 2001 From: Steffen Eiden Date: Thu, 15 Jun 2023 12:05:27 +0200 Subject: s390/uv: Always export uv_info KVM needs the struct's values to be able to provide PV support. The uvdevice is currently guest only and will need the struct's values for call support checking and potential future expansions. As uv.c is only compiled with CONFIG_PGSTE or CONFIG_PROTECTED_VIRTUALIZATION_GUEST we don't need a second check in the code. Users of uv_info will need to fence for these two config options for the time being. Signed-off-by: Steffen Eiden Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20230615100533.3996107-2-seiden@linux.ibm.com Signed-off-by: Janosch Frank Message-Id: <20230615100533.3996107-2-seiden@linux.ibm.com> --- arch/s390/kernel/uv.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index cb2ee06df286..e320a382fa85 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -23,12 +23,20 @@ int __bootdata_preserved(prot_virt_guest); #endif +/* + * uv_info contains both host and guest information but it's currently only + * expected to be used within modules if it's the KVM module or for + * any PV guest module. + * + * The kernel itself will write these values once in uv_query_info() + * and then make some of them readable via a sysfs interface. + */ struct uv_info __bootdata_preserved(uv_info); +EXPORT_SYMBOL(uv_info); #if IS_ENABLED(CONFIG_KVM) int __bootdata_preserved(prot_virt_host); EXPORT_SYMBOL(prot_virt_host); -EXPORT_SYMBOL(uv_info); static int __init uv_init(phys_addr_t stor_base, unsigned long stor_len) { -- cgit v1.2.3 From ea9d97163523d299022fc78258eccc466d92102a Mon Sep 17 00:00:00 2001 From: Steffen Eiden Date: Thu, 15 Jun 2023 12:05:28 +0200 Subject: s390/uvdevice: Add info IOCTL Add an IOCTL that allows userspace to find out which IOCTLs the uvdevice supports without trial and error. Explicitly expose the IOCTL nr for the request types. Signed-off-by: Steffen Eiden Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20230615100533.3996107-3-seiden@linux.ibm.com Signed-off-by: Janosch Frank Message-Id: <20230615100533.3996107-3-seiden@linux.ibm.com> --- arch/s390/include/uapi/asm/uvdevice.h | 42 ++++++++++++++++++- drivers/s390/char/Kconfig | 2 +- drivers/s390/char/uvdevice.c | 77 +++++++++++++++++++++++++++++++---- 3 files changed, 112 insertions(+), 9 deletions(-) diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h index 10a5ac918e02..9d9b684836c2 100644 --- a/arch/s390/include/uapi/asm/uvdevice.h +++ b/arch/s390/include/uapi/asm/uvdevice.h @@ -32,6 +32,33 @@ struct uvio_attest { __u16 reserved136; /* 0x0136 */ }; +/** + * uvio_uvdev_info - Information of supported functions + * @supp_uvio_cmds - supported IOCTLs by this device + * @supp_uv_cmds - supported UVCs corresponding to the IOCTL + * + * UVIO request to get information about supported request types by this + * uvdevice and the Ultravisor. Everything is output. Bits are in LSB0 + * ordering. If the bit is set in both, @supp_uvio_cmds and @supp_uv_cmds, the + * uvdevice and the Ultravisor support that call. + * + * Note that bit 0 (UVIO_IOCTL_UVDEV_INFO_NR) is always zero for `supp_uv_cmds` + * as there is no corresponding UV-call. + */ +struct uvio_uvdev_info { + /* + * If bit `n` is set, this device supports the IOCTL with nr `n`. + */ + __u64 supp_uvio_cmds; + /* + * If bit `n` is set, the Ultravisor(UV) supports the UV-call + * corresponding to the IOCTL with nr `n` in the calling contextx (host + * or guest). The value is only valid if the corresponding bit in + * @supp_uvio_cmds is set as well. + */ + __u64 supp_uv_cmds; +}; + /* * The following max values define an upper length for the IOCTL in/out buffers. * However, they do not represent the maximum the Ultravisor allows which is @@ -46,6 +73,19 @@ struct uvio_attest { #define UVIO_DEVICE_NAME "uv" #define UVIO_TYPE_UVC 'u' -#define UVIO_IOCTL_ATT _IOWR(UVIO_TYPE_UVC, 0x01, struct uvio_ioctl_cb) +enum UVIO_IOCTL_NR { + UVIO_IOCTL_UVDEV_INFO_NR = 0x00, + UVIO_IOCTL_ATT_NR, + /* must be the last entry */ + UVIO_IOCTL_NUM_IOCTLS +}; + +#define UVIO_IOCTL(nr) _IOWR(UVIO_TYPE_UVC, nr, struct uvio_ioctl_cb) +#define UVIO_IOCTL_UVDEV_INFO UVIO_IOCTL(UVIO_IOCTL_UVDEV_INFO_NR) +#define UVIO_IOCTL_ATT UVIO_IOCTL(UVIO_IOCTL_ATT_NR) + +#define UVIO_SUPP_CALL(nr) (1ULL << (nr)) +#define UVIO_SUPP_UDEV_INFO UVIO_SUPP_CALL(UVIO_IOCTL_UDEV_INFO_NR) +#define UVIO_SUPP_ATT UVIO_SUPP_CALL(UVIO_IOCTL_ATT_NR) #endif /* __S390_ASM_UVDEVICE_H */ diff --git a/drivers/s390/char/Kconfig b/drivers/s390/char/Kconfig index 80c4e5101c97..8a03af5ee5b3 100644 --- a/drivers/s390/char/Kconfig +++ b/drivers/s390/char/Kconfig @@ -96,7 +96,7 @@ config SCLP_OFB config S390_UV_UAPI def_tristate m prompt "Ultravisor userspace API" - depends on S390 + depends on S390 && (KVM || PROTECTED_VIRTUALIZATION_GUEST) help Selecting exposes parts of the UV interface to userspace by providing a misc character device at /dev/uv. diff --git a/drivers/s390/char/uvdevice.c b/drivers/s390/char/uvdevice.c index 1d40457c7b10..7d7866be389b 100644 --- a/drivers/s390/char/uvdevice.c +++ b/drivers/s390/char/uvdevice.c @@ -32,6 +32,52 @@ #include #include +#define BIT_UVIO_INTERNAL U32_MAX +/* Mapping from IOCTL-nr to UVC-bit */ +static const u32 ioctl_nr_to_uvc_bit[] __initconst = { + [UVIO_IOCTL_UVDEV_INFO_NR] = BIT_UVIO_INTERNAL, + [UVIO_IOCTL_ATT_NR] = BIT_UVC_CMD_RETR_ATTEST, +}; + +static_assert(ARRAY_SIZE(ioctl_nr_to_uvc_bit) == UVIO_IOCTL_NUM_IOCTLS); + +static struct uvio_uvdev_info uvdev_info = { + .supp_uvio_cmds = GENMASK_ULL(UVIO_IOCTL_NUM_IOCTLS - 1, 0), +}; + +static void __init set_supp_uv_cmds(unsigned long *supp_uv_cmds) +{ + int i; + + for (i = 0; i < UVIO_IOCTL_NUM_IOCTLS; i++) { + if (ioctl_nr_to_uvc_bit[i] == BIT_UVIO_INTERNAL) + continue; + if (!test_bit_inv(ioctl_nr_to_uvc_bit[i], uv_info.inst_calls_list)) + continue; + __set_bit(i, supp_uv_cmds); + } +} + +/** + * uvio_uvdev_info() - get information about the uvdevice + * + * @uv_ioctl: ioctl control block + * + * Lists all IOCTLs that are supported by this uvdevice + */ +static int uvio_uvdev_info(struct uvio_ioctl_cb *uv_ioctl) +{ + void __user *user_buf_arg = (void __user *)uv_ioctl->argument_addr; + + if (uv_ioctl->argument_len < sizeof(uvdev_info)) + return -EINVAL; + if (copy_to_user(user_buf_arg, &uvdev_info, sizeof(uvdev_info))) + return -EFAULT; + + uv_ioctl->uv_rc = UVC_RC_EXECUTED; + return 0; +} + static int uvio_build_uvcb_attest(struct uv_cb_attest *uvcb_attest, u8 *arcb, u8 *meas, u8 *add_data, struct uvio_attest *uvio_attest) { @@ -185,8 +231,19 @@ out: return ret; } -static int uvio_copy_and_check_ioctl(struct uvio_ioctl_cb *ioctl, void __user *argp) +static int uvio_copy_and_check_ioctl(struct uvio_ioctl_cb *ioctl, void __user *argp, + unsigned long cmd) { + u8 nr = _IOC_NR(cmd); + + if (_IOC_DIR(cmd) != (_IOC_READ | _IOC_WRITE)) + return -ENOIOCTLCMD; + if (_IOC_TYPE(cmd) != UVIO_TYPE_UVC) + return -ENOIOCTLCMD; + if (nr >= UVIO_IOCTL_NUM_IOCTLS) + return -ENOIOCTLCMD; + if (_IOC_SIZE(cmd) != sizeof(*ioctl)) + return -ENOIOCTLCMD; if (copy_from_user(ioctl, argp, sizeof(*ioctl))) return -EFAULT; if (ioctl->flags != 0) @@ -194,7 +251,7 @@ static int uvio_copy_and_check_ioctl(struct uvio_ioctl_cb *ioctl, void __user *a if (memchr_inv(ioctl->reserved14, 0, sizeof(ioctl->reserved14))) return -EINVAL; - return 0; + return nr; } /* @@ -205,12 +262,17 @@ static long uvio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) void __user *argp = (void __user *)arg; struct uvio_ioctl_cb uv_ioctl = { }; long ret; + int nr; + + nr = uvio_copy_and_check_ioctl(&uv_ioctl, argp, cmd); + if (nr < 0) + return nr; - switch (cmd) { - case UVIO_IOCTL_ATT: - ret = uvio_copy_and_check_ioctl(&uv_ioctl, argp); - if (ret) - return ret; + switch (nr) { + case UVIO_IOCTL_UVDEV_INFO_NR: + ret = uvio_uvdev_info(&uv_ioctl); + break; + case UVIO_IOCTL_ATT_NR: ret = uvio_attestation(&uv_ioctl); break; default: @@ -245,6 +307,7 @@ static void __exit uvio_dev_exit(void) static int __init uvio_dev_init(void) { + set_supp_uv_cmds((unsigned long *)&uvdev_info.supp_uv_cmds); return misc_register(&uvio_dev_miscdev); } -- cgit v1.2.3 From 44567ca21aaf6f60cb5dcde180b1f6aab9da33dd Mon Sep 17 00:00:00 2001 From: Steffen Eiden Date: Thu, 15 Jun 2023 12:05:29 +0200 Subject: s390/uvdevice: Add 'Add Secret' UVC Userspace can call the Add Secret Ultravisor Call using IOCTLs on the uvdevice. The Add Secret UV call sends an encrypted and cryptographically verified request to the Ultravisor. The request inserts a protected guest's secret into the Ultravisor for later use. The uvdevice is merely transporting the request from userspace to the Ultravisor. It's neither checking nor manipulating the request data. Signed-off-by: Steffen Eiden Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20230615100533.3996107-4-seiden@linux.ibm.com Signed-off-by: Janosch Frank Message-Id: <20230615100533.3996107-4-seiden@linux.ibm.com> --- arch/s390/include/asm/uv.h | 14 ++++++++ arch/s390/include/uapi/asm/uvdevice.h | 4 +++ drivers/s390/char/uvdevice.c | 63 +++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+) diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 28a9ad57b6f1..1babc70ea5d4 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -58,6 +58,7 @@ #define UVC_CMD_SET_SHARED_ACCESS 0x1000 #define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001 #define UVC_CMD_RETR_ATTEST 0x1020 +#define UVC_CMD_ADD_SECRET 0x1031 /* Bits in installed uv calls */ enum uv_cmds_inst { @@ -88,6 +89,7 @@ enum uv_cmds_inst { BIT_UVC_CMD_DUMP_CPU = 26, BIT_UVC_CMD_DUMP_COMPLETE = 27, BIT_UVC_CMD_RETR_ATTEST = 28, + BIT_UVC_CMD_ADD_SECRET = 29, }; enum uv_feat_ind { @@ -292,6 +294,18 @@ struct uv_cb_dump_complete { u64 reserved30[5]; } __packed __aligned(8); +/* + * A common UV call struct for pv guests that contains a single address + * Examples: + * Add Secret + */ +struct uv_cb_guest_addr { + struct uv_cb_header header; + u64 reserved08[3]; + u64 addr; + u64 reserved28[4]; +} __packed __aligned(8); + static inline int __uv_call(unsigned long r1, unsigned long r2) { int cc; diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h index 9d9b684836c2..e77410226598 100644 --- a/arch/s390/include/uapi/asm/uvdevice.h +++ b/arch/s390/include/uapi/asm/uvdevice.h @@ -69,6 +69,7 @@ struct uvio_uvdev_info { #define UVIO_ATT_ARCB_MAX_LEN 0x100000 #define UVIO_ATT_MEASUREMENT_MAX_LEN 0x8000 #define UVIO_ATT_ADDITIONAL_MAX_LEN 0x8000 +#define UVIO_ADD_SECRET_MAX_LEN 0x100000 #define UVIO_DEVICE_NAME "uv" #define UVIO_TYPE_UVC 'u' @@ -76,6 +77,7 @@ struct uvio_uvdev_info { enum UVIO_IOCTL_NR { UVIO_IOCTL_UVDEV_INFO_NR = 0x00, UVIO_IOCTL_ATT_NR, + UVIO_IOCTL_ADD_SECRET_NR, /* must be the last entry */ UVIO_IOCTL_NUM_IOCTLS }; @@ -83,9 +85,11 @@ enum UVIO_IOCTL_NR { #define UVIO_IOCTL(nr) _IOWR(UVIO_TYPE_UVC, nr, struct uvio_ioctl_cb) #define UVIO_IOCTL_UVDEV_INFO UVIO_IOCTL(UVIO_IOCTL_UVDEV_INFO_NR) #define UVIO_IOCTL_ATT UVIO_IOCTL(UVIO_IOCTL_ATT_NR) +#define UVIO_IOCTL_ADD_SECRET UVIO_IOCTL(UVIO_IOCTL_ADD_SECRET_NR) #define UVIO_SUPP_CALL(nr) (1ULL << (nr)) #define UVIO_SUPP_UDEV_INFO UVIO_SUPP_CALL(UVIO_IOCTL_UDEV_INFO_NR) #define UVIO_SUPP_ATT UVIO_SUPP_CALL(UVIO_IOCTL_ATT_NR) +#define UVIO_SUPP_ADD_SECRET UVIO_SUPP_CALL(UVIO_IOCTL_ADD_SECRET_NR) #endif /* __S390_ASM_UVDEVICE_H */ diff --git a/drivers/s390/char/uvdevice.c b/drivers/s390/char/uvdevice.c index 7d7866be389b..7221e987703a 100644 --- a/drivers/s390/char/uvdevice.c +++ b/drivers/s390/char/uvdevice.c @@ -37,6 +37,7 @@ static const u32 ioctl_nr_to_uvc_bit[] __initconst = { [UVIO_IOCTL_UVDEV_INFO_NR] = BIT_UVIO_INTERNAL, [UVIO_IOCTL_ATT_NR] = BIT_UVC_CMD_RETR_ATTEST, + [UVIO_IOCTL_ADD_SECRET_NR] = BIT_UVC_CMD_ADD_SECRET, }; static_assert(ARRAY_SIZE(ioctl_nr_to_uvc_bit) == UVIO_IOCTL_NUM_IOCTLS); @@ -231,6 +232,65 @@ out: return ret; } +/** uvio_add_secret() - perform an Add Secret UVC + * + * @uv_ioctl: ioctl control block + * + * uvio_add_secret() performs the Add Secret Ultravisor Call. + * + * The given userspace argument address and size are verified to be + * valid but every other check is made by the Ultravisor + * (UV). Therefore UV errors won't result in a negative return + * value. The request is then copied to kernelspace, the UV-call is + * performed and the results are copied back to userspace. + * + * The argument has to point to an Add Secret Request Control Block + * which is an encrypted and cryptographically verified request that + * inserts a protected guest's secrets into the Ultravisor for later + * use. + * + * If the Add Secret UV facility is not present, UV will return + * invalid command rc. This won't be fenced in the driver and does not + * result in a negative return value. + * + * Context: might sleep + * + * Return: 0 on success or a negative error code on error. + */ +static int uvio_add_secret(struct uvio_ioctl_cb *uv_ioctl) +{ + void __user *user_buf_arg = (void __user *)uv_ioctl->argument_addr; + struct uv_cb_guest_addr uvcb = { + .header.len = sizeof(uvcb), + .header.cmd = UVC_CMD_ADD_SECRET, + }; + void *asrcb = NULL; + int ret; + + if (uv_ioctl->argument_len > UVIO_ADD_SECRET_MAX_LEN) + return -EINVAL; + if (uv_ioctl->argument_len == 0) + return -EINVAL; + + asrcb = kvzalloc(uv_ioctl->argument_len, GFP_KERNEL); + if (!asrcb) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(asrcb, user_buf_arg, uv_ioctl->argument_len)) + goto out; + + ret = 0; + uvcb.addr = (u64)asrcb; + uv_call_sched(0, (u64)&uvcb); + uv_ioctl->uv_rc = uvcb.header.rc; + uv_ioctl->uv_rrc = uvcb.header.rrc; + +out: + kvfree(asrcb); + return ret; +} + static int uvio_copy_and_check_ioctl(struct uvio_ioctl_cb *ioctl, void __user *argp, unsigned long cmd) { @@ -275,6 +335,9 @@ static long uvio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case UVIO_IOCTL_ATT_NR: ret = uvio_attestation(&uv_ioctl); break; + case UVIO_IOCTL_ADD_SECRET_NR: + ret = uvio_add_secret(&uv_ioctl); + break; default: ret = -ENOIOCTLCMD; break; -- cgit v1.2.3 From b96b3ce2720145bd981988443288fbc4bdf37854 Mon Sep 17 00:00:00 2001 From: Steffen Eiden Date: Thu, 15 Jun 2023 12:05:30 +0200 Subject: s390/uvdevice: Add 'List Secrets' UVC Userspace can call the List Secrets Ultravisor Call using IOCTLs on the uvdevice. The List Secrets UV call lists the identifier of the secrets in the UV secret store. The uvdevice is merely transporting the request from userspace to Ultravisor. It's neither checking nor manipulating the request or response data. Signed-off-by: Steffen Eiden Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20230615100533.3996107-5-seiden@linux.ibm.com Signed-off-by: Janosch Frank Message-Id: <20230615100533.3996107-5-seiden@linux.ibm.com> --- arch/s390/include/asm/uv.h | 3 ++ arch/s390/include/uapi/asm/uvdevice.h | 4 +++ drivers/s390/char/uvdevice.c | 52 +++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+) diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 1babc70ea5d4..3739c8f6a129 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -59,6 +59,7 @@ #define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001 #define UVC_CMD_RETR_ATTEST 0x1020 #define UVC_CMD_ADD_SECRET 0x1031 +#define UVC_CMD_LIST_SECRETS 0x1033 /* Bits in installed uv calls */ enum uv_cmds_inst { @@ -90,6 +91,7 @@ enum uv_cmds_inst { BIT_UVC_CMD_DUMP_COMPLETE = 27, BIT_UVC_CMD_RETR_ATTEST = 28, BIT_UVC_CMD_ADD_SECRET = 29, + BIT_UVC_CMD_LIST_SECRETS = 30, }; enum uv_feat_ind { @@ -298,6 +300,7 @@ struct uv_cb_dump_complete { * A common UV call struct for pv guests that contains a single address * Examples: * Add Secret + * List Secrets */ struct uv_cb_guest_addr { struct uv_cb_header header; diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h index e77410226598..76045da44868 100644 --- a/arch/s390/include/uapi/asm/uvdevice.h +++ b/arch/s390/include/uapi/asm/uvdevice.h @@ -70,6 +70,7 @@ struct uvio_uvdev_info { #define UVIO_ATT_MEASUREMENT_MAX_LEN 0x8000 #define UVIO_ATT_ADDITIONAL_MAX_LEN 0x8000 #define UVIO_ADD_SECRET_MAX_LEN 0x100000 +#define UVIO_LIST_SECRETS_LEN 0x1000 #define UVIO_DEVICE_NAME "uv" #define UVIO_TYPE_UVC 'u' @@ -78,6 +79,7 @@ enum UVIO_IOCTL_NR { UVIO_IOCTL_UVDEV_INFO_NR = 0x00, UVIO_IOCTL_ATT_NR, UVIO_IOCTL_ADD_SECRET_NR, + UVIO_IOCTL_LIST_SECRETS_NR, /* must be the last entry */ UVIO_IOCTL_NUM_IOCTLS }; @@ -86,10 +88,12 @@ enum UVIO_IOCTL_NR { #define UVIO_IOCTL_UVDEV_INFO UVIO_IOCTL(UVIO_IOCTL_UVDEV_INFO_NR) #define UVIO_IOCTL_ATT UVIO_IOCTL(UVIO_IOCTL_ATT_NR) #define UVIO_IOCTL_ADD_SECRET UVIO_IOCTL(UVIO_IOCTL_ADD_SECRET_NR) +#define UVIO_IOCTL_LIST_SECRETS UVIO_IOCTL(UVIO_IOCTL_LIST_SECRETS_NR) #define UVIO_SUPP_CALL(nr) (1ULL << (nr)) #define UVIO_SUPP_UDEV_INFO UVIO_SUPP_CALL(UVIO_IOCTL_UDEV_INFO_NR) #define UVIO_SUPP_ATT UVIO_SUPP_CALL(UVIO_IOCTL_ATT_NR) #define UVIO_SUPP_ADD_SECRET UVIO_SUPP_CALL(UVIO_IOCTL_ADD_SECRET_NR) +#define UVIO_SUPP_LIST_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LIST_SECRETS_NR) #endif /* __S390_ASM_UVDEVICE_H */ diff --git a/drivers/s390/char/uvdevice.c b/drivers/s390/char/uvdevice.c index 7221e987703a..70502c4ec290 100644 --- a/drivers/s390/char/uvdevice.c +++ b/drivers/s390/char/uvdevice.c @@ -38,6 +38,7 @@ static const u32 ioctl_nr_to_uvc_bit[] __initconst = { [UVIO_IOCTL_UVDEV_INFO_NR] = BIT_UVIO_INTERNAL, [UVIO_IOCTL_ATT_NR] = BIT_UVC_CMD_RETR_ATTEST, [UVIO_IOCTL_ADD_SECRET_NR] = BIT_UVC_CMD_ADD_SECRET, + [UVIO_IOCTL_LIST_SECRETS_NR] = BIT_UVC_CMD_LIST_SECRETS, }; static_assert(ARRAY_SIZE(ioctl_nr_to_uvc_bit) == UVIO_IOCTL_NUM_IOCTLS); @@ -291,6 +292,54 @@ out: return ret; } +/** uvio_list_secrets() - perform a List Secret UVC + * @uv_ioctl: ioctl control block + * + * uvio_list_secrets() performs the List Secret Ultravisor Call. It verifies + * that the given userspace argument address is valid and its size is sane. + * Every other check is made by the Ultravisor (UV) and won't result in a + * negative return value. It builds the request, performs the UV-call, and + * copies the result to userspace. + * + * The argument specifies the location for the result of the UV-Call. + * + * If the List Secrets UV facility is not present, UV will return invalid + * command rc. This won't be fenced in the driver and does not result in a + * negative return value. + * + * Context: might sleep + * + * Return: 0 on success or a negative error code on error. + */ +static int uvio_list_secrets(struct uvio_ioctl_cb *uv_ioctl) +{ + void __user *user_buf_arg = (void __user *)uv_ioctl->argument_addr; + struct uv_cb_guest_addr uvcb = { + .header.len = sizeof(uvcb), + .header.cmd = UVC_CMD_LIST_SECRETS, + }; + void *secrets = NULL; + int ret = 0; + + if (uv_ioctl->argument_len != UVIO_LIST_SECRETS_LEN) + return -EINVAL; + + secrets = kvzalloc(UVIO_LIST_SECRETS_LEN, GFP_KERNEL); + if (!secrets) + return -ENOMEM; + + uvcb.addr = (u64)secrets; + uv_call_sched(0, (u64)&uvcb); + uv_ioctl->uv_rc = uvcb.header.rc; + uv_ioctl->uv_rrc = uvcb.header.rrc; + + if (copy_to_user(user_buf_arg, secrets, UVIO_LIST_SECRETS_LEN)) + ret = -EFAULT; + + kvfree(secrets); + return ret; +} + static int uvio_copy_and_check_ioctl(struct uvio_ioctl_cb *ioctl, void __user *argp, unsigned long cmd) { @@ -338,6 +387,9 @@ static long uvio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case UVIO_IOCTL_ADD_SECRET_NR: ret = uvio_add_secret(&uv_ioctl); break; + case UVIO_IOCTL_LIST_SECRETS_NR: + ret = uvio_list_secrets(&uv_ioctl); + break; default: ret = -ENOIOCTLCMD; break; -- cgit v1.2.3 From 2d8a26acaf88a9174bcd44b607c5aa2ca9f5718f Mon Sep 17 00:00:00 2001 From: Steffen Eiden Date: Thu, 15 Jun 2023 12:05:31 +0200 Subject: s390/uvdevice: Add 'Lock Secret Store' UVC Userspace can call the Lock Secret Store Ultravisor Call using IOCTLs on the uvdevice. The Lock Secret Store UV call disables all additions of secrets for the future. The uvdevice is merely transporting the request from userspace to the Ultravisor. Signed-off-by: Steffen Eiden Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20230615100533.3996107-6-seiden@linux.ibm.com Signed-off-by: Janosch Frank Message-Id: <20230615100533.3996107-6-seiden@linux.ibm.com> --- arch/s390/include/asm/uv.h | 2 ++ arch/s390/include/uapi/asm/uvdevice.h | 3 +++ drivers/s390/char/uvdevice.c | 39 +++++++++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 3739c8f6a129..3203ffbdde6b 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -60,6 +60,7 @@ #define UVC_CMD_RETR_ATTEST 0x1020 #define UVC_CMD_ADD_SECRET 0x1031 #define UVC_CMD_LIST_SECRETS 0x1033 +#define UVC_CMD_LOCK_SECRETS 0x1034 /* Bits in installed uv calls */ enum uv_cmds_inst { @@ -92,6 +93,7 @@ enum uv_cmds_inst { BIT_UVC_CMD_RETR_ATTEST = 28, BIT_UVC_CMD_ADD_SECRET = 29, BIT_UVC_CMD_LIST_SECRETS = 30, + BIT_UVC_CMD_LOCK_SECRETS = 31, }; enum uv_feat_ind { diff --git a/arch/s390/include/uapi/asm/uvdevice.h b/arch/s390/include/uapi/asm/uvdevice.h index 76045da44868..b9c2f14a6af3 100644 --- a/arch/s390/include/uapi/asm/uvdevice.h +++ b/arch/s390/include/uapi/asm/uvdevice.h @@ -80,6 +80,7 @@ enum UVIO_IOCTL_NR { UVIO_IOCTL_ATT_NR, UVIO_IOCTL_ADD_SECRET_NR, UVIO_IOCTL_LIST_SECRETS_NR, + UVIO_IOCTL_LOCK_SECRETS_NR, /* must be the last entry */ UVIO_IOCTL_NUM_IOCTLS }; @@ -89,11 +90,13 @@ enum UVIO_IOCTL_NR { #define UVIO_IOCTL_ATT UVIO_IOCTL(UVIO_IOCTL_ATT_NR) #define UVIO_IOCTL_ADD_SECRET UVIO_IOCTL(UVIO_IOCTL_ADD_SECRET_NR) #define UVIO_IOCTL_LIST_SECRETS UVIO_IOCTL(UVIO_IOCTL_LIST_SECRETS_NR) +#define UVIO_IOCTL_LOCK_SECRETS UVIO_IOCTL(UVIO_IOCTL_LOCK_SECRETS_NR) #define UVIO_SUPP_CALL(nr) (1ULL << (nr)) #define UVIO_SUPP_UDEV_INFO UVIO_SUPP_CALL(UVIO_IOCTL_UDEV_INFO_NR) #define UVIO_SUPP_ATT UVIO_SUPP_CALL(UVIO_IOCTL_ATT_NR) #define UVIO_SUPP_ADD_SECRET UVIO_SUPP_CALL(UVIO_IOCTL_ADD_SECRET_NR) #define UVIO_SUPP_LIST_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LIST_SECRETS_NR) +#define UVIO_SUPP_LOCK_SECRETS UVIO_SUPP_CALL(UVIO_IOCTL_LOCK_SECRETS_NR) #endif /* __S390_ASM_UVDEVICE_H */ diff --git a/drivers/s390/char/uvdevice.c b/drivers/s390/char/uvdevice.c index 70502c4ec290..144cd2e03590 100644 --- a/drivers/s390/char/uvdevice.c +++ b/drivers/s390/char/uvdevice.c @@ -39,6 +39,7 @@ static const u32 ioctl_nr_to_uvc_bit[] __initconst = { [UVIO_IOCTL_ATT_NR] = BIT_UVC_CMD_RETR_ATTEST, [UVIO_IOCTL_ADD_SECRET_NR] = BIT_UVC_CMD_ADD_SECRET, [UVIO_IOCTL_LIST_SECRETS_NR] = BIT_UVC_CMD_LIST_SECRETS, + [UVIO_IOCTL_LOCK_SECRETS_NR] = BIT_UVC_CMD_LOCK_SECRETS, }; static_assert(ARRAY_SIZE(ioctl_nr_to_uvc_bit) == UVIO_IOCTL_NUM_IOCTLS); @@ -340,6 +341,41 @@ static int uvio_list_secrets(struct uvio_ioctl_cb *uv_ioctl) return ret; } +/** uvio_lock_secrets() - perform a Lock Secret Store UVC + * @uv_ioctl: ioctl control block + * + * uvio_lock_secrets() performs the Lock Secret Store Ultravisor Call. It + * performs the UV-call and copies the return codes to the ioctl control block. + * After this call was dispatched successfully every following Add Secret UVC + * and Lock Secrets UVC will fail with return code 0x102. + * + * The argument address and size must be 0. + * + * If the Lock Secrets UV facility is not present, UV will return invalid + * command rc. This won't be fenced in the driver and does not result in a + * negative return value. + * + * Context: might sleep + * + * Return: 0 on success or a negative error code on error. + */ +static int uvio_lock_secrets(struct uvio_ioctl_cb *ioctl) +{ + struct uv_cb_nodata uvcb = { + .header.len = sizeof(uvcb), + .header.cmd = UVC_CMD_LOCK_SECRETS, + }; + + if (ioctl->argument_addr || ioctl->argument_len) + return -EINVAL; + + uv_call(0, (u64)&uvcb); + ioctl->uv_rc = uvcb.header.rc; + ioctl->uv_rrc = uvcb.header.rrc; + + return 0; +} + static int uvio_copy_and_check_ioctl(struct uvio_ioctl_cb *ioctl, void __user *argp, unsigned long cmd) { @@ -390,6 +426,9 @@ static long uvio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case UVIO_IOCTL_LIST_SECRETS_NR: ret = uvio_list_secrets(&uv_ioctl); break; + case UVIO_IOCTL_LOCK_SECRETS_NR: + ret = uvio_lock_secrets(&uv_ioctl); + break; default: ret = -ENOIOCTLCMD; break; -- cgit v1.2.3 From 78d3326e725e8a8b17b8fe1a6beaf8e0ea04de04 Mon Sep 17 00:00:00 2001 From: Steffen Eiden Date: Thu, 15 Jun 2023 12:05:32 +0200 Subject: s390/uv: replace scnprintf with sysfs_emit Replace scnprintf(page, PAGE_SIZE, ...) with the page size aware sysfs_emit(buf, ...) which adds some sanity checks. Signed-off-by: Steffen Eiden Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20230615100533.3996107-7-seiden@linux.ibm.com Signed-off-by: Janosch Frank Message-Id: <20230615100533.3996107-7-seiden@linux.ibm.com> --- arch/s390/kernel/uv.c | 58 +++++++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 32 deletions(-) diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index e320a382fa85..6a23a13d0dfc 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -468,13 +468,13 @@ EXPORT_SYMBOL_GPL(arch_make_page_accessible); #if defined(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) || IS_ENABLED(CONFIG_KVM) static ssize_t uv_query_facilities(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%lx\n%lx\n%lx\n%lx\n", - uv_info.inst_calls_list[0], - uv_info.inst_calls_list[1], - uv_info.inst_calls_list[2], - uv_info.inst_calls_list[3]); + return sysfs_emit(buf, "%lx\n%lx\n%lx\n%lx\n", + uv_info.inst_calls_list[0], + uv_info.inst_calls_list[1], + uv_info.inst_calls_list[2], + uv_info.inst_calls_list[3]); } static struct kobj_attribute uv_query_facilities_attr = @@ -499,30 +499,27 @@ static struct kobj_attribute uv_query_supp_se_hdr_pcf_attr = __ATTR(supp_se_hdr_pcf, 0444, uv_query_supp_se_hdr_pcf, NULL); static ssize_t uv_query_dump_cpu_len(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%lx\n", - uv_info.guest_cpu_stor_len); + return sysfs_emit(buf, "%lx\n", uv_info.guest_cpu_stor_len); } static struct kobj_attribute uv_query_dump_cpu_len_attr = __ATTR(uv_query_dump_cpu_len, 0444, uv_query_dump_cpu_len, NULL); static ssize_t uv_query_dump_storage_state_len(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%lx\n", - uv_info.conf_dump_storage_state_len); + return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_storage_state_len); } static struct kobj_attribute uv_query_dump_storage_state_len_attr = __ATTR(dump_storage_state_len, 0444, uv_query_dump_storage_state_len, NULL); static ssize_t uv_query_dump_finalize_len(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%lx\n", - uv_info.conf_dump_finalize_len); + return sysfs_emit(buf, "%lx\n", uv_info.conf_dump_finalize_len); } static struct kobj_attribute uv_query_dump_finalize_len_attr = @@ -538,48 +535,45 @@ static struct kobj_attribute uv_query_feature_indications_attr = __ATTR(feature_indications, 0444, uv_query_feature_indications, NULL); static ssize_t uv_query_max_guest_cpus(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%d\n", - uv_info.max_guest_cpu_id + 1); + return sysfs_emit(buf, "%d\n", uv_info.max_guest_cpu_id + 1); } static struct kobj_attribute uv_query_max_guest_cpus_attr = __ATTR(max_cpus, 0444, uv_query_max_guest_cpus, NULL); static ssize_t uv_query_max_guest_vms(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%d\n", - uv_info.max_num_sec_conf); + return sysfs_emit(buf, "%d\n", uv_info.max_num_sec_conf); } static struct kobj_attribute uv_query_max_guest_vms_attr = __ATTR(max_guests, 0444, uv_query_max_guest_vms, NULL); static ssize_t uv_query_max_guest_addr(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%lx\n", - uv_info.max_sec_stor_addr); + return sysfs_emit(buf, "%lx\n", uv_info.max_sec_stor_addr); } static struct kobj_attribute uv_query_max_guest_addr_attr = __ATTR(max_address, 0444, uv_query_max_guest_addr, NULL); static ssize_t uv_query_supp_att_req_hdr_ver(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_req_hdr_ver); + return sysfs_emit(buf, "%lx\n", uv_info.supp_att_req_hdr_ver); } static struct kobj_attribute uv_query_supp_att_req_hdr_ver_attr = __ATTR(supp_att_req_hdr_ver, 0444, uv_query_supp_att_req_hdr_ver, NULL); static ssize_t uv_query_supp_att_pflags(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { - return scnprintf(page, PAGE_SIZE, "%lx\n", uv_info.supp_att_pflags); + return sysfs_emit(buf, "%lx\n", uv_info.supp_att_pflags); } static struct kobj_attribute uv_query_supp_att_pflags_attr = @@ -606,18 +600,18 @@ static struct attribute_group uv_query_attr_group = { }; static ssize_t uv_is_prot_virt_guest(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { int val = 0; #ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST val = prot_virt_guest; #endif - return scnprintf(page, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } static ssize_t uv_is_prot_virt_host(struct kobject *kobj, - struct kobj_attribute *attr, char *page) + struct kobj_attribute *attr, char *buf) { int val = 0; @@ -625,7 +619,7 @@ static ssize_t uv_is_prot_virt_host(struct kobject *kobj, val = prot_virt_host; #endif - return scnprintf(page, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } static struct kobj_attribute uv_prot_virt_guest = -- cgit v1.2.3 From db54dfc9f71cd2df7afd1e88535ef6099cb0333e Mon Sep 17 00:00:00 2001 From: Steffen Eiden Date: Thu, 15 Jun 2023 12:05:33 +0200 Subject: s390/uv: Update query for secret-UVCs Update the query struct such that secret-UVC related information can be parsed. Add sysfs files for these new values. 'supp_add_secret_req_ver' notes the supported versions for the Add Secret UVC. Bit 0 indicates that version 0x100 is supported, bit 1 indicates 0x200, and so on. 'supp_add_secret_pcf' notes the supported plaintext flags for the Add Secret UVC. 'supp_secret_types' notes the supported types of secrets. Bit 0 indicates secret type 1, bit 1 indicates type 2, and so on. 'max_secrets' notes the maximum amount of secrets the secret store can store per pv guest. Signed-off-by: Steffen Eiden Reviewed-by: Janosch Frank Link: https://lore.kernel.org/r/20230615100533.3996107-8-seiden@linux.ibm.com Signed-off-by: Janosch Frank Message-Id: <20230615100533.3996107-8-seiden@linux.ibm.com> --- arch/s390/boot/uv.c | 4 ++++ arch/s390/include/asm/uv.h | 13 +++++++++++-- arch/s390/kernel/uv.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c index 0a077c0a2056..1e66d2cbb096 100644 --- a/arch/s390/boot/uv.c +++ b/arch/s390/boot/uv.c @@ -47,6 +47,10 @@ void uv_query_info(void) uv_info.conf_dump_finalize_len = uvcb.conf_dump_finalize_len; uv_info.supp_att_req_hdr_ver = uvcb.supp_att_req_hdr_ver; uv_info.supp_att_pflags = uvcb.supp_att_pflags; + uv_info.supp_add_secret_req_ver = uvcb.supp_add_secret_req_ver; + uv_info.supp_add_secret_pcf = uvcb.supp_add_secret_pcf; + uv_info.supp_secret_types = uvcb.supp_secret_types; + uv_info.max_secrets = uvcb.max_secrets; } #ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 3203ffbdde6b..d6bb2f4f78d1 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -123,7 +123,7 @@ struct uv_cb_qui { u32 reserved70[3]; /* 0x0070 */ u32 max_num_sec_conf; /* 0x007c */ u64 max_guest_stor_addr; /* 0x0080 */ - u8 reserved88[158 - 136]; /* 0x0088 */ + u8 reserved88[0x9e - 0x88]; /* 0x0088 */ u16 max_guest_cpu_id; /* 0x009e */ u64 uv_feature_indications; /* 0x00a0 */ u64 reserveda8; /* 0x00a8 */ @@ -135,7 +135,12 @@ struct uv_cb_qui { u64 reservedd8; /* 0x00d8 */ u64 supp_att_req_hdr_ver; /* 0x00e0 */ u64 supp_att_pflags; /* 0x00e8 */ - u8 reservedf0[256 - 240]; /* 0x00f0 */ + u64 reservedf0; /* 0x00f0 */ + u64 supp_add_secret_req_ver; /* 0x00f8 */ + u64 supp_add_secret_pcf; /* 0x0100 */ + u64 supp_secret_types; /* 0x0180 */ + u16 max_secrets; /* 0x0110 */ + u8 reserved112[0x120 - 0x112]; /* 0x0112 */ } __packed __aligned(8); /* Initialize Ultravisor */ @@ -384,6 +389,10 @@ struct uv_info { unsigned long conf_dump_finalize_len; unsigned long supp_att_req_hdr_ver; unsigned long supp_att_pflags; + unsigned long supp_add_secret_req_ver; + unsigned long supp_add_secret_pcf; + unsigned long supp_secret_types; + unsigned short max_secrets; }; extern struct uv_info uv_info; diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 6a23a13d0dfc..273a0281a189 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -579,6 +579,42 @@ static ssize_t uv_query_supp_att_pflags(struct kobject *kobj, static struct kobj_attribute uv_query_supp_att_pflags_attr = __ATTR(supp_att_pflags, 0444, uv_query_supp_att_pflags, NULL); +static ssize_t uv_query_supp_add_secret_req_ver(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_req_ver); +} + +static struct kobj_attribute uv_query_supp_add_secret_req_ver_attr = + __ATTR(supp_add_secret_req_ver, 0444, uv_query_supp_add_secret_req_ver, NULL); + +static ssize_t uv_query_supp_add_secret_pcf(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_add_secret_pcf); +} + +static struct kobj_attribute uv_query_supp_add_secret_pcf_attr = + __ATTR(supp_add_secret_pcf, 0444, uv_query_supp_add_secret_pcf, NULL); + +static ssize_t uv_query_supp_secret_types(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.supp_secret_types); +} + +static struct kobj_attribute uv_query_supp_secret_types_attr = + __ATTR(supp_secret_types, 0444, uv_query_supp_secret_types, NULL); + +static ssize_t uv_query_max_secrets(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%d\n", uv_info.max_secrets); +} + +static struct kobj_attribute uv_query_max_secrets_attr = + __ATTR(max_secrets, 0444, uv_query_max_secrets, NULL); + static struct attribute *uv_query_attrs[] = { &uv_query_facilities_attr.attr, &uv_query_feature_indications_attr.attr, @@ -592,6 +628,10 @@ static struct attribute *uv_query_attrs[] = { &uv_query_dump_cpu_len_attr.attr, &uv_query_supp_att_req_hdr_ver_attr.attr, &uv_query_supp_att_pflags_attr.attr, + &uv_query_supp_add_secret_req_ver_attr.attr, + &uv_query_supp_add_secret_pcf_attr.attr, + &uv_query_supp_secret_types_attr.attr, + &uv_query_max_secrets_attr.attr, NULL, }; -- cgit v1.2.3 From fb1273635f8c38df18483ffcd038a5b792dfcc94 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 16 Jun 2023 18:02:33 +0300 Subject: KVM: x86: Remove PRIx* definitions as they are solely for user space In the Linux kernel we do not support PRI.64 specifiers. Moreover they seem not to be used anyway here. Drop them. Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/r/20230616150233.83813-1-andriy.shevchenko@linux.intel.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/lapic.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e542cf285b51..58ee570db395 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -51,11 +51,6 @@ #define mod_64(x, y) ((x) % (y)) #endif -#define PRId64 "d" -#define PRIx64 "llx" -#define PRIu64 "u" -#define PRIo64 "o" - /* 14 is the version for Xeon and Pentium 8.4.8*/ #define APIC_VERSION 0x14UL #define LAPIC_MMIO_LENGTH (1 << 12) -- cgit v1.2.3 From 77cf33c17154b7f8151429f6ba32049afc49f9c3 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 15 Jun 2023 13:03:44 +0530 Subject: RISC-V: KVM: Implement guest external interrupt line management The RISC-V host will have one guest external interrupt line for each VS-level IMSICs associated with a HART. The guest external interrupt lines are per-HART resources and hypervisor can use HGEIE, HGEIP, and HIE CSRs to manage these guest external interrupt lines. Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_aia.h | 10 ++ arch/riscv/kvm/aia.c | 244 +++++++++++++++++++++++++++++++++++++++ arch/riscv/kvm/main.c | 3 +- arch/riscv/kvm/vcpu.c | 2 + 4 files changed, 258 insertions(+), 1 deletion(-) diff --git a/arch/riscv/include/asm/kvm_aia.h b/arch/riscv/include/asm/kvm_aia.h index 1de0717112e5..0938e0cadf80 100644 --- a/arch/riscv/include/asm/kvm_aia.h +++ b/arch/riscv/include/asm/kvm_aia.h @@ -44,10 +44,15 @@ struct kvm_vcpu_aia { #define irqchip_in_kernel(k) ((k)->arch.aia.in_kernel) +extern unsigned int kvm_riscv_aia_nr_hgei; DECLARE_STATIC_KEY_FALSE(kvm_riscv_aia_available); #define kvm_riscv_aia_available() \ static_branch_unlikely(&kvm_riscv_aia_available) +static inline void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu) +{ +} + #define KVM_RISCV_AIA_IMSIC_TOPEI (ISELECT_MASK + 1) static inline int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, unsigned long isel, @@ -119,6 +124,11 @@ static inline void kvm_riscv_aia_destroy_vm(struct kvm *kvm) { } +int kvm_riscv_aia_alloc_hgei(int cpu, struct kvm_vcpu *owner, + void __iomem **hgei_va, phys_addr_t *hgei_pa); +void kvm_riscv_aia_free_hgei(int cpu, int hgei); +void kvm_riscv_aia_wakeon_hgei(struct kvm_vcpu *owner, bool enable); + void kvm_riscv_aia_enable(void); void kvm_riscv_aia_disable(void); int kvm_riscv_aia_init(void); diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index 4f1286fc7f17..1cee75a8c883 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -8,11 +8,47 @@ */ #include +#include +#include +#include #include +#include +#include #include +struct aia_hgei_control { + raw_spinlock_t lock; + unsigned long free_bitmap; + struct kvm_vcpu *owners[BITS_PER_LONG]; +}; +static DEFINE_PER_CPU(struct aia_hgei_control, aia_hgei); +static int hgei_parent_irq; + +unsigned int kvm_riscv_aia_nr_hgei; DEFINE_STATIC_KEY_FALSE(kvm_riscv_aia_available); +static int aia_find_hgei(struct kvm_vcpu *owner) +{ + int i, hgei; + unsigned long flags; + struct aia_hgei_control *hgctrl = get_cpu_ptr(&aia_hgei); + + raw_spin_lock_irqsave(&hgctrl->lock, flags); + + hgei = -1; + for (i = 1; i <= kvm_riscv_aia_nr_hgei; i++) { + if (hgctrl->owners[i] == owner) { + hgei = i; + break; + } + } + + raw_spin_unlock_irqrestore(&hgctrl->lock, flags); + + put_cpu_ptr(&aia_hgei); + return hgei; +} + static void aia_set_hvictl(bool ext_irq_pending) { unsigned long hvictl; @@ -56,6 +92,7 @@ void kvm_riscv_vcpu_aia_sync_interrupts(struct kvm_vcpu *vcpu) bool kvm_riscv_vcpu_aia_has_interrupts(struct kvm_vcpu *vcpu, u64 mask) { + int hgei; unsigned long seip; if (!kvm_riscv_aia_available()) @@ -74,6 +111,10 @@ bool kvm_riscv_vcpu_aia_has_interrupts(struct kvm_vcpu *vcpu, u64 mask) if (!kvm_riscv_aia_initialized(vcpu->kvm) || !seip) return false; + hgei = aia_find_hgei(vcpu); + if (hgei > 0) + return !!(csr_read(CSR_HGEIP) & BIT(hgei)); + return false; } @@ -348,6 +389,143 @@ int kvm_riscv_vcpu_aia_rmw_ireg(struct kvm_vcpu *vcpu, unsigned int csr_num, return KVM_INSN_EXIT_TO_USER_SPACE; } +int kvm_riscv_aia_alloc_hgei(int cpu, struct kvm_vcpu *owner, + void __iomem **hgei_va, phys_addr_t *hgei_pa) +{ + int ret = -ENOENT; + unsigned long flags; + struct aia_hgei_control *hgctrl = per_cpu_ptr(&aia_hgei, cpu); + + if (!kvm_riscv_aia_available() || !hgctrl) + return -ENODEV; + + raw_spin_lock_irqsave(&hgctrl->lock, flags); + + if (hgctrl->free_bitmap) { + ret = __ffs(hgctrl->free_bitmap); + hgctrl->free_bitmap &= ~BIT(ret); + hgctrl->owners[ret] = owner; + } + + raw_spin_unlock_irqrestore(&hgctrl->lock, flags); + + /* TODO: To be updated later by AIA in-kernel irqchip support */ + if (hgei_va) + *hgei_va = NULL; + if (hgei_pa) + *hgei_pa = 0; + + return ret; +} + +void kvm_riscv_aia_free_hgei(int cpu, int hgei) +{ + unsigned long flags; + struct aia_hgei_control *hgctrl = per_cpu_ptr(&aia_hgei, cpu); + + if (!kvm_riscv_aia_available() || !hgctrl) + return; + + raw_spin_lock_irqsave(&hgctrl->lock, flags); + + if (hgei > 0 && hgei <= kvm_riscv_aia_nr_hgei) { + if (!(hgctrl->free_bitmap & BIT(hgei))) { + hgctrl->free_bitmap |= BIT(hgei); + hgctrl->owners[hgei] = NULL; + } + } + + raw_spin_unlock_irqrestore(&hgctrl->lock, flags); +} + +void kvm_riscv_aia_wakeon_hgei(struct kvm_vcpu *owner, bool enable) +{ + int hgei; + + if (!kvm_riscv_aia_available()) + return; + + hgei = aia_find_hgei(owner); + if (hgei > 0) { + if (enable) + csr_set(CSR_HGEIE, BIT(hgei)); + else + csr_clear(CSR_HGEIE, BIT(hgei)); + } +} + +static irqreturn_t hgei_interrupt(int irq, void *dev_id) +{ + int i; + unsigned long hgei_mask, flags; + struct aia_hgei_control *hgctrl = get_cpu_ptr(&aia_hgei); + + hgei_mask = csr_read(CSR_HGEIP) & csr_read(CSR_HGEIE); + csr_clear(CSR_HGEIE, hgei_mask); + + raw_spin_lock_irqsave(&hgctrl->lock, flags); + + for_each_set_bit(i, &hgei_mask, BITS_PER_LONG) { + if (hgctrl->owners[i]) + kvm_vcpu_kick(hgctrl->owners[i]); + } + + raw_spin_unlock_irqrestore(&hgctrl->lock, flags); + + put_cpu_ptr(&aia_hgei); + return IRQ_HANDLED; +} + +static int aia_hgei_init(void) +{ + int cpu, rc; + struct irq_domain *domain; + struct aia_hgei_control *hgctrl; + + /* Initialize per-CPU guest external interrupt line management */ + for_each_possible_cpu(cpu) { + hgctrl = per_cpu_ptr(&aia_hgei, cpu); + raw_spin_lock_init(&hgctrl->lock); + if (kvm_riscv_aia_nr_hgei) { + hgctrl->free_bitmap = + BIT(kvm_riscv_aia_nr_hgei + 1) - 1; + hgctrl->free_bitmap &= ~BIT(0); + } else + hgctrl->free_bitmap = 0; + } + + /* Find INTC irq domain */ + domain = irq_find_matching_fwnode(riscv_get_intc_hwnode(), + DOMAIN_BUS_ANY); + if (!domain) { + kvm_err("unable to find INTC domain\n"); + return -ENOENT; + } + + /* Map per-CPU SGEI interrupt from INTC domain */ + hgei_parent_irq = irq_create_mapping(domain, IRQ_S_GEXT); + if (!hgei_parent_irq) { + kvm_err("unable to map SGEI IRQ\n"); + return -ENOMEM; + } + + /* Request per-CPU SGEI interrupt */ + rc = request_percpu_irq(hgei_parent_irq, hgei_interrupt, + "riscv-kvm", &aia_hgei); + if (rc) { + kvm_err("failed to request SGEI IRQ\n"); + return rc; + } + + return 0; +} + +static void aia_hgei_exit(void) +{ + /* Free per-CPU SGEI interrupt */ + free_percpu_irq(hgei_parent_irq, &aia_hgei); +} + void kvm_riscv_aia_enable(void) { if (!kvm_riscv_aia_available()) @@ -362,21 +540,82 @@ void kvm_riscv_aia_enable(void) csr_write(CSR_HVIPRIO1H, 0x0); csr_write(CSR_HVIPRIO2H, 0x0); #endif + + /* Enable per-CPU SGEI interrupt */ + enable_percpu_irq(hgei_parent_irq, + irq_get_trigger_type(hgei_parent_irq)); + csr_set(CSR_HIE, BIT(IRQ_S_GEXT)); } void kvm_riscv_aia_disable(void) { + int i; + unsigned long flags; + struct kvm_vcpu *vcpu; + struct aia_hgei_control *hgctrl; + if (!kvm_riscv_aia_available()) return; + hgctrl = get_cpu_ptr(&aia_hgei); + + /* Disable per-CPU SGEI interrupt */ + csr_clear(CSR_HIE, BIT(IRQ_S_GEXT)); + disable_percpu_irq(hgei_parent_irq); aia_set_hvictl(false); + + raw_spin_lock_irqsave(&hgctrl->lock, flags); + + for (i = 0; i <= kvm_riscv_aia_nr_hgei; i++) { + vcpu = hgctrl->owners[i]; + if (!vcpu) + continue; + + /* + * We release hgctrl->lock before notifying IMSIC + * so that we don't have lock ordering issues. + */ + raw_spin_unlock_irqrestore(&hgctrl->lock, flags); + + /* Notify IMSIC */ + kvm_riscv_vcpu_aia_imsic_release(vcpu); + + /* + * Wakeup VCPU if it was blocked so that it can + * run on other HARTs + */ + if (csr_read(CSR_HGEIE) & BIT(i)) { + csr_clear(CSR_HGEIE, BIT(i)); + kvm_vcpu_kick(vcpu); + } + + raw_spin_lock_irqsave(&hgctrl->lock, flags); + } + + raw_spin_unlock_irqrestore(&hgctrl->lock, flags); + + put_cpu_ptr(&aia_hgei); } int kvm_riscv_aia_init(void) { + int rc; + if (!riscv_isa_extension_available(NULL, SxAIA)) return -ENODEV; + /* Figure-out number of bits in HGEIE */ + csr_write(CSR_HGEIE, -1UL); + kvm_riscv_aia_nr_hgei = fls_long(csr_read(CSR_HGEIE)); + csr_write(CSR_HGEIE, 0); + if (kvm_riscv_aia_nr_hgei) + kvm_riscv_aia_nr_hgei--; + + /* Initialize guest external interrupt line management */ + rc = aia_hgei_init(); + if (rc) + return rc; + /* Enable KVM AIA support */ static_branch_enable(&kvm_riscv_aia_available); @@ -385,4 +624,9 @@ int kvm_riscv_aia_init(void) void kvm_riscv_aia_exit(void) { + if (!kvm_riscv_aia_available()) + return; + + /* Cleanup the HGEI state */ + aia_hgei_exit(); } diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index a7112d583637..48ae0d4b3932 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -116,7 +116,8 @@ static int __init riscv_kvm_init(void) kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits()); if (kvm_riscv_aia_available()) - kvm_info("AIA available\n"); + kvm_info("AIA available with %d guest external interrupts\n", + kvm_riscv_aia_nr_hgei); rc = kvm_init(sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (rc) { diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 8bd9f2a8a0b9..2db62c6c0d3e 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -250,10 +250,12 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { + kvm_riscv_aia_wakeon_hgei(vcpu, true); } void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { + kvm_riscv_aia_wakeon_hgei(vcpu, false); } int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From f7fec5ecc9b6a6dedf6e38c1d3e7cd6557df5514 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 15 Jun 2023 13:03:45 +0530 Subject: RISC-V: KVM: Add IMSIC related defines We add IMSIC related defines in a separate header so that different parts of KVM code can share it. Once AIA drivers are merged will have a common IMSIC header shared by both KVM and IRQCHIP driver. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_aia_imsic.h | 38 ++++++++++++++++++++++++++++++++++ arch/riscv/kvm/aia.c | 3 +-- 2 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 arch/riscv/include/asm/kvm_aia_imsic.h diff --git a/arch/riscv/include/asm/kvm_aia_imsic.h b/arch/riscv/include/asm/kvm_aia_imsic.h new file mode 100644 index 000000000000..da5881d2bde0 --- /dev/null +++ b/arch/riscv/include/asm/kvm_aia_imsic.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + * Copyright (C) 2022 Ventana Micro Systems Inc. + */ +#ifndef __KVM_RISCV_AIA_IMSIC_H +#define __KVM_RISCV_AIA_IMSIC_H + +#include +#include + +#define IMSIC_MMIO_PAGE_SHIFT 12 +#define IMSIC_MMIO_PAGE_SZ (1UL << IMSIC_MMIO_PAGE_SHIFT) +#define IMSIC_MMIO_PAGE_LE 0x00 +#define IMSIC_MMIO_PAGE_BE 0x04 + +#define IMSIC_MIN_ID 63 +#define IMSIC_MAX_ID 2048 + +#define IMSIC_EIDELIVERY 0x70 + +#define IMSIC_EITHRESHOLD 0x72 + +#define IMSIC_EIP0 0x80 +#define IMSIC_EIP63 0xbf +#define IMSIC_EIPx_BITS 32 + +#define IMSIC_EIE0 0xc0 +#define IMSIC_EIE63 0xff +#define IMSIC_EIEx_BITS 32 + +#define IMSIC_FIRST IMSIC_EIDELIVERY +#define IMSIC_LAST IMSIC_EIE63 + +#define IMSIC_MMIO_SETIPNUM_LE 0x00 +#define IMSIC_MMIO_SETIPNUM_BE 0x04 + +#endif diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index 1cee75a8c883..c78c06d99e39 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -15,6 +15,7 @@ #include #include #include +#include struct aia_hgei_control { raw_spinlock_t lock; @@ -364,8 +365,6 @@ static int aia_rmw_iprio(struct kvm_vcpu *vcpu, unsigned int isel, return KVM_INSN_CONTINUE_NEXT_SEPC; } -#define IMSIC_FIRST 0x70 -#define IMSIC_LAST 0xff int kvm_riscv_vcpu_aia_rmw_ireg(struct kvm_vcpu *vcpu, unsigned int csr_num, unsigned long *val, unsigned long new_val, unsigned long wr_mask) -- cgit v1.2.3 From cf55201c7516d1c69859f1157deab02223285f82 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 15 Jun 2023 13:03:46 +0530 Subject: RISC-V: KVM: Add APLIC related defines We add APLIC related defines in a separate header so that different parts of KVM code can share it. Once AIA drivers are merged will have a common APLIC header shared by both KVM and IRQCHIP driver. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_aia_aplic.h | 58 ++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 arch/riscv/include/asm/kvm_aia_aplic.h diff --git a/arch/riscv/include/asm/kvm_aia_aplic.h b/arch/riscv/include/asm/kvm_aia_aplic.h new file mode 100644 index 000000000000..6dd1a4809ec1 --- /dev/null +++ b/arch/riscv/include/asm/kvm_aia_aplic.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + * Copyright (C) 2022 Ventana Micro Systems Inc. + */ +#ifndef __KVM_RISCV_AIA_IMSIC_H +#define __KVM_RISCV_AIA_IMSIC_H + +#include + +#define APLIC_MAX_IDC BIT(14) +#define APLIC_MAX_SOURCE 1024 + +#define APLIC_DOMAINCFG 0x0000 +#define APLIC_DOMAINCFG_RDONLY 0x80000000 +#define APLIC_DOMAINCFG_IE BIT(8) +#define APLIC_DOMAINCFG_DM BIT(2) +#define APLIC_DOMAINCFG_BE BIT(0) + +#define APLIC_SOURCECFG_BASE 0x0004 +#define APLIC_SOURCECFG_D BIT(10) +#define APLIC_SOURCECFG_CHILDIDX_MASK 0x000003ff +#define APLIC_SOURCECFG_SM_MASK 0x00000007 +#define APLIC_SOURCECFG_SM_INACTIVE 0x0 +#define APLIC_SOURCECFG_SM_DETACH 0x1 +#define APLIC_SOURCECFG_SM_EDGE_RISE 0x4 +#define APLIC_SOURCECFG_SM_EDGE_FALL 0x5 +#define APLIC_SOURCECFG_SM_LEVEL_HIGH 0x6 +#define APLIC_SOURCECFG_SM_LEVEL_LOW 0x7 + +#define APLIC_IRQBITS_PER_REG 32 + +#define APLIC_SETIP_BASE 0x1c00 +#define APLIC_SETIPNUM 0x1cdc + +#define APLIC_CLRIP_BASE 0x1d00 +#define APLIC_CLRIPNUM 0x1ddc + +#define APLIC_SETIE_BASE 0x1e00 +#define APLIC_SETIENUM 0x1edc + +#define APLIC_CLRIE_BASE 0x1f00 +#define APLIC_CLRIENUM 0x1fdc + +#define APLIC_SETIPNUM_LE 0x2000 +#define APLIC_SETIPNUM_BE 0x2004 + +#define APLIC_GENMSI 0x3000 + +#define APLIC_TARGET_BASE 0x3004 +#define APLIC_TARGET_HART_IDX_SHIFT 18 +#define APLIC_TARGET_HART_IDX_MASK 0x3fff +#define APLIC_TARGET_GUEST_IDX_SHIFT 12 +#define APLIC_TARGET_GUEST_IDX_MASK 0x3f +#define APLIC_TARGET_IPRIO_MASK 0xff +#define APLIC_TARGET_EIID_MASK 0x7ff + +#endif -- cgit v1.2.3 From f0607e6215b2329bb8f3117b34ccfc6447e14f22 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 15 Jun 2023 13:03:47 +0530 Subject: RISC-V: KVM: Set kvm_riscv_aia_nr_hgei to zero We hard-code the kvm_riscv_aia_nr_hgei to zero until IMSIC HW guest file support is added in KVM RISC-V. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/kvm/aia.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index c78c06d99e39..3f97575707eb 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -408,7 +408,7 @@ int kvm_riscv_aia_alloc_hgei(int cpu, struct kvm_vcpu *owner, raw_spin_unlock_irqrestore(&hgctrl->lock, flags); - /* TODO: To be updated later by AIA in-kernel irqchip support */ + /* TODO: To be updated later by AIA IMSIC HW guest file support */ if (hgei_va) *hgei_va = NULL; if (hgei_pa) @@ -610,6 +610,14 @@ int kvm_riscv_aia_init(void) if (kvm_riscv_aia_nr_hgei) kvm_riscv_aia_nr_hgei--; + /* + * Number of usable HGEI lines should be minimum of per-HART + * IMSIC guest files and number of bits in HGEIE + * + * TODO: To be updated later by AIA IMSIC HW guest file support + */ + kvm_riscv_aia_nr_hgei = 0; + /* Initialize guest external interrupt line management */ rc = aia_hgei_init(); if (rc) -- cgit v1.2.3 From 00f918f61c56a46d9e09ce21b54b8c21f496c753 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 15 Jun 2023 13:03:48 +0530 Subject: RISC-V: KVM: Skeletal in-kernel AIA irqchip support To incrementally implement in-kernel AIA irqchip support, we first add minimal skeletal support which only compiles but does not provide any functionality. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_aia.h | 20 +++++++ arch/riscv/include/asm/kvm_host.h | 4 ++ arch/riscv/include/uapi/asm/kvm.h | 4 ++ arch/riscv/kvm/Kconfig | 4 ++ arch/riscv/kvm/aia.c | 8 +++ arch/riscv/kvm/vm.c | 118 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 158 insertions(+) diff --git a/arch/riscv/include/asm/kvm_aia.h b/arch/riscv/include/asm/kvm_aia.h index 0938e0cadf80..3bc0a0e47a15 100644 --- a/arch/riscv/include/asm/kvm_aia.h +++ b/arch/riscv/include/asm/kvm_aia.h @@ -45,6 +45,7 @@ struct kvm_vcpu_aia { #define irqchip_in_kernel(k) ((k)->arch.aia.in_kernel) extern unsigned int kvm_riscv_aia_nr_hgei; +extern unsigned int kvm_riscv_aia_max_ids; DECLARE_STATIC_KEY_FALSE(kvm_riscv_aia_available); #define kvm_riscv_aia_available() \ static_branch_unlikely(&kvm_riscv_aia_available) @@ -116,6 +117,25 @@ static inline void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu) { } +static inline int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, + u32 hart_index, + u32 guest_index, u32 iid) +{ + return 0; +} + +static inline int kvm_riscv_aia_inject_msi(struct kvm *kvm, + struct kvm_msi *msi) +{ + return 0; +} + +static inline int kvm_riscv_aia_inject_irq(struct kvm *kvm, + unsigned int irq, bool level) +{ + return 0; +} + static inline void kvm_riscv_aia_init_vm(struct kvm *kvm) { } diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index ee0acccb1d3b..871432586a63 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -27,6 +27,8 @@ #define KVM_VCPU_MAX_FEATURES 0 +#define KVM_IRQCHIP_NUM_PINS 1024 + #define KVM_REQ_SLEEP \ KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(1) @@ -318,6 +320,8 @@ int kvm_riscv_gstage_vmid_init(struct kvm *kvm); bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid); void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu); +int kvm_riscv_setup_default_irq_routing(struct kvm *kvm, u32 lines); + void __kvm_riscv_unpriv_trap(void); unsigned long kvm_riscv_vcpu_unpriv_read(struct kvm_vcpu *vcpu, diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index f92790c9481a..332d4a274891 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -15,6 +15,7 @@ #include #include +#define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_READONLY_MEM #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 @@ -203,6 +204,9 @@ enum KVM_RISCV_SBI_EXT_ID { #define KVM_REG_RISCV_SBI_MULTI_REG_LAST \ KVM_REG_RISCV_SBI_MULTI_REG(KVM_RISCV_SBI_EXT_MAX - 1) +/* One single KVM irqchip, ie. the AIA */ +#define KVM_NR_IRQCHIPS 1 + #endif #endif /* __LINUX_KVM_RISCV_H */ diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index 28891e583259..dfc237d7875b 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -21,6 +21,10 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support (EXPERIMENTAL)" depends on RISCV_SBI && MMU select HAVE_KVM_EVENTFD + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQFD + select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_MSI select HAVE_KVM_VCPU_ASYNC_IOCTL select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index 3f97575707eb..18c442c15ff2 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -26,6 +26,7 @@ static DEFINE_PER_CPU(struct aia_hgei_control, aia_hgei); static int hgei_parent_irq; unsigned int kvm_riscv_aia_nr_hgei; +unsigned int kvm_riscv_aia_max_ids; DEFINE_STATIC_KEY_FALSE(kvm_riscv_aia_available); static int aia_find_hgei(struct kvm_vcpu *owner) @@ -618,6 +619,13 @@ int kvm_riscv_aia_init(void) */ kvm_riscv_aia_nr_hgei = 0; + /* + * Find number of guest MSI IDs + * + * TODO: To be updated later by AIA IMSIC HW guest file support + */ + kvm_riscv_aia_max_ids = IMSIC_MAX_ID; + /* Initialize guest external interrupt line management */ rc = aia_hgei_init(); if (rc) diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index 6ef15f78e80f..7e2b50c692c1 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -55,11 +55,129 @@ void kvm_arch_destroy_vm(struct kvm *kvm) kvm_riscv_aia_destroy_vm(kvm); } +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irql, + bool line_status) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + return kvm_riscv_aia_inject_irq(kvm, irql->irq, irql->level); +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status) +{ + struct kvm_msi msi; + + if (!level) + return -1; + + msi.address_lo = e->msi.address_lo; + msi.address_hi = e->msi.address_hi; + msi.data = e->msi.data; + msi.flags = e->msi.flags; + msi.devid = e->msi.devid; + + return kvm_riscv_aia_inject_msi(kvm, &msi); +} + +static int kvm_riscv_set_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, + int level, bool line_status) +{ + return kvm_riscv_aia_inject_irq(kvm, e->irqchip.pin, level); +} + +int kvm_riscv_setup_default_irq_routing(struct kvm *kvm, u32 lines) +{ + struct kvm_irq_routing_entry *ents; + int i, rc; + + ents = kcalloc(lines, sizeof(*ents), GFP_KERNEL); + if (!ents) + return -ENOMEM; + + for (i = 0; i < lines; i++) { + ents[i].gsi = i; + ents[i].type = KVM_IRQ_ROUTING_IRQCHIP; + ents[i].u.irqchip.irqchip = 0; + ents[i].u.irqchip.pin = i; + } + rc = kvm_set_irq_routing(kvm, ents, lines, 0); + kfree(ents); + + return rc; +} + +bool kvm_arch_can_set_irq_routing(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + +int kvm_set_routing_entry(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + e->set = kvm_riscv_set_irq; + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin; + if ((e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) || + (e->irqchip.irqchip >= KVM_NR_IRQCHIPS)) + goto out; + break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + e->msi.flags = ue->flags; + e->msi.devid = ue->u.msi.devid; + break; + default: + goto out; + } + r = 0; +out: + return r; +} + +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + if (!level) + return -EWOULDBLOCK; + + switch (e->type) { + case KVM_IRQ_ROUTING_MSI: + return kvm_set_msi(e, kvm, irq_source_id, level, line_status); + + case KVM_IRQ_ROUTING_IRQCHIP: + return kvm_riscv_set_irq(e, kvm, irq_source_id, + level, line_status); + } + + return -EWOULDBLOCK; +} + +bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) { int r; switch (ext) { + case KVM_CAP_IRQCHIP: + r = kvm_riscv_aia_available(); + break; case KVM_CAP_IOEVENTFD: case KVM_CAP_DEVICE_CTRL: case KVM_CAP_USER_MEMORY: -- cgit v1.2.3 From 89d01306e34d6ace24e9708cb443df0e53c06ce0 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 15 Jun 2023 13:03:49 +0530 Subject: RISC-V: KVM: Implement device interface for AIA irqchip We implement KVM device interface for in-kernel AIA irqchip so that user-space can use KVM device ioctls to create, configure, and destroy in-kernel AIA irqchip. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_aia.h | 132 +++++--- arch/riscv/include/uapi/asm/kvm.h | 45 +++ arch/riscv/kvm/Makefile | 1 + arch/riscv/kvm/aia.c | 11 + arch/riscv/kvm/aia_device.c | 623 ++++++++++++++++++++++++++++++++++++++ include/uapi/linux/kvm.h | 2 + 6 files changed, 772 insertions(+), 42 deletions(-) create mode 100644 arch/riscv/kvm/aia_device.c diff --git a/arch/riscv/include/asm/kvm_aia.h b/arch/riscv/include/asm/kvm_aia.h index 3bc0a0e47a15..a1281ebc9b92 100644 --- a/arch/riscv/include/asm/kvm_aia.h +++ b/arch/riscv/include/asm/kvm_aia.h @@ -20,6 +20,33 @@ struct kvm_aia { /* In-kernel irqchip initialized */ bool initialized; + + /* Virtualization mode (Emulation, HW Accelerated, or Auto) */ + u32 mode; + + /* Number of MSIs */ + u32 nr_ids; + + /* Number of wired IRQs */ + u32 nr_sources; + + /* Number of group bits in IMSIC address */ + u32 nr_group_bits; + + /* Position of group bits in IMSIC address */ + u32 nr_group_shift; + + /* Number of hart bits in IMSIC address */ + u32 nr_hart_bits; + + /* Number of guest bits in IMSIC address */ + u32 nr_guest_bits; + + /* Guest physical address of APLIC */ + gpa_t aplic_addr; + + /* Internal state of APLIC */ + void *aplic_state; }; struct kvm_vcpu_aia_csr { @@ -38,8 +65,19 @@ struct kvm_vcpu_aia { /* CPU AIA CSR context upon Guest VCPU reset */ struct kvm_vcpu_aia_csr guest_reset_csr; + + /* Guest physical address of IMSIC for this VCPU */ + gpa_t imsic_addr; + + /* HART index of IMSIC extacted from guest physical address */ + u32 hart_index; + + /* Internal state of IMSIC for this VCPU */ + void *imsic_state; }; +#define KVM_RISCV_AIA_UNDEF_ADDR (-1) + #define kvm_riscv_aia_initialized(k) ((k)->arch.aia.initialized) #define irqchip_in_kernel(k) ((k)->arch.aia.in_kernel) @@ -50,10 +88,17 @@ DECLARE_STATIC_KEY_FALSE(kvm_riscv_aia_available); #define kvm_riscv_aia_available() \ static_branch_unlikely(&kvm_riscv_aia_available) +extern struct kvm_device_ops kvm_riscv_aia_device_ops; + static inline void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu) { } +static inline int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu) +{ + return 1; +} + #define KVM_RISCV_AIA_IMSIC_TOPEI (ISELECT_MASK + 1) static inline int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, unsigned long isel, @@ -64,6 +109,41 @@ static inline int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, return 0; } +static inline void kvm_riscv_vcpu_aia_imsic_reset(struct kvm_vcpu *vcpu) +{ +} + +static inline int kvm_riscv_vcpu_aia_imsic_inject(struct kvm_vcpu *vcpu, + u32 guest_index, u32 offset, + u32 iid) +{ + return 0; +} + +static inline int kvm_riscv_vcpu_aia_imsic_init(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static inline void kvm_riscv_vcpu_aia_imsic_cleanup(struct kvm_vcpu *vcpu) +{ +} + +static inline int kvm_riscv_aia_aplic_inject(struct kvm *kvm, + u32 source, bool level) +{ + return 0; +} + +static inline int kvm_riscv_aia_aplic_init(struct kvm *kvm) +{ + return 0; +} + +static inline void kvm_riscv_aia_aplic_cleanup(struct kvm *kvm) +{ +} + #ifdef CONFIG_32BIT void kvm_riscv_vcpu_aia_flush_interrupts(struct kvm_vcpu *vcpu); void kvm_riscv_vcpu_aia_sync_interrupts(struct kvm_vcpu *vcpu); @@ -99,50 +179,18 @@ int kvm_riscv_vcpu_aia_rmw_ireg(struct kvm_vcpu *vcpu, unsigned int csr_num, { .base = CSR_SIREG, .count = 1, .func = kvm_riscv_vcpu_aia_rmw_ireg }, \ { .base = CSR_STOPEI, .count = 1, .func = kvm_riscv_vcpu_aia_rmw_topei }, -static inline int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu) -{ - return 1; -} - -static inline void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu) -{ -} - -static inline int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - -static inline void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu) -{ -} - -static inline int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, - u32 hart_index, - u32 guest_index, u32 iid) -{ - return 0; -} - -static inline int kvm_riscv_aia_inject_msi(struct kvm *kvm, - struct kvm_msi *msi) -{ - return 0; -} +int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu); +int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu); -static inline int kvm_riscv_aia_inject_irq(struct kvm *kvm, - unsigned int irq, bool level) -{ - return 0; -} +int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, u32 hart_index, + u32 guest_index, u32 iid); +int kvm_riscv_aia_inject_msi(struct kvm *kvm, struct kvm_msi *msi); +int kvm_riscv_aia_inject_irq(struct kvm *kvm, unsigned int irq, bool level); -static inline void kvm_riscv_aia_init_vm(struct kvm *kvm) -{ -} - -static inline void kvm_riscv_aia_destroy_vm(struct kvm *kvm) -{ -} +void kvm_riscv_aia_init_vm(struct kvm *kvm); +void kvm_riscv_aia_destroy_vm(struct kvm *kvm); int kvm_riscv_aia_alloc_hgei(int cpu, struct kvm_vcpu *owner, void __iomem **hgei_va, phys_addr_t *hgei_pa); diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 332d4a274891..047c8fc5bd71 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -204,6 +204,51 @@ enum KVM_RISCV_SBI_EXT_ID { #define KVM_REG_RISCV_SBI_MULTI_REG_LAST \ KVM_REG_RISCV_SBI_MULTI_REG(KVM_RISCV_SBI_EXT_MAX - 1) +/* Device Control API: RISC-V AIA */ +#define KVM_DEV_RISCV_APLIC_ALIGN 0x1000 +#define KVM_DEV_RISCV_APLIC_SIZE 0x4000 +#define KVM_DEV_RISCV_APLIC_MAX_HARTS 0x4000 +#define KVM_DEV_RISCV_IMSIC_ALIGN 0x1000 +#define KVM_DEV_RISCV_IMSIC_SIZE 0x1000 + +#define KVM_DEV_RISCV_AIA_GRP_CONFIG 0 +#define KVM_DEV_RISCV_AIA_CONFIG_MODE 0 +#define KVM_DEV_RISCV_AIA_CONFIG_IDS 1 +#define KVM_DEV_RISCV_AIA_CONFIG_SRCS 2 +#define KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS 3 +#define KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT 4 +#define KVM_DEV_RISCV_AIA_CONFIG_HART_BITS 5 +#define KVM_DEV_RISCV_AIA_CONFIG_GUEST_BITS 6 + +/* + * Modes of RISC-V AIA device: + * 1) EMUL (aka Emulation): Trap-n-emulate IMSIC + * 2) HWACCEL (aka HW Acceleration): Virtualize IMSIC using IMSIC guest files + * 3) AUTO (aka Automatic): Virtualize IMSIC using IMSIC guest files whenever + * available otherwise fallback to trap-n-emulation + */ +#define KVM_DEV_RISCV_AIA_MODE_EMUL 0 +#define KVM_DEV_RISCV_AIA_MODE_HWACCEL 1 +#define KVM_DEV_RISCV_AIA_MODE_AUTO 2 + +#define KVM_DEV_RISCV_AIA_IDS_MIN 63 +#define KVM_DEV_RISCV_AIA_IDS_MAX 2048 +#define KVM_DEV_RISCV_AIA_SRCS_MAX 1024 +#define KVM_DEV_RISCV_AIA_GROUP_BITS_MAX 8 +#define KVM_DEV_RISCV_AIA_GROUP_SHIFT_MIN 24 +#define KVM_DEV_RISCV_AIA_GROUP_SHIFT_MAX 56 +#define KVM_DEV_RISCV_AIA_HART_BITS_MAX 16 +#define KVM_DEV_RISCV_AIA_GUEST_BITS_MAX 8 + +#define KVM_DEV_RISCV_AIA_GRP_ADDR 1 +#define KVM_DEV_RISCV_AIA_ADDR_APLIC 0 +#define KVM_DEV_RISCV_AIA_ADDR_IMSIC(__vcpu) (1 + (__vcpu)) +#define KVM_DEV_RISCV_AIA_ADDR_MAX \ + (1 + KVM_DEV_RISCV_APLIC_MAX_HARTS) + +#define KVM_DEV_RISCV_AIA_GRP_CTRL 2 +#define KVM_DEV_RISCV_AIA_CTRL_INIT 0 + /* One single KVM irqchip, ie. the AIA */ #define KVM_NR_IRQCHIPS 1 diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile index 8031b8912a0d..dd69ebe098bd 100644 --- a/arch/riscv/kvm/Makefile +++ b/arch/riscv/kvm/Makefile @@ -27,3 +27,4 @@ kvm-y += vcpu_sbi_hsm.o kvm-y += vcpu_timer.o kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o vcpu_sbi_pmu.o kvm-y += aia.o +kvm-y += aia_device.o diff --git a/arch/riscv/kvm/aia.c b/arch/riscv/kvm/aia.c index 18c442c15ff2..585a3b42c52c 100644 --- a/arch/riscv/kvm/aia.c +++ b/arch/riscv/kvm/aia.c @@ -631,6 +631,14 @@ int kvm_riscv_aia_init(void) if (rc) return rc; + /* Register device operations */ + rc = kvm_register_device_ops(&kvm_riscv_aia_device_ops, + KVM_DEV_TYPE_RISCV_AIA); + if (rc) { + aia_hgei_exit(); + return rc; + } + /* Enable KVM AIA support */ static_branch_enable(&kvm_riscv_aia_available); @@ -642,6 +650,9 @@ void kvm_riscv_aia_exit(void) if (!kvm_riscv_aia_available()) return; + /* Unregister device operations */ + kvm_unregister_device_ops(KVM_DEV_TYPE_RISCV_AIA); + /* Cleanup the HGEI state */ aia_hgei_exit(); } diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c new file mode 100644 index 000000000000..7ab555121872 --- /dev/null +++ b/arch/riscv/kvm/aia_device.c @@ -0,0 +1,623 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + * Copyright (C) 2022 Ventana Micro Systems Inc. + * + * Authors: + * Anup Patel + */ + +#include +#include +#include +#include + +static void unlock_vcpus(struct kvm *kvm, int vcpu_lock_idx) +{ + struct kvm_vcpu *tmp_vcpu; + + for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) { + tmp_vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx); + mutex_unlock(&tmp_vcpu->mutex); + } +} + +static void unlock_all_vcpus(struct kvm *kvm) +{ + unlock_vcpus(kvm, atomic_read(&kvm->online_vcpus) - 1); +} + +static bool lock_all_vcpus(struct kvm *kvm) +{ + struct kvm_vcpu *tmp_vcpu; + unsigned long c; + + kvm_for_each_vcpu(c, tmp_vcpu, kvm) { + if (!mutex_trylock(&tmp_vcpu->mutex)) { + unlock_vcpus(kvm, c - 1); + return false; + } + } + + return true; +} + +static int aia_create(struct kvm_device *dev, u32 type) +{ + int ret; + unsigned long i; + struct kvm *kvm = dev->kvm; + struct kvm_vcpu *vcpu; + + if (irqchip_in_kernel(kvm)) + return -EEXIST; + + ret = -EBUSY; + if (!lock_all_vcpus(kvm)) + return ret; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.ran_atleast_once) + goto out_unlock; + } + ret = 0; + + kvm->arch.aia.in_kernel = true; + +out_unlock: + unlock_all_vcpus(kvm); + return ret; +} + +static void aia_destroy(struct kvm_device *dev) +{ + kfree(dev); +} + +static int aia_config(struct kvm *kvm, unsigned long type, + u32 *nr, bool write) +{ + struct kvm_aia *aia = &kvm->arch.aia; + + /* Writes can only be done before irqchip is initialized */ + if (write && kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + switch (type) { + case KVM_DEV_RISCV_AIA_CONFIG_MODE: + if (write) { + switch (*nr) { + case KVM_DEV_RISCV_AIA_MODE_EMUL: + break; + case KVM_DEV_RISCV_AIA_MODE_HWACCEL: + case KVM_DEV_RISCV_AIA_MODE_AUTO: + /* + * HW Acceleration and Auto modes only + * supported on host with non-zero guest + * external interrupts (i.e. non-zero + * VS-level IMSIC pages). + */ + if (!kvm_riscv_aia_nr_hgei) + return -EINVAL; + break; + default: + return -EINVAL; + }; + aia->mode = *nr; + } else + *nr = aia->mode; + break; + case KVM_DEV_RISCV_AIA_CONFIG_IDS: + if (write) { + if ((*nr < KVM_DEV_RISCV_AIA_IDS_MIN) || + (*nr >= KVM_DEV_RISCV_AIA_IDS_MAX) || + ((*nr & KVM_DEV_RISCV_AIA_IDS_MIN) != + KVM_DEV_RISCV_AIA_IDS_MIN) || + (kvm_riscv_aia_max_ids <= *nr)) + return -EINVAL; + aia->nr_ids = *nr; + } else + *nr = aia->nr_ids; + break; + case KVM_DEV_RISCV_AIA_CONFIG_SRCS: + if (write) { + if ((*nr >= KVM_DEV_RISCV_AIA_SRCS_MAX) || + (*nr >= kvm_riscv_aia_max_ids)) + return -EINVAL; + aia->nr_sources = *nr; + } else + *nr = aia->nr_sources; + break; + case KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS: + if (write) { + if (*nr >= KVM_DEV_RISCV_AIA_GROUP_BITS_MAX) + return -EINVAL; + aia->nr_group_bits = *nr; + } else + *nr = aia->nr_group_bits; + break; + case KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT: + if (write) { + if ((*nr < KVM_DEV_RISCV_AIA_GROUP_SHIFT_MIN) || + (*nr >= KVM_DEV_RISCV_AIA_GROUP_SHIFT_MAX)) + return -EINVAL; + aia->nr_group_shift = *nr; + } else + *nr = aia->nr_group_shift; + break; + case KVM_DEV_RISCV_AIA_CONFIG_HART_BITS: + if (write) { + if (*nr >= KVM_DEV_RISCV_AIA_HART_BITS_MAX) + return -EINVAL; + aia->nr_hart_bits = *nr; + } else + *nr = aia->nr_hart_bits; + break; + case KVM_DEV_RISCV_AIA_CONFIG_GUEST_BITS: + if (write) { + if (*nr >= KVM_DEV_RISCV_AIA_GUEST_BITS_MAX) + return -EINVAL; + aia->nr_guest_bits = *nr; + } else + *nr = aia->nr_guest_bits; + break; + default: + return -ENXIO; + }; + + return 0; +} + +static int aia_aplic_addr(struct kvm *kvm, u64 *addr, bool write) +{ + struct kvm_aia *aia = &kvm->arch.aia; + + if (write) { + /* Writes can only be done before irqchip is initialized */ + if (kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + if (*addr & (KVM_DEV_RISCV_APLIC_ALIGN - 1)) + return -EINVAL; + + aia->aplic_addr = *addr; + } else + *addr = aia->aplic_addr; + + return 0; +} + +static int aia_imsic_addr(struct kvm *kvm, u64 *addr, + unsigned long vcpu_idx, bool write) +{ + struct kvm_vcpu *vcpu; + struct kvm_vcpu_aia *vcpu_aia; + + vcpu = kvm_get_vcpu(kvm, vcpu_idx); + if (!vcpu) + return -EINVAL; + vcpu_aia = &vcpu->arch.aia_context; + + if (write) { + /* Writes can only be done before irqchip is initialized */ + if (kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + if (*addr & (KVM_DEV_RISCV_IMSIC_ALIGN - 1)) + return -EINVAL; + } + + mutex_lock(&vcpu->mutex); + if (write) + vcpu_aia->imsic_addr = *addr; + else + *addr = vcpu_aia->imsic_addr; + mutex_unlock(&vcpu->mutex); + + return 0; +} + +static gpa_t aia_imsic_ppn(struct kvm_aia *aia, gpa_t addr) +{ + u32 h, l; + gpa_t mask = 0; + + h = aia->nr_hart_bits + aia->nr_guest_bits + + IMSIC_MMIO_PAGE_SHIFT - 1; + mask = GENMASK_ULL(h, 0); + + if (aia->nr_group_bits) { + h = aia->nr_group_bits + aia->nr_group_shift - 1; + l = aia->nr_group_shift; + mask |= GENMASK_ULL(h, l); + } + + return (addr & ~mask) >> IMSIC_MMIO_PAGE_SHIFT; +} + +static u32 aia_imsic_hart_index(struct kvm_aia *aia, gpa_t addr) +{ + u32 hart, group = 0; + + hart = (addr >> (aia->nr_guest_bits + IMSIC_MMIO_PAGE_SHIFT)) & + GENMASK_ULL(aia->nr_hart_bits - 1, 0); + if (aia->nr_group_bits) + group = (addr >> aia->nr_group_shift) & + GENMASK_ULL(aia->nr_group_bits - 1, 0); + + return (group << aia->nr_hart_bits) | hart; +} + +static int aia_init(struct kvm *kvm) +{ + int ret, i; + unsigned long idx; + struct kvm_vcpu *vcpu; + struct kvm_vcpu_aia *vaia; + struct kvm_aia *aia = &kvm->arch.aia; + gpa_t base_ppn = KVM_RISCV_AIA_UNDEF_ADDR; + + /* Irqchip can be initialized only once */ + if (kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + /* We might be in the middle of creating a VCPU? */ + if (kvm->created_vcpus != atomic_read(&kvm->online_vcpus)) + return -EBUSY; + + /* Number of sources should be less than or equals number of IDs */ + if (aia->nr_ids < aia->nr_sources) + return -EINVAL; + + /* APLIC base is required for non-zero number of sources */ + if (aia->nr_sources && aia->aplic_addr == KVM_RISCV_AIA_UNDEF_ADDR) + return -EINVAL; + + /* Initialize APLIC */ + ret = kvm_riscv_aia_aplic_init(kvm); + if (ret) + return ret; + + /* Iterate over each VCPU */ + kvm_for_each_vcpu(idx, vcpu, kvm) { + vaia = &vcpu->arch.aia_context; + + /* IMSIC base is required */ + if (vaia->imsic_addr == KVM_RISCV_AIA_UNDEF_ADDR) { + ret = -EINVAL; + goto fail_cleanup_imsics; + } + + /* All IMSICs should have matching base PPN */ + if (base_ppn == KVM_RISCV_AIA_UNDEF_ADDR) + base_ppn = aia_imsic_ppn(aia, vaia->imsic_addr); + if (base_ppn != aia_imsic_ppn(aia, vaia->imsic_addr)) { + ret = -EINVAL; + goto fail_cleanup_imsics; + } + + /* Update HART index of the IMSIC based on IMSIC base */ + vaia->hart_index = aia_imsic_hart_index(aia, + vaia->imsic_addr); + + /* Initialize IMSIC for this VCPU */ + ret = kvm_riscv_vcpu_aia_imsic_init(vcpu); + if (ret) + goto fail_cleanup_imsics; + } + + /* Set the initialized flag */ + kvm->arch.aia.initialized = true; + + return 0; + +fail_cleanup_imsics: + for (i = idx - 1; i >= 0; i--) { + vcpu = kvm_get_vcpu(kvm, i); + if (!vcpu) + continue; + kvm_riscv_vcpu_aia_imsic_cleanup(vcpu); + } + kvm_riscv_aia_aplic_cleanup(kvm); + return ret; +} + +static int aia_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + u32 nr; + u64 addr; + int nr_vcpus, r = -ENXIO; + unsigned long type = (unsigned long)attr->attr; + void __user *uaddr = (void __user *)(long)attr->addr; + + switch (attr->group) { + case KVM_DEV_RISCV_AIA_GRP_CONFIG: + if (copy_from_user(&nr, uaddr, sizeof(nr))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = aia_config(dev->kvm, type, &nr, true); + mutex_unlock(&dev->kvm->lock); + + break; + + case KVM_DEV_RISCV_AIA_GRP_ADDR: + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + nr_vcpus = atomic_read(&dev->kvm->online_vcpus); + mutex_lock(&dev->kvm->lock); + if (type == KVM_DEV_RISCV_AIA_ADDR_APLIC) + r = aia_aplic_addr(dev->kvm, &addr, true); + else if (type < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus)) + r = aia_imsic_addr(dev->kvm, &addr, + type - KVM_DEV_RISCV_AIA_ADDR_IMSIC(0), true); + mutex_unlock(&dev->kvm->lock); + + break; + + case KVM_DEV_RISCV_AIA_GRP_CTRL: + switch (type) { + case KVM_DEV_RISCV_AIA_CTRL_INIT: + mutex_lock(&dev->kvm->lock); + r = aia_init(dev->kvm); + mutex_unlock(&dev->kvm->lock); + break; + } + + break; + } + + return r; +} + +static int aia_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + u32 nr; + u64 addr; + int nr_vcpus, r = -ENXIO; + void __user *uaddr = (void __user *)(long)attr->addr; + unsigned long type = (unsigned long)attr->attr; + + switch (attr->group) { + case KVM_DEV_RISCV_AIA_GRP_CONFIG: + if (copy_from_user(&nr, uaddr, sizeof(nr))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = aia_config(dev->kvm, type, &nr, false); + mutex_unlock(&dev->kvm->lock); + if (r) + return r; + + if (copy_to_user(uaddr, &nr, sizeof(nr))) + return -EFAULT; + + break; + case KVM_DEV_RISCV_AIA_GRP_ADDR: + if (copy_from_user(&addr, uaddr, sizeof(addr))) + return -EFAULT; + + nr_vcpus = atomic_read(&dev->kvm->online_vcpus); + mutex_lock(&dev->kvm->lock); + if (type == KVM_DEV_RISCV_AIA_ADDR_APLIC) + r = aia_aplic_addr(dev->kvm, &addr, false); + else if (type < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus)) + r = aia_imsic_addr(dev->kvm, &addr, + type - KVM_DEV_RISCV_AIA_ADDR_IMSIC(0), false); + mutex_unlock(&dev->kvm->lock); + if (r) + return r; + + if (copy_to_user(uaddr, &addr, sizeof(addr))) + return -EFAULT; + + break; + } + + return r; +} + +static int aia_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + int nr_vcpus; + + switch (attr->group) { + case KVM_DEV_RISCV_AIA_GRP_CONFIG: + switch (attr->attr) { + case KVM_DEV_RISCV_AIA_CONFIG_MODE: + case KVM_DEV_RISCV_AIA_CONFIG_IDS: + case KVM_DEV_RISCV_AIA_CONFIG_SRCS: + case KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS: + case KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT: + case KVM_DEV_RISCV_AIA_CONFIG_HART_BITS: + case KVM_DEV_RISCV_AIA_CONFIG_GUEST_BITS: + return 0; + } + break; + case KVM_DEV_RISCV_AIA_GRP_ADDR: + nr_vcpus = atomic_read(&dev->kvm->online_vcpus); + if (attr->attr == KVM_DEV_RISCV_AIA_ADDR_APLIC) + return 0; + else if (attr->attr < KVM_DEV_RISCV_AIA_ADDR_IMSIC(nr_vcpus)) + return 0; + break; + case KVM_DEV_RISCV_AIA_GRP_CTRL: + switch (attr->attr) { + case KVM_DEV_RISCV_AIA_CTRL_INIT: + return 0; + } + break; + } + + return -ENXIO; +} + +struct kvm_device_ops kvm_riscv_aia_device_ops = { + .name = "kvm-riscv-aia", + .create = aia_create, + .destroy = aia_destroy, + .set_attr = aia_set_attr, + .get_attr = aia_get_attr, + .has_attr = aia_has_attr, +}; + +int kvm_riscv_vcpu_aia_update(struct kvm_vcpu *vcpu) +{ + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(vcpu->kvm)) + return 1; + + /* Update the IMSIC HW state before entering guest mode */ + return kvm_riscv_vcpu_aia_imsic_update(vcpu); +} + +void kvm_riscv_vcpu_aia_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_aia_csr *csr = &vcpu->arch.aia_context.guest_csr; + struct kvm_vcpu_aia_csr *reset_csr = + &vcpu->arch.aia_context.guest_reset_csr; + + if (!kvm_riscv_aia_available()) + return; + memcpy(csr, reset_csr, sizeof(*csr)); + + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(vcpu->kvm)) + return; + + /* Reset the IMSIC context */ + kvm_riscv_vcpu_aia_imsic_reset(vcpu); +} + +int kvm_riscv_vcpu_aia_init(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_aia *vaia = &vcpu->arch.aia_context; + + if (!kvm_riscv_aia_available()) + return 0; + + /* + * We don't do any memory allocations over here because these + * will be done after AIA device is initialized by the user-space. + * + * Refer, aia_init() implementation for more details. + */ + + /* Initialize default values in AIA vcpu context */ + vaia->imsic_addr = KVM_RISCV_AIA_UNDEF_ADDR; + vaia->hart_index = vcpu->vcpu_idx; + + return 0; +} + +void kvm_riscv_vcpu_aia_deinit(struct kvm_vcpu *vcpu) +{ + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(vcpu->kvm)) + return; + + /* Cleanup IMSIC context */ + kvm_riscv_vcpu_aia_imsic_cleanup(vcpu); +} + +int kvm_riscv_aia_inject_msi_by_id(struct kvm *kvm, u32 hart_index, + u32 guest_index, u32 iid) +{ + unsigned long idx; + struct kvm_vcpu *vcpu; + + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + /* Inject MSI to matching VCPU */ + kvm_for_each_vcpu(idx, vcpu, kvm) { + if (vcpu->arch.aia_context.hart_index == hart_index) + return kvm_riscv_vcpu_aia_imsic_inject(vcpu, + guest_index, + 0, iid); + } + + return 0; +} + +int kvm_riscv_aia_inject_msi(struct kvm *kvm, struct kvm_msi *msi) +{ + gpa_t tppn, ippn; + unsigned long idx; + struct kvm_vcpu *vcpu; + u32 g, toff, iid = msi->data; + struct kvm_aia *aia = &kvm->arch.aia; + gpa_t target = (((gpa_t)msi->address_hi) << 32) | msi->address_lo; + + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + /* Convert target address to target PPN */ + tppn = target >> IMSIC_MMIO_PAGE_SHIFT; + + /* Extract and clear Guest ID from target PPN */ + g = tppn & (BIT(aia->nr_guest_bits) - 1); + tppn &= ~((gpa_t)(BIT(aia->nr_guest_bits) - 1)); + + /* Inject MSI to matching VCPU */ + kvm_for_each_vcpu(idx, vcpu, kvm) { + ippn = vcpu->arch.aia_context.imsic_addr >> + IMSIC_MMIO_PAGE_SHIFT; + if (ippn == tppn) { + toff = target & (IMSIC_MMIO_PAGE_SZ - 1); + return kvm_riscv_vcpu_aia_imsic_inject(vcpu, g, + toff, iid); + } + } + + return 0; +} + +int kvm_riscv_aia_inject_irq(struct kvm *kvm, unsigned int irq, bool level) +{ + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(kvm)) + return -EBUSY; + + /* Inject interrupt level change in APLIC */ + return kvm_riscv_aia_aplic_inject(kvm, irq, level); +} + +void kvm_riscv_aia_init_vm(struct kvm *kvm) +{ + struct kvm_aia *aia = &kvm->arch.aia; + + if (!kvm_riscv_aia_available()) + return; + + /* + * We don't do any memory allocations over here because these + * will be done after AIA device is initialized by the user-space. + * + * Refer, aia_init() implementation for more details. + */ + + /* Initialize default values in AIA global context */ + aia->mode = (kvm_riscv_aia_nr_hgei) ? + KVM_DEV_RISCV_AIA_MODE_AUTO : KVM_DEV_RISCV_AIA_MODE_EMUL; + aia->nr_ids = kvm_riscv_aia_max_ids - 1; + aia->nr_sources = 0; + aia->nr_group_bits = 0; + aia->nr_group_shift = KVM_DEV_RISCV_AIA_GROUP_SHIFT_MIN; + aia->nr_hart_bits = 0; + aia->nr_guest_bits = 0; + aia->aplic_addr = KVM_RISCV_AIA_UNDEF_ADDR; +} + +void kvm_riscv_aia_destroy_vm(struct kvm *kvm) +{ + /* Proceed only if AIA was initialized successfully */ + if (!kvm_riscv_aia_initialized(kvm)) + return; + + /* Cleanup APLIC context */ + kvm_riscv_aia_aplic_cleanup(kvm); +} diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 737318b1c1d9..27ccd07898e1 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1442,6 +1442,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_XIVE KVM_DEV_TYPE_XIVE KVM_DEV_TYPE_ARM_PV_TIME, #define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME + KVM_DEV_TYPE_RISCV_AIA, +#define KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_MAX, }; -- cgit v1.2.3 From 74967aa208e257d0c26a7dd7dd93a8902b2203e4 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 15 Jun 2023 13:03:50 +0530 Subject: RISC-V: KVM: Add in-kernel emulation of AIA APLIC There is no virtualization support in AIA APLIC so we add in-kernel emulation of AIA APLIC which only supports MSI-mode (i.e. wired interrupts forwarded to AIA IMSIC as MSIs). Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_aia.h | 17 +- arch/riscv/kvm/Makefile | 1 + arch/riscv/kvm/aia_aplic.c | 576 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 580 insertions(+), 14 deletions(-) create mode 100644 arch/riscv/kvm/aia_aplic.c diff --git a/arch/riscv/include/asm/kvm_aia.h b/arch/riscv/include/asm/kvm_aia.h index a1281ebc9b92..f6bd8523395f 100644 --- a/arch/riscv/include/asm/kvm_aia.h +++ b/arch/riscv/include/asm/kvm_aia.h @@ -129,20 +129,9 @@ static inline void kvm_riscv_vcpu_aia_imsic_cleanup(struct kvm_vcpu *vcpu) { } -static inline int kvm_riscv_aia_aplic_inject(struct kvm *kvm, - u32 source, bool level) -{ - return 0; -} - -static inline int kvm_riscv_aia_aplic_init(struct kvm *kvm) -{ - return 0; -} - -static inline void kvm_riscv_aia_aplic_cleanup(struct kvm *kvm) -{ -} +int kvm_riscv_aia_aplic_inject(struct kvm *kvm, u32 source, bool level); +int kvm_riscv_aia_aplic_init(struct kvm *kvm); +void kvm_riscv_aia_aplic_cleanup(struct kvm *kvm); #ifdef CONFIG_32BIT void kvm_riscv_vcpu_aia_flush_interrupts(struct kvm_vcpu *vcpu); diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile index dd69ebe098bd..94c43702c765 100644 --- a/arch/riscv/kvm/Makefile +++ b/arch/riscv/kvm/Makefile @@ -28,3 +28,4 @@ kvm-y += vcpu_timer.o kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o vcpu_sbi_pmu.o kvm-y += aia.o kvm-y += aia_device.o +kvm-y += aia_aplic.o diff --git a/arch/riscv/kvm/aia_aplic.c b/arch/riscv/kvm/aia_aplic.c new file mode 100644 index 000000000000..eecd8f4abe21 --- /dev/null +++ b/arch/riscv/kvm/aia_aplic.c @@ -0,0 +1,576 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + * Copyright (C) 2022 Ventana Micro Systems Inc. + * + * Authors: + * Anup Patel + */ + +#include +#include +#include +#include +#include +#include + +struct aplic_irq { + raw_spinlock_t lock; + u32 sourcecfg; + u32 state; +#define APLIC_IRQ_STATE_PENDING BIT(0) +#define APLIC_IRQ_STATE_ENABLED BIT(1) +#define APLIC_IRQ_STATE_ENPEND (APLIC_IRQ_STATE_PENDING | \ + APLIC_IRQ_STATE_ENABLED) +#define APLIC_IRQ_STATE_INPUT BIT(8) + u32 target; +}; + +struct aplic { + struct kvm_io_device iodev; + + u32 domaincfg; + u32 genmsi; + + u32 nr_irqs; + u32 nr_words; + struct aplic_irq *irqs; +}; + +static u32 aplic_read_sourcecfg(struct aplic *aplic, u32 irq) +{ + u32 ret; + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return 0; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + ret = irqd->sourcecfg; + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + return ret; +} + +static void aplic_write_sourcecfg(struct aplic *aplic, u32 irq, u32 val) +{ + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return; + irqd = &aplic->irqs[irq]; + + if (val & APLIC_SOURCECFG_D) + val = 0; + else + val &= APLIC_SOURCECFG_SM_MASK; + + raw_spin_lock_irqsave(&irqd->lock, flags); + irqd->sourcecfg = val; + raw_spin_unlock_irqrestore(&irqd->lock, flags); +} + +static u32 aplic_read_target(struct aplic *aplic, u32 irq) +{ + u32 ret; + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return 0; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + ret = irqd->target; + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + return ret; +} + +static void aplic_write_target(struct aplic *aplic, u32 irq, u32 val) +{ + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return; + irqd = &aplic->irqs[irq]; + + val &= APLIC_TARGET_EIID_MASK | + (APLIC_TARGET_HART_IDX_MASK << APLIC_TARGET_HART_IDX_SHIFT) | + (APLIC_TARGET_GUEST_IDX_MASK << APLIC_TARGET_GUEST_IDX_SHIFT); + + raw_spin_lock_irqsave(&irqd->lock, flags); + irqd->target = val; + raw_spin_unlock_irqrestore(&irqd->lock, flags); +} + +static bool aplic_read_pending(struct aplic *aplic, u32 irq) +{ + bool ret; + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return false; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + ret = (irqd->state & APLIC_IRQ_STATE_PENDING) ? true : false; + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + return ret; +} + +static void aplic_write_pending(struct aplic *aplic, u32 irq, bool pending) +{ + unsigned long flags, sm; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + + sm = irqd->sourcecfg & APLIC_SOURCECFG_SM_MASK; + if (!pending && + ((sm == APLIC_SOURCECFG_SM_LEVEL_HIGH) || + (sm == APLIC_SOURCECFG_SM_LEVEL_LOW))) + goto skip_write_pending; + + if (pending) + irqd->state |= APLIC_IRQ_STATE_PENDING; + else + irqd->state &= ~APLIC_IRQ_STATE_PENDING; + +skip_write_pending: + raw_spin_unlock_irqrestore(&irqd->lock, flags); +} + +static bool aplic_read_enabled(struct aplic *aplic, u32 irq) +{ + bool ret; + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return false; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + ret = (irqd->state & APLIC_IRQ_STATE_ENABLED) ? true : false; + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + return ret; +} + +static void aplic_write_enabled(struct aplic *aplic, u32 irq, bool enabled) +{ + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + if (enabled) + irqd->state |= APLIC_IRQ_STATE_ENABLED; + else + irqd->state &= ~APLIC_IRQ_STATE_ENABLED; + raw_spin_unlock_irqrestore(&irqd->lock, flags); +} + +static bool aplic_read_input(struct aplic *aplic, u32 irq) +{ + bool ret; + unsigned long flags; + struct aplic_irq *irqd; + + if (!irq || aplic->nr_irqs <= irq) + return false; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + ret = (irqd->state & APLIC_IRQ_STATE_INPUT) ? true : false; + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + return ret; +} + +static void aplic_inject_msi(struct kvm *kvm, u32 irq, u32 target) +{ + u32 hart_idx, guest_idx, eiid; + + hart_idx = target >> APLIC_TARGET_HART_IDX_SHIFT; + hart_idx &= APLIC_TARGET_HART_IDX_MASK; + guest_idx = target >> APLIC_TARGET_GUEST_IDX_SHIFT; + guest_idx &= APLIC_TARGET_GUEST_IDX_MASK; + eiid = target & APLIC_TARGET_EIID_MASK; + kvm_riscv_aia_inject_msi_by_id(kvm, hart_idx, guest_idx, eiid); +} + +static void aplic_update_irq_range(struct kvm *kvm, u32 first, u32 last) +{ + bool inject; + u32 irq, target; + unsigned long flags; + struct aplic_irq *irqd; + struct aplic *aplic = kvm->arch.aia.aplic_state; + + if (!(aplic->domaincfg & APLIC_DOMAINCFG_IE)) + return; + + for (irq = first; irq <= last; irq++) { + if (!irq || aplic->nr_irqs <= irq) + continue; + irqd = &aplic->irqs[irq]; + + raw_spin_lock_irqsave(&irqd->lock, flags); + + inject = false; + target = irqd->target; + if ((irqd->state & APLIC_IRQ_STATE_ENPEND) == + APLIC_IRQ_STATE_ENPEND) { + irqd->state &= ~APLIC_IRQ_STATE_PENDING; + inject = true; + } + + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + if (inject) + aplic_inject_msi(kvm, irq, target); + } +} + +int kvm_riscv_aia_aplic_inject(struct kvm *kvm, u32 source, bool level) +{ + u32 target; + bool inject = false, ie; + unsigned long flags; + struct aplic_irq *irqd; + struct aplic *aplic = kvm->arch.aia.aplic_state; + + if (!aplic || !source || (aplic->nr_irqs <= source)) + return -ENODEV; + irqd = &aplic->irqs[source]; + ie = (aplic->domaincfg & APLIC_DOMAINCFG_IE) ? true : false; + + raw_spin_lock_irqsave(&irqd->lock, flags); + + if (irqd->sourcecfg & APLIC_SOURCECFG_D) + goto skip_unlock; + + switch (irqd->sourcecfg & APLIC_SOURCECFG_SM_MASK) { + case APLIC_SOURCECFG_SM_EDGE_RISE: + if (level && !(irqd->state & APLIC_IRQ_STATE_INPUT) && + !(irqd->state & APLIC_IRQ_STATE_PENDING)) + irqd->state |= APLIC_IRQ_STATE_PENDING; + break; + case APLIC_SOURCECFG_SM_EDGE_FALL: + if (!level && (irqd->state & APLIC_IRQ_STATE_INPUT) && + !(irqd->state & APLIC_IRQ_STATE_PENDING)) + irqd->state |= APLIC_IRQ_STATE_PENDING; + break; + case APLIC_SOURCECFG_SM_LEVEL_HIGH: + if (level && !(irqd->state & APLIC_IRQ_STATE_PENDING)) + irqd->state |= APLIC_IRQ_STATE_PENDING; + break; + case APLIC_SOURCECFG_SM_LEVEL_LOW: + if (!level && !(irqd->state & APLIC_IRQ_STATE_PENDING)) + irqd->state |= APLIC_IRQ_STATE_PENDING; + break; + } + + if (level) + irqd->state |= APLIC_IRQ_STATE_INPUT; + else + irqd->state &= ~APLIC_IRQ_STATE_INPUT; + + target = irqd->target; + if (ie && ((irqd->state & APLIC_IRQ_STATE_ENPEND) == + APLIC_IRQ_STATE_ENPEND)) { + irqd->state &= ~APLIC_IRQ_STATE_PENDING; + inject = true; + } + +skip_unlock: + raw_spin_unlock_irqrestore(&irqd->lock, flags); + + if (inject) + aplic_inject_msi(kvm, source, target); + + return 0; +} + +static u32 aplic_read_input_word(struct aplic *aplic, u32 word) +{ + u32 i, ret = 0; + + for (i = 0; i < 32; i++) + ret |= aplic_read_input(aplic, word * 32 + i) ? BIT(i) : 0; + + return ret; +} + +static u32 aplic_read_pending_word(struct aplic *aplic, u32 word) +{ + u32 i, ret = 0; + + for (i = 0; i < 32; i++) + ret |= aplic_read_pending(aplic, word * 32 + i) ? BIT(i) : 0; + + return ret; +} + +static void aplic_write_pending_word(struct aplic *aplic, u32 word, + u32 val, bool pending) +{ + u32 i; + + for (i = 0; i < 32; i++) { + if (val & BIT(i)) + aplic_write_pending(aplic, word * 32 + i, pending); + } +} + +static u32 aplic_read_enabled_word(struct aplic *aplic, u32 word) +{ + u32 i, ret = 0; + + for (i = 0; i < 32; i++) + ret |= aplic_read_enabled(aplic, word * 32 + i) ? BIT(i) : 0; + + return ret; +} + +static void aplic_write_enabled_word(struct aplic *aplic, u32 word, + u32 val, bool enabled) +{ + u32 i; + + for (i = 0; i < 32; i++) { + if (val & BIT(i)) + aplic_write_enabled(aplic, word * 32 + i, enabled); + } +} + +static int aplic_mmio_read_offset(struct kvm *kvm, gpa_t off, u32 *val32) +{ + u32 i; + struct aplic *aplic = kvm->arch.aia.aplic_state; + + if ((off & 0x3) != 0) + return -EOPNOTSUPP; + + if (off == APLIC_DOMAINCFG) { + *val32 = APLIC_DOMAINCFG_RDONLY | + aplic->domaincfg | APLIC_DOMAINCFG_DM; + } else if ((off >= APLIC_SOURCECFG_BASE) && + (off < (APLIC_SOURCECFG_BASE + (aplic->nr_irqs - 1) * 4))) { + i = ((off - APLIC_SOURCECFG_BASE) >> 2) + 1; + *val32 = aplic_read_sourcecfg(aplic, i); + } else if ((off >= APLIC_SETIP_BASE) && + (off < (APLIC_SETIP_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_SETIP_BASE) >> 2; + *val32 = aplic_read_pending_word(aplic, i); + } else if (off == APLIC_SETIPNUM) { + *val32 = 0; + } else if ((off >= APLIC_CLRIP_BASE) && + (off < (APLIC_CLRIP_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_CLRIP_BASE) >> 2; + *val32 = aplic_read_input_word(aplic, i); + } else if (off == APLIC_CLRIPNUM) { + *val32 = 0; + } else if ((off >= APLIC_SETIE_BASE) && + (off < (APLIC_SETIE_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_SETIE_BASE) >> 2; + *val32 = aplic_read_enabled_word(aplic, i); + } else if (off == APLIC_SETIENUM) { + *val32 = 0; + } else if ((off >= APLIC_CLRIE_BASE) && + (off < (APLIC_CLRIE_BASE + aplic->nr_words * 4))) { + *val32 = 0; + } else if (off == APLIC_CLRIENUM) { + *val32 = 0; + } else if (off == APLIC_SETIPNUM_LE) { + *val32 = 0; + } else if (off == APLIC_SETIPNUM_BE) { + *val32 = 0; + } else if (off == APLIC_GENMSI) { + *val32 = aplic->genmsi; + } else if ((off >= APLIC_TARGET_BASE) && + (off < (APLIC_TARGET_BASE + (aplic->nr_irqs - 1) * 4))) { + i = ((off - APLIC_TARGET_BASE) >> 2) + 1; + *val32 = aplic_read_target(aplic, i); + } else + return -ENODEV; + + return 0; +} + +static int aplic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + if (len != 4) + return -EOPNOTSUPP; + + return aplic_mmio_read_offset(vcpu->kvm, + addr - vcpu->kvm->arch.aia.aplic_addr, + val); +} + +static int aplic_mmio_write_offset(struct kvm *kvm, gpa_t off, u32 val32) +{ + u32 i; + struct aplic *aplic = kvm->arch.aia.aplic_state; + + if ((off & 0x3) != 0) + return -EOPNOTSUPP; + + if (off == APLIC_DOMAINCFG) { + /* Only IE bit writeable */ + aplic->domaincfg = val32 & APLIC_DOMAINCFG_IE; + } else if ((off >= APLIC_SOURCECFG_BASE) && + (off < (APLIC_SOURCECFG_BASE + (aplic->nr_irqs - 1) * 4))) { + i = ((off - APLIC_SOURCECFG_BASE) >> 2) + 1; + aplic_write_sourcecfg(aplic, i, val32); + } else if ((off >= APLIC_SETIP_BASE) && + (off < (APLIC_SETIP_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_SETIP_BASE) >> 2; + aplic_write_pending_word(aplic, i, val32, true); + } else if (off == APLIC_SETIPNUM) { + aplic_write_pending(aplic, val32, true); + } else if ((off >= APLIC_CLRIP_BASE) && + (off < (APLIC_CLRIP_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_CLRIP_BASE) >> 2; + aplic_write_pending_word(aplic, i, val32, false); + } else if (off == APLIC_CLRIPNUM) { + aplic_write_pending(aplic, val32, false); + } else if ((off >= APLIC_SETIE_BASE) && + (off < (APLIC_SETIE_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_SETIE_BASE) >> 2; + aplic_write_enabled_word(aplic, i, val32, true); + } else if (off == APLIC_SETIENUM) { + aplic_write_enabled(aplic, val32, true); + } else if ((off >= APLIC_CLRIE_BASE) && + (off < (APLIC_CLRIE_BASE + aplic->nr_words * 4))) { + i = (off - APLIC_CLRIE_BASE) >> 2; + aplic_write_enabled_word(aplic, i, val32, false); + } else if (off == APLIC_CLRIENUM) { + aplic_write_enabled(aplic, val32, false); + } else if (off == APLIC_SETIPNUM_LE) { + aplic_write_pending(aplic, val32, true); + } else if (off == APLIC_SETIPNUM_BE) { + aplic_write_pending(aplic, __swab32(val32), true); + } else if (off == APLIC_GENMSI) { + aplic->genmsi = val32 & ~(APLIC_TARGET_GUEST_IDX_MASK << + APLIC_TARGET_GUEST_IDX_SHIFT); + kvm_riscv_aia_inject_msi_by_id(kvm, + val32 >> APLIC_TARGET_HART_IDX_SHIFT, 0, + val32 & APLIC_TARGET_EIID_MASK); + } else if ((off >= APLIC_TARGET_BASE) && + (off < (APLIC_TARGET_BASE + (aplic->nr_irqs - 1) * 4))) { + i = ((off - APLIC_TARGET_BASE) >> 2) + 1; + aplic_write_target(aplic, i, val32); + } else + return -ENODEV; + + aplic_update_irq_range(kvm, 1, aplic->nr_irqs - 1); + + return 0; +} + +static int aplic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + if (len != 4) + return -EOPNOTSUPP; + + return aplic_mmio_write_offset(vcpu->kvm, + addr - vcpu->kvm->arch.aia.aplic_addr, + *((const u32 *)val)); +} + +static struct kvm_io_device_ops aplic_iodoev_ops = { + .read = aplic_mmio_read, + .write = aplic_mmio_write, +}; + +int kvm_riscv_aia_aplic_init(struct kvm *kvm) +{ + int i, ret = 0; + struct aplic *aplic; + + /* Do nothing if we have zero sources */ + if (!kvm->arch.aia.nr_sources) + return 0; + + /* Allocate APLIC global state */ + aplic = kzalloc(sizeof(*aplic), GFP_KERNEL); + if (!aplic) + return -ENOMEM; + kvm->arch.aia.aplic_state = aplic; + + /* Setup APLIC IRQs */ + aplic->nr_irqs = kvm->arch.aia.nr_sources + 1; + aplic->nr_words = DIV_ROUND_UP(aplic->nr_irqs, 32); + aplic->irqs = kcalloc(aplic->nr_irqs, + sizeof(*aplic->irqs), GFP_KERNEL); + if (!aplic->irqs) { + ret = -ENOMEM; + goto fail_free_aplic; + } + for (i = 0; i < aplic->nr_irqs; i++) + raw_spin_lock_init(&aplic->irqs[i].lock); + + /* Setup IO device */ + kvm_iodevice_init(&aplic->iodev, &aplic_iodoev_ops); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, + kvm->arch.aia.aplic_addr, + KVM_DEV_RISCV_APLIC_SIZE, + &aplic->iodev); + mutex_unlock(&kvm->slots_lock); + if (ret) + goto fail_free_aplic_irqs; + + /* Setup default IRQ routing */ + ret = kvm_riscv_setup_default_irq_routing(kvm, aplic->nr_irqs); + if (ret) + goto fail_unreg_iodev; + + return 0; + +fail_unreg_iodev: + mutex_lock(&kvm->slots_lock); + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &aplic->iodev); + mutex_unlock(&kvm->slots_lock); +fail_free_aplic_irqs: + kfree(aplic->irqs); +fail_free_aplic: + kvm->arch.aia.aplic_state = NULL; + kfree(aplic); + return ret; +} + +void kvm_riscv_aia_aplic_cleanup(struct kvm *kvm) +{ + struct aplic *aplic = kvm->arch.aia.aplic_state; + + if (!aplic) + return; + + mutex_lock(&kvm->slots_lock); + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &aplic->iodev); + mutex_unlock(&kvm->slots_lock); + + kfree(aplic->irqs); + + kvm->arch.aia.aplic_state = NULL; + kfree(aplic); +} -- cgit v1.2.3 From 289a007b98b06d9ce4be24e2dda43f6821687e70 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 15 Jun 2023 13:03:51 +0530 Subject: RISC-V: KVM: Expose APLIC registers as attributes of AIA irqchip We expose APLIC registers as KVM device attributes of the in-kernel AIA irqchip device. This will allow KVM user-space to save/restore APLIC state using KVM device ioctls(). Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_aia.h | 3 +++ arch/riscv/include/uapi/asm/kvm.h | 6 ++++++ arch/riscv/kvm/aia_aplic.c | 43 +++++++++++++++++++++++++++++++++++++++ arch/riscv/kvm/aia_device.c | 25 +++++++++++++++++++++++ 4 files changed, 77 insertions(+) diff --git a/arch/riscv/include/asm/kvm_aia.h b/arch/riscv/include/asm/kvm_aia.h index f6bd8523395f..ba939c0054aa 100644 --- a/arch/riscv/include/asm/kvm_aia.h +++ b/arch/riscv/include/asm/kvm_aia.h @@ -129,6 +129,9 @@ static inline void kvm_riscv_vcpu_aia_imsic_cleanup(struct kvm_vcpu *vcpu) { } +int kvm_riscv_aia_aplic_set_attr(struct kvm *kvm, unsigned long type, u32 v); +int kvm_riscv_aia_aplic_get_attr(struct kvm *kvm, unsigned long type, u32 *v); +int kvm_riscv_aia_aplic_has_attr(struct kvm *kvm, unsigned long type); int kvm_riscv_aia_aplic_inject(struct kvm *kvm, u32 source, bool level); int kvm_riscv_aia_aplic_init(struct kvm *kvm); void kvm_riscv_aia_aplic_cleanup(struct kvm *kvm); diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 047c8fc5bd71..9ed822fc5589 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -249,6 +249,12 @@ enum KVM_RISCV_SBI_EXT_ID { #define KVM_DEV_RISCV_AIA_GRP_CTRL 2 #define KVM_DEV_RISCV_AIA_CTRL_INIT 0 +/* + * The device attribute type contains the memory mapped offset of the + * APLIC register (range 0x0000-0x3FFF) and it must be 4-byte aligned. + */ +#define KVM_DEV_RISCV_AIA_GRP_APLIC 3 + /* One single KVM irqchip, ie. the AIA */ #define KVM_NR_IRQCHIPS 1 diff --git a/arch/riscv/kvm/aia_aplic.c b/arch/riscv/kvm/aia_aplic.c index eecd8f4abe21..39e72aa016a4 100644 --- a/arch/riscv/kvm/aia_aplic.c +++ b/arch/riscv/kvm/aia_aplic.c @@ -501,6 +501,49 @@ static struct kvm_io_device_ops aplic_iodoev_ops = { .write = aplic_mmio_write, }; +int kvm_riscv_aia_aplic_set_attr(struct kvm *kvm, unsigned long type, u32 v) +{ + int rc; + + if (!kvm->arch.aia.aplic_state) + return -ENODEV; + + rc = aplic_mmio_write_offset(kvm, type, v); + if (rc) + return rc; + + return 0; +} + +int kvm_riscv_aia_aplic_get_attr(struct kvm *kvm, unsigned long type, u32 *v) +{ + int rc; + + if (!kvm->arch.aia.aplic_state) + return -ENODEV; + + rc = aplic_mmio_read_offset(kvm, type, v); + if (rc) + return rc; + + return 0; +} + +int kvm_riscv_aia_aplic_has_attr(struct kvm *kvm, unsigned long type) +{ + int rc; + u32 val; + + if (!kvm->arch.aia.aplic_state) + return -ENODEV; + + rc = aplic_mmio_read_offset(kvm, type, &val); + if (rc) + return rc; + + return 0; +} + int kvm_riscv_aia_aplic_init(struct kvm *kvm) { int i, ret = 0; diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c index 7ab555121872..c649ad6e8e0a 100644 --- a/arch/riscv/kvm/aia_device.c +++ b/arch/riscv/kvm/aia_device.c @@ -365,6 +365,15 @@ static int aia_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) break; } + break; + case KVM_DEV_RISCV_AIA_GRP_APLIC: + if (copy_from_user(&nr, uaddr, sizeof(nr))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = kvm_riscv_aia_aplic_set_attr(dev->kvm, type, nr); + mutex_unlock(&dev->kvm->lock); + break; } @@ -412,6 +421,20 @@ static int aia_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) if (copy_to_user(uaddr, &addr, sizeof(addr))) return -EFAULT; + break; + case KVM_DEV_RISCV_AIA_GRP_APLIC: + if (copy_from_user(&nr, uaddr, sizeof(nr))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = kvm_riscv_aia_aplic_get_attr(dev->kvm, type, &nr); + mutex_unlock(&dev->kvm->lock); + if (r) + return r; + + if (copy_to_user(uaddr, &nr, sizeof(nr))) + return -EFAULT; + break; } @@ -448,6 +471,8 @@ static int aia_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) return 0; } break; + case KVM_DEV_RISCV_AIA_GRP_APLIC: + return kvm_riscv_aia_aplic_has_attr(dev->kvm, attr->attr); } return -ENXIO; -- cgit v1.2.3 From db8b7e97d6137a28b2bfc07e591d54da268f0c36 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 15 Jun 2023 13:03:52 +0530 Subject: RISC-V: KVM: Add in-kernel virtualization of AIA IMSIC We can have AIA IMSIC support for both HS-level and VS-level but the VS-level IMSICs are optional. We use the VS-level IMSICs for Guest/VM whenever available otherwise we fallback to software emulation of AIA IMSIC. This patch adds in-kernel virtualization of AIA IMSIC. Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_aia.h | 46 +- arch/riscv/kvm/Makefile | 1 + arch/riscv/kvm/aia_imsic.c | 914 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 925 insertions(+), 36 deletions(-) create mode 100644 arch/riscv/kvm/aia_imsic.c diff --git a/arch/riscv/include/asm/kvm_aia.h b/arch/riscv/include/asm/kvm_aia.h index ba939c0054aa..a4f6ebf90e31 100644 --- a/arch/riscv/include/asm/kvm_aia.h +++ b/arch/riscv/include/asm/kvm_aia.h @@ -90,44 +90,18 @@ DECLARE_STATIC_KEY_FALSE(kvm_riscv_aia_available); extern struct kvm_device_ops kvm_riscv_aia_device_ops; -static inline void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu) -{ -} - -static inline int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu) -{ - return 1; -} +void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu); +int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu); #define KVM_RISCV_AIA_IMSIC_TOPEI (ISELECT_MASK + 1) -static inline int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, - unsigned long isel, - unsigned long *val, - unsigned long new_val, - unsigned long wr_mask) -{ - return 0; -} - -static inline void kvm_riscv_vcpu_aia_imsic_reset(struct kvm_vcpu *vcpu) -{ -} - -static inline int kvm_riscv_vcpu_aia_imsic_inject(struct kvm_vcpu *vcpu, - u32 guest_index, u32 offset, - u32 iid) -{ - return 0; -} - -static inline int kvm_riscv_vcpu_aia_imsic_init(struct kvm_vcpu *vcpu) -{ - return 0; -} - -static inline void kvm_riscv_vcpu_aia_imsic_cleanup(struct kvm_vcpu *vcpu) -{ -} +int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, unsigned long isel, + unsigned long *val, unsigned long new_val, + unsigned long wr_mask); +void kvm_riscv_vcpu_aia_imsic_reset(struct kvm_vcpu *vcpu); +int kvm_riscv_vcpu_aia_imsic_inject(struct kvm_vcpu *vcpu, + u32 guest_index, u32 offset, u32 iid); +int kvm_riscv_vcpu_aia_imsic_init(struct kvm_vcpu *vcpu); +void kvm_riscv_vcpu_aia_imsic_cleanup(struct kvm_vcpu *vcpu); int kvm_riscv_aia_aplic_set_attr(struct kvm *kvm, unsigned long type, u32 v); int kvm_riscv_aia_aplic_get_attr(struct kvm *kvm, unsigned long type, u32 *v); diff --git a/arch/riscv/kvm/Makefile b/arch/riscv/kvm/Makefile index 94c43702c765..c1d1356387ff 100644 --- a/arch/riscv/kvm/Makefile +++ b/arch/riscv/kvm/Makefile @@ -29,3 +29,4 @@ kvm-$(CONFIG_RISCV_PMU_SBI) += vcpu_pmu.o vcpu_sbi_pmu.o kvm-y += aia.o kvm-y += aia_device.o kvm-y += aia_aplic.o +kvm-y += aia_imsic.o diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c new file mode 100644 index 000000000000..0e6ecf067757 --- /dev/null +++ b/arch/riscv/kvm/aia_imsic.c @@ -0,0 +1,914 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + * Copyright (C) 2022 Ventana Micro Systems Inc. + * + * Authors: + * Anup Patel + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IMSIC_MAX_EIX (IMSIC_MAX_ID / BITS_PER_TYPE(u64)) + +struct imsic_mrif_eix { + unsigned long eip[BITS_PER_TYPE(u64) / BITS_PER_LONG]; + unsigned long eie[BITS_PER_TYPE(u64) / BITS_PER_LONG]; +}; + +struct imsic_mrif { + struct imsic_mrif_eix eix[IMSIC_MAX_EIX]; + unsigned long eithreshold; + unsigned long eidelivery; +}; + +struct imsic { + struct kvm_io_device iodev; + + u32 nr_msis; + u32 nr_eix; + u32 nr_hw_eix; + + /* + * At any point in time, the register state is in + * one of the following places: + * + * 1) Hardware: IMSIC VS-file (vsfile_cpu >= 0) + * 2) Software: IMSIC SW-file (vsfile_cpu < 0) + */ + + /* IMSIC VS-file */ + rwlock_t vsfile_lock; + int vsfile_cpu; + int vsfile_hgei; + void __iomem *vsfile_va; + phys_addr_t vsfile_pa; + + /* IMSIC SW-file */ + struct imsic_mrif *swfile; + phys_addr_t swfile_pa; +}; + +#define imsic_vs_csr_read(__c) \ +({ \ + unsigned long __r; \ + csr_write(CSR_VSISELECT, __c); \ + __r = csr_read(CSR_VSIREG); \ + __r; \ +}) + +#define imsic_read_switchcase(__ireg) \ + case __ireg: \ + return imsic_vs_csr_read(__ireg); +#define imsic_read_switchcase_2(__ireg) \ + imsic_read_switchcase(__ireg + 0) \ + imsic_read_switchcase(__ireg + 1) +#define imsic_read_switchcase_4(__ireg) \ + imsic_read_switchcase_2(__ireg + 0) \ + imsic_read_switchcase_2(__ireg + 2) +#define imsic_read_switchcase_8(__ireg) \ + imsic_read_switchcase_4(__ireg + 0) \ + imsic_read_switchcase_4(__ireg + 4) +#define imsic_read_switchcase_16(__ireg) \ + imsic_read_switchcase_8(__ireg + 0) \ + imsic_read_switchcase_8(__ireg + 8) +#define imsic_read_switchcase_32(__ireg) \ + imsic_read_switchcase_16(__ireg + 0) \ + imsic_read_switchcase_16(__ireg + 16) +#define imsic_read_switchcase_64(__ireg) \ + imsic_read_switchcase_32(__ireg + 0) \ + imsic_read_switchcase_32(__ireg + 32) + +static unsigned long imsic_eix_read(int ireg) +{ + switch (ireg) { + imsic_read_switchcase_64(IMSIC_EIP0) + imsic_read_switchcase_64(IMSIC_EIE0) + }; + + return 0; +} + +#define imsic_vs_csr_swap(__c, __v) \ +({ \ + unsigned long __r; \ + csr_write(CSR_VSISELECT, __c); \ + __r = csr_swap(CSR_VSIREG, __v); \ + __r; \ +}) + +#define imsic_swap_switchcase(__ireg, __v) \ + case __ireg: \ + return imsic_vs_csr_swap(__ireg, __v); +#define imsic_swap_switchcase_2(__ireg, __v) \ + imsic_swap_switchcase(__ireg + 0, __v) \ + imsic_swap_switchcase(__ireg + 1, __v) +#define imsic_swap_switchcase_4(__ireg, __v) \ + imsic_swap_switchcase_2(__ireg + 0, __v) \ + imsic_swap_switchcase_2(__ireg + 2, __v) +#define imsic_swap_switchcase_8(__ireg, __v) \ + imsic_swap_switchcase_4(__ireg + 0, __v) \ + imsic_swap_switchcase_4(__ireg + 4, __v) +#define imsic_swap_switchcase_16(__ireg, __v) \ + imsic_swap_switchcase_8(__ireg + 0, __v) \ + imsic_swap_switchcase_8(__ireg + 8, __v) +#define imsic_swap_switchcase_32(__ireg, __v) \ + imsic_swap_switchcase_16(__ireg + 0, __v) \ + imsic_swap_switchcase_16(__ireg + 16, __v) +#define imsic_swap_switchcase_64(__ireg, __v) \ + imsic_swap_switchcase_32(__ireg + 0, __v) \ + imsic_swap_switchcase_32(__ireg + 32, __v) + +static unsigned long imsic_eix_swap(int ireg, unsigned long val) +{ + switch (ireg) { + imsic_swap_switchcase_64(IMSIC_EIP0, val) + imsic_swap_switchcase_64(IMSIC_EIE0, val) + }; + + return 0; +} + +#define imsic_vs_csr_write(__c, __v) \ +do { \ + csr_write(CSR_VSISELECT, __c); \ + csr_write(CSR_VSIREG, __v); \ +} while (0) + +#define imsic_write_switchcase(__ireg, __v) \ + case __ireg: \ + imsic_vs_csr_write(__ireg, __v); \ + break; +#define imsic_write_switchcase_2(__ireg, __v) \ + imsic_write_switchcase(__ireg + 0, __v) \ + imsic_write_switchcase(__ireg + 1, __v) +#define imsic_write_switchcase_4(__ireg, __v) \ + imsic_write_switchcase_2(__ireg + 0, __v) \ + imsic_write_switchcase_2(__ireg + 2, __v) +#define imsic_write_switchcase_8(__ireg, __v) \ + imsic_write_switchcase_4(__ireg + 0, __v) \ + imsic_write_switchcase_4(__ireg + 4, __v) +#define imsic_write_switchcase_16(__ireg, __v) \ + imsic_write_switchcase_8(__ireg + 0, __v) \ + imsic_write_switchcase_8(__ireg + 8, __v) +#define imsic_write_switchcase_32(__ireg, __v) \ + imsic_write_switchcase_16(__ireg + 0, __v) \ + imsic_write_switchcase_16(__ireg + 16, __v) +#define imsic_write_switchcase_64(__ireg, __v) \ + imsic_write_switchcase_32(__ireg + 0, __v) \ + imsic_write_switchcase_32(__ireg + 32, __v) + +static void imsic_eix_write(int ireg, unsigned long val) +{ + switch (ireg) { + imsic_write_switchcase_64(IMSIC_EIP0, val) + imsic_write_switchcase_64(IMSIC_EIE0, val) + }; +} + +#define imsic_vs_csr_set(__c, __v) \ +do { \ + csr_write(CSR_VSISELECT, __c); \ + csr_set(CSR_VSIREG, __v); \ +} while (0) + +#define imsic_set_switchcase(__ireg, __v) \ + case __ireg: \ + imsic_vs_csr_set(__ireg, __v); \ + break; +#define imsic_set_switchcase_2(__ireg, __v) \ + imsic_set_switchcase(__ireg + 0, __v) \ + imsic_set_switchcase(__ireg + 1, __v) +#define imsic_set_switchcase_4(__ireg, __v) \ + imsic_set_switchcase_2(__ireg + 0, __v) \ + imsic_set_switchcase_2(__ireg + 2, __v) +#define imsic_set_switchcase_8(__ireg, __v) \ + imsic_set_switchcase_4(__ireg + 0, __v) \ + imsic_set_switchcase_4(__ireg + 4, __v) +#define imsic_set_switchcase_16(__ireg, __v) \ + imsic_set_switchcase_8(__ireg + 0, __v) \ + imsic_set_switchcase_8(__ireg + 8, __v) +#define imsic_set_switchcase_32(__ireg, __v) \ + imsic_set_switchcase_16(__ireg + 0, __v) \ + imsic_set_switchcase_16(__ireg + 16, __v) +#define imsic_set_switchcase_64(__ireg, __v) \ + imsic_set_switchcase_32(__ireg + 0, __v) \ + imsic_set_switchcase_32(__ireg + 32, __v) + +static void imsic_eix_set(int ireg, unsigned long val) +{ + switch (ireg) { + imsic_set_switchcase_64(IMSIC_EIP0, val) + imsic_set_switchcase_64(IMSIC_EIE0, val) + }; +} + +static unsigned long imsic_mrif_atomic_rmw(struct imsic_mrif *mrif, + unsigned long *ptr, + unsigned long new_val, + unsigned long wr_mask) +{ + unsigned long old_val = 0, tmp = 0; + + __asm__ __volatile__ ( + "0: lr.w.aq %1, %0\n" + " and %2, %1, %3\n" + " or %2, %2, %4\n" + " sc.w.rl %2, %2, %0\n" + " bnez %2, 0b" + : "+A" (*ptr), "+r" (old_val), "+r" (tmp) + : "r" (~wr_mask), "r" (new_val & wr_mask) + : "memory"); + + return old_val; +} + +static unsigned long imsic_mrif_atomic_or(struct imsic_mrif *mrif, + unsigned long *ptr, + unsigned long val) +{ + return atomic_long_fetch_or(val, (atomic_long_t *)ptr); +} + +#define imsic_mrif_atomic_write(__mrif, __ptr, __new_val) \ + imsic_mrif_atomic_rmw(__mrif, __ptr, __new_val, -1UL) +#define imsic_mrif_atomic_read(__mrif, __ptr) \ + imsic_mrif_atomic_or(__mrif, __ptr, 0) + +static u32 imsic_mrif_topei(struct imsic_mrif *mrif, u32 nr_eix, u32 nr_msis) +{ + struct imsic_mrif_eix *eix; + u32 i, imin, imax, ei, max_msi; + unsigned long eipend[BITS_PER_TYPE(u64) / BITS_PER_LONG]; + unsigned long eithreshold = imsic_mrif_atomic_read(mrif, + &mrif->eithreshold); + + max_msi = (eithreshold && (eithreshold <= nr_msis)) ? + eithreshold : nr_msis; + for (ei = 0; ei < nr_eix; ei++) { + eix = &mrif->eix[ei]; + eipend[0] = imsic_mrif_atomic_read(mrif, &eix->eie[0]) & + imsic_mrif_atomic_read(mrif, &eix->eip[0]); +#ifdef CONFIG_32BIT + eipend[1] = imsic_mrif_atomic_read(mrif, &eix->eie[1]) & + imsic_mrif_atomic_read(mrif, &eix->eip[1]); + if (!eipend[0] && !eipend[1]) +#else + if (!eipend[0]) +#endif + continue; + + imin = ei * BITS_PER_TYPE(u64); + imax = ((imin + BITS_PER_TYPE(u64)) < max_msi) ? + imin + BITS_PER_TYPE(u64) : max_msi; + for (i = (!imin) ? 1 : imin; i < imax; i++) { + if (test_bit(i - imin, eipend)) + return (i << TOPEI_ID_SHIFT) | i; + } + } + + return 0; +} + +static int imsic_mrif_rmw(struct imsic_mrif *mrif, u32 nr_eix, + unsigned long isel, unsigned long *val, + unsigned long new_val, unsigned long wr_mask) +{ + bool pend; + struct imsic_mrif_eix *eix; + unsigned long *ei, num, old_val = 0; + + switch (isel) { + case IMSIC_EIDELIVERY: + old_val = imsic_mrif_atomic_rmw(mrif, &mrif->eidelivery, + new_val, wr_mask & 0x1); + break; + case IMSIC_EITHRESHOLD: + old_val = imsic_mrif_atomic_rmw(mrif, &mrif->eithreshold, + new_val, wr_mask & (IMSIC_MAX_ID - 1)); + break; + case IMSIC_EIP0 ... IMSIC_EIP63: + case IMSIC_EIE0 ... IMSIC_EIE63: + if (isel >= IMSIC_EIP0 && isel <= IMSIC_EIP63) { + pend = true; + num = isel - IMSIC_EIP0; + } else { + pend = false; + num = isel - IMSIC_EIE0; + } + + if ((num / 2) >= nr_eix) + return -EINVAL; + eix = &mrif->eix[num / 2]; + +#ifndef CONFIG_32BIT + if (num & 0x1) + return -EINVAL; + ei = (pend) ? &eix->eip[0] : &eix->eie[0]; +#else + ei = (pend) ? &eix->eip[num & 0x1] : &eix->eie[num & 0x1]; +#endif + + /* Bit0 of EIP0 or EIE0 is read-only */ + if (!num) + wr_mask &= ~BIT(0); + + old_val = imsic_mrif_atomic_rmw(mrif, ei, new_val, wr_mask); + break; + default: + return -ENOENT; + }; + + if (val) + *val = old_val; + + return 0; +} + +struct imsic_vsfile_read_data { + int hgei; + u32 nr_eix; + bool clear; + struct imsic_mrif *mrif; +}; + +static void imsic_vsfile_local_read(void *data) +{ + u32 i; + struct imsic_mrif_eix *eix; + struct imsic_vsfile_read_data *idata = data; + struct imsic_mrif *mrif = idata->mrif; + unsigned long new_hstatus, old_hstatus, old_vsiselect; + + old_vsiselect = csr_read(CSR_VSISELECT); + old_hstatus = csr_read(CSR_HSTATUS); + new_hstatus = old_hstatus & ~HSTATUS_VGEIN; + new_hstatus |= ((unsigned long)idata->hgei) << HSTATUS_VGEIN_SHIFT; + csr_write(CSR_HSTATUS, new_hstatus); + + /* + * We don't use imsic_mrif_atomic_xyz() functions to store + * values in MRIF because imsic_vsfile_read() is always called + * with pointer to temporary MRIF on stack. + */ + + if (idata->clear) { + mrif->eidelivery = imsic_vs_csr_swap(IMSIC_EIDELIVERY, 0); + mrif->eithreshold = imsic_vs_csr_swap(IMSIC_EITHRESHOLD, 0); + for (i = 0; i < idata->nr_eix; i++) { + eix = &mrif->eix[i]; + eix->eip[0] = imsic_eix_swap(IMSIC_EIP0 + i * 2, 0); + eix->eie[0] = imsic_eix_swap(IMSIC_EIE0 + i * 2, 0); +#ifdef CONFIG_32BIT + eix->eip[1] = imsic_eix_swap(IMSIC_EIP0 + i * 2 + 1, 0); + eix->eie[1] = imsic_eix_swap(IMSIC_EIE0 + i * 2 + 1, 0); +#endif + } + } else { + mrif->eidelivery = imsic_vs_csr_read(IMSIC_EIDELIVERY); + mrif->eithreshold = imsic_vs_csr_read(IMSIC_EITHRESHOLD); + for (i = 0; i < idata->nr_eix; i++) { + eix = &mrif->eix[i]; + eix->eip[0] = imsic_eix_read(IMSIC_EIP0 + i * 2); + eix->eie[0] = imsic_eix_read(IMSIC_EIE0 + i * 2); +#ifdef CONFIG_32BIT + eix->eip[1] = imsic_eix_read(IMSIC_EIP0 + i * 2 + 1); + eix->eie[1] = imsic_eix_read(IMSIC_EIE0 + i * 2 + 1); +#endif + } + } + + csr_write(CSR_HSTATUS, old_hstatus); + csr_write(CSR_VSISELECT, old_vsiselect); +} + +static void imsic_vsfile_read(int vsfile_hgei, int vsfile_cpu, u32 nr_eix, + bool clear, struct imsic_mrif *mrif) +{ + struct imsic_vsfile_read_data idata; + + /* We can only read clear if we have a IMSIC VS-file */ + if (vsfile_cpu < 0 || vsfile_hgei <= 0) + return; + + /* We can only read clear on local CPU */ + idata.hgei = vsfile_hgei; + idata.nr_eix = nr_eix; + idata.clear = clear; + idata.mrif = mrif; + on_each_cpu_mask(cpumask_of(vsfile_cpu), + imsic_vsfile_local_read, &idata, 1); +} + +static void imsic_vsfile_local_clear(int vsfile_hgei, u32 nr_eix) +{ + u32 i; + unsigned long new_hstatus, old_hstatus, old_vsiselect; + + /* We can only zero-out if we have a IMSIC VS-file */ + if (vsfile_hgei <= 0) + return; + + old_vsiselect = csr_read(CSR_VSISELECT); + old_hstatus = csr_read(CSR_HSTATUS); + new_hstatus = old_hstatus & ~HSTATUS_VGEIN; + new_hstatus |= ((unsigned long)vsfile_hgei) << HSTATUS_VGEIN_SHIFT; + csr_write(CSR_HSTATUS, new_hstatus); + + imsic_vs_csr_write(IMSIC_EIDELIVERY, 0); + imsic_vs_csr_write(IMSIC_EITHRESHOLD, 0); + for (i = 0; i < nr_eix; i++) { + imsic_eix_write(IMSIC_EIP0 + i * 2, 0); + imsic_eix_write(IMSIC_EIE0 + i * 2, 0); +#ifdef CONFIG_32BIT + imsic_eix_write(IMSIC_EIP0 + i * 2 + 1, 0); + imsic_eix_write(IMSIC_EIE0 + i * 2 + 1, 0); +#endif + } + + csr_write(CSR_HSTATUS, old_hstatus); + csr_write(CSR_VSISELECT, old_vsiselect); +} + +static void imsic_vsfile_local_update(int vsfile_hgei, u32 nr_eix, + struct imsic_mrif *mrif) +{ + u32 i; + struct imsic_mrif_eix *eix; + unsigned long new_hstatus, old_hstatus, old_vsiselect; + + /* We can only update if we have a HW IMSIC context */ + if (vsfile_hgei <= 0) + return; + + /* + * We don't use imsic_mrif_atomic_xyz() functions to read values + * from MRIF in this function because it is always called with + * pointer to temporary MRIF on stack. + */ + + old_vsiselect = csr_read(CSR_VSISELECT); + old_hstatus = csr_read(CSR_HSTATUS); + new_hstatus = old_hstatus & ~HSTATUS_VGEIN; + new_hstatus |= ((unsigned long)vsfile_hgei) << HSTATUS_VGEIN_SHIFT; + csr_write(CSR_HSTATUS, new_hstatus); + + for (i = 0; i < nr_eix; i++) { + eix = &mrif->eix[i]; + imsic_eix_set(IMSIC_EIP0 + i * 2, eix->eip[0]); + imsic_eix_set(IMSIC_EIE0 + i * 2, eix->eie[0]); +#ifdef CONFIG_32BIT + imsic_eix_set(IMSIC_EIP0 + i * 2 + 1, eix->eip[1]); + imsic_eix_set(IMSIC_EIE0 + i * 2 + 1, eix->eie[1]); +#endif + } + imsic_vs_csr_write(IMSIC_EITHRESHOLD, mrif->eithreshold); + imsic_vs_csr_write(IMSIC_EIDELIVERY, mrif->eidelivery); + + csr_write(CSR_HSTATUS, old_hstatus); + csr_write(CSR_VSISELECT, old_vsiselect); +} + +static void imsic_vsfile_cleanup(struct imsic *imsic) +{ + int old_vsfile_hgei, old_vsfile_cpu; + unsigned long flags; + + /* + * We don't use imsic_mrif_atomic_xyz() functions to clear the + * SW-file in this function because it is always called when the + * VCPU is being destroyed. + */ + + write_lock_irqsave(&imsic->vsfile_lock, flags); + old_vsfile_hgei = imsic->vsfile_hgei; + old_vsfile_cpu = imsic->vsfile_cpu; + imsic->vsfile_cpu = imsic->vsfile_hgei = -1; + imsic->vsfile_va = NULL; + imsic->vsfile_pa = 0; + write_unlock_irqrestore(&imsic->vsfile_lock, flags); + + memset(imsic->swfile, 0, sizeof(*imsic->swfile)); + + if (old_vsfile_cpu >= 0) + kvm_riscv_aia_free_hgei(old_vsfile_cpu, old_vsfile_hgei); +} + +static void imsic_swfile_extirq_update(struct kvm_vcpu *vcpu) +{ + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + struct imsic_mrif *mrif = imsic->swfile; + + if (imsic_mrif_atomic_read(mrif, &mrif->eidelivery) && + imsic_mrif_topei(mrif, imsic->nr_eix, imsic->nr_msis)) + kvm_riscv_vcpu_set_interrupt(vcpu, IRQ_VS_EXT); + else + kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_EXT); +} + +static void imsic_swfile_read(struct kvm_vcpu *vcpu, bool clear, + struct imsic_mrif *mrif) +{ + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + + /* + * We don't use imsic_mrif_atomic_xyz() functions to read and + * write SW-file and MRIF in this function because it is always + * called when VCPU is not using SW-file and the MRIF points to + * a temporary MRIF on stack. + */ + + memcpy(mrif, imsic->swfile, sizeof(*mrif)); + if (clear) { + memset(imsic->swfile, 0, sizeof(*imsic->swfile)); + kvm_riscv_vcpu_unset_interrupt(vcpu, IRQ_VS_EXT); + } +} + +static void imsic_swfile_update(struct kvm_vcpu *vcpu, + struct imsic_mrif *mrif) +{ + u32 i; + struct imsic_mrif_eix *seix, *eix; + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + struct imsic_mrif *smrif = imsic->swfile; + + imsic_mrif_atomic_write(smrif, &smrif->eidelivery, mrif->eidelivery); + imsic_mrif_atomic_write(smrif, &smrif->eithreshold, mrif->eithreshold); + for (i = 0; i < imsic->nr_eix; i++) { + seix = &smrif->eix[i]; + eix = &mrif->eix[i]; + imsic_mrif_atomic_or(smrif, &seix->eip[0], eix->eip[0]); + imsic_mrif_atomic_or(smrif, &seix->eie[0], eix->eie[0]); +#ifdef CONFIG_32BIT + imsic_mrif_atomic_or(smrif, &seix->eip[1], eix->eip[1]); + imsic_mrif_atomic_or(smrif, &seix->eie[1], eix->eie[1]); +#endif + } + + imsic_swfile_extirq_update(vcpu); +} + +void kvm_riscv_vcpu_aia_imsic_release(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + struct imsic_mrif tmrif; + int old_vsfile_hgei, old_vsfile_cpu; + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + + /* Read and clear IMSIC VS-file details */ + write_lock_irqsave(&imsic->vsfile_lock, flags); + old_vsfile_hgei = imsic->vsfile_hgei; + old_vsfile_cpu = imsic->vsfile_cpu; + imsic->vsfile_cpu = imsic->vsfile_hgei = -1; + imsic->vsfile_va = NULL; + imsic->vsfile_pa = 0; + write_unlock_irqrestore(&imsic->vsfile_lock, flags); + + /* Do nothing, if no IMSIC VS-file to release */ + if (old_vsfile_cpu < 0) + return; + + /* + * At this point, all interrupt producers are still using + * the old IMSIC VS-file so we first re-direct all interrupt + * producers. + */ + + /* Purge the G-stage mapping */ + kvm_riscv_gstage_iounmap(vcpu->kvm, + vcpu->arch.aia_context.imsic_addr, + IMSIC_MMIO_PAGE_SZ); + + /* TODO: Purge the IOMMU mapping ??? */ + + /* + * At this point, all interrupt producers have been re-directed + * to somewhere else so we move register state from the old IMSIC + * VS-file to the IMSIC SW-file. + */ + + /* Read and clear register state from old IMSIC VS-file */ + memset(&tmrif, 0, sizeof(tmrif)); + imsic_vsfile_read(old_vsfile_hgei, old_vsfile_cpu, imsic->nr_hw_eix, + true, &tmrif); + + /* Update register state in IMSIC SW-file */ + imsic_swfile_update(vcpu, &tmrif); + + /* Free-up old IMSIC VS-file */ + kvm_riscv_aia_free_hgei(old_vsfile_cpu, old_vsfile_hgei); +} + +int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu) +{ + unsigned long flags; + phys_addr_t new_vsfile_pa; + struct imsic_mrif tmrif; + void __iomem *new_vsfile_va; + struct kvm *kvm = vcpu->kvm; + struct kvm_run *run = vcpu->run; + struct kvm_vcpu_aia *vaia = &vcpu->arch.aia_context; + struct imsic *imsic = vaia->imsic_state; + int ret = 0, new_vsfile_hgei = -1, old_vsfile_hgei, old_vsfile_cpu; + + /* Do nothing for emulation mode */ + if (kvm->arch.aia.mode == KVM_DEV_RISCV_AIA_MODE_EMUL) + return 1; + + /* Read old IMSIC VS-file details */ + read_lock_irqsave(&imsic->vsfile_lock, flags); + old_vsfile_hgei = imsic->vsfile_hgei; + old_vsfile_cpu = imsic->vsfile_cpu; + read_unlock_irqrestore(&imsic->vsfile_lock, flags); + + /* Do nothing if we are continuing on same CPU */ + if (old_vsfile_cpu == vcpu->cpu) + return 1; + + /* Allocate new IMSIC VS-file */ + ret = kvm_riscv_aia_alloc_hgei(vcpu->cpu, vcpu, + &new_vsfile_va, &new_vsfile_pa); + if (ret <= 0) { + /* For HW acceleration mode, we can't continue */ + if (kvm->arch.aia.mode == KVM_DEV_RISCV_AIA_MODE_HWACCEL) { + run->fail_entry.hardware_entry_failure_reason = + CSR_HSTATUS; + run->fail_entry.cpu = vcpu->cpu; + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + return 0; + } + + /* Release old IMSIC VS-file */ + if (old_vsfile_cpu >= 0) + kvm_riscv_vcpu_aia_imsic_release(vcpu); + + /* For automatic mode, we continue */ + goto done; + } + new_vsfile_hgei = ret; + + /* + * At this point, all interrupt producers are still using + * to the old IMSIC VS-file so we first move all interrupt + * producers to the new IMSIC VS-file. + */ + + /* Zero-out new IMSIC VS-file */ + imsic_vsfile_local_clear(new_vsfile_hgei, imsic->nr_hw_eix); + + /* Update G-stage mapping for the new IMSIC VS-file */ + ret = kvm_riscv_gstage_ioremap(kvm, vcpu->arch.aia_context.imsic_addr, + new_vsfile_pa, IMSIC_MMIO_PAGE_SZ, + true, true); + if (ret) + goto fail_free_vsfile_hgei; + + /* TODO: Update the IOMMU mapping ??? */ + + /* Update new IMSIC VS-file details in IMSIC context */ + write_lock_irqsave(&imsic->vsfile_lock, flags); + imsic->vsfile_hgei = new_vsfile_hgei; + imsic->vsfile_cpu = vcpu->cpu; + imsic->vsfile_va = new_vsfile_va; + imsic->vsfile_pa = new_vsfile_pa; + write_unlock_irqrestore(&imsic->vsfile_lock, flags); + + /* + * At this point, all interrupt producers have been moved + * to the new IMSIC VS-file so we move register state from + * the old IMSIC VS/SW-file to the new IMSIC VS-file. + */ + + memset(&tmrif, 0, sizeof(tmrif)); + if (old_vsfile_cpu >= 0) { + /* Read and clear register state from old IMSIC VS-file */ + imsic_vsfile_read(old_vsfile_hgei, old_vsfile_cpu, + imsic->nr_hw_eix, true, &tmrif); + + /* Free-up old IMSIC VS-file */ + kvm_riscv_aia_free_hgei(old_vsfile_cpu, old_vsfile_hgei); + } else { + /* Read and clear register state from IMSIC SW-file */ + imsic_swfile_read(vcpu, true, &tmrif); + } + + /* Restore register state in the new IMSIC VS-file */ + imsic_vsfile_local_update(new_vsfile_hgei, imsic->nr_hw_eix, &tmrif); + +done: + /* Set VCPU HSTATUS.VGEIN to new IMSIC VS-file */ + vcpu->arch.guest_context.hstatus &= ~HSTATUS_VGEIN; + if (new_vsfile_hgei > 0) + vcpu->arch.guest_context.hstatus |= + ((unsigned long)new_vsfile_hgei) << HSTATUS_VGEIN_SHIFT; + + /* Continue run-loop */ + return 1; + +fail_free_vsfile_hgei: + kvm_riscv_aia_free_hgei(vcpu->cpu, new_vsfile_hgei); + return ret; +} + +int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, unsigned long isel, + unsigned long *val, unsigned long new_val, + unsigned long wr_mask) +{ + u32 topei; + struct imsic_mrif_eix *eix; + int r, rc = KVM_INSN_CONTINUE_NEXT_SEPC; + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + + if (isel == KVM_RISCV_AIA_IMSIC_TOPEI) { + /* Read pending and enabled interrupt with highest priority */ + topei = imsic_mrif_topei(imsic->swfile, imsic->nr_eix, + imsic->nr_msis); + if (val) + *val = topei; + + /* Writes ignore value and clear top pending interrupt */ + if (topei && wr_mask) { + topei >>= TOPEI_ID_SHIFT; + if (topei) { + eix = &imsic->swfile->eix[topei / + BITS_PER_TYPE(u64)]; + clear_bit(topei & (BITS_PER_TYPE(u64) - 1), + eix->eip); + } + } + } else { + r = imsic_mrif_rmw(imsic->swfile, imsic->nr_eix, isel, + val, new_val, wr_mask); + /* Forward unknown IMSIC register to user-space */ + if (r) + rc = (r == -ENOENT) ? 0 : KVM_INSN_ILLEGAL_TRAP; + } + + if (wr_mask) + imsic_swfile_extirq_update(vcpu); + + return rc; +} + +void kvm_riscv_vcpu_aia_imsic_reset(struct kvm_vcpu *vcpu) +{ + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + + if (!imsic) + return; + + kvm_riscv_vcpu_aia_imsic_release(vcpu); + + memset(imsic->swfile, 0, sizeof(*imsic->swfile)); +} + +int kvm_riscv_vcpu_aia_imsic_inject(struct kvm_vcpu *vcpu, + u32 guest_index, u32 offset, u32 iid) +{ + unsigned long flags; + struct imsic_mrif_eix *eix; + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + + /* We only emulate one IMSIC MMIO page for each Guest VCPU */ + if (!imsic || !iid || guest_index || + (offset != IMSIC_MMIO_SETIPNUM_LE && + offset != IMSIC_MMIO_SETIPNUM_BE)) + return -ENODEV; + + iid = (offset == IMSIC_MMIO_SETIPNUM_BE) ? __swab32(iid) : iid; + if (imsic->nr_msis <= iid) + return -EINVAL; + + read_lock_irqsave(&imsic->vsfile_lock, flags); + + if (imsic->vsfile_cpu >= 0) { + writel(iid, imsic->vsfile_va + IMSIC_MMIO_SETIPNUM_LE); + kvm_vcpu_kick(vcpu); + } else { + eix = &imsic->swfile->eix[iid / BITS_PER_TYPE(u64)]; + set_bit(iid & (BITS_PER_TYPE(u64) - 1), eix->eip); + imsic_swfile_extirq_update(vcpu); + } + + read_unlock_irqrestore(&imsic->vsfile_lock, flags); + + return 0; +} + +static int imsic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + if (len != 4 || (addr & 0x3) != 0) + return -EOPNOTSUPP; + + *((u32 *)val) = 0; + + return 0; +} + +static int imsic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + struct kvm_msi msi = { 0 }; + + if (len != 4 || (addr & 0x3) != 0) + return -EOPNOTSUPP; + + msi.address_hi = addr >> 32; + msi.address_lo = (u32)addr; + msi.data = *((const u32 *)val); + kvm_riscv_aia_inject_msi(vcpu->kvm, &msi); + + return 0; +}; + +static struct kvm_io_device_ops imsic_iodoev_ops = { + .read = imsic_mmio_read, + .write = imsic_mmio_write, +}; + +int kvm_riscv_vcpu_aia_imsic_init(struct kvm_vcpu *vcpu) +{ + int ret = 0; + struct imsic *imsic; + struct page *swfile_page; + struct kvm *kvm = vcpu->kvm; + + /* Fail if we have zero IDs */ + if (!kvm->arch.aia.nr_ids) + return -EINVAL; + + /* Allocate IMSIC context */ + imsic = kzalloc(sizeof(*imsic), GFP_KERNEL); + if (!imsic) + return -ENOMEM; + vcpu->arch.aia_context.imsic_state = imsic; + + /* Setup IMSIC context */ + imsic->nr_msis = kvm->arch.aia.nr_ids + 1; + rwlock_init(&imsic->vsfile_lock); + imsic->nr_eix = BITS_TO_U64(imsic->nr_msis); + imsic->nr_hw_eix = BITS_TO_U64(kvm_riscv_aia_max_ids); + imsic->vsfile_hgei = imsic->vsfile_cpu = -1; + + /* Setup IMSIC SW-file */ + swfile_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, + get_order(sizeof(*imsic->swfile))); + if (!swfile_page) { + ret = -ENOMEM; + goto fail_free_imsic; + } + imsic->swfile = page_to_virt(swfile_page); + imsic->swfile_pa = page_to_phys(swfile_page); + + /* Setup IO device */ + kvm_iodevice_init(&imsic->iodev, &imsic_iodoev_ops); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, + vcpu->arch.aia_context.imsic_addr, + KVM_DEV_RISCV_IMSIC_SIZE, + &imsic->iodev); + mutex_unlock(&kvm->slots_lock); + if (ret) + goto fail_free_swfile; + + return 0; + +fail_free_swfile: + free_pages((unsigned long)imsic->swfile, + get_order(sizeof(*imsic->swfile))); +fail_free_imsic: + vcpu->arch.aia_context.imsic_state = NULL; + kfree(imsic); + return ret; +} + +void kvm_riscv_vcpu_aia_imsic_cleanup(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct imsic *imsic = vcpu->arch.aia_context.imsic_state; + + if (!imsic) + return; + + imsic_vsfile_cleanup(imsic); + + mutex_lock(&kvm->slots_lock); + kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &imsic->iodev); + mutex_unlock(&kvm->slots_lock); + + free_pages((unsigned long)imsic->swfile, + get_order(sizeof(*imsic->swfile))); + + vcpu->arch.aia_context.imsic_state = NULL; + kfree(imsic); +} -- cgit v1.2.3 From 5463091a51cfaab8922ac94e5178a05dfa836dbb Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Thu, 15 Jun 2023 13:03:53 +0530 Subject: RISC-V: KVM: Expose IMSIC registers as attributes of AIA irqchip We expose IMSIC registers as KVM device attributes of the in-kernel AIA irqchip device. This will allow KVM user-space to save/restore IMISC state of each VCPU using KVM device ioctls(). Signed-off-by: Anup Patel Reviewed-by: Atish Patra Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_aia.h | 3 + arch/riscv/include/uapi/asm/kvm.h | 17 ++++ arch/riscv/kvm/aia_device.c | 29 ++++++- arch/riscv/kvm/aia_imsic.c | 170 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 217 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/kvm_aia.h b/arch/riscv/include/asm/kvm_aia.h index a4f6ebf90e31..1f37b600ca47 100644 --- a/arch/riscv/include/asm/kvm_aia.h +++ b/arch/riscv/include/asm/kvm_aia.h @@ -97,6 +97,9 @@ int kvm_riscv_vcpu_aia_imsic_update(struct kvm_vcpu *vcpu); int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, unsigned long isel, unsigned long *val, unsigned long new_val, unsigned long wr_mask); +int kvm_riscv_aia_imsic_rw_attr(struct kvm *kvm, unsigned long type, + bool write, unsigned long *val); +int kvm_riscv_aia_imsic_has_attr(struct kvm *kvm, unsigned long type); void kvm_riscv_vcpu_aia_imsic_reset(struct kvm_vcpu *vcpu); int kvm_riscv_vcpu_aia_imsic_inject(struct kvm_vcpu *vcpu, u32 guest_index, u32 offset, u32 iid); diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 9ed822fc5589..61d7fecc4899 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -255,6 +255,23 @@ enum KVM_RISCV_SBI_EXT_ID { */ #define KVM_DEV_RISCV_AIA_GRP_APLIC 3 +/* + * The lower 12-bits of the device attribute type contains the iselect + * value of the IMSIC register (range 0x70-0xFF) whereas the higher order + * bits contains the VCPU id. + */ +#define KVM_DEV_RISCV_AIA_GRP_IMSIC 4 +#define KVM_DEV_RISCV_AIA_IMSIC_ISEL_BITS 12 +#define KVM_DEV_RISCV_AIA_IMSIC_ISEL_MASK \ + ((1U << KVM_DEV_RISCV_AIA_IMSIC_ISEL_BITS) - 1) +#define KVM_DEV_RISCV_AIA_IMSIC_MKATTR(__vcpu, __isel) \ + (((__vcpu) << KVM_DEV_RISCV_AIA_IMSIC_ISEL_BITS) | \ + ((__isel) & KVM_DEV_RISCV_AIA_IMSIC_ISEL_MASK)) +#define KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(__attr) \ + ((__attr) & KVM_DEV_RISCV_AIA_IMSIC_ISEL_MASK) +#define KVM_DEV_RISCV_AIA_IMSIC_GET_VCPU(__attr) \ + ((__attr) >> KVM_DEV_RISCV_AIA_IMSIC_ISEL_BITS) + /* One single KVM irqchip, ie. the AIA */ #define KVM_NR_IRQCHIPS 1 diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c index c649ad6e8e0a..84dae351b6d7 100644 --- a/arch/riscv/kvm/aia_device.c +++ b/arch/riscv/kvm/aia_device.c @@ -327,7 +327,7 @@ static int aia_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) u32 nr; u64 addr; int nr_vcpus, r = -ENXIO; - unsigned long type = (unsigned long)attr->attr; + unsigned long v, type = (unsigned long)attr->attr; void __user *uaddr = (void __user *)(long)attr->addr; switch (attr->group) { @@ -374,6 +374,15 @@ static int aia_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) r = kvm_riscv_aia_aplic_set_attr(dev->kvm, type, nr); mutex_unlock(&dev->kvm->lock); + break; + case KVM_DEV_RISCV_AIA_GRP_IMSIC: + if (copy_from_user(&v, uaddr, sizeof(v))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = kvm_riscv_aia_imsic_rw_attr(dev->kvm, type, true, &v); + mutex_unlock(&dev->kvm->lock); + break; } @@ -386,7 +395,7 @@ static int aia_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) u64 addr; int nr_vcpus, r = -ENXIO; void __user *uaddr = (void __user *)(long)attr->addr; - unsigned long type = (unsigned long)attr->attr; + unsigned long v, type = (unsigned long)attr->attr; switch (attr->group) { case KVM_DEV_RISCV_AIA_GRP_CONFIG: @@ -435,6 +444,20 @@ static int aia_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) if (copy_to_user(uaddr, &nr, sizeof(nr))) return -EFAULT; + break; + case KVM_DEV_RISCV_AIA_GRP_IMSIC: + if (copy_from_user(&v, uaddr, sizeof(v))) + return -EFAULT; + + mutex_lock(&dev->kvm->lock); + r = kvm_riscv_aia_imsic_rw_attr(dev->kvm, type, false, &v); + mutex_unlock(&dev->kvm->lock); + if (r) + return r; + + if (copy_to_user(uaddr, &v, sizeof(v))) + return -EFAULT; + break; } @@ -473,6 +496,8 @@ static int aia_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) break; case KVM_DEV_RISCV_AIA_GRP_APLIC: return kvm_riscv_aia_aplic_has_attr(dev->kvm, attr->attr); + case KVM_DEV_RISCV_AIA_GRP_IMSIC: + return kvm_riscv_aia_imsic_has_attr(dev->kvm, attr->attr); } return -ENXIO; diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index 0e6ecf067757..a0c9dbce2b99 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -278,6 +278,33 @@ static u32 imsic_mrif_topei(struct imsic_mrif *mrif, u32 nr_eix, u32 nr_msis) return 0; } +static int imsic_mrif_isel_check(u32 nr_eix, unsigned long isel) +{ + u32 num = 0; + + switch (isel) { + case IMSIC_EIDELIVERY: + case IMSIC_EITHRESHOLD: + break; + case IMSIC_EIP0 ... IMSIC_EIP63: + num = isel - IMSIC_EIP0; + break; + case IMSIC_EIE0 ... IMSIC_EIE63: + num = isel - IMSIC_EIE0; + break; + default: + return -ENOENT; + }; +#ifndef CONFIG_32BIT + if (num & 0x1) + return -EINVAL; +#endif + if ((num / 2) >= nr_eix) + return -EINVAL; + + return 0; +} + static int imsic_mrif_rmw(struct imsic_mrif *mrif, u32 nr_eix, unsigned long isel, unsigned long *val, unsigned long new_val, unsigned long wr_mask) @@ -408,6 +435,86 @@ static void imsic_vsfile_read(int vsfile_hgei, int vsfile_cpu, u32 nr_eix, imsic_vsfile_local_read, &idata, 1); } +struct imsic_vsfile_rw_data { + int hgei; + int isel; + bool write; + unsigned long val; +}; + +static void imsic_vsfile_local_rw(void *data) +{ + struct imsic_vsfile_rw_data *idata = data; + unsigned long new_hstatus, old_hstatus, old_vsiselect; + + old_vsiselect = csr_read(CSR_VSISELECT); + old_hstatus = csr_read(CSR_HSTATUS); + new_hstatus = old_hstatus & ~HSTATUS_VGEIN; + new_hstatus |= ((unsigned long)idata->hgei) << HSTATUS_VGEIN_SHIFT; + csr_write(CSR_HSTATUS, new_hstatus); + + switch (idata->isel) { + case IMSIC_EIDELIVERY: + if (idata->write) + imsic_vs_csr_write(IMSIC_EIDELIVERY, idata->val); + else + idata->val = imsic_vs_csr_read(IMSIC_EIDELIVERY); + break; + case IMSIC_EITHRESHOLD: + if (idata->write) + imsic_vs_csr_write(IMSIC_EITHRESHOLD, idata->val); + else + idata->val = imsic_vs_csr_read(IMSIC_EITHRESHOLD); + break; + case IMSIC_EIP0 ... IMSIC_EIP63: + case IMSIC_EIE0 ... IMSIC_EIE63: +#ifndef CONFIG_32BIT + if (idata->isel & 0x1) + break; +#endif + if (idata->write) + imsic_eix_write(idata->isel, idata->val); + else + idata->val = imsic_eix_read(idata->isel); + break; + default: + break; + } + + csr_write(CSR_HSTATUS, old_hstatus); + csr_write(CSR_VSISELECT, old_vsiselect); +} + +static int imsic_vsfile_rw(int vsfile_hgei, int vsfile_cpu, u32 nr_eix, + unsigned long isel, bool write, + unsigned long *val) +{ + int rc; + struct imsic_vsfile_rw_data rdata; + + /* We can only access register if we have a IMSIC VS-file */ + if (vsfile_cpu < 0 || vsfile_hgei <= 0) + return -EINVAL; + + /* Check IMSIC register iselect */ + rc = imsic_mrif_isel_check(nr_eix, isel); + if (rc) + return rc; + + /* We can only access register on local CPU */ + rdata.hgei = vsfile_hgei; + rdata.isel = isel; + rdata.write = write; + rdata.val = (write) ? *val : 0; + on_each_cpu_mask(cpumask_of(vsfile_cpu), + imsic_vsfile_local_rw, &rdata, 1); + + if (!write) + *val = rdata.val; + + return 0; +} + static void imsic_vsfile_local_clear(int vsfile_hgei, u32 nr_eix) { u32 i; @@ -759,6 +866,69 @@ int kvm_riscv_vcpu_aia_imsic_rmw(struct kvm_vcpu *vcpu, unsigned long isel, return rc; } +int kvm_riscv_aia_imsic_rw_attr(struct kvm *kvm, unsigned long type, + bool write, unsigned long *val) +{ + u32 isel, vcpu_id; + unsigned long flags; + struct imsic *imsic; + struct kvm_vcpu *vcpu; + int rc, vsfile_hgei, vsfile_cpu; + + if (!kvm_riscv_aia_initialized(kvm)) + return -ENODEV; + + vcpu_id = KVM_DEV_RISCV_AIA_IMSIC_GET_VCPU(type); + vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + if (!vcpu) + return -ENODEV; + + isel = KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(type); + imsic = vcpu->arch.aia_context.imsic_state; + + read_lock_irqsave(&imsic->vsfile_lock, flags); + + rc = 0; + vsfile_hgei = imsic->vsfile_hgei; + vsfile_cpu = imsic->vsfile_cpu; + if (vsfile_cpu < 0) { + if (write) { + rc = imsic_mrif_rmw(imsic->swfile, imsic->nr_eix, + isel, NULL, *val, -1UL); + imsic_swfile_extirq_update(vcpu); + } else + rc = imsic_mrif_rmw(imsic->swfile, imsic->nr_eix, + isel, val, 0, 0); + } + + read_unlock_irqrestore(&imsic->vsfile_lock, flags); + + if (!rc && vsfile_cpu >= 0) + rc = imsic_vsfile_rw(vsfile_hgei, vsfile_cpu, imsic->nr_eix, + isel, write, val); + + return rc; +} + +int kvm_riscv_aia_imsic_has_attr(struct kvm *kvm, unsigned long type) +{ + u32 isel, vcpu_id; + struct imsic *imsic; + struct kvm_vcpu *vcpu; + + if (!kvm_riscv_aia_initialized(kvm)) + return -ENODEV; + + vcpu_id = KVM_DEV_RISCV_AIA_IMSIC_GET_VCPU(type); + vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); + if (!vcpu) + return -ENODEV; + + isel = KVM_DEV_RISCV_AIA_IMSIC_GET_ISEL(type); + imsic = vcpu->arch.aia_context.imsic_state; + return imsic_mrif_isel_check(imsic->nr_eix, isel); +} + void kvm_riscv_vcpu_aia_imsic_reset(struct kvm_vcpu *vcpu) { struct imsic *imsic = vcpu->arch.aia_context.imsic_state; -- cgit v1.2.3 From 377f71f6d68d13cf2f5c6c9011ce762224a57b82 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Fri, 16 Jun 2023 15:22:03 +0100 Subject: riscv: kvm: define vcpu_sbi_ext_pmu in header Sparse is giving a warning about vcpu_sbi_ext_pmu not being defined, so add a definition to the relevant header to fix the following: arch/riscv/kvm/vcpu_sbi_pmu.c:81:37: warning: symbol 'vcpu_sbi_ext_pmu' was not declared. Should it be static? Fixes: cbddc4c4cb9e ("RISC-V: KVM: Add SBI PMU extension support") Signed-off-by: Ben Dooks Reviewed-by: Anup Patel Reviewed-by: Conor Dooley Signed-off-by: Anup Patel --- arch/riscv/include/asm/kvm_vcpu_sbi.h | 3 +++ arch/riscv/kvm/vcpu_sbi.c | 4 +--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index 14ee03df252d..cdcf0ff07be7 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -72,4 +72,7 @@ extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_hsm; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental; extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor; +#ifdef CONFIG_RISCV_PMU_SBI +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu; +#endif #endif /* __RISCV_KVM_VCPU_SBI_H__ */ diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index b3e92c11e54f..7b46e04fb667 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -20,9 +20,7 @@ static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01 = { }; #endif -#ifdef CONFIG_RISCV_PMU_SBI -extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu; -#else +#ifndef CONFIG_RISCV_PMU_SBI static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_pmu = { .extid_start = -1UL, .extid_end = -1UL, -- cgit v1.2.3 From d75b336225789fb967442feb6cdf9cfc0357b252 Mon Sep 17 00:00:00 2001 From: Anup Patel Date: Mon, 12 Jun 2023 16:40:44 +0530 Subject: RISC-V: KVM: Allow Svnapot extension for Guest/VM We extend the KVM ISA extension ONE_REG interface to allow KVM user space to detect and enable Svnapot extension for Guest/VM. Signed-off-by: Anup Patel Reviewed-by: Andrew Jones Signed-off-by: Anup Patel --- arch/riscv/include/uapi/asm/kvm.h | 1 + arch/riscv/kvm/vcpu.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index 61d7fecc4899..a1ca18408bbd 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -122,6 +122,7 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_ZICBOZ, KVM_RISCV_ISA_EXT_ZBB, KVM_RISCV_ISA_EXT_SSAIA, + KVM_RISCV_ISA_EXT_SVNAPOT, KVM_RISCV_ISA_EXT_MAX, }; diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 2db62c6c0d3e..7b355900f235 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -61,6 +61,7 @@ static const unsigned long kvm_isa_ext_arr[] = { KVM_ISA_EXT_ARR(SSAIA), KVM_ISA_EXT_ARR(SSTC), KVM_ISA_EXT_ARR(SVINVAL), + KVM_ISA_EXT_ARR(SVNAPOT), KVM_ISA_EXT_ARR(SVPBMT), KVM_ISA_EXT_ARR(ZBB), KVM_ISA_EXT_ARR(ZIHINTPAUSE), @@ -102,6 +103,7 @@ static bool kvm_riscv_vcpu_isa_disable_allowed(unsigned long ext) case KVM_RISCV_ISA_EXT_SSAIA: case KVM_RISCV_ISA_EXT_SSTC: case KVM_RISCV_ISA_EXT_SVINVAL: + case KVM_RISCV_ISA_EXT_SVNAPOT: case KVM_RISCV_ISA_EXT_ZIHINTPAUSE: case KVM_RISCV_ISA_EXT_ZBB: return false; -- cgit v1.2.3 From 07f225b5842420ae9c18cba17873fc71ed69c28e Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 20 Jun 2023 08:40:36 +0800 Subject: RISC-V: KVM: Remove unneeded semicolon ./arch/riscv/kvm/aia_imsic.c:94:2-3: Unneeded semicolon ./arch/riscv/kvm/aia_imsic.c:134:2-3: Unneeded semicolon ./arch/riscv/kvm/aia_imsic.c:173:2-3: Unneeded semicolon ./arch/riscv/kvm/aia_imsic.c:210:2-3: Unneeded semicolon ./arch/riscv/kvm/aia_imsic.c:296:2-3: Unneeded semicolon ./arch/riscv/kvm/aia_imsic.c:354:2-3: Unneeded semicolon ./arch/riscv/kvm/aia_device.c:105:4-5: Unneeded semicolon ./arch/riscv/kvm/aia_device.c:166:2-3: Unneeded semicolon Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=5569 Signed-off-by: Yang Li Signed-off-by: Anup Patel --- arch/riscv/kvm/aia_device.c | 4 ++-- arch/riscv/kvm/aia_imsic.c | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/riscv/kvm/aia_device.c b/arch/riscv/kvm/aia_device.c index 84dae351b6d7..0eb689351b7d 100644 --- a/arch/riscv/kvm/aia_device.c +++ b/arch/riscv/kvm/aia_device.c @@ -102,7 +102,7 @@ static int aia_config(struct kvm *kvm, unsigned long type, break; default: return -EINVAL; - }; + } aia->mode = *nr; } else *nr = aia->mode; @@ -163,7 +163,7 @@ static int aia_config(struct kvm *kvm, unsigned long type, break; default: return -ENXIO; - }; + } return 0; } diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c index a0c9dbce2b99..6cf23b8adb71 100644 --- a/arch/riscv/kvm/aia_imsic.c +++ b/arch/riscv/kvm/aia_imsic.c @@ -92,7 +92,7 @@ static unsigned long imsic_eix_read(int ireg) switch (ireg) { imsic_read_switchcase_64(IMSIC_EIP0) imsic_read_switchcase_64(IMSIC_EIE0) - }; + } return 0; } @@ -132,7 +132,7 @@ static unsigned long imsic_eix_swap(int ireg, unsigned long val) switch (ireg) { imsic_swap_switchcase_64(IMSIC_EIP0, val) imsic_swap_switchcase_64(IMSIC_EIE0, val) - }; + } return 0; } @@ -171,7 +171,7 @@ static void imsic_eix_write(int ireg, unsigned long val) switch (ireg) { imsic_write_switchcase_64(IMSIC_EIP0, val) imsic_write_switchcase_64(IMSIC_EIE0, val) - }; + } } #define imsic_vs_csr_set(__c, __v) \ @@ -208,7 +208,7 @@ static void imsic_eix_set(int ireg, unsigned long val) switch (ireg) { imsic_set_switchcase_64(IMSIC_EIP0, val) imsic_set_switchcase_64(IMSIC_EIE0, val) - }; + } } static unsigned long imsic_mrif_atomic_rmw(struct imsic_mrif *mrif, @@ -294,7 +294,7 @@ static int imsic_mrif_isel_check(u32 nr_eix, unsigned long isel) break; default: return -ENOENT; - }; + } #ifndef CONFIG_32BIT if (num & 0x1) return -EINVAL; @@ -352,7 +352,7 @@ static int imsic_mrif_rmw(struct imsic_mrif *mrif, u32 nr_eix, break; default: return -ENOENT; - }; + } if (val) *val = old_val; -- cgit v1.2.3 From 192df2aa0113ddddee2a93e453ff46610807b425 Mon Sep 17 00:00:00 2001 From: Oliver Upton Date: Thu, 22 Jun 2023 16:09:22 +0000 Subject: KVM: arm64: Fix misuse of KVM_ARM_VCPU_POWER_OFF bit index KVM_ARM_VCPU_POWER_OFF is as bit index, _not_ a literal bitmask. Nonetheless, commit e3c1c0cae31e ("KVM: arm64: Relax invariance of KVM_ARM_VCPU_POWER_OFF") started using it that way, meaning that powering off a vCPU with the KVM_ARM_VCPU_INIT ioctl is completely broken. Fix it by using a shifted bit for the bitwise operations instead. Reported-by: Dan Carpenter Fixes: e3c1c0cae31e ("KVM: arm64: Relax invariance of KVM_ARM_VCPU_POWER_OFF") Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20230622160922.1925530-1-oliver.upton@linux.dev Signed-off-by: Oliver Upton --- arch/arm64/kvm/arm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 2e1f5561141c..c2c14059f6a8 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1280,8 +1280,8 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, * reflecting it in the finalized feature set, thus limiting its scope * to a single KVM_ARM_VCPU_INIT call. */ - if (init->features[0] & KVM_ARM_VCPU_POWER_OFF) { - init->features[0] &= ~KVM_ARM_VCPU_POWER_OFF; + if (init->features[0] & BIT(KVM_ARM_VCPU_POWER_OFF)) { + init->features[0] &= ~BIT(KVM_ARM_VCPU_POWER_OFF); power_off = true; } -- cgit v1.2.3 From b7dac767c9356fe94fe345f907adf573cf745d8d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 11 Apr 2023 10:16:50 -0700 Subject: Documentation/process: Add a label for the tip tree handbook's coding style Add a label for the tip tree's "Coding style notes" so that a forthcoming KVM x86 handbook can reference/piggyback the tip tree's preferred coding style. Link: https://lore.kernel.org/r/20230411171651.1067966-2-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/process/maintainer-tip.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/process/maintainer-tip.rst b/Documentation/process/maintainer-tip.rst index 178c95fd17dc..0cac75a86372 100644 --- a/Documentation/process/maintainer-tip.rst +++ b/Documentation/process/maintainer-tip.rst @@ -452,6 +452,8 @@ and can be added to an existing kernel config by running: Some of these options are x86-specific and can be left out when testing on other architectures. +.. _maintainer-tip-coding-style: + Coding style notes ------------------ -- cgit v1.2.3 From 63e2f55cabedf8a7ede928f7cf3ab028af44b9e9 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 11 Apr 2023 10:16:51 -0700 Subject: Documentation/process: Add a maintainer handbook for KVM x86 Add a KVM x86 doc to the subsystem/maintainer handbook section to explain how KVM x86 (currently) operates as a sub-subsystem, and to soapbox on the rules and expectations for contributing to KVM x86. Reviewed-by: Like Xu Link: https://lore.kernel.org/r/20230411171651.1067966-3-seanjc@google.com Signed-off-by: Sean Christopherson --- Documentation/process/maintainer-handbooks.rst | 1 + Documentation/process/maintainer-kvm-x86.rst | 390 +++++++++++++++++++++++++ MAINTAINERS | 1 + 3 files changed, 392 insertions(+) create mode 100644 Documentation/process/maintainer-kvm-x86.rst diff --git a/Documentation/process/maintainer-handbooks.rst b/Documentation/process/maintainer-handbooks.rst index d783060b4cc6..d12cbbe2b7df 100644 --- a/Documentation/process/maintainer-handbooks.rst +++ b/Documentation/process/maintainer-handbooks.rst @@ -17,3 +17,4 @@ Contents: maintainer-tip maintainer-netdev + maintainer-kvm-x86 diff --git a/Documentation/process/maintainer-kvm-x86.rst b/Documentation/process/maintainer-kvm-x86.rst new file mode 100644 index 000000000000..9183bd449762 --- /dev/null +++ b/Documentation/process/maintainer-kvm-x86.rst @@ -0,0 +1,390 @@ +.. SPDX-License-Identifier: GPL-2.0 + +KVM x86 +======= + +Foreword +-------- +KVM strives to be a welcoming community; contributions from newcomers are +valued and encouraged. Please do not be discouraged or intimidated by the +length of this document and the many rules/guidelines it contains. Everyone +makes mistakes, and everyone was a newbie at some point. So long as you make +an honest effort to follow KVM x86's guidelines, are receptive to feedback, +and learn from any mistakes you make, you will be welcomed with open arms, not +torches and pitchforks. + +TL;DR +----- +Testing is mandatory. Be consistent with established styles and patterns. + +Trees +----- +KVM x86 is currently in a transition period from being part of the main KVM +tree, to being "just another KVM arch". As such, KVM x86 is split across the +main KVM tree, ``git.kernel.org/pub/scm/virt/kvm/kvm.git``, and a KVM x86 +specific tree, ``github.com/kvm-x86/linux.git``. + +Generally speaking, fixes for the current cycle are applied directly to the +main KVM tree, while all development for the next cycle is routed through the +KVM x86 tree. In the unlikely event that a fix for the current cycle is routed +through the KVM x86 tree, it will be applied to the ``fixes`` branch before +making its way to the main KVM tree. + +Note, this transition period is expected to last quite some time, i.e. will be +the status quo for the foreseeable future. + +Branches +~~~~~~~~ +The KVM x86 tree is organized into multiple topic branches. The purpose of +using finer-grained topic branches is to make it easier to keep tabs on an area +of development, and to limit the collateral damage of human errors and/or buggy +commits, e.g. dropping the HEAD commit of a topic branch has no impact on other +in-flight commits' SHA1 hashes, and having to reject a pull request due to bugs +delays only that topic branch. + +All topic branches, except for ``next`` and ``fixes``, are rolled into ``next`` +via a Cthulhu merge on an as-needed basis, i.e. when a topic branch is updated. +As a result, force pushes to ``next`` are common. + +Lifecycle +~~~~~~~~~ +Fixes that target the current release, a.k.a. mainline, are typically applied +directly to the main KVM tree, i.e. do not route through the KVM x86 tree. + +Changes that target the next release are routed through the KVM x86 tree. Pull +requests (from KVM x86 to main KVM) are sent for each KVM x86 topic branch, +typically the week before Linus' opening of the merge window, e.g. the week +following rc7 for "normal" releases. If all goes well, the topic branches are +rolled into the main KVM pull request sent during Linus' merge window. + +The KVM x86 tree doesn't have its own official merge window, but there's a soft +close around rc5 for new features, and a soft close around rc6 for fixes (for +the next release; see above for fixes that target the current release). + +Timeline +~~~~~~~~ +Submissions are typically reviewed and applied in FIFO order, with some wiggle +room for the size of a series, patches that are "cache hot", etc. Fixes, +especially for the current release and or stable trees, get to jump the queue. +Patches that will be taken through a non-KVM tree (most often through the tip +tree) and/or have other acks/reviews also jump the queue to some extent. + +Note, the vast majority of review is done between rc1 and rc6, give or take. +The period between rc6 and the next rc1 is used to catch up on other tasks, +i.e. radio silence during this period isn't unusual. + +Pings to get a status update are welcome, but keep in mind the timing of the +current release cycle and have realistic expectations. If you are pinging for +acceptance, i.e. not just for feedback or an update, please do everything you +can, within reason, to ensure that your patches are ready to be merged! Pings +on series that break the build or fail tests lead to unhappy maintainers! + +Development +----------- + +Base Tree/Branch +~~~~~~~~~~~~~~~~ +Fixes that target the current release, a.k.a. mainline, should be based on +``git://git.kernel.org/pub/scm/virt/kvm/kvm.git master``. Note, fixes do not +automatically warrant inclusion in the current release. There is no singular +rule, but typically only fixes for bugs that are urgent, critical, and/or were +introduced in the current release should target the current release. + +Everything else should be based on ``kvm-x86/next``, i.e. there is no need to +select a specific topic branch as the base. If there are conflicts and/or +dependencies across topic branches, it is the maintainer's job to sort them +out. + +The only exception to using ``kvm-x86/next`` as the base is if a patch/series +is a multi-arch series, i.e. has non-trivial modifications to common KVM code +and/or has more than superficial changes to other architectures' code. Multi- +arch patch/series should instead be based on a common, stable point in KVM's +history, e.g. the release candidate upon which ``kvm-x86 next`` is based. If +you're unsure whether a patch/series is truly multi-arch, err on the side of +caution and treat it as multi-arch, i.e. use a common base. + +Coding Style +~~~~~~~~~~~~ +When it comes to style, naming, patterns, etc., consistency is the number one +priority in KVM x86. If all else fails, match what already exists. + +With a few caveats listed below, follow the tip tree maintainers' preferred +:ref:`maintainer-tip-coding-style`, as patches/series often touch both KVM and +non-KVM x86 files, i.e. draw the attention of KVM *and* tip tree maintainers. + +Using reverse fir tree, a.k.a. reverse Christmas tree or reverse XMAS tree, for +variable declarations isn't strictly required, though it is still preferred. + +Except for a handful of special snowflakes, do not use kernel-doc comments for +functions. The vast majority of "public" KVM functions aren't truly public as +they are intended only for KVM-internal consumption (there are plans to +privatize KVM's headers and exports to enforce this). + +Comments +~~~~~~~~ +Write comments using imperative mood and avoid pronouns. Use comments to +provide a high level overview of the code, and/or to explain why the code does +what it does. Do not reiterate what the code literally does; let the code +speak for itself. If the code itself is inscrutable, comments will not help. + +SDM and APM References +~~~~~~~~~~~~~~~~~~~~~~ +Much of KVM's code base is directly tied to architectural behavior defined in +Intel's Software Development Manual (SDM) and AMD's Architecture Programmer’s +Manual (APM). Use of "Intel's SDM" and "AMD's APM", or even just "SDM" or +"APM", without additional context is a-ok. + +Do not reference specific sections, tables, figures, etc. by number, especially +not in comments. Instead, if necessary (see below), copy-paste the relevant +snippet and reference sections/tables/figures by name. The layouts of the SDM +and APM are constantly changing, and so the numbers/labels aren't stable. + +Generally speaking, do not explicitly reference or copy-paste from the SDM or +APM in comments. With few exceptions, KVM *must* honor architectural behavior, +therefore it's implied that KVM behavior is emulating SDM and/or APM behavior. +Note, referencing the SDM/APM in changelogs to justify the change and provide +context is perfectly ok and encouraged. + +Shortlog +~~~~~~~~ +The preferred prefix format is ``KVM: :``, where ```` is one of:: + + - x86 + - x86/mmu + - x86/pmu + - x86/xen + - selftests + - SVM + - nSVM + - VMX + - nVMX + +**DO NOT use x86/kvm!** ``x86/kvm`` is used exclusively for Linux-as-a-KVM-guest +changes, i.e. for arch/x86/kernel/kvm.c. Do not use file names or complete file +paths as the subject/shortlog prefix. + +Note, these don't align with the topics branches (the topic branches care much +more about code conflicts). + +All names are case sensitive! ``KVM: x86:`` is good, ``kvm: vmx:`` is not. + +Capitalize the first word of the condensed patch description, but omit ending +punctionation. E.g.:: + + KVM: x86: Fix a null pointer dereference in function_xyz() + +not:: + + kvm: x86: fix a null pointer dereference in function_xyz. + +If a patch touches multiple topics, traverse up the conceptual tree to find the +first common parent (which is often simply ``x86``). When in doubt, +``git log path/to/file`` should provide a reasonable hint. + +New topics do occasionally pop up, but please start an on-list discussion if +you want to propose introducing a new topic, i.e. don't go rogue. + +See :ref:`the_canonical_patch_format` for more information, with one amendment: +do not treat the 70-75 character limit as an absolute, hard limit. Instead, +use 75 characters as a firm-but-not-hard limit, and use 80 characters as a hard +limit. I.e. let the shortlog run a few characters over the standard limit if +you have good reason to do so. + +Changelog +~~~~~~~~~ +Most importantly, write changelogs using imperative mood and avoid pronouns. + +See :ref:`describe_changes` for more information, with one amendment: lead with +a short blurb on the actual changes, and then follow up with the context and +background. Note! This order directly conflicts with the tip tree's preferred +approach! Please follow the tip tree's preferred style when sending patches +that primarily target arch/x86 code that is _NOT_ KVM code. + +Stating what a patch does before diving into details is preferred by KVM x86 +for several reasons. First and foremost, what code is actually being changed +is arguably the most important information, and so that info should be easy to +find. Changelogs that bury the "what's actually changing" in a one-liner after +3+ paragraphs of background make it very hard to find that information. + +For initial review, one could argue the "what's broken" is more important, but +for skimming logs and git archaeology, the gory details matter less and less. +E.g. when doing a series of "git blame", the details of each change along the +way are useless, the details only matter for the culprit. Providing the "what +changed" makes it easy to quickly determine whether or not a commit might be of +interest. + +Another benefit of stating "what's changing" first is that it's almost always +possible to state "what's changing" in a single sentence. Conversely, all but +the most simple bugs require multiple sentences or paragraphs to fully describe +the problem. If both the "what's changing" and "what's the bug" are super +short then the order doesn't matter. But if one is shorter (almost always the +"what's changing), then covering the shorter one first is advantageous because +it's less of an inconvenience for readers/reviewers that have a strict ordering +preference. E.g. having to skip one sentence to get to the context is less +painful than having to skip three paragraphs to get to "what's changing". + +Fixes +~~~~~ +If a change fixes a KVM/kernel bug, add a Fixes: tag even if the change doesn't +need to be backported to stable kernels, and even if the change fixes a bug in +an older release. + +Conversely, if a fix does need to be backported, explicitly tag the patch with +"Cc: stable@vger.kernel" (though the email itself doesn't need to Cc: stable); +KVM x86 opts out of backporting Fixes: by default. Some auto-selected patches +do get backported, but require explicit maintainer approval (search MANUALSEL). + +Function References +~~~~~~~~~~~~~~~~~~~ +When a function is mentioned in a comment, changelog, or shortlog (or anywhere +for that matter), use the format ``function_name()``. The parentheses provide +context and disambiguate the reference. + +Testing +------- +At a bare minimum, *all* patches in a series must build cleanly for KVM_INTEL=m +KVM_AMD=m, and KVM_WERROR=y. Building every possible combination of Kconfigs +isn't feasible, but the more the merrier. KVM_SMM, KVM_XEN, PROVE_LOCKING, and +X86_64 are particularly interesting knobs to turn. + +Running KVM selftests and KVM-unit-tests is also mandatory (and stating the +obvious, the tests need to pass). The only exception is for changes that have +negligible probability of affecting runtime behavior, e.g. patches that only +modify comments. When possible and relevant, testing on both Intel and AMD is +strongly preferred. Booting an actual VM is encouraged, but not mandatory. + +For changes that touch KVM's shadow paging code, running with TDP (EPT/NPT) +disabled is mandatory. For changes that affect common KVM MMU code, running +with TDP disabled is strongly encouraged. For all other changes, if the code +being modified depends on and/or interacts with a module param, testing with +the relevant settings is mandatory. + +Note, KVM selftests and KVM-unit-tests do have known failures. If you suspect +a failure is not due to your changes, verify that the *exact same* failure +occurs with and without your changes. + +Changes that touch reStructured Text documentation, i.e. .rst files, must build +htmldocs cleanly, i.e. with no new warnings or errors. + +If you can't fully test a change, e.g. due to lack of hardware, clearly state +what level of testing you were able to do, e.g. in the cover letter. + +New Features +~~~~~~~~~~~~ +With one exception, new features *must* come with test coverage. KVM specific +tests aren't strictly required, e.g. if coverage is provided by running a +sufficiently enabled guest VM, or by running a related kernel selftest in a VM, +but dedicated KVM tests are preferred in all cases. Negative testcases in +particular are mandatory for enabling of new hardware features as error and +exception flows are rarely exercised simply by running a VM. + +The only exception to this rule is if KVM is simply advertising support for a +feature via KVM_GET_SUPPORTED_CPUID, i.e. for instructions/features that KVM +can't prevent a guest from using and for which there is no true enabling. + +Note, "new features" does not just mean "new hardware features"! New features +that can't be well validated using existing KVM selftests and/or KVM-unit-tests +must come with tests. + +Posting new feature development without tests to get early feedback is more +than welcome, but such submissions should be tagged RFC, and the cover letter +should clearly state what type of feedback is requested/expected. Do not abuse +the RFC process; RFCs will typically not receive in-depth review. + +Bug Fixes +~~~~~~~~~ +Except for "obvious" found-by-inspection bugs, fixes must be accompanied by a +reproducer for the bug being fixed. In many cases the reproducer is implicit, +e.g. for build errors and test failures, but it should still be clear to +readers what is broken and how to verify the fix. Some leeway is given for +bugs that are found via non-public workloads/tests, but providing regression +tests for such bugs is strongly preferred. + +In general, regression tests are preferred for any bug that is not trivial to +hit. E.g. even if the bug was originally found by a fuzzer such as syzkaller, +a targeted regression test may be warranted if the bug requires hitting a +one-in-a-million type race condition. + +Note, KVM bugs are rarely urgent *and* non-trivial to reproduce. Ask yourself +if a bug is really truly the end of the world before posting a fix without a +reproducer. + +Posting +------- + +Links +~~~~~ +Do not explicitly reference bug reports, prior versions of a patch/series, etc. +via ``In-Reply-To:`` headers. Using ``In-Reply-To:`` becomes an unholy mess +for large series and/or when the version count gets high, and ``In-Reply-To:`` +is useless for anyone that doesn't have the original message, e.g. if someone +wasn't Cc'd on the bug report or if the list of recipients changes between +versions. + +To link to a bug report, previous version, or anything of interest, use lore +links. For referencing previous version(s), generally speaking do not include +a Link: in the changelog as there is no need to record the history in git, i.e. +put the link in the cover letter or in the section git ignores. Do provide a +formal Link: for bug reports and/or discussions that led to the patch. The +context of why a change was made is highly valuable for future readers. + +Git Base +~~~~~~~~ +If you are using git version 2.9.0 or later (Googlers, this is all of you!), +use ``git format-patch`` with the ``--base`` flag to automatically include the +base tree information in the generated patches. + +Note, ``--base=auto`` works as expected if and only if a branch's upstream is +set to the base topic branch, e.g. it will do the wrong thing if your upstream +is set to your personal repository for backup purposes. An alternative "auto" +solution is to derive the names of your development branches based on their +KVM x86 topic, and feed that into ``--base``. E.g. ``x86/pmu/my_branch_name``, +and then write a small wrapper to extract ``pmu`` from the current branch name +to yield ``--base=x/pmu``, where ``x`` is whatever name your repository uses to +track the KVM x86 remote. + +Co-Posting Tests +~~~~~~~~~~~~~~~~ +KVM selftests that are associated with KVM changes, e.g. regression tests for +bug fixes, should be posted along with the KVM changes as a single series. The +standard kernel rules for bisection apply, i.e. KVM changes that result in test +failures should be ordered after the selftests updates, and vice versa, new +tests that fail due to KVM bugs should be ordered after the KVM fixes. + +KVM-unit-tests should *always* be posted separately. Tools, e.g. b4 am, don't +know that KVM-unit-tests is a separate repository and get confused when patches +in a series apply on different trees. To tie KVM-unit-tests patches back to +KVM patches, first post the KVM changes and then provide a lore Link: to the +KVM patch/series in the KVM-unit-tests patch(es). + +Notifications +------------- +When a patch/series is officially accepted, a notification email will be sent +in reply to the original posting (cover letter for multi-patch series). The +notification will include the tree and topic branch, along with the SHA1s of +the commits of applied patches. + +If a subset of patches is applied, this will be clearly stated in the +notification. Unless stated otherwise, it's implied that any patches in the +series that were not accepted need more work and should be submitted in a new +version. + +If for some reason a patch is dropped after officially being accepted, a reply +will be sent to the notification email explaining why the patch was dropped, as +well as the next steps. + +SHA1 Stability +~~~~~~~~~~~~~~ +SHA1s are not 100% guaranteed to be stable until they land in Linus' tree! A +SHA1 is *usually* stable once a notification has been sent, but things happen. +In most cases, an update to the notification email be provided if an applied +patch's SHA1 changes. However, in some scenarios, e.g. if all KVM x86 branches +need to be rebased, individual notifications will not be given. + +Vulnerabilities +--------------- +Bugs that can be exploited by the guest to attack the host (kernel or +userspace), or that can be exploited by a nested VM to *its* host (L2 attacking +L1), are of particular interest to KVM. Please follow the protocol for +:ref:`securitybugs` if you suspect a bug can lead to an escape, data leak, etc. + diff --git a/MAINTAINERS b/MAINTAINERS index e0ad886d3163..d682bfa2b291 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11420,6 +11420,7 @@ M: Sean Christopherson M: Paolo Bonzini L: kvm@vger.kernel.org S: Supported +P: Documentation/process/maintainer-kvm-x86.rst T: git git://git.kernel.org/pub/scm/virt/kvm/kvm.git F: arch/x86/include/asm/kvm* F: arch/x86/include/asm/svm.h -- cgit v1.2.3