6 files changed, 251 insertions, 35 deletions
diff --git a/arch/x86/hyperv/Makefile b/arch/x86/hyperv/Makefile
index 5d2de10809ae..3a1548054b48 100644
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y			:= hv_init.o mmu.o nested.o irqdomain.o ivm.o
 obj-$(CONFIG_X86_64)	+= hv_apic.o hv_proc.o
+obj-$(CONFIG_HYPERV_VTL_MODE)	+= hv_vtl.o
 
 ifdef CONFIG_X86_64
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)	+= hv_spinlock.o
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index fb8b2c088681..1fbda2f94184 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -96,6 +96,11 @@ static void hv_apic_eoi_write(u32 reg, u32 val)
 	wrmsr(HV_X64_MSR_EOI, val, 0);
 }
 
+static bool cpu_is_self(int cpu)
+{
+	return cpu == smp_processor_id();
+}
+
 /*
  * IPI implementation on Hyper-V.
  */
@@ -128,10 +133,9 @@ static bool __send_ipi_mask_ex(const struct cpumask *mask, int vector,
 	 */
 	if (!cpumask_equal(mask, cpu_present_mask) || exclude_self) {
 		ipi_arg->vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-		if (exclude_self)
-			nr_bank = cpumask_to_vpset_noself(&(ipi_arg->vp_set), mask);
-		else
-			nr_bank = cpumask_to_vpset(&(ipi_arg->vp_set), mask);
+
+		nr_bank = cpumask_to_vpset_skip(&(ipi_arg->vp_set), mask,
+				exclude_self ? cpu_is_self : NULL);
 
 		/*
 		 * 'nr_bank <= 0' means some CPUs in cpumask can't be
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c
index edbc67ec1f3e..a5f9474f08e1 100644
--- a/arch/x86/hyperv/hv_init.c
+++ b/arch/x86/hyperv/hv_init.c
@@ -63,7 +63,10 @@ static int hyperv_init_ghcb(void)
 	 * memory boundary and map it here.
 	 */
 	rdmsrl(MSR_AMD64_SEV_ES_GHCB, ghcb_gpa);
-	ghcb_va = memremap(ghcb_gpa, HV_HYP_PAGE_SIZE, MEMREMAP_WB);
+
+	/* Mask out vTOM bit. ioremap_cache() maps decrypted */
+	ghcb_gpa &= ~ms_hyperv.shared_gpa_boundary;
+	ghcb_va = (void *)ioremap_cache(ghcb_gpa, HV_HYP_PAGE_SIZE);
 	if (!ghcb_va)
 		return -ENOMEM;
 
@@ -217,7 +220,7 @@ static int hv_cpu_die(unsigned int cpu)
 	if (hv_ghcb_pg) {
 		ghcb_va = (void **)this_cpu_ptr(hv_ghcb_pg);
 		if (*ghcb_va)
-			memunmap(*ghcb_va);
+			iounmap(*ghcb_va);
 		*ghcb_va = NULL;
 	}
 
diff --git a/arch/x86/hyperv/hv_vtl.c b/arch/x86/hyperv/hv_vtl.c
new file mode 100644
index 000000000000..1ba5d3b99b16
--- /dev/null
+++ b/arch/x86/hyperv/hv_vtl.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023, Microsoft Corporation.
+ *
+ * Author:
+ *   Saurabh Sengar <ssengar@microsoft.com>
+ */
+
+#include <asm/apic.h>
+#include <asm/boot.h>
+#include <asm/desc.h>
+#include <asm/i8259.h>
+#include <asm/mshyperv.h>
+#include <asm/realmode.h>
+
+extern struct boot_params boot_params;
+static struct real_mode_header hv_vtl_real_mode_header;
+
+void __init hv_vtl_init_platform(void)
+{
+	pr_info("Linux runs in Hyper-V Virtual Trust Level\n");
+
+	x86_init.irqs.pre_vector_init = x86_init_noop;
+	x86_init.timers.timer_init = x86_init_noop;
+
+	x86_platform.get_wallclock = get_rtc_noop;
+	x86_platform.set_wallclock = set_rtc_noop;
+	x86_platform.get_nmi_reason = hv_get_nmi_reason;
+
+	x86_platform.legacy.i8042 = X86_LEGACY_I8042_PLATFORM_ABSENT;
+	x86_platform.legacy.rtc = 0;
+	x86_platform.legacy.warm_reset = 0;
+	x86_platform.legacy.reserve_bios_regions = 0;
+	x86_platform.legacy.devices.pnpbios = 0;
+}
+
+static inline u64 hv_vtl_system_desc_base(struct ldttss_desc *desc)
+{
+	return ((u64)desc->base3 << 32) | ((u64)desc->base2 << 24) |
+		(desc->base1 << 16) | desc->base0;
+}
+
+static inline u32 hv_vtl_system_desc_limit(struct ldttss_desc *desc)
+{
+	return ((u32)desc->limit1 << 16) | (u32)desc->limit0;
+}
+
+typedef void (*secondary_startup_64_fn)(void*, void*);
+static void hv_vtl_ap_entry(void)
+{
+	((secondary_startup_64_fn)secondary_startup_64)(&boot_params, &boot_params);
+}
+
+static int hv_vtl_bringup_vcpu(u32 target_vp_index, u64 eip_ignored)
+{
+	u64 status;
+	int ret = 0;
+	struct hv_enable_vp_vtl *input;
+	unsigned long irq_flags;
+
+	struct desc_ptr gdt_ptr;
+	struct desc_ptr idt_ptr;
+
+	struct ldttss_desc *tss;
+	struct ldttss_desc *ldt;
+	struct desc_struct *gdt;
+
+	u64 rsp = current->thread.sp;
+	u64 rip = (u64)&hv_vtl_ap_entry;
+
+	native_store_gdt(&gdt_ptr);
+	store_idt(&idt_ptr);
+
+	gdt = (struct desc_struct *)((void *)(gdt_ptr.address));
+	tss = (struct ldttss_desc *)(gdt + GDT_ENTRY_TSS);
+	ldt = (struct ldttss_desc *)(gdt + GDT_ENTRY_LDT);
+
+	local_irq_save(irq_flags);
+
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+
+	input->partition_id = HV_PARTITION_ID_SELF;
+	input->vp_index = target_vp_index;
+	input->target_vtl.target_vtl = HV_VTL_MGMT;
+
+	/*
+	 * The x86_64 Linux kernel follows the 16-bit -> 32-bit -> 64-bit
+	 * mode transition sequence after waking up an AP with SIPI whose
+	 * vector points to the 16-bit AP startup trampoline code. Here in
+	 * VTL2, we can't perform that sequence as the AP has to start in
+	 * the 64-bit mode.
+	 *
+	 * To make this happen, we tell the hypervisor to load a valid 64-bit
+	 * context (most of which is just magic numbers from the CPU manual)
+	 * so that AP jumps right to the 64-bit entry of the kernel, and the
+	 * control registers are loaded with values that let the AP fetch the
+	 * code and data and carry on with work it gets assigned.
+	 */
+
+	input->vp_context.rip = rip;
+	input->vp_context.rsp = rsp;
+	input->vp_context.rflags = 0x0000000000000002;
+	input->vp_context.efer = __rdmsr(MSR_EFER);
+	input->vp_context.cr0 = native_read_cr0();
+	input->vp_context.cr3 = __native_read_cr3();
+	input->vp_context.cr4 = native_read_cr4();
+	input->vp_context.msr_cr_pat = __rdmsr(MSR_IA32_CR_PAT);
+	input->vp_context.idtr.limit = idt_ptr.size;
+	input->vp_context.idtr.base = idt_ptr.address;
+	input->vp_context.gdtr.limit = gdt_ptr.size;
+	input->vp_context.gdtr.base = gdt_ptr.address;
+
+	/* Non-system desc (64bit), long, code, present */
+	input->vp_context.cs.selector = __KERNEL_CS;
+	input->vp_context.cs.base = 0;
+	input->vp_context.cs.limit = 0xffffffff;
+	input->vp_context.cs.attributes = 0xa09b;
+	/* Non-system desc (64bit), data, present, granularity, default */
+	input->vp_context.ss.selector = __KERNEL_DS;
+	input->vp_context.ss.base = 0;
+	input->vp_context.ss.limit = 0xffffffff;
+	input->vp_context.ss.attributes = 0xc093;
+
+	/* System desc (128bit), present, LDT */
+	input->vp_context.ldtr.selector = GDT_ENTRY_LDT * 8;
+	input->vp_context.ldtr.base = hv_vtl_system_desc_base(ldt);
+	input->vp_context.ldtr.limit = hv_vtl_system_desc_limit(ldt);
+	input->vp_context.ldtr.attributes = 0x82;
+
+	/* System desc (128bit), present, TSS, 0x8b - busy, 0x89 -- default */
+	input->vp_context.tr.selector = GDT_ENTRY_TSS * 8;
+	input->vp_context.tr.base = hv_vtl_system_desc_base(tss);
+	input->vp_context.tr.limit = hv_vtl_system_desc_limit(tss);
+	input->vp_context.tr.attributes = 0x8b;
+
+	status = hv_do_hypercall(HVCALL_ENABLE_VP_VTL, input, NULL);
+
+	if (!hv_result_success(status) &&
+	    hv_result(status) != HV_STATUS_VTL_ALREADY_ENABLED) {
+		pr_err("HVCALL_ENABLE_VP_VTL failed for VP : %d ! [Err: %#llx\n]",
+		       target_vp_index, status);
+		ret = -EINVAL;
+		goto free_lock;
+	}
+
+	status = hv_do_hypercall(HVCALL_START_VP, input, NULL);
+
+	if (!hv_result_success(status)) {
+		pr_err("HVCALL_START_VP failed for VP : %d ! [Err: %#llx]\n",
+		       target_vp_index, status);
+		ret = -EINVAL;
+	}
+
+free_lock:
+	local_irq_restore(irq_flags);
+
+	return ret;
+}
+
+static int hv_vtl_apicid_to_vp_id(u32 apic_id)
+{
+	u64 control;
+	u64 status;
+	unsigned long irq_flags;
+	struct hv_get_vp_from_apic_id_in *input;
+	u32 *output, ret;
+
+	local_irq_save(irq_flags);
+
+	input = *this_cpu_ptr(hyperv_pcpu_input_arg);
+	memset(input, 0, sizeof(*input));
+	input->partition_id = HV_PARTITION_ID_SELF;
+	input->apic_ids[0] = apic_id;
+
+	output = (u32 *)input;
+
+	control = HV_HYPERCALL_REP_COMP_1 | HVCALL_GET_VP_ID_FROM_APIC_ID;
+	status = hv_do_hypercall(control, input, output);
+	ret = output[0];
+
+	local_irq_restore(irq_flags);
+
+	if (!hv_result_success(status)) {
+		pr_err("failed to get vp id from apic id %d, status %#llx\n",
+		       apic_id, status);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static int hv_vtl_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
+{
+	int vp_id;
+
+	pr_debug("Bringing up CPU with APIC ID %d in VTL2...\n", apicid);
+	vp_id = hv_vtl_apicid_to_vp_id(apicid);
+
+	if (vp_id < 0) {
+		pr_err("Couldn't find CPU with APIC ID %d\n", apicid);
+		return -EINVAL;
+	}
+	if (vp_id > ms_hyperv.max_vp_index) {
+		pr_err("Invalid CPU id %d for APIC ID %d\n", vp_id, apicid);
+		return -EINVAL;
+	}
+
+	return hv_vtl_bringup_vcpu(vp_id, start_eip);
+}
+
+static int __init hv_vtl_early_init(void)
+{
+	/*
+	 * `boot_cpu_has` returns the runtime feature support,
+	 * and here is the earliest it can be used.
+	 */
+	if (cpu_feature_enabled(X86_FEATURE_XSAVE))
+		panic("XSAVE has to be disabled as it is not supported by this module.\n"
+			  "Please add 'noxsave' to the kernel command line.\n");
+
+	real_mode_header = &hv_vtl_real_mode_header;
+	apic->wakeup_secondary_cpu_64 = hv_vtl_wakeup_secondary_cpu;
+
+	return 0;
+}
+early_initcall(hv_vtl_early_init);
diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c
index f6a020cb1a24..127d5b7b63de 100644
--- a/arch/x86/hyperv/ivm.c
+++ b/arch/x86/hyperv/ivm.c
@@ -376,34 +376,6 @@ void __init hv_vtom_init(void)
 
 #endif /* CONFIG_AMD_MEM_ENCRYPT */
 
-/*
- * hv_map_memory - map memory to extra space in the AMD SEV-SNP Isolation VM.
- */
-void *hv_map_memory(void *addr, unsigned long size)
-{
-	unsigned long *pfns = kcalloc(size / PAGE_SIZE,
-				      sizeof(unsigned long), GFP_KERNEL);
-	void *vaddr;
-	int i;
-
-	if (!pfns)
-		return NULL;
-
-	for (i = 0; i < size / PAGE_SIZE; i++)
-		pfns[i] = vmalloc_to_pfn(addr + i * PAGE_SIZE) +
-			(ms_hyperv.shared_gpa_boundary >> PAGE_SHIFT);
-
-	vaddr = vmap_pfn(pfns, size / PAGE_SIZE, pgprot_decrypted(PAGE_KERNEL));
-	kfree(pfns);
-
-	return vaddr;
-}
-
-void hv_unmap_memory(void *addr)
-{
-	vunmap(addr);
-}
-
 enum hv_isolation_type hv_get_isolation_type(void)
 {
 	if (!(ms_hyperv.priv_high & HV_ISOLATION))
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 0ad2378fe6ad..8460bd35e10c 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -52,6 +52,11 @@ static inline int fill_gva_list(u64 gva_list[], int offset,
 	return gva_n - offset;
 }
 
+static bool cpu_is_lazy(int cpu)
+{
+	return per_cpu(cpu_tlbstate_shared.is_lazy, cpu);
+}
+
 static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
 				   const struct flush_tlb_info *info)
 {
@@ -60,6 +65,7 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
 	struct hv_tlb_flush *flush;
 	u64 status;
 	unsigned long flags;
+	bool do_lazy = !info->freed_tables;
 
 	trace_hyperv_mmu_flush_tlb_multi(cpus, info);
 
@@ -112,6 +118,8 @@ static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
 			goto do_ex_hypercall;
 
 		for_each_cpu(cpu, cpus) {
+			if (do_lazy && cpu_is_lazy(cpu))
+				continue;
 			vcpu = hv_cpu_number_to_vp_number(cpu);
 			if (vcpu == VP_INVAL) {
 				local_irq_restore(flags);
@@ -198,7 +206,8 @@ static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
 	flush->hv_vp_set.valid_bank_mask = 0;
 
 	flush->hv_vp_set.format = HV_GENERIC_SET_SPARSE_4K;
-	nr_bank = cpumask_to_vpset(&(flush->hv_vp_set), cpus);
+	nr_bank = cpumask_to_vpset_skip(&flush->hv_vp_set, cpus,
+			info->freed_tables ? NULL : cpu_is_lazy);
 	if (nr_bank < 0)
 		return HV_STATUS_INVALID_PARAMETER;