Merge tag 'kvm-3.10-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm updates from Gleb Natapov: "Highlights of the updates are: general: - new emulated device API - legacy device assignment is now optional - irqfd interface is more generic and can be shared between arches x86: - VMCS shadow support and other nested VMX improvements - APIC virtualization and Posted Interrupt hardware support - Optimize mmio spte zapping ppc: - BookE: in-kernel MPIC emulation with irqfd support - Book3S: in-kernel XICS emulation (incomplete) - Book3S: HV: migration fixes - BookE: more debug support preparation - BookE: e6500 support ARM: - reworking of Hyp idmaps s390: - ioeventfd for virtio-ccw And many other bug fixes, cleanups and improvements" * tag 'kvm-3.10-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (204 commits) kvm: Add compat_ioctl for device control API KVM: x86: Account for failing enable_irq_window for NMI window request KVM: PPC: Book3S: Add API for in-kernel XICS emulation kvm/ppc/mpic: fix missing unlock in set_base_addr() kvm/ppc: Hold srcu lock when calling kvm_io_bus_read/write kvm/ppc/mpic: remove users kvm/ppc/mpic: fix mmio region lists when multiple guests used kvm/ppc/mpic: remove default routes from documentation kvm: KVM_CAP_IOMMU only available with device assignment ARM: KVM: iterate over all CPUs for CPU compatibility check KVM: ARM: Fix spelling in error message ARM: KVM: define KVM_ARM_MAX_VCPUS unconditionally KVM: ARM: Fix API documentation for ONE_REG encoding ARM: KVM: promote vfp_host pointer to generic host cpu context ARM: KVM: add architecture specific hook for capabilities ARM: KVM: perform HYP initilization for hotplugged CPUs ARM: KVM: switch to a dual-step HYP init code ARM: KVM: rework HYP page table freeing ARM: KVM: enforce maximum size for identity mapped code ARM: KVM: move to a KVM provided HYP idmap ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-05-05 23:47:31 +0200
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-05-05 23:47:31 +0200
commit: 01227a889ed56ae53aeebb9f93be9d54dd8b2de8 (patch)
tree: d5eba9359a9827e84d4112b84d48c54df5c5acde /arch/arm/kvm
parent: Give the OID registry file module info to avoid kernel tainting (diff)
parent: kvm: Add compat_ioctl for device control API (diff)
download: linux-01227a889ed56ae53aeebb9f93be9d54dd8b2de8.tar.xz
linux-01227a889ed56ae53aeebb9f93be9d54dd8b2de8.zip
7 files changed, 467 insertions, 278 deletions
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 49dd64e579c2..370e1a8af6ac 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -41,9 +41,9 @@ config KVM_ARM_HOST
 	  Provides host support for ARM processors.
 
 config KVM_ARM_MAX_VCPUS
-	int "Number maximum supported virtual CPUs per VM"
-	depends on KVM_ARM_HOST
-	default 4
+	int "Number maximum supported virtual CPUs per VM" if KVM_ARM_HOST
+	default 4 if KVM_ARM_HOST
+	default 0
 	help
 	  Static number of max supported virtual CPUs per VM.
 
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index 8dc5e76cb789..53c5ed83d16f 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -18,6 +18,6 @@ kvm-arm-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
 
 obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
-obj-y += coproc.o coproc_a15.o mmio.o psci.o
+obj-y += coproc.o coproc_a15.o mmio.o psci.o perf.o
 obj-$(CONFIG_KVM_ARM_VGIC) += vgic.o
 obj-$(CONFIG_KVM_ARM_TIMER) += arch_timer.o
diff --git a/arch/arm/kvm/arch_timer.c b/arch/arm/kvm/arch_timer.c
index 6ac938d46297..c55b6089e923 100644
--- a/arch/arm/kvm/arch_timer.c
+++ b/arch/arm/kvm/arch_timer.c
@@ -22,6 +22,7 @@
 #include <linux/kvm_host.h>
 #include <linux/interrupt.h>
 
+#include <clocksource/arm_arch_timer.h>
 #include <asm/arch_timer.h>
 
 #include <asm/kvm_vgic.h>
@@ -64,7 +65,7 @@ static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
-	timer->cntv_ctl |= 1 << 1; /* Mask the interrupt in the guest */
+	timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK;
 	kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
 			    vcpu->arch.timer_cpu.irq->irq,
 			    vcpu->arch.timer_cpu.irq->level);
@@ -133,8 +134,8 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 	cycle_t cval, now;
 	u64 ns;
 
-	/* Check if the timer is enabled and unmasked first */
-	if ((timer->cntv_ctl & 3) != 1)
+	if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
+		!(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
 		return;
 
 	cval = timer->cntv_cval;
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index a0dfc2a53f91..37d216d814cd 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -16,6 +16,7 @@
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  */
 
+#include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/err.h>
 #include <linux/kvm_host.h>
@@ -48,7 +49,7 @@ __asm__(".arch_extension	virt");
 #endif
 
 static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page);
-static kvm_kernel_vfp_t __percpu *kvm_host_vfp_state;
+static kvm_cpu_context_t __percpu *kvm_host_cpu_state;
 static unsigned long hyp_default_vectors;
 
 /* Per-CPU variable containing the currently running vcpu. */
@@ -206,7 +207,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 		r = KVM_MAX_VCPUS;
 		break;
 	default:
-		r = 0;
+		r = kvm_arch_dev_ioctl_check_extension(ext);
 		break;
 	}
 	return r;
@@ -218,27 +219,18 @@ long kvm_arch_dev_ioctl(struct file *filp,
 	return -EINVAL;
 }
 
-int kvm_arch_set_memory_region(struct kvm *kvm,
-			       struct kvm_userspace_memory_region *mem,
-			       struct kvm_memory_slot old,
-			       int user_alloc)
-{
-	return 0;
-}
-
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
-				   struct kvm_memory_slot old,
 				   struct kvm_userspace_memory_region *mem,
-				   bool user_alloc)
+				   enum kvm_mr_change change)
 {
 	return 0;
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				   struct kvm_userspace_memory_region *mem,
-				   struct kvm_memory_slot old,
-				   bool user_alloc)
+				   const struct kvm_memory_slot *old,
+				   enum kvm_mr_change change)
 {
 }
 
@@ -326,7 +318,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	vcpu->cpu = cpu;
-	vcpu->arch.vfp_host = this_cpu_ptr(kvm_host_vfp_state);
+	vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state);
 
 	/*
 	 * Check whether this vcpu requires the cache to be flushed on
@@ -639,7 +631,8 @@ static int vcpu_interrupt_line(struct kvm_vcpu *vcpu, int number, bool level)
 	return 0;
 }
 
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level)
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
+			  bool line_status)
 {
 	u32 irq = irq_level->irq;
 	unsigned int irq_type, vcpu_idx, irq_num;
@@ -794,30 +787,48 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 }
 
-static void cpu_init_hyp_mode(void *vector)
+static void cpu_init_hyp_mode(void *dummy)
 {
+	unsigned long long boot_pgd_ptr;
 	unsigned long long pgd_ptr;
 	unsigned long hyp_stack_ptr;
 	unsigned long stack_page;
 	unsigned long vector_ptr;
 
 	/* Switch from the HYP stub to our own HYP init vector */
-	__hyp_set_vectors((unsigned long)vector);
+	__hyp_set_vectors(kvm_get_idmap_vector());
 
+	boot_pgd_ptr = (unsigned long long)kvm_mmu_get_boot_httbr();
 	pgd_ptr = (unsigned long long)kvm_mmu_get_httbr();
 	stack_page = __get_cpu_var(kvm_arm_hyp_stack_page);
 	hyp_stack_ptr = stack_page + PAGE_SIZE;
 	vector_ptr = (unsigned long)__kvm_hyp_vector;
 
-	__cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
+	__cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+}
+
+static int hyp_init_cpu_notify(struct notifier_block *self,
+			       unsigned long action, void *cpu)
+{
+	switch (action) {
+	case CPU_STARTING:
+	case CPU_STARTING_FROZEN:
+		cpu_init_hyp_mode(NULL);
+		break;
+	}
+
+	return NOTIFY_OK;
 }
 
+static struct notifier_block hyp_init_cpu_nb = {
+	.notifier_call = hyp_init_cpu_notify,
+};
+
 /**
  * Inits Hyp-mode on all online CPUs
  */
 static int init_hyp_mode(void)
 {
-	phys_addr_t init_phys_addr;
 	int cpu;
 	int err = 0;
 
@@ -850,24 +861,6 @@ static int init_hyp_mode(void)
 	}
 
 	/*
-	 * Execute the init code on each CPU.
-	 *
-	 * Note: The stack is not mapped yet, so don't do anything else than
-	 * initializing the hypervisor mode on each CPU using a local stack
-	 * space for temporary storage.
-	 */
-	init_phys_addr = virt_to_phys(__kvm_hyp_init);
-	for_each_online_cpu(cpu) {
-		smp_call_function_single(cpu, cpu_init_hyp_mode,
-					 (void *)(long)init_phys_addr, 1);
-	}
-
-	/*
-	 * Unmap the identity mapping
-	 */
-	kvm_clear_hyp_idmap();
-
-	/*
 	 * Map the Hyp-code called directly from the host
 	 */
 	err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end);
@@ -890,33 +883,38 @@ static int init_hyp_mode(void)
 	}
 
 	/*
-	 * Map the host VFP structures
+	 * Map the host CPU structures
 	 */
-	kvm_host_vfp_state = alloc_percpu(kvm_kernel_vfp_t);
-	if (!kvm_host_vfp_state) {
+	kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
+	if (!kvm_host_cpu_state) {
 		err = -ENOMEM;
-		kvm_err("Cannot allocate host VFP state\n");
+		kvm_err("Cannot allocate host CPU state\n");
 		goto out_free_mappings;
 	}
 
 	for_each_possible_cpu(cpu) {
-		kvm_kernel_vfp_t *vfp;
+		kvm_cpu_context_t *cpu_ctxt;
 
-		vfp = per_cpu_ptr(kvm_host_vfp_state, cpu);
-		err = create_hyp_mappings(vfp, vfp + 1);
+		cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu);
+		err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1);
 
 		if (err) {
-			kvm_err("Cannot map host VFP state: %d\n", err);
-			goto out_free_vfp;
+			kvm_err("Cannot map host CPU state: %d\n", err);
+			goto out_free_context;
 		}
 	}
 
 	/*
+	 * Execute the init code on each CPU.
+	 */
+	on_each_cpu(cpu_init_hyp_mode, NULL, 1);
+
+	/*
 	 * Init HYP view of VGIC
 	 */
 	err = kvm_vgic_hyp_init();
 	if (err)
-		goto out_free_vfp;
+		goto out_free_context;
 
 #ifdef CONFIG_KVM_ARM_VGIC
 		vgic_present = true;
@@ -929,12 +927,19 @@ static int init_hyp_mode(void)
 	if (err)
 		goto out_free_mappings;
 
+#ifndef CONFIG_HOTPLUG_CPU
+	free_boot_hyp_pgd();
+#endif
+
+	kvm_perf_init();
+
 	kvm_info("Hyp mode initialized successfully\n");
+
 	return 0;
-out_free_vfp:
-	free_percpu(kvm_host_vfp_state);
+out_free_context:
+	free_percpu(kvm_host_cpu_state);
 out_free_mappings:
-	free_hyp_pmds();
+	free_hyp_pgds();
 out_free_stack_pages:
 	for_each_possible_cpu(cpu)
 		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
@@ -943,27 +948,42 @@ out_err:
 	return err;
 }
 
+static void check_kvm_target_cpu(void *ret)
+{
+	*(int *)ret = kvm_target_cpu();
+}
+
 /**
  * Initialize Hyp-mode and memory mappings on all CPUs.
  */
 int kvm_arch_init(void *opaque)
 {
 	int err;
+	int ret, cpu;
 
 	if (!is_hyp_mode_available()) {
 		kvm_err("HYP mode not available\n");
 		return -ENODEV;
 	}
 
-	if (kvm_target_cpu() < 0) {
-		kvm_err("Target CPU not supported!\n");
-		return -ENODEV;
+	for_each_online_cpu(cpu) {
+		smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1);
+		if (ret < 0) {
+			kvm_err("Error, CPU %d not supported!\n", cpu);
+			return -ENODEV;
+		}
 	}
 
 	err = init_hyp_mode();
 	if (err)
 		goto out_err;
 
+	err = register_cpu_notifier(&hyp_init_cpu_nb);
+	if (err) {
+		kvm_err("Cannot register HYP init CPU notifier (%d)\n", err);
+		goto out_err;
+	}
+
 	kvm_coproc_table_init();
 	return 0;
 out_err:
@@ -973,6 +993,7 @@ out_err:
 /* NOP: Compiling as a module not supported */
 void kvm_arch_exit(void)
 {
+	kvm_perf_teardown();
 }
 
 static int arm_init(void)
diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index 9f37a79b880b..f048338135f7 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -21,13 +21,33 @@
 #include <asm/asm-offsets.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
 
 /********************************************************************
  * Hypervisor initialization
  *   - should be called with:
- *       r0,r1 = Hypervisor pgd pointer
- *       r2 = top of Hyp stack (kernel VA)
- *       r3 = pointer to hyp vectors
+ *       r0 = top of Hyp stack (kernel VA)
+ *       r1 = pointer to hyp vectors
+ *       r2,r3 = Hypervisor pgd pointer
+ *
+ * The init scenario is:
+ * - We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd,
+ *   runtime stack, runtime vectors
+ * - Enable the MMU with the boot pgd
+ * - Jump to a target into the trampoline page (remember, this is the same
+ *   physical page!)
+ * - Now switch to the runtime pgd (same VA, and still the same physical
+ *   page!)
+ * - Invalidate TLBs
+ * - Set stack and vectors
+ * - Profit! (or eret, if you only care about the code).
+ *
+ * As we only have four registers available to pass parameters (and we
+ * need six), we split the init in two phases:
+ * - Phase 1: r0 = 0, r1 = 0, r2,r3 contain the boot PGD.
+ *   Provides the basic HYP init, and enable the MMU.
+ * - Phase 2: r0 = ToS, r1 = vectors, r2,r3 contain the runtime PGD.
+ *   Switches to the runtime PGD, set stack and vectors.
  */
 
 	.text
@@ -47,22 +67,25 @@ __kvm_hyp_init:
 	W(b)	.
 
 __do_hyp_init:
+	cmp	r0, #0			@ We have a SP?
+	bne	phase2			@ Yes, second stage init
+
 	@ Set the HTTBR to point to the hypervisor PGD pointer passed
-	mcrr	p15, 4, r0, r1, c2
+	mcrr	p15, 4, r2, r3, c2
 
 	@ Set the HTCR and VTCR to the same shareability and cacheability
 	@ settings as the non-secure TTBCR and with T0SZ == 0.
 	mrc	p15, 4, r0, c2, c0, 2	@ HTCR
-	ldr	r12, =HTCR_MASK
-	bic	r0, r0, r12
+	ldr	r2, =HTCR_MASK
+	bic	r0, r0, r2
 	mrc	p15, 0, r1, c2, c0, 2	@ TTBCR
 	and	r1, r1, #(HTCR_MASK & ~TTBCR_T0SZ)
 	orr	r0, r0, r1
 	mcr	p15, 4, r0, c2, c0, 2	@ HTCR
 
 	mrc	p15, 4, r1, c2, c1, 2	@ VTCR
-	ldr	r12, =VTCR_MASK
-	bic	r1, r1, r12
+	ldr	r2, =VTCR_MASK
+	bic	r1, r1, r2
 	bic	r0, r0, #(~VTCR_HTCR_SH)	@ clear non-reusable HTCR bits
 	orr	r1, r0, r1
 	orr	r1, r1, #(KVM_VTCR_SL0 | KVM_VTCR_T0SZ | KVM_VTCR_S)
@@ -85,24 +108,41 @@ __do_hyp_init:
 	@  - Memory alignment checks: enabled
 	@  - MMU: enabled (this code must be run from an identity mapping)
 	mrc	p15, 4, r0, c1, c0, 0	@ HSCR
-	ldr	r12, =HSCTLR_MASK
-	bic	r0, r0, r12
+	ldr	r2, =HSCTLR_MASK
+	bic	r0, r0, r2
 	mrc	p15, 0, r1, c1, c0, 0	@ SCTLR
-	ldr	r12, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C)
-	and	r1, r1, r12
- ARM(	ldr	r12, =(HSCTLR_M | HSCTLR_A)			)
- THUMB(	ldr	r12, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE)		)
-	orr	r1, r1, r12
+	ldr	r2, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C)
+	and	r1, r1, r2
+ ARM(	ldr	r2, =(HSCTLR_M | HSCTLR_A)			)
+ THUMB(	ldr	r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE)		)
+	orr	r1, r1, r2
 	orr	r0, r0, r1
 	isb
 	mcr	p15, 4, r0, c1, c0, 0	@ HSCR
-	isb
 
-	@ Set stack pointer and return to the kernel
-	mov	sp, r2
+	@ End of init phase-1
+	eret
+
+phase2:
+	@ Set stack pointer
+	mov	sp, r0
 
 	@ Set HVBAR to point to the HYP vectors
-	mcr	p15, 4, r3, c12, c0, 0	@ HVBAR
+	mcr	p15, 4, r1, c12, c0, 0	@ HVBAR
+
+	@ Jump to the trampoline page
+	ldr	r0, =TRAMPOLINE_VA
+	adr	r1, target
+	bfi	r0, r1, #0, #PAGE_SHIFT
+	mov	pc, r0
+
+target:	@ We're now in the trampoline code, switch page tables
+	mcrr	p15, 4, r2, r3, c2
+	isb
+
+	@ Invalidate the old TLBs
+	mcr	p15, 4, r0, c8, c7, 0	@ TLBIALLH
+	dsb
 
 	eret
 
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 2f12e4056408..965706578f13 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -32,8 +32,15 @@
 
 extern char  __hyp_idmap_text_start[], __hyp_idmap_text_end[];
 
+static pgd_t *boot_hyp_pgd;
+static pgd_t *hyp_pgd;
 static DEFINE_MUTEX(kvm_hyp_pgd_mutex);
 
+static void *init_bounce_page;
+static unsigned long hyp_idmap_start;
+static unsigned long hyp_idmap_end;
+static phys_addr_t hyp_idmap_vector;
+
 static void kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {
 	kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, kvm, ipa);
@@ -71,172 +78,224 @@ static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc)
 	return p;
 }
 
-static void free_ptes(pmd_t *pmd, unsigned long addr)
+static void clear_pud_entry(pud_t *pud)
 {
-	pte_t *pte;
-	unsigned int i;
+	pmd_t *pmd_table = pmd_offset(pud, 0);
+	pud_clear(pud);
+	pmd_free(NULL, pmd_table);
+	put_page(virt_to_page(pud));
+}
 
-	for (i = 0; i < PTRS_PER_PMD; i++, addr += PMD_SIZE) {
-		if (!pmd_none(*pmd) && pmd_table(*pmd)) {
-			pte = pte_offset_kernel(pmd, addr);
-			pte_free_kernel(NULL, pte);
-		}
-		pmd++;
+static void clear_pmd_entry(pmd_t *pmd)
+{
+	pte_t *pte_table = pte_offset_kernel(pmd, 0);
+	pmd_clear(pmd);
+	pte_free_kernel(NULL, pte_table);
+	put_page(virt_to_page(pmd));
+}
+
+static bool pmd_empty(pmd_t *pmd)
+{
+	struct page *pmd_page = virt_to_page(pmd);
+	return page_count(pmd_page) == 1;
+}
+
+static void clear_pte_entry(pte_t *pte)
+{
+	if (pte_present(*pte)) {
+		kvm_set_pte(pte, __pte(0));
+		put_page(virt_to_page(pte));
 	}
 }
 
-static void free_hyp_pgd_entry(unsigned long addr)
+static bool pte_empty(pte_t *pte)
+{
+	struct page *pte_page = virt_to_page(pte);
+	return page_count(pte_page) == 1;
+}
+
+static void unmap_range(pgd_t *pgdp, unsigned long long start, u64 size)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
-	unsigned long hyp_addr = KERN_TO_HYP(addr);
+	pte_t *pte;
+	unsigned long long addr = start, end = start + size;
+	u64 range;
+
+	while (addr < end) {
+		pgd = pgdp + pgd_index(addr);
+		pud = pud_offset(pgd, addr);
+		if (pud_none(*pud)) {
+			addr += PUD_SIZE;
+			continue;
+		}
 
-	pgd = hyp_pgd + pgd_index(hyp_addr);
-	pud = pud_offset(pgd, hyp_addr);
+		pmd = pmd_offset(pud, addr);
+		if (pmd_none(*pmd)) {
+			addr += PMD_SIZE;
+			continue;
+		}
 
-	if (pud_none(*pud))
-		return;
-	BUG_ON(pud_bad(*pud));
+		pte = pte_offset_kernel(pmd, addr);
+		clear_pte_entry(pte);
+		range = PAGE_SIZE;
 
-	pmd = pmd_offset(pud, hyp_addr);
-	free_ptes(pmd, addr);
-	pmd_free(NULL, pmd);
-	pud_clear(pud);
+		/* If we emptied the pte, walk back up the ladder */
+		if (pte_empty(pte)) {
+			clear_pmd_entry(pmd);
+			range = PMD_SIZE;
+			if (pmd_empty(pmd)) {
+				clear_pud_entry(pud);
+				range = PUD_SIZE;
+			}
+		}
+
+		addr += range;
+	}
 }
 
 /**
- * free_hyp_pmds - free a Hyp-mode level-2 tables and child level-3 tables
+ * free_boot_hyp_pgd - free HYP boot page tables
  *
- * Assumes this is a page table used strictly in Hyp-mode and therefore contains
- * either mappings in the kernel memory area (above PAGE_OFFSET), or
- * device mappings in the vmalloc range (from VMALLOC_START to VMALLOC_END).
+ * Free the HYP boot page tables. The bounce page is also freed.
  */
-void free_hyp_pmds(void)
+void free_boot_hyp_pgd(void)
 {
-	unsigned long addr;
-
 	mutex_lock(&kvm_hyp_pgd_mutex);
-	for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
-		free_hyp_pgd_entry(addr);
-	for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
-		free_hyp_pgd_entry(addr);
+
+	if (boot_hyp_pgd) {
+		unmap_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
+		unmap_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
+		kfree(boot_hyp_pgd);
+		boot_hyp_pgd = NULL;
+	}
+
+	if (hyp_pgd)
+		unmap_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
+
+	kfree(init_bounce_page);
+	init_bounce_page = NULL;
+
 	mutex_unlock(&kvm_hyp_pgd_mutex);
 }
 
-static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
-				    unsigned long end)
+/**
+ * free_hyp_pgds - free Hyp-mode page tables
+ *
+ * Assumes hyp_pgd is a page table used strictly in Hyp-mode and
+ * therefore contains either mappings in the kernel memory area (above
+ * PAGE_OFFSET), or device mappings in the vmalloc range (from
+ * VMALLOC_START to VMALLOC_END).
+ *
+ * boot_hyp_pgd should only map two pages for the init code.
+ */
+void free_hyp_pgds(void)
 {
-	pte_t *pte;
 	unsigned long addr;
-	struct page *page;
 
-	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
-		unsigned long hyp_addr = KERN_TO_HYP(addr);
+	free_boot_hyp_pgd();
+
+	mutex_lock(&kvm_hyp_pgd_mutex);
 
-		pte = pte_offset_kernel(pmd, hyp_addr);
-		BUG_ON(!virt_addr_valid(addr));
-		page = virt_to_page(addr);
-		kvm_set_pte(pte, mk_pte(page, PAGE_HYP));
+	if (hyp_pgd) {
+		for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
+			unmap_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+		for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
+			unmap_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+		kfree(hyp_pgd);
+		hyp_pgd = NULL;
 	}
+
+	mutex_unlock(&kvm_hyp_pgd_mutex);
 }
 
-static void create_hyp_io_pte_mappings(pmd_t *pmd, unsigned long start,
-				       unsigned long end,
-				       unsigned long *pfn_base)
+static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
+				    unsigned long end, unsigned long pfn,
+				    pgprot_t prot)
 {
 	pte_t *pte;
 	unsigned long addr;
 
-	for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
-		unsigned long hyp_addr = KERN_TO_HYP(addr);
-
-		pte = pte_offset_kernel(pmd, hyp_addr);
-		BUG_ON(pfn_valid(*pfn_base));
-		kvm_set_pte(pte, pfn_pte(*pfn_base, PAGE_HYP_DEVICE));
-		(*pfn_base)++;
-	}
+	addr = start;
+	do {
+		pte = pte_offset_kernel(pmd, addr);
+		kvm_set_pte(pte, pfn_pte(pfn, prot));
+		get_page(virt_to_page(pte));
+		kvm_flush_dcache_to_poc(pte, sizeof(*pte));
+		pfn++;
+	} while (addr += PAGE_SIZE, addr != end);
 }
 
 static int create_hyp_pmd_mappings(pud_t *pud, unsigned long start,
-				   unsigned long end, unsigned long *pfn_base)
+				   unsigned long end, unsigned long pfn,
+				   pgprot_t prot)
 {
 	pmd_t *pmd;
 	pte_t *pte;
 	unsigned long addr, next;
 
-	for (addr = start; addr < end; addr = next) {
-		unsigned long hyp_addr = KERN_TO_HYP(addr);
-		pmd = pmd_offset(pud, hyp_addr);
+	addr = start;
+	do {
+		pmd = pmd_offset(pud, addr);
 
 		BUG_ON(pmd_sect(*pmd));
 
 		if (pmd_none(*pmd)) {
-			pte = pte_alloc_one_kernel(NULL, hyp_addr);
+			pte = pte_alloc_one_kernel(NULL, addr);
 			if (!pte) {
 				kvm_err("Cannot allocate Hyp pte\n");
 				return -ENOMEM;
 			}
 			pmd_populate_kernel(NULL, pmd, pte);
+			get_page(virt_to_page(pmd));
+			kvm_flush_dcache_to_poc(pmd, sizeof(*pmd));
 		}
 
 		next = pmd_addr_end(addr, end);
 
-		/*
-		 * If pfn_base is NULL, we map kernel pages into HYP with the
-		 * virtual address. Otherwise, this is considered an I/O
-		 * mapping and we map the physical region starting at
-		 * *pfn_base to [start, end[.
-		 */
-		if (!pfn_base)
-			create_hyp_pte_mappings(pmd, addr, next);
-		else
-			create_hyp_io_pte_mappings(pmd, addr, next, pfn_base);
-	}
+		create_hyp_pte_mappings(pmd, addr, next, pfn, prot);
+		pfn += (next - addr) >> PAGE_SHIFT;
+	} while (addr = next, addr != end);
 
 	return 0;
 }
 
-static int __create_hyp_mappings(void *from, void *to, unsigned long *pfn_base)
+static int __create_hyp_mappings(pgd_t *pgdp,
+				 unsigned long start, unsigned long end,
+				 unsigned long pfn, pgprot_t prot)
 {
-	unsigned long start = (unsigned long)from;
-	unsigned long end = (unsigned long)to;
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	unsigned long addr, next;
 	int err = 0;
 
-	if (start >= end)
-		return -EINVAL;
-	/* Check for a valid kernel memory mapping */
-	if (!pfn_base && (!virt_addr_valid(from) || !virt_addr_valid(to - 1)))
-		return -EINVAL;
-	/* Check for a valid kernel IO mapping */
-	if (pfn_base && (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1)))
-		return -EINVAL;
-
 	mutex_lock(&kvm_hyp_pgd_mutex);
-	for (addr = start; addr < end; addr = next) {
-		unsigned long hyp_addr = KERN_TO_HYP(addr);
-		pgd = hyp_pgd + pgd_index(hyp_addr);
-		pud = pud_offset(pgd, hyp_addr);
+	addr = start & PAGE_MASK;
+	end = PAGE_ALIGN(end);
+	do {
+		pgd = pgdp + pgd_index(addr);
+		pud = pud_offset(pgd, addr);
 
 		if (pud_none_or_clear_bad(pud)) {
-			pmd = pmd_alloc_one(NULL, hyp_addr);
+			pmd = pmd_alloc_one(NULL, addr);
 			if (!pmd) {
 				kvm_err("Cannot allocate Hyp pmd\n");
 				err = -ENOMEM;
 				goto out;
 			}
 			pud_populate(NULL, pud, pmd);
+			get_page(virt_to_page(pud));
+			kvm_flush_dcache_to_poc(pud, sizeof(*pud));
 		}
 
 		next = pgd_addr_end(addr, end);
-		err = create_hyp_pmd_mappings(pud, addr, next, pfn_base);
+		err = create_hyp_pmd_mappings(pud, addr, next, pfn, prot);
 		if (err)
 			goto out;
-	}
+		pfn += (next - addr) >> PAGE_SHIFT;
+	} while (addr = next, addr != end);
 out:
 	mutex_unlock(&kvm_hyp_pgd_mutex);
 	return err;
@@ -250,27 +309,41 @@ out:
  * The same virtual address as the kernel virtual address is also used
  * in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
  * physical pages.
- *
- * Note: Wrapping around zero in the "to" address is not supported.
  */
 int create_hyp_mappings(void *from, void *to)
 {
-	return __create_hyp_mappings(from, to, NULL);
+	unsigned long phys_addr = virt_to_phys(from);
+	unsigned long start = KERN_TO_HYP((unsigned long)from);
+	unsigned long end = KERN_TO_HYP((unsigned long)to);
+
+	/* Check for a valid kernel memory mapping */
+	if (!virt_addr_valid(from) || !virt_addr_valid(to - 1))
+		return -EINVAL;
+
+	return __create_hyp_mappings(hyp_pgd, start, end,
+				     __phys_to_pfn(phys_addr), PAGE_HYP);
 }
 
 /**
  * create_hyp_io_mappings - duplicate a kernel IO mapping into Hyp mode
  * @from:	The kernel start VA of the range
  * @to:		The kernel end VA of the range (exclusive)
- * @addr:	The physical start address which gets mapped
+ * @phys_addr:	The physical start address which gets mapped
  *
  * The resulting HYP VA is the same as the kernel VA, modulo
  * HYP_PAGE_OFFSET.
  */
-int create_hyp_io_mappings(void *from, void *to, phys_addr_t addr)
+int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
 {
-	unsigned long pfn = __phys_to_pfn(addr);
-	return __create_hyp_mappings(from, to, &pfn);
+	unsigned long start = KERN_TO_HYP((unsigned long)from);
+	unsigned long end = KERN_TO_HYP((unsigned long)to);
+
+	/* Check for a valid kernel IO mapping */
+	if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
+		return -EINVAL;
+
+	return __create_hyp_mappings(hyp_pgd, start, end,
+				     __phys_to_pfn(phys_addr), PAGE_HYP_DEVICE);
 }
 
 /**
@@ -307,42 +380,6 @@ int kvm_alloc_stage2_pgd(struct kvm *kvm)
 	return 0;
 }
 
-static void clear_pud_entry(pud_t *pud)
-{
-	pmd_t *pmd_table = pmd_offset(pud, 0);
-	pud_clear(pud);
-	pmd_free(NULL, pmd_table);
-	put_page(virt_to_page(pud));
-}
-
-static void clear_pmd_entry(pmd_t *pmd)
-{
-	pte_t *pte_table = pte_offset_kernel(pmd, 0);
-	pmd_clear(pmd);
-	pte_free_kernel(NULL, pte_table);
-	put_page(virt_to_page(pmd));
-}
-
-static bool pmd_empty(pmd_t *pmd)
-{
-	struct page *pmd_page = virt_to_page(pmd);
-	return page_count(pmd_page) == 1;
-}
-
-static void clear_pte_entry(pte_t *pte)
-{
-	if (pte_present(*pte)) {
-		kvm_set_pte(pte, __pte(0));
-		put_page(virt_to_page(pte));
-	}
-}
-
-static bool pte_empty(pte_t *pte)
-{
-	struct page *pte_page = virt_to_page(pte);
-	return page_count(pte_page) == 1;
-}
-
 /**
  * unmap_stage2_range -- Clear stage2 page table entries to unmap a range
  * @kvm:   The VM pointer
@@ -356,43 +393,7 @@ static bool pte_empty(pte_t *pte)
  */
 static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
 {
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	phys_addr_t addr = start, end = start + size;
-	u64 range;
-
-	while (addr < end) {
-		pgd = kvm->arch.pgd + pgd_index(addr);
-		pud = pud_offset(pgd, addr);
-		if (pud_none(*pud)) {
-			addr += PUD_SIZE;
-			continue;
-		}
-
-		pmd = pmd_offset(pud, addr);
-		if (pmd_none(*pmd)) {
-			addr += PMD_SIZE;
-			continue;
-		}
-
-		pte = pte_offset_kernel(pmd, addr);
-		clear_pte_entry(pte);
-		range = PAGE_SIZE;
-
-		/* If we emptied the pte, walk back up the ladder */
-		if (pte_empty(pte)) {
-			clear_pmd_entry(pmd);
-			range = PMD_SIZE;
-			if (pmd_empty(pmd)) {
-				clear_pud_entry(pud);
-				range = PUD_SIZE;
-			}
-		}
-
-		addr += range;
-	}
+	unmap_range(kvm->arch.pgd, start, size);
 }
 
 /**
@@ -728,47 +729,105 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 
 phys_addr_t kvm_mmu_get_httbr(void)
 {
-	VM_BUG_ON(!virt_addr_valid(hyp_pgd));
 	return virt_to_phys(hyp_pgd);
 }
 
+phys_addr_t kvm_mmu_get_boot_httbr(void)
+{
+	return virt_to_phys(boot_hyp_pgd);
+}
+
+phys_addr_t kvm_get_idmap_vector(void)
+{
+	return hyp_idmap_vector;
+}
+
 int kvm_mmu_init(void)
 {
-	if (!hyp_pgd) {
+	int err;
+
+	hyp_idmap_start = virt_to_phys(__hyp_idmap_text_start);
+	hyp_idmap_end = virt_to_phys(__hyp_idmap_text_end);
+	hyp_idmap_vector = virt_to_phys(__kvm_hyp_init);
+
+	if ((hyp_idmap_start ^ hyp_idmap_end) & PAGE_MASK) {
+		/*
+		 * Our init code is crossing a page boundary. Allocate
+		 * a bounce page, copy the code over and use that.
+		 */
+		size_t len = __hyp_idmap_text_end - __hyp_idmap_text_start;
+		phys_addr_t phys_base;
+
+		init_bounce_page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+		if (!init_bounce_page) {
+			kvm_err("Couldn't allocate HYP init bounce page\n");
+			err = -ENOMEM;
+			goto out;
+		}
+
+		memcpy(init_bounce_page, __hyp_idmap_text_start, len);
+		/*
+		 * Warning: the code we just copied to the bounce page
+		 * must be flushed to the point of coherency.
+		 * Otherwise, the data may be sitting in L2, and HYP
+		 * mode won't be able to observe it as it runs with
+		 * caches off at that point.
+		 */
+		kvm_flush_dcache_to_poc(init_bounce_page, len);
+
+		phys_base = virt_to_phys(init_bounce_page);
+		hyp_idmap_vector += phys_base - hyp_idmap_start;
+		hyp_idmap_start = phys_base;
+		hyp_idmap_end = phys_base + len;
+
+		kvm_info("Using HYP init bounce page @%lx\n",
+			 (unsigned long)phys_base);
+	}
+
+	hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL);
+	boot_hyp_pgd = kzalloc(PTRS_PER_PGD * sizeof(pgd_t), GFP_KERNEL);
+	if (!hyp_pgd || !boot_hyp_pgd) {
 		kvm_err("Hyp mode PGD not allocated\n");
-		return -ENOMEM;
+		err = -ENOMEM;
+		goto out;
 	}
 
-	return 0;
-}
+	/* Create the idmap in the boot page tables */
+	err = 	__create_hyp_mappings(boot_hyp_pgd,
+				      hyp_idmap_start, hyp_idmap_end,
+				      __phys_to_pfn(hyp_idmap_start),
+				      PAGE_HYP);
 
-/**
- * kvm_clear_idmap - remove all idmaps from the hyp pgd
- *
- * Free the underlying pmds for all pgds in range and clear the pgds (but
- * don't free them) afterwards.
- */
-void kvm_clear_hyp_idmap(void)
-{
-	unsigned long addr, end;
-	unsigned long next;
-	pgd_t *pgd = hyp_pgd;
-	pud_t *pud;
-	pmd_t *pmd;
+	if (err) {
+		kvm_err("Failed to idmap %lx-%lx\n",
+			hyp_idmap_start, hyp_idmap_end);
+		goto out;
+	}
 
-	addr = virt_to_phys(__hyp_idmap_text_start);
-	end = virt_to_phys(__hyp_idmap_text_end);
+	/* Map the very same page at the trampoline VA */
+	err = 	__create_hyp_mappings(boot_hyp_pgd,
+				      TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
+				      __phys_to_pfn(hyp_idmap_start),
+				      PAGE_HYP);
+	if (err) {
+		kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
+			TRAMPOLINE_VA);
+		goto out;
+	}
 
-	pgd += pgd_index(addr);
-	do {
-		next = pgd_addr_end(addr, end);
-		if (pgd_none_or_clear_bad(pgd))
-			continue;
-		pud = pud_offset(pgd, addr);
-		pmd = pmd_offset(pud, addr);
+	/* Map the same page again into the runtime page tables */
+	err = 	__create_hyp_mappings(hyp_pgd,
+				      TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
+				      __phys_to_pfn(hyp_idmap_start),
+				      PAGE_HYP);
+	if (err) {
+		kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
+			TRAMPOLINE_VA);
+		goto out;
+	}
 
-		pud_clear(pud);
-		kvm_clean_pmd_entry(pmd);
-		pmd_free(NULL, (pmd_t *)((unsigned long)pmd & PAGE_MASK));
-	} while (pgd++, addr = next, addr < end);
+	return 0;
+out:
+	free_hyp_pgds();
+	return err;
 }
diff --git a/arch/arm/kvm/perf.c b/arch/arm/kvm/perf.c
new file mode 100644
index 000000000000..1a3849da0b4b
--- /dev/null
+++ b/arch/arm/kvm/perf.c
@@ -0,0 +1,68 @@
+/*
+ * Based on the x86 implementation.
+ *
+ * Copyright (C) 2012 ARM Ltd.
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/kvm_host.h>
+
+#include <asm/kvm_emulate.h>
+
+static int kvm_is_in_guest(void)
+{
+        return kvm_arm_get_running_vcpu() != NULL;
+}
+
+static int kvm_is_user_mode(void)
+{
+	struct kvm_vcpu *vcpu;
+
+	vcpu = kvm_arm_get_running_vcpu();
+
+	if (vcpu)
+		return !vcpu_mode_priv(vcpu);
+
+	return 0;
+}
+
+static unsigned long kvm_get_guest_ip(void)
+{
+	struct kvm_vcpu *vcpu;
+
+	vcpu = kvm_arm_get_running_vcpu();
+
+	if (vcpu)
+		return *vcpu_pc(vcpu);
+
+	return 0;
+}
+
+static struct perf_guest_info_callbacks kvm_guest_cbs = {
+	.is_in_guest	= kvm_is_in_guest,
+	.is_user_mode	= kvm_is_user_mode,
+	.get_guest_ip	= kvm_get_guest_ip,
+};
+
+int kvm_perf_init(void)
+{
+	return perf_register_guest_info_callbacks(&kvm_guest_cbs);
+}
+
+int kvm_perf_teardown(void)
+{
+	return perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+}
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-05-05 23:47:31 +0200
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-05-05 23:47:31 +0200
commit	01227a889ed56ae53aeebb9f93be9d54dd8b2de8 (patch)
tree	d5eba9359a9827e84d4112b84d48c54df5c5acde /arch/arm/kvm
parent	Give the OID registry file module info to avoid kernel tainting (diff)
parent	kvm: Add compat_ioctl for device control API (diff)
download	linux-01227a889ed56ae53aeebb9f93be9d54dd8b2de8.tar.xz linux-01227a889ed56ae53aeebb9f93be9d54dd8b2de8.zip