summaryrefslogtreecommitdiffstats
path: root/virt/kvm
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2018-12-19 20:33:55 +0100
committerPaolo Bonzini <pbonzini@redhat.com>2018-12-19 20:33:55 +0100
commit8c5e14f438b8cee47ff01fc1cadd15e3eed44b59 (patch)
tree8f5028e0fce8f3a73cb000b47e571f3788429d23 /virt/kvm
parentkvm: selftests: ucall: improve ucall placement in memory, fix unsigned compar... (diff)
parentarm: KVM: Add S2_PMD_{MASK,SIZE} constants (diff)
downloadlinux-8c5e14f438b8cee47ff01fc1cadd15e3eed44b59.tar.xz
linux-8c5e14f438b8cee47ff01fc1cadd15e3eed44b59.zip
Merge tag 'kvmarm-for-v4.21' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm updates for 4.21 - Large PUD support for HugeTLB - Single-stepping fixes - Improved tracing - Various timer and vgic fixups
Diffstat (limited to 'virt/kvm')
-rw-r--r--virt/kvm/arm/arch_timer.c35
-rw-r--r--virt/kvm/arm/arm.c25
-rw-r--r--virt/kvm/arm/hyp/vgic-v3-sr.c6
-rw-r--r--virt/kvm/arm/mmio.c11
-rw-r--r--virt/kvm/arm/mmu.c384
-rw-r--r--virt/kvm/arm/trace.h18
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.c44
-rw-r--r--virt/kvm/arm/vgic/vgic.c13
8 files changed, 356 insertions, 180 deletions
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 17cecc96f735..b07ac4614e1c 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -70,11 +70,9 @@ static void soft_timer_start(struct hrtimer *hrt, u64 ns)
HRTIMER_MODE_ABS);
}
-static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work)
+static void soft_timer_cancel(struct hrtimer *hrt)
{
hrtimer_cancel(hrt);
- if (work)
- cancel_work_sync(work);
}
static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
@@ -102,23 +100,6 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
return IRQ_HANDLED;
}
-/*
- * Work function for handling the backup timer that we schedule when a vcpu is
- * no longer running, but had a timer programmed to fire in the future.
- */
-static void kvm_timer_inject_irq_work(struct work_struct *work)
-{
- struct kvm_vcpu *vcpu;
-
- vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired);
-
- /*
- * If the vcpu is blocked we want to wake it up so that it will see
- * the timer has expired when entering the guest.
- */
- kvm_vcpu_wake_up(vcpu);
-}
-
static u64 kvm_timer_compute_delta(struct arch_timer_context *timer_ctx)
{
u64 cval, now;
@@ -188,7 +169,7 @@ static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt)
return HRTIMER_RESTART;
}
- schedule_work(&timer->expired);
+ kvm_vcpu_wake_up(vcpu);
return HRTIMER_NORESTART;
}
@@ -300,7 +281,7 @@ static void phys_timer_emulate(struct kvm_vcpu *vcpu)
* then we also don't need a soft timer.
*/
if (kvm_timer_should_fire(ptimer) || !kvm_timer_irq_can_fire(ptimer)) {
- soft_timer_cancel(&timer->phys_timer, NULL);
+ soft_timer_cancel(&timer->phys_timer);
return;
}
@@ -426,7 +407,7 @@ void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
vtimer_restore_state(vcpu);
- soft_timer_cancel(&timer->bg_timer, &timer->expired);
+ soft_timer_cancel(&timer->bg_timer);
}
static void set_cntvoff(u64 cntvoff)
@@ -544,7 +525,7 @@ void kvm_timer_vcpu_put(struct kvm_vcpu *vcpu)
* In any case, we re-schedule the hrtimer for the physical timer when
* coming back to the VCPU thread in kvm_timer_vcpu_load().
*/
- soft_timer_cancel(&timer->phys_timer, NULL);
+ soft_timer_cancel(&timer->phys_timer);
/*
* The kernel may decide to run userspace after calling vcpu_put, so
@@ -637,7 +618,6 @@ void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
update_vtimer_cntvoff(vcpu, kvm_phys_timer_read());
vcpu_ptimer(vcpu)->cntvoff = 0;
- INIT_WORK(&timer->expired, kvm_timer_inject_irq_work);
hrtimer_init(&timer->bg_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
timer->bg_timer.function = kvm_bg_timer_expire;
@@ -792,11 +772,8 @@ out_free_irq:
void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
{
struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
- struct arch_timer_context *vtimer = vcpu_vtimer(vcpu);
- soft_timer_cancel(&timer->bg_timer, &timer->expired);
- soft_timer_cancel(&timer->phys_timer, NULL);
- kvm_vgic_unmap_phys_irq(vcpu, vtimer->irq.irq);
+ soft_timer_cancel(&timer->bg_timer);
}
static bool timer_irqs_are_valid(struct kvm_vcpu *vcpu)
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index e91adf77d99a..87ecb5f3637d 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -66,7 +66,7 @@ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu);
static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
static u32 kvm_next_vmid;
static unsigned int kvm_vmid_bits __read_mostly;
-static DEFINE_RWLOCK(kvm_vmid_lock);
+static DEFINE_SPINLOCK(kvm_vmid_lock);
static bool vgic_present;
@@ -484,7 +484,9 @@ void force_vm_exit(const cpumask_t *mask)
*/
static bool need_new_vmid_gen(struct kvm *kvm)
{
- return unlikely(kvm->arch.vmid_gen != atomic64_read(&kvm_vmid_gen));
+ u64 current_vmid_gen = atomic64_read(&kvm_vmid_gen);
+ smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */
+ return unlikely(READ_ONCE(kvm->arch.vmid_gen) != current_vmid_gen);
}
/**
@@ -499,16 +501,11 @@ static void update_vttbr(struct kvm *kvm)
{
phys_addr_t pgd_phys;
u64 vmid, cnp = kvm_cpu_has_cnp() ? VTTBR_CNP_BIT : 0;
- bool new_gen;
- read_lock(&kvm_vmid_lock);
- new_gen = need_new_vmid_gen(kvm);
- read_unlock(&kvm_vmid_lock);
-
- if (!new_gen)
+ if (!need_new_vmid_gen(kvm))
return;
- write_lock(&kvm_vmid_lock);
+ spin_lock(&kvm_vmid_lock);
/*
* We need to re-check the vmid_gen here to ensure that if another vcpu
@@ -516,7 +513,7 @@ static void update_vttbr(struct kvm *kvm)
* use the same vmid.
*/
if (!need_new_vmid_gen(kvm)) {
- write_unlock(&kvm_vmid_lock);
+ spin_unlock(&kvm_vmid_lock);
return;
}
@@ -539,7 +536,6 @@ static void update_vttbr(struct kvm *kvm)
kvm_call_hyp(__kvm_flush_vm_context);
}
- kvm->arch.vmid_gen = atomic64_read(&kvm_vmid_gen);
kvm->arch.vmid = kvm_next_vmid;
kvm_next_vmid++;
kvm_next_vmid &= (1 << kvm_vmid_bits) - 1;
@@ -550,7 +546,10 @@ static void update_vttbr(struct kvm *kvm)
vmid = ((u64)(kvm->arch.vmid) << VTTBR_VMID_SHIFT) & VTTBR_VMID_MASK(kvm_vmid_bits);
kvm->arch.vttbr = kvm_phys_to_vttbr(pgd_phys) | vmid | cnp;
- write_unlock(&kvm_vmid_lock);
+ smp_wmb();
+ WRITE_ONCE(kvm->arch.vmid_gen, atomic64_read(&kvm_vmid_gen));
+
+ spin_unlock(&kvm_vmid_lock);
}
static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
@@ -674,8 +673,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
ret = kvm_handle_mmio_return(vcpu, vcpu->run);
if (ret)
return ret;
- if (kvm_arm_handle_step_debug(vcpu, vcpu->run))
- return 0;
}
if (run->immediate_exit)
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c
index 616e5a433ab0..9652c453480f 100644
--- a/virt/kvm/arm/hyp/vgic-v3-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v3-sr.c
@@ -1012,8 +1012,10 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
esr = kvm_vcpu_get_hsr(vcpu);
if (vcpu_mode_is_32bit(vcpu)) {
- if (!kvm_condition_valid(vcpu))
+ if (!kvm_condition_valid(vcpu)) {
+ __kvm_skip_instr(vcpu);
return 1;
+ }
sysreg = esr_cp15_to_sysreg(esr);
} else {
@@ -1123,6 +1125,8 @@ int __hyp_text __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
rt = kvm_vcpu_sys_get_rt(vcpu);
fn(vcpu, vmcr, rt);
+ __kvm_skip_instr(vcpu);
+
return 1;
}
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
index dac7ceb1a677..08443a15e6be 100644
--- a/virt/kvm/arm/mmio.c
+++ b/virt/kvm/arm/mmio.c
@@ -117,6 +117,12 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data);
}
+ /*
+ * The MMIO instruction is emulated and should not be re-executed
+ * in the guest.
+ */
+ kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+
return 0;
}
@@ -144,11 +150,6 @@ static int decode_hsr(struct kvm_vcpu *vcpu, bool *is_write, int *len)
vcpu->arch.mmio_decode.sign_extend = sign_extend;
vcpu->arch.mmio_decode.rt = rt;
- /*
- * The MMIO instruction is emulated and should not be re-executed
- * in the guest.
- */
- kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
return 0;
}
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 5eca48bdb1a6..dee3dbd98712 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -115,6 +115,25 @@ static void stage2_dissolve_pmd(struct kvm *kvm, phys_addr_t addr, pmd_t *pmd)
put_page(virt_to_page(pmd));
}
+/**
+ * stage2_dissolve_pud() - clear and flush huge PUD entry
+ * @kvm: pointer to kvm structure.
+ * @addr: IPA
+ * @pud: pud pointer for IPA
+ *
+ * Function clears a PUD entry, flushes addr 1st and 2nd stage TLBs. Marks all
+ * pages in the range dirty.
+ */
+static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp)
+{
+ if (!stage2_pud_huge(kvm, *pudp))
+ return;
+
+ stage2_pud_clear(kvm, pudp);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ put_page(virt_to_page(pudp));
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
int min, int max)
{
@@ -607,7 +626,7 @@ static void create_hyp_pte_mappings(pmd_t *pmd, unsigned long start,
addr = start;
do {
pte = pte_offset_kernel(pmd, addr);
- kvm_set_pte(pte, pfn_pte(pfn, prot));
+ kvm_set_pte(pte, kvm_pfn_pte(pfn, prot));
get_page(virt_to_page(pte));
pfn++;
} while (addr += PAGE_SIZE, addr != end);
@@ -1022,7 +1041,7 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache
pmd_t *pmd;
pud = stage2_get_pud(kvm, cache, addr);
- if (!pud)
+ if (!pud || stage2_pud_huge(kvm, *pud))
return NULL;
if (stage2_pud_none(kvm, *pud)) {
@@ -1083,29 +1102,103 @@ static int stage2_set_pmd_huge(struct kvm *kvm, struct kvm_mmu_memory_cache
return 0;
}
-static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
+static int stage2_set_pud_huge(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
+ phys_addr_t addr, const pud_t *new_pudp)
{
+ pud_t *pudp, old_pud;
+
+ pudp = stage2_get_pud(kvm, cache, addr);
+ VM_BUG_ON(!pudp);
+
+ old_pud = *pudp;
+
+ /*
+ * A large number of vcpus faulting on the same stage 2 entry,
+ * can lead to a refault due to the
+ * stage2_pud_clear()/tlb_flush(). Skip updating the page
+ * tables if there is no change.
+ */
+ if (pud_val(old_pud) == pud_val(*new_pudp))
+ return 0;
+
+ if (stage2_pud_present(kvm, old_pud)) {
+ stage2_pud_clear(kvm, pudp);
+ kvm_tlb_flush_vmid_ipa(kvm, addr);
+ } else {
+ get_page(virt_to_page(pudp));
+ }
+
+ kvm_set_pud(pudp, *new_pudp);
+ return 0;
+}
+
+/*
+ * stage2_get_leaf_entry - walk the stage2 VM page tables and return
+ * true if a valid and present leaf-entry is found. A pointer to the
+ * leaf-entry is returned in the appropriate level variable - pudpp,
+ * pmdpp, ptepp.
+ */
+static bool stage2_get_leaf_entry(struct kvm *kvm, phys_addr_t addr,
+ pud_t **pudpp, pmd_t **pmdpp, pte_t **ptepp)
+{
+ pud_t *pudp;
pmd_t *pmdp;
pte_t *ptep;
- pmdp = stage2_get_pmd(kvm, NULL, addr);
+ *pudpp = NULL;
+ *pmdpp = NULL;
+ *ptepp = NULL;
+
+ pudp = stage2_get_pud(kvm, NULL, addr);
+ if (!pudp || stage2_pud_none(kvm, *pudp) || !stage2_pud_present(kvm, *pudp))
+ return false;
+
+ if (stage2_pud_huge(kvm, *pudp)) {
+ *pudpp = pudp;
+ return true;
+ }
+
+ pmdp = stage2_pmd_offset(kvm, pudp, addr);
if (!pmdp || pmd_none(*pmdp) || !pmd_present(*pmdp))
return false;
- if (pmd_thp_or_huge(*pmdp))
- return kvm_s2pmd_exec(pmdp);
+ if (pmd_thp_or_huge(*pmdp)) {
+ *pmdpp = pmdp;
+ return true;
+ }
ptep = pte_offset_kernel(pmdp, addr);
if (!ptep || pte_none(*ptep) || !pte_present(*ptep))
return false;
- return kvm_s2pte_exec(ptep);
+ *ptepp = ptep;
+ return true;
+}
+
+static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
+{
+ pud_t *pudp;
+ pmd_t *pmdp;
+ pte_t *ptep;
+ bool found;
+
+ found = stage2_get_leaf_entry(kvm, addr, &pudp, &pmdp, &ptep);
+ if (!found)
+ return false;
+
+ if (pudp)
+ return kvm_s2pud_exec(pudp);
+ else if (pmdp)
+ return kvm_s2pmd_exec(pmdp);
+ else
+ return kvm_s2pte_exec(ptep);
}
static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
phys_addr_t addr, const pte_t *new_pte,
unsigned long flags)
{
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte, old_pte;
bool iomap = flags & KVM_S2PTE_FLAG_IS_IOMAP;
@@ -1114,7 +1207,31 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
VM_BUG_ON(logging_active && !cache);
/* Create stage-2 page table mapping - Levels 0 and 1 */
- pmd = stage2_get_pmd(kvm, cache, addr);
+ pud = stage2_get_pud(kvm, cache, addr);
+ if (!pud) {
+ /*
+ * Ignore calls from kvm_set_spte_hva for unallocated
+ * address ranges.
+ */
+ return 0;
+ }
+
+ /*
+ * While dirty page logging - dissolve huge PUD, then continue
+ * on to allocate page.
+ */
+ if (logging_active)
+ stage2_dissolve_pud(kvm, addr, pud);
+
+ if (stage2_pud_none(kvm, *pud)) {
+ if (!cache)
+ return 0; /* ignore calls from kvm_set_spte_hva */
+ pmd = mmu_memory_cache_alloc(cache);
+ stage2_pud_populate(kvm, pud, pmd);
+ get_page(virt_to_page(pud));
+ }
+
+ pmd = stage2_pmd_offset(kvm, pud, addr);
if (!pmd) {
/*
* Ignore calls from kvm_set_spte_hva for unallocated
@@ -1182,6 +1299,11 @@ static int stage2_pmdp_test_and_clear_young(pmd_t *pmd)
return stage2_ptep_test_and_clear_young((pte_t *)pmd);
}
+static int stage2_pudp_test_and_clear_young(pud_t *pud)
+{
+ return stage2_ptep_test_and_clear_young((pte_t *)pud);
+}
+
/**
* kvm_phys_addr_ioremap - map a device range to guest IPA
*
@@ -1202,7 +1324,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
pfn = __phys_to_pfn(pa);
for (addr = guest_ipa; addr < end; addr += PAGE_SIZE) {
- pte_t pte = pfn_pte(pfn, PAGE_S2_DEVICE);
+ pte_t pte = kvm_pfn_pte(pfn, PAGE_S2_DEVICE);
if (writable)
pte = kvm_s2pte_mkwrite(pte);
@@ -1234,7 +1356,7 @@ static bool transparent_hugepage_adjust(kvm_pfn_t *pfnp, phys_addr_t *ipap)
struct page *page = pfn_to_page(pfn);
/*
- * PageTransCompoungMap() returns true for THP and
+ * PageTransCompoundMap() returns true for THP and
* hugetlbfs. Make sure the adjustment is done only for THP
* pages.
*/
@@ -1347,9 +1469,12 @@ static void stage2_wp_puds(struct kvm *kvm, pgd_t *pgd,
do {
next = stage2_pud_addr_end(kvm, addr, end);
if (!stage2_pud_none(kvm, *pud)) {
- /* TODO:PUD not supported, revisit later if supported */
- BUG_ON(stage2_pud_huge(kvm, *pud));
- stage2_wp_pmds(kvm, pud, addr, next);
+ if (stage2_pud_huge(kvm, *pud)) {
+ if (!kvm_s2pud_readonly(pud))
+ kvm_set_s2pud_readonly(pud);
+ } else {
+ stage2_wp_pmds(kvm, pud, addr, next);
+ }
}
} while (pud++, addr = next, addr != end);
}
@@ -1392,7 +1517,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
*
* Called to start logging dirty pages after memory region
* KVM_MEM_LOG_DIRTY_PAGES operation is called. After this function returns
- * all present PMD and PTEs are write protected in the memory region.
+ * all present PUD, PMD and PTEs are write protected in the memory region.
* Afterwards read of dirty page log can be called.
*
* Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired,
@@ -1470,12 +1595,70 @@ static void kvm_send_hwpoison_signal(unsigned long address,
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, current);
}
+static bool fault_supports_stage2_pmd_mappings(struct kvm_memory_slot *memslot,
+ unsigned long hva)
+{
+ gpa_t gpa_start, gpa_end;
+ hva_t uaddr_start, uaddr_end;
+ size_t size;
+
+ size = memslot->npages * PAGE_SIZE;
+
+ gpa_start = memslot->base_gfn << PAGE_SHIFT;
+ gpa_end = gpa_start + size;
+
+ uaddr_start = memslot->userspace_addr;
+ uaddr_end = uaddr_start + size;
+
+ /*
+ * Pages belonging to memslots that don't have the same alignment
+ * within a PMD for userspace and IPA cannot be mapped with stage-2
+ * PMD entries, because we'll end up mapping the wrong pages.
+ *
+ * Consider a layout like the following:
+ *
+ * memslot->userspace_addr:
+ * +-----+--------------------+--------------------+---+
+ * |abcde|fgh Stage-1 PMD | Stage-1 PMD tv|xyz|
+ * +-----+--------------------+--------------------+---+
+ *
+ * memslot->base_gfn << PAGE_SIZE:
+ * +---+--------------------+--------------------+-----+
+ * |abc|def Stage-2 PMD | Stage-2 PMD |tvxyz|
+ * +---+--------------------+--------------------+-----+
+ *
+ * If we create those stage-2 PMDs, we'll end up with this incorrect
+ * mapping:
+ * d -> f
+ * e -> g
+ * f -> h
+ */
+ if ((gpa_start & ~S2_PMD_MASK) != (uaddr_start & ~S2_PMD_MASK))
+ return false;
+
+ /*
+ * Next, let's make sure we're not trying to map anything not covered
+ * by the memslot. This means we have to prohibit PMD size mappings
+ * for the beginning and end of a non-PMD aligned and non-PMD sized
+ * memory slot (illustrated by the head and tail parts of the
+ * userspace view above containing pages 'abcde' and 'xyz',
+ * respectively).
+ *
+ * Note that it doesn't matter if we do the check using the
+ * userspace_addr or the base_gfn, as both are equally aligned (per
+ * the check above) and equally sized.
+ */
+ return (hva & S2_PMD_MASK) >= uaddr_start &&
+ (hva & S2_PMD_MASK) + S2_PMD_SIZE <= uaddr_end;
+}
+
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
struct kvm_memory_slot *memslot, unsigned long hva,
unsigned long fault_status)
{
int ret;
- bool write_fault, exec_fault, writable, hugetlb = false, force_pte = false;
+ bool write_fault, writable, force_pte = false;
+ bool exec_fault, needs_exec;
unsigned long mmu_seq;
gfn_t gfn = fault_ipa >> PAGE_SHIFT;
struct kvm *kvm = vcpu->kvm;
@@ -1484,7 +1667,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
kvm_pfn_t pfn;
pgprot_t mem_type = PAGE_S2;
bool logging_active = memslot_is_logging(memslot);
- unsigned long flags = 0;
+ unsigned long vma_pagesize, flags = 0;
write_fault = kvm_is_write_fault(vcpu);
exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
@@ -1495,6 +1678,12 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
+ if (!fault_supports_stage2_pmd_mappings(memslot, hva))
+ force_pte = true;
+
+ if (logging_active)
+ force_pte = true;
+
/* Let's check if we will get back a huge page backed by hugetlbfs */
down_read(&current->mm->mmap_sem);
vma = find_vma_intersection(current->mm, hva, hva + 1);
@@ -1504,22 +1693,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
return -EFAULT;
}
- if (vma_kernel_pagesize(vma) == PMD_SIZE && !logging_active) {
- hugetlb = true;
- gfn = (fault_ipa & PMD_MASK) >> PAGE_SHIFT;
- } else {
- /*
- * Pages belonging to memslots that don't have the same
- * alignment for userspace and IPA cannot be mapped using
- * block descriptors even if the pages belong to a THP for
- * the process, because the stage-2 block descriptor will
- * cover more than a single THP and we loose atomicity for
- * unmapping, updates, and splits of the THP or other pages
- * in the stage-2 block range.
- */
- if ((memslot->userspace_addr & ~PMD_MASK) !=
- ((memslot->base_gfn << PAGE_SHIFT) & ~PMD_MASK))
- force_pte = true;
+ vma_pagesize = vma_kernel_pagesize(vma);
+ /*
+ * PUD level may not exist for a VM but PMD is guaranteed to
+ * exist.
+ */
+ if ((vma_pagesize == PMD_SIZE ||
+ (vma_pagesize == PUD_SIZE && kvm_stage2_has_pud(kvm))) &&
+ !force_pte) {
+ gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> PAGE_SHIFT;
}
up_read(&current->mm->mmap_sem);
@@ -1558,7 +1740,6 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
* should not be mapped with huge pages (it introduces churn
* and performance degradation), so force a pte mapping.
*/
- force_pte = true;
flags |= KVM_S2_FLAG_LOGGING_ACTIVE;
/*
@@ -1573,50 +1754,69 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
- if (!hugetlb && !force_pte)
- hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);
+ if (vma_pagesize == PAGE_SIZE && !force_pte) {
+ /*
+ * Only PMD_SIZE transparent hugepages(THP) are
+ * currently supported. This code will need to be
+ * updated to support other THP sizes.
+ */
+ if (transparent_hugepage_adjust(&pfn, &fault_ipa))
+ vma_pagesize = PMD_SIZE;
+ }
+
+ if (writable)
+ kvm_set_pfn_dirty(pfn);
- if (hugetlb) {
- pmd_t new_pmd = pfn_pmd(pfn, mem_type);
- new_pmd = pmd_mkhuge(new_pmd);
- if (writable) {
- new_pmd = kvm_s2pmd_mkwrite(new_pmd);
- kvm_set_pfn_dirty(pfn);
- }
+ if (fault_status != FSC_PERM)
+ clean_dcache_guest_page(pfn, vma_pagesize);
+
+ if (exec_fault)
+ invalidate_icache_guest_page(pfn, vma_pagesize);
- if (fault_status != FSC_PERM)
- clean_dcache_guest_page(pfn, PMD_SIZE);
+ /*
+ * If we took an execution fault we have made the
+ * icache/dcache coherent above and should now let the s2
+ * mapping be executable.
+ *
+ * Write faults (!exec_fault && FSC_PERM) are orthogonal to
+ * execute permissions, and we preserve whatever we have.
+ */
+ needs_exec = exec_fault ||
+ (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
- if (exec_fault) {
+ if (vma_pagesize == PUD_SIZE) {
+ pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
+
+ new_pud = kvm_pud_mkhuge(new_pud);
+ if (writable)
+ new_pud = kvm_s2pud_mkwrite(new_pud);
+
+ if (needs_exec)
+ new_pud = kvm_s2pud_mkexec(new_pud);
+
+ ret = stage2_set_pud_huge(kvm, memcache, fault_ipa, &new_pud);
+ } else if (vma_pagesize == PMD_SIZE) {
+ pmd_t new_pmd = kvm_pfn_pmd(pfn, mem_type);
+
+ new_pmd = kvm_pmd_mkhuge(new_pmd);
+
+ if (writable)
+ new_pmd = kvm_s2pmd_mkwrite(new_pmd);
+
+ if (needs_exec)
new_pmd = kvm_s2pmd_mkexec(new_pmd);
- invalidate_icache_guest_page(pfn, PMD_SIZE);
- } else if (fault_status == FSC_PERM) {
- /* Preserve execute if XN was already cleared */
- if (stage2_is_exec(kvm, fault_ipa))
- new_pmd = kvm_s2pmd_mkexec(new_pmd);
- }
ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
} else {
- pte_t new_pte = pfn_pte(pfn, mem_type);
+ pte_t new_pte = kvm_pfn_pte(pfn, mem_type);
if (writable) {
new_pte = kvm_s2pte_mkwrite(new_pte);
- kvm_set_pfn_dirty(pfn);
mark_page_dirty(kvm, gfn);
}
- if (fault_status != FSC_PERM)
- clean_dcache_guest_page(pfn, PAGE_SIZE);
-
- if (exec_fault) {
+ if (needs_exec)
new_pte = kvm_s2pte_mkexec(new_pte);
- invalidate_icache_guest_page(pfn, PAGE_SIZE);
- } else if (fault_status == FSC_PERM) {
- /* Preserve execute if XN was already cleared */
- if (stage2_is_exec(kvm, fault_ipa))
- new_pte = kvm_s2pte_mkexec(new_pte);
- }
ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte, flags);
}
@@ -1637,6 +1837,7 @@ out_unlock:
*/
static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
{
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte;
kvm_pfn_t pfn;
@@ -1646,24 +1847,23 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
spin_lock(&vcpu->kvm->mmu_lock);
- pmd = stage2_get_pmd(vcpu->kvm, NULL, fault_ipa);
- if (!pmd || pmd_none(*pmd)) /* Nothing there */
+ if (!stage2_get_leaf_entry(vcpu->kvm, fault_ipa, &pud, &pmd, &pte))
goto out;
- if (pmd_thp_or_huge(*pmd)) { /* THP, HugeTLB */
+ if (pud) { /* HugeTLB */
+ *pud = kvm_s2pud_mkyoung(*pud);
+ pfn = kvm_pud_pfn(*pud);
+ pfn_valid = true;
+ } else if (pmd) { /* THP, HugeTLB */
*pmd = pmd_mkyoung(*pmd);
pfn = pmd_pfn(*pmd);
pfn_valid = true;
- goto out;
+ } else {
+ *pte = pte_mkyoung(*pte); /* Just a page... */
+ pfn = pte_pfn(*pte);
+ pfn_valid = true;
}
- pte = pte_offset_kernel(pmd, fault_ipa);
- if (pte_none(*pte)) /* Nothing there either */
- goto out;
-
- *pte = pte_mkyoung(*pte); /* Just a page... */
- pfn = pte_pfn(*pte);
- pfn_valid = true;
out:
spin_unlock(&vcpu->kvm->mmu_lock);
if (pfn_valid)
@@ -1865,48 +2065,44 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
* just like a translation fault and clean the cache to the PoC.
*/
clean_dcache_guest_page(pfn, PAGE_SIZE);
- stage2_pte = pfn_pte(pfn, PAGE_S2);
+ stage2_pte = kvm_pfn_pte(pfn, PAGE_S2);
handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte);
}
static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
- pmd = stage2_get_pmd(kvm, NULL, gpa);
- if (!pmd || pmd_none(*pmd)) /* Nothing there */
+ WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+ if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
return 0;
- if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */
+ if (pud)
+ return stage2_pudp_test_and_clear_young(pud);
+ else if (pmd)
return stage2_pmdp_test_and_clear_young(pmd);
-
- pte = pte_offset_kernel(pmd, gpa);
- if (pte_none(*pte))
- return 0;
-
- return stage2_ptep_test_and_clear_young(pte);
+ else
+ return stage2_ptep_test_and_clear_young(pte);
}
static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
+ pud_t *pud;
pmd_t *pmd;
pte_t *pte;
- WARN_ON(size != PAGE_SIZE && size != PMD_SIZE);
- pmd = stage2_get_pmd(kvm, NULL, gpa);
- if (!pmd || pmd_none(*pmd)) /* Nothing there */
+ WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
+ if (!stage2_get_leaf_entry(kvm, gpa, &pud, &pmd, &pte))
return 0;
- if (pmd_thp_or_huge(*pmd)) /* THP, HugeTLB */
+ if (pud)
+ return kvm_s2pud_young(*pud);
+ else if (pmd)
return pmd_young(*pmd);
-
- pte = pte_offset_kernel(pmd, gpa);
- if (!pte_none(*pte)) /* Just a page... */
+ else
return pte_young(*pte);
-
- return 0;
}
int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h
index 57b3edebbb40..3828beab93f2 100644
--- a/virt/kvm/arm/trace.h
+++ b/virt/kvm/arm/trace.h
@@ -26,25 +26,25 @@ TRACE_EVENT(kvm_entry,
);
TRACE_EVENT(kvm_exit,
- TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc),
- TP_ARGS(idx, exit_reason, vcpu_pc),
+ TP_PROTO(int ret, unsigned int esr_ec, unsigned long vcpu_pc),
+ TP_ARGS(ret, esr_ec, vcpu_pc),
TP_STRUCT__entry(
- __field( int, idx )
- __field( unsigned int, exit_reason )
+ __field( int, ret )
+ __field( unsigned int, esr_ec )
__field( unsigned long, vcpu_pc )
),
TP_fast_assign(
- __entry->idx = idx;
- __entry->exit_reason = exit_reason;
+ __entry->ret = ARM_EXCEPTION_CODE(ret);
+ __entry->esr_ec = ARM_EXCEPTION_IS_TRAP(ret) ? esr_ec : 0;
__entry->vcpu_pc = vcpu_pc;
),
TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
- __print_symbolic(__entry->idx, kvm_arm_exception_type),
- __entry->exit_reason,
- __print_symbolic(__entry->exit_reason, kvm_arm_exception_class),
+ __print_symbolic(__entry->ret, kvm_arm_exception_type),
+ __entry->esr_ec,
+ __print_symbolic(__entry->esr_ec, kvm_arm_exception_class),
__entry->vcpu_pc)
);
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index f56ff1cf52ec..ceeda7e04a4d 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -313,36 +313,30 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
spin_lock_irqsave(&irq->irq_lock, flags);
- /*
- * If this virtual IRQ was written into a list register, we
- * have to make sure the CPU that runs the VCPU thread has
- * synced back the LR state to the struct vgic_irq.
- *
- * As long as the conditions below are true, we know the VCPU thread
- * may be on its way back from the guest (we kicked the VCPU thread in
- * vgic_change_active_prepare) and still has to sync back this IRQ,
- * so we release and re-acquire the spin_lock to let the other thread
- * sync back the IRQ.
- *
- * When accessing VGIC state from user space, requester_vcpu is
- * NULL, which is fine, because we guarantee that no VCPUs are running
- * when accessing VGIC state from user space so irq->vcpu->cpu is
- * always -1.
- */
- while (irq->vcpu && /* IRQ may have state in an LR somewhere */
- irq->vcpu != requester_vcpu && /* Current thread is not the VCPU thread */
- irq->vcpu->cpu != -1) /* VCPU thread is running */
- cond_resched_lock(&irq->irq_lock);
-
if (irq->hw) {
vgic_hw_irq_change_active(vcpu, irq, active, !requester_vcpu);
} else {
u32 model = vcpu->kvm->arch.vgic.vgic_model;
+ u8 active_source;
irq->active = active;
+
+ /*
+ * The GICv2 architecture indicates that the source CPUID for
+ * an SGI should be provided during an EOI which implies that
+ * the active state is stored somewhere, but at the same time
+ * this state is not architecturally exposed anywhere and we
+ * have no way of knowing the right source.
+ *
+ * This may lead to a VCPU not being able to receive
+ * additional instances of a particular SGI after migration
+ * for a GICv2 VM on some GIC implementations. Oh well.
+ */
+ active_source = (requester_vcpu) ? requester_vcpu->vcpu_id : 0;
+
if (model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
active && vgic_irq_is_sgi(irq->intid))
- irq->active_source = requester_vcpu->vcpu_id;
+ irq->active_source = active_source;
}
if (irq->active)
@@ -368,14 +362,16 @@ static void vgic_mmio_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
*/
static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
{
- if (intid > VGIC_NR_PRIVATE_IRQS)
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ intid > VGIC_NR_PRIVATE_IRQS)
kvm_arm_halt_guest(vcpu->kvm);
}
/* See vgic_change_active_prepare */
static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid)
{
- if (intid > VGIC_NR_PRIVATE_IRQS)
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ intid > VGIC_NR_PRIVATE_IRQS)
kvm_arm_resume_guest(vcpu->kvm);
}
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 7cfdfbc910e0..a6b135491b6c 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -103,13 +103,13 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
{
/* SGIs and PPIs */
if (intid <= VGIC_MAX_PRIVATE) {
- intid = array_index_nospec(intid, VGIC_MAX_PRIVATE);
+ intid = array_index_nospec(intid, VGIC_MAX_PRIVATE + 1);
return &vcpu->arch.vgic_cpu.private_irqs[intid];
}
/* SPIs */
- if (intid <= VGIC_MAX_SPI) {
- intid = array_index_nospec(intid, VGIC_MAX_SPI);
+ if (intid < (kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)) {
+ intid = array_index_nospec(intid, kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS);
return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
}
@@ -908,6 +908,7 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
struct vgic_irq *irq;
bool pending = false;
unsigned long flags;
+ struct vgic_vmcr vmcr;
if (!vcpu->kvm->arch.vgic.enabled)
return false;
@@ -915,11 +916,15 @@ int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
if (vcpu->arch.vgic_cpu.vgic_v3.its_vpe.pending_last)
return true;
+ vgic_get_vmcr(vcpu, &vmcr);
+
spin_lock_irqsave(&vgic_cpu->ap_list_lock, flags);
list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
spin_lock(&irq->irq_lock);
- pending = irq_is_pending(irq) && irq->enabled;
+ pending = irq_is_pending(irq) && irq->enabled &&
+ !irq->active &&
+ irq->priority < vmcr.pmr;
spin_unlock(&irq->irq_lock);
if (pending)