From 8acc0993e3f9a04a407ff1507dd329455f340121 Mon Sep 17 00:00:00 2001
From: Kai Huang <kai.huang@linux.intel.com>
Date: Tue, 15 Jan 2019 17:28:40 +1300
Subject: kvm, x86, mmu: Use kernel generic dynamic physical address mask

AMD's SME/SEV is no longer the only case which reduces supported
physical address bits, since Intel introduced Multi-key Total Memory
Encryption (MKTME), which repurposes high bits of physical address as
keyID, thus effectively shrinks supported physical address bits. To
cover both cases (and potential similar future features), kernel MM
introduced generic dynamaic physical address mask instead of hard-coded
__PHYSICAL_MASK in 'commit 94d49eb30e854 ("x86/mm: Decouple dynamic
__PHYSICAL_MASK from AMD SME")'. KVM should use that too.

Change PT64_BASE_ADDR_MASK to use kernel dynamic physical address mask
when it is enabled, instead of sme_clr. PT64_DIR_BASE_ADDR_MASK is also
deleted since it is not used at all.

Signed-off-by: Kai Huang <kai.huang@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index da9c42349b1f..45eb988aa411 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -109,9 +109,11 @@ module_param(dbg, bool, 0644);
 	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
 
 
-#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
-#define PT64_DIR_BASE_ADDR_MASK \
-	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
+#ifdef CONFIG_DYNAMIC_PHYSICAL_MASK
+#define PT64_BASE_ADDR_MASK (physical_mask & ~(u64)(PAGE_SIZE-1))
+#else
+#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#endif
 #define PT64_LVL_ADDR_MASK(level) \
 	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
 						* PT64_LEVEL_BITS))) - 1))
-- 
cgit v1.2.3


From 254272ce6505948ecc0b4bf5cd0aa61cdd815994 Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon@google.com>
Date: Mon, 11 Feb 2019 11:02:50 -0800
Subject: kvm: x86: Add memcg accounting to KVM allocations

There are many KVM kernel memory allocations which are tied to the life of
the VM process and should be charged to the VM process's cgroup. If the
allocations aren't tied to the process, the OOM killer will not know
that killing the process will free the associated kernel memory.
Add __GFP_ACCOUNT flags to many of the allocations which are not yet being
charged to the VM process's cgroup.

Tested:
	Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch
	introduced no new failures.
	Ran a kernel memory accounting test which creates a VM to touch
	memory and then checks that the kernel memory allocated for the
	process is within certain bounds.
	With this patch we account for much more of the vmalloc and slab memory
	allocated for the VM.

There remain a few allocations which should be charged to the VM's
cgroup but are not. In x86, they include:
	vcpu->arch.pio_data
There allocations are unaccounted in this patch because they are mapped
to userspace, and accounting them to a cgroup causes problems. This
should be addressed in a future patch.

Signed-off-by: Ben Gardon <bgardon@google.com>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/hyperv.c     |  2 +-
 arch/x86/kvm/i8254.c      |  2 +-
 arch/x86/kvm/i8259.c      |  2 +-
 arch/x86/kvm/ioapic.c     |  2 +-
 arch/x86/kvm/lapic.c      |  7 ++++---
 arch/x86/kvm/mmu.c        |  6 +++---
 arch/x86/kvm/page_track.c |  2 +-
 arch/x86/kvm/x86.c        | 16 +++++++++-------
 8 files changed, 21 insertions(+), 18 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 89d20ed1d2e8..27c43525a05f 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1729,7 +1729,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
 
 	mutex_lock(&hv->hv_lock);
 	ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
-			GFP_KERNEL);
+			GFP_KERNEL_ACCOUNT);
 	mutex_unlock(&hv->hv_lock);
 
 	if (ret >= 0)
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index af192895b1fc..4a6dc54cc12b 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -653,7 +653,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	pid_t pid_nr;
 	int ret;
 
-	pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
+	pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT);
 	if (!pit)
 		return NULL;
 
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index bdcd4139eca9..8b38bb4868a6 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -583,7 +583,7 @@ int kvm_pic_init(struct kvm *kvm)
 	struct kvm_pic *s;
 	int ret;
 
-	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
+	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT);
 	if (!s)
 		return -ENOMEM;
 	spin_lock_init(&s->lock);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 4e822ad363f3..1add1bc881e2 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -622,7 +622,7 @@ int kvm_ioapic_init(struct kvm *kvm)
 	struct kvm_ioapic *ioapic;
 	int ret;
 
-	ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
+	ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT);
 	if (!ioapic)
 		return -ENOMEM;
 	spin_lock_init(&ioapic->lock);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4b6c2da7265c..991fdf7fc17f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -181,7 +181,8 @@ static void recalculate_apic_map(struct kvm *kvm)
 			max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
 
 	new = kvzalloc(sizeof(struct kvm_apic_map) +
-	                   sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL);
+	                   sizeof(struct kvm_lapic *) * ((u64)max_id + 1),
+			   GFP_KERNEL_ACCOUNT);
 
 	if (!new)
 		goto out;
@@ -2259,13 +2260,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	ASSERT(vcpu != NULL);
 	apic_debug("apic_init %d\n", vcpu->vcpu_id);
 
-	apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+	apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
 	if (!apic)
 		goto nomem;
 
 	vcpu->arch.apic = apic;
 
-	apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
+	apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
 	if (!apic->regs) {
 		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
 		       vcpu->vcpu_id);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 45eb988aa411..415d0e62cb3e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -961,7 +961,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 	if (cache->nobjs >= min)
 		return 0;
 	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
 		if (!obj)
 			return cache->nobjs >= min ? 0 : -ENOMEM;
 		cache->objects[cache->nobjs++] = obj;
@@ -3702,7 +3702,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
 			u64 *lm_root;
 
-			lm_root = (void*)get_zeroed_page(GFP_KERNEL);
+			lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
 			if (lm_root == NULL)
 				return 1;
 
@@ -5499,7 +5499,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 	 * Therefore we need to allocate shadow page tables in the first
 	 * 4GB of memory, which happens to fit the DMA32 zone.
 	 */
-	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
 	if (!page)
 		return -ENOMEM;
 
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 3052a59a3065..fd04d462fdae 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -42,7 +42,7 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
 	for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
 		slot->arch.gfn_track[i] =
 			kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
-				 GFP_KERNEL);
+				 GFP_KERNEL_ACCOUNT);
 		if (!slot->arch.gfn_track[i])
 			goto track_free;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 96f87d356c79..3de586f89730 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3879,7 +3879,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 		if (!lapic_in_kernel(vcpu))
 			goto out;
-		u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+		u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
+				GFP_KERNEL_ACCOUNT);
 
 		r = -ENOMEM;
 		if (!u.lapic)
@@ -4066,7 +4067,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_XSAVE: {
-		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
 		r = -ENOMEM;
 		if (!u.xsave)
 			break;
@@ -4090,7 +4091,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_XCRS: {
-		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
 		r = -ENOMEM;
 		if (!u.xcrs)
 			break;
@@ -9040,14 +9041,15 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		static_key_slow_inc(&kvm_no_apic_vcpu);
 
 	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
-				       GFP_KERNEL);
+				       GFP_KERNEL_ACCOUNT);
 	if (!vcpu->arch.mce_banks) {
 		r = -ENOMEM;
 		goto fail_free_lapic;
 	}
 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 
-	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
+	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
+				GFP_KERNEL_ACCOUNT)) {
 		r = -ENOMEM;
 		goto fail_free_mce_banks;
 	}
@@ -9306,13 +9308,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 		slot->arch.rmap[i] =
 			kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
-				 GFP_KERNEL);
+				 GFP_KERNEL_ACCOUNT);
 		if (!slot->arch.rmap[i])
 			goto out_free;
 		if (i == 0)
 			continue;
 
-		linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
+		linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
 		if (!linfo)
 			goto out_free;
 
-- 
cgit v1.2.3


From 152482580a1b0accb60676063a1ac57b2d12daf6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 12:54:17 -0800
Subject: KVM: Call kvm_arch_memslots_updated() before updating memslots

kvm_arch_memslots_updated() is at this point in time an x86-specific
hook for handling MMIO generation wraparound.  x86 stashes 19 bits of
the memslots generation number in its MMIO sptes in order to avoid
full page fault walks for repeat faults on emulated MMIO addresses.
Because only 19 bits are used, wrapping the MMIO generation number is
possible, if unlikely.  kvm_arch_memslots_updated() alerts x86 that
the generation has changed so that it can invalidate all MMIO sptes in
case the effective MMIO generation has wrapped so as to avoid using a
stale spte, e.g. a (very) old spte that was created with generation==0.

Given that the purpose of kvm_arch_memslots_updated() is to prevent
consuming stale entries, it needs to be called before the new generation
is propagated to memslots.  Invalidating the MMIO sptes after updating
memslots means that there is a window where a vCPU could dereference
the new memslots generation, e.g. 0, and incorrectly reuse an old MMIO
spte that was created with (pre-wrap) generation==0.

Fixes: e59dbe09f8e6 ("KVM: Introduce kvm_arch_memslots_updated()")
Cc: <stable@vger.kernel.org>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/mips/include/asm/kvm_host.h    | 2 +-
 arch/powerpc/include/asm/kvm_host.h | 2 +-
 arch/s390/include/asm/kvm_host.h    | 2 +-
 arch/x86/include/asm/kvm_host.h     | 2 +-
 arch/x86/kvm/mmu.c                  | 4 ++--
 arch/x86/kvm/x86.c                  | 4 ++--
 include/linux/kvm_host.h            | 2 +-
 virt/kvm/arm/mmu.c                  | 2 +-
 virt/kvm/kvm_main.c                 | 7 +++++--
 9 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index d2abd98471e8..41204a49cf95 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -1134,7 +1134,7 @@ static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 0f98f00da2ea..19693b8add93 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -837,7 +837,7 @@ struct kvm_vcpu_arch {
 static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_exit(void) {}
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index d5d24889c3bc..c2b8c8c6c9be 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -878,7 +878,7 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_free_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *free, struct kvm_memory_slot *dont) {}
-static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots) {}
+static inline void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0e2ef41efb9d..c4758e1a8843 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1254,7 +1254,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
 				   struct kvm_memory_slot *slot,
 				   gfn_t gfn_offset, unsigned long mask);
 void kvm_mmu_zap_all(struct kvm *kvm);
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots);
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen);
 unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 415d0e62cb3e..a53a0e7ad9e6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5893,13 +5893,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
 	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
 }
 
-void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
+void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 {
 	/*
 	 * The very rare case: if the generation-number is round,
 	 * zap all shadow pages.
 	 */
-	if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
+	if (unlikely((gen & MMIO_GEN_MASK) == 0)) {
 		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
 		kvm_mmu_invalidate_zap_all_pages(kvm);
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3de586f89730..03d26ffb29cd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9357,13 +9357,13 @@ out_free:
 	return -ENOMEM;
 }
 
-void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
 {
 	/*
 	 * memslots->generation has been incremented.
 	 * mmio generation may have reached its maximum value.
 	 */
-	kvm_mmu_invalidate_mmio_sptes(kvm, slots);
+	kvm_mmu_invalidate_mmio_sptes(kvm, gen);
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c38cc5eb7e73..cf761ff58224 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -634,7 +634,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 			   struct kvm_memory_slot *dont);
 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 			    unsigned long npages);
-void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots);
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				const struct kvm_userspace_memory_region *mem,
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index fbdf3ac2f001..e0355e0f8712 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -2350,7 +2350,7 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 	return 0;
 }
 
-void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslots *slots)
+void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
 {
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0a0ea8f4bb1b..d54f6578a849 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -874,6 +874,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 		int as_id, struct kvm_memslots *slots)
 {
 	struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
+	u64 gen;
 
 	/*
 	 * Set the low bit in the generation, which disables SPTE caching
@@ -896,9 +897,11 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	 * space 0 will use generations 0, 4, 8, ... while * address space 1 will
 	 * use generations 2, 6, 10, 14, ...
 	 */
-	slots->generation += KVM_ADDRESS_SPACE_NUM * 2 - 1;
+	gen = slots->generation + KVM_ADDRESS_SPACE_NUM * 2 - 1;
 
-	kvm_arch_memslots_updated(kvm, slots);
+	kvm_arch_memslots_updated(kvm, gen);
+
+	slots->generation = gen;
 
 	return old_memslots;
 }
-- 
cgit v1.2.3


From e1359e2beb8b0a1188abc997273acbaedc8ee791 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:12 -0800
Subject: KVM: x86/mmu: Detect MMIO generation wrap in any address space

The check to detect a wrap of the MMIO generation explicitly looks for a
generation number of zero.  Now that unique memslots generation numbers
are assigned to each address space, only address space 0 will get a
generation number of exactly zero when wrapping.  E.g. when address
space 1 goes from 0x7fffe to 0x80002, the MMIO generation number will
wrap to 0x2.  Adjust the MMIO generation to strip the address space
modifier prior to checking for a wrap.

Fixes: 4bd518f1598d ("KVM: use separate generations for each address space")
Cc: <stable@vger.kernel.org>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a53a0e7ad9e6..c2f2c9de63ed 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5895,11 +5895,28 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
 
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 {
+	gen &= MMIO_GEN_MASK;
+
+	/*
+	 * Shift to eliminate the "update in-progress" flag, which isn't
+	 * included in the spte's generation number.
+	 */
+	gen >>= 1;
+
+	/*
+	 * Generation numbers are incremented in multiples of the number of
+	 * address spaces in order to provide unique generations across all
+	 * address spaces.  Strip what is effectively the address space
+	 * modifier prior to checking for a wrap of the MMIO generation so
+	 * that a wrap in any address space is detected.
+	 */
+	gen &= ~((u64)KVM_ADDRESS_SPACE_NUM - 1);
+
 	/*
-	 * The very rare case: if the generation-number is round,
+	 * The very rare case: if the MMIO generation number has wrapped,
 	 * zap all shadow pages.
 	 */
-	if (unlikely((gen & MMIO_GEN_MASK) == 0)) {
+	if (unlikely(gen == 0)) {
 		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
 		kvm_mmu_invalidate_zap_all_pages(kvm);
 	}
-- 
cgit v1.2.3


From 5192f9b976f9687569a90602b8a6c053da4498f6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:15 -0800
Subject: KVM: x86: Use a u64 when passing the MMIO gen around

KVM currently uses an 'unsigned int' for the MMIO generation number
despite it being derived from the 64-bit memslots generation and
being propagated to (potentially) 64-bit sptes.  There is no hidden
agenda behind using an 'unsigned int', it's done simply because the
MMIO generation will never set bits above bit 19.

Passing a u64 will allow the "update in-progress" flag to be relocated
from bit 0 to bit 63 and removes the need to cast the generation back
to a u64 when propagating it to a spte.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c2f2c9de63ed..54bb706e40e8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -348,20 +348,20 @@ static inline bool is_access_track_spte(u64 spte)
 #define MMIO_GEN_LOW_MASK		((1 << MMIO_GEN_LOW_SHIFT) - 2)
 #define MMIO_GEN_MASK			((1 << MMIO_GEN_SHIFT) - 1)
 
-static u64 generation_mmio_spte_mask(unsigned int gen)
+static u64 generation_mmio_spte_mask(u64 gen)
 {
 	u64 mask;
 
 	WARN_ON(gen & ~MMIO_GEN_MASK);
 
 	mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
-	mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
+	mask |= (gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
 	return mask;
 }
 
-static unsigned int get_mmio_spte_generation(u64 spte)
+static u64 get_mmio_spte_generation(u64 spte)
 {
-	unsigned int gen;
+	u64 gen;
 
 	spte &= ~shadow_mmio_mask;
 
@@ -370,7 +370,7 @@ static unsigned int get_mmio_spte_generation(u64 spte)
 	return gen;
 }
 
-static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
+static u64 kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
 {
 	return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
 }
@@ -378,7 +378,7 @@ static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 			   unsigned access)
 {
-	unsigned int gen = kvm_current_mmio_generation(vcpu);
+	u64 gen = kvm_current_mmio_generation(vcpu);
 	u64 mask = generation_mmio_spte_mask(gen);
 	u64 gpa = gfn << PAGE_SHIFT;
 
@@ -426,7 +426,7 @@ static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 
 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 {
-	unsigned int kvm_gen, spte_gen;
+	u64 kvm_gen, spte_gen;
 
 	kvm_gen = kvm_current_mmio_generation(vcpu);
 	spte_gen = get_mmio_spte_generation(spte);
-- 
cgit v1.2.3


From cae7ed3c2cb06680400adab632a243c5e5f42637 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:16 -0800
Subject: KVM: x86: Refactor the MMIO SPTE generation handling

The code to propagate the memslots generation number into MMIO sptes is
a bit convoluted.  The "what" is relatively straightfoward, e.g. the
comment explaining which bits go where is quite readable, but the "how"
requires a lot of staring to understand what is happening.  For example,
'MMIO_GEN_LOW_SHIFT' is actually used to calculate the high bits of the
spte, while 'MMIO_SPTE_GEN_LOW_SHIFT' is used to calculate the low bits.

Refactor the code to:

  - use #defines whose values align with the bits defined in the comment
  - use consistent code for both the high and low mask
  - explicitly highlight the handling of bit 0 (update in-progress flag)
  - explicitly call out that the defines are for MMIO sptes (to avoid
    confusion with the per-vCPU MMIO cache, which uses the full memslots
    generation)

In addition to making the code a little less magical, this paves the way
for moving the update in-progress flag to bit 63 without having to
simultaneously rewrite all of the MMIO spte code.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 76 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 43 insertions(+), 33 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 54bb706e40e8..364b2a737d94 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -332,30 +332,41 @@ static inline bool is_access_track_spte(u64 spte)
 }
 
 /*
- * the low bit of the generation number is always presumed to be zero.
- * This disables mmio caching during memslot updates.  The concept is
- * similar to a seqcount but instead of retrying the access we just punt
- * and ignore the cache.
+ * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
+ * the memslots generation and is derived as follows:
  *
- * spte bits 3-11 are used as bits 1-9 of the generation number,
- * the bits 52-61 are used as bits 10-19 of the generation number.
+ * Bits 1-9 of the memslot generation are propagated to spte bits 3-11
+ * Bits 10-19 of the memslot generation are propagated to spte bits 52-61
+ *
+ * The MMIO generation starts at bit 1 of the memslots generation in order to
+ * skip over bit 0, the KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag.  Including
+ * the flag would require stealing a bit from the "real" generation number and
+ * thus effectively halve the maximum number of MMIO generations that can be
+ * handled before encountering a wrap (which requires a full MMU zap).  The
+ * flag is instead explicitly queried when checking for MMIO spte cache hits.
  */
-#define MMIO_SPTE_GEN_LOW_SHIFT		2
-#define MMIO_SPTE_GEN_HIGH_SHIFT	52
-
-#define MMIO_GEN_SHIFT			20
-#define MMIO_GEN_LOW_SHIFT		10
-#define MMIO_GEN_LOW_MASK		((1 << MMIO_GEN_LOW_SHIFT) - 2)
-#define MMIO_GEN_MASK			((1 << MMIO_GEN_SHIFT) - 1)
-
+#define MMIO_SPTE_GEN_MASK		GENMASK_ULL(19, 1)
+#define MMIO_SPTE_GEN_SHIFT		1
+
+#define MMIO_SPTE_GEN_LOW_START		3
+#define MMIO_SPTE_GEN_LOW_END		11
+#define MMIO_SPTE_GEN_LOW_MASK		GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
+						    MMIO_SPTE_GEN_LOW_START)
+
+#define MMIO_SPTE_GEN_HIGH_START	52
+#define MMIO_SPTE_GEN_HIGH_END		61
+#define MMIO_SPTE_GEN_HIGH_MASK		GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
+						    MMIO_SPTE_GEN_HIGH_START)
 static u64 generation_mmio_spte_mask(u64 gen)
 {
 	u64 mask;
 
-	WARN_ON(gen & ~MMIO_GEN_MASK);
+	WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
 
-	mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
-	mask |= (gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
+	gen >>= MMIO_SPTE_GEN_SHIFT;
+
+	mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
+	mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
 	return mask;
 }
 
@@ -365,20 +376,15 @@ static u64 get_mmio_spte_generation(u64 spte)
 
 	spte &= ~shadow_mmio_mask;
 
-	gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
-	gen |= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
-	return gen;
-}
-
-static u64 kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
-{
-	return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
+	gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
+	gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
+	return gen << MMIO_SPTE_GEN_SHIFT;
 }
 
 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 			   unsigned access)
 {
-	u64 gen = kvm_current_mmio_generation(vcpu);
+	u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK;
 	u64 mask = generation_mmio_spte_mask(gen);
 	u64 gpa = gfn << PAGE_SHIFT;
 
@@ -409,7 +415,7 @@ static gfn_t get_mmio_spte_gfn(u64 spte)
 
 static unsigned get_mmio_spte_access(u64 spte)
 {
-	u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
+	u64 mask = generation_mmio_spte_mask(MMIO_SPTE_GEN_MASK) | shadow_mmio_mask;
 	return (spte & ~mask) & ~PAGE_MASK;
 }
 
@@ -426,9 +432,13 @@ static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 
 static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
 {
-	u64 kvm_gen, spte_gen;
+	u64 kvm_gen, spte_gen, gen;
+
+	gen = kvm_vcpu_memslots(vcpu)->generation;
+	if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
+		return false;
 
-	kvm_gen = kvm_current_mmio_generation(vcpu);
+	kvm_gen = gen & MMIO_SPTE_GEN_MASK;
 	spte_gen = get_mmio_spte_generation(spte);
 
 	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
@@ -5895,13 +5905,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
 
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 {
-	gen &= MMIO_GEN_MASK;
+	gen &= MMIO_SPTE_GEN_MASK;
 
 	/*
-	 * Shift to eliminate the "update in-progress" flag, which isn't
-	 * included in the spte's generation number.
+	 * Shift to adjust for the "update in-progress" flag, which isn't
+	 * included in the MMIO generation number.
 	 */
-	gen >>= 1;
+	gen >>= MMIO_SPTE_GEN_SHIFT;
 
 	/*
 	 * Generation numbers are incremented in multiples of the number of
-- 
cgit v1.2.3


From 164bf7e56c5a73f2f819c39ba7e0f20e0f97dc7b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:18 -0800
Subject: KVM: Move the memslot update in-progress flag to bit 63

...now that KVM won't explode by moving it out of bit 0.  Using bit 63
eliminates the need to jump over bit 0, e.g. when calculating a new
memslots generation or when propagating the memslots generation to an
MMIO spte.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 Documentation/virtual/kvm/mmu.txt | 13 ++++++++-----
 arch/x86/kvm/mmu.c                | 31 ++++++++++++-------------------
 include/linux/kvm_host.h          |  4 ++--
 virt/kvm/kvm_main.c               |  8 ++++----
 4 files changed, 26 insertions(+), 30 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/Documentation/virtual/kvm/mmu.txt b/Documentation/virtual/kvm/mmu.txt
index e507a9e0421e..367a952f50ab 100644
--- a/Documentation/virtual/kvm/mmu.txt
+++ b/Documentation/virtual/kvm/mmu.txt
@@ -452,13 +452,16 @@ stored into the MMIO spte.  Thus, the MMIO spte might be created based on
 out-of-date information, but with an up-to-date generation number.
 
 To avoid this, the generation number is incremented again after synchronize_srcu
-returns; thus, the low bit of kvm_memslots(kvm)->generation is only 1 during a
+returns; thus, bit 63 of kvm_memslots(kvm)->generation set to 1 only during a
 memslot update, while some SRCU readers might be using the old copy.  We do not
 want to use an MMIO sptes created with an odd generation number, and we can do
-this without losing a bit in the MMIO spte.  The low bit of the generation
-is not stored in MMIO spte, and presumed zero when it is extracted out of the
-spte.  If KVM is unlucky and creates an MMIO spte while the low bit is 1,
-the next access to the spte will always be a cache miss.
+this without losing a bit in the MMIO spte.  The "update in-progress" bit of the
+generation is not stored in MMIO spte, and is so is implicitly zero when the
+generation is extracted out of the spte.  If KVM is unlucky and creates an MMIO
+spte while an update is in-progress, the next access to the spte will always be
+a cache miss.  For example, a subsequent access during the update window will
+miss due to the in-progress flag diverging, while an access after the update
+window closes will have a higher generation number (as compared to the spte).
 
 
 Further reading
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 364b2a737d94..bcf62e1e1ff7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -335,18 +335,17 @@ static inline bool is_access_track_spte(u64 spte)
  * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
  * the memslots generation and is derived as follows:
  *
- * Bits 1-9 of the memslot generation are propagated to spte bits 3-11
- * Bits 10-19 of the memslot generation are propagated to spte bits 52-61
+ * Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
+ * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
  *
- * The MMIO generation starts at bit 1 of the memslots generation in order to
- * skip over bit 0, the KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag.  Including
- * the flag would require stealing a bit from the "real" generation number and
- * thus effectively halve the maximum number of MMIO generations that can be
- * handled before encountering a wrap (which requires a full MMU zap).  The
- * flag is instead explicitly queried when checking for MMIO spte cache hits.
+ * The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
+ * the MMIO generation number, as doing so would require stealing a bit from
+ * the "real" generation number and thus effectively halve the maximum number
+ * of MMIO generations that can be handled before encountering a wrap (which
+ * requires a full MMU zap).  The flag is instead explicitly queried when
+ * checking for MMIO spte cache hits.
  */
-#define MMIO_SPTE_GEN_MASK		GENMASK_ULL(19, 1)
-#define MMIO_SPTE_GEN_SHIFT		1
+#define MMIO_SPTE_GEN_MASK		GENMASK_ULL(18, 0)
 
 #define MMIO_SPTE_GEN_LOW_START		3
 #define MMIO_SPTE_GEN_LOW_END		11
@@ -363,8 +362,6 @@ static u64 generation_mmio_spte_mask(u64 gen)
 
 	WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
 
-	gen >>= MMIO_SPTE_GEN_SHIFT;
-
 	mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
 	mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
 	return mask;
@@ -378,7 +375,7 @@ static u64 get_mmio_spte_generation(u64 spte)
 
 	gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
 	gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
-	return gen << MMIO_SPTE_GEN_SHIFT;
+	return gen;
 }
 
 static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
@@ -5905,13 +5902,9 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
 
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 {
-	gen &= MMIO_SPTE_GEN_MASK;
+	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
 
-	/*
-	 * Shift to adjust for the "update in-progress" flag, which isn't
-	 * included in the MMIO generation number.
-	 */
-	gen >>= MMIO_SPTE_GEN_SHIFT;
+	gen &= MMIO_SPTE_GEN_MASK;
 
 	/*
 	 * Generation numbers are incremented in multiples of the number of
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 5e1cb74922b3..85c0c00d5159 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -49,7 +49,7 @@
 #define KVM_MEMSLOT_INVALID	(1UL << 16)
 
 /*
- * Bit 0 of the memslot generation number is an "update in-progress flag",
+ * Bit 63 of the memslot generation number is an "update in-progress flag",
  * e.g. is temporarily set for the duration of install_new_memslots().
  * This flag effectively creates a unique generation number that is used to
  * mark cached memslot data, e.g. MMIO accesses, as potentially being stale,
@@ -67,7 +67,7 @@
  * the actual generation number against accesses that were inserted into the
  * cache *before* the memslots were updated.
  */
-#define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS	BIT_ULL(0)
+#define KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS	BIT_ULL(63)
 
 /* Two fragments for cross MMIO pages. */
 #define KVM_MAX_MMIO_FRAGMENTS	2
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5c2e7e173a46..c9d0bc01f8cb 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -657,7 +657,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 		if (!slots)
 			goto out_err_no_srcu;
 		/* Generations must be different for each address space. */
-		slots->generation = i * 2;
+		slots->generation = i;
 		rcu_assign_pointer(kvm->memslots[i], slots);
 	}
 
@@ -890,10 +890,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	 * Generations must be unique even across address spaces.  We do not need
 	 * a global counter for that, instead the generation space is evenly split
 	 * across address spaces.  For example, with two address spaces, address
-	 * space 0 will use generations 0, 4, 8, ... while address space 1 will
-	 * use generations 2, 6, 10, 14, ...
+	 * space 0 will use generations 0, 2, 4, ... while address space 1 will
+	 * use generations 1, 3, 5, ...
 	 */
-	gen += KVM_ADDRESS_SPACE_NUM * 2;
+	gen += KVM_ADDRESS_SPACE_NUM;
 
 	kvm_arch_memslots_updated(kvm, gen);
 
-- 
cgit v1.2.3


From 85875a133ea3aca5e4ea423c7cb991ad53f4866e Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:19 -0800
Subject: KVM: x86/mmu: Move slot_level_*() helper functions up a few lines

...so that kvm_mmu_invalidate_zap_pages_in_memslot() can utilize the
helpers in future patches.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 139 +++++++++++++++++++++++++++--------------------------
 1 file changed, 70 insertions(+), 69 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bcf62e1e1ff7..5a8af2bcd891 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5487,6 +5487,76 @@ void kvm_disable_tdp(void)
 }
 EXPORT_SYMBOL_GPL(kvm_disable_tdp);
 
+
+/* The return value indicates if tlb flush on all vcpus is needed. */
+typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
+
+/* The caller should hold mmu-lock before calling this function. */
+static __always_inline bool
+slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			slot_level_handler fn, int start_level, int end_level,
+			gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
+{
+	struct slot_rmap_walk_iterator iterator;
+	bool flush = false;
+
+	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
+			end_gfn, &iterator) {
+		if (iterator.rmap)
+			flush |= fn(kvm, iterator.rmap);
+
+		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+			if (flush && lock_flush_tlb) {
+				kvm_flush_remote_tlbs(kvm);
+				flush = false;
+			}
+			cond_resched_lock(&kvm->mmu_lock);
+		}
+	}
+
+	if (flush && lock_flush_tlb) {
+		kvm_flush_remote_tlbs(kvm);
+		flush = false;
+	}
+
+	return flush;
+}
+
+static __always_inline bool
+slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		  slot_level_handler fn, int start_level, int end_level,
+		  bool lock_flush_tlb)
+{
+	return slot_handle_level_range(kvm, memslot, fn, start_level,
+			end_level, memslot->base_gfn,
+			memslot->base_gfn + memslot->npages - 1,
+			lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		      slot_level_handler fn, bool lock_flush_tlb)
+{
+	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
+			slot_level_handler fn, bool lock_flush_tlb)
+{
+	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
+				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
+}
+
+static __always_inline bool
+slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
+		 slot_level_handler fn, bool lock_flush_tlb)
+{
+	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
+				 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
+}
+
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
 	free_page((unsigned long)vcpu->arch.mmu->pae_root);
@@ -5561,75 +5631,6 @@ void kvm_mmu_uninit_vm(struct kvm *kvm)
 	kvm_page_track_unregister_notifier(kvm, node);
 }
 
-/* The return value indicates if tlb flush on all vcpus is needed. */
-typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head);
-
-/* The caller should hold mmu-lock before calling this function. */
-static __always_inline bool
-slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
-			slot_level_handler fn, int start_level, int end_level,
-			gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
-{
-	struct slot_rmap_walk_iterator iterator;
-	bool flush = false;
-
-	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
-			end_gfn, &iterator) {
-		if (iterator.rmap)
-			flush |= fn(kvm, iterator.rmap);
-
-		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
-			if (flush && lock_flush_tlb) {
-				kvm_flush_remote_tlbs(kvm);
-				flush = false;
-			}
-			cond_resched_lock(&kvm->mmu_lock);
-		}
-	}
-
-	if (flush && lock_flush_tlb) {
-		kvm_flush_remote_tlbs(kvm);
-		flush = false;
-	}
-
-	return flush;
-}
-
-static __always_inline bool
-slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
-		  slot_level_handler fn, int start_level, int end_level,
-		  bool lock_flush_tlb)
-{
-	return slot_handle_level_range(kvm, memslot, fn, start_level,
-			end_level, memslot->base_gfn,
-			memslot->base_gfn + memslot->npages - 1,
-			lock_flush_tlb);
-}
-
-static __always_inline bool
-slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
-		      slot_level_handler fn, bool lock_flush_tlb)
-{
-	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
-				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
-}
-
-static __always_inline bool
-slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
-			slot_level_handler fn, bool lock_flush_tlb)
-{
-	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
-				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
-}
-
-static __always_inline bool
-slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
-		 slot_level_handler fn, bool lock_flush_tlb)
-{
-	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
-				 PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
-}
-
 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 {
 	struct kvm_memslots *slots;
-- 
cgit v1.2.3


From a21136345cb6f1a5b7f576701b6a454da5b6e606 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:20 -0800
Subject: KVM: x86/mmu: Split remote_flush+zap case out of
 kvm_mmu_flush_or_zap()

...and into a separate helper, kvm_mmu_remote_flush_or_zap(), that does
not require a vcpu so that the code can be (re)used by
kvm_mmu_invalidate_zap_pages_in_memslot().

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5a8af2bcd891..1cce120f06ae 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2240,18 +2240,28 @@ static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	return true;
 }
 
+static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
+					struct list_head *invalid_list,
+					bool remote_flush)
+{
+	if (!remote_flush && !list_empty(invalid_list))
+		return false;
+
+	if (!list_empty(invalid_list))
+		kvm_mmu_commit_zap_page(kvm, invalid_list);
+	else
+		kvm_flush_remote_tlbs(kvm);
+	return true;
+}
+
 static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
 				 struct list_head *invalid_list,
 				 bool remote_flush, bool local_flush)
 {
-	if (!list_empty(invalid_list)) {
-		kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
+	if (kvm_mmu_remote_flush_or_zap(vcpu->kvm, invalid_list, remote_flush))
 		return;
-	}
 
-	if (remote_flush)
-		kvm_flush_remote_tlbs(vcpu->kvm);
-	else if (local_flush)
+	if (local_flush)
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
 
-- 
cgit v1.2.3


From 4e103134b862314dc2f2f18f2fb0ab972adc3f5f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:21 -0800
Subject: KVM: x86/mmu: Zap only the relevant pages when removing a memslot

Modify kvm_mmu_invalidate_zap_pages_in_memslot(), a.k.a. the x86 MMU's
handler for kvm_arch_flush_shadow_memslot(), to zap only the pages/PTEs
that actually belong to the memslot being removed.  This improves
performance, especially why the deleted memslot has only a few shadow
entries, or even no entries.  E.g. a microbenchmark to access regular
memory while concurrently reading PCI ROM to trigger memslot deletion
showed a 5% improvement in throughput.

Cc: Xiao Guangrong <guangrong.xiao@gmail.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1cce120f06ae..b81e2cad0237 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5622,7 +5622,38 @@ static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
 			struct kvm_memory_slot *slot,
 			struct kvm_page_track_notifier_node *node)
 {
-	kvm_mmu_invalidate_zap_all_pages(kvm);
+	struct kvm_mmu_page *sp;
+	LIST_HEAD(invalid_list);
+	unsigned long i;
+	bool flush;
+	gfn_t gfn;
+
+	spin_lock(&kvm->mmu_lock);
+
+	if (list_empty(&kvm->arch.active_mmu_pages))
+		goto out_unlock;
+
+	flush = slot_handle_all_level(kvm, slot, kvm_zap_rmapp, false);
+
+	for (i = 0; i < slot->npages; i++) {
+		gfn = slot->base_gfn + i;
+
+		for_each_valid_sp(kvm, sp, gfn) {
+			if (sp->gfn != gfn)
+				continue;
+
+			kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+		}
+		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+			kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+			flush = false;
+			cond_resched_lock(&kvm->mmu_lock);
+		}
+	}
+	kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
+
+out_unlock:
+	spin_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_init_vm(struct kvm *kvm)
-- 
cgit v1.2.3


From 4771450c345dc5e3e3417d82aff62e0d88e7eee6 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:23 -0800
Subject: Revert "KVM: MMU: drop kvm_mmu_zap_mmio_sptes"

Revert back to a dedicated (and slower) mechanism for handling the
scenario where all MMIO shadow PTEs need to be zapped due to overflowing
the MMIO generation number.  The MMIO generation scenario is almost
literally a one-in-a-million occurrence, i.e. is not a performance
sensitive scenario.

Restoring kvm_mmu_zap_mmio_sptes() leaves VM teardown as the only user
of kvm_mmu_invalidate_zap_all_pages() and paves the way for removing
the fast invalidate mechanism altogether.

This reverts commit a8eca9dcc656a405a28ffba43f3d86a1ff0eb331.

Cc: Xiao Guangrong <guangrong.xiao@gmail.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/mmu.c              | 22 +++++++++++++++++++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 69daa57b08a7..aeca3fb1cf63 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -319,6 +319,7 @@ struct kvm_mmu_page {
 	struct list_head link;
 	struct hlist_node hash_link;
 	bool unsync;
+	bool mmio_cached;
 
 	/*
 	 * The following two entries are used to key the shadow page in the
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b81e2cad0237..d80c1558b23c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -391,6 +391,8 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
 		<< shadow_nonpresent_or_rsvd_mask_len;
 
+	page_header(__pa(sptep))->mmio_cached = true;
+
 	trace_mark_mmio_spte(sptep, gfn, access, gen);
 	mmu_spte_set(sptep, mask);
 }
@@ -5942,6 +5944,24 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
 	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
 }
 
+static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
+{
+	struct kvm_mmu_page *sp, *node;
+	LIST_HEAD(invalid_list);
+
+	spin_lock(&kvm->mmu_lock);
+restart:
+	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+		if (!sp->mmio_cached)
+			continue;
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+			goto restart;
+	}
+
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 {
 	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
@@ -5963,7 +5983,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 	 */
 	if (unlikely(gen == 0)) {
 		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
-		kvm_mmu_invalidate_zap_all_pages(kvm);
+		kvm_mmu_zap_mmio_sptes(kvm);
 	}
 }
 
-- 
cgit v1.2.3


From 571c5af06e303b4a69016193fd6b5afbc96eac40 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:24 -0800
Subject: KVM: x86/mmu: Voluntarily reschedule as needed when zapping MMIO
 sptes

Call cond_resched_lock() when zapping MMIO to reschedule if needed or to
release and reacquire mmu_lock in case of contention.  There is no need
to flush or zap when temporarily dropping mmu_lock as zapping MMIO sptes
is done when holding the memslots lock and with the "update in-progress"
bit set in the memslots generation, which disables MMIO spte caching.
The walk does need to be restarted if mmu_lock is dropped as the active
pages list may be modified.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d80c1558b23c..2190679eda39 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5954,7 +5954,8 @@ restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
 		if (!sp->mmio_cached)
 			continue;
-		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) ||
+		    cond_resched_lock(&kvm->mmu_lock))
 			goto restart;
 	}
 
-- 
cgit v1.2.3


From 5ff0568374ed2e585376a3832857ade5daccd381 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:25 -0800
Subject: KVM: x86/mmu: Remove is_obsolete() call

Unwinding usage of is_obsolete() is a step towards removing x86's fast
invalidate mechanism, i.e. this is one part of a revert all patches from
the series that introduced the mechanism[1].

This is a partial revert of commit 05988d728dcd ("KVM: MMU: reduce
KVM_REQ_MMU_RELOAD when root page is zapped").

[1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com

Cc: Xiao Guangrong <guangrong.xiao@gmail.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2190679eda39..6cbffc775220 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2713,11 +2713,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	} else {
 		list_move(&sp->link, &kvm->arch.active_mmu_pages);
 
-		/*
-		 * The obsolete pages can not be used on any vcpus.
-		 * See the comments in kvm_mmu_invalidate_zap_all_pages().
-		 */
-		if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
+		if (!sp->role.invalid)
 			kvm_reload_remote_mmus(kvm);
 	}
 
-- 
cgit v1.2.3


From 52d5dedc79bdcbac2976159a172069618cf31be5 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:26 -0800
Subject: Revert "KVM: MMU: reclaim the zapped-obsolete page first"

Unwinding optimizations related to obsolete pages is a step towards
removing x86 KVM's fast invalidate mechanism, i.e. this is one part of
a revert all patches from the series that introduced the mechanism[1].

This reverts commit 365c886860c4ba670d245e762b23987c912c129a.

[1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com

Cc: Xiao Guangrong <guangrong.xiao@gmail.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/mmu.c              | 21 ++++-----------------
 arch/x86/kvm/x86.c              |  1 -
 3 files changed, 4 insertions(+), 19 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aeca3fb1cf63..fbe16a908076 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -851,7 +851,6 @@ struct kvm_arch {
 	 * Hash table of struct kvm_mmu_page.
 	 */
 	struct list_head active_mmu_pages;
-	struct list_head zapped_obsolete_pages;
 	struct kvm_page_track_notifier_node mmu_sp_tracker;
 	struct kvm_page_track_notifier_head track_notifier_head;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6cbffc775220..255b0212fc5b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5858,6 +5858,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
+	LIST_HEAD(invalid_list);
 	int batch = 0;
 
 restart:
@@ -5890,8 +5891,7 @@ restart:
 			goto restart;
 		}
 
-		ret = kvm_mmu_prepare_zap_page(kvm, sp,
-				&kvm->arch.zapped_obsolete_pages);
+		ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 		batch += ret;
 
 		if (ret)
@@ -5902,7 +5902,7 @@ restart:
 	 * Should flush tlb before free page tables since lockless-walking
 	 * may use the pages.
 	 */
-	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 }
 
 /*
@@ -5935,11 +5935,6 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
 	spin_unlock(&kvm->mmu_lock);
 }
 
-static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
-{
-	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
-}
-
 static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
@@ -6011,24 +6006,16 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 		 * want to shrink a VM that only started to populate its MMU
 		 * anyway.
 		 */
-		if (!kvm->arch.n_used_mmu_pages &&
-		      !kvm_has_zapped_obsolete_pages(kvm))
+		if (!kvm->arch.n_used_mmu_pages)
 			continue;
 
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
 
-		if (kvm_has_zapped_obsolete_pages(kvm)) {
-			kvm_mmu_commit_zap_page(kvm,
-			      &kvm->arch.zapped_obsolete_pages);
-			goto unlock;
-		}
-
 		if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
 			freed++;
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 
-unlock:
 		spin_unlock(&kvm->mmu_lock);
 		srcu_read_unlock(&kvm->srcu, idx);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 03d26ffb29cd..78fb13f190a3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9113,7 +9113,6 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
-	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
 
-- 
cgit v1.2.3


From 210f494261e1e84ad1f15877baa1c615afe3b342 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:27 -0800
Subject: Revert "KVM: MMU: collapse TLB flushes when zap all pages"

Unwinding optimizations related to obsolete pages is a step towards
removing x86 KVM's fast invalidate mechanism, i.e. this is one part of
a revert all patches from the series that introduced the mechanism[1].

This reverts commit f34d251d66ba263c077ed9d2bbd1874339a4c887.

[1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com

Cc: Xiao Guangrong <guangrong.xiao@gmail.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 31 +++----------------------------
 1 file changed, 3 insertions(+), 28 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 255b0212fc5b..e733262027ed 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2211,14 +2211,6 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list);
 
-/*
- * NOTE: we should pay more attention on the zapped-obsolete page
- * (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
- * since it has been deleted from active_mmu_pages but still can be found
- * at hast list.
- *
- * for_each_valid_sp() has skipped that kind of pages.
- */
 #define for_each_valid_sp(_kvm, _sp, _gfn)				\
 	hlist_for_each_entry(_sp,					\
 	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
@@ -5881,13 +5873,11 @@ restart:
 		if (sp->role.invalid)
 			continue;
 
-		/*
-		 * Need not flush tlb since we only zap the sp with invalid
-		 * generation number.
-		 */
 		if (batch >= BATCH_ZAP_PAGES &&
-		      cond_resched_lock(&kvm->mmu_lock)) {
+		      (need_resched() || spin_needbreak(&kvm->mmu_lock))) {
 			batch = 0;
+			kvm_mmu_commit_zap_page(kvm, &invalid_list);
+			cond_resched_lock(&kvm->mmu_lock);
 			goto restart;
 		}
 
@@ -5898,10 +5888,6 @@ restart:
 			goto restart;
 	}
 
-	/*
-	 * Should flush tlb before free page tables since lockless-walking
-	 * may use the pages.
-	 */
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 }
 
@@ -5920,17 +5906,6 @@ void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
 	trace_kvm_mmu_invalidate_zap_all_pages(kvm);
 	kvm->arch.mmu_valid_gen++;
 
-	/*
-	 * Notify all vcpus to reload its shadow page table
-	 * and flush TLB. Then all vcpus will switch to new
-	 * shadow page table with the new mmu_valid_gen.
-	 *
-	 * Note: we should do this under the protection of
-	 * mmu-lock, otherwise, vcpu would purge shadow page
-	 * but miss tlb flush.
-	 */
-	kvm_reload_remote_mmus(kvm);
-
 	kvm_zap_obsolete_pages(kvm);
 	spin_unlock(&kvm->mmu_lock);
 }
-- 
cgit v1.2.3


From 43d2b14b105fb00b8864c7b0ee7043cc1cc4a969 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:28 -0800
Subject: Revert "KVM: MMU: zap pages in batch"

Unwinding optimizations related to obsolete pages is a step towards
removing x86 KVM's fast invalidate mechanism, i.e. this is one part of
a revert all patches from the series that introduced the mechanism[1].

This reverts commit e7d11c7a894986a13817c1c001e1e7668c5c4eb4.

[1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com

Cc: Xiao Guangrong <guangrong.xiao@gmail.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e733262027ed..cb9fd69d2632 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5846,18 +5846,14 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 
-#define BATCH_ZAP_PAGES	10
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
 	LIST_HEAD(invalid_list);
-	int batch = 0;
 
 restart:
 	list_for_each_entry_safe_reverse(sp, node,
 	      &kvm->arch.active_mmu_pages, link) {
-		int ret;
-
 		/*
 		 * No obsolete page exists before new created page since
 		 * active_mmu_pages is the FIFO list.
@@ -5866,6 +5862,28 @@ restart:
 			break;
 
 		/*
+		 * Do not repeatedly zap a root page to avoid unnecessary
+		 * KVM_REQ_MMU_RELOAD, otherwise we may not be able to
+		 * progress:
+		 *    vcpu 0                        vcpu 1
+		 *                         call vcpu_enter_guest():
+		 *                            1): handle KVM_REQ_MMU_RELOAD
+		 *                                and require mmu-lock to
+		 *                                load mmu
+		 * repeat:
+		 *    1): zap root page and
+		 *        send KVM_REQ_MMU_RELOAD
+		 *
+		 *    2): if (cond_resched_lock(mmu-lock))
+		 *
+		 *                            2): hold mmu-lock and load mmu
+		 *
+		 *                            3): see KVM_REQ_MMU_RELOAD bit
+		 *                                on vcpu->requests is set
+		 *                                then return 1 to call
+		 *                                vcpu_enter_guest() again.
+		 *            goto repeat;
+		 *
 		 * Since we are reversely walking the list and the invalid
 		 * list will be moved to the head, skip the invalid page
 		 * can help us to avoid the infinity list walking.
@@ -5873,18 +5891,13 @@ restart:
 		if (sp->role.invalid)
 			continue;
 
-		if (batch >= BATCH_ZAP_PAGES &&
-		      (need_resched() || spin_needbreak(&kvm->mmu_lock))) {
-			batch = 0;
+		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
 			kvm_mmu_commit_zap_page(kvm, &invalid_list);
 			cond_resched_lock(&kvm->mmu_lock);
 			goto restart;
 		}
 
-		ret = kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
-		batch += ret;
-
-		if (ret)
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
 			goto restart;
 	}
 
-- 
cgit v1.2.3


From 42560fb1f3c6c7f730897b7fa7a478bc37e0be50 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:29 -0800
Subject: Revert "KVM: MMU: add tracepoint for kvm_mmu_invalidate_all_pages"

...as part of removing x86 KVM's fast invalidate mechanism, i.e. this
is one part of a revert all patches from the series that introduced the
mechanism[1].

This reverts commit 35006126f024f68727c67001b9cb703c38f69268.

[1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com

Cc: Xiao Guangrong <guangrong.xiao@gmail.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c      |  1 -
 arch/x86/kvm/mmutrace.h | 21 ---------------------
 2 files changed, 22 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cb9fd69d2632..df4025e6f1e1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5916,7 +5916,6 @@ restart:
 void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
 {
 	spin_lock(&kvm->mmu_lock);
-	trace_kvm_mmu_invalidate_zap_all_pages(kvm);
 	kvm->arch.mmu_valid_gen++;
 
 	kvm_zap_obsolete_pages(kvm);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index c73bf4e4988c..cac88910b310 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -282,27 +282,6 @@ TRACE_EVENT(
 	)
 );
 
-TRACE_EVENT(
-	kvm_mmu_invalidate_zap_all_pages,
-	TP_PROTO(struct kvm *kvm),
-	TP_ARGS(kvm),
-
-	TP_STRUCT__entry(
-		__field(unsigned long, mmu_valid_gen)
-		__field(unsigned int, mmu_used_pages)
-	),
-
-	TP_fast_assign(
-		__entry->mmu_valid_gen = kvm->arch.mmu_valid_gen;
-		__entry->mmu_used_pages = kvm->arch.n_used_mmu_pages;
-	),
-
-	TP_printk("kvm-mmu-valid-gen %lx used_pages %x",
-		  __entry->mmu_valid_gen, __entry->mmu_used_pages
-	)
-);
-
-
 TRACE_EVENT(
 	check_mmio_spte,
 	TP_PROTO(u64 spte, unsigned int kvm_gen, unsigned int spte_gen),
-- 
cgit v1.2.3


From 7390de1e99a70895721165d0ccd4a6e16482960a Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:31 -0800
Subject: Revert "KVM: x86: use the fast way to invalidate all pages"

Revert to a slow kvm_mmu_zap_all() for kvm_arch_flush_shadow_all().
Flushing all shadow entries is only done during VM teardown, i.e.
kvm_arch_flush_shadow_all() is only called when the associated MM struct
is being released or when the VM instance is being freed.

Although the performance of teardown itself isn't critical, KVM should
still voluntarily schedule to play nice with the rest of the kernel;
but that can be done without the fast invalidate mechanism in a future
patch.

This reverts commit 6ca18b6950f8dee29361722f28f69847724b276f.

Cc: Xiao Guangrong <guangrong.xiao@gmail.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 15 +++++++++++++++
 arch/x86/kvm/x86.c |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index df4025e6f1e1..5cdeb88850f8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5846,6 +5846,21 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 
+void kvm_mmu_zap_all(struct kvm *kvm)
+{
+	struct kvm_mmu_page *sp, *node;
+	LIST_HEAD(invalid_list);
+
+	spin_lock(&kvm->mmu_lock);
+restart:
+	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+			goto restart;
+
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
+	spin_unlock(&kvm->mmu_lock);
+}
+
 static void kvm_zap_obsolete_pages(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 78fb13f190a3..65e4559eef2f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9470,7 +9470,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 
 void kvm_arch_flush_shadow_all(struct kvm *kvm)
 {
-	kvm_mmu_invalidate_zap_all_pages(kvm);
+	kvm_mmu_zap_all(kvm);
 }
 
 void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
-- 
cgit v1.2.3


From 8a674adc11cd4cc59e51eaea6f0cc4b3d5710411 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:32 -0800
Subject: KVM: x86/mmu: skip over invalid root pages when zapping all sptes

...to guarantee forward progress.  When zapped, root pages are marked
invalid and moved to the head of the active pages list until they are
explicitly freed.  Theoretically, having unzappable root pages at the
head of the list could prevent kvm_mmu_zap_all() from making forward
progress were a future patch to add a loop restart after processing a
page, e.g. to drop mmu_lock on contention.

Although kvm_mmu_prepare_zap_page() can theoretically take action on
invalid pages, e.g. to zap unsync children, functionally it's not
necessary (root pages will be re-zapped when freed) and practically
speaking the odds of e.g. @unsync or @unsync_children becoming %true
while zapping all pages is basically nil.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5cdeb88850f8..c79ad7f31fdb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5853,9 +5853,12 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 
 	spin_lock(&kvm->mmu_lock);
 restart:
-	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
+	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
+		if (sp->role.invalid && sp->root_count)
+			continue;
 		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
 			goto restart;
+	}
 
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 	spin_unlock(&kvm->mmu_lock);
-- 
cgit v1.2.3


From 5d6317ca4e61a3fa7528f832cd945c42fde8e67f Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:33 -0800
Subject: KVM: x86/mmu: Voluntarily reschedule as needed when zapping all sptes

Call cond_resched_lock() when zapping all sptes to reschedule if needed
or to release and reacquire mmu_lock in case of contention.  There is no
need to flush or zap when temporarily dropping mmu_lock as zapping all
sptes is done only when the owning userspace VMM has exited or when the
VM is being destroyed, i.e. there is no interplay with memslots or MMIO
generations to worry about.

Be paranoid and restart the walk if mmu_lock is dropped to avoid any
potential issues with consuming a stale iterator.  The overhead in doing
so is negligible as at worst there will be a few root shadow pages at
the head of the list, i.e. the iterator is essentially the head of the
list already.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c79ad7f31fdb..fa153d771f47 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5856,7 +5856,8 @@ restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
 		if (sp->role.invalid && sp->root_count)
 			continue;
-		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) ||
+		    cond_resched_lock(&kvm->mmu_lock))
 			goto restart;
 	}
 
-- 
cgit v1.2.3


From ea145aacf4ae8485cf179a4d0dc502e9f75044f4 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:34 -0800
Subject: Revert "KVM: MMU: fast invalidate all pages"

Remove x86 KVM's fast invalidate mechanism, i.e. revert all patches
from the original series[1], now that all users of the fast invalidate
mechanism are gone.

This reverts commit 5304b8d37c2a5ebca48330f5e7868d240eafbed1.

[1] https://lkml.kernel.org/r/1369960590-14138-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com

Cc: Xiao Guangrong <guangrong.xiao@gmail.com>
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 -
 arch/x86/kvm/mmu.c              | 98 +----------------------------------------
 arch/x86/kvm/mmu.h              |  1 -
 3 files changed, 1 insertion(+), 100 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fbe16a908076..9417febf8490 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -334,7 +334,6 @@ struct kvm_mmu_page {
 	int root_count;          /* Currently serving as active root */
 	unsigned int unsync_children;
 	struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */
-	unsigned long mmu_valid_gen;
 	DECLARE_BITMAP(unsync_child_bitmap, 512);
 
 #ifdef CONFIG_X86_32
@@ -845,7 +844,6 @@ struct kvm_arch {
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_max_mmu_pages;
 	unsigned int indirect_shadow_pages;
-	unsigned long mmu_valid_gen;
 	struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
 	/*
 	 * Hash table of struct kvm_mmu_page.
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fa153d771f47..6d602d4c3ca4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2060,12 +2060,6 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct
 	if (!direct)
 		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
-
-	/*
-	 * The active_mmu_pages list is the FIFO list, do not move the
-	 * page until it is zapped. kvm_zap_obsolete_pages depends on
-	 * this feature. See the comments in kvm_zap_obsolete_pages().
-	 */
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
 	return sp;
@@ -2214,7 +2208,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 #define for_each_valid_sp(_kvm, _sp, _gfn)				\
 	hlist_for_each_entry(_sp,					\
 	  &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
-		if (is_obsolete_sp((_kvm), (_sp)) || (_sp)->role.invalid) {    \
+		if ((_sp)->role.invalid) {    \
 		} else
 
 #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn)			\
@@ -2266,11 +2260,6 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
 static void mmu_audit_disable(void) { }
 #endif
 
-static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-	return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
-}
-
 static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			 struct list_head *invalid_list)
 {
@@ -2495,7 +2484,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
 			flush |= kvm_sync_pages(vcpu, gfn, &invalid_list);
 	}
-	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
 	clear_page(sp->spt);
 	trace_kvm_mmu_get_page(sp, true);
 
@@ -4206,14 +4194,6 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
 			return false;
 
 		if (cached_root_available(vcpu, new_cr3, new_role)) {
-			/*
-			 * It is possible that the cached previous root page is
-			 * obsolete because of a change in the MMU
-			 * generation number. However, that is accompanied by
-			 * KVM_REQ_MMU_RELOAD, which will free the root that we
-			 * have set here and allocate a new one.
-			 */
-
 			kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
 			if (!skip_tlb_flush) {
 				kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
@@ -5865,82 +5845,6 @@ restart:
 	spin_unlock(&kvm->mmu_lock);
 }
 
-static void kvm_zap_obsolete_pages(struct kvm *kvm)
-{
-	struct kvm_mmu_page *sp, *node;
-	LIST_HEAD(invalid_list);
-
-restart:
-	list_for_each_entry_safe_reverse(sp, node,
-	      &kvm->arch.active_mmu_pages, link) {
-		/*
-		 * No obsolete page exists before new created page since
-		 * active_mmu_pages is the FIFO list.
-		 */
-		if (!is_obsolete_sp(kvm, sp))
-			break;
-
-		/*
-		 * Do not repeatedly zap a root page to avoid unnecessary
-		 * KVM_REQ_MMU_RELOAD, otherwise we may not be able to
-		 * progress:
-		 *    vcpu 0                        vcpu 1
-		 *                         call vcpu_enter_guest():
-		 *                            1): handle KVM_REQ_MMU_RELOAD
-		 *                                and require mmu-lock to
-		 *                                load mmu
-		 * repeat:
-		 *    1): zap root page and
-		 *        send KVM_REQ_MMU_RELOAD
-		 *
-		 *    2): if (cond_resched_lock(mmu-lock))
-		 *
-		 *                            2): hold mmu-lock and load mmu
-		 *
-		 *                            3): see KVM_REQ_MMU_RELOAD bit
-		 *                                on vcpu->requests is set
-		 *                                then return 1 to call
-		 *                                vcpu_enter_guest() again.
-		 *            goto repeat;
-		 *
-		 * Since we are reversely walking the list and the invalid
-		 * list will be moved to the head, skip the invalid page
-		 * can help us to avoid the infinity list walking.
-		 */
-		if (sp->role.invalid)
-			continue;
-
-		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
-			kvm_mmu_commit_zap_page(kvm, &invalid_list);
-			cond_resched_lock(&kvm->mmu_lock);
-			goto restart;
-		}
-
-		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
-			goto restart;
-	}
-
-	kvm_mmu_commit_zap_page(kvm, &invalid_list);
-}
-
-/*
- * Fast invalidate all shadow pages and use lock-break technique
- * to zap obsolete pages.
- *
- * It's required when memslot is being deleted or VM is being
- * destroyed, in these cases, we should ensure that KVM MMU does
- * not use any resource of the being-deleted slot or all slots
- * after calling the function.
- */
-void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
-{
-	spin_lock(&kvm->mmu_lock);
-	kvm->arch.mmu_valid_gen++;
-
-	kvm_zap_obsolete_pages(kvm);
-	spin_unlock(&kvm->mmu_lock);
-}
-
 static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c7b333147c4a..bbdc60f2fae8 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -203,7 +203,6 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
 	return -(u32)fault & errcode;
 }
 
-void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm);
 void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
 
 void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
-- 
cgit v1.2.3


From 83cdb56864bcb1466b454f17fff47348ca7925a2 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:35 -0800
Subject: KVM: x86/mmu: Differentiate between nr zapped and list unstable

The return value of kvm_mmu_prepare_zap_page() has evolved to become
overloaded to convey two separate pieces of information.  1) was at
least one page zapped and 2) has the list of MMU pages become unstable.

In it's original incarnation (as kvm_mmu_zap_page()), there was no
return value at all.  Commit 0738541396be ("KVM: MMU: awareness of new
kvm_mmu_zap_page behaviour") added a return value in preparation for
commit 4731d4c7a077 ("KVM: MMU: out of sync shadow core").  Although
the return value was of type 'int', it was actually used as a boolean
to indicate whether or not active_mmu_pages may have become unstable due
to zapping children.  Walking a list with list_for_each_entry_safe()
only protects against deleting/moving the current entry, i.e. zapping a
child page would break iteration due to modifying any number of entries.

Later, commit 60c8aec6e2c9 ("KVM: MMU: use page array in unsync walk")
modified mmu_zap_unsync_children() to return an approximation of the
number of children zapped.  This was not intentional, it was simply a
side effect of how the code was written.

The unintented side affect was then morphed into an actual feature by
commit 77662e0028c7 ("KVM: MMU: fix kvm_mmu_zap_page() and its calling
path"), which modified kvm_mmu_change_mmu_pages() to use the number of
zapped pages when determining the number of MMU pages in use by the VM.

Finally, commit 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return
the number of pages it actually freed") added the initial page to the
return value to make its behavior more consistent with what most users
would expect.  Incorporating the initial parent page in the return value
of kvm_mmu_zap_page() breaks the original usage of restarting a list
walk on a non-zero return value to handle a potentially unstable list,
i.e. walks will unnecessarily restart when any page is zapped.

Fix this by restoring the original behavior of kvm_mmu_zap_page(), i.e.
return a boolean to indicate that the list may be unstable and move the
number of zapped children to a dedicated parameter.  Since the majority
of callers to kvm_mmu_prepare_zap_page() don't care about either return
value, preserve the current definition of kvm_mmu_prepare_zap_page() by
making it a wrapper of a new helper, __kvm_mmu_prepare_zap_page().  This
avoids having to update every call site and also provides cleaner code
for functions that only care about the number of pages zapped.

Fixes: 54a4f0239f2e ("KVM: MMU: make kvm_mmu_zap_page() return
                      the number of pages it actually freed")
Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 36 ++++++++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 10 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6d602d4c3ca4..4b93fcdf0839 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2200,8 +2200,8 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	--kvm->stat.mmu_unsync;
 }
 
-static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
-				    struct list_head *invalid_list);
+static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				     struct list_head *invalid_list);
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 				    struct list_head *invalid_list);
 
@@ -2669,17 +2669,22 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 	return zapped;
 }
 
-static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
-				    struct list_head *invalid_list)
+static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
+				       struct kvm_mmu_page *sp,
+				       struct list_head *invalid_list,
+				       int *nr_zapped)
 {
-	int ret;
+	bool list_unstable;
 
 	trace_kvm_mmu_prepare_zap_page(sp);
 	++kvm->stat.mmu_shadow_zapped;
-	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
+	*nr_zapped = mmu_zap_unsync_children(kvm, sp, invalid_list);
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
 
+	/* Zapping children means active_mmu_pages has become unstable. */
+	list_unstable = *nr_zapped;
+
 	if (!sp->role.invalid && !sp->role.direct)
 		unaccount_shadowed(kvm, sp);
 
@@ -2687,7 +2692,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 		kvm_unlink_unsync_page(kvm, sp);
 	if (!sp->root_count) {
 		/* Count self */
-		ret++;
+		(*nr_zapped)++;
 		list_move(&sp->link, invalid_list);
 		kvm_mod_used_mmu_pages(kvm, -1);
 	} else {
@@ -2698,7 +2703,16 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
 	}
 
 	sp->role.invalid = 1;
-	return ret;
+	return list_unstable;
+}
+
+static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				     struct list_head *invalid_list)
+{
+	int nr_zapped;
+
+	__kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, &nr_zapped);
+	return nr_zapped;
 }
 
 static void kvm_mmu_commit_zap_page(struct kvm *kvm,
@@ -5830,13 +5844,14 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
 	LIST_HEAD(invalid_list);
+	int ign;
 
 	spin_lock(&kvm->mmu_lock);
 restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
 		if (sp->role.invalid && sp->root_count)
 			continue;
-		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) ||
+		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) ||
 		    cond_resched_lock(&kvm->mmu_lock))
 			goto restart;
 	}
@@ -5849,13 +5864,14 @@ static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
 {
 	struct kvm_mmu_page *sp, *node;
 	LIST_HEAD(invalid_list);
+	int ign;
 
 	spin_lock(&kvm->mmu_lock);
 restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
 		if (!sp->mmio_cached)
 			continue;
-		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list) ||
+		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) ||
 		    cond_resched_lock(&kvm->mmu_lock))
 			goto restart;
 	}
-- 
cgit v1.2.3


From 24efe61f696c7b44a66c7d6a41c181b31aa338fc Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:36 -0800
Subject: KVM: x86/mmu: WARN if zapping a MMIO spte results in zapping children

Paolo expressed a concern that kvm_mmu_zap_mmio_sptes() could have a
quadratic runtime[1], i.e. restarting the spte walk while zapping only
MMIO sptes could result in re-walking large portions of the list over
and over due to the non-MMIO sptes encountered before the restart not
being removed.

At the time, the concern was legitimate as the walk was restarted when
any spte was zapped.  But that is no longer the case as the walk is now
restarted iff one or more children have been zapped, which is necessary
because zapping children makes the active_mmu_pages list unstable.

Furthermore, it should be impossible for an MMIO spte to have children,
i.e. zapping an MMIO spte should never result in zapping children.  In
other words, kvm_mmu_zap_mmio_sptes() should never restart its walk, and
so should always execute in linear time.  WARN if this assertion fails.

Although it should never be needed, leave the restart logic in place.
In normal operation, the cost is at worst an extra CMP+Jcc, and if for
some reason the list does become unstable, not restarting would likely
crash KVM, or worse, the kernel.

[1] https://patchwork.kernel.org/patch/10756589/#22452085

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4b93fcdf0839..81618070367b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5871,8 +5871,11 @@ restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
 		if (!sp->mmio_cached)
 			continue;
-		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) ||
-		    cond_resched_lock(&kvm->mmu_lock))
+		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
+			WARN_ON_ONCE(1);
+			goto restart;
+		}
+		if (cond_resched_lock(&kvm->mmu_lock))
 			goto restart;
 	}
 
-- 
cgit v1.2.3


From 8ab3c471eef20925bf64c6d4fa46e88cdb4e86d5 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <sean.j.christopherson@intel.com>
Date: Tue, 5 Feb 2019 13:01:37 -0800
Subject: KVM: x86/mmu: Consolidate kvm_mmu_zap_all() and
 kvm_mmu_zap_mmio_sptes()

...via a new helper, __kvm_mmu_zap_all().  An alternative to passing a
'bool mmio_only' would be to pass a callback function to filter the
shadow page, i.e. to make __kvm_mmu_zap_all() generic and reusable, but
zapping all shadow pages is a last resort, i.e. making the helper less
extensible is a feature of sorts.  And the explicit MMIO parameter makes
it easy to preserve the WARN_ON_ONCE() if a restart is triggered when
zapping MMIO sptes.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 33 ++++++++++-----------------------
 1 file changed, 10 insertions(+), 23 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 81618070367b..8d43b7c0f56f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5840,7 +5840,7 @@ void kvm_mmu_slot_set_dirty(struct kvm *kvm,
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
 
-void kvm_mmu_zap_all(struct kvm *kvm)
+static void __kvm_mmu_zap_all(struct kvm *kvm, bool mmio_only)
 {
 	struct kvm_mmu_page *sp, *node;
 	LIST_HEAD(invalid_list);
@@ -5849,30 +5849,12 @@ void kvm_mmu_zap_all(struct kvm *kvm)
 	spin_lock(&kvm->mmu_lock);
 restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
-		if (sp->role.invalid && sp->root_count)
+		if (mmio_only && !sp->mmio_cached)
 			continue;
-		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign) ||
-		    cond_resched_lock(&kvm->mmu_lock))
-			goto restart;
-	}
-
-	kvm_mmu_commit_zap_page(kvm, &invalid_list);
-	spin_unlock(&kvm->mmu_lock);
-}
-
-static void kvm_mmu_zap_mmio_sptes(struct kvm *kvm)
-{
-	struct kvm_mmu_page *sp, *node;
-	LIST_HEAD(invalid_list);
-	int ign;
-
-	spin_lock(&kvm->mmu_lock);
-restart:
-	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
-		if (!sp->mmio_cached)
+		if (sp->role.invalid && sp->root_count)
 			continue;
 		if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) {
-			WARN_ON_ONCE(1);
+			WARN_ON_ONCE(mmio_only);
 			goto restart;
 		}
 		if (cond_resched_lock(&kvm->mmu_lock))
@@ -5883,6 +5865,11 @@ restart:
 	spin_unlock(&kvm->mmu_lock);
 }
 
+void kvm_mmu_zap_all(struct kvm *kvm)
+{
+	return __kvm_mmu_zap_all(kvm, false);
+}
+
 void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 {
 	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
@@ -5904,7 +5891,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
 	 */
 	if (unlikely(gen == 0)) {
 		kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
-		kvm_mmu_zap_mmio_sptes(kvm);
+		__kvm_mmu_zap_all(kvm, true);
 	}
 }
 
-- 
cgit v1.2.3


From 92da008fa21034c369cdb8ca2b629fe5c196826b Mon Sep 17 00:00:00 2001
From: Ben Gardon <bgardon@google.com>
Date: Tue, 12 Mar 2019 11:45:58 -0700
Subject: Revert "KVM/MMU: Flush tlb directly in the kvm_zap_gfn_range()"

This reverts commit 71883a62fcd6c70639fa12cda733378b4d997409.

The above commit contains an optimization to kvm_zap_gfn_range which
uses gfn-limited TLB flushes, if enabled. If using these limited flushes,
kvm_zap_gfn_range passes lock_flush_tlb=false to slot_handle_level_range
which creates a race when the function unlocks to call cond_resched.
See an example of this race below:

CPU 0                   CPU 1                           CPU 3
// zap_direct_gfn_range
mmu_lock()
// *ptep == pte_1
*ptep = 0
if (lock_flush_tlb)
        flush_tlbs()
mmu_unlock()
                        // In invalidate range
                        // MMU notifier
                        mmu_lock()
                        if (pte != 0)
                                *ptep = 0
                                flush = true
                        if (flush)
                                flush_remote_tlbs()
                        mmu_unlock()
                        return
                        // Host MM reallocates
                        // page previously
                        // backing guest memory.
                                                        // Guest accesses
                                                        // invalid page
                                                        // through pte_1
                                                        // in its TLB!!

Tested: Ran all kvm-unit-tests on a Intel Haswell machine with and
	without this patch. The patch introduced no new failures.

Signed-off-by: Ben Gardon <bgardon@google.com>
Cc: stable@vger.kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

(limited to 'arch/x86/kvm/mmu.c')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8d43b7c0f56f..4cda5ee48845 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5660,13 +5660,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 {
 	struct kvm_memslots *slots;
 	struct kvm_memory_slot *memslot;
-	bool flush_tlb = true;
-	bool flush = false;
 	int i;
 
-	if (kvm_available_flush_tlb_with_range())
-		flush_tlb = false;
-
 	spin_lock(&kvm->mmu_lock);
 	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
 		slots = __kvm_memslots(kvm, i);
@@ -5678,17 +5673,12 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 			if (start >= end)
 				continue;
 
-			flush |= slot_handle_level_range(kvm, memslot,
-					kvm_zap_rmapp, PT_PAGE_TABLE_LEVEL,
-					PT_MAX_HUGEPAGE_LEVEL, start,
-					end - 1, flush_tlb);
+			slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+						PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
+						start, end - 1, true);
 		}
 	}
 
-	if (flush)
-		kvm_flush_remote_tlbs_with_address(kvm, gfn_start,
-				gfn_end - gfn_start + 1);
-
 	spin_unlock(&kvm->mmu_lock);
 }
 
-- 
cgit v1.2.3