From b254244d2682fe975630f176c25a4444cc4e088d Mon Sep 17 00:00:00 2001
From: Daniel De Graaf <dgdegra@tycho.nsa.gov>
Date: Thu, 24 Mar 2011 08:10:07 -0400
Subject: xen/p2m: Allocate p2m tracking pages on override

It is possible to add a p2m override on pages that are currently mapped
to INVALID_P2M_ENTRY; in particular, this will happen when using
ballooned pages in gntdev. This means that set_phys_to_machine must be
used instead of __set_phys_to_machine.

Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 215a3ce61068..2a44edd606e5 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -671,7 +671,9 @@ int m2p_add_override(unsigned long mfn, struct page *page)
 	page->private = mfn;
 	page->index = pfn_to_mfn(pfn);
 
-	__set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
+	if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
+		return -ENOMEM;
+
 	if (!PageHighMem(page))
 		/* Just zap old mapping for now */
 		pte_clear(&init_mm, address, ptep);
@@ -709,7 +711,7 @@ int m2p_remove_override(struct page *page)
 	spin_lock_irqsave(&m2p_override_lock, flags);
 	list_del(&page->lru);
 	spin_unlock_irqrestore(&m2p_override_lock, flags);
-	__set_phys_to_machine(pfn, page->index);
+	set_phys_to_machine(pfn, page->index);
 
 	if (!PageHighMem(page))
 		set_pte_at(&init_mm, address, ptep,
-- 
cgit v1.2.3


From b83c6e55ac482f08984504d61382ecf05f0afe32 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 24 Mar 2011 13:34:32 -0700
Subject: xen: fix p2m section mismatches

Fix section mismatch warnings:
set_phys_range_identity() is called by __init xen_set_identity(),
so also mark set_phys_range_identity() as __init.
then:
__early_alloc_p2m() is called set_phys_range_identity(), so also mark
__early_alloc_p2m() as __init.

WARNING: arch/x86/built-in.o(.text+0x7856): Section mismatch in reference from the function __early_alloc_p2m() to the function .init.text:extend_brk()
The function __early_alloc_p2m() references
the function __init extend_brk().
This is often because __early_alloc_p2m lacks a __init
annotation or the annotation of extend_brk is wrong.

WARNING: arch/x86/built-in.o(.text+0x7967): Section mismatch in reference from the function set_phys_range_identity() to the function .init.text:extend_brk()
The function set_phys_range_identity() references
the function __init extend_brk().
This is often because set_phys_range_identity lacks a __init
annotation or the annotation of extend_brk is wrong.

[v2: Per Stephen Hemming recommonedation made __early_alloc_p2m static]
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/p2m.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 2a44edd606e5..141eb0de8b06 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -497,7 +497,7 @@ static bool alloc_p2m(unsigned long pfn)
 	return true;
 }
 
-bool __early_alloc_p2m(unsigned long pfn)
+static bool __init __early_alloc_p2m(unsigned long pfn)
 {
 	unsigned topidx, mididx, idx;
 
@@ -530,7 +530,7 @@ bool __early_alloc_p2m(unsigned long pfn)
 	}
 	return idx != 0;
 }
-unsigned long set_phys_range_identity(unsigned long pfn_s,
+unsigned long __init set_phys_range_identity(unsigned long pfn_s,
 				      unsigned long pfn_e)
 {
 	unsigned long pfn;
-- 
cgit v1.2.3


From d88885d0923ae27b01dfcec644f94829b1e46bea Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Mon, 4 Apr 2011 14:48:20 -0400
Subject: xen/debug: Don't be so verbose with WARN on 1-1 mapping errors.

There are valid situations in which this error is not
a warning. Mainly when QEMU maps a guest memory and uses
the VM_IO flag to set the MFNs. For right now make the
WARN be WARN_ONCE. In the future we will:

 1). Remove the VM_IO code handling..
 2). .. which will also remove this debug facility.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c82df6c9c0f0..a991b57f91fe 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -565,13 +565,13 @@ pte_t xen_make_pte_debug(pteval_t pte)
 	if (io_page &&
 	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
 		other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
-		WARN(addr != other_addr,
+		WARN_ONCE(addr != other_addr,
 			"0x%lx is using VM_IO, but it is 0x%lx!\n",
 			(unsigned long)addr, (unsigned long)other_addr);
 	} else {
 		pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
 		other_addr = (_pte.pte & PTE_PFN_MASK);
-		WARN((addr == other_addr) && (!io_page) && (!iomap_set),
+		WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
 			"0x%lx is missing VM_IO (and wasn't fixed)!\n",
 			(unsigned long)addr);
 	}
-- 
cgit v1.2.3


From 61f4237d5b005767a76f4f3694e68e6f78f392d9 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sat, 18 Sep 2010 22:25:30 -0700
Subject: xen: just completely disable XSAVE

Some (old) versions of Xen just kill the domain if it tries to set any
unknown bits in CR4, so we can't reliably probe for OSXSAVE in
CR4.

Since Xen doesn't support XSAVE for guests at the moment, and no such
support is being worked on, there's no downside in just unconditionally
masking XSAVE support.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 49dbd78ec3cb..66272a237622 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -250,23 +250,7 @@ static __init void xen_init_cpuid_mask(void)
 			~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
 			  (1 << X86_FEATURE_ACPI));  /* disable ACPI */
 
-	ax = 1;
-	cx = 0;
-	xen_cpuid(&ax, &bx, &cx, &dx);
-
-	/* cpuid claims we support xsave; try enabling it to see what happens */
-	if (cx & (1 << (X86_FEATURE_XSAVE % 32))) {
-		unsigned long cr4;
-
-		set_in_cr4(X86_CR4_OSXSAVE);
-		
-		cr4 = read_cr4();
-
-		if ((cr4 & X86_CR4_OSXSAVE) == 0)
-			cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
-
-		clear_in_cr4(X86_CR4_OSXSAVE);
-	}
+	cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); /* disable XSAVE */
 }
 
 static void xen_set_debugreg(int reg, unsigned long val)
-- 
cgit v1.2.3


From 947ccf9c3c30307b774af3666ee74fcd9f47f646 Mon Sep 17 00:00:00 2001
From: Shan Haitao <haitao.shan@intel.com>
Date: Tue, 9 Nov 2010 11:43:36 -0800
Subject: xen: Allow PV-OPS kernel to detect whether XSAVE is supported

Xen fails to mask XSAVE from the cpuid feature, despite not historically
supporting guest use of XSAVE.  However, now that XSAVE support has been
added to Xen, we need to reliably detect its presence.

The most reliable way to do this is to look at the OSXSAVE feature in
cpuid which is set iff the OS (Xen, in this case), has set
CR4.OSXSAVE.

[ Cleaned up conditional a bit. - Jeremy ]

Signed-off-by: Shan Haitao <haitao.shan@intel.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 66272a237622..e3c6a06cf725 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -238,6 +238,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 static __init void xen_init_cpuid_mask(void)
 {
 	unsigned int ax, bx, cx, dx;
+	unsigned int xsave_mask;
 
 	cpuid_leaf1_edx_mask =
 		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
@@ -249,8 +250,16 @@ static __init void xen_init_cpuid_mask(void)
 		cpuid_leaf1_edx_mask &=
 			~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
 			  (1 << X86_FEATURE_ACPI));  /* disable ACPI */
+	ax = 1;
+	xen_cpuid(&ax, &bx, &cx, &dx);
 
-	cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); /* disable XSAVE */
+	xsave_mask =
+		(1 << (X86_FEATURE_XSAVE % 32)) |
+		(1 << (X86_FEATURE_OSXSAVE % 32));
+
+	/* Xen will set CR4.OSXSAVE if supported and not disabled by force */
+	if ((cx & xsave_mask) != xsave_mask)
+		cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
 }
 
 static void xen_set_debugreg(int reg, unsigned long val)
-- 
cgit v1.2.3


From d419e4c0f7584ffc5c72d9aeeaac485cc756ebcf Mon Sep 17 00:00:00 2001
From: Shriram Rajagopalan <rshriram@cs.ubc.ca>
Date: Mon, 11 Apr 2011 22:54:48 +0200
Subject: fix XEN_SAVE_RESTORE Kconfig dependencies

Make XEN_SAVE_RESTORE select HIBERNATE_CALLBACKS.
Remove XEN_SAVE_RESTORE dependency from PM_SLEEP.

Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca>
Acked-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/x86/xen/Kconfig | 1 +
 kernel/power/Kconfig | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 1c7121ba18ff..5cc821cb2e09 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -39,6 +39,7 @@ config XEN_MAX_DOMAIN_MEMORY
 config XEN_SAVE_RESTORE
        bool
        depends on XEN
+       select HIBERNATE_CALLBACKS
        default y
 
 config XEN_DEBUG_FS
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 049791468d37..6de9a8fc3417 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -89,7 +89,7 @@ config PM_STD_PARTITION
 
 config PM_SLEEP
 	def_bool y
-	depends on SUSPEND || HIBERNATE_CALLBACKS || XEN_SAVE_RESTORE
+	depends on SUSPEND || HIBERNATE_CALLBACKS
 
 config PM_SLEEP_SMP
 	def_bool y
-- 
cgit v1.2.3


From 24bdb0b62cc82120924762ae6bc85afc8c3f2b26 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Tue, 12 Apr 2011 12:19:52 +0100
Subject: xen: do not create the extra e820 region at an addr lower than 4G

Do not add the extra e820 region at a physical address lower than 4G
because it breaks e820_end_of_low_ram_pfn().

It is OK for us to move the xen_extra_mem_start up and down because this
is the index of the memory that can be ballooned in/out - it is memory
not available to the kernel during bootup.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/setup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index fa0269a99377..90bac0aac3a5 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -227,7 +227,7 @@ char * __init xen_memory_setup(void)
 
 	memcpy(map_raw, map, sizeof(map));
 	e820.nr_map = 0;
-	xen_extra_mem_start = mem_end;
+	xen_extra_mem_start = max((1ULL << 32), mem_end);
 	for (i = 0; i < memmap.nr_entries; i++) {
 		unsigned long long end;
 
-- 
cgit v1.2.3


From ee176455e28469e2420032aab3db11ac2ae3eaa8 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Tue, 19 Apr 2011 14:47:31 +0100
Subject: xen: mask_rw_pte: do not apply the early_ioremap checks on x86_32

The two "is_early_ioremap_ptep" checks in mask_rw_pte are only used on
x86_64, in fact early_ioremap is not used at all to setup the initial
pagetable on x86_32.
Moreover on x86_32 the two checks are wrong because the range
pgt_buf_start..pgt_buf_end initially should be mapped RW because
the pages in the range are not pagetable pages yet and haven't been
cleared yet. Afterwards considering the pgt_buf_start..pgt_buf_end is
part of the initial mapping, xen_alloc_pte is capable of turning
the ptes RO when they become pagetable pages.

Fix the issue and improve the readability of the code providing two
different implementation of mask_rw_pte for x86_32 and x86_64.

Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index a991b57f91fe..aef7af92b28b 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1473,16 +1473,20 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
 #endif
 }
 
+#ifdef CONFIG_X86_32
 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
 {
-	unsigned long pfn = pte_pfn(pte);
-
-#ifdef CONFIG_X86_32
 	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
 	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
 		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
 			       pte_val_ma(pte));
-#endif
+
+	return pte;
+}
+#else /* CONFIG_X86_64 */
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+	unsigned long pfn = pte_pfn(pte);
 
 	/*
 	 * If the new pfn is within the range of the newly allocated
@@ -1497,6 +1501,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
 
 	return pte;
 }
+#endif /* CONFIG_X86_64 */
 
 /* Init-time set_pte while constructing initial pagetables, which
    doesn't allow RO pagetable pages to be remapped RW */
-- 
cgit v1.2.3


From a38647837a411f7df79623128421eef2118b5884 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Fri, 29 Apr 2011 11:34:00 -0400
Subject: xen/mmu: Add workaround "x86-64, mm: Put early page table high"

As a consequence of the commit:

commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e
Author: Yinghai Lu <yinghai@kernel.org>
Date:   Fri Dec 17 16:58:28 2010 -0800

    x86-64, mm: Put early page table high

it causes the Linux kernel to crash under Xen:

mapping kernel into physical memory
Xen: setup ISA identity maps
about to get started...
(XEN) mm.c:2466:d0 Bad type (saw 7400000000000001 != exp 1000000000000000) for mfn b1d89 (pfn bacf7)
(XEN) mm.c:3027:d0 Error while pinning mfn b1d89
(XEN) traps.c:481:d0 Unhandled invalid opcode fault/trap [#6] on VCPU 0 [ec=0000]
(XEN) domain_crash_sync called from entry.S
(XEN) Domain 0 (vcpu#0) crashed on cpu#0:
...

The reason is that at some point init_memory_mapping is going to reach
the pagetable pages area and map those pages too (mapping them as normal
memory that falls in the range of addresses passed to init_memory_mapping
as argument). Some of those pages are already pagetable pages (they are
in the range pgt_buf_start-pgt_buf_end) therefore they are going to be
mapped RO and everything is fine.
Some of these pages are not pagetable pages yet (they fall in the range
pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
are going to be mapped RW.  When these pages become pagetable pages and
are hooked into the pagetable, xen will find that the guest has already
a RW mapping of them somewhere and fail the operation.
The reason Xen requires pagetables to be RO is that the hypervisor needs
to verify that the pagetables are valid before using them. The validation
operations are called "pinning" (more details in arch/x86/xen/mmu.c).

In order to fix the issue we mark all the pages in the entire range
pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
is completed only the range pgt_buf_start-pgt_buf_end is reserved by
init_memory_mapping. Hence the kernel is going to crash as soon as one
of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
ranges are RO).

For this reason, this function is introduced which is called _after_
the init_memory_mapping has completed (in a perfect world we would
call this function from init_memory_mapping, but lets ignore that).

Because we are called _after_ init_memory_mapping the pgt_buf_[start,
end,top] have all changed to new values (b/c another init_memory_mapping
is called). Hence, the first time we enter this function, we save
away the pgt_buf_start value and update the pgt_buf_[end,top].

When we detect that the "old" pgt_buf_start through pgt_buf_end
PFNs have been reserved (so memblock_x86_reserve_range has been called),
we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top.

And then we update those "old" pgt_buf_[end|top] with the new ones
so that we can redo this on the next pagetable.

Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Reviewed-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
[v1: Updated with Jeremy's comments]
[v2: Added the crash output]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 123 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index aef7af92b28b..1bca25f60ff2 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1463,6 +1463,119 @@ static int xen_pgd_alloc(struct mm_struct *mm)
 	return ret;
 }
 
+#ifdef CONFIG_X86_64
+static __initdata u64 __last_pgt_set_rw = 0;
+static __initdata u64 __pgt_buf_start = 0;
+static __initdata u64 __pgt_buf_end = 0;
+static __initdata u64 __pgt_buf_top = 0;
+/*
+ * As a consequence of the commit:
+ * 
+ * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e
+ * Author: Yinghai Lu <yinghai@kernel.org>
+ * Date:   Fri Dec 17 16:58:28 2010 -0800
+ * 
+ *     x86-64, mm: Put early page table high
+ * 
+ * at some point init_memory_mapping is going to reach the pagetable pages
+ * area and map those pages too (mapping them as normal memory that falls
+ * in the range of addresses passed to init_memory_mapping as argument).
+ * Some of those pages are already pagetable pages (they are in the range
+ * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
+ * everything is fine.
+ * Some of these pages are not pagetable pages yet (they fall in the range
+ * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
+ * are going to be mapped RW.  When these pages become pagetable pages and
+ * are hooked into the pagetable, xen will find that the guest has already
+ * a RW mapping of them somewhere and fail the operation.
+ * The reason Xen requires pagetables to be RO is that the hypervisor needs
+ * to verify that the pagetables are valid before using them. The validation
+ * operations are called "pinning".
+ * 
+ * In order to fix the issue we mark all the pages in the entire range
+ * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
+ * is completed only the range pgt_buf_start-pgt_buf_end is reserved by
+ * init_memory_mapping. Hence the kernel is going to crash as soon as one
+ * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
+ * ranges are RO).
+ * 
+ * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_
+ * the init_memory_mapping has completed (in a perfect world we would
+ * call this function from init_memory_mapping, but lets ignore that).
+ * 
+ * Because we are called _after_ init_memory_mapping the pgt_buf_[start,
+ * end,top] have all changed to new values (b/c init_memory_mapping
+ * is called and setting up another new page-table). Hence, the first time
+ * we enter this function, we save away the pgt_buf_start value and update
+ * the pgt_buf_[end,top].
+ * 
+ * When we detect that the "old" pgt_buf_start through pgt_buf_end
+ * PFNs have been reserved (so memblock_x86_reserve_range has been called),
+ * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top.
+ * 
+ * And then we update those "old" pgt_buf_[end|top] with the new ones
+ * so that we can redo this on the next pagetable.
+ */
+static __init void mark_rw_past_pgt(void) {
+
+	if (pgt_buf_end > pgt_buf_start) {
+		u64 addr, size;
+
+		/* Save it away. */
+		if (!__pgt_buf_start) {
+			__pgt_buf_start = pgt_buf_start;
+			__pgt_buf_end = pgt_buf_end;
+			__pgt_buf_top = pgt_buf_top;
+			return;
+		}
+		/* If we get the range that starts at __pgt_buf_end that means
+		 * the range is reserved, and that in 'init_memory_mapping'
+		 * the 'memblock_x86_reserve_range' has been called with the
+		 * outdated __pgt_buf_start, __pgt_buf_end (the "new"
+		 * pgt_buf_[start|end|top] refer now to a new pagetable.
+		 * Note: we are called _after_ the pgt_buf_[..] have been
+		 * updated.*/
+
+		addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start),
+						       &size, PAGE_SIZE);
+
+		/* Still not reserved, meaning 'memblock_x86_reserve_range'
+		 * hasn't been called yet. Update the _end and _top.*/
+		if (addr == PFN_PHYS(__pgt_buf_start)) {
+			__pgt_buf_end = pgt_buf_end;
+			__pgt_buf_top = pgt_buf_top;
+			return;
+		}
+
+		/* OK, the area is reserved, meaning it is time for us to
+		 * set RW for the old end->top PFNs. */
+
+		/* ..unless we had already done this. */
+		if (__pgt_buf_end == __last_pgt_set_rw)
+			return;
+
+		addr = PFN_PHYS(__pgt_buf_end);
+		
+		/* set as RW the rest */
+		printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n",
+			PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top));
+		
+		while (addr < PFN_PHYS(__pgt_buf_top)) {
+			make_lowmem_page_readwrite(__va(addr));
+			addr += PAGE_SIZE;
+		}
+		/* And update everything so that we are ready for the next
+		 * pagetable (the one created for regions past 4GB) */
+		__last_pgt_set_rw = __pgt_buf_end;
+		__pgt_buf_start = pgt_buf_start;
+		__pgt_buf_end = pgt_buf_end;
+		__pgt_buf_top = pgt_buf_top;
+	}
+	return;
+}
+#else
+static __init void mark_rw_past_pgt(void) { }
+#endif
 static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
 #ifdef CONFIG_X86_64
@@ -1488,6 +1601,14 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
 {
 	unsigned long pfn = pte_pfn(pte);
 
+	/*
+	 * A bit of optimization. We do not need to call the workaround
+	 * when xen_set_pte_init is called with a PTE with 0 as PFN.
+	 * That is b/c the pagetable at that point are just being populated
+	 * with empty values and we can save some cycles by not calling
+	 * the 'memblock' code.*/
+	if (pfn)
+		mark_rw_past_pgt();
 	/*
 	 * If the new pfn is within the range of the newly allocated
 	 * kernel pagetable, and it isn't being mapped into an
@@ -1997,6 +2118,8 @@ __init void xen_ident_map_ISA(void)
 
 static __init void xen_post_allocator_init(void)
 {
+	mark_rw_past_pgt();
+
 #ifdef CONFIG_XEN_DEBUG
 	pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
 #endif
-- 
cgit v1.2.3


From b9269dc7bfdf8c985971c09f2dcb2aa04ad7986d Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Tue, 12 Apr 2011 12:19:49 +0100
Subject: xen: mask_rw_pte mark RO all pagetable pages up to pgt_buf_top

mask_rw_pte is currently checking if a pfn is a pagetable page if it
falls in the range pgt_buf_start - pgt_buf_end but that is incorrect
because pgt_buf_end is a moving target: pgt_buf_top is the real
boundary.

Acked-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86/xen')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 1bca25f60ff2..55c965b38c27 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1616,7 +1616,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
 	 * it is RO.
 	 */
 	if (((!is_early_ioremap_ptep(ptep) &&
-			pfn >= pgt_buf_start && pfn < pgt_buf_end)) ||
+			pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
 			(is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
 		pte = pte_wrprotect(pte);
 
-- 
cgit v1.2.3