summaryrefslogtreecommitdiffstats
path: root/arch/x86/xen
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-05-07 10:51:38 +0200
committerIngo Molnar <mingo@elte.hu>2011-05-07 10:51:48 +0200
commit4cb1f43ce8c72ee453c00fcb9f6ee9c4ebd03f98 (patch)
tree15e64f192b54ea01fd640d69eed0cabed2baaaa9 /arch/x86/xen
parentx86: Fix spelling error in the memcpy() source code comment (diff)
parentLinux 2.6.39-rc6 (diff)
downloadlinux-4cb1f43ce8c72ee453c00fcb9f6ee9c4ebd03f98.tar.xz
linux-4cb1f43ce8c72ee453c00fcb9f6ee9c4ebd03f98.zip
Merge commit 'v2.6.39-rc6' into x86/cleanups
Merge reason: move to a (much) newer upstream base. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'arch/x86/xen')
-rw-r--r--arch/x86/xen/Kconfig1
-rw-r--r--arch/x86/xen/enlighten.c21
-rw-r--r--arch/x86/xen/mmu.c142
-rw-r--r--arch/x86/xen/p2m.c10
-rw-r--r--arch/x86/xen/setup.c2
5 files changed, 150 insertions, 26 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 1c7121ba18ff..5cc821cb2e09 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -39,6 +39,7 @@ config XEN_MAX_DOMAIN_MEMORY
config XEN_SAVE_RESTORE
bool
depends on XEN
+ select HIBERNATE_CALLBACKS
default y
config XEN_DEBUG_FS
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 49dbd78ec3cb..e3c6a06cf725 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -238,6 +238,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
static __init void xen_init_cpuid_mask(void)
{
unsigned int ax, bx, cx, dx;
+ unsigned int xsave_mask;
cpuid_leaf1_edx_mask =
~((1 << X86_FEATURE_MCE) | /* disable MCE */
@@ -249,24 +250,16 @@ static __init void xen_init_cpuid_mask(void)
cpuid_leaf1_edx_mask &=
~((1 << X86_FEATURE_APIC) | /* disable local APIC */
(1 << X86_FEATURE_ACPI)); /* disable ACPI */
-
ax = 1;
- cx = 0;
xen_cpuid(&ax, &bx, &cx, &dx);
- /* cpuid claims we support xsave; try enabling it to see what happens */
- if (cx & (1 << (X86_FEATURE_XSAVE % 32))) {
- unsigned long cr4;
-
- set_in_cr4(X86_CR4_OSXSAVE);
-
- cr4 = read_cr4();
+ xsave_mask =
+ (1 << (X86_FEATURE_XSAVE % 32)) |
+ (1 << (X86_FEATURE_OSXSAVE % 32));
- if ((cr4 & X86_CR4_OSXSAVE) == 0)
- cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
-
- clear_in_cr4(X86_CR4_OSXSAVE);
- }
+ /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
+ if ((cx & xsave_mask) != xsave_mask)
+ cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
}
static void xen_set_debugreg(int reg, unsigned long val)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index c82df6c9c0f0..55c965b38c27 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -565,13 +565,13 @@ pte_t xen_make_pte_debug(pteval_t pte)
if (io_page &&
(xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
- WARN(addr != other_addr,
+ WARN_ONCE(addr != other_addr,
"0x%lx is using VM_IO, but it is 0x%lx!\n",
(unsigned long)addr, (unsigned long)other_addr);
} else {
pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
other_addr = (_pte.pte & PTE_PFN_MASK);
- WARN((addr == other_addr) && (!io_page) && (!iomap_set),
+ WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
"0x%lx is missing VM_IO (and wasn't fixed)!\n",
(unsigned long)addr);
}
@@ -1463,6 +1463,119 @@ static int xen_pgd_alloc(struct mm_struct *mm)
return ret;
}
+#ifdef CONFIG_X86_64
+static __initdata u64 __last_pgt_set_rw = 0;
+static __initdata u64 __pgt_buf_start = 0;
+static __initdata u64 __pgt_buf_end = 0;
+static __initdata u64 __pgt_buf_top = 0;
+/*
+ * As a consequence of the commit:
+ *
+ * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e
+ * Author: Yinghai Lu <yinghai@kernel.org>
+ * Date: Fri Dec 17 16:58:28 2010 -0800
+ *
+ * x86-64, mm: Put early page table high
+ *
+ * at some point init_memory_mapping is going to reach the pagetable pages
+ * area and map those pages too (mapping them as normal memory that falls
+ * in the range of addresses passed to init_memory_mapping as argument).
+ * Some of those pages are already pagetable pages (they are in the range
+ * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
+ * everything is fine.
+ * Some of these pages are not pagetable pages yet (they fall in the range
+ * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
+ * are going to be mapped RW. When these pages become pagetable pages and
+ * are hooked into the pagetable, xen will find that the guest has already
+ * a RW mapping of them somewhere and fail the operation.
+ * The reason Xen requires pagetables to be RO is that the hypervisor needs
+ * to verify that the pagetables are valid before using them. The validation
+ * operations are called "pinning".
+ *
+ * In order to fix the issue we mark all the pages in the entire range
+ * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
+ * is completed only the range pgt_buf_start-pgt_buf_end is reserved by
+ * init_memory_mapping. Hence the kernel is going to crash as soon as one
+ * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
+ * ranges are RO).
+ *
+ * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_
+ * the init_memory_mapping has completed (in a perfect world we would
+ * call this function from init_memory_mapping, but lets ignore that).
+ *
+ * Because we are called _after_ init_memory_mapping the pgt_buf_[start,
+ * end,top] have all changed to new values (b/c init_memory_mapping
+ * is called and setting up another new page-table). Hence, the first time
+ * we enter this function, we save away the pgt_buf_start value and update
+ * the pgt_buf_[end,top].
+ *
+ * When we detect that the "old" pgt_buf_start through pgt_buf_end
+ * PFNs have been reserved (so memblock_x86_reserve_range has been called),
+ * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top.
+ *
+ * And then we update those "old" pgt_buf_[end|top] with the new ones
+ * so that we can redo this on the next pagetable.
+ */
+static __init void mark_rw_past_pgt(void) {
+
+ if (pgt_buf_end > pgt_buf_start) {
+ u64 addr, size;
+
+ /* Save it away. */
+ if (!__pgt_buf_start) {
+ __pgt_buf_start = pgt_buf_start;
+ __pgt_buf_end = pgt_buf_end;
+ __pgt_buf_top = pgt_buf_top;
+ return;
+ }
+ /* If we get the range that starts at __pgt_buf_end that means
+ * the range is reserved, and that in 'init_memory_mapping'
+ * the 'memblock_x86_reserve_range' has been called with the
+ * outdated __pgt_buf_start, __pgt_buf_end (the "new"
+ * pgt_buf_[start|end|top] refer now to a new pagetable.
+ * Note: we are called _after_ the pgt_buf_[..] have been
+ * updated.*/
+
+ addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start),
+ &size, PAGE_SIZE);
+
+ /* Still not reserved, meaning 'memblock_x86_reserve_range'
+ * hasn't been called yet. Update the _end and _top.*/
+ if (addr == PFN_PHYS(__pgt_buf_start)) {
+ __pgt_buf_end = pgt_buf_end;
+ __pgt_buf_top = pgt_buf_top;
+ return;
+ }
+
+ /* OK, the area is reserved, meaning it is time for us to
+ * set RW for the old end->top PFNs. */
+
+ /* ..unless we had already done this. */
+ if (__pgt_buf_end == __last_pgt_set_rw)
+ return;
+
+ addr = PFN_PHYS(__pgt_buf_end);
+
+ /* set as RW the rest */
+ printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n",
+ PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top));
+
+ while (addr < PFN_PHYS(__pgt_buf_top)) {
+ make_lowmem_page_readwrite(__va(addr));
+ addr += PAGE_SIZE;
+ }
+ /* And update everything so that we are ready for the next
+ * pagetable (the one created for regions past 4GB) */
+ __last_pgt_set_rw = __pgt_buf_end;
+ __pgt_buf_start = pgt_buf_start;
+ __pgt_buf_end = pgt_buf_end;
+ __pgt_buf_top = pgt_buf_top;
+ }
+ return;
+}
+#else
+static __init void mark_rw_past_pgt(void) { }
+#endif
static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
#ifdef CONFIG_X86_64
@@ -1473,30 +1586,43 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
#endif
}
+#ifdef CONFIG_X86_32
static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
{
- unsigned long pfn = pte_pfn(pte);
-
-#ifdef CONFIG_X86_32
/* If there's an existing pte, then don't allow _PAGE_RW to be set */
if (pte_val_ma(*ptep) & _PAGE_PRESENT)
pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
pte_val_ma(pte));
-#endif
+
+ return pte;
+}
+#else /* CONFIG_X86_64 */
+static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
+{
+ unsigned long pfn = pte_pfn(pte);
/*
+ * A bit of optimization. We do not need to call the workaround
+ * when xen_set_pte_init is called with a PTE with 0 as PFN.
+ * That is b/c the pagetable at that point are just being populated
+ * with empty values and we can save some cycles by not calling
+ * the 'memblock' code.*/
+ if (pfn)
+ mark_rw_past_pgt();
+ /*
* If the new pfn is within the range of the newly allocated
* kernel pagetable, and it isn't being mapped into an
* early_ioremap fixmap slot as a freshly allocated page, make sure
* it is RO.
*/
if (((!is_early_ioremap_ptep(ptep) &&
- pfn >= pgt_buf_start && pfn < pgt_buf_end)) ||
+ pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
(is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
pte = pte_wrprotect(pte);
return pte;
}
+#endif /* CONFIG_X86_64 */
/* Init-time set_pte while constructing initial pagetables, which
doesn't allow RO pagetable pages to be remapped RW */
@@ -1992,6 +2118,8 @@ __init void xen_ident_map_ISA(void)
static __init void xen_post_allocator_init(void)
{
+ mark_rw_past_pgt();
+
#ifdef CONFIG_XEN_DEBUG
pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
#endif
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 215a3ce61068..141eb0de8b06 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -497,7 +497,7 @@ static bool alloc_p2m(unsigned long pfn)
return true;
}
-bool __early_alloc_p2m(unsigned long pfn)
+static bool __init __early_alloc_p2m(unsigned long pfn)
{
unsigned topidx, mididx, idx;
@@ -530,7 +530,7 @@ bool __early_alloc_p2m(unsigned long pfn)
}
return idx != 0;
}
-unsigned long set_phys_range_identity(unsigned long pfn_s,
+unsigned long __init set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e)
{
unsigned long pfn;
@@ -671,7 +671,9 @@ int m2p_add_override(unsigned long mfn, struct page *page)
page->private = mfn;
page->index = pfn_to_mfn(pfn);
- __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
+ if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
+ return -ENOMEM;
+
if (!PageHighMem(page))
/* Just zap old mapping for now */
pte_clear(&init_mm, address, ptep);
@@ -709,7 +711,7 @@ int m2p_remove_override(struct page *page)
spin_lock_irqsave(&m2p_override_lock, flags);
list_del(&page->lru);
spin_unlock_irqrestore(&m2p_override_lock, flags);
- __set_phys_to_machine(pfn, page->index);
+ set_phys_to_machine(pfn, page->index);
if (!PageHighMem(page))
set_pte_at(&init_mm, address, ptep,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index fa0269a99377..90bac0aac3a5 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -227,7 +227,7 @@ char * __init xen_memory_setup(void)
memcpy(map_raw, map, sizeof(map));
e820.nr_map = 0;
- xen_extra_mem_start = mem_end;
+ xen_extra_mem_start = max((1ULL << 32), mem_end);
for (i = 0; i < memmap.nr_entries; i++) {
unsigned long long end;