diff options
Diffstat (limited to 'arch/x86/xen')
-rw-r--r-- | arch/x86/xen/Kconfig | 1 | ||||
-rw-r--r-- | arch/x86/xen/enlighten.c | 21 | ||||
-rw-r--r-- | arch/x86/xen/mmu.c | 142 | ||||
-rw-r--r-- | arch/x86/xen/p2m.c | 10 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 2 |
5 files changed, 150 insertions, 26 deletions
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 1c7121ba18ff..5cc821cb2e09 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -39,6 +39,7 @@ config XEN_MAX_DOMAIN_MEMORY config XEN_SAVE_RESTORE bool depends on XEN + select HIBERNATE_CALLBACKS default y config XEN_DEBUG_FS diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 49dbd78ec3cb..e3c6a06cf725 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -238,6 +238,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, static __init void xen_init_cpuid_mask(void) { unsigned int ax, bx, cx, dx; + unsigned int xsave_mask; cpuid_leaf1_edx_mask = ~((1 << X86_FEATURE_MCE) | /* disable MCE */ @@ -249,24 +250,16 @@ static __init void xen_init_cpuid_mask(void) cpuid_leaf1_edx_mask &= ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ (1 << X86_FEATURE_ACPI)); /* disable ACPI */ - ax = 1; - cx = 0; xen_cpuid(&ax, &bx, &cx, &dx); - /* cpuid claims we support xsave; try enabling it to see what happens */ - if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { - unsigned long cr4; - - set_in_cr4(X86_CR4_OSXSAVE); - - cr4 = read_cr4(); + xsave_mask = + (1 << (X86_FEATURE_XSAVE % 32)) | + (1 << (X86_FEATURE_OSXSAVE % 32)); - if ((cr4 & X86_CR4_OSXSAVE) == 0) - cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); - - clear_in_cr4(X86_CR4_OSXSAVE); - } + /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ + if ((cx & xsave_mask) != xsave_mask) + cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ } static void xen_set_debugreg(int reg, unsigned long val) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index c82df6c9c0f0..55c965b38c27 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -565,13 +565,13 @@ pte_t xen_make_pte_debug(pteval_t pte) if (io_page && (xen_initial_domain() || addr >= ISA_END_ADDRESS)) { other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT; - WARN(addr != other_addr, + WARN_ONCE(addr != other_addr, "0x%lx is using VM_IO, but it is 0x%lx!\n", (unsigned long)addr, (unsigned long)other_addr); } else { pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP; other_addr = (_pte.pte & PTE_PFN_MASK); - WARN((addr == other_addr) && (!io_page) && (!iomap_set), + WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set), "0x%lx is missing VM_IO (and wasn't fixed)!\n", (unsigned long)addr); } @@ -1463,6 +1463,119 @@ static int xen_pgd_alloc(struct mm_struct *mm) return ret; } +#ifdef CONFIG_X86_64 +static __initdata u64 __last_pgt_set_rw = 0; +static __initdata u64 __pgt_buf_start = 0; +static __initdata u64 __pgt_buf_end = 0; +static __initdata u64 __pgt_buf_top = 0; +/* + * As a consequence of the commit: + * + * commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e + * Author: Yinghai Lu <yinghai@kernel.org> + * Date: Fri Dec 17 16:58:28 2010 -0800 + * + * x86-64, mm: Put early page table high + * + * at some point init_memory_mapping is going to reach the pagetable pages + * area and map those pages too (mapping them as normal memory that falls + * in the range of addresses passed to init_memory_mapping as argument). + * Some of those pages are already pagetable pages (they are in the range + * pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and + * everything is fine. + * Some of these pages are not pagetable pages yet (they fall in the range + * pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they + * are going to be mapped RW. When these pages become pagetable pages and + * are hooked into the pagetable, xen will find that the guest has already + * a RW mapping of them somewhere and fail the operation. + * The reason Xen requires pagetables to be RO is that the hypervisor needs + * to verify that the pagetables are valid before using them. The validation + * operations are called "pinning". + * + * In order to fix the issue we mark all the pages in the entire range + * pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation + * is completed only the range pgt_buf_start-pgt_buf_end is reserved by + * init_memory_mapping. Hence the kernel is going to crash as soon as one + * of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those + * ranges are RO). + * + * For this reason, 'mark_rw_past_pgt' is introduced which is called _after_ + * the init_memory_mapping has completed (in a perfect world we would + * call this function from init_memory_mapping, but lets ignore that). + * + * Because we are called _after_ init_memory_mapping the pgt_buf_[start, + * end,top] have all changed to new values (b/c init_memory_mapping + * is called and setting up another new page-table). Hence, the first time + * we enter this function, we save away the pgt_buf_start value and update + * the pgt_buf_[end,top]. + * + * When we detect that the "old" pgt_buf_start through pgt_buf_end + * PFNs have been reserved (so memblock_x86_reserve_range has been called), + * we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top. + * + * And then we update those "old" pgt_buf_[end|top] with the new ones + * so that we can redo this on the next pagetable. + */ +static __init void mark_rw_past_pgt(void) { + + if (pgt_buf_end > pgt_buf_start) { + u64 addr, size; + + /* Save it away. */ + if (!__pgt_buf_start) { + __pgt_buf_start = pgt_buf_start; + __pgt_buf_end = pgt_buf_end; + __pgt_buf_top = pgt_buf_top; + return; + } + /* If we get the range that starts at __pgt_buf_end that means + * the range is reserved, and that in 'init_memory_mapping' + * the 'memblock_x86_reserve_range' has been called with the + * outdated __pgt_buf_start, __pgt_buf_end (the "new" + * pgt_buf_[start|end|top] refer now to a new pagetable. + * Note: we are called _after_ the pgt_buf_[..] have been + * updated.*/ + + addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start), + &size, PAGE_SIZE); + + /* Still not reserved, meaning 'memblock_x86_reserve_range' + * hasn't been called yet. Update the _end and _top.*/ + if (addr == PFN_PHYS(__pgt_buf_start)) { + __pgt_buf_end = pgt_buf_end; + __pgt_buf_top = pgt_buf_top; + return; + } + + /* OK, the area is reserved, meaning it is time for us to + * set RW for the old end->top PFNs. */ + + /* ..unless we had already done this. */ + if (__pgt_buf_end == __last_pgt_set_rw) + return; + + addr = PFN_PHYS(__pgt_buf_end); + + /* set as RW the rest */ + printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", + PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top)); + + while (addr < PFN_PHYS(__pgt_buf_top)) { + make_lowmem_page_readwrite(__va(addr)); + addr += PAGE_SIZE; + } + /* And update everything so that we are ready for the next + * pagetable (the one created for regions past 4GB) */ + __last_pgt_set_rw = __pgt_buf_end; + __pgt_buf_start = pgt_buf_start; + __pgt_buf_end = pgt_buf_end; + __pgt_buf_top = pgt_buf_top; + } + return; +} +#else +static __init void mark_rw_past_pgt(void) { } +#endif static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) { #ifdef CONFIG_X86_64 @@ -1473,30 +1586,43 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) #endif } +#ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { - unsigned long pfn = pte_pfn(pte); - -#ifdef CONFIG_X86_32 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ if (pte_val_ma(*ptep) & _PAGE_PRESENT) pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & pte_val_ma(pte)); -#endif + + return pte; +} +#else /* CONFIG_X86_64 */ +static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); /* + * A bit of optimization. We do not need to call the workaround + * when xen_set_pte_init is called with a PTE with 0 as PFN. + * That is b/c the pagetable at that point are just being populated + * with empty values and we can save some cycles by not calling + * the 'memblock' code.*/ + if (pfn) + mark_rw_past_pgt(); + /* * If the new pfn is within the range of the newly allocated * kernel pagetable, and it isn't being mapped into an * early_ioremap fixmap slot as a freshly allocated page, make sure * it is RO. */ if (((!is_early_ioremap_ptep(ptep) && - pfn >= pgt_buf_start && pfn < pgt_buf_end)) || + pfn >= pgt_buf_start && pfn < pgt_buf_top)) || (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1))) pte = pte_wrprotect(pte); return pte; } +#endif /* CONFIG_X86_64 */ /* Init-time set_pte while constructing initial pagetables, which doesn't allow RO pagetable pages to be remapped RW */ @@ -1992,6 +2118,8 @@ __init void xen_ident_map_ISA(void) static __init void xen_post_allocator_init(void) { + mark_rw_past_pgt(); + #ifdef CONFIG_XEN_DEBUG pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); #endif diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 215a3ce61068..141eb0de8b06 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -497,7 +497,7 @@ static bool alloc_p2m(unsigned long pfn) return true; } -bool __early_alloc_p2m(unsigned long pfn) +static bool __init __early_alloc_p2m(unsigned long pfn) { unsigned topidx, mididx, idx; @@ -530,7 +530,7 @@ bool __early_alloc_p2m(unsigned long pfn) } return idx != 0; } -unsigned long set_phys_range_identity(unsigned long pfn_s, +unsigned long __init set_phys_range_identity(unsigned long pfn_s, unsigned long pfn_e) { unsigned long pfn; @@ -671,7 +671,9 @@ int m2p_add_override(unsigned long mfn, struct page *page) page->private = mfn; page->index = pfn_to_mfn(pfn); - __set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)); + if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) + return -ENOMEM; + if (!PageHighMem(page)) /* Just zap old mapping for now */ pte_clear(&init_mm, address, ptep); @@ -709,7 +711,7 @@ int m2p_remove_override(struct page *page) spin_lock_irqsave(&m2p_override_lock, flags); list_del(&page->lru); spin_unlock_irqrestore(&m2p_override_lock, flags); - __set_phys_to_machine(pfn, page->index); + set_phys_to_machine(pfn, page->index); if (!PageHighMem(page)) set_pte_at(&init_mm, address, ptep, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index fa0269a99377..90bac0aac3a5 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -227,7 +227,7 @@ char * __init xen_memory_setup(void) memcpy(map_raw, map, sizeof(map)); e820.nr_map = 0; - xen_extra_mem_start = mem_end; + xen_extra_mem_start = max((1ULL << 32), mem_end); for (i = 0; i < memmap.nr_entries; i++) { unsigned long long end; |