diff options
Diffstat (limited to 'arch/x86')
41 files changed, 860 insertions, 1079 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 87a693cf2bb7..4d350b5cbc71 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -23,7 +23,7 @@ config X86 select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) - select HAVE_ARCH_KGDB + select HAVE_ARCH_KGDB if !X86_VOYAGER config GENERIC_LOCKBREAK diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 610aaecc19f8..239fd9fba0a5 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -5,6 +5,17 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" +config NONPROMISC_DEVMEM + bool "Disable promiscuous /dev/mem" + help + The /dev/mem file by default only allows userspace access to PCI + space and the BIOS code and data regions. This is sufficient for + dosemu and X and all common users of /dev/mem. With this config + option, you allow userspace access to all of memory, including + kernel and userspace memory. Accidental access to this is + obviously disasterous, but specific access can be used by people + debugging the kernel. + config EARLY_PRINTK bool "Early printk" if EMBEDDED default y diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index b1bdc4c6f9f2..172cf8a98bdd 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -1,7 +1,8 @@ bootsect bzImage +cpustr.h +mkcpustr +offsets.h setup setup.bin setup.elf -cpustr.h -mkcpustr diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore new file mode 100644 index 000000000000..58f1f48a58f8 --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/.gitignore @@ -0,0 +1,3 @@ +wakeup.bin +wakeup.elf +wakeup.lds diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index df4099dc1c68..65c7857a90dd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -511,31 +511,30 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) unsigned long flags; char *vaddr; int nr_pages = 2; + struct page *pages[2]; + int i; - BUG_ON(len > sizeof(long)); - BUG_ON((((long)addr + len - 1) & ~(sizeof(long) - 1)) - - ((long)addr & ~(sizeof(long) - 1))); - if (kernel_text_address((unsigned long)addr)) { - struct page *pages[2] = { virt_to_page(addr), - virt_to_page(addr + PAGE_SIZE) }; - if (!pages[1]) - nr_pages = 1; - vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - BUG_ON(!vaddr); - local_irq_save(flags); - memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - local_irq_restore(flags); - vunmap(vaddr); + if (!core_kernel_text((unsigned long)addr)) { + pages[0] = vmalloc_to_page(addr); + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); } else { - /* - * modules are in vmalloc'ed memory, always writable. - */ - local_irq_save(flags); - memcpy(addr, opcode, len); - local_irq_restore(flags); + pages[0] = virt_to_page(addr); + WARN_ON(!PageReserved(pages[0])); + pages[1] = virt_to_page(addr + PAGE_SIZE); } + BUG_ON(!pages[0]); + if (!pages[1]) + nr_pages = 1; + vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + BUG_ON(!vaddr); + local_irq_save(flags); + memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); + local_irq_restore(flags); + vunmap(vaddr); sync_core(); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ + for (i = 0; i < len; i++) + BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); return addr; } diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 687208190b06..8317401170b8 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -902,7 +902,7 @@ void __init init_bsp_APIC(void) apic_write_around(APIC_LVT1, value); } -void __cpuinit lapic_setup_esr(void) +static void __cpuinit lapic_setup_esr(void) { unsigned long oldvalue, value, maxlvt; if (lapic_is_integrated() && !esr_disable) { diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 9e8e5c050c55..bf83157337e4 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -429,7 +429,7 @@ void __init setup_boot_APIC_clock(void) * set the DUMMY flag again and force the broadcast mode in the * clockevents layer. */ -void __cpuinit check_boot_apic_timer_broadcast(void) +static void __cpuinit check_boot_apic_timer_broadcast(void) { if (!disable_apic_timer || (lapic_clockevent.features & CLOCK_EVT_FEAT_DUMMY)) @@ -834,7 +834,7 @@ void __cpuinit setup_local_APIC(void) preempt_enable(); } -void __cpuinit lapic_setup_esr(void) +static void __cpuinit lapic_setup_esr(void) { unsigned maxlvt = lapic_get_maxlvt(); diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f0f8934fc303..2a609dc3271c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -409,7 +409,7 @@ restore_nocheck_notrace: irq_return: INTERRUPT_RETURN .section .fixup,"ax" -iret_exc: +ENTRY(iret_exc) pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -1017,6 +1017,13 @@ ENTRY(kernel_thread_helper) ENDPROC(kernel_thread_helper) #ifdef CONFIG_XEN +/* Xen doesn't set %esp to be precisely what the normal sysenter + entrypoint expects, so fix it up before using the normal path. */ +ENTRY(xen_sysenter_target) + RING0_INT_FRAME + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp + ENTRY(xen_hypervisor_callback) CFI_STARTPROC pushl $0 @@ -1035,8 +1042,9 @@ ENTRY(xen_hypervisor_callback) cmpl $xen_iret_end_crit,%eax jae 1f - call xen_iret_crit_fixup + jmp xen_iret_crit_fixup +ENTRY(xen_do_upcall) 1: mov %esp, %eax call xen_evtchn_do_upcall jmp ret_from_intr diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 3733412d1357..74f0c5ea2a03 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -366,11 +366,13 @@ struct pv_mmu_ops pv_mmu_ops = { .flush_tlb_single = native_flush_tlb_single, .flush_tlb_others = native_flush_tlb_others, - .alloc_pt = paravirt_nop, - .alloc_pd = paravirt_nop, - .alloc_pd_clone = paravirt_nop, - .release_pt = paravirt_nop, - .release_pd = paravirt_nop, + .alloc_pte = paravirt_nop, + .alloc_pmd = paravirt_nop, + .alloc_pmd_clone = paravirt_nop, + .alloc_pud = paravirt_nop, + .release_pte = paravirt_nop, + .release_pmd = paravirt_nop, + .release_pud = paravirt_nop, .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 7adad088e373..77de848bd1fb 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -550,7 +550,7 @@ static void hard_enable_TSC(void) write_cr4(read_cr4() & ~X86_CR4_TSD); } -void enable_TSC(void) +static void enable_TSC(void) { preempt_disable(); if (test_and_clear_thread_flag(TIF_NOTSC)) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 891af1a1b48a..131c2ee7ac56 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -562,7 +562,7 @@ static void hard_enable_TSC(void) write_cr4(read_cr4() & ~X86_CR4_TSD); } -void enable_TSC(void) +static void enable_TSC(void) { preempt_disable(); if (test_and_clear_thread_flag(TIF_NOTSC)) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 19c9386ac118..1791a751a772 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -8,6 +8,7 @@ #include <asm/apic.h> #include <asm/desc.h> #include <asm/hpet.h> +#include <asm/pgtable.h> #include <asm/reboot_fixups.h> #include <asm/reboot.h> @@ -15,7 +16,6 @@ # include <linux/dmi.h> # include <linux/ctype.h> # include <linux/mc146818rtc.h> -# include <asm/pgtable.h> #else # include <asm/iommu.h> #endif @@ -275,7 +275,7 @@ void machine_real_restart(unsigned char *code, int length) /* Remap the kernel at virtual address zero, as well as offset zero from the kernel segment. This assumes the kernel segment starts at virtual address PAGE_OFFSET. */ - memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, + memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0d1f44ae6eea..c0c68c18a788 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -18,8 +18,6 @@ unsigned disabled_cpus __cpuinitdata; unsigned int boot_cpu_physical_apicid = -1U; EXPORT_SYMBOL(boot_cpu_physical_apicid); -physid_mask_t phys_cpu_present_map; - DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID; EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid); diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 78828b0f604f..455d3c80960b 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -442,7 +442,7 @@ static void __init reserve_ebda_region(void) } #ifndef CONFIG_NEED_MULTIPLE_NODES -void __init setup_bootmem_allocator(void); +static void __init setup_bootmem_allocator(void); static unsigned long __init setup_memory(void) { /* @@ -477,7 +477,7 @@ static unsigned long __init setup_memory(void) return max_low_pfn; } -void __init zone_sizes_init(void) +static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6a925394bc7e..eef79e84145f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -184,7 +184,7 @@ static void unmap_cpu_to_node(int cpu) u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; -void map_cpu_to_logical_apicid(void) +static void map_cpu_to_logical_apicid(void) { int cpu = smp_processor_id(); int apicid = logical_smp_processor_id(); @@ -197,7 +197,7 @@ void map_cpu_to_logical_apicid(void) map_cpu_to_node(cpu, node); } -void unmap_cpu_to_logical_apicid(int cpu) +static void unmap_cpu_to_logical_apicid(int cpu) { cpu_2_logical_apicid[cpu] = BAD_APICID; unmap_cpu_to_node(cpu); @@ -211,7 +211,7 @@ void unmap_cpu_to_logical_apicid(int cpu) * Report back to the Boot Processor. * Running on AP. */ -void __cpuinit smp_callin(void) +static void __cpuinit smp_callin(void) { int cpuid, phys_id; unsigned long timeout; @@ -436,7 +436,7 @@ valid_k7: #endif } -void __cpuinit smp_checks(void) +static void __cpuinit smp_checks(void) { if (smp_b_stepping) printk(KERN_WARNING "WARNING: SMP operation may be unreliable" @@ -565,7 +565,7 @@ void __init smp_alloc_memory(void) } #endif -void impress_friends(void) +static void impress_friends(void) { int cpu; unsigned long bogosum = 0; @@ -1039,8 +1039,8 @@ int __cpuinit native_cpu_up(unsigned int cpu) #ifdef CONFIG_X86_32 /* init low mem mapping */ - clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); + clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); flush_tlb_all(); #endif @@ -1287,7 +1287,7 @@ void cpu_exit_clear(void) } # endif /* CONFIG_X86_32 */ -void remove_siblinginfo(int cpu) +static void remove_siblinginfo(int cpu) { int sibling; struct cpuinfo_x86 *c = &cpu_data(cpu); diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index 1558e513757e..df224a8774cb 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -191,7 +191,7 @@ void native_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, spin_unlock(&f->tlbstate_lock); } -int __cpuinit init_smp_flush(void) +static int __cpuinit init_smp_flush(void) { int i; diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 12affe1f9bce..956f38927aa7 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -320,7 +320,7 @@ static void check_zeroed_page(u32 pfn, int type, struct page *page) * pdes need to be zeroed. */ if (type & VMI_PAGE_CLONE) - limit = USER_PTRS_PER_PGD; + limit = KERNEL_PGD_BOUNDARY; for (i = 0; i < limit; i++) BUG_ON(ptr[i]); } @@ -392,13 +392,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) } #endif -static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) +static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) { vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); } -static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn) +static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) { /* * This call comes in very early, before mem_map is setup. @@ -409,20 +409,20 @@ static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn) vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); } -static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) +static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) { vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); vmi_check_page_type(clonepfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); } -static void vmi_release_pt(u32 pfn) +static void vmi_release_pte(u32 pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L1); vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } -static void vmi_release_pd(u32 pfn) +static void vmi_release_pmd(u32 pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L2); vmi_set_page_type(pfn, VMI_PAGE_NORMAL); @@ -871,15 +871,15 @@ static inline int __init activate_vmi(void) vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); if (vmi_ops.allocate_page) { - pv_mmu_ops.alloc_pt = vmi_allocate_pt; - pv_mmu_ops.alloc_pd = vmi_allocate_pd; - pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone; + pv_mmu_ops.alloc_pte = vmi_allocate_pte; + pv_mmu_ops.alloc_pmd = vmi_allocate_pmd; + pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone; } vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); if (vmi_ops.release_page) { - pv_mmu_ops.release_pt = vmi_release_pt; - pv_mmu_ops.release_pd = vmi_release_pd; + pv_mmu_ops.release_pte = vmi_release_pte; + pv_mmu_ops.release_pmd = vmi_release_pmd; } /* Set linear is needed in all cases */ diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index edff4c985485..61efa2f7d564 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -216,7 +216,7 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) return 0; } -long __vsyscall(3) venosys_1(void) +static long __vsyscall(3) venosys_1(void) { return -ENOSYS; } diff --git a/arch/x86/mach-visws/mpparse.c b/arch/x86/mach-visws/mpparse.c index 2a8456a1f44f..57484e91ab90 100644 --- a/arch/x86/mach-visws/mpparse.c +++ b/arch/x86/mach-visws/mpparse.c @@ -11,22 +11,9 @@ /* Have we found an MP table */ int smp_found_config; -/* - * Various Linux-internal data structures created from the - * MP-table. - */ -int apic_version [MAX_APICS]; - int pic_mode; -unsigned long mp_lapic_addr; - -/* Processor that is doing the boot up */ -unsigned int boot_cpu_physical_apicid = -1U; - -/* Bitmask of physically existing CPUs */ -physid_mask_t phys_cpu_present_map; -unsigned int __initdata maxcpus = NR_CPUS; +extern unsigned int __cpuinitdata maxcpus; /* * The Visual Workstation is Intel MP compliant in the hardware diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 96f60c7cd124..6e2c4efce0ef 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -206,11 +206,6 @@ static struct irq_chip vic_chip = { /* used to count up as CPUs are brought on line (starts at 0) */ static int cpucount = 0; -/* steal a page from the bottom of memory for the trampoline and - * squirrel its address away here. This will be in kernel virtual - * space */ -unsigned char *trampoline_base; - /* The per cpu profile stuff - used in smp_local_timer_interrupt */ static DEFINE_PER_CPU(int, prof_multiplier) = 1; static DEFINE_PER_CPU(int, prof_old_multiplier) = 1; @@ -427,18 +422,6 @@ void __init smp_store_cpu_info(int id) identify_secondary_cpu(c); } -/* set up the trampoline and return the physical address of the code */ -unsigned long __init setup_trampoline(void) -{ - /* these two are global symbols in trampoline.S */ - extern const __u8 trampoline_end[]; - extern const __u8 trampoline_data[]; - - memcpy(trampoline_base, trampoline_data, - trampoline_end - trampoline_data); - return virt_to_phys(trampoline_base); -} - /* Routine initially called when a non-boot CPU is brought online */ static void __init start_secondary(void *unused) { @@ -560,8 +543,8 @@ static void __init do_boot_cpu(__u8 cpu) hijack_source.idt.Offset, stack_start.sp)); /* init lowmem identity mapping */ - clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); + clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); flush_tlb_all(); if (quad_boot) { diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 20941d2954e2..b7b3e4c7cfc9 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,5 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o + pat.o pgtable.o obj-$(CONFIG_X86_32) += pgtable_32.o diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 6791b8334bc6..2c24bea92c66 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -324,7 +324,7 @@ static const struct file_operations ptdump_fops = { .release = single_release, }; -int pt_dump_init(void) +static int pt_dump_init(void) { struct dentry *pe; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9ec62da85fd7..baf7c4f643c8 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -71,7 +71,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); + paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); @@ -100,7 +100,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); } - paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); + paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } @@ -227,6 +227,25 @@ static inline int page_kills_ppro(unsigned long pagenr) return 0; } +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} + #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; pgprot_t kmap_prot; @@ -365,7 +384,7 @@ void __init native_pagetable_setup_start(pgd_t *base) pte_clear(NULL, va, pte); } - paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT); + paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); } void __init native_pagetable_setup_done(pgd_t *base) @@ -457,7 +476,7 @@ void zap_low_mappings(void) * Note that "pgd_clear()" doesn't do it for * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) { + for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { #ifdef CONFIG_X86_PAE set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1ff7906a9a4d..0cca62663037 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -135,7 +135,7 @@ static __init void *spp_getpage(void) return ptr; } -static __init void +static void set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) { pgd_t *pgd; @@ -173,7 +173,7 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); pte = pte_offset_kernel(pmd, vaddr); - if (!pte_none(*pte) && + if (!pte_none(*pte) && pte_val(new_pte) && pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) pte_ERROR(*pte); set_pte(pte, new_pte); @@ -214,8 +214,7 @@ void __init cleanup_highmap(void) } /* NOTE: this is meant to be run only at boot */ -void __init -__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) +void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) { unsigned long address = __fix_to_virt(idx); @@ -664,6 +663,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif /* CONFIG_MEMORY_HOTPLUG */ +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} + + static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 3a4baf95e24d..d176b23110cc 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -336,6 +336,35 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); +/* + * Convert a physical pointer to a virtual kernel pointer for /dev/mem + * access + */ +void *xlate_dev_mem_ptr(unsigned long phys) +{ + void *addr; + unsigned long start = phys & PAGE_MASK; + + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ + if (page_is_ram(start >> PAGE_SHIFT)) + return __va(phys); + + addr = (void *)ioremap(start, PAGE_SIZE); + if (addr) + addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); + + return addr; +} + +void unxlate_dev_mem_ptr(unsigned long phys, void *addr) +{ + if (page_is_ram(phys >> PAGE_SHIFT)) + return; + + iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); + return; +} + #ifdef CONFIG_X86_32 int __initdata early_ioremap_debug; @@ -407,7 +436,7 @@ void __init early_ioremap_clear(void) pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); pmd_clear(pmd); - paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); + paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); __flush_tlb_all(); } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index f7823a172868..bd5e05c654dc 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -483,9 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) goto out_unlock; pbase = (pte_t *)page_address(base); -#ifdef CONFIG_X86_32 - paravirt_alloc_pt(&init_mm, page_to_pfn(base)); -#endif + paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); #ifdef CONFIG_X86_64 @@ -993,7 +991,7 @@ static const struct file_operations dpa_fops = { .release = single_release, }; -int __init debug_pagealloc_proc_init(void) +static int __init debug_pagealloc_proc_init(void) { struct dentry *de; diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 72c0f6097402..ef8b64b89c7d 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -11,6 +11,7 @@ #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/fs.h> +#include <linux/bootmem.h> #include <asm/msr.h> #include <asm/tlbflush.h> @@ -21,6 +22,7 @@ #include <asm/cacheflush.h> #include <asm/fcntl.h> #include <asm/mtrr.h> +#include <asm/io.h> int pat_wc_enabled = 1; @@ -190,6 +192,21 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, return 0; } +/* + * req_type typically has one of the: + * - _PAGE_CACHE_WB + * - _PAGE_CACHE_WC + * - _PAGE_CACHE_UC_MINUS + * - _PAGE_CACHE_UC + * + * req_type will have a special case value '-1', when requester want to inherit + * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. + * + * If ret_type is NULL, function will return an error if it cannot reserve the + * region with req_type. If ret_type is non-null, function will return + * available type in ret_type in case of no error. In case of any error + * it will return a negative return value. + */ int reserve_memtype(u64 start, u64 end, unsigned long req_type, unsigned long *ret_type) { @@ -200,9 +217,14 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, /* Only track when pat_wc_enabled */ if (!pat_wc_enabled) { - if (ret_type) - *ret_type = req_type; - + /* This is identical to page table setting without PAT */ + if (ret_type) { + if (req_type == -1) { + *ret_type = _PAGE_CACHE_WB; + } else { + *ret_type = req_type; + } + } return 0; } @@ -214,8 +236,29 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, return 0; } - req_type &= _PAGE_CACHE_MASK; - err = pat_x_mtrr_type(start, end, req_type, &actual_type); + if (req_type == -1) { + /* + * Special case where caller wants to inherit from mtrr or + * existing pat mapping, defaulting to UC_MINUS in case of + * no match. + */ + u8 mtrr_type = mtrr_type_lookup(start, end); + if (mtrr_type == 0xFE) { /* MTRR match error */ + err = -1; + } + + if (mtrr_type == MTRR_TYPE_WRBACK) { + req_type = _PAGE_CACHE_WB; + actual_type = _PAGE_CACHE_WB; + } else { + req_type = _PAGE_CACHE_UC_MINUS; + actual_type = _PAGE_CACHE_UC_MINUS; + } + } else { + req_type &= _PAGE_CACHE_MASK; + err = pat_x_mtrr_type(start, end, req_type, &actual_type); + } + if (err) { if (ret_type) *ret_type = actual_type; @@ -241,7 +284,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, struct memtype *saved_ptr; if (parse->start >= end) { - printk("New Entry\n"); + pr_debug("New Entry\n"); list_add(&new_entry->nd, parse->nd.prev); new_entry = NULL; break; @@ -343,7 +386,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, break; } - printk("Overlap at 0x%Lx-0x%Lx\n", + printk(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); /* No conflict. Go ahead and add this new entry */ list_add(&new_entry->nd, &saved_ptr->nd); @@ -353,7 +396,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } if (err) { - printk( + printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", start, end, cattr_name(new_entry->type), cattr_name(req_type)); @@ -365,16 +408,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (new_entry) { /* No conflict. Not yet added to the list. Add to the tail */ list_add_tail(&new_entry->nd, &memtype_list); - printk("New Entry\n"); - } + pr_debug("New Entry\n"); + } if (ret_type) { - printk( + pr_debug( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", start, end, cattr_name(actual_type), cattr_name(req_type), cattr_name(*ret_type)); } else { - printk( + pr_debug( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", start, end, cattr_name(actual_type), cattr_name(req_type)); @@ -411,11 +454,115 @@ int free_memtype(u64 start, u64 end) spin_unlock(&memtype_lock); if (err) { - printk(KERN_DEBUG "%s:%d freeing invalid memtype %Lx-%Lx\n", + printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", current->comm, current->pid, start, end); } - printk( "free_memtype request 0x%Lx-0x%Lx\n", start, end); + pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); return err; } + +/* + * /dev/mem mmap interface. The memtype used for mapping varies: + * - Use UC for mappings with O_SYNC flag + * - Without O_SYNC flag, if there is any conflict in reserve_memtype, + * inherit the memtype from existing mapping. + * - Else use UC_MINUS memtype (for backward compatibility with existing + * X drivers. + */ +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) +{ + return vma_prot; +} + +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot) +{ + u64 offset = ((u64) pfn) << PAGE_SHIFT; + unsigned long flags = _PAGE_CACHE_UC_MINUS; + unsigned long ret_flags; + int retval; + + if (file->f_flags & O_SYNC) { + flags = _PAGE_CACHE_UC; + } + +#ifdef CONFIG_X86_32 + /* + * On the PPro and successors, the MTRRs are used to set + * memory types for physical addresses outside main memory, + * so blindly setting UC or PWT on those pages is wrong. + * For Pentiums and earlier, the surround logic should disable + * caching for the high addresses through the KEN pin, but + * we maintain the tradition of paranoia in this code. + */ + if (!pat_wc_enabled && + ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && + (pfn << PAGE_SHIFT) >= __pa(high_memory)) { + flags = _PAGE_CACHE_UC; + } +#endif + + /* + * With O_SYNC, we can only take UC mapping. Fail if we cannot. + * Without O_SYNC, we want to get + * - WB for WB-able memory and no other conflicting mappings + * - UC_MINUS for non-WB-able memory with no other conflicting mappings + * - Inherit from confliting mappings otherwise + */ + if (flags != _PAGE_CACHE_UC_MINUS) { + retval = reserve_memtype(offset, offset + size, flags, NULL); + } else { + retval = reserve_memtype(offset, offset + size, -1, &ret_flags); + } + + if (retval < 0) + return 0; + + flags = ret_flags; + + if (pfn <= max_pfn_mapped && + ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { + free_memtype(offset, offset + size); + printk(KERN_INFO + "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + offset, offset + size); + return 0; + } + + *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | + flags); + return 1; +} + +void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) +{ + u64 addr = (u64)pfn << PAGE_SHIFT; + unsigned long flags; + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); + + reserve_memtype(addr, addr + size, want_flags, &flags); + if (flags != want_flags) { + printk(KERN_INFO + "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + addr, addr + size, + cattr_name(flags)); + } +} + +void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) +{ + u64 addr = (u64)pfn << PAGE_SHIFT; + + free_memtype(addr, addr + size); +} + diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c new file mode 100644 index 000000000000..50159764f694 --- /dev/null +++ b/arch/x86/mm/pgtable.c @@ -0,0 +1,276 @@ +#include <linux/mm.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlb.h> + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); +} + +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + +#ifdef CONFIG_HIGHPTE + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); +#else + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); +#endif + if (pte) + pgtable_page_ctor(pte); + return pte; +} + +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +{ + pgtable_page_dtor(pte); + paravirt_release_pte(page_to_pfn(pte)); + tlb_remove_page(tlb, pte); +} + +#if PAGETABLE_LEVELS > 2 +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +{ + paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pmd)); +} + +#if PAGETABLE_LEVELS > 3 +void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +{ + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pud)); +} +#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* PAGETABLE_LEVELS > 2 */ + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_add(&page->lru, &pgd_list); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_del(&page->lru); +} + +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) + +static void pgd_ctor(void *p) +{ + pgd_t *pgd = p; + unsigned long flags; + + /* Clear usermode parts of PGD */ + memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t)); + + spin_lock_irqsave(&pgd_lock, flags); + + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the + references from swapper_pg_dir. */ + if (PAGETABLE_LEVELS == 2 || + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || + PAGETABLE_LEVELS == 4) { + clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + } + + /* list required to sync kernel mapping updates */ + if (!SHARED_KERNEL_PMD) + pgd_list_add(pgd); + + spin_unlock_irqrestore(&pgd_lock, flags); +} + +static void pgd_dtor(void *pgd) +{ + unsigned long flags; /* can be called from interrupt context */ + + if (SHARED_KERNEL_PMD) + return; + + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); +} + +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * -- wli + */ + +#ifdef CONFIG_X86_PAE +/* + * Mop up any pmd pages which may still be attached to the pgd. + * Normally they will be freed by munmap/exit_mmap, but any pmd we + * preallocate which never got a corresponding vma will need to be + * freed manually. + */ +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) +{ + int i; + + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { + pgd_t pgd = pgdp[i]; + + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + pgdp[i] = native_make_pgd(0); + + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + } + } +} + +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + pud_t *pud; + unsigned long addr; + int i; + + pud = pud_offset(pgd, 0); + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; + i++, pud++, addr += PUD_SIZE) { + pmd_t *pmd = pmd_alloc_one(mm, addr); + + if (!pmd) { + pgd_mop_up_pmds(mm, pgd); + return 0; + } + + if (i >= KERNEL_PGD_BOUNDARY) + memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), + sizeof(pmd_t) * PTRS_PER_PMD); + + pud_populate(mm, pud, pmd); + } + + return 1; +} + +void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +{ + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + + /* Note: almost everything apart from _PAGE_PRESENT is + reserved at the pmd (PDPT) level. */ + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + */ + if (mm == current->active_mm) + write_cr3(read_cr3()); +} +#else /* !CONFIG_X86_PAE */ +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + return 1; +} + +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) +{ +} +#endif /* CONFIG_X86_PAE */ + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + + /* so that alloc_pmd can use it */ + mm->pgd = pgd; + if (pgd) + pgd_ctor(pgd); + + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { + pgd_dtor(pgd); + free_page((unsigned long)pgd); + pgd = NULL; + } + + return pgd; +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + pgd_mop_up_pmds(mm, pgd); + pgd_dtor(pgd); + free_page((unsigned long)pgd); +} + +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + + if (changed && dirty) { + *ptep = entry; + pte_update_defer(vma->vm_mm, address, ptep); + flush_tlb_page(vma, address); + } + + return changed; +} + +int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int ret = 0; + + if (pte_young(*ptep)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + &ptep->pte); + + if (ret) + pte_update(vma->vm_mm, addr, ptep); + + return ret; +} + +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young; + + young = ptep_test_and_clear_young(vma, address, ptep); + if (young) + flush_tlb_page(vma, address); + + return young; +} diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 6fb9e7c6893f..9ee007be9142 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -173,210 +173,6 @@ void reserve_top_address(unsigned long reserve) __VMALLOC_RESERVE += reserve; } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); -} - -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - struct page *pte; - -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); -#endif - if (pte) - pgtable_page_ctor(pte); - return pte; -} - -/* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking - * against pageattr.c; it is the unique case in which a valid change - * of kernel pagetables can't be lazily synchronized by vmalloc faults. - * vmalloc faults work because attached pagetables are never freed. - * -- wli - */ -static inline void pgd_list_add(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - - list_add(&page->lru, &pgd_list); -} - -static inline void pgd_list_del(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - - list_del(&page->lru); -} - -#define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) - -static void pgd_ctor(void *p) -{ - pgd_t *pgd = p; - unsigned long flags; - - /* Clear usermode parts of PGD */ - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - - spin_lock_irqsave(&pgd_lock, flags); - - /* If the pgd points to a shared pagetable level (either the - ptes in non-PAE, or shared PMD in PAE), then just copy the - references from swapper_pg_dir. */ - if (PAGETABLE_LEVELS == 2 || - (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { - clone_pgd_range(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - } - - /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) - pgd_list_add(pgd); - - spin_unlock_irqrestore(&pgd_lock, flags); -} - -static void pgd_dtor(void *pgd) -{ - unsigned long flags; /* can be called from interrupt context */ - - if (SHARED_KERNEL_PMD) - return; - - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); -} - -#ifdef CONFIG_X86_PAE -/* - * Mop up any pmd pages which may still be attached to the pgd. - * Normally they will be freed by munmap/exit_mmap, but any pmd we - * preallocate which never got a corresponding vma will need to be - * freed manually. - */ -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) -{ - int i; - - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { - pgd_t pgd = pgdp[i]; - - if (pgd_val(pgd) != 0) { - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); - - pgdp[i] = native_make_pgd(0); - - paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); - pmd_free(mm, pmd); - } - } -} - -/* - * In PAE mode, we need to do a cr3 reload (=tlb flush) when - * updating the top-level pagetable entries to guarantee the - * processor notices the update. Since this is expensive, and - * all 4 top-level entries are used almost immediately in a - * new process's life, we just pre-populate them here. - * - * Also, if we're in a paravirt environment where the kernel pmd is - * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate - * and initialize the kernel pmds here. - */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) -{ - pud_t *pud; - unsigned long addr; - int i; - - pud = pud_offset(pgd, 0); - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; - i++, pud++, addr += PUD_SIZE) { - pmd_t *pmd = pmd_alloc_one(mm, addr); - - if (!pmd) { - pgd_mop_up_pmds(mm, pgd); - return 0; - } - - if (i >= USER_PTRS_PER_PGD) - memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), - sizeof(pmd_t) * PTRS_PER_PMD); - - pud_populate(mm, pud, pmd); - } - - return 1; -} -#else /* !CONFIG_X86_PAE */ -/* No need to prepopulate any pagetable entries in non-PAE modes. */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) -{ - return 1; -} - -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) -{ -} -#endif /* CONFIG_X86_PAE */ - -pgd_t *pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - - /* so that alloc_pd can use it */ - mm->pgd = pgd; - if (pgd) - pgd_ctor(pgd); - - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { - pgd_dtor(pgd); - free_page((unsigned long)pgd); - pgd = NULL; - } - - return pgd; -} - -void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - pgd_mop_up_pmds(mm, pgd); - pgd_dtor(pgd); - free_page((unsigned long)pgd); -} - -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) -{ - pgtable_page_dtor(pte); - paravirt_release_pt(page_to_pfn(pte)); - tlb_remove_page(tlb, pte); -} - -#ifdef CONFIG_X86_PAE - -void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) -{ - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pmd)); -} - -#endif - int pmd_bad(pmd_t pmd) { WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd)); diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index fb43d89f46f3..3890234e5b26 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -163,7 +163,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) pxm, apic_id, node); } -int update_end_of_memory(unsigned long end) {return -1;} +static int update_end_of_memory(unsigned long end) {return -1;} static int hotadd_enough_memory(struct bootnode *nd) {return 1;} #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static inline int save_add_info(void) {return 1;} diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 4d5f2649bee4..2e641be2737e 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,7 +6,7 @@ config XEN bool "Xen guest support" select PARAVIRT depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) + depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 343df246bd3e..3d8df981d5fd 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ -obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ - events.o time.o manage.o xen-asm.o +obj-y := enlighten.o setup.o multicalls.o mmu.o \ + time.o manage.o xen-asm.o grant-table.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c0388220cf97..c8a56e457d61 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -155,7 +155,8 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, if (*ax == 1) maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ (1 << X86_FEATURE_ACPI) | /* disable ACPI */ - (1 << X86_FEATURE_SEP) | /* disable SEP */ + (1 << X86_FEATURE_MCE) | /* disable MCE */ + (1 << X86_FEATURE_MCA) | /* disable MCA */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ asm(XEN_EMULATE_PREFIX "cpuid" @@ -531,26 +532,37 @@ static void xen_apic_write(unsigned long reg, u32 val) static void xen_flush_tlb(void) { struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + struct multicall_space mcs; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); op = mcs.args; op->cmd = MMUEXT_TLB_FLUSH_LOCAL; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); } static void xen_flush_tlb_single(unsigned long addr) { struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + struct multicall_space mcs; + + preempt_disable(); + mcs = xen_mc_entry(sizeof(*op)); op = mcs.args; op->cmd = MMUEXT_INVLPG_LOCAL; op->arg1.linear_addr = addr & PAGE_MASK; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); } static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, @@ -655,15 +667,17 @@ static void xen_write_cr3(unsigned long cr3) /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ -static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) +static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) { +#ifdef CONFIG_FLATMEM BUG_ON(mem_map); /* should only be used early */ +#endif make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); } -/* Early release_pt assumes that all pts are pinned, since there's +/* Early release_pte assumes that all pts are pinned, since there's only init_mm and anything attached to that is pinned. */ -static void xen_release_pt_init(u32 pfn) +static void xen_release_pte_init(u32 pfn) { make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } @@ -697,12 +711,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) } } -static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) { xen_alloc_ptpage(mm, pfn, PT_PTE); } -static void xen_alloc_pd(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) { xen_alloc_ptpage(mm, pfn, PT_PMD); } @@ -722,12 +736,12 @@ static void xen_release_ptpage(u32 pfn, unsigned level) } } -static void xen_release_pt(u32 pfn) +static void xen_release_pte(u32 pfn) { xen_release_ptpage(pfn, PT_PTE); } -static void xen_release_pd(u32 pfn) +static void xen_release_pmd(u32 pfn) { xen_release_ptpage(pfn, PT_PMD); } @@ -849,10 +863,10 @@ static __init void xen_pagetable_setup_done(pgd_t *base) { /* This will work as long as patching hasn't happened yet (which it hasn't) */ - pv_mmu_ops.alloc_pt = xen_alloc_pt; - pv_mmu_ops.alloc_pd = xen_alloc_pd; - pv_mmu_ops.release_pt = xen_release_pt; - pv_mmu_ops.release_pd = xen_release_pd; + pv_mmu_ops.alloc_pte = xen_alloc_pte; + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; pv_mmu_ops.set_pte = xen_set_pte; setup_shared_info(); @@ -994,7 +1008,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .read_pmc = native_read_pmc, .iret = xen_iret, - .irq_enable_syscall_ret = NULL, /* never called */ + .irq_enable_syscall_ret = xen_sysexit, .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, @@ -1059,11 +1073,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, - .alloc_pt = xen_alloc_pt_init, - .release_pt = xen_release_pt_init, - .alloc_pd = xen_alloc_pt_init, - .alloc_pd_clone = paravirt_nop, - .release_pd = xen_release_pt_init, + .alloc_pte = xen_alloc_pte_init, + .release_pte = xen_release_pte_init, + .alloc_pmd = xen_alloc_pte_init, + .alloc_pmd_clone = paravirt_nop, + .release_pmd = xen_release_pte_init, #ifdef CONFIG_HIGHPTE .kmap_atomic_pte = xen_kmap_atomic_pte, diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c deleted file mode 100644 index dcf613e17581..000000000000 --- a/arch/x86/xen/events.c +++ /dev/null @@ -1,591 +0,0 @@ -/* - * Xen event channels - * - * Xen models interrupts with abstract event channels. Because each - * domain gets 1024 event channels, but NR_IRQ is not that large, we - * must dynamically map irqs<->event channels. The event channels - * interface with the rest of the kernel by defining a xen interrupt - * chip. When an event is recieved, it is mapped to an irq and sent - * through the normal interrupt processing path. - * - * There are four kinds of events which can be mapped to an event - * channel: - * - * 1. Inter-domain notifications. This includes all the virtual - * device events, since they're driven by front-ends in another domain - * (typically dom0). - * 2. VIRQs, typically used for timers. These are per-cpu events. - * 3. IPIs. - * 4. Hardware interrupts. Not supported at present. - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ - -#include <linux/linkage.h> -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/module.h> -#include <linux/string.h> - -#include <asm/ptrace.h> -#include <asm/irq.h> -#include <asm/sync_bitops.h> -#include <asm/xen/hypercall.h> -#include <asm/xen/hypervisor.h> - -#include <xen/events.h> -#include <xen/interface/xen.h> -#include <xen/interface/event_channel.h> - -#include "xen-ops.h" - -/* - * This lock protects updates to the following mapping and reference-count - * arrays. The lock does not need to be acquired to read the mapping tables. - */ -static DEFINE_SPINLOCK(irq_mapping_update_lock); - -/* IRQ <-> VIRQ mapping. */ -static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; - -/* IRQ <-> IPI mapping */ -static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; - -/* Packed IRQ information: binding type, sub-type index, and event channel. */ -struct packed_irq -{ - unsigned short evtchn; - unsigned char index; - unsigned char type; -}; - -static struct packed_irq irq_info[NR_IRQS]; - -/* Binding types. */ -enum { - IRQT_UNBOUND, - IRQT_PIRQ, - IRQT_VIRQ, - IRQT_IPI, - IRQT_EVTCHN -}; - -/* Convenient shorthand for packed representation of an unbound IRQ. */ -#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) - -static int evtchn_to_irq[NR_EVENT_CHANNELS] = { - [0 ... NR_EVENT_CHANNELS-1] = -1 -}; -static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; -static u8 cpu_evtchn[NR_EVENT_CHANNELS]; - -/* Reference counts for bindings to IRQs. */ -static int irq_bindcount[NR_IRQS]; - -/* Xen will never allocate port zero for any purpose. */ -#define VALID_EVTCHN(chn) ((chn) != 0) - -/* - * Force a proper event-channel callback from Xen after clearing the - * callback mask. We do this in a very simple manner, by making a call - * down into Xen. The pending flag will be checked by Xen on return. - */ -void force_evtchn_callback(void) -{ - (void)HYPERVISOR_xen_version(0, NULL); -} -EXPORT_SYMBOL_GPL(force_evtchn_callback); - -static struct irq_chip xen_dynamic_chip; - -/* Constructor for packed IRQ information. */ -static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) -{ - return (struct packed_irq) { evtchn, index, type }; -} - -/* - * Accessors for packed IRQ information. - */ -static inline unsigned int evtchn_from_irq(int irq) -{ - return irq_info[irq].evtchn; -} - -static inline unsigned int index_from_irq(int irq) -{ - return irq_info[irq].index; -} - -static inline unsigned int type_from_irq(int irq) -{ - return irq_info[irq].type; -} - -static inline unsigned long active_evtchns(unsigned int cpu, - struct shared_info *sh, - unsigned int idx) -{ - return (sh->evtchn_pending[idx] & - cpu_evtchn_mask[cpu][idx] & - ~sh->evtchn_mask[idx]); -} - -static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) -{ - int irq = evtchn_to_irq[chn]; - - BUG_ON(irq == -1); -#ifdef CONFIG_SMP - irq_desc[irq].affinity = cpumask_of_cpu(cpu); -#endif - - __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); - __set_bit(chn, cpu_evtchn_mask[cpu]); - - cpu_evtchn[chn] = cpu; -} - -static void init_evtchn_cpu_bindings(void) -{ -#ifdef CONFIG_SMP - int i; - /* By default all event channels notify CPU#0. */ - for (i = 0; i < NR_IRQS; i++) - irq_desc[i].affinity = cpumask_of_cpu(0); -#endif - - memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); - memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); -} - -static inline unsigned int cpu_from_evtchn(unsigned int evtchn) -{ - return cpu_evtchn[evtchn]; -} - -static inline void clear_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_clear_bit(port, &s->evtchn_pending[0]); -} - -static inline void set_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_pending[0]); -} - - -/** - * notify_remote_via_irq - send event to remote end of event channel via irq - * @irq: irq of event channel to send event to - * - * Unlike notify_remote_via_evtchn(), this is safe to use across - * save/restore. Notifications on a broken connection are silently - * dropped. - */ -void notify_remote_via_irq(int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - notify_remote_via_evtchn(evtchn); -} -EXPORT_SYMBOL_GPL(notify_remote_via_irq); - -static void mask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_mask[0]); -} - -static void unmask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - unsigned int cpu = get_cpu(); - - BUG_ON(!irqs_disabled()); - - /* Slow path (hypercall) if this is a non-local port. */ - if (unlikely(cpu != cpu_from_evtchn(port))) { - struct evtchn_unmask unmask = { .port = port }; - (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); - } else { - struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - - sync_clear_bit(port, &s->evtchn_mask[0]); - - /* - * The following is basically the equivalent of - * 'hw_resend_irq'. Just like a real IO-APIC we 'lose - * the interrupt edge' if the channel is masked. - */ - if (sync_test_bit(port, &s->evtchn_pending[0]) && - !sync_test_and_set_bit(port / BITS_PER_LONG, - &vcpu_info->evtchn_pending_sel)) - vcpu_info->evtchn_upcall_pending = 1; - } - - put_cpu(); -} - -static int find_unbound_irq(void) -{ - int irq; - - /* Only allocate from dynirq range */ - for (irq = 0; irq < NR_IRQS; irq++) - if (irq_bindcount[irq] == 0) - break; - - if (irq == NR_IRQS) - panic("No available IRQ to bind to: increase NR_IRQS!\n"); - - return irq; -} - -int bind_evtchn_to_irq(unsigned int evtchn) -{ - int irq; - - spin_lock(&irq_mapping_update_lock); - - irq = evtchn_to_irq[evtchn]; - - if (irq == -1) { - irq = find_unbound_irq(); - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "event"); - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); - } - - irq_bindcount[irq]++; - - spin_unlock(&irq_mapping_update_lock); - - return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); - -static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) -{ - struct evtchn_bind_ipi bind_ipi; - int evtchn, irq; - - spin_lock(&irq_mapping_update_lock); - - irq = per_cpu(ipi_to_irq, cpu)[ipi]; - if (irq == -1) { - irq = find_unbound_irq(); - if (irq < 0) - goto out; - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "ipi"); - - bind_ipi.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, - &bind_ipi) != 0) - BUG(); - evtchn = bind_ipi.port; - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); - - per_cpu(ipi_to_irq, cpu)[ipi] = irq; - - bind_evtchn_to_cpu(evtchn, cpu); - } - - irq_bindcount[irq]++; - - out: - spin_unlock(&irq_mapping_update_lock); - return irq; -} - - -static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) -{ - struct evtchn_bind_virq bind_virq; - int evtchn, irq; - - spin_lock(&irq_mapping_update_lock); - - irq = per_cpu(virq_to_irq, cpu)[virq]; - - if (irq == -1) { - bind_virq.virq = virq; - bind_virq.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, - &bind_virq) != 0) - BUG(); - evtchn = bind_virq.port; - - irq = find_unbound_irq(); - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "virq"); - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); - - per_cpu(virq_to_irq, cpu)[virq] = irq; - - bind_evtchn_to_cpu(evtchn, cpu); - } - - irq_bindcount[irq]++; - - spin_unlock(&irq_mapping_update_lock); - - return irq; -} - -static void unbind_from_irq(unsigned int irq) -{ - struct evtchn_close close; - int evtchn = evtchn_from_irq(irq); - - spin_lock(&irq_mapping_update_lock); - - if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - switch (type_from_irq(irq)) { - case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) - [index_from_irq(irq)] = -1; - break; - default: - break; - } - - /* Closed ports are implicitly re-bound to VCPU0. */ - bind_evtchn_to_cpu(evtchn, 0); - - evtchn_to_irq[evtchn] = -1; - irq_info[irq] = IRQ_UNBOUND; - - dynamic_irq_init(irq); - } - - spin_unlock(&irq_mapping_update_lock); -} - -int bind_evtchn_to_irqhandler(unsigned int evtchn, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, void *dev_id) -{ - unsigned int irq; - int retval; - - irq = bind_evtchn_to_irq(evtchn); - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); - -int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, - irq_handler_t handler, - unsigned long irqflags, const char *devname, void *dev_id) -{ - unsigned int irq; - int retval; - - irq = bind_virq_to_irq(virq, cpu); - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} -EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); - -int bind_ipi_to_irqhandler(enum ipi_vector ipi, - unsigned int cpu, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, - void *dev_id) -{ - int irq, retval; - - irq = bind_ipi_to_irq(ipi, cpu); - if (irq < 0) - return irq; - - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} - -void unbind_from_irqhandler(unsigned int irq, void *dev_id) -{ - free_irq(irq, dev_id); - unbind_from_irq(irq); -} -EXPORT_SYMBOL_GPL(unbind_from_irqhandler); - -void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) -{ - int irq = per_cpu(ipi_to_irq, cpu)[vector]; - BUG_ON(irq < 0); - notify_remote_via_irq(irq); -} - - -/* - * Search the CPUs pending events bitmasks. For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. - * - * Xen uses a two-level bitmap to speed searching. The first level is - * a bitset of words which contain pending event bits. The second - * level is a bitset of pending events themselves. - */ -void xen_evtchn_do_upcall(struct pt_regs *regs) -{ - int cpu = get_cpu(); - struct shared_info *s = HYPERVISOR_shared_info; - struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - unsigned long pending_words; - - vcpu_info->evtchn_upcall_pending = 0; - - /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ - pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); - while (pending_words != 0) { - unsigned long pending_bits; - int word_idx = __ffs(pending_words); - pending_words &= ~(1UL << word_idx); - - while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { - int bit_idx = __ffs(pending_bits); - int port = (word_idx * BITS_PER_LONG) + bit_idx; - int irq = evtchn_to_irq[port]; - - if (irq != -1) { - regs->orig_ax = ~irq; - do_IRQ(regs); - } - } - } - - put_cpu(); -} - -/* Rebind an evtchn so that it gets delivered to a specific cpu */ -static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) -{ - struct evtchn_bind_vcpu bind_vcpu; - int evtchn = evtchn_from_irq(irq); - - if (!VALID_EVTCHN(evtchn)) - return; - - /* Send future instances of this interrupt to other vcpu. */ - bind_vcpu.port = evtchn; - bind_vcpu.vcpu = tcpu; - - /* - * If this fails, it usually just indicates that we're dealing with a - * virq or IPI channel, which don't actually need to be rebound. Ignore - * it, but don't do the xenlinux-level rebind in that case. - */ - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) - bind_evtchn_to_cpu(evtchn, tcpu); -} - - -static void set_affinity_irq(unsigned irq, cpumask_t dest) -{ - unsigned tcpu = first_cpu(dest); - rebind_irq_to_cpu(irq, tcpu); -} - -static void enable_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - unmask_evtchn(evtchn); -} - -static void disable_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - mask_evtchn(evtchn); -} - -static void ack_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - move_native_irq(irq); - - if (VALID_EVTCHN(evtchn)) - clear_evtchn(evtchn); -} - -static int retrigger_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - int ret = 0; - - if (VALID_EVTCHN(evtchn)) { - set_evtchn(evtchn); - ret = 1; - } - - return ret; -} - -static struct irq_chip xen_dynamic_chip __read_mostly = { - .name = "xen-dyn", - .mask = disable_dynirq, - .unmask = enable_dynirq, - .ack = ack_dynirq, - .set_affinity = set_affinity_irq, - .retrigger = retrigger_dynirq, -}; - -void __init xen_init_IRQ(void) -{ - int i; - - init_evtchn_cpu_bindings(); - - /* No event channels are 'live' right now. */ - for (i = 0; i < NR_EVENT_CHANNELS; i++) - mask_evtchn(i); - - /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ - for (i = 0; i < NR_IRQS; i++) - irq_bindcount[i] = 0; - - irq_ctx_init(smp_processor_id()); -} diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c deleted file mode 100644 index 0707714e40d6..000000000000 --- a/arch/x86/xen/features.c +++ /dev/null @@ -1,29 +0,0 @@ -/****************************************************************************** - * features.c - * - * Xen feature flags. - * - * Copyright (c) 2006, Ian Campbell, XenSource Inc. - */ -#include <linux/types.h> -#include <linux/cache.h> -#include <linux/module.h> -#include <asm/xen/hypervisor.h> -#include <xen/features.h> - -u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; -EXPORT_SYMBOL_GPL(xen_features); - -void xen_setup_features(void) -{ - struct xen_feature_info fi; - int i, j; - - for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { - fi.submap_idx = i; - if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) - break; - for (j = 0; j < 32; j++) - xen_features[i * 32 + j] = !!(fi.submap & 1<<j); - } -} diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c new file mode 100644 index 000000000000..49ba9b5224d1 --- /dev/null +++ b/arch/x86/xen/grant-table.c @@ -0,0 +1,91 @@ +/****************************************************************************** + * grant_table.c + * x86 specific part + * + * Granting foreign access to our memory reservation. + * + * Copyright (c) 2005-2006, Christopher Clark + * Copyright (c) 2004-2005, K A Fraser + * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan. Split out x86 specific part. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> + +#include <xen/interface/xen.h> +#include <xen/page.h> +#include <xen/grant_table.h> + +#include <asm/pgtable.h> + +static int map_pte_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + unsigned long **frames = (unsigned long **)data; + + set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL)); + (*frames)++; + return 0; +} + +static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + + set_pte_at(&init_mm, addr, pte, __pte(0)); + return 0; +} + +int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + struct grant_entry **__shared) +{ + int rc; + struct grant_entry *shared = *__shared; + + if (shared == NULL) { + struct vm_struct *area = + xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes); + BUG_ON(area == NULL); + shared = area->addr; + *__shared = shared; + } + + rc = apply_to_page_range(&init_mm, (unsigned long)shared, + PAGE_SIZE * nr_gframes, + map_pte_fn, &frames); + return rc; +} + +void arch_gnttab_unmap_shared(struct grant_entry *shared, + unsigned long nr_gframes) +{ + apply_to_page_range(&init_mm, (unsigned long)shared, + PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); +} diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 2a054ef2a3da..6cbcf65609ad 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -156,6 +156,10 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { + /* updates to init_mm may be done without lock */ + if (mm == &init_mm) + preempt_disable(); + if (mm == current->mm || mm == &init_mm) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { struct multicall_space mcs; @@ -163,14 +167,61 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); xen_mc_issue(PARAVIRT_LAZY_MMU); - return; + goto out; } else if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) - return; + goto out; } xen_set_pte(ptep, pteval); + +out: + if (mm == &init_mm) + preempt_enable(); +} + +pteval_t xen_pte_val(pte_t pte) +{ + pteval_t ret = pte.pte; + + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; + + return ret; +} + +pgdval_t xen_pgd_val(pgd_t pgd) +{ + pgdval_t ret = pgd.pgd; + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; + return ret; +} + +pte_t xen_make_pte(pteval_t pte) +{ + if (pte & _PAGE_PRESENT) { + pte = phys_to_machine(XPADDR(pte)).maddr; + pte &= ~(_PAGE_PCD | _PAGE_PWT); + } + + return (pte_t){ .pte = pte }; } +pgd_t xen_make_pgd(pgdval_t pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd = phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} + +pmdval_t xen_pmd_val(pmd_t pmd) +{ + pmdval_t ret = native_pmd_val(pmd); + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; + return ret; +} #ifdef CONFIG_X86_PAE void xen_set_pud(pud_t *ptr, pud_t val) { @@ -214,100 +265,18 @@ void xen_pmd_clear(pmd_t *pmdp) xen_set_pmd(pmdp, __pmd(0)); } -unsigned long long xen_pte_val(pte_t pte) +pmd_t xen_make_pmd(pmdval_t pmd) { - unsigned long long ret = 0; - - if (pte.pte_low) { - ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low; - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - } - - return ret; -} - -unsigned long long xen_pmd_val(pmd_t pmd) -{ - unsigned long long ret = pmd.pmd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - return ret; -} - -unsigned long long xen_pgd_val(pgd_t pgd) -{ - unsigned long long ret = pgd.pgd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - return ret; -} - -pte_t xen_make_pte(unsigned long long pte) -{ - if (pte & _PAGE_PRESENT) { - pte = phys_to_machine(XPADDR(pte)).maddr; - pte &= ~(_PAGE_PCD | _PAGE_PWT); - } - - return (pte_t){ .pte = pte }; -} - -pmd_t xen_make_pmd(unsigned long long pmd) -{ - if (pmd & 1) + if (pmd & _PAGE_PRESENT) pmd = phys_to_machine(XPADDR(pmd)).maddr; - return (pmd_t){ pmd }; -} - -pgd_t xen_make_pgd(unsigned long long pgd) -{ - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; - - return (pgd_t){ pgd }; + return native_make_pmd(pmd); } #else /* !PAE */ void xen_set_pte(pte_t *ptep, pte_t pte) { *ptep = pte; } - -unsigned long xen_pte_val(pte_t pte) -{ - unsigned long ret = pte.pte_low; - - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr; - - return ret; -} - -unsigned long xen_pgd_val(pgd_t pgd) -{ - unsigned long ret = pgd.pgd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - return ret; -} - -pte_t xen_make_pte(unsigned long pte) -{ - if (pte & _PAGE_PRESENT) { - pte = phys_to_machine(XPADDR(pte)).maddr; - pte &= ~(_PAGE_PCD | _PAGE_PWT); - } - - return (pte_t){ pte }; -} - -pgd_t xen_make_pgd(unsigned long pgd) -{ - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; - - return (pgd_t){ pgd }; -} #endif /* CONFIG_X86_PAE */ /* diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2341492bf7a0..82517e4a752a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -16,6 +16,7 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#include <xen/interface/callback.h> #include <xen/interface/physdev.h> #include <xen/features.h> @@ -68,6 +69,24 @@ static void __init fiddle_vdso(void) *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; } +void xen_enable_sysenter(void) +{ + int cpu = smp_processor_id(); + extern void xen_sysenter_target(void); + /* Mask events on entry, even though they get enabled immediately */ + static struct callback_register sysenter = { + .type = CALLBACKTYPE_sysenter, + .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target }, + .flags = CALLBACKF_mask_events, + }; + + if (!boot_cpu_has(X86_FEATURE_SEP) || + HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { + clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); + } +} + void __init xen_arch_setup(void) { struct physdev_set_iopl set_iopl; @@ -82,6 +101,8 @@ void __init xen_arch_setup(void) HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, __KERNEL_CS, (unsigned long)xen_failsafe_callback); + xen_enable_sysenter(); + set_iopl.iopl = 1; rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); if (rc != 0) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index e340ff92f6b6..92dd3dbf3ffb 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -36,8 +36,9 @@ #include "mmu.h" static cpumask_t xen_cpu_initialized_map; -static DEFINE_PER_CPU(int, resched_irq); -static DEFINE_PER_CPU(int, callfunc_irq); +static DEFINE_PER_CPU(int, resched_irq) = -1; +static DEFINE_PER_CPU(int, callfunc_irq) = -1; +static DEFINE_PER_CPU(int, debug_irq) = -1; /* * Structure and data for smp_call_function(). This is designed to minimise @@ -72,6 +73,7 @@ static __cpuinit void cpu_bringup_and_idle(void) int cpu = smp_processor_id(); cpu_init(); + xen_enable_sysenter(); preempt_disable(); per_cpu(cpu_state, cpu) = CPU_ONLINE; @@ -88,9 +90,7 @@ static __cpuinit void cpu_bringup_and_idle(void) static int xen_smp_intr_init(unsigned int cpu) { int rc; - const char *resched_name, *callfunc_name; - - per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; + const char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, @@ -114,6 +114,14 @@ static int xen_smp_intr_init(unsigned int cpu) goto fail; per_cpu(callfunc_irq, cpu) = rc; + debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); + rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, + IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING, + debug_name, NULL); + if (rc < 0) + goto fail; + per_cpu(debug_irq, cpu) = rc; + return 0; fail: @@ -121,6 +129,8 @@ static int xen_smp_intr_init(unsigned int cpu) unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); if (per_cpu(callfunc_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + if (per_cpu(debug_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); return rc; } diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index fe161ed4b01e..2497a30f41de 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -108,6 +108,20 @@ ENDPATCH(xen_restore_fl_direct) RELOC(xen_restore_fl_direct, 2b+1) /* + We can't use sysexit directly, because we're not running in ring0. + But we can easily fake it up using iret. Assuming xen_sysexit + is jumped to with a standard stack frame, we can just strip it + back to a standard iret frame and use iret. + */ +ENTRY(xen_sysexit) + movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ + orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) + lea PT_EIP(%esp), %esp + + jmp xen_iret +ENDPROC(xen_sysexit) + +/* This is run where a normal iret would be run, with the same stack setup: 8: eflags 4: cs @@ -184,8 +198,12 @@ iret_restore_end: region is OK. */ je xen_hypervisor_callback - iret +1: iret xen_iret_end_crit: +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous hyper_iret: /* put this out of line since its very rarely used */ @@ -219,9 +237,7 @@ hyper_iret: ds } SAVE_ALL state eax } : : - ebx } - ---------------- - return addr <- esp + ebx }<- esp ---------------- In order to deliver the nested exception properly, we need to shift @@ -236,10 +252,8 @@ hyper_iret: it's usermode state which we eventually need to restore. */ ENTRY(xen_iret_crit_fixup) - /* offsets +4 for return address */ - /* - Paranoia: Make sure we're really coming from userspace. + Paranoia: Make sure we're really coming from kernel space. One could imagine a case where userspace jumps into the critical range address, but just before the CPU delivers a GP, it decides to deliver an interrupt instead. Unlikely? @@ -248,32 +262,32 @@ ENTRY(xen_iret_crit_fixup) jump instruction itself, not the destination, but some virtual environments get this wrong. */ - movl PT_CS+4(%esp), %ecx + movl PT_CS(%esp), %ecx andl $SEGMENT_RPL_MASK, %ecx cmpl $USER_RPL, %ecx je 2f - lea PT_ORIG_EAX+4(%esp), %esi - lea PT_EFLAGS+4(%esp), %edi + lea PT_ORIG_EAX(%esp), %esi + lea PT_EFLAGS(%esp), %edi /* If eip is before iret_restore_end then stack hasn't been restored yet. */ cmp $iret_restore_end, %eax jae 1f - movl 0+4(%edi),%eax /* copy EAX */ - movl %eax, PT_EAX+4(%esp) + movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */ + movl %eax, PT_EAX(%esp) lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ /* set up the copy */ 1: std - mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ + mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ rep movsl cld lea 4(%edi),%esp /* point esp to new frame */ -2: ret +2: jmp xen_do_upcall /* diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 956a491ea998..f1063ae08037 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -2,6 +2,8 @@ #define XEN_OPS_H #include <linux/init.h> +#include <linux/irqreturn.h> +#include <xen/xen-ops.h> /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; @@ -9,7 +11,6 @@ extern const char xen_failsafe_callback[]; void xen_copy_trap_info(struct trap_info *traps); -DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); DECLARE_PER_CPU(unsigned long, xen_cr3); DECLARE_PER_CPU(unsigned long, xen_current_cr3); @@ -19,6 +20,7 @@ extern struct shared_info *HYPERVISOR_shared_info; char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); +void xen_enable_sysenter(void); void xen_setup_timer(int cpu); void xen_setup_cpu_clockevents(void); @@ -28,6 +30,8 @@ unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); unsigned long long xen_sched_clock(void); +irqreturn_t xen_debug_interrupt(int irq, void *dev_id); + bool xen_vcpu_stolen(int vcpu); void xen_mark_init_mm_pinned(void); @@ -64,4 +68,6 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void); DECL_ASM(void, xen_restore_fl_direct, unsigned long); void xen_iret(void); +void xen_sysexit(void); + #endif /* XEN_OPS_H */ |