diff options
Diffstat (limited to 'arch/x86')
99 files changed, 4855 insertions, 2445 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4d350b5cbc71..a12dbb2b93f3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -86,9 +86,6 @@ config GENERIC_GPIO config ARCH_MAY_HAVE_PC_FDC def_bool y -config DMI - def_bool y - config RWSEM_GENERIC_SPINLOCK def_bool !X86_XADD @@ -114,6 +111,9 @@ config GENERIC_TIME_VSYSCALL config ARCH_HAS_CPU_RELAX def_bool y +config ARCH_HAS_CACHE_LINE_SIZE + def_bool y + config HAVE_SETUP_PER_CPU_AREA def_bool X86_64 || (X86_SMP && !X86_VOYAGER) @@ -142,6 +142,9 @@ config AUDIT_ARCH config ARCH_SUPPORTS_AOUT def_bool y +config ARCH_SUPPORTS_OPTIMIZED_INLINING + def_bool y + # Use the generic interrupt handling code in kernel/irq/: config GENERIC_HARDIRQS bool @@ -370,6 +373,25 @@ config VMI at the moment), by linking the kernel to a GPL-ed ROM module provided by the hypervisor. +config KVM_CLOCK + bool "KVM paravirtualized clock" + select PARAVIRT + depends on !(X86_VISWS || X86_VOYAGER) + help + Turning on this option will allow you to run a paravirtualized clock + when running over the KVM hypervisor. Instead of relying on a PIT + (or probably other) emulation by the underlying device model, the host + provides the guest with timing infrastructure such as time of day, and + system time + +config KVM_GUEST + bool "KVM Guest support" + select PARAVIRT + depends on !(X86_VISWS || X86_VOYAGER) + help + This option enables various optimizations for running under the KVM + hypervisor. + source "arch/x86/lguest/Kconfig" config PARAVIRT @@ -460,6 +482,15 @@ config HPET_EMULATE_RTC # Mark as embedded because too many people got it wrong. # The code disables itself when not needed. +config DMI + default y + bool "Enable DMI scanning" if EMBEDDED + help + Enabled scanning of DMI to identify machine quirks. Say Y + here unless you have verified that your setup is not + affected by entries in the DMI blacklist. Required by PNP + BIOS code. + config GART_IOMMU bool "GART IOMMU support" if EMBEDDED default y @@ -1049,9 +1080,9 @@ config MTRR See <file:Documentation/mtrr.txt> for more information. config X86_PAT - def_bool y + bool prompt "x86 PAT support" - depends on MTRR && NONPROMISC_DEVMEM + depends on MTRR help Use PAT attributes to setup page level cache control. diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 57072f2716f9..7ef18b01f0bc 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -21,8 +21,8 @@ config M386 Here are the settings recommended for greatest speed: - "386" for the AMD/Cyrix/Intel 386DX/DXL/SL/SLC/SX, Cyrix/TI - 486DLC/DLC2, UMC 486SX-S and NexGen Nx586. Only "386" kernels - will run on a 386 class machine. + 486DLC/DLC2, and UMC 486SX-S. Only "386" kernels will run on a 386 + class machine. - "486" for the AMD/Cyrix/IBM/Intel 486DX/DX2/DX4 or SL/SLC/SLC2/SLC3/SX/SX2 and UMC U5D or U5S. - "586" for generic Pentium CPUs lacking the TSC @@ -278,6 +278,11 @@ config GENERIC_CPU endchoice +config X86_CPU + def_bool y + select GENERIC_FIND_FIRST_BIT + select GENERIC_FIND_NEXT_BIT + config X86_GENERIC bool "Generic x86 support" depends on X86_32 @@ -398,7 +403,7 @@ config X86_TSC # generates cmov. config X86_CMOV def_bool y - depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7) + depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64) config X86_MINIMUM_CPU_FAMILY int diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 610aaecc19f8..5b1979a45a1e 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -5,6 +5,17 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" +config NONPROMISC_DEVMEM + bool "Disable promiscuous /dev/mem" + help + The /dev/mem file by default only allows userspace access to PCI + space and the BIOS code and data regions. This is sufficient for + dosemu and X and all common users of /dev/mem. With this config + option, you allow userspace access to all of memory, including + kernel and userspace memory. Accidental access to this is + obviously disasterous, but specific access can be used by people + debugging the kernel. + config EARLY_PRINTK bool "Early printk" if EMBEDDED default y @@ -246,3 +257,16 @@ config CPA_DEBUG Do change_page_attr() self-tests every 30 seconds. endmenu + +config OPTIMIZE_INLINING + bool "Allow gcc to uninline functions marked 'inline'" + default y + help + This option determines if the kernel forces gcc to inline the functions + developers have marked 'inline'. Doing so takes away freedom from gcc to + do what it thinks is best, which is desirable for the gcc 3.x series of + compilers. The gcc 4.x series have a rewritten inlining algorithm and + disabling this option will generate a smaller kernel there. Hopefully + this algorithm is so good that allowing gcc4 to make the decision can + become the default in the future, until then this option is there to + test gcc for this. diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore index b1bdc4c6f9f2..172cf8a98bdd 100644 --- a/arch/x86/boot/.gitignore +++ b/arch/x86/boot/.gitignore @@ -1,7 +1,8 @@ bootsect bzImage +cpustr.h +mkcpustr +offsets.h setup setup.bin setup.elf -cpustr.h -mkcpustr diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 6d2df8d61c54..af86e431acfa 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -120,7 +120,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x0208 # header version number (>= 0x0105) + .word 0x0209 # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -227,6 +227,10 @@ hardware_subarch_data: .quad 0 payload_offset: .long input_data payload_length: .long input_data_end-input_data +setup_data: .quad 0 # 64-bit physical pointer to + # single linked list of + # struct setup_data + # End of setup header ##################################################### .section ".inittext", "ax" diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 3df340b54e57..ad7ddaaff588 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1421,6 +1421,7 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set +CONFIG_OPTIMIZE_INLINING=y # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index eef98cb00c62..2d6f5b2809d2 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1346,6 +1346,7 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set +CONFIG_OPTIMIZE_INLINING=y # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 05e155d3fb6c..bbed3a26ce55 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -499,11 +499,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, regs->cs = __USER32_CS; regs->ss = __USER32_DS; - set_fs(USER_DS); - regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF); - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); - #if DEBUG_SIG printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", current->comm, current->pid, frame, regs->ip, frame->pretcode); @@ -599,11 +594,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->cs = __USER32_CS; regs->ss = __USER32_DS; - set_fs(USER_DS); - regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF); - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); - #if DEBUG_SIG printk(KERN_DEBUG "SIG deliver (%s:%d): sp=%p pc=%lx ra=%u\n", current->comm, current->pid, frame, regs->ip, frame->pretcode); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ae7158bce4d6..b5e329da166c 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -430,7 +430,7 @@ ia32_sys_call_table: .quad sys_setuid16 .quad sys_getuid16 .quad compat_sys_stime /* stime */ /* 25 */ - .quad sys32_ptrace /* ptrace */ + .quad compat_sys_ptrace /* ptrace */ .quad sys_alarm .quad sys_fstat /* (old)fstat */ .quad sys_pause diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 90e092d0af0c..fa19c3819540 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -80,6 +80,8 @@ obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o +obj-$(CONFIG_KVM_GUEST) += kvm.o +obj-$(CONFIG_KVM_CLOCK) += kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o ifdef CONFIG_INPUT_PCSPKR diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 057ccf1d5ad4..977ed5cdeaa3 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -697,10 +697,6 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) #define HPET_RESOURCE_NAME_SIZE 9 hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); - if (!hpet_res) - return 0; - - memset(hpet_res, 0, sizeof(*hpet_res)); hpet_res->name = (void *)&hpet_res[1]; hpet_res->flags = IORESOURCE_MEM; snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, "HPET %u", diff --git a/arch/x86/kernel/acpi/realmode/.gitignore b/arch/x86/kernel/acpi/realmode/.gitignore new file mode 100644 index 000000000000..58f1f48a58f8 --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/.gitignore @@ -0,0 +1,3 @@ +wakeup.bin +wakeup.elf +wakeup.lds diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index df4099dc1c68..65c7857a90dd 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -511,31 +511,30 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) unsigned long flags; char *vaddr; int nr_pages = 2; + struct page *pages[2]; + int i; - BUG_ON(len > sizeof(long)); - BUG_ON((((long)addr + len - 1) & ~(sizeof(long) - 1)) - - ((long)addr & ~(sizeof(long) - 1))); - if (kernel_text_address((unsigned long)addr)) { - struct page *pages[2] = { virt_to_page(addr), - virt_to_page(addr + PAGE_SIZE) }; - if (!pages[1]) - nr_pages = 1; - vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - BUG_ON(!vaddr); - local_irq_save(flags); - memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - local_irq_restore(flags); - vunmap(vaddr); + if (!core_kernel_text((unsigned long)addr)) { + pages[0] = vmalloc_to_page(addr); + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); } else { - /* - * modules are in vmalloc'ed memory, always writable. - */ - local_irq_save(flags); - memcpy(addr, opcode, len); - local_irq_restore(flags); + pages[0] = virt_to_page(addr); + WARN_ON(!PageReserved(pages[0])); + pages[1] = virt_to_page(addr + PAGE_SIZE); } + BUG_ON(!pages[0]); + if (!pages[1]) + nr_pages = 1; + vaddr = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + BUG_ON(!vaddr); + local_irq_save(flags); + memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); + local_irq_restore(flags); + vunmap(vaddr); sync_core(); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ + for (i = 0; i < len; i++) + BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); return addr; } diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 8317401170b8..4b99b1bdeb6c 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -451,7 +451,8 @@ void __init setup_boot_APIC_clock(void) } /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32); + lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, + lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); lapic_clockevent.min_delta_ns = diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index bf83157337e4..5910020c3f24 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -360,7 +360,8 @@ static void __init calibrate_APIC_clock(void) result / 1000 / 1000, result / 1000 % 1000); /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, 32); + lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, + lapic_clockevent.shift); lapic_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); lapic_clockevent.min_delta_ns = diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index f0030a0999c7..bf9290e29013 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -904,6 +904,7 @@ recalc: original_pm_idle(); else default_idle(); + local_irq_disable(); jiffies_since_last_check = jiffies - last_jiffies; if (jiffies_since_last_check > idle_period) goto recalc; @@ -911,6 +912,8 @@ recalc: if (apm_idle_done) apm_do_busy(); + + local_irq_enable(); } /** @@ -1189,19 +1192,6 @@ static int suspend(int vetoable) int err; struct apm_user *as; - if (pm_send_all(PM_SUSPEND, (void *)3)) { - /* Vetoed */ - if (vetoable) { - if (apm_info.connection_version > 0x100) - set_system_power_state(APM_STATE_REJECT); - err = -EBUSY; - ignore_sys_suspend = 0; - printk(KERN_WARNING "apm: suspend was vetoed.\n"); - goto out; - } - printk(KERN_CRIT "apm: suspend was vetoed, but suspending anyway.\n"); - } - device_suspend(PMSG_SUSPEND); local_irq_disable(); device_power_down(PMSG_SUSPEND); @@ -1224,9 +1214,7 @@ static int suspend(int vetoable) device_power_up(); local_irq_enable(); device_resume(); - pm_send_all(PM_RESUME, (void *)0); queue_event(APM_NORMAL_RESUME, NULL); - out: spin_lock(&user_list_lock); for (as = user_list; as != NULL; as = as->next) { as->suspend_wait = 0; @@ -1337,7 +1325,6 @@ static void check_events(void) if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { device_resume(); - pm_send_all(PM_RESUME, (void *)0); queue_event(event, NULL); } ignore_normal_resume = 0; diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index ee7c45235e54..a0c6f8190887 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -11,7 +11,6 @@ obj-$(CONFIG_X86_32) += cyrix.o obj-$(CONFIG_X86_32) += centaur.o obj-$(CONFIG_X86_32) += transmeta.o obj-$(CONFIG_X86_32) += intel.o -obj-$(CONFIG_X86_32) += nexgen.o obj-$(CONFIG_X86_32) += umc.o obj-$(CONFIG_X86_MCE) += mcheck/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 0173065dc3b7..245866828294 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -343,10 +343,4 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = { .c_size_cache = amd_size_cache, }; -int __init amd_init_cpu(void) -{ - cpu_devs[X86_VENDOR_AMD] = &amd_cpu_dev; - return 0; -} - cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index e2d870de837c..8db8f73503b3 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -339,6 +339,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); unsigned int freq; + unsigned int cached_freq; dprintk("get_cur_freq_on_cpu (%d)\n", cpu); @@ -347,7 +348,16 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu) return 0; } + cached_freq = data->freq_table[data->acpi_data->state].frequency; freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data); + if (freq != cached_freq) { + /* + * The dreaded BIOS frequency change behind our back. + * Force set the frequency on next target call. + */ + data->resume = 1; + } + dprintk("cur freq = %u\n", freq); return freq; diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 9a699ed03598..e07e8c068ae0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -49,7 +49,7 @@ static int banks; static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; static unsigned long notify_user; static int rip_msr; -static int mce_bootlog = 1; +static int mce_bootlog = -1; static atomic_t mce_events; static char trigger[128]; @@ -471,13 +471,15 @@ static void mce_init(void *dummy) static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) { /* This should be disabled by the BIOS, but isn't always */ - if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) { - /* disable GART TBL walk error reporting, which trips off - incorrectly with the IOMMU & 3ware & Cerberus. */ - clear_bit(10, &bank[4]); - /* Lots of broken BIOS around that don't clear them - by default and leave crap in there. Don't log. */ - mce_bootlog = 0; + if (c->x86_vendor == X86_VENDOR_AMD) { + if(c->x86 == 15) + /* disable GART TBL walk error reporting, which trips off + incorrectly with the IOMMU & 3ware & Cerberus. */ + clear_bit(10, &bank[4]); + if(c->x86 <= 17 && mce_bootlog < 0) + /* Lots of broken BIOS around that don't clear them + by default and leave crap in there. Don't log. */ + mce_bootlog = 0; } } diff --git a/arch/x86/kernel/cpu/nexgen.c b/arch/x86/kernel/cpu/nexgen.c deleted file mode 100644 index 5d5e1c134123..000000000000 --- a/arch/x86/kernel/cpu/nexgen.c +++ /dev/null @@ -1,59 +0,0 @@ -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/string.h> -#include <asm/processor.h> - -#include "cpu.h" - -/* - * Detect a NexGen CPU running without BIOS hypercode new enough - * to have CPUID. (Thanks to Herbert Oppmann) - */ - -static int __cpuinit deep_magic_nexgen_probe(void) -{ - int ret; - - __asm__ __volatile__ ( - " movw $0x5555, %%ax\n" - " xorw %%dx,%%dx\n" - " movw $2, %%cx\n" - " divw %%cx\n" - " movl $0, %%eax\n" - " jnz 1f\n" - " movl $1, %%eax\n" - "1:\n" - : "=a" (ret) : : "cx", "dx"); - return ret; -} - -static void __cpuinit init_nexgen(struct cpuinfo_x86 *c) -{ - c->x86_cache_size = 256; /* A few had 1 MB... */ -} - -static void __cpuinit nexgen_identify(struct cpuinfo_x86 *c) -{ - /* Detect NexGen with old hypercode */ - if (deep_magic_nexgen_probe()) - strcpy(c->x86_vendor_id, "NexGenDriven"); -} - -static struct cpu_dev nexgen_cpu_dev __cpuinitdata = { - .c_vendor = "Nexgen", - .c_ident = { "NexGenDriven" }, - .c_models = { - { .vendor = X86_VENDOR_NEXGEN, - .family = 5, - .model_names = { [1] = "Nx586" } - }, - }, - .c_init = init_nexgen, - .c_identify = nexgen_identify, -}; - -int __init nexgen_init_cpu(void) -{ - cpu_devs[X86_VENDOR_NEXGEN] = &nexgen_cpu_dev; - return 0; -} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index b943e10ad814..f9ae93adffe5 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -614,16 +614,6 @@ static struct wd_ops intel_arch_wd_ops __read_mostly = { .evntsel = MSR_ARCH_PERFMON_EVENTSEL1, }; -static struct wd_ops coreduo_wd_ops = { - .reserve = single_msr_reserve, - .unreserve = single_msr_unreserve, - .setup = setup_intel_arch_watchdog, - .rearm = p6_rearm, - .stop = single_msr_stop_watchdog, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .evntsel = MSR_ARCH_PERFMON_EVENTSEL0, -}; - static void probe_nmi_watchdog(void) { switch (boot_cpu_data.x86_vendor) { @@ -637,8 +627,8 @@ static void probe_nmi_watchdog(void) /* Work around Core Duo (Yonah) errata AE49 where perfctr1 doesn't have a working enable bit. */ if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) { - wd_ops = &coreduo_wd_ops; - break; + intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0; + intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0; } if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { wd_ops = &intel_arch_wd_ops; diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 2251d0ae9570..268553817909 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -25,6 +25,7 @@ #include <asm/hpet.h> #include <linux/kdebug.h> #include <asm/smp.h> +#include <asm/reboot.h> #include <mach_ipi.h> @@ -117,7 +118,7 @@ static void nmi_shootdown_cpus(void) } #endif -void machine_crash_shutdown(struct pt_regs *regs) +void native_machine_crash_shutdown(struct pt_regs *regs) { /* This function is only called after the system * has panicked or is otherwise in a critical state. diff --git a/arch/x86/kernel/e820_64.c b/arch/x86/kernel/e820_64.c index cbd42e51cb08..645ee5e32a27 100644 --- a/arch/x86/kernel/e820_64.c +++ b/arch/x86/kernel/e820_64.c @@ -84,14 +84,41 @@ void __init reserve_early(unsigned long start, unsigned long end, char *name) strncpy(r->name, name, sizeof(r->name) - 1); } -void __init early_res_to_bootmem(void) +void __init free_early(unsigned long start, unsigned long end) +{ + struct early_res *r; + int i, j; + + for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { + r = &early_res[i]; + if (start == r->start && end == r->end) + break; + } + if (i >= MAX_EARLY_RES || !early_res[i].end) + panic("free_early on not reserved area: %lx-%lx!", start, end); + + for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++) + ; + + memcpy(&early_res[i], &early_res[i + 1], + (j - 1 - i) * sizeof(struct early_res)); + + early_res[j - 1].end = 0; +} + +void __init early_res_to_bootmem(unsigned long start, unsigned long end) { int i; + unsigned long final_start, final_end; for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { struct early_res *r = &early_res[i]; - printk(KERN_INFO "early res: %d [%lx-%lx] %s\n", i, - r->start, r->end - 1, r->name); - reserve_bootmem_generic(r->start, r->end - r->start); + final_start = max(start, r->start); + final_end = min(end, r->end); + if (final_start >= final_end) + continue; + printk(KERN_INFO " early res: %d [%lx-%lx] %s\n", i, + final_start, final_end - 1, r->name); + reserve_bootmem_generic(final_start, final_end - final_start); } } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f0f8934fc303..2a609dc3271c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -409,7 +409,7 @@ restore_nocheck_notrace: irq_return: INTERRUPT_RETURN .section .fixup,"ax" -iret_exc: +ENTRY(iret_exc) pushl $0 # no error code pushl $do_iret_error jmp error_code @@ -1017,6 +1017,13 @@ ENTRY(kernel_thread_helper) ENDPROC(kernel_thread_helper) #ifdef CONFIG_XEN +/* Xen doesn't set %esp to be precisely what the normal sysenter + entrypoint expects, so fix it up before using the normal path. */ +ENTRY(xen_sysenter_target) + RING0_INT_FRAME + addl $5*4, %esp /* remove xen-provided frame */ + jmp sysenter_past_esp + ENTRY(xen_hypervisor_callback) CFI_STARTPROC pushl $0 @@ -1035,8 +1042,9 @@ ENTRY(xen_hypervisor_callback) cmpl $xen_iret_end_crit,%eax jae 1f - call xen_iret_crit_fixup + jmp xen_iret_crit_fixup +ENTRY(xen_do_upcall) 1: mov %esp, %eax call xen_evtchn_do_upcall jmp ret_from_intr diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 9546ef408b92..021624c83583 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -51,7 +51,7 @@ void __init setup_apic_routing(void) else #endif - if (cpus_weight(cpu_possible_map) <= 8) + if (num_possible_cpus() <= 8) genapic = &apic_flat; else genapic = &apic_physflat; diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 993c76773256..e25c57b8aa84 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -11,6 +11,7 @@ #include <linux/string.h> #include <linux/percpu.h> #include <linux/start_kernel.h> +#include <linux/io.h> #include <asm/processor.h> #include <asm/proto.h> @@ -22,6 +23,7 @@ #include <asm/sections.h> #include <asm/kdebug.h> #include <asm/e820.h> +#include <asm/bios_ebda.h> static void __init zap_identity_mappings(void) { @@ -49,7 +51,6 @@ static void __init copy_bootdata(char *real_mode_data) } } -#define BIOS_EBDA_SEGMENT 0x40E #define BIOS_LOWMEM_KILOBYTES 0x413 /* @@ -80,8 +81,7 @@ static void __init reserve_ebda_region(void) lowmem <<= 10; /* start of EBDA area */ - ebda_addr = *(unsigned short *)__va(BIOS_EBDA_SEGMENT); - ebda_addr <<= 4; + ebda_addr = get_bios_ebda(); /* Fixup: bios puts an EBDA in the top 64K segment */ /* of conventional memory, but does not adjust lowmem. */ @@ -101,6 +101,24 @@ static void __init reserve_ebda_region(void) reserve_early(lowmem, 0x100000, "BIOS reserved"); } +static void __init reserve_setup_data(void) +{ + struct setup_data *data; + unsigned long pa_data; + char buf[32]; + + if (boot_params.hdr.version < 0x0209) + return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, sizeof(*data)); + sprintf(buf, "setup data %x", data->type); + reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); + pa_data = data->next; + early_iounmap(data, sizeof(*data)); + } +} + void __init x86_64_start_kernel(char * real_mode_data) { int i; @@ -157,6 +175,7 @@ void __init x86_64_start_kernel(char * real_mode_data) #endif reserve_ebda_region(); + reserve_setup_data(); /* * At this point everything still needed from the boot loader diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 36652ea1a265..9007f9ea64ee 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -218,7 +218,7 @@ static void hpet_legacy_clockevent_register(void) hpet_freq = 1000000000000000ULL; do_div(hpet_freq, hpet_period); hpet_clockevent.mult = div_sc((unsigned long) hpet_freq, - NSEC_PER_SEC, 32); + NSEC_PER_SEC, hpet_clockevent.shift); /* Calculate the min / max delta */ hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &hpet_clockevent); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 8540abe86ade..c1b5e3ece1f2 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -115,7 +115,8 @@ void __init setup_pit_timer(void) * IO_APIC has been initialized. */ pit_clockevent.cpumask = cpumask_of_cpu(smp_processor_id()); - pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32); + pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, + pit_clockevent.shift); pit_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_clockevent); pit_clockevent.min_delta_ns = @@ -224,7 +225,8 @@ static int __init init_pit_clocksource(void) pit_clockevent.mode != CLOCK_EVT_MODE_PERIODIC) return 0; - clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20); + clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, + clocksource_pit.shift); return clocksource_register(&clocksource_pit); } arch_initcall(init_pit_clocksource); diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 2e2f42074e18..a40d54fc1fdd 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -2068,7 +2068,7 @@ static void __init setup_nmi(void) * cycles as some i82489DX-based boards have glue logic that keeps the * 8259A interrupt line asserted until INTA. --macro */ -static inline void unlock_ExtINT_logic(void) +static inline void __init unlock_ExtINT_logic(void) { int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; @@ -2444,6 +2444,7 @@ void destroy_irq(unsigned int irq) dynamic_irq_cleanup(irq); spin_lock_irqsave(&vector_lock, flags); + clear_bit(irq_vector[irq], used_vectors); irq_vector[irq] = 0; spin_unlock_irqrestore(&vector_lock, flags); } diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 9ba11d07920f..ef1a8dfcc529 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1599,7 +1599,7 @@ static void __init setup_nmi(void) * cycles as some i82489DX-based boards have glue logic that keeps the * 8259A interrupt line asserted until INTA. --macro */ -static inline void unlock_ExtINT_logic(void) +static inline void __init unlock_ExtINT_logic(void) { int apic, pin, i; struct IO_APIC_route_entry entry0, entry1; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 6ea67b76a214..00bda7bcda63 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -134,7 +134,7 @@ unsigned int do_IRQ(struct pt_regs *regs) : "=a" (arg1), "=d" (arg2), "=b" (bx) : "0" (irq), "1" (desc), "2" (isp), "D" (desc->handle_irq) - : "memory", "cc" + : "memory", "cc", "ecx" ); } else #endif diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index 73354302fda7..c03205991718 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -6,23 +6,171 @@ * * This file is released under the GPLv2. */ - #include <linux/debugfs.h> +#include <linux/uaccess.h> #include <linux/stat.h> #include <linux/init.h> +#include <linux/io.h> +#include <linux/mm.h> #include <asm/setup.h> #ifdef CONFIG_DEBUG_BOOT_PARAMS +struct setup_data_node { + u64 paddr; + u32 type; + u32 len; +}; + +static ssize_t +setup_data_read(struct file *file, char __user *user_buf, size_t count, + loff_t *ppos) +{ + struct setup_data_node *node = file->private_data; + unsigned long remain; + loff_t pos = *ppos; + struct page *pg; + void *p; + u64 pa; + + if (pos < 0) + return -EINVAL; + if (pos >= node->len) + return 0; + + if (count > node->len - pos) + count = node->len - pos; + pa = node->paddr + sizeof(struct setup_data) + pos; + pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); + if (PageHighMem(pg)) { + p = ioremap_cache(pa, count); + if (!p) + return -ENXIO; + } else { + p = __va(pa); + } + + remain = copy_to_user(user_buf, p, count); + + if (PageHighMem(pg)) + iounmap(p); + + if (remain) + return -EFAULT; + + *ppos = pos + count; + + return count; +} + +static int setup_data_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +static const struct file_operations fops_setup_data = { + .read = setup_data_read, + .open = setup_data_open, +}; + +static int __init +create_setup_data_node(struct dentry *parent, int no, + struct setup_data_node *node) +{ + struct dentry *d, *type, *data; + char buf[16]; + int error; + + sprintf(buf, "%d", no); + d = debugfs_create_dir(buf, parent); + if (!d) { + error = -ENOMEM; + goto err_return; + } + type = debugfs_create_x32("type", S_IRUGO, d, &node->type); + if (!type) { + error = -ENOMEM; + goto err_dir; + } + data = debugfs_create_file("data", S_IRUGO, d, node, &fops_setup_data); + if (!data) { + error = -ENOMEM; + goto err_type; + } + return 0; + +err_type: + debugfs_remove(type); +err_dir: + debugfs_remove(d); +err_return: + return error; +} + +static int __init create_setup_data_nodes(struct dentry *parent) +{ + struct setup_data_node *node; + struct setup_data *data; + int error, no = 0; + struct dentry *d; + struct page *pg; + u64 pa_data; + + d = debugfs_create_dir("setup_data", parent); + if (!d) { + error = -ENOMEM; + goto err_return; + } + + pa_data = boot_params.hdr.setup_data; + + while (pa_data) { + node = kmalloc(sizeof(*node), GFP_KERNEL); + if (!node) { + error = -ENOMEM; + goto err_dir; + } + pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); + if (PageHighMem(pg)) { + data = ioremap_cache(pa_data, sizeof(*data)); + if (!data) { + error = -ENXIO; + goto err_dir; + } + } else { + data = __va(pa_data); + } + + node->paddr = pa_data; + node->type = data->type; + node->len = data->len; + error = create_setup_data_node(d, no, node); + pa_data = data->next; + + if (PageHighMem(pg)) + iounmap(data); + if (error) + goto err_dir; + no++; + } + return 0; + +err_dir: + debugfs_remove(d); +err_return: + return error; +} + static struct debugfs_blob_wrapper boot_params_blob = { - .data = &boot_params, - .size = sizeof(boot_params), + .data = &boot_params, + .size = sizeof(boot_params), }; static int __init boot_params_kdebugfs_init(void) { - int error; struct dentry *dbp, *version, *data; + int error; dbp = debugfs_create_dir("boot_params", NULL); if (!dbp) { @@ -41,7 +189,13 @@ static int __init boot_params_kdebugfs_init(void) error = -ENOMEM; goto err_version; } + error = create_setup_data_nodes(dbp); + if (error) + goto err_data; return 0; + +err_data: + debugfs_remove(data); err_version: debugfs_remove(version); err_dir: @@ -61,5 +215,4 @@ static int __init arch_kdebugfs_init(void) return error; } - arch_initcall(arch_kdebugfs_init); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c new file mode 100644 index 000000000000..8b7a3cf37d2b --- /dev/null +++ b/arch/x86/kernel/kvm.c @@ -0,0 +1,248 @@ +/* + * KVM paravirt_ops implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright IBM Corporation, 2007 + * Authors: Anthony Liguori <aliguori@us.ibm.com> + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/kvm_para.h> +#include <linux/cpu.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/hardirq.h> + +#define MMU_QUEUE_SIZE 1024 + +struct kvm_para_state { + u8 mmu_queue[MMU_QUEUE_SIZE]; + int mmu_queue_len; + enum paravirt_lazy_mode mode; +}; + +static DEFINE_PER_CPU(struct kvm_para_state, para_state); + +static struct kvm_para_state *kvm_para_state(void) +{ + return &per_cpu(para_state, raw_smp_processor_id()); +} + +/* + * No need for any "IO delay" on KVM + */ +static void kvm_io_delay(void) +{ +} + +static void kvm_mmu_op(void *buffer, unsigned len) +{ + int r; + unsigned long a1, a2; + + do { + a1 = __pa(buffer); + a2 = 0; /* on i386 __pa() always returns <4G */ + r = kvm_hypercall3(KVM_HC_MMU_OP, len, a1, a2); + buffer += r; + len -= r; + } while (len); +} + +static void mmu_queue_flush(struct kvm_para_state *state) +{ + if (state->mmu_queue_len) { + kvm_mmu_op(state->mmu_queue, state->mmu_queue_len); + state->mmu_queue_len = 0; + } +} + +static void kvm_deferred_mmu_op(void *buffer, int len) +{ + struct kvm_para_state *state = kvm_para_state(); + + if (state->mode != PARAVIRT_LAZY_MMU) { + kvm_mmu_op(buffer, len); + return; + } + if (state->mmu_queue_len + len > sizeof state->mmu_queue) + mmu_queue_flush(state); + memcpy(state->mmu_queue + state->mmu_queue_len, buffer, len); + state->mmu_queue_len += len; +} + +static void kvm_mmu_write(void *dest, u64 val) +{ + __u64 pte_phys; + struct kvm_mmu_op_write_pte wpte; + +#ifdef CONFIG_HIGHPTE + struct page *page; + unsigned long dst = (unsigned long) dest; + + page = kmap_atomic_to_page(dest); + pte_phys = page_to_pfn(page); + pte_phys <<= PAGE_SHIFT; + pte_phys += (dst & ~(PAGE_MASK)); +#else + pte_phys = (unsigned long)__pa(dest); +#endif + wpte.header.op = KVM_MMU_OP_WRITE_PTE; + wpte.pte_val = val; + wpte.pte_phys = pte_phys; + + kvm_deferred_mmu_op(&wpte, sizeof wpte); +} + +/* + * We only need to hook operations that are MMU writes. We hook these so that + * we can use lazy MMU mode to batch these operations. We could probably + * improve the performance of the host code if we used some of the information + * here to simplify processing of batched writes. + */ +static void kvm_set_pte(pte_t *ptep, pte_t pte) +{ + kvm_mmu_write(ptep, pte_val(pte)); +} + +static void kvm_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + kvm_mmu_write(ptep, pte_val(pte)); +} + +static void kvm_set_pmd(pmd_t *pmdp, pmd_t pmd) +{ + kvm_mmu_write(pmdp, pmd_val(pmd)); +} + +#if PAGETABLE_LEVELS >= 3 +#ifdef CONFIG_X86_PAE +static void kvm_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + kvm_mmu_write(ptep, pte_val(pte)); +} + +static void kvm_set_pte_present(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + kvm_mmu_write(ptep, pte_val(pte)); +} + +static void kvm_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + kvm_mmu_write(ptep, 0); +} + +static void kvm_pmd_clear(pmd_t *pmdp) +{ + kvm_mmu_write(pmdp, 0); +} +#endif + +static void kvm_set_pud(pud_t *pudp, pud_t pud) +{ + kvm_mmu_write(pudp, pud_val(pud)); +} + +#if PAGETABLE_LEVELS == 4 +static void kvm_set_pgd(pgd_t *pgdp, pgd_t pgd) +{ + kvm_mmu_write(pgdp, pgd_val(pgd)); +} +#endif +#endif /* PAGETABLE_LEVELS >= 3 */ + +static void kvm_flush_tlb(void) +{ + struct kvm_mmu_op_flush_tlb ftlb = { + .header.op = KVM_MMU_OP_FLUSH_TLB, + }; + + kvm_deferred_mmu_op(&ftlb, sizeof ftlb); +} + +static void kvm_release_pt(u32 pfn) +{ + struct kvm_mmu_op_release_pt rpt = { + .header.op = KVM_MMU_OP_RELEASE_PT, + .pt_phys = (u64)pfn << PAGE_SHIFT, + }; + + kvm_mmu_op(&rpt, sizeof rpt); +} + +static void kvm_enter_lazy_mmu(void) +{ + struct kvm_para_state *state = kvm_para_state(); + + paravirt_enter_lazy_mmu(); + state->mode = paravirt_get_lazy_mode(); +} + +static void kvm_leave_lazy_mmu(void) +{ + struct kvm_para_state *state = kvm_para_state(); + + mmu_queue_flush(state); + paravirt_leave_lazy(paravirt_get_lazy_mode()); + state->mode = paravirt_get_lazy_mode(); +} + +static void paravirt_ops_setup(void) +{ + pv_info.name = "KVM"; + pv_info.paravirt_enabled = 1; + + if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) + pv_cpu_ops.io_delay = kvm_io_delay; + + if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) { + pv_mmu_ops.set_pte = kvm_set_pte; + pv_mmu_ops.set_pte_at = kvm_set_pte_at; + pv_mmu_ops.set_pmd = kvm_set_pmd; +#if PAGETABLE_LEVELS >= 3 +#ifdef CONFIG_X86_PAE + pv_mmu_ops.set_pte_atomic = kvm_set_pte_atomic; + pv_mmu_ops.set_pte_present = kvm_set_pte_present; + pv_mmu_ops.pte_clear = kvm_pte_clear; + pv_mmu_ops.pmd_clear = kvm_pmd_clear; +#endif + pv_mmu_ops.set_pud = kvm_set_pud; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.set_pgd = kvm_set_pgd; +#endif +#endif + pv_mmu_ops.flush_tlb_user = kvm_flush_tlb; + pv_mmu_ops.release_pte = kvm_release_pt; + pv_mmu_ops.release_pmd = kvm_release_pt; + pv_mmu_ops.release_pud = kvm_release_pt; + + pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu; + pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu; + } +} + +void __init kvm_guest_init(void) +{ + if (!kvm_para_available()) + return; + + paravirt_ops_setup(); +} diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c new file mode 100644 index 000000000000..ddee04043aeb --- /dev/null +++ b/arch/x86/kernel/kvmclock.c @@ -0,0 +1,187 @@ +/* KVM paravirtual clock driver. A clocksource implementation + Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include <linux/clocksource.h> +#include <linux/kvm_para.h> +#include <asm/arch_hooks.h> +#include <asm/msr.h> +#include <asm/apic.h> +#include <linux/percpu.h> +#include <asm/reboot.h> + +#define KVM_SCALE 22 + +static int kvmclock = 1; + +static int parse_no_kvmclock(char *arg) +{ + kvmclock = 0; + return 0; +} +early_param("no-kvmclock", parse_no_kvmclock); + +/* The hypervisor will put information about time periodically here */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct kvm_vcpu_time_info, hv_clock); +#define get_clock(cpu, field) per_cpu(hv_clock, cpu).field + +static inline u64 kvm_get_delta(u64 last_tsc) +{ + int cpu = smp_processor_id(); + u64 delta = native_read_tsc() - last_tsc; + return (delta * get_clock(cpu, tsc_to_system_mul)) >> KVM_SCALE; +} + +static struct kvm_wall_clock wall_clock; +static cycle_t kvm_clock_read(void); +/* + * The wallclock is the time of day when we booted. Since then, some time may + * have elapsed since the hypervisor wrote the data. So we try to account for + * that with system time + */ +unsigned long kvm_get_wallclock(void) +{ + u32 wc_sec, wc_nsec; + u64 delta; + struct timespec ts; + int version, nsec; + int low, high; + + low = (int)__pa(&wall_clock); + high = ((u64)__pa(&wall_clock) >> 32); + + delta = kvm_clock_read(); + + native_write_msr(MSR_KVM_WALL_CLOCK, low, high); + do { + version = wall_clock.wc_version; + rmb(); + wc_sec = wall_clock.wc_sec; + wc_nsec = wall_clock.wc_nsec; + rmb(); + } while ((wall_clock.wc_version != version) || (version & 1)); + + delta = kvm_clock_read() - delta; + delta += wc_nsec; + nsec = do_div(delta, NSEC_PER_SEC); + set_normalized_timespec(&ts, wc_sec + delta, nsec); + /* + * Of all mechanisms of time adjustment I've tested, this one + * was the champion! + */ + return ts.tv_sec + 1; +} + +int kvm_set_wallclock(unsigned long now) +{ + return 0; +} + +/* + * This is our read_clock function. The host puts an tsc timestamp each time + * it updates a new time. Without the tsc adjustment, we can have a situation + * in which a vcpu starts to run earlier (smaller system_time), but probes + * time later (compared to another vcpu), leading to backwards time + */ +static cycle_t kvm_clock_read(void) +{ + u64 last_tsc, now; + int cpu; + + preempt_disable(); + cpu = smp_processor_id(); + + last_tsc = get_clock(cpu, tsc_timestamp); + now = get_clock(cpu, system_time); + + now += kvm_get_delta(last_tsc); + preempt_enable(); + + return now; +} +static struct clocksource kvm_clock = { + .name = "kvm-clock", + .read = kvm_clock_read, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .mult = 1 << KVM_SCALE, + .shift = KVM_SCALE, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static int kvm_register_clock(void) +{ + int cpu = smp_processor_id(); + int low, high; + low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; + high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); + + return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); +} + +static void kvm_setup_secondary_clock(void) +{ + /* + * Now that the first cpu already had this clocksource initialized, + * we shouldn't fail. + */ + WARN_ON(kvm_register_clock()); + /* ok, done with our trickery, call native */ + setup_secondary_APIC_clock(); +} + +/* + * After the clock is registered, the host will keep writing to the + * registered memory location. If the guest happens to shutdown, this memory + * won't be valid. In cases like kexec, in which you install a new kernel, this + * means a random memory location will be kept being written. So before any + * kind of shutdown from our side, we unregister the clock by writting anything + * that does not have the 'enable' bit set in the msr + */ +#ifdef CONFIG_KEXEC +static void kvm_crash_shutdown(struct pt_regs *regs) +{ + native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); + native_machine_crash_shutdown(regs); +} +#endif + +static void kvm_shutdown(void) +{ + native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); + native_machine_shutdown(); +} + +void __init kvmclock_init(void) +{ + if (!kvm_para_available()) + return; + + if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { + if (kvm_register_clock()) + return; + pv_time_ops.get_wallclock = kvm_get_wallclock; + pv_time_ops.set_wallclock = kvm_set_wallclock; + pv_time_ops.sched_clock = kvm_clock_read; + pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; + machine_ops.shutdown = kvm_shutdown; +#ifdef CONFIG_KEXEC + machine_ops.crash_shutdown = kvm_crash_shutdown; +#endif + clocksource_register(&kvm_clock); + } +} diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index b402c0f3f192..3cad17fe026b 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -63,7 +63,7 @@ static int __init mfgpt_fix(char *s) /* The following udocumented bit resets the MFGPT timers */ val = 0xFF; dummy = 0; - wrmsr(0x5140002B, val, dummy); + wrmsr(MSR_MFGPT_SETUP, val, dummy); return 1; } __setup("mfgptfix", mfgpt_fix); @@ -127,17 +127,17 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable) * 6; that is, resets for 7 and 8 will be ignored. Is this * a problem? -dilinger */ - msr = MFGPT_NR_MSR; + msr = MSR_MFGPT_NR; mask = 1 << (timer + 24); break; case MFGPT_EVENT_NMI: - msr = MFGPT_NR_MSR; + msr = MSR_MFGPT_NR; mask = 1 << (timer + shift); break; case MFGPT_EVENT_IRQ: - msr = MFGPT_IRQ_MSR; + msr = MSR_MFGPT_IRQ; mask = 1 << (timer + shift); break; @@ -364,7 +364,8 @@ int __init mfgpt_timer_setup(void) geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val); /* Set up the clock event */ - mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC, 32); + mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC, + mfgpt_clockevent.shift); mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &mfgpt_clockevent); mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE, diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 70744e344fa1..3e2c54dc8b29 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -686,13 +686,11 @@ void __init get_smp_config(void) static int __init smp_scan_config(unsigned long base, unsigned long length, unsigned reserve) { - extern void __bad_mpf_size(void); unsigned int *bp = phys_to_virt(base); struct intel_mp_floating *mpf; Dprintk("Scan SMP from %p for %ld bytes.\n", bp, length); - if (sizeof(*mpf) != 16) - __bad_mpf_size(); + BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { mpf = (struct intel_mp_floating *)bp; @@ -801,7 +799,6 @@ void __init find_smp_config(void) #ifdef CONFIG_X86_IO_APIC #define MP_ISA_BUS 0 -#define MP_MAX_IOAPIC_PIN 127 extern struct mp_ioapic_routing mp_ioapic_routing[MAX_IO_APICS]; @@ -820,7 +817,7 @@ static int mp_find_ioapic(int gsi) return -1; } -static u8 uniq_ioapic_id(u8 id) +static u8 __init uniq_ioapic_id(u8 id) { #ifdef CONFIG_X86_32 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && @@ -909,14 +906,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) intsrc.mpc_dstapic = mp_ioapics[ioapic].mpc_apicid; /* APIC ID */ intsrc.mpc_dstirq = pin; /* INTIN# */ - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, %d-%d\n", - intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); + MP_intsrc_info(&intsrc); } int es7000_plat; @@ -985,23 +975,14 @@ void __init mp_config_acpi_legacy_irqs(void) intsrc.mpc_srcbusirq = i; /* Identity mapped */ intsrc.mpc_dstirq = i; - Dprintk("Int: type %d, pol %d, trig %d, bus %d, irq %d, " - "%d-%d\n", intsrc.mpc_irqtype, intsrc.mpc_irqflag & 3, - (intsrc.mpc_irqflag >> 2) & 3, intsrc.mpc_srcbus, - intsrc.mpc_srcbusirq, intsrc.mpc_dstapic, - intsrc.mpc_dstirq); - - mp_irqs[mp_irq_entries] = intsrc; - if (++mp_irq_entries == MAX_IRQ_SOURCES) - panic("Max # of irq sources exceeded!\n"); + MP_intsrc_info(&intsrc); } } int mp_register_gsi(u32 gsi, int triggering, int polarity) { - int ioapic = -1; - int ioapic_pin = 0; - int idx, bit = 0; + int ioapic; + int ioapic_pin; #ifdef CONFIG_X86_32 #define MAX_GSI_NUM 4096 #define IRQ_COMPRESSION_START 64 @@ -1041,15 +1022,13 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) * with redundant pin->gsi mappings (but unique PCI devices); * we only program the IOAPIC on the first. */ - bit = ioapic_pin % 32; - idx = (ioapic_pin < 32) ? 0 : (ioapic_pin / 32); - if (idx > 3) { + if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " "%d-%d\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); return gsi; } - if ((1 << bit) & mp_ioapic_routing[ioapic].pin_programmed[idx]) { + if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); #ifdef CONFIG_X86_32 @@ -1059,7 +1038,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) #endif } - mp_ioapic_routing[ioapic].pin_programmed[idx] |= (1 << bit); + set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed); #ifdef CONFIG_X86_32 /* * For GSI >= 64, use IRQ compression diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 3733412d1357..74f0c5ea2a03 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -366,11 +366,13 @@ struct pv_mmu_ops pv_mmu_ops = { .flush_tlb_single = native_flush_tlb_single, .flush_tlb_others = native_flush_tlb_others, - .alloc_pt = paravirt_nop, - .alloc_pd = paravirt_nop, - .alloc_pd_clone = paravirt_nop, - .release_pt = paravirt_nop, - .release_pd = paravirt_nop, + .alloc_pte = paravirt_nop, + .alloc_pmd = paravirt_nop, + .alloc_pmd_clone = paravirt_nop, + .alloc_pud = paravirt_nop, + .release_pte = paravirt_nop, + .release_pmd = paravirt_nop, + .release_pud = paravirt_nop, .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 2edee22e9c30..e28ec497e142 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -43,6 +43,7 @@ #include <asm/system.h> #include <asm/dma.h> #include <asm/rio.h> +#include <asm/bios_ebda.h> #ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT int use_calgary __read_mostly = 1; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 3004d716539d..67e9b4a1e89d 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -4,6 +4,8 @@ #include <linux/smp.h> #include <linux/slab.h> #include <linux/sched.h> +#include <linux/module.h> +#include <linux/pm.h> struct kmem_cache *task_xstate_cachep; @@ -42,3 +44,118 @@ void arch_task_cache_init(void) __alignof__(union thread_xstate), SLAB_PANIC, NULL); } + +static void do_nothing(void *unused) +{ +} + +/* + * cpu_idle_wait - Used to ensure that all the CPUs discard old value of + * pm_idle and update to new pm_idle value. Required while changing pm_idle + * handler on SMP systems. + * + * Caller must have changed pm_idle to the new value before the call. Old + * pm_idle value will not be used by any CPU after the return of this function. + */ +void cpu_idle_wait(void) +{ + smp_mb(); + /* kick all the CPUs so that they exit out of pm_idle */ + smp_call_function(do_nothing, NULL, 0, 1); +} +EXPORT_SYMBOL_GPL(cpu_idle_wait); + +/* + * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, + * which can obviate IPI to trigger checking of need_resched. + * We execute MONITOR against need_resched and enter optimized wait state + * through MWAIT. Whenever someone changes need_resched, we would be woken + * up from MWAIT (without an IPI). + * + * New with Core Duo processors, MWAIT can take some hints based on CPU + * capability. + */ +void mwait_idle_with_hints(unsigned long ax, unsigned long cx) +{ + if (!need_resched()) { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __mwait(ax, cx); + } +} + +/* Default MONITOR/MWAIT with no hints, used for default C1 state */ +static void mwait_idle(void) +{ + if (!need_resched()) { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __sti_mwait(0, 0); + else + local_irq_enable(); + } else + local_irq_enable(); +} + + +static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) +{ + if (force_mwait) + return 1; + /* Any C1 states supported? */ + return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0; +} + +/* + * On SMP it's slightly faster (but much more power-consuming!) + * to poll the ->work.need_resched flag instead of waiting for the + * cross-CPU IPI to arrive. Use this option with caution. + */ +static void poll_idle(void) +{ + local_irq_enable(); + cpu_relax(); +} + +void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) +{ + static int selected; + + if (selected) + return; +#ifdef CONFIG_X86_SMP + if (pm_idle == poll_idle && smp_num_siblings > 1) { + printk(KERN_WARNING "WARNING: polling idle and HT enabled," + " performance may degrade.\n"); + } +#endif + if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { + /* + * Skip, if setup has overridden idle. + * One CPU supports mwait => All CPUs supports mwait + */ + if (!pm_idle) { + printk(KERN_INFO "using mwait in idle threads.\n"); + pm_idle = mwait_idle; + } + } + selected = 1; +} + +static int __init idle_setup(char *str) +{ + if (!strcmp(str, "poll")) { + printk("using polling idle threads.\n"); + pm_idle = poll_idle; + } else if (!strcmp(str, "mwait")) + force_mwait = 1; + else + return -1; + + boot_option_idle_override = 1; + return 0; +} +early_param("idle", idle_setup); + diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 77de848bd1fb..f8476dfbb60d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -111,12 +111,10 @@ void default_idle(void) */ smp_mb(); - local_irq_disable(); - if (!need_resched()) { + if (!need_resched()) safe_halt(); /* enables interrupts racelessly */ - local_irq_disable(); - } - local_irq_enable(); + else + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } else { local_irq_enable(); @@ -128,17 +126,6 @@ void default_idle(void) EXPORT_SYMBOL(default_idle); #endif -/* - * On SMP it's slightly faster (but much more power-consuming!) - * to poll the ->work.need_resched flag instead of waiting for the - * cross-CPU IPI to arrive. Use this option with caution. - */ -static void poll_idle(void) -{ - local_irq_enable(); - cpu_relax(); -} - #ifdef CONFIG_HOTPLUG_CPU #include <asm/nmi.h> /* We don't actually take CPU down, just spin without interrupts. */ @@ -196,6 +183,7 @@ void cpu_idle(void) if (cpu_is_offline(cpu)) play_dead(); + local_irq_disable(); __get_cpu_var(irq_stat).idle_timestamp = jiffies; idle(); } @@ -206,104 +194,6 @@ void cpu_idle(void) } } -static void do_nothing(void *unused) -{ -} - -/* - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of - * pm_idle and update to new pm_idle value. Required while changing pm_idle - * handler on SMP systems. - * - * Caller must have changed pm_idle to the new value before the call. Old - * pm_idle value will not be used by any CPU after the return of this function. - */ -void cpu_idle_wait(void) -{ - smp_mb(); - /* kick all the CPUs so that they exit out of pm_idle */ - smp_call_function(do_nothing, NULL, 0, 1); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - * - * New with Core Duo processors, MWAIT can take some hints based on CPU - * capability. - */ -void mwait_idle_with_hints(unsigned long ax, unsigned long cx) -{ - if (!need_resched()) { - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __sti_mwait(ax, cx); - else - local_irq_enable(); - } else - local_irq_enable(); -} - -/* Default MONITOR/MWAIT with no hints, used for default C1 state */ -static void mwait_idle(void) -{ - local_irq_enable(); - mwait_idle_with_hints(0, 0); -} - -static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) -{ - if (force_mwait) - return 1; - /* Any C1 states supported? */ - return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0; -} - -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) -{ - static int selected; - - if (selected) - return; -#ifdef CONFIG_X86_SMP - if (pm_idle == poll_idle && smp_num_siblings > 1) { - printk(KERN_WARNING "WARNING: polling idle and HT enabled," - " performance may degrade.\n"); - } -#endif - if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { - /* - * Skip, if setup has overridden idle. - * One CPU supports mwait => All CPUs supports mwait - */ - if (!pm_idle) { - printk(KERN_INFO "using mwait in idle threads.\n"); - pm_idle = mwait_idle; - } - } - selected = 1; -} - -static int __init idle_setup(char *str) -{ - if (!strcmp(str, "poll")) { - printk("using polling idle threads.\n"); - pm_idle = poll_idle; - } else if (!strcmp(str, "mwait")) - force_mwait = 1; - else - return -1; - - boot_option_idle_override = 1; - return 0; -} -early_param("idle", idle_setup); - void __show_registers(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 131c2ee7ac56..e2319f39988b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -106,26 +106,13 @@ void default_idle(void) * test NEED_RESCHED: */ smp_mb(); - local_irq_disable(); - if (!need_resched()) { + if (!need_resched()) safe_halt(); /* enables interrupts racelessly */ - local_irq_disable(); - } - local_irq_enable(); + else + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } -/* - * On SMP it's slightly faster (but much more power-consuming!) - * to poll the ->need_resched flag instead of waiting for the - * cross-CPU IPI to arrive. Use this option with caution. - */ -static void poll_idle(void) -{ - local_irq_enable(); - cpu_relax(); -} - #ifdef CONFIG_HOTPLUG_CPU DECLARE_PER_CPU(int, cpu_state); @@ -192,110 +179,6 @@ void cpu_idle(void) } } -static void do_nothing(void *unused) -{ -} - -/* - * cpu_idle_wait - Used to ensure that all the CPUs discard old value of - * pm_idle and update to new pm_idle value. Required while changing pm_idle - * handler on SMP systems. - * - * Caller must have changed pm_idle to the new value before the call. Old - * pm_idle value will not be used by any CPU after the return of this function. - */ -void cpu_idle_wait(void) -{ - smp_mb(); - /* kick all the CPUs so that they exit out of pm_idle */ - smp_call_function(do_nothing, NULL, 0, 1); -} -EXPORT_SYMBOL_GPL(cpu_idle_wait); - -/* - * This uses new MONITOR/MWAIT instructions on P4 processors with PNI, - * which can obviate IPI to trigger checking of need_resched. - * We execute MONITOR against need_resched and enter optimized wait state - * through MWAIT. Whenever someone changes need_resched, we would be woken - * up from MWAIT (without an IPI). - * - * New with Core Duo processors, MWAIT can take some hints based on CPU - * capability. - */ -void mwait_idle_with_hints(unsigned long ax, unsigned long cx) -{ - if (!need_resched()) { - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __mwait(ax, cx); - } -} - -/* Default MONITOR/MWAIT with no hints, used for default C1 state */ -static void mwait_idle(void) -{ - if (!need_resched()) { - __monitor((void *)¤t_thread_info()->flags, 0, 0); - smp_mb(); - if (!need_resched()) - __sti_mwait(0, 0); - else - local_irq_enable(); - } else { - local_irq_enable(); - } -} - - -static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) -{ - if (force_mwait) - return 1; - /* Any C1 states supported? */ - return c->cpuid_level >= 5 && ((cpuid_edx(5) >> 4) & 0xf) > 0; -} - -void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) -{ - static int selected; - - if (selected) - return; -#ifdef CONFIG_X86_SMP - if (pm_idle == poll_idle && smp_num_siblings > 1) { - printk(KERN_WARNING "WARNING: polling idle and HT enabled," - " performance may degrade.\n"); - } -#endif - if (cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c)) { - /* - * Skip, if setup has overridden idle. - * One CPU supports mwait => All CPUs supports mwait - */ - if (!pm_idle) { - printk(KERN_INFO "using mwait in idle threads.\n"); - pm_idle = mwait_idle; - } - } - selected = 1; -} - -static int __init idle_setup(char *str) -{ - if (!strcmp(str, "poll")) { - printk("using polling idle threads.\n"); - pm_idle = poll_idle; - } else if (!strcmp(str, "mwait")) - force_mwait = 1; - else - return -1; - - boot_option_idle_override = 1; - return 0; -} -early_param("idle", idle_setup); - /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs * regs) { diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 559c1b027417..fb03ef380f0e 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1207,97 +1207,16 @@ static int genregs32_set(struct task_struct *target, return ret; } -static long ptrace32_siginfo(unsigned request, u32 pid, u32 addr, u32 data) +long compat_arch_ptrace(struct task_struct *child, compat_long_t request, + compat_ulong_t caddr, compat_ulong_t cdata) { - siginfo_t __user *si = compat_alloc_user_space(sizeof(siginfo_t)); - compat_siginfo_t __user *si32 = compat_ptr(data); - siginfo_t ssi; - int ret; - - if (request == PTRACE_SETSIGINFO) { - memset(&ssi, 0, sizeof(siginfo_t)); - ret = copy_siginfo_from_user32(&ssi, si32); - if (ret) - return ret; - if (copy_to_user(si, &ssi, sizeof(siginfo_t))) - return -EFAULT; - } - ret = sys_ptrace(request, pid, addr, (unsigned long)si); - if (ret) - return ret; - if (request == PTRACE_GETSIGINFO) { - if (copy_from_user(&ssi, si, sizeof(siginfo_t))) - return -EFAULT; - ret = copy_siginfo_to_user32(si32, &ssi); - } - return ret; -} - -asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) -{ - struct task_struct *child; - struct pt_regs *childregs; + unsigned long addr = caddr; + unsigned long data = cdata; void __user *datap = compat_ptr(data); int ret; __u32 val; switch (request) { - case PTRACE_TRACEME: - case PTRACE_ATTACH: - case PTRACE_KILL: - case PTRACE_CONT: - case PTRACE_SINGLESTEP: - case PTRACE_SINGLEBLOCK: - case PTRACE_DETACH: - case PTRACE_SYSCALL: - case PTRACE_OLDSETOPTIONS: - case PTRACE_SETOPTIONS: - case PTRACE_SET_THREAD_AREA: - case PTRACE_GET_THREAD_AREA: -#ifdef X86_BTS - case PTRACE_BTS_CONFIG: - case PTRACE_BTS_STATUS: - case PTRACE_BTS_SIZE: - case PTRACE_BTS_GET: - case PTRACE_BTS_CLEAR: - case PTRACE_BTS_DRAIN: -#endif - return sys_ptrace(request, pid, addr, data); - - default: - return -EINVAL; - - case PTRACE_PEEKTEXT: - case PTRACE_PEEKDATA: - case PTRACE_POKEDATA: - case PTRACE_POKETEXT: - case PTRACE_POKEUSR: - case PTRACE_PEEKUSR: - case PTRACE_GETREGS: - case PTRACE_SETREGS: - case PTRACE_SETFPREGS: - case PTRACE_GETFPREGS: - case PTRACE_SETFPXREGS: - case PTRACE_GETFPXREGS: - case PTRACE_GETEVENTMSG: - break; - - case PTRACE_SETSIGINFO: - case PTRACE_GETSIGINFO: - return ptrace32_siginfo(request, pid, addr, data); - } - - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) - return PTR_ERR(child); - - ret = ptrace_check_attach(child, request == PTRACE_KILL); - if (ret < 0) - goto out; - - childregs = task_pt_regs(child); - - switch (request) { case PTRACE_PEEKUSR: ret = getreg32(child, addr, &val); if (ret == 0) @@ -1343,12 +1262,14 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) sizeof(struct user32_fxsr_struct), datap); + case PTRACE_GET_THREAD_AREA: + case PTRACE_SET_THREAD_AREA: + return arch_ptrace(child, request, addr, data); + default: return compat_ptrace_request(child, request, addr, data); } - out: - put_task_struct(child); return ret; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 19c9386ac118..a4a838306b2c 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -8,6 +8,7 @@ #include <asm/apic.h> #include <asm/desc.h> #include <asm/hpet.h> +#include <asm/pgtable.h> #include <asm/reboot_fixups.h> #include <asm/reboot.h> @@ -15,7 +16,6 @@ # include <linux/dmi.h> # include <linux/ctype.h> # include <linux/mc146818rtc.h> -# include <asm/pgtable.h> #else # include <asm/iommu.h> #endif @@ -275,7 +275,7 @@ void machine_real_restart(unsigned char *code, int length) /* Remap the kernel at virtual address zero, as well as offset zero from the kernel segment. This assumes the kernel segment starts at virtual address PAGE_OFFSET. */ - memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, + memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS); /* @@ -399,7 +399,7 @@ static void native_machine_emergency_restart(void) } } -static void native_machine_shutdown(void) +void native_machine_shutdown(void) { /* Stop the cpus and apics */ #ifdef CONFIG_SMP @@ -470,7 +470,10 @@ struct machine_ops machine_ops = { .shutdown = native_machine_shutdown, .emergency_restart = native_machine_emergency_restart, .restart = native_machine_restart, - .halt = native_machine_halt + .halt = native_machine_halt, +#ifdef CONFIG_KEXEC + .crash_shutdown = native_machine_crash_shutdown, +#endif }; void machine_power_off(void) @@ -498,3 +501,9 @@ void machine_halt(void) machine_ops.halt(); } +#ifdef CONFIG_KEXEC +void machine_crash_shutdown(struct pt_regs *regs) +{ + machine_ops.crash_shutdown(regs); +} +#endif diff --git a/arch/x86/kernel/setup_32.c b/arch/x86/kernel/setup_32.c index 455d3c80960b..2283422af794 100644 --- a/arch/x86/kernel/setup_32.c +++ b/arch/x86/kernel/setup_32.c @@ -47,6 +47,7 @@ #include <linux/pfn.h> #include <linux/pci.h> #include <linux/init_ohci1394_dma.h> +#include <linux/kvm_para.h> #include <video/edid.h> @@ -389,7 +390,6 @@ unsigned long __init find_max_low_pfn(void) return max_low_pfn; } -#define BIOS_EBDA_SEGMENT 0x40E #define BIOS_LOWMEM_KILOBYTES 0x413 /* @@ -420,8 +420,7 @@ static void __init reserve_ebda_region(void) lowmem <<= 10; /* start of EBDA area */ - ebda_addr = *(unsigned short *)__va(BIOS_EBDA_SEGMENT); - ebda_addr <<= 4; + ebda_addr = get_bios_ebda(); /* Fixup: bios puts an EBDA in the top 64K segment */ /* of conventional memory, but does not adjust lowmem. */ @@ -822,6 +821,10 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = setup_memory(); +#ifdef CONFIG_KVM_CLOCK + kvmclock_init(); +#endif + #ifdef CONFIG_VMI /* * Must be after max_low_pfn is determined, and before kernel @@ -829,6 +832,7 @@ void __init setup_arch(char **cmdline_p) */ vmi_init(); #endif + kvm_guest_init(); /* * NOTE: before this point _nobody_ is allowed to allocate diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index c2ec3dcb6b99..a94fb959a87a 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -42,6 +42,7 @@ #include <linux/ctype.h> #include <linux/uaccess.h> #include <linux/init_ohci1394_dma.h> +#include <linux/kvm_para.h> #include <asm/mtrr.h> #include <asm/uaccess.h> @@ -116,7 +117,7 @@ extern int root_mountflags; char __initdata command_line[COMMAND_LINE_SIZE]; -struct resource standard_io_resources[] = { +static struct resource standard_io_resources[] = { { .name = "dma1", .start = 0x00, .end = 0x1f, .flags = IORESOURCE_BUSY | IORESOURCE_IO }, { .name = "pic1", .start = 0x20, .end = 0x21, @@ -190,6 +191,7 @@ contig_initmem_init(unsigned long start_pfn, unsigned long end_pfn) bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn); e820_register_active_regions(0, start_pfn, end_pfn); free_bootmem_with_active_regions(0, end_pfn); + early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); } #endif @@ -264,6 +266,28 @@ void __attribute__((weak)) __init memory_setup(void) machine_specific_memory_setup(); } +static void __init parse_setup_data(void) +{ + struct setup_data *data; + unsigned long pa_data; + + if (boot_params.hdr.version < 0x0209) + return; + pa_data = boot_params.hdr.setup_data; + while (pa_data) { + data = early_ioremap(pa_data, PAGE_SIZE); + switch (data->type) { + default: + break; + } +#ifndef CONFIG_DEBUG_BOOT_PARAMS + free_early(pa_data, pa_data+sizeof(*data)+data->len); +#endif + pa_data = data->next; + early_iounmap(data, PAGE_SIZE); + } +} + /* * setup_arch - architecture-specific boot-time initializations * @@ -316,6 +340,8 @@ void __init setup_arch(char **cmdline_p) strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; + parse_setup_data(); + parse_early_param(); #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT @@ -359,6 +385,10 @@ void __init setup_arch(char **cmdline_p) io_delay_init(); +#ifdef CONFIG_KVM_CLOCK + kvmclock_init(); +#endif + #ifdef CONFIG_SMP /* setup to use the early static init tables during kernel startup */ x86_cpu_to_apicid_early_ptr = (void *)x86_cpu_to_apicid_init; @@ -397,8 +427,6 @@ void __init setup_arch(char **cmdline_p) contig_initmem_init(0, end_pfn); #endif - early_res_to_bootmem(); - dma32_reserve_bootmem(); #ifdef CONFIG_ACPI_SLEEP @@ -465,6 +493,8 @@ void __init setup_arch(char **cmdline_p) init_apic_mappings(); ioapic_init_mappings(); + kvm_guest_init(); + /* * We trust e820 completely. No explicit ROM probing in memory. */ diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index f1b117930837..8e05e7f7bd40 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -413,16 +413,6 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, regs->ss = __USER_DS; regs->cs = __USER_CS; - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF); - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); - return 0; give_sigsegv: @@ -501,16 +491,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->ss = __USER_DS; regs->cs = __USER_CS; - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF); - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); - return 0; give_sigsegv: @@ -566,6 +546,21 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, if (ret) return ret; + /* + * Clear the direction flag as per the ABI for function entry. + */ + regs->flags &= ~X86_EFLAGS_DF; + + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->flags &= ~X86_EFLAGS_TF; + if (test_thread_flag(TIF_SINGLESTEP)) + ptrace_notify(SIGTRAP); + spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 827179c5b32a..ccb2a4560c2d 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -285,14 +285,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, even if the handler happens to be interrupting 32-bit code. */ regs->cs = __USER_CS; - /* This, by contrast, has nothing to do with segment registers - - see include/asm-x86_64/uaccess.h for details. */ - set_fs(USER_DS); - - regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF); - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); - return 0; give_sigsegv: @@ -380,6 +372,28 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, ret = setup_rt_frame(sig, ka, info, oldset, regs); if (ret == 0) { + /* + * This has nothing to do with segment registers, + * despite the name. This magic affects uaccess.h + * macros' behavior. Reset it to the normal setting. + */ + set_fs(USER_DS); + + /* + * Clear the direction flag as per the ABI for function entry. + */ + regs->flags &= ~X86_EFLAGS_DF; + + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->flags &= ~X86_EFLAGS_TF; + if (test_thread_flag(TIF_SINGLESTEP)) + ptrace_notify(SIGTRAP); + spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ade371f9663a..04c662ba18f1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1039,8 +1039,8 @@ int __cpuinit native_cpu_up(unsigned int cpu) #ifdef CONFIG_X86_32 /* init low mem mapping */ - clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); + clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); flush_tlb_all(); #endif @@ -1058,7 +1058,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) check_tsc_sync_source(cpu); local_irq_restore(flags); - while (!cpu_isset(cpu, cpu_online_map)) { + while (!cpu_online(cpu)) { cpu_relax(); touch_nmi_watchdog(); } @@ -1168,7 +1168,7 @@ static void __init smp_cpu_index_default(void) int i; struct cpuinfo_x86 *c; - for_each_cpu_mask(i, cpu_possible_map) { + for_each_possible_cpu(i) { c = &cpu_data(i); /* mark all to hotplug */ c->cpu_index = NR_CPUS; diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c index 6878a9c2df5d..ae751094eba9 100644 --- a/arch/x86/kernel/summit_32.c +++ b/arch/x86/kernel/summit_32.c @@ -29,6 +29,7 @@ #include <linux/mm.h> #include <linux/init.h> #include <asm/io.h> +#include <asm/bios_ebda.h> #include <asm/mach-summit/mach_mpparse.h> static struct rio_table_hdr *rio_table_hdr __initdata; @@ -140,8 +141,8 @@ void __init setup_summit(void) int i, next_wpeg, next_bus = 0; /* The pointer to the EBDA is stored in the word @ phys 0x40E(40:0E) */ - ptr = *(unsigned short *)phys_to_virt(0x40Eul); - ptr = (unsigned long)phys_to_virt(ptr << 4); + ptr = get_bios_ebda(); + ptr = (unsigned long)phys_to_virt(ptr); rio_table_hdr = NULL; offset = 0x180; diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c index df224a8774cb..a1f07d793202 100644 --- a/arch/x86/kernel/tlb_64.c +++ b/arch/x86/kernel/tlb_64.c @@ -195,9 +195,9 @@ static int __cpuinit init_smp_flush(void) { int i; - for_each_cpu_mask(i, cpu_possible_map) { + for_each_possible_cpu(i) spin_lock_init(&per_cpu(flush_state, i).tlbstate_lock); - } + return 0; } core_initcall(init_smp_flush); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 64580679861e..d8ccc3c6552f 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -33,7 +33,7 @@ /* We can free up trampoline after bootup if cpu hotplug is not supported. */ #ifndef CONFIG_HOTPLUG_CPU -.section ".init.data","aw",@progbits +.section ".cpuinit.data","aw",@progbits #else .section .rodata,"a",@progbits #endif diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 471e694d6713..bde6f63e15d5 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -602,7 +602,7 @@ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) -DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) +DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) void __kprobes do_general_protection(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 12affe1f9bce..956f38927aa7 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -320,7 +320,7 @@ static void check_zeroed_page(u32 pfn, int type, struct page *page) * pdes need to be zeroed. */ if (type & VMI_PAGE_CLONE) - limit = USER_PTRS_PER_PGD; + limit = KERNEL_PGD_BOUNDARY; for (i = 0; i < limit; i++) BUG_ON(ptr[i]); } @@ -392,13 +392,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) } #endif -static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn) +static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) { vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); } -static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn) +static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) { /* * This call comes in very early, before mem_map is setup. @@ -409,20 +409,20 @@ static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn) vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); } -static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) +static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) { vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); vmi_check_page_type(clonepfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); } -static void vmi_release_pt(u32 pfn) +static void vmi_release_pte(u32 pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L1); vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } -static void vmi_release_pd(u32 pfn) +static void vmi_release_pmd(u32 pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L2); vmi_set_page_type(pfn, VMI_PAGE_NORMAL); @@ -871,15 +871,15 @@ static inline int __init activate_vmi(void) vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage); if (vmi_ops.allocate_page) { - pv_mmu_ops.alloc_pt = vmi_allocate_pt; - pv_mmu_ops.alloc_pd = vmi_allocate_pd; - pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone; + pv_mmu_ops.alloc_pte = vmi_allocate_pte; + pv_mmu_ops.alloc_pmd = vmi_allocate_pmd; + pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone; } vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage); if (vmi_ops.release_page) { - pv_mmu_ops.release_pt = vmi_release_pt; - pv_mmu_ops.release_pd = vmi_release_pd; + pv_mmu_ops.release_pte = vmi_release_pte; + pv_mmu_ops.release_pmd = vmi_release_pmd; } /* Set linear is needed in all cases */ diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index b7ab3c335fae..fad3674b06a5 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -209,12 +209,6 @@ SECTIONS EXIT_DATA } -/* vdso blob that is mapped into user space */ - vdso_start = . ; - .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) } - . = ALIGN(PAGE_SIZE); - vdso_end = .; - #ifdef CONFIG_BLK_DEV_INITRD . = ALIGN(PAGE_SIZE); __initramfs_start = .; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 41962e793c0f..8d45fabc5f3b 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -19,7 +19,7 @@ if VIRTUALIZATION config KVM tristate "Kernel-based Virtual Machine (KVM) support" - depends on HAVE_KVM && EXPERIMENTAL + depends on HAVE_KVM select PREEMPT_NOTIFIERS select ANON_INODES ---help--- @@ -50,6 +50,17 @@ config KVM_AMD Provides support for KVM on AMD processors equipped with the AMD-V (SVM) extensions. +config KVM_TRACE + bool "KVM trace support" + depends on KVM && MARKERS && SYSFS + select RELAY + select DEBUG_FS + default n + ---help--- + This option allows reading a trace of kvm-related events through + relayfs. Note the ABI is not considered stable and will be + modified in future updates. + # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/lguest/Kconfig diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index ffdd0b310784..c97d35c218db 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -3,10 +3,14 @@ # common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) +ifeq ($(CONFIG_KVM_TRACE),y) +common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) +endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm -kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o +kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ + i8254.o obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c new file mode 100644 index 000000000000..361e31611276 --- /dev/null +++ b/arch/x86/kvm/i8254.c @@ -0,0 +1,611 @@ +/* + * 8253/8254 interval timer emulation + * + * Copyright (c) 2003-2004 Fabrice Bellard + * Copyright (c) 2006 Intel Corporation + * Copyright (c) 2007 Keir Fraser, XenSource Inc + * Copyright (c) 2008 Intel Corporation + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * Authors: + * Sheng Yang <sheng.yang@intel.com> + * Based on QEMU and Xen. + */ + +#include <linux/kvm_host.h> + +#include "irq.h" +#include "i8254.h" + +#ifndef CONFIG_X86_64 +#define mod_64(x, y) ((x) - (y) * div64_64(x, y)) +#else +#define mod_64(x, y) ((x) % (y)) +#endif + +#define RW_STATE_LSB 1 +#define RW_STATE_MSB 2 +#define RW_STATE_WORD0 3 +#define RW_STATE_WORD1 4 + +/* Compute with 96 bit intermediate result: (a*b)/c */ +static u64 muldiv64(u64 a, u32 b, u32 c) +{ + union { + u64 ll; + struct { + u32 low, high; + } l; + } u, res; + u64 rl, rh; + + u.ll = a; + rl = (u64)u.l.low * (u64)b; + rh = (u64)u.l.high * (u64)b; + rh += (rl >> 32); + res.l.high = div64_64(rh, c); + res.l.low = div64_64(((mod_64(rh, c) << 32) + (rl & 0xffffffff)), c); + return res.ll; +} + +static void pit_set_gate(struct kvm *kvm, int channel, u32 val) +{ + struct kvm_kpit_channel_state *c = + &kvm->arch.vpit->pit_state.channels[channel]; + + WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + + switch (c->mode) { + default: + case 0: + case 4: + /* XXX: just disable/enable counting */ + break; + case 1: + case 2: + case 3: + case 5: + /* Restart counting on rising edge. */ + if (c->gate < val) + c->count_load_time = ktime_get(); + break; + } + + c->gate = val; +} + +int pit_get_gate(struct kvm *kvm, int channel) +{ + WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + + return kvm->arch.vpit->pit_state.channels[channel].gate; +} + +static int pit_get_count(struct kvm *kvm, int channel) +{ + struct kvm_kpit_channel_state *c = + &kvm->arch.vpit->pit_state.channels[channel]; + s64 d, t; + int counter; + + WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + + t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); + d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); + + switch (c->mode) { + case 0: + case 1: + case 4: + case 5: + counter = (c->count - d) & 0xffff; + break; + case 3: + /* XXX: may be incorrect for odd counts */ + counter = c->count - (mod_64((2 * d), c->count)); + break; + default: + counter = c->count - mod_64(d, c->count); + break; + } + return counter; +} + +static int pit_get_out(struct kvm *kvm, int channel) +{ + struct kvm_kpit_channel_state *c = + &kvm->arch.vpit->pit_state.channels[channel]; + s64 d, t; + int out; + + WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + + t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time)); + d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC); + + switch (c->mode) { + default: + case 0: + out = (d >= c->count); + break; + case 1: + out = (d < c->count); + break; + case 2: + out = ((mod_64(d, c->count) == 0) && (d != 0)); + break; + case 3: + out = (mod_64(d, c->count) < ((c->count + 1) >> 1)); + break; + case 4: + case 5: + out = (d == c->count); + break; + } + + return out; +} + +static void pit_latch_count(struct kvm *kvm, int channel) +{ + struct kvm_kpit_channel_state *c = + &kvm->arch.vpit->pit_state.channels[channel]; + + WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + + if (!c->count_latched) { + c->latched_count = pit_get_count(kvm, channel); + c->count_latched = c->rw_mode; + } +} + +static void pit_latch_status(struct kvm *kvm, int channel) +{ + struct kvm_kpit_channel_state *c = + &kvm->arch.vpit->pit_state.channels[channel]; + + WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); + + if (!c->status_latched) { + /* TODO: Return NULL COUNT (bit 6). */ + c->status = ((pit_get_out(kvm, channel) << 7) | + (c->rw_mode << 4) | + (c->mode << 1) | + c->bcd); + c->status_latched = 1; + } +} + +int __pit_timer_fn(struct kvm_kpit_state *ps) +{ + struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; + struct kvm_kpit_timer *pt = &ps->pit_timer; + + atomic_inc(&pt->pending); + smp_mb__after_atomic_inc(); + /* FIXME: handle case where the guest is in guest mode */ + if (vcpu0 && waitqueue_active(&vcpu0->wq)) { + vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; + wake_up_interruptible(&vcpu0->wq); + } + + pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); + pt->scheduled = ktime_to_ns(pt->timer.expires); + + return (pt->period == 0 ? 0 : 1); +} + +int pit_has_pending_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_pit *pit = vcpu->kvm->arch.vpit; + + if (pit && vcpu->vcpu_id == 0) + return atomic_read(&pit->pit_state.pit_timer.pending); + + return 0; +} + +static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) +{ + struct kvm_kpit_state *ps; + int restart_timer = 0; + + ps = container_of(data, struct kvm_kpit_state, pit_timer.timer); + + restart_timer = __pit_timer_fn(ps); + + if (restart_timer) + return HRTIMER_RESTART; + else + return HRTIMER_NORESTART; +} + +static void destroy_pit_timer(struct kvm_kpit_timer *pt) +{ + pr_debug("pit: execute del timer!\n"); + hrtimer_cancel(&pt->timer); +} + +static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) +{ + s64 interval; + + interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); + + pr_debug("pit: create pit timer, interval is %llu nsec\n", interval); + + /* TODO The new value only affected after the retriggered */ + hrtimer_cancel(&pt->timer); + pt->period = (is_period == 0) ? 0 : interval; + pt->timer.function = pit_timer_fn; + atomic_set(&pt->pending, 0); + + hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), + HRTIMER_MODE_ABS); +} + +static void pit_load_count(struct kvm *kvm, int channel, u32 val) +{ + struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; + + WARN_ON(!mutex_is_locked(&ps->lock)); + + pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); + + /* + * Though spec said the state of 8254 is undefined after power-up, + * seems some tricky OS like Windows XP depends on IRQ0 interrupt + * when booting up. + * So here setting initialize rate for it, and not a specific number + */ + if (val == 0) + val = 0x10000; + + ps->channels[channel].count_load_time = ktime_get(); + ps->channels[channel].count = val; + + if (channel != 0) + return; + + /* Two types of timer + * mode 1 is one shot, mode 2 is period, otherwise del timer */ + switch (ps->channels[0].mode) { + case 1: + create_pit_timer(&ps->pit_timer, val, 0); + break; + case 2: + create_pit_timer(&ps->pit_timer, val, 1); + break; + default: + destroy_pit_timer(&ps->pit_timer); + } +} + +void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val) +{ + mutex_lock(&kvm->arch.vpit->pit_state.lock); + pit_load_count(kvm, channel, val); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); +} + +static void pit_ioport_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *data) +{ + struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_kpit_state *pit_state = &pit->pit_state; + struct kvm *kvm = pit->kvm; + int channel, access; + struct kvm_kpit_channel_state *s; + u32 val = *(u32 *) data; + + val &= 0xff; + addr &= KVM_PIT_CHANNEL_MASK; + + mutex_lock(&pit_state->lock); + + if (val != 0) + pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n", + (unsigned int)addr, len, val); + + if (addr == 3) { + channel = val >> 6; + if (channel == 3) { + /* Read-Back Command. */ + for (channel = 0; channel < 3; channel++) { + s = &pit_state->channels[channel]; + if (val & (2 << channel)) { + if (!(val & 0x20)) + pit_latch_count(kvm, channel); + if (!(val & 0x10)) + pit_latch_status(kvm, channel); + } + } + } else { + /* Select Counter <channel>. */ + s = &pit_state->channels[channel]; + access = (val >> 4) & KVM_PIT_CHANNEL_MASK; + if (access == 0) { + pit_latch_count(kvm, channel); + } else { + s->rw_mode = access; + s->read_state = access; + s->write_state = access; + s->mode = (val >> 1) & 7; + if (s->mode > 5) + s->mode -= 4; + s->bcd = val & 1; + } + } + } else { + /* Write Count. */ + s = &pit_state->channels[addr]; + switch (s->write_state) { + default: + case RW_STATE_LSB: + pit_load_count(kvm, addr, val); + break; + case RW_STATE_MSB: + pit_load_count(kvm, addr, val << 8); + break; + case RW_STATE_WORD0: + s->write_latch = val; + s->write_state = RW_STATE_WORD1; + break; + case RW_STATE_WORD1: + pit_load_count(kvm, addr, s->write_latch | (val << 8)); + s->write_state = RW_STATE_WORD0; + break; + } + } + + mutex_unlock(&pit_state->lock); +} + +static void pit_ioport_read(struct kvm_io_device *this, + gpa_t addr, int len, void *data) +{ + struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_kpit_state *pit_state = &pit->pit_state; + struct kvm *kvm = pit->kvm; + int ret, count; + struct kvm_kpit_channel_state *s; + + addr &= KVM_PIT_CHANNEL_MASK; + s = &pit_state->channels[addr]; + + mutex_lock(&pit_state->lock); + + if (s->status_latched) { + s->status_latched = 0; + ret = s->status; + } else if (s->count_latched) { + switch (s->count_latched) { + default: + case RW_STATE_LSB: + ret = s->latched_count & 0xff; + s->count_latched = 0; + break; + case RW_STATE_MSB: + ret = s->latched_count >> 8; + s->count_latched = 0; + break; + case RW_STATE_WORD0: + ret = s->latched_count & 0xff; + s->count_latched = RW_STATE_MSB; + break; + } + } else { + switch (s->read_state) { + default: + case RW_STATE_LSB: + count = pit_get_count(kvm, addr); + ret = count & 0xff; + break; + case RW_STATE_MSB: + count = pit_get_count(kvm, addr); + ret = (count >> 8) & 0xff; + break; + case RW_STATE_WORD0: + count = pit_get_count(kvm, addr); + ret = count & 0xff; + s->read_state = RW_STATE_WORD1; + break; + case RW_STATE_WORD1: + count = pit_get_count(kvm, addr); + ret = (count >> 8) & 0xff; + s->read_state = RW_STATE_WORD0; + break; + } + } + + if (len > sizeof(ret)) + len = sizeof(ret); + memcpy(data, (char *)&ret, len); + + mutex_unlock(&pit_state->lock); +} + +static int pit_in_range(struct kvm_io_device *this, gpa_t addr) +{ + return ((addr >= KVM_PIT_BASE_ADDRESS) && + (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); +} + +static void speaker_ioport_write(struct kvm_io_device *this, + gpa_t addr, int len, const void *data) +{ + struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_kpit_state *pit_state = &pit->pit_state; + struct kvm *kvm = pit->kvm; + u32 val = *(u32 *) data; + + mutex_lock(&pit_state->lock); + pit_state->speaker_data_on = (val >> 1) & 1; + pit_set_gate(kvm, 2, val & 1); + mutex_unlock(&pit_state->lock); +} + +static void speaker_ioport_read(struct kvm_io_device *this, + gpa_t addr, int len, void *data) +{ + struct kvm_pit *pit = (struct kvm_pit *)this->private; + struct kvm_kpit_state *pit_state = &pit->pit_state; + struct kvm *kvm = pit->kvm; + unsigned int refresh_clock; + int ret; + + /* Refresh clock toggles at about 15us. We approximate as 2^14ns. */ + refresh_clock = ((unsigned int)ktime_to_ns(ktime_get()) >> 14) & 1; + + mutex_lock(&pit_state->lock); + ret = ((pit_state->speaker_data_on << 1) | pit_get_gate(kvm, 2) | + (pit_get_out(kvm, 2) << 5) | (refresh_clock << 4)); + if (len > sizeof(ret)) + len = sizeof(ret); + memcpy(data, (char *)&ret, len); + mutex_unlock(&pit_state->lock); +} + +static int speaker_in_range(struct kvm_io_device *this, gpa_t addr) +{ + return (addr == KVM_SPEAKER_BASE_ADDRESS); +} + +void kvm_pit_reset(struct kvm_pit *pit) +{ + int i; + struct kvm_kpit_channel_state *c; + + mutex_lock(&pit->pit_state.lock); + for (i = 0; i < 3; i++) { + c = &pit->pit_state.channels[i]; + c->mode = 0xff; + c->gate = (i != 2); + pit_load_count(pit->kvm, i, 0); + } + mutex_unlock(&pit->pit_state.lock); + + atomic_set(&pit->pit_state.pit_timer.pending, 0); + pit->pit_state.inject_pending = 1; +} + +struct kvm_pit *kvm_create_pit(struct kvm *kvm) +{ + struct kvm_pit *pit; + struct kvm_kpit_state *pit_state; + + pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL); + if (!pit) + return NULL; + + mutex_init(&pit->pit_state.lock); + mutex_lock(&pit->pit_state.lock); + + /* Initialize PIO device */ + pit->dev.read = pit_ioport_read; + pit->dev.write = pit_ioport_write; + pit->dev.in_range = pit_in_range; + pit->dev.private = pit; + kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + + pit->speaker_dev.read = speaker_ioport_read; + pit->speaker_dev.write = speaker_ioport_write; + pit->speaker_dev.in_range = speaker_in_range; + pit->speaker_dev.private = pit; + kvm_io_bus_register_dev(&kvm->pio_bus, &pit->speaker_dev); + + kvm->arch.vpit = pit; + pit->kvm = kvm; + + pit_state = &pit->pit_state; + pit_state->pit = pit; + hrtimer_init(&pit_state->pit_timer.timer, + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + mutex_unlock(&pit->pit_state.lock); + + kvm_pit_reset(pit); + + return pit; +} + +void kvm_free_pit(struct kvm *kvm) +{ + struct hrtimer *timer; + + if (kvm->arch.vpit) { + mutex_lock(&kvm->arch.vpit->pit_state.lock); + timer = &kvm->arch.vpit->pit_state.pit_timer.timer; + hrtimer_cancel(timer); + mutex_unlock(&kvm->arch.vpit->pit_state.lock); + kfree(kvm->arch.vpit); + } +} + +void __inject_pit_timer_intr(struct kvm *kvm) +{ + mutex_lock(&kvm->lock); + kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); + kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); + kvm_pic_set_irq(pic_irqchip(kvm), 0, 1); + kvm_pic_set_irq(pic_irqchip(kvm), 0, 0); + mutex_unlock(&kvm->lock); +} + +void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) +{ + struct kvm_pit *pit = vcpu->kvm->arch.vpit; + struct kvm *kvm = vcpu->kvm; + struct kvm_kpit_state *ps; + + if (vcpu && pit) { + ps = &pit->pit_state; + + /* Try to inject pending interrupts when: + * 1. Pending exists + * 2. Last interrupt was accepted or waited for too long time*/ + if (atomic_read(&ps->pit_timer.pending) && + (ps->inject_pending || + (jiffies - ps->last_injected_time + >= KVM_MAX_PIT_INTR_INTERVAL))) { + ps->inject_pending = 0; + __inject_pit_timer_intr(kvm); + ps->last_injected_time = jiffies; + } + } +} + +void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec) +{ + struct kvm_arch *arch = &vcpu->kvm->arch; + struct kvm_kpit_state *ps; + + if (vcpu && arch->vpit) { + ps = &arch->vpit->pit_state; + if (atomic_read(&ps->pit_timer.pending) && + (((arch->vpic->pics[0].imr & 1) == 0 && + arch->vpic->pics[0].irq_base == vec) || + (arch->vioapic->redirtbl[0].fields.vector == vec && + arch->vioapic->redirtbl[0].fields.mask != 1))) { + ps->inject_pending = 1; + atomic_dec(&ps->pit_timer.pending); + ps->channels[0].count_load_time = ktime_get(); + } + } +} diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h new file mode 100644 index 000000000000..db25c2a6c8c4 --- /dev/null +++ b/arch/x86/kvm/i8254.h @@ -0,0 +1,63 @@ +#ifndef __I8254_H +#define __I8254_H + +#include "iodev.h" + +struct kvm_kpit_timer { + struct hrtimer timer; + int irq; + s64 period; /* unit: ns */ + s64 scheduled; + ktime_t last_update; + atomic_t pending; +}; + +struct kvm_kpit_channel_state { + u32 count; /* can be 65536 */ + u16 latched_count; + u8 count_latched; + u8 status_latched; + u8 status; + u8 read_state; + u8 write_state; + u8 write_latch; + u8 rw_mode; + u8 mode; + u8 bcd; /* not supported */ + u8 gate; /* timer start */ + ktime_t count_load_time; +}; + +struct kvm_kpit_state { + struct kvm_kpit_channel_state channels[3]; + struct kvm_kpit_timer pit_timer; + u32 speaker_data_on; + struct mutex lock; + struct kvm_pit *pit; + bool inject_pending; /* if inject pending interrupts */ + unsigned long last_injected_time; +}; + +struct kvm_pit { + unsigned long base_addresss; + struct kvm_io_device dev; + struct kvm_io_device speaker_dev; + struct kvm *kvm; + struct kvm_kpit_state pit_state; +}; + +#define KVM_PIT_BASE_ADDRESS 0x40 +#define KVM_SPEAKER_BASE_ADDRESS 0x61 +#define KVM_PIT_MEM_LENGTH 4 +#define KVM_PIT_FREQ 1193181 +#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 +#define KVM_PIT_CHANNEL_MASK 0x3 + +void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); +void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec); +void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); +struct kvm_pit *kvm_create_pit(struct kvm *kvm); +void kvm_free_pit(struct kvm *kvm); +void kvm_pit_reset(struct kvm_pit *pit); + +#endif diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index e5714759e97f..ce1f583459b1 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -23,6 +23,22 @@ #include <linux/kvm_host.h> #include "irq.h" +#include "i8254.h" + +/* + * check if there are pending timer events + * to be processed. + */ +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) +{ + int ret; + + ret = pit_has_pending_timer(vcpu); + ret |= apic_has_pending_timer(vcpu); + + return ret; +} +EXPORT_SYMBOL(kvm_cpu_has_pending_timer); /* * check if there is pending interrupt without @@ -66,6 +82,7 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { kvm_inject_apic_timer_irqs(vcpu); + kvm_inject_pit_timer_irqs(vcpu); /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); @@ -73,6 +90,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) { kvm_apic_timer_intr_post(vcpu, vec); + kvm_pit_timer_intr_post(vcpu, vec); /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_timer_intr_post); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index fa5ed5d59b5d..1802134b836f 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -85,4 +85,7 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); +int pit_has_pending_timer(struct kvm_vcpu *vcpu); +int apic_has_pending_timer(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h index ecdfe97e4635..65ef0fc2c036 100644 --- a/arch/x86/kvm/kvm_svm.h +++ b/arch/x86/kvm/kvm_svm.h @@ -39,6 +39,8 @@ struct vcpu_svm { unsigned long host_db_regs[NUM_DB_REGS]; unsigned long host_dr6; unsigned long host_dr7; + + u32 *msrpm; }; #endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 68a6b1511934..57ac4e4c556a 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -338,10 +338,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } else apic_clear_vector(vector, apic->regs + APIC_TMR); - if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) kvm_vcpu_kick(vcpu); - else if (vcpu->arch.mp_state == VCPU_MP_STATE_HALTED) { - vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; + else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) { + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); } @@ -362,11 +362,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_INIT: if (level) { - if (vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE) + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) printk(KERN_DEBUG "INIT on a runnable vcpu %d\n", vcpu->vcpu_id); - vcpu->arch.mp_state = VCPU_MP_STATE_INIT_RECEIVED; + vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; kvm_vcpu_kick(vcpu); } else { printk(KERN_DEBUG @@ -379,9 +379,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_STARTUP: printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", vcpu->vcpu_id, vector); - if (vcpu->arch.mp_state == VCPU_MP_STATE_INIT_RECEIVED) { + if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { vcpu->arch.sipi_vector = vector; - vcpu->arch.mp_state = VCPU_MP_STATE_SIPI_RECEIVED; + vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; if (waitqueue_active(&vcpu->wq)) wake_up_interruptible(&vcpu->wq); } @@ -658,7 +658,7 @@ static void start_apic_timer(struct kvm_lapic *apic) apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" PRIx64 ", " "timer initial count 0x%x, period %lldns, " - "expire @ 0x%016" PRIx64 ".\n", __FUNCTION__, + "expire @ 0x%016" PRIx64 ".\n", __func__, APIC_BUS_CYCLE_NS, ktime_to_ns(now), apic_get_reg(apic, APIC_TMICT), apic->timer.period, @@ -691,7 +691,7 @@ static void apic_mmio_write(struct kvm_io_device *this, /* too common printing */ if (offset != APIC_EOI) apic_debug("%s: offset 0x%x with length 0x%x, and value is " - "0x%x\n", __FUNCTION__, offset, len, val); + "0x%x\n", __func__, offset, len, val); offset &= 0xff0; @@ -822,6 +822,7 @@ void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) apic_set_tpr(apic, ((cr8 & 0x0f) << 4) | (apic_get_reg(apic, APIC_TASKPRI) & 4)); } +EXPORT_SYMBOL_GPL(kvm_lapic_set_tpr); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu) { @@ -869,7 +870,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) struct kvm_lapic *apic; int i; - apic_debug("%s\n", __FUNCTION__); + apic_debug("%s\n", __func__); ASSERT(vcpu); apic = vcpu->arch.apic; @@ -907,7 +908,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_update_ppr(apic); apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" - "0x%016" PRIx64 ", base_address=0x%0lx.\n", __FUNCTION__, + "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, vcpu, kvm_apic_id(apic), vcpu->arch.apic_base, apic->base_address); } @@ -940,7 +941,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic) atomic_inc(&apic->timer.pending); if (waitqueue_active(q)) { - apic->vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; + apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; wake_up_interruptible(q); } if (apic_lvtt_period(apic)) { @@ -952,6 +953,16 @@ static int __apic_timer_fn(struct kvm_lapic *apic) return result; } +int apic_has_pending_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *lapic = vcpu->arch.apic; + + if (lapic) + return atomic_read(&lapic->timer.pending); + + return 0; +} + static int __inject_apic_timer_irq(struct kvm_lapic *apic) { int vector; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e55af12e11b7..2ad6f5481671 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -27,11 +27,22 @@ #include <linux/highmem.h> #include <linux/module.h> #include <linux/swap.h> +#include <linux/hugetlb.h> +#include <linux/compiler.h> #include <asm/page.h> #include <asm/cmpxchg.h> #include <asm/io.h> +/* + * When setting this variable to true it enables Two-Dimensional-Paging + * where the hardware walks 2 page tables: + * 1. the guest-virtual to guest-physical + * 2. while doing 1. it walks guest-physical to host-physical + * If the hardware supports that we don't need to do shadow paging. + */ +bool tdp_enabled = false; + #undef MMU_DEBUG #undef AUDIT @@ -101,8 +112,6 @@ static int dbg = 1; #define PT_FIRST_AVAIL_BITS_SHIFT 9 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 -#define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT) - #define VALID_PAGE(x) ((x) != INVALID_PAGE) #define PT64_LEVEL_BITS 9 @@ -159,6 +168,13 @@ static int dbg = 1; #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +struct kvm_pv_mmu_op_buffer { + void *ptr; + unsigned len; + unsigned processed; + char buf[512] __aligned(sizeof(long)); +}; + struct kvm_rmap_desc { u64 *shadow_ptes[RMAP_EXT]; struct kvm_rmap_desc *more; @@ -200,11 +216,15 @@ static int is_present_pte(unsigned long pte) static int is_shadow_present_pte(u64 pte) { - pte &= ~PT_SHADOW_IO_MARK; return pte != shadow_trap_nonpresent_pte && pte != shadow_notrap_nonpresent_pte; } +static int is_large_pte(u64 pte) +{ + return pte & PT_PAGE_SIZE_MASK; +} + static int is_writeble_pte(unsigned long pte) { return pte & PT_WRITABLE_MASK; @@ -215,14 +235,14 @@ static int is_dirty_pte(unsigned long pte) return pte & PT_DIRTY_MASK; } -static int is_io_pte(unsigned long pte) +static int is_rmap_pte(u64 pte) { - return pte & PT_SHADOW_IO_MARK; + return is_shadow_present_pte(pte); } -static int is_rmap_pte(u64 pte) +static pfn_t spte_to_pfn(u64 pte) { - return is_shadow_present_pte(pte); + return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; } static gfn_t pse36_gfn_delta(u32 gpte) @@ -349,16 +369,100 @@ static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) } /* + * Return the pointer to the largepage write count for a given + * gfn, handling slots that are not large page aligned. + */ +static int *slot_largepage_idx(gfn_t gfn, struct kvm_memory_slot *slot) +{ + unsigned long idx; + + idx = (gfn / KVM_PAGES_PER_HPAGE) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE); + return &slot->lpage_info[idx].write_count; +} + +static void account_shadowed(struct kvm *kvm, gfn_t gfn) +{ + int *write_count; + + write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + *write_count += 1; + WARN_ON(*write_count > KVM_PAGES_PER_HPAGE); +} + +static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) +{ + int *write_count; + + write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn)); + *write_count -= 1; + WARN_ON(*write_count < 0); +} + +static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn) +{ + struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn); + int *largepage_idx; + + if (slot) { + largepage_idx = slot_largepage_idx(gfn, slot); + return *largepage_idx; + } + + return 1; +} + +static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) +{ + struct vm_area_struct *vma; + unsigned long addr; + + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 0; + + vma = find_vma(current->mm, addr); + if (vma && is_vm_hugetlb_page(vma)) + return 1; + + return 0; +} + +static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) +{ + struct kvm_memory_slot *slot; + + if (has_wrprotected_page(vcpu->kvm, large_gfn)) + return 0; + + if (!host_largepage_backed(vcpu->kvm, large_gfn)) + return 0; + + slot = gfn_to_memslot(vcpu->kvm, large_gfn); + if (slot && slot->dirty_bitmap) + return 0; + + return 1; +} + +/* * Take gfn and return the reverse mapping to it. * Note: gfn must be unaliased before this function get called */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int lpage) { struct kvm_memory_slot *slot; + unsigned long idx; slot = gfn_to_memslot(kvm, gfn); - return &slot->rmap[gfn - slot->base_gfn]; + if (!lpage) + return &slot->rmap[gfn - slot->base_gfn]; + + idx = (gfn / KVM_PAGES_PER_HPAGE) - + (slot->base_gfn / KVM_PAGES_PER_HPAGE); + + return &slot->lpage_info[idx].rmap_pde; } /* @@ -370,7 +474,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn) * If rmapp bit zero is one, (then rmap & ~1) points to a struct kvm_rmap_desc * containing more mappings. */ -static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) +static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn, int lpage) { struct kvm_mmu_page *sp; struct kvm_rmap_desc *desc; @@ -382,7 +486,7 @@ static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) gfn = unalias_gfn(vcpu->kvm, gfn); sp = page_header(__pa(spte)); sp->gfns[spte - sp->spt] = gfn; - rmapp = gfn_to_rmap(vcpu->kvm, gfn); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, lpage); if (!*rmapp) { rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); *rmapp = (unsigned long)spte; @@ -435,20 +539,21 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) struct kvm_rmap_desc *desc; struct kvm_rmap_desc *prev_desc; struct kvm_mmu_page *sp; - struct page *page; + pfn_t pfn; unsigned long *rmapp; int i; if (!is_rmap_pte(*spte)) return; sp = page_header(__pa(spte)); - page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); - mark_page_accessed(page); + pfn = spte_to_pfn(*spte); + if (*spte & PT_ACCESSED_MASK) + kvm_set_pfn_accessed(pfn); if (is_writeble_pte(*spte)) - kvm_release_page_dirty(page); + kvm_release_pfn_dirty(pfn); else - kvm_release_page_clean(page); - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt]); + kvm_release_pfn_clean(pfn); + rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], is_large_pte(*spte)); if (!*rmapp) { printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); BUG(); @@ -514,7 +619,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) int write_protected = 0; gfn = unalias_gfn(kvm, gfn); - rmapp = gfn_to_rmap(kvm, gfn); + rmapp = gfn_to_rmap(kvm, gfn, 0); spte = rmap_next(kvm, rmapp, NULL); while (spte) { @@ -527,8 +632,35 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) } spte = rmap_next(kvm, rmapp, spte); } + if (write_protected) { + pfn_t pfn; + + spte = rmap_next(kvm, rmapp, NULL); + pfn = spte_to_pfn(*spte); + kvm_set_pfn_dirty(pfn); + } + + /* check for huge page mappings */ + rmapp = gfn_to_rmap(kvm, gfn, 1); + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!spte); + BUG_ON(!(*spte & PT_PRESENT_MASK)); + BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); + pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); + if (is_writeble_pte(*spte)) { + rmap_remove(kvm, spte); + --kvm->stat.lpages; + set_shadow_pte(spte, shadow_trap_nonpresent_pte); + write_protected = 1; + } + spte = rmap_next(kvm, rmapp, spte); + } + if (write_protected) kvm_flush_remote_tlbs(kvm); + + account_shadowed(kvm, gfn); } #ifdef MMU_DEBUG @@ -538,8 +670,8 @@ static int is_empty_shadow_page(u64 *spt) u64 *end; for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++) - if ((*pos & ~PT_SHADOW_IO_MARK) != shadow_trap_nonpresent_pte) { - printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__, + if (*pos != shadow_trap_nonpresent_pte) { + printk(KERN_ERR "%s: %p %llx\n", __func__, pos, *pos); return 0; } @@ -559,7 +691,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) static unsigned kvm_page_table_hashfn(gfn_t gfn) { - return gfn; + return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1); } static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, @@ -662,13 +794,14 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) struct kvm_mmu_page *sp; struct hlist_node *node; - pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + pgprintk("%s: looking for gfn %lx\n", __func__, gfn); + index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && !sp->role.metaphysical) { + if (sp->gfn == gfn && !sp->role.metaphysical + && !sp->role.invalid) { pgprintk("%s: found role %x\n", - __FUNCTION__, sp->role.word); + __func__, sp->role.word); return sp; } return NULL; @@ -699,27 +832,27 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__, + pgprintk("%s: looking gfn %lx role %x\n", __func__, gfn, role.word); - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry(sp, node, bucket, hash_link) if (sp->gfn == gfn && sp->role.word == role.word) { mmu_page_add_parent_pte(vcpu, sp, parent_pte); - pgprintk("%s: found\n", __FUNCTION__); + pgprintk("%s: found\n", __func__); return sp; } ++vcpu->kvm->stat.mmu_cache_miss; sp = kvm_mmu_alloc_page(vcpu, parent_pte); if (!sp) return sp; - pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word); + pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word); sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, bucket); - vcpu->arch.mmu.prefetch_page(vcpu, sp); if (!metaphysical) rmap_write_protect(vcpu->kvm, gfn); + vcpu->arch.mmu.prefetch_page(vcpu, sp); return sp; } @@ -745,11 +878,17 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { ent = pt[i]; + if (is_shadow_present_pte(ent)) { + if (!is_large_pte(ent)) { + ent &= PT64_BASE_ADDR_MASK; + mmu_page_remove_parent_pte(page_header(ent), + &pt[i]); + } else { + --kvm->stat.lpages; + rmap_remove(kvm, &pt[i]); + } + } pt[i] = shadow_trap_nonpresent_pte; - if (!is_shadow_present_pte(ent)) - continue; - ent &= PT64_BASE_ADDR_MASK; - mmu_page_remove_parent_pte(page_header(ent), &pt[i]); } kvm_flush_remote_tlbs(kvm); } @@ -789,10 +928,15 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) } kvm_mmu_page_unlink_children(kvm, sp); if (!sp->root_count) { + if (!sp->role.metaphysical) + unaccount_shadowed(kvm, sp->gfn); hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); - } else + } else { list_move(&sp->link, &kvm->arch.active_mmu_pages); + sp->role.invalid = 1; + kvm_reload_remote_mmus(kvm); + } kvm_mmu_reset_last_pte_updated(kvm); } @@ -838,13 +982,13 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) struct hlist_node *node, *n; int r; - pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn); + pgprintk("%s: looking for gfn %lx\n", __func__, gfn); r = 0; - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + index = kvm_page_table_hashfn(gfn); bucket = &kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) if (sp->gfn == gfn && !sp->role.metaphysical) { - pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn, + pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); kvm_mmu_zap_page(kvm, sp); r = 1; @@ -857,7 +1001,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) struct kvm_mmu_page *sp; while ((sp = kvm_mmu_lookup_page(kvm, gfn)) != NULL) { - pgprintk("%s: zap %lx %x\n", __FUNCTION__, gfn, sp->role.word); + pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); kvm_mmu_zap_page(kvm, sp); } } @@ -889,26 +1033,39 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, - int *ptwrite, gfn_t gfn, struct page *page) + int *ptwrite, int largepage, gfn_t gfn, + pfn_t pfn, bool speculative) { u64 spte; int was_rmapped = 0; int was_writeble = is_writeble_pte(*shadow_pte); - hfn_t host_pfn = (*shadow_pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; pgprintk("%s: spte %llx access %x write_fault %d" " user_fault %d gfn %lx\n", - __FUNCTION__, *shadow_pte, pt_access, + __func__, *shadow_pte, pt_access, write_fault, user_fault, gfn); if (is_rmap_pte(*shadow_pte)) { - if (host_pfn != page_to_pfn(page)) { + /* + * If we overwrite a PTE page pointer with a 2MB PMD, unlink + * the parent of the now unreachable PTE. + */ + if (largepage && !is_large_pte(*shadow_pte)) { + struct kvm_mmu_page *child; + u64 pte = *shadow_pte; + + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, shadow_pte); + } else if (pfn != spte_to_pfn(*shadow_pte)) { pgprintk("hfn old %lx new %lx\n", - host_pfn, page_to_pfn(page)); + spte_to_pfn(*shadow_pte), pfn); rmap_remove(vcpu->kvm, shadow_pte); + } else { + if (largepage) + was_rmapped = is_large_pte(*shadow_pte); + else + was_rmapped = 1; } - else - was_rmapped = 1; } /* @@ -917,6 +1074,8 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, * demand paging). */ spte = PT_PRESENT_MASK | PT_DIRTY_MASK; + if (!speculative) + pte_access |= PT_ACCESSED_MASK; if (!dirty) pte_access &= ~ACC_WRITE_MASK; if (!(pte_access & ACC_EXEC_MASK)) @@ -925,15 +1084,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= PT_PRESENT_MASK; if (pte_access & ACC_USER_MASK) spte |= PT_USER_MASK; + if (largepage) + spte |= PT_PAGE_SIZE_MASK; - if (is_error_page(page)) { - set_shadow_pte(shadow_pte, - shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); - kvm_release_page_clean(page); - return; - } - - spte |= page_to_phys(page); + spte |= (u64)pfn << PAGE_SHIFT; if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { @@ -946,9 +1100,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, } shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow) { + if (shadow || + (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { pgprintk("%s: found shadow page for %lx, marking ro\n", - __FUNCTION__, gfn); + __func__, gfn); pte_access &= ~ACC_WRITE_MASK; if (is_writeble_pte(spte)) { spte &= ~PT_WRITABLE_MASK; @@ -964,18 +1119,25 @@ unshadowed: if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); - pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); + pgprintk("%s: setting spte %llx\n", __func__, spte); + pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n", + (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", + (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); set_shadow_pte(shadow_pte, spte); + if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) + && (spte & PT_PRESENT_MASK)) + ++vcpu->kvm->stat.lpages; + page_header_update_slot(vcpu->kvm, shadow_pte, gfn); if (!was_rmapped) { - rmap_add(vcpu, shadow_pte, gfn); + rmap_add(vcpu, shadow_pte, gfn, largepage); if (!is_rmap_pte(*shadow_pte)) - kvm_release_page_clean(page); + kvm_release_pfn_clean(pfn); } else { if (was_writeble) - kvm_release_page_dirty(page); + kvm_release_pfn_dirty(pfn); else - kvm_release_page_clean(page); + kvm_release_pfn_clean(pfn); } if (!ptwrite || !*ptwrite) vcpu->arch.last_pte_updated = shadow_pte; @@ -985,10 +1147,10 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, - gfn_t gfn, struct page *page) +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, + int largepage, gfn_t gfn, pfn_t pfn, + int level) { - int level = PT32E_ROOT_LEVEL; hpa_t table_addr = vcpu->arch.mmu.root_hpa; int pt_write = 0; @@ -1001,8 +1163,14 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, if (level == 1) { mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, gfn, page); - return pt_write || is_io_pte(table[index]); + 0, write, 1, &pt_write, 0, gfn, pfn, false); + return pt_write; + } + + if (largepage && level == 2) { + mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, + 0, write, 1, &pt_write, 1, gfn, pfn, false); + return pt_write; } if (table[index] == shadow_trap_nonpresent_pte) { @@ -1016,7 +1184,7 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, 1, ACC_ALL, &table[index]); if (!new_table) { pgprintk("nonpaging_map: ENOMEM\n"); - kvm_release_page_clean(page); + kvm_release_pfn_clean(pfn); return -ENOMEM; } @@ -1030,21 +1198,30 @@ static int __nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; - - struct page *page; - - down_read(&vcpu->kvm->slots_lock); + int largepage = 0; + pfn_t pfn; down_read(¤t->mm->mmap_sem); - page = gfn_to_page(vcpu->kvm, gfn); + if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { + gfn &= ~(KVM_PAGES_PER_HPAGE-1); + largepage = 1; + } + + pfn = gfn_to_pfn(vcpu->kvm, gfn); up_read(¤t->mm->mmap_sem); + /* mmio */ + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + return 1; + } + spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); - r = __nonpaging_map(vcpu, v, write, gfn, page); + r = __direct_map(vcpu, v, write, largepage, gfn, pfn, + PT32E_ROOT_LEVEL); spin_unlock(&vcpu->kvm->mmu_lock); - up_read(&vcpu->kvm->slots_lock); return r; } @@ -1073,6 +1250,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) sp = page_header(root); --sp->root_count; + if (!sp->root_count && sp->role.invalid) + kvm_mmu_zap_page(vcpu->kvm, sp); vcpu->arch.mmu.root_hpa = INVALID_PAGE; spin_unlock(&vcpu->kvm->mmu_lock); return; @@ -1085,6 +1264,8 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu) root &= PT64_BASE_ADDR_MASK; sp = page_header(root); --sp->root_count; + if (!sp->root_count && sp->role.invalid) + kvm_mmu_zap_page(vcpu->kvm, sp); } vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; } @@ -1097,6 +1278,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) int i; gfn_t root_gfn; struct kvm_mmu_page *sp; + int metaphysical = 0; root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT; @@ -1105,14 +1287,20 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) hpa_t root = vcpu->arch.mmu.root_hpa; ASSERT(!VALID_PAGE(root)); + if (tdp_enabled) + metaphysical = 1; sp = kvm_mmu_get_page(vcpu, root_gfn, 0, - PT64_ROOT_LEVEL, 0, ACC_ALL, NULL); + PT64_ROOT_LEVEL, metaphysical, + ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; vcpu->arch.mmu.root_hpa = root; return; } #endif + metaphysical = !is_paging(vcpu); + if (tdp_enabled) + metaphysical = 1; for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -1126,7 +1314,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) } else if (vcpu->arch.mmu.root_level == 0) root_gfn = 0; sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, - PT32_ROOT_LEVEL, !is_paging(vcpu), + PT32_ROOT_LEVEL, metaphysical, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; @@ -1146,7 +1334,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn; int r; - pgprintk("%s: gva %lx error %x\n", __FUNCTION__, gva, error_code); + pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); r = mmu_topup_memory_caches(vcpu); if (r) return r; @@ -1160,6 +1348,41 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, error_code & PFERR_WRITE_MASK, gfn); } +static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, + u32 error_code) +{ + pfn_t pfn; + int r; + int largepage = 0; + gfn_t gfn = gpa >> PAGE_SHIFT; + + ASSERT(vcpu); + ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + down_read(¤t->mm->mmap_sem); + if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { + gfn &= ~(KVM_PAGES_PER_HPAGE-1); + largepage = 1; + } + pfn = gfn_to_pfn(vcpu->kvm, gfn); + up_read(¤t->mm->mmap_sem); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + return 1; + } + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); + r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, + largepage, gfn, pfn, TDP_ROOT_LEVEL); + spin_unlock(&vcpu->kvm->mmu_lock); + + return r; +} + static void nonpaging_free(struct kvm_vcpu *vcpu) { mmu_free_roots(vcpu); @@ -1188,7 +1411,7 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) static void paging_new_cr3(struct kvm_vcpu *vcpu) { - pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->arch.cr3); + pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); mmu_free_roots(vcpu); } @@ -1253,7 +1476,35 @@ static int paging32E_init_context(struct kvm_vcpu *vcpu) return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL); } -static int init_kvm_mmu(struct kvm_vcpu *vcpu) +static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) +{ + struct kvm_mmu *context = &vcpu->arch.mmu; + + context->new_cr3 = nonpaging_new_cr3; + context->page_fault = tdp_page_fault; + context->free = nonpaging_free; + context->prefetch_page = nonpaging_prefetch_page; + context->shadow_root_level = TDP_ROOT_LEVEL; + context->root_hpa = INVALID_PAGE; + + if (!is_paging(vcpu)) { + context->gva_to_gpa = nonpaging_gva_to_gpa; + context->root_level = 0; + } else if (is_long_mode(vcpu)) { + context->gva_to_gpa = paging64_gva_to_gpa; + context->root_level = PT64_ROOT_LEVEL; + } else if (is_pae(vcpu)) { + context->gva_to_gpa = paging64_gva_to_gpa; + context->root_level = PT32E_ROOT_LEVEL; + } else { + context->gva_to_gpa = paging32_gva_to_gpa; + context->root_level = PT32_ROOT_LEVEL; + } + + return 0; +} + +static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -1268,6 +1519,16 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu) return paging32_init_context(vcpu); } +static int init_kvm_mmu(struct kvm_vcpu *vcpu) +{ + vcpu->arch.update_pte.pfn = bad_pfn; + + if (tdp_enabled) + return init_kvm_tdp_mmu(vcpu); + else + return init_kvm_softmmu(vcpu); +} + static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) { ASSERT(vcpu); @@ -1316,7 +1577,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, pte = *spte; if (is_shadow_present_pte(pte)) { - if (sp->role.level == PT_PAGE_TABLE_LEVEL) + if (sp->role.level == PT_PAGE_TABLE_LEVEL || + is_large_pte(pte)) rmap_remove(vcpu->kvm, spte); else { child = page_header(pte & PT64_BASE_ADDR_MASK); @@ -1324,24 +1586,26 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, } } set_shadow_pte(spte, shadow_trap_nonpresent_pte); + if (is_large_pte(pte)) + --vcpu->kvm->stat.lpages; } static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, - const void *new, int bytes, - int offset_in_pte) + const void *new) { - if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + if ((sp->role.level != PT_PAGE_TABLE_LEVEL) + && !vcpu->arch.update_pte.largepage) { ++vcpu->kvm->stat.mmu_pde_zapped; return; } ++vcpu->kvm->stat.mmu_pte_updated; if (sp->role.glevels == PT32_ROOT_LEVEL) - paging32_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); + paging32_update_pte(vcpu, sp, spte, new); else - paging64_update_pte(vcpu, sp, spte, new, bytes, offset_in_pte); + paging64_update_pte(vcpu, sp, spte, new); } static bool need_remote_flush(u64 old, u64 new) @@ -1378,7 +1642,9 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn_t gfn; int r; u64 gpte = 0; - struct page *page; + pfn_t pfn; + + vcpu->arch.update_pte.largepage = 0; if (bytes != 4 && bytes != 8) return; @@ -1408,11 +1674,19 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; down_read(¤t->mm->mmap_sem); - page = gfn_to_page(vcpu->kvm, gfn); + if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { + gfn &= ~(KVM_PAGES_PER_HPAGE-1); + vcpu->arch.update_pte.largepage = 1; + } + pfn = gfn_to_pfn(vcpu->kvm, gfn); up_read(¤t->mm->mmap_sem); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); + return; + } vcpu->arch.update_pte.gfn = gfn; - vcpu->arch.update_pte.page = page; + vcpu->arch.update_pte.pfn = pfn; } void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, @@ -1423,7 +1697,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, struct hlist_node *node, *n; struct hlist_head *bucket; unsigned index; - u64 entry; + u64 entry, gentry; u64 *spte; unsigned offset = offset_in_page(gpa); unsigned pte_size; @@ -1433,8 +1707,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, int level; int flooded = 0; int npte; + int r; - pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); + pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); @@ -1450,7 +1725,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, vcpu->arch.last_pt_write_count = 1; vcpu->arch.last_pte_updated = NULL; } - index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES; + index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { if (sp->gfn != gfn || sp->role.metaphysical) @@ -1496,20 +1771,29 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, continue; } spte = &sp->spt[page_offset / sizeof(*spte)]; + if ((gpa & (pte_size - 1)) || (bytes < pte_size)) { + gentry = 0; + r = kvm_read_guest_atomic(vcpu->kvm, + gpa & ~(u64)(pte_size - 1), + &gentry, pte_size); + new = (const void *)&gentry; + if (r < 0) + new = NULL; + } while (npte--) { entry = *spte; mmu_pte_write_zap_pte(vcpu, sp, spte); - mmu_pte_write_new_pte(vcpu, sp, spte, new, bytes, - page_offset & (pte_size - 1)); + if (new) + mmu_pte_write_new_pte(vcpu, sp, spte, new); mmu_pte_write_flush_tlb(vcpu, entry, *spte); ++spte; } } kvm_mmu_audit(vcpu, "post pte write"); spin_unlock(&vcpu->kvm->mmu_lock); - if (vcpu->arch.update_pte.page) { - kvm_release_page_clean(vcpu->arch.update_pte.page); - vcpu->arch.update_pte.page = NULL; + if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { + kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); + vcpu->arch.update_pte.pfn = bad_pfn; } } @@ -1518,9 +1802,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) gpa_t gpa; int r; - down_read(&vcpu->kvm->slots_lock); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); - up_read(&vcpu->kvm->slots_lock); spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); @@ -1577,6 +1859,12 @@ out: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +void kvm_enable_tdp(void) +{ + tdp_enabled = true; +} +EXPORT_SYMBOL_GPL(kvm_enable_tdp); + static void free_mmu_pages(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; @@ -1677,7 +1965,53 @@ void kvm_mmu_zap_all(struct kvm *kvm) kvm_flush_remote_tlbs(kvm); } -void kvm_mmu_module_exit(void) +void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) +{ + struct kvm_mmu_page *page; + + page = container_of(kvm->arch.active_mmu_pages.prev, + struct kvm_mmu_page, link); + kvm_mmu_zap_page(kvm, page); +} + +static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) +{ + struct kvm *kvm; + struct kvm *kvm_freed = NULL; + int cache_count = 0; + + spin_lock(&kvm_lock); + + list_for_each_entry(kvm, &vm_list, vm_list) { + int npages; + + spin_lock(&kvm->mmu_lock); + npages = kvm->arch.n_alloc_mmu_pages - + kvm->arch.n_free_mmu_pages; + cache_count += npages; + if (!kvm_freed && nr_to_scan > 0 && npages > 0) { + kvm_mmu_remove_one_alloc_mmu_page(kvm); + cache_count--; + kvm_freed = kvm; + } + nr_to_scan--; + + spin_unlock(&kvm->mmu_lock); + } + if (kvm_freed) + list_move_tail(&kvm_freed->vm_list, &vm_list); + + spin_unlock(&kvm_lock); + + return cache_count; +} + +static struct shrinker mmu_shrinker = { + .shrink = mmu_shrink, + .seeks = DEFAULT_SEEKS * 10, +}; + +void mmu_destroy_caches(void) { if (pte_chain_cache) kmem_cache_destroy(pte_chain_cache); @@ -1687,6 +2021,12 @@ void kvm_mmu_module_exit(void) kmem_cache_destroy(mmu_page_header_cache); } +void kvm_mmu_module_exit(void) +{ + mmu_destroy_caches(); + unregister_shrinker(&mmu_shrinker); +} + int kvm_mmu_module_init(void) { pte_chain_cache = kmem_cache_create("kvm_pte_chain", @@ -1706,10 +2046,12 @@ int kvm_mmu_module_init(void) if (!mmu_page_header_cache) goto nomem; + register_shrinker(&mmu_shrinker); + return 0; nomem: - kvm_mmu_module_exit(); + mmu_destroy_caches(); return -ENOMEM; } @@ -1732,6 +2074,127 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) return nr_mmu_pages; } +static void *pv_mmu_peek_buffer(struct kvm_pv_mmu_op_buffer *buffer, + unsigned len) +{ + if (len > buffer->len) + return NULL; + return buffer->ptr; +} + +static void *pv_mmu_read_buffer(struct kvm_pv_mmu_op_buffer *buffer, + unsigned len) +{ + void *ret; + + ret = pv_mmu_peek_buffer(buffer, len); + if (!ret) + return ret; + buffer->ptr += len; + buffer->len -= len; + buffer->processed += len; + return ret; +} + +static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu, + gpa_t addr, gpa_t value) +{ + int bytes = 8; + int r; + + if (!is_long_mode(vcpu) && !is_pae(vcpu)) + bytes = 4; + + r = mmu_topup_memory_caches(vcpu); + if (r) + return r; + + if (!emulator_write_phys(vcpu, addr, &value, bytes)) + return -EFAULT; + + return 1; +} + +static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) +{ + kvm_x86_ops->tlb_flush(vcpu); + return 1; +} + +static int kvm_pv_mmu_release_pt(struct kvm_vcpu *vcpu, gpa_t addr) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_unshadow(vcpu->kvm, addr >> PAGE_SHIFT); + spin_unlock(&vcpu->kvm->mmu_lock); + return 1; +} + +static int kvm_pv_mmu_op_one(struct kvm_vcpu *vcpu, + struct kvm_pv_mmu_op_buffer *buffer) +{ + struct kvm_mmu_op_header *header; + + header = pv_mmu_peek_buffer(buffer, sizeof *header); + if (!header) + return 0; + switch (header->op) { + case KVM_MMU_OP_WRITE_PTE: { + struct kvm_mmu_op_write_pte *wpte; + + wpte = pv_mmu_read_buffer(buffer, sizeof *wpte); + if (!wpte) + return 0; + return kvm_pv_mmu_write(vcpu, wpte->pte_phys, + wpte->pte_val); + } + case KVM_MMU_OP_FLUSH_TLB: { + struct kvm_mmu_op_flush_tlb *ftlb; + + ftlb = pv_mmu_read_buffer(buffer, sizeof *ftlb); + if (!ftlb) + return 0; + return kvm_pv_mmu_flush_tlb(vcpu); + } + case KVM_MMU_OP_RELEASE_PT: { + struct kvm_mmu_op_release_pt *rpt; + + rpt = pv_mmu_read_buffer(buffer, sizeof *rpt); + if (!rpt) + return 0; + return kvm_pv_mmu_release_pt(vcpu, rpt->pt_phys); + } + default: return 0; + } +} + +int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, + gpa_t addr, unsigned long *ret) +{ + int r; + struct kvm_pv_mmu_op_buffer buffer; + + buffer.ptr = buffer.buf; + buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); + buffer.processed = 0; + + r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); + if (r) + goto out; + + while (buffer.len) { + r = kvm_pv_mmu_op_one(vcpu, &buffer); + if (r < 0) + goto out; + if (r == 0) + break; + } + + r = 1; +out: + *ret = buffer.processed; + return r; +} + #ifdef AUDIT static const char *audit_msg; @@ -1768,8 +2231,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, audit_mappings_page(vcpu, ent, va, level - 1); } else { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); - struct page *page = gpa_to_page(vcpu, gpa); - hpa_t hpa = page_to_phys(page); + hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT; if (is_shadow_present_pte(ent) && (ent & PT64_BASE_ADDR_MASK) != hpa) @@ -1782,7 +2244,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, && !is_error_hpa(hpa)) printk(KERN_ERR "audit: (%s) notrap shadow," " valid guest gva %lx\n", audit_msg, va); - kvm_release_page_clean(page); + kvm_release_pfn_clean(pfn); } } @@ -1867,7 +2329,7 @@ static void audit_rmap(struct kvm_vcpu *vcpu) if (n_rmap != n_actual) printk(KERN_ERR "%s: (%s) rmap %d actual %d\n", - __FUNCTION__, audit_msg, n_rmap, n_actual); + __func__, audit_msg, n_rmap, n_actual); } static void audit_write_protection(struct kvm_vcpu *vcpu) @@ -1887,7 +2349,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu) if (*rmapp) printk(KERN_ERR "%s: (%s) shadow page has writable" " mappings: gfn %lx role %x\n", - __FUNCTION__, audit_msg, sp->gfn, + __func__, audit_msg, sp->gfn, sp->role.word); } } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 1fce19ec7a23..e64e9f56a65e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -3,6 +3,12 @@ #include <linux/kvm_host.h> +#ifdef CONFIG_X86_64 +#define TDP_ROOT_LEVEL PT64_ROOT_LEVEL +#else +#define TDP_ROOT_LEVEL PT32E_ROOT_LEVEL +#endif + static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ecc0856268c4..156fe10288ae 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -130,7 +130,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, unsigned index, pt_access, pte_access; gpa_t pte_gpa; - pgprintk("%s: addr %lx\n", __FUNCTION__, addr); + pgprintk("%s: addr %lx\n", __func__, addr); walk: walker->level = vcpu->arch.mmu.root_level; pte = vcpu->arch.cr3; @@ -155,7 +155,7 @@ walk: pte_gpa += index * sizeof(pt_element_t); walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__, + pgprintk("%s: table_gfn[%d] %lx\n", __func__, walker->level - 1, table_gfn); kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); @@ -222,7 +222,7 @@ walk: walker->pt_access = pt_access; walker->pte_access = pte_access; pgprintk("%s: pte %llx pte_access %x pt_access %x\n", - __FUNCTION__, (u64)pte, pt_access, pte_access); + __func__, (u64)pte, pt_access, pte_access); return 1; not_present: @@ -243,31 +243,30 @@ err: } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, - u64 *spte, const void *pte, int bytes, - int offset_in_pte) + u64 *spte, const void *pte) { pt_element_t gpte; unsigned pte_access; - struct page *npage; + pfn_t pfn; + int largepage = vcpu->arch.update_pte.largepage; gpte = *(const pt_element_t *)pte; if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { - if (!offset_in_pte && !is_present_pte(gpte)) + if (!is_present_pte(gpte)) set_shadow_pte(spte, shadow_notrap_nonpresent_pte); return; } - if (bytes < sizeof(pt_element_t)) - return; - pgprintk("%s: gpte %llx spte %p\n", __FUNCTION__, (u64)gpte, spte); + pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) return; - npage = vcpu->arch.update_pte.page; - if (!npage) + pfn = vcpu->arch.update_pte.pfn; + if (is_error_pfn(pfn)) return; - get_page(npage); + kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, - gpte & PT_DIRTY_MASK, NULL, gpte_to_gfn(gpte), npage); + gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), + pfn, true); } /* @@ -275,8 +274,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, */ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, struct guest_walker *walker, - int user_fault, int write_fault, int *ptwrite, - struct page *page) + int user_fault, int write_fault, int largepage, + int *ptwrite, pfn_t pfn) { hpa_t shadow_addr; int level; @@ -304,11 +303,19 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, shadow_ent = ((u64 *)__va(shadow_addr)) + index; if (level == PT_PAGE_TABLE_LEVEL) break; - if (is_shadow_present_pte(*shadow_ent)) { + + if (largepage && level == PT_DIRECTORY_LEVEL) + break; + + if (is_shadow_present_pte(*shadow_ent) + && !is_large_pte(*shadow_ent)) { shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; continue; } + if (is_large_pte(*shadow_ent)) + rmap_remove(vcpu->kvm, shadow_ent); + if (level - 1 == PT_PAGE_TABLE_LEVEL && walker->level == PT_DIRECTORY_LEVEL) { metaphysical = 1; @@ -329,7 +336,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, walker->pte_gpa[level - 2], &curr_pte, sizeof(curr_pte)); if (r || curr_pte != walker->ptes[level - 2]) { - kvm_release_page_clean(page); + kvm_release_pfn_clean(pfn); return NULL; } } @@ -342,7 +349,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, user_fault, write_fault, walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, walker->gfn, page); + ptwrite, largepage, walker->gfn, pfn, false); return shadow_ent; } @@ -371,16 +378,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u64 *shadow_pte; int write_pt = 0; int r; - struct page *page; + pfn_t pfn; + int largepage = 0; - pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); + pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); kvm_mmu_audit(vcpu, "pre page fault"); r = mmu_topup_memory_caches(vcpu); if (r) return r; - down_read(&vcpu->kvm->slots_lock); /* * Look up the shadow pte for the faulting address. */ @@ -391,40 +398,45 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { - pgprintk("%s: guest page fault\n", __FUNCTION__); + pgprintk("%s: guest page fault\n", __func__); inject_page_fault(vcpu, addr, walker.error_code); vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ - up_read(&vcpu->kvm->slots_lock); return 0; } down_read(¤t->mm->mmap_sem); - page = gfn_to_page(vcpu->kvm, walker.gfn); + if (walker.level == PT_DIRECTORY_LEVEL) { + gfn_t large_gfn; + large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); + if (is_largepage_backed(vcpu, large_gfn)) { + walker.gfn = large_gfn; + largepage = 1; + } + } + pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); up_read(¤t->mm->mmap_sem); + /* mmio */ + if (is_error_pfn(pfn)) { + pgprintk("gfn %x is mmio\n", walker.gfn); + kvm_release_pfn_clean(pfn); + return 1; + } + spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, - &write_pt, page); - pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __FUNCTION__, + largepage, &write_pt, pfn); + + pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, shadow_pte, *shadow_pte, write_pt); if (!write_pt) vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ - /* - * mmio: emulate if accessible, otherwise its a guest fault. - */ - if (shadow_pte && is_io_pte(*shadow_pte)) { - spin_unlock(&vcpu->kvm->mmu_lock); - up_read(&vcpu->kvm->slots_lock); - return 1; - } - ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, "post page fault (fixed)"); spin_unlock(&vcpu->kvm->mmu_lock); - up_read(&vcpu->kvm->slots_lock); return write_pt; } diff --git a/arch/x86/kvm/segment_descriptor.h b/arch/x86/kvm/segment_descriptor.h deleted file mode 100644 index 56fc4c873389..000000000000 --- a/arch/x86/kvm/segment_descriptor.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef __SEGMENT_DESCRIPTOR_H -#define __SEGMENT_DESCRIPTOR_H - -struct segment_descriptor { - u16 limit_low; - u16 base_low; - u8 base_mid; - u8 type : 4; - u8 system : 1; - u8 dpl : 2; - u8 present : 1; - u8 limit_high : 4; - u8 avl : 1; - u8 long_mode : 1; - u8 default_op : 1; - u8 granularity : 1; - u8 base_high; -} __attribute__((packed)); - -#ifdef CONFIG_X86_64 -/* LDT or TSS descriptor in the GDT. 16 bytes. */ -struct segment_descriptor_64 { - struct segment_descriptor s; - u32 base_higher; - u32 pad_zero; -}; - -#endif -#endif diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1a582f1090e8..89e0be2c10d0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -47,6 +47,18 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_LBRV (1 << 1) #define SVM_DEATURE_SVML (1 << 2) +#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) + +/* enable NPT for AMD64 and X86 with PAE */ +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static bool npt_enabled = true; +#else +static bool npt_enabled = false; +#endif +static int npt = 1; + +module_param(npt, int, S_IRUGO); + static void kvm_reput_irq(struct vcpu_svm *svm); static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) @@ -54,8 +66,7 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_svm, vcpu); } -unsigned long iopm_base; -unsigned long msrpm_base; +static unsigned long iopm_base; struct kvm_ldttss_desc { u16 limit0; @@ -182,7 +193,7 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (!(efer & EFER_LMA)) + if (!npt_enabled && !(efer & EFER_LMA)) efer &= ~EFER_LME; to_svm(vcpu)->vmcb->save.efer = efer | MSR_EFER_SVME_MASK; @@ -219,12 +230,12 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); if (!svm->next_rip) { - printk(KERN_DEBUG "%s: NOP\n", __FUNCTION__); + printk(KERN_DEBUG "%s: NOP\n", __func__); return; } if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", - __FUNCTION__, + __func__, svm->vmcb->save.rip, svm->next_rip); @@ -279,11 +290,7 @@ static void svm_hardware_enable(void *garbage) struct svm_cpu_data *svm_data; uint64_t efer; -#ifdef CONFIG_X86_64 - struct desc_ptr gdt_descr; -#else struct desc_ptr gdt_descr; -#endif struct desc_struct *gdt; int me = raw_smp_processor_id(); @@ -302,7 +309,6 @@ static void svm_hardware_enable(void *garbage) svm_data->asid_generation = 1; svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; svm_data->next_asid = svm_data->max_asid + 1; - svm_features = cpuid_edx(SVM_CPUID_FUNC); asm volatile ("sgdt %0" : "=m"(gdt_descr)); gdt = (struct desc_struct *)gdt_descr.address; @@ -361,12 +367,51 @@ static void set_msr_interception(u32 *msrpm, unsigned msr, BUG(); } +static void svm_vcpu_init_msrpm(u32 *msrpm) +{ + memset(msrpm, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); + +#ifdef CONFIG_X86_64 + set_msr_interception(msrpm, MSR_GS_BASE, 1, 1); + set_msr_interception(msrpm, MSR_FS_BASE, 1, 1); + set_msr_interception(msrpm, MSR_KERNEL_GS_BASE, 1, 1); + set_msr_interception(msrpm, MSR_LSTAR, 1, 1); + set_msr_interception(msrpm, MSR_CSTAR, 1, 1); + set_msr_interception(msrpm, MSR_SYSCALL_MASK, 1, 1); +#endif + set_msr_interception(msrpm, MSR_K6_STAR, 1, 1); + set_msr_interception(msrpm, MSR_IA32_SYSENTER_CS, 1, 1); + set_msr_interception(msrpm, MSR_IA32_SYSENTER_ESP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_SYSENTER_EIP, 1, 1); +} + +static void svm_enable_lbrv(struct vcpu_svm *svm) +{ + u32 *msrpm = svm->msrpm; + + svm->vmcb->control.lbr_ctl = 1; + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 1, 1); + set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 1, 1); +} + +static void svm_disable_lbrv(struct vcpu_svm *svm) +{ + u32 *msrpm = svm->msrpm; + + svm->vmcb->control.lbr_ctl = 0; + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHFROMIP, 0, 0); + set_msr_interception(msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0); + set_msr_interception(msrpm, MSR_IA32_LASTINTFROMIP, 0, 0); + set_msr_interception(msrpm, MSR_IA32_LASTINTTOIP, 0, 0); +} + static __init int svm_hardware_setup(void) { int cpu; struct page *iopm_pages; - struct page *msrpm_pages; - void *iopm_va, *msrpm_va; + void *iopm_va; int r; iopm_pages = alloc_pages(GFP_KERNEL, IOPM_ALLOC_ORDER); @@ -379,41 +424,33 @@ static __init int svm_hardware_setup(void) clear_bit(0x80, iopm_va); /* allow direct access to PC debug port */ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); - msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + for_each_online_cpu(cpu) { + r = svm_cpu_init(cpu); + if (r) + goto err; + } - r = -ENOMEM; - if (!msrpm_pages) - goto err_1; + svm_features = cpuid_edx(SVM_CPUID_FUNC); - msrpm_va = page_address(msrpm_pages); - memset(msrpm_va, 0xff, PAGE_SIZE * (1 << MSRPM_ALLOC_ORDER)); - msrpm_base = page_to_pfn(msrpm_pages) << PAGE_SHIFT; + if (!svm_has(SVM_FEATURE_NPT)) + npt_enabled = false; -#ifdef CONFIG_X86_64 - set_msr_interception(msrpm_va, MSR_GS_BASE, 1, 1); - set_msr_interception(msrpm_va, MSR_FS_BASE, 1, 1); - set_msr_interception(msrpm_va, MSR_KERNEL_GS_BASE, 1, 1); - set_msr_interception(msrpm_va, MSR_LSTAR, 1, 1); - set_msr_interception(msrpm_va, MSR_CSTAR, 1, 1); - set_msr_interception(msrpm_va, MSR_SYSCALL_MASK, 1, 1); -#endif - set_msr_interception(msrpm_va, MSR_K6_STAR, 1, 1); - set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_CS, 1, 1); - set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_ESP, 1, 1); - set_msr_interception(msrpm_va, MSR_IA32_SYSENTER_EIP, 1, 1); + if (npt_enabled && !npt) { + printk(KERN_INFO "kvm: Nested Paging disabled\n"); + npt_enabled = false; + } - for_each_online_cpu(cpu) { - r = svm_cpu_init(cpu); - if (r) - goto err_2; + if (npt_enabled) { + printk(KERN_INFO "kvm: Nested Paging enabled\n"); + kvm_enable_tdp(); } + return 0; -err_2: - __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); - msrpm_base = 0; -err_1: +err: __free_pages(iopm_pages, IOPM_ALLOC_ORDER); iopm_base = 0; return r; @@ -421,9 +458,8 @@ err_1: static __exit void svm_hardware_unsetup(void) { - __free_pages(pfn_to_page(msrpm_base >> PAGE_SHIFT), MSRPM_ALLOC_ORDER); __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); - iopm_base = msrpm_base = 0; + iopm_base = 0; } static void init_seg(struct vmcb_seg *seg) @@ -443,15 +479,14 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static void init_vmcb(struct vmcb *vmcb) +static void init_vmcb(struct vcpu_svm *svm) { - struct vmcb_control_area *control = &vmcb->control; - struct vmcb_save_area *save = &vmcb->save; + struct vmcb_control_area *control = &svm->vmcb->control; + struct vmcb_save_area *save = &svm->vmcb->save; control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | - INTERCEPT_CR4_MASK | - INTERCEPT_CR8_MASK; + INTERCEPT_CR4_MASK; control->intercept_cr_write = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | @@ -471,23 +506,13 @@ static void init_vmcb(struct vmcb *vmcb) INTERCEPT_DR7_MASK; control->intercept_exceptions = (1 << PF_VECTOR) | - (1 << UD_VECTOR); + (1 << UD_VECTOR) | + (1 << MC_VECTOR); control->intercept = (1ULL << INTERCEPT_INTR) | (1ULL << INTERCEPT_NMI) | (1ULL << INTERCEPT_SMI) | - /* - * selective cr0 intercept bug? - * 0: 0f 22 d8 mov %eax,%cr3 - * 3: 0f 20 c0 mov %cr0,%eax - * 6: 0d 00 00 00 80 or $0x80000000,%eax - * b: 0f 22 c0 mov %eax,%cr0 - * set cr3 ->interception - * get cr0 ->interception - * set cr0 -> no interception - */ - /* (1ULL << INTERCEPT_SELECTIVE_CR0) | */ (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | @@ -508,7 +533,7 @@ static void init_vmcb(struct vmcb *vmcb) (1ULL << INTERCEPT_MWAIT); control->iopm_base_pa = iopm_base; - control->msrpm_base_pa = msrpm_base; + control->msrpm_base_pa = __pa(svm->msrpm); control->tsc_offset = 0; control->int_ctl = V_INTR_MASKING_MASK; @@ -550,13 +575,30 @@ static void init_vmcb(struct vmcb *vmcb) save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; save->cr4 = X86_CR4_PAE; /* rdx = ?? */ + + if (npt_enabled) { + /* Setup VMCB for Nested Paging */ + control->nested_ctl = 1; + control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); + control->intercept_exceptions &= ~(1 << PF_VECTOR); + control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| + INTERCEPT_CR3_MASK); + control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| + INTERCEPT_CR3_MASK); + save->g_pat = 0x0007040600070406ULL; + /* enable caching because the QEMU Bios doesn't enable it */ + save->cr0 = X86_CR0_ET; + save->cr3 = 0; + save->cr4 = 0; + } + force_new_asid(&svm->vcpu); } static int svm_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - init_vmcb(svm->vmcb); + init_vmcb(svm); if (vcpu->vcpu_id != 0) { svm->vmcb->save.rip = 0; @@ -571,6 +613,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) { struct vcpu_svm *svm; struct page *page; + struct page *msrpm_pages; int err; svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); @@ -589,12 +632,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto uninit; } + err = -ENOMEM; + msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); + if (!msrpm_pages) + goto uninit; + svm->msrpm = page_address(msrpm_pages); + svm_vcpu_init_msrpm(svm->msrpm); + svm->vmcb = page_address(page); clear_page(svm->vmcb); svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; svm->asid_generation = 0; memset(svm->db_regs, 0, sizeof(svm->db_regs)); - init_vmcb(svm->vmcb); + init_vmcb(svm); fx_init(&svm->vcpu); svm->vcpu.fpu_active = 1; @@ -617,6 +667,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); + __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, svm); } @@ -731,6 +782,13 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->unusable = !var->present; } +static int svm_get_cpl(struct kvm_vcpu *vcpu) +{ + struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; + + return save->cpl; +} + static void svm_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) { struct vcpu_svm *svm = to_svm(vcpu); @@ -784,6 +842,9 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) } } #endif + if (npt_enabled) + goto set; + if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); vcpu->fpu_active = 1; @@ -791,18 +852,29 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vcpu->arch.cr0 = cr0; cr0 |= X86_CR0_PG | X86_CR0_WP; - cr0 &= ~(X86_CR0_CD | X86_CR0_NW); if (!vcpu->fpu_active) { svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); cr0 |= X86_CR0_TS; } +set: + /* + * re-enable caching here because the QEMU bios + * does not do it - this results in some delay at + * reboot + */ + cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { - vcpu->arch.cr4 = cr4; - to_svm(vcpu)->vmcb->save.cr4 = cr4 | X86_CR4_PAE; + unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; + + vcpu->arch.cr4 = cr4; + if (!npt_enabled) + cr4 |= X86_CR4_PAE; + cr4 |= host_cr4_mce; + to_svm(vcpu)->vmcb->save.cr4 = cr4; } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -833,13 +905,6 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, } -/* FIXME: - - svm(vcpu)->vmcb->control.int_ctl &= ~V_TPR_MASK; - svm(vcpu)->vmcb->control.int_ctl |= (sregs->cr8 & V_TPR_MASK); - -*/ - static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) { return -EOPNOTSUPP; @@ -920,7 +985,7 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, } default: printk(KERN_DEBUG "%s: unexpected dr %u\n", - __FUNCTION__, dr); + __func__, dr); *exception = UD_VECTOR; return; } @@ -962,6 +1027,19 @@ static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + /* + * On an #MC intercept the MCE handler is not called automatically in + * the host. So do it by hand here. + */ + asm volatile ( + "int $0x12\n"); + /* not sure if we ever come back to this point */ + + return 1; +} + static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { /* @@ -969,7 +1047,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) * so reinitialize it. */ clear_page(svm->vmcb); - init_vmcb(svm->vmcb); + init_vmcb(svm); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; @@ -1033,9 +1111,18 @@ static int invalid_op_interception(struct vcpu_svm *svm, static int task_switch_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - pr_unimpl(&svm->vcpu, "%s: task switch is unsupported\n", __FUNCTION__); - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - return 0; + u16 tss_selector; + + tss_selector = (u16)svm->vmcb->control.exit_info_1; + if (svm->vmcb->control.exit_info_2 & + (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) + return kvm_task_switch(&svm->vcpu, tss_selector, + TASK_SWITCH_IRET); + if (svm->vmcb->control.exit_info_2 & + (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) + return kvm_task_switch(&svm->vcpu, tss_selector, + TASK_SWITCH_JMP); + return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL); } static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) @@ -1049,7 +1136,7 @@ static int emulate_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) - pr_unimpl(&svm->vcpu, "%s: failed\n", __FUNCTION__); + pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); return 1; } @@ -1179,8 +1266,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) svm->vmcb->save.sysenter_esp = data; break; case MSR_IA32_DEBUGCTLMSR: - pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", - __FUNCTION__, data); + if (!svm_has(SVM_FEATURE_LBRV)) { + pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", + __func__, data); + break; + } + if (data & DEBUGCTL_RESERVED_BITS) + return 1; + + svm->vmcb->save.dbgctl = data; + if (data & (1ULL<<0)) + svm_enable_lbrv(svm); + else + svm_disable_lbrv(svm); break; case MSR_K7_EVNTSEL0: case MSR_K7_EVNTSEL1: @@ -1265,6 +1363,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, + [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, [SVM_EXIT_INTR] = nop_on_interception, [SVM_EXIT_NMI] = nop_on_interception, [SVM_EXIT_SMI] = nop_on_interception, @@ -1290,14 +1389,34 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_WBINVD] = emulate_on_interception, [SVM_EXIT_MONITOR] = invalid_op_interception, [SVM_EXIT_MWAIT] = invalid_op_interception, + [SVM_EXIT_NPF] = pf_interception, }; - static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u32 exit_code = svm->vmcb->control.exit_code; + if (npt_enabled) { + int mmu_reload = 0; + if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { + svm_set_cr0(vcpu, svm->vmcb->save.cr0); + mmu_reload = 1; + } + vcpu->arch.cr0 = svm->vmcb->save.cr0; + vcpu->arch.cr3 = svm->vmcb->save.cr3; + if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { + if (!load_pdptrs(vcpu, vcpu->arch.cr3)) { + kvm_inject_gp(vcpu, 0); + return 1; + } + } + if (mmu_reload) { + kvm_mmu_reset_context(vcpu); + kvm_mmu_load(vcpu); + } + } + kvm_reput_irq(svm); if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { @@ -1308,10 +1427,11 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) } if (is_external_interrupt(svm->vmcb->control.exit_int_info) && - exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR) + exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && + exit_code != SVM_EXIT_NPF) printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " "exit_code 0x%x\n", - __FUNCTION__, svm->vmcb->control.exit_int_info, + __func__, svm->vmcb->control.exit_int_info, exit_code); if (exit_code >= ARRAY_SIZE(svm_exit_handlers) @@ -1364,6 +1484,27 @@ static void svm_set_irq(struct kvm_vcpu *vcpu, int irq) svm_inject_irq(svm, irq); } +static void update_cr8_intercept(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + int max_irr, tpr; + + if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr) + return; + + vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; + + max_irr = kvm_lapic_find_highest_irr(vcpu); + if (max_irr == -1) + return; + + tpr = kvm_lapic_get_cr8(vcpu) << 4; + + if (tpr >= (max_irr & 0xf0)) + vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; +} + static void svm_intr_assist(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1376,14 +1517,14 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu) SVM_EVTINJ_VEC_MASK; vmcb->control.exit_int_info = 0; svm_inject_irq(svm, intr_vector); - return; + goto out; } if (vmcb->control.int_ctl & V_IRQ_MASK) - return; + goto out; if (!kvm_cpu_has_interrupt(vcpu)) - return; + goto out; if (!(vmcb->save.rflags & X86_EFLAGS_IF) || (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) || @@ -1391,12 +1532,14 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu) /* unable to deliver irq, set pending irq */ vmcb->control.intercept |= (1ULL << INTERCEPT_VINTR); svm_inject_irq(svm, 0x0); - return; + goto out; } /* Okay, we can deliver the interrupt: grab it and update PIC state. */ intr_vector = kvm_cpu_get_interrupt(vcpu); svm_inject_irq(svm, intr_vector); kvm_timer_intr_post(vcpu, intr_vector); +out: + update_cr8_intercept(vcpu); } static void kvm_reput_irq(struct vcpu_svm *svm) @@ -1482,6 +1625,29 @@ static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { } +static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { + int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; + kvm_lapic_set_tpr(vcpu, cr8); + } +} + +static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 cr8; + + if (!irqchip_in_kernel(vcpu->kvm)) + return; + + cr8 = kvm_get_cr8(vcpu); + svm->vmcb->control.int_ctl &= ~V_TPR_MASK; + svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; +} + static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1491,6 +1657,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) pre_svm_run(svm); + sync_lapic_to_cr8(vcpu); + save_host_msrs(vcpu); fs_selector = read_fs(); gs_selector = read_gs(); @@ -1499,6 +1667,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->host_dr6 = read_dr6(); svm->host_dr7 = read_dr7(); svm->vmcb->save.cr2 = vcpu->arch.cr2; + /* required for live migration with NPT */ + if (npt_enabled) + svm->vmcb->save.cr3 = vcpu->arch.cr3; if (svm->vmcb->save.dr7 & 0xff) { write_dr7(0); @@ -1635,6 +1806,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) stgi(); + sync_cr8_to_lapic(vcpu); + svm->next_rip = 0; } @@ -1642,6 +1815,12 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); + if (npt_enabled) { + svm->vmcb->control.nested_cr3 = root; + force_new_asid(vcpu); + return; + } + svm->vmcb->save.cr3 = root; force_new_asid(vcpu); @@ -1709,6 +1888,7 @@ static struct kvm_x86_ops svm_x86_ops = { .get_segment_base = svm_get_segment_base, .get_segment = svm_get_segment, .set_segment = svm_set_segment, + .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, diff --git a/arch/x86/kvm/svm.h b/arch/x86/kvm/svm.h index 5fd50491b555..1b8afa78e869 100644 --- a/arch/x86/kvm/svm.h +++ b/arch/x86/kvm/svm.h @@ -238,6 +238,9 @@ struct __attribute__ ((__packed__)) vmcb { #define SVM_EXITINTINFO_VALID SVM_EVTINJ_VALID #define SVM_EXITINTINFO_VALID_ERR SVM_EVTINJ_VALID_ERR +#define SVM_EXITINFOSHIFT_TS_REASON_IRET 36 +#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 + #define SVM_EXIT_READ_CR0 0x000 #define SVM_EXIT_READ_CR3 0x003 #define SVM_EXIT_READ_CR4 0x004 diff --git a/arch/x86/kvm/tss.h b/arch/x86/kvm/tss.h new file mode 100644 index 000000000000..622aa10f692f --- /dev/null +++ b/arch/x86/kvm/tss.h @@ -0,0 +1,59 @@ +#ifndef __TSS_SEGMENT_H +#define __TSS_SEGMENT_H + +struct tss_segment_32 { + u32 prev_task_link; + u32 esp0; + u32 ss0; + u32 esp1; + u32 ss1; + u32 esp2; + u32 ss2; + u32 cr3; + u32 eip; + u32 eflags; + u32 eax; + u32 ecx; + u32 edx; + u32 ebx; + u32 esp; + u32 ebp; + u32 esi; + u32 edi; + u32 es; + u32 cs; + u32 ss; + u32 ds; + u32 fs; + u32 gs; + u32 ldt_selector; + u16 t; + u16 io_map; +}; + +struct tss_segment_16 { + u16 prev_task_link; + u16 sp0; + u16 ss0; + u16 sp1; + u16 ss1; + u16 sp2; + u16 ss2; + u16 ip; + u16 flag; + u16 ax; + u16 cx; + u16 dx; + u16 bx; + u16 sp; + u16 bp; + u16 si; + u16 di; + u16 es; + u16 cs; + u16 ss; + u16 ds; + u16 ldt; +}; + +#endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8e1462880d1f..8e5d6645b90d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -17,7 +17,6 @@ #include "irq.h" #include "vmx.h" -#include "segment_descriptor.h" #include "mmu.h" #include <linux/kvm_host.h> @@ -37,6 +36,12 @@ MODULE_LICENSE("GPL"); static int bypass_guest_pf = 1; module_param(bypass_guest_pf, bool, 0); +static int enable_vpid = 1; +module_param(enable_vpid, bool, 0); + +static int flexpriority_enabled = 1; +module_param(flexpriority_enabled, bool, 0); + struct vmcs { u32 revision_id; u32 abort; @@ -71,6 +76,7 @@ struct vcpu_vmx { unsigned rip; } irq; } rmode; + int vpid; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -85,6 +91,10 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static struct page *vmx_io_bitmap_a; static struct page *vmx_io_bitmap_b; +static struct page *vmx_msr_bitmap; + +static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); +static DEFINE_SPINLOCK(vmx_vpid_lock); static struct vmcs_config { int size; @@ -176,6 +186,11 @@ static inline int is_external_interrupt(u32 intr_info) == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } +static inline int cpu_has_vmx_msr_bitmap(void) +{ + return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS); +} + static inline int cpu_has_vmx_tpr_shadow(void) { return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW); @@ -194,8 +209,9 @@ static inline int cpu_has_secondary_exec_ctrls(void) static inline bool cpu_has_vmx_virtualize_apic_accesses(void) { - return (vmcs_config.cpu_based_2nd_exec_ctrl & - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); + return flexpriority_enabled + && (vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES); } static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) @@ -204,6 +220,12 @@ static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) (irqchip_in_kernel(kvm))); } +static inline int cpu_has_vmx_vpid(void) +{ + return (vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_VPID); +} + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -214,6 +236,20 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) return -1; } +static inline void __invvpid(int ext, u16 vpid, gva_t gva) +{ + struct { + u64 vpid : 16; + u64 rsvd : 48; + u64 gva; + } operand = { vpid, 0, gva }; + + asm volatile (ASM_VMX_INVVPID + /* CF==1 or ZF==1 --> rc = -1 */ + "; ja 1f ; ud2 ; 1:" + : : "a"(&operand), "c"(ext) : "cc", "memory"); +} + static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -257,6 +293,14 @@ static void vcpu_clear(struct vcpu_vmx *vmx) vmx->launched = 0; } +static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) +{ + if (vmx->vpid == 0) + return; + + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); +} + static unsigned long vmcs_readl(unsigned long field) { unsigned long value; @@ -353,7 +397,7 @@ static void reload_tss(void) * VT restores TR but not its size. Useless. */ struct descriptor_table gdt; - struct segment_descriptor *descs; + struct desc_struct *descs; get_gdt(&gdt); descs = (void *)gdt.base; @@ -485,11 +529,12 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 phys_addr = __pa(vmx->vmcs); - u64 tsc_this, delta; + u64 tsc_this, delta, new_offset; if (vcpu->cpu != cpu) { vcpu_clear(vmx); kvm_migrate_apic_timer(vcpu); + vpid_sync_vcpu_all(vmx); } if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { @@ -524,8 +569,11 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * Make sure the time stamp counter is monotonous. */ rdtscll(tsc_this); - delta = vcpu->arch.host_tsc - tsc_this; - vmcs_write64(TSC_OFFSET, vmcs_read64(TSC_OFFSET) + delta); + if (tsc_this < vcpu->arch.host_tsc) { + delta = vcpu->arch.host_tsc - tsc_this; + new_offset = vmcs_read64(TSC_OFFSET) + delta; + vmcs_write64(TSC_OFFSET, new_offset); + } } } @@ -596,7 +644,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, nr | INTR_TYPE_EXCEPTION - | (has_error_code ? INTR_INFO_DELIEVER_CODE_MASK : 0) + | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | INTR_INFO_VALID_MASK); if (has_error_code) vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); @@ -959,6 +1007,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING; opt = CPU_BASED_TPR_SHADOW | + CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) @@ -971,7 +1020,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { min = 0; opt = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | - SECONDARY_EXEC_WBINVD_EXITING; + SECONDARY_EXEC_WBINVD_EXITING | + SECONDARY_EXEC_ENABLE_VPID; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) return -EIO; @@ -1080,6 +1130,10 @@ static __init int hardware_setup(void) { if (setup_vmcs_config(&vmcs_config) < 0) return -EIO; + + if (boot_cpu_has(X86_FEATURE_NX)) + kvm_enable_efer_bits(EFER_NX); + return alloc_kvm_area(); } @@ -1214,7 +1268,7 @@ static void enter_lmode(struct kvm_vcpu *vcpu) guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { printk(KERN_DEBUG "%s: tss fixup for long mode. \n", - __FUNCTION__); + __func__); vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); @@ -1239,6 +1293,11 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif +static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +{ + vpid_sync_vcpu_all(to_vmx(vcpu)); +} + static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; @@ -1275,6 +1334,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { + vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, cr3); if (vcpu->arch.cr0 & X86_CR0_PE) vmx_fpu_deactivate(vcpu); @@ -1288,14 +1348,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vcpu->arch.cr4 = cr4; } -#ifdef CONFIG_X86_64 - static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); vcpu->arch.shadow_efer = efer; + if (!msr) + return; if (efer & EFER_LMA) { vmcs_write32(VM_ENTRY_CONTROLS, vmcs_read32(VM_ENTRY_CONTROLS) | @@ -1312,8 +1372,6 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) setup_msrs(vmx); } -#endif - static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -1344,6 +1402,20 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->unusable = (ar >> 16) & 1; } +static int vmx_get_cpl(struct kvm_vcpu *vcpu) +{ + struct kvm_segment kvm_seg; + + if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ + return 0; + + if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ + return 3; + + vmx_get_segment(vcpu, &kvm_seg, VCPU_SREG_CS); + return kvm_seg.selector & 3; +} + static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; @@ -1433,7 +1505,6 @@ static int init_rmode_tss(struct kvm *kvm) int ret = 0; int r; - down_read(&kvm->slots_lock); r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -1456,7 +1527,6 @@ static int init_rmode_tss(struct kvm *kvm) ret = 1; out: - up_read(&kvm->slots_lock); return ret; } @@ -1494,6 +1564,46 @@ out: return r; } +static void allocate_vpid(struct vcpu_vmx *vmx) +{ + int vpid; + + vmx->vpid = 0; + if (!enable_vpid || !cpu_has_vmx_vpid()) + return; + spin_lock(&vmx_vpid_lock); + vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS); + if (vpid < VMX_NR_VPIDS) { + vmx->vpid = vpid; + __set_bit(vpid, vmx_vpid_bitmap); + } + spin_unlock(&vmx_vpid_lock); +} + +void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) +{ + void *va; + + if (!cpu_has_vmx_msr_bitmap()) + return; + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals + * have the write-low and read-high bitmap offsets the wrong way round. + * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. + */ + va = kmap(msr_bitmap); + if (msr <= 0x1fff) { + __clear_bit(msr, va + 0x000); /* read-low */ + __clear_bit(msr, va + 0x800); /* write-low */ + } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + __clear_bit(msr, va + 0x400); /* read-high */ + __clear_bit(msr, va + 0xc00); /* write-high */ + } + kunmap(msr_bitmap); +} + /* * Sets up the vmcs for emulated real mode. */ @@ -1511,6 +1621,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a)); vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b)); + if (cpu_has_vmx_msr_bitmap()) + vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap)); + vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */ /* Control */ @@ -1532,6 +1645,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + if (vmx->vpid == 0) + exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -1613,6 +1728,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) u64 msr; int ret; + down_read(&vcpu->kvm->slots_lock); if (!init_rmode_tss(vmx->vcpu.kvm)) { ret = -ENOMEM; goto out; @@ -1621,7 +1737,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmx->vcpu.arch.rmode.active = 0; vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); - set_cr8(&vmx->vcpu, 0); + kvm_set_cr8(&vmx->vcpu, 0); msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (vmx->vcpu.vcpu_id == 0) msr |= MSR_IA32_APICBASE_BSP; @@ -1704,18 +1820,22 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(vmx->vcpu.kvm->arch.apic_access_page)); + if (vmx->vpid != 0) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + vmx->vcpu.arch.cr0 = 0x60000010; vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ vmx_set_cr4(&vmx->vcpu, 0); -#ifdef CONFIG_X86_64 vmx_set_efer(&vmx->vcpu, 0); -#endif vmx_fpu_activate(&vmx->vcpu); update_exception_bitmap(&vmx->vcpu); - return 0; + vpid_sync_vcpu_all(vmx); + + ret = 0; out: + up_read(&vcpu->kvm->slots_lock); return ret; } @@ -1723,6 +1843,8 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) { struct vcpu_vmx *vmx = to_vmx(vcpu); + KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); + if (vcpu->arch.rmode.active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; @@ -1844,7 +1966,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if ((vect_info & VECTORING_INFO_VALID_MASK) && !is_page_fault(intr_info)) printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " - "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info); + "intr info 0x%x\n", __func__, vect_info, intr_info); if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) { int irq = vect_info & VECTORING_INFO_VECTOR_MASK; @@ -1869,10 +1991,12 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) error_code = 0; rip = vmcs_readl(GUEST_RIP); - if (intr_info & INTR_INFO_DELIEVER_CODE_MASK) + if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { cr2 = vmcs_readl(EXIT_QUALIFICATION); + KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, + (u32)((u64)cr2 >> 32), handler); return kvm_mmu_page_fault(vcpu, cr2, error_code); } @@ -1901,6 +2025,7 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { ++vcpu->stat.irq_exits; + KVMTRACE_1D(INTR, vcpu, vmcs_read32(VM_EXIT_INTR_INFO), handler); return 1; } @@ -1958,25 +2083,27 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ + KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], + (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); switch (cr) { case 0: vcpu_load_rsp_rip(vcpu); - set_cr0(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr0(vcpu, vcpu->arch.regs[reg]); skip_emulated_instruction(vcpu); return 1; case 3: vcpu_load_rsp_rip(vcpu); - set_cr3(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr3(vcpu, vcpu->arch.regs[reg]); skip_emulated_instruction(vcpu); return 1; case 4: vcpu_load_rsp_rip(vcpu); - set_cr4(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr4(vcpu, vcpu->arch.regs[reg]); skip_emulated_instruction(vcpu); return 1; case 8: vcpu_load_rsp_rip(vcpu); - set_cr8(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr8(vcpu, vcpu->arch.regs[reg]); skip_emulated_instruction(vcpu); if (irqchip_in_kernel(vcpu->kvm)) return 1; @@ -1990,6 +2117,7 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); vmx_fpu_activate(vcpu); + KVMTRACE_0D(CLTS, vcpu, handler); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ @@ -1998,18 +2126,24 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu_load_rsp_rip(vcpu); vcpu->arch.regs[reg] = vcpu->arch.cr3; vcpu_put_rsp_rip(vcpu); + KVMTRACE_3D(CR_READ, vcpu, (u32)cr, + (u32)vcpu->arch.regs[reg], + (u32)((u64)vcpu->arch.regs[reg] >> 32), + handler); skip_emulated_instruction(vcpu); return 1; case 8: vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = get_cr8(vcpu); + vcpu->arch.regs[reg] = kvm_get_cr8(vcpu); vcpu_put_rsp_rip(vcpu); + KVMTRACE_2D(CR_READ, vcpu, (u32)cr, + (u32)vcpu->arch.regs[reg], handler); skip_emulated_instruction(vcpu); return 1; } break; case 3: /* lmsw */ - lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); + kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); skip_emulated_instruction(vcpu); return 1; @@ -2049,6 +2183,7 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) val = 0; } vcpu->arch.regs[reg] = val; + KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); } else { /* mov to dr */ } @@ -2073,6 +2208,9 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } + KVMTRACE_3D(MSR_READ, vcpu, ecx, (u32)data, (u32)(data >> 32), + handler); + /* FIXME: handling of bits 32:63 of rax, rdx */ vcpu->arch.regs[VCPU_REGS_RAX] = data & -1u; vcpu->arch.regs[VCPU_REGS_RDX] = (data >> 32) & -1u; @@ -2086,6 +2224,9 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); + KVMTRACE_3D(MSR_WRITE, vcpu, ecx, (u32)data, (u32)(data >> 32), + handler); + if (vmx_set_msr(vcpu, ecx, data) != 0) { kvm_inject_gp(vcpu, 0); return 1; @@ -2110,6 +2251,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu, cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + + KVMTRACE_0D(PEND_INTR, vcpu, handler); + /* * If the user space waits to inject interrupts, exit as soon as * possible @@ -2152,6 +2296,8 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_read64(EXIT_QUALIFICATION); offset = exit_qualification & 0xffful; + KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler); + er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); if (er != EMULATE_DONE) { @@ -2163,6 +2309,20 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + unsigned long exit_qualification; + u16 tss_selector; + int reason; + + exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + + reason = (u32)exit_qualification >> 30; + tss_selector = exit_qualification; + + return kvm_task_switch(vcpu, tss_selector, reason); +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -2185,6 +2345,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, [EXIT_REASON_WBINVD] = handle_wbinvd, + [EXIT_REASON_TASK_SWITCH] = handle_task_switch, }; static const int kvm_vmx_max_exit_handlers = @@ -2200,6 +2361,9 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vectoring_info = vmx->idt_vectoring_info; + KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), + (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); + if (unlikely(vmx->fail)) { kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason @@ -2210,7 +2374,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) if ((vectoring_info & VECTORING_INFO_VALID_MASK) && exit_reason != EXIT_REASON_EXCEPTION_NMI) printk(KERN_WARNING "%s: unexpected, valid vectoring info and " - "exit reason is 0x%x\n", __FUNCTION__, exit_reason); + "exit reason is 0x%x\n", __func__, exit_reason); if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); @@ -2221,10 +2385,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) return 0; } -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) -{ -} - static void update_tpr_threshold(struct kvm_vcpu *vcpu) { int max_irr, tpr; @@ -2285,11 +2445,13 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) return; } + KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); - if (unlikely(idtv_info_field & INTR_INFO_DELIEVER_CODE_MASK)) + if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, vmcs_read32(IDT_VECTORING_ERROR_CODE)); if (unlikely(has_ext_irq)) @@ -2470,8 +2632,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* We need to handle NMIs before interrupts are enabled */ - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ + KVMTRACE_0D(NMI, vcpu, handler); asm("int $2"); + } } static void vmx_free_vmcs(struct kvm_vcpu *vcpu) @@ -2489,6 +2653,10 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + spin_lock(&vmx_vpid_lock); + if (vmx->vpid != 0) + __clear_bit(vmx->vpid, vmx_vpid_bitmap); + spin_unlock(&vmx_vpid_lock); vmx_free_vmcs(vcpu); kfree(vmx->host_msrs); kfree(vmx->guest_msrs); @@ -2505,6 +2673,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!vmx) return ERR_PTR(-ENOMEM); + allocate_vpid(vmx); + err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) goto free_vcpu; @@ -2591,14 +2761,13 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_segment_base = vmx_get_segment_base, .get_segment = vmx_get_segment, .set_segment = vmx_set_segment, + .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, .set_cr4 = vmx_set_cr4, -#ifdef CONFIG_X86_64 .set_efer = vmx_set_efer, -#endif .get_idt = vmx_get_idt, .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, @@ -2626,7 +2795,7 @@ static struct kvm_x86_ops vmx_x86_ops = { static int __init vmx_init(void) { - void *iova; + void *va; int r; vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); @@ -2639,28 +2808,48 @@ static int __init vmx_init(void) goto out; } + vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!vmx_msr_bitmap) { + r = -ENOMEM; + goto out1; + } + /* * Allow direct access to the PC debug port (it is often used for I/O * delays, but the vmexits simply slow things down). */ - iova = kmap(vmx_io_bitmap_a); - memset(iova, 0xff, PAGE_SIZE); - clear_bit(0x80, iova); + va = kmap(vmx_io_bitmap_a); + memset(va, 0xff, PAGE_SIZE); + clear_bit(0x80, va); kunmap(vmx_io_bitmap_a); - iova = kmap(vmx_io_bitmap_b); - memset(iova, 0xff, PAGE_SIZE); + va = kmap(vmx_io_bitmap_b); + memset(va, 0xff, PAGE_SIZE); kunmap(vmx_io_bitmap_b); + va = kmap(vmx_msr_bitmap); + memset(va, 0xff, PAGE_SIZE); + kunmap(vmx_msr_bitmap); + + set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); if (r) - goto out1; + goto out2; + + vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE); + vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE); + vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS); + vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); + vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); if (bypass_guest_pf) kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); return 0; +out2: + __free_page(vmx_msr_bitmap); out1: __free_page(vmx_io_bitmap_b); out: @@ -2670,6 +2859,7 @@ out: static void __exit vmx_exit(void) { + __free_page(vmx_msr_bitmap); __free_page(vmx_io_bitmap_b); __free_page(vmx_io_bitmap_a); diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index d52ae8d7303d..5dff4606b988 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -49,6 +49,7 @@ * Definitions of Secondary Processor-Based VM-Execution Controls. */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 +#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 @@ -65,6 +66,7 @@ /* VMCS Encodings */ enum vmcs_field { + VIRTUAL_PROCESSOR_ID = 0x00000000, GUEST_ES_SELECTOR = 0x00000800, GUEST_CS_SELECTOR = 0x00000802, GUEST_SS_SELECTOR = 0x00000804, @@ -231,12 +233,12 @@ enum vmcs_field { */ #define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ #define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ -#define INTR_INFO_DELIEVER_CODE_MASK 0x800 /* 11 */ +#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ #define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ #define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK #define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK -#define VECTORING_INFO_DELIEVER_CODE_MASK INTR_INFO_DELIEVER_CODE_MASK +#define VECTORING_INFO_DELIVER_CODE_MASK INTR_INFO_DELIVER_CODE_MASK #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ @@ -321,4 +323,8 @@ enum vmcs_field { #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 +#define VMX_NR_VPIDS (1 << 16) +#define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 +#define VMX_VPID_EXTENT_ALL_CONTEXT 2 + #endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6b01552bd1f1..0ce556372a4d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -15,10 +15,12 @@ */ #include <linux/kvm_host.h> -#include "segment_descriptor.h" #include "irq.h" #include "mmu.h" +#include "i8254.h" +#include "tss.h" +#include <linux/clocksource.h> #include <linux/kvm.h> #include <linux/fs.h> #include <linux/vmalloc.h> @@ -28,6 +30,7 @@ #include <asm/uaccess.h> #include <asm/msr.h> +#include <asm/desc.h> #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -41,7 +44,15 @@ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) -#define EFER_RESERVED_BITS 0xfffffffffffff2fe +/* EFER defaults: + * - enable syscall per default because its emulated by KVM + * - enable LME and LMA per default on 64 bit KVM + */ +#ifdef CONFIG_X86_64 +static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; +#else +static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; +#endif #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU @@ -63,6 +74,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "irq_window", VCPU_STAT(irq_window_exits) }, { "halt_exits", VCPU_STAT(halt_exits) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, + { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, { "irq_exits", VCPU_STAT(irq_exits) }, { "host_state_reload", VCPU_STAT(host_state_reload) }, @@ -78,6 +90,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, + { "largepages", VM_STAT(lpages) }, { NULL } }; @@ -85,7 +98,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { unsigned long segment_base(u16 selector) { struct descriptor_table gdt; - struct segment_descriptor *d; + struct desc_struct *d; unsigned long table_base; unsigned long v; @@ -101,13 +114,12 @@ unsigned long segment_base(u16 selector) asm("sldt %0" : "=g"(ldt_selector)); table_base = segment_base(ldt_selector); } - d = (struct segment_descriptor *)(table_base + (selector & ~7)); - v = d->base_low | ((unsigned long)d->base_mid << 16) | - ((unsigned long)d->base_high << 24); + d = (struct desc_struct *)(table_base + (selector & ~7)); + v = d->base0 | ((unsigned long)d->base1 << 16) | + ((unsigned long)d->base2 << 24); #ifdef CONFIG_X86_64 - if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) - v |= ((unsigned long) \ - ((struct segment_descriptor_64 *)d)->base_higher) << 32; + if (d->s == 0 && (d->type == 2 || d->type == 9 || d->type == 11)) + v |= ((unsigned long)((struct ldttss_desc64 *)d)->base3) << 32; #endif return v; } @@ -145,11 +157,16 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, u32 error_code) { ++vcpu->stat.pf_guest; - if (vcpu->arch.exception.pending && vcpu->arch.exception.nr == PF_VECTOR) { - printk(KERN_DEBUG "kvm: inject_page_fault:" - " double fault 0x%lx\n", addr); - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; + if (vcpu->arch.exception.pending) { + if (vcpu->arch.exception.nr == PF_VECTOR) { + printk(KERN_DEBUG "kvm: inject_page_fault:" + " double fault 0x%lx\n", addr); + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + } else if (vcpu->arch.exception.nr == DF_VECTOR) { + /* triple fault -> shutdown */ + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + } return; } vcpu->arch.cr2 = addr; @@ -184,7 +201,6 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) int ret; u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; - down_read(&vcpu->kvm->slots_lock); ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, offset * sizeof(u64), sizeof(pdpte)); if (ret < 0) { @@ -201,10 +217,10 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); out: - up_read(&vcpu->kvm->slots_lock); return ret; } +EXPORT_SYMBOL_GPL(load_pdptrs); static bool pdptrs_changed(struct kvm_vcpu *vcpu) { @@ -215,18 +231,16 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu) if (is_long_mode(vcpu) || !is_pae(vcpu)) return false; - down_read(&vcpu->kvm->slots_lock); r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); if (r < 0) goto out; changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; out: - up_read(&vcpu->kvm->slots_lock); return changed; } -void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { if (cr0 & CR0_RESERVED_BITS) { printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", @@ -284,15 +298,18 @@ void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_mmu_reset_context(vcpu); return; } -EXPORT_SYMBOL_GPL(set_cr0); +EXPORT_SYMBOL_GPL(kvm_set_cr0); -void lmsw(struct kvm_vcpu *vcpu, unsigned long msw) +void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); + kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); + KVMTRACE_1D(LMSW, vcpu, + (u32)((vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)), + handler); } -EXPORT_SYMBOL_GPL(lmsw); +EXPORT_SYMBOL_GPL(kvm_lmsw); -void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & CR4_RESERVED_BITS) { printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); @@ -323,9 +340,9 @@ void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vcpu->arch.cr4 = cr4; kvm_mmu_reset_context(vcpu); } -EXPORT_SYMBOL_GPL(set_cr4); +EXPORT_SYMBOL_GPL(kvm_set_cr4); -void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { kvm_mmu_flush_tlb(vcpu); @@ -359,7 +376,6 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) */ } - down_read(&vcpu->kvm->slots_lock); /* * Does the new cr3 value map to physical memory? (Note, we * catch an invalid cr3 even in real-mode, because it would @@ -375,11 +391,10 @@ void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) vcpu->arch.cr3 = cr3; vcpu->arch.mmu.new_cr3(vcpu); } - up_read(&vcpu->kvm->slots_lock); } -EXPORT_SYMBOL_GPL(set_cr3); +EXPORT_SYMBOL_GPL(kvm_set_cr3); -void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { if (cr8 & CR8_RESERVED_BITS) { printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); @@ -391,16 +406,16 @@ void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) else vcpu->arch.cr8 = cr8; } -EXPORT_SYMBOL_GPL(set_cr8); +EXPORT_SYMBOL_GPL(kvm_set_cr8); -unsigned long get_cr8(struct kvm_vcpu *vcpu) +unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) { if (irqchip_in_kernel(vcpu->kvm)) return kvm_lapic_get_cr8(vcpu); else return vcpu->arch.cr8; } -EXPORT_SYMBOL_GPL(get_cr8); +EXPORT_SYMBOL_GPL(kvm_get_cr8); /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS @@ -415,7 +430,8 @@ static u32 msrs_to_save[] = { #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TIME_STAMP_COUNTER, + MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_IA32_PERF_STATUS, }; static unsigned num_msrs_to_save; @@ -424,11 +440,9 @@ static u32 emulated_msrs[] = { MSR_IA32_MISC_ENABLE, }; -#ifdef CONFIG_X86_64 - static void set_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & EFER_RESERVED_BITS) { + if (efer & efer_reserved_bits) { printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", efer); kvm_inject_gp(vcpu, 0); @@ -450,7 +464,12 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer) vcpu->arch.shadow_efer = efer; } -#endif +void kvm_enable_efer_bits(u64 mask) +{ + efer_reserved_bits &= ~mask; +} +EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); + /* * Writes msr value into into the appropriate "register". @@ -470,26 +489,86 @@ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) return kvm_set_msr(vcpu, index, *data); } +static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) +{ + static int version; + struct kvm_wall_clock wc; + struct timespec wc_ts; + + if (!wall_clock) + return; + + version++; + + kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); + + wc_ts = current_kernel_time(); + wc.wc_sec = wc_ts.tv_sec; + wc.wc_nsec = wc_ts.tv_nsec; + wc.wc_version = version; + + kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc)); + + version++; + kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); +} + +static void kvm_write_guest_time(struct kvm_vcpu *v) +{ + struct timespec ts; + unsigned long flags; + struct kvm_vcpu_arch *vcpu = &v->arch; + void *shared_kaddr; + + if ((!vcpu->time_page)) + return; + + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); + kvm_get_msr(v, MSR_IA32_TIME_STAMP_COUNTER, + &vcpu->hv_clock.tsc_timestamp); + ktime_get_ts(&ts); + local_irq_restore(flags); + + /* With all the info we got, fill in the values */ + + vcpu->hv_clock.system_time = ts.tv_nsec + + (NSEC_PER_SEC * (u64)ts.tv_sec); + /* + * The interface expects us to write an even number signaling that the + * update is finished. Since the guest won't see the intermediate + * state, we just write "2" at the end + */ + vcpu->hv_clock.version = 2; + + shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); + + memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); + + kunmap_atomic(shared_kaddr, KM_USER0); + + mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { -#ifdef CONFIG_X86_64 case MSR_EFER: set_efer(vcpu, data); break; -#endif case MSR_IA32_MC0_STATUS: pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n", - __FUNCTION__, data); + __func__, data); break; case MSR_IA32_MCG_STATUS: pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n", - __FUNCTION__, data); + __func__, data); break; case MSR_IA32_MCG_CTL: pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", - __FUNCTION__, data); + __func__, data); break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: @@ -501,6 +580,42 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; + case MSR_KVM_WALL_CLOCK: + vcpu->kvm->arch.wall_clock = data; + kvm_write_wall_clock(vcpu->kvm, data); + break; + case MSR_KVM_SYSTEM_TIME: { + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + + vcpu->arch.time = data; + + /* we verify if the enable bit is set... */ + if (!(data & 1)) + break; + + /* ...but clean it before doing the actual write */ + vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); + + vcpu->arch.hv_clock.tsc_to_system_mul = + clocksource_khz2mult(tsc_khz, 22); + vcpu->arch.hv_clock.tsc_shift = 22; + + down_read(¤t->mm->mmap_sem); + vcpu->arch.time_page = + gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); + up_read(¤t->mm->mmap_sem); + + if (is_error_page(vcpu->arch.time_page)) { + kvm_release_page_clean(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } + + kvm_write_guest_time(vcpu); + break; + } default: pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); return 1; @@ -540,7 +655,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MC0_MISC+12: case MSR_IA32_MC0_MISC+16: case MSR_IA32_UCODE_REV: - case MSR_IA32_PERF_STATUS: case MSR_IA32_EBL_CR_POWERON: /* MTRR registers */ case 0xfe: @@ -556,11 +670,21 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; -#ifdef CONFIG_X86_64 + case MSR_IA32_PERF_STATUS: + /* TSC increment by tick */ + data = 1000ULL; + /* CPU multiplier */ + data |= (((uint64_t)4ULL) << 40); + break; case MSR_EFER: data = vcpu->arch.shadow_efer; break; -#endif + case MSR_KVM_WALL_CLOCK: + data = vcpu->kvm->arch.wall_clock; + break; + case MSR_KVM_SYSTEM_TIME: + data = vcpu->arch.time; + break; default: pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; @@ -584,9 +708,11 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, vcpu_load(vcpu); + down_read(&vcpu->kvm->slots_lock); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; + up_read(&vcpu->kvm->slots_lock); vcpu_put(vcpu); @@ -688,11 +814,24 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_USER_MEMORY: case KVM_CAP_SET_TSS_ADDR: case KVM_CAP_EXT_CPUID: + case KVM_CAP_CLOCKSOURCE: + case KVM_CAP_PIT: + case KVM_CAP_NOP_IO_DELAY: + case KVM_CAP_MP_STATE: r = 1; break; case KVM_CAP_VAPIC: r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; + case KVM_CAP_NR_VCPUS: + r = KVM_MAX_VCPUS; + break; + case KVM_CAP_NR_MEMSLOTS: + r = KVM_MEMORY_SLOTS; + break; + case KVM_CAP_PV_MMU: + r = !tdp_enabled; + break; default: r = 0; break; @@ -763,6 +902,7 @@ out: void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { kvm_x86_ops->vcpu_load(vcpu, cpu); + kvm_write_guest_time(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -958,32 +1098,32 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, } /* function 4 and 0xb have additional index. */ case 4: { - int index, cache_type; + int i, cache_type; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until cache_type is zero */ - for (index = 1; *nent < maxnent; ++index) { - cache_type = entry[index - 1].eax & 0x1f; + for (i = 1; *nent < maxnent; ++i) { + cache_type = entry[i - 1].eax & 0x1f; if (!cache_type) break; - do_cpuid_1_ent(&entry[index], function, index); - entry[index].flags |= + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; } break; } case 0xb: { - int index, level_type; + int i, level_type; entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* read more entries until level_type is zero */ - for (index = 1; *nent < maxnent; ++index) { - level_type = entry[index - 1].ecx & 0xff; + for (i = 1; *nent < maxnent; ++i) { + level_type = entry[i - 1].ecx & 0xff; if (!level_type) break; - do_cpuid_1_ent(&entry[index], function, index); - entry[index].flags |= + do_cpuid_1_ent(&entry[i], function, i); + entry[i].flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; ++*nent; } @@ -1365,6 +1505,23 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) return r; } +static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) +{ + int r = 0; + + memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state)); + return r; +} + +static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) +{ + int r = 0; + + memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state)); + kvm_pit_load_count(kvm, 0, ps->channels[0].count); + return r; +} + /* * Get (and clear) the dirty memory log for a memory slot. */ @@ -1457,6 +1614,12 @@ long kvm_arch_vm_ioctl(struct file *filp, } else goto out; break; + case KVM_CREATE_PIT: + r = -ENOMEM; + kvm->arch.vpit = kvm_create_pit(kvm); + if (kvm->arch.vpit) + r = 0; + break; case KVM_IRQ_LINE: { struct kvm_irq_level irq_event; @@ -1512,6 +1675,37 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_GET_PIT: { + struct kvm_pit_state ps; + r = -EFAULT; + if (copy_from_user(&ps, argp, sizeof ps)) + goto out; + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_get_pit(kvm, &ps); + if (r) + goto out; + r = -EFAULT; + if (copy_to_user(argp, &ps, sizeof ps)) + goto out; + r = 0; + break; + } + case KVM_SET_PIT: { + struct kvm_pit_state ps; + r = -EFAULT; + if (copy_from_user(&ps, argp, sizeof ps)) + goto out; + r = -ENXIO; + if (!kvm->arch.vpit) + goto out; + r = kvm_vm_ioctl_set_pit(kvm, &ps); + if (r) + goto out; + r = 0; + break; + } default: ; } @@ -1570,7 +1764,6 @@ int emulator_read_std(unsigned long addr, void *data = val; int r = X86EMUL_CONTINUE; - down_read(&vcpu->kvm->slots_lock); while (bytes) { gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); unsigned offset = addr & (PAGE_SIZE-1); @@ -1592,7 +1785,6 @@ int emulator_read_std(unsigned long addr, addr += tocopy; } out: - up_read(&vcpu->kvm->slots_lock); return r; } EXPORT_SYMBOL_GPL(emulator_read_std); @@ -1611,9 +1803,7 @@ static int emulator_read_emulated(unsigned long addr, return X86EMUL_CONTINUE; } - down_read(&vcpu->kvm->slots_lock); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); - up_read(&vcpu->kvm->slots_lock); /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -1646,19 +1836,15 @@ mmio: return X86EMUL_UNHANDLEABLE; } -static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, - const void *val, int bytes) +int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, + const void *val, int bytes) { int ret; - down_read(&vcpu->kvm->slots_lock); ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); - if (ret < 0) { - up_read(&vcpu->kvm->slots_lock); + if (ret < 0) return 0; - } kvm_mmu_pte_write(vcpu, gpa, val, bytes); - up_read(&vcpu->kvm->slots_lock); return 1; } @@ -1670,9 +1856,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_io_device *mmio_dev; gpa_t gpa; - down_read(&vcpu->kvm->slots_lock); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); - up_read(&vcpu->kvm->slots_lock); if (gpa == UNMAPPED_GVA) { kvm_inject_page_fault(vcpu, addr, 2); @@ -1749,7 +1933,6 @@ static int emulator_cmpxchg_emulated(unsigned long addr, char *kaddr; u64 val; - down_read(&vcpu->kvm->slots_lock); gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); if (gpa == UNMAPPED_GVA || @@ -1769,9 +1952,8 @@ static int emulator_cmpxchg_emulated(unsigned long addr, set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); kunmap_atomic(kaddr, KM_USER0); kvm_release_page_dirty(page); - emul_write: - up_read(&vcpu->kvm->slots_lock); } +emul_write: #endif return emulator_write_emulated(addr, new, bytes, vcpu); @@ -1802,7 +1984,7 @@ int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) *dest = kvm_x86_ops->get_dr(vcpu, dr); return X86EMUL_CONTINUE; default: - pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr); + pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr); return X86EMUL_UNHANDLEABLE; } } @@ -1840,7 +2022,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) } EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); -struct x86_emulate_ops emulate_ops = { +static struct x86_emulate_ops emulate_ops = { .read_std = emulator_read_std, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, @@ -2091,6 +2273,13 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.guest_page_offset = 0; vcpu->arch.pio.rep = 0; + if (vcpu->run->io.direction == KVM_EXIT_IO_IN) + KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, + handler); + else + KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, + handler); + kvm_x86_ops->cache_regs(vcpu); memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); kvm_x86_ops->decache_regs(vcpu); @@ -2129,6 +2318,13 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, vcpu->arch.pio.guest_page_offset = offset_in_page(address); vcpu->arch.pio.rep = rep; + if (vcpu->run->io.direction == KVM_EXIT_IO_IN) + KVMTRACE_2D(IO_READ, vcpu, vcpu->run->io.port, (u32)size, + handler); + else + KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, + handler); + if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); return 1; @@ -2163,10 +2359,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, kvm_x86_ops->skip_emulated_instruction(vcpu); for (i = 0; i < nr_pages; ++i) { - down_read(&vcpu->kvm->slots_lock); page = gva_to_page(vcpu, address + i * PAGE_SIZE); vcpu->arch.pio.guest_pages[i] = page; - up_read(&vcpu->kvm->slots_lock); if (!page) { kvm_inject_gp(vcpu, 0); free_pio_guest_pages(vcpu); @@ -2238,10 +2432,13 @@ void kvm_arch_exit(void) int kvm_emulate_halt(struct kvm_vcpu *vcpu) { ++vcpu->stat.halt_exits; + KVMTRACE_0D(HLT, vcpu, handler); if (irqchip_in_kernel(vcpu->kvm)) { - vcpu->arch.mp_state = VCPU_MP_STATE_HALTED; + vcpu->arch.mp_state = KVM_MP_STATE_HALTED; + up_read(&vcpu->kvm->slots_lock); kvm_vcpu_block(vcpu); - if (vcpu->arch.mp_state != VCPU_MP_STATE_RUNNABLE) + down_read(&vcpu->kvm->slots_lock); + if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) return -EINTR; return 1; } else { @@ -2251,9 +2448,19 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_halt); +static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0, + unsigned long a1) +{ + if (is_long_mode(vcpu)) + return a0; + else + return a0 | ((gpa_t)a1 << 32); +} + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; + int r = 1; kvm_x86_ops->cache_regs(vcpu); @@ -2263,6 +2470,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) a2 = vcpu->arch.regs[VCPU_REGS_RDX]; a3 = vcpu->arch.regs[VCPU_REGS_RSI]; + KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); + if (!is_long_mode(vcpu)) { nr &= 0xFFFFFFFF; a0 &= 0xFFFFFFFF; @@ -2275,13 +2484,17 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) case KVM_HC_VAPIC_POLL_IRQ: ret = 0; break; + case KVM_HC_MMU_OP: + r = kvm_pv_mmu_op(vcpu, a0, hc_gpa(vcpu, a1, a2), &ret); + break; default: ret = -KVM_ENOSYS; break; } vcpu->arch.regs[VCPU_REGS_RAX] = ret; kvm_x86_ops->decache_regs(vcpu); - return 0; + ++vcpu->stat.hypercalls; + return r; } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); @@ -2329,7 +2542,7 @@ void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, unsigned long *rflags) { - lmsw(vcpu, msw); + kvm_lmsw(vcpu, msw); *rflags = kvm_x86_ops->get_rflags(vcpu); } @@ -2346,9 +2559,9 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) case 4: return vcpu->arch.cr4; case 8: - return get_cr8(vcpu); + return kvm_get_cr8(vcpu); default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); return 0; } } @@ -2358,23 +2571,23 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, { switch (cr) { case 0: - set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); + kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); *rflags = kvm_x86_ops->get_rflags(vcpu); break; case 2: vcpu->arch.cr2 = val; break; case 3: - set_cr3(vcpu, val); + kvm_set_cr3(vcpu, val); break; case 4: - set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); + kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); break; case 8: - set_cr8(vcpu, val & 0xfUL); + kvm_set_cr8(vcpu, val & 0xfUL); break; default: - vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr); + vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); } } @@ -2447,6 +2660,11 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) } kvm_x86_ops->decache_regs(vcpu); kvm_x86_ops->skip_emulated_instruction(vcpu); + KVMTRACE_5D(CPUID, vcpu, function, + (u32)vcpu->arch.regs[VCPU_REGS_RAX], + (u32)vcpu->arch.regs[VCPU_REGS_RBX], + (u32)vcpu->arch.regs[VCPU_REGS_RCX], + (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); @@ -2469,7 +2687,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; - kvm_run->cr8 = get_cr8(vcpu); + kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); if (irqchip_in_kernel(vcpu->kvm)) kvm_run->ready_for_interrupt_injection = 1; @@ -2509,16 +2727,17 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; - if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) { + if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { pr_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, vcpu->arch.sipi_vector); kvm_lapic_reset(vcpu); r = kvm_x86_ops->vcpu_reset(vcpu); if (r) return r; - vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } + down_read(&vcpu->kvm->slots_lock); vapic_enter(vcpu); preempted: @@ -2526,6 +2745,10 @@ preempted: kvm_x86_ops->guest_debug_pre(vcpu); again: + if (vcpu->requests) + if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) + kvm_mmu_unload(vcpu); + r = kvm_mmu_reload(vcpu); if (unlikely(r)) goto out; @@ -2539,6 +2762,11 @@ again: r = 0; goto out; } + if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { + kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; + r = 0; + goto out; + } } kvm_inject_pending_timer_irqs(vcpu); @@ -2557,6 +2785,14 @@ again: goto out; } + if (vcpu->requests) + if (test_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) { + local_irq_enable(); + preempt_enable(); + r = 1; + goto out; + } + if (signal_pending(current)) { local_irq_enable(); preempt_enable(); @@ -2566,6 +2802,13 @@ again: goto out; } + vcpu->guest_mode = 1; + /* + * Make sure that guest_mode assignment won't happen after + * testing the pending IRQ vector bitmap. + */ + smp_wmb(); + if (vcpu->arch.exception.pending) __queue_exception(vcpu); else if (irqchip_in_kernel(vcpu->kvm)) @@ -2575,13 +2818,15 @@ again: kvm_lapic_sync_to_vapic(vcpu); - vcpu->guest_mode = 1; + up_read(&vcpu->kvm->slots_lock); + kvm_guest_enter(); if (vcpu->requests) if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) kvm_x86_ops->tlb_flush(vcpu); + KVMTRACE_0D(VMENTRY, vcpu, entryexit); kvm_x86_ops->run(vcpu, kvm_run); vcpu->guest_mode = 0; @@ -2601,6 +2846,8 @@ again: preempt_enable(); + down_read(&vcpu->kvm->slots_lock); + /* * Profile KVM exit RIPs: */ @@ -2628,14 +2875,18 @@ again: } out: + up_read(&vcpu->kvm->slots_lock); if (r > 0) { kvm_resched(vcpu); + down_read(&vcpu->kvm->slots_lock); goto preempted; } post_kvm_run_save(vcpu, kvm_run); + down_read(&vcpu->kvm->slots_lock); vapic_exit(vcpu); + up_read(&vcpu->kvm->slots_lock); return r; } @@ -2647,7 +2898,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu_load(vcpu); - if (unlikely(vcpu->arch.mp_state == VCPU_MP_STATE_UNINITIALIZED)) { + if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); vcpu_put(vcpu); return -EAGAIN; @@ -2658,7 +2909,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* re-sync apic's tpr */ if (!irqchip_in_kernel(vcpu->kvm)) - set_cr8(vcpu, kvm_run->cr8); + kvm_set_cr8(vcpu, kvm_run->cr8); if (vcpu->arch.pio.cur_count) { r = complete_pio(vcpu); @@ -2670,9 +2921,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); vcpu->mmio_read_completed = 1; vcpu->mmio_needed = 0; + + down_read(&vcpu->kvm->slots_lock); r = emulate_instruction(vcpu, kvm_run, vcpu->arch.mmio_fault_cr2, 0, EMULTYPE_NO_DECODE); + up_read(&vcpu->kvm->slots_lock); if (r == EMULATE_DO_MMIO) { /* * Read-modify-write. Back to userspace. @@ -2773,7 +3027,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) static void get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - return kvm_x86_ops->get_segment(vcpu, var, seg); + kvm_x86_ops->get_segment(vcpu, var, seg); } void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -2816,7 +3070,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = vcpu->arch.cr3; sregs->cr4 = vcpu->arch.cr4; - sregs->cr8 = get_cr8(vcpu); + sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.shadow_efer; sregs->apic_base = kvm_get_apic_base(vcpu); @@ -2836,12 +3090,438 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, return 0; } +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_load(vcpu); + mp_state->mp_state = vcpu->arch.mp_state; + vcpu_put(vcpu); + return 0; +} + +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + vcpu_load(vcpu); + vcpu->arch.mp_state = mp_state->mp_state; + vcpu_put(vcpu); + return 0; +} + static void set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { - return kvm_x86_ops->set_segment(vcpu, var, seg); + kvm_x86_ops->set_segment(vcpu, var, seg); +} + +static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, + struct kvm_segment *kvm_desct) +{ + kvm_desct->base = seg_desc->base0; + kvm_desct->base |= seg_desc->base1 << 16; + kvm_desct->base |= seg_desc->base2 << 24; + kvm_desct->limit = seg_desc->limit0; + kvm_desct->limit |= seg_desc->limit << 16; + kvm_desct->selector = selector; + kvm_desct->type = seg_desc->type; + kvm_desct->present = seg_desc->p; + kvm_desct->dpl = seg_desc->dpl; + kvm_desct->db = seg_desc->d; + kvm_desct->s = seg_desc->s; + kvm_desct->l = seg_desc->l; + kvm_desct->g = seg_desc->g; + kvm_desct->avl = seg_desc->avl; + if (!selector) + kvm_desct->unusable = 1; + else + kvm_desct->unusable = 0; + kvm_desct->padding = 0; +} + +static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, + u16 selector, + struct descriptor_table *dtable) +{ + if (selector & 1 << 2) { + struct kvm_segment kvm_seg; + + get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); + + if (kvm_seg.unusable) + dtable->limit = 0; + else + dtable->limit = kvm_seg.limit; + dtable->base = kvm_seg.base; + } + else + kvm_x86_ops->get_gdt(vcpu, dtable); +} + +/* allowed just for 8 bytes segments */ +static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, + struct desc_struct *seg_desc) +{ + struct descriptor_table dtable; + u16 index = selector >> 3; + + get_segment_descritptor_dtable(vcpu, selector, &dtable); + + if (dtable.limit < index * 8 + 7) { + kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); + return 1; + } + return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); +} + +/* allowed just for 8 bytes segments */ +static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, + struct desc_struct *seg_desc) +{ + struct descriptor_table dtable; + u16 index = selector >> 3; + + get_segment_descritptor_dtable(vcpu, selector, &dtable); + + if (dtable.limit < index * 8 + 7) + return 1; + return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); +} + +static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, + struct desc_struct *seg_desc) +{ + u32 base_addr; + + base_addr = seg_desc->base0; + base_addr |= (seg_desc->base1 << 16); + base_addr |= (seg_desc->base2 << 24); + + return base_addr; +} + +static int load_tss_segment32(struct kvm_vcpu *vcpu, + struct desc_struct *seg_desc, + struct tss_segment_32 *tss) +{ + u32 base_addr; + + base_addr = get_tss_base_addr(vcpu, seg_desc); + + return kvm_read_guest(vcpu->kvm, base_addr, tss, + sizeof(struct tss_segment_32)); +} + +static int save_tss_segment32(struct kvm_vcpu *vcpu, + struct desc_struct *seg_desc, + struct tss_segment_32 *tss) +{ + u32 base_addr; + + base_addr = get_tss_base_addr(vcpu, seg_desc); + + return kvm_write_guest(vcpu->kvm, base_addr, tss, + sizeof(struct tss_segment_32)); +} + +static int load_tss_segment16(struct kvm_vcpu *vcpu, + struct desc_struct *seg_desc, + struct tss_segment_16 *tss) +{ + u32 base_addr; + + base_addr = get_tss_base_addr(vcpu, seg_desc); + + return kvm_read_guest(vcpu->kvm, base_addr, tss, + sizeof(struct tss_segment_16)); +} + +static int save_tss_segment16(struct kvm_vcpu *vcpu, + struct desc_struct *seg_desc, + struct tss_segment_16 *tss) +{ + u32 base_addr; + + base_addr = get_tss_base_addr(vcpu, seg_desc); + + return kvm_write_guest(vcpu->kvm, base_addr, tss, + sizeof(struct tss_segment_16)); +} + +static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment kvm_seg; + + get_segment(vcpu, &kvm_seg, seg); + return kvm_seg.selector; +} + +static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, + u16 selector, + struct kvm_segment *kvm_seg) +{ + struct desc_struct seg_desc; + + if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) + return 1; + seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); + return 0; +} + +static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, + int type_bits, int seg) +{ + struct kvm_segment kvm_seg; + + if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) + return 1; + kvm_seg.type |= type_bits; + + if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && + seg != VCPU_SREG_LDTR) + if (!kvm_seg.s) + kvm_seg.unusable = 1; + + set_segment(vcpu, &kvm_seg, seg); + return 0; +} + +static void save_state_to_tss32(struct kvm_vcpu *vcpu, + struct tss_segment_32 *tss) +{ + tss->cr3 = vcpu->arch.cr3; + tss->eip = vcpu->arch.rip; + tss->eflags = kvm_x86_ops->get_rflags(vcpu); + tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; + tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; + tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; + tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; + tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; + tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; + tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; + tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; + + tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); + tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); + tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); + tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); + tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS); + tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS); + tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); + tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); +} + +static int load_state_from_tss32(struct kvm_vcpu *vcpu, + struct tss_segment_32 *tss) +{ + kvm_set_cr3(vcpu, tss->cr3); + + vcpu->arch.rip = tss->eip; + kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); + + vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; + vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; + vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; + vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; + vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; + vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; + vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; + vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; + + if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) + return 1; + + if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + return 1; + + if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + return 1; + + if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + return 1; + + if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + return 1; + + if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) + return 1; + + if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) + return 1; + return 0; +} + +static void save_state_to_tss16(struct kvm_vcpu *vcpu, + struct tss_segment_16 *tss) +{ + tss->ip = vcpu->arch.rip; + tss->flag = kvm_x86_ops->get_rflags(vcpu); + tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; + tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; + tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; + tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; + tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; + tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; + tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; + tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; + + tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); + tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); + tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); + tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); + tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); + tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); +} + +static int load_state_from_tss16(struct kvm_vcpu *vcpu, + struct tss_segment_16 *tss) +{ + vcpu->arch.rip = tss->ip; + kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); + vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; + vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; + vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; + vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; + vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; + vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; + vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; + vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; + + if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) + return 1; + + if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + return 1; + + if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + return 1; + + if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + return 1; + + if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + return 1; + return 0; +} + +int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, + struct desc_struct *cseg_desc, + struct desc_struct *nseg_desc) +{ + struct tss_segment_16 tss_segment_16; + int ret = 0; + + if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16)) + goto out; + + save_state_to_tss16(vcpu, &tss_segment_16); + save_tss_segment16(vcpu, cseg_desc, &tss_segment_16); + + if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16)) + goto out; + if (load_state_from_tss16(vcpu, &tss_segment_16)) + goto out; + + ret = 1; +out: + return ret; +} + +int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, + struct desc_struct *cseg_desc, + struct desc_struct *nseg_desc) +{ + struct tss_segment_32 tss_segment_32; + int ret = 0; + + if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32)) + goto out; + + save_state_to_tss32(vcpu, &tss_segment_32); + save_tss_segment32(vcpu, cseg_desc, &tss_segment_32); + + if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32)) + goto out; + if (load_state_from_tss32(vcpu, &tss_segment_32)) + goto out; + + ret = 1; +out: + return ret; } +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) +{ + struct kvm_segment tr_seg; + struct desc_struct cseg_desc; + struct desc_struct nseg_desc; + int ret = 0; + + get_segment(vcpu, &tr_seg, VCPU_SREG_TR); + + if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) + goto out; + + if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc)) + goto out; + + + if (reason != TASK_SWITCH_IRET) { + int cpl; + + cpl = kvm_x86_ops->get_cpl(vcpu); + if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { + kvm_queue_exception_e(vcpu, GP_VECTOR, 0); + return 1; + } + } + + if (!nseg_desc.p || (nseg_desc.limit0 | nseg_desc.limit << 16) < 0x67) { + kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); + return 1; + } + + if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { + cseg_desc.type &= ~(1 << 8); //clear the B flag + save_guest_segment_descriptor(vcpu, tr_seg.selector, + &cseg_desc); + } + + if (reason == TASK_SWITCH_IRET) { + u32 eflags = kvm_x86_ops->get_rflags(vcpu); + kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); + } + + kvm_x86_ops->skip_emulated_instruction(vcpu); + kvm_x86_ops->cache_regs(vcpu); + + if (nseg_desc.type & 8) + ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc, + &nseg_desc); + else + ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc, + &nseg_desc); + + if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { + u32 eflags = kvm_x86_ops->get_rflags(vcpu); + kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); + } + + if (reason != TASK_SWITCH_IRET) { + nseg_desc.type |= (1 << 8); + save_guest_segment_descriptor(vcpu, tss_selector, + &nseg_desc); + } + + kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); + seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); + tr_seg.type = 11; + set_segment(vcpu, &tr_seg, VCPU_SREG_TR); +out: + kvm_x86_ops->decache_regs(vcpu); + return ret; +} +EXPORT_SYMBOL_GPL(kvm_task_switch); + int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { @@ -2862,12 +3542,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; vcpu->arch.cr3 = sregs->cr3; - set_cr8(vcpu, sregs->cr8); + kvm_set_cr8(vcpu, sregs->cr8); mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; -#ifdef CONFIG_X86_64 kvm_x86_ops->set_efer(vcpu, sregs->efer); -#endif kvm_set_apic_base(vcpu, sregs->apic_base); kvm_x86_ops->decache_cr4_guest_bits(vcpu); @@ -3141,9 +3819,9 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = INVALID_PAGE; if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0) - vcpu->arch.mp_state = VCPU_MP_STATE_RUNNABLE; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; else - vcpu->arch.mp_state = VCPU_MP_STATE_UNINITIALIZED; + vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) { @@ -3175,7 +3853,9 @@ fail: void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { kvm_free_lapic(vcpu); + down_read(&vcpu->kvm->slots_lock); kvm_mmu_destroy(vcpu); + up_read(&vcpu->kvm->slots_lock); free_page((unsigned long)vcpu->arch.pio_data); } @@ -3219,10 +3899,13 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); kvm_free_physmem(kvm); + if (kvm->arch.apic_access_page) + put_page(kvm->arch.apic_access_page); kfree(kvm); } @@ -3278,8 +3961,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm, int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return vcpu->arch.mp_state == VCPU_MP_STATE_RUNNABLE - || vcpu->arch.mp_state == VCPU_MP_STATE_SIPI_RECEIVED; + return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE + || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED; } static void vcpu_kick_intr(void *info) @@ -3293,11 +3976,17 @@ static void vcpu_kick_intr(void *info) void kvm_vcpu_kick(struct kvm_vcpu *vcpu) { int ipi_pcpu = vcpu->cpu; + int cpu = get_cpu(); if (waitqueue_active(&vcpu->wq)) { wake_up_interruptible(&vcpu->wq); ++vcpu->stat.halt_wakeup; } - if (vcpu->guest_mode) + /* + * We may be called synchronously with irqs disabled in guest mode, + * So need not to call smp_call_function_single() in that case. + */ + if (vcpu->guest_mode && vcpu->cpu != cpu) smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0); + put_cpu(); } diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 79586003397a..2ca08386f993 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -65,6 +65,14 @@ #define MemAbs (1<<9) /* Memory operand is absolute displacement */ #define String (1<<10) /* String instruction (rep capable) */ #define Stack (1<<11) /* Stack instruction (push/pop) */ +#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ +#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ +#define GroupMask 0xff /* Group number stored in bits 0:7 */ + +enum { + Group1_80, Group1_81, Group1_82, Group1_83, + Group1A, Group3_Byte, Group3, Group4, Group5, Group7, +}; static u16 opcode_table[256] = { /* 0x00 - 0x07 */ @@ -123,14 +131,14 @@ static u16 opcode_table[256] = { ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, /* 0x80 - 0x87 */ - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, + Group | Group1_80, Group | Group1_81, + Group | Group1_82, Group | Group1_83, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - 0, ModRM | DstReg, 0, DstMem | SrcNone | ModRM | Mov | Stack, + 0, ModRM | DstReg, 0, Group | Group1A, /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, @@ -164,16 +172,15 @@ static u16 opcode_table[256] = { 0, 0, 0, 0, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - ImplicitOps, ImplicitOps, - ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, - 0, 0, ByteOp | DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM + 0, 0, Group | Group4, Group | Group5, }; static u16 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, SrcMem | ModRM | DstReg, 0, 0, 0, 0, ImplicitOps, 0, + 0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0, ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, @@ -229,6 +236,56 @@ static u16 twobyte_table[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +static u16 group_table[] = { + [Group1_80*8] = + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + [Group1_81*8] = + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + [Group1_82*8] = + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + [Group1_83*8] = + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + [Group1A*8] = + DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, + [Group3_Byte*8] = + ByteOp | SrcImm | DstMem | ModRM, 0, + ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, + [Group3*8] = + DstMem | SrcImm | ModRM | SrcImm, 0, + DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, + [Group4*8] = + ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + 0, 0, 0, 0, 0, 0, + [Group5*8] = + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, + SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, + [Group7*8] = + 0, 0, ModRM | SrcMem, ModRM | SrcMem, + SrcNone | ModRM | DstMem | Mov, 0, + SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, +}; + +static u16 group2_table[] = { + [Group7*8] = + SrcNone | ModRM, 0, 0, 0, + SrcNone | ModRM | DstMem | Mov, 0, + SrcMem16 | ModRM | Mov, 0, +}; + /* EFLAGS bit definitions. */ #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) @@ -317,7 +374,7 @@ static u16 twobyte_table[256] = { #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ do { \ - unsigned long _tmp; \ + unsigned long __tmp; \ switch ((_dst).bytes) { \ case 1: \ __asm__ __volatile__ ( \ @@ -325,7 +382,7 @@ static u16 twobyte_table[256] = { _op"b %"_bx"3,%1; " \ _POST_EFLAGS("0", "4", "2") \ : "=m" (_eflags), "=m" ((_dst).val), \ - "=&r" (_tmp) \ + "=&r" (__tmp) \ : _by ((_src).val), "i" (EFLAGS_MASK)); \ break; \ default: \ @@ -426,29 +483,40 @@ static u16 twobyte_table[256] = { (_type)_x; \ }) +static inline unsigned long ad_mask(struct decode_cache *c) +{ + return (1UL << (c->ad_bytes << 3)) - 1; +} + /* Access/update address held in a register, based on addressing mode. */ -#define address_mask(reg) \ - ((c->ad_bytes == sizeof(unsigned long)) ? \ - (reg) : ((reg) & ((1UL << (c->ad_bytes << 3)) - 1))) -#define register_address(base, reg) \ - ((base) + address_mask(reg)) -#define register_address_increment(reg, inc) \ - do { \ - /* signed type ensures sign extension to long */ \ - int _inc = (inc); \ - if (c->ad_bytes == sizeof(unsigned long)) \ - (reg) += _inc; \ - else \ - (reg) = ((reg) & \ - ~((1UL << (c->ad_bytes << 3)) - 1)) | \ - (((reg) + _inc) & \ - ((1UL << (c->ad_bytes << 3)) - 1)); \ - } while (0) +static inline unsigned long +address_mask(struct decode_cache *c, unsigned long reg) +{ + if (c->ad_bytes == sizeof(unsigned long)) + return reg; + else + return reg & ad_mask(c); +} -#define JMP_REL(rel) \ - do { \ - register_address_increment(c->eip, rel); \ - } while (0) +static inline unsigned long +register_address(struct decode_cache *c, unsigned long base, unsigned long reg) +{ + return base + address_mask(c, reg); +} + +static inline void +register_address_increment(struct decode_cache *c, unsigned long *reg, int inc) +{ + if (c->ad_bytes == sizeof(unsigned long)) + *reg += inc; + else + *reg = (*reg & ~ad_mask(c)) | ((*reg + inc) & ad_mask(c)); +} + +static inline void jmp_rel(struct decode_cache *c, int rel) +{ + register_address_increment(c, &c->eip, rel); +} static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, @@ -763,7 +831,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct decode_cache *c = &ctxt->decode; int rc = 0; int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes; + int def_op_bytes, def_ad_bytes, group; /* Shadow copy of register state. Committed on successful emulation. */ @@ -864,12 +932,24 @@ done_prefixes: c->b = insn_fetch(u8, 1, c->eip); c->d = twobyte_table[c->b]; } + } - /* Unrecognised? */ - if (c->d == 0) { - DPRINTF("Cannot emulate %02x\n", c->b); - return -1; - } + if (c->d & Group) { + group = c->d & GroupMask; + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + + group = (group << 3) + ((c->modrm >> 3) & 7); + if ((c->d & GroupDual) && (c->modrm >> 6) == 3) + c->d = group2_table[group]; + else + c->d = group_table[group]; + } + + /* Unrecognised? */ + if (c->d == 0) { + DPRINTF("Cannot emulate %02x\n", c->b); + return -1; } if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) @@ -924,6 +1004,7 @@ done_prefixes: */ if ((c->d & ModRM) && c->modrm_mod == 3) { c->src.type = OP_REG; + c->src.val = c->modrm_val; break; } c->src.type = OP_MEM; @@ -967,6 +1048,7 @@ done_prefixes: case DstMem: if ((c->d & ModRM) && c->modrm_mod == 3) { c->dst.type = OP_REG; + c->dst.val = c->dst.orig_val = c->modrm_val; break; } c->dst.type = OP_MEM; @@ -984,8 +1066,8 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) c->dst.type = OP_MEM; c->dst.bytes = c->op_bytes; c->dst.val = c->src.val; - register_address_increment(c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(ctxt->ss_base, + register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); + c->dst.ptr = (void *) register_address(c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]); } @@ -995,13 +1077,13 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - rc = ops->read_std(register_address(ctxt->ss_base, + rc = ops->read_std(register_address(c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]), &c->dst.val, c->dst.bytes, ctxt->vcpu); if (rc != 0) return rc; - register_address_increment(c->regs[VCPU_REGS_RSP], c->dst.bytes); + register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes); return 0; } @@ -1043,26 +1125,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, switch (c->modrm_reg) { case 0 ... 1: /* test */ - /* - * Special case in Grp3: test has an immediate - * source operand. - */ - c->src.type = OP_IMM; - c->src.ptr = (unsigned long *)c->eip; - c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - if (c->src.bytes == 8) - c->src.bytes = 4; - switch (c->src.bytes) { - case 1: - c->src.val = insn_fetch(s8, 1, c->eip); - break; - case 2: - c->src.val = insn_fetch(s16, 2, c->eip); - break; - case 4: - c->src.val = insn_fetch(s32, 4, c->eip); - break; - } emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); break; case 2: /* not */ @@ -1076,7 +1138,6 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, rc = X86EMUL_UNHANDLEABLE; break; } -done: return rc; } @@ -1084,7 +1145,6 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc; switch (c->modrm_reg) { case 0: /* inc */ @@ -1094,36 +1154,11 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, emulate_1op("dec", c->dst, ctxt->eflags); break; case 4: /* jmp abs */ - if (c->b == 0xff) - c->eip = c->dst.val; - else { - DPRINTF("Cannot emulate %02x\n", c->b); - return X86EMUL_UNHANDLEABLE; - } + c->eip = c->src.val; break; case 6: /* push */ - - /* 64-bit mode: PUSH always pushes a 64-bit operand. */ - - if (ctxt->mode == X86EMUL_MODE_PROT64) { - c->dst.bytes = 8; - rc = ops->read_std((unsigned long)c->dst.ptr, - &c->dst.val, 8, ctxt->vcpu); - if (rc != 0) - return rc; - } - register_address_increment(c->regs[VCPU_REGS_RSP], - -c->dst.bytes); - rc = ops->write_emulated(register_address(ctxt->ss_base, - c->regs[VCPU_REGS_RSP]), &c->dst.val, - c->dst.bytes, ctxt->vcpu); - if (rc != 0) - return rc; - c->dst.type = OP_NONE; + emulate_push(ctxt); break; - default: - DPRINTF("Cannot emulate %02x\n", c->b); - return X86EMUL_UNHANDLEABLE; } return 0; } @@ -1361,19 +1396,19 @@ special_insn: c->dst.type = OP_MEM; c->dst.bytes = c->op_bytes; c->dst.val = c->src.val; - register_address_increment(c->regs[VCPU_REGS_RSP], + register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); c->dst.ptr = (void *) register_address( - ctxt->ss_base, c->regs[VCPU_REGS_RSP]); + c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]); break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: - if ((rc = ops->read_std(register_address(ctxt->ss_base, + if ((rc = ops->read_std(register_address(c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]), c->dst.ptr, c->op_bytes, ctxt->vcpu)) != 0) goto done; - register_address_increment(c->regs[VCPU_REGS_RSP], + register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->op_bytes); c->dst.type = OP_NONE; /* Disable writeback. */ break; @@ -1393,9 +1428,9 @@ special_insn: 1, (c->d & ByteOp) ? 1 : c->op_bytes, c->rep_prefix ? - address_mask(c->regs[VCPU_REGS_RCX]) : 1, + address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, (ctxt->eflags & EFLG_DF), - register_address(ctxt->es_base, + register_address(c, ctxt->es_base, c->regs[VCPU_REGS_RDI]), c->rep_prefix, c->regs[VCPU_REGS_RDX]) == 0) { @@ -1409,9 +1444,9 @@ special_insn: 0, (c->d & ByteOp) ? 1 : c->op_bytes, c->rep_prefix ? - address_mask(c->regs[VCPU_REGS_RCX]) : 1, + address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, (ctxt->eflags & EFLG_DF), - register_address(c->override_base ? + register_address(c, c->override_base ? *c->override_base : ctxt->ds_base, c->regs[VCPU_REGS_RSI]), @@ -1425,7 +1460,7 @@ special_insn: int rel = insn_fetch(s8, 1, c->eip); if (test_cc(c->b, ctxt->eflags)) - JMP_REL(rel); + jmp_rel(c, rel); break; } case 0x80 ... 0x83: /* Grp1 */ @@ -1477,7 +1512,7 @@ special_insn: case 0x88 ... 0x8b: /* mov */ goto mov; case 0x8d: /* lea r16/r32, m */ - c->dst.val = c->modrm_val; + c->dst.val = c->modrm_ea; break; case 0x8f: /* pop (sole member of Grp1a) */ rc = emulate_grp1a(ctxt, ops); @@ -1501,27 +1536,27 @@ special_insn: case 0xa4 ... 0xa5: /* movs */ c->dst.type = OP_MEM; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address( + c->dst.ptr = (unsigned long *)register_address(c, ctxt->es_base, c->regs[VCPU_REGS_RDI]); - if ((rc = ops->read_emulated(register_address( + if ((rc = ops->read_emulated(register_address(c, c->override_base ? *c->override_base : ctxt->ds_base, c->regs[VCPU_REGS_RSI]), &c->dst.val, c->dst.bytes, ctxt->vcpu)) != 0) goto done; - register_address_increment(c->regs[VCPU_REGS_RSI], + register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes : c->dst.bytes); - register_address_increment(c->regs[VCPU_REGS_RDI], + register_address_increment(c, &c->regs[VCPU_REGS_RDI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes : c->dst.bytes); break; case 0xa6 ... 0xa7: /* cmps */ c->src.type = OP_NONE; /* Disable writeback. */ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->src.ptr = (unsigned long *)register_address( + c->src.ptr = (unsigned long *)register_address(c, c->override_base ? *c->override_base : ctxt->ds_base, c->regs[VCPU_REGS_RSI]); @@ -1533,7 +1568,7 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address( + c->dst.ptr = (unsigned long *)register_address(c, ctxt->es_base, c->regs[VCPU_REGS_RDI]); if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, @@ -1546,10 +1581,10 @@ special_insn: emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - register_address_increment(c->regs[VCPU_REGS_RSI], + register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->src.bytes : c->src.bytes); - register_address_increment(c->regs[VCPU_REGS_RDI], + register_address_increment(c, &c->regs[VCPU_REGS_RDI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes : c->dst.bytes); @@ -1557,11 +1592,11 @@ special_insn: case 0xaa ... 0xab: /* stos */ c->dst.type = OP_MEM; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; - c->dst.ptr = (unsigned long *)register_address( + c->dst.ptr = (unsigned long *)register_address(c, ctxt->es_base, c->regs[VCPU_REGS_RDI]); c->dst.val = c->regs[VCPU_REGS_RAX]; - register_address_increment(c->regs[VCPU_REGS_RDI], + register_address_increment(c, &c->regs[VCPU_REGS_RDI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes : c->dst.bytes); break; @@ -1569,7 +1604,7 @@ special_insn: c->dst.type = OP_REG; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; - if ((rc = ops->read_emulated(register_address( + if ((rc = ops->read_emulated(register_address(c, c->override_base ? *c->override_base : ctxt->ds_base, c->regs[VCPU_REGS_RSI]), @@ -1577,7 +1612,7 @@ special_insn: c->dst.bytes, ctxt->vcpu)) != 0) goto done; - register_address_increment(c->regs[VCPU_REGS_RSI], + register_address_increment(c, &c->regs[VCPU_REGS_RSI], (ctxt->eflags & EFLG_DF) ? -c->dst.bytes : c->dst.bytes); break; @@ -1616,14 +1651,14 @@ special_insn: goto cannot_emulate; } c->src.val = (unsigned long) c->eip; - JMP_REL(rel); + jmp_rel(c, rel); c->op_bytes = c->ad_bytes; emulate_push(ctxt); break; } case 0xe9: /* jmp rel */ case 0xeb: /* jmp rel short */ - JMP_REL(c->src.val); + jmp_rel(c, c->src.val); c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ @@ -1690,6 +1725,8 @@ twobyte_insn: goto done; kvm_emulate_hypercall(ctxt->vcpu); + /* Disable writeback. */ + c->dst.type = OP_NONE; break; case 2: /* lgdt */ rc = read_descriptor(ctxt, ops, c->src.ptr, @@ -1697,6 +1734,8 @@ twobyte_insn: if (rc) goto done; realmode_lgdt(ctxt->vcpu, size, address); + /* Disable writeback. */ + c->dst.type = OP_NONE; break; case 3: /* lidt/vmmcall */ if (c->modrm_mod == 3 && c->modrm_rm == 1) { @@ -1712,27 +1751,25 @@ twobyte_insn: goto done; realmode_lidt(ctxt->vcpu, size, address); } + /* Disable writeback. */ + c->dst.type = OP_NONE; break; case 4: /* smsw */ - if (c->modrm_mod != 3) - goto cannot_emulate; - *(u16 *)&c->regs[c->modrm_rm] - = realmode_get_cr(ctxt->vcpu, 0); + c->dst.bytes = 2; + c->dst.val = realmode_get_cr(ctxt->vcpu, 0); break; case 6: /* lmsw */ - if (c->modrm_mod != 3) - goto cannot_emulate; - realmode_lmsw(ctxt->vcpu, (u16)c->modrm_val, - &ctxt->eflags); + realmode_lmsw(ctxt->vcpu, (u16)c->src.val, + &ctxt->eflags); break; case 7: /* invlpg*/ emulate_invlpg(ctxt->vcpu, memop); + /* Disable writeback. */ + c->dst.type = OP_NONE; break; default: goto cannot_emulate; } - /* Disable writeback. */ - c->dst.type = OP_NONE; break; case 0x06: emulate_clts(ctxt->vcpu); @@ -1823,7 +1860,7 @@ twobyte_insn: goto cannot_emulate; } if (test_cc(c->b, ctxt->eflags)) - JMP_REL(rel); + jmp_rel(c, rel); c->dst.type = OP_NONE; break; } diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 25df1c1989fe..76f60f52a885 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -11,7 +11,7 @@ lib-y += memcpy_$(BITS).o ifeq ($(CONFIG_X86_32),y) lib-y += checksum_32.o lib-y += strstr_32.o - lib-y += bitops_32.o semaphore_32.o string_32.o + lib-y += semaphore_32.o string_32.o lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else @@ -21,7 +21,6 @@ else lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += thunk_64.o clear_page_64.o copy_page_64.o - lib-y += bitops_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o endif diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c deleted file mode 100644 index b65440459859..000000000000 --- a/arch/x86/lib/bitops_32.c +++ /dev/null @@ -1,70 +0,0 @@ -#include <linux/bitops.h> -#include <linux/module.h> - -/** - * find_next_bit - find the next set bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The maximum size to search - */ -int find_next_bit(const unsigned long *addr, int size, int offset) -{ - const unsigned long *p = addr + (offset >> 5); - int set = 0, bit = offset & 31, res; - - if (bit) { - /* - * Look for nonzero in the first 32 bits: - */ - __asm__("bsfl %1,%0\n\t" - "jne 1f\n\t" - "movl $32, %0\n" - "1:" - : "=r" (set) - : "r" (*p >> bit)); - if (set < (32 - bit)) - return set + offset; - set = 32 - bit; - p++; - } - /* - * No set bit yet, search remaining full words for a bit - */ - res = find_first_bit (p, size - 32 * (p - addr)); - return (offset + set + res); -} -EXPORT_SYMBOL(find_next_bit); - -/** - * find_next_zero_bit - find the first zero bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The maximum size to search - */ -int find_next_zero_bit(const unsigned long *addr, int size, int offset) -{ - const unsigned long *p = addr + (offset >> 5); - int set = 0, bit = offset & 31, res; - - if (bit) { - /* - * Look for zero in the first 32 bits. - */ - __asm__("bsfl %1,%0\n\t" - "jne 1f\n\t" - "movl $32, %0\n" - "1:" - : "=r" (set) - : "r" (~(*p >> bit))); - if (set < (32 - bit)) - return set + offset; - set = 32 - bit; - p++; - } - /* - * No zero yet, search remaining full bytes for a zero - */ - res = find_first_zero_bit(p, size - 32 * (p - addr)); - return (offset + set + res); -} -EXPORT_SYMBOL(find_next_zero_bit); diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c deleted file mode 100644 index 0e8f491e6ccc..000000000000 --- a/arch/x86/lib/bitops_64.c +++ /dev/null @@ -1,175 +0,0 @@ -#include <linux/bitops.h> - -#undef find_first_zero_bit -#undef find_next_zero_bit -#undef find_first_bit -#undef find_next_bit - -static inline long -__find_first_zero_bit(const unsigned long * addr, unsigned long size) -{ - long d0, d1, d2; - long res; - - /* - * We must test the size in words, not in bits, because - * otherwise incoming sizes in the range -63..-1 will not run - * any scasq instructions, and then the flags used by the je - * instruction will have whatever random value was in place - * before. Nobody should call us like that, but - * find_next_zero_bit() does when offset and size are at the - * same word and it fails to find a zero itself. - */ - size += 63; - size >>= 6; - if (!size) - return 0; - asm volatile( - " repe; scasq\n" - " je 1f\n" - " xorq -8(%%rdi),%%rax\n" - " subq $8,%%rdi\n" - " bsfq %%rax,%%rdx\n" - "1: subq %[addr],%%rdi\n" - " shlq $3,%%rdi\n" - " addq %%rdi,%%rdx" - :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2) - :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL), - [addr] "S" (addr) : "memory"); - /* - * Any register would do for [addr] above, but GCC tends to - * prefer rbx over rsi, even though rsi is readily available - * and doesn't have to be saved. - */ - return res; -} - -/** - * find_first_zero_bit - find the first zero bit in a memory region - * @addr: The address to start the search at - * @size: The maximum size to search - * - * Returns the bit-number of the first zero bit, not the number of the byte - * containing a bit. - */ -long find_first_zero_bit(const unsigned long * addr, unsigned long size) -{ - return __find_first_zero_bit (addr, size); -} - -/** - * find_next_zero_bit - find the next zero bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The maximum size to search - */ -long find_next_zero_bit (const unsigned long * addr, long size, long offset) -{ - const unsigned long * p = addr + (offset >> 6); - unsigned long set = 0; - unsigned long res, bit = offset&63; - - if (bit) { - /* - * Look for zero in first word - */ - asm("bsfq %1,%0\n\t" - "cmoveq %2,%0" - : "=r" (set) - : "r" (~(*p >> bit)), "r"(64L)); - if (set < (64 - bit)) - return set + offset; - set = 64 - bit; - p++; - } - /* - * No zero yet, search remaining full words for a zero - */ - res = __find_first_zero_bit (p, size - 64 * (p - addr)); - - return (offset + set + res); -} - -static inline long -__find_first_bit(const unsigned long * addr, unsigned long size) -{ - long d0, d1; - long res; - - /* - * We must test the size in words, not in bits, because - * otherwise incoming sizes in the range -63..-1 will not run - * any scasq instructions, and then the flags used by the jz - * instruction will have whatever random value was in place - * before. Nobody should call us like that, but - * find_next_bit() does when offset and size are at the same - * word and it fails to find a one itself. - */ - size += 63; - size >>= 6; - if (!size) - return 0; - asm volatile( - " repe; scasq\n" - " jz 1f\n" - " subq $8,%%rdi\n" - " bsfq (%%rdi),%%rax\n" - "1: subq %[addr],%%rdi\n" - " shlq $3,%%rdi\n" - " addq %%rdi,%%rax" - :"=a" (res), "=&c" (d0), "=&D" (d1) - :"0" (0ULL), "1" (size), "2" (addr), - [addr] "r" (addr) : "memory"); - return res; -} - -/** - * find_first_bit - find the first set bit in a memory region - * @addr: The address to start the search at - * @size: The maximum size to search - * - * Returns the bit-number of the first set bit, not the number of the byte - * containing a bit. - */ -long find_first_bit(const unsigned long * addr, unsigned long size) -{ - return __find_first_bit(addr,size); -} - -/** - * find_next_bit - find the first set bit in a memory region - * @addr: The address to base the search on - * @offset: The bitnumber to start searching at - * @size: The maximum size to search - */ -long find_next_bit(const unsigned long * addr, long size, long offset) -{ - const unsigned long * p = addr + (offset >> 6); - unsigned long set = 0, bit = offset & 63, res; - - if (bit) { - /* - * Look for nonzero in the first 64 bits: - */ - asm("bsfq %1,%0\n\t" - "cmoveq %2,%0\n\t" - : "=r" (set) - : "r" (*p >> bit), "r" (64L)); - if (set < (64 - bit)) - return set + offset; - set = 64 - bit; - p++; - } - /* - * No set bit yet, search remaining full words for a bit - */ - res = __find_first_bit (p, size - 64 * (p - addr)); - return (offset + set + res); -} - -#include <linux/module.h> - -EXPORT_SYMBOL(find_next_bit); -EXPORT_SYMBOL(find_first_bit); -EXPORT_SYMBOL(find_first_zero_bit); -EXPORT_SYMBOL(find_next_zero_bit); diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index d05722121d24..8acbf0cdf1a5 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -113,7 +113,7 @@ static inline void send_QIC_CPI(__u32 cpuset, __u8 cpi) for_each_online_cpu(cpu) { if (cpuset & (1 << cpu)) { #ifdef VOYAGER_DEBUG - if (!cpu_isset(cpu, cpu_online_map)) + if (!cpu_online(cpu)) VDEBUG(("CPU%d sending cpi %d to CPU%d not in " "cpu_online_map\n", hard_smp_processor_id(), cpi, cpu)); @@ -543,8 +543,8 @@ static void __init do_boot_cpu(__u8 cpu) hijack_source.idt.Offset, stack_start.sp)); /* init lowmem identity mapping */ - clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, - min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS)); + clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); flush_tlb_all(); if (quad_boot) { @@ -683,9 +683,9 @@ void __init smp_boot_cpus(void) * Code added from smpboot.c */ { unsigned long bogosum = 0; - for (i = 0; i < NR_CPUS; i++) - if (cpu_isset(i, cpu_online_map)) - bogosum += cpu_data(i).loops_per_jiffy; + + for_each_online_cpu(i) + bogosum += cpu_data(i).loops_per_jiffy; printk(KERN_INFO "Total of %d processors activated " "(%lu.%02lu BogoMIPS).\n", cpucount + 1, bogosum / (500000 / HZ), @@ -1838,7 +1838,7 @@ static int __cpuinit voyager_cpu_up(unsigned int cpu) return -EIO; /* Unleash the CPU! */ cpu_set(cpu, smp_commenced_mask); - while (!cpu_isset(cpu, cpu_online_map)) + while (!cpu_online(cpu)) mb(); return 0; } diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 20941d2954e2..b7b3e4c7cfc9 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,5 +1,5 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o + pat.o pgtable.o obj-$(CONFIG_X86_32) += pgtable_32.o diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9ec62da85fd7..de236e419cb5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -71,7 +71,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); + paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); @@ -100,7 +100,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); } - paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT); + paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); BUG_ON(page_table != pte_offset_kernel(pmd, 0)); } @@ -227,6 +227,25 @@ static inline int page_kills_ppro(unsigned long pagenr) return 0; } +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} + #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; pgprot_t kmap_prot; @@ -268,47 +287,17 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = pte; } -static void __meminit free_new_highpage(struct page *page) -{ - init_page_count(page); - __free_page(page); - totalhigh_pages++; -} - void __init add_one_highpage_init(struct page *page, int pfn, int bad_ppro) { if (page_is_ram(pfn) && !(bad_ppro && page_kills_ppro(pfn))) { ClearPageReserved(page); - free_new_highpage(page); + init_page_count(page); + __free_page(page); + totalhigh_pages++; } else SetPageReserved(page); } -static int __meminit -add_one_highpage_hotplug(struct page *page, unsigned long pfn) -{ - free_new_highpage(page); - totalram_pages++; -#ifdef CONFIG_FLATMEM - max_mapnr = max(pfn, max_mapnr); -#endif - num_physpages++; - - return 0; -} - -/* - * Not currently handling the NUMA case. - * Assuming single node and all memory that - * has been added dynamically that would be - * onlined here is in HIGHMEM. - */ -void __meminit online_page(struct page *page) -{ - ClearPageReserved(page); - add_one_highpage_hotplug(page, page_to_pfn(page)); -} - #ifndef CONFIG_NUMA static void __init set_highmem_pages_init(int bad_ppro) { @@ -365,7 +354,7 @@ void __init native_pagetable_setup_start(pgd_t *base) pte_clear(NULL, va, pte); } - paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT); + paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); } void __init native_pagetable_setup_done(pgd_t *base) @@ -457,7 +446,7 @@ void zap_low_mappings(void) * Note that "pgd_clear()" doesn't do it for * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) { + for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) { #ifdef CONFIG_X86_PAE set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else @@ -547,9 +536,9 @@ void __init paging_init(void) /* * Test if the WP bit works in supervisor mode. It isn't supported on 386's - * and also on some strange 486's (NexGen etc.). All 586+'s are OK. This - * used to involve black magic jumps to work around some nasty CPU bugs, - * but fortunately the switch to using exceptions got rid of all that. + * and also on some strange 486's. All 586+'s are OK. This used to involve + * black magic jumps to work around some nasty CPU bugs, but fortunately the + * switch to using exceptions got rid of all that. */ static void __init test_wp_bit(void) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1ff7906a9a4d..32ba13b0f818 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -135,7 +135,7 @@ static __init void *spp_getpage(void) return ptr; } -static __init void +static void set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) { pgd_t *pgd; @@ -173,7 +173,7 @@ set_pte_phys(unsigned long vaddr, unsigned long phys, pgprot_t prot) new_pte = pfn_pte(phys >> PAGE_SHIFT, prot); pte = pte_offset_kernel(pmd, vaddr); - if (!pte_none(*pte) && + if (!pte_none(*pte) && pte_val(new_pte) && pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) pte_ERROR(*pte); set_pte(pte, new_pte); @@ -214,8 +214,7 @@ void __init cleanup_highmap(void) } /* NOTE: this is meant to be run only at boot */ -void __init -__set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) +void __set_fixmap(enum fixed_addresses idx, unsigned long phys, pgprot_t prot) { unsigned long address = __fix_to_virt(idx); @@ -621,15 +620,6 @@ void __init paging_init(void) /* * Memory hotplug specific functions */ -void online_page(struct page *page) -{ - ClearPageReserved(page); - init_page_count(page); - __free_page(page); - totalram_pages++; - num_physpages++; -} - #ifdef CONFIG_MEMORY_HOTPLUG /* * Memory is added always to NORMAL zone. This means you will never get @@ -664,6 +654,26 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif /* CONFIG_MEMORY_HOTPLUG */ +/* + * devmem_is_allowed() checks to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * + * On x86, access has to be given to the first megabyte of ram because that area + * contains bios code and data regions used by X and dosemu and similar apps. + * Access has to be given to non-kernel-ram areas as well, these contain the PCI + * mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pagenr) +{ + if (pagenr <= 256) + return 1; + if (!page_is_ram(pagenr)) + return 1; + return 0; +} + + static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules, kcore_vsyscall; @@ -791,7 +801,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) void __init reserve_bootmem_generic(unsigned long phys, unsigned len) { #ifdef CONFIG_NUMA - int nid = phys_to_nid(phys); + int nid, next_nid; #endif unsigned long pfn = phys >> PAGE_SHIFT; @@ -810,10 +820,16 @@ void __init reserve_bootmem_generic(unsigned long phys, unsigned len) /* Should check here against the e820 map to avoid double free */ #ifdef CONFIG_NUMA - reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); + nid = phys_to_nid(phys); + next_nid = phys_to_nid(phys + len - 1); + if (nid == next_nid) + reserve_bootmem_node(NODE_DATA(nid), phys, len, BOOTMEM_DEFAULT); + else + reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #else reserve_bootmem(phys, len, BOOTMEM_DEFAULT); #endif + if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) { dma_reserve += len / PAGE_SIZE; set_dma_reserve(dma_reserve); @@ -907,6 +923,10 @@ const char *arch_vma_name(struct vm_area_struct *vma) /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. */ +static long __meminitdata addr_start, addr_end; +static void __meminitdata *p_start, *p_end; +static int __meminitdata node_start; + int __meminit vmemmap_populate(struct page *start_page, unsigned long size, int node) { @@ -941,12 +961,32 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node) PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd(pte_val(entry))); - printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n", - addr, addr + PMD_SIZE - 1, p, node); + /* check to see if we have contiguous blocks */ + if (p_end != p || node_start != node) { + if (p_start) + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + addr_start = addr; + node_start = node; + p_start = p; + } + addr_end = addr + PMD_SIZE; + p_end = p + PMD_SIZE; } else { vmemmap_verify((pte_t *)pmd, node, addr, next); } } return 0; } + +void __meminit vmemmap_populate_print_last(void) +{ + if (p_start) { + printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n", + addr_start, addr_end-1, p_start, p_end-1, node_start); + p_start = NULL; + p_end = NULL; + node_start = 0; + } +} #endif diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 3a4baf95e24d..804de18abcc2 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -117,8 +117,8 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size, * have to convert them into an offset in a page-aligned mapping, but the * caller shouldn't need to know that small detail. */ -static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, - unsigned long prot_val) +static void __iomem *__ioremap_caller(resource_size_t phys_addr, + unsigned long size, unsigned long prot_val, void *caller) { unsigned long pfn, offset, vaddr; resource_size_t last_addr; @@ -212,7 +212,7 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, /* * Ok, go for it.. */ - area = get_vm_area(size, VM_IOREMAP); + area = get_vm_area_caller(size, VM_IOREMAP, caller); if (!area) return NULL; area->phys_addr = phys_addr; @@ -255,7 +255,8 @@ static void __iomem *__ioremap(resource_size_t phys_addr, unsigned long size, */ void __iomem *ioremap_nocache(resource_size_t phys_addr, unsigned long size) { - return __ioremap(phys_addr, size, _PAGE_CACHE_UC); + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_UC, + __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_nocache); @@ -272,7 +273,8 @@ EXPORT_SYMBOL(ioremap_nocache); void __iomem *ioremap_wc(unsigned long phys_addr, unsigned long size) { if (pat_wc_enabled) - return __ioremap(phys_addr, size, _PAGE_CACHE_WC); + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WC, + __builtin_return_address(0)); else return ioremap_nocache(phys_addr, size); } @@ -280,7 +282,8 @@ EXPORT_SYMBOL(ioremap_wc); void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) { - return __ioremap(phys_addr, size, _PAGE_CACHE_WB); + return __ioremap_caller(phys_addr, size, _PAGE_CACHE_WB, + __builtin_return_address(0)); } EXPORT_SYMBOL(ioremap_cache); @@ -336,6 +339,35 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap); +/* + * Convert a physical pointer to a virtual kernel pointer for /dev/mem + * access + */ +void *xlate_dev_mem_ptr(unsigned long phys) +{ + void *addr; + unsigned long start = phys & PAGE_MASK; + + /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ + if (page_is_ram(start >> PAGE_SHIFT)) + return __va(phys); + + addr = (void *)ioremap(start, PAGE_SIZE); + if (addr) + addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); + + return addr; +} + +void unxlate_dev_mem_ptr(unsigned long phys, void *addr) +{ + if (page_is_ram(phys >> PAGE_SHIFT)) + return; + + iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); + return; +} + #ifdef CONFIG_X86_32 int __initdata early_ioremap_debug; @@ -407,7 +439,7 @@ void __init early_ioremap_clear(void) pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); pmd_clear(pmd); - paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT); + paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT); __flush_tlb_all(); } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 9a6892200b27..c5066d519e5d 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -196,6 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long bootmap_start, nodedata_phys; void *bootmap; const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); + int nid; start = round_up(start, ZONE_ALIGN); @@ -218,9 +219,19 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, NODE_DATA(nodeid)->node_start_pfn = start_pfn; NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn; - /* Find a place for the bootmem map */ + /* + * Find a place for the bootmem map + * nodedata_phys could be on other nodes by alloc_bootmem, + * so need to sure bootmap_start not to be small, otherwise + * early_node_mem will get that with find_e820_area instead + * of alloc_bootmem, that could clash with reserved range + */ bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn); - bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); + nid = phys_to_nid(nodedata_phys); + if (nid == nodeid) + bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); + else + bootmap_start = round_up(start, PAGE_SIZE); /* * SMP_CAHCE_BYTES could be enough, but init_bootmem_node like * to use that to align to PAGE_SIZE @@ -245,10 +256,29 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, free_bootmem_with_active_regions(nodeid, end); - reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size, - BOOTMEM_DEFAULT); - reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, - bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); + /* + * convert early reserve to bootmem reserve earlier + * otherwise early_node_mem could use early reserved mem + * on previous node + */ + early_res_to_bootmem(start, end); + + /* + * in some case early_node_mem could use alloc_bootmem + * to get range on other node, don't reserve that again + */ + if (nid != nodeid) + printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid); + else + reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, + pgdat_size, BOOTMEM_DEFAULT); + nid = phys_to_nid(bootmap_start); + if (nid != nodeid) + printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); + else + reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, + bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); + #ifdef CONFIG_ACPI_NUMA srat_reserve_add_area(nodeid); #endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index c29ebd037254..bd5e05c654dc 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -483,9 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) goto out_unlock; pbase = (pte_t *)page_address(base); -#ifdef CONFIG_X86_32 - paravirt_alloc_pt(&init_mm, page_to_pfn(base)); -#endif + paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); #ifdef CONFIG_X86_64 diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 72c0f6097402..277446cd30b6 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -11,16 +11,19 @@ #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/fs.h> +#include <linux/bootmem.h> #include <asm/msr.h> #include <asm/tlbflush.h> #include <asm/processor.h> +#include <asm/page.h> #include <asm/pgtable.h> #include <asm/pat.h> #include <asm/e820.h> #include <asm/cacheflush.h> #include <asm/fcntl.h> #include <asm/mtrr.h> +#include <asm/io.h> int pat_wc_enabled = 1; @@ -190,6 +193,21 @@ static int pat_x_mtrr_type(u64 start, u64 end, unsigned long prot, return 0; } +/* + * req_type typically has one of the: + * - _PAGE_CACHE_WB + * - _PAGE_CACHE_WC + * - _PAGE_CACHE_UC_MINUS + * - _PAGE_CACHE_UC + * + * req_type will have a special case value '-1', when requester want to inherit + * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS. + * + * If ret_type is NULL, function will return an error if it cannot reserve the + * region with req_type. If ret_type is non-null, function will return + * available type in ret_type in case of no error. In case of any error + * it will return a negative return value. + */ int reserve_memtype(u64 start, u64 end, unsigned long req_type, unsigned long *ret_type) { @@ -200,9 +218,14 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, /* Only track when pat_wc_enabled */ if (!pat_wc_enabled) { - if (ret_type) - *ret_type = req_type; - + /* This is identical to page table setting without PAT */ + if (ret_type) { + if (req_type == -1) { + *ret_type = _PAGE_CACHE_WB; + } else { + *ret_type = req_type; + } + } return 0; } @@ -214,8 +237,29 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, return 0; } - req_type &= _PAGE_CACHE_MASK; - err = pat_x_mtrr_type(start, end, req_type, &actual_type); + if (req_type == -1) { + /* + * Special case where caller wants to inherit from mtrr or + * existing pat mapping, defaulting to UC_MINUS in case of + * no match. + */ + u8 mtrr_type = mtrr_type_lookup(start, end); + if (mtrr_type == 0xFE) { /* MTRR match error */ + err = -1; + } + + if (mtrr_type == MTRR_TYPE_WRBACK) { + req_type = _PAGE_CACHE_WB; + actual_type = _PAGE_CACHE_WB; + } else { + req_type = _PAGE_CACHE_UC_MINUS; + actual_type = _PAGE_CACHE_UC_MINUS; + } + } else { + req_type &= _PAGE_CACHE_MASK; + err = pat_x_mtrr_type(start, end, req_type, &actual_type); + } + if (err) { if (ret_type) *ret_type = actual_type; @@ -241,7 +285,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, struct memtype *saved_ptr; if (parse->start >= end) { - printk("New Entry\n"); + pr_debug("New Entry\n"); list_add(&new_entry->nd, parse->nd.prev); new_entry = NULL; break; @@ -291,7 +335,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, break; } - printk("Overlap at 0x%Lx-0x%Lx\n", + pr_debug("Overlap at 0x%Lx-0x%Lx\n", saved_ptr->start, saved_ptr->end); /* No conflict. Go ahead and add this new entry */ list_add(&new_entry->nd, saved_ptr->nd.prev); @@ -343,8 +387,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, break; } - printk("Overlap at 0x%Lx-0x%Lx\n", - saved_ptr->start, saved_ptr->end); + pr_debug(KERN_INFO "Overlap at 0x%Lx-0x%Lx\n", + saved_ptr->start, saved_ptr->end); /* No conflict. Go ahead and add this new entry */ list_add(&new_entry->nd, &saved_ptr->nd); new_entry = NULL; @@ -353,7 +397,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, } if (err) { - printk( + printk(KERN_INFO "reserve_memtype failed 0x%Lx-0x%Lx, track %s, req %s\n", start, end, cattr_name(new_entry->type), cattr_name(req_type)); @@ -365,16 +409,16 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (new_entry) { /* No conflict. Not yet added to the list. Add to the tail */ list_add_tail(&new_entry->nd, &memtype_list); - printk("New Entry\n"); - } + pr_debug("New Entry\n"); + } if (ret_type) { - printk( + pr_debug( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s, ret %s\n", start, end, cattr_name(actual_type), cattr_name(req_type), cattr_name(*ret_type)); } else { - printk( + pr_debug( "reserve_memtype added 0x%Lx-0x%Lx, track %s, req %s\n", start, end, cattr_name(actual_type), cattr_name(req_type)); @@ -411,11 +455,142 @@ int free_memtype(u64 start, u64 end) spin_unlock(&memtype_lock); if (err) { - printk(KERN_DEBUG "%s:%d freeing invalid memtype %Lx-%Lx\n", + printk(KERN_INFO "%s:%d freeing invalid memtype %Lx-%Lx\n", current->comm, current->pid, start, end); } - printk( "free_memtype request 0x%Lx-0x%Lx\n", start, end); + pr_debug("free_memtype request 0x%Lx-0x%Lx\n", start, end); return err; } + +/* + * /dev/mem mmap interface. The memtype used for mapping varies: + * - Use UC for mappings with O_SYNC flag + * - Without O_SYNC flag, if there is any conflict in reserve_memtype, + * inherit the memtype from existing mapping. + * - Else use UC_MINUS memtype (for backward compatibility with existing + * X drivers. + */ +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot) +{ + return vma_prot; +} + +#ifdef CONFIG_NONPROMISC_DEVMEM +/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + return 1; +} +#else +static inline int range_is_allowed(unsigned long pfn, unsigned long size) +{ + u64 from = ((u64)pfn) << PAGE_SHIFT; + u64 to = from + size; + u64 cursor = from; + + while (cursor < to) { + if (!devmem_is_allowed(pfn)) { + printk(KERN_INFO + "Program %s tried to access /dev/mem between %Lx->%Lx.\n", + current->comm, from, to); + return 0; + } + cursor += PAGE_SIZE; + pfn++; + } + return 1; +} +#endif /* CONFIG_NONPROMISC_DEVMEM */ + +int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t *vma_prot) +{ + u64 offset = ((u64) pfn) << PAGE_SHIFT; + unsigned long flags = _PAGE_CACHE_UC_MINUS; + int retval; + + if (!range_is_allowed(pfn, size)) + return 0; + + if (file->f_flags & O_SYNC) { + flags = _PAGE_CACHE_UC; + } + +#ifdef CONFIG_X86_32 + /* + * On the PPro and successors, the MTRRs are used to set + * memory types for physical addresses outside main memory, + * so blindly setting UC or PWT on those pages is wrong. + * For Pentiums and earlier, the surround logic should disable + * caching for the high addresses through the KEN pin, but + * we maintain the tradition of paranoia in this code. + */ + if (!pat_wc_enabled && + ! ( test_bit(X86_FEATURE_MTRR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_K6_MTRR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CYRIX_ARR, boot_cpu_data.x86_capability) || + test_bit(X86_FEATURE_CENTAUR_MCR, boot_cpu_data.x86_capability)) && + (pfn << PAGE_SHIFT) >= __pa(high_memory)) { + flags = _PAGE_CACHE_UC; + } +#endif + + /* + * With O_SYNC, we can only take UC mapping. Fail if we cannot. + * Without O_SYNC, we want to get + * - WB for WB-able memory and no other conflicting mappings + * - UC_MINUS for non-WB-able memory with no other conflicting mappings + * - Inherit from confliting mappings otherwise + */ + if (flags != _PAGE_CACHE_UC_MINUS) { + retval = reserve_memtype(offset, offset + size, flags, NULL); + } else { + retval = reserve_memtype(offset, offset + size, -1, &flags); + } + + if (retval < 0) + return 0; + + if (pfn <= max_pfn_mapped && + ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) { + free_memtype(offset, offset + size); + printk(KERN_INFO + "%s:%d /dev/mem ioremap_change_attr failed %s for %Lx-%Lx\n", + current->comm, current->pid, + cattr_name(flags), + offset, offset + size); + return 0; + } + + *vma_prot = __pgprot((pgprot_val(*vma_prot) & ~_PAGE_CACHE_MASK) | + flags); + return 1; +} + +void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) +{ + u64 addr = (u64)pfn << PAGE_SHIFT; + unsigned long flags; + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); + + reserve_memtype(addr, addr + size, want_flags, &flags); + if (flags != want_flags) { + printk(KERN_INFO + "%s:%d /dev/mem expected mapping type %s for %Lx-%Lx, got %s\n", + current->comm, current->pid, + cattr_name(want_flags), + addr, addr + size, + cattr_name(flags)); + } +} + +void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) +{ + u64 addr = (u64)pfn << PAGE_SHIFT; + + free_memtype(addr, addr + size); +} + diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c new file mode 100644 index 000000000000..50159764f694 --- /dev/null +++ b/arch/x86/mm/pgtable.c @@ -0,0 +1,276 @@ +#include <linux/mm.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlb.h> + +pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) +{ + return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); +} + +pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *pte; + +#ifdef CONFIG_HIGHPTE + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); +#else + pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); +#endif + if (pte) + pgtable_page_ctor(pte); + return pte; +} + +void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +{ + pgtable_page_dtor(pte); + paravirt_release_pte(page_to_pfn(pte)); + tlb_remove_page(tlb, pte); +} + +#if PAGETABLE_LEVELS > 2 +void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +{ + paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pmd)); +} + +#if PAGETABLE_LEVELS > 3 +void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +{ + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(pud)); +} +#endif /* PAGETABLE_LEVELS > 3 */ +#endif /* PAGETABLE_LEVELS > 2 */ + +static inline void pgd_list_add(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_add(&page->lru, &pgd_list); +} + +static inline void pgd_list_del(pgd_t *pgd) +{ + struct page *page = virt_to_page(pgd); + + list_del(&page->lru); +} + +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) + +static void pgd_ctor(void *p) +{ + pgd_t *pgd = p; + unsigned long flags; + + /* Clear usermode parts of PGD */ + memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t)); + + spin_lock_irqsave(&pgd_lock, flags); + + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the + references from swapper_pg_dir. */ + if (PAGETABLE_LEVELS == 2 || + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || + PAGETABLE_LEVELS == 4) { + clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + } + + /* list required to sync kernel mapping updates */ + if (!SHARED_KERNEL_PMD) + pgd_list_add(pgd); + + spin_unlock_irqrestore(&pgd_lock, flags); +} + +static void pgd_dtor(void *pgd) +{ + unsigned long flags; /* can be called from interrupt context */ + + if (SHARED_KERNEL_PMD) + return; + + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); +} + +/* + * List of all pgd's needed for non-PAE so it can invalidate entries + * in both cached and uncached pgd's; not needed for PAE since the + * kernel pmd is shared. If PAE were not to share the pmd a similar + * tactic would be needed. This is essentially codepath-based locking + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. + * -- wli + */ + +#ifdef CONFIG_X86_PAE +/* + * Mop up any pmd pages which may still be attached to the pgd. + * Normally they will be freed by munmap/exit_mmap, but any pmd we + * preallocate which never got a corresponding vma will need to be + * freed manually. + */ +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) +{ + int i; + + for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { + pgd_t pgd = pgdp[i]; + + if (pgd_val(pgd) != 0) { + pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); + + pgdp[i] = native_make_pgd(0); + + paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); + pmd_free(mm, pmd); + } + } +} + +/* + * In PAE mode, we need to do a cr3 reload (=tlb flush) when + * updating the top-level pagetable entries to guarantee the + * processor notices the update. Since this is expensive, and + * all 4 top-level entries are used almost immediately in a + * new process's life, we just pre-populate them here. + * + * Also, if we're in a paravirt environment where the kernel pmd is + * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate + * and initialize the kernel pmds here. + */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + pud_t *pud; + unsigned long addr; + int i; + + pud = pud_offset(pgd, 0); + for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; + i++, pud++, addr += PUD_SIZE) { + pmd_t *pmd = pmd_alloc_one(mm, addr); + + if (!pmd) { + pgd_mop_up_pmds(mm, pgd); + return 0; + } + + if (i >= KERNEL_PGD_BOUNDARY) + memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), + sizeof(pmd_t) * PTRS_PER_PMD); + + pud_populate(mm, pud, pmd); + } + + return 1; +} + +void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) +{ + paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); + + /* Note: almost everything apart from _PAGE_PRESENT is + reserved at the pmd (PDPT) level. */ + set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); + + /* + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... + */ + if (mm == current->active_mm) + write_cr3(read_cr3()); +} +#else /* !CONFIG_X86_PAE */ +/* No need to prepopulate any pagetable entries in non-PAE modes. */ +static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) +{ + return 1; +} + +static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd) +{ +} +#endif /* CONFIG_X86_PAE */ + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + + /* so that alloc_pmd can use it */ + mm->pgd = pgd; + if (pgd) + pgd_ctor(pgd); + + if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { + pgd_dtor(pgd); + free_page((unsigned long)pgd); + pgd = NULL; + } + + return pgd; +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + pgd_mop_up_pmds(mm, pgd); + pgd_dtor(pgd); + free_page((unsigned long)pgd); +} + +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + + if (changed && dirty) { + *ptep = entry; + pte_update_defer(vma->vm_mm, address, ptep); + flush_tlb_page(vma, address); + } + + return changed; +} + +int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + int ret = 0; + + if (pte_young(*ptep)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + &ptep->pte); + + if (ret) + pte_update(vma->vm_mm, addr, ptep); + + return ret; +} + +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young; + + young = ptep_test_and_clear_young(vma, address, ptep); + if (young) + flush_tlb_page(vma, address); + + return young; +} diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 6fb9e7c6893f..9ee007be9142 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -173,210 +173,6 @@ void reserve_top_address(unsigned long reserve) __VMALLOC_RESERVE += reserve; } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) -{ - return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); -} - -pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) -{ - struct page *pte; - -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); -#endif - if (pte) - pgtable_page_ctor(pte); - return pte; -} - -/* - * List of all pgd's needed for non-PAE so it can invalidate entries - * in both cached and uncached pgd's; not needed for PAE since the - * kernel pmd is shared. If PAE were not to share the pmd a similar - * tactic would be needed. This is essentially codepath-based locking - * against pageattr.c; it is the unique case in which a valid change - * of kernel pagetables can't be lazily synchronized by vmalloc faults. - * vmalloc faults work because attached pagetables are never freed. - * -- wli - */ -static inline void pgd_list_add(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - - list_add(&page->lru, &pgd_list); -} - -static inline void pgd_list_del(pgd_t *pgd) -{ - struct page *page = virt_to_page(pgd); - - list_del(&page->lru); -} - -#define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) - -static void pgd_ctor(void *p) -{ - pgd_t *pgd = p; - unsigned long flags; - - /* Clear usermode parts of PGD */ - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - - spin_lock_irqsave(&pgd_lock, flags); - - /* If the pgd points to a shared pagetable level (either the - ptes in non-PAE, or shared PMD in PAE), then just copy the - references from swapper_pg_dir. */ - if (PAGETABLE_LEVELS == 2 || - (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { - clone_pgd_range(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - } - - /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) - pgd_list_add(pgd); - - spin_unlock_irqrestore(&pgd_lock, flags); -} - -static void pgd_dtor(void *pgd) -{ - unsigned long flags; /* can be called from interrupt context */ - - if (SHARED_KERNEL_PMD) - return; - - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); -} - -#ifdef CONFIG_X86_PAE -/* - * Mop up any pmd pages which may still be attached to the pgd. - * Normally they will be freed by munmap/exit_mmap, but any pmd we - * preallocate which never got a corresponding vma will need to be - * freed manually. - */ -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) -{ - int i; - - for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) { - pgd_t pgd = pgdp[i]; - - if (pgd_val(pgd) != 0) { - pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); - - pgdp[i] = native_make_pgd(0); - - paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT); - pmd_free(mm, pmd); - } - } -} - -/* - * In PAE mode, we need to do a cr3 reload (=tlb flush) when - * updating the top-level pagetable entries to guarantee the - * processor notices the update. Since this is expensive, and - * all 4 top-level entries are used almost immediately in a - * new process's life, we just pre-populate them here. - * - * Also, if we're in a paravirt environment where the kernel pmd is - * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate - * and initialize the kernel pmds here. - */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) -{ - pud_t *pud; - unsigned long addr; - int i; - - pud = pud_offset(pgd, 0); - for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD; - i++, pud++, addr += PUD_SIZE) { - pmd_t *pmd = pmd_alloc_one(mm, addr); - - if (!pmd) { - pgd_mop_up_pmds(mm, pgd); - return 0; - } - - if (i >= USER_PTRS_PER_PGD) - memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), - sizeof(pmd_t) * PTRS_PER_PMD); - - pud_populate(mm, pud, pmd); - } - - return 1; -} -#else /* !CONFIG_X86_PAE */ -/* No need to prepopulate any pagetable entries in non-PAE modes. */ -static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd) -{ - return 1; -} - -static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) -{ -} -#endif /* CONFIG_X86_PAE */ - -pgd_t *pgd_alloc(struct mm_struct *mm) -{ - pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); - - /* so that alloc_pd can use it */ - mm->pgd = pgd; - if (pgd) - pgd_ctor(pgd); - - if (pgd && !pgd_prepopulate_pmd(mm, pgd)) { - pgd_dtor(pgd); - free_page((unsigned long)pgd); - pgd = NULL; - } - - return pgd; -} - -void pgd_free(struct mm_struct *mm, pgd_t *pgd) -{ - pgd_mop_up_pmds(mm, pgd); - pgd_dtor(pgd); - free_page((unsigned long)pgd); -} - -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) -{ - pgtable_page_dtor(pte); - paravirt_release_pt(page_to_pfn(pte)); - tlb_remove_page(tlb, pte); -} - -#ifdef CONFIG_X86_PAE - -void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) -{ - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); - tlb_remove_page(tlb, virt_to_page(pmd)); -} - -#endif - int pmd_bad(pmd_t pmd) { WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd)); diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S index 4b1620a1529e..1d3aa6b87181 100644 --- a/arch/x86/vdso/vdso.S +++ b/arch/x86/vdso/vdso.S @@ -1,2 +1,10 @@ - .section ".vdso","a" +#include <linux/init.h> + +__INITDATA + + .globl vdso_start, vdso_end +vdso_start: .incbin "arch/x86/vdso/vdso.so" +vdso_end: + +__FINIT diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 4d5f2649bee4..2e641be2737e 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,7 +6,7 @@ config XEN bool "Xen guest support" select PARAVIRT depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && !NEED_MULTIPLE_NODES && !(X86_VISWS || X86_VOYAGER) + depends on X86_CMPXCHG && X86_TSC && !(X86_VISWS || X86_VOYAGER) help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 343df246bd3e..3d8df981d5fd 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ -obj-y := enlighten.o setup.o features.o multicalls.o mmu.o \ - events.o time.o manage.o xen-asm.o +obj-y := enlighten.o setup.o multicalls.o mmu.o \ + time.o manage.o xen-asm.o grant-table.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c0388220cf97..c8a56e457d61 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -155,7 +155,8 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, if (*ax == 1) maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ (1 << X86_FEATURE_ACPI) | /* disable ACPI */ - (1 << X86_FEATURE_SEP) | /* disable SEP */ + (1 << X86_FEATURE_MCE) | /* disable MCE */ + (1 << X86_FEATURE_MCA) | /* disable MCA */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ asm(XEN_EMULATE_PREFIX "cpuid" @@ -531,26 +532,37 @@ static void xen_apic_write(unsigned long reg, u32 val) static void xen_flush_tlb(void) { struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + struct multicall_space mcs; + + preempt_disable(); + + mcs = xen_mc_entry(sizeof(*op)); op = mcs.args; op->cmd = MMUEXT_TLB_FLUSH_LOCAL; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); } static void xen_flush_tlb_single(unsigned long addr) { struct mmuext_op *op; - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + struct multicall_space mcs; + + preempt_disable(); + mcs = xen_mc_entry(sizeof(*op)); op = mcs.args; op->cmd = MMUEXT_INVLPG_LOCAL; op->arg1.linear_addr = addr & PAGE_MASK; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); } static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm, @@ -655,15 +667,17 @@ static void xen_write_cr3(unsigned long cr3) /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ -static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn) +static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) { +#ifdef CONFIG_FLATMEM BUG_ON(mem_map); /* should only be used early */ +#endif make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); } -/* Early release_pt assumes that all pts are pinned, since there's +/* Early release_pte assumes that all pts are pinned, since there's only init_mm and anything attached to that is pinned. */ -static void xen_release_pt_init(u32 pfn) +static void xen_release_pte_init(u32 pfn) { make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } @@ -697,12 +711,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) } } -static void xen_alloc_pt(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) { xen_alloc_ptpage(mm, pfn, PT_PTE); } -static void xen_alloc_pd(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) { xen_alloc_ptpage(mm, pfn, PT_PMD); } @@ -722,12 +736,12 @@ static void xen_release_ptpage(u32 pfn, unsigned level) } } -static void xen_release_pt(u32 pfn) +static void xen_release_pte(u32 pfn) { xen_release_ptpage(pfn, PT_PTE); } -static void xen_release_pd(u32 pfn) +static void xen_release_pmd(u32 pfn) { xen_release_ptpage(pfn, PT_PMD); } @@ -849,10 +863,10 @@ static __init void xen_pagetable_setup_done(pgd_t *base) { /* This will work as long as patching hasn't happened yet (which it hasn't) */ - pv_mmu_ops.alloc_pt = xen_alloc_pt; - pv_mmu_ops.alloc_pd = xen_alloc_pd; - pv_mmu_ops.release_pt = xen_release_pt; - pv_mmu_ops.release_pd = xen_release_pd; + pv_mmu_ops.alloc_pte = xen_alloc_pte; + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; pv_mmu_ops.set_pte = xen_set_pte; setup_shared_info(); @@ -994,7 +1008,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .read_pmc = native_read_pmc, .iret = xen_iret, - .irq_enable_syscall_ret = NULL, /* never called */ + .irq_enable_syscall_ret = xen_sysexit, .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, @@ -1059,11 +1073,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, - .alloc_pt = xen_alloc_pt_init, - .release_pt = xen_release_pt_init, - .alloc_pd = xen_alloc_pt_init, - .alloc_pd_clone = paravirt_nop, - .release_pd = xen_release_pt_init, + .alloc_pte = xen_alloc_pte_init, + .release_pte = xen_release_pte_init, + .alloc_pmd = xen_alloc_pte_init, + .alloc_pmd_clone = paravirt_nop, + .release_pmd = xen_release_pte_init, #ifdef CONFIG_HIGHPTE .kmap_atomic_pte = xen_kmap_atomic_pte, diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c deleted file mode 100644 index dcf613e17581..000000000000 --- a/arch/x86/xen/events.c +++ /dev/null @@ -1,591 +0,0 @@ -/* - * Xen event channels - * - * Xen models interrupts with abstract event channels. Because each - * domain gets 1024 event channels, but NR_IRQ is not that large, we - * must dynamically map irqs<->event channels. The event channels - * interface with the rest of the kernel by defining a xen interrupt - * chip. When an event is recieved, it is mapped to an irq and sent - * through the normal interrupt processing path. - * - * There are four kinds of events which can be mapped to an event - * channel: - * - * 1. Inter-domain notifications. This includes all the virtual - * device events, since they're driven by front-ends in another domain - * (typically dom0). - * 2. VIRQs, typically used for timers. These are per-cpu events. - * 3. IPIs. - * 4. Hardware interrupts. Not supported at present. - * - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 - */ - -#include <linux/linkage.h> -#include <linux/interrupt.h> -#include <linux/irq.h> -#include <linux/module.h> -#include <linux/string.h> - -#include <asm/ptrace.h> -#include <asm/irq.h> -#include <asm/sync_bitops.h> -#include <asm/xen/hypercall.h> -#include <asm/xen/hypervisor.h> - -#include <xen/events.h> -#include <xen/interface/xen.h> -#include <xen/interface/event_channel.h> - -#include "xen-ops.h" - -/* - * This lock protects updates to the following mapping and reference-count - * arrays. The lock does not need to be acquired to read the mapping tables. - */ -static DEFINE_SPINLOCK(irq_mapping_update_lock); - -/* IRQ <-> VIRQ mapping. */ -static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; - -/* IRQ <-> IPI mapping */ -static DEFINE_PER_CPU(int, ipi_to_irq[XEN_NR_IPIS]) = {[0 ... XEN_NR_IPIS-1] = -1}; - -/* Packed IRQ information: binding type, sub-type index, and event channel. */ -struct packed_irq -{ - unsigned short evtchn; - unsigned char index; - unsigned char type; -}; - -static struct packed_irq irq_info[NR_IRQS]; - -/* Binding types. */ -enum { - IRQT_UNBOUND, - IRQT_PIRQ, - IRQT_VIRQ, - IRQT_IPI, - IRQT_EVTCHN -}; - -/* Convenient shorthand for packed representation of an unbound IRQ. */ -#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) - -static int evtchn_to_irq[NR_EVENT_CHANNELS] = { - [0 ... NR_EVENT_CHANNELS-1] = -1 -}; -static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; -static u8 cpu_evtchn[NR_EVENT_CHANNELS]; - -/* Reference counts for bindings to IRQs. */ -static int irq_bindcount[NR_IRQS]; - -/* Xen will never allocate port zero for any purpose. */ -#define VALID_EVTCHN(chn) ((chn) != 0) - -/* - * Force a proper event-channel callback from Xen after clearing the - * callback mask. We do this in a very simple manner, by making a call - * down into Xen. The pending flag will be checked by Xen on return. - */ -void force_evtchn_callback(void) -{ - (void)HYPERVISOR_xen_version(0, NULL); -} -EXPORT_SYMBOL_GPL(force_evtchn_callback); - -static struct irq_chip xen_dynamic_chip; - -/* Constructor for packed IRQ information. */ -static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) -{ - return (struct packed_irq) { evtchn, index, type }; -} - -/* - * Accessors for packed IRQ information. - */ -static inline unsigned int evtchn_from_irq(int irq) -{ - return irq_info[irq].evtchn; -} - -static inline unsigned int index_from_irq(int irq) -{ - return irq_info[irq].index; -} - -static inline unsigned int type_from_irq(int irq) -{ - return irq_info[irq].type; -} - -static inline unsigned long active_evtchns(unsigned int cpu, - struct shared_info *sh, - unsigned int idx) -{ - return (sh->evtchn_pending[idx] & - cpu_evtchn_mask[cpu][idx] & - ~sh->evtchn_mask[idx]); -} - -static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) -{ - int irq = evtchn_to_irq[chn]; - - BUG_ON(irq == -1); -#ifdef CONFIG_SMP - irq_desc[irq].affinity = cpumask_of_cpu(cpu); -#endif - - __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); - __set_bit(chn, cpu_evtchn_mask[cpu]); - - cpu_evtchn[chn] = cpu; -} - -static void init_evtchn_cpu_bindings(void) -{ -#ifdef CONFIG_SMP - int i; - /* By default all event channels notify CPU#0. */ - for (i = 0; i < NR_IRQS; i++) - irq_desc[i].affinity = cpumask_of_cpu(0); -#endif - - memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); - memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); -} - -static inline unsigned int cpu_from_evtchn(unsigned int evtchn) -{ - return cpu_evtchn[evtchn]; -} - -static inline void clear_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_clear_bit(port, &s->evtchn_pending[0]); -} - -static inline void set_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_pending[0]); -} - - -/** - * notify_remote_via_irq - send event to remote end of event channel via irq - * @irq: irq of event channel to send event to - * - * Unlike notify_remote_via_evtchn(), this is safe to use across - * save/restore. Notifications on a broken connection are silently - * dropped. - */ -void notify_remote_via_irq(int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - notify_remote_via_evtchn(evtchn); -} -EXPORT_SYMBOL_GPL(notify_remote_via_irq); - -static void mask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - sync_set_bit(port, &s->evtchn_mask[0]); -} - -static void unmask_evtchn(int port) -{ - struct shared_info *s = HYPERVISOR_shared_info; - unsigned int cpu = get_cpu(); - - BUG_ON(!irqs_disabled()); - - /* Slow path (hypercall) if this is a non-local port. */ - if (unlikely(cpu != cpu_from_evtchn(port))) { - struct evtchn_unmask unmask = { .port = port }; - (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); - } else { - struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - - sync_clear_bit(port, &s->evtchn_mask[0]); - - /* - * The following is basically the equivalent of - * 'hw_resend_irq'. Just like a real IO-APIC we 'lose - * the interrupt edge' if the channel is masked. - */ - if (sync_test_bit(port, &s->evtchn_pending[0]) && - !sync_test_and_set_bit(port / BITS_PER_LONG, - &vcpu_info->evtchn_pending_sel)) - vcpu_info->evtchn_upcall_pending = 1; - } - - put_cpu(); -} - -static int find_unbound_irq(void) -{ - int irq; - - /* Only allocate from dynirq range */ - for (irq = 0; irq < NR_IRQS; irq++) - if (irq_bindcount[irq] == 0) - break; - - if (irq == NR_IRQS) - panic("No available IRQ to bind to: increase NR_IRQS!\n"); - - return irq; -} - -int bind_evtchn_to_irq(unsigned int evtchn) -{ - int irq; - - spin_lock(&irq_mapping_update_lock); - - irq = evtchn_to_irq[evtchn]; - - if (irq == -1) { - irq = find_unbound_irq(); - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "event"); - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); - } - - irq_bindcount[irq]++; - - spin_unlock(&irq_mapping_update_lock); - - return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); - -static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) -{ - struct evtchn_bind_ipi bind_ipi; - int evtchn, irq; - - spin_lock(&irq_mapping_update_lock); - - irq = per_cpu(ipi_to_irq, cpu)[ipi]; - if (irq == -1) { - irq = find_unbound_irq(); - if (irq < 0) - goto out; - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "ipi"); - - bind_ipi.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, - &bind_ipi) != 0) - BUG(); - evtchn = bind_ipi.port; - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); - - per_cpu(ipi_to_irq, cpu)[ipi] = irq; - - bind_evtchn_to_cpu(evtchn, cpu); - } - - irq_bindcount[irq]++; - - out: - spin_unlock(&irq_mapping_update_lock); - return irq; -} - - -static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) -{ - struct evtchn_bind_virq bind_virq; - int evtchn, irq; - - spin_lock(&irq_mapping_update_lock); - - irq = per_cpu(virq_to_irq, cpu)[virq]; - - if (irq == -1) { - bind_virq.virq = virq; - bind_virq.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, - &bind_virq) != 0) - BUG(); - evtchn = bind_virq.port; - - irq = find_unbound_irq(); - - dynamic_irq_init(irq); - set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, - handle_level_irq, "virq"); - - evtchn_to_irq[evtchn] = irq; - irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); - - per_cpu(virq_to_irq, cpu)[virq] = irq; - - bind_evtchn_to_cpu(evtchn, cpu); - } - - irq_bindcount[irq]++; - - spin_unlock(&irq_mapping_update_lock); - - return irq; -} - -static void unbind_from_irq(unsigned int irq) -{ - struct evtchn_close close; - int evtchn = evtchn_from_irq(irq); - - spin_lock(&irq_mapping_update_lock); - - if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { - close.port = evtchn; - if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) - BUG(); - - switch (type_from_irq(irq)) { - case IRQT_VIRQ: - per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) - [index_from_irq(irq)] = -1; - break; - default: - break; - } - - /* Closed ports are implicitly re-bound to VCPU0. */ - bind_evtchn_to_cpu(evtchn, 0); - - evtchn_to_irq[evtchn] = -1; - irq_info[irq] = IRQ_UNBOUND; - - dynamic_irq_init(irq); - } - - spin_unlock(&irq_mapping_update_lock); -} - -int bind_evtchn_to_irqhandler(unsigned int evtchn, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, void *dev_id) -{ - unsigned int irq; - int retval; - - irq = bind_evtchn_to_irq(evtchn); - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} -EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); - -int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, - irq_handler_t handler, - unsigned long irqflags, const char *devname, void *dev_id) -{ - unsigned int irq; - int retval; - - irq = bind_virq_to_irq(virq, cpu); - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} -EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); - -int bind_ipi_to_irqhandler(enum ipi_vector ipi, - unsigned int cpu, - irq_handler_t handler, - unsigned long irqflags, - const char *devname, - void *dev_id) -{ - int irq, retval; - - irq = bind_ipi_to_irq(ipi, cpu); - if (irq < 0) - return irq; - - retval = request_irq(irq, handler, irqflags, devname, dev_id); - if (retval != 0) { - unbind_from_irq(irq); - return retval; - } - - return irq; -} - -void unbind_from_irqhandler(unsigned int irq, void *dev_id) -{ - free_irq(irq, dev_id); - unbind_from_irq(irq); -} -EXPORT_SYMBOL_GPL(unbind_from_irqhandler); - -void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) -{ - int irq = per_cpu(ipi_to_irq, cpu)[vector]; - BUG_ON(irq < 0); - notify_remote_via_irq(irq); -} - - -/* - * Search the CPUs pending events bitmasks. For each one found, map - * the event number to an irq, and feed it into do_IRQ() for - * handling. - * - * Xen uses a two-level bitmap to speed searching. The first level is - * a bitset of words which contain pending event bits. The second - * level is a bitset of pending events themselves. - */ -void xen_evtchn_do_upcall(struct pt_regs *regs) -{ - int cpu = get_cpu(); - struct shared_info *s = HYPERVISOR_shared_info; - struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); - unsigned long pending_words; - - vcpu_info->evtchn_upcall_pending = 0; - - /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ - pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); - while (pending_words != 0) { - unsigned long pending_bits; - int word_idx = __ffs(pending_words); - pending_words &= ~(1UL << word_idx); - - while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { - int bit_idx = __ffs(pending_bits); - int port = (word_idx * BITS_PER_LONG) + bit_idx; - int irq = evtchn_to_irq[port]; - - if (irq != -1) { - regs->orig_ax = ~irq; - do_IRQ(regs); - } - } - } - - put_cpu(); -} - -/* Rebind an evtchn so that it gets delivered to a specific cpu */ -static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) -{ - struct evtchn_bind_vcpu bind_vcpu; - int evtchn = evtchn_from_irq(irq); - - if (!VALID_EVTCHN(evtchn)) - return; - - /* Send future instances of this interrupt to other vcpu. */ - bind_vcpu.port = evtchn; - bind_vcpu.vcpu = tcpu; - - /* - * If this fails, it usually just indicates that we're dealing with a - * virq or IPI channel, which don't actually need to be rebound. Ignore - * it, but don't do the xenlinux-level rebind in that case. - */ - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) - bind_evtchn_to_cpu(evtchn, tcpu); -} - - -static void set_affinity_irq(unsigned irq, cpumask_t dest) -{ - unsigned tcpu = first_cpu(dest); - rebind_irq_to_cpu(irq, tcpu); -} - -static void enable_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - unmask_evtchn(evtchn); -} - -static void disable_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - if (VALID_EVTCHN(evtchn)) - mask_evtchn(evtchn); -} - -static void ack_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - - move_native_irq(irq); - - if (VALID_EVTCHN(evtchn)) - clear_evtchn(evtchn); -} - -static int retrigger_dynirq(unsigned int irq) -{ - int evtchn = evtchn_from_irq(irq); - int ret = 0; - - if (VALID_EVTCHN(evtchn)) { - set_evtchn(evtchn); - ret = 1; - } - - return ret; -} - -static struct irq_chip xen_dynamic_chip __read_mostly = { - .name = "xen-dyn", - .mask = disable_dynirq, - .unmask = enable_dynirq, - .ack = ack_dynirq, - .set_affinity = set_affinity_irq, - .retrigger = retrigger_dynirq, -}; - -void __init xen_init_IRQ(void) -{ - int i; - - init_evtchn_cpu_bindings(); - - /* No event channels are 'live' right now. */ - for (i = 0; i < NR_EVENT_CHANNELS; i++) - mask_evtchn(i); - - /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ - for (i = 0; i < NR_IRQS; i++) - irq_bindcount[i] = 0; - - irq_ctx_init(smp_processor_id()); -} diff --git a/arch/x86/xen/features.c b/arch/x86/xen/features.c deleted file mode 100644 index 0707714e40d6..000000000000 --- a/arch/x86/xen/features.c +++ /dev/null @@ -1,29 +0,0 @@ -/****************************************************************************** - * features.c - * - * Xen feature flags. - * - * Copyright (c) 2006, Ian Campbell, XenSource Inc. - */ -#include <linux/types.h> -#include <linux/cache.h> -#include <linux/module.h> -#include <asm/xen/hypervisor.h> -#include <xen/features.h> - -u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; -EXPORT_SYMBOL_GPL(xen_features); - -void xen_setup_features(void) -{ - struct xen_feature_info fi; - int i, j; - - for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { - fi.submap_idx = i; - if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) - break; - for (j = 0; j < 32; j++) - xen_features[i * 32 + j] = !!(fi.submap & 1<<j); - } -} diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c new file mode 100644 index 000000000000..49ba9b5224d1 --- /dev/null +++ b/arch/x86/xen/grant-table.c @@ -0,0 +1,91 @@ +/****************************************************************************** + * grant_table.c + * x86 specific part + * + * Granting foreign access to our memory reservation. + * + * Copyright (c) 2005-2006, Christopher Clark + * Copyright (c) 2004-2005, K A Fraser + * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp> + * VA Linux Systems Japan. Split out x86 specific part. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/vmalloc.h> + +#include <xen/interface/xen.h> +#include <xen/page.h> +#include <xen/grant_table.h> + +#include <asm/pgtable.h> + +static int map_pte_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + unsigned long **frames = (unsigned long **)data; + + set_pte_at(&init_mm, addr, pte, mfn_pte((*frames)[0], PAGE_KERNEL)); + (*frames)++; + return 0; +} + +static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + + set_pte_at(&init_mm, addr, pte, __pte(0)); + return 0; +} + +int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, + unsigned long max_nr_gframes, + struct grant_entry **__shared) +{ + int rc; + struct grant_entry *shared = *__shared; + + if (shared == NULL) { + struct vm_struct *area = + xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes); + BUG_ON(area == NULL); + shared = area->addr; + *__shared = shared; + } + + rc = apply_to_page_range(&init_mm, (unsigned long)shared, + PAGE_SIZE * nr_gframes, + map_pte_fn, &frames); + return rc; +} + +void arch_gnttab_unmap_shared(struct grant_entry *shared, + unsigned long nr_gframes) +{ + apply_to_page_range(&init_mm, (unsigned long)shared, + PAGE_SIZE * nr_gframes, unmap_pte_fn, NULL); +} diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 2a054ef2a3da..126766d43aea 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -156,6 +156,10 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { + /* updates to init_mm may be done without lock */ + if (mm == &init_mm) + preempt_disable(); + if (mm == current->mm || mm == &init_mm) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { struct multicall_space mcs; @@ -163,14 +167,61 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); xen_mc_issue(PARAVIRT_LAZY_MMU); - return; + goto out; } else if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) - return; + goto out; } xen_set_pte(ptep, pteval); + +out: + if (mm == &init_mm) + preempt_enable(); +} + +pteval_t xen_pte_val(pte_t pte) +{ + pteval_t ret = pte.pte; + + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; + + return ret; +} + +pgdval_t xen_pgd_val(pgd_t pgd) +{ + pgdval_t ret = pgd.pgd; + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; + return ret; +} + +pte_t xen_make_pte(pteval_t pte) +{ + if (pte & _PAGE_PRESENT) { + pte = phys_to_machine(XPADDR(pte)).maddr; + pte &= ~(_PAGE_PCD | _PAGE_PWT); + } + + return (pte_t){ .pte = pte }; } +pgd_t xen_make_pgd(pgdval_t pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd = phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} + +pmdval_t xen_pmd_val(pmd_t pmd) +{ + pmdval_t ret = native_pmd_val(pmd); + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT; + return ret; +} #ifdef CONFIG_X86_PAE void xen_set_pud(pud_t *ptr, pud_t val) { @@ -214,100 +265,18 @@ void xen_pmd_clear(pmd_t *pmdp) xen_set_pmd(pmdp, __pmd(0)); } -unsigned long long xen_pte_val(pte_t pte) +pmd_t xen_make_pmd(pmdval_t pmd) { - unsigned long long ret = 0; - - if (pte.pte_low) { - ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low; - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - } - - return ret; -} - -unsigned long long xen_pmd_val(pmd_t pmd) -{ - unsigned long long ret = pmd.pmd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - return ret; -} - -unsigned long long xen_pgd_val(pgd_t pgd) -{ - unsigned long long ret = pgd.pgd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - return ret; -} - -pte_t xen_make_pte(unsigned long long pte) -{ - if (pte & _PAGE_PRESENT) { - pte = phys_to_machine(XPADDR(pte)).maddr; - pte &= ~(_PAGE_PCD | _PAGE_PWT); - } - - return (pte_t){ .pte = pte }; -} - -pmd_t xen_make_pmd(unsigned long long pmd) -{ - if (pmd & 1) + if (pmd & _PAGE_PRESENT) pmd = phys_to_machine(XPADDR(pmd)).maddr; - return (pmd_t){ pmd }; -} - -pgd_t xen_make_pgd(unsigned long long pgd) -{ - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; - - return (pgd_t){ pgd }; + return native_make_pmd(pmd); } #else /* !PAE */ void xen_set_pte(pte_t *ptep, pte_t pte) { *ptep = pte; } - -unsigned long xen_pte_val(pte_t pte) -{ - unsigned long ret = pte.pte_low; - - if (ret & _PAGE_PRESENT) - ret = machine_to_phys(XMADDR(ret)).paddr; - - return ret; -} - -unsigned long xen_pgd_val(pgd_t pgd) -{ - unsigned long ret = pgd.pgd; - if (ret) - ret = machine_to_phys(XMADDR(ret)).paddr | 1; - return ret; -} - -pte_t xen_make_pte(unsigned long pte) -{ - if (pte & _PAGE_PRESENT) { - pte = phys_to_machine(XPADDR(pte)).maddr; - pte &= ~(_PAGE_PCD | _PAGE_PWT); - } - - return (pte_t){ pte }; -} - -pgd_t xen_make_pgd(unsigned long pgd) -{ - if (pgd & _PAGE_PRESENT) - pgd = phys_to_machine(XPADDR(pgd)).maddr; - - return (pgd_t){ pgd }; -} #endif /* CONFIG_X86_PAE */ /* @@ -418,7 +387,7 @@ static void xen_do_pin(unsigned level, unsigned long pfn) static int pin_page(struct page *page, enum pt_level level) { - unsigned pgfl = test_and_set_bit(PG_pinned, &page->flags); + unsigned pgfl = TestSetPagePinned(page); int flush; if (pgfl) @@ -499,7 +468,7 @@ void __init xen_mark_init_mm_pinned(void) static int unpin_page(struct page *page, enum pt_level level) { - unsigned pgfl = test_and_clear_bit(PG_pinned, &page->flags); + unsigned pgfl = TestClearPagePinned(page); if (pgfl && !PageHighMem(page)) { void *pt = lowmem_page_address(page); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 2341492bf7a0..82517e4a752a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -16,6 +16,7 @@ #include <asm/xen/hypervisor.h> #include <asm/xen/hypercall.h> +#include <xen/interface/callback.h> #include <xen/interface/physdev.h> #include <xen/features.h> @@ -68,6 +69,24 @@ static void __init fiddle_vdso(void) *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; } +void xen_enable_sysenter(void) +{ + int cpu = smp_processor_id(); + extern void xen_sysenter_target(void); + /* Mask events on entry, even though they get enabled immediately */ + static struct callback_register sysenter = { + .type = CALLBACKTYPE_sysenter, + .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target }, + .flags = CALLBACKF_mask_events, + }; + + if (!boot_cpu_has(X86_FEATURE_SEP) || + HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { + clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); + } +} + void __init xen_arch_setup(void) { struct physdev_set_iopl set_iopl; @@ -82,6 +101,8 @@ void __init xen_arch_setup(void) HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, __KERNEL_CS, (unsigned long)xen_failsafe_callback); + xen_enable_sysenter(); + set_iopl.iopl = 1; rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); if (rc != 0) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index e340ff92f6b6..94e69000f982 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -36,8 +36,9 @@ #include "mmu.h" static cpumask_t xen_cpu_initialized_map; -static DEFINE_PER_CPU(int, resched_irq); -static DEFINE_PER_CPU(int, callfunc_irq); +static DEFINE_PER_CPU(int, resched_irq) = -1; +static DEFINE_PER_CPU(int, callfunc_irq) = -1; +static DEFINE_PER_CPU(int, debug_irq) = -1; /* * Structure and data for smp_call_function(). This is designed to minimise @@ -72,6 +73,7 @@ static __cpuinit void cpu_bringup_and_idle(void) int cpu = smp_processor_id(); cpu_init(); + xen_enable_sysenter(); preempt_disable(); per_cpu(cpu_state, cpu) = CPU_ONLINE; @@ -88,9 +90,7 @@ static __cpuinit void cpu_bringup_and_idle(void) static int xen_smp_intr_init(unsigned int cpu) { int rc; - const char *resched_name, *callfunc_name; - - per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; + const char *resched_name, *callfunc_name, *debug_name; resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, @@ -114,6 +114,14 @@ static int xen_smp_intr_init(unsigned int cpu) goto fail; per_cpu(callfunc_irq, cpu) = rc; + debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); + rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, + IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING, + debug_name, NULL); + if (rc < 0) + goto fail; + per_cpu(debug_irq, cpu) = rc; + return 0; fail: @@ -121,6 +129,8 @@ static int xen_smp_intr_init(unsigned int cpu) unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); if (per_cpu(callfunc_irq, cpu) >= 0) unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + if (per_cpu(debug_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); return rc; } @@ -183,7 +193,7 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus) /* Restrict the possible_map according to max_cpus. */ while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { - for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--) + for (cpu = NR_CPUS - 1; !cpu_possible(cpu); cpu--) continue; cpu_clear(cpu, cpu_possible_map); } diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index fe161ed4b01e..2497a30f41de 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -108,6 +108,20 @@ ENDPATCH(xen_restore_fl_direct) RELOC(xen_restore_fl_direct, 2b+1) /* + We can't use sysexit directly, because we're not running in ring0. + But we can easily fake it up using iret. Assuming xen_sysexit + is jumped to with a standard stack frame, we can just strip it + back to a standard iret frame and use iret. + */ +ENTRY(xen_sysexit) + movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ + orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) + lea PT_EIP(%esp), %esp + + jmp xen_iret +ENDPROC(xen_sysexit) + +/* This is run where a normal iret would be run, with the same stack setup: 8: eflags 4: cs @@ -184,8 +198,12 @@ iret_restore_end: region is OK. */ je xen_hypervisor_callback - iret +1: iret xen_iret_end_crit: +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous hyper_iret: /* put this out of line since its very rarely used */ @@ -219,9 +237,7 @@ hyper_iret: ds } SAVE_ALL state eax } : : - ebx } - ---------------- - return addr <- esp + ebx }<- esp ---------------- In order to deliver the nested exception properly, we need to shift @@ -236,10 +252,8 @@ hyper_iret: it's usermode state which we eventually need to restore. */ ENTRY(xen_iret_crit_fixup) - /* offsets +4 for return address */ - /* - Paranoia: Make sure we're really coming from userspace. + Paranoia: Make sure we're really coming from kernel space. One could imagine a case where userspace jumps into the critical range address, but just before the CPU delivers a GP, it decides to deliver an interrupt instead. Unlikely? @@ -248,32 +262,32 @@ ENTRY(xen_iret_crit_fixup) jump instruction itself, not the destination, but some virtual environments get this wrong. */ - movl PT_CS+4(%esp), %ecx + movl PT_CS(%esp), %ecx andl $SEGMENT_RPL_MASK, %ecx cmpl $USER_RPL, %ecx je 2f - lea PT_ORIG_EAX+4(%esp), %esi - lea PT_EFLAGS+4(%esp), %edi + lea PT_ORIG_EAX(%esp), %esi + lea PT_EFLAGS(%esp), %edi /* If eip is before iret_restore_end then stack hasn't been restored yet. */ cmp $iret_restore_end, %eax jae 1f - movl 0+4(%edi),%eax /* copy EAX */ - movl %eax, PT_EAX+4(%esp) + movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */ + movl %eax, PT_EAX(%esp) lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ /* set up the copy */ 1: std - mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */ + mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ rep movsl cld lea 4(%edi),%esp /* point esp to new frame */ -2: ret +2: jmp xen_do_upcall /* diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 956a491ea998..f1063ae08037 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -2,6 +2,8 @@ #define XEN_OPS_H #include <linux/init.h> +#include <linux/irqreturn.h> +#include <xen/xen-ops.h> /* These are code, but not functions. Defined in entry.S */ extern const char xen_hypervisor_callback[]; @@ -9,7 +11,6 @@ extern const char xen_failsafe_callback[]; void xen_copy_trap_info(struct trap_info *traps); -DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); DECLARE_PER_CPU(unsigned long, xen_cr3); DECLARE_PER_CPU(unsigned long, xen_current_cr3); @@ -19,6 +20,7 @@ extern struct shared_info *HYPERVISOR_shared_info; char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); +void xen_enable_sysenter(void); void xen_setup_timer(int cpu); void xen_setup_cpu_clockevents(void); @@ -28,6 +30,8 @@ unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); unsigned long long xen_sched_clock(void); +irqreturn_t xen_debug_interrupt(int irq, void *dev_id); + bool xen_vcpu_stolen(int vcpu); void xen_mark_init_mm_pinned(void); @@ -64,4 +68,6 @@ DECL_ASM(unsigned long, xen_save_fl_direct, void); DECL_ASM(void, xen_restore_fl_direct, unsigned long); void xen_iret(void); +void xen_sysexit(void); + #endif /* XEN_OPS_H */ |