summaryrefslogtreecommitdiffstats
path: root/arch/x86/lguest
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/lguest')
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/Makefile1
-rw-r--r--arch/x86/lguest/boot.c193
-rw-r--r--arch/x86/lguest/i386_head.S60
4 files changed, 197 insertions, 58 deletions
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 8dab8f7844d3..38718041efc3 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,7 +2,6 @@ config LGUEST_GUEST
bool "Lguest guest support"
select PARAVIRT
depends on X86_32
- depends on !X86_PAE
select VIRTIO
select VIRTIO_RING
select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/Makefile b/arch/x86/lguest/Makefile
index 27f0c9ed7f60..94e0e54056a9 100644
--- a/arch/x86/lguest/Makefile
+++ b/arch/x86/lguest/Makefile
@@ -1 +1,2 @@
obj-y := i386_head.o boot.o
+CFLAGS_boot.o := $(call cc-option, -fno-stack-protector)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ca7ec44bafc3..7bc65f0f62c4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -67,6 +67,7 @@
#include <asm/mce.h>
#include <asm/io.h>
#include <asm/i387.h>
+#include <asm/stackprotector.h>
#include <asm/reboot.h> /* for struct machine_ops */
/*G:010 Welcome to the Guest!
@@ -86,7 +87,7 @@ struct lguest_data lguest_data = {
/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a
* ring buffer of stored hypercalls which the Host will run though next time we
- * do a normal hypercall. Each entry in the ring has 4 slots for the hypercall
+ * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall
* arguments, and a "hcall_status" word which is 0 if the call is ready to go,
* and 255 once the Host has finished with it.
*
@@ -95,7 +96,8 @@ struct lguest_data lguest_data = {
* effect of causing the Host to run all the stored calls in the ring buffer
* which empties it for next time! */
static void async_hcall(unsigned long call, unsigned long arg1,
- unsigned long arg2, unsigned long arg3)
+ unsigned long arg2, unsigned long arg3,
+ unsigned long arg4)
{
/* Note: This code assumes we're uniprocessor. */
static unsigned int next_call;
@@ -107,12 +109,13 @@ static void async_hcall(unsigned long call, unsigned long arg1,
local_irq_save(flags);
if (lguest_data.hcall_status[next_call] != 0xFF) {
/* Table full, so do normal hcall which will flush table. */
- kvm_hypercall3(call, arg1, arg2, arg3);
+ kvm_hypercall4(call, arg1, arg2, arg3, arg4);
} else {
lguest_data.hcalls[next_call].arg0 = call;
lguest_data.hcalls[next_call].arg1 = arg1;
lguest_data.hcalls[next_call].arg2 = arg2;
lguest_data.hcalls[next_call].arg3 = arg3;
+ lguest_data.hcalls[next_call].arg4 = arg4;
/* Arguments must all be written before we mark it to go */
wmb();
lguest_data.hcall_status[next_call] = 0;
@@ -140,7 +143,7 @@ static void lazy_hcall1(unsigned long call,
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
kvm_hypercall1(call, arg1);
else
- async_hcall(call, arg1, 0, 0);
+ async_hcall(call, arg1, 0, 0, 0);
}
static void lazy_hcall2(unsigned long call,
@@ -150,7 +153,7 @@ static void lazy_hcall2(unsigned long call,
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
kvm_hypercall2(call, arg1, arg2);
else
- async_hcall(call, arg1, arg2, 0);
+ async_hcall(call, arg1, arg2, 0, 0);
}
static void lazy_hcall3(unsigned long call,
@@ -161,18 +164,38 @@ static void lazy_hcall3(unsigned long call,
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
kvm_hypercall3(call, arg1, arg2, arg3);
else
- async_hcall(call, arg1, arg2, arg3);
+ async_hcall(call, arg1, arg2, arg3, 0);
}
+#ifdef CONFIG_X86_PAE
+static void lazy_hcall4(unsigned long call,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4)
+{
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
+ kvm_hypercall4(call, arg1, arg2, arg3, arg4);
+ else
+ async_hcall(call, arg1, arg2, arg3, arg4);
+}
+#endif
+
/* When lazy mode is turned off reset the per-cpu lazy mode variable and then
* issue the do-nothing hypercall to flush any stored calls. */
-static void lguest_leave_lazy_mode(void)
+static void lguest_leave_lazy_mmu_mode(void)
+{
+ kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+ paravirt_leave_lazy_mmu();
+}
+
+static void lguest_end_context_switch(struct task_struct *next)
{
- paravirt_leave_lazy(paravirt_get_lazy_mode());
kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+ paravirt_end_context_switch(next);
}
-/*G:033
+/*G:032
* After that diversion we return to our first native-instruction
* replacements: four functions for interrupt control.
*
@@ -192,30 +215,28 @@ static unsigned long save_fl(void)
{
return lguest_data.irq_enabled;
}
-PV_CALLEE_SAVE_REGS_THUNK(save_fl);
-
-/* restore_flags() just sets the flags back to the value given. */
-static void restore_fl(unsigned long flags)
-{
- lguest_data.irq_enabled = flags;
-}
-PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
/* Interrupts go off... */
static void irq_disable(void)
{
lguest_data.irq_enabled = 0;
}
+
+/* Let's pause a moment. Remember how I said these are called so often?
+ * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to
+ * break some rules. In particular, these functions are assumed to save their
+ * own registers if they need to: normal C functions assume they can trash the
+ * eax register. To use normal C functions, we use
+ * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the
+ * C function, then restores it. */
+PV_CALLEE_SAVE_REGS_THUNK(save_fl);
PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
+/*:*/
-/* Interrupts go on... */
-static void irq_enable(void)
-{
- lguest_data.irq_enabled = X86_EFLAGS_IF;
-}
-PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
+/* These are in i386_head.S */
+extern void lg_irq_enable(void);
+extern void lg_restore_fl(unsigned long flags);
-/*:*/
/*M:003 Note that we don't check for outstanding interrupts when we re-enable
* them (or when we unmask an interrupt). This seems to work for the moment,
* since interrupts are rare and we'll just get the interrupt on the next timer
@@ -361,8 +382,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
case 1: /* Basic feature request. */
/* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */
*cx &= 0x00002201;
- /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU. */
- *dx &= 0x07808111;
+ /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */
+ *dx &= 0x07808151;
/* The Host can do a nice optimization if it knows that the
* kernel mappings (addresses above 0xC0000000 or whatever
* PAGE_OFFSET is set to) haven't changed. But Linux calls
@@ -381,6 +402,11 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx,
if (*ax > 0x80000008)
*ax = 0x80000008;
break;
+ case 0x80000001:
+ /* Here we should fix nx cap depending on host. */
+ /* For this version of PAE, we just clear NX bit. */
+ *dx &= ~(1 << 20);
+ break;
}
}
@@ -514,25 +540,52 @@ static void lguest_write_cr4(unsigned long val)
static void lguest_pte_update(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
+#ifdef CONFIG_X86_PAE
+ lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr,
+ ptep->pte_low, ptep->pte_high);
+#else
lazy_hcall3(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low);
+#endif
}
static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval)
{
- *ptep = pteval;
+ native_set_pte(ptep, pteval);
lguest_pte_update(mm, addr, ptep);
}
-/* The Guest calls this to set a top-level entry. Again, we set the entry then
- * tell the Host which top-level page we changed, and the index of the entry we
- * changed. */
+/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd
+ * to set a middle-level entry when PAE is activated.
+ * Again, we set the entry then tell the Host which page we changed,
+ * and the index of the entry we changed. */
+#ifdef CONFIG_X86_PAE
+static void lguest_set_pud(pud_t *pudp, pud_t pudval)
+{
+ native_set_pud(pudp, pudval);
+
+ /* 32 bytes aligned pdpt address and the index. */
+ lazy_hcall2(LHCALL_SET_PGD, __pa(pudp) & 0xFFFFFFE0,
+ (__pa(pudp) & 0x1F) / sizeof(pud_t));
+}
+
static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
{
- *pmdp = pmdval;
+ native_set_pmd(pmdp, pmdval);
lazy_hcall2(LHCALL_SET_PMD, __pa(pmdp) & PAGE_MASK,
- (__pa(pmdp) & (PAGE_SIZE - 1)) / 4);
+ (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
}
+#else
+
+/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not
+ * activated. */
+static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+ native_set_pmd(pmdp, pmdval);
+ lazy_hcall2(LHCALL_SET_PGD, __pa(pmdp) & PAGE_MASK,
+ (__pa(pmdp) & (PAGE_SIZE - 1)) / sizeof(pmd_t));
+}
+#endif
/* There are a couple of legacy places where the kernel sets a PTE, but we
* don't know the top level any more. This is useless for us, since we don't
@@ -545,11 +598,31 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
* which brings boot back to 0.25 seconds. */
static void lguest_set_pte(pte_t *ptep, pte_t pteval)
{
- *ptep = pteval;
+ native_set_pte(ptep, pteval);
if (cr3_changed)
lazy_hcall1(LHCALL_FLUSH_TLB, 1);
}
+#ifdef CONFIG_X86_PAE
+static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte)
+{
+ native_set_pte_atomic(ptep, pte);
+ if (cr3_changed)
+ lazy_hcall1(LHCALL_FLUSH_TLB, 1);
+}
+
+void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ native_pte_clear(mm, addr, ptep);
+ lguest_pte_update(mm, addr, ptep);
+}
+
+void lguest_pmd_clear(pmd_t *pmdp)
+{
+ lguest_set_pmd(pmdp, __pmd(0));
+}
+#endif
+
/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on
* native page table operations. On native hardware you can set a new page
* table entry whenever you want, but if you want to remove one you have to do
@@ -621,13 +694,12 @@ static void __init lguest_init_IRQ(void)
{
unsigned int i;
- for (i = 0; i < LGUEST_IRQS; i++) {
- int vector = FIRST_EXTERNAL_VECTOR + i;
+ for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
/* Some systems map "vectors" to interrupts weirdly. Lguest has
* a straightforward 1 to 1 mapping, so force that here. */
- __get_cpu_var(vector_irq)[vector] = i;
- if (vector != SYSCALL_VECTOR)
- set_intr_gate(vector, interrupt[i]);
+ __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR;
+ if (i != SYSCALL_VECTOR)
+ set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
/* This call is required to set up for 4k stacks, where we have
* separate stacks for hard and soft interrupts. */
@@ -636,7 +708,7 @@ static void __init lguest_init_IRQ(void)
void lguest_setup_irq(unsigned int irq)
{
- irq_to_desc_alloc_cpu(irq, 0);
+ irq_to_desc_alloc_node(irq, 0);
set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
handle_level_irq, "level");
}
@@ -966,10 +1038,10 @@ static void lguest_restart(char *reason)
*
* Our current solution is to allow the paravirt back end to optionally patch
* over the indirect calls to replace them with something more efficient. We
- * patch the four most commonly called functions: disable interrupts, enable
- * interrupts, restore interrupts and save interrupts. We usually have 6 or 10
- * bytes to patch into: the Guest versions of these operations are small enough
- * that we can fit comfortably.
+ * patch two of the simplest of the most commonly called functions: disable
+ * interrupts and save interrupts. We usually have 6 or 10 bytes to patch
+ * into: the Guest versions of these operations are small enough that we can
+ * fit comfortably.
*
* First we need assembly templates of each of the patchable Guest operations,
* and these are in i386_head.S. */
@@ -980,8 +1052,6 @@ static const struct lguest_insns
const char *start, *end;
} lguest_insns[] = {
[PARAVIRT_PATCH(pv_irq_ops.irq_disable)] = { lgstart_cli, lgend_cli },
- [PARAVIRT_PATCH(pv_irq_ops.irq_enable)] = { lgstart_sti, lgend_sti },
- [PARAVIRT_PATCH(pv_irq_ops.restore_fl)] = { lgstart_popf, lgend_popf },
[PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf },
};
@@ -1019,6 +1089,7 @@ __init void lguest_init(void)
pv_info.name = "lguest";
pv_info.paravirt_enabled = 1;
pv_info.kernel_rpl = 1;
+ pv_info.shared_kernel_pmd = 1;
/* We set up all the lguest overrides for sensitive operations. These
* are detailed with the operations themselves. */
@@ -1026,9 +1097,9 @@ __init void lguest_init(void)
/* interrupt-related operations */
pv_irq_ops.init_IRQ = lguest_init_IRQ;
pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
- pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
+ pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl);
pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
- pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
+ pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable);
pv_irq_ops.safe_halt = lguest_safe_halt;
/* init-time operations */
@@ -1053,8 +1124,8 @@ __init void lguest_init(void)
pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
pv_cpu_ops.wbinvd = lguest_wbinvd;
- pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
- pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+ pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
+ pv_cpu_ops.end_context_switch = lguest_end_context_switch;
/* pagetable management */
pv_mmu_ops.write_cr3 = lguest_write_cr3;
@@ -1064,10 +1135,16 @@ __init void lguest_init(void)
pv_mmu_ops.set_pte = lguest_set_pte;
pv_mmu_ops.set_pte_at = lguest_set_pte_at;
pv_mmu_ops.set_pmd = lguest_set_pmd;
+#ifdef CONFIG_X86_PAE
+ pv_mmu_ops.set_pte_atomic = lguest_set_pte_atomic;
+ pv_mmu_ops.pte_clear = lguest_pte_clear;
+ pv_mmu_ops.pmd_clear = lguest_pmd_clear;
+ pv_mmu_ops.set_pud = lguest_set_pud;
+#endif
pv_mmu_ops.read_cr2 = lguest_read_cr2;
pv_mmu_ops.read_cr3 = lguest_read_cr3;
pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
- pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+ pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
pv_mmu_ops.pte_update = lguest_pte_update;
pv_mmu_ops.pte_update_defer = lguest_pte_update;
@@ -1088,13 +1165,21 @@ __init void lguest_init(void)
* lguest_init() where the rest of the fairly chaotic boot setup
* occurs. */
+ /* The stack protector is a weird thing where gcc places a canary
+ * value on the stack and then checks it on return. This file is
+ * compiled with -fno-stack-protector it, so we got this far without
+ * problems. The value of the canary is kept at offset 20 from the
+ * %gs register, so we need to set that up before calling C functions
+ * in other files. */
+ setup_stack_canary_segment(0);
+ /* We could just call load_stack_canary_segment(), but we might as
+ * call switch_to_new_gdt() which loads the whole table and sets up
+ * the per-cpu segment descriptor register %fs as well. */
+ switch_to_new_gdt(0);
+
/* As described in head_32.S, we map the first 128M of memory. */
max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
- /* Load the %fs segment register (the per-cpu segment register) with
- * the normal data segment to get through booting. */
- asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
-
/* The Host<->Guest Switcher lives at the top of our address space, and
* the Host told us how big it is when we made LGUEST_INIT hypercall:
* it put the answer in lguest_data.reserve_mem */
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index f79541989471..a9c8cfe61cd4 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -46,10 +46,64 @@ ENTRY(lguest_entry)
.globl lgstart_##name; .globl lgend_##name
LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(sti, movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled)
-LGUEST_PATCH(popf, movl %eax, lguest_data+LGUEST_DATA_irq_enabled)
LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax)
-/*:*/
+
+/*G:033 But using those wrappers is inefficient (we'll see why that doesn't
+ * matter for save_fl and irq_disable later). If we write our routines
+ * carefully in assembler, we can avoid clobbering any registers and avoid
+ * jumping through the wrapper functions.
+ *
+ * I skipped over our first piece of assembler, but this one is worth studying
+ * in a bit more detail so I'll describe in easy stages. First, the routine
+ * to enable interrupts: */
+ENTRY(lg_irq_enable)
+ /* The reverse of irq_disable, this sets lguest_data.irq_enabled to
+ * X86_EFLAGS_IF (ie. "Interrupts enabled"). */
+ movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled
+ /* But now we need to check if the Host wants to know: there might have
+ * been interrupts waiting to be delivered, in which case it will have
+ * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we
+ * jump to send_interrupts, otherwise we're done. */
+ testl $0, lguest_data+LGUEST_DATA_irq_pending
+ jnz send_interrupts
+ /* One cool thing about x86 is that you can do many things without using
+ * a register. In this case, the normal path hasn't needed to save or
+ * restore any registers at all! */
+ ret
+send_interrupts:
+ /* OK, now we need a register: eax is used for the hypercall number,
+ * which is LHCALL_SEND_INTERRUPTS.
+ *
+ * We used not to bother with this pending detection at all, which was
+ * much simpler. Sooner or later the Host would realize it had to
+ * send us an interrupt. But that turns out to make performance 7
+ * times worse on a simple tcp benchmark. So now we do this the hard
+ * way. */
+ pushl %eax
+ movl $LHCALL_SEND_INTERRUPTS, %eax
+ /* This is a vmcall instruction (same thing that KVM uses). Older
+ * assembler versions might not know the "vmcall" instruction, so we
+ * create one manually here. */
+ .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */
+ popl %eax
+ ret
+
+/* Finally, the "popf" or "restore flags" routine. The %eax register holds the
+ * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're
+ * enabling interrupts again, if it's 0 we're leaving them off. */
+ENTRY(lg_restore_fl)
+ /* This is just "lguest_data.irq_enabled = flags;" */
+ movl %eax, lguest_data+LGUEST_DATA_irq_enabled
+ /* Now, if the %eax value has enabled interrupts and
+ * lguest_data.irq_pending is set, we want to tell the Host so it can
+ * deliver any outstanding interrupts. Fortunately, both values will
+ * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl"
+ * instruction will AND them together for us. If both are set, we
+ * jump to send_interrupts. */
+ testl lguest_data+LGUEST_DATA_irq_pending, %eax
+ jnz send_interrupts
+ /* Again, the normal path has used no extra registers. Clever, huh? */
+ ret
/* These demark the EIP range where host should never deliver interrupts. */
.global lguest_noirq_start