From 4595f9620cda8a1e973588e743cf5f8436dd20c6 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 10 Jan 2009 21:58:09 -0800
Subject: x86: change flush_tlb_others to take a const struct cpumask

Impact: reduce stack usage, use new cpumask API.

This is made a little more tricky by uv_flush_tlb_others which
actually alters its argument, for an IPI to be sent to the remaining
cpus in the mask.

I solve this by allocating a cpumask_var_t for this case and falling back
to IPI should this fail.

To eliminate temporaries in the caller, all flush_tlb_others implementations
now do the this-cpu-elimination step themselves.

Note also the curious "cpus_or(f->flush_cpumask, cpumask, f->flush_cpumask)"
which has been there since pre-git and yet f->flush_cpumask is always zero
at this point.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/x86/xen/enlighten.c | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bea215230b20..965539ec425f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -634,35 +634,27 @@ static void xen_flush_tlb_single(unsigned long addr)
 	preempt_enable();
 }
 
-static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
-				 unsigned long va)
+static void xen_flush_tlb_others(const struct cpumask *cpus,
+				 struct mm_struct *mm, unsigned long va)
 {
 	struct {
 		struct mmuext_op op;
-		cpumask_t mask;
+		DECLARE_BITMAP(mask, NR_CPUS);
 	} *args;
-	cpumask_t cpumask = *cpus;
 	struct multicall_space mcs;
 
-	/*
-	 * A couple of (to be removed) sanity checks:
-	 *
-	 * - current CPU must not be in mask
-	 * - mask must exist :)
-	 */
-	BUG_ON(cpus_empty(cpumask));
-	BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+	BUG_ON(cpumask_empty(cpus));
 	BUG_ON(!mm);
 
-	/* If a CPU which we ran on has gone down, OK. */
-	cpus_and(cpumask, cpumask, cpu_online_map);
-	if (cpus_empty(cpumask))
-		return;
-
 	mcs = xen_mc_entry(sizeof(*args));
 	args = mcs.args;
-	args->mask = cpumask;
-	args->op.arg2.vcpumask = &args->mask;
+	args->op.arg2.vcpumask = to_cpumask(args->mask);
+
+	/* Remove us, and any offline CPUS. */
+	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
+	if (unlikely(cpumask_empty(to_cpumask(args->mask))))
+		goto issue;
 
 	if (va == TLB_FLUSH_ALL) {
 		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
@@ -673,6 +665,7 @@ static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
 
 	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
 
+issue:
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
 
-- 
cgit v1.2.3


From 004aa322f855a765741d9437a98dd8fe2e4f32a6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 13 Jan 2009 20:41:35 +0900
Subject: x86: misc clean up after the percpu update

Do the following cleanups:

* kill x86_64_init_pda() which now is equivalent to pda_init()

* use per_cpu_offset() instead of cpu_pda() when initializing
  initial_gs

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/setup.h | 1 -
 arch/x86/kernel/acpi/sleep.c | 2 +-
 arch/x86/kernel/head64.c     | 7 +------
 arch/x86/kernel/smpboot.c    | 2 +-
 arch/x86/xen/enlighten.c     | 2 +-
 5 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ebe858cdc8a3..536949749bc2 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -100,7 +100,6 @@ extern unsigned long init_pg_tables_start;
 extern unsigned long init_pg_tables_end;
 
 #else
-void __init x86_64_init_pda(void);
 void __init x86_64_start_kernel(char *real_mode);
 void __init x86_64_start_reservations(char *real_mode_data);
 
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 9ff67f8dc2c0..4abff454c55b 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -101,7 +101,7 @@ int acpi_save_state_mem(void)
 	stack_start.sp = temp_stack + sizeof(temp_stack);
 	early_gdt_descr.address =
 			(unsigned long)get_cpu_gdt_table(smp_processor_id());
-	initial_gs = (unsigned long)cpu_pda(smp_processor_id());
+	initial_gs = per_cpu_offset(smp_processor_id());
 #endif
 	initial_code = (unsigned long)wakeup_long64;
 	saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 71b6f6ec96a2..af67d3227ea6 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,11 +26,6 @@
 #include <asm/bios_ebda.h>
 #include <asm/trampoline.h>
 
-void __init x86_64_init_pda(void)
-{
-	pda_init(0);
-}
-
 static void __init zap_identity_mappings(void)
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
@@ -96,7 +91,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	if (console_loglevel == 10)
 		early_printk("Kernel alive\n");
 
-	x86_64_init_pda();
+	pda_init(0);
 
 	x86_64_start_reservations(real_mode_data);
 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f2f77ca494d4..2f0e0f1090f6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -798,7 +798,7 @@ do_rest:
 #else
 	cpu_pda(cpu)->pcurrent = c_idle.idle;
 	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
-	initial_gs = (unsigned long)cpu_pda(cpu);
+	initial_gs = per_cpu_offset(cpu);
 #endif
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 	initial_code = (unsigned long)start_secondary;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 965539ec425f..312414ef9365 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1645,7 +1645,7 @@ asmlinkage void __init xen_start_kernel(void)
 #ifdef CONFIG_X86_64
 	/* Disable until direct per-cpu data access. */
 	have_vcpu_info_placement = 0;
-	x86_64_init_pda();
+	pda_init(0);
 #endif
 
 	xen_smp_init();
-- 
cgit v1.2.3


From 6dbde3530850d4d8bfc1b6bd4006d92786a2787f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 15 Jan 2009 22:15:53 +0900
Subject: percpu: add optimized generic percpu accessors

It is an optimization and a cleanup, and adds the following new
generic percpu methods:

  percpu_read()
  percpu_write()
  percpu_add()
  percpu_sub()
  percpu_and()
  percpu_or()
  percpu_xor()

and implements support for them on x86. (other architectures will fall
back to a default implementation)

The advantage is that for example to read a local percpu variable,
instead of this sequence:

 return __get_cpu_var(var);

 ffffffff8102ca2b:	48 8b 14 fd 80 09 74 	mov    -0x7e8bf680(,%rdi,8),%rdx
 ffffffff8102ca32:	81
 ffffffff8102ca33:	48 c7 c0 d8 59 00 00 	mov    $0x59d8,%rax
 ffffffff8102ca3a:	48 8b 04 10          	mov    (%rax,%rdx,1),%rax

We can get a single instruction by using the optimized variants:

 return percpu_read(var);

 ffffffff8102ca3f:	65 48 8b 05 91 8f fd 	mov    %gs:0x7efd8f91(%rip),%rax

I also cleaned up the x86-specific APIs and made the x86 code use
these new generic percpu primitives.

tj: * fixed generic percpu_sub() definition as Roel Kluin pointed out
    * added percpu_and() for completeness's sake
    * made generic percpu ops atomic against preemption

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/include/asm/current.h        |  2 +-
 arch/x86/include/asm/irq_regs_32.h    |  4 +--
 arch/x86/include/asm/mmu_context_32.h | 12 ++++----
 arch/x86/include/asm/pda.h            | 10 +++----
 arch/x86/include/asm/percpu.h         | 24 ++++++++--------
 arch/x86/include/asm/smp.h            |  2 +-
 arch/x86/kernel/process_32.c          |  2 +-
 arch/x86/kernel/tlb_32.c              | 10 +++----
 arch/x86/mach-voyager/voyager_smp.c   |  4 +--
 arch/x86/xen/enlighten.c              | 14 +++++-----
 arch/x86/xen/irq.c                    |  8 +++---
 arch/x86/xen/mmu.c                    |  2 +-
 arch/x86/xen/multicalls.h             |  2 +-
 arch/x86/xen/smp.c                    |  2 +-
 include/asm-generic/percpu.h          | 52 +++++++++++++++++++++++++++++++++++
 15 files changed, 102 insertions(+), 48 deletions(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 0930b4f8d672..0728480f5c56 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -10,7 +10,7 @@ struct task_struct;
 DECLARE_PER_CPU(struct task_struct *, current_task);
 static __always_inline struct task_struct *get_current(void)
 {
-	return x86_read_percpu(current_task);
+	return percpu_read(current_task);
 }
 
 #else /* X86_32 */
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
index 86afd7473457..d7ed33ee94e9 100644
--- a/arch/x86/include/asm/irq_regs_32.h
+++ b/arch/x86/include/asm/irq_regs_32.h
@@ -15,7 +15,7 @@ DECLARE_PER_CPU(struct pt_regs *, irq_regs);
 
 static inline struct pt_regs *get_irq_regs(void)
 {
-	return x86_read_percpu(irq_regs);
+	return percpu_read(irq_regs);
 }
 
 static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
@@ -23,7 +23,7 @@ static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
 	struct pt_regs *old_regs;
 
 	old_regs = get_irq_regs();
-	x86_write_percpu(irq_regs, new_regs);
+	percpu_write(irq_regs, new_regs);
 
 	return old_regs;
 }
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
index 7e98ce1d2c0e..08b53454f831 100644
--- a/arch/x86/include/asm/mmu_context_32.h
+++ b/arch/x86/include/asm/mmu_context_32.h
@@ -4,8 +4,8 @@
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 #ifdef CONFIG_SMP
-	if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
-		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
+	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+		percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
 #endif
 }
 
@@ -19,8 +19,8 @@ static inline void switch_mm(struct mm_struct *prev,
 		/* stop flush ipis for the previous mm */
 		cpu_clear(cpu, prev->cpu_vm_mask);
 #ifdef CONFIG_SMP
-		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
-		x86_write_percpu(cpu_tlbstate.active_mm, next);
+		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		percpu_write(cpu_tlbstate.active_mm, next);
 #endif
 		cpu_set(cpu, next->cpu_vm_mask);
 
@@ -35,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev,
 	}
 #ifdef CONFIG_SMP
 	else {
-		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
-		BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
+		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
 
 		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
 			/* We were in lazy tlb mode and leave_mm disabled
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index e3d3a081d798..47f274fe6953 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -45,11 +45,11 @@ extern void pda_init(int);
 
 #define cpu_pda(cpu)		(&per_cpu(__pda, cpu))
 
-#define read_pda(field)		x86_read_percpu(__pda.field)
-#define write_pda(field, val)	x86_write_percpu(__pda.field, val)
-#define add_pda(field, val)	x86_add_percpu(__pda.field, val)
-#define sub_pda(field, val)	x86_sub_percpu(__pda.field, val)
-#define or_pda(field, val)	x86_or_percpu(__pda.field, val)
+#define read_pda(field)		percpu_read(__pda.field)
+#define write_pda(field, val)	percpu_write(__pda.field, val)
+#define add_pda(field, val)	percpu_add(__pda.field, val)
+#define sub_pda(field, val)	percpu_sub(__pda.field, val)
+#define or_pda(field, val)	percpu_or(__pda.field, val)
 
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */
 #define test_and_clear_bit_pda(bit, field)				\
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 328b31a429d7..03aa4b00a1c3 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -40,16 +40,11 @@
 
 #ifdef CONFIG_SMP
 #define __percpu_seg_str	"%%"__stringify(__percpu_seg)":"
-#define __my_cpu_offset		x86_read_percpu(this_cpu_off)
+#define __my_cpu_offset		percpu_read(this_cpu_off)
 #else
 #define __percpu_seg_str
 #endif
 
-#include <asm-generic/percpu.h>
-
-/* We can use this directly for local CPU (faster). */
-DECLARE_PER_CPU(unsigned long, this_cpu_off);
-
 /* For arch-specific code, we can use direct single-insn ops (they
  * don't give an lvalue though). */
 extern void __bad_percpu_size(void);
@@ -115,11 +110,13 @@ do {							\
 	ret__;						\
 })
 
-#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
-#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
-#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
-#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
-#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
+#define percpu_read(var)	percpu_from_op("mov", per_cpu__##var)
+#define percpu_write(var, val)	percpu_to_op("mov", per_cpu__##var, val)
+#define percpu_add(var, val)	percpu_to_op("add", per_cpu__##var, val)
+#define percpu_sub(var, val)	percpu_to_op("sub", per_cpu__##var, val)
+#define percpu_and(var, val)	percpu_to_op("and", per_cpu__##var, val)
+#define percpu_or(var, val)	percpu_to_op("or", per_cpu__##var, val)
+#define percpu_xor(var, val)	percpu_to_op("xor", per_cpu__##var, val)
 
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */
 #define x86_test_and_clear_bit_percpu(bit, var)				\
@@ -131,6 +128,11 @@ do {							\
 	old__;								\
 })
 
+#include <asm-generic/percpu.h>
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU(unsigned long, this_cpu_off);
+
 #ifdef CONFIG_X86_64
 extern void load_pda_offset(int cpu);
 #else
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 127415402ea1..c7bbbbe65d3f 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -160,7 +160,7 @@ extern unsigned disabled_cpus __cpuinitdata;
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
-#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+#define raw_smp_processor_id() (percpu_read(cpu_number))
 extern int safe_smp_processor_id(void);
 
 #elif defined(CONFIG_X86_64_SMP)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a546f55c77b4..77d546817d94 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -591,7 +591,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (prev->gs | next->gs)
 		loadsegment(gs, next->gs);
 
-	x86_write_percpu(current_task, next_p);
+	percpu_write(current_task, next_p);
 
 	return prev_p;
 }
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index ec53818f4e38..e65449d0f7d9 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -34,8 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock);
  */
 void leave_mm(int cpu)
 {
-	BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
-	cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
+	BUG_ON(percpu_read(cpu_tlbstate.state) == TLBSTATE_OK);
+	cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
 	load_cr3(swapper_pg_dir);
 }
 EXPORT_SYMBOL_GPL(leave_mm);
@@ -103,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
 		 * BUG();
 		 */
 
-	if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
-		if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
+	if (flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+		if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
 			if (flush_va == TLB_FLUSH_ALL)
 				local_flush_tlb();
 			else
@@ -222,7 +222,7 @@ static void do_flush_tlb_all(void *info)
 	unsigned long cpu = smp_processor_id();
 
 	__flush_tlb_all();
-	if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
+	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
 		leave_mm(cpu);
 }
 
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 1a48368acb09..96f15b09a4c5 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -402,7 +402,7 @@ void __init find_smp_config(void)
 	     VOYAGER_SUS_IN_CONTROL_PORT);
 
 	current_thread_info()->cpu = boot_cpu_id;
-	x86_write_percpu(cpu_number, boot_cpu_id);
+	percpu_write(cpu_number, boot_cpu_id);
 }
 
 /*
@@ -1782,7 +1782,7 @@ static void __init voyager_smp_cpus_done(unsigned int max_cpus)
 void __init smp_setup_processor_id(void)
 {
 	current_thread_info()->cpu = hard_smp_processor_id();
-	x86_write_percpu(cpu_number, hard_smp_processor_id());
+	percpu_write(cpu_number, hard_smp_processor_id());
 }
 
 static void voyager_send_call_func(cpumask_t callmask)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 312414ef9365..75b94139e1f2 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -695,17 +695,17 @@ static void xen_write_cr0(unsigned long cr0)
 
 static void xen_write_cr2(unsigned long cr2)
 {
-	x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+	percpu_read(xen_vcpu)->arch.cr2 = cr2;
 }
 
 static unsigned long xen_read_cr2(void)
 {
-	return x86_read_percpu(xen_vcpu)->arch.cr2;
+	return percpu_read(xen_vcpu)->arch.cr2;
 }
 
 static unsigned long xen_read_cr2_direct(void)
 {
-	return x86_read_percpu(xen_vcpu_info.arch.cr2);
+	return percpu_read(xen_vcpu_info.arch.cr2);
 }
 
 static void xen_write_cr4(unsigned long cr4)
@@ -718,12 +718,12 @@ static void xen_write_cr4(unsigned long cr4)
 
 static unsigned long xen_read_cr3(void)
 {
-	return x86_read_percpu(xen_cr3);
+	return percpu_read(xen_cr3);
 }
 
 static void set_current_cr3(void *v)
 {
-	x86_write_percpu(xen_current_cr3, (unsigned long)v);
+	percpu_write(xen_current_cr3, (unsigned long)v);
 }
 
 static void __xen_write_cr3(bool kernel, unsigned long cr3)
@@ -748,7 +748,7 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 
 	if (kernel) {
-		x86_write_percpu(xen_cr3, cr3);
+		percpu_write(xen_cr3, cr3);
 
 		/* Update xen_current_cr3 once the batch has actually
 		   been submitted. */
@@ -764,7 +764,7 @@ static void xen_write_cr3(unsigned long cr3)
 
 	/* Update while interrupts are disabled, so its atomic with
 	   respect to ipis */
-	x86_write_percpu(xen_cr3, cr3);
+	percpu_write(xen_cr3, cr3);
 
 	__xen_write_cr3(true, cr3);
 
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index bb042608c602..2e8271431e1a 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -39,7 +39,7 @@ static unsigned long xen_save_fl(void)
 	struct vcpu_info *vcpu;
 	unsigned long flags;
 
-	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu = percpu_read(xen_vcpu);
 
 	/* flag has opposite sense of mask */
 	flags = !vcpu->evtchn_upcall_mask;
@@ -62,7 +62,7 @@ static void xen_restore_fl(unsigned long flags)
 	   make sure we're don't switch CPUs between getting the vcpu
 	   pointer and updating the mask. */
 	preempt_disable();
-	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu = percpu_read(xen_vcpu);
 	vcpu->evtchn_upcall_mask = flags;
 	preempt_enable_no_resched();
 
@@ -83,7 +83,7 @@ static void xen_irq_disable(void)
 	   make sure we're don't switch CPUs between getting the vcpu
 	   pointer and updating the mask. */
 	preempt_disable();
-	x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+	percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
 	preempt_enable_no_resched();
 }
 
@@ -96,7 +96,7 @@ static void xen_irq_enable(void)
 	   the caller is confused and is trying to re-enable interrupts
 	   on an indeterminate processor. */
 
-	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu = percpu_read(xen_vcpu);
 	vcpu->evtchn_upcall_mask = 0;
 
 	/* Doesn't matter if we get preempted here, because any
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 503c240e26c7..7bc7852cc5c4 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1074,7 +1074,7 @@ static void drop_other_mm_ref(void *info)
 
 	/* If this cpu still has a stale cr3 reference, then make sure
 	   it has been flushed. */
-	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+	if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
 		load_cr3(swapper_pg_dir);
 		arch_flush_lazy_cpu_mode();
 	}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 858938241616..e786fa7f2615 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -39,7 +39,7 @@ static inline void xen_mc_issue(unsigned mode)
 		xen_mc_flush();
 
 	/* restore flags saved in xen_mc_batch */
-	local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+	local_irq_restore(percpu_read(xen_mc_irq_flags));
 }
 
 /* Set up a callback to be called when the current batch is flushed */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 83fa4236477d..3bfd6dd0b47c 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -78,7 +78,7 @@ static __cpuinit void cpu_bringup(void)
 	xen_setup_cpu_clockevents();
 
 	cpu_set(cpu, cpu_online_map);
-	x86_write_percpu(cpu_state, CPU_ONLINE);
+	percpu_write(cpu_state, CPU_ONLINE);
 	wmb();
 
 	/* We can take interrupts now: we're officially "up". */
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index b0e63c672ebd..00f45ff081a6 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -80,4 +80,56 @@ extern void setup_per_cpu_areas(void);
 #define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
 					__typeof__(type) per_cpu_var(name)
 
+/*
+ * Optional methods for optimized non-lvalue per-cpu variable access.
+ *
+ * @var can be a percpu variable or a field of it and its size should
+ * equal char, int or long.  percpu_read() evaluates to a lvalue and
+ * all others to void.
+ *
+ * These operations are guaranteed to be atomic w.r.t. preemption.
+ * The generic versions use plain get/put_cpu_var().  Archs are
+ * encouraged to implement single-instruction alternatives which don't
+ * require preemption protection.
+ */
+#ifndef percpu_read
+# define percpu_read(var)						\
+  ({									\
+	typeof(per_cpu_var(var)) __tmp_var__;				\
+	__tmp_var__ = get_cpu_var(var);					\
+	put_cpu_var(var);						\
+	__tmp_var__;							\
+  })
+#endif
+
+#define __percpu_generic_to_op(var, val, op)				\
+do {									\
+	get_cpu_var(var) op val;					\
+	put_cpu_var(var);						\
+} while (0)
+
+#ifndef percpu_write
+# define percpu_write(var, val)		__percpu_generic_to_op(var, (val), =)
+#endif
+
+#ifndef percpu_add
+# define percpu_add(var, val)		__percpu_generic_to_op(var, (val), +=)
+#endif
+
+#ifndef percpu_sub
+# define percpu_sub(var, val)		__percpu_generic_to_op(var, (val), -=)
+#endif
+
+#ifndef percpu_and
+# define percpu_and(var, val)		__percpu_generic_to_op(var, (val), &=)
+#endif
+
+#ifndef percpu_or
+# define percpu_or(var, val)		__percpu_generic_to_op(var, (val), |=)
+#endif
+
+#ifndef percpu_xor
+# define percpu_xor(var, val)		__percpu_generic_to_op(var, (val), ^=)
+#endif
+
 #endif /* _ASM_GENERIC_PERCPU_H_ */
-- 
cgit v1.2.3


From 8ce031972b40da58c268caba8c5ea3c0856d7131 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 19 Jan 2009 12:21:27 +0900
Subject: x86: remove pda_init()

Impact: cleanup

Copy the code to cpu_init() to satisfy the requirement that the cpu
be reinitialized.  Remove all other calls, since the segments are
already initialized in head_64.S.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/include/asm/pda.h   |  1 -
 arch/x86/kernel/cpu/common.c | 15 +++------------
 arch/x86/kernel/head64.c     |  2 --
 arch/x86/xen/enlighten.c     |  1 -
 4 files changed, 3 insertions(+), 16 deletions(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 4a8c9d382c93..b473e952439a 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -24,7 +24,6 @@ struct x8664_pda {
 } ____cacheline_aligned_in_smp;
 
 DECLARE_PER_CPU(struct x8664_pda, __pda);
-extern void pda_init(int);
 
 #define cpu_pda(cpu)		(&per_cpu(__pda, cpu))
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7976a6a0f65c..f83a4d6160f0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -895,15 +895,6 @@ EXPORT_PER_CPU_SYMBOL(kernel_stack);
 
 DEFINE_PER_CPU(unsigned int, irq_count) = -1;
 
-void __cpuinit pda_init(int cpu)
-{
-	/* Setup up data that may be needed in __get_free_pages early */
-	loadsegment(fs, 0);
-	loadsegment(gs, 0);
-
-	load_pda_offset(cpu);
-}
-
 static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
 	__aligned(PAGE_SIZE);
@@ -967,9 +958,9 @@ void __cpuinit cpu_init(void)
 	struct task_struct *me;
 	int i;
 
-	/* CPU 0 is initialised in head64.c */
-	if (cpu != 0)
-		pda_init(cpu);
+	loadsegment(fs, 0);
+	loadsegment(gs, 0);
+	load_pda_offset(cpu);
 
 #ifdef CONFIG_NUMA
 	if (cpu != 0 && percpu_read(node_number) == 0 &&
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index af67d3227ea6..f5b272247690 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -91,8 +91,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	if (console_loglevel == 10)
 		early_printk("Kernel alive\n");
 
-	pda_init(0);
-
 	x86_64_start_reservations(real_mode_data);
 }
 
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 75b94139e1f2..bef941f61451 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1645,7 +1645,6 @@ asmlinkage void __init xen_start_kernel(void)
 #ifdef CONFIG_X86_64
 	/* Disable until direct per-cpu data access. */
 	have_vcpu_info_placement = 0;
-	pda_init(0);
 #endif
 
 	xen_smp_init();
-- 
cgit v1.2.3


From ab897d2013128f470240a541b31cf5e636984e71 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 22 Jan 2009 14:24:16 -0800
Subject: x86/pvops: remove pte_flags pvop

pte_flags() was introduced as a new pvop in order to extract just the
flags portion of a pte, which is a potentially cheaper operation than
extracting the page number as well.  It turns out this operation is
not needed, because simply using a mask to extract the flags from a
pte is sufficient for all current users.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/page.h     |  3 +--
 arch/x86/include/asm/paravirt.h | 18 ------------------
 arch/x86/kernel/paravirt.c      |  1 -
 arch/x86/xen/enlighten.c        |  1 -
 4 files changed, 1 insertion(+), 22 deletions(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
index e9873a2e8695..6b9810859daf 100644
--- a/arch/x86/include/asm/page.h
+++ b/arch/x86/include/asm/page.h
@@ -147,7 +147,7 @@ static inline pteval_t native_pte_val(pte_t pte)
 	return pte.pte;
 }
 
-static inline pteval_t native_pte_flags(pte_t pte)
+static inline pteval_t pte_flags(pte_t pte)
 {
 	return native_pte_val(pte) & PTE_FLAGS_MASK;
 }
@@ -173,7 +173,6 @@ static inline pteval_t native_pte_flags(pte_t pte)
 #endif
 
 #define pte_val(x)	native_pte_val(x)
-#define pte_flags(x)	native_pte_flags(x)
 #define __pte(x)	native_make_pte(x)
 
 #endif	/* CONFIG_PARAVIRT */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ba3e2ff6aedc..e25c410f3d8c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -279,7 +279,6 @@ struct pv_mmu_ops {
 					pte_t *ptep, pte_t pte);
 
 	pteval_t (*pte_val)(pte_t);
-	pteval_t (*pte_flags)(pte_t);
 	pte_t (*make_pte)(pteval_t pte);
 
 	pgdval_t (*pgd_val)(pgd_t);
@@ -1084,23 +1083,6 @@ static inline pteval_t pte_val(pte_t pte)
 	return ret;
 }
 
-static inline pteval_t pte_flags(pte_t pte)
-{
-	pteval_t ret;
-
-	if (sizeof(pteval_t) > sizeof(long))
-		ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags,
-				 pte.pte, (u64)pte.pte >> 32);
-	else
-		ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags,
-				 pte.pte);
-
-#ifdef CONFIG_PARAVIRT_DEBUG
-	BUG_ON(ret & PTE_PFN_MASK);
-#endif
-	return ret;
-}
-
 static inline pgd_t __pgd(pgdval_t val)
 {
 	pgdval_t ret;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e4c8fb608873..202514be5923 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -435,7 +435,6 @@ struct pv_mmu_ops pv_mmu_ops = {
 #endif /* PAGETABLE_LEVELS >= 3 */
 
 	.pte_val = native_pte_val,
-	.pte_flags = native_pte_flags,
 	.pgd_val = native_pgd_val,
 
 	.make_pte = native_make_pte,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bea215230b20..6f1bb71aa13a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1314,7 +1314,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
 
 	.pte_val = xen_pte_val,
-	.pte_flags = native_pte_flags,
 	.pgd_val = xen_pgd_val,
 
 	.make_pte = xen_make_pte,
-- 
cgit v1.2.3


From 319f3ba52c71630865b10ac3b99dd020440d681d Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 28 Jan 2009 14:35:01 -0800
Subject: xen: move remaining mmu-related stuff into mmu.c

Impact: Cleanup

Move remaining mmu-related stuff into mmu.c.
A general cleanup, and lay the groundwork for later patches.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/xen/enlighten.c | 731 +----------------------------------------------
 arch/x86/xen/mmu.c       | 700 +++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/xen/mmu.h       |   3 +
 arch/x86/xen/xen-ops.h   |   8 +
 4 files changed, 712 insertions(+), 730 deletions(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 6b3f7eef57e3..0cd2a165f179 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -61,35 +61,6 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 enum xen_domain_type xen_domain_type = XEN_NATIVE;
 EXPORT_SYMBOL_GPL(xen_domain_type);
 
-/*
- * Identity map, in addition to plain kernel map.  This needs to be
- * large enough to allocate page table pages to allocate the rest.
- * Each page can map 2MB.
- */
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
-
-#ifdef CONFIG_X86_64
-/* l3 pud for userspace vsyscall mapping */
-static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
-#endif /* CONFIG_X86_64 */
-
-/*
- * Note about cr3 (pagetable base) values:
- *
- * xen_cr3 contains the current logical cr3 value; it contains the
- * last set cr3.  This may not be the current effective cr3, because
- * its update may be being lazily deferred.  However, a vcpu looking
- * at its own cr3 can use this value knowing that it everything will
- * be self-consistent.
- *
- * xen_current_cr3 contains the actual vcpu cr3; it is set once the
- * hypercall to set the vcpu cr3 is complete (so it may be a little
- * out of date, but it will never be set early).  If one vcpu is
- * looking at another vcpu's cr3 value, it should use this variable.
- */
-DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
-DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
-
 struct start_info *xen_start_info;
 EXPORT_SYMBOL_GPL(xen_start_info);
 
@@ -237,7 +208,7 @@ static unsigned long xen_get_debugreg(int reg)
 	return HYPERVISOR_get_debugreg(reg);
 }
 
-static void xen_leave_lazy(void)
+void xen_leave_lazy(void)
 {
 	paravirt_leave_lazy(paravirt_get_lazy_mode());
 	xen_mc_flush();
@@ -598,76 +569,6 @@ static struct apic_ops xen_basic_apic_ops = {
 
 #endif
 
-static void xen_flush_tlb(void)
-{
-	struct mmuext_op *op;
-	struct multicall_space mcs;
-
-	preempt_disable();
-
-	mcs = xen_mc_entry(sizeof(*op));
-
-	op = mcs.args;
-	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
-	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-	xen_mc_issue(PARAVIRT_LAZY_MMU);
-
-	preempt_enable();
-}
-
-static void xen_flush_tlb_single(unsigned long addr)
-{
-	struct mmuext_op *op;
-	struct multicall_space mcs;
-
-	preempt_disable();
-
-	mcs = xen_mc_entry(sizeof(*op));
-	op = mcs.args;
-	op->cmd = MMUEXT_INVLPG_LOCAL;
-	op->arg1.linear_addr = addr & PAGE_MASK;
-	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-	xen_mc_issue(PARAVIRT_LAZY_MMU);
-
-	preempt_enable();
-}
-
-static void xen_flush_tlb_others(const struct cpumask *cpus,
-				 struct mm_struct *mm, unsigned long va)
-{
-	struct {
-		struct mmuext_op op;
-		DECLARE_BITMAP(mask, NR_CPUS);
-	} *args;
-	struct multicall_space mcs;
-
-	BUG_ON(cpumask_empty(cpus));
-	BUG_ON(!mm);
-
-	mcs = xen_mc_entry(sizeof(*args));
-	args = mcs.args;
-	args->op.arg2.vcpumask = to_cpumask(args->mask);
-
-	/* Remove us, and any offline CPUS. */
-	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
-	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
-	if (unlikely(cpumask_empty(to_cpumask(args->mask))))
-		goto issue;
-
-	if (va == TLB_FLUSH_ALL) {
-		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
-	} else {
-		args->op.cmd = MMUEXT_INVLPG_MULTI;
-		args->op.arg1.linear_addr = va;
-	}
-
-	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
-
-issue:
-	xen_mc_issue(PARAVIRT_LAZY_MMU);
-}
 
 static void xen_clts(void)
 {
@@ -693,21 +594,6 @@ static void xen_write_cr0(unsigned long cr0)
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
 }
 
-static void xen_write_cr2(unsigned long cr2)
-{
-	percpu_read(xen_vcpu)->arch.cr2 = cr2;
-}
-
-static unsigned long xen_read_cr2(void)
-{
-	return percpu_read(xen_vcpu)->arch.cr2;
-}
-
-static unsigned long xen_read_cr2_direct(void)
-{
-	return percpu_read(xen_vcpu_info.arch.cr2);
-}
-
 static void xen_write_cr4(unsigned long cr4)
 {
 	cr4 &= ~X86_CR4_PGE;
@@ -716,71 +602,6 @@ static void xen_write_cr4(unsigned long cr4)
 	native_write_cr4(cr4);
 }
 
-static unsigned long xen_read_cr3(void)
-{
-	return percpu_read(xen_cr3);
-}
-
-static void set_current_cr3(void *v)
-{
-	percpu_write(xen_current_cr3, (unsigned long)v);
-}
-
-static void __xen_write_cr3(bool kernel, unsigned long cr3)
-{
-	struct mmuext_op *op;
-	struct multicall_space mcs;
-	unsigned long mfn;
-
-	if (cr3)
-		mfn = pfn_to_mfn(PFN_DOWN(cr3));
-	else
-		mfn = 0;
-
-	WARN_ON(mfn == 0 && kernel);
-
-	mcs = __xen_mc_entry(sizeof(*op));
-
-	op = mcs.args;
-	op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
-	op->arg1.mfn = mfn;
-
-	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
-	if (kernel) {
-		percpu_write(xen_cr3, cr3);
-
-		/* Update xen_current_cr3 once the batch has actually
-		   been submitted. */
-		xen_mc_callback(set_current_cr3, (void *)cr3);
-	}
-}
-
-static void xen_write_cr3(unsigned long cr3)
-{
-	BUG_ON(preemptible());
-
-	xen_mc_batch();  /* disables interrupts */
-
-	/* Update while interrupts are disabled, so its atomic with
-	   respect to ipis */
-	percpu_write(xen_cr3, cr3);
-
-	__xen_write_cr3(true, cr3);
-
-#ifdef CONFIG_X86_64
-	{
-		pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
-		if (user_pgd)
-			__xen_write_cr3(false, __pa(user_pgd));
-		else
-			__xen_write_cr3(false, 0);
-	}
-#endif
-
-	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
-}
-
 static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 {
 	int ret;
@@ -822,185 +643,6 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 	return ret;
 }
 
-/* Early in boot, while setting up the initial pagetable, assume
-   everything is pinned. */
-static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
-{
-#ifdef CONFIG_FLATMEM
-	BUG_ON(mem_map);	/* should only be used early */
-#endif
-	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
-}
-
-/* Early release_pte assumes that all pts are pinned, since there's
-   only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(unsigned long pfn)
-{
-	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
-}
-
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
-{
-	struct mmuext_op op;
-	op.cmd = cmd;
-	op.arg1.mfn = pfn_to_mfn(pfn);
-	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-		BUG();
-}
-
-/* This needs to make sure the new pte page is pinned iff its being
-   attached to a pinned pagetable. */
-static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
-{
-	struct page *page = pfn_to_page(pfn);
-
-	if (PagePinned(virt_to_page(mm->pgd))) {
-		SetPagePinned(page);
-
-		vm_unmap_aliases();
-		if (!PageHighMem(page)) {
-			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
-			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
-		} else {
-			/* make sure there are no stray mappings of
-			   this page */
-			kmap_flush_unused();
-		}
-	}
-}
-
-static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
-{
-	xen_alloc_ptpage(mm, pfn, PT_PTE);
-}
-
-static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
-{
-	xen_alloc_ptpage(mm, pfn, PT_PMD);
-}
-
-static int xen_pgd_alloc(struct mm_struct *mm)
-{
-	pgd_t *pgd = mm->pgd;
-	int ret = 0;
-
-	BUG_ON(PagePinned(virt_to_page(pgd)));
-
-#ifdef CONFIG_X86_64
-	{
-		struct page *page = virt_to_page(pgd);
-		pgd_t *user_pgd;
-
-		BUG_ON(page->private != 0);
-
-		ret = -ENOMEM;
-
-		user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-		page->private = (unsigned long)user_pgd;
-
-		if (user_pgd != NULL) {
-			user_pgd[pgd_index(VSYSCALL_START)] =
-				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
-			ret = 0;
-		}
-
-		BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
-	}
-#endif
-
-	return ret;
-}
-
-static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
-#ifdef CONFIG_X86_64
-	pgd_t *user_pgd = xen_get_user_pgd(pgd);
-
-	if (user_pgd)
-		free_page((unsigned long)user_pgd);
-#endif
-}
-
-/* This should never happen until we're OK to use struct page */
-static void xen_release_ptpage(unsigned long pfn, unsigned level)
-{
-	struct page *page = pfn_to_page(pfn);
-
-	if (PagePinned(page)) {
-		if (!PageHighMem(page)) {
-			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
-				pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
-			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
-		}
-		ClearPagePinned(page);
-	}
-}
-
-static void xen_release_pte(unsigned long pfn)
-{
-	xen_release_ptpage(pfn, PT_PTE);
-}
-
-static void xen_release_pmd(unsigned long pfn)
-{
-	xen_release_ptpage(pfn, PT_PMD);
-}
-
-#if PAGETABLE_LEVELS == 4
-static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
-{
-	xen_alloc_ptpage(mm, pfn, PT_PUD);
-}
-
-static void xen_release_pud(unsigned long pfn)
-{
-	xen_release_ptpage(pfn, PT_PUD);
-}
-#endif
-
-#ifdef CONFIG_HIGHPTE
-static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
-{
-	pgprot_t prot = PAGE_KERNEL;
-
-	if (PagePinned(page))
-		prot = PAGE_KERNEL_RO;
-
-	if (0 && PageHighMem(page))
-		printk("mapping highpte %lx type %d prot %s\n",
-		       page_to_pfn(page), type,
-		       (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
-
-	return kmap_atomic_prot(page, type, prot);
-}
-#endif
-
-#ifdef CONFIG_X86_32
-static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
-{
-	/* If there's an existing pte, then don't allow _PAGE_RW to be set */
-	if (pte_val_ma(*ptep) & _PAGE_PRESENT)
-		pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) &
-			       pte_val_ma(pte));
-
-	return pte;
-}
-
-/* Init-time set_pte while constructing initial pagetables, which
-   doesn't allow RO pagetable pages to be remapped RW */
-static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
-{
-	pte = mask_rw_pte(ptep, pte);
-
-	xen_set_pte(ptep, pte);
-}
-#endif
-
-static __init void xen_pagetable_setup_start(pgd_t *base)
-{
-}
-
 void xen_setup_shared_info(void)
 {
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
@@ -1021,37 +663,6 @@ void xen_setup_shared_info(void)
 	xen_setup_mfn_list_list();
 }
 
-static __init void xen_pagetable_setup_done(pgd_t *base)
-{
-	xen_setup_shared_info();
-}
-
-static __init void xen_post_allocator_init(void)
-{
-	pv_mmu_ops.set_pte = xen_set_pte;
-	pv_mmu_ops.set_pmd = xen_set_pmd;
-	pv_mmu_ops.set_pud = xen_set_pud;
-#if PAGETABLE_LEVELS == 4
-	pv_mmu_ops.set_pgd = xen_set_pgd;
-#endif
-
-	/* This will work as long as patching hasn't happened yet
-	   (which it hasn't) */
-	pv_mmu_ops.alloc_pte = xen_alloc_pte;
-	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
-	pv_mmu_ops.release_pte = xen_release_pte;
-	pv_mmu_ops.release_pmd = xen_release_pmd;
-#if PAGETABLE_LEVELS == 4
-	pv_mmu_ops.alloc_pud = xen_alloc_pud;
-	pv_mmu_ops.release_pud = xen_release_pud;
-#endif
-
-#ifdef CONFIG_X86_64
-	SetPagePinned(virt_to_page(level3_user_vsyscall));
-#endif
-	xen_mark_init_mm_pinned();
-}
-
 /* This is called once we have the cpu_possible_map */
 void xen_setup_vcpu_info_placement(void)
 {
@@ -1126,49 +737,6 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
 	return ret;
 }
 
-static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
-{
-	pte_t pte;
-
-	phys >>= PAGE_SHIFT;
-
-	switch (idx) {
-	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
-#ifdef CONFIG_X86_F00F_BUG
-	case FIX_F00F_IDT:
-#endif
-#ifdef CONFIG_X86_32
-	case FIX_WP_TEST:
-	case FIX_VDSO:
-# ifdef CONFIG_HIGHMEM
-	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
-# endif
-#else
-	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
-#endif
-#ifdef CONFIG_X86_LOCAL_APIC
-	case FIX_APIC_BASE:	/* maps dummy local APIC */
-#endif
-		pte = pfn_pte(phys, prot);
-		break;
-
-	default:
-		pte = mfn_pte(phys, prot);
-		break;
-	}
-
-	__native_set_fixmap(idx, pte);
-
-#ifdef CONFIG_X86_64
-	/* Replicate changes to map the vsyscall page into the user
-	   pagetable vsyscall mapping. */
-	if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
-		unsigned long vaddr = __fix_to_virt(idx);
-		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
-	}
-#endif
-}
-
 static const struct pv_info xen_info __initdata = {
 	.paravirt_enabled = 1,
 	.shared_kernel_pmd = 0,
@@ -1264,86 +832,6 @@ static const struct pv_apic_ops xen_apic_ops __initdata = {
 #endif
 };
 
-static const struct pv_mmu_ops xen_mmu_ops __initdata = {
-	.pagetable_setup_start = xen_pagetable_setup_start,
-	.pagetable_setup_done = xen_pagetable_setup_done,
-
-	.read_cr2 = xen_read_cr2,
-	.write_cr2 = xen_write_cr2,
-
-	.read_cr3 = xen_read_cr3,
-	.write_cr3 = xen_write_cr3,
-
-	.flush_tlb_user = xen_flush_tlb,
-	.flush_tlb_kernel = xen_flush_tlb,
-	.flush_tlb_single = xen_flush_tlb_single,
-	.flush_tlb_others = xen_flush_tlb_others,
-
-	.pte_update = paravirt_nop,
-	.pte_update_defer = paravirt_nop,
-
-	.pgd_alloc = xen_pgd_alloc,
-	.pgd_free = xen_pgd_free,
-
-	.alloc_pte = xen_alloc_pte_init,
-	.release_pte = xen_release_pte_init,
-	.alloc_pmd = xen_alloc_pte_init,
-	.alloc_pmd_clone = paravirt_nop,
-	.release_pmd = xen_release_pte_init,
-
-#ifdef CONFIG_HIGHPTE
-	.kmap_atomic_pte = xen_kmap_atomic_pte,
-#endif
-
-#ifdef CONFIG_X86_64
-	.set_pte = xen_set_pte,
-#else
-	.set_pte = xen_set_pte_init,
-#endif
-	.set_pte_at = xen_set_pte_at,
-	.set_pmd = xen_set_pmd_hyper,
-
-	.ptep_modify_prot_start = __ptep_modify_prot_start,
-	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
-
-	.pte_val = xen_pte_val,
-	.pgd_val = xen_pgd_val,
-
-	.make_pte = xen_make_pte,
-	.make_pgd = xen_make_pgd,
-
-#ifdef CONFIG_X86_PAE
-	.set_pte_atomic = xen_set_pte_atomic,
-	.set_pte_present = xen_set_pte_at,
-	.pte_clear = xen_pte_clear,
-	.pmd_clear = xen_pmd_clear,
-#endif	/* CONFIG_X86_PAE */
-	.set_pud = xen_set_pud_hyper,
-
-	.make_pmd = xen_make_pmd,
-	.pmd_val = xen_pmd_val,
-
-#if PAGETABLE_LEVELS == 4
-	.pud_val = xen_pud_val,
-	.make_pud = xen_make_pud,
-	.set_pgd = xen_set_pgd_hyper,
-
-	.alloc_pud = xen_alloc_pte_init,
-	.release_pud = xen_release_pte_init,
-#endif	/* PAGETABLE_LEVELS == 4 */
-
-	.activate_mm = xen_activate_mm,
-	.dup_mmap = xen_dup_mmap,
-	.exit_mmap = xen_exit_mmap,
-
-	.lazy_mode = {
-		.enter = paravirt_enter_lazy_mmu,
-		.leave = xen_leave_lazy,
-	},
-
-	.set_fixmap = xen_set_fixmap,
-};
-
 static void xen_reboot(int reason)
 {
 	struct sched_shutdown r = { .reason = reason };
@@ -1386,223 +874,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
 };
 
 
-static void __init xen_reserve_top(void)
-{
-#ifdef CONFIG_X86_32
-	unsigned long top = HYPERVISOR_VIRT_START;
-	struct xen_platform_parameters pp;
-
-	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
-		top = pp.virt_start;
-
-	reserve_top_address(-top);
-#endif	/* CONFIG_X86_32 */
-}
-
-/*
- * Like __va(), but returns address in the kernel mapping (which is
- * all we have until the physical memory mapping has been set up.
- */
-static void *__ka(phys_addr_t paddr)
-{
-#ifdef CONFIG_X86_64
-	return (void *)(paddr + __START_KERNEL_map);
-#else
-	return __va(paddr);
-#endif
-}
-
-/* Convert a machine address to physical address */
-static unsigned long m2p(phys_addr_t maddr)
-{
-	phys_addr_t paddr;
-
-	maddr &= PTE_PFN_MASK;
-	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
-
-	return paddr;
-}
-
-/* Convert a machine address to kernel virtual */
-static void *m2v(phys_addr_t maddr)
-{
-	return __ka(m2p(maddr));
-}
-
-static void set_page_prot(void *addr, pgprot_t prot)
-{
-	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
-	pte_t pte = pfn_pte(pfn, prot);
-
-	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
-		BUG();
-}
-
-static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
-{
-	unsigned pmdidx, pteidx;
-	unsigned ident_pte;
-	unsigned long pfn;
-
-	ident_pte = 0;
-	pfn = 0;
-	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
-		pte_t *pte_page;
-
-		/* Reuse or allocate a page of ptes */
-		if (pmd_present(pmd[pmdidx]))
-			pte_page = m2v(pmd[pmdidx].pmd);
-		else {
-			/* Check for free pte pages */
-			if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
-				break;
-
-			pte_page = &level1_ident_pgt[ident_pte];
-			ident_pte += PTRS_PER_PTE;
-
-			pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
-		}
-
-		/* Install mappings */
-		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
-			pte_t pte;
-
-			if (pfn > max_pfn_mapped)
-				max_pfn_mapped = pfn;
-
-			if (!pte_none(pte_page[pteidx]))
-				continue;
-
-			pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
-			pte_page[pteidx] = pte;
-		}
-	}
-
-	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
-		set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
-
-	set_page_prot(pmd, PAGE_KERNEL_RO);
-}
-
-#ifdef CONFIG_X86_64
-static void convert_pfn_mfn(void *v)
-{
-	pte_t *pte = v;
-	int i;
-
-	/* All levels are converted the same way, so just treat them
-	   as ptes. */
-	for (i = 0; i < PTRS_PER_PTE; i++)
-		pte[i] = xen_make_pte(pte[i].pte);
-}
-
-/*
- * Set up the inital kernel pagetable.
- *
- * We can construct this by grafting the Xen provided pagetable into
- * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
- * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
- * means that only the kernel has a physical mapping to start with -
- * but that's enough to get __va working.  We need to fill in the rest
- * of the physical mapping once some sort of allocator has been set
- * up.
- */
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
-						unsigned long max_pfn)
-{
-	pud_t *l3;
-	pmd_t *l2;
-
-	/* Zap identity mapping */
-	init_level4_pgt[0] = __pgd(0);
-
-	/* Pre-constructed entries are in pfn, so convert to mfn */
-	convert_pfn_mfn(init_level4_pgt);
-	convert_pfn_mfn(level3_ident_pgt);
-	convert_pfn_mfn(level3_kernel_pgt);
-
-	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
-	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
-
-	memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-	memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-
-	l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
-	l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
-	memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
-
-	/* Set up identity map */
-	xen_map_identity_early(level2_ident_pgt, max_pfn);
-
-	/* Make pagetable pieces RO */
-	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
-	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
-	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
-
-	/* Pin down new L4 */
-	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
-			  PFN_DOWN(__pa_symbol(init_level4_pgt)));
-
-	/* Unpin Xen-provided one */
-	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-
-	/* Switch over */
-	pgd = init_level4_pgt;
-
-	/*
-	 * At this stage there can be no user pgd, and no page
-	 * structure to attach it to, so make sure we just set kernel
-	 * pgd.
-	 */
-	xen_mc_batch();
-	__xen_write_cr3(true, __pa(pgd));
-	xen_mc_issue(PARAVIRT_LAZY_CPU);
-
-	reserve_early(__pa(xen_start_info->pt_base),
-		      __pa(xen_start_info->pt_base +
-			   xen_start_info->nr_pt_frames * PAGE_SIZE),
-		      "XEN PAGETABLES");
-
-	return pgd;
-}
-#else	/* !CONFIG_X86_64 */
-static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
-
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
-						unsigned long max_pfn)
-{
-	pmd_t *kernel_pmd;
-
-	init_pg_tables_start = __pa(pgd);
-	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
-	max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
-
-	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
-	memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
-
-	xen_map_identity_early(level2_kernel_pgt, max_pfn);
-
-	memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
-	set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
-			__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
-
-	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
-	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
-	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
-
-	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
-
-	xen_write_cr3(__pa(swapper_pg_dir));
-
-	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
-
-	return swapper_pg_dir;
-}
-#endif	/* CONFIG_X86_64 */
-
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 98cb9869eb24..94e452c0b00c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -47,6 +47,7 @@
 #include <asm/tlbflush.h>
 #include <asm/fixmap.h>
 #include <asm/mmu_context.h>
+#include <asm/setup.h>
 #include <asm/paravirt.h>
 #include <asm/linkage.h>
 
@@ -55,6 +56,8 @@
 
 #include <xen/page.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+#include <xen/hvc-console.h>
 
 #include "multicalls.h"
 #include "mmu.h"
@@ -114,6 +117,37 @@ static inline void check_zero(void)
 
 #endif /* CONFIG_XEN_DEBUG_FS */
 
+
+/*
+ * Identity map, in addition to plain kernel map.  This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
+
+/*
+ * Note about cr3 (pagetable base) values:
+ *
+ * xen_cr3 contains the current logical cr3 value; it contains the
+ * last set cr3.  This may not be the current effective cr3, because
+ * its update may be being lazily deferred.  However, a vcpu looking
+ * at its own cr3 can use this value knowing that it everything will
+ * be self-consistent.
+ *
+ * xen_current_cr3 contains the actual vcpu cr3; it is set once the
+ * hypercall to set the vcpu cr3 is complete (so it may be a little
+ * out of date, but it will never be set early).  If one vcpu is
+ * looking at another vcpu's cr3 value, it should use this variable.
+ */
+DEFINE_PER_CPU(unsigned long, xen_cr3);	 /* cr3 stored as physaddr */
+DEFINE_PER_CPU(unsigned long, xen_current_cr3);	 /* actual vcpu cr3 */
+
+
 /*
  * Just beyond the highest usermode address.  STACK_TOP_MAX has a
  * redzone above it, so round it up to a PGD boundary.
@@ -1152,6 +1186,672 @@ void xen_exit_mmap(struct mm_struct *mm)
 	spin_unlock(&mm->page_table_lock);
 }
 
+static __init void xen_pagetable_setup_start(pgd_t *base)
+{
+}
+
+static __init void xen_pagetable_setup_done(pgd_t *base)
+{
+	xen_setup_shared_info();
+}
+
+static void xen_write_cr2(unsigned long cr2)
+{
+	percpu_read(xen_vcpu)->arch.cr2 = cr2;
+}
+
+static unsigned long xen_read_cr2(void)
+{
+	return percpu_read(xen_vcpu)->arch.cr2;
+}
+
+unsigned long xen_read_cr2_direct(void)
+{
+	return percpu_read(xen_vcpu_info.arch.cr2);
+}
+
+static void xen_flush_tlb(void)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs;
+
+	preempt_disable();
+
+	mcs = xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+	preempt_enable();
+}
+
+static void xen_flush_tlb_single(unsigned long addr)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs;
+
+	preempt_disable();
+
+	mcs = xen_mc_entry(sizeof(*op));
+	op = mcs.args;
+	op->cmd = MMUEXT_INVLPG_LOCAL;
+	op->arg1.linear_addr = addr & PAGE_MASK;
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+	preempt_enable();
+}
+
+static void xen_flush_tlb_others(const struct cpumask *cpus,
+				 struct mm_struct *mm, unsigned long va)
+{
+	struct {
+		struct mmuext_op op;
+		DECLARE_BITMAP(mask, NR_CPUS);
+	} *args;
+	struct multicall_space mcs;
+
+	BUG_ON(cpumask_empty(cpus));
+	BUG_ON(!mm);
+
+	mcs = xen_mc_entry(sizeof(*args));
+	args = mcs.args;
+	args->op.arg2.vcpumask = to_cpumask(args->mask);
+
+	/* Remove us, and any offline CPUS. */
+	cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));
+	if (unlikely(cpumask_empty(to_cpumask(args->mask))))
+		goto issue;
+
+	if (va == TLB_FLUSH_ALL) {
+		args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+	} else {
+		args->op.cmd = MMUEXT_INVLPG_MULTI;
+		args->op.arg1.linear_addr = va;
+	}
+
+	MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+issue:
+	xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static unsigned long xen_read_cr3(void)
+{
+	return percpu_read(xen_cr3);
+}
+
+static void set_current_cr3(void *v)
+{
+	percpu_write(xen_current_cr3, (unsigned long)v);
+}
+
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
+{
+	struct mmuext_op *op;
+	struct multicall_space mcs;
+	unsigned long mfn;
+
+	if (cr3)
+		mfn = pfn_to_mfn(PFN_DOWN(cr3));
+	else
+		mfn = 0;
+
+	WARN_ON(mfn == 0 && kernel);
+
+	mcs = __xen_mc_entry(sizeof(*op));
+
+	op = mcs.args;
+	op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
+	op->arg1.mfn = mfn;
+
+	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+	if (kernel) {
+		percpu_write(xen_cr3, cr3);
+
+		/* Update xen_current_cr3 once the batch has actually
+		   been submitted. */
+		xen_mc_callback(set_current_cr3, (void *)cr3);
+	}
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
+	BUG_ON(preemptible());
+
+	xen_mc_batch();  /* disables interrupts */
+
+	/* Update while interrupts are disabled, so its atomic with
+	   respect to ipis */
+	percpu_write(xen_cr3, cr3);
+
+	__xen_write_cr3(true, cr3);
+
+#ifdef CONFIG_X86_64
+	{
+		pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+		if (user_pgd)
+			__xen_write_cr3(false, __pa(user_pgd));
+		else
+			__xen_write_cr3(false, 0);
+	}
+#endif
+
+	xen_mc_issue(PARAVIRT_LAZY_CPU);  /* interrupts restored */
+}
+
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t *pgd = mm->pgd;
+	int ret = 0;
+
+	BUG_ON(PagePinned(virt_to_page(pgd)));
+
+#ifdef CONFIG_X86_64
+	{
+		struct page *page = virt_to_page(pgd);
+		pgd_t *user_pgd;
+
+		BUG_ON(page->private != 0);
+
+		ret = -ENOMEM;
+
+		user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+		page->private = (unsigned long)user_pgd;
+
+		if (user_pgd != NULL) {
+			user_pgd[pgd_index(VSYSCALL_START)] =
+				__pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+			ret = 0;
+		}
+
+		BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+	}
+#endif
+
+	return ret;
+}
+
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+	pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+	if (user_pgd)
+		free_page((unsigned long)user_pgd);
+#endif
+}
+
+
+/* Early in boot, while setting up the initial pagetable, assume
+   everything is pinned. */
+static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
+{
+#ifdef CONFIG_FLATMEM
+	BUG_ON(mem_map);	/* should only be used early */
+#endif
+	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+}
+
+/* Early release_pte assumes that all pts are pinned, since there's
+   only init_mm and anything attached to that is pinned. */
+static void xen_release_pte_init(unsigned long pfn)
+{
+	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+}
+
+static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+	struct mmuext_op op;
+	op.cmd = cmd;
+	op.arg1.mfn = pfn_to_mfn(pfn);
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+		BUG();
+}
+
+/* This needs to make sure the new pte page is pinned iff its being
+   attached to a pinned pagetable. */
+static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
+{
+	struct page *page = pfn_to_page(pfn);
+
+	if (PagePinned(virt_to_page(mm->pgd))) {
+		SetPagePinned(page);
+
+		vm_unmap_aliases();
+		if (!PageHighMem(page)) {
+			make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
+			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+		} else {
+			/* make sure there are no stray mappings of
+			   this page */
+			kmap_flush_unused();
+		}
+	}
+}
+
+static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
+{
+	xen_alloc_ptpage(mm, pfn, PT_PTE);
+}
+
+static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
+{
+	xen_alloc_ptpage(mm, pfn, PT_PMD);
+}
+
+/* This should never happen until we're OK to use struct page */
+static void xen_release_ptpage(unsigned long pfn, unsigned level)
+{
+	struct page *page = pfn_to_page(pfn);
+
+	if (PagePinned(page)) {
+		if (!PageHighMem(page)) {
+			if (level == PT_PTE && USE_SPLIT_PTLOCKS)
+				pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
+			make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
+		}
+		ClearPagePinned(page);
+	}
+}
+
+static void xen_release_pte(unsigned long pfn)
+{
+	xen_release_ptpage(pfn, PT_PTE);
+}
+
+static void xen_release_pmd(unsigned long pfn)
+{
+	xen_release_ptpage(pfn, PT_PMD);
+}
+
+#if PAGETABLE_LEVELS == 4
+static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
+{
+	xen_alloc_ptpage(mm, pfn, PT_PUD);
+}
+
+static void xen_release_pud(unsigned long pfn)
+{
+	xen_release_ptpage(pfn, PT_PUD);
+}
+#endif
+
+void __init xen_reserve_top(void)
+{
+#ifdef CONFIG_X86_32
+	unsigned long top = HYPERVISOR_VIRT_START;
+	struct xen_platform_parameters pp;
+
+	if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
+		top = pp.virt_start;
+
+	reserve_top_address(-top);
+#endif	/* CONFIG_X86_32 */
+}
+
+/*
+ * Like __va(), but returns address in the kernel mapping (which is
+ * all we have until the physical memory mapping has been set up.
+ */
+static void *__ka(phys_addr_t paddr)
+{
+#ifdef CONFIG_X86_64
+	return (void *)(paddr + __START_KERNEL_map);
+#else
+	return __va(paddr);
+#endif
+}
+
+/* Convert a machine address to physical address */
+static unsigned long m2p(phys_addr_t maddr)
+{
+	phys_addr_t paddr;
+
+	maddr &= PTE_PFN_MASK;
+	paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
+
+	return paddr;
+}
+
+/* Convert a machine address to kernel virtual */
+static void *m2v(phys_addr_t maddr)
+{
+	return __ka(m2p(maddr));
+}
+
+static void set_page_prot(void *addr, pgprot_t prot)
+{
+	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+	pte_t pte = pfn_pte(pfn, prot);
+
+	if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
+		BUG();
+}
+
+static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
+{
+	unsigned pmdidx, pteidx;
+	unsigned ident_pte;
+	unsigned long pfn;
+
+	ident_pte = 0;
+	pfn = 0;
+	for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+		pte_t *pte_page;
+
+		/* Reuse or allocate a page of ptes */
+		if (pmd_present(pmd[pmdidx]))
+			pte_page = m2v(pmd[pmdidx].pmd);
+		else {
+			/* Check for free pte pages */
+			if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+				break;
+
+			pte_page = &level1_ident_pgt[ident_pte];
+			ident_pte += PTRS_PER_PTE;
+
+			pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
+		}
+
+		/* Install mappings */
+		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+			pte_t pte;
+
+			if (pfn > max_pfn_mapped)
+				max_pfn_mapped = pfn;
+
+			if (!pte_none(pte_page[pteidx]))
+				continue;
+
+			pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
+			pte_page[pteidx] = pte;
+		}
+	}
+
+	for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+		set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+
+	set_page_prot(pmd, PAGE_KERNEL_RO);
+}
+
+#ifdef CONFIG_X86_64
+static void convert_pfn_mfn(void *v)
+{
+	pte_t *pte = v;
+	int i;
+
+	/* All levels are converted the same way, so just treat them
+	   as ptes. */
+	for (i = 0; i < PTRS_PER_PTE; i++)
+		pte[i] = xen_make_pte(pte[i].pte);
+}
+
+/*
+ * Set up the inital kernel pagetable.
+ *
+ * We can construct this by grafting the Xen provided pagetable into
+ * head_64.S's preconstructed pagetables.  We copy the Xen L2's into
+ * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt.  This
+ * means that only the kernel has a physical mapping to start with -
+ * but that's enough to get __va working.  We need to fill in the rest
+ * of the physical mapping once some sort of allocator has been set
+ * up.
+ */
+__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+					 unsigned long max_pfn)
+{
+	pud_t *l3;
+	pmd_t *l2;
+
+	/* Zap identity mapping */
+	init_level4_pgt[0] = __pgd(0);
+
+	/* Pre-constructed entries are in pfn, so convert to mfn */
+	convert_pfn_mfn(init_level4_pgt);
+	convert_pfn_mfn(level3_ident_pgt);
+	convert_pfn_mfn(level3_kernel_pgt);
+
+	l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
+	l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
+
+	memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+	memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+	l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
+	l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
+	memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+	/* Set up identity map */
+	xen_map_identity_early(level2_ident_pgt, max_pfn);
+
+	/* Make pagetable pieces RO */
+	set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+	set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+	set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+	set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
+	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+	set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+
+	/* Pin down new L4 */
+	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+			  PFN_DOWN(__pa_symbol(init_level4_pgt)));
+
+	/* Unpin Xen-provided one */
+	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+	/* Switch over */
+	pgd = init_level4_pgt;
+
+	/*
+	 * At this stage there can be no user pgd, and no page
+	 * structure to attach it to, so make sure we just set kernel
+	 * pgd.
+	 */
+	xen_mc_batch();
+	__xen_write_cr3(true, __pa(pgd));
+	xen_mc_issue(PARAVIRT_LAZY_CPU);
+
+	reserve_early(__pa(xen_start_info->pt_base),
+		      __pa(xen_start_info->pt_base +
+			   xen_start_info->nr_pt_frames * PAGE_SIZE),
+		      "XEN PAGETABLES");
+
+	return pgd;
+}
+#else	/* !CONFIG_X86_64 */
+static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+
+__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
+					 unsigned long max_pfn)
+{
+	pmd_t *kernel_pmd;
+
+	init_pg_tables_start = __pa(pgd);
+	init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+	max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+
+	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+	memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
+
+	xen_map_identity_early(level2_kernel_pgt, max_pfn);
+
+	memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+	set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
+			__pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+
+	set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+	set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+	set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+
+	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+	xen_write_cr3(__pa(swapper_pg_dir));
+
+	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+
+	return swapper_pg_dir;
+}
+#endif	/* CONFIG_X86_64 */
+
+static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
+{
+	pte_t pte;
+
+	phys >>= PAGE_SHIFT;
+
+	switch (idx) {
+	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
+#ifdef CONFIG_X86_F00F_BUG
+	case FIX_F00F_IDT:
+#endif
+#ifdef CONFIG_X86_32
+	case FIX_WP_TEST:
+	case FIX_VDSO:
+# ifdef CONFIG_HIGHMEM
+	case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
+#else
+	case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+#endif
+#ifdef CONFIG_X86_LOCAL_APIC
+	case FIX_APIC_BASE:	/* maps dummy local APIC */
+#endif
+		pte = pfn_pte(phys, prot);
+		break;
+
+	default:
+		pte = mfn_pte(phys, prot);
+		break;
+	}
+
+	__native_set_fixmap(idx, pte);
+
+#ifdef CONFIG_X86_64
+	/* Replicate changes to map the vsyscall page into the user
+	   pagetable vsyscall mapping. */
+	if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+		unsigned long vaddr = __fix_to_virt(idx);
+		set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+	}
+#endif
+}
+
+__init void xen_post_allocator_init(void)
+{
+	pv_mmu_ops.set_pte = xen_set_pte;
+	pv_mmu_ops.set_pmd = xen_set_pmd;
+	pv_mmu_ops.set_pud = xen_set_pud;
+#if PAGETABLE_LEVELS == 4
+	pv_mmu_ops.set_pgd = xen_set_pgd;
+#endif
+
+	/* This will work as long as patching hasn't happened yet
+	   (which it hasn't) */
+	pv_mmu_ops.alloc_pte = xen_alloc_pte;
+	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+	pv_mmu_ops.release_pte = xen_release_pte;
+	pv_mmu_ops.release_pmd = xen_release_pmd;
+#if PAGETABLE_LEVELS == 4
+	pv_mmu_ops.alloc_pud = xen_alloc_pud;
+	pv_mmu_ops.release_pud = xen_release_pud;
+#endif
+
+#ifdef CONFIG_X86_64
+	SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
+	xen_mark_init_mm_pinned();
+}
+
+
+const struct pv_mmu_ops xen_mmu_ops __initdata = {
+	.pagetable_setup_start = xen_pagetable_setup_start,
+	.pagetable_setup_done = xen_pagetable_setup_done,
+
+	.read_cr2 = xen_read_cr2,
+	.write_cr2 = xen_write_cr2,
+
+	.read_cr3 = xen_read_cr3,
+	.write_cr3 = xen_write_cr3,
+
+	.flush_tlb_user = xen_flush_tlb,
+	.flush_tlb_kernel = xen_flush_tlb,
+	.flush_tlb_single = xen_flush_tlb_single,
+	.flush_tlb_others = xen_flush_tlb_others,
+
+	.pte_update = paravirt_nop,
+	.pte_update_defer = paravirt_nop,
+
+	.pgd_alloc = xen_pgd_alloc,
+	.pgd_free = xen_pgd_free,
+
+	.alloc_pte = xen_alloc_pte_init,
+	.release_pte = xen_release_pte_init,
+	.alloc_pmd = xen_alloc_pte_init,
+	.alloc_pmd_clone = paravirt_nop,
+	.release_pmd = xen_release_pte_init,
+
+#ifdef CONFIG_HIGHPTE
+	.kmap_atomic_pte = xen_kmap_atomic_pte,
+#endif
+
+#ifdef CONFIG_X86_64
+	.set_pte = xen_set_pte,
+#else
+	.set_pte = xen_set_pte_init,
+#endif
+	.set_pte_at = xen_set_pte_at,
+	.set_pmd = xen_set_pmd_hyper,
+
+	.ptep_modify_prot_start = __ptep_modify_prot_start,
+	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
+
+	.pte_val = xen_pte_val,
+	.pgd_val = xen_pgd_val,
+
+	.make_pte = xen_make_pte,
+	.make_pgd = xen_make_pgd,
+
+#ifdef CONFIG_X86_PAE
+	.set_pte_atomic = xen_set_pte_atomic,
+	.set_pte_present = xen_set_pte_at,
+	.pte_clear = xen_pte_clear,
+	.pmd_clear = xen_pmd_clear,
+#endif	/* CONFIG_X86_PAE */
+	.set_pud = xen_set_pud_hyper,
+
+	.make_pmd = xen_make_pmd,
+	.pmd_val = xen_pmd_val,
+
+#if PAGETABLE_LEVELS == 4
+	.pud_val = xen_pud_val,
+	.make_pud = xen_make_pud,
+	.set_pgd = xen_set_pgd_hyper,
+
+	.alloc_pud = xen_alloc_pte_init,
+	.release_pud = xen_release_pte_init,
+#endif	/* PAGETABLE_LEVELS == 4 */
+
+	.activate_mm = xen_activate_mm,
+	.dup_mmap = xen_dup_mmap,
+	.exit_mmap = xen_exit_mmap,
+
+	.lazy_mode = {
+		.enter = paravirt_enter_lazy_mmu,
+		.leave = xen_leave_lazy,
+	},
+
+	.set_fixmap = xen_set_fixmap,
+};
+
+
 #ifdef CONFIG_XEN_DEBUG_FS
 
 static struct dentry *d_mmu_debug;
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 98d71659da5a..24d1b44a337d 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -54,4 +54,7 @@ pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t
 void  xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
 				  pte_t *ptep, pte_t pte);
 
+unsigned long xen_read_cr2_direct(void);
+
+extern const struct pv_mmu_ops xen_mmu_ops;
 #endif	/* _XEN_MMU_H */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index c1f8faf0a2c5..11913fc94c14 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -13,6 +13,7 @@ extern const char xen_failsafe_callback[];
 struct trap_info;
 void xen_copy_trap_info(struct trap_info *traps);
 
+DECLARE_PER_CPU(struct vcpu_info, xen_vcpu_info);
 DECLARE_PER_CPU(unsigned long, xen_cr3);
 DECLARE_PER_CPU(unsigned long, xen_current_cr3);
 
@@ -22,6 +23,13 @@ extern struct shared_info *HYPERVISOR_shared_info;
 
 void xen_setup_mfn_list_list(void);
 void xen_setup_shared_info(void);
+void xen_setup_machphys_mapping(void);
+pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
+void xen_ident_map_ISA(void);
+void xen_reserve_top(void);
+
+void xen_leave_lazy(void);
+void xen_post_allocator_init(void);
 
 char * __init xen_memory_setup(void);
 void __init xen_arch_setup(void);
-- 
cgit v1.2.3


From ecb93d1ccd0aac63f03be2db3cac3fa974716f4c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Wed, 28 Jan 2009 14:35:05 -0800
Subject: x86/paravirt: add register-saving thunks to reduce caller register
 pressure

Impact: Optimization

One of the problems with inserting a pile of C calls where previously
there were none is that the register pressure is greatly increased.
The C calling convention says that the caller must expect a certain
set of registers may be trashed by the callee, and that the callee can
use those registers without restriction.  This includes the function
argument registers, and several others.

This patch seeks to alleviate this pressure by introducing wrapper
thunks that will do the register saving/restoring, so that the
callsite doesn't need to worry about it, but the callee function can
be conventional compiler-generated code.  In many cases (particularly
performance-sensitive cases) the callee will be in assembler anyway,
and need not use the compiler's calling convention.

Standard calling convention is:
	 arguments	    return	scratch
x86-32	 eax edx ecx	    eax		?
x86-64	 rdi rsi rdx rcx    rax		r8 r9 r10 r11

The thunk preserves all argument and scratch registers.  The return
register is not preserved, and is available as a scratch register for
unwrapped callee code (and of course the return value).

Wrapped function pointers are themselves wrapped in a struct
paravirt_callee_save structure, in order to get some warning from the
compiler when functions with mismatched calling conventions are used.

The most common paravirt ops, both statically and dynamically, are
interrupt enable/disable/save/restore, so handle them first.  This is
particularly easy since their calls are handled specially anyway.

XXX Deal with VMI.  What's their calling convention?

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/paravirt.h | 126 +++++++++++++++++++++++++++++++---------
 arch/x86/kernel/paravirt.c      |   8 +--
 arch/x86/kernel/vsmp_64.c       |  12 ++--
 arch/x86/lguest/boot.c          |  13 +++--
 arch/x86/xen/enlighten.c        |   8 +--
 arch/x86/xen/irq.c              |  14 +++--
 6 files changed, 132 insertions(+), 49 deletions(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index dcce961262bf..f9107b88631b 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -17,6 +17,10 @@
 #ifdef CONFIG_X86_32
 /* CLBR_ANY should match all regs platform has. For i386, that's just it */
 #define CLBR_ANY  ((1 << 4) - 1)
+
+#define CLBR_ARG_REGS	(CLBR_EAX | CLBR_EDX | CLBR_ECX)
+#define CLBR_RET_REG	(CLBR_EAX)
+#define CLBR_SCRATCH	(0)
 #else
 #define CLBR_RAX  CLBR_EAX
 #define CLBR_RCX  CLBR_ECX
@@ -27,16 +31,19 @@
 #define CLBR_R9   (1 << 6)
 #define CLBR_R10  (1 << 7)
 #define CLBR_R11  (1 << 8)
+
 #define CLBR_ANY  ((1 << 9) - 1)
 
 #define CLBR_ARG_REGS	(CLBR_RDI | CLBR_RSI | CLBR_RDX | \
 			 CLBR_RCX | CLBR_R8 | CLBR_R9)
-#define CLBR_RET_REG	(CLBR_RAX | CLBR_RDX)
+#define CLBR_RET_REG	(CLBR_RAX)
 #define CLBR_SCRATCH	(CLBR_R10 | CLBR_R11)
 
 #include <asm/desc_defs.h>
 #endif /* X86_64 */
 
+#define CLBR_CALLEE_SAVE ((CLBR_ARG_REGS | CLBR_SCRATCH) & ~CLBR_RET_REG)
+
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
 #include <linux/cpumask.h>
@@ -50,6 +57,14 @@ struct tss_struct;
 struct mm_struct;
 struct desc_struct;
 
+/*
+ * Wrapper type for pointers to code which uses the non-standard
+ * calling convention.  See PV_CALL_SAVE_REGS_THUNK below.
+ */
+struct paravirt_callee_save {
+	void *func;
+};
+
 /* general info */
 struct pv_info {
 	unsigned int kernel_rpl;
@@ -199,11 +214,15 @@ struct pv_irq_ops {
 	 * expected to use X86_EFLAGS_IF; all other bits
 	 * returned from save_fl are undefined, and may be ignored by
 	 * restore_fl.
+	 *
+	 * NOTE: These functions callers expect the callee to preserve
+	 * more registers than the standard C calling convention.
 	 */
-	unsigned long (*save_fl)(void);
-	void (*restore_fl)(unsigned long);
-	void (*irq_disable)(void);
-	void (*irq_enable)(void);
+	struct paravirt_callee_save save_fl;
+	struct paravirt_callee_save restore_fl;
+	struct paravirt_callee_save irq_disable;
+	struct paravirt_callee_save irq_enable;
+
 	void (*safe_halt)(void);
 	void (*halt)(void);
 
@@ -1437,12 +1456,37 @@ extern struct paravirt_patch_site __parainstructions[],
 	__parainstructions_end[];
 
 #ifdef CONFIG_X86_32
-#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;"
-#define PV_RESTORE_REGS "popl %%edx; popl %%ecx"
+#define PV_SAVE_REGS "pushl %ecx; pushl %edx;"
+#define PV_RESTORE_REGS "popl %edx; popl %ecx;"
+
+/* save and restore all caller-save registers, except return value */
+#define PV_SAVE_ALL_CALLER_REGS PV_SAVE_REGS
+#define PV_RESTORE_ALL_CALLER_REGS PV_RESTORE_REGS
+
 #define PV_FLAGS_ARG "0"
 #define PV_EXTRA_CLOBBERS
 #define PV_VEXTRA_CLOBBERS
 #else
+/* save and restore all caller-save registers, except return value */
+#define PV_SAVE_ALL_CALLER_REGS						\
+	"push %rcx;"							\
+	"push %rdx;"							\
+	"push %rsi;"							\
+	"push %rdi;"							\
+	"push %r8;"							\
+	"push %r9;"							\
+	"push %r10;"							\
+	"push %r11;"
+#define PV_RESTORE_ALL_CALLER_REGS					\
+	"pop %r11;"							\
+	"pop %r10;"							\
+	"pop %r9;"							\
+	"pop %r8;"							\
+	"pop %rdi;"							\
+	"pop %rsi;"							\
+	"pop %rdx;"							\
+	"pop %rcx;"
+
 /* We save some registers, but all of them, that's too much. We clobber all
  * caller saved registers but the argument parameter */
 #define PV_SAVE_REGS "pushq %%rdi;"
@@ -1452,52 +1496,76 @@ extern struct paravirt_patch_site __parainstructions[],
 #define PV_FLAGS_ARG "D"
 #endif
 
+/*
+ * Generate a thunk around a function which saves all caller-save
+ * registers except for the return value.  This allows C functions to
+ * be called from assembler code where fewer than normal registers are
+ * available.  It may also help code generation around calls from C
+ * code if the common case doesn't use many registers.
+ *
+ * When a callee is wrapped in a thunk, the caller can assume that all
+ * arg regs and all scratch registers are preserved across the
+ * call. The return value in rax/eax will not be saved, even for void
+ * functions.
+ */
+#define PV_CALLEE_SAVE_REGS_THUNK(func)					\
+	extern typeof(func) __raw_callee_save_##func;			\
+	static void *__##func##__ __used = func;			\
+									\
+	asm(".pushsection .text;"					\
+	    "__raw_callee_save_" #func ": "				\
+	    PV_SAVE_ALL_CALLER_REGS					\
+	    "call " #func ";"						\
+	    PV_RESTORE_ALL_CALLER_REGS					\
+	    "ret;"							\
+	    ".popsection")
+
+/* Get a reference to a callee-save function */
+#define PV_CALLEE_SAVE(func)						\
+	((struct paravirt_callee_save) { __raw_callee_save_##func })
+
+/* Promise that "func" already uses the right calling convention */
+#define __PV_IS_CALLEE_SAVE(func)			\
+	((struct paravirt_callee_save) { func })
+
 static inline unsigned long __raw_local_save_flags(void)
 {
 	unsigned long f;
 
-	asm volatile(paravirt_alt(PV_SAVE_REGS
-				  PARAVIRT_CALL
-				  PV_RESTORE_REGS)
+	asm volatile(paravirt_alt(PARAVIRT_CALL)
 		     : "=a"(f)
 		     : paravirt_type(pv_irq_ops.save_fl),
 		       paravirt_clobber(CLBR_EAX)
-		     : "memory", "cc" PV_VEXTRA_CLOBBERS);
+		     : "memory", "cc");
 	return f;
 }
 
 static inline void raw_local_irq_restore(unsigned long f)
 {
-	asm volatile(paravirt_alt(PV_SAVE_REGS
-				  PARAVIRT_CALL
-				  PV_RESTORE_REGS)
+	asm volatile(paravirt_alt(PARAVIRT_CALL)
 		     : "=a"(f)
 		     : PV_FLAGS_ARG(f),
 		       paravirt_type(pv_irq_ops.restore_fl),
 		       paravirt_clobber(CLBR_EAX)
-		     : "memory", "cc" PV_EXTRA_CLOBBERS);
+		     : "memory", "cc");
 }
 
 static inline void raw_local_irq_disable(void)
 {
-	asm volatile(paravirt_alt(PV_SAVE_REGS
-				  PARAVIRT_CALL
-				  PV_RESTORE_REGS)
+	asm volatile(paravirt_alt(PARAVIRT_CALL)
 		     :
 		     : paravirt_type(pv_irq_ops.irq_disable),
 		       paravirt_clobber(CLBR_EAX)
-		     : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
+		     : "memory", "eax", "cc");
 }
 
 static inline void raw_local_irq_enable(void)
 {
-	asm volatile(paravirt_alt(PV_SAVE_REGS
-				  PARAVIRT_CALL
-				  PV_RESTORE_REGS)
+	asm volatile(paravirt_alt(PARAVIRT_CALL)
 		     :
 		     : paravirt_type(pv_irq_ops.irq_enable),
 		       paravirt_clobber(CLBR_EAX)
-		     : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
+		     : "memory", "eax", "cc");
 }
 
 static inline unsigned long __raw_local_irq_save(void)
@@ -1541,9 +1609,9 @@ static inline unsigned long __raw_local_irq_save(void)
 
 
 #define COND_PUSH(set, mask, reg)			\
-	.if ((~set) & mask); push %reg; .endif
+	.if ((~(set)) & mask); push %reg; .endif
 #define COND_POP(set, mask, reg)			\
-	.if ((~set) & mask); pop %reg; .endif
+	.if ((~(set)) & mask); pop %reg; .endif
 
 #ifdef CONFIG_X86_64
 
@@ -1594,15 +1662,15 @@ static inline unsigned long __raw_local_irq_save(void)
 
 #define DISABLE_INTERRUPTS(clobbers)					\
 	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
-		  PV_SAVE_REGS(clobbers);				\
+		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);		\
 		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable);	\
-		  PV_RESTORE_REGS(clobbers);)
+		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 
 #define ENABLE_INTERRUPTS(clobbers)					\
 	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers,	\
-		  PV_SAVE_REGS(clobbers);				\
+		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);		\
 		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable);	\
-		  PV_RESTORE_REGS(clobbers);)
+		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 
 #define USERGS_SYSRET32							\
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32),	\
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index dd25e2b1593b..8adb6b5aa421 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -310,10 +310,10 @@ struct pv_time_ops pv_time_ops = {
 
 struct pv_irq_ops pv_irq_ops = {
 	.init_IRQ = native_init_IRQ,
-	.save_fl = native_save_fl,
-	.restore_fl = native_restore_fl,
-	.irq_disable = native_irq_disable,
-	.irq_enable = native_irq_enable,
+	.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
+	.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
+	.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
+	.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
 	.safe_halt = native_safe_halt,
 	.halt = native_halt,
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index a688f3bfaec2..c609205df594 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -37,6 +37,7 @@ static unsigned long vsmp_save_fl(void)
 		flags &= ~X86_EFLAGS_IF;
 	return flags;
 }
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl);
 
 static void vsmp_restore_fl(unsigned long flags)
 {
@@ -46,6 +47,7 @@ static void vsmp_restore_fl(unsigned long flags)
 		flags |= X86_EFLAGS_AC;
 	native_restore_fl(flags);
 }
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl);
 
 static void vsmp_irq_disable(void)
 {
@@ -53,6 +55,7 @@ static void vsmp_irq_disable(void)
 
 	native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC);
 }
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable);
 
 static void vsmp_irq_enable(void)
 {
@@ -60,6 +63,7 @@ static void vsmp_irq_enable(void)
 
 	native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
 }
+PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable);
 
 static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
 				  unsigned long addr, unsigned len)
@@ -90,10 +94,10 @@ static void __init set_vsmp_pv_ops(void)
 	       cap, ctl);
 	if (cap & ctl & (1 << 4)) {
 		/* Setup irq ops and turn on vSMP  IRQ fastpath handling */
-		pv_irq_ops.irq_disable = vsmp_irq_disable;
-		pv_irq_ops.irq_enable  = vsmp_irq_enable;
-		pv_irq_ops.save_fl  = vsmp_save_fl;
-		pv_irq_ops.restore_fl  = vsmp_restore_fl;
+		pv_irq_ops.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable);
+		pv_irq_ops.irq_enable  = PV_CALLEE_SAVE(vsmp_irq_enable);
+		pv_irq_ops.save_fl  = PV_CALLEE_SAVE(vsmp_save_fl);
+		pv_irq_ops.restore_fl  = PV_CALLEE_SAVE(vsmp_restore_fl);
 		pv_init_ops.patch = vsmp_patch;
 
 		ctl &= ~(1 << 4);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 92f1c6f3e19d..19e33b6cd593 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -173,24 +173,29 @@ static unsigned long save_fl(void)
 {
 	return lguest_data.irq_enabled;
 }
+PV_CALLEE_SAVE_REGS_THUNK(save_fl);
 
 /* restore_flags() just sets the flags back to the value given. */
 static void restore_fl(unsigned long flags)
 {
 	lguest_data.irq_enabled = flags;
 }
+PV_CALLEE_SAVE_REGS_THUNK(restore_fl);
 
 /* Interrupts go off... */
 static void irq_disable(void)
 {
 	lguest_data.irq_enabled = 0;
 }
+PV_CALLEE_SAVE_REGS_THUNK(irq_disable);
 
 /* Interrupts go on... */
 static void irq_enable(void)
 {
 	lguest_data.irq_enabled = X86_EFLAGS_IF;
 }
+PV_CALLEE_SAVE_REGS_THUNK(irq_enable);
+
 /*:*/
 /*M:003 Note that we don't check for outstanding interrupts when we re-enable
  * them (or when we unmask an interrupt).  This seems to work for the moment,
@@ -984,10 +989,10 @@ __init void lguest_init(void)
 
 	/* interrupt-related operations */
 	pv_irq_ops.init_IRQ = lguest_init_IRQ;
-	pv_irq_ops.save_fl = save_fl;
-	pv_irq_ops.restore_fl = restore_fl;
-	pv_irq_ops.irq_disable = irq_disable;
-	pv_irq_ops.irq_enable = irq_enable;
+	pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl);
+	pv_irq_ops.restore_fl = PV_CALLEE_SAVE(restore_fl);
+	pv_irq_ops.irq_disable = PV_CALLEE_SAVE(irq_disable);
+	pv_irq_ops.irq_enable = PV_CALLEE_SAVE(irq_enable);
 	pv_irq_ops.safe_halt = lguest_safe_halt;
 
 	/* init-time operations */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0cd2a165f179..ff6d530ccc77 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -676,10 +676,10 @@ void xen_setup_vcpu_info_placement(void)
 	if (have_vcpu_info_placement) {
 		printk(KERN_INFO "Xen: using vcpu_info placement\n");
 
-		pv_irq_ops.save_fl = xen_save_fl_direct;
-		pv_irq_ops.restore_fl = xen_restore_fl_direct;
-		pv_irq_ops.irq_disable = xen_irq_disable_direct;
-		pv_irq_ops.irq_enable = xen_irq_enable_direct;
+		pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
+		pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
+		pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
+		pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
 		pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
 	}
 }
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 2e8271431e1a..5a070900ad35 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -50,6 +50,7 @@ static unsigned long xen_save_fl(void)
 	*/
 	return (-flags) & X86_EFLAGS_IF;
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
 
 static void xen_restore_fl(unsigned long flags)
 {
@@ -76,6 +77,7 @@ static void xen_restore_fl(unsigned long flags)
 			xen_force_evtchn_callback();
 	}
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
 
 static void xen_irq_disable(void)
 {
@@ -86,6 +88,7 @@ static void xen_irq_disable(void)
 	percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
 	preempt_enable_no_resched();
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
 
 static void xen_irq_enable(void)
 {
@@ -106,6 +109,7 @@ static void xen_irq_enable(void)
 	if (unlikely(vcpu->evtchn_upcall_pending))
 		xen_force_evtchn_callback();
 }
+PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
 
 static void xen_safe_halt(void)
 {
@@ -124,10 +128,12 @@ static void xen_halt(void)
 
 static const struct pv_irq_ops xen_irq_ops __initdata = {
 	.init_IRQ = __xen_init_IRQ,
-	.save_fl = xen_save_fl,
-	.restore_fl = xen_restore_fl,
-	.irq_disable = xen_irq_disable,
-	.irq_enable = xen_irq_enable,
+
+	.save_fl = PV_CALLEE_SAVE(xen_save_fl),
+	.restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
+	.irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
+	.irq_enable = PV_CALLEE_SAVE(xen_irq_enable),
+
 	.safe_halt = xen_safe_halt,
 	.halt = xen_halt,
 #ifdef CONFIG_X86_64
-- 
cgit v1.2.3


From 795f99b61d20c34cb04d17d8906b32f745a635ec Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Fri, 30 Jan 2009 17:47:54 +0900
Subject: xen: setup percpu data pointers

Impact: fix xen booting

We need to access percpu data fairly early, so set up the percpu
registers as soon as possible.  We only need to load the appropriate
segment register.  We already have a GDT, but its hard to change it
early because we need to manipulate the pagetable to do so, and that
hasn't been set up yet.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/xen/enlighten.c | 3 +++
 arch/x86/xen/smp.c       | 5 ++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bef941f61451..fe19c88a5029 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1647,6 +1647,9 @@ asmlinkage void __init xen_start_kernel(void)
 	have_vcpu_info_placement = 0;
 #endif
 
+	/* setup percpu state */
+	load_percpu_segment(0);
+
 	xen_smp_init();
 
 	/* Get mfn list */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 7735e3dd359c..88d5d5ec6beb 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -170,7 +170,8 @@ static void __init xen_smp_prepare_boot_cpu(void)
 
 	/* We've switched to the "real" per-cpu gdt, so make sure the
 	   old memory can be recycled */
-	make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
+	make_lowmem_page_readwrite(__per_cpu_load +
+				   (unsigned long)&per_cpu_var(gdt_page));
 
 	xen_setup_vcpu_info_placement();
 }
@@ -235,6 +236,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->user_regs.ss = __KERNEL_DS;
 #ifdef CONFIG_X86_32
 	ctxt->user_regs.fs = __KERNEL_PERCPU;
+#else
+	ctxt->gs_base_kernel = per_cpu_offset(cpu);
 #endif
 	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
 	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
-- 
cgit v1.2.3


From 383414322b3b3ced0cbc146801e0cc6c60a6c5f4 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 2 Feb 2009 13:55:31 -0800
Subject: xen: setup percpu data pointers

We need to access percpu data fairly early, so set up the percpu
registers as soon as possible.  We only need to load the appropriate
segment register.  We already have a GDT, but its hard to change it
early because we need to manipulate the pagetable to do so, and that
hasn't been set up yet.

Also, set the kernel stack when bringing up secondary CPUs.  If we
don't they all end up sharing the same stack...

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/xen/enlighten.c | 15 ++++++++++++++-
 arch/x86/xen/smp.c       |  6 ++++--
 arch/x86/xen/xen-ops.h   |  2 ++
 3 files changed, 20 insertions(+), 3 deletions(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index cd022c43dfbc..aed7ceeb4b65 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -66,6 +66,8 @@ EXPORT_SYMBOL_GPL(xen_start_info);
 
 struct shared_info xen_dummy_shared_info;
 
+void *xen_initial_gdt;
+
 /*
  * Point at some empty memory to start with. We map the real shared_info
  * page as soon as fixmap is up and running.
@@ -917,8 +919,19 @@ asmlinkage void __init xen_start_kernel(void)
 	have_vcpu_info_placement = 0;
 #endif
 
-	/* setup percpu state */
+#ifdef CONFIG_X86_64
+	/*
+	 * Setup percpu state.  We only need to do this for 64-bit
+	 * because 32-bit already has %fs set properly.
+	 */
 	load_percpu_segment(0);
+#endif
+	/*
+	 * The only reliable way to retain the initial address of the
+	 * percpu gdt_page is to remember it here, so we can go and
+	 * mark it RW later, when the initial percpu area is freed.
+	 */
+	xen_initial_gdt = &per_cpu(gdt_page, 0);
 
 	xen_smp_init();
 
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 88d5d5ec6beb..035582ae815d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -170,8 +170,7 @@ static void __init xen_smp_prepare_boot_cpu(void)
 
 	/* We've switched to the "real" per-cpu gdt, so make sure the
 	   old memory can be recycled */
-	make_lowmem_page_readwrite(__per_cpu_load +
-				   (unsigned long)&per_cpu_var(gdt_page));
+	make_lowmem_page_readwrite(xen_initial_gdt);
 
 	xen_setup_vcpu_info_placement();
 }
@@ -287,6 +286,9 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
 	irq_ctx_init(cpu);
 #else
 	clear_tsk_thread_flag(idle, TIF_FORK);
+	per_cpu(kernel_stack, cpu) =
+		(unsigned long)task_stack_page(idle) -
+		KERNEL_STACK_OFFSET + THREAD_SIZE;
 #endif
 	xen_setup_timer(cpu);
 	xen_init_lock_cpu(cpu);
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 11913fc94c14..2f5ef2632ea2 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -10,6 +10,8 @@
 extern const char xen_hypervisor_callback[];
 extern const char xen_failsafe_callback[];
 
+extern void *xen_initial_gdt;
+
 struct trap_info;
 void xen_copy_trap_info(struct trap_info *traps);
 
-- 
cgit v1.2.3


From e4d0407185cdbdcfd99fc23bde2e5454bbc46329 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 2 Feb 2009 13:55:54 -0800
Subject: xen: use direct ops on 64-bit

Enable the use of the direct vcpu-access operations on 64-bit.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/xen/enlighten.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index aed7ceeb4b65..37230342c2c4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -87,14 +87,7 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
  *
  * 0: not available, 1: available
  */
-static int have_vcpu_info_placement =
-#ifdef CONFIG_X86_32
-	1
-#else
-	0
-#endif
-	;
-
+static int have_vcpu_info_placement = 1;
 
 static void xen_vcpu_setup(int cpu)
 {
@@ -914,11 +907,6 @@ asmlinkage void __init xen_start_kernel(void)
 
 	machine_ops = xen_machine_ops;
 
-#ifdef CONFIG_X86_64
-	/* Disable until direct per-cpu data access. */
-	have_vcpu_info_placement = 0;
-#endif
-
 #ifdef CONFIG_X86_64
 	/*
 	 * Setup percpu state.  We only need to do this for 64-bit
-- 
cgit v1.2.3


From ccbeed3a05908d201b47b6c3dd1a373138bba566 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 9 Feb 2009 22:17:40 +0900
Subject: x86: make lazy %gs optional on x86_32

Impact: pt_regs changed, lazy gs handling made optional, add slight
        overhead to SAVE_ALL, simplifies error_code path a bit

On x86_32, %gs hasn't been used by kernel and handled lazily.  pt_regs
doesn't have place for it and gs is saved/loaded only when necessary.
In preparation for stack protector support, this patch makes lazy %gs
handling optional by doing the followings.

* Add CONFIG_X86_32_LAZY_GS and place for gs in pt_regs.

* Save and restore %gs along with other registers in entry_32.S unless
  LAZY_GS.  Note that this unfortunately adds "pushl $0" on SAVE_ALL
  even when LAZY_GS.  However, it adds no overhead to common exit path
  and simplifies entry path with error code.

* Define different user_gs accessors depending on LAZY_GS and add
  lazy_save_gs() and lazy_load_gs() which are noop if !LAZY_GS.  The
  lazy_*_gs() ops are used to save, load and clear %gs lazily.

* Define ELF_CORE_COPY_KERNEL_REGS() which always read %gs directly.

xen and lguest changes need to be verified.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                   |   4 ++
 arch/x86/include/asm/elf.h         |  15 ++++-
 arch/x86/include/asm/mmu_context.h |   2 +-
 arch/x86/include/asm/ptrace.h      |   4 +-
 arch/x86/include/asm/system.h      |  12 +++-
 arch/x86/kernel/asm-offsets_32.c   |   1 +
 arch/x86/kernel/entry_32.S         | 132 +++++++++++++++++++++++++++++++------
 arch/x86/kernel/process_32.c       |   4 +-
 arch/x86/kernel/ptrace.c           |   5 +-
 arch/x86/lguest/boot.c             |   2 +-
 arch/x86/xen/enlighten.c           |  17 ++---
 11 files changed, 158 insertions(+), 40 deletions(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5c8e353c1122..5bcdede71ba4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -207,6 +207,10 @@ config X86_TRAMPOLINE
 	depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP)
 	default y
 
+config X86_32_LAZY_GS
+	def_bool y
+	depends on X86_32
+
 config KTIME_SCALAR
 	def_bool X86_32
 source "init/Kconfig"
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 39b0aac1675c..83c1bc8d2e8a 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -112,7 +112,7 @@ extern unsigned int vdso_enabled;
  * now struct_user_regs, they are different)
  */
 
-#define ELF_CORE_COPY_REGS(pr_reg, regs)	\
+#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs)	\
 do {						\
 	pr_reg[0] = regs->bx;			\
 	pr_reg[1] = regs->cx;			\
@@ -124,7 +124,6 @@ do {						\
 	pr_reg[7] = regs->ds & 0xffff;		\
 	pr_reg[8] = regs->es & 0xffff;		\
 	pr_reg[9] = regs->fs & 0xffff;		\
-	pr_reg[10] = get_user_gs(regs);		\
 	pr_reg[11] = regs->orig_ax;		\
 	pr_reg[12] = regs->ip;			\
 	pr_reg[13] = regs->cs & 0xffff;		\
@@ -133,6 +132,18 @@ do {						\
 	pr_reg[16] = regs->ss & 0xffff;		\
 } while (0);
 
+#define ELF_CORE_COPY_REGS(pr_reg, regs)	\
+do {						\
+	ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
+	pr_reg[10] = get_user_gs(regs);		\
+} while (0);
+
+#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs)	\
+do {						\
+	ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
+	savesegment(gs, pr_reg[10]);		\
+} while (0);
+
 #define ELF_PLATFORM	(utsname()->machine)
 #define set_personality_64bit()	do { } while (0)
 
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 4955165682c5..f923203dc39a 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -79,7 +79,7 @@ do {						\
 #ifdef CONFIG_X86_32
 #define deactivate_mm(tsk, mm)			\
 do {						\
-	set_user_gs(task_pt_regs(tsk), 0);	\
+	lazy_load_gs(0);			\
 } while (0)
 #else
 #define deactivate_mm(tsk, mm)			\
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6d34d954c228..e304b66abeea 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -28,7 +28,7 @@ struct pt_regs {
 	int  xds;
 	int  xes;
 	int  xfs;
-	/* int  gs; */
+	int  xgs;
 	long orig_eax;
 	long eip;
 	int  xcs;
@@ -50,7 +50,7 @@ struct pt_regs {
 	unsigned long ds;
 	unsigned long es;
 	unsigned long fs;
-	/* int  gs; */
+	unsigned long gs;
 	unsigned long orig_ax;
 	unsigned long ip;
 	unsigned long cs;
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 70c74b8db875..79b98e5b96f4 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -186,10 +186,20 @@ extern void native_load_gs_index(unsigned);
  * x86_32 user gs accessors.
  */
 #ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_32_LAZY_GS
 #define get_user_gs(regs)	(u16)({unsigned long v; savesegment(gs, v); v;})
 #define set_user_gs(regs, v)	loadsegment(gs, (unsigned long)(v))
 #define task_user_gs(tsk)	((tsk)->thread.gs)
-#endif
+#define lazy_save_gs(v)		savesegment(gs, (v))
+#define lazy_load_gs(v)		loadsegment(gs, (v))
+#else	/* X86_32_LAZY_GS */
+#define get_user_gs(regs)	(u16)((regs)->gs)
+#define set_user_gs(regs, v)	do { (regs)->gs = (v); } while (0)
+#define task_user_gs(tsk)	(task_pt_regs(tsk)->gs)
+#define lazy_save_gs(v)		do { } while (0)
+#define lazy_load_gs(v)		do { } while (0)
+#endif	/* X86_32_LAZY_GS */
+#endif	/* X86_32 */
 
 static inline unsigned long get_limit(unsigned long segment)
 {
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index ee4df08feee6..fbf2f33e3080 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -75,6 +75,7 @@ void foo(void)
 	OFFSET(PT_DS,  pt_regs, ds);
 	OFFSET(PT_ES,  pt_regs, es);
 	OFFSET(PT_FS,  pt_regs, fs);
+	OFFSET(PT_GS,  pt_regs, gs);
 	OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
 	OFFSET(PT_EIP, pt_regs, ip);
 	OFFSET(PT_CS,  pt_regs, cs);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c461925d3b64..82e6868bee47 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -30,12 +30,13 @@
  *	1C(%esp) - %ds
  *	20(%esp) - %es
  *	24(%esp) - %fs
- *	28(%esp) - orig_eax
- *	2C(%esp) - %eip
- *	30(%esp) - %cs
- *	34(%esp) - %eflags
- *	38(%esp) - %oldesp
- *	3C(%esp) - %oldss
+ *	28(%esp) - %gs		saved iff !CONFIG_X86_32_LAZY_GS
+ *	2C(%esp) - orig_eax
+ *	30(%esp) - %eip
+ *	34(%esp) - %cs
+ *	38(%esp) - %eflags
+ *	3C(%esp) - %oldesp
+ *	40(%esp) - %oldss
  *
  * "current" is in register %ebx during any slow entries.
  */
@@ -101,8 +102,99 @@
 #define resume_userspace_sig	resume_userspace
 #endif
 
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc.  Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+	pushl $0
+	CFI_ADJUST_CFA_OFFSET 4
+.endm
+.macro POP_GS pop=0
+	addl $(4 + \pop), %esp
+	CFI_ADJUST_CFA_OFFSET -(4 + \pop)
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else	/* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+	pushl %gs
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET gs, 0*/
+.endm
+
+.macro POP_GS pop=0
+98:	popl %gs
+	CFI_ADJUST_CFA_OFFSET -4
+	/*CFI_RESTORE gs*/
+  .if \pop <> 0
+	add $\pop, %esp
+	CFI_ADJUST_CFA_OFFSET -\pop
+  .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99:	movl $0, (%esp)
+	jmp 98b
+.section __ex_table, "a"
+	.align 4
+	.long 98b, 99b
+.popsection
+.endm
+
+.macro PTGS_TO_GS
+98:	mov PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99:	movl $0, PT_GS(%esp)
+	jmp 98b
+.section __ex_table, "a"
+	.align 4
+	.long 98b, 99b
+.popsection
+.endm
+
+.macro GS_TO_REG reg
+	movl %gs, \reg
+	/*CFI_REGISTER gs, \reg*/
+.endm
+.macro REG_TO_PTGS reg
+	movl \reg, PT_GS(%esp)
+	/*CFI_REL_OFFSET gs, PT_GS*/
+.endm
+.macro SET_KERNEL_GS reg
+	xorl \reg, \reg
+	movl \reg, %gs
+.endm
+
+#endif	/* CONFIG_X86_32_LAZY_GS */
+
 .macro SAVE_ALL
 	cld
+	PUSH_GS
 	pushl %fs
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET fs, 0;*/
@@ -138,6 +230,7 @@
 	movl %edx, %es
 	movl $(__KERNEL_PERCPU), %edx
 	movl %edx, %fs
+	SET_KERNEL_GS %edx
 .endm
 
 .macro RESTORE_INT_REGS
@@ -164,7 +257,7 @@
 	CFI_RESTORE eax
 .endm
 
-.macro RESTORE_REGS
+.macro RESTORE_REGS pop=0
 	RESTORE_INT_REGS
 1:	popl %ds
 	CFI_ADJUST_CFA_OFFSET -4
@@ -175,6 +268,7 @@
 3:	popl %fs
 	CFI_ADJUST_CFA_OFFSET -4
 	/*CFI_RESTORE fs;*/
+	POP_GS \pop
 .pushsection .fixup, "ax"
 4:	movl $0, (%esp)
 	jmp 1b
@@ -188,6 +282,7 @@
 	.long 2b, 5b
 	.long 3b, 6b
 .popsection
+	POP_GS_EX
 .endm
 
 .macro RING0_INT_FRAME
@@ -368,6 +463,7 @@ sysenter_exit:
 	xorl %ebp,%ebp
 	TRACE_IRQS_ON
 1:	mov  PT_FS(%esp), %fs
+	PTGS_TO_GS
 	ENABLE_INTERRUPTS_SYSEXIT
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -416,6 +512,7 @@ sysexit_audit:
 	.align 4
 	.long 1b,2b
 .popsection
+	PTGS_TO_GS_EX
 ENDPROC(ia32_sysenter_target)
 
 	# system call handler stub
@@ -458,8 +555,7 @@ restore_all:
 restore_nocheck:
 	TRACE_IRQS_IRET
 restore_nocheck_notrace:
-	RESTORE_REGS
-	addl $4, %esp			# skip orig_eax/error_code
+	RESTORE_REGS 4			# skip orig_eax/error_code
 	CFI_ADJUST_CFA_OFFSET -4
 irq_return:
 	INTERRUPT_RETURN
@@ -1078,7 +1174,10 @@ ENTRY(page_fault)
 	CFI_ADJUST_CFA_OFFSET 4
 	ALIGN
 error_code:
-	/* the function address is in %fs's slot on the stack */
+	/* the function address is in %gs's slot on the stack */
+	pushl %fs
+	CFI_ADJUST_CFA_OFFSET 4
+	/*CFI_REL_OFFSET fs, 0*/
 	pushl %es
 	CFI_ADJUST_CFA_OFFSET 4
 	/*CFI_REL_OFFSET es, 0*/
@@ -1107,20 +1206,15 @@ error_code:
 	CFI_ADJUST_CFA_OFFSET 4
 	CFI_REL_OFFSET ebx, 0
 	cld
-	pushl %fs
-	CFI_ADJUST_CFA_OFFSET 4
-	/*CFI_REL_OFFSET fs, 0*/
 	movl $(__KERNEL_PERCPU), %ecx
 	movl %ecx, %fs
 	UNWIND_ESPFIX_STACK
-	popl %ecx
-	CFI_ADJUST_CFA_OFFSET -4
-	/*CFI_REGISTER es, ecx*/
-	movl PT_FS(%esp), %edi		# get the function address
+	GS_TO_REG %ecx
+	movl PT_GS(%esp), %edi		# get the function address
 	movl PT_ORIG_EAX(%esp), %edx	# get the error code
 	movl $-1, PT_ORIG_EAX(%esp)	# no syscall to restart
-	mov  %ecx, PT_FS(%esp)
-	/*CFI_REL_OFFSET fs, ES*/
+	REG_TO_PTGS %ecx
+	SET_KERNEL_GS %ecx
 	movl $(__USER_DS), %ecx
 	movl %ecx, %ds
 	movl %ecx, %es
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index d58a340e1be3..86122fa2a1ba 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -539,7 +539,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * used %fs or %gs (it does not today), or if the kernel is
 	 * running inside of a hypervisor layer.
 	 */
-	savesegment(gs, prev->gs);
+	lazy_save_gs(prev->gs);
 
 	/*
 	 * Load the per-thread Thread-Local Storage descriptor.
@@ -585,7 +585,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * Restore %gs if needed (which is common)
 	 */
 	if (prev->gs | next->gs)
-		loadsegment(gs, next->gs);
+		lazy_load_gs(next->gs);
 
 	percpu_write(current_task, next_p);
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 508b6b57d0c3..7ec39ab37a2d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -75,10 +75,7 @@ static inline bool invalid_selector(u16 value)
 static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
 {
 	BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
-	regno >>= 2;
-	if (regno > FS)
-		--regno;
-	return &regs->bx + regno;
+	return &regs->bx + (regno >> 2);
 }
 
 static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 19e33b6cd593..da2e314f61b5 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -283,7 +283,7 @@ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu)
 	/* There's one problem which normal hardware doesn't have: the Host
 	 * can't handle us removing entries we're currently using.  So we clear
 	 * the GS register here: if it's needed it'll be reloaded anyway. */
-	loadsegment(gs, 0);
+	lazy_load_gs(0);
 	lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
 }
 
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 37230342c2c4..95ff6a0e942a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -323,13 +323,14 @@ static void load_TLS_descriptor(struct thread_struct *t,
 static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
 {
 	/*
-	 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
-	 * it means we're in a context switch, and %gs has just been
-	 * saved.  This means we can zero it out to prevent faults on
-	 * exit from the hypervisor if the next process has no %gs.
-	 * Either way, it has been saved, and the new value will get
-	 * loaded properly.  This will go away as soon as Xen has been
-	 * modified to not save/restore %gs for normal hypercalls.
+	 * XXX sleazy hack: If we're being called in a lazy-cpu zone
+	 * and lazy gs handling is enabled, it means we're in a
+	 * context switch, and %gs has just been saved.  This means we
+	 * can zero it out to prevent faults on exit from the
+	 * hypervisor if the next process has no %gs.  Either way, it
+	 * has been saved, and the new value will get loaded properly.
+	 * This will go away as soon as Xen has been modified to not
+	 * save/restore %gs for normal hypercalls.
 	 *
 	 * On x86_64, this hack is not used for %gs, because gs points
 	 * to KERNEL_GS_BASE (and uses it for PDA references), so we
@@ -341,7 +342,7 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
 	 */
 	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
 #ifdef CONFIG_X86_32
-		loadsegment(gs, 0);
+		lazy_load_gs(0);
 #else
 		loadsegment(fs, 0);
 #endif
-- 
cgit v1.2.3


From c1eeb2de41d7015678bdd412b48a5f071b84e29a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 16 Feb 2009 23:02:14 -0800
Subject: x86: fold apic_ops into genapic

Impact: cleanup

make it simpler, don't need have one extra struct.

v2: fix the sgi_uv build

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h               | 97 ++++++++++---------------------
 arch/x86/include/asm/genapic.h            | 61 +++++++++++++++++++
 arch/x86/kernel/apic.c                    | 63 ++------------------
 arch/x86/kernel/bigsmp_32.c               |  7 +++
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c   |  2 +-
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c |  2 +-
 arch/x86/kernel/cpu/perfctr-watchdog.c    |  2 +-
 arch/x86/kernel/es7000_32.c               |  7 +++
 arch/x86/kernel/genapic_64.c              |  2 +-
 arch/x86/kernel/genapic_flat_64.c         | 16 ++++-
 arch/x86/kernel/genx2apic_cluster.c       | 11 +++-
 arch/x86/kernel/genx2apic_phys.c          | 11 +++-
 arch/x86/kernel/genx2apic_uv_x.c          |  9 ++-
 arch/x86/kernel/ipi.c                     |  2 +-
 arch/x86/kernel/irq.c                     |  2 +-
 arch/x86/kernel/nmi.c                     |  2 +-
 arch/x86/kernel/numaq_32.c                |  7 +++
 arch/x86/kernel/probe_32.c                |  7 +++
 arch/x86/kernel/summit_32.c               |  7 +++
 arch/x86/kernel/uv_irq.c                  |  2 +-
 arch/x86/kernel/vmi_32.c                  |  6 +-
 arch/x86/kernel/vmiclock_32.c             |  2 +-
 arch/x86/lguest/boot.c                    | 19 +++---
 arch/x86/xen/enlighten.c                  | 21 +++----
 24 files changed, 205 insertions(+), 162 deletions(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index dc1db99cd40e..4f56e053d347 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -92,6 +92,12 @@ static inline u32 native_apic_mem_read(u32 reg)
 	return *((volatile u32 *)(APIC_BASE + reg));
 }
 
+extern void native_apic_wait_icr_idle(void);
+extern u32 native_safe_apic_wait_icr_idle(void);
+extern void native_apic_icr_write(u32 low, u32 id);
+extern u64 native_apic_icr_read(void);
+
+#ifdef CONFIG_X86_X2APIC
 static inline void native_apic_msr_write(u32 reg, u32 v)
 {
 	if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
@@ -112,7 +118,31 @@ static inline u32 native_apic_msr_read(u32 reg)
 	return low;
 }
 
-#ifdef CONFIG_X86_X2APIC
+static inline void native_x2apic_wait_icr_idle(void)
+{
+	/* no need to wait for icr idle in x2apic */
+	return;
+}
+
+static inline u32 native_safe_x2apic_wait_icr_idle(void)
+{
+	/* no need to wait for icr idle in x2apic */
+	return 0;
+}
+
+static inline void native_x2apic_icr_write(u32 low, u32 id)
+{
+	wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
+}
+
+static inline u64 native_x2apic_icr_read(void)
+{
+	unsigned long val;
+
+	rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
+	return val;
+}
+
 extern int x2apic;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
@@ -146,47 +176,6 @@ static inline int x2apic_enabled(void)
 }
 #endif
 
-struct apic_ops {
-	u32 (*read)(u32 reg);
-	void (*write)(u32 reg, u32 v);
-	u64 (*icr_read)(void);
-	void (*icr_write)(u32 low, u32 high);
-	void (*wait_icr_idle)(void);
-	u32 (*safe_wait_icr_idle)(void);
-};
-
-extern struct apic_ops *apic_ops;
-
-static inline u32 apic_read(u32 reg)
-{
-	return apic_ops->read(reg);
-}
-
-static inline void apic_write(u32 reg, u32 val)
-{
-	apic_ops->write(reg, val);
-}
-
-static inline u64 apic_icr_read(void)
-{
-	return apic_ops->icr_read();
-}
-
-static inline void apic_icr_write(u32 low, u32 high)
-{
-	apic_ops->icr_write(low, high);
-}
-
-static inline void apic_wait_icr_idle(void)
-{
-	apic_ops->wait_icr_idle();
-}
-
-static inline u32 safe_apic_wait_icr_idle(void)
-{
-	return apic_ops->safe_wait_icr_idle();
-}
-
 extern int get_physical_broadcast(void);
 
 #ifdef CONFIG_X86_X2APIC
@@ -197,18 +186,6 @@ static inline void ack_x2APIC_irq(void)
 }
 #endif
 
-
-static inline void ack_APIC_irq(void)
-{
-	/*
-	 * ack_APIC_irq() actually gets compiled as a single instruction
-	 * ... yummie.
-	 */
-
-	/* Docs say use 0 for future compatibility */
-	apic_write(APIC_EOI, 0);
-}
-
 extern int lapic_get_maxlvt(void);
 extern void clear_local_APIC(void);
 extern void connect_bsp_APIC(void);
@@ -256,18 +233,6 @@ static inline void disable_local_APIC(void) { }
 #define	SET_APIC_ID(x)		(apic->set_apic_id(x))
 #else
 
-#ifdef CONFIG_X86_LOCAL_APIC
-static inline unsigned default_get_apic_id(unsigned long x)
-{
-	unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-
-	if (APIC_XAPIC(ver))
-		return (x >> 24) & 0xFF;
-	else
-		return (x >> 24) & 0x0F;
-}
-#endif
-
 #endif
 
 #endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/genapic.h b/arch/x86/include/asm/genapic.h
index 273b99452ae0..a6d0b00a544c 100644
--- a/arch/x86/include/asm/genapic.h
+++ b/arch/x86/include/asm/genapic.h
@@ -5,6 +5,7 @@
 
 #include <asm/mpspec.h>
 #include <asm/atomic.h>
+#include <asm/apic.h>
 
 /*
  * Copyright 2004 James Cleverdon, IBM.
@@ -83,10 +84,70 @@ struct genapic {
 	void (*smp_callin_clear_local_apic)(void);
 	void (*store_NMI_vector)(unsigned short *high, unsigned short *low);
 	void (*inquire_remote_apic)(int apicid);
+
+	/* apic ops */
+	u32 (*read)(u32 reg);
+	void (*write)(u32 reg, u32 v);
+	u64 (*icr_read)(void);
+	void (*icr_write)(u32 low, u32 high);
+	void (*wait_icr_idle)(void);
+	u32 (*safe_wait_icr_idle)(void);
 };
 
 extern struct genapic *apic;
 
+static inline u32 apic_read(u32 reg)
+{
+	return apic->read(reg);
+}
+
+static inline void apic_write(u32 reg, u32 val)
+{
+	apic->write(reg, val);
+}
+
+static inline u64 apic_icr_read(void)
+{
+	return apic->icr_read();
+}
+
+static inline void apic_icr_write(u32 low, u32 high)
+{
+	apic->icr_write(low, high);
+}
+
+static inline void apic_wait_icr_idle(void)
+{
+	apic->wait_icr_idle();
+}
+
+static inline u32 safe_apic_wait_icr_idle(void)
+{
+	return apic->safe_wait_icr_idle();
+}
+
+
+static inline void ack_APIC_irq(void)
+{
+	/*
+	 * ack_APIC_irq() actually gets compiled as a single instruction
+	 * ... yummie.
+	 */
+
+	/* Docs say use 0 for future compatibility */
+	apic_write(APIC_EOI, 0);
+}
+
+static inline unsigned default_get_apic_id(unsigned long x)
+{
+	unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
+
+	if (APIC_XAPIC(ver))
+		return (x >> 24) & 0xFF;
+	else
+		return (x >> 24) & 0x0F;
+}
+
 /*
  * Warm reset vector default position:
  */
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 004aa1c31e4f..af494bad8858 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -210,18 +210,13 @@ static int modern_apic(void)
 	return lapic_get_version() >= 0x14;
 }
 
-/*
- * Paravirt kernels also might be using these below ops. So we still
- * use generic apic_read()/apic_write(), which might be pointing to different
- * ops in PARAVIRT case.
- */
-void xapic_wait_icr_idle(void)
+void native_apic_wait_icr_idle(void)
 {
 	while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
 		cpu_relax();
 }
 
-u32 safe_xapic_wait_icr_idle(void)
+u32 native_safe_apic_wait_icr_idle(void)
 {
 	u32 send_status;
 	int timeout;
@@ -237,13 +232,13 @@ u32 safe_xapic_wait_icr_idle(void)
 	return send_status;
 }
 
-void xapic_icr_write(u32 low, u32 id)
+void native_apic_icr_write(u32 low, u32 id)
 {
 	apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
 	apic_write(APIC_ICR, low);
 }
 
-static u64 xapic_icr_read(void)
+u64 native_apic_icr_read(void)
 {
 	u32 icr1, icr2;
 
@@ -253,54 +248,6 @@ static u64 xapic_icr_read(void)
 	return icr1 | ((u64)icr2 << 32);
 }
 
-static struct apic_ops xapic_ops = {
-	.read = native_apic_mem_read,
-	.write = native_apic_mem_write,
-	.icr_read = xapic_icr_read,
-	.icr_write = xapic_icr_write,
-	.wait_icr_idle = xapic_wait_icr_idle,
-	.safe_wait_icr_idle = safe_xapic_wait_icr_idle,
-};
-
-struct apic_ops __read_mostly *apic_ops = &xapic_ops;
-EXPORT_SYMBOL_GPL(apic_ops);
-
-#ifdef CONFIG_X86_X2APIC
-static void x2apic_wait_icr_idle(void)
-{
-	/* no need to wait for icr idle in x2apic */
-	return;
-}
-
-static u32 safe_x2apic_wait_icr_idle(void)
-{
-	/* no need to wait for icr idle in x2apic */
-	return 0;
-}
-
-void x2apic_icr_write(u32 low, u32 id)
-{
-	wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
-}
-
-static u64 x2apic_icr_read(void)
-{
-	unsigned long val;
-
-	rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
-	return val;
-}
-
-static struct apic_ops x2apic_ops = {
-	.read = native_apic_msr_read,
-	.write = native_apic_msr_write,
-	.icr_read = x2apic_icr_read,
-	.icr_write = x2apic_icr_write,
-	.wait_icr_idle = x2apic_wait_icr_idle,
-	.safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
-};
-#endif
-
 /**
  * enable_NMI_through_LVT0 - enable NMI through local vector table 0
  */
@@ -1329,7 +1276,6 @@ void check_x2apic(void)
 	if (msr & X2APIC_ENABLE) {
 		pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
 		x2apic_preenabled = x2apic = 1;
-		apic_ops = &x2apic_ops;
 	}
 }
 
@@ -1403,7 +1349,6 @@ void __init enable_IR_x2apic(void)
 
 	if (!x2apic) {
 		x2apic = 1;
-		apic_ops = &x2apic_ops;
 		enable_x2apic();
 	}
 
diff --git a/arch/x86/kernel/bigsmp_32.c b/arch/x86/kernel/bigsmp_32.c
index 47a62f46afdb..9eeb714c5ded 100644
--- a/arch/x86/kernel/bigsmp_32.c
+++ b/arch/x86/kernel/bigsmp_32.c
@@ -263,4 +263,11 @@ struct genapic apic_bigsmp = {
 	.smp_callin_clear_local_apic	= NULL,
 	.store_NMI_vector		= NULL,
 	.inquire_remote_apic		= default_inquire_remote_apic,
+
+	.read				= native_apic_mem_read,
+	.write				= native_apic_mem_write,
+	.icr_read			= native_apic_icr_read,
+	.icr_write			= native_apic_icr_write,
+	.wait_icr_idle			= native_apic_wait_icr_idle,
+	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 };
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 4772e91e8246..e22d6ed26e61 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -24,7 +24,7 @@
 #include <linux/smp.h>
 #include <linux/sysdev.h>
 #include <linux/sysfs.h>
-#include <asm/apic.h>
+#include <asm/genapic.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/percpu.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 5e8c79e748a6..42f090702f02 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -7,7 +7,7 @@
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <asm/processor.h>
-#include <asm/apic.h>
+#include <asm/genapic.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
 #include <asm/hw_irq.h>
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9abd48b22674..f6c70a164e32 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -19,7 +19,7 @@
 #include <linux/nmi.h>
 #include <linux/kprobes.h>
 
-#include <asm/apic.h>
+#include <asm/genapic.h>
 #include <asm/intel_arch_perfmon.h>
 
 struct nmi_watchdog_ctlblk {
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
index 55515d73d9c2..23f1df4ce18e 100644
--- a/arch/x86/kernel/es7000_32.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -806,4 +806,11 @@ struct genapic apic_es7000 = {
 	.smp_callin_clear_local_apic	= NULL,
 	.store_NMI_vector		= NULL,
 	.inquire_remote_apic		= default_inquire_remote_apic,
+
+	.read				= native_apic_mem_read,
+	.write				= native_apic_mem_write,
+	.icr_read			= native_apic_icr_read,
+	.icr_write			= native_apic_icr_write,
+	.wait_icr_idle			= native_apic_wait_icr_idle,
+	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 };
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index cdc4772d9c87..70b616b4c629 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -19,8 +19,8 @@
 #include <linux/dmar.h>
 
 #include <asm/smp.h>
-#include <asm/ipi.h>
 #include <asm/genapic.h>
+#include <asm/ipi.h>
 #include <asm/setup.h>
 
 extern struct genapic apic_flat;
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 249d2d3c034c..36ee760fd133 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -17,8 +17,8 @@
 #include <linux/init.h>
 #include <linux/hardirq.h>
 #include <asm/smp.h>
-#include <asm/ipi.h>
 #include <asm/genapic.h>
+#include <asm/ipi.h>
 
 #ifdef CONFIG_ACPI
 #include <acpi/acpi_bus.h>
@@ -229,6 +229,13 @@ struct genapic apic_flat =  {
 	.smp_callin_clear_local_apic	= NULL,
 	.store_NMI_vector		= NULL,
 	.inquire_remote_apic		= NULL,
+
+	.read				= native_apic_mem_read,
+	.write				= native_apic_mem_write,
+	.icr_read			= native_apic_icr_read,
+	.icr_write			= native_apic_icr_write,
+	.wait_icr_idle			= native_apic_wait_icr_idle,
+	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 };
 
 /*
@@ -374,4 +381,11 @@ struct genapic apic_physflat =  {
 	.smp_callin_clear_local_apic	= NULL,
 	.store_NMI_vector		= NULL,
 	.inquire_remote_apic		= NULL,
+
+	.read				= native_apic_mem_read,
+	.write				= native_apic_mem_write,
+	.icr_read			= native_apic_icr_read,
+	.icr_write			= native_apic_icr_write,
+	.wait_icr_idle			= native_apic_wait_icr_idle,
+	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 };
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
index 7c87156b6411..dd6e8d685426 100644
--- a/arch/x86/kernel/genx2apic_cluster.c
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -7,8 +7,8 @@
 #include <linux/dmar.h>
 
 #include <asm/smp.h>
-#include <asm/ipi.h>
 #include <asm/genapic.h>
+#include <asm/ipi.h>
 
 DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
 
@@ -46,7 +46,7 @@ static void
 	/*
 	 * send the IPI.
 	 */
-	x2apic_icr_write(cfg, apicid);
+	native_x2apic_icr_write(cfg, apicid);
 }
 
 /*
@@ -234,4 +234,11 @@ struct genapic apic_x2apic_cluster = {
 	.smp_callin_clear_local_apic	= NULL,
 	.store_NMI_vector		= NULL,
 	.inquire_remote_apic		= NULL,
+
+	.read				= native_apic_msr_read,
+	.write				= native_apic_msr_write,
+	.icr_read			= native_x2apic_icr_read,
+	.icr_write			= native_x2apic_icr_write,
+	.wait_icr_idle			= native_x2apic_wait_icr_idle,
+	.safe_wait_icr_idle		= native_safe_x2apic_wait_icr_idle,
 };
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
index 5cbae8aa0408..eb1486bb002e 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -7,8 +7,8 @@
 #include <linux/dmar.h>
 
 #include <asm/smp.h>
-#include <asm/ipi.h>
 #include <asm/genapic.h>
+#include <asm/ipi.h>
 
 static int x2apic_phys;
 
@@ -50,7 +50,7 @@ static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
 	/*
 	 * send the IPI.
 	 */
-	x2apic_icr_write(cfg, apicid);
+	native_x2apic_icr_write(cfg, apicid);
 }
 
 static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
@@ -220,4 +220,11 @@ struct genapic apic_x2apic_phys = {
 	.smp_callin_clear_local_apic	= NULL,
 	.store_NMI_vector		= NULL,
 	.inquire_remote_apic		= NULL,
+
+	.read				= native_apic_msr_read,
+	.write				= native_apic_msr_write,
+	.icr_read			= native_x2apic_icr_read,
+	.icr_write			= native_x2apic_icr_write,
+	.wait_icr_idle			= native_x2apic_wait_icr_idle,
+	.safe_wait_icr_idle		= native_safe_x2apic_wait_icr_idle,
 };
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 89b84e004f04..9ae4a92fac8c 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -22,8 +22,8 @@
 #include <linux/proc_fs.h>
 #include <asm/current.h>
 #include <asm/smp.h>
-#include <asm/ipi.h>
 #include <asm/genapic.h>
+#include <asm/ipi.h>
 #include <asm/pgtable.h>
 #include <asm/uv/uv.h>
 #include <asm/uv/uv_mmrs.h>
@@ -292,6 +292,13 @@ struct genapic apic_x2apic_uv_x = {
 	.smp_callin_clear_local_apic	= NULL,
 	.store_NMI_vector		= NULL,
 	.inquire_remote_apic		= NULL,
+
+	.read				= native_apic_msr_read,
+	.write				= native_apic_msr_write,
+	.icr_read			= native_x2apic_icr_read,
+	.icr_write			= native_x2apic_icr_write,
+	.wait_icr_idle			= native_x2apic_wait_icr_idle,
+	.safe_wait_icr_idle		= native_safe_x2apic_wait_icr_idle,
 };
 
 static __cpuinit void set_x2apic_extra_bits(int pnode)
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index dbf5445727a9..1326272cae43 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -15,7 +15,7 @@
 #include <asm/mtrr.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-#include <asm/apic.h>
+#include <asm/genapic.h>
 #include <asm/proto.h>
 #include <asm/ipi.h>
 
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index f13ca1650aaf..3957776b1930 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -8,7 +8,7 @@
 #include <linux/smp.h>
 #include <linux/ftrace.h>
 
-#include <asm/apic.h>
+#include <asm/genapic.h>
 #include <asm/io_apic.h>
 #include <asm/irq.h>
 #include <asm/idle.h>
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index bdfad80c3cf1..48b9ca5e088c 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -11,7 +11,7 @@
  *  Mikael Pettersson	: PM converted to driver model. Disable/enable API.
  */
 
-#include <asm/apic.h>
+#include <asm/genapic.h>
 
 #include <linux/nmi.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 0cc41a1d2550..f0f0c2f0596b 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -569,4 +569,11 @@ struct genapic apic_numaq = {
 	.smp_callin_clear_local_apic	= numaq_smp_callin_clear_local_apic,
 	.store_NMI_vector		= numaq_store_NMI_vector,
 	.inquire_remote_apic		= NULL,
+
+	.read				= native_apic_mem_read,
+	.write				= native_apic_mem_write,
+	.icr_read			= native_apic_icr_read,
+	.icr_write			= native_apic_icr_write,
+	.wait_icr_idle			= native_apic_wait_icr_idle,
+	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 };
diff --git a/arch/x86/kernel/probe_32.c b/arch/x86/kernel/probe_32.c
index 22337b75de62..1f701caa95bc 100644
--- a/arch/x86/kernel/probe_32.c
+++ b/arch/x86/kernel/probe_32.c
@@ -127,6 +127,13 @@ struct genapic apic_default = {
 	.smp_callin_clear_local_apic	= NULL,
 	.store_NMI_vector		= NULL,
 	.inquire_remote_apic		= default_inquire_remote_apic,
+
+	.read				= native_apic_mem_read,
+	.write				= native_apic_mem_write,
+	.icr_read			= native_apic_icr_read,
+	.icr_write			= native_apic_icr_write,
+	.wait_icr_idle			= native_apic_wait_icr_idle,
+	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 };
 
 extern struct genapic apic_numaq;
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index 1e733eff9b33..1cf32c325d12 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -599,4 +599,11 @@ struct genapic apic_summit = {
 	.smp_callin_clear_local_apic	= NULL,
 	.store_NMI_vector		= NULL,
 	.inquire_remote_apic		= default_inquire_remote_apic,
+
+	.read				= native_apic_mem_read,
+	.write				= native_apic_mem_write,
+	.icr_read			= native_apic_icr_read,
+	.icr_write			= native_apic_icr_write,
+	.wait_icr_idle			= native_apic_wait_icr_idle,
+	.safe_wait_icr_idle		= native_safe_apic_wait_icr_idle,
 };
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index aeef529917e4..75eb5ec5dd2a 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -11,7 +11,7 @@
 #include <linux/module.h>
 #include <linux/irq.h>
 
-#include <asm/apic.h>
+#include <asm/genapic.h>
 #include <asm/uv/uv_irq.h>
 
 static void uv_noop(unsigned int irq)
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index f052c84ecbe4..a1c7b71dc0d0 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -32,7 +32,7 @@
 #include <asm/io.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
-#include <asm/apic.h>
+#include <asm/genapic.h>
 #include <asm/processor.h>
 #include <asm/timer.h>
 #include <asm/vmi_time.h>
@@ -798,8 +798,8 @@ static inline int __init activate_vmi(void)
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
-       para_fill(apic_ops->read, APICRead);
-       para_fill(apic_ops->write, APICWrite);
+       para_fill(apic->read, APICRead);
+       para_fill(apic->write, APICWrite);
 #endif
 
 	/*
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index a4791ef412d1..2a5e0e6a7c0d 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -30,7 +30,7 @@
 #include <asm/vmi_time.h>
 #include <asm/arch_hooks.h>
 #include <asm/apicdef.h>
-#include <asm/apic.h>
+#include <asm/genapic.h>
 #include <asm/timer.h>
 #include <asm/i8253.h>
 #include <asm/irq_vectors.h>
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index da2e314f61b5..bc9893f2c383 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -55,7 +55,7 @@
 #include <linux/lguest_launcher.h>
 #include <linux/virtio_console.h>
 #include <linux/pm.h>
-#include <asm/apic.h>
+#include <asm/genapic.h>
 #include <asm/lguest.h>
 #include <asm/paravirt.h>
 #include <asm/param.h>
@@ -828,13 +828,14 @@ static u32 lguest_apic_safe_wait_icr_idle(void)
 	return 0;
 }
 
-static struct apic_ops lguest_basic_apic_ops = {
-	.read = lguest_apic_read,
-	.write = lguest_apic_write,
-	.icr_read = lguest_apic_icr_read,
-	.icr_write = lguest_apic_icr_write,
-	.wait_icr_idle = lguest_apic_wait_icr_idle,
-	.safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle,
+static void set_lguest_basic_apic_ops(void)
+{
+	apic->read = lguest_apic_read;
+	apic->write = lguest_apic_write;
+	apic->icr_read = lguest_apic_icr_read;
+	apic->icr_write = lguest_apic_icr_write;
+	apic->wait_icr_idle = lguest_apic_wait_icr_idle;
+	apic->safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle;
 };
 #endif
 
@@ -1035,7 +1036,7 @@ __init void lguest_init(void)
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* apic read/write intercepts */
-	apic_ops = &lguest_basic_apic_ops;
+	set_lguest_basic_apic_ops();
 #endif
 
 	/* time operations */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 95ff6a0e942a..e3dd3fb67290 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -36,7 +36,7 @@
 #include <xen/hvc-console.h>
 
 #include <asm/paravirt.h>
-#include <asm/apic.h>
+#include <asm/genapic.h>
 #include <asm/page.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
@@ -554,14 +554,15 @@ static u32 xen_safe_apic_wait_icr_idle(void)
         return 0;
 }
 
-static struct apic_ops xen_basic_apic_ops = {
-	.read = xen_apic_read,
-	.write = xen_apic_write,
-	.icr_read = xen_apic_icr_read,
-	.icr_write = xen_apic_icr_write,
-	.wait_icr_idle = xen_apic_wait_icr_idle,
-	.safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
-};
+static void set_xen_basic_apic_ops(void)
+{
+	apic->read = xen_apic_read;
+	apic->write = xen_apic_write;
+	apic->icr_read = xen_apic_icr_read;
+	apic->icr_write = xen_apic_icr_write;
+	apic->wait_icr_idle = xen_apic_wait_icr_idle;
+	apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
+}
 
 #endif
 
@@ -898,7 +899,7 @@ asmlinkage void __init xen_start_kernel(void)
 	/*
 	 * set up the basic apic ops.
 	 */
-	apic_ops = &xen_basic_apic_ops;
+	set_xen_basic_apic_ops();
 #endif
 
 	if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
-- 
cgit v1.2.3


From 7b6aa335ca1a845c2262ec7a595b4521bca0f79d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 17 Feb 2009 13:58:15 +0100
Subject: x86, apic: remove genapic.h

Impact: cleanup

Remove genapic.h and remove all references to it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ipi.h                 | 2 +-
 arch/x86/kernel/acpi/boot.c                | 2 +-
 arch/x86/kernel/apic.c                     | 2 +-
 arch/x86/kernel/bigsmp_32.c                | 2 +-
 arch/x86/kernel/cpu/addon_cpuid_features.c | 2 +-
 arch/x86/kernel/cpu/amd.c                  | 2 +-
 arch/x86/kernel/cpu/common.c               | 4 ++--
 arch/x86/kernel/cpu/intel.c                | 2 +-
 arch/x86/kernel/cpu/mcheck/mce_amd_64.c    | 2 +-
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c  | 2 +-
 arch/x86/kernel/cpu/mcheck/p4.c            | 2 +-
 arch/x86/kernel/crash.c                    | 2 +-
 arch/x86/kernel/es7000_32.c                | 4 ++--
 arch/x86/kernel/genapic_64.c               | 2 +-
 arch/x86/kernel/genapic_flat_64.c          | 2 +-
 arch/x86/kernel/genx2apic_cluster.c        | 2 +-
 arch/x86/kernel/genx2apic_phys.c           | 2 +-
 arch/x86/kernel/genx2apic_uv_x.c           | 2 +-
 arch/x86/kernel/io_apic.c                  | 2 +-
 arch/x86/kernel/ipi.c                      | 2 +-
 arch/x86/kernel/irq.c                      | 2 +-
 arch/x86/kernel/irq_32.c                   | 2 +-
 arch/x86/kernel/kgdb.c                     | 2 +-
 arch/x86/kernel/mpparse.c                  | 2 +-
 arch/x86/kernel/nmi.c                      | 2 +-
 arch/x86/kernel/numaq_32.c                 | 4 ++--
 arch/x86/kernel/probe_32.c                 | 8 ++++----
 arch/x86/kernel/reboot.c                   | 2 +-
 arch/x86/kernel/setup.c                    | 2 +-
 arch/x86/kernel/smp.c                      | 2 +-
 arch/x86/kernel/smpboot.c                  | 4 ++--
 arch/x86/kernel/summit_32.c                | 2 +-
 arch/x86/kernel/tlb_uv.c                   | 4 ++--
 arch/x86/kernel/uv_irq.c                   | 2 +-
 arch/x86/kernel/visws_quirks.c             | 4 ++--
 arch/x86/kernel/vmi_32.c                   | 2 +-
 arch/x86/kernel/vmiclock_32.c              | 2 +-
 arch/x86/lguest/boot.c                     | 2 +-
 arch/x86/mm/srat_64.c                      | 2 +-
 arch/x86/mm/tlb.c                          | 2 +-
 arch/x86/oprofile/nmi_int.c                | 2 +-
 arch/x86/oprofile/op_model_p4.c            | 2 +-
 arch/x86/oprofile/op_model_ppro.c          | 2 +-
 arch/x86/pci/numaq_32.c                    | 2 +-
 arch/x86/xen/enlighten.c                   | 2 +-
 45 files changed, 54 insertions(+), 54 deletions(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index 5f2efc5d9927..3395c680a976 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -123,7 +123,7 @@ extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
 						 int vector);
 extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
 							 int vector);
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
 							 int vector);
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 956c1dee6fbe..42814152c940 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -37,7 +37,7 @@
 #include <asm/pgtable.h>
 #include <asm/io_apic.h>
 #include <asm/apic.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/io.h>
 #include <asm/mpspec.h>
 #include <asm/smp.h>
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index af494bad8858..7db03a9b61dc 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -36,7 +36,7 @@
 
 #include <asm/arch_hooks.h>
 #include <asm/pgalloc.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/atomic.h>
 #include <asm/mpspec.h>
 #include <asm/i8253.h>
diff --git a/arch/x86/kernel/bigsmp_32.c b/arch/x86/kernel/bigsmp_32.c
index 9eeb714c5ded..72f4e534051e 100644
--- a/arch/x86/kernel/bigsmp_32.c
+++ b/arch/x86/kernel/bigsmp_32.c
@@ -6,7 +6,7 @@
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <asm/mpspec.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
 #include <asm/ipi.h>
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index e48640cfac0c..6882a735d9c0 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,7 +7,7 @@
 #include <asm/pat.h>
 #include <asm/processor.h>
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 struct cpuid_bit {
 	u16 feature;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ff4d7b9e32e4..c94ba9311e65 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -12,7 +12,7 @@
 # include <asm/cacheflush.h>
 #endif
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 #include "cpu.h"
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4b5d13e472d6..41f3788ec9b9 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -26,8 +26,8 @@
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/mpspec.h>
 #include <asm/apic.h>
-#include <asm/genapic.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
+#include <asm/apic.h>
 #include <asm/uv/uv.h>
 #endif
 
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1f137a87d4bd..290f92e2b7c6 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -24,7 +24,7 @@
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/mpspec.h>
 #include <asm/apic.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #endif
 
 static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index e22d6ed26e61..4772e91e8246 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -24,7 +24,7 @@
 #include <linux/smp.h>
 #include <linux/sysdev.h>
 #include <linux/sysfs.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/mce.h>
 #include <asm/msr.h>
 #include <asm/percpu.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 42f090702f02..5e8c79e748a6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -7,7 +7,7 @@
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <asm/processor.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/msr.h>
 #include <asm/mce.h>
 #include <asm/hw_irq.h>
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index f9c92b66dfbe..9b60fce09f75 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -11,7 +11,7 @@
 #include <asm/processor.h>
 #include <asm/system.h>
 #include <asm/msr.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 #include <asm/therm_throt.h>
 
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index ad7f2a696f4a..3340cc0f244e 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -28,7 +28,7 @@
 #include <asm/reboot.h>
 #include <asm/virtext.h>
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 
 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c
index 23f1df4ce18e..6cdfb3e4dc33 100644
--- a/arch/x86/kernel/es7000_32.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -40,7 +40,7 @@
 #include <asm/smp.h>
 #include <asm/atomic.h>
 #include <asm/apicdef.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/setup.h>
 
 /*
@@ -387,7 +387,7 @@ void __init es7000_enable_apic_mode(void)
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <asm/mpspec.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
 #include <linux/kernel.h>
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index ef7886353240..91cae6f6e730 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -19,7 +19,7 @@
 #include <linux/dmar.h>
 
 #include <asm/smp.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/ipi.h>
 #include <asm/setup.h>
 
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 36ee760fd133..a7d847636487 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -17,7 +17,7 @@
 #include <linux/init.h>
 #include <linux/hardirq.h>
 #include <asm/smp.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/ipi.h>
 
 #ifdef CONFIG_ACPI
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
index dd6e8d685426..f5e02cffa264 100644
--- a/arch/x86/kernel/genx2apic_cluster.c
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -7,7 +7,7 @@
 #include <linux/dmar.h>
 
 #include <asm/smp.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/ipi.h>
 
 DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
index eb1486bb002e..11eb4cb7ca3f 100644
--- a/arch/x86/kernel/genx2apic_phys.c
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -7,7 +7,7 @@
 #include <linux/dmar.h>
 
 #include <asm/smp.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/ipi.h>
 
 static int x2apic_phys;
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 9ae4a92fac8c..c1746a198bde 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -22,7 +22,7 @@
 #include <linux/proc_fs.h>
 #include <asm/current.h>
 #include <asm/smp.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/ipi.h>
 #include <asm/pgtable.h>
 #include <asm/uv/uv.h>
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index a89878e08a42..00e6071cefc4 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -62,7 +62,7 @@
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_irq.h>
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 #define __apicdebuginit(type) static type __init
 
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index 1326272cae43..dbf5445727a9 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -15,7 +15,7 @@
 #include <asm/mtrr.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/proto.h>
 #include <asm/ipi.h>
 
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3957776b1930..f13ca1650aaf 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -8,7 +8,7 @@
 #include <linux/smp.h>
 #include <linux/ftrace.h>
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/io_apic.h>
 #include <asm/irq.h>
 #include <asm/idle.h>
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 4beb9a13873d..e4ac7c80e2df 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -212,7 +212,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 /* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
 void fixup_irqs(void)
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 5c4f55483849..eedfaebe1063 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -46,7 +46,7 @@
 #include <asm/apicdef.h>
 #include <asm/system.h>
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 /*
  * Put the error code here just in case the user cares:
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 200764453195..7f4d2586972e 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -29,7 +29,7 @@
 #include <asm/setup.h>
 #include <asm/smp.h>
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 /*
  * Checksum an MP configuration block.
  */
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 48b9ca5e088c..bdfad80c3cf1 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -11,7 +11,7 @@
  *  Mikael Pettersson	: PM converted to driver model. Disable/enable API.
  */
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 #include <linux/nmi.h>
 #include <linux/mm.h>
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index f0f0c2f0596b..5a2d75d1fd44 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -31,7 +31,7 @@
 
 #include <asm/processor.h>
 #include <asm/topology.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/numaq.h>
 #include <asm/setup.h>
 #include <asm/e820.h>
@@ -301,7 +301,7 @@ int __init get_memcfg_numaq(void)
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <asm/mpspec.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
 #include <asm/ipi.h>
diff --git a/arch/x86/kernel/probe_32.c b/arch/x86/kernel/probe_32.c
index b3d5d74e5229..be0d554984a7 100644
--- a/arch/x86/kernel/probe_32.c
+++ b/arch/x86/kernel/probe_32.c
@@ -17,20 +17,20 @@
 #include <asm/fixmap.h>
 #include <asm/mpspec.h>
 #include <asm/apicdef.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/setup.h>
 
 #include <linux/threads.h>
 #include <linux/cpumask.h>
 #include <asm/mpspec.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/smp.h>
 #include <linux/init.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/ipi.h>
 
 #include <linux/smp.h>
@@ -41,7 +41,7 @@
 #include <asm/e820.h>
 #include <asm/setup.h>
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 #ifdef CONFIG_HOTPLUG_CPU
 #define DEFAULT_SEND_IPI	(1)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 32e8f0af292c..7c8cd447d5ed 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -24,7 +24,7 @@
 # include <asm/iommu.h>
 #endif
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 /*
  * Power off function, if any
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 43d964411c0d..deaafd2693ee 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -97,7 +97,7 @@
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/paravirt.h>
 #include <asm/hypervisor.h>
 
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index eaaffae31cc0..13f33ea8ccaa 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -26,7 +26,7 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/proto.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 /*
  *	Some notes on x86 processor bugs affecting SMP operation:
  *
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b5f2b698973f..562a9fc3bc34 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -60,12 +60,12 @@
 #include <asm/tlbflush.h>
 #include <asm/mtrr.h>
 #include <asm/vmi.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/setup.h>
 #include <asm/uv/uv.h>
 #include <linux/mc146818rtc.h>
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/smpboot_hooks.h>
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index 1cf32c325d12..eb31ba276bb3 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -40,7 +40,7 @@
 #include <asm/mpspec.h>
 #include <asm/apic.h>
 #include <asm/smp.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
 #include <asm/ipi.h>
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index f396e61bcb34..1a7dfa7cb525 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -15,12 +15,12 @@
 #include <asm/uv/uv_mmrs.h>
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_bau.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/idle.h>
 #include <asm/tsc.h>
 #include <asm/irq_vectors.h>
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 static struct bau_control	**uv_bau_table_bases __read_mostly;
 static int			uv_bau_retry_limit __read_mostly;
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index 75eb5ec5dd2a..aeef529917e4 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -11,7 +11,7 @@
 #include <linux/module.h>
 #include <linux/irq.h>
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/uv/uv_irq.h>
 
 static void uv_noop(unsigned int irq)
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 4fd646e6dd43..5264fea6c281 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -32,9 +32,9 @@
 #include <asm/e820.h>
 #include <asm/io.h>
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 #include <linux/kernel_stat.h>
 
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index a1c7b71dc0d0..2cc4a90e2cb3 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -32,7 +32,7 @@
 #include <asm/io.h>
 #include <asm/fixmap.h>
 #include <asm/apicdef.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/processor.h>
 #include <asm/timer.h>
 #include <asm/vmi_time.h>
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 2a5e0e6a7c0d..a4791ef412d1 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -30,7 +30,7 @@
 #include <asm/vmi_time.h>
 #include <asm/arch_hooks.h>
 #include <asm/apicdef.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/timer.h>
 #include <asm/i8253.h>
 #include <asm/irq_vectors.h>
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index bc9893f2c383..f3a5305b8adf 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -55,7 +55,7 @@
 #include <linux/lguest_launcher.h>
 #include <linux/virtio_console.h>
 #include <linux/pm.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/lguest.h>
 #include <asm/paravirt.h>
 #include <asm/param.h>
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 15df1baee100..574c8bc95ef0 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -20,7 +20,7 @@
 #include <asm/proto.h>
 #include <asm/numa.h>
 #include <asm/e820.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/uv/uv.h>
 
 int acpi_numa __initdata;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 14c5af4d11e6..b641349fe077 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -14,7 +14,7 @@
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
 			= { &init_mm, 0, };
 
-#include <asm/genapic.h>
+#include <asm/apic.h>
 /*
  *	Smarter SMP flushing macros.
  *		c/o Linus Torvalds.
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index a32a5c7a8ef6..202864ad49a7 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -19,7 +19,7 @@
 #include <linux/cpu.h>
 #include <asm/nmi.h>
 #include <asm/msr.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 #include "op_counter.h"
 #include "op_x86_model.h"
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 09a237bc9ef6..4c4a51c90bc2 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -14,7 +14,7 @@
 #include <linux/nmi.h>
 #include <asm/msr.h>
 #include <asm/fixmap.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 
 
 #include "op_x86_model.h"
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 5ebd8f605d76..e9f80c744cf3 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -16,7 +16,7 @@
 #include <linux/slab.h>
 #include <asm/ptrace.h>
 #include <asm/msr.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/nmi.h>
 #include <asm/intel_arch_perfmon.h>
 
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 5601e829c387..8eb295e116f6 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -5,7 +5,7 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/nodemask.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/mpspec.h>
 #include <asm/pci_x86.h>
 
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e3dd3fb67290..86497d5f44cd 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -36,7 +36,7 @@
 #include <xen/hvc-console.h>
 
 #include <asm/paravirt.h>
-#include <asm/genapic.h>
+#include <asm/apic.h>
 #include <asm/page.h>
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
-- 
cgit v1.2.3


From 9976b39b5031bbf76f715893cf080b6a17683881 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Fri, 27 Feb 2009 09:19:26 -0800
Subject: xen: deal with virtually mapped percpu data

The virtually mapped percpu space causes us two problems:

 - for hypercalls which take an mfn, we need to do a full pagetable
   walk to convert the percpu va into an mfn, and

 - when a hypercall requires a page to be mapped RO via all its aliases,
   we need to make sure its RO in both the percpu mapping and in the
   linear mapping

This primarily affects the gdt and the vcpu info structure.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Xen-devel <xen-devel@lists.xensource.com>
Cc: Gerd Hoffmann <kraxel@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Tejun Heo <htejun@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/xen/page.h |  1 +
 arch/x86/xen/enlighten.c        | 10 ++++++----
 arch/x86/xen/mmu.c              |  7 +++++++
 arch/x86/xen/smp.c              |  6 +++++-
 4 files changed, 19 insertions(+), 5 deletions(-)

(limited to 'arch/x86/xen/enlighten.c')

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 4bd990ee43df..1a918dde46b5 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -164,6 +164,7 @@ static inline pte_t __pte_ma(pteval_t x)
 
 
 xmaddr_t arbitrary_virt_to_machine(void *address);
+unsigned long arbitrary_virt_to_mfn(void *vaddr);
 void make_lowmem_page_readonly(void *vaddr);
 void make_lowmem_page_readwrite(void *vaddr);
 
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 86497d5f44cd..352ea6830659 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -103,7 +103,7 @@ static void xen_vcpu_setup(int cpu)
 
 	vcpup = &per_cpu(xen_vcpu_info, cpu);
 
-	info.mfn = virt_to_mfn(vcpup);
+	info.mfn = arbitrary_virt_to_mfn(vcpup);
 	info.offset = offset_in_page(vcpup);
 
 	printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
@@ -301,8 +301,10 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 	frames = mcs.args;
 
 	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
-		frames[f] = virt_to_mfn(va);
+		frames[f] = arbitrary_virt_to_mfn((void *)va);
+
 		make_lowmem_page_readonly((void *)va);
+		make_lowmem_page_readonly(mfn_to_virt(frames[f]));
 	}
 
 	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
@@ -314,7 +316,7 @@ static void load_TLS_descriptor(struct thread_struct *t,
 				unsigned int cpu, unsigned int i)
 {
 	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-	xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+	xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
 	struct multicall_space mc = __xen_mc_entry(0);
 
 	MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
@@ -488,7 +490,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
 		break;
 
 	default: {
-		xmaddr_t maddr = virt_to_machine(&dt[entry]);
+		xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]);
 
 		xen_mc_flush();
 		if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc))
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 319bd40a57c2..cb6afa4ec95c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -276,6 +276,13 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
 	p2m_top[topidx][idx] = mfn;
 }
 
+unsigned long arbitrary_virt_to_mfn(void *vaddr)
+{
+	xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
+
+	return PFN_DOWN(maddr.maddr);
+}
+
 xmaddr_t arbitrary_virt_to_machine(void *vaddr)
 {
 	unsigned long address = (unsigned long)vaddr;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 035582ae815d..8d470562ffc9 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -219,6 +219,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 {
 	struct vcpu_guest_context *ctxt;
 	struct desc_struct *gdt;
+	unsigned long gdt_mfn;
 
 	if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
 		return 0;
@@ -248,9 +249,12 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->ldt_ents = 0;
 
 	BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+
+	gdt_mfn = arbitrary_virt_to_mfn(gdt);
 	make_lowmem_page_readonly(gdt);
+	make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
 
-	ctxt->gdt_frames[0] = virt_to_mfn(gdt);
+	ctxt->gdt_frames[0] = gdt_mfn;
 	ctxt->gdt_ents      = GDT_ENTRIES;
 
 	ctxt->user_regs.cs = __KERNEL_CS;
-- 
cgit v1.2.3