summaryrefslogtreecommitdiffstats
path: root/arch/x86_64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86_64')
-rw-r--r--arch/x86_64/Kconfig66
-rw-r--r--arch/x86_64/Makefile2
-rw-r--r--arch/x86_64/boot/compressed/head.S11
-rw-r--r--arch/x86_64/boot/compressed/misc.c12
-rw-r--r--arch/x86_64/boot/install.sh2
-rw-r--r--arch/x86_64/boot/setup.S8
-rw-r--r--arch/x86_64/boot/tools/build.c5
-rw-r--r--arch/x86_64/ia32/ia32entry.S2
-rw-r--r--arch/x86_64/kernel/Makefile1
-rw-r--r--arch/x86_64/kernel/acpi/wakeup.S2
-rw-r--r--arch/x86_64/kernel/apic.c46
-rw-r--r--arch/x86_64/kernel/crash.c35
-rw-r--r--arch/x86_64/kernel/e820.c6
-rw-r--r--arch/x86_64/kernel/genapic_flat.c110
-rw-r--r--arch/x86_64/kernel/head.S18
-rw-r--r--arch/x86_64/kernel/i387.c2
-rw-r--r--arch/x86_64/kernel/i8259.c12
-rw-r--r--arch/x86_64/kernel/io_apic.c36
-rw-r--r--arch/x86_64/kernel/irq.c29
-rw-r--r--arch/x86_64/kernel/machine_kexec.c250
-rw-r--r--arch/x86_64/kernel/mce.c10
-rw-r--r--arch/x86_64/kernel/mce_intel.c4
-rw-r--r--arch/x86_64/kernel/nmi.c4
-rw-r--r--arch/x86_64/kernel/process.c31
-rw-r--r--arch/x86_64/kernel/reboot.c62
-rw-r--r--arch/x86_64/kernel/relocate_kernel.S143
-rw-r--r--arch/x86_64/kernel/setup.c47
-rw-r--r--arch/x86_64/kernel/setup64.c6
-rw-r--r--arch/x86_64/kernel/smp.c10
-rw-r--r--arch/x86_64/kernel/smpboot.c317
-rw-r--r--arch/x86_64/kernel/suspend.c18
-rw-r--r--arch/x86_64/kernel/traps.c8
-rw-r--r--arch/x86_64/kernel/vmlinux.lds.S128
-rw-r--r--arch/x86_64/mm/numa.c2
34 files changed, 1186 insertions, 259 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index db259757dc8a..d09437b5c48f 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -207,33 +207,6 @@ config SMP
If you don't know what to do here, say N.
-config PREEMPT
- bool "Preemptible Kernel"
- ---help---
- This option reduces the latency of the kernel when reacting to
- real-time or interactive events by allowing a low priority process to
- be preempted even if it is in kernel mode executing a system call.
- This allows applications to run more reliably even when the system is
- under load. On contrary it may also break your drivers and add
- priority inheritance problems to your system. Don't select it if
- you rely on a stable system or have slightly obscure hardware.
- It's also not very well tested on x86-64 currently.
- You have been warned.
-
- Say Y here if you are feeling brave and building a kernel for a
- desktop, embedded or real-time system. Say N if you are unsure.
-
-config PREEMPT_BKL
- bool "Preempt The Big Kernel Lock"
- depends on PREEMPT
- default y
- help
- This option reduces the latency of the kernel by making the
- big kernel lock preemptible.
-
- Say Y here if you are building a kernel for a desktop system.
- Say N if you are unsure.
-
config SCHED_SMT
bool "SMT (Hyperthreading) scheduler support"
depends on SMP
@@ -244,6 +217,8 @@ config SCHED_SMT
cost of slightly increased overhead in some places. If unsure say
N here.
+source "kernel/Kconfig.preempt"
+
config K8_NUMA
bool "K8 NUMA support"
select NUMA
@@ -313,6 +288,15 @@ config NR_CPUS
This is purely to save memory - each supported CPU requires
memory in the static kernel configuration.
+config HOTPLUG_CPU
+ bool "Support for hot-pluggable CPUs (EXPERIMENTAL)"
+ depends on SMP && HOTPLUG && EXPERIMENTAL
+ help
+ Say Y here to experiment with turning CPUs off and on. CPUs
+ can be controlled through /sys/devices/system/cpu/cpu#.
+ Say N if you want to disable CPU hotplug.
+
+
config HPET_TIMER
bool
default y
@@ -385,6 +369,34 @@ config X86_MCE_INTEL
Additional support for intel specific MCE features such as
the thermal monitor.
+config PHYSICAL_START
+ hex "Physical address where the kernel is loaded" if EMBEDDED
+ default "0x100000"
+ help
+ This gives the physical address where the kernel is loaded.
+ Primarily used in the case of kexec on panic where the
+ fail safe kernel needs to run at a different address than
+ the panic-ed kernel.
+
+ Don't change this unless you know what you are doing.
+
+config KEXEC
+ bool "kexec system call (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ help
+ kexec is a system call that implements the ability to shutdown your
+ current kernel, and to start another kernel. It is like a reboot
+ but it is indepedent of the system firmware. And like a reboot
+ you can start any kernel with it, not just Linux.
+
+ The name comes from the similiarity to the exec system call.
+
+ It is an ongoing process to be certain the hardware in a machine
+ is properly shutdown, so do not be surprised if this code does not
+ initially work for you. It may help to enable device hotplugging
+ support. As of this writing the exact hardware interface is
+ strongly in flux, so no good recommendation can be made.
+
config SECCOMP
bool "Enable seccomp to safely compute untrusted bytecode"
depends on PROC_FS
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 6f90c246c418..8a73794f9b90 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -35,7 +35,7 @@ export IA32_CC IA32_LD IA32_AS IA32_OBJCOPY IA32_CPP
LDFLAGS := -m elf_x86_64
OBJCOPYFLAGS := -O binary -R .note -R .comment -S
-LDFLAGS_vmlinux := -e stext
+LDFLAGS_vmlinux :=
CHECKFLAGS += -D__x86_64__ -m64
diff --git a/arch/x86_64/boot/compressed/head.S b/arch/x86_64/boot/compressed/head.S
index 27264dbd575c..6f55565e4d42 100644
--- a/arch/x86_64/boot/compressed/head.S
+++ b/arch/x86_64/boot/compressed/head.S
@@ -2,8 +2,6 @@
* linux/boot/head.S
*
* Copyright (C) 1991, 1992, 1993 Linus Torvalds
- *
- * $Id: head.S,v 1.3 2001/04/20 00:59:28 ak Exp $
*/
/*
@@ -21,13 +19,14 @@
*/
/*
- * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
+ * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
*/
.code32
.text
#include <linux/linkage.h>
#include <asm/segment.h>
+#include <asm/page.h>
.code32
.globl startup_32
@@ -77,7 +76,7 @@ startup_32:
jnz 3f
addl $8,%esp
xorl %ebx,%ebx
- ljmp $(__KERNEL_CS), $0x100000
+ ljmp $(__KERNEL_CS), $__PHYSICAL_START
/*
* We come here, if we were loaded high.
@@ -103,7 +102,7 @@ startup_32:
popl %ecx # lcount
popl %edx # high_buffer_start
popl %eax # hcount
- movl $0x100000,%edi
+ movl $__PHYSICAL_START,%edi
cli # make sure we don't get interrupted
ljmp $(__KERNEL_CS), $0x1000 # and jump to the move routine
@@ -128,7 +127,7 @@ move_routine_start:
movsl
movl %ebx,%esi # Restore setup pointer
xorl %ebx,%ebx
- ljmp $(__KERNEL_CS), $0x100000
+ ljmp $(__KERNEL_CS), $__PHYSICAL_START
move_routine_end:
diff --git a/arch/x86_64/boot/compressed/misc.c b/arch/x86_64/boot/compressed/misc.c
index c8b9216f9e63..b38d5b8b5fb8 100644
--- a/arch/x86_64/boot/compressed/misc.c
+++ b/arch/x86_64/boot/compressed/misc.c
@@ -11,6 +11,7 @@
#include "miscsetup.h"
#include <asm/io.h>
+#include <asm/page.h>
/*
* gzip declarations
@@ -92,8 +93,11 @@ static unsigned long output_ptr = 0;
static void *malloc(int size);
static void free(void *where);
+void* memset(void* s, int c, unsigned n);
+void* memcpy(void* dest, const void* src, unsigned n);
+
static void putstr(const char *);
-
+
extern int end;
static long free_mem_ptr = (long)&end;
static long free_mem_end_ptr;
@@ -284,7 +288,7 @@ void setup_normal_output_buffer(void)
#else
if ((ALT_MEM_K > EXT_MEM_K ? ALT_MEM_K : EXT_MEM_K) < 1024) error("Less than 2MB of memory");
#endif
- output_data = (char *)0x100000; /* Points to 1M */
+ output_data = (char *)__PHYSICAL_START; /* Normally Points to 1M */
free_mem_end_ptr = (long)real_mode;
}
@@ -307,8 +311,8 @@ void setup_output_buffer_if_we_run_high(struct moveparams *mv)
low_buffer_size = low_buffer_end - LOW_BUFFER_START;
high_loaded = 1;
free_mem_end_ptr = (long)high_buffer_start;
- if ( (0x100000 + low_buffer_size) > ((ulg)high_buffer_start)) {
- high_buffer_start = (uch *)(0x100000 + low_buffer_size);
+ if ( (__PHYSICAL_START + low_buffer_size) > ((ulg)high_buffer_start)) {
+ high_buffer_start = (uch *)(__PHYSICAL_START + low_buffer_size);
mv->hcount = 0; /* say: we need not to move high_buffer */
}
else mv->hcount = -1;
diff --git a/arch/x86_64/boot/install.sh b/arch/x86_64/boot/install.sh
index f17b40dfc0f4..198af15a7758 100644
--- a/arch/x86_64/boot/install.sh
+++ b/arch/x86_64/boot/install.sh
@@ -1,6 +1,6 @@
#!/bin/sh
#
-# arch/i386/boot/install.sh
+# arch/x86_64/boot/install.sh
#
# This file is subject to the terms and conditions of the GNU General Public
# License. See the file "COPYING" in the main directory of this archive
diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S
index 75d4d2ad93b3..ff58b2832b75 100644
--- a/arch/x86_64/boot/setup.S
+++ b/arch/x86_64/boot/setup.S
@@ -33,7 +33,7 @@
* Transcribed from Intel (as86) -> AT&T (gas) by Chris Noe, May 1999.
* <stiker@northlink.com>
*
- * Fix to work around buggy BIOSes which dont use carry bit correctly
+ * Fix to work around buggy BIOSes which don't use carry bit correctly
* and/or report extended memory in CX/DX for e801h memory size detection
* call. As a result the kernel got wrong figures. The int15/e801h docs
* from Ralf Brown interrupt list seem to indicate AX/BX should be used
@@ -383,7 +383,7 @@ sse_ok:
# a whole bunch of different types, and allows memory holes and
# everything. We scan through this memory map and build a list
# of the first 32 memory areas, which we return at [E820MAP].
-# This is documented at http://www.teleport.com/~acpi/acpihtml/topic245.htm
+# This is documented at http://www.acpi.info/, in the ACPI 2.0 specification.
#define SMAP 0x534d4150
@@ -436,7 +436,7 @@ bail820:
meme801:
stc # fix to work around buggy
- xorw %cx,%cx # BIOSes which dont clear/set
+ xorw %cx,%cx # BIOSes which don't clear/set
xorw %dx,%dx # carry on pass/error of
# e801h memory size call
# or merely pass cx,dx though
@@ -733,7 +733,7 @@ flush_instr:
#
# but we yet haven't reloaded the CS register, so the default size
# of the target offset still is 16 bit.
-# However, using an operant prefix (0x66), the CPU will properly
+# However, using an operand prefix (0x66), the CPU will properly
# take our 48 bit far pointer. (INTeL 80386 Programmer's Reference
# Manual, Mixing 16-bit and 32-bit code, page 16-6)
diff --git a/arch/x86_64/boot/tools/build.c b/arch/x86_64/boot/tools/build.c
index c2fa66313170..18b5bac1c428 100644
--- a/arch/x86_64/boot/tools/build.c
+++ b/arch/x86_64/boot/tools/build.c
@@ -1,6 +1,4 @@
/*
- * $Id: build.c,v 1.3 2001/06/26 15:14:50 pavel Exp $
- *
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright (C) 1997 Martin Mares
*/
@@ -8,7 +6,8 @@
/*
* This file builds a disk-image from three different files:
*
- * - bootsect: exactly 512 bytes of 8086 machine code, loads the rest
+ * - bootsect: compatibility mbr which prints an error message if
+ * someone tries to boot the kernel directly.
* - setup: 8086 machine code, sets up system parm
* - system: 80386 code for actual system
*
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index f3ca0db85b5b..cc935427d532 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -589,7 +589,7 @@ ia32_sys_call_table:
.quad compat_sys_mq_timedreceive /* 280 */
.quad compat_sys_mq_notify
.quad compat_sys_mq_getsetattr
- .quad quiet_ni_syscall /* reserved for kexec */
+ .quad compat_sys_kexec_load /* reserved for kexec */
.quad compat_sys_waitid
.quad quiet_ni_syscall /* sys_altroot */
.quad sys_add_key
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 5ca4a4598fda..48f9e2c19cd6 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_SMP) += smp.o smpboot.o trampoline.o
obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o mpparse.o \
genapic.o genapic_cluster.o genapic_flat.o
+obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o
obj-$(CONFIG_PM) += suspend.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o
obj-$(CONFIG_CPU_FREQ) += cpufreq/
diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S
index a4c630034cd4..185faa911db5 100644
--- a/arch/x86_64/kernel/acpi/wakeup.S
+++ b/arch/x86_64/kernel/acpi/wakeup.S
@@ -67,7 +67,7 @@ wakeup_code:
shll $4, %eax
addl $(gdta - wakeup_code), %eax
movl %eax, gdt_48a +2 - wakeup_code
- lgdt %ds:gdt_48a - wakeup_code # load gdt with whatever is
+ lgdtl %ds:gdt_48a - wakeup_code # load gdt with whatever is
# appropriate
movl $1, %eax # protected mode (PE) bit
diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index f8e6cc4fecd4..375d369570ca 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -133,7 +133,7 @@ void __init connect_bsp_APIC(void)
}
}
-void disconnect_bsp_APIC(void)
+void disconnect_bsp_APIC(int virt_wire_setup)
{
if (pic_mode) {
/*
@@ -146,6 +146,42 @@ void disconnect_bsp_APIC(void)
outb(0x70, 0x22);
outb(0x00, 0x23);
}
+ else {
+ /* Go back to Virtual Wire compatibility mode */
+ unsigned long value;
+
+ /* For the spurious interrupt use vector F, and enable it */
+ value = apic_read(APIC_SPIV);
+ value &= ~APIC_VECTOR_MASK;
+ value |= APIC_SPIV_APIC_ENABLED;
+ value |= 0xf;
+ apic_write_around(APIC_SPIV, value);
+
+ if (!virt_wire_setup) {
+ /* For LVT0 make it edge triggered, active high, external and enabled */
+ value = apic_read(APIC_LVT0);
+ value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED );
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
+ apic_write_around(APIC_LVT0, value);
+ }
+ else {
+ /* Disable LVT0 */
+ apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
+ }
+
+ /* For LVT1 make it edge triggered, active high, nmi and enabled */
+ value = apic_read(APIC_LVT1);
+ value &= ~(
+ APIC_MODE_MASK | APIC_SEND_PENDING |
+ APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
+ APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
+ value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
+ value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
+ apic_write_around(APIC_LVT1, value);
+ }
}
void disable_local_APIC(void)
@@ -285,7 +321,7 @@ void __init init_bsp_APIC(void)
apic_write_around(APIC_LVT1, value);
}
-void __init setup_local_APIC (void)
+void __cpuinit setup_local_APIC (void)
{
unsigned int value, ver, maxlvt;
@@ -534,7 +570,7 @@ static struct sys_device device_lapic = {
.cls = &lapic_sysclass,
};
-static void __init apic_pm_activate(void)
+static void __cpuinit apic_pm_activate(void)
{
apic_pm_state.active = 1;
}
@@ -774,14 +810,14 @@ void __init setup_boot_APIC_clock (void)
local_irq_enable();
}
-void __init setup_secondary_APIC_clock(void)
+void __cpuinit setup_secondary_APIC_clock(void)
{
local_irq_disable(); /* FIXME: Do we need this? --RR */
setup_APIC_timer(calibration_result);
local_irq_enable();
}
-void __init disable_APIC_timer(void)
+void __cpuinit disable_APIC_timer(void)
{
if (using_apic_timer) {
unsigned long v;
diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c
new file mode 100644
index 000000000000..d7fa4248501c
--- /dev/null
+++ b/arch/x86_64/kernel/crash.c
@@ -0,0 +1,35 @@
+/*
+ * Architecture specific (x86_64) functions for kexec based crash dumps.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ *
+ * Copyright (C) IBM Corporation, 2004. All rights reserved.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/irq.h>
+#include <linux/reboot.h>
+#include <linux/kexec.h>
+
+#include <asm/processor.h>
+#include <asm/hardirq.h>
+#include <asm/nmi.h>
+#include <asm/hw_irq.h>
+
+note_buf_t crash_notes[NR_CPUS];
+
+void machine_crash_shutdown(struct pt_regs *regs)
+{
+ /* This function is only called after the system
+ * has paniced or is otherwise in a critical state.
+ * The minimum amount of code to allow a kexec'd kernel
+ * to run successfully needs to happen here.
+ *
+ * In practice this means shooting down the other cpus in
+ * an SMP system.
+ */
+}
diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c
index 7c154dfff64a..6ded3a50dfe6 100644
--- a/arch/x86_64/kernel/e820.c
+++ b/arch/x86_64/kernel/e820.c
@@ -16,6 +16,7 @@
#include <linux/bootmem.h>
#include <linux/ioport.h>
#include <linux/string.h>
+#include <linux/kexec.h>
#include <asm/page.h>
#include <asm/e820.h>
#include <asm/proto.h>
@@ -191,8 +192,6 @@ void __init e820_reserve_resources(void)
int i;
for (i = 0; i < e820.nr_map; i++) {
struct resource *res;
- if (e820.map[i].addr + e820.map[i].size > 0x100000000ULL)
- continue;
res = alloc_bootmem_low(sizeof(struct resource));
switch (e820.map[i].type) {
case E820_RAM: res->name = "System RAM"; break;
@@ -212,6 +211,9 @@ void __init e820_reserve_resources(void)
*/
request_resource(res, &code_resource);
request_resource(res, &data_resource);
+#ifdef CONFIG_KEXEC
+ request_resource(res, &crashk_res);
+#endif
}
}
}
diff --git a/arch/x86_64/kernel/genapic_flat.c b/arch/x86_64/kernel/genapic_flat.c
index b4cbbad04226..282846965080 100644
--- a/arch/x86_64/kernel/genapic_flat.c
+++ b/arch/x86_64/kernel/genapic_flat.c
@@ -7,6 +7,8 @@
* Hacked for x86-64 by James Cleverdon from i386 architecture code by
* Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
* James Cleverdon.
+ * Ashok Raj <ashok.raj@intel.com>
+ * Removed IPI broadcast shortcut to support CPU hotplug
*/
#include <linux/config.h>
#include <linux/threads.h>
@@ -18,6 +20,46 @@
#include <asm/smp.h>
#include <asm/ipi.h>
+/*
+ * The following permit choosing broadcast IPI shortcut v.s sending IPI only
+ * to online cpus via the send_IPI_mask varient.
+ * The mask version is my preferred option, since it eliminates a lot of
+ * other extra code that would need to be written to cleanup intrs sent
+ * to a CPU while offline.
+ *
+ * Sending broadcast introduces lots of trouble in CPU hotplug situations.
+ * These IPI's are delivered to cpu's irrespective of their offline status
+ * and could pickup stale intr data when these CPUS are turned online.
+ *
+ * Not using broadcast is a cleaner approach IMO, but Andi Kleen disagrees with
+ * the idea of not using broadcast IPI's anymore. Hence the run time check
+ * is introduced, on his request so we can choose an alternate mechanism.
+ *
+ * Initial wacky performance tests that collect cycle counts show
+ * no increase in using mask v.s broadcast version. In fact they seem
+ * identical in terms of cycle counts.
+ *
+ * if we need to use broadcast, we need to do the following.
+ *
+ * cli;
+ * hold call_lock;
+ * clear any pending IPI, just ack and clear all pending intr
+ * set cpu_online_map;
+ * release call_lock;
+ * sti;
+ *
+ * The complicated dummy irq processing shown above is not required if
+ * we didnt sent IPI's to wrong CPU's in the first place.
+ *
+ * - Ashok Raj <ashok.raj@intel.com>
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+#define DEFAULT_SEND_IPI (1)
+#else
+#define DEFAULT_SEND_IPI (0)
+#endif
+
+static int no_broadcast=DEFAULT_SEND_IPI;
static cpumask_t flat_target_cpus(void)
{
@@ -45,22 +87,6 @@ static void flat_init_apic_ldr(void)
apic_write_around(APIC_LDR, val);
}
-static void flat_send_IPI_allbutself(int vector)
-{
- /*
- * if there are no other CPUs in the system then
- * we get an APIC send error if we try to broadcast.
- * thus we have to avoid sending IPIs in this case.
- */
- if (num_online_cpus() > 1)
- __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
-}
-
-static void flat_send_IPI_all(int vector)
-{
- __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
-}
-
static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
{
unsigned long mask = cpus_addr(cpumask)[0];
@@ -93,6 +119,39 @@ static void flat_send_IPI_mask(cpumask_t cpumask, int vector)
local_irq_restore(flags);
}
+static inline void __local_flat_send_IPI_allbutself(int vector)
+{
+ if (no_broadcast) {
+ cpumask_t mask = cpu_online_map;
+ int this_cpu = get_cpu();
+
+ cpu_clear(this_cpu, mask);
+ flat_send_IPI_mask(mask, vector);
+ put_cpu();
+ }
+ else
+ __send_IPI_shortcut(APIC_DEST_ALLBUT, vector, APIC_DEST_LOGICAL);
+}
+
+static inline void __local_flat_send_IPI_all(int vector)
+{
+ if (no_broadcast)
+ flat_send_IPI_mask(cpu_online_map, vector);
+ else
+ __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
+}
+
+static void flat_send_IPI_allbutself(int vector)
+{
+ if (((num_online_cpus()) - 1) >= 1)
+ __local_flat_send_IPI_allbutself(vector);
+}
+
+static void flat_send_IPI_all(int vector)
+{
+ __local_flat_send_IPI_all(vector);
+}
+
static int flat_apic_id_registered(void)
{
return physid_isset(GET_APIC_ID(apic_read(APIC_ID)), phys_cpu_present_map);
@@ -111,6 +170,16 @@ static unsigned int phys_pkg_id(int index_msb)
return ((ebx >> 24) & 0xFF) >> index_msb;
}
+static __init int no_ipi_broadcast(char *str)
+{
+ get_option(&str, &no_broadcast);
+ printk ("Using %s mode\n", no_broadcast ? "No IPI Broadcast" :
+ "IPI Broadcast");
+ return 1;
+}
+
+__setup("no_ipi_broadcast", no_ipi_broadcast);
+
struct genapic apic_flat = {
.name = "flat",
.int_delivery_mode = dest_LowestPrio,
@@ -125,3 +194,12 @@ struct genapic apic_flat = {
.cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
.phys_pkg_id = phys_pkg_id,
};
+
+static int __init print_ipi_mode(void)
+{
+ printk ("Using IPI %s mode\n", no_broadcast ? "No-Shortcut" :
+ "Shortcut");
+ return 0;
+}
+
+late_initcall(print_ipi_mode);
diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index 9bd2e7a4b81e..8d765aa77a26 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -248,23 +248,23 @@ ENTRY(_stext)
*/
.org 0x1000
ENTRY(init_level4_pgt)
- .quad 0x0000000000102007 /* -> level3_ident_pgt */
+ .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */
.fill 255,8,0
- .quad 0x000000000010a007
+ .quad 0x000000000000a007 + __PHYSICAL_START
.fill 254,8,0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad 0x0000000000103007 /* -> level3_kernel_pgt */
+ .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
.org 0x2000
ENTRY(level3_ident_pgt)
- .quad 0x0000000000104007
+ .quad 0x0000000000004007 + __PHYSICAL_START
.fill 511,8,0
.org 0x3000
ENTRY(level3_kernel_pgt)
.fill 510,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
- .quad 0x0000000000105007 /* -> level2_kernel_pgt */
+ .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt */
.fill 1,8,0
.org 0x4000
@@ -337,17 +337,17 @@ ENTRY(empty_bad_pmd_table)
.org 0xa000
ENTRY(level3_physmem_pgt)
- .quad 0x0000000000105007 /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
+ .quad 0x0000000000005007 + __PHYSICAL_START /* -> level2_kernel_pgt (so that __va works even before pagetable_init) */
.org 0xb000
#ifdef CONFIG_ACPI_SLEEP
ENTRY(wakeup_level4_pgt)
- .quad 0x0000000000102007 /* -> level3_ident_pgt */
+ .quad 0x0000000000002007 + __PHYSICAL_START /* -> level3_ident_pgt */
.fill 255,8,0
- .quad 0x000000000010a007
+ .quad 0x000000000000a007 + __PHYSICAL_START
.fill 254,8,0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
- .quad 0x0000000000103007 /* -> level3_kernel_pgt */
+ .quad 0x0000000000003007 + __PHYSICAL_START /* -> level3_kernel_pgt */
#endif
.data
diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c
index ba139cac57ce..d9b22b633e39 100644
--- a/arch/x86_64/kernel/i387.c
+++ b/arch/x86_64/kernel/i387.c
@@ -42,7 +42,7 @@ void mxcsr_feature_mask_init(void)
* Called at bootup to set up the initial FPU state that is later cloned
* into all processes.
*/
-void __init fpu_init(void)
+void __cpuinit fpu_init(void)
{
unsigned long oldcr0 = read_cr0();
extern void __bad_fxsave_alignment(void);
diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 19eafa0aa95c..a89169095129 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -414,10 +414,22 @@ static int i8259A_suspend(struct sys_device *dev, pm_message_t state)
return 0;
}
+static int i8259A_shutdown(struct sys_device *dev)
+{
+ /* Put the i8259A into a quiescent state that
+ * the kernel initialization code can get it
+ * out of.
+ */
+ outb(0xff, 0x21); /* mask all of 8259A-1 */
+ outb(0xff, 0xA1); /* mask all of 8259A-1 */
+ return 0;
+}
+
static struct sysdev_class i8259_sysdev_class = {
set_kset_name("i8259"),
.suspend = i8259A_suspend,
.resume = i8259A_resume,
+ .shutdown = i8259A_shutdown,
};
static struct sys_device device_i8259A = {
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index afd87e64d0a8..157190d986bb 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -330,7 +330,7 @@ static int find_irq_entry(int apic, int pin, int type)
/*
* Find the pin to which IRQ[irq] (ISA) is connected
*/
-static int __init find_isa_irq_pin(int irq, int type)
+static int find_isa_irq_pin(int irq, int type)
{
int i;
@@ -1132,12 +1132,44 @@ static void __init enable_IO_APIC(void)
*/
void disable_IO_APIC(void)
{
+ int pin;
/*
* Clear the IO-APIC before rebooting:
*/
clear_IO_APIC();
- disconnect_bsp_APIC();
+ /*
+ * If the i82559 is routed through an IOAPIC
+ * Put that IOAPIC in virtual wire mode
+ * so legacy interrups can be delivered.
+ */
+ pin = find_isa_irq_pin(0, mp_ExtINT);
+ if (pin != -1) {
+ struct IO_APIC_route_entry entry;
+ unsigned long flags;
+
+ memset(&entry, 0, sizeof(entry));
+ entry.mask = 0; /* Enabled */
+ entry.trigger = 0; /* Edge */
+ entry.irr = 0;
+ entry.polarity = 0; /* High */
+ entry.delivery_status = 0;
+ entry.dest_mode = 0; /* Physical */
+ entry.delivery_mode = 7; /* ExtInt */
+ entry.vector = 0;
+ entry.dest.physical.physical_dest = 0;
+
+
+ /*
+ * Add it to the IO-APIC irq-routing table:
+ */
+ spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
+ io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+ spin_unlock_irqrestore(&ioapic_lock, flags);
+ }
+
+ disconnect_bsp_APIC(pin != -1);
}
/*
diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c
index 62b112e4deb4..cc3fb85f5145 100644
--- a/arch/x86_64/kernel/irq.c
+++ b/arch/x86_64/kernel/irq.c
@@ -14,6 +14,7 @@
#include <linux/interrupt.h>
#include <linux/seq_file.h>
#include <linux/module.h>
+#include <linux/delay.h>
#include <asm/uaccess.h>
#include <asm/io_apic.h>
@@ -106,3 +107,31 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
return 1;
}
+#ifdef CONFIG_HOTPLUG_CPU
+void fixup_irqs(cpumask_t map)
+{
+ unsigned int irq;
+ static int warned;
+
+ for (irq = 0; irq < NR_IRQS; irq++) {
+ cpumask_t mask;
+ if (irq == 2)
+ continue;
+
+ cpus_and(mask, irq_affinity[irq], map);
+ if (any_online_cpu(mask) == NR_CPUS) {
+ printk("Breaking affinity for irq %i\n", irq);
+ mask = map;
+ }
+ if (irq_desc[irq].handler->set_affinity)
+ irq_desc[irq].handler->set_affinity(irq, mask);
+ else if (irq_desc[irq].action && !(warned++))
+ printk("Cannot set affinity for irq %i\n", irq);
+ }
+
+ /* That doesn't seem sufficient. Give it 1ms. */
+ local_irq_enable();
+ mdelay(1);
+ local_irq_disable();
+}
+#endif
diff --git a/arch/x86_64/kernel/machine_kexec.c b/arch/x86_64/kernel/machine_kexec.c
new file mode 100644
index 000000000000..60d1eff41567
--- /dev/null
+++ b/arch/x86_64/kernel/machine_kexec.c
@@ -0,0 +1,250 @@
+/*
+ * machine_kexec.c - handle transition of Linux booting another kernel
+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/kexec.h>
+#include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/reboot.h>
+#include <asm/pda.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/io.h>
+#include <asm/apic.h>
+#include <asm/cpufeature.h>
+#include <asm/hw_irq.h>
+
+#define LEVEL0_SIZE (1UL << 12UL)
+#define LEVEL1_SIZE (1UL << 21UL)
+#define LEVEL2_SIZE (1UL << 30UL)
+#define LEVEL3_SIZE (1UL << 39UL)
+#define LEVEL4_SIZE (1UL << 48UL)
+
+#define L0_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L1_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE)
+#define L2_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+#define L3_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
+
+static void init_level2_page(u64 *level2p, unsigned long addr)
+{
+ unsigned long end_addr;
+
+ addr &= PAGE_MASK;
+ end_addr = addr + LEVEL2_SIZE;
+ while (addr < end_addr) {
+ *(level2p++) = addr | L1_ATTR;
+ addr += LEVEL1_SIZE;
+ }
+}
+
+static int init_level3_page(struct kimage *image, u64 *level3p,
+ unsigned long addr, unsigned long last_addr)
+{
+ unsigned long end_addr;
+ int result;
+
+ result = 0;
+ addr &= PAGE_MASK;
+ end_addr = addr + LEVEL3_SIZE;
+ while ((addr < last_addr) && (addr < end_addr)) {
+ struct page *page;
+ u64 *level2p;
+
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ level2p = (u64 *)page_address(page);
+ init_level2_page(level2p, addr);
+ *(level3p++) = __pa(level2p) | L2_ATTR;
+ addr += LEVEL2_SIZE;
+ }
+ /* clear the unused entries */
+ while (addr < end_addr) {
+ *(level3p++) = 0;
+ addr += LEVEL2_SIZE;
+ }
+out:
+ return result;
+}
+
+
+static int init_level4_page(struct kimage *image, u64 *level4p,
+ unsigned long addr, unsigned long last_addr)
+{
+ unsigned long end_addr;
+ int result;
+
+ result = 0;
+ addr &= PAGE_MASK;
+ end_addr = addr + LEVEL4_SIZE;
+ while ((addr < last_addr) && (addr < end_addr)) {
+ struct page *page;
+ u64 *level3p;
+
+ page = kimage_alloc_control_pages(image, 0);
+ if (!page) {
+ result = -ENOMEM;
+ goto out;
+ }
+ level3p = (u64 *)page_address(page);
+ result = init_level3_page(image, level3p, addr, last_addr);
+ if (result) {
+ goto out;
+ }
+ *(level4p++) = __pa(level3p) | L3_ATTR;
+ addr += LEVEL3_SIZE;
+ }
+ /* clear the unused entries */
+ while (addr < end_addr) {
+ *(level4p++) = 0;
+ addr += LEVEL3_SIZE;
+ }
+out:
+ return result;
+}
+
+
+static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
+{
+ u64 *level4p;
+ level4p = (u64 *)__va(start_pgtable);
+ return init_level4_page(image, level4p, 0, end_pfn << PAGE_SHIFT);
+}
+
+static void set_idt(void *newidt, u16 limit)
+{
+ unsigned char curidt[10];
+
+ /* x86-64 supports unaliged loads & stores */
+ (*(u16 *)(curidt)) = limit;
+ (*(u64 *)(curidt +2)) = (unsigned long)(newidt);
+
+ __asm__ __volatile__ (
+ "lidt %0\n"
+ : "=m" (curidt)
+ );
+};
+
+
+static void set_gdt(void *newgdt, u16 limit)
+{
+ unsigned char curgdt[10];
+
+ /* x86-64 supports unaligned loads & stores */
+ (*(u16 *)(curgdt)) = limit;
+ (*(u64 *)(curgdt +2)) = (unsigned long)(newgdt);
+
+ __asm__ __volatile__ (
+ "lgdt %0\n"
+ : "=m" (curgdt)
+ );
+};
+
+static void load_segments(void)
+{
+ __asm__ __volatile__ (
+ "\tmovl $"STR(__KERNEL_DS)",%eax\n"
+ "\tmovl %eax,%ds\n"
+ "\tmovl %eax,%es\n"
+ "\tmovl %eax,%ss\n"
+ "\tmovl %eax,%fs\n"
+ "\tmovl %eax,%gs\n"
+ );
+#undef STR
+#undef __STR
+}
+
+typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page,
+ unsigned long control_code_buffer,
+ unsigned long start_address,
+ unsigned long pgtable) ATTRIB_NORET;
+
+const extern unsigned char relocate_new_kernel[];
+const extern unsigned long relocate_new_kernel_size;
+
+int machine_kexec_prepare(struct kimage *image)
+{
+ unsigned long start_pgtable, control_code_buffer;
+ int result;
+
+ /* Calculate the offsets */
+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ control_code_buffer = start_pgtable + 4096UL;
+
+ /* Setup the identity mapped 64bit page table */
+ result = init_pgtable(image, start_pgtable);
+ if (result)
+ return result;
+
+ /* Place the code in the reboot code buffer */
+ memcpy(__va(control_code_buffer), relocate_new_kernel,
+ relocate_new_kernel_size);
+
+ return 0;
+}
+
+void machine_kexec_cleanup(struct kimage *image)
+{
+ return;
+}
+
+/*
+ * Do not allocate memory (or fail in any way) in machine_kexec().
+ * We are past the point of no return, committed to rebooting now.
+ */
+NORET_TYPE void machine_kexec(struct kimage *image)
+{
+ unsigned long page_list;
+ unsigned long control_code_buffer;
+ unsigned long start_pgtable;
+ relocate_new_kernel_t rnk;
+
+ /* Interrupts aren't acceptable while we reboot */
+ local_irq_disable();
+
+ /* Calculate the offsets */
+ page_list = image->head;
+ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT;
+ control_code_buffer = start_pgtable + 4096UL;
+
+ /* Set the low half of the page table to my identity mapped
+ * page table for kexec. Leave the high half pointing at the
+ * kernel pages. Don't bother to flush the global pages
+ * as that will happen when I fully switch to my identity mapped
+ * page table anyway.
+ */
+ memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2);
+ __flush_tlb();
+
+
+ /* The segment registers are funny things, they are
+ * automatically loaded from a table, in memory wherever you
+ * set them to a specific selector, but this table is never
+ * accessed again unless you set the segment to a different selector.
+ *
+ * The more common model are caches where the behide
+ * the scenes work is done, but is also dropped at arbitrary
+ * times.
+ *
+ * I take advantage of this here by force loading the
+ * segments, before I zap the gdt with an invalid value.
+ */
+ load_segments();
+ /* The gdt & idt are now invalid.
+ * If you want to load them you must set up your own idt & gdt.
+ */
+ set_gdt(phys_to_virt(0),0);
+ set_idt(phys_to_virt(0),0);
+ /* now call it */
+ rnk = (relocate_new_kernel_t) control_code_buffer;
+ (*rnk)(page_list, control_code_buffer, image->start, start_pgtable);
+}
diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c
index 3a89d735a4f6..21e70625a495 100644
--- a/arch/x86_64/kernel/mce.c
+++ b/arch/x86_64/kernel/mce.c
@@ -327,7 +327,7 @@ static void mce_init(void *dummy)
}
/* Add per CPU specific workarounds here */
-static void __init mce_cpu_quirks(struct cpuinfo_x86 *c)
+static void __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
{
/* This should be disabled by the BIOS, but isn't always */
if (c->x86_vendor == X86_VENDOR_AMD && c->x86 == 15) {
@@ -337,7 +337,7 @@ static void __init mce_cpu_quirks(struct cpuinfo_x86 *c)
}
}
-static void __init mce_cpu_features(struct cpuinfo_x86 *c)
+static void __cpuinit mce_cpu_features(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
@@ -352,7 +352,7 @@ static void __init mce_cpu_features(struct cpuinfo_x86 *c)
* Called for each booted CPU to set up machine checks.
* Must be called with preempt off.
*/
-void __init mcheck_init(struct cpuinfo_x86 *c)
+void __cpuinit mcheck_init(struct cpuinfo_x86 *c)
{
static cpumask_t mce_cpus __initdata = CPU_MASK_NONE;
@@ -411,7 +411,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff
memset(mcelog.entry, 0, next * sizeof(struct mce));
mcelog.next = 0;
- synchronize_kernel();
+ synchronize_sched();
/* Collect entries that were still getting written before the synchronize. */
@@ -542,7 +542,7 @@ ACCESSOR(bank4ctl,bank[4],mce_restart())
ACCESSOR(tolerant,tolerant,)
ACCESSOR(check_interval,check_interval,mce_restart())
-static __init int mce_init_device(void)
+static __cpuinit int mce_init_device(void)
{
int err;
if (!mce_available(&boot_cpu_data))
diff --git a/arch/x86_64/kernel/mce_intel.c b/arch/x86_64/kernel/mce_intel.c
index 4db9a640069f..0be0a7959814 100644
--- a/arch/x86_64/kernel/mce_intel.c
+++ b/arch/x86_64/kernel/mce_intel.c
@@ -42,7 +42,7 @@ done:
irq_exit();
}
-static void __init intel_init_thermal(struct cpuinfo_x86 *c)
+static void __cpuinit intel_init_thermal(struct cpuinfo_x86 *c)
{
u32 l, h;
int tm2 = 0;
@@ -93,7 +93,7 @@ static void __init intel_init_thermal(struct cpuinfo_x86 *c)
return;
}
-void __init mce_intel_feature_init(struct cpuinfo_x86 *c)
+void __cpuinit mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
}
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 31c0f2e6ac91..4e44d6e6b7e5 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -98,7 +98,7 @@ static unsigned int nmi_p4_cccr_val;
(P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
-static __init inline int nmi_known_cpu(void)
+static __cpuinit inline int nmi_known_cpu(void)
{
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
@@ -110,7 +110,7 @@ static __init inline int nmi_known_cpu(void)
}
/* Run after command line and cpu_init init, but before all other checks */
-void __init nmi_watchdog_default(void)
+void __cpuinit nmi_watchdog_default(void)
{
if (nmi_watchdog != NMI_DEFAULT)
return;
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index e59d1f9d6163..1d91271796e5 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -8,7 +8,8 @@
*
* X86-64 port
* Andi Kleen.
- *
+ *
+ * CPU hotplug support - ashok.raj@intel.com
* $Id: process.c,v 1.38 2002/01/15 10:08:03 ak Exp $
*/
@@ -18,6 +19,7 @@
#include <stdarg.h>
+#include <linux/cpu.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
@@ -154,6 +156,29 @@ void cpu_idle_wait(void)
}
EXPORT_SYMBOL_GPL(cpu_idle_wait);
+#ifdef CONFIG_HOTPLUG_CPU
+DECLARE_PER_CPU(int, cpu_state);
+
+#include <asm/nmi.h>
+/* We don't actually take CPU down, just spin without interrupts. */
+static inline void play_dead(void)
+{
+ idle_task_exit();
+ wbinvd();
+ mb();
+ /* Ack it */
+ __get_cpu_var(cpu_state) = CPU_DEAD;
+
+ while (1)
+ safe_halt();
+}
+#else
+static inline void play_dead(void)
+{
+ BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
/*
* The idle thread. There's no useful work to be
* done, so just try to conserve power and have a
@@ -174,6 +199,8 @@ void cpu_idle (void)
idle = pm_idle;
if (!idle)
idle = default_idle;
+ if (cpu_is_offline(smp_processor_id()))
+ play_dead();
idle();
}
@@ -204,7 +231,7 @@ static void mwait_idle(void)
}
}
-void __init select_idle_routine(const struct cpuinfo_x86 *c)
+void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
{
static int printed;
if (cpu_has(c, X86_FEATURE_MWAIT)) {
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index be4b36f762cf..57e71dbdfd69 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -66,41 +66,47 @@ static int __init reboot_setup(char *str)
__setup("reboot=", reboot_setup);
-#ifdef CONFIG_SMP
-static void smp_halt(void)
+static inline void kb_wait(void)
{
- int cpuid = safe_smp_processor_id();
- static int first_entry = 1;
+ int i;
- if (reboot_force)
- return;
+ for (i=0; i<0x10000; i++)
+ if ((inb_p(0x64) & 0x02) == 0)
+ break;
+}
- if (first_entry) {
- first_entry = 0;
- smp_call_function((void *)machine_restart, NULL, 1, 0);
- }
-
- smp_stop_cpu();
+void machine_shutdown(void)
+{
+ /* Stop the cpus and apics */
+#ifdef CONFIG_SMP
+ int reboot_cpu_id;
- /* AP calling this. Just halt */
- if (cpuid != boot_cpu_id) {
- for (;;)
- asm("hlt");
+ /* The boot cpu is always logical cpu 0 */
+ reboot_cpu_id = 0;
+
+ /* Make certain the cpu I'm about to reboot on is online */
+ if (!cpu_isset(reboot_cpu_id, cpu_online_map)) {
+ reboot_cpu_id = smp_processor_id();
}
- /* Wait for all other CPUs to have run smp_stop_cpu */
- while (!cpus_empty(cpu_online_map))
- rep_nop();
-}
+ /* Make certain I only run on the appropriate processor */
+ set_cpus_allowed(current, cpumask_of_cpu(reboot_cpu_id));
+
+ /* O.K Now that I'm on the appropriate processor,
+ * stop all of the others.
+ */
+ smp_send_stop();
#endif
-static inline void kb_wait(void)
-{
- int i;
+ local_irq_disable();
- for (i=0; i<0x10000; i++)
- if ((inb_p(0x64) & 0x02) == 0)
- break;
+#ifndef CONFIG_SMP
+ disable_local_APIC();
+#endif
+
+ disable_IO_APIC();
+
+ local_irq_enable();
}
void machine_restart(char * __unused)
@@ -109,9 +115,7 @@ void machine_restart(char * __unused)
printk("machine restart\n");
-#ifdef CONFIG_SMP
- smp_halt();
-#endif
+ machine_shutdown();
if (!reboot_force) {
local_irq_disable();
diff --git a/arch/x86_64/kernel/relocate_kernel.S b/arch/x86_64/kernel/relocate_kernel.S
new file mode 100644
index 000000000000..d24fa9b72a2b
--- /dev/null
+++ b/arch/x86_64/kernel/relocate_kernel.S
@@ -0,0 +1,143 @@
+/*
+ * relocate_kernel.S - put the kernel image in place to boot
+ * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/linkage.h>
+
+ /*
+ * Must be relocatable PIC code callable as a C function, that once
+ * it starts can not use the previous processes stack.
+ */
+ .globl relocate_new_kernel
+ .code64
+relocate_new_kernel:
+ /* %rdi page_list
+ * %rsi reboot_code_buffer
+ * %rdx start address
+ * %rcx page_table
+ * %r8 arg5
+ * %r9 arg6
+ */
+
+ /* zero out flags, and disable interrupts */
+ pushq $0
+ popfq
+
+ /* set a new stack at the bottom of our page... */
+ lea 4096(%rsi), %rsp
+
+ /* store the parameters back on the stack */
+ pushq %rdx /* store the start address */
+
+ /* Set cr0 to a known state:
+ * 31 1 == Paging enabled
+ * 18 0 == Alignment check disabled
+ * 16 0 == Write protect disabled
+ * 3 0 == No task switch
+ * 2 0 == Don't do FP software emulation.
+ * 0 1 == Proctected mode enabled
+ */
+ movq %cr0, %rax
+ andq $~((1<<18)|(1<<16)|(1<<3)|(1<<2)), %rax
+ orl $((1<<31)|(1<<0)), %eax
+ movq %rax, %cr0
+
+ /* Set cr4 to a known state:
+ * 10 0 == xmm exceptions disabled
+ * 9 0 == xmm registers instructions disabled
+ * 8 0 == performance monitoring counter disabled
+ * 7 0 == page global disabled
+ * 6 0 == machine check exceptions disabled
+ * 5 1 == physical address extension enabled
+ * 4 0 == page size extensions disabled
+ * 3 0 == Debug extensions disabled
+ * 2 0 == Time stamp disable (disabled)
+ * 1 0 == Protected mode virtual interrupts disabled
+ * 0 0 == VME disabled
+ */
+
+ movq $((1<<5)), %rax
+ movq %rax, %cr4
+
+ jmp 1f
+1:
+
+ /* Switch to the identity mapped page tables,
+ * and flush the TLB.
+ */
+ movq %rcx, %cr3
+
+ /* Do the copies */
+ movq %rdi, %rcx /* Put the page_list in %rcx */
+ xorq %rdi, %rdi
+ xorq %rsi, %rsi
+ jmp 1f
+
+0: /* top, read another word for the indirection page */
+
+ movq (%rbx), %rcx
+ addq $8, %rbx
+1:
+ testq $0x1, %rcx /* is it a destination page? */
+ jz 2f
+ movq %rcx, %rdi
+ andq $0xfffffffffffff000, %rdi
+ jmp 0b
+2:
+ testq $0x2, %rcx /* is it an indirection page? */
+ jz 2f
+ movq %rcx, %rbx
+ andq $0xfffffffffffff000, %rbx
+ jmp 0b
+2:
+ testq $0x4, %rcx /* is it the done indicator? */
+ jz 2f
+ jmp 3f
+2:
+ testq $0x8, %rcx /* is it the source indicator? */
+ jz 0b /* Ignore it otherwise */
+ movq %rcx, %rsi /* For ever source page do a copy */
+ andq $0xfffffffffffff000, %rsi
+
+ movq $512, %rcx
+ rep ; movsq
+ jmp 0b
+3:
+
+ /* To be certain of avoiding problems with self-modifying code
+ * I need to execute a serializing instruction here.
+ * So I flush the TLB by reloading %cr3 here, it's handy,
+ * and not processor dependent.
+ */
+ movq %cr3, %rax
+ movq %rax, %cr3
+
+ /* set all of the registers to known values */
+ /* leave %rsp alone */
+
+ xorq %rax, %rax
+ xorq %rbx, %rbx
+ xorq %rcx, %rcx
+ xorq %rdx, %rdx
+ xorq %rsi, %rsi
+ xorq %rdi, %rdi
+ xorq %rbp, %rbp
+ xorq %r8, %r8
+ xorq %r9, %r9
+ xorq %r10, %r9
+ xorq %r11, %r11
+ xorq %r12, %r12
+ xorq %r13, %r13
+ xorq %r14, %r14
+ xorq %r15, %r15
+
+ ret
+relocate_new_kernel_end:
+
+ .globl relocate_new_kernel_size
+relocate_new_kernel_size:
+ .quad relocate_new_kernel_end - relocate_new_kernel
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 000015dd5a8b..b02d921da4f7 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -41,6 +41,7 @@
#include <linux/kallsyms.h>
#include <linux/edd.h>
#include <linux/mmzone.h>
+#include <linux/kexec.h>
#include <asm/mtrr.h>
#include <asm/uaccess.h>
@@ -367,6 +368,27 @@ static __init void parse_cmdline_early (char ** cmdline_p)
if (!memcmp(from, "noexec=", 7))
nonx_setup(from + 7);
+#ifdef CONFIG_KEXEC
+ /* crashkernel=size@addr specifies the location to reserve for
+ * a crash kernel. By reserving this memory we guarantee
+ * that linux never set's it up as a DMA target.
+ * Useful for holding code to do something appropriate
+ * after a kernel panic.
+ */
+ else if (!memcmp(from, "crashkernel=", 12)) {
+ unsigned long size, base;
+ size = memparse(from+12, &from);
+ if (*from == '@') {
+ base = memparse(from+1, &from);
+ /* FIXME: Do I want a sanity check
+ * to validate the memory range?
+ */
+ crashk_res.start = base;
+ crashk_res.end = base + size - 1;
+ }
+ }
+#endif
+
next_char:
c = *(from++);
if (!c)
@@ -625,6 +647,13 @@ void __init setup_arch(char **cmdline_p)
#endif
sparse_init();
+
+#ifdef CONFIG_KEXEC
+ if (crashk_res.start != crashk_res.end) {
+ reserve_bootmem(crashk_res.start,
+ crashk_res.end - crashk_res.start + 1);
+ }
+#endif
paging_init();
check_ioapic();
@@ -676,7 +705,7 @@ void __init setup_arch(char **cmdline_p)
#endif
}
-static int __init get_model_name(struct cpuinfo_x86 *c)
+static int __cpuinit get_model_name(struct cpuinfo_x86 *c)
{
unsigned int *v;
@@ -692,7 +721,7 @@ static int __init get_model_name(struct cpuinfo_x86 *c)
}
-static void __init display_cacheinfo(struct cpuinfo_x86 *c)
+static void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
{
unsigned int n, dummy, eax, ebx, ecx, edx;
@@ -803,7 +832,7 @@ static int __init init_amd(struct cpuinfo_x86 *c)
return r;
}
-static void __init detect_ht(struct cpuinfo_x86 *c)
+static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
u32 eax, ebx, ecx, edx;
@@ -864,7 +893,7 @@ static void __init detect_ht(struct cpuinfo_x86 *c)
/*
* find out the number of processor cores on the die
*/
-static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c)
+static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
{
unsigned int eax;
@@ -882,7 +911,7 @@ static int __init intel_num_cpu_cores(struct cpuinfo_x86 *c)
return 1;
}
-static void __init init_intel(struct cpuinfo_x86 *c)
+static void __cpuinit init_intel(struct cpuinfo_x86 *c)
{
/* Cache sizes */
unsigned n;
@@ -902,7 +931,7 @@ static void __init init_intel(struct cpuinfo_x86 *c)
c->x86_num_cores = intel_num_cpu_cores(c);
}
-void __init get_cpu_vendor(struct cpuinfo_x86 *c)
+void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
{
char *v = c->x86_vendor_id;
@@ -923,7 +952,7 @@ struct cpu_model_info {
/* Do some early cpuid on the boot CPU to get some parameter that are
needed before check_bugs. Everything advanced is in identify_cpu
below. */
-void __init early_identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
{
u32 tfms;
@@ -977,7 +1006,7 @@ void __init early_identify_cpu(struct cpuinfo_x86 *c)
/*
* This does the hard work of actually picking apart the CPU stuff...
*/
-void __init identify_cpu(struct cpuinfo_x86 *c)
+void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
{
int i;
u32 xlvl;
@@ -1054,7 +1083,7 @@ void __init identify_cpu(struct cpuinfo_x86 *c)
}
-void __init print_cpu_info(struct cpuinfo_x86 *c)
+void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
{
if (c->x86_model_id[0])
printk("%s", c->x86_model_id);
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 678b7ac33b8b..f2b9c6bc999a 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -29,7 +29,7 @@
char x86_boot_params[BOOT_PARAM_SIZE] __initdata = {0,};
-cpumask_t cpu_initialized __initdata = CPU_MASK_NONE;
+cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned;
@@ -171,7 +171,7 @@ void syscall_init(void)
wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
}
-void __init check_efer(void)
+void __cpuinit check_efer(void)
{
unsigned long efer;
@@ -188,7 +188,7 @@ void __init check_efer(void)
* 'CPU state barrier', nothing should get across.
* A lot of state is already set up in PDA init.
*/
-void __init cpu_init (void)
+void __cpuinit cpu_init (void)
{
#ifdef CONFIG_SMP
int cpu = stack_smp_processor_id();
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 1e379ed17b1d..ccae392886af 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -283,6 +283,16 @@ struct call_data_struct {
static struct call_data_struct * call_data;
+void lock_ipi_call_lock(void)
+{
+ spin_lock_irq(&call_lock);
+}
+
+void unlock_ipi_call_lock(void)
+{
+ spin_unlock_irq(&call_lock);
+}
+
/*
* this function sends a 'generic call function' IPI to all other CPUs
* in the system.
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index f1ec0f345941..b969ee128728 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -34,6 +34,7 @@
* Andi Kleen : Converted to new state machine.
* Various cleanups.
* Probably mostly hotplug CPU ready now.
+ * Ashok Raj : CPU hotplug support
*/
@@ -58,11 +59,6 @@
#include <asm/proto.h>
#include <asm/nmi.h>
-/* Change for real CPU hotplug. Note other files need to be fixed
- first too. */
-#define __cpuinit __init
-#define __cpuinitdata __initdata
-
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
/* Package ID of each logical CPU */
@@ -103,6 +99,37 @@ EXPORT_SYMBOL(cpu_core_map);
extern unsigned char trampoline_data[];
extern unsigned char trampoline_end[];
+/* State of each CPU */
+DEFINE_PER_CPU(int, cpu_state) = { 0 };
+
+/*
+ * Store all idle threads, this can be reused instead of creating
+ * a new thread. Also avoids complicated thread destroy functionality
+ * for idle threads.
+ */
+struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
+
+#define get_idle_for_cpu(x) (idle_thread_array[(x)])
+#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
+
+/*
+ * cpu_possible_map should be static, it cannot change as cpu's
+ * are onlined, or offlined. The reason is per-cpu data-structures
+ * are allocated by some modules at init time, and dont expect to
+ * do this dynamically on cpu arrival/departure.
+ * cpu_present_map on the other hand can change dynamically.
+ * In case when cpu_hotplug is not compiled, then we resort to current
+ * behaviour, which is cpu_possible == cpu_present.
+ * If cpu-hotplug is supported, then we need to preallocate for all
+ * those NR_CPUS, hence cpu_possible_map represents entire NR_CPUS range.
+ * - Ashok Raj
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+#define fixup_cpu_possible_map(x) cpu_set((x), cpu_possible_map)
+#else
+#define fixup_cpu_possible_map(x)
+#endif
+
/*
* Currently trivial. Write the real->protected mode
* bootstrap into the page concerned. The caller
@@ -418,6 +445,33 @@ void __cpuinit smp_callin(void)
cpu_set(cpuid, cpu_callin_map);
}
+static inline void set_cpu_sibling_map(int cpu)
+{
+ int i;
+
+ if (smp_num_siblings > 1) {
+ for_each_cpu(i) {
+ if (cpu_core_id[cpu] == cpu_core_id[i]) {
+ cpu_set(i, cpu_sibling_map[cpu]);
+ cpu_set(cpu, cpu_sibling_map[i]);
+ }
+ }
+ } else {
+ cpu_set(cpu, cpu_sibling_map[cpu]);
+ }
+
+ if (current_cpu_data.x86_num_cores > 1) {
+ for_each_cpu(i) {
+ if (phys_proc_id[cpu] == phys_proc_id[i]) {
+ cpu_set(i, cpu_core_map[cpu]);
+ cpu_set(cpu, cpu_core_map[i]);
+ }
+ }
+ } else {
+ cpu_core_map[cpu] = cpu_sibling_map[cpu];
+ }
+}
+
/*
* Setup code on secondary processor (after comming out of the trampoline)
*/
@@ -448,9 +502,28 @@ void __cpuinit start_secondary(void)
enable_APIC_timer();
/*
+ * The sibling maps must be set before turing the online map on for
+ * this cpu
+ */
+ set_cpu_sibling_map(smp_processor_id());
+
+ /*
+ * We need to hold call_lock, so there is no inconsistency
+ * between the time smp_call_function() determines number of
+ * IPI receipients, and the time when the determination is made
+ * for which cpus receive the IPI in genapic_flat.c. Holding this
+ * lock helps us to not include this cpu in a currently in progress
+ * smp_call_function().
+ */
+ lock_ipi_call_lock();
+
+ /*
* Allow the master to continue.
*/
cpu_set(smp_processor_id(), cpu_online_map);
+ per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ unlock_ipi_call_lock();
+
mb();
/* Wait for TSC sync to not schedule things before.
@@ -628,33 +701,77 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
return (send_status | accept_status);
}
+struct create_idle {
+ struct task_struct *idle;
+ struct completion done;
+ int cpu;
+};
+
+void do_fork_idle(void *_c_idle)
+{
+ struct create_idle *c_idle = _c_idle;
+
+ c_idle->idle = fork_idle(c_idle->cpu);
+ complete(&c_idle->done);
+}
+
/*
* Boot one CPU.
*/
static int __cpuinit do_boot_cpu(int cpu, int apicid)
{
- struct task_struct *idle;
unsigned long boot_error;
int timeout;
unsigned long start_rip;
+ struct create_idle c_idle = {
+ .cpu = cpu,
+ .done = COMPLETION_INITIALIZER(c_idle.done),
+ };
+ DECLARE_WORK(work, do_fork_idle, &c_idle);
+
+ c_idle.idle = get_idle_for_cpu(cpu);
+
+ if (c_idle.idle) {
+ c_idle.idle->thread.rsp = (unsigned long) (((struct pt_regs *)
+ (THREAD_SIZE + (unsigned long) c_idle.idle->thread_info)) - 1);
+ init_idle(c_idle.idle, cpu);
+ goto do_rest;
+ }
+
/*
- * We can't use kernel_thread since we must avoid to
- * reschedule the child.
+ * During cold boot process, keventd thread is not spun up yet.
+ * When we do cpu hot-add, we create idle threads on the fly, we should
+ * not acquire any attributes from the calling context. Hence the clean
+ * way to create kernel_threads() is to do that from keventd().
+ * We do the current_is_keventd() due to the fact that ACPI notifier
+ * was also queuing to keventd() and when the caller is already running
+ * in context of keventd(), we would end up with locking up the keventd
+ * thread.
*/
- idle = fork_idle(cpu);
- if (IS_ERR(idle)) {
+ if (!keventd_up() || current_is_keventd())
+ work.func(work.data);
+ else {
+ schedule_work(&work);
+ wait_for_completion(&c_idle.done);
+ }
+
+ if (IS_ERR(c_idle.idle)) {
printk("failed fork for CPU %d\n", cpu);
- return PTR_ERR(idle);
+ return PTR_ERR(c_idle.idle);
}
- cpu_pda[cpu].pcurrent = idle;
+ set_idle_for_cpu(cpu, c_idle.idle);
+
+do_rest:
+
+ cpu_pda[cpu].pcurrent = c_idle.idle;
start_rip = setup_trampoline();
- init_rsp = idle->thread.rsp;
+ init_rsp = c_idle.idle->thread.rsp;
per_cpu(init_tss,cpu).rsp0 = init_rsp;
initial_code = start_secondary;
- clear_ti_thread_flag(idle->thread_info, TIF_FORK);
+ clear_ti_thread_flag(c_idle.idle->thread_info, TIF_FORK);
printk(KERN_INFO "Booting processor %d/%d rip %lx rsp %lx\n", cpu, apicid,
start_rip, init_rsp);
@@ -746,51 +863,6 @@ cycles_t cacheflush_time;
unsigned long cache_decay_ticks;
/*
- * Construct cpu_sibling_map[], so that we can tell the sibling CPU
- * on SMT systems efficiently.
- */
-static __cpuinit void detect_siblings(void)
-{
- int cpu;
-
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- cpus_clear(cpu_sibling_map[cpu]);
- cpus_clear(cpu_core_map[cpu]);
- }
-
- for_each_online_cpu (cpu) {
- struct cpuinfo_x86 *c = cpu_data + cpu;
- int siblings = 0;
- int i;
- if (smp_num_siblings > 1) {
- for_each_online_cpu (i) {
- if (cpu_core_id[cpu] == cpu_core_id[i]) {
- siblings++;
- cpu_set(i, cpu_sibling_map[cpu]);
- }
- }
- } else {
- siblings++;
- cpu_set(cpu, cpu_sibling_map[cpu]);
- }
-
- if (siblings != smp_num_siblings) {
- printk(KERN_WARNING
- "WARNING: %d siblings found for CPU%d, should be %d\n",
- siblings, cpu, smp_num_siblings);
- smp_num_siblings = siblings;
- }
- if (c->x86_num_cores > 1) {
- for_each_online_cpu(i) {
- if (phys_proc_id[cpu] == phys_proc_id[i])
- cpu_set(i, cpu_core_map[cpu]);
- }
- } else
- cpu_core_map[cpu] = cpu_sibling_map[cpu];
- }
-}
-
-/*
* Cleanup possible dangling ends...
*/
static __cpuinit void smp_cleanup_boot(void)
@@ -823,7 +895,7 @@ static __cpuinit void smp_cleanup_boot(void)
*
* RED-PEN audit/test this more. I bet there is more state messed up here.
*/
-static __cpuinit void disable_smp(void)
+static __init void disable_smp(void)
{
cpu_present_map = cpumask_of_cpu(0);
cpu_possible_map = cpumask_of_cpu(0);
@@ -838,7 +910,7 @@ static __cpuinit void disable_smp(void)
/*
* Handle user cpus=... parameter.
*/
-static __cpuinit void enforce_max_cpus(unsigned max_cpus)
+static __init void enforce_max_cpus(unsigned max_cpus)
{
int i, k;
k = 0;
@@ -855,7 +927,7 @@ static __cpuinit void enforce_max_cpus(unsigned max_cpus)
/*
* Various sanity checks.
*/
-static int __cpuinit smp_sanity_check(unsigned max_cpus)
+static int __init smp_sanity_check(unsigned max_cpus)
{
if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
printk("weird, boot CPU (#%d) not listed by the BIOS.\n",
@@ -913,7 +985,7 @@ static int __cpuinit smp_sanity_check(unsigned max_cpus)
* Prepare for SMP bootup. The MP table or ACPI has been read
* earlier. Just do some sanity checking here and enable APIC mode.
*/
-void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
+void __init smp_prepare_cpus(unsigned int max_cpus)
{
int i;
@@ -930,10 +1002,9 @@ void __cpuinit smp_prepare_cpus(unsigned int max_cpus)
int apicid = cpu_present_to_apicid(i);
if (physid_isset(apicid, phys_cpu_present_map)) {
cpu_set(i, cpu_present_map);
- /* possible map would be different if we supported real
- CPU hotplug. */
cpu_set(i, cpu_possible_map);
}
+ fixup_cpu_possible_map(i);
}
if (smp_sanity_check(max_cpus) < 0) {
@@ -978,13 +1049,13 @@ void __init smp_prepare_boot_cpu(void)
int me = smp_processor_id();
cpu_set(me, cpu_online_map);
cpu_set(me, cpu_callout_map);
+ cpu_set(0, cpu_sibling_map[0]);
+ cpu_set(0, cpu_core_map[0]);
+ per_cpu(cpu_state, me) = CPU_ONLINE;
}
/*
* Entry point to boot a CPU.
- *
- * This is all __cpuinit, not __devinit for now because we don't support
- * CPU hotplug (yet).
*/
int __cpuinit __cpu_up(unsigned int cpu)
{
@@ -1001,6 +1072,15 @@ int __cpuinit __cpu_up(unsigned int cpu)
return -EINVAL;
}
+ /*
+ * Already booted CPU?
+ */
+ if (cpu_isset(cpu, cpu_callin_map)) {
+ Dprintk("do_boot_cpu %d Already started\n", cpu);
+ return -ENOSYS;
+ }
+
+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* Boot it! */
err = do_boot_cpu(cpu, apicid);
if (err < 0) {
@@ -1013,23 +1093,118 @@ int __cpuinit __cpu_up(unsigned int cpu)
while (!cpu_isset(cpu, cpu_online_map))
cpu_relax();
- return 0;
+ err = 0;
+
+ return err;
}
/*
* Finish the SMP boot.
*/
-void __cpuinit smp_cpus_done(unsigned int max_cpus)
+void __init smp_cpus_done(unsigned int max_cpus)
{
+#ifndef CONFIG_HOTPLUG_CPU
zap_low_mappings();
+#endif
smp_cleanup_boot();
#ifdef CONFIG_X86_IO_APIC
setup_ioapic_dest();
#endif
- detect_siblings();
time_init_gtod();
check_nmi_watchdog();
}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void remove_siblinginfo(int cpu)
+{
+ int sibling;
+
+ for_each_cpu_mask(sibling, cpu_sibling_map[cpu])
+ cpu_clear(cpu, cpu_sibling_map[sibling]);
+ for_each_cpu_mask(sibling, cpu_core_map[cpu])
+ cpu_clear(cpu, cpu_core_map[sibling]);
+ cpus_clear(cpu_sibling_map[cpu]);
+ cpus_clear(cpu_core_map[cpu]);
+ phys_proc_id[cpu] = BAD_APICID;
+ cpu_core_id[cpu] = BAD_APICID;
+}
+
+void remove_cpu_from_maps(void)
+{
+ int cpu = smp_processor_id();
+
+ cpu_clear(cpu, cpu_callout_map);
+ cpu_clear(cpu, cpu_callin_map);
+ clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
+}
+
+int __cpu_disable(void)
+{
+ int cpu = smp_processor_id();
+
+ /*
+ * Perhaps use cpufreq to drop frequency, but that could go
+ * into generic code.
+ *
+ * We won't take down the boot processor on i386 due to some
+ * interrupts only being able to be serviced by the BSP.
+ * Especially so if we're not using an IOAPIC -zwane
+ */
+ if (cpu == 0)
+ return -EBUSY;
+
+ disable_APIC_timer();
+
+ /*
+ * HACK:
+ * Allow any queued timer interrupts to get serviced
+ * This is only a temporary solution until we cleanup
+ * fixup_irqs as we do for IA64.
+ */
+ local_irq_enable();
+ mdelay(1);
+
+ local_irq_disable();
+ remove_siblinginfo(cpu);
+
+ /* It's now safe to remove this processor from the online map */
+ cpu_clear(cpu, cpu_online_map);
+ remove_cpu_from_maps();
+ fixup_irqs(cpu_online_map);
+ return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+ /* We don't do anything here: idle task is faking death itself. */
+ unsigned int i;
+
+ for (i = 0; i < 10; i++) {
+ /* They ack this in play_dead by setting CPU_DEAD */
+ if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+ printk ("CPU %d is now offline\n", cpu);
+ return;
+ }
+ current->state = TASK_UNINTERRUPTIBLE;
+ schedule_timeout(HZ/10);
+ }
+ printk(KERN_ERR "CPU %u didn't die...\n", cpu);
+}
+
+#else /* ... !CONFIG_HOTPLUG_CPU */
+
+int __cpu_disable(void)
+{
+ return -ENOSYS;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+ /* We said "no" in __cpu_disable */
+ BUG();
+}
+#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ebaa1e37d657..6c0f402e3a88 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -44,7 +44,6 @@ void __save_processor_state(struct saved_context *ctxt)
*/
asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
- asm volatile ("sldt %0" : "=m" (ctxt->ldt));
asm volatile ("str %0" : "=m" (ctxt->tr));
/* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
@@ -69,6 +68,7 @@ void __save_processor_state(struct saved_context *ctxt)
asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2));
asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3));
asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4));
+ asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8));
}
void save_processor_state(void)
@@ -90,12 +90,20 @@ void __restore_processor_state(struct saved_context *ctxt)
/*
* control registers
*/
+ asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8));
asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4));
asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3));
asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2));
asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0));
/*
+ * now restore the descriptor tables to their proper values
+ * ltr is done i fix_processor_context().
+ */
+ asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
+ asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+
+ /*
* segment registers
*/
asm volatile ("movw %0, %%ds" :: "r" (ctxt->ds));
@@ -108,14 +116,6 @@ void __restore_processor_state(struct saved_context *ctxt)
wrmsrl(MSR_GS_BASE, ctxt->gs_base);
wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
- /*
- * now restore the descriptor tables to their proper values
- * ltr is done i fix_processor_context().
- */
- asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
- asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
- asm volatile ("lldt %0" :: "m" (ctxt->ldt));
-
fix_processor_context();
do_fpu_end();
diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 121646fc43f6..102736630002 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -586,11 +586,17 @@ static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
asmlinkage void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
+ int cpu;
+
+ cpu = smp_processor_id();
/* Only the BSP gets external NMIs from the system. */
- if (!smp_processor_id())
+ if (!cpu)
reason = get_nmi_reason();
+ if (!cpu_online(cpu))
+ return;
+
if (!(reason & 0xc0)) {
if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 0, SIGINT)
== NOTIFY_STOP)
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 59ebd5beda87..73389f51c4e5 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -2,7 +2,10 @@
* Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
*/
+#define LOAD_OFFSET __START_KERNEL_map
+
#include <asm-generic/vmlinux.lds.h>
+#include <asm/page.h>
#include <linux/config.h>
OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
@@ -11,28 +14,30 @@ ENTRY(phys_startup_64)
jiffies_64 = jiffies;
SECTIONS
{
- . = 0xffffffff80100000;
+ . = __START_KERNEL;
phys_startup_64 = startup_64 - LOAD_OFFSET;
_text = .; /* Text and read-only data */
- .text : {
+ .text : AT(ADDR(.text) - LOAD_OFFSET) {
*(.text)
SCHED_TEXT
LOCK_TEXT
*(.fixup)
*(.gnu.warning)
} = 0x9090
- .text.lock : { *(.text.lock) } /* out-of-line lock text */
+ /* out-of-line lock text */
+ .text.lock : AT(ADDR(.text.lock) - LOAD_OFFSET) { *(.text.lock) }
_etext = .; /* End of text section */
. = ALIGN(16); /* Exception table */
__start___ex_table = .;
- __ex_table : { *(__ex_table) }
+ __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) }
__stop___ex_table = .;
RODATA
- .data : { /* Data */
+ /* Data */
+ .data : AT(ADDR(.data) - LOAD_OFFSET) {
*(.data)
CONSTRUCTORS
}
@@ -40,62 +45,95 @@ SECTIONS
_edata = .; /* End of data section */
__bss_start = .; /* BSS */
- .bss : {
+ .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
*(.bss.page_aligned)
*(.bss)
}
__bss_end = .;
+ . = ALIGN(PAGE_SIZE);
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .data.cacheline_aligned : { *(.data.cacheline_aligned) }
+ .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+ *(.data.cacheline_aligned)
+ }
+
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-#define AFTER(x) BINALIGN(LOADADDR(x) + SIZEOF(x), 16)
-#define BINALIGN(x,y) (((x) + (y) - 1) & ~((y) - 1))
-#define CACHE_ALIGN(x) BINALIGN(x, CONFIG_X86_L1_CACHE_BYTES)
+ . = VSYSCALL_ADDR;
+ .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) }
+ __vsyscall_0 = VSYSCALL_VIRT_ADDR;
- .vsyscall_0 -10*1024*1024: AT ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) { *(.vsyscall_0) }
- __vsyscall_0 = LOADADDR(.vsyscall_0);
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .xtime_lock : AT CACHE_ALIGN(AFTER(.vsyscall_0)) { *(.xtime_lock) }
- xtime_lock = LOADADDR(.xtime_lock);
- .vxtime : AT AFTER(.xtime_lock) { *(.vxtime) }
- vxtime = LOADADDR(.vxtime);
- .wall_jiffies : AT AFTER(.vxtime) { *(.wall_jiffies) }
- wall_jiffies = LOADADDR(.wall_jiffies);
- .sys_tz : AT AFTER(.wall_jiffies) { *(.sys_tz) }
- sys_tz = LOADADDR(.sys_tz);
- .sysctl_vsyscall : AT AFTER(.sys_tz) { *(.sysctl_vsyscall) }
- sysctl_vsyscall = LOADADDR(.sysctl_vsyscall);
- .xtime : AT AFTER(.sysctl_vsyscall) { *(.xtime) }
- xtime = LOADADDR(.xtime);
+ .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) }
+ xtime_lock = VVIRT(.xtime_lock);
+
+ .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) }
+ vxtime = VVIRT(.vxtime);
+
+ .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) }
+ wall_jiffies = VVIRT(.wall_jiffies);
+
+ .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) }
+ sys_tz = VVIRT(.sys_tz);
+
+ .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) }
+ sysctl_vsyscall = VVIRT(.sysctl_vsyscall);
+
+ .xtime : AT(VLOAD(.xtime)) { *(.xtime) }
+ xtime = VVIRT(.xtime);
+
. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- .jiffies : AT CACHE_ALIGN(AFTER(.xtime)) { *(.jiffies) }
- jiffies = LOADADDR(.jiffies);
- .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT (LOADADDR(.vsyscall_0) + 1024) { *(.vsyscall_1) }
- . = LOADADDR(.vsyscall_0) + 4096;
+ .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
+ jiffies = VVIRT(.jiffies);
+
+ .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
+ .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
+ .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }
+
+ . = VSYSCALL_VIRT_ADDR + 4096;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
. = ALIGN(8192); /* init_task */
- .data.init_task : { *(.data.init_task) }
+ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+ *(.data.init_task)
+ }
. = ALIGN(4096);
- .data.page_aligned : { *(.data.page_aligned) }
+ .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+ *(.data.page_aligned)
+ }
. = ALIGN(4096); /* Init code and data */
__init_begin = .;
- .init.text : {
+ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
_sinittext = .;
*(.init.text)
_einittext = .;
}
__initdata_begin = .;
- .init.data : { *(.init.data) }
+ .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { *(.init.data) }
__initdata_end = .;
. = ALIGN(16);
__setup_start = .;
- .init.setup : { *(.init.setup) }
+ .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { *(.init.setup) }
__setup_end = .;
__initcall_start = .;
- .initcall.init : {
+ .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
*(.initcall1.init)
*(.initcall2.init)
*(.initcall3.init)
@@ -106,32 +144,38 @@ SECTIONS
}
__initcall_end = .;
__con_initcall_start = .;
- .con_initcall.init : { *(.con_initcall.init) }
+ .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+ *(.con_initcall.init)
+ }
__con_initcall_end = .;
SECURITY_INIT
. = ALIGN(8);
__alt_instructions = .;
- .altinstructions : { *(.altinstructions) }
+ .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+ *(.altinstructions)
+ }
__alt_instructions_end = .;
- .altinstr_replacement : { *(.altinstr_replacement) }
+ .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+ *(.altinstr_replacement)
+ }
/* .exit.text is discard at runtime, not link time, to deal with references
from .altinstructions and .eh_frame */
- .exit.text : { *(.exit.text) }
- .exit.data : { *(.exit.data) }
+ .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
+ .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) }
. = ALIGN(4096);
__initramfs_start = .;
- .init.ramfs : { *(.init.ramfs) }
+ .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { *(.init.ramfs) }
__initramfs_end = .;
. = ALIGN(32);
__per_cpu_start = .;
- .data.percpu : { *(.data.percpu) }
+ .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
__per_cpu_end = .;
. = ALIGN(4096);
__init_end = .;
. = ALIGN(4096);
__nosave_begin = .;
- .data_nosave : { *(.data.nosave) }
+ .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
. = ALIGN(4096);
__nosave_end = .;
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 84cde796ecb1..ac61c186eb02 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -251,7 +251,7 @@ void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
}
-__init void numa_add_cpu(int cpu)
+__cpuinit void numa_add_cpu(int cpu)
{
/* BP is initialized elsewhere */
if (cpu)