From 4116c527ea9517623369a5b3b037aedde280d672 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Sat, 3 Sep 2005 15:56:27 -0700
Subject: [PATCH] hpet: use read_timer_tsc only when CPU has TSC

Only use read_timer_tsc only when CPU has TSC.  Thanks to Andrea for
pointing this out.  Should not be issue on any platforms as all recent
systems that has HPET also has CPUs that supports TSC.  The patch is still
required for correctness.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/timers/timer_hpet.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index ef8dac5dd33b..cbb3f221ced8 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -136,6 +136,8 @@ static void delay_hpet(unsigned long loops)
 	} while ((hpet_end - hpet_start) < (loops));
 }
 
+static struct timer_opts timer_hpet;
+
 static int __init init_hpet(char* override)
 {
 	unsigned long result, remain;
@@ -163,6 +165,8 @@ static int __init init_hpet(char* override)
 			}
 			set_cyc2ns_scale(cpu_khz/1000);
 		}
+		/* set this only when cpu_has_tsc */
+		timer_hpet.read_timer = read_timer_tsc;
 	}
 
 	/*
@@ -186,7 +190,6 @@ static struct timer_opts timer_hpet __read_mostly = {
 	.get_offset =		get_offset_hpet,
 	.monotonic_clock =	monotonic_clock_hpet,
 	.delay = 		delay_hpet,
-	.read_timer = 		read_timer_tsc,
 };
 
 struct init_timer_opts __initdata timer_hpet_init = {
-- 
cgit v1.2.3


From 7ae65fd334232468a9d6b523a4fc141cd6ec5ea4 Mon Sep 17 00:00:00 2001
From: Matt Tolentino <metolent@snoqualmie.dp.intel.com>
Date: Sat, 3 Sep 2005 15:56:27 -0700
Subject: [PATCH] x86: fix EFI memory map parsing

The memory descriptors that comprise the EFI memory map are not fixed in
stone such that the size could change in the future.  This uses the memory
descriptor size obtained from EFI to iterate over the memory map entries
during boot.  This enables the removal of an x86 specific pad (and ifdef)
in the EFI header.  I also couldn't stomach the broken up nature of the
function to put EFI runtime calls into virtual mode any longer so I fixed
that up a bit as well.

For reference, this patch only impacts x86.

Signed-off-by: Matt Tolentino <matthew.e.tolentino@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/efi.c   | 101 +++++++++++++++++++++++------------------------
 arch/i386/kernel/setup.c |  14 ++++---
 arch/i386/mm/init.c      |   5 ++-
 include/asm-i386/setup.h |   2 +-
 include/linux/efi.h      |  14 ++-----
 5 files changed, 67 insertions(+), 69 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 385883ea8c19..850648ae8305 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -233,22 +233,23 @@ void __init efi_map_memmap(void)
 {
 	memmap.map = NULL;
 
-	memmap.map = (efi_memory_desc_t *)
-		bt_ioremap((unsigned long) memmap.phys_map,
-			(memmap.nr_map * sizeof(efi_memory_desc_t)));
-
+	memmap.map = bt_ioremap((unsigned long) memmap.phys_map,
+			(memmap.nr_map * memmap.desc_size));
 	if (memmap.map == NULL)
 		printk(KERN_ERR PFX "Could not remap the EFI memmap!\n");
+
+	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
 }
 
 #if EFI_DEBUG
 static void __init print_efi_memmap(void)
 {
 	efi_memory_desc_t *md;
+	void *p;
 	int i;
 
-	for (i = 0; i < memmap.nr_map; i++) {
-		md = &memmap.map[i];
+	for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) {
+		md = p;
 		printk(KERN_INFO "mem%02u: type=%u, attr=0x%llx, "
 			"range=[0x%016llx-0x%016llx) (%lluMB)\n",
 			i, md->type, md->attribute, md->phys_addr,
@@ -271,10 +272,10 @@ void efi_memmap_walk(efi_freemem_callback_t callback, void *arg)
 	} prev, curr;
 	efi_memory_desc_t *md;
 	unsigned long start, end;
-	int i;
+	void *p;
 
-	for (i = 0; i < memmap.nr_map; i++) {
-		md = &memmap.map[i];
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
 
 		if ((md->num_pages == 0) || (!is_available_memory(md)))
 			continue;
@@ -325,6 +326,7 @@ void __init efi_init(void)
 	memmap.phys_map = EFI_MEMMAP;
 	memmap.nr_map = EFI_MEMMAP_SIZE/EFI_MEMDESC_SIZE;
 	memmap.desc_version = EFI_MEMDESC_VERSION;
+	memmap.desc_size = EFI_MEMDESC_SIZE;
 
 	efi.systab = (efi_system_table_t *)
 		boot_ioremap((unsigned long) efi_phys.systab,
@@ -428,22 +430,30 @@ void __init efi_init(void)
 		printk(KERN_ERR PFX "Could not map the runtime service table!\n");
 
 	/* Map the EFI memory map for use until paging_init() */
-
-	memmap.map = (efi_memory_desc_t *)
-		boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE);
-
+	memmap.map = boot_ioremap((unsigned long) EFI_MEMMAP, EFI_MEMMAP_SIZE);
 	if (memmap.map == NULL)
 		printk(KERN_ERR PFX "Could not map the EFI memory map!\n");
 
-	if (EFI_MEMDESC_SIZE != sizeof(efi_memory_desc_t)) {
-		printk(KERN_WARNING PFX "Warning! Kernel-defined memdesc doesn't "
-			   "match the one from EFI!\n");
-	}
+	memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
+
 #if EFI_DEBUG
 	print_efi_memmap();
 #endif
 }
 
+static inline void __init check_range_for_systab(efi_memory_desc_t *md)
+{
+	if (((unsigned long)md->phys_addr <= (unsigned long)efi_phys.systab) &&
+		((unsigned long)efi_phys.systab < md->phys_addr +
+		((unsigned long)md->num_pages << EFI_PAGE_SHIFT))) {
+		unsigned long addr;
+
+		addr = md->virt_addr - md->phys_addr +
+			(unsigned long)efi_phys.systab;
+		efi.systab = (efi_system_table_t *)addr;
+	}
+}
+
 /*
  * This function will switch the EFI runtime services to virtual mode.
  * Essentially, look through the EFI memmap and map every region that
@@ -457,43 +467,32 @@ void __init efi_enter_virtual_mode(void)
 {
 	efi_memory_desc_t *md;
 	efi_status_t status;
-	int i;
+	void *p;
 
 	efi.systab = NULL;
 
-	for (i = 0; i < memmap.nr_map; i++) {
-		md = &memmap.map[i];
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
 
-		if (md->attribute & EFI_MEMORY_RUNTIME) {
-			md->virt_addr =
-				(unsigned long)ioremap(md->phys_addr,
-					md->num_pages << EFI_PAGE_SHIFT);
-			if (!(unsigned long)md->virt_addr) {
-				printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
-					(unsigned long)md->phys_addr);
-			}
+		if (!(md->attribute & EFI_MEMORY_RUNTIME))
+			continue;
 
-			if (((unsigned long)md->phys_addr <=
-					(unsigned long)efi_phys.systab) &&
-				((unsigned long)efi_phys.systab <
-					md->phys_addr +
-					((unsigned long)md->num_pages <<
-						EFI_PAGE_SHIFT))) {
-				unsigned long addr;
-
-				addr = md->virt_addr - md->phys_addr +
-						(unsigned long)efi_phys.systab;
-				efi.systab = (efi_system_table_t *)addr;
-			}
+		md->virt_addr = (unsigned long)ioremap(md->phys_addr,
+			md->num_pages << EFI_PAGE_SHIFT);
+		if (!(unsigned long)md->virt_addr) {
+			printk(KERN_ERR PFX "ioremap of 0x%lX failed\n",
+				(unsigned long)md->phys_addr);
 		}
+		/* update the virtual address of the EFI system table */
+		check_range_for_systab(md);
 	}
 
 	if (!efi.systab)
 		BUG();
 
 	status = phys_efi_set_virtual_address_map(
-			sizeof(efi_memory_desc_t) * memmap.nr_map,
-			sizeof(efi_memory_desc_t),
+			memmap.desc_size * memmap.nr_map,
+			memmap.desc_size,
 			memmap.desc_version,
 		       	memmap.phys_map);
 
@@ -533,10 +532,10 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 {
 	struct resource *res;
 	efi_memory_desc_t *md;
-	int i;
+	void *p;
 
-	for (i = 0; i < memmap.nr_map; i++) {
-		md = &memmap.map[i];
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
 
 		if ((md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT)) >
 		    0x100000000ULL)
@@ -613,10 +612,10 @@ efi_initialize_iomem_resources(struct resource *code_resource,
 u32 efi_mem_type(unsigned long phys_addr)
 {
 	efi_memory_desc_t *md;
-	int i;
+	void *p;
 
-	for (i = 0; i < memmap.nr_map; i++) {
-		md = &memmap.map[i];
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
 		if ((md->phys_addr <= phys_addr) && (phys_addr <
 			(md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
 			return md->type;
@@ -627,10 +626,10 @@ u32 efi_mem_type(unsigned long phys_addr)
 u64 efi_mem_attributes(unsigned long phys_addr)
 {
 	efi_memory_desc_t *md;
-	int i;
+	void *p;
 
-	for (i = 0; i < memmap.nr_map; i++) {
-		md = &memmap.map[i];
+	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+		md = p;
 		if ((md->phys_addr <= phys_addr) && (phys_addr <
 			(md->phys_addr + (md-> num_pages << EFI_PAGE_SHIFT)) ))
 			return md->attribute;
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index af4de58cab54..9adbf710ec8d 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -370,12 +370,16 @@ static void __init limit_regions(unsigned long long size)
 	int i;
 
 	if (efi_enabled) {
-		for (i = 0; i < memmap.nr_map; i++) {
-			current_addr = memmap.map[i].phys_addr +
-				       (memmap.map[i].num_pages << 12);
-			if (memmap.map[i].type == EFI_CONVENTIONAL_MEMORY) {
+		efi_memory_desc_t *md;
+		void *p;
+
+		for (p = memmap.map, i = 0; p < memmap.map_end;
+			p += memmap.desc_size, i++) {
+			md = p;
+			current_addr = md->phys_addr + (md->num_pages << 12);
+			if (md->type == EFI_CONVENTIONAL_MEMORY) {
 				if (current_addr >= size) {
-					memmap.map[i].num_pages -=
+					md->num_pages -=
 						(((current_addr-size) + PAGE_SIZE-1) >> PAGE_SHIFT);
 					memmap.nr_map = i + 1;
 					return;
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 12216b52e28b..d8b23ab76533 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -198,9 +198,10 @@ int page_is_ram(unsigned long pagenr)
 
 	if (efi_enabled) {
 		efi_memory_desc_t *md;
+		void *p;
 
-		for (i = 0; i < memmap.nr_map; i++) {
-			md = &memmap.map[i];
+		for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
+			md = p;
 			if (!is_available_memory(md))
 				continue;
 			addr = (md->phys_addr+PAGE_SIZE-1) >> PAGE_SHIFT;
diff --git a/include/asm-i386/setup.h b/include/asm-i386/setup.h
index 7a32184d54bf..826a8ca50ac8 100644
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -44,7 +44,7 @@ extern unsigned char boot_params[PARAM_SIZE];
 #define EFI_SYSTAB ((efi_system_table_t *) *((unsigned long *)(PARAM+0x1c4)))
 #define EFI_MEMDESC_SIZE (*((unsigned long *) (PARAM+0x1c8)))
 #define EFI_MEMDESC_VERSION (*((unsigned long *) (PARAM+0x1cc)))
-#define EFI_MEMMAP ((efi_memory_desc_t *) *((unsigned long *)(PARAM+0x1d0)))
+#define EFI_MEMMAP ((void *) *((unsigned long *)(PARAM+0x1d0)))
 #define EFI_MEMMAP_SIZE (*((unsigned long *) (PARAM+0x1d4)))
 #define MOUNT_ROOT_RDONLY (*(unsigned short *) (PARAM+0x1F2))
 #define RAMDISK_FLAGS (*(unsigned short *) (PARAM+0x1F8))
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 73781ec165b4..c7c5dd316182 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -91,11 +91,6 @@ typedef	struct {
 
 #define EFI_PAGE_SHIFT		12
 
-/*
- * For current x86 implementations of EFI, there is
- * additional padding in the mem descriptors.  This is not
- * the case in ia64.  Need to have this fixed in the f/w.
- */
 typedef struct {
 	u32 type;
 	u32 pad;
@@ -103,9 +98,6 @@ typedef struct {
 	u64 virt_addr;
 	u64 num_pages;
 	u64 attribute;
-#if defined (__i386__)
-	u64 pad1;
-#endif
 } efi_memory_desc_t;
 
 typedef int (*efi_freemem_callback_t) (unsigned long start, unsigned long end, void *arg);
@@ -240,10 +232,12 @@ typedef struct {
 } efi_system_table_t;
 
 struct efi_memory_map {
-	efi_memory_desc_t *phys_map;
-	efi_memory_desc_t *map;
+	void *phys_map;
+	void *map;
+	void *map_end;
 	int nr_map;
 	unsigned long desc_version;
+	unsigned long desc_size;
 };
 
 /*
-- 
cgit v1.2.3


From 5fd75ebb1a58c1a3c9e3d9fdf75ce7286b79bb74 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <kernel@tesarici.cz>
Date: Sat, 3 Sep 2005 15:56:28 -0700
Subject: [PATCH] vm86: Honor TF bit when emulating an instruction

If the virtual 86 machine reaches an instruction which raises a General
Protection Fault (such as CLI or STI), the instruction is emulated (in
handle_vm86_fault).  However, the emulation ignored the TF bit, so the
hardware debug interrupt was not invoked after such an emulated instruction
(and the DOS debugger missed it).

This patch fixes the problem by emulating the hardware debug interrupt as
the last action before control is returned to the VM86 program.

Signed-off-by: Petr Tesarik <kernel@tesarici.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/vm86.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index ec0f68ce6886..2daa06fb4a88 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -542,7 +542,7 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
 	unsigned char opcode;
 	unsigned char __user *csp;
 	unsigned char __user *ssp;
-	unsigned short ip, sp;
+	unsigned short ip, sp, orig_flags;
 	int data32, pref_done;
 
 #define CHECK_IF_IN_TRAP \
@@ -551,8 +551,12 @@ void handle_vm86_fault(struct kernel_vm86_regs * regs, long error_code)
 #define VM86_FAULT_RETURN do { \
 	if (VMPI.force_return_for_pic  && (VEFLAGS & (IF_MASK | VIF_MASK))) \
 		return_to_32bit(regs, VM86_PICRETURN); \
+	if (orig_flags & TF_MASK) \
+		handle_vm86_trap(regs, 0, 1); \
 	return; } while (0)
 
+	orig_flags = *(unsigned short *)&regs->eflags;
+
 	csp = (unsigned char __user *) (regs->cs << 4);
 	ssp = (unsigned char __user *) (regs->ss << 4);
 	sp = SP(regs);
-- 
cgit v1.2.3


From 484b90c4b965d54037ff99b198d84cdf144f8a35 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@in.ibm.com>
Date: Sat, 3 Sep 2005 15:56:31 -0700
Subject: [PATCH] kdump: Save parameter segment in protected mode (x86)

o With introduction of kexec as boot-loader, the assumption that parameter
  segment will always be loaded at lower address than kernel and will be
  addressable by early bootup page tables is no longer valid. In kexec on
  panic case parameter segment might well be loaded beyond kernel image and
  might not be addressable by early boot page tables.
o This case might hit in the scenario where user has reserved a chunk of
  memory for second kernel, for example 16MB to 64MB, and has also built
  second kernel for physical memory location 16MB. In this case kexec has no
  choice but to load the parameter segment at a higher address than new kernel
  image at safe location where new kernel does not stomp it.
o Though problem should automatically go away once relocatable kernel for i386
  is in place and kexec can determine the location of new kernel at run time
  and load parameter segment at lower address than kernel image. But till then
  this patch can go in (assuming it does not break something else).
o This patch moves up the boot parameter saving code. Now boot parameters
  are copied out in protected mode before page tables are initialized. This
  will ensure that parameter segment is always addressable irrespective of
  its physical location.

Signed-off-by: Vivek Goyal <vgoyal@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/head.S | 48 ++++++++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/head.S b/arch/i386/kernel/head.S
index 4477bb107098..0480ca9e9e57 100644
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -77,6 +77,32 @@ ENTRY(startup_32)
 	subl %edi,%ecx
 	shrl $2,%ecx
 	rep ; stosl
+/*
+ * Copy bootup parameters out of the way.
+ * Note: %esi still has the pointer to the real-mode data.
+ * With the kexec as boot loader, parameter segment might be loaded beyond
+ * kernel image and might not even be addressable by early boot page tables.
+ * (kexec on panic case). Hence copy out the parameters before initializing
+ * page tables.
+ */
+	movl $(boot_params - __PAGE_OFFSET),%edi
+	movl $(PARAM_SIZE/4),%ecx
+	cld
+	rep
+	movsl
+	movl boot_params - __PAGE_OFFSET + NEW_CL_POINTER,%esi
+	andl %esi,%esi
+	jnz 2f			# New command line protocol
+	cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
+	jne 1f
+	movzwl OLD_CL_OFFSET,%esi
+	addl $(OLD_CL_BASE_ADDR),%esi
+2:
+	movl $(saved_command_line - __PAGE_OFFSET),%edi
+	movl $(COMMAND_LINE_SIZE/4),%ecx
+	rep
+	movsl
+1:
 
 /*
  * Initialize page tables.  This creates a PDE and a set of page
@@ -214,28 +240,6 @@ ENTRY(startup_32_smp)
  */
 	call setup_idt
 
-/*
- * Copy bootup parameters out of the way.
- * Note: %esi still has the pointer to the real-mode data.
- */
-	movl $boot_params,%edi
-	movl $(PARAM_SIZE/4),%ecx
-	cld
-	rep
-	movsl
-	movl boot_params+NEW_CL_POINTER,%esi
-	andl %esi,%esi
-	jnz 2f			# New command line protocol
-	cmpw $(OLD_CL_MAGIC),OLD_CL_MAGIC_ADDR
-	jne 1f
-	movzwl OLD_CL_OFFSET,%esi
-	addl $(OLD_CL_BASE_ADDR),%esi
-2:
-	movl $saved_command_line,%edi
-	movl $(COMMAND_LINE_SIZE/4),%ecx
-	rep
-	movsl
-1:
 checkCPUtype:
 
 	movl $-1,X86_CPUID		#  -1 for no CPUID initially
-- 
cgit v1.2.3


From 911a62d42365076209e2c327e7688db296e35d62 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Sat, 3 Sep 2005 15:56:31 -0700
Subject: [PATCH] x86: sutomatically enable bigsmp when we have more than 8
 CPUs

i386 generic subarchitecture requires explicit dmi strings or command line
to enable bigsmp mode.  The patch below removes that restriction, and uses
bigsmp as soon as it finds more than 8 logical CPUs, Intel processors and
xAPIC support.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/acpi/boot.c              |  3 +++
 arch/i386/kernel/mpparse.c                |  9 +++++++++
 arch/i386/kernel/setup.c                  |  8 +++++++-
 arch/i386/mach-generic/bigsmp.c           |  5 ++++-
 arch/i386/mach-generic/probe.c            | 20 ++++++++++++++++++++
 include/asm-i386/apicdef.h                |  1 +
 include/asm-i386/mach-generic/mach_apic.h |  2 ++
 include/asm-i386/mpspec.h                 |  1 +
 8 files changed, 47 insertions(+), 2 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/acpi/boot.c b/arch/i386/kernel/acpi/boot.c
index b7808a89d945..34ee500c26e5 100644
--- a/arch/i386/kernel/acpi/boot.c
+++ b/arch/i386/kernel/acpi/boot.c
@@ -833,6 +833,9 @@ acpi_process_madt(void)
 		if (!error) {
 			acpi_lapic = 1;
 
+#ifdef CONFIG_X86_GENERICARCH
+			generic_bigsmp_probe();
+#endif
 			/*
 			 * Parse MADT IO-APIC entries
 			 */
diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index ce838abb27d8..788efffa9930 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -65,6 +65,8 @@ int nr_ioapics;
 int pic_mode;
 unsigned long mp_lapic_addr;
 
+unsigned int def_to_bigsmp = 0;
+
 /* Processor that is doing the boot up */
 unsigned int boot_cpu_physical_apicid = -1U;
 /* Internal processor count */
@@ -213,6 +215,13 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 		ver = 0x10;
 	}
 	apic_version[m->mpc_apicid] = ver;
+	if ((num_processors > 8) &&
+	    APIC_XAPIC(ver) &&
+	    (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL))
+		def_to_bigsmp = 1;
+	else
+		def_to_bigsmp = 0;
+
 	bios_cpu_apicid[num_processors - 1] = m->mpc_apicid;
 }
 
diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 9adbf710ec8d..294bcca985ab 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -1585,8 +1585,14 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	acpi_boot_table_init();
 	acpi_boot_init();
-#endif
 
+#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC)
+	if (def_to_bigsmp)
+		printk(KERN_WARNING "More than 8 CPUs detected and "
+			"CONFIG_X86_PC cannot handle it.\nUse "
+			"CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
+#endif
+#endif
 #ifdef CONFIG_X86_LOCAL_APIC
 	if (smp_found_config)
 		get_smp_config();
diff --git a/arch/i386/mach-generic/bigsmp.c b/arch/i386/mach-generic/bigsmp.c
index 25883b44f625..037b2af1a1f4 100644
--- a/arch/i386/mach-generic/bigsmp.c
+++ b/arch/i386/mach-generic/bigsmp.c
@@ -47,7 +47,10 @@ static struct dmi_system_id __initdata bigsmp_dmi_table[] = {
 
 static __init int probe_bigsmp(void)
 { 
-	dmi_check_system(bigsmp_dmi_table);
+	if (def_to_bigsmp)
+        	dmi_bigsmp = 1;
+	else
+		dmi_check_system(bigsmp_dmi_table);
 	return dmi_bigsmp; 
 } 
 
diff --git a/arch/i386/mach-generic/probe.c b/arch/i386/mach-generic/probe.c
index 5497c65a8790..cea5b3ce4b57 100644
--- a/arch/i386/mach-generic/probe.c
+++ b/arch/i386/mach-generic/probe.c
@@ -30,6 +30,25 @@ struct genapic *apic_probe[] __initdata = {
 	NULL,
 };
 
+static int cmdline_apic;
+
+void __init generic_bigsmp_probe(void)
+{
+	/*
+	 * This routine is used to switch to bigsmp mode when
+	 * - There is no apic= option specified by the user
+	 * - generic_apic_probe() has choosen apic_default as the sub_arch
+	 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
+	 */
+
+	if (!cmdline_apic && genapic == &apic_default)
+		if (apic_bigsmp.probe()) {
+			genapic = &apic_bigsmp;
+			printk(KERN_INFO "Overriding APIC driver with %s\n",
+			       genapic->name);
+		}
+}
+
 void __init generic_apic_probe(char *command_line) 
 { 
 	char *s;
@@ -52,6 +71,7 @@ void __init generic_apic_probe(char *command_line)
 		if (!changed)
 			printk(KERN_ERR "Unknown genapic `%s' specified.\n", s);
 		*p = old;
+		cmdline_apic = changed;
 	} 
 	for (i = 0; !changed && apic_probe[i]; i++) { 
 		if (apic_probe[i]->probe()) {
diff --git a/include/asm-i386/apicdef.h b/include/asm-i386/apicdef.h
index a96a8f48fbfc..03185cef8e0a 100644
--- a/include/asm-i386/apicdef.h
+++ b/include/asm-i386/apicdef.h
@@ -16,6 +16,7 @@
 #define			GET_APIC_VERSION(x)	((x)&0xFF)
 #define			GET_APIC_MAXLVT(x)	(((x)>>16)&0xFF)
 #define			APIC_INTEGRATED(x)	((x)&0xF0)
+#define			APIC_XAPIC(x)		((x) >= 0x14)
 #define		APIC_TASKPRI	0x80
 #define			APIC_TPRI_MASK		0xFF
 #define		APIC_ARBPRI	0x90
diff --git a/include/asm-i386/mach-generic/mach_apic.h b/include/asm-i386/mach-generic/mach_apic.h
index b13767a4e934..d9dc039da94a 100644
--- a/include/asm-i386/mach-generic/mach_apic.h
+++ b/include/asm-i386/mach-generic/mach_apic.h
@@ -28,4 +28,6 @@
 #define enable_apic_mode (genapic->enable_apic_mode)
 #define phys_pkg_id (genapic->phys_pkg_id)
 
+extern void generic_bigsmp_probe(void);
+
 #endif /* __ASM_MACH_APIC_H */
diff --git a/include/asm-i386/mpspec.h b/include/asm-i386/mpspec.h
index d9fafba075bc..d84a9c326c22 100644
--- a/include/asm-i386/mpspec.h
+++ b/include/asm-i386/mpspec.h
@@ -11,6 +11,7 @@ extern int mp_bus_id_to_local [MAX_MP_BUSSES];
 extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
 extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES];
 
+extern unsigned int def_to_bigsmp;
 extern unsigned int boot_cpu_physical_apicid;
 extern int smp_found_config;
 extern void find_smp_config (void);
-- 
cgit v1.2.3


From 252943efcfce945d8dd3738ca4c4b9cbeb4f3fa9 Mon Sep 17 00:00:00 2001
From: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Date: Sat, 3 Sep 2005 15:56:32 -0700
Subject: [PATCH] x86: Add the check for all the cores in a package in cache
 information

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/intel_cacheinfo.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index 6c55b50cf048..9e0d5f83cb9f 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -305,6 +305,9 @@ static void __devinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
 {
 	struct _cpuid4_info	*this_leaf;
 	unsigned long num_threads_sharing;
+#ifdef CONFIG_X86_HT
+	struct cpuinfo_x86 *c = cpu_data + cpu;
+#endif
 
 	this_leaf = CPUID4_INFO_IDX(cpu, index);
 	num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing;
@@ -314,10 +317,12 @@ static void __devinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
 #ifdef CONFIG_X86_HT
 	else if (num_threads_sharing == smp_num_siblings)
 		this_leaf->shared_cpu_map = cpu_sibling_map[cpu];
-#endif
+	else if (num_threads_sharing == (c->x86_num_cores * smp_num_siblings))
+		this_leaf->shared_cpu_map = cpu_core_map[cpu];
 	else
-		printk(KERN_INFO "Number of CPUs sharing cache didn't match "
+		printk(KERN_DEBUG "Number of CPUs sharing cache didn't match "
 				"any known set of CPUs\n");
+#endif
 }
 #else
 static void __init cache_shared_cpu_map_setup(unsigned int cpu, int index) {}
-- 
cgit v1.2.3


From 2a0694d15d55d0deed928786a6393d5e45e37d76 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sat, 3 Sep 2005 15:56:35 -0700
Subject: [PATCH] i386: clean up vDSO alignment padding

This makes the vDSO use nops for all its padding around instructions,
rather than sometimes zeros, and nop-pads the end of the area containing
instructions to a 32-byte cache line, to keep text and data in separate
lines.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/vsyscall-sigreturn.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/vsyscall-sigreturn.S b/arch/i386/kernel/vsyscall-sigreturn.S
index c8fcf75b9be3..68afa50dd7cf 100644
--- a/arch/i386/kernel/vsyscall-sigreturn.S
+++ b/arch/i386/kernel/vsyscall-sigreturn.S
@@ -15,7 +15,7 @@
 */
 
 	.text
-	.org	__kernel_vsyscall+32
+	.org __kernel_vsyscall+32,0x90
 	.globl __kernel_sigreturn
 	.type __kernel_sigreturn,@function
 __kernel_sigreturn:
@@ -35,6 +35,7 @@ __kernel_rt_sigreturn:
 	int $0x80
 .LEND_rt_sigreturn:
 	.size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn
+	.balign 32
 	.previous
 
 	.section .eh_frame,"a",@progbits
-- 
cgit v1.2.3


From 4bb0d3ec3e5b1e9e2399cdc641b3b6521ac9cdaa Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:36 -0700
Subject: [PATCH] i386: inline asm cleanup

i386 Inline asm cleanup.  Use cr/dr accessor functions.

Also, a potential bugfix.  Also, some CR accessors really should be volatile.
Reads from CR0 (numeric state may change in an exception handler), writes to
CR4 (flipping CR4.TSD) and reads from CR2 (page fault) prevent instruction
re-ordering.  I did not add memory clobber to CR3 / CR4 / CR0 updates, as it
was not there to begin with, and in no case should kernel memory be clobbered,
except when doing a TLB flush, which already has memory clobber.

I noticed that page invalidation does not have a memory clobber.  I can't find
a bug as a result, but there is definitely a potential for a bug here:

#define __flush_tlb_single(addr) \
	__asm__ __volatile__("invlpg %0": :"m" (*(char *) addr))

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/common.c           | 12 ++++++------
 arch/i386/kernel/cpu/cpufreq/longhaul.c | 12 +++---------
 arch/i386/kernel/cpu/cyrix.c            |  6 +-----
 arch/i386/kernel/efi.c                  |  4 ++--
 arch/i386/kernel/machine_kexec.c        |  8 +-------
 arch/i386/kernel/process.c              | 16 ++++++----------
 arch/i386/kernel/smp.c                  |  2 +-
 arch/i386/mm/fault.c                    |  6 +++---
 arch/i386/mm/pageattr.c                 |  2 +-
 arch/i386/power/cpu.c                   | 16 ++++++++--------
 include/asm-i386/agp.h                  |  2 +-
 include/asm-i386/bugs.h                 |  5 ++++-
 include/asm-i386/processor.h            | 22 +++++++++-------------
 include/asm-i386/system.h               | 28 +++++++++++++++++++++++++---
 include/asm-i386/xor.h                  | 26 +++++++++++++-------------
 15 files changed, 84 insertions(+), 83 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 4553ffd94b1f..361f2e7ccb12 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -642,12 +642,12 @@ void __devinit cpu_init(void)
 	asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs");
 
 	/* Clear all 6 debug registers: */
-
-#define CD(register) set_debugreg(0, register)
-
-	CD(0); CD(1); CD(2); CD(3); /* no db4 and db5 */; CD(6); CD(7);
-
-#undef CD
+	set_debugreg(0, 0);
+	set_debugreg(0, 1);
+	set_debugreg(0, 2);
+	set_debugreg(0, 3);
+	set_debugreg(0, 6);
+	set_debugreg(0, 7);
 
 	/*
 	 * Force FPU initialization:
diff --git a/arch/i386/kernel/cpu/cpufreq/longhaul.c b/arch/i386/kernel/cpu/cpufreq/longhaul.c
index 04e3563da4fe..bf02b5026e62 100644
--- a/arch/i386/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/i386/kernel/cpu/cpufreq/longhaul.c
@@ -64,8 +64,6 @@ static int dont_scale_voltage;
 #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "longhaul", msg)
 
 
-#define __hlt()     __asm__ __volatile__("hlt": : :"memory")
-
 /* Clock ratios multiplied by 10 */
 static int clock_ratio[32];
 static int eblcr_table[32];
@@ -168,11 +166,9 @@ static void do_powersaver(union msr_longhaul *longhaul,
 	outb(0xFE,0x21);	/* TMR0 only */
 	outb(0xFF,0x80);	/* delay */
 
-	local_irq_enable();
-
-	__hlt();
+	safe_halt();
 	wrmsrl(MSR_VIA_LONGHAUL, longhaul->val);
-	__hlt();
+	halt();
 
 	local_irq_disable();
 
@@ -251,9 +247,7 @@ static void longhaul_setstate(unsigned int clock_ratio_index)
 		bcr2.bits.CLOCKMUL = clock_ratio_index;
 		local_irq_disable();
 		wrmsrl (MSR_VIA_BCR2, bcr2.val);
-		local_irq_enable();
-
-		__hlt();
+		safe_halt();
 
 		/* Disable software clock multiplier */
 		rdmsrl (MSR_VIA_BCR2, bcr2.val);
diff --git a/arch/i386/kernel/cpu/cyrix.c b/arch/i386/kernel/cpu/cyrix.c
index ba4b01138c8f..ff87cc22b323 100644
--- a/arch/i386/kernel/cpu/cyrix.c
+++ b/arch/i386/kernel/cpu/cyrix.c
@@ -132,11 +132,7 @@ static void __init set_cx86_memwb(void)
 	setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04);
 	/* set 'Not Write-through' */
 	cr0 = 0x20000000;
-	__asm__("movl %%cr0,%%eax\n\t"
-		"orl %0,%%eax\n\t"
-		"movl %%eax,%%cr0\n"
-		: : "r" (cr0)
-		:"ax");
+	write_cr0(read_cr0() | cr0);
 	/* CCR2 bit 2: lock NW bit and set WT1 */
 	setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14 );
 }
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 850648ae8305..921fdb15fc9b 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -79,7 +79,7 @@ static void efi_call_phys_prelog(void)
 	 * directory. If I have PSE, I just need to duplicate one entry in
 	 * page directory.
 	 */
-	__asm__ __volatile__("movl %%cr4, %0":"=r"(cr4));
+	cr4 = read_cr4();
 
 	if (cr4 & X86_CR4_PSE) {
 		efi_bak_pg_dir_pointer[0].pgd =
@@ -115,7 +115,7 @@ static void efi_call_phys_epilog(void)
 	cpu_gdt_descr[0].address =
 		(unsigned long) __va(cpu_gdt_descr[0].address);
 	__asm__ __volatile__("lgdt %0":"=m"(cpu_gdt_descr));
-	__asm__ __volatile__("movl %%cr4, %0":"=r"(cr4));
+	cr4 = read_cr4();
 
 	if (cr4 & X86_CR4_PSE) {
 		swapper_pg_dir[pgd_index(0)].pgd =
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c
index cb699a2aa1f8..f19f6d34bcbf 100644
--- a/arch/i386/kernel/machine_kexec.c
+++ b/arch/i386/kernel/machine_kexec.c
@@ -17,13 +17,7 @@
 #include <asm/apic.h>
 #include <asm/cpufeature.h>
 #include <asm/desc.h>
-
-static inline unsigned long read_cr3(void)
-{
-	unsigned long cr3;
-	asm volatile("movl %%cr3,%0": "=r"(cr3));
-	return cr3;
-}
+#include <asm/system.h>
 
 #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
 
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index e3f362e8af5b..761d4ed47ef3 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -313,16 +313,12 @@ void show_regs(struct pt_regs * regs)
 	printk(" DS: %04x ES: %04x\n",
 		0xffff & regs->xds,0xffff & regs->xes);
 
-	__asm__("movl %%cr0, %0": "=r" (cr0));
-	__asm__("movl %%cr2, %0": "=r" (cr2));
-	__asm__("movl %%cr3, %0": "=r" (cr3));
-	/* This could fault if %cr4 does not exist */
-	__asm__("1: movl %%cr4, %0		\n"
-		"2:				\n"
-		".section __ex_table,\"a\"	\n"
-		".long 1b,2b			\n"
-		".previous			\n"
-		: "=r" (cr4): "0" (0));
+	cr0 = read_cr0();
+	cr2 = read_cr2();
+	cr3 = read_cr3();
+	if (current_cpu_data.x86 > 4) {
+		cr4 = read_cr4();
+	}
 	printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4);
 	show_trace(NULL, &regs->esp);
 }
diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c
index cec4bde67161..48b55db3680f 100644
--- a/arch/i386/kernel/smp.c
+++ b/arch/i386/kernel/smp.c
@@ -576,7 +576,7 @@ static void stop_this_cpu (void * dummy)
 	local_irq_disable();
 	disable_local_APIC();
 	if (cpu_data[smp_processor_id()].hlt_works_ok)
-		for(;;) __asm__("hlt");
+		for(;;) halt();
 	for (;;);
 }
 
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 61d9e34af5a6..411b8500ad1b 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -233,7 +233,7 @@ fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	int write, si_code;
 
 	/* get the address */
-	__asm__("movl %%cr2,%0":"=r" (address));
+        address = read_cr2();
 
 	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
 					SIGSEGV) == NOTIFY_STOP)
@@ -453,7 +453,7 @@ no_context:
 	printk(" at virtual address %08lx\n",address);
 	printk(KERN_ALERT " printing eip:\n");
 	printk("%08lx\n", regs->eip);
-	asm("movl %%cr3,%0":"=r" (page));
+	page = read_cr3();
 	page = ((unsigned long *) __va(page))[address >> 22];
 	printk(KERN_ALERT "*pde = %08lx\n", page);
 	/*
@@ -526,7 +526,7 @@ vmalloc_fault:
 		pmd_t *pmd, *pmd_k;
 		pte_t *pte_k;
 
-		asm("movl %%cr3,%0":"=r" (pgd_paddr));
+		pgd_paddr = read_cr3();
 		pgd = index + (pgd_t *)__va(pgd_paddr);
 		pgd_k = init_mm.pgd + index;
 
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index cb3da6baa704..bce06a79eafa 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -62,7 +62,7 @@ static void flush_kernel_map(void *dummy)
 { 
 	/* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */
 	if (boot_cpu_data.x86_model >= 4) 
-		asm volatile("wbinvd":::"memory"); 
+		wbinvd();
 	/* Flush all to work around Errata in early athlons regarding 
 	 * large page flushing. 
 	 */
diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c
index c547c1af6fa1..4e19c43e0954 100644
--- a/arch/i386/power/cpu.c
+++ b/arch/i386/power/cpu.c
@@ -57,10 +57,10 @@ void __save_processor_state(struct saved_context *ctxt)
 	/*
 	 * control registers 
 	 */
-	asm volatile ("movl %%cr0, %0" : "=r" (ctxt->cr0));
-	asm volatile ("movl %%cr2, %0" : "=r" (ctxt->cr2));
-	asm volatile ("movl %%cr3, %0" : "=r" (ctxt->cr3));
-	asm volatile ("movl %%cr4, %0" : "=r" (ctxt->cr4));
+	ctxt->cr0 = read_cr0();
+	ctxt->cr2 = read_cr2();
+	ctxt->cr3 = read_cr3();
+	ctxt->cr4 = read_cr4();
 }
 
 void save_processor_state(void)
@@ -109,10 +109,10 @@ void __restore_processor_state(struct saved_context *ctxt)
 	/*
 	 * control registers
 	 */
-	asm volatile ("movl %0, %%cr4" :: "r" (ctxt->cr4));
-	asm volatile ("movl %0, %%cr3" :: "r" (ctxt->cr3));
-	asm volatile ("movl %0, %%cr2" :: "r" (ctxt->cr2));
-	asm volatile ("movl %0, %%cr0" :: "r" (ctxt->cr0));
+	write_cr4(ctxt->cr4);
+	write_cr3(ctxt->cr3);
+	write_cr2(ctxt->cr2);
+	write_cr2(ctxt->cr0);
 
 	/*
 	 * now restore the descriptor tables to their proper values
diff --git a/include/asm-i386/agp.h b/include/asm-i386/agp.h
index b82f5f3ab887..9075083bab76 100644
--- a/include/asm-i386/agp.h
+++ b/include/asm-i386/agp.h
@@ -19,7 +19,7 @@ int unmap_page_from_agp(struct page *page);
 /* Could use CLFLUSH here if the cpu supports it. But then it would
    need to be called for each cacheline of the whole page so it may not be 
    worth it. Would need a page for it. */
-#define flush_agp_cache() asm volatile("wbinvd":::"memory")
+#define flush_agp_cache() wbinvd()
 
 /* Convert a physical address to an address suitable for the GART. */
 #define phys_to_gart(x) (x)
diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h
index 6789fc275da3..ea54540638d2 100644
--- a/include/asm-i386/bugs.h
+++ b/include/asm-i386/bugs.h
@@ -118,7 +118,10 @@ static void __init check_hlt(void)
 		printk("disabled\n");
 		return;
 	}
-	__asm__ __volatile__("hlt ; hlt ; hlt ; hlt");
+	halt();
+	halt();
+	halt();
+	halt();
 	printk("OK.\n");
 }
 
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index d0d8b0160090..7e17d3b4f65a 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -203,9 +203,7 @@ static inline unsigned int cpuid_edx(unsigned int op)
 	return edx;
 }
 
-#define load_cr3(pgdir) \
-	asm volatile("movl %0,%%cr3": :"r" (__pa(pgdir)))
-
+#define load_cr3(pgdir) write_cr3(__pa(pgdir))
 
 /*
  * Intel CPU features in CR4
@@ -232,22 +230,20 @@ extern unsigned long mmu_cr4_features;
 
 static inline void set_in_cr4 (unsigned long mask)
 {
+	unsigned cr4;
 	mmu_cr4_features |= mask;
-	__asm__("movl %%cr4,%%eax\n\t"
-		"orl %0,%%eax\n\t"
-		"movl %%eax,%%cr4\n"
-		: : "irg" (mask)
-		:"ax");
+	cr4 = read_cr4();
+	cr4 |= mask;
+	write_cr4(cr4);
 }
 
 static inline void clear_in_cr4 (unsigned long mask)
 {
+	unsigned cr4;
 	mmu_cr4_features &= ~mask;
-	__asm__("movl %%cr4,%%eax\n\t"
-		"andl %0,%%eax\n\t"
-		"movl %%eax,%%cr4\n"
-		: : "irg" (~mask)
-		:"ax");
+	cr4 = read_cr4();
+	cr4 &= ~mask;
+	write_cr4(cr4);
 }
 
 /*
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 3db717a244f0..8048a5e018cd 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -107,13 +107,33 @@ static inline unsigned long _get_base(char * addr)
 #define clts() __asm__ __volatile__ ("clts")
 #define read_cr0() ({ \
 	unsigned int __dummy; \
-	__asm__( \
+	__asm__ __volatile__( \
 		"movl %%cr0,%0\n\t" \
 		:"=r" (__dummy)); \
 	__dummy; \
 })
 #define write_cr0(x) \
-	__asm__("movl %0,%%cr0": :"r" (x));
+	__asm__ __volatile__("movl %0,%%cr0": :"r" (x));
+
+#define read_cr2() ({ \
+	unsigned int __dummy; \
+	__asm__ __volatile__( \
+		"movl %%cr2,%0\n\t" \
+		:"=r" (__dummy)); \
+	__dummy; \
+})
+#define write_cr2(x) \
+	__asm__ __volatile__("movl %0,%%cr2": :"r" (x));
+
+#define read_cr3() ({ \
+	unsigned int __dummy; \
+	__asm__ ( \
+		"movl %%cr3,%0\n\t" \
+		:"=r" (__dummy)); \
+	__dummy; \
+})
+#define write_cr3(x) \
+	__asm__ __volatile__("movl %0,%%cr3": :"r" (x));
 
 #define read_cr4() ({ \
 	unsigned int __dummy; \
@@ -123,7 +143,7 @@ static inline unsigned long _get_base(char * addr)
 	__dummy; \
 })
 #define write_cr4(x) \
-	__asm__("movl %0,%%cr4": :"r" (x));
+	__asm__ __volatile__("movl %0,%%cr4": :"r" (x));
 #define stts() write_cr0(8 | read_cr0())
 
 #endif	/* __KERNEL__ */
@@ -447,6 +467,8 @@ struct alt_instr {
 #define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
 /* used in the idle loop; sti takes one instruction cycle to complete */
 #define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
+/* used when interrupts are already enabled or to shutdown the processor */
+#define halt()			__asm__ __volatile__("hlt": : :"memory")
 
 #define irqs_disabled()			\
 ({					\
diff --git a/include/asm-i386/xor.h b/include/asm-i386/xor.h
index f80e2dbe1b56..23c86cef3b25 100644
--- a/include/asm-i386/xor.h
+++ b/include/asm-i386/xor.h
@@ -535,14 +535,14 @@ static struct xor_block_template xor_block_p5_mmx = {
 
 #define XMMS_SAVE do {				\
 	preempt_disable();			\
+	cr0 = read_cr0();			\
+	clts();					\
 	__asm__ __volatile__ ( 			\
-		"movl %%cr0,%0		;\n\t"	\
-		"clts			;\n\t"	\
-		"movups %%xmm0,(%1)	;\n\t"	\
-		"movups %%xmm1,0x10(%1)	;\n\t"	\
-		"movups %%xmm2,0x20(%1)	;\n\t"	\
-		"movups %%xmm3,0x30(%1)	;\n\t"	\
-		: "=&r" (cr0)			\
+		"movups %%xmm0,(%0)	;\n\t"	\
+		"movups %%xmm1,0x10(%0)	;\n\t"	\
+		"movups %%xmm2,0x20(%0)	;\n\t"	\
+		"movups %%xmm3,0x30(%0)	;\n\t"	\
+		:				\
 		: "r" (xmm_save) 		\
 		: "memory");			\
 } while(0)
@@ -550,14 +550,14 @@ static struct xor_block_template xor_block_p5_mmx = {
 #define XMMS_RESTORE do {			\
 	__asm__ __volatile__ ( 			\
 		"sfence			;\n\t"	\
-		"movups (%1),%%xmm0	;\n\t"	\
-		"movups 0x10(%1),%%xmm1	;\n\t"	\
-		"movups 0x20(%1),%%xmm2	;\n\t"	\
-		"movups 0x30(%1),%%xmm3	;\n\t"	\
-		"movl 	%0,%%cr0	;\n\t"	\
+		"movups (%0),%%xmm0	;\n\t"	\
+		"movups 0x10(%0),%%xmm1	;\n\t"	\
+		"movups 0x20(%0),%%xmm2	;\n\t"	\
+		"movups 0x30(%0),%%xmm3	;\n\t"	\
 		:				\
-		: "r" (cr0), "r" (xmm_save)	\
+		: "r" (xmm_save)		\
 		: "memory");			\
+	write_cr0(cr0);				\
 	preempt_enable();			\
 } while(0)
 
-- 
cgit v1.2.3


From 245067d1674d451855692fcd4647daf9fd47f82d Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:37 -0700
Subject: [PATCH] i386: cleanup serialize msr

i386 arch cleanup.  Introduce the serialize macro to serialize processor
state.  Why the microcode update needs it I am not quite sure, since wrmsr()
is already a serializing instruction, but it is a microcode update, so I will
keep the semantic the same, since this could be a timing workaround.  As far
as I can tell, this has always been there since the original microcode update
source.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/microcode.c   | 7 +++++--
 include/asm-i386/processor.h   | 5 +++++
 include/asm-x86_64/processor.h | 5 +++++
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/microcode.c b/arch/i386/kernel/microcode.c
index a77c612aad00..165f13158c60 100644
--- a/arch/i386/kernel/microcode.c
+++ b/arch/i386/kernel/microcode.c
@@ -164,7 +164,8 @@ static void collect_cpu_info (void *unused)
 	}
 
 	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
-	__asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
+	/* see notes above for revision 1.07.  Apparent chip bug */
+	serialize_cpu();
 	/* get the current revision from MSR 0x8B */
 	rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
 	pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
@@ -377,7 +378,9 @@ static void do_update_one (void * unused)
 		(unsigned long) uci->mc->bits >> 16 >> 16);
 	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
 
-	__asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
+	/* see notes above for revision 1.07.  Apparent chip bug */
+	serialize_cpu();
+
 	/* get the current revision from MSR 0x8B */
 	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
 
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 7e17d3b4f65a..a0489b222702 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -277,6 +277,11 @@ static inline void clear_in_cr4 (unsigned long mask)
 	outb((data), 0x23); \
 } while (0)
 
+static inline void serialize_cpu(void)
+{
+	 __asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
+}
+
 static inline void __monitor(const void *eax, unsigned long ecx,
 		unsigned long edx)
 {
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 85549e656eeb..194160f6a43f 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -437,6 +437,11 @@ static inline void prefetchw(void *x)
 	outb((data), 0x23); \
 } while (0)
 
+static inline void serialize_cpu(void)
+{
+	__asm__ __volatile__ ("cpuid" : : : "ax", "bx", "cx", "dx");
+}
+
 static inline void __monitor(const void *eax, unsigned long ecx,
 		unsigned long edx)
 {
-- 
cgit v1.2.3


From 4d37e7e3fd851428dede4d05d3e69d03795a744a Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:38 -0700
Subject: [PATCH] i386: inline assembler: cleanup and encapsulate descriptor
 and task register management

i386 inline assembler cleanup.

This change encapsulates descriptor and task register management.  Also,
it is possible to improve assembler generation in two cases; savesegment
may store the value in a register instead of a memory location, which
allows GCC to optimize stack variables into registers, and MOV MEM, SEG
is always a 16-bit write to memory, making the casting in math-emu
unnecessary.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/common.c    |  4 ++--
 arch/i386/kernel/doublefault.c   |  2 +-
 arch/i386/kernel/efi.c           |  5 ++---
 arch/i386/kernel/reboot.c        |  9 +++++----
 arch/i386/kernel/signal.c        |  4 ++--
 arch/i386/kernel/traps.c         |  2 +-
 arch/i386/kernel/vm86.c          |  4 ++--
 arch/i386/math-emu/get_address.c | 13 +++----------
 arch/i386/power/cpu.c            | 26 +++++++++++++-------------
 include/asm-i386/desc.h          | 10 ++++++++++
 include/asm-i386/system.h        |  4 ++--
 11 files changed, 43 insertions(+), 40 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 361f2e7ccb12..46ce9b248f55 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -613,8 +613,8 @@ void __devinit cpu_init(void)
 	memcpy(thread->tls_array, &per_cpu(cpu_gdt_table, cpu),
 		GDT_ENTRY_TLS_ENTRIES * 8);
 
-	__asm__ __volatile__("lgdt %0" : : "m" (cpu_gdt_descr[cpu]));
-	__asm__ __volatile__("lidt %0" : : "m" (idt_descr));
+	load_gdt(&cpu_gdt_descr[cpu]);
+	load_idt(&idt_descr);
 
 	/*
 	 * Delete NT
diff --git a/arch/i386/kernel/doublefault.c b/arch/i386/kernel/doublefault.c
index 789af3e9fb1f..5edb1d379add 100644
--- a/arch/i386/kernel/doublefault.c
+++ b/arch/i386/kernel/doublefault.c
@@ -20,7 +20,7 @@ static void doublefault_fn(void)
 	struct Xgt_desc_struct gdt_desc = {0, 0};
 	unsigned long gdt, tss;
 
-	__asm__ __volatile__("sgdt %0": "=m" (gdt_desc): :"memory");
+	store_gdt(&gdt_desc);
 	gdt = gdt_desc.address;
 
 	printk("double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c
index 921fdb15fc9b..ecad519fd395 100644
--- a/arch/i386/kernel/efi.c
+++ b/arch/i386/kernel/efi.c
@@ -104,8 +104,7 @@ static void efi_call_phys_prelog(void)
 	local_flush_tlb();
 
 	cpu_gdt_descr[0].address = __pa(cpu_gdt_descr[0].address);
-	__asm__ __volatile__("lgdt %0":"=m"
-			    (*(struct Xgt_desc_struct *) __pa(&cpu_gdt_descr[0])));
+	load_gdt((struct Xgt_desc_struct *) __pa(&cpu_gdt_descr[0]));
 }
 
 static void efi_call_phys_epilog(void)
@@ -114,7 +113,7 @@ static void efi_call_phys_epilog(void)
 
 	cpu_gdt_descr[0].address =
 		(unsigned long) __va(cpu_gdt_descr[0].address);
-	__asm__ __volatile__("lgdt %0":"=m"(cpu_gdt_descr));
+	load_gdt(&cpu_gdt_descr[0]);
 	cr4 = read_cr4();
 
 	if (cr4 & X86_CR4_PSE) {
diff --git a/arch/i386/kernel/reboot.c b/arch/i386/kernel/reboot.c
index c71fef31dc47..1cbb9c0f4704 100644
--- a/arch/i386/kernel/reboot.c
+++ b/arch/i386/kernel/reboot.c
@@ -13,6 +13,7 @@
 #include <linux/dmi.h>
 #include <asm/uaccess.h>
 #include <asm/apic.h>
+#include <asm/desc.h>
 #include "mach_reboot.h"
 #include <linux/reboot_fixups.h>
 
@@ -242,13 +243,13 @@ void machine_real_restart(unsigned char *code, int length)
 
 	/* Set up the IDT for real mode. */
 
-	__asm__ __volatile__ ("lidt %0" : : "m" (real_mode_idt));
+	load_idt(&real_mode_idt);
 
 	/* Set up a GDT from which we can load segment descriptors for real
 	   mode.  The GDT is not used in real mode; it is just needed here to
 	   prepare the descriptors. */
 
-	__asm__ __volatile__ ("lgdt %0" : : "m" (real_mode_gdt));
+	load_gdt(&real_mode_gdt);
 
 	/* Load the data segment registers, and thus the descriptors ready for
 	   real mode.  The base address of each segment is 0x100, 16 times the
@@ -316,7 +317,7 @@ void machine_emergency_restart(void)
 	if (!reboot_thru_bios) {
 		if (efi_enabled) {
 			efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL);
-			__asm__ __volatile__("lidt %0": :"m" (no_idt));
+			load_idt(&no_idt);
 			__asm__ __volatile__("int3");
 		}
 		/* rebooting needs to touch the page at absolute addr 0 */
@@ -325,7 +326,7 @@ void machine_emergency_restart(void)
 			mach_reboot_fixups(); /* for board specific fixups */
 			mach_reboot();
 			/* That didn't work - force a triple fault.. */
-			__asm__ __volatile__("lidt %0": :"m" (no_idt));
+			load_idt(&no_idt);
 			__asm__ __volatile__("int3");
 		}
 	}
diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 140e340569c6..7bcda6d69d87 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -278,9 +278,9 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
 	int tmp, err = 0;
 
 	tmp = 0;
-	__asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
+	savesegment(gs, tmp);
 	err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
-	__asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp));
+	savesegment(fs, tmp);
 	err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
 
 	err |= __put_user(regs->xes, (unsigned int __user *)&sc->es);
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index cd2d5d5514fe..1144df0c48d6 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -1008,7 +1008,7 @@ void __init trap_init_f00f_bug(void)
 	 * it uses the read-only mapped virtual address.
 	 */
 	idt_descr.address = fix_to_virt(FIX_F00F_IDT);
-	__asm__ __volatile__("lidt %0" : : "m" (idt_descr));
+	load_idt(&idt_descr);
 }
 #endif
 
diff --git a/arch/i386/kernel/vm86.c b/arch/i386/kernel/vm86.c
index 2daa06fb4a88..16b485009622 100644
--- a/arch/i386/kernel/vm86.c
+++ b/arch/i386/kernel/vm86.c
@@ -294,8 +294,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
  */
 	info->regs32->eax = 0;
 	tsk->thread.saved_esp0 = tsk->thread.esp0;
-	asm volatile("mov %%fs,%0":"=m" (tsk->thread.saved_fs));
-	asm volatile("mov %%gs,%0":"=m" (tsk->thread.saved_gs));
+	savesegment(fs, tsk->thread.saved_fs);
+	savesegment(gs, tsk->thread.saved_gs);
 
 	tss = &per_cpu(init_tss, get_cpu());
 	tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0;
diff --git a/arch/i386/math-emu/get_address.c b/arch/i386/math-emu/get_address.c
index 91175738e948..9819b705efa4 100644
--- a/arch/i386/math-emu/get_address.c
+++ b/arch/i386/math-emu/get_address.c
@@ -155,7 +155,6 @@ static long pm_address(u_char FPU_modrm, u_char segment,
 { 
   struct desc_struct descriptor;
   unsigned long base_address, limit, address, seg_top;
-  unsigned short selector;
 
   segment--;
 
@@ -173,17 +172,11 @@ static long pm_address(u_char FPU_modrm, u_char segment,
       /* fs and gs aren't used by the kernel, so they still have their
 	 user-space values. */
     case PREFIX_FS_-1:
-      /* The cast is needed here to get gcc 2.8.0 to use a 16 bit register
-	 in the assembler statement. */
-
-      __asm__("mov %%fs,%0":"=r" (selector));
-      addr->selector = selector;
+      /* N.B. - movl %seg, mem is a 2 byte write regardless of prefix */
+      savesegment(fs, addr->selector);
       break;
     case PREFIX_GS_-1:
-      /* The cast is needed here to get gcc 2.8.0 to use a 16 bit register
-	 in the assembler statement. */
-      __asm__("mov %%gs,%0":"=r" (selector));
-      addr->selector = selector;
+      savesegment(gs, addr->selector);
       break;
     default:
       addr->selector = PM_REG_(segment);
diff --git a/arch/i386/power/cpu.c b/arch/i386/power/cpu.c
index 4e19c43e0954..c7a6436aa938 100644
--- a/arch/i386/power/cpu.c
+++ b/arch/i386/power/cpu.c
@@ -42,17 +42,17 @@ void __save_processor_state(struct saved_context *ctxt)
 	/*
 	 * descriptor tables
 	 */
-	asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
-	asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
-	asm volatile ("str %0"  : "=m" (ctxt->tr));
+ 	store_gdt(&ctxt->gdt_limit);
+ 	store_idt(&ctxt->idt_limit);
+ 	store_tr(ctxt->tr);
 
 	/*
 	 * segment registers
 	 */
-	asm volatile ("movw %%es, %0" : "=m" (ctxt->es));
-	asm volatile ("movw %%fs, %0" : "=m" (ctxt->fs));
-	asm volatile ("movw %%gs, %0" : "=m" (ctxt->gs));
-	asm volatile ("movw %%ss, %0" : "=m" (ctxt->ss));
+ 	savesegment(es, ctxt->es);
+ 	savesegment(fs, ctxt->fs);
+ 	savesegment(gs, ctxt->gs);
+ 	savesegment(ss, ctxt->ss);
 
 	/*
 	 * control registers 
@@ -118,16 +118,16 @@ void __restore_processor_state(struct saved_context *ctxt)
 	 * now restore the descriptor tables to their proper values
 	 * ltr is done i fix_processor_context().
 	 */
-	asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
-	asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+ 	load_gdt(&ctxt->gdt_limit);
+ 	load_idt(&ctxt->idt_limit);
 
 	/*
 	 * segment registers
 	 */
-	asm volatile ("movw %0, %%es" :: "r" (ctxt->es));
-	asm volatile ("movw %0, %%fs" :: "r" (ctxt->fs));
-	asm volatile ("movw %0, %%gs" :: "r" (ctxt->gs));
-	asm volatile ("movw %0, %%ss" :: "r" (ctxt->ss));
+ 	loadsegment(es, ctxt->es);
+ 	loadsegment(fs, ctxt->fs);
+ 	loadsegment(gs, ctxt->gs);
+ 	loadsegment(ss, ctxt->ss);
 
 	/*
 	 * sysenter MSRs
diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index 11e67811a990..9a0b85a52e71 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -30,6 +30,16 @@ extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS];
 #define load_TR_desc() __asm__ __volatile__("ltr %%ax"::"a" (GDT_ENTRY_TSS*8))
 #define load_LDT_desc() __asm__ __volatile__("lldt %%ax"::"a" (GDT_ENTRY_LDT*8))
 
+#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
+
+#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
+#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
+#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
+#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
+
 /*
  * This is the ldt that every process will get unless we need
  * something other than this.
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 8048a5e018cd..37fd2f8c7196 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -93,13 +93,13 @@ static inline unsigned long _get_base(char * addr)
 		".align 4\n\t"			\
 		".long 1b,3b\n"			\
 		".previous"			\
-		: :"m" (value))
+		: :"rm" (value))
 
 /*
  * Save a segment register away
  */
 #define savesegment(seg, value) \
-	asm volatile("mov %%" #seg ",%0":"=m" (value))
+	asm volatile("mov %%" #seg ",%0":"=rm" (value))
 
 /*
  * Clear and set 'TS' bit respectively
-- 
cgit v1.2.3


From e7a2ff593c0e48b130434dee4d2fd3452a850e6f Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:39 -0700
Subject: [PATCH] i386: load_tls() fix

Subtle fix: load_TLS has been moved after saving %fs and %gs segments to avoid
creating non-reversible segments.  This could conceivably cause a bug if the
kernel ever needed to save and restore fs/gs from the NMI handler.  It
currently does not, but this is the safest approach to avoiding fs/gs
corruption.  SMIs are safe, since SMI saves the descriptor hidden state.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/process.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 761d4ed47ef3..9d94995e9672 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -678,21 +678,26 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	__unlazy_fpu(prev_p);
 
 	/*
-	 * Reload esp0, LDT and the page table pointer:
+	 * Reload esp0.
 	 */
 	load_esp0(tss, next);
 
 	/*
-	 * Load the per-thread Thread-Local Storage descriptor.
+	 * Save away %fs and %gs. No need to save %es and %ds, as
+	 * those are always kernel segments while inside the kernel.
+	 * Doing this before setting the new TLS descriptors avoids
+	 * the situation where we temporarily have non-reloadable
+	 * segments in %fs and %gs.  This could be an issue if the
+	 * NMI handler ever used %fs or %gs (it does not today), or
+	 * if the kernel is running inside of a hypervisor layer.
 	 */
-	load_TLS(next, cpu);
+	savesegment(fs, prev->fs);
+	savesegment(gs, prev->gs);
 
 	/*
-	 * Save away %fs and %gs. No need to save %es and %ds, as
-	 * those are always kernel segments while inside the kernel.
+	 * Load the per-thread Thread-Local Storage descriptor.
 	 */
-	asm volatile("mov %%fs,%0":"=m" (prev->fs));
-	asm volatile("mov %%gs,%0":"=m" (prev->gs));
+	load_TLS(next, cpu);
 
 	/*
 	 * Restore %fs and %gs if needed.
-- 
cgit v1.2.3


From f2ab4461249df85b20930a7a57b54f39c5ae291a Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:42 -0700
Subject: [PATCH] x86: more asm cleanups

Some more assembler cleanups I noticed along the way.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/intel.c           |  9 +++------
 arch/i386/kernel/crash.c               |  2 +-
 arch/i386/kernel/machine_kexec.c       | 10 ++--------
 arch/i386/kernel/msr.c                 | 31 ++++++-------------------------
 arch/i386/kernel/process.c             |  2 +-
 arch/i386/mach-voyager/voyager_basic.c | 14 ++++++--------
 arch/i386/mach-voyager/voyager_smp.c   |  2 +-
 include/asm-i386/msr.h                 | 15 +++++++++++++++
 8 files changed, 35 insertions(+), 50 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c
index a2c33c1a46c5..43601de0f633 100644
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -82,16 +82,13 @@ static void __devinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
  */
 static int __devinit num_cpu_cores(struct cpuinfo_x86 *c)
 {
-	unsigned int eax;
+	unsigned int eax, ebx, ecx, edx;
 
 	if (c->cpuid_level < 4)
 		return 1;
 
-	__asm__("cpuid"
-		: "=a" (eax)
-		: "0" (4), "c" (0)
-		: "bx", "dx");
-
+	/* Intel has a non-standard dependency on %ecx for this CPUID level. */
+	cpuid_count(4, 0, &eax, &ebx, &ecx, &edx);
 	if (eax & 0x1f)
 		return ((eax >> 26) + 1);
 	else
diff --git a/arch/i386/kernel/crash.c b/arch/i386/kernel/crash.c
index e5fab12f7926..913be77bb844 100644
--- a/arch/i386/kernel/crash.c
+++ b/arch/i386/kernel/crash.c
@@ -153,7 +153,7 @@ static int crash_nmi_callback(struct pt_regs *regs, int cpu)
 	disable_local_APIC();
 	atomic_dec(&waiting_for_crash_ipi);
 	/* Assume hlt works */
-	__asm__("hlt");
+	halt();
 	for(;;);
 
 	return 1;
diff --git a/arch/i386/kernel/machine_kexec.c b/arch/i386/kernel/machine_kexec.c
index f19f6d34bcbf..a912fed48482 100644
--- a/arch/i386/kernel/machine_kexec.c
+++ b/arch/i386/kernel/machine_kexec.c
@@ -93,10 +93,7 @@ static void set_idt(void *newidt, __u16 limit)
 	curidt.size    = limit;
 	curidt.address = (unsigned long)newidt;
 
-	__asm__ __volatile__ (
-		"lidtl %0\n"
-		: : "m" (curidt)
-		);
+	load_idt(&curidt);
 };
 
 
@@ -108,10 +105,7 @@ static void set_gdt(void *newgdt, __u16 limit)
 	curgdt.size    = limit;
 	curgdt.address = (unsigned long)newgdt;
 
-	__asm__ __volatile__ (
-		"lgdtl %0\n"
-		: : "m" (curgdt)
-		);
+	load_gdt(&curgdt);
 };
 
 static void load_segments(void)
diff --git a/arch/i386/kernel/msr.c b/arch/i386/kernel/msr.c
index b2f03c39a6fe..03100d6fc5d6 100644
--- a/arch/i386/kernel/msr.c
+++ b/arch/i386/kernel/msr.c
@@ -46,23 +46,13 @@
 
 static struct class *msr_class;
 
-/* Note: "err" is handled in a funny way below.  Otherwise one version
-   of gcc or another breaks. */
-
 static inline int wrmsr_eio(u32 reg, u32 eax, u32 edx)
 {
 	int err;
 
-	asm volatile ("1:	wrmsr\n"
-		      "2:\n"
-		      ".section .fixup,\"ax\"\n"
-		      "3:	movl %4,%0\n"
-		      "	jmp 2b\n"
-		      ".previous\n"
-		      ".section __ex_table,\"a\"\n"
-		      "	.align 4\n" "	.long 1b,3b\n" ".previous":"=&bDS" (err)
-		      :"a"(eax), "d"(edx), "c"(reg), "i"(-EIO), "0"(0));
-
+	err = wrmsr_safe(reg, eax, edx);
+	if (err)
+		err = -EIO;
 	return err;
 }
 
@@ -70,18 +60,9 @@ static inline int rdmsr_eio(u32 reg, u32 *eax, u32 *edx)
 {
 	int err;
 
-	asm volatile ("1:	rdmsr\n"
-		      "2:\n"
-		      ".section .fixup,\"ax\"\n"
-		      "3:	movl %4,%0\n"
-		      "	jmp 2b\n"
-		      ".previous\n"
-		      ".section __ex_table,\"a\"\n"
-		      "	.align 4\n"
-		      "	.long 1b,3b\n"
-		      ".previous":"=&bDS" (err), "=a"(*eax), "=d"(*edx)
-		      :"c"(reg), "i"(-EIO), "0"(0));
-
+	err = rdmsr_safe(reg, eax, edx);
+	if (err)
+		err = -EIO;
 	return err;
 }
 
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 9d94995e9672..660997800393 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -164,7 +164,7 @@ static inline void play_dead(void)
 	 */
 	local_irq_disable();
 	while (1)
-		__asm__ __volatile__("hlt":::"memory");
+		halt();
 }
 #else
 static inline void play_dead(void)
diff --git a/arch/i386/mach-voyager/voyager_basic.c b/arch/i386/mach-voyager/voyager_basic.c
index c6384061328a..cc69875d979b 100644
--- a/arch/i386/mach-voyager/voyager_basic.c
+++ b/arch/i386/mach-voyager/voyager_basic.c
@@ -234,10 +234,9 @@ voyager_power_off(void)
 #endif
 	}
 	/* and wait for it to happen */
-	for(;;) {
-		__asm("cli");
-		__asm("hlt");
-	}
+	local_irq_disable();
+	for(;;)
+		halt();
 }
 
 /* copied from process.c */
@@ -278,10 +277,9 @@ machine_restart(char *cmd)
 		outb(basebd | 0x08, VOYAGER_MC_SETUP);
 		outb(0x02, catbase + 0x21);
 	}
-	for(;;) {
-		asm("cli");
-		asm("hlt");
-	}
+	local_irq_disable();
+	for(;;)
+		halt();
 }
 
 void
diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c
index 0e1f4208b07c..16790b798613 100644
--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -1015,7 +1015,7 @@ smp_stop_cpu_function(void *dummy)
 	cpu_clear(smp_processor_id(), cpu_online_map);
 	local_irq_disable();
 	for(;;)
-	       __asm__("hlt");
+		halt();
 }
 
 static DEFINE_SPINLOCK(call_lock);
diff --git a/include/asm-i386/msr.h b/include/asm-i386/msr.h
index c76fce8badbb..62b76cd96957 100644
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -47,6 +47,21 @@ static inline void wrmsrl (unsigned long msr, unsigned long long val)
 		     : "c" (msr), "0" (a), "d" (b), "i" (-EFAULT));\
 	ret__; })
 
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr,a,b) ({ int ret__;						\
+	asm volatile("2: rdmsr ; xorl %0,%0\n"						\
+		     "1:\n\t"								\
+		     ".section .fixup,\"ax\"\n\t"					\
+		     "3:  movl %4,%0 ; jmp 1b\n\t"					\
+		     ".previous\n\t"							\
+ 		     ".section __ex_table,\"a\"\n"					\
+		     "   .align 4\n\t"							\
+		     "   .long 	2b,3b\n\t"						\
+		     ".previous"							\
+		     : "=r" (ret__), "=a" (*(a)), "=d" (*(b))				\
+		     : "c" (msr), "i" (-EFAULT));\
+	ret__; })
+
 #define rdtsc(low,high) \
      __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
 
-- 
cgit v1.2.3


From 0998e4228aca046fbd747c3fed909791d52e88eb Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:43 -0700
Subject: [PATCH] x86: privilege cleanup

Privilege checking cleanup.  Originally, these diffs were much greater, but
recent cleanups in Linux have already done much of the cleanup.  I added
some explanatory comments in places where the reasoning behind certain
tests is rather subtle.

Also, in traps.c, we can skip the user_mode check in handle_BUG().  The
reason is, there are only two call chains - one via die_if_kernel() and one
via do_page_fault(), both entering from die().  Both of these paths already
ensure that a kernel mode failure has happened.  Also, the original check
here, if (user_mode(regs)) was insufficient anyways, since it would not
rule out BUG faults from V8086 mode execution.

Saving the %ss segment in show_regs() rather than assuming a fixed value
also gives better information about the current kernel state in the
register dump.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/signal.c | 4 +++-
 arch/i386/kernel/traps.c  | 5 +----
 include/asm-i386/ptrace.h | 7 +++++++
 3 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
index 7bcda6d69d87..61eb0c8a6e47 100644
--- a/arch/i386/kernel/signal.c
+++ b/arch/i386/kernel/signal.c
@@ -604,7 +604,9 @@ int fastcall do_signal(struct pt_regs *regs, sigset_t *oldset)
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
 	 * kernel mode. Just return without doing anything
-	 * if so.
+ 	 * if so.  vm86 regs switched out by assembly code
+ 	 * before reaching here, so testing against kernel
+ 	 * CS suffices.
 	 */
 	if (!user_mode(regs))
 		return 1;
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 1144df0c48d6..b2b4bb890bb7 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -210,7 +210,7 @@ void show_registers(struct pt_regs *regs)
 	unsigned short ss;
 
 	esp = (unsigned long) (&regs->esp);
-	ss = __KERNEL_DS;
+	savesegment(ss, ss);
 	if (user_mode(regs)) {
 		in_kernel = 0;
 		esp = regs->esp;
@@ -267,9 +267,6 @@ static void handle_BUG(struct pt_regs *regs)
 	char c;
 	unsigned long eip;
 
-	if (user_mode(regs))
-		goto no_bug;		/* Not in kernel */
-
 	eip = regs->eip;
 
 	if (eip < PAGE_OFFSET)
diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h
index 05532875e39e..7e0f2945d17d 100644
--- a/include/asm-i386/ptrace.h
+++ b/include/asm-i386/ptrace.h
@@ -61,6 +61,13 @@ struct pt_regs {
 struct task_struct;
 extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code);
 
+/*
+ * user_mode_vm(regs) determines whether a register set came from user mode.
+ * This is true if V8086 mode was enabled OR if the register set was from
+ * protected mode with RPL-3 CS value.  This tricky test checks that with
+ * one comparison.  Many places in the kernel can bypass this full check
+ * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ */
 static inline int user_mode(struct pt_regs *regs)
 {
 	return (regs->xcs & 3) != 0;
-- 
cgit v1.2.3


From a5201129307f414890f9a4410e38da205f5d7359 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:44 -0700
Subject: [PATCH] x86: make IOPL explicit

The pushf/popf in switch_to are ONLY used to switch IOPL.  Making this
explicit in C code is more clear.  This pushf/popf pair was added as a
bugfix for leaking IOPL to unprivileged processes when using
sysenter/sysexit based system calls (sysexit does not restore flags).

When requesting an IOPL change in sys_iopl(), it is just as easy to change
the current flags and the flags in the stack image (in case an IRET is
required), but there is no reason to force an IRET if we came in from the
SYSENTER path.

This change is the minimal solution for supporting a paravirtualized Linux
kernel that allows user processes to run with I/O privilege.  Other
solutions require radical rewrites of part of the low level fault / system
call handling code, or do not fully support sysenter based system calls.

Unfortunately, this added one field to the thread_struct.  But as a bonus,
on P4, the fastest time measured for switch_to() went from 312 to 260
cycles, a win of about 17% in the fast case through this performance
critical path.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/ioport.c    |  7 ++++---
 arch/i386/kernel/process.c   |  6 ++++++
 include/asm-i386/processor.h | 16 ++++++++++++++++
 include/asm-i386/system.h    |  4 +---
 4 files changed, 27 insertions(+), 6 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/ioport.c b/arch/i386/kernel/ioport.c
index 8b25160393c1..f2b37654777f 100644
--- a/arch/i386/kernel/ioport.c
+++ b/arch/i386/kernel/ioport.c
@@ -132,6 +132,7 @@ asmlinkage long sys_iopl(unsigned long unused)
 	volatile struct pt_regs * regs = (struct pt_regs *) &unused;
 	unsigned int level = regs->ebx;
 	unsigned int old = (regs->eflags >> 12) & 3;
+	struct thread_struct *t = &current->thread;
 
 	if (level > 3)
 		return -EINVAL;
@@ -140,8 +141,8 @@ asmlinkage long sys_iopl(unsigned long unused)
 		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
 	}
-	regs->eflags = (regs->eflags &~ 0x3000UL) | (level << 12);
-	/* Make sure we return the long way (not sysenter) */
-	set_thread_flag(TIF_IRET);
+	t->iopl = level << 12;
+	regs->eflags = (regs->eflags & ~X86_EFLAGS_IOPL) | t->iopl;
+	set_iopl_mask(t->iopl);
 	return 0;
 }
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 660997800393..b45cbf93d439 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -711,6 +711,12 @@ struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct tas
 	if (prev->gs | next->gs)
 		loadsegment(gs, next->gs);
 
+	/*
+	 * Restore IOPL if needed.
+	 */
+	if (unlikely(prev->iopl != next->iopl))
+		set_iopl_mask(next->iopl);
+
 	/*
 	 * Now maybe reload the debug registers
 	 */
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 992224bff549..37bef8ed7bed 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -455,6 +455,7 @@ struct thread_struct {
 	unsigned int		saved_fs, saved_gs;
 /* IO permissions */
 	unsigned long	*io_bitmap_ptr;
+ 	unsigned long	iopl;
 /* max allowed port in the bitmap, in bytes: */
 	unsigned long	io_bitmap_max;
 };
@@ -511,6 +512,21 @@ static inline void load_esp0(struct tss_struct *tss, struct thread_struct *threa
 			: /* no output */			\
 			:"r" (value))
 
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void set_iopl_mask(unsigned mask)
+{
+	unsigned int reg;
+	__asm__ __volatile__ ("pushfl;"
+			      "popl %0;"
+			      "andl %1, %0;"
+			      "orl %2, %0;"
+			      "pushl %0;"
+			      "popfl"
+				: "=&r" (reg)
+				: "i" (~X86_EFLAGS_IOPL), "r" (mask));
+}
 
 /* Forward declaration, a strange C thing */
 struct task_struct;
diff --git a/include/asm-i386/system.h b/include/asm-i386/system.h
index 37fd2f8c7196..acd5c26b69ba 100644
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -14,8 +14,7 @@ extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struc
 
 #define switch_to(prev,next,last) do {					\
 	unsigned long esi,edi;						\
-	asm volatile("pushfl\n\t"					\
-		     "pushl %%ebp\n\t"					\
+	asm volatile("pushl %%ebp\n\t"					\
 		     "movl %%esp,%0\n\t"	/* save ESP */		\
 		     "movl %5,%%esp\n\t"	/* restore ESP */	\
 		     "movl $1f,%1\n\t"		/* save EIP */		\
@@ -23,7 +22,6 @@ extern struct task_struct * FASTCALL(__switch_to(struct task_struct *prev, struc
 		     "jmp __switch_to\n"				\
 		     "1:\t"						\
 		     "popl %%ebp\n\t"					\
-		     "popfl"						\
 		     :"=m" (prev->thread.esp),"=m" (prev->thread.eip),	\
 		      "=a" (last),"=S" (esi),"=D" (edi)			\
 		     :"m" (next->thread.esp),"m" (next->thread.eip),	\
-- 
cgit v1.2.3


From f2f30ebca6c0c95e987cb9a1fd1495770a75432e Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:47 -0700
Subject: [PATCH] x86: introduce a write acessor for updating the current LDT

Introduce a write acessor for updating the current LDT.  This is required
for hypervisors like Xen that do not allow LDT pages to be directly
written.

Testing - here's a fun little LDT test that can be trivially modified to
test limits as well.

/*
 * Copyright (c) 2005, Zachary Amsden (zach@vmware.com)
 * This is licensed under the GPL.
 */

#include <stdio.h>
#include <signal.h>
#include <asm/ldt.h>
#include <asm/segment.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/mman.h>
#define __KERNEL__
#include <asm/page.h>

void main(void)
{
        struct user_desc desc;
        char *code;
        unsigned long long tsc;

        code = (char *)mmap(0, 8192, PROT_EXEC|PROT_READ|PROT_WRITE,
                                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        desc.entry_number = 0;
        desc.base_addr = code;
        desc.limit = 1;
        desc.seg_32bit = 1;
        desc.contents = MODIFY_LDT_CONTENTS_CODE;
        desc.read_exec_only = 0;
        desc.limit_in_pages = 1;
        desc.seg_not_present = 0;
        desc.useable = 1;
        if (modify_ldt(1, &desc, sizeof(desc)) != 0) {
                perror("modify_ldt");
        }
        printf("code base is 0x%08x\n", (unsigned)code);
        code[0x0ffe] = 0x0f;  /* rdtsc */
        code[0x0fff] = 0x31;
        code[0x1000] = 0xcb;  /* lret */
        __asm__ __volatile("lcall $7,$0xffe" : "=A" (tsc));
        printf("TSC is 0x%016llx\n", tsc);
}

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/ldt.c  | 7 ++-----
 include/asm-i386/desc.h | 7 +++++++
 2 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/ldt.c b/arch/i386/kernel/ldt.c
index bb50afbee921..fe1ffa55587d 100644
--- a/arch/i386/kernel/ldt.c
+++ b/arch/i386/kernel/ldt.c
@@ -177,7 +177,7 @@ static int read_default_ldt(void __user * ptr, unsigned long bytecount)
 static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
 {
 	struct mm_struct * mm = current->mm;
-	__u32 entry_1, entry_2, *lp;
+	__u32 entry_1, entry_2;
 	int error;
 	struct user_desc ldt_info;
 
@@ -205,8 +205,6 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
 			goto out_unlock;
 	}
 
-	lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);
-
    	/* Allow LDTs to be cleared by the user. */
    	if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
 		if (oldmode || LDT_empty(&ldt_info)) {
@@ -223,8 +221,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
 
 	/* Install the new entry ...  */
 install:
-	*lp	= entry_1;
-	*(lp+1)	= entry_2;
+	write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2);
 	error = 0;
 
 out_unlock:
diff --git a/include/asm-i386/desc.h b/include/asm-i386/desc.h
index ccb11c032839..6df1a53c190e 100644
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -96,6 +96,13 @@ static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
 	(info)->seg_not_present	== 1	&& \
 	(info)->useable		== 0	)
 
+static inline void write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
+{
+	__u32 *lp = (__u32 *)((char *)ldt + entry*8);
+	*lp = entry_a;
+	*(lp+1) = entry_b;
+}
+
 #if TLS_SIZE != 24
 # error update this code.
 #endif
-- 
cgit v1.2.3


From 748f2edb52712aa3d926470a888608dc500d17e8 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Sat, 3 Sep 2005 15:56:48 -0700
Subject: [PATCH] x86 NMI: better support for debuggers

This patch adds a notify to the die_nmi notify that the system is about to
be taken down.  If the notify is handled with a NOTIFY_STOP return, the
system is given a new lease on life.

We also change the nmi watchdog to carry on if die_nmi returns.

This give debug code a chance to a) catch watchdog timeouts and b) possibly
allow the system to continue, realizing that the time out may be due to
debugger activities such as single stepping which is usually done with
"other" cpus held.

Signed-off-by: George Anzinger<george@mvista.com>
Cc: Keith Owens <kaos@ocs.com.au>
Signed-off-by: George Anzinger <george@mvista.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/nmi.c    |  5 ++++-
 arch/i386/kernel/traps.c  |  4 ++++
 include/asm-i386/kdebug.h | 11 +++++++++--
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 8c242bb1ef45..8bbdbda07a2d 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -501,8 +501,11 @@ void nmi_watchdog_tick (struct pt_regs * regs)
 		 */
 		alert_counter[cpu]++;
 		if (alert_counter[cpu] == 5*nmi_hz)
+			/*
+			 * die_nmi will return ONLY if NOTIFY_STOP happens..
+			 */
 			die_nmi(regs, "NMI Watchdog detected LOCKUP");
-	} else {
+
 		last_irq_sums[cpu] = sum;
 		alert_counter[cpu] = 0;
 	}
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index b2b4bb890bb7..54629bb5893a 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -565,6 +565,10 @@ static DEFINE_SPINLOCK(nmi_print_lock);
 
 void die_nmi (struct pt_regs *regs, const char *msg)
 {
+	if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 0, SIGINT) ==
+	    NOTIFY_STOP)
+		return;
+
 	spin_lock(&nmi_print_lock);
 	/*
 	* We are in trouble anyway, lets at least try
diff --git a/include/asm-i386/kdebug.h b/include/asm-i386/kdebug.h
index b3f8d5f59d5d..316138e89910 100644
--- a/include/asm-i386/kdebug.h
+++ b/include/asm-i386/kdebug.h
@@ -41,9 +41,16 @@ enum die_val {
 	DIE_PAGE_FAULT,
 };
 
-static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err,int trap, int sig)
+static inline int notify_die(enum die_val val, const char *str,
+			struct pt_regs *regs, long err, int trap, int sig)
 {
-	struct die_args args = { .regs=regs, .str=str, .err=err, .trapnr=trap,.signr=sig };
+	struct die_args args = {
+		.regs = regs,
+		.str = str,
+		.err = err,
+		.trapnr = trap,
+		.signr = sig
+	};
 	return notifier_call_chain(&i386die_chain, val, &args);
 }
 
-- 
cgit v1.2.3


From d7271b14b2e9e5905aba0fbf5c4dc4f8980c0cb2 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zach@vmware.com>
Date: Sat, 3 Sep 2005 15:56:50 -0700
Subject: [PATCH] i386: encapsulate copying of pgd entries

Add a clone operation for pgd updates.

This helps complete the encapsulation of updates to page tables (or pages
about to become page tables) into accessor functions rather than using
memcpy() to duplicate them.  This is both generally good for consistency
and also necessary for running in a hypervisor which requires explicit
updates to page table entries.

The new function is:

clone_pgd_range(pgd_t *dst, pgd_t *src, int count);

   dst - pointer to pgd range anwhere on a pgd page
   src - ""
   count - the number of pgds to copy.

   dst and src can be on the same page, but the range must not overlap
   and must not cross a page boundary.

Note that I ommitted using this call to copy pgd entries into the
software suspend page root, since this is not technically a live paging
structure, rather it is used on resume from suspend.  CC'ing Pavel in case
he has any feedback on this.

Thanks to Chris Wright for noticing that this could be more optimal in
PAE compiles by eliminating the memset.

Signed-off-by: Zachary Amsden <zach@vmware.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/smpboot.c |  4 ++--
 arch/i386/mm/pgtable.c     | 10 +++++-----
 include/asm-i386/pgtable.h | 15 +++++++++++++++
 3 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 8ac8e9fd5614..8e950cdff1a0 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -1017,8 +1017,8 @@ int __devinit smp_prepare_cpu(int cpu)
 	tsc_sync_disabled = 1;
 
 	/* init low mem mapping */
-	memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
-			sizeof(swapper_pg_dir[0]) * KERNEL_PGD_PTRS);
+	clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+			KERNEL_PGD_PTRS);
 	flush_tlb_all();
 	schedule_work(&task);
 	wait_for_completion(&done);
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index bd2f7afc7a2a..dcdce2c6c532 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -207,19 +207,19 @@ void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
 {
 	unsigned long flags;
 
-	if (PTRS_PER_PMD == 1)
+	if (PTRS_PER_PMD == 1) {
+		memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
 		spin_lock_irqsave(&pgd_lock, flags);
+	}
 
-	memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
+	clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
 			swapper_pg_dir + USER_PTRS_PER_PGD,
-			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
-
+			KERNEL_PGD_PTRS);
 	if (PTRS_PER_PMD > 1)
 		return;
 
 	pgd_list_add(pgd);
 	spin_unlock_irqrestore(&pgd_lock, flags);
-	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
 }
 
 /* never called when PTRS_PER_PMD > 1 */
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index d74185aee15b..47bc1ffa3d4c 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -277,6 +277,21 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr,
 	clear_bit(_PAGE_BIT_RW, &ptep->pte_low);
 }
 
+/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ *  dst - pointer to pgd range anwhere on a pgd page
+ *  src - ""
+ *  count - the number of pgds to copy.
+ *
+ * dst and src can be on the same page, but the range must not overlap,
+ * and must not cross a page boundary.
+ */
+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+{
+       memcpy(dst, src, count * sizeof(pgd_t));
+}
+
 /*
  * Macro to mark a page protection value as "uncacheable".  On processors which do not support
  * it, this is a no-op.
-- 
cgit v1.2.3


From 4ad8d38342430f8b52f7a8458dce90caf8c8ca64 Mon Sep 17 00:00:00 2001
From: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Date: Sat, 3 Sep 2005 15:56:51 -0700
Subject: [PATCH] i386 boottime for_each_cpu broken

for_each_cpu walks through all processors in cpu_possible_map, which is
defined as cpu_callout_map on i386 and isn't initialised until all
processors have been booted. This breaks things which do for_each_cpu
iterations early during boot. So, define cpu_possible_map as a bitmap with
NR_CPUS bits populated. This was triggered by a patch i'm working on which
does alloc_percpu before bringing up secondary processors.

From: Alexander Nyberg <alexn@telia.com>

i386-boottime-for_each_cpu-broken.patch
i386-boottime-for_each_cpu-broken-fix.patch

The SMP version of __alloc_percpu checks the cpu_possible_map before
allocating memory for a certain cpu.  With the above patches the BSP cpuid
is never set in cpu_possible_map which breaks CONFIG_SMP on uniprocessor
machines (as soon as someone tries to dereference something allocated via
__alloc_percpu, which in fact is never allocated since the cpu is not set
in cpu_possible_map).

Signed-off-by: Zwane Mwaikambo <zwane@arm.linux.org.uk>
Signed-off-by: Alexander Nyberg <alexn@telia.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/mpparse.c           | 8 +++++++-
 arch/i386/kernel/smpboot.c           | 3 +++
 arch/i386/mach-voyager/voyager_smp.c | 3 +++
 include/asm-i386/smp.h               | 2 +-
 4 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/mpparse.c b/arch/i386/kernel/mpparse.c
index 788efffa9930..5d0b9a8fc43d 100644
--- a/arch/i386/kernel/mpparse.c
+++ b/arch/i386/kernel/mpparse.c
@@ -122,7 +122,7 @@ static int MP_valid_apicid(int apicid, int version)
 
 static void __init MP_processor_info (struct mpc_config_processor *m)
 {
- 	int ver, apicid;
+ 	int ver, apicid, cpu, found_bsp = 0;
 	physid_mask_t tmp;
  	
 	if (!(m->mpc_cpuflag & CPU_ENABLED))
@@ -181,6 +181,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 	if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
 		Dprintk("    Bootup CPU\n");
 		boot_cpu_physical_apicid = m->mpc_apicid;
+		found_bsp = 1;
 	}
 
 	if (num_processors >= NR_CPUS) {
@@ -204,6 +205,11 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
 		return;
 	}
 
+	if (found_bsp)
+		cpu = 0;
+	else
+		cpu = num_processors - 1;
+	cpu_set(cpu, cpu_possible_map);
 	tmp = apicid_to_cpu_present(apicid);
 	physids_or(phys_cpu_present_map, phys_cpu_present_map, tmp);
 	
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 8e950cdff1a0..5e4893d2b9f2 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -88,6 +88,8 @@ EXPORT_SYMBOL(cpu_online_map);
 cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
 EXPORT_SYMBOL(cpu_callout_map);
+cpumask_t cpu_possible_map;
+EXPORT_SYMBOL(cpu_possible_map);
 static cpumask_t smp_commenced_mask;
 
 /* TSC's upper 32 bits can't be written in eariler CPU (before prescott), there
@@ -1265,6 +1267,7 @@ void __devinit smp_prepare_boot_cpu(void)
 	cpu_set(smp_processor_id(), cpu_online_map);
 	cpu_set(smp_processor_id(), cpu_callout_map);
 	cpu_set(smp_processor_id(), cpu_present_map);
+	cpu_set(smp_processor_id(), cpu_possible_map);
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 }
 
diff --git a/arch/i386/mach-voyager/voyager_smp.c b/arch/i386/mach-voyager/voyager_smp.c
index 16790b798613..46b0cf4a31e0 100644
--- a/arch/i386/mach-voyager/voyager_smp.c
+++ b/arch/i386/mach-voyager/voyager_smp.c
@@ -242,6 +242,8 @@ static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
 cpumask_t cpu_callin_map = CPU_MASK_NONE;
 cpumask_t cpu_callout_map = CPU_MASK_NONE;
 EXPORT_SYMBOL(cpu_callout_map);
+cpumask_t cpu_possible_map = CPU_MASK_ALL;
+EXPORT_SYMBOL(cpu_possible_map);
 
 /* The per processor IRQ masks (these are usually kept in sync) */
 static __u16 vic_irq_mask[NR_CPUS] __cacheline_aligned;
@@ -1910,6 +1912,7 @@ void __devinit smp_prepare_boot_cpu(void)
 {
 	cpu_set(smp_processor_id(), cpu_online_map);
 	cpu_set(smp_processor_id(), cpu_callout_map);
+	cpu_set(smp_processor_id(), cpu_possible_map);
 }
 
 int __devinit
diff --git a/include/asm-i386/smp.h b/include/asm-i386/smp.h
index a283738b80b3..13250199976d 100644
--- a/include/asm-i386/smp.h
+++ b/include/asm-i386/smp.h
@@ -59,7 +59,7 @@ extern void cpu_uninit(void);
 
 extern cpumask_t cpu_callout_map;
 extern cpumask_t cpu_callin_map;
-#define cpu_possible_map cpu_callout_map
+extern cpumask_t cpu_possible_map;
 
 /* We don't mark CPUs online until __cpu_up(), so we need another measure */
 static inline int num_booting_cpus(void)
-- 
cgit v1.2.3


From 52fdd08903a1d1162e184114837e232640191627 Mon Sep 17 00:00:00 2001
From: Benjamin LaHaise <bcrl@kvack.org>
Date: Sat, 3 Sep 2005 15:56:52 -0700
Subject: [PATCH] unify x86/x86-64 semaphore code

This patch moves the common code in x86 and x86-64's semaphore.c into a
single file in lib/semaphore-sleepers.c.  The arch specific asm stubs are
left in the arch tree (in semaphore.c for i386 and in the asm for x86-64).
There should be no changes in code/functionality with this patch.

Signed-off-by: Benjamin LaHaise <benjamin.c.lahaise@intel.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig              |   4 +
 arch/i386/kernel/semaphore.c   | 162 -------------------------------------
 arch/um/Kconfig_i386           |   4 +
 arch/um/Kconfig_x86_64         |   4 +
 arch/um/sys-x86_64/Makefile    |   5 +-
 arch/x86_64/Kconfig            |   4 +
 arch/x86_64/kernel/Makefile    |   2 +-
 arch/x86_64/kernel/semaphore.c | 180 -----------------------------------------
 lib/Makefile                   |   1 +
 lib/semaphore-sleepers.c       | 177 ++++++++++++++++++++++++++++++++++++++++
 10 files changed, 197 insertions(+), 346 deletions(-)
 delete mode 100644 arch/x86_64/kernel/semaphore.c
 create mode 100644 lib/semaphore-sleepers.c

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index dcb0ad098c60..3b3b017e1c15 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -14,6 +14,10 @@ config X86
 	  486, 586, Pentiums, and various instruction-set-compatible chips by
 	  AMD, Cyrix, and others.
 
+config SEMAPHORE_SLEEPERS
+	bool
+	default y
+
 config MMU
 	bool
 	default y
diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c
index 469f496e55c0..7455ab643943 100644
--- a/arch/i386/kernel/semaphore.c
+++ b/arch/i386/kernel/semaphore.c
@@ -13,170 +13,8 @@
  * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
  */
 #include <linux/config.h>
-#include <linux/sched.h>
-#include <linux/err.h>
-#include <linux/init.h>
 #include <asm/semaphore.h>
 
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to acquire the semaphore, while the "sleeping"
- * variable is a count of such acquires.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * "sleeping" and the contention routine ordering is protected
- * by the spinlock in the semaphore's waitqueue head.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-
-/*
- * Logic:
- *  - only on a boundary condition do we need to care. When we go
- *    from a negative count to a non-negative, we wake people up.
- *  - when we go from a non-negative count to a negative do we
- *    (a) synchronize with the "sleeper" count and (b) make sure
- *    that we're on the wakeup list before we synchronize so that
- *    we cannot lose wakeup events.
- */
-
-static fastcall void __attribute_used__  __up(struct semaphore *sem)
-{
-	wake_up(&sem->wait);
-}
-
-static fastcall void __attribute_used__ __sched __down(struct semaphore * sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * the wait_queue_head.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_UNINTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	tsk->state = TASK_RUNNING;
-}
-
-static fastcall int __attribute_used__ __sched __down_interruptible(struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_INTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * With signals pending, this turns into
-		 * the trylock failure case - we won't be
-		 * sleeping, and we* can't get the lock as
-		 * it has contention. Just correct the count
-		 * and exit.
-		 */
-		if (signal_pending(current)) {
-			retval = -EINTR;
-			sem->sleepers = 0;
-			atomic_add(sleepers, &sem->count);
-			break;
-		}
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * wait_queue_head. The "-1" is because we're
-		 * still hoping to get the semaphore.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_INTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-	tsk->state = TASK_RUNNING;
-	return retval;
-}
-
-/*
- * Trylock failed - make sure we correct for
- * having decremented the count.
- *
- * We could have done the trylock with a
- * single "cmpxchg" without failure cases,
- * but then it wouldn't work on a 386.
- */
-static fastcall int __attribute_used__ __down_trylock(struct semaphore * sem)
-{
-	int sleepers;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	sleepers = sem->sleepers + 1;
-	sem->sleepers = 0;
-
-	/*
-	 * Add "everybody else" and us into it. They aren't
-	 * playing, because we own the spinlock in the
-	 * wait_queue_head.
-	 */
-	if (!atomic_add_negative(sleepers, &sem->count)) {
-		wake_up_locked(&sem->wait);
-	}
-
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	return 1;
-}
-
-
 /*
  * The semaphore operations have a special calling sequence that
  * allow us to do a simpler in-line version of them. These routines
diff --git a/arch/um/Kconfig_i386 b/arch/um/Kconfig_i386
index 27c18a8d9d17..8ad156a00499 100644
--- a/arch/um/Kconfig_i386
+++ b/arch/um/Kconfig_i386
@@ -6,6 +6,10 @@ config 64BIT
 	bool
 	default n
 
+config SEMAPHORE_SLEEPERS
+	bool
+	default y
+
 config TOP_ADDR
  	hex
  	default 0xc0000000 if !HOST_2G_2G
diff --git a/arch/um/Kconfig_x86_64 b/arch/um/Kconfig_x86_64
index 735a047c890c..6e5357c5bcfb 100644
--- a/arch/um/Kconfig_x86_64
+++ b/arch/um/Kconfig_x86_64
@@ -6,6 +6,10 @@ config 64BIT
 	bool
 	default y
 
+config SEMAPHORE_SLEEPERS
+	bool
+	default y
+
 config TOP_ADDR
  	hex
 	default 0x80000000
diff --git a/arch/um/sys-x86_64/Makefile b/arch/um/sys-x86_64/Makefile
index 7488206ce6f4..736d3edec581 100644
--- a/arch/um/sys-x86_64/Makefile
+++ b/arch/um/sys-x86_64/Makefile
@@ -6,7 +6,7 @@
 
 #XXX: why into lib-y?
 lib-y = bitops.o bugs.o csum-partial.o delay.o fault.o mem.o memcpy.o \
-	ptrace.o ptrace_user.o semaphore.o sigcontext.o signal.o stub.o \
+	ptrace.o ptrace_user.o sigcontext.o signal.o stub.o \
 	stub_segv.o syscalls.o syscall_table.o sysrq.o thunk.o
 
 obj-y := ksyms.o
@@ -15,7 +15,7 @@ obj-$(CONFIG_MODULES) += module.o um_module.o
 USER_OBJS := ptrace_user.o sigcontext.o
 
 SYMLINKS = bitops.c csum-copy.S csum-partial.c csum-wrappers.c memcpy.S \
-	semaphore.c thunk.S module.c
+	thunk.S module.c
 
 include arch/um/scripts/Makefile.rules
 
@@ -24,7 +24,6 @@ csum-copy.S-dir = lib
 csum-partial.c-dir = lib
 csum-wrappers.c-dir = lib
 memcpy.S-dir = lib
-semaphore.c-dir = kernel
 thunk.S-dir = lib
 module.c-dir = kernel
 
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 660a03a89e66..75e52c57f19c 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -24,6 +24,10 @@ config X86
 	bool
 	default y
 
+config SEMAPHORE_SLEEPERS
+	bool
+	default y
+
 config MMU
 	bool
 	default y
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index 48f9e2c19cd6..427501a336c4 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -4,7 +4,7 @@
 
 extra-y 	:= head.o head64.o init_task.o vmlinux.lds
 EXTRA_AFLAGS	:= -traditional
-obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o \
+obj-y	:= process.o signal.o entry.o traps.o irq.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \
 		x8664_ksyms.o i387.o syscall.o vsyscall.o \
 		setup64.o bootflag.o e820.o reboot.o quirks.o
diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c
deleted file mode 100644
index 48f7c18172b9..000000000000
--- a/arch/x86_64/kernel/semaphore.c
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * x86_64 semaphore implementation.
- *
- * (C) Copyright 1999 Linus Torvalds
- *
- * Portions Copyright 1999 Red Hat, Inc.
- *
- *	This program is free software; you can redistribute it and/or
- *	modify it under the terms of the GNU General Public License
- *	as published by the Free Software Foundation; either version
- *	2 of the License, or (at your option) any later version.
- *
- * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
- */
-#include <linux/config.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <asm/errno.h>
-
-#include <asm/semaphore.h>
-
-/*
- * Semaphores are implemented using a two-way counter:
- * The "count" variable is decremented for each process
- * that tries to acquire the semaphore, while the "sleeping"
- * variable is a count of such acquires.
- *
- * Notably, the inline "up()" and "down()" functions can
- * efficiently test if they need to do any extra work (up
- * needs to do something only if count was negative before
- * the increment operation.
- *
- * "sleeping" and the contention routine ordering is protected
- * by the spinlock in the semaphore's waitqueue head.
- *
- * Note that these functions are only called when there is
- * contention on the lock, and as such all this is the
- * "non-critical" part of the whole semaphore business. The
- * critical part is the inline stuff in <asm/semaphore.h>
- * where we want to avoid any extra jumps and calls.
- */
-
-/*
- * Logic:
- *  - only on a boundary condition do we need to care. When we go
- *    from a negative count to a non-negative, we wake people up.
- *  - when we go from a non-negative count to a negative do we
- *    (a) synchronize with the "sleeper" count and (b) make sure
- *    that we're on the wakeup list before we synchronize so that
- *    we cannot lose wakeup events.
- */
-
-void __up(struct semaphore *sem)
-{
-	wake_up(&sem->wait);
-}
-
-void __sched __down(struct semaphore * sem)
-{
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_UNINTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * the wait_queue_head.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_UNINTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	tsk->state = TASK_RUNNING;
-}
-
-int __sched __down_interruptible(struct semaphore * sem)
-{
-	int retval = 0;
-	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
-	unsigned long flags;
-
-	tsk->state = TASK_INTERRUPTIBLE;
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	add_wait_queue_exclusive_locked(&sem->wait, &wait);
-
-	sem->sleepers++;
-	for (;;) {
-		int sleepers = sem->sleepers;
-
-		/*
-		 * With signals pending, this turns into
-		 * the trylock failure case - we won't be
-		 * sleeping, and we* can't get the lock as
-		 * it has contention. Just correct the count
-		 * and exit.
-		 */
-		if (signal_pending(current)) {
-			retval = -EINTR;
-			sem->sleepers = 0;
-			atomic_add(sleepers, &sem->count);
-			break;
-		}
-
-		/*
-		 * Add "everybody else" into it. They aren't
-		 * playing, because we own the spinlock in
-		 * wait_queue_head. The "-1" is because we're
-		 * still hoping to get the semaphore.
-		 */
-		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
-			sem->sleepers = 0;
-			break;
-		}
-		sem->sleepers = 1;	/* us - see -1 above */
-		spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-		schedule();
-
-		spin_lock_irqsave(&sem->wait.lock, flags);
-		tsk->state = TASK_INTERRUPTIBLE;
-	}
-	remove_wait_queue_locked(&sem->wait, &wait);
-	wake_up_locked(&sem->wait);
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-
-	tsk->state = TASK_RUNNING;
-	return retval;
-}
-
-/*
- * Trylock failed - make sure we correct for
- * having decremented the count.
- *
- * We could have done the trylock with a
- * single "cmpxchg" without failure cases,
- * but then it wouldn't work on a 386.
- */
-int __down_trylock(struct semaphore * sem)
-{
-	int sleepers;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sem->wait.lock, flags);
-	sleepers = sem->sleepers + 1;
-	sem->sleepers = 0;
-
-	/*
-	 * Add "everybody else" and us into it. They aren't
-	 * playing, because we own the spinlock in the
-	 * wait_queue_head.
-	 */
-	if (!atomic_add_negative(sleepers, &sem->count)) {
-		wake_up_locked(&sem->wait);
-	}
-
-	spin_unlock_irqrestore(&sem->wait.lock, flags);
-	return 1;
-}
-
-
diff --git a/lib/Makefile b/lib/Makefile
index 52f83380f704..3e2bd0df23bb 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -18,6 +18,7 @@ endif
 
 lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
+lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
diff --git a/lib/semaphore-sleepers.c b/lib/semaphore-sleepers.c
new file mode 100644
index 000000000000..4d5f18889fa5
--- /dev/null
+++ b/lib/semaphore-sleepers.c
@@ -0,0 +1,177 @@
+/*
+ * i386 and x86-64 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <asm/semaphore.h>
+
+/*
+ * Semaphores are implemented using a two-way counter:
+ * The "count" variable is decremented for each process
+ * that tries to acquire the semaphore, while the "sleeping"
+ * variable is a count of such acquires.
+ *
+ * Notably, the inline "up()" and "down()" functions can
+ * efficiently test if they need to do any extra work (up
+ * needs to do something only if count was negative before
+ * the increment operation.
+ *
+ * "sleeping" and the contention routine ordering is protected
+ * by the spinlock in the semaphore's waitqueue head.
+ *
+ * Note that these functions are only called when there is
+ * contention on the lock, and as such all this is the
+ * "non-critical" part of the whole semaphore business. The
+ * critical part is the inline stuff in <asm/semaphore.h>
+ * where we want to avoid any extra jumps and calls.
+ */
+
+/*
+ * Logic:
+ *  - only on a boundary condition do we need to care. When we go
+ *    from a negative count to a non-negative, we wake people up.
+ *  - when we go from a non-negative count to a negative do we
+ *    (a) synchronize with the "sleeper" count and (b) make sure
+ *    that we're on the wakeup list before we synchronize so that
+ *    we cannot lose wakeup events.
+ */
+
+fastcall void __up(struct semaphore *sem)
+{
+	wake_up(&sem->wait);
+}
+
+fastcall void __sched __down(struct semaphore * sem)
+{
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	unsigned long flags;
+
+	tsk->state = TASK_UNINTERRUPTIBLE;
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	add_wait_queue_exclusive_locked(&sem->wait, &wait);
+
+	sem->sleepers++;
+	for (;;) {
+		int sleepers = sem->sleepers;
+
+		/*
+		 * Add "everybody else" into it. They aren't
+		 * playing, because we own the spinlock in
+		 * the wait_queue_head.
+		 */
+		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
+			sem->sleepers = 0;
+			break;
+		}
+		sem->sleepers = 1;	/* us - see -1 above */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		schedule();
+
+		spin_lock_irqsave(&sem->wait.lock, flags);
+		tsk->state = TASK_UNINTERRUPTIBLE;
+	}
+	remove_wait_queue_locked(&sem->wait, &wait);
+	wake_up_locked(&sem->wait);
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+	tsk->state = TASK_RUNNING;
+}
+
+fastcall int __sched __down_interruptible(struct semaphore * sem)
+{
+	int retval = 0;
+	struct task_struct *tsk = current;
+	DECLARE_WAITQUEUE(wait, tsk);
+	unsigned long flags;
+
+	tsk->state = TASK_INTERRUPTIBLE;
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	add_wait_queue_exclusive_locked(&sem->wait, &wait);
+
+	sem->sleepers++;
+	for (;;) {
+		int sleepers = sem->sleepers;
+
+		/*
+		 * With signals pending, this turns into
+		 * the trylock failure case - we won't be
+		 * sleeping, and we* can't get the lock as
+		 * it has contention. Just correct the count
+		 * and exit.
+		 */
+		if (signal_pending(current)) {
+			retval = -EINTR;
+			sem->sleepers = 0;
+			atomic_add(sleepers, &sem->count);
+			break;
+		}
+
+		/*
+		 * Add "everybody else" into it. They aren't
+		 * playing, because we own the spinlock in
+		 * wait_queue_head. The "-1" is because we're
+		 * still hoping to get the semaphore.
+		 */
+		if (!atomic_add_negative(sleepers - 1, &sem->count)) {
+			sem->sleepers = 0;
+			break;
+		}
+		sem->sleepers = 1;	/* us - see -1 above */
+		spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+		schedule();
+
+		spin_lock_irqsave(&sem->wait.lock, flags);
+		tsk->state = TASK_INTERRUPTIBLE;
+	}
+	remove_wait_queue_locked(&sem->wait, &wait);
+	wake_up_locked(&sem->wait);
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+	tsk->state = TASK_RUNNING;
+	return retval;
+}
+
+/*
+ * Trylock failed - make sure we correct for
+ * having decremented the count.
+ *
+ * We could have done the trylock with a
+ * single "cmpxchg" without failure cases,
+ * but then it wouldn't work on a 386.
+ */
+fastcall int __down_trylock(struct semaphore * sem)
+{
+	int sleepers;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sem->wait.lock, flags);
+	sleepers = sem->sleepers + 1;
+	sem->sleepers = 0;
+
+	/*
+	 * Add "everybody else" and us into it. They aren't
+	 * playing, because we own the spinlock in the
+	 * wait_queue_head.
+	 */
+	if (!atomic_add_negative(sleepers, &sem->count)) {
+		wake_up_locked(&sem->wait);
+	}
+
+	spin_unlock_irqrestore(&sem->wait.lock, flags);
+	return 1;
+}
-- 
cgit v1.2.3


From 795312e763569ce4df67e7a0ca726a9901358fa2 Mon Sep 17 00:00:00 2001
From: Pierre Ossman <drzeus@drzeus.cx>
Date: Sat, 3 Sep 2005 15:56:54 -0700
Subject: [PATCH] ISA DMA suspend for i386

Reset the ISA DMA controller into a known state after a suspend.  Primary
concern was reenabling the cascading DMA channel (4).

Signed-off-by: Pierre Ossman <drzeus@drzeus.cx>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/Makefile |  2 +-
 arch/i386/kernel/i8237.c  | 67 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 1 deletion(-)
 create mode 100644 arch/i386/kernel/i8237.c

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 4cc83b322b36..64682a0edacf 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -7,7 +7,7 @@ extra-y := head.o init_task.o vmlinux.lds
 obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
 		pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
-		doublefault.o quirks.o
+		doublefault.o quirks.o i8237.o
 
 obj-y				+= cpu/
 obj-y				+= timers/
diff --git a/arch/i386/kernel/i8237.c b/arch/i386/kernel/i8237.c
new file mode 100644
index 000000000000..c36d1c006c2f
--- /dev/null
+++ b/arch/i386/kernel/i8237.c
@@ -0,0 +1,67 @@
+/*
+ * i8237.c: 8237A DMA controller suspend functions.
+ *
+ * Written by Pierre Ossman, 2005.
+ */
+
+#include <linux/init.h>
+#include <linux/sysdev.h>
+
+#include <asm/dma.h>
+
+/*
+ * This module just handles suspend/resume issues with the
+ * 8237A DMA controller (used for ISA and LPC).
+ * Allocation is handled in kernel/dma.c and normal usage is
+ * in asm/dma.h.
+ */
+
+static int i8237A_resume(struct sys_device *dev)
+{
+	unsigned long flags;
+	int i;
+
+	flags = claim_dma_lock();
+
+	dma_outb(DMA1_RESET_REG, 0);
+	dma_outb(DMA2_RESET_REG, 0);
+
+	for (i = 0;i < 8;i++) {
+		set_dma_addr(i, 0x000000);
+		/* DMA count is a bit weird so this is not 0 */
+		set_dma_count(i, 1);
+	}
+
+	/* Enable cascade DMA or channel 0-3 won't work */
+	enable_dma(4);
+
+	release_dma_lock(flags);
+
+	return 0;
+}
+
+static int i8237A_suspend(struct sys_device *dev, pm_message_t state)
+{
+	return 0;
+}
+
+static struct sysdev_class i8237_sysdev_class = {
+	set_kset_name("i8237"),
+	.suspend = i8237A_suspend,
+	.resume = i8237A_resume,
+};
+
+static struct sys_device device_i8237A = {
+	.id	= 0,
+	.cls	= &i8237_sysdev_class,
+};
+
+static int __init i8237A_init_sysfs(void)
+{
+	int error = sysdev_class_register(&i8237_sysdev_class);
+	if (!error)
+		error = sysdev_register(&device_i8237A);
+	return error;
+}
+
+device_initcall(i8237A_init_sysfs);
-- 
cgit v1.2.3


From 829ca9a30a2ddb727981d80fabdbff2ea86bc9ea Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sat, 3 Sep 2005 15:56:56 -0700
Subject: [PATCH] swsusp: fix remaining u32 vs. pm_message_t confusion

Fix remaining bits of u32 vs.  pm_message confusion.  Should not break
anything.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/cpu/mtrr/main.c              |  2 +-
 arch/ppc/platforms/pmac_pic.c                 |  2 +-
 arch/ppc/syslib/open_pic.c                    |  2 +-
 arch/x86_64/kernel/nmi.c                      |  2 +-
 drivers/ide/ppc/pmac.c                        |  8 ++++----
 drivers/macintosh/mediabay.c                  |  2 +-
 drivers/media/video/bttv-driver.c             |  1 -
 drivers/media/video/msp3400.c                 |  4 ++--
 drivers/media/video/tda9887.c                 |  2 +-
 drivers/media/video/tuner-core.c              |  2 +-
 drivers/net/bnx2.c                            | 18 +++++++++---------
 drivers/net/e1000/e1000_main.c                | 14 ++++++--------
 drivers/net/irda/vlsi_ir.c                    | 11 +++--------
 drivers/net/phy/mdio_bus.c                    |  2 +-
 drivers/net/wireless/orinoco_pci.c            |  2 --
 drivers/net/wireless/prism54/islpci_hotplug.c |  2 --
 drivers/video/s1d13xxxfb.c                    |  2 +-
 drivers/video/savage/savagefb_driver.c        |  1 -
 sound/pci/atiixp.c                            |  4 ++--
 19 files changed, 35 insertions(+), 48 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
index 764cac64e211..dd4ebd6af7e4 100644
--- a/arch/i386/kernel/cpu/mtrr/main.c
+++ b/arch/i386/kernel/cpu/mtrr/main.c
@@ -561,7 +561,7 @@ struct mtrr_value {
 
 static struct mtrr_value * mtrr_state;
 
-static int mtrr_save(struct sys_device * sysdev, u32 state)
+static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 {
 	int i;
 	int size = num_var_ranges * sizeof(struct mtrr_value);
diff --git a/arch/ppc/platforms/pmac_pic.c b/arch/ppc/platforms/pmac_pic.c
index 9f92e1bb7f34..2ce058895e03 100644
--- a/arch/ppc/platforms/pmac_pic.c
+++ b/arch/ppc/platforms/pmac_pic.c
@@ -619,7 +619,7 @@ not_found:
 	return viaint;
 }
 
-static int pmacpic_suspend(struct sys_device *sysdev, u32 state)
+static int pmacpic_suspend(struct sys_device *sysdev, pm_message_t state)
 {
 	int viaint = pmacpic_find_viaint();
 
diff --git a/arch/ppc/syslib/open_pic.c b/arch/ppc/syslib/open_pic.c
index ad39b86ca92c..53da58523e39 100644
--- a/arch/ppc/syslib/open_pic.c
+++ b/arch/ppc/syslib/open_pic.c
@@ -948,7 +948,7 @@ static void openpic_cached_disable_irq(u_int irq)
  * we need something better to deal with that... Maybe switch to S1 for
  * cpufreq changes
  */
-int openpic_suspend(struct sys_device *sysdev, u32 state)
+int openpic_suspend(struct sys_device *sysdev, pm_message_t state)
 {
 	int	i;
 	unsigned long flags;
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 4e44d6e6b7e5..64a8e05d5811 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -290,7 +290,7 @@ void enable_timer_nmi_watchdog(void)
 
 static int nmi_pm_active; /* nmi_active before suspend */
 
-static int lapic_nmi_suspend(struct sys_device *dev, u32 state)
+static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
 {
 	nmi_pm_active = nmi_active;
 	disable_lapic_nmi_watchdog();
diff --git a/drivers/ide/ppc/pmac.c b/drivers/ide/ppc/pmac.c
index ea65b070a367..d2760b8ca159 100644
--- a/drivers/ide/ppc/pmac.c
+++ b/drivers/ide/ppc/pmac.c
@@ -1504,7 +1504,7 @@ pmac_ide_macio_attach(struct macio_dev *mdev, const struct of_device_id *match)
 }
 
 static int
-pmac_ide_macio_suspend(struct macio_dev *mdev, u32 state)
+pmac_ide_macio_suspend(struct macio_dev *mdev, pm_message_t state)
 {
 	ide_hwif_t	*hwif = (ide_hwif_t *)dev_get_drvdata(&mdev->ofdev.dev);
 	int		rc = 0;
@@ -1527,7 +1527,7 @@ pmac_ide_macio_resume(struct macio_dev *mdev)
 	if (mdev->ofdev.dev.power.power_state != 0) {
 		rc = pmac_ide_do_resume(hwif);
 		if (rc == 0)
-			mdev->ofdev.dev.power.power_state = 0;
+			mdev->ofdev.dev.power.power_state = PMSG_ON;
 	}
 
 	return rc;
@@ -1608,7 +1608,7 @@ pmac_ide_pci_attach(struct pci_dev *pdev, const struct pci_device_id *id)
 }
 
 static int
-pmac_ide_pci_suspend(struct pci_dev *pdev, u32 state)
+pmac_ide_pci_suspend(struct pci_dev *pdev, pm_message_t state)
 {
 	ide_hwif_t	*hwif = (ide_hwif_t *)pci_get_drvdata(pdev);
 	int		rc = 0;
@@ -1631,7 +1631,7 @@ pmac_ide_pci_resume(struct pci_dev *pdev)
 	if (pdev->dev.power.power_state != 0) {
 		rc = pmac_ide_do_resume(hwif);
 		if (rc == 0)
-			pdev->dev.power.power_state = 0;
+			pdev->dev.power.power_state = PMSG_ON;
 	}
 
 	return rc;
diff --git a/drivers/macintosh/mediabay.c b/drivers/macintosh/mediabay.c
index 7c16c25fc5d4..7e1afca75e41 100644
--- a/drivers/macintosh/mediabay.c
+++ b/drivers/macintosh/mediabay.c
@@ -724,7 +724,7 @@ static int __pmac media_bay_resume(struct macio_dev *mdev)
 	struct media_bay_info	*bay = macio_get_drvdata(mdev);
 
 	if (mdev->ofdev.dev.power.power_state != 0) {
-		mdev->ofdev.dev.power.power_state = 0;
+		mdev->ofdev.dev.power.power_state = PMSG_ON;
 
 	       	/* We re-enable the bay using it's previous content
 	       	   only if it did not change. Note those bozo timings,
diff --git a/drivers/media/video/bttv-driver.c b/drivers/media/video/bttv-driver.c
index eee9322ce21b..087efb4dea09 100644
--- a/drivers/media/video/bttv-driver.c
+++ b/drivers/media/video/bttv-driver.c
@@ -4047,7 +4047,6 @@ static int bttv_suspend(struct pci_dev *pci_dev, pm_message_t state)
 	struct bttv_buffer_set idle;
 	unsigned long flags;
 
-	dprintk("bttv%d: suspend %d\n", btv->c.nr, state);
 
 	/* stop dma + irqs */
 	spin_lock_irqsave(&btv->s_lock,flags);
diff --git a/drivers/media/video/msp3400.c b/drivers/media/video/msp3400.c
index 62f1b8ddb98b..e956234abf24 100644
--- a/drivers/media/video/msp3400.c
+++ b/drivers/media/video/msp3400.c
@@ -1416,7 +1416,7 @@ static int msp_detach(struct i2c_client *client);
 static int msp_probe(struct i2c_adapter *adap);
 static int msp_command(struct i2c_client *client, unsigned int cmd, void *arg);
 
-static int msp_suspend(struct device * dev, u32 state, u32 level);
+static int msp_suspend(struct device * dev, pm_message_t state, u32 level);
 static int msp_resume(struct device * dev, u32 level);
 
 static void msp_wake_thread(struct i2c_client *client);
@@ -1817,7 +1817,7 @@ static int msp_command(struct i2c_client *client, unsigned int cmd, void *arg)
 	return 0;
 }
 
-static int msp_suspend(struct device * dev, u32 state, u32 level)
+static int msp_suspend(struct device * dev, pm_message_t state, u32 level)
 {
 	struct i2c_client *c = container_of(dev, struct i2c_client, dev);
 
diff --git a/drivers/media/video/tda9887.c b/drivers/media/video/tda9887.c
index 108c3ad7d622..a28a395d6dfe 100644
--- a/drivers/media/video/tda9887.c
+++ b/drivers/media/video/tda9887.c
@@ -760,7 +760,7 @@ tda9887_command(struct i2c_client *client, unsigned int cmd, void *arg)
 	return 0;
 }
 
-static int tda9887_suspend(struct device * dev, u32 state, u32 level)
+static int tda9887_suspend(struct device * dev, pm_message_t state, u32 level)
 {
 	dprintk("tda9887: suspend\n");
 	return 0;
diff --git a/drivers/media/video/tuner-core.c b/drivers/media/video/tuner-core.c
index f0a579827a24..a155e99a263b 100644
--- a/drivers/media/video/tuner-core.c
+++ b/drivers/media/video/tuner-core.c
@@ -672,7 +672,7 @@ static int tuner_command(struct i2c_client *client, unsigned int cmd, void *arg)
 	return 0;
 }
 
-static int tuner_suspend(struct device *dev, u32 state, u32 level)
+static int tuner_suspend(struct device *dev, pm_message_t state, u32 level)
 {
 	struct i2c_client *c = container_of (dev, struct i2c_client, dev);
 	struct tuner *t = i2c_get_clientdata (c);
diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 7babf6af4e28..55a72c7ad001 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -2004,14 +2004,14 @@ bnx2_init_cpus(struct bnx2 *bp)
 }
 
 static int
-bnx2_set_power_state(struct bnx2 *bp, int state)
+bnx2_set_power_state(struct bnx2 *bp, pci_power_t state)
 {
 	u16 pmcsr;
 
 	pci_read_config_word(bp->pdev, bp->pm_cap + PCI_PM_CTRL, &pmcsr);
 
 	switch (state) {
-	case 0: {
+	case PCI_D0: {
 		u32 val;
 
 		pci_write_config_word(bp->pdev, bp->pm_cap + PCI_PM_CTRL,
@@ -2032,7 +2032,7 @@ bnx2_set_power_state(struct bnx2 *bp, int state)
 		REG_WR(bp, BNX2_RPM_CONFIG, val);
 		break;
 	}
-	case 3: {
+	case PCI_D3hot: {
 		int i;
 		u32 val, wol_msg;
 
@@ -3886,7 +3886,7 @@ bnx2_open(struct net_device *dev)
 	struct bnx2 *bp = dev->priv;
 	int rc;
 
-	bnx2_set_power_state(bp, 0);
+	bnx2_set_power_state(bp, PCI_D0);
 	bnx2_disable_int(bp);
 
 	rc = bnx2_alloc_mem(bp);
@@ -4197,7 +4197,7 @@ bnx2_close(struct net_device *dev)
 	bnx2_free_mem(bp);
 	bp->link_up = 0;
 	netif_carrier_off(bp->dev);
-	bnx2_set_power_state(bp, 3);
+	bnx2_set_power_state(bp, PCI_D3hot);
 	return 0;
 }
 
@@ -5203,7 +5203,7 @@ bnx2_init_board(struct pci_dev *pdev, struct net_device *dev)
 			       BNX2_PCICFG_MISC_CONFIG_REG_WINDOW_ENA |
 			       BNX2_PCICFG_MISC_CONFIG_TARGET_MB_WORD_SWAP);
 
-	bnx2_set_power_state(bp, 0);
+	bnx2_set_power_state(bp, PCI_D0);
 
 	bp->chip_id = REG_RD(bp, BNX2_MISC_ID);
 
@@ -5495,7 +5495,7 @@ bnx2_remove_one(struct pci_dev *pdev)
 }
 
 static int
-bnx2_suspend(struct pci_dev *pdev, u32 state)
+bnx2_suspend(struct pci_dev *pdev, pm_message_t state)
 {
 	struct net_device *dev = pci_get_drvdata(pdev);
 	struct bnx2 *bp = dev->priv;
@@ -5513,7 +5513,7 @@ bnx2_suspend(struct pci_dev *pdev, u32 state)
 		reset_code = BNX2_DRV_MSG_CODE_SUSPEND_NO_WOL;
 	bnx2_reset_chip(bp, reset_code);
 	bnx2_free_skbs(bp);
-	bnx2_set_power_state(bp, state);
+	bnx2_set_power_state(bp, pci_choose_state(pdev, state));
 	return 0;
 }
 
@@ -5526,7 +5526,7 @@ bnx2_resume(struct pci_dev *pdev)
 	if (!netif_running(dev))
 		return 0;
 
-	bnx2_set_power_state(bp, 0);
+	bnx2_set_power_state(bp, PCI_D0);
 	netif_device_attach(dev);
 	bnx2_init_nic(bp);
 	bnx2_netif_start(bp);
diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c
index 9b596e0bbf95..7c8a0a22dcd5 100644
--- a/drivers/net/e1000/e1000_main.c
+++ b/drivers/net/e1000/e1000_main.c
@@ -162,7 +162,7 @@ static void e1000_vlan_rx_add_vid(struct net_device *netdev, uint16_t vid);
 static void e1000_vlan_rx_kill_vid(struct net_device *netdev, uint16_t vid);
 static void e1000_restore_vlan(struct e1000_adapter *adapter);
 
-static int e1000_suspend(struct pci_dev *pdev, uint32_t state);
+static int e1000_suspend(struct pci_dev *pdev, pm_message_t state);
 #ifdef CONFIG_PM
 static int e1000_resume(struct pci_dev *pdev);
 #endif
@@ -3642,7 +3642,7 @@ e1000_set_spd_dplx(struct e1000_adapter *adapter, uint16_t spddplx)
 }
 
 static int
-e1000_suspend(struct pci_dev *pdev, uint32_t state)
+e1000_suspend(struct pci_dev *pdev, pm_message_t state)
 {
 	struct net_device *netdev = pci_get_drvdata(pdev);
 	struct e1000_adapter *adapter = netdev_priv(netdev);
@@ -3726,9 +3726,7 @@ e1000_suspend(struct pci_dev *pdev, uint32_t state)
 	}
 
 	pci_disable_device(pdev);
-
-	state = (state > 0) ? 3 : 0;
-	pci_set_power_state(pdev, state);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
 
 	return 0;
 }
@@ -3741,13 +3739,13 @@ e1000_resume(struct pci_dev *pdev)
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	uint32_t manc, ret_val, swsm;
 
-	pci_set_power_state(pdev, 0);
+	pci_set_power_state(pdev, PCI_D0);
 	pci_restore_state(pdev);
 	ret_val = pci_enable_device(pdev);
 	pci_set_master(pdev);
 
-	pci_enable_wake(pdev, 3, 0);
-	pci_enable_wake(pdev, 4, 0); /* 4 == D3 cold */
+	pci_enable_wake(pdev, PCI_D3hot, 0);
+	pci_enable_wake(pdev, PCI_D3cold, 0);
 
 	e1000_reset(adapter);
 	E1000_WRITE_REG(&adapter->hw, WUS, ~0);
diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c
index 006e4f575606..4be95398bac4 100644
--- a/drivers/net/irda/vlsi_ir.c
+++ b/drivers/net/irda/vlsi_ir.c
@@ -1749,11 +1749,6 @@ static int vlsi_irda_suspend(struct pci_dev *pdev, pm_message_t state)
 	struct net_device *ndev = pci_get_drvdata(pdev);
 	vlsi_irda_dev_t *idev;
 
-	if (state < 1 || state > 3 ) {
-		IRDA_ERROR("%s - %s: invalid pm state request: %u\n",
-			   __FUNCTION__, PCIDEV_NAME(pdev), state);
-		return 0;
-	}
 	if (!ndev) {
 		IRDA_ERROR("%s - %s: no netdevice \n",
 			   __FUNCTION__, PCIDEV_NAME(pdev));
@@ -1781,7 +1776,7 @@ static int vlsi_irda_suspend(struct pci_dev *pdev, pm_message_t state)
 			idev->new_baud = idev->baud;
 	}
 
-	pci_set_power_state(pdev,state);
+	pci_set_power_state(pdev, pci_choose_state(pdev, state));
 	pdev->current_state = state;
 	idev->resume_ok = 1;
 	up(&idev->sem);
@@ -1807,8 +1802,8 @@ static int vlsi_irda_resume(struct pci_dev *pdev)
 		return 0;
 	}
 	
-	pci_set_power_state(pdev, 0);
-	pdev->current_state = 0;
+	pci_set_power_state(pdev, PCI_D0);
+	pdev->current_state = PM_EVENT_ON;
 
 	if (!idev->resume_ok) {
 		/* should be obsolete now - but used to happen due to:
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 41f62c0c5fcb..5e81494e9a9a 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -128,7 +128,7 @@ static int mdio_bus_match(struct device *dev, struct device_driver *drv)
 /* Suspend and resume.  Copied from platform_suspend and
  * platform_resume
  */
-static int mdio_bus_suspend(struct device * dev, u32 state)
+static int mdio_bus_suspend(struct device * dev, pm_message_t state)
 {
 	int ret = 0;
 	struct device_driver *drv = dev->driver;
diff --git a/drivers/net/wireless/orinoco_pci.c b/drivers/net/wireless/orinoco_pci.c
index 7a6f52ea7faa..42e03438291b 100644
--- a/drivers/net/wireless/orinoco_pci.c
+++ b/drivers/net/wireless/orinoco_pci.c
@@ -301,8 +301,6 @@ static int orinoco_pci_suspend(struct pci_dev *pdev, pm_message_t state)
 	unsigned long flags;
 	int err;
 	
-	printk(KERN_DEBUG "%s: Orinoco-PCI entering sleep mode (state=%d)\n",
-	       dev->name, state);
 
 	err = orinoco_lock(priv, &flags);
 	if (err) {
diff --git a/drivers/net/wireless/prism54/islpci_hotplug.c b/drivers/net/wireless/prism54/islpci_hotplug.c
index c17391d947f3..dc040caab7d7 100644
--- a/drivers/net/wireless/prism54/islpci_hotplug.c
+++ b/drivers/net/wireless/prism54/islpci_hotplug.c
@@ -267,8 +267,6 @@ prism54_suspend(struct pci_dev *pdev, pm_message_t state)
 	islpci_private *priv = ndev ? netdev_priv(ndev) : NULL;
 	BUG_ON(!priv);
 
-	printk(KERN_NOTICE "%s: got suspend request (state %d)\n",
-	       ndev->name, state);
 
 	pci_save_state(pdev);
 
diff --git a/drivers/video/s1d13xxxfb.c b/drivers/video/s1d13xxxfb.c
index 3848be2b9d2d..fa98d91c42eb 100644
--- a/drivers/video/s1d13xxxfb.c
+++ b/drivers/video/s1d13xxxfb.c
@@ -655,7 +655,7 @@ bail:
 }
 
 #ifdef CONFIG_PM
-static int s1d13xxxfb_suspend(struct device *dev, u32 state, u32 level)
+static int s1d13xxxfb_suspend(struct device *dev, pm_message_t state, u32 level)
 {
 	struct fb_info *info = dev_get_drvdata(dev);
 	struct s1d13xxxfb_par *s1dfb = info->par;
diff --git a/drivers/video/savage/savagefb_driver.c b/drivers/video/savage/savagefb_driver.c
index f4633d1891f1..117ad42f120d 100644
--- a/drivers/video/savage/savagefb_driver.c
+++ b/drivers/video/savage/savagefb_driver.c
@@ -2110,7 +2110,6 @@ static int savagefb_suspend (struct pci_dev* dev, pm_message_t state)
 	struct savagefb_par *par = (struct savagefb_par *)info->par;
 
 	DBG("savagefb_suspend");
-	printk(KERN_DEBUG "state: %u\n", state);
 
 	acquire_console_sem();
 	fb_set_suspend(info, pci_choose_state(dev, state));
diff --git a/sound/pci/atiixp.c b/sound/pci/atiixp.c
index 904d17394e1c..188df085b7ee 100644
--- a/sound/pci/atiixp.c
+++ b/sound/pci/atiixp.c
@@ -1427,7 +1427,7 @@ static int snd_atiixp_suspend(snd_card_t *card, pm_message_t state)
 	snd_atiixp_aclink_down(chip);
 	snd_atiixp_chip_stop(chip);
 
-	pci_set_power_state(chip->pci, 3);
+	pci_set_power_state(chip->pci, PCI_D3hot);
 	pci_disable_device(chip->pci);
 	return 0;
 }
@@ -1438,7 +1438,7 @@ static int snd_atiixp_resume(snd_card_t *card)
 	int i;
 
 	pci_enable_device(chip->pci);
-	pci_set_power_state(chip->pci, 0);
+	pci_set_power_state(chip->pci, PCI_D0);
 	pci_set_master(chip->pci);
 
 	snd_atiixp_aclink_reset(chip);
-- 
cgit v1.2.3


From c3c433e4f33afe255389ba3b1a003dc8deb3de9a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Sat, 3 Sep 2005 15:57:07 -0700
Subject: [PATCH] add suspend/resume for timer

The timers lack .suspend/.resume methods.  Because of this, jiffies got a
big compensation after a S3 resume.  And then softlockup watchdog reports
an oops.  This occured with HPET enabled, but it's also possible for other
timers.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/time.c              | 10 ++++++++++
 arch/i386/kernel/timers/timer_hpet.c | 14 ++++++++++++++
 arch/i386/kernel/timers/timer_pit.c  | 27 ---------------------------
 arch/i386/kernel/timers/timer_pm.c   |  9 +++++++++
 arch/i386/kernel/timers/timer_tsc.c  | 14 ++++++++++++++
 include/asm-i386/timer.h             |  3 +++
 6 files changed, 50 insertions(+), 27 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 0ee9dee8af06..6f794a78ee1e 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -383,6 +383,7 @@ void notify_arch_cmos_timer(void)
 
 static long clock_cmos_diff, sleep_start;
 
+static struct timer_opts *last_timer;
 static int timer_suspend(struct sys_device *dev, pm_message_t state)
 {
 	/*
@@ -391,6 +392,10 @@ static int timer_suspend(struct sys_device *dev, pm_message_t state)
 	clock_cmos_diff = -get_cmos_time();
 	clock_cmos_diff += get_seconds();
 	sleep_start = get_cmos_time();
+	last_timer = cur_timer;
+	cur_timer = &timer_none;
+	if (last_timer->suspend)
+		last_timer->suspend(state);
 	return 0;
 }
 
@@ -404,6 +409,7 @@ static int timer_resume(struct sys_device *dev)
 	if (is_hpet_enabled())
 		hpet_reenable();
 #endif
+	setup_pit_timer();
 	sec = get_cmos_time() + clock_cmos_diff;
 	sleep_length = (get_cmos_time() - sleep_start) * HZ;
 	write_seqlock_irqsave(&xtime_lock, flags);
@@ -412,6 +418,10 @@ static int timer_resume(struct sys_device *dev)
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	jiffies += sleep_length;
 	wall_jiffies += sleep_length;
+	if (last_timer->resume)
+		last_timer->resume();
+	cur_timer = last_timer;
+	last_timer = NULL;
 	return 0;
 }
 
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index cbb3f221ced8..001de97c9e4a 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -181,6 +181,19 @@ static int __init init_hpet(char* override)
 	return 0;
 }
 
+static int hpet_resume(void)
+{
+	write_seqlock(&monotonic_lock);
+	/* Assume this is the last mark offset time */
+	rdtsc(last_tsc_low, last_tsc_high);
+
+	if (hpet_use_timer)
+		hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick;
+	else
+		hpet_last = hpet_readl(HPET_COUNTER);
+	write_sequnlock(&monotonic_lock);
+	return 0;
+}
 /************************************************************/
 
 /* tsc timer_opts struct */
@@ -190,6 +203,7 @@ static struct timer_opts timer_hpet __read_mostly = {
 	.get_offset =		get_offset_hpet,
 	.monotonic_clock =	monotonic_clock_hpet,
 	.delay = 		delay_hpet,
+	.resume	=		hpet_resume,
 };
 
 struct init_timer_opts __initdata timer_hpet_init = {
diff --git a/arch/i386/kernel/timers/timer_pit.c b/arch/i386/kernel/timers/timer_pit.c
index 06de036a820c..eddb64038234 100644
--- a/arch/i386/kernel/timers/timer_pit.c
+++ b/arch/i386/kernel/timers/timer_pit.c
@@ -175,30 +175,3 @@ void setup_pit_timer(void)
 	outb(LATCH >> 8 , PIT_CH0);	/* MSB */
 	spin_unlock_irqrestore(&i8253_lock, flags);
 }
-
-static int timer_resume(struct sys_device *dev)
-{
-	setup_pit_timer();
-	return 0;
-}
-
-static struct sysdev_class timer_sysclass = {
-	set_kset_name("timer_pit"),
-	.resume	= timer_resume,
-};
-
-static struct sys_device device_timer = {
-	.id	= 0,
-	.cls	= &timer_sysclass,
-};
-
-static int __init init_timer_sysfs(void)
-{
-	int error = sysdev_class_register(&timer_sysclass);
-	if (!error)
-		error = sysdev_register(&device_timer);
-	return error;
-}
-
-device_initcall(init_timer_sysfs);
-
diff --git a/arch/i386/kernel/timers/timer_pm.c b/arch/i386/kernel/timers/timer_pm.c
index 4ef20e663498..264edaaac315 100644
--- a/arch/i386/kernel/timers/timer_pm.c
+++ b/arch/i386/kernel/timers/timer_pm.c
@@ -186,6 +186,14 @@ static void mark_offset_pmtmr(void)
 	}
 }
 
+static int pmtmr_resume(void)
+{
+	write_seqlock(&monotonic_lock);
+	/* Assume this is the last mark offset time */
+	offset_tick = read_pmtmr();
+	write_sequnlock(&monotonic_lock);
+	return 0;
+}
 
 static unsigned long long monotonic_clock_pmtmr(void)
 {
@@ -247,6 +255,7 @@ static struct timer_opts timer_pmtmr = {
 	.monotonic_clock 	= monotonic_clock_pmtmr,
 	.delay 			= delay_pmtmr,
 	.read_timer 		= read_timer_tsc,
+	.resume			= pmtmr_resume,
 };
 
 struct init_timer_opts __initdata timer_pmtmr_init = {
diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c
index 8f4e4d5bc560..6dd470cc9f72 100644
--- a/arch/i386/kernel/timers/timer_tsc.c
+++ b/arch/i386/kernel/timers/timer_tsc.c
@@ -543,6 +543,19 @@ static int __init init_tsc(char* override)
 	return -ENODEV;
 }
 
+static int tsc_resume(void)
+{
+	write_seqlock(&monotonic_lock);
+	/* Assume this is the last mark offset time */
+	rdtsc(last_tsc_low, last_tsc_high);
+#ifdef CONFIG_HPET_TIMER
+	if (is_hpet_enabled() && hpet_use_timer)
+		hpet_last = hpet_readl(HPET_COUNTER);
+#endif
+	write_sequnlock(&monotonic_lock);
+	return 0;
+}
+
 #ifndef CONFIG_X86_TSC
 /* disable flag for tsc.  Takes effect by clearing the TSC cpu flag
  * in cpu/common.c */
@@ -573,6 +586,7 @@ static struct timer_opts timer_tsc = {
 	.monotonic_clock = monotonic_clock_tsc,
 	.delay = delay_tsc,
 	.read_timer = read_timer_tsc,
+	.resume	= tsc_resume,
 };
 
 struct init_timer_opts __initdata timer_tsc_init = {
diff --git a/include/asm-i386/timer.h b/include/asm-i386/timer.h
index dcf1e07db08a..aed16437479d 100644
--- a/include/asm-i386/timer.h
+++ b/include/asm-i386/timer.h
@@ -1,6 +1,7 @@
 #ifndef _ASMi386_TIMER_H
 #define _ASMi386_TIMER_H
 #include <linux/init.h>
+#include <linux/pm.h>
 
 /**
  * struct timer_ops - used to define a timer source
@@ -23,6 +24,8 @@ struct timer_opts {
 	unsigned long long (*monotonic_clock)(void);
 	void (*delay)(unsigned long);
 	unsigned long (*read_timer)(void);
+	int (*suspend)(pm_message_t state);
+	int (*resume)(void);
 };
 
 struct init_timer_opts {
-- 
cgit v1.2.3


From 94c80b2598dbd2b8a6fe5f5c2c3af1beb37f66c7 Mon Sep 17 00:00:00 2001
From: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Date: Sat, 3 Sep 2005 15:57:13 -0700
Subject: [PATCH] Ptrace/i386: fix "syscall audit" interaction with singlestep

      Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>

Avoid giving two traps for singlestep instead of one, when syscall auditing is
enabled.

In fact no singlestep trap is sent on syscall entry, only on syscall exit, as
can be seen in entry.S:

# Note that in this mask _TIF_SINGLESTEP is not tested !!! <<<<<<<<<<<<<<
        testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
        jnz syscall_trace_entry
	...
syscall_trace_entry:
	...
	call do_syscall_trace

But auditing a SINGLESTEP'ed process causes do_syscall_trace to be called, so
the tracer will get one more trap on the syscall entry path, which it
shouldn't.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
CC: Roland McGrath <roland@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/ptrace.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 0da59b42843c..5ee9e1d60653 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -683,8 +683,19 @@ void do_syscall_trace(struct pt_regs *regs, int entryexit)
 	/* do the secure computing check first */
 	secure_computing(regs->orig_eax);
 
-	if (unlikely(current->audit_context) && entryexit)
-		audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
+	if (unlikely(current->audit_context)) {
+		if (entryexit)
+			audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
+
+		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
+		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
+		 * not used, entry.S will call us only on syscall exit, not
+		 * entry ; so when TIF_SYSCALL_AUDIT is used we must avoid
+		 * calling send_sigtrap() on syscall entry.
+		 */
+		else if (is_singlestep)
+			goto out;
+	}
 
 	if (!(current->ptrace & PT_PTRACED))
 		goto out;
-- 
cgit v1.2.3


From ed75e8d58010fdc06e2c3a81bfbebae92314c7e3 Mon Sep 17 00:00:00 2001
From: Laurent Vivier <LaurentVivier@wanadoo.fr>
Date: Sat, 3 Sep 2005 15:57:18 -0700
Subject: [PATCH] UML Support - Ptrace: adds the host SYSEMU support, for UML
 and general usage

      Jeff Dike <jdike@addtoit.com>,
      Paolo 'Blaisorblade' Giarrusso <blaisorblade_spam@yahoo.it>,
      Bodo Stroesser <bstroesser@fujitsu-siemens.com>

Adds a new ptrace(2) mode, called PTRACE_SYSEMU, resembling PTRACE_SYSCALL
except that the kernel does not execute the requested syscall; this is useful
to improve performance for virtual environments, like UML, which want to run
the syscall on their own.

In fact, using PTRACE_SYSCALL means stopping child execution twice, on entry
and on exit, and each time you also have two context switches; with SYSEMU you
avoid the 2nd stop and so save two context switches per syscall.

Also, some architectures don't have support in the host for changing the
syscall number via ptrace(), which is currently needed to skip syscall
execution (UML turns any syscall into getpid() to avoid it being executed on
the host).  Fixing that is hard, while SYSEMU is easier to implement.

* This version of the patch includes some suggestions of Jeff Dike to avoid
  adding any instructions to the syscall fast path, plus some other little
  changes, by myself, to make it work even when the syscall is executed with
  SYSENTER (but I'm unsure about them). It has been widely tested for quite a
  lot of time.

* Various fixed were included to handle the various switches between
  various states, i.e. when for instance a syscall entry is traced with one of
  PT_SYSCALL / _SYSEMU / _SINGLESTEP and another one is used on exit.
  Basically, this is done by remembering which one of them was used even after
  the call to ptrace_notify().

* We're combining TIF_SYSCALL_EMU with TIF_SYSCALL_TRACE or TIF_SINGLESTEP
  to make do_syscall_trace() notice that the current syscall was started with
  SYSEMU on entry, so that no notification ought to be done in the exit path;
  this is a bit of a hack, so this problem is solved in another way in next
  patches.

* Also, the effects of the patch:
"Ptrace - i386: fix Syscall Audit interaction with singlestep"
are cancelled; they are restored back in the last patch of this series.

Detailed descriptions of the patches doing this kind of processing follow (but
I've already summed everything up).

* Fix behaviour when changing interception kind #1.

  In do_syscall_trace(), we check the status of the TIF_SYSCALL_EMU flag
  only after doing the debugger notification; but the debugger might have
  changed the status of this flag because he continued execution with
  PTRACE_SYSCALL, so this is wrong.  This patch fixes it by saving the flag
  status before calling ptrace_notify().

* Fix behaviour when changing interception kind #2:
  avoid intercepting syscall on return when using SYSCALL again.

  A guest process switching from using PTRACE_SYSEMU to PTRACE_SYSCALL
  crashes.

  The problem is in arch/i386/kernel/entry.S.  The current SYSEMU patch
  inhibits the syscall-handler to be called, but does not prevent
  do_syscall_trace() to be called after this for syscall completion
  interception.

  The appended patch fixes this.  It reuses the flag TIF_SYSCALL_EMU to
  remember "we come from PTRACE_SYSEMU and now are in PTRACE_SYSCALL", since
  the flag is unused in the depicted situation.

* Fix behaviour when changing interception kind #3:
  avoid intercepting syscall on return when using SINGLESTEP.

  When testing 2.6.9 and the skas3.v6 patch, with my latest patch and had
  problems with singlestepping on UML in SKAS with SYSEMU.  It looped
  receiving SIGTRAPs without moving forward.  EIP of the traced process was
  the same for all SIGTRAPs.

What's missing is to handle switching from PTRACE_SYSCALL_EMU to
PTRACE_SINGLESTEP in a way very similar to what is done for the change from
PTRACE_SYSCALL_EMU to PTRACE_SYSCALL_TRACE.

I.e., after calling ptrace(PTRACE_SYSEMU), on the return path, the debugger is
notified and then wake ups the process; the syscall is executed (or skipped,
when do_syscall_trace() returns 0, i.e.  when using PTRACE_SYSEMU), and
do_syscall_trace() is called again.  Since we are on the return path of a
SYSEMU'd syscall, if the wake up is performed through ptrace(PTRACE_SYSCALL),
we must still avoid notifying the parent of the syscall exit.  Now, this
behaviour is extended even to resuming with PTRACE_SINGLESTEP.

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/entry.S       |  9 ++++---
 arch/i386/kernel/ptrace.c      | 57 +++++++++++++++++++++++++++++-------------
 include/asm-i386/thread_info.h |  5 +++-
 include/linux/ptrace.h         |  1 +
 kernel/fork.c                  |  3 +++
 5 files changed, 53 insertions(+), 22 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index a991d4e5edd2..b389e5f3bdee 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -203,7 +203,7 @@ sysenter_past_esp:
 	GET_THREAD_INFO(%ebp)
 
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
+	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -226,9 +226,9 @@ ENTRY(system_call)
 	pushl %eax			# save orig_eax
 	SAVE_ALL
 	GET_THREAD_INFO(%ebp)
-					# system call tracing in operation
+					# system call tracing in operation / emulation
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
-	testw $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),TI_flags(%ebp)
+	testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
 	jnz syscall_trace_entry
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
@@ -338,6 +338,9 @@ syscall_trace_entry:
 	movl %esp, %eax
 	xorl %edx,%edx
 	call do_syscall_trace
+	cmpl $0, %eax
+	jne syscall_exit		# ret != 0 -> running under PTRACE_SYSEMU,
+					# so must skip actual syscall
 	movl ORIG_EAX(%esp), %eax
 	cmpl $(nr_syscalls), %eax
 	jnae syscall_call
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 5ee9e1d60653..5b569dc1c227 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -509,15 +509,27 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		  }
 		  break;
 
+	case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */
 	case PTRACE_SYSCALL:	/* continue and stop at next (return from) syscall */
 	case PTRACE_CONT:	/* restart after signal. */
 		ret = -EIO;
 		if (!valid_signal(data))
 			break;
+		/* If we came here with PTRACE_SYSEMU and now continue with
+		 * PTRACE_SYSCALL, entry.S used to intercept the syscall return.
+		 * But it shouldn't!
+		 * So we don't clear TIF_SYSCALL_EMU, which is always unused in
+		 * this special case, to remember, we came from SYSEMU. That
+		 * flag will be cleared by do_syscall_trace().
+		 */
+		if (request == PTRACE_SYSEMU) {
+			set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+		} else if (request == PTRACE_CONT) {
+			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+		}
 		if (request == PTRACE_SYSCALL) {
 			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		}
-		else {
+		} else {
 			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 		}
 		child->exit_code = data;
@@ -546,6 +558,8 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		ret = -EIO;
 		if (!valid_signal(data))
 			break;
+		/*See do_syscall_trace to know why we don't clear
+		 * TIF_SYSCALL_EMU.*/
 		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 		set_singlestep(child);
 		child->exit_code = data;
@@ -678,37 +692,43 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
  * - triggered by current->work.syscall_trace
  */
 __attribute__((regparm(3)))
-void do_syscall_trace(struct pt_regs *regs, int entryexit)
+int do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
+	int is_sysemu, is_systrace, is_singlestep, ret = 0;
 	/* do the secure computing check first */
 	secure_computing(regs->orig_eax);
 
-	if (unlikely(current->audit_context)) {
-		if (entryexit)
-			audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
-
-		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
-		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
-		 * not used, entry.S will call us only on syscall exit, not
-		 * entry ; so when TIF_SYSCALL_AUDIT is used we must avoid
-		 * calling send_sigtrap() on syscall entry.
-		 */
-		else if (is_singlestep)
-			goto out;
-	}
+	if (unlikely(current->audit_context) && entryexit)
+		audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
 
 	if (!(current->ptrace & PT_PTRACED))
 		goto out;
 
+	is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
+	is_systrace = test_thread_flag(TIF_SYSCALL_TRACE);
+	is_singlestep = test_thread_flag(TIF_SINGLESTEP);
+
+	/* We can detect the case of coming from PTRACE_SYSEMU and now running
+	 * with PTRACE_SYSCALL or PTRACE_SINGLESTEP, by TIF_SYSCALL_EMU being
+	 * set additionally.
+	 * If so let's reset the flag and return without action (no singlestep
+	 * nor syscall tracing, since no actual step has been executed).
+	 */
+	if (is_sysemu && (is_systrace || is_singlestep)) {
+		clear_thread_flag(TIF_SYSCALL_EMU);
+		goto out;
+	}
+
 	/* Fake a debug trap */
 	if (test_thread_flag(TIF_SINGLESTEP))
 		send_sigtrap(current, regs, 0);
 
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
+	if (!is_systrace && !is_sysemu)
 		goto out;
 
 	/* the 0x80 provides a way for the tracing parent to distinguish
 	   between a syscall stop and SIGTRAP delivery */
+	/* Note that the debugger could change the result of test_thread_flag!*/
 	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
 
 	/*
@@ -720,9 +740,10 @@ void do_syscall_trace(struct pt_regs *regs, int entryexit)
 		send_sig(current->exit_code, current, 1);
 		current->exit_code = 0;
 	}
+	ret = is_sysemu;
  out:
 	if (unlikely(current->audit_context) && !entryexit)
 		audit_syscall_entry(current, AUDIT_ARCH_I386, regs->orig_eax,
 				    regs->ebx, regs->ecx, regs->edx, regs->esi);
-
+	return ret;
 }
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index 95add81237ea..e2cb9fa6f563 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -139,6 +139,7 @@ register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_IRET		5	/* return with iret */
+#define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
@@ -150,13 +151,15 @@ register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
+#define _TIF_SYSCALL_EMU	(1<<TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
-  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|_TIF_SECCOMP))
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP|\
+		  _TIF_SECCOMP|_TIF_SYSCALL_EMU))
 /* work to do on any return to u-space */
 #define _TIF_ALLWORK_MASK	(0x0000FFFF & ~_TIF_SECCOMP)
 
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index a373fc254df2..7528afb6b2ad 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -20,6 +20,7 @@
 #define PTRACE_DETACH		0x11
 
 #define PTRACE_SYSCALL		  24
+#define PTRACE_SYSEMU		  31
 
 /* 0x4200-0x4300 are reserved for architecture-independent additions.  */
 #define PTRACE_SETOPTIONS	0x4200
diff --git a/kernel/fork.c b/kernel/fork.c
index b65187f0c74e..7e1ead9a6ba4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -994,6 +994,9 @@ static task_t *copy_process(unsigned long clone_flags,
 	 * of CLONE_PTRACE.
 	 */
 	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
+#ifdef TIF_SYSCALL_EMU
+	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
+#endif
 
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
-- 
cgit v1.2.3


From c8c86cecd1d1a2722acb28a01d1babf7b6993697 Mon Sep 17 00:00:00 2001
From: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Date: Sat, 3 Sep 2005 15:57:19 -0700
Subject: [PATCH] Uml support: reorganize PTRACE_SYSEMU support

With this patch, we change the way we handle switching from PTRACE_SYSEMU to
PTRACE_{SINGLESTEP,SYSCALL}, to free TIF_SYSCALL_EMU from double use as a
preparation for PTRACE_SYSEMU_SINGLESTEP extension, without changing the
behavior of the host kernel.

Signed-off-by: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/entry.S  |  8 +++++++-
 arch/i386/kernel/ptrace.c | 43 ++++++++++++++-----------------------------
 2 files changed, 21 insertions(+), 30 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index b389e5f3bdee..9a47723469c6 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -339,12 +339,18 @@ syscall_trace_entry:
 	xorl %edx,%edx
 	call do_syscall_trace
 	cmpl $0, %eax
-	jne syscall_exit		# ret != 0 -> running under PTRACE_SYSEMU,
+	jne syscall_skip		# ret != 0 -> running under PTRACE_SYSEMU,
 					# so must skip actual syscall
 	movl ORIG_EAX(%esp), %eax
 	cmpl $(nr_syscalls), %eax
 	jnae syscall_call
 	jmp syscall_exit
+syscall_skip:
+	cli				# make sure we don't miss an interrupt
+					# setting need_resched or sigpending
+					# between sampling and the iret
+	movl TI_flags(%ebp), %ecx
+	jmp work_pending
 
 	# perform syscall exit tracing
 	ALIGN
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 5b569dc1c227..18642f05dde1 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -515,21 +515,14 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		ret = -EIO;
 		if (!valid_signal(data))
 			break;
-		/* If we came here with PTRACE_SYSEMU and now continue with
-		 * PTRACE_SYSCALL, entry.S used to intercept the syscall return.
-		 * But it shouldn't!
-		 * So we don't clear TIF_SYSCALL_EMU, which is always unused in
-		 * this special case, to remember, we came from SYSEMU. That
-		 * flag will be cleared by do_syscall_trace().
-		 */
 		if (request == PTRACE_SYSEMU) {
 			set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-		} else if (request == PTRACE_CONT) {
-			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
-		}
-		if (request == PTRACE_SYSCALL) {
+			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+		} else if (request == PTRACE_SYSCALL) {
 			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 		} else {
+			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 		}
 		child->exit_code = data;
@@ -558,8 +551,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		ret = -EIO;
 		if (!valid_signal(data))
 			break;
-		/*See do_syscall_trace to know why we don't clear
-		 * TIF_SYSCALL_EMU.*/
+		clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 		set_singlestep(child);
 		child->exit_code = data;
@@ -694,7 +686,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 __attribute__((regparm(3)))
 int do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
-	int is_sysemu, is_systrace, is_singlestep, ret = 0;
+	int is_sysemu, is_singlestep, ret = 0;
 	/* do the secure computing check first */
 	secure_computing(regs->orig_eax);
 
@@ -705,25 +697,13 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 		goto out;
 
 	is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
-	is_systrace = test_thread_flag(TIF_SYSCALL_TRACE);
 	is_singlestep = test_thread_flag(TIF_SINGLESTEP);
 
-	/* We can detect the case of coming from PTRACE_SYSEMU and now running
-	 * with PTRACE_SYSCALL or PTRACE_SINGLESTEP, by TIF_SYSCALL_EMU being
-	 * set additionally.
-	 * If so let's reset the flag and return without action (no singlestep
-	 * nor syscall tracing, since no actual step has been executed).
-	 */
-	if (is_sysemu && (is_systrace || is_singlestep)) {
-		clear_thread_flag(TIF_SYSCALL_EMU);
-		goto out;
-	}
-
 	/* Fake a debug trap */
-	if (test_thread_flag(TIF_SINGLESTEP))
+	if (is_singlestep)
 		send_sigtrap(current, regs, 0);
 
-	if (!is_systrace && !is_sysemu)
+ 	if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
 		goto out;
 
 	/* the 0x80 provides a way for the tracing parent to distinguish
@@ -745,5 +725,10 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 	if (unlikely(current->audit_context) && !entryexit)
 		audit_syscall_entry(current, AUDIT_ARCH_I386, regs->orig_eax,
 				    regs->ebx, regs->ecx, regs->edx, regs->esi);
-	return ret;
+	if (ret == 0)
+		return 0;
+
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
+	return 1;
 }
-- 
cgit v1.2.3


From 1b38f0064e4e0b9ec626e39f0740b1cf2e295743 Mon Sep 17 00:00:00 2001
From: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Date: Sat, 3 Sep 2005 15:57:20 -0700
Subject: [PATCH] Uml support: add PTRACE_SYSEMU_SINGLESTEP option to i386

This patch implements the new ptrace option PTRACE_SYSEMU_SINGLESTEP, which
can be used by UML to singlestep a process: it will receive SINGLESTEP
interceptions for normal instructions and syscalls, but syscall execution will
be skipped just like with PTRACE_SYSEMU.

Signed-off-by: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/ptrace.c | 21 +++++++++++++++++----
 include/linux/ptrace.h    |  1 +
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 18642f05dde1..3196ba50fcd5 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -547,11 +547,17 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		wake_up_process(child);
 		break;
 
+	case PTRACE_SYSEMU_SINGLESTEP: /* Same as SYSEMU, but singlestep if not syscall */
 	case PTRACE_SINGLESTEP:	/* set the trap flag. */
 		ret = -EIO;
 		if (!valid_signal(data))
 			break;
-		clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+
+		if (request == PTRACE_SYSEMU_SINGLESTEP)
+			set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+		else
+			clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
+
 		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 		set_singlestep(child);
 		child->exit_code = data;
@@ -686,7 +692,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
 __attribute__((regparm(3)))
 int do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
-	int is_sysemu, is_singlestep, ret = 0;
+	int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU), ret = 0;
+	/* With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP */
+	int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
+
 	/* do the secure computing check first */
 	secure_computing(regs->orig_eax);
 
@@ -696,8 +705,11 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 	if (!(current->ptrace & PT_PTRACED))
 		goto out;
 
-	is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
-	is_singlestep = test_thread_flag(TIF_SINGLESTEP);
+	/* If a process stops on the 1st tracepoint with SYSCALL_TRACE
+	 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
+	 * here. We have to check this and return */
+	if (is_sysemu && entryexit)
+		return 0;
 
 	/* Fake a debug trap */
 	if (is_singlestep)
@@ -728,6 +740,7 @@ int do_syscall_trace(struct pt_regs *regs, int entryexit)
 	if (ret == 0)
 		return 0;
 
+	regs->orig_eax = -1; /* force skip of syscall restarting */
 	if (unlikely(current->audit_context))
 		audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
 	return 1;
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 7528afb6b2ad..2afdafb62123 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -21,6 +21,7 @@
 
 #define PTRACE_SYSCALL		  24
 #define PTRACE_SYSEMU		  31
+#define PTRACE_SYSEMU_SINGLESTEP  32
 
 /* 0x4200-0x4300 are reserved for architecture-independent additions.  */
 #define PTRACE_SETOPTIONS	0x4200
-- 
cgit v1.2.3


From ab1c23c24471c760c573f4fb0dd78e166ddfd844 Mon Sep 17 00:00:00 2001
From: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
Date: Sat, 3 Sep 2005 15:57:21 -0700
Subject: [PATCH] SYSEMU: fix sysaudit / singlestep interaction

      Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>

This is simply an adjustment for "Ptrace - i386: fix Syscall Audit interaction
with singlestep" to work on top of SYSEMU patches, too.  On this patch, I have
some doubts: I wonder why we need to alter that way ptrace_disable().

I left the patch this way because it has been extensively tested, but I don't
understand the reason.

The current PTRACE_DETACH handling simply clears child->ptrace; actually this
is not enough because entry.S just looks at the thread_flags; actually,
do_syscall_trace checks current->ptrace but I don't think depending on that is
good, at least for performance, so I think the clearing is done elsewhere.
For instance, on PTRACE_CONT it's done, but doing PTRACE_DETACH without
PTRACE_CONT is possible (and happens when gdb crashes and one kills it
manually).

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
CC: Roland McGrath <roland@redhat.com>
Cc: Jeff Dike <jdike@addtoit.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/ptrace.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index 3196ba50fcd5..340980203b09 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -271,6 +271,8 @@ static void clear_singlestep(struct task_struct *child)
 void ptrace_disable(struct task_struct *child)
 { 
 	clear_singlestep(child);
+	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
+	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 }
 
 /*
@@ -693,14 +695,29 @@ __attribute__((regparm(3)))
 int do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
 	int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU), ret = 0;
-	/* With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP */
+	/* With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
+	 * interception. */
 	int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
 
 	/* do the secure computing check first */
 	secure_computing(regs->orig_eax);
 
-	if (unlikely(current->audit_context) && entryexit)
-		audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
+	if (unlikely(current->audit_context)) {
+		if (entryexit)
+			audit_syscall_exit(current, AUDITSC_RESULT(regs->eax), regs->eax);
+		/* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
+		 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
+		 * not used, entry.S will call us only on syscall exit, not
+		 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
+		 * calling send_sigtrap() on syscall entry.
+		 *
+		 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
+		 * is_singlestep is false, despite his name, so we will still do
+		 * the correct thing.
+		 */
+		else if (is_singlestep)
+			goto out;
+	}
 
 	if (!(current->ptrace & PT_PTRACED))
 		goto out;
-- 
cgit v1.2.3


From 640aa46e25922a00b805e6b0d0b5181ad9cf736a Mon Sep 17 00:00:00 2001
From: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Date: Sat, 3 Sep 2005 15:57:22 -0700
Subject: [PATCH] uml: SYSEMU: slight cleanup and speedup

As a follow-up to "UML Support - Ptrace: adds the host SYSEMU support, for
UML and general usage" (i.e.  uml-support-* in current mm).

Avoid unconditionally jumping to work_pending and code copying, just reuse
the already existing resume_userspace path.

One interesting note, from Charles P.  Wright, suggested that the API is
improvable with no downsides for UML (except that it will have to support
yet another host API, since dropping support for the current API, for UML,
is not reasonable from users' point of view).

Signed-off-by: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
CC: Charles P. Wright <cwright@cs.sunysb.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/entry.S | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 9a47723469c6..abb909793efc 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -339,18 +339,12 @@ syscall_trace_entry:
 	xorl %edx,%edx
 	call do_syscall_trace
 	cmpl $0, %eax
-	jne syscall_skip		# ret != 0 -> running under PTRACE_SYSEMU,
+	jne resume_userspace		# ret != 0 -> running under PTRACE_SYSEMU,
 					# so must skip actual syscall
 	movl ORIG_EAX(%esp), %eax
 	cmpl $(nr_syscalls), %eax
 	jnae syscall_call
 	jmp syscall_exit
-syscall_skip:
-	cli				# make sure we don't miss an interrupt
-					# setting need_resched or sigpending
-					# between sampling and the iret
-	movl TI_flags(%ebp), %ecx
-	jmp work_pending
 
 	# perform syscall exit tracing
 	ALIGN
-- 
cgit v1.2.3


From 54d5d42404e7705cf3804593189e963350d470e5 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Tue, 6 Sep 2005 15:16:15 -0700
Subject: [PATCH] x86/x86_64: deferred handling of writes to
 /proc/irqxx/smp_affinity

When handling writes to /proc/irq, current code is re-programming rte
entries directly. This is not recommended and could potentially cause
chipset's to lockup, or cause missing interrupts.

CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the
interrupt is pending. The same needs to be done for /proc/irq handling as well.
Otherwise user space irq balancers are really not doing the right thing.

- Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for
  lack of a generic name.
- added move_irq out of IRQ_BALANCE, and added this same to X86_64
- Added new proc handler for write, so we can do deferred write at irq
  handling time.
- Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead
  it now shows only active cpu masks, or exactly what was set.
- Provided a common move_irq implementation, instead of duplicating
  when using generic irq framework.

Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off.
Tested UP builds as well.

MSI testing: tbd: I have cards, need to look for a x-over cable, although I
did test an earlier version of this patch.  Will test in a couple days.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Acked-by: Zwane Mwaikambo <zwane@holomorphy.com>
Grudgingly-acked-by: Andi Kleen <ak@muc.de>
Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org>
Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/Kconfig            |   5 ++
 arch/i386/kernel/io_apic.c   |  55 ++++++++++---------
 arch/ia64/Kconfig            |   5 ++
 arch/ia64/kernel/irq.c       |  39 +-------------
 arch/x86_64/Kconfig          |   5 ++
 arch/x86_64/kernel/io_apic.c | 102 ++++++++++++++++++++++-------------
 drivers/pci/msi.c            |  17 ++----
 drivers/pci/msi.h            |   5 --
 include/asm-ia64/hw_irq.h    |   7 ---
 include/asm-ia64/irq.h       |   6 ---
 include/linux/irq.h          | 123 +++++++++++++++++++++++++++++++++++++++++++
 kernel/irq/manage.c          |   4 ++
 kernel/irq/proc.c            |  14 ++++-
 13 files changed, 253 insertions(+), 134 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 3b3b017e1c15..4b7de3e1e57b 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1318,6 +1318,11 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 config X86_SMP
 	bool
 	depends on SMP && !X86_VOYAGER
diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 6578f40bd501..4a5940431579 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -33,6 +33,7 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/sysdev.h>
+
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/desc.h>
@@ -222,13 +223,21 @@ static void clear_IO_APIC (void)
 			clear_IO_APIC_pin(apic, pin);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 {
 	unsigned long flags;
 	int pin;
 	struct irq_pin_list *entry = irq_2_pin + irq;
 	unsigned int apicid_value;
+	cpumask_t tmp;
 	
+	cpus_and(tmp, cpumask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(cpumask, tmp, CPU_MASK_ALL);
+
 	apicid_value = cpu_mask_to_apicid(cpumask);
 	/* Prepare to do the io_apic_write */
 	apicid_value = apicid_value << 24;
@@ -242,6 +251,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 			break;
 		entry = irq_2_pin + entry->next;
 	}
+	set_irq_info(irq, cpumask);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -259,7 +269,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
 #  define Dprintk(x...) 
 # endif
 
-cpumask_t __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS];
 
 #define IRQBALANCE_CHECK_ARCH -999
 static int irqbalance_disabled = IRQBALANCE_CHECK_ARCH;
@@ -328,12 +337,7 @@ static inline void balance_irq(int cpu, int irq)
 	cpus_and(allowed_mask, cpu_online_map, irq_affinity[irq]);
 	new_cpu = move(cpu, allowed_mask, now, 1);
 	if (cpu != new_cpu) {
-		irq_desc_t *desc = irq_desc + irq;
-		unsigned long flags;
-
-		spin_lock_irqsave(&desc->lock, flags);
-		pending_irq_balance_cpumask[irq] = cpumask_of_cpu(new_cpu);
-		spin_unlock_irqrestore(&desc->lock, flags);
+		set_pending_irq(irq, cpumask_of_cpu(new_cpu));
 	}
 }
 
@@ -528,16 +532,12 @@ tryanotherirq:
 	cpus_and(tmp, target_cpu_mask, allowed_mask);
 
 	if (!cpus_empty(tmp)) {
-		irq_desc_t *desc = irq_desc + selected_irq;
-		unsigned long flags;
 
 		Dprintk("irq = %d moved to cpu = %d\n",
 				selected_irq, min_loaded);
 		/* mark for change destination */
-		spin_lock_irqsave(&desc->lock, flags);
-		pending_irq_balance_cpumask[selected_irq] =
-					cpumask_of_cpu(min_loaded);
-		spin_unlock_irqrestore(&desc->lock, flags);
+		set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
+
 		/* Since we made a change, come back sooner to 
 		 * check for more variation.
 		 */
@@ -568,7 +568,8 @@ static int balanced_irq(void *unused)
 	
 	/* push everything to CPU 0 to give us a starting point.  */
 	for (i = 0 ; i < NR_IRQS ; i++) {
-		pending_irq_balance_cpumask[i] = cpumask_of_cpu(0);
+		pending_irq_cpumask[i] = cpumask_of_cpu(0);
+		set_pending_irq(i, cpumask_of_cpu(0));
 	}
 
 	for ( ; ; ) {
@@ -647,20 +648,9 @@ int __init irqbalance_disable(char *str)
 
 __setup("noirqbalance", irqbalance_disable);
 
-static inline void move_irq(int irq)
-{
-	/* note - we hold the desc->lock */
-	if (unlikely(!cpus_empty(pending_irq_balance_cpumask[irq]))) {
-		set_ioapic_affinity_irq(irq, pending_irq_balance_cpumask[irq]);
-		cpus_clear(pending_irq_balance_cpumask[irq]);
-	}
-}
-
 late_initcall(balanced_irq_init);
-
-#else /* !CONFIG_IRQBALANCE */
-static inline void move_irq(int irq) { }
 #endif /* CONFIG_IRQBALANCE */
+#endif /* CONFIG_SMP */
 
 #ifndef CONFIG_SMP
 void fastcall send_IPI_self(int vector)
@@ -820,6 +810,7 @@ EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
  * so mask in all cases should simply be TARGET_CPUS
  */
+#ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
 	int pin, ioapic, irq, irq_entry;
@@ -838,6 +829,7 @@ void __init setup_ioapic_dest(void)
 
 	}
 }
+#endif
 
 /*
  * EISA Edge/Level control register, ELCR
@@ -1249,6 +1241,7 @@ static void __init setup_IO_APIC_irqs(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
 		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	}
@@ -1944,6 +1937,7 @@ static void ack_edge_ioapic_vector(unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_irq(vector);
 	ack_edge_ioapic_irq(irq);
 }
 
@@ -1958,6 +1952,7 @@ static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_irq(vector);
 	end_level_ioapic_irq(irq);
 }
 
@@ -1975,14 +1970,17 @@ static void unmask_IO_APIC_vector (unsigned int vector)
 	unmask_IO_APIC_irq(irq);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_vector (unsigned int vector,
 					cpumask_t cpu_mask)
 {
 	int irq = vector_to_irq(vector);
 
+	set_native_irq_info(vector, cpu_mask);
 	set_ioapic_affinity_irq(irq, cpu_mask);
 }
 #endif
+#endif
 
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
@@ -2000,7 +1998,9 @@ static struct hw_interrupt_type ioapic_edge_type = {
 	.disable 	= disable_edge_ioapic,
 	.ack 		= ack_edge_ioapic,
 	.end 		= end_edge_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity 	= set_ioapic_affinity,
+#endif
 };
 
 static struct hw_interrupt_type ioapic_level_type = {
@@ -2011,7 +2011,9 @@ static struct hw_interrupt_type ioapic_level_type = {
 	.disable 	= disable_level_ioapic,
 	.ack 		= mask_and_ack_level_ioapic,
 	.end 		= end_level_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity 	= set_ioapic_affinity,
+#endif
 };
 
 static inline void init_IO_APIC_traps(void)
@@ -2569,6 +2571,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
 	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+	set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 3deced637f07..17b5dbf8c311 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -434,6 +434,11 @@ config GENERIC_IRQ_PROBE
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 source "arch/ia64/hp/sim/Kconfig"
 
 source "arch/ia64/oprofile/Kconfig"
diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c
index 28f2aadc38d0..205d98028261 100644
--- a/arch/ia64/kernel/irq.c
+++ b/arch/ia64/kernel/irq.c
@@ -91,23 +91,8 @@ skip:
 }
 
 #ifdef CONFIG_SMP
-/*
- * This is updated when the user sets irq affinity via /proc
- */
-static cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
-static unsigned long pending_irq_redir[BITS_TO_LONGS(NR_IRQS)];
-
 static char irq_redir [NR_IRQS]; // = { [0 ... NR_IRQS-1] = 1 };
 
-/*
- * Arch specific routine for deferred write to iosapic rte to reprogram
- * intr destination.
- */
-void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
-{
-	pending_irq_cpumask[irq] = mask_val;
-}
-
 void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 {
 	cpumask_t mask = CPU_MASK_NONE;
@@ -116,32 +101,10 @@ void set_irq_affinity_info (unsigned int irq, int hwid, int redir)
 
 	if (irq < NR_IRQS) {
 		irq_affinity[irq] = mask;
+		set_irq_info(irq, mask);
 		irq_redir[irq] = (char) (redir & 0xff);
 	}
 }
-
-
-void move_irq(int irq)
-{
-	/* note - we hold desc->lock */
-	cpumask_t tmp;
-	irq_desc_t *desc = irq_descp(irq);
-	int redir = test_bit(irq, pending_irq_redir);
-
-	if (unlikely(!desc->handler->set_affinity))
-		return;
-
-	if (!cpus_empty(pending_irq_cpumask[irq])) {
-		cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
-		if (unlikely(!cpus_empty(tmp))) {
-			desc->handler->set_affinity(irq | (redir ? IA64_IRQ_REDIRECTED : 0),
-						    pending_irq_cpumask[irq]);
-		}
-		cpus_clear(pending_irq_cpumask[irq]);
-	}
-}
-
-
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_HOTPLUG_CPU
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 75e52c57f19c..251ce7cf1a38 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -441,6 +441,11 @@ config ISA_DMA_API
 	bool
 	default y
 
+config GENERIC_PENDING_IRQ
+	bool
+	depends on GENERIC_HARDIRQS && SMP
+	default y
+
 menu "Power management options"
 
 source kernel/power/Kconfig
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index d206d7e49cf5..76bcc4e6979d 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -78,6 +78,54 @@ int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
 #define vector_to_irq(vector)	(vector)
 #endif
 
+#define __DO_ACTION(R, ACTION, FINAL)					\
+									\
+{									\
+	int pin;							\
+	struct irq_pin_list *entry = irq_2_pin + irq;			\
+									\
+	for (;;) {							\
+		unsigned int reg;					\
+		pin = entry->pin;					\
+		if (pin == -1)						\
+			break;						\
+		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
+		reg ACTION;						\
+		io_apic_modify(entry->apic, reg);			\
+		if (!entry->next)					\
+			break;						\
+		entry = irq_2_pin + entry->next;			\
+	}								\
+	FINAL;								\
+}
+
+#ifdef CONFIG_SMP
+static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
+{
+	unsigned long flags;
+	unsigned int dest;
+	cpumask_t tmp;
+
+	cpus_and(tmp, mask, cpu_online_map);
+	if (cpus_empty(tmp))
+		tmp = TARGET_CPUS;
+
+	cpus_and(mask, tmp, CPU_MASK_ALL);
+
+	dest = cpu_mask_to_apicid(mask);
+
+	/*
+	 * Only the high 8 bits are valid.
+	 */
+	dest = SET_APIC_LOGICAL_ID(dest);
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__DO_ACTION(1, = dest, )
+	set_irq_info(irq, mask);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+#endif
+
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -101,26 +149,6 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
 	entry->pin = pin;
 }
 
-#define __DO_ACTION(R, ACTION, FINAL)					\
-									\
-{									\
-	int pin;							\
-	struct irq_pin_list *entry = irq_2_pin + irq;			\
-									\
-	for (;;) {							\
-		unsigned int reg;					\
-		pin = entry->pin;					\
-		if (pin == -1)						\
-			break;						\
-		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
-		reg ACTION;						\
-		io_apic_modify(entry->apic, reg);			\
-		if (!entry->next)					\
-			break;						\
-		entry = irq_2_pin + entry->next;			\
-	}								\
-	FINAL;								\
-}
 
 #define DO_ACTION(name,R,ACTION, FINAL)					\
 									\
@@ -767,6 +795,7 @@ static void __init setup_IO_APIC_irqs(void)
 		spin_lock_irqsave(&ioapic_lock, flags);
 		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
 		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	}
@@ -1314,6 +1343,7 @@ static unsigned int startup_edge_ioapic_irq(unsigned int irq)
  */
 static void ack_edge_ioapic_irq(unsigned int irq)
 {
+	move_irq(irq);
 	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
 					== (IRQ_PENDING | IRQ_DISABLED))
 		mask_IO_APIC_irq(irq);
@@ -1343,26 +1373,10 @@ static unsigned int startup_level_ioapic_irq (unsigned int irq)
 
 static void end_level_ioapic_irq (unsigned int irq)
 {
+	move_irq(irq);
 	ack_APIC_irq();
 }
 
-static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
-{
-	unsigned long flags;
-	unsigned int dest;
-
-	dest = cpu_mask_to_apicid(mask);
-
-	/*
-	 * Only the high 8 bits are valid.
-	 */
-	dest = SET_APIC_LOGICAL_ID(dest);
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__DO_ACTION(1, = dest, )
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
 #ifdef CONFIG_PCI_MSI
 static unsigned int startup_edge_ioapic_vector(unsigned int vector)
 {
@@ -1375,6 +1389,7 @@ static void ack_edge_ioapic_vector(unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_native_irq(vector);
 	ack_edge_ioapic_irq(irq);
 }
 
@@ -1389,6 +1404,7 @@ static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
 
+	move_native_irq(vector);
 	end_level_ioapic_irq(irq);
 }
 
@@ -1406,14 +1422,17 @@ static void unmask_IO_APIC_vector (unsigned int vector)
 	unmask_IO_APIC_irq(irq);
 }
 
+#ifdef CONFIG_SMP
 static void set_ioapic_affinity_vector (unsigned int vector,
 					cpumask_t cpu_mask)
 {
 	int irq = vector_to_irq(vector);
 
+	set_native_irq_info(vector, cpu_mask);
 	set_ioapic_affinity_irq(irq, cpu_mask);
 }
-#endif
+#endif // CONFIG_SMP
+#endif // CONFIG_PCI_MSI
 
 /*
  * Level and edge triggered IO-APIC interrupts need different handling,
@@ -1432,7 +1451,9 @@ static struct hw_interrupt_type ioapic_edge_type = {
 	.disable 	= disable_edge_ioapic,
 	.ack 		= ack_edge_ioapic,
 	.end 		= end_edge_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity = set_ioapic_affinity,
+#endif
 };
 
 static struct hw_interrupt_type ioapic_level_type = {
@@ -1443,7 +1464,9 @@ static struct hw_interrupt_type ioapic_level_type = {
 	.disable 	= disable_level_ioapic,
 	.ack 		= mask_and_ack_level_ioapic,
 	.end 		= end_level_ioapic,
+#ifdef CONFIG_SMP
 	.set_affinity = set_ioapic_affinity,
+#endif
 };
 
 static inline void init_IO_APIC_traps(void)
@@ -1918,6 +1941,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
 	spin_lock_irqsave(&ioapic_lock, flags);
 	io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
 	io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
+	set_native_irq_info(use_pci_vector() ?  entry.vector : irq, TARGET_CPUS);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return 0;
@@ -1931,6 +1955,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int a
  * we need to reprogram the ioredtbls to cater for the cpus which have come online
  * so mask in all cases should simply be TARGET_CPUS
  */
+#ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
 	int pin, ioapic, irq, irq_entry;
@@ -1949,3 +1974,4 @@ void __init setup_ioapic_dest(void)
 
 	}
 }
+#endif
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 2b85aa39f954..532f73bb2224 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -91,6 +91,7 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 {
 	struct msi_desc *entry;
 	struct msg_address address;
+	unsigned int irq = vector;
 
 	entry = (struct msi_desc *)msi_desc[vector];
 	if (!entry || !entry->dev)
@@ -112,6 +113,7 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
 		pci_write_config_dword(entry->dev, msi_lower_address_reg(pos),
 			address.lo_address.value);
+		set_native_irq_info(irq, cpu_mask);
 		break;
 	}
 	case PCI_CAP_ID_MSIX:
@@ -125,22 +127,13 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 			MSI_TARGET_CPU_SHIFT);
 		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
 		writel(address.lo_address.value, entry->mask_base + offset);
+		set_native_irq_info(irq, cpu_mask);
 		break;
 	}
 	default:
 		break;
 	}
 }
-
-#ifdef CONFIG_IRQBALANCE
-static inline void move_msi(int vector)
-{
-	if (!cpus_empty(pending_irq_balance_cpumask[vector])) {
-		set_msi_affinity(vector, pending_irq_balance_cpumask[vector]);
-		cpus_clear(pending_irq_balance_cpumask[vector]);
-	}
-}
-#endif /* CONFIG_IRQBALANCE */
 #endif /* CONFIG_SMP */
 
 static void mask_MSI_irq(unsigned int vector)
@@ -191,13 +184,13 @@ static void shutdown_msi_irq(unsigned int vector)
 
 static void end_msi_irq_wo_maskbit(unsigned int vector)
 {
-	move_msi(vector);
+	move_native_irq(vector);
 	ack_APIC_irq();
 }
 
 static void end_msi_irq_w_maskbit(unsigned int vector)
 {
-	move_msi(vector);
+	move_native_irq(vector);
 	unmask_MSI_irq(vector);
 	ack_APIC_irq();
 }
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 390f1851c0f1..402136a5c9e4 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -19,7 +19,6 @@
 #define NR_HP_RESERVED_VECTORS 	20
 
 extern int vector_irq[NR_VECTORS];
-extern cpumask_t pending_irq_balance_cpumask[NR_IRQS];
 extern void (*interrupt[NR_IRQS])(void);
 extern int pci_vector_resources(int last, int nr_released);
 
@@ -29,10 +28,6 @@ extern int pci_vector_resources(int last, int nr_released);
 #define set_msi_irq_affinity	NULL
 #endif
 
-#ifndef CONFIG_IRQBALANCE
-static inline void move_msi(int vector) {}
-#endif
-
 /*
  * MSI-X Address Register
  */
diff --git a/include/asm-ia64/hw_irq.h b/include/asm-ia64/hw_irq.h
index 041ab8c51a64..0cf119b42f7d 100644
--- a/include/asm-ia64/hw_irq.h
+++ b/include/asm-ia64/hw_irq.h
@@ -116,13 +116,6 @@ __ia64_local_vector_to_irq (ia64_vector vec)
  * and to obtain the irq descriptor for a given irq number.
  */
 
-/* Return a pointer to the irq descriptor for IRQ.  */
-static inline irq_desc_t *
-irq_descp (int irq)
-{
-	return irq_desc + irq;
-}
-
 /* Extract the IA-64 vector that corresponds to IRQ.  */
 static inline ia64_vector
 irq_to_vector (int irq)
diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index bd07d11d9f37..5d930fdc0bea 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -30,12 +30,6 @@ extern void disable_irq_nosync (unsigned int);
 extern void enable_irq (unsigned int);
 extern void set_irq_affinity_info (unsigned int irq, int dest, int redir);
 
-#ifdef CONFIG_SMP
-extern void move_irq(int irq);
-#else
-#define move_irq(irq)
-#endif
-
 struct irqaction;
 struct pt_regs;
 int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 069d3b84d311..4a362b9ec966 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -71,16 +71,139 @@ typedef struct irq_desc {
 	unsigned int irq_count;		/* For detecting broken interrupts */
 	unsigned int irqs_unhandled;
 	spinlock_t lock;
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+	unsigned int move_irq;		/* Flag need to re-target intr dest*/
+#endif
 } ____cacheline_aligned irq_desc_t;
 
 extern irq_desc_t irq_desc [NR_IRQS];
 
+/* Return a pointer to the irq descriptor for IRQ.  */
+static inline irq_desc_t *
+irq_descp (int irq)
+{
+	return irq_desc + irq;
+}
+
 #include <asm/hw_irq.h> /* the arch dependent stuff */
 
 extern int setup_irq(unsigned int irq, struct irqaction * new);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 extern cpumask_t irq_affinity[NR_IRQS];
+
+#ifdef CONFIG_SMP
+static inline void set_native_irq_info(int irq, cpumask_t mask)
+{
+	irq_affinity[irq] = mask;
+}
+#else
+static inline void set_native_irq_info(int irq, cpumask_t mask)
+{
+}
+#endif
+
+#ifdef CONFIG_SMP
+
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+extern cpumask_t pending_irq_cpumask[NR_IRQS];
+
+static inline void set_pending_irq(unsigned int irq, cpumask_t mask)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->move_irq = 1;
+	pending_irq_cpumask[irq] = mask;
+	spin_unlock_irqrestore(&desc->lock, flags);
+}
+
+static inline void
+move_native_irq(int irq)
+{
+	cpumask_t tmp;
+	irq_desc_t *desc = irq_descp(irq);
+
+	if (likely (!desc->move_irq))
+		return;
+
+	desc->move_irq = 0;
+
+	if (likely(cpus_empty(pending_irq_cpumask[irq])))
+		return;
+
+	if (!desc->handler->set_affinity)
+		return;
+
+	/* note - we hold the desc->lock */
+	cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map);
+
+	/*
+	 * If there was a valid mask to work with, please
+	 * do the disable, re-program, enable sequence.
+	 * This is *not* particularly important for level triggered
+	 * but in a edge trigger case, we might be setting rte
+	 * when an active trigger is comming in. This could
+	 * cause some ioapics to mal-function.
+	 * Being paranoid i guess!
+	 */
+	if (unlikely(!cpus_empty(tmp))) {
+		desc->handler->disable(irq);
+		desc->handler->set_affinity(irq,tmp);
+		desc->handler->enable(irq);
+	}
+	cpus_clear(pending_irq_cpumask[irq]);
+}
+
+#ifdef CONFIG_PCI_MSI
+/*
+ * Wonder why these are dummies?
+ * For e.g the set_ioapic_affinity_vector() calls the set_ioapic_affinity_irq()
+ * counter part after translating the vector to irq info. We need to perform
+ * this operation on the real irq, when we dont use vector, i.e when
+ * pci_use_vector() is false.
+ */
+static inline void move_irq(int irq)
+{
+}
+
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+}
+
+#else // CONFIG_PCI_MSI
+
+static inline void move_irq(int irq)
+{
+	move_native_irq(irq);
+}
+
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+	set_native_irq_info(irq, mask);
+}
+#endif // CONFIG_PCI_MSI
+
+#else	// CONFIG_GENERIC_PENDING_IRQ || CONFIG_IRQBALANCE
+
+#define move_irq(x)
+#define move_native_irq(x)
+#define set_pending_irq(x,y)
+static inline void set_irq_info(int irq, cpumask_t mask)
+{
+	set_native_irq_info(irq, mask);
+}
+
+#endif // CONFIG_GENERIC_PENDING_IRQ
+
+#else // CONFIG_SMP
+
+#define move_irq(x)
+#define move_native_irq(x)
+
+#endif // CONFIG_SMP
+
 extern int no_irq_affinity;
 extern int noirqdebug_setup(char *str);
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index ac6700985705..1cfdb08ddf20 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -18,6 +18,10 @@
 
 cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
 
+#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
+cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
+#endif
+
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 85d08daa6600..f26e534c6585 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
  */
 static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
 
-void __attribute__((weak))
-proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
+{
+	/*
+	 * Save these away for later use. Re-progam when the
+	 * interrupt is pending
+	 */
+	set_pending_irq(irq, mask_val);
+}
+#else
+void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
 {
 	irq_affinity[irq] = mask_val;
 	irq_desc[irq].handler->set_affinity(irq, mask_val);
 }
+#endif
 
 static int irq_affinity_read_proc(char *page, char **start, off_t off,
 				  int count, int *eof, void *data)
-- 
cgit v1.2.3


From 8446f1d391f3d27e6bf9c43d4cbcdac0ca720417 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 6 Sep 2005 15:16:27 -0700
Subject: [PATCH] detect soft lockups

This patch adds a new kernel debug feature: CONFIG_DETECT_SOFTLOCKUP.

When enabled then per-CPU watchdog threads are started, which try to run
once per second.  If they get delayed for more than 10 seconds then a
callback from the timer interrupt detects this condition and prints out a
warning message and a stack dump (once per lockup incident).  The feature
is otherwise non-intrusive, it doesnt try to unlock the box in any way, it
only gets the debug info out, automatically, and on all CPUs affected by
the lockup.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Signed-Off-By: Matthias Urlichs <smurf@smurf.noris.de>
Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/nmi.c       |   5 ++
 arch/i386/kernel/time.c      |   1 +
 arch/x86_64/kernel/nmi.c     |   2 +
 arch/x86_64/kernel/time.c    |   1 +
 drivers/mtd/nand/nand_base.c |   1 +
 include/linux/sched.h        |  17 +++++
 init/main.c                  |   1 +
 kernel/Makefile              |   1 +
 kernel/power/swsusp.c        |   1 +
 kernel/softlockup.c          | 151 +++++++++++++++++++++++++++++++++++++++++++
 kernel/timer.c               |   1 +
 lib/Kconfig.debug            |  19 ++++++
 12 files changed, 201 insertions(+)
 create mode 100644 kernel/softlockup.c

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/nmi.c b/arch/i386/kernel/nmi.c
index 8bbdbda07a2d..0178457db721 100644
--- a/arch/i386/kernel/nmi.c
+++ b/arch/i386/kernel/nmi.c
@@ -478,6 +478,11 @@ void touch_nmi_watchdog (void)
 	 */
 	for (i = 0; i < NR_CPUS; i++)
 		alert_counter[i] = 0;
+
+	/*
+	 * Tickle the softlockup detector too:
+	 */
+	touch_softlockup_watchdog();
 }
 
 extern void die_nmi(struct pt_regs *, const char *msg);
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 6f794a78ee1e..b0c5ee2b3446 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -422,6 +422,7 @@ static int timer_resume(struct sys_device *dev)
 		last_timer->resume();
 	cur_timer = last_timer;
 	last_timer = NULL;
+	touch_softlockup_watchdog();
 	return 0;
 }
 
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 64a8e05d5811..84cae81fff8b 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -463,6 +463,8 @@ void touch_nmi_watchdog (void)
 	 */
 	for (i = 0; i < NR_CPUS; i++)
 		per_cpu(nmi_touch, i) = 1;
+
+ 	touch_softlockup_watchdog();
 }
 
 void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 66bf6ddeb0c3..2b5d9da912a2 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -1041,6 +1041,7 @@ static int timer_resume(struct sys_device *dev)
 	write_sequnlock_irqrestore(&xtime_lock,flags);
 	jiffies += sleep_length;
 	wall_jiffies += sleep_length;
+	touch_softlockup_watchdog();
 	return 0;
 }
 
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index eee5115658c8..04e54318bc6a 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -526,6 +526,7 @@ static void nand_wait_ready(struct mtd_info *mtd)
 	do {
 		if (this->dev_ready(mtd))
 			return;
+		touch_softlockup_watchdog();
 	} while (time_before(jiffies, timeo));	
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index dec5827c7742..5fb31bede103 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -176,6 +176,23 @@ extern void trap_init(void);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
 
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+extern void softlockup_tick(struct pt_regs *regs);
+extern void spawn_softlockup_task(void);
+extern void touch_softlockup_watchdog(void);
+#else
+static inline void softlockup_tick(struct pt_regs *regs)
+{
+}
+static inline void spawn_softlockup_task(void)
+{
+}
+static inline void touch_softlockup_watchdog(void)
+{
+}
+#endif
+
+
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 /* Is this address in the __sched functions? */
diff --git a/init/main.c b/init/main.c
index ff410063e4e1..a29fb2ac7240 100644
--- a/init/main.c
+++ b/init/main.c
@@ -614,6 +614,7 @@ static void do_pre_smp_initcalls(void)
 	migration_init();
 #endif
 	spawn_ksoftirqd();
+	spawn_softlockup_task();
 }
 
 static void run_init_process(char *init_filename)
diff --git a/kernel/Makefile b/kernel/Makefile
index cb05cd05d237..8d57a2f1226b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,6 +27,7 @@ obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_SYSFS) += ksysfs.o
+obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index eaacd5cb5889..d967e875ee82 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1059,6 +1059,7 @@ int swsusp_resume(void)
 	BUG_ON(!error);
 	restore_processor_state();
 	restore_highmem();
+	touch_softlockup_watchdog();
 	device_power_up();
 	local_irq_enable();
 	return error;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
new file mode 100644
index 000000000000..75976209cea7
--- /dev/null
+++ b/kernel/softlockup.c
@@ -0,0 +1,151 @@
+/*
+ * Detect Soft Lockups
+ *
+ * started by Ingo Molnar, (C) 2005, Red Hat
+ *
+ * this code detects soft lockups: incidents in where on a CPU
+ * the kernel does not reschedule for 10 seconds or more.
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+
+static DEFINE_SPINLOCK(print_lock);
+
+static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
+
+static int did_panic = 0;
+static int softlock_panic(struct notifier_block *this, unsigned long event,
+				void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = softlock_panic,
+};
+
+void touch_softlockup_watchdog(void)
+{
+	per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
+}
+EXPORT_SYMBOL(touch_softlockup_watchdog);
+
+/*
+ * This callback runs from the timer interrupt, and checks
+ * whether the watchdog thread has hung or not:
+ */
+void softlockup_tick(struct pt_regs *regs)
+{
+	int this_cpu = smp_processor_id();
+	unsigned long timestamp = per_cpu(timestamp, this_cpu);
+
+	if (per_cpu(print_timestamp, this_cpu) == timestamp)
+		return;
+
+	/* Do not cause a second panic when there already was one */
+	if (did_panic)
+		return;
+
+	if (time_after(jiffies, timestamp + 10*HZ)) {
+		per_cpu(print_timestamp, this_cpu) = timestamp;
+
+		spin_lock(&print_lock);
+		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
+			this_cpu);
+		show_regs(regs);
+		spin_unlock(&print_lock);
+	}
+}
+
+/*
+ * The watchdog thread - runs every second and touches the timestamp.
+ */
+static int watchdog(void * __bind_cpu)
+{
+	struct sched_param param = { .sched_priority = 99 };
+	int this_cpu = (long) __bind_cpu;
+
+	printk("softlockup thread %d started up.\n", this_cpu);
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+	current->flags |= PF_NOFREEZE;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/*
+	 * Run briefly once per second - if this gets delayed for
+	 * more than 10 seconds then the debug-printout triggers
+	 * in softlockup_tick():
+	 */
+	while (!kthread_should_stop()) {
+		msleep_interruptible(1000);
+		touch_softlockup_watchdog();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+/*
+ * Create/destroy watchdog threads as CPUs come and go:
+ */
+static int __devinit
+cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		BUG_ON(per_cpu(watchdog_task, hotcpu));
+		p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("watchdog for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+  		per_cpu(watchdog_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+ 		break;
+	case CPU_ONLINE:
+
+		wake_up_process(per_cpu(watchdog_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+		p = per_cpu(watchdog_task, hotcpu);
+		per_cpu(watchdog_task, hotcpu) = NULL;
+		kthread_stop(p);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init void spawn_softlockup_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+
+	notifier_chain_register(&panic_notifier_list, &panic_block);
+}
+
diff --git a/kernel/timer.c b/kernel/timer.c
index 5377f40723ff..1433d87f46b3 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs)
 {
 	jiffies_64++;
 	update_times();
+	softlockup_tick(regs);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 299f7f3b5b08..3754c9a8f5c8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -46,6 +46,25 @@ config LOG_BUF_SHIFT
 		     13 =>  8 KB
 		     12 =>  4 KB
 
+config DETECT_SOFTLOCKUP
+	bool "Detect Soft Lockups"
+	depends on DEBUG_KERNEL
+	default y
+	help
+	  Say Y here to enable the kernel to detect "soft lockups",
+	  which are bugs that cause the kernel to loop in kernel
+	  mode for more than 10 seconds, without giving other tasks a
+	  chance to run.
+
+	  When a soft-lockup is detected, the kernel will print the
+	  current stack trace (which you should report), but the
+	  system will stay locked up. This feature has negligible
+	  overhead.
+
+	  (Note that "hard lockups" are separate type of bugs that
+	   can be detected via the NMI-watchdog, on platforms that
+	   support it.)
+
 config SCHEDSTATS
 	bool "Collect scheduler statistics"
 	depends on DEBUG_KERNEL && PROC_FS
-- 
cgit v1.2.3


From c3d8c1414573be8cf7c8fdc1e076935697c7f6af Mon Sep 17 00:00:00 2001
From: Christoph Lameter <christoph@scalex86.org>
Date: Tue, 6 Sep 2005 15:16:33 -0700
Subject: [PATCH] More __read_mostly variables

Move some more frequently read variables that showed up during some of our
performance tests as sometimes ending up in hot cachelines to the
read_mostly section.

Fix: Move the __read_mostly from before hpet_usec_quotient to follow the
variable like the other uses of __read_mostly.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Christoph Lameter <christoph@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/setup.c             | 2 +-
 arch/i386/kernel/timers/timer_hpet.c | 2 +-
 mm/mmap.c                            | 2 +-
 mm/page_alloc.c                      | 8 ++++----
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index 294bcca985ab..e29fd5aeaf8e 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -82,7 +82,7 @@ EXPORT_SYMBOL(efi_enabled);
 /* cpu data as detected by the assembly code in head.S */
 struct cpuinfo_x86 new_cpu_data __initdata = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
 /* common cpu data for all cpus */
-struct cpuinfo_x86 boot_cpu_data = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
+struct cpuinfo_x86 boot_cpu_data __read_mostly = { 0, 0, 0, 0, -1, 1, 0, 0, -1 };
 EXPORT_SYMBOL(boot_cpu_data);
 
 unsigned long mmu_cr4_features;
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index 001de97c9e4a..6dbb29f834e8 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -18,7 +18,7 @@
 #include "mach_timer.h"
 #include <asm/hpet.h>
 
-static unsigned long __read_mostly hpet_usec_quotient;	/* convert hpet clks to usec */
+static unsigned long hpet_usec_quotient __read_mostly;	/* convert hpet clks to usec */
 static unsigned long tsc_hpet_quotient;		/* convert tsc to hpet clks */
 static unsigned long hpet_last; 	/* hpet counter value at last tick*/
 static unsigned long last_tsc_low;	/* lsb 32 bits of Time Stamp Counter */
diff --git a/mm/mmap.c b/mm/mmap.c
index 404319477e71..bb43340d3973 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -61,7 +61,7 @@ pgprot_t protection_map[16] = {
 
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50;	/* default is 50% */
-int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
+int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 atomic_t vm_committed_space = ATOMIC_INIT(0);
 
 /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b06a9636d971..34bba8f1144e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -42,11 +42,11 @@
  * MCD - HACK: Find somewhere to initialize this EARLY, or make this
  * initializer cleaner
  */
-nodemask_t node_online_map = { { [0] = 1UL } };
+nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
-nodemask_t node_possible_map = NODE_MASK_ALL;
+nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
-struct pglist_data *pgdat_list;
+struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages;
 unsigned long totalhigh_pages;
 long nr_swap_pages;
@@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages);
  * Used by page_zone() to look up the address of the struct zone whose
  * id is encoded in the upper bits of page->flags
  */
-struct zone *zone_table[1 << ZONETABLE_SHIFT];
+struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
-- 
cgit v1.2.3


From 19306059cd7fedaf96b4b0260a9a8a45e513c857 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Tue, 6 Sep 2005 15:16:35 -0700
Subject: [PATCH] NMI: Update NMI users of RCU to use new API

Uses of RCU for dynamically changeable NMI handlers need to use the new
rcu_dereference() and rcu_assign_pointer() facilities.  This change makes
it clear that these uses are safe from a memory-barrier viewpoint, but the
main purpose is to document exactly what operations are being protected by
RCU.  This has been tested on x86 and x86-64, which are the only
architectures affected by this change.

Signed-off-by: <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/RCU/NMI-RCU.txt | 112 ++++++++++++++++++++++++++++++++++++++++++
 arch/i386/kernel/traps.c      |   4 +-
 arch/x86_64/kernel/nmi.c      |   4 +-
 3 files changed, 116 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/RCU/NMI-RCU.txt

(limited to 'arch/i386/kernel')

diff --git a/Documentation/RCU/NMI-RCU.txt b/Documentation/RCU/NMI-RCU.txt
new file mode 100644
index 000000000000..d0634a5c3445
--- /dev/null
+++ b/Documentation/RCU/NMI-RCU.txt
@@ -0,0 +1,112 @@
+Using RCU to Protect Dynamic NMI Handlers
+
+
+Although RCU is usually used to protect read-mostly data structures,
+it is possible to use RCU to provide dynamic non-maskable interrupt
+handlers, as well as dynamic irq handlers.  This document describes
+how to do this, drawing loosely from Zwane Mwaikambo's NMI-timer
+work in "arch/i386/oprofile/nmi_timer_int.c" and in
+"arch/i386/kernel/traps.c".
+
+The relevant pieces of code are listed below, each followed by a
+brief explanation.
+
+	static int dummy_nmi_callback(struct pt_regs *regs, int cpu)
+	{
+		return 0;
+	}
+
+The dummy_nmi_callback() function is a "dummy" NMI handler that does
+nothing, but returns zero, thus saying that it did nothing, allowing
+the NMI handler to take the default machine-specific action.
+
+	static nmi_callback_t nmi_callback = dummy_nmi_callback;
+
+This nmi_callback variable is a global function pointer to the current
+NMI handler.
+
+	fastcall void do_nmi(struct pt_regs * regs, long error_code)
+	{
+		int cpu;
+
+		nmi_enter();
+
+		cpu = smp_processor_id();
+		++nmi_count(cpu);
+
+		if (!rcu_dereference(nmi_callback)(regs, cpu))
+			default_do_nmi(regs);
+
+		nmi_exit();
+	}
+
+The do_nmi() function processes each NMI.  It first disables preemption
+in the same way that a hardware irq would, then increments the per-CPU
+count of NMIs.  It then invokes the NMI handler stored in the nmi_callback
+function pointer.  If this handler returns zero, do_nmi() invokes the
+default_do_nmi() function to handle a machine-specific NMI.  Finally,
+preemption is restored.
+
+Strictly speaking, rcu_dereference() is not needed, since this code runs
+only on i386, which does not need rcu_dereference() anyway.  However,
+it is a good documentation aid, particularly for anyone attempting to
+do something similar on Alpha.
+
+Quick Quiz:  Why might the rcu_dereference() be necessary on Alpha,
+	     given that the code referenced by the pointer is read-only?
+
+
+Back to the discussion of NMI and RCU...
+
+	void set_nmi_callback(nmi_callback_t callback)
+	{
+		rcu_assign_pointer(nmi_callback, callback);
+	}
+
+The set_nmi_callback() function registers an NMI handler.  Note that any
+data that is to be used by the callback must be initialized up -before-
+the call to set_nmi_callback().  On architectures that do not order
+writes, the rcu_assign_pointer() ensures that the NMI handler sees the
+initialized values.
+
+	void unset_nmi_callback(void)
+	{
+		rcu_assign_pointer(nmi_callback, dummy_nmi_callback);
+	}
+
+This function unregisters an NMI handler, restoring the original
+dummy_nmi_handler().  However, there may well be an NMI handler
+currently executing on some other CPU.  We therefore cannot free
+up any data structures used by the old NMI handler until execution
+of it completes on all other CPUs.
+
+One way to accomplish this is via synchronize_sched(), perhaps as
+follows:
+
+	unset_nmi_callback();
+	synchronize_sched();
+	kfree(my_nmi_data);
+
+This works because synchronize_sched() blocks until all CPUs complete
+any preemption-disabled segments of code that they were executing.
+Since NMI handlers disable preemption, synchronize_sched() is guaranteed
+not to return until all ongoing NMI handlers exit.  It is therefore safe
+to free up the handler's data as soon as synchronize_sched() returns.
+
+
+Answer to Quick Quiz
+
+	Why might the rcu_dereference() be necessary on Alpha, given
+	that the code referenced by the pointer is read-only?
+
+	Answer: The caller to set_nmi_callback() might well have
+		initialized some data that is to be used by the
+		new NMI handler.  In this case, the rcu_dereference()
+		would be needed, because otherwise a CPU that received
+		an NMI just after the new handler was set might see
+		the pointer to the new NMI handler, but the old
+		pre-initialized version of the handler's data.
+
+		More important, the rcu_dereference() makes it clear
+		to someone reading the code that the pointer is being
+		protected by RCU.
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 54629bb5893a..029bf94cda7d 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -657,7 +657,7 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code)
 
 	++nmi_count(cpu);
 
-	if (!nmi_callback(regs, cpu))
+	if (!rcu_dereference(nmi_callback)(regs, cpu))
 		default_do_nmi(regs);
 
 	nmi_exit();
@@ -665,7 +665,7 @@ fastcall void do_nmi(struct pt_regs * regs, long error_code)
 
 void set_nmi_callback(nmi_callback_t callback)
 {
-	nmi_callback = callback;
+	rcu_assign_pointer(nmi_callback, callback);
 }
 EXPORT_SYMBOL_GPL(set_nmi_callback);
 
diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c
index 84cae81fff8b..caf164959e19 100644
--- a/arch/x86_64/kernel/nmi.c
+++ b/arch/x86_64/kernel/nmi.c
@@ -524,14 +524,14 @@ asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
 
 	nmi_enter();
 	add_pda(__nmi_count,1);
-	if (!nmi_callback(regs, cpu))
+	if (!rcu_dereference(nmi_callback)(regs, cpu))
 		default_do_nmi(regs);
 	nmi_exit();
 }
 
 void set_nmi_callback(nmi_callback_t callback)
 {
-	nmi_callback = callback;
+	rcu_assign_pointer(nmi_callback, callback);
 }
 
 void unset_nmi_callback(void)
-- 
cgit v1.2.3


From 7f4bde9a3486cd7e70bedd2aff35b38667d50173 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Tue, 6 Sep 2005 15:17:39 -0700
Subject: [PATCH] remove the second arg of do_timer_interrupt()

The second arg of do_timer_interrupt() is not used in the functions, and
all callers pass NULL.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Cc: Paul Mundt <lethal@Linux-SH.ORG>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/time.c | 5 ++---
 arch/sh/kernel/time.c   | 4 ++--
 arch/sh64/kernel/time.c | 4 ++--
 3 files changed, 6 insertions(+), 7 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index b0c5ee2b3446..9b94d84a6c3b 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -252,8 +252,7 @@ EXPORT_SYMBOL(profile_pc);
  * timer_interrupt() needs to keep up the real-time clock,
  * as well as call the "do_timer()" routine every clocktick
  */
-static inline void do_timer_interrupt(int irq, void *dev_id,
-					struct pt_regs *regs)
+static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
 {
 #ifdef CONFIG_X86_IO_APIC
 	if (timer_ack) {
@@ -307,7 +306,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 
 	cur_timer->mark_offset();
  
-	do_timer_interrupt(irq, NULL, regs);
+	do_timer_interrupt(irq, regs);
 
 	write_sequnlock(&xtime_lock);
 	return IRQ_HANDLED;
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index df7a9b9d4cbf..d5f5aedde0a3 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -234,7 +234,7 @@ static long last_rtc_update;
  * timer_interrupt() needs to keep up the real-time clock,
  * as well as call the "do_timer()" routine every clocktick
  */
-static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
 {
 	do_timer(regs);
 #ifndef CONFIG_SMP
@@ -285,7 +285,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * locally disabled. -arca
 	 */
 	write_seqlock(&xtime_lock);
-	do_timer_interrupt(irq, NULL, regs);
+	do_timer_interrupt(irq, regs);
 	write_sequnlock(&xtime_lock);
 
 	return IRQ_HANDLED;
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index 6c84da3efc73..926c6fc0619c 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -303,7 +303,7 @@ static long last_rtc_update = 0;
  * timer_interrupt() needs to keep up the real-time clock,
  * as well as call the "do_timer()" routine every clocktick
  */
-static inline void do_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
 {
 	unsigned long long current_ctc;
 	asm ("getcon cr62, %0" : "=r" (current_ctc));
@@ -361,7 +361,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * locally disabled. -arca
 	 */
 	write_lock(&xtime_lock);
-	do_timer_interrupt(irq, NULL, regs);
+	do_timer_interrupt(irq, regs);
 	write_unlock(&xtime_lock);
 
 	return IRQ_HANDLED;
-- 
cgit v1.2.3


From 6c231b7bab0aa6860cd9da2de8a064eddc34c146 Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Tue, 6 Sep 2005 15:17:45 -0700
Subject: [PATCH] Additions to .data.read_mostly section

Mark variables which are usually accessed for reads with __readmostly.

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Shai Fultheim <shai@scalex86.org>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/io_apic.c           | 10 +++++-----
 arch/i386/kernel/timers/timer_hpet.c |  2 +-
 arch/i386/mm/discontig.c             |  8 ++++----
 arch/i386/mm/init.c                  |  2 +-
 arch/x86_64/kernel/genapic.c         |  2 +-
 arch/x86_64/kernel/io_apic.c         | 10 +++++-----
 arch/x86_64/kernel/setup.c           |  2 +-
 arch/x86_64/kernel/setup64.c         |  2 +-
 arch/x86_64/kernel/smpboot.c         | 10 +++++-----
 arch/x86_64/mm/numa.c                |  6 +++---
 fs/namespace.c                       |  2 +-
 mm/page_alloc.c                      |  4 ++--
 mm/shmem.c                           |  2 +-
 13 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 4a5940431579..0e727e6da5c9 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -78,7 +78,7 @@ static struct irq_pin_list {
 	int apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
-int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
 #ifdef CONFIG_PCI_MSI
 #define vector_to_irq(vector) 	\
 	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
@@ -1119,7 +1119,7 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
 
 int assign_irq_vector(int irq)
 {
@@ -1990,7 +1990,7 @@ static void set_ioapic_affinity_vector (unsigned int vector,
  * edge-triggered handler, without risking IRQ storms and other ugly
  * races.
  */
-static struct hw_interrupt_type ioapic_edge_type = {
+static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
 	.typename 	= "IO-APIC-edge",
 	.startup 	= startup_edge_ioapic,
 	.shutdown 	= shutdown_edge_ioapic,
@@ -2003,7 +2003,7 @@ static struct hw_interrupt_type ioapic_edge_type = {
 #endif
 };
 
-static struct hw_interrupt_type ioapic_level_type = {
+static struct hw_interrupt_type ioapic_level_type __read_mostly = {
 	.typename 	= "IO-APIC-level",
 	.startup 	= startup_level_ioapic,
 	.shutdown 	= shutdown_level_ioapic,
@@ -2076,7 +2076,7 @@ static void ack_lapic_irq (unsigned int irq)
 
 static void end_lapic_irq (unsigned int i) { /* nothing */ }
 
-static struct hw_interrupt_type lapic_irq_type = {
+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
 	.typename 	= "local-APIC-edge",
 	.startup 	= NULL, /* startup_irq() not used for IRQ0 */
 	.shutdown 	= NULL, /* shutdown_irq() not used for IRQ0 */
diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c
index 6dbb29f834e8..d973a8b681fd 100644
--- a/arch/i386/kernel/timers/timer_hpet.c
+++ b/arch/i386/kernel/timers/timer_hpet.c
@@ -19,7 +19,7 @@
 #include <asm/hpet.h>
 
 static unsigned long hpet_usec_quotient __read_mostly;	/* convert hpet clks to usec */
-static unsigned long tsc_hpet_quotient;		/* convert tsc to hpet clks */
+static unsigned long tsc_hpet_quotient __read_mostly;	/* convert tsc to hpet clks */
 static unsigned long hpet_last; 	/* hpet counter value at last tick*/
 static unsigned long last_tsc_low;	/* lsb 32 bits of Time Stamp Counter */
 static unsigned long last_tsc_high; 	/* msb 32 bits of Time Stamp Counter */
diff --git a/arch/i386/mm/discontig.c b/arch/i386/mm/discontig.c
index 6711ce3f6916..244d8ec66be2 100644
--- a/arch/i386/mm/discontig.c
+++ b/arch/i386/mm/discontig.c
@@ -37,7 +37,7 @@
 #include <asm/mmzone.h>
 #include <bios_ebda.h>
 
-struct pglist_data *node_data[MAX_NUMNODES];
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 bootmem_data_t node0_bdata;
 
@@ -49,8 +49,8 @@ bootmem_data_t node0_bdata;
  * 2) node_start_pfn   - the starting page frame number for a node
  * 3) node_end_pfn     - the ending page fram number for a node
  */
-unsigned long node_start_pfn[MAX_NUMNODES];
-unsigned long node_end_pfn[MAX_NUMNODES];
+unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
+unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
 
 
 #ifdef CONFIG_DISCONTIGMEM
@@ -66,7 +66,7 @@ unsigned long node_end_pfn[MAX_NUMNODES];
  *     physnode_map[4-7] = 1;
  *     physnode_map[8- ] = -1;
  */
-s8 physnode_map[MAX_ELEMENTS] = { [0 ... (MAX_ELEMENTS - 1)] = -1};
+s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1};
 EXPORT_SYMBOL(physnode_map);
 
 void memory_present(int nid, unsigned long start, unsigned long end)
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 9edfc058b894..2ebaf75f732e 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -393,7 +393,7 @@ void zap_low_mappings (void)
 }
 
 static int disable_nx __initdata = 0;
-u64 __supported_pte_mask = ~_PAGE_NX;
+u64 __supported_pte_mask __read_mostly = ~_PAGE_NX;
 
 /*
  * noexec = on|off
diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c
index 30c843a5efdd..f062aa03bab7 100644
--- a/arch/x86_64/kernel/genapic.c
+++ b/arch/x86_64/kernel/genapic.c
@@ -25,7 +25,7 @@
 #endif
 
 /* which logical CPU number maps to which CPU (physical APIC ID) */
-u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 EXPORT_SYMBOL(x86_cpu_to_apicid);
 u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
 
diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c
index 76bcc4e6979d..ba1a744e9bf0 100644
--- a/arch/x86_64/kernel/io_apic.c
+++ b/arch/x86_64/kernel/io_apic.c
@@ -70,7 +70,7 @@ static struct irq_pin_list {
 	short apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
-int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1};
 #ifdef CONFIG_PCI_MSI
 #define vector_to_irq(vector) 	\
 	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
@@ -683,7 +683,7 @@ static inline int IO_APIC_irq_trigger(int irq)
 }
 
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
-u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
+u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
 
 int assign_irq_vector(int irq)
 {
@@ -1443,7 +1443,7 @@ static void set_ioapic_affinity_vector (unsigned int vector,
  * races.
  */
 
-static struct hw_interrupt_type ioapic_edge_type = {
+static struct hw_interrupt_type ioapic_edge_type __read_mostly = {
 	.typename = "IO-APIC-edge",
 	.startup 	= startup_edge_ioapic,
 	.shutdown 	= shutdown_edge_ioapic,
@@ -1456,7 +1456,7 @@ static struct hw_interrupt_type ioapic_edge_type = {
 #endif
 };
 
-static struct hw_interrupt_type ioapic_level_type = {
+static struct hw_interrupt_type ioapic_level_type __read_mostly = {
 	.typename = "IO-APIC-level",
 	.startup 	= startup_level_ioapic,
 	.shutdown 	= shutdown_level_ioapic,
@@ -1529,7 +1529,7 @@ static void ack_lapic_irq (unsigned int irq)
 
 static void end_lapic_irq (unsigned int i) { /* nothing */ }
 
-static struct hw_interrupt_type lapic_irq_type = {
+static struct hw_interrupt_type lapic_irq_type __read_mostly = {
 	.typename = "local-APIC-edge",
 	.startup = NULL, /* startup_irq() not used for IRQ0 */
 	.shutdown = NULL, /* shutdown_irq() not used for IRQ0 */
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index 116a491e2961..b356f8e6adfe 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -65,7 +65,7 @@
  * Machine setup..
  */
 
-struct cpuinfo_x86 boot_cpu_data;
+struct cpuinfo_x86 boot_cpu_data __read_mostly;
 
 unsigned long mmu_cr4_features;
 
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 34082c1cc41e..e3ffcacc8c90 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -36,7 +36,7 @@ struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table };
 
 char boot_cpu_stack[IRQSTACKSIZE] __attribute__((section(".bss.page_aligned")));
 
-unsigned long __supported_pte_mask = ~0UL;
+unsigned long __supported_pte_mask __read_mostly = ~0UL;
 static int do_not_nx __initdata = 0;
 
 /* noexec=on|off
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index fa25e39fe54d..90aeccd15190 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -62,13 +62,13 @@
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 /* Package ID of each logical CPU */
-u8 phys_proc_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
-u8 cpu_core_id[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
 EXPORT_SYMBOL(phys_proc_id);
 EXPORT_SYMBOL(cpu_core_id);
 
 /* Bitmask of currently online CPUs */
-cpumask_t cpu_online_map;
+cpumask_t cpu_online_map __read_mostly;
 
 EXPORT_SYMBOL(cpu_online_map);
 
@@ -88,8 +88,8 @@ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned;
 /* Set when the idlers are all forked */
 int smp_threads_ready;
 
-cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
-cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned;
+cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
+cpumask_t cpu_core_map[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(cpu_core_map);
 
 /*
diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index 6a156f5692ae..04f7a33e144c 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -22,14 +22,14 @@
 #define Dprintk(x...)
 #endif
 
-struct pglist_data *node_data[MAX_NUMNODES];
+struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
 
 int memnode_shift;
 u8  memnodemap[NODEMAPSIZE];
 
-unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
-cpumask_t     node_to_cpumask[MAX_NUMNODES];
+unsigned char cpu_to_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
+cpumask_t     node_to_cpumask[MAX_NUMNODES] __read_mostly;
 
 int numa_off __initdata;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 79bd8a46e1e7..29b70669435c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -40,7 +40,7 @@ static inline int sysfs_init(void)
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
 
 static struct list_head *mount_hashtable;
-static int hash_mask, hash_bits;
+static int hash_mask __read_mostly, hash_bits __read_mostly;
 static kmem_cache_t *mnt_cache; 
 
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 34bba8f1144e..14d7032c1d12 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -47,8 +47,8 @@ EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
 struct pglist_data *pgdat_list __read_mostly;
-unsigned long totalram_pages;
-unsigned long totalhigh_pages;
+unsigned long totalram_pages __read_mostly;
+unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index bdc4bbb6ddbb..db2c9e8d9909 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -180,7 +180,7 @@ static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
 static struct vm_operations_struct shmem_vm_ops;
 
-static struct backing_dev_info shmem_backing_dev_info = {
+static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 	.ra_pages	= 0,	/* No readahead */
 	.capabilities	= BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK,
 	.unplug_io_fn	= default_unplug_io_fn,
-- 
cgit v1.2.3


From b149ee2233edf08fb59b11e879a2c5941929bcb8 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Tue, 6 Sep 2005 15:17:46 -0700
Subject: [PATCH] NTP: ntp-helper functions

This patch cleans up a commonly repeated set of changes to the NTP state
variables by adding two helper inline functions:

ntp_clear(): Clears the ntp state variables

ntp_synced(): Returns 1 if the system is synced with a time server.

This was compile tested for alpha, arm, i386, x86-64, ppc64, s390, sparc,
sparc64.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/time.c         |  7 ++-----
 arch/arm/kernel/time.c           |  7 ++-----
 arch/arm26/kernel/time.c         |  7 ++-----
 arch/cris/arch-v10/kernel/time.c |  2 +-
 arch/cris/kernel/time.c          |  5 +----
 arch/frv/kernel/time.c           |  7 ++-----
 arch/h8300/kernel/time.c         |  5 +----
 arch/i386/kernel/time.c          |  7 ++-----
 arch/m32r/kernel/time.c          |  7 ++-----
 arch/m68k/kernel/time.c          |  5 +----
 arch/m68knommu/kernel/time.c     |  7 ++-----
 arch/mips/kernel/sysirix.c       |  5 +----
 arch/mips/kernel/time.c          |  7 ++-----
 arch/mips/sgi-ip27/ip27-timer.c  |  2 +-
 arch/parisc/kernel/time.c        |  5 +----
 arch/ppc/kernel/time.c           |  7 ++-----
 arch/ppc64/kernel/time.c         |  7 ++-----
 arch/s390/kernel/time.c          |  5 +----
 arch/sh/kernel/time.c            |  7 ++-----
 arch/sh64/kernel/time.c          |  7 ++-----
 arch/sparc/kernel/pcic.c         |  5 +----
 arch/sparc/kernel/time.c         |  7 ++-----
 arch/sparc64/kernel/time.c       |  2 +-
 arch/v850/kernel/time.c          |  7 ++-----
 arch/x86_64/kernel/time.c        |  7 ++-----
 arch/xtensa/kernel/time.c        |  7 ++-----
 include/linux/timex.h            | 23 +++++++++++++++++++++++
 27 files changed, 65 insertions(+), 111 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 8226c5cd788c..67be50b7d80a 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -149,7 +149,7 @@ irqreturn_t timer_interrupt(int irq, void *dev, struct pt_regs * regs)
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0
+	if (ntp_synced()
 	    && xtime.tv_sec > state.last_rtc_update + 660
 	    && xtime.tv_nsec >= 500000 - ((unsigned) TICK_SIZE) / 2
 	    && xtime.tv_nsec <= 500000 + ((unsigned) TICK_SIZE) / 2) {
@@ -502,10 +502,7 @@ do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index 8880482dcbff..69449a818dcc 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -102,7 +102,7 @@ static unsigned long next_rtc_update;
  */
 static inline void do_set_rtc(void)
 {
-	if (time_status & STA_UNSYNC || set_rtc == NULL)
+	if (!ntp_synced() || set_rtc == NULL)
 		return;
 
 	if (next_rtc_update &&
@@ -292,10 +292,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/arm26/kernel/time.c b/arch/arm26/kernel/time.c
index 549a6b2e177e..e66aedd02fad 100644
--- a/arch/arm26/kernel/time.c
+++ b/arch/arm26/kernel/time.c
@@ -114,7 +114,7 @@ static unsigned long next_rtc_update;
  */
 static inline void do_set_rtc(void)
 {
-	if (time_status & STA_UNSYNC || set_rtc == NULL)
+	if (!ntp_synced() || set_rtc == NULL)
 		return;
 
 //FIXME - timespec.tv_sec is a time_t not unsigned long
@@ -189,10 +189,7 @@ int do_settimeofday(struct timespec *tv)
 
 	xtime.tv_sec = tv->tv_sec;
 	xtime.tv_nsec = tv->tv_nsec;
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/cris/arch-v10/kernel/time.c b/arch/cris/arch-v10/kernel/time.c
index 6b7b4e0802e3..dc3dfe9b4a1a 100644
--- a/arch/cris/arch-v10/kernel/time.c
+++ b/arch/cris/arch-v10/kernel/time.c
@@ -240,7 +240,7 @@ timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * The division here is not time critical since it will run once in 
 	 * 11 minutes
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - (tick_nsec / 1000) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + (tick_nsec / 1000) / 2) {
diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c
index fa2d4323da25..a2d99b4aedcd 100644
--- a/arch/cris/kernel/time.c
+++ b/arch/cris/kernel/time.c
@@ -114,10 +114,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index 075db6644694..8d6558b00e44 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -85,7 +85,7 @@ static irqreturn_t timer_interrupt(int irq, void *dummy, struct pt_regs * regs)
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2
@@ -216,10 +216,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c
index 8a600218334d..af8c5d2057dd 100644
--- a/arch/h8300/kernel/time.c
+++ b/arch/h8300/kernel/time.c
@@ -116,10 +116,7 @@ int do_settimeofday(struct timespec *tv)
 
 	xtime.tv_sec = tv->tv_sec;
 	xtime.tv_nsec = tv->tv_nsec;
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 9b94d84a6c3b..eefea7c55008 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -194,10 +194,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
@@ -347,7 +344,7 @@ static void sync_cmos_clock(unsigned long dummy)
 	 * This code is run on a timer.  If the clock is set, that timer
 	 * may not expire at the correct time.  Thus, we adjust...
 	 */
-	if ((time_status & STA_UNSYNC) != 0)
+	if (!ntp_synced())
 		/*
 		 * Not synced, exit, do not restart a timer (if one is
 		 * running, let it run out).
diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c
index 8a2b77bc5749..539c562cd54d 100644
--- a/arch/m32r/kernel/time.c
+++ b/arch/m32r/kernel/time.c
@@ -171,10 +171,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 
@@ -221,7 +218,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
 	write_seqlock(&xtime_lock);
-	if ((time_status & STA_UNSYNC) == 0
+	if (ntp_synced()
 		&& xtime.tv_sec > last_rtc_update + 660
 		&& (xtime.tv_nsec / 1000) >= 500000 - ((unsigned)TICK_SIZE) / 2
 		&& (xtime.tv_nsec / 1000) <= 500000 + ((unsigned)TICK_SIZE) / 2)
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index e47e19588525..4ec95e3cb874 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -166,10 +166,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c
index 5c3ca671627c..b17c1ecba966 100644
--- a/arch/m68knommu/kernel/time.c
+++ b/arch/m68knommu/kernel/time.c
@@ -68,7 +68,7 @@ static irqreturn_t timer_interrupt(int irq, void *dummy, struct pt_regs * regs)
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec  / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
@@ -178,10 +178,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c
index f3bf0e43b8bb..b46595462717 100644
--- a/arch/mips/kernel/sysirix.c
+++ b/arch/mips/kernel/sysirix.c
@@ -632,10 +632,7 @@ asmlinkage int irix_stime(int value)
 	write_seqlock_irq(&xtime_lock);
 	xtime.tv_sec = value;
 	xtime.tv_nsec = 0;
-	time_adjust = 0;			/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 
 	return 0;
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index 648c82292ed6..0dd0df7a3b04 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -223,10 +223,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;			/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
@@ -442,7 +439,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
 	write_seqlock(&xtime_lock);
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/mips/sgi-ip27/ip27-timer.c b/arch/mips/sgi-ip27/ip27-timer.c
index 8c1b96fffa76..cddf1cedf007 100644
--- a/arch/mips/sgi-ip27/ip27-timer.c
+++ b/arch/mips/sgi-ip27/ip27-timer.c
@@ -118,7 +118,7 @@ again:
 	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to when a second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 6cf7407344ba..7ff67f8e9f8c 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -188,10 +188,7 @@ do_settimeofday (struct timespec *tv)
 		set_normalized_timespec(&xtime, sec, nsec);
 		set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-		time_adjust = 0;		/* stop active adjtime() */
-		time_status |= STA_UNSYNC;
-		time_maxerror = NTP_PHASE_LIMIT;
-		time_esterror = NTP_PHASE_LIMIT;
+		ntp_clear();
 	}
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
diff --git a/arch/ppc/kernel/time.c b/arch/ppc/kernel/time.c
index bf4ddca5e853..a3c5281a5d2d 100644
--- a/arch/ppc/kernel/time.c
+++ b/arch/ppc/kernel/time.c
@@ -169,7 +169,7 @@ void timer_interrupt(struct pt_regs * regs)
 		 * We should have an rtc call that only sets the minutes and
 		 * seconds like on Intel to avoid problems with non UTC clocks.
 		 */
-		if ( ppc_md.set_rtc_time && (time_status & STA_UNSYNC) == 0 &&
+		if ( ppc_md.set_rtc_time && ntp_synced() &&
 		     xtime.tv_sec - last_rtc_update >= 659 &&
 		     abs((xtime.tv_nsec / 1000) - (1000000-1000000/HZ)) < 500000/HZ &&
 		     jiffies - wall_jiffies == 1) {
@@ -271,10 +271,7 @@ int do_settimeofday(struct timespec *tv)
 	 */
 	last_rtc_update = new_sec - 658;
 
-	time_adjust = 0;                /* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	clock_was_set();
 	return 0;
diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c
index 91ef95ccda4f..9939c206afa4 100644
--- a/arch/ppc64/kernel/time.c
+++ b/arch/ppc64/kernel/time.c
@@ -128,7 +128,7 @@ static __inline__ void timer_check_rtc(void)
          * We should have an rtc call that only sets the minutes and
          * seconds like on Intel to avoid problems with non UTC clocks.
          */
-        if ( (time_status & STA_UNSYNC) == 0 &&
+        if (ntp_synced() &&
              xtime.tv_sec - last_rtc_update >= 659 &&
              abs((xtime.tv_nsec/1000) - (1000000-1000000/HZ)) < 500000/HZ &&
              jiffies - wall_jiffies == 1) {
@@ -435,10 +435,7 @@ int do_settimeofday(struct timespec *tv)
 	 */
 	last_rtc_update = new_sec - 658;
 
-	time_adjust = 0;                /* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	delta_xsec = mulhdu( (tb_last_stamp-do_gtod.varp->tb_orig_stamp),
 			     do_gtod.varp->tb_to_xs );
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 8ca485676780..2fd75da15495 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -139,10 +139,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 	return 0;
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index d5f5aedde0a3..02ca69918d7c 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -215,10 +215,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 
@@ -252,7 +249,7 @@ static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
 	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index 926c6fc0619c..f4a62a10053c 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -247,10 +247,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 
@@ -328,7 +325,7 @@ static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
 	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/sparc/kernel/pcic.c b/arch/sparc/kernel/pcic.c
index 597d3ff6ad68..36a40697b8d6 100644
--- a/arch/sparc/kernel/pcic.c
+++ b/arch/sparc/kernel/pcic.c
@@ -840,10 +840,7 @@ static int pci_do_settimeofday(struct timespec *tv)
 
 	xtime.tv_sec = tv->tv_sec;
 	xtime.tv_nsec = tv->tv_nsec;
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	return 0;
 }
 
diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c
index 3b759aefc170..bc015e980341 100644
--- a/arch/sparc/kernel/time.c
+++ b/arch/sparc/kernel/time.c
@@ -139,7 +139,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs * regs)
 
 
 	/* Determine when to update the Mostek clock. */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
@@ -554,10 +554,7 @@ static int sbus_do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	return 0;
 }
 
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index 362b9c26871b..3f08a32f51a1 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -449,7 +449,7 @@ static inline void timer_check_rtc(void)
 	static long last_rtc_update;
 
 	/* Determine when to update the Mostek clock. */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
diff --git a/arch/v850/kernel/time.c b/arch/v850/kernel/time.c
index f722a268238a..ea3fd8844ff0 100644
--- a/arch/v850/kernel/time.c
+++ b/arch/v850/kernel/time.c
@@ -66,7 +66,7 @@ static irqreturn_t timer_interrupt (int irq, void *dummy, struct pt_regs *regs)
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
 	 */
-	if ((time_status & STA_UNSYNC) == 0 &&
+	if (ntp_synced() &&
 	    xtime.tv_sec > last_rtc_update + 660 &&
 	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
 	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
@@ -169,10 +169,7 @@ int do_settimeofday(struct timespec *tv)
 	xtime.tv_sec = tv->tv_sec;
 	xtime.tv_nsec = tv->tv_nsec;
 
-	time_adjust = 0;		/* stop active adjtime () */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	write_sequnlock_irq (&xtime_lock);
 	clock_was_set();
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 2b5d9da912a2..7b6abe058253 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -176,10 +176,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
@@ -471,7 +468,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
  * off) isn't likely to go away much sooner anyway.
  */
 
-	if ((~time_status & STA_UNSYNC) && xtime.tv_sec > rtc_update &&
+	if (ntp_synced() && xtime.tv_sec > rtc_update &&
 		abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
 		set_rtc_mmss(xtime.tv_sec);
 		rtc_update = xtime.tv_sec + 660;
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index e07287db5a40..1ac7d5ce7456 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -122,10 +122,7 @@ int do_settimeofday(struct timespec *tv)
 	set_normalized_timespec(&xtime, sec, nsec);
 	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-	time_adjust = 0;                /* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	ntp_clear();
 	write_sequnlock_irq(&xtime_lock);
 	return 0;
 }
@@ -184,7 +181,7 @@ again:
 		next += CCOUNT_PER_JIFFY;
 		do_timer (regs); /* Linux handler in kernel/timer.c */
 
-		if ((time_status & STA_UNSYNC) == 0 &&
+		if (ntp_synced() &&
 		    xtime.tv_sec - last_rtc_update >= 659 &&
 		    abs((xtime.tv_nsec/1000)-(1000000-1000000/HZ))<5000000/HZ &&
 		    jiffies - wall_jiffies == 1) {
diff --git a/include/linux/timex.h b/include/linux/timex.h
index 74fdd07d3792..7e050a2cc35b 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -260,6 +260,29 @@ extern long pps_calcnt;		/* calibration intervals */
 extern long pps_errcnt;		/* calibration errors */
 extern long pps_stbcnt;		/* stability limit exceeded */
 
+/**
+ * ntp_clear - Clears the NTP state variables
+ *
+ * Must be called while holding a write on the xtime_lock
+ */
+static inline void ntp_clear(void)
+{
+	time_adjust = 0;		/* stop active adjtime() */
+	time_status |= STA_UNSYNC;
+	time_maxerror = NTP_PHASE_LIMIT;
+	time_esterror = NTP_PHASE_LIMIT;
+}
+
+/**
+ * ntp_synced - Returns 1 if the NTP status is not UNSYNC
+ *
+ */
+static inline int ntp_synced(void)
+{
+	return !(time_status & STA_UNSYNC);
+}
+
+
 #ifdef CONFIG_TIME_INTERPOLATION
 
 #define TIME_SOURCE_CPU 0
-- 
cgit v1.2.3


From 61e032fa2f659fada02ede5087b46963a1c7de34 Mon Sep 17 00:00:00 2001
From: Andrey Panin <pazke@donpac.ru>
Date: Tue, 6 Sep 2005 15:18:26 -0700
Subject: [PATCH] dmi: remove uneeded function

After elimination of central DMI blacklist dmi_scan_machine() function became
a wrapper for dmi_iterate().  This patch moves some code around to kill
unneeded function.

Signed-off-by: Andrey Panin <pazke@donpac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/dmi_scan.c | 85 +++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 45 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index a3cdf894302b..6ae4ef472dbf 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -84,49 +84,6 @@ static int __init dmi_checksum(u8 *buf)
 	return sum == 0;
 }
 
-static int __init dmi_iterate(void (*decode)(struct dmi_header *))
-{
-	u8 buf[15];
-	char __iomem *p, *q;
-
-	/*
-	 * no iounmap() for that ioremap(); it would be a no-op, but it's
-	 * so early in setup that sucker gets confused into doing what
-	 * it shouldn't if we actually call it.
-	 */
-	p = ioremap(0xF0000, 0x10000);
-	if (p == NULL)
-		return -1;
-
-	for (q = p; q < p + 0x10000; q += 16) {
-		memcpy_fromio(buf, q, 15);
-		if ((memcmp(buf, "_DMI_", 5) == 0) && dmi_checksum(buf)) {
-			u16 num = (buf[13] << 8) | buf[12];
-			u16 len = (buf[7] << 8) | buf[6];
-			u32 base = (buf[11] << 24) | (buf[10] << 16) |
-				   (buf[9] << 8) | buf[8];
-
-			/*
-			 * DMI version 0.0 means that the real version is taken from
-			 * the SMBIOS version, which we don't know at this point.
-			 */
-			if (buf[14] != 0)
-				printk(KERN_INFO "DMI %d.%d present.\n",
-					buf[14] >> 4, buf[14] & 0xF);
-			else
-				printk(KERN_INFO "DMI present.\n");
-
-			dmi_printk((KERN_INFO "%d structures occupying %d bytes.\n",
-				num, len));
-			dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", base));
-
-			if (dmi_table(base,len, num, decode) == 0)
-				return 0;
-		}
-	}
-	return -1;
-}
-
 static char *dmi_ident[DMI_STRING_MAX];
 
 /*
@@ -190,8 +147,46 @@ static void __init dmi_decode(struct dmi_header *dm)
 
 void __init dmi_scan_machine(void)
 {
-	if (dmi_iterate(dmi_decode))
-		printk(KERN_INFO "DMI not present.\n");
+	u8 buf[15];
+	char __iomem *p, *q;
+
+	/*
+	 * no iounmap() for that ioremap(); it would be a no-op, but it's
+	 * so early in setup that sucker gets confused into doing what
+	 * it shouldn't if we actually call it.
+	 */
+	p = ioremap(0xF0000, 0x10000);
+	if (p == NULL)
+		goto out;
+
+	for (q = p; q < p + 0x10000; q += 16) {
+		memcpy_fromio(buf, q, 15);
+		if ((memcmp(buf, "_DMI_", 5) == 0) && dmi_checksum(buf)) {
+			u16 num = (buf[13] << 8) | buf[12];
+			u16 len = (buf[7] << 8) | buf[6];
+			u32 base = (buf[11] << 24) | (buf[10] << 16) |
+				   (buf[9] << 8) | buf[8];
+
+			/*
+			 * DMI version 0.0 means that the real version is taken from
+			 * the SMBIOS version, which we don't know at this point.
+			 */
+			if (buf[14] != 0)
+				printk(KERN_INFO "DMI %d.%d present.\n",
+					buf[14] >> 4, buf[14] & 0xF);
+			else
+				printk(KERN_INFO "DMI present.\n");
+
+			dmi_printk((KERN_INFO "%d structures occupying %d bytes.\n",
+				num, len));
+			dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", base));
+
+			if (dmi_table(base,len, num, dmi_decode) == 0)
+				return;
+		}
+	}
+
+out:	printk(KERN_INFO "DMI not present.\n");
 }
 
 
-- 
cgit v1.2.3


From 4e70b9a3d68909ad7e79bf6e1b0dcec6de922a7c Mon Sep 17 00:00:00 2001
From: Andrey Panin <pazke@donpac.ru>
Date: Tue, 6 Sep 2005 15:18:28 -0700
Subject: [PATCH] dmi: remove old debugging code

DMI debugging code is unused for ages.  This patch removes it.

Signed-off-by: Andrey Panin <pazke@donpac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/dmi_scan.c | 21 ---------------------
 1 file changed, 21 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index 6ae4ef472dbf..ecdaae262755 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -12,13 +12,6 @@ struct dmi_header {
 	u16 handle;
 };
 
-#undef DMI_DEBUG
-
-#ifdef DMI_DEBUG
-#define dmi_printk(x) printk x
-#else
-#define dmi_printk(x)
-#endif
 
 static char * __init dmi_string(struct dmi_header *dm, u8 s)
 {
@@ -117,29 +110,19 @@ static void __init dmi_decode(struct dmi_header *dm)
 	
 	switch(dm->type) {
 	case  0:
-		dmi_printk(("BIOS Vendor: %s\n", dmi_string(dm, data[4])));
 		dmi_save_ident(dm, DMI_BIOS_VENDOR, 4);
-		dmi_printk(("BIOS Version: %s\n", dmi_string(dm, data[5])));
 		dmi_save_ident(dm, DMI_BIOS_VERSION, 5);
-		dmi_printk(("BIOS Release: %s\n", dmi_string(dm, data[8])));
 		dmi_save_ident(dm, DMI_BIOS_DATE, 8);
 		break;
 	case 1:
-		dmi_printk(("System Vendor: %s\n", dmi_string(dm, data[4])));
 		dmi_save_ident(dm, DMI_SYS_VENDOR, 4);
-		dmi_printk(("Product Name: %s\n", dmi_string(dm, data[5])));
 		dmi_save_ident(dm, DMI_PRODUCT_NAME, 5);
-		dmi_printk(("Version: %s\n", dmi_string(dm, data[6])));
 		dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6);
-		dmi_printk(("Serial Number: %s\n", dmi_string(dm, data[7])));
 		dmi_save_ident(dm, DMI_PRODUCT_SERIAL, 7);
 		break;
 	case 2:
-		dmi_printk(("Board Vendor: %s\n", dmi_string(dm, data[4])));
 		dmi_save_ident(dm, DMI_BOARD_VENDOR, 4);
-		dmi_printk(("Board Name: %s\n", dmi_string(dm, data[5])));
 		dmi_save_ident(dm, DMI_BOARD_NAME, 5);
-		dmi_printk(("Board Version: %s\n", dmi_string(dm, data[6])));
 		dmi_save_ident(dm, DMI_BOARD_VERSION, 6);
 		break;
 	}
@@ -177,10 +160,6 @@ void __init dmi_scan_machine(void)
 			else
 				printk(KERN_INFO "DMI present.\n");
 
-			dmi_printk((KERN_INFO "%d structures occupying %d bytes.\n",
-				num, len));
-			dmi_printk((KERN_INFO "DMI table at 0x%08X.\n", base));
-
 			if (dmi_table(base,len, num, dmi_decode) == 0)
 				return;
 		}
-- 
cgit v1.2.3


From c3c7120d552989be94c9137989be5abb6da8954f Mon Sep 17 00:00:00 2001
From: Andrey Panin <pazke@donpac.ru>
Date: Tue, 6 Sep 2005 15:18:28 -0700
Subject: [PATCH] dmi: make dmi_string() behave like strdup()

This patch changes dmi_string() function to allocate string copy by itself, to
avoid code duplication in the next patch.

Signed-off-by: Andrey Panin <pazke@donpac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/dmi_scan.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index ecdaae262755..ae1a1aed2fc0 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -16,15 +16,25 @@ struct dmi_header {
 static char * __init dmi_string(struct dmi_header *dm, u8 s)
 {
 	u8 *bp = ((u8 *) dm) + dm->length;
+	char *str = "";
 
-	if (!s)
-		return "";
-	s--;
-	while (s > 0 && *bp) {
-		bp += strlen(bp) + 1;
+	if (s) {
 		s--;
-	}
-	return bp;
+		while (s > 0 && *bp) {
+			bp += strlen(bp) + 1;
+			s--;
+		}
+
+		if (*bp != 0) {
+			str = alloc_bootmem(strlen(bp) + 1);
+			if (str != NULL)
+				strcpy(str, bp);
+			else
+				printk(KERN_ERR "dmi_string: out of memory.\n");
+		}
+ 	}
+
+	return str;
 }
 
 /*
@@ -84,19 +94,16 @@ static char *dmi_ident[DMI_STRING_MAX];
  */
 static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string)
 {
-	char *d = (char*)dm;
-	char *p = dmi_string(dm, d[string]);
+	char *p, *d = (char*) dm;
 
-	if (p == NULL || *p == 0)
-		return;
 	if (dmi_ident[slot])
 		return;
 
-	dmi_ident[slot] = alloc_bootmem(strlen(p) + 1);
-	if(dmi_ident[slot])
-		strcpy(dmi_ident[slot], p);
-	else
-		printk(KERN_ERR "dmi_save_ident: out of memory.\n");
+	p = dmi_string(dm, d[string]);
+	if (p == NULL)
+		return;
+
+	dmi_ident[slot] = p;
 }
 
 /*
-- 
cgit v1.2.3


From ebad6a4230bdb5927495e28bc7837f515bf667a7 Mon Sep 17 00:00:00 2001
From: Andrey Panin <pazke@donpac.ru>
Date: Tue, 6 Sep 2005 15:18:29 -0700
Subject: [PATCH] dmi: add onboard devices discovery

This patch adds onboard devices and IPMI BMC discovery into DMI scan code.
Drivers can use dmi_find_device() function to search for devices by type and
name.

Signed-off-by: Andrey Panin <pazke@donpac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/dmi_scan.c | 102 ++++++++++++++++++++++++++++++++++++++------
 include/linux/dmi.h         |  36 ++++++++++++++--
 2 files changed, 123 insertions(+), 15 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index ae1a1aed2fc0..c4a73855e38c 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -6,13 +6,6 @@
 #include <linux/bootmem.h>
 
 
-struct dmi_header {
-	u8 type;
-	u8 length;
-	u16 handle;
-};
-
-
 static char * __init dmi_string(struct dmi_header *dm, u8 s)
 {
 	u8 *bp = ((u8 *) dm) + dm->length;
@@ -88,6 +81,7 @@ static int __init dmi_checksum(u8 *buf)
 }
 
 static char *dmi_ident[DMI_STRING_MAX];
+static LIST_HEAD(dmi_devices);
 
 /*
  *	Save a DMI string
@@ -106,6 +100,58 @@ static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string)
 	dmi_ident[slot] = p;
 }
 
+static void __init dmi_save_devices(struct dmi_header *dm)
+{
+	int i, count = (dm->length - sizeof(struct dmi_header)) / 2;
+	struct dmi_device *dev;
+
+	for (i = 0; i < count; i++) {
+		char *d = ((char *) dm) + (i * 2);
+
+		/* Skip disabled device */
+		if ((*d & 0x80) == 0)
+			continue;
+
+		dev = alloc_bootmem(sizeof(*dev));
+		if (!dev) {
+			printk(KERN_ERR "dmi_save_devices: out of memory.\n");
+			break;
+		}
+
+		dev->type = *d++ & 0x7f;
+		dev->name = dmi_string(dm, *d);
+		dev->device_data = NULL;
+
+		list_add(&dev->list, &dmi_devices);
+	}
+}
+
+static void __init dmi_save_ipmi_device(struct dmi_header *dm)
+{
+	struct dmi_device *dev;
+	void * data;
+
+	data = alloc_bootmem(dm->length);
+	if (data == NULL) {
+		printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n");
+		return;
+	}
+
+	memcpy(data, dm, dm->length);
+
+	dev = alloc_bootmem(sizeof(*dev));
+	if (!dev) {
+		printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n");
+		return;
+	}
+
+	dev->type = DMI_DEV_TYPE_IPMI;
+	dev->name = "IPMI controller";
+	dev->device_data = data;
+
+	list_add(&dev->list, &dmi_devices);
+}
+
 /*
  *	Process a DMI table entry. Right now all we care about are the BIOS
  *	and machine entries. For 2.5 we should pull the smbus controller info
@@ -113,25 +159,28 @@ static void __init dmi_save_ident(struct dmi_header *dm, int slot, int string)
  */
 static void __init dmi_decode(struct dmi_header *dm)
 {
-	u8 *data __attribute__((__unused__)) = (u8 *)dm;
-	
 	switch(dm->type) {
-	case  0:
+	case 0:		/* BIOS Information */
 		dmi_save_ident(dm, DMI_BIOS_VENDOR, 4);
 		dmi_save_ident(dm, DMI_BIOS_VERSION, 5);
 		dmi_save_ident(dm, DMI_BIOS_DATE, 8);
 		break;
-	case 1:
+	case 1:		/* System Information */
 		dmi_save_ident(dm, DMI_SYS_VENDOR, 4);
 		dmi_save_ident(dm, DMI_PRODUCT_NAME, 5);
 		dmi_save_ident(dm, DMI_PRODUCT_VERSION, 6);
 		dmi_save_ident(dm, DMI_PRODUCT_SERIAL, 7);
 		break;
-	case 2:
+	case 2:		/* Base Board Information */
 		dmi_save_ident(dm, DMI_BOARD_VENDOR, 4);
 		dmi_save_ident(dm, DMI_BOARD_NAME, 5);
 		dmi_save_ident(dm, DMI_BOARD_VERSION, 6);
 		break;
+	case 10:	/* Onboard Devices Information */
+		dmi_save_devices(dm);
+		break;
+	case 38:	/* IPMI Device Information */
+		dmi_save_ipmi_device(dm);
 	}
 }
 
@@ -221,3 +270,32 @@ char *dmi_get_system_info(int field)
 	return dmi_ident[field];
 }
 EXPORT_SYMBOL(dmi_get_system_info);
+
+/**
+ *	dmi_find_device - find onboard device by type/name
+ *	@type: device type or %DMI_DEV_TYPE_ANY to match all device types
+ *	@desc: device name string or %NULL to match all
+ *	@from: previous device found in search, or %NULL for new search.
+ *
+ *	Iterates through the list of known onboard devices. If a device is
+ *	found with a matching @vendor and @device, a pointer to its device
+ *	structure is returned.  Otherwise, %NULL is returned.
+ *	A new search is initiated by passing %NULL to the @from argument.
+ *	If @from is not %NULL, searches continue from next device.
+ */
+struct dmi_device * dmi_find_device(int type, const char *name,
+				    struct dmi_device *from)
+{
+	struct list_head *d, *head = from ? &from->list : &dmi_devices;
+
+	for(d = head->next; d != &dmi_devices; d = d->next) {
+		struct dmi_device *dev = list_entry(d, struct dmi_device, list);
+
+		if (((type == DMI_DEV_TYPE_ANY) || (dev->type == type)) &&
+		    ((name == NULL) || (strcmp(dev->name, name) == 0)))
+			return dev;
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(dmi_find_device);
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index 5e93e6dce9a4..c30175e8dec6 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -1,6 +1,8 @@
 #ifndef __DMI_H__
 #define __DMI_H__
 
+#include <linux/list.h>
+
 enum dmi_field {
 	DMI_NONE,
 	DMI_BIOS_VENDOR,
@@ -16,6 +18,24 @@ enum dmi_field {
 	DMI_STRING_MAX,
 };
 
+enum dmi_device_type {
+	DMI_DEV_TYPE_ANY = 0,
+	DMI_DEV_TYPE_OTHER,
+	DMI_DEV_TYPE_UNKNOWN,
+	DMI_DEV_TYPE_VIDEO,
+	DMI_DEV_TYPE_SCSI,
+	DMI_DEV_TYPE_ETHERNET,
+	DMI_DEV_TYPE_TOKENRING,
+	DMI_DEV_TYPE_SOUND,
+	DMI_DEV_TYPE_IPMI = -1
+};
+
+struct dmi_header {
+	u8 type;
+	u8 length;
+	u16 handle;
+};
+
 /*
  *	DMI callbacks for problem boards
  */
@@ -26,22 +46,32 @@ struct dmi_strmatch {
 
 struct dmi_system_id {
 	int (*callback)(struct dmi_system_id *);
-	char *ident;
+	const char *ident;
 	struct dmi_strmatch matches[4];
 	void *driver_data;
 };
 
-#define DMI_MATCH(a,b)	{ a, b }
+#define DMI_MATCH(a, b)	{ a, b }
+
+struct dmi_device {
+	struct list_head list;
+	int type;
+	const char *name;
+	void *device_data;	/* Type specific data */
+};
 
 #if defined(CONFIG_X86) && !defined(CONFIG_X86_64)
 
 extern int dmi_check_system(struct dmi_system_id *list);
 extern char * dmi_get_system_info(int field);
-
+extern struct dmi_device * dmi_find_device(int type, const char *name,
+	struct dmi_device *from);
 #else
 
 static inline int dmi_check_system(struct dmi_system_id *list) { return 0; }
 static inline char * dmi_get_system_info(int field) { return NULL; }
+static struct dmi_device * dmi_find_device(int type, const char *name,
+	struct dmi_device *from) { return NULL; }
 
 #endif
 
-- 
cgit v1.2.3


From 640e803376b9c4072f69fec42e304c974a631298 Mon Sep 17 00:00:00 2001
From: Robert Love <rml@novell.com>
Date: Tue, 6 Sep 2005 15:18:30 -0700
Subject: [PATCH] fix: dmi_check_system

Background:

	1) dmi_check_system() returns the count of the number of
	   matches.  Zero thus means no matches.
	2) A match callback can return nonzero to stop the match
	   checking.

Bug: The count is incremented after we check for the nonzero return value,
so it does not reflect the actual count.  We could say this is intended,
for some dumb reason, except that it means that a match on the first check
returns zero--no matches--if the callback returns nonzero.

Attached patch implements the count before calling the callback and thus
before potentially short-circuiting.

Signed-off-by: Robert Love <rml@novell.com>
Cc: Andrey Panin <pazke@donpac.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/dmi_scan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/dmi_scan.c b/arch/i386/kernel/dmi_scan.c
index c4a73855e38c..58516e2ac172 100644
--- a/arch/i386/kernel/dmi_scan.c
+++ b/arch/i386/kernel/dmi_scan.c
@@ -248,9 +248,9 @@ int dmi_check_system(struct dmi_system_id *list)
 			/* No match */
 			goto fail;
 		}
+		count++;
 		if (d->callback && d->callback(d))
 			break;
-		count++;
 fail:		d++;
 	}
 
-- 
cgit v1.2.3


From 3d97ae5b958855ac007b6f56a0f94ab8ade09e9e Mon Sep 17 00:00:00 2001
From: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Date: Tue, 6 Sep 2005 15:19:27 -0700
Subject: [PATCH] kprobes: prevent possible race conditions i386 changes

This patch contains the i386 architecture specific changes to prevent the
possible race conditions.

Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/entry.S       | 13 ++++++++-----
 arch/i386/kernel/kprobes.c     | 29 +++++++++++++++--------------
 arch/i386/kernel/traps.c       | 12 +++++++-----
 arch/i386/kernel/vmlinux.lds.S |  1 +
 arch/i386/mm/fault.c           |  4 +++-
 5 files changed, 34 insertions(+), 25 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index abb909793efc..3aad03839660 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -507,7 +507,7 @@ label:						\
 	pushl $__KERNEL_CS;			\
 	pushl $sysenter_past_esp
 
-ENTRY(debug)
+KPROBE_ENTRY(debug)
 	cmpl $sysenter_entry,(%esp)
 	jne debug_stack_correct
 	FIX_STACK(12, debug_stack_correct, debug_esp_fix_insn)
@@ -518,7 +518,7 @@ debug_stack_correct:
 	movl %esp,%eax			# pt_regs pointer
 	call do_debug
 	jmp ret_from_exception
-
+	.previous .text
 /*
  * NMI is doubly nasty. It can happen _while_ we're handling
  * a debug fault, and the debug fault hasn't yet been able to
@@ -591,13 +591,14 @@ nmi_16bit_stack:
 	.long 1b,iret_exc
 .previous
 
-ENTRY(int3)
+KPROBE_ENTRY(int3)
 	pushl $-1			# mark this as an int
 	SAVE_ALL
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_int3
 	jmp ret_from_exception
+	.previous .text
 
 ENTRY(overflow)
 	pushl $0
@@ -631,17 +632,19 @@ ENTRY(stack_segment)
 	pushl $do_stack_segment
 	jmp error_code
 
-ENTRY(general_protection)
+KPROBE_ENTRY(general_protection)
 	pushl $do_general_protection
 	jmp error_code
+	.previous .text
 
 ENTRY(alignment_check)
 	pushl $do_alignment_check
 	jmp error_code
 
-ENTRY(page_fault)
+KPROBE_ENTRY(page_fault)
 	pushl $do_page_fault
 	jmp error_code
+	.previous .text
 
 #ifdef CONFIG_X86_MCE
 ENTRY(machine_check)
diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index a6d8c45961d3..7fb5a6f4a563 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -62,32 +62,32 @@ static inline int is_IF_modifier(kprobe_opcode_t opcode)
 	return 0;
 }
 
-int arch_prepare_kprobe(struct kprobe *p)
+int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
 	return 0;
 }
 
-void arch_copy_kprobe(struct kprobe *p)
+void __kprobes arch_copy_kprobe(struct kprobe *p)
 {
 	memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
 	p->opcode = *p->addr;
 }
 
-void arch_arm_kprobe(struct kprobe *p)
+void __kprobes arch_arm_kprobe(struct kprobe *p)
 {
 	*p->addr = BREAKPOINT_INSTRUCTION;
 	flush_icache_range((unsigned long) p->addr,
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-void arch_disarm_kprobe(struct kprobe *p)
+void __kprobes arch_disarm_kprobe(struct kprobe *p)
 {
 	*p->addr = p->opcode;
 	flush_icache_range((unsigned long) p->addr,
 			   (unsigned long) p->addr + sizeof(kprobe_opcode_t));
 }
 
-void arch_remove_kprobe(struct kprobe *p)
+void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
 }
 
@@ -127,7 +127,8 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 		regs->eip = (unsigned long)&p->ainsn.insn;
 }
 
-void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
+void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
+				      struct pt_regs *regs)
 {
 	unsigned long *sara = (unsigned long *)&regs->esp;
         struct kretprobe_instance *ri;
@@ -150,7 +151,7 @@ void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
  * Interrupts are disabled on entry as trap3 is an interrupt gate and they
  * remain disabled thorough out this function.
  */
-static int kprobe_handler(struct pt_regs *regs)
+static int __kprobes kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *p;
 	int ret = 0;
@@ -259,7 +260,7 @@ no_kprobe:
 /*
  * Called when we hit the probe point at kretprobe_trampoline
  */
-int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 {
         struct kretprobe_instance *ri = NULL;
         struct hlist_head *head;
@@ -338,7 +339,7 @@ int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
  * that is atop the stack is the address following the copied instruction.
  * We need to make it the address following the original instruction.
  */
-static void resume_execution(struct kprobe *p, struct pt_regs *regs)
+static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs)
 {
 	unsigned long *tos = (unsigned long *)&regs->esp;
 	unsigned long next_eip = 0;
@@ -444,8 +445,8 @@ static inline int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
 /*
  * Wrapper routine to for handling exceptions.
  */
-int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
-			     void *data)
+int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
+				       unsigned long val, void *data)
 {
 	struct die_args *args = (struct die_args *)data;
 	switch (val) {
@@ -473,7 +474,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
 	return NOTIFY_DONE;
 }
 
-int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct jprobe *jp = container_of(p, struct jprobe, kp);
 	unsigned long addr;
@@ -495,7 +496,7 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	return 1;
 }
 
-void jprobe_return(void)
+void __kprobes jprobe_return(void)
 {
 	preempt_enable_no_resched();
 	asm volatile ("       xchgl   %%ebx,%%esp     \n"
@@ -506,7 +507,7 @@ void jprobe_return(void)
 		      (jprobe_saved_esp):"memory");
 }
 
-int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	u8 *addr = (u8 *) (regs->eip - 1);
 	unsigned long stack_addr = (unsigned long)jprobe_saved_esp;
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index 029bf94cda7d..09a58cb6daa7 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -363,8 +363,9 @@ static inline void die_if_kernel(const char * str, struct pt_regs * regs, long e
 		die(str, regs, err);
 }
 
-static void do_trap(int trapnr, int signr, char *str, int vm86,
-			   struct pt_regs * regs, long error_code, siginfo_t *info)
+static void __kprobes do_trap(int trapnr, int signr, char *str, int vm86,
+			      struct pt_regs * regs, long error_code,
+			      siginfo_t *info)
 {
 	struct task_struct *tsk = current;
 	tsk->thread.error_code = error_code;
@@ -460,7 +461,8 @@ DO_ERROR(12, SIGBUS,  "stack segment", stack_segment)
 DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
 DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0)
 
-fastcall void do_general_protection(struct pt_regs * regs, long error_code)
+fastcall void __kprobes do_general_protection(struct pt_regs * regs,
+					      long error_code)
 {
 	int cpu = get_cpu();
 	struct tss_struct *tss = &per_cpu(init_tss, cpu);
@@ -676,7 +678,7 @@ void unset_nmi_callback(void)
 EXPORT_SYMBOL_GPL(unset_nmi_callback);
 
 #ifdef CONFIG_KPROBES
-fastcall void do_int3(struct pt_regs *regs, long error_code)
+fastcall void __kprobes do_int3(struct pt_regs *regs, long error_code)
 {
 	if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
 			== NOTIFY_STOP)
@@ -710,7 +712,7 @@ fastcall void do_int3(struct pt_regs *regs, long error_code)
  * find every occurrence of the TF bit that could be saved away even
  * by user code)
  */
-fastcall void do_debug(struct pt_regs * regs, long error_code)
+fastcall void __kprobes do_debug(struct pt_regs * regs, long error_code)
 {
 	unsigned int condition;
 	struct task_struct *tsk = current;
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 761972f8cb6c..13b9c62cbbb4 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -22,6 +22,7 @@ SECTIONS
 	*(.text)
 	SCHED_TEXT
 	LOCK_TEXT
+	KPROBES_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x9090
diff --git a/arch/i386/mm/fault.c b/arch/i386/mm/fault.c
index 411b8500ad1b..9edd4485b91e 100644
--- a/arch/i386/mm/fault.c
+++ b/arch/i386/mm/fault.c
@@ -21,6 +21,7 @@
 #include <linux/vt_kern.h>		/* For unblank_screen() */
 #include <linux/highmem.h>
 #include <linux/module.h>
+#include <linux/kprobes.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -223,7 +224,8 @@ fastcall void do_invalid_op(struct pt_regs *, unsigned long);
  *	bit 1 == 0 means read, 1 means write
  *	bit 2 == 0 means kernel, 1 means user-mode
  */
-fastcall void do_page_fault(struct pt_regs *regs, unsigned long error_code)
+fastcall void __kprobes do_page_fault(struct pt_regs *regs,
+				      unsigned long error_code)
 {
 	struct task_struct *tsk;
 	struct mm_struct *mm;
-- 
cgit v1.2.3


From bce0649417d6e71f6df8ab7b11103d247913b142 Mon Sep 17 00:00:00 2001
From: Jim Keniston <jkenisto@us.ibm.com>
Date: Tue, 6 Sep 2005 15:19:34 -0700
Subject: [PATCH] kprobes: fix handling of simultaneous probe hit/unregister

This patch fixes a bug in kprobes's handling of a corner case on i386 and
x86_64.  On an SMP system, if one CPU unregisters a kprobe just after
another CPU hits that probepoint, kprobe_handler() on the latter CPU sees
that the kprobe has been unregistered, and attempts to let the CPU continue
as if the probepoint hadn't been hit.  The bug is that on i386 and x86_64,
we were neglecting to set the IP back to the beginning of the probed
instruction.  This could cause an oops or crash.

This bug doesn't exist on ppc64 and ia64, where a breakpoint instruction
leaves the IP pointing to the beginning of the instruction.  I don't know
about sparc64.  (Dave, could you please advise?)

This fix has been tested on i386 and x86_64 SMP systems.  To reproduce the
problem, set one CPU to work registering and unregistering a kprobe
repeatedly, and another CPU pounding the probepoint in a tight loop.

Acked-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Jim Keniston <jkenisto@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c   | 3 +++
 arch/x86_64/kernel/kprobes.c | 3 +++
 2 files changed, 6 insertions(+)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index 7fb5a6f4a563..e5cec32018a5 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -221,7 +221,10 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * either a probepoint or a debugger breakpoint
 			 * at this address.  In either case, no further
 			 * handling of this interrupt is appropriate.
+			 * Back up over the (now missing) int3 and run
+			 * the original instruction.
 			 */
+			regs->eip -= sizeof(kprobe_opcode_t);
 			ret = 1;
 		}
 		/* Not one of ours: let kernel handle it */
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index c21cceaea275..2d7658fbbb28 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -361,7 +361,10 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 			 * either a probepoint or a debugger breakpoint
 			 * at this address.  In either case, no further
 			 * handling of this interrupt is appropriate.
+			 * Back up over the (now missing) int3 and run
+			 * the original instruction.
 			 */
+			regs->rip = (unsigned long)addr;
 			ret = 1;
 		}
 		/* Not one of ours: let kernel handle it */
-- 
cgit v1.2.3


From deac66ae454cacf942c051b86d9232af546fb187 Mon Sep 17 00:00:00 2001
From: Keshavamurthy Anil S <anil.s.keshavamurthy@intel.com>
Date: Tue, 6 Sep 2005 15:19:35 -0700
Subject: [PATCH] kprobes: fix bug when probed on task and isr functions

This patch fixes a race condition where in system used to hang or sometime
crash within minutes when kprobes are inserted on ISR routine and a task
routine.

The fix has been stress tested on i386, ia64, pp64 and on x86_64.  To
reproduce the problem insert kprobes on schedule() and do_IRQ() functions
and you should see hang or system crash.

Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c   |  3 ++-
 arch/ia64/kernel/kprobes.c   | 22 +++++++++++++++++++---
 arch/ppc64/kernel/kprobes.c  | 11 ++++++-----
 arch/x86_64/kernel/kprobes.c |  3 ++-
 include/asm-ia64/kprobes.h   |  1 +
 include/asm-ppc64/kprobes.h  |  3 +++
 kernel/kprobes.c             | 22 ++++++++++++++++++++++
 7 files changed, 55 insertions(+), 10 deletions(-)

(limited to 'arch/i386/kernel')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index e5cec32018a5..6345b430b105 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -177,7 +177,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			if (kprobe_status == KPROBE_HIT_SS &&
+				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
 				regs->eflags &= ~TF_MASK;
 				regs->eflags |= kprobe_saved_eflags;
 				unlock_kprobes();
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index 4b1bd539ec47..471086b808a4 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -95,6 +95,17 @@ static void __kprobes update_kprobe_inst_flag(uint template, uint  slot,
 	p->ainsn.inst_flag = 0;
 	p->ainsn.target_br_reg = 0;
 
+	/* Check for Break instruction
+ 	 * Bits 37:40 Major opcode to be zero
+	 * Bits 27:32 X6 to be zero
+	 * Bits 32:35 X3 to be zero
+	 */
+	if ((!major_opcode) && (!((kprobe_inst >> 27) & 0x1FF)) ) {
+		/* is a break instruction */
+	 	p->ainsn.inst_flag |= INST_FLAG_BREAK_INST;
+		return;
+	}
+
 	if (bundle_encoding[template][slot] == B) {
 		switch (major_opcode) {
 		  case INDIRECT_CALL_OPCODE:
@@ -542,8 +553,11 @@ static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs)
 	unsigned long bundle_addr = (unsigned long) &p->opcode.bundle;
 	unsigned long slot = (unsigned long)p->addr & 0xf;
 
-	/* Update instruction pointer (IIP) and slot number (IPSR.ri) */
-	regs->cr_iip = bundle_addr & ~0xFULL;
+	/* single step inline if break instruction */
+	if (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)
+		regs->cr_iip = (unsigned long)p->addr & ~0xFULL;
+	else
+		regs->cr_iip = bundle_addr & ~0xFULL;
 
 	if (slot > 2)
 		slot = 0;
@@ -599,7 +613,9 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 	if (kprobe_running()) {
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			if ( (kprobe_status == KPROBE_HIT_SS) &&
+	 		     (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)) {
+  				ia64_psr(regs)->ss = 0;
 				unlock_kprobes();
 				goto no_kprobe;
 			}
diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c
index 591e4b67b5a5..7e80d49c589a 100644
--- a/arch/ppc64/kernel/kprobes.c
+++ b/arch/ppc64/kernel/kprobes.c
@@ -102,7 +102,7 @@ static inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
 	regs->msr |= MSR_SE;
 
 	/* single step inline if it is a trap variant */
-	if (IS_TW(insn) || IS_TD(insn) || IS_TWI(insn) || IS_TDI(insn))
+	if (is_trap(insn))
 		regs->nip = (unsigned long)p->addr;
 	else
 		regs->nip = (unsigned long)p->ainsn.insn;
@@ -152,7 +152,9 @@ static inline int kprobe_handler(struct pt_regs *regs)
 		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			kprobe_opcode_t insn = *p->ainsn.insn;
+			if (kprobe_status == KPROBE_HIT_SS &&
+					is_trap(insn)) {
 				regs->msr &= ~MSR_SE;
 				regs->msr |= kprobe_saved_msr;
 				unlock_kprobes();
@@ -192,8 +194,7 @@ static inline int kprobe_handler(struct pt_regs *regs)
 			 * trap variant, it could belong to someone else
 			 */
 			kprobe_opcode_t cur_insn = *addr;
-			if (IS_TW(cur_insn) || IS_TD(cur_insn) ||
-					IS_TWI(cur_insn) || IS_TDI(cur_insn))
+			if (is_trap(cur_insn))
 		       		goto no_kprobe;
 			/*
 			 * The breakpoint instruction was removed right
@@ -403,7 +404,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	default:
 		break;
 	}
-	preempt_enable();
+	preempt_enable_no_resched();
 	return ret;
 }
 
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 2d7658fbbb28..df08c43276a0 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -311,7 +311,8 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 		   Disarm the probe we just hit, and ignore it. */
 		p = get_kprobe(addr);
 		if (p) {
-			if (kprobe_status == KPROBE_HIT_SS) {
+			if (kprobe_status == KPROBE_HIT_SS &&
+				*p->ainsn.insn == BREAKPOINT_INSTRUCTION) {
 				regs->eflags &= ~TF_MASK;
 				regs->eflags |= kprobe_saved_rflags;
 				unlock_kprobes();
diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h
index bf36a32e37e4..573a3574a24f 100644
--- a/include/asm-ia64/kprobes.h
+++ b/include/asm-ia64/kprobes.h
@@ -92,6 +92,7 @@ struct arch_specific_insn {
 	kprobe_opcode_t insn;
  #define INST_FLAG_FIX_RELATIVE_IP_ADDR		1
  #define INST_FLAG_FIX_BRANCH_REG		2
+ #define INST_FLAG_BREAK_INST			4
  	unsigned long inst_flag;
  	unsigned short target_br_reg;
 };
diff --git a/include/asm-ppc64/kprobes.h b/include/asm-ppc64/kprobes.h
index 0802919c3235..d9129d2b038e 100644
--- a/include/asm-ppc64/kprobes.h
+++ b/include/asm-ppc64/kprobes.h
@@ -42,6 +42,9 @@ typedef unsigned int kprobe_opcode_t;
 
 #define JPROBE_ENTRY(pentry)	(kprobe_opcode_t *)((func_descr_t *)pentry)
 
+#define is_trap(instr)	(IS_TW(instr) || IS_TD(instr) || \
+			IS_TWI(instr) || IS_TDI(instr))
+
 #define ARCH_SUPPORTS_KRETPROBES
 void kretprobe_trampoline(void);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 3b7653f2e7ae..f3ea492ab44d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -155,14 +155,36 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 /* Locks kprobe: irqs must be disabled */
 void __kprobes lock_kprobes(void)
 {
+	unsigned long flags = 0;
+
+	/* Avoiding local interrupts to happen right after we take the kprobe_lock
+	 * and before we get a chance to update kprobe_cpu, this to prevent
+	 * deadlock when we have a kprobe on ISR routine and a kprobe on task
+	 * routine
+	 */
+	local_irq_save(flags);
+
 	spin_lock(&kprobe_lock);
 	kprobe_cpu = smp_processor_id();
+
+ 	local_irq_restore(flags);
 }
 
 void __kprobes unlock_kprobes(void)
 {
+	unsigned long flags = 0;
+
+	/* Avoiding local interrupts to happen right after we update
+	 * kprobe_cpu and before we get a a chance to release kprobe_lock,
+	 * this to prevent deadlock when we have a kprobe on ISR routine and
+	 * a kprobe on task routine
+	 */
+	local_irq_save(flags);
+
 	kprobe_cpu = NR_CPUS;
 	spin_unlock(&kprobe_lock);
+
+ 	local_irq_restore(flags);
 }
 
 /* You have to be holding the kprobe_lock */
-- 
cgit v1.2.3