26 files changed, 1573 insertions, 383 deletions
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index 4c73d8ba2e3d..e1fb68ddec26 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -17,9 +17,11 @@ obj-$(CONFIG_IA64_PALINFO)	+= palinfo.o
 obj-$(CONFIG_IOSAPIC)		+= iosapic.o
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_SMP)		+= smp.o smpboot.o domain.o
+obj-$(CONFIG_NUMA)		+= numa.o
 obj-$(CONFIG_PERFMON)		+= perfmon_default_smpl.o
 obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
 obj-$(CONFIG_IA64_MCA_RECOVERY)	+= mca_recovery.o
+obj-$(CONFIG_KPROBES)		+= kprobes.o jprobes.o
 obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR)	+= uncached.o
 mca_recovery-y			+= mca_drv.o mca_drv_asm.o
 
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 72dfd9e7de0f..9609f243e5d0 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -11,6 +11,7 @@
  *  Copyright (C) 2001 Jenna Hall <jenna.s.hall@intel.com>
  *  Copyright (C) 2001 Takayoshi Kochi <t-kochi@bq.jp.nec.com>
  *  Copyright (C) 2002 Erich Focht <efocht@ess.nec.de>
+ *  Copyright (C) 2004 Ashok Raj <ashok.raj@intel.com>
  *
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  *
@@ -67,6 +68,11 @@ EXPORT_SYMBOL(pm_power_off);
 unsigned char acpi_kbd_controller_present = 1;
 unsigned char acpi_legacy_devices;
 
+static unsigned int __initdata acpi_madt_rev;
+
+unsigned int acpi_cpei_override;
+unsigned int acpi_cpei_phys_cpuid;
+
 #define MAX_SAPICS 256
 u16 ia64_acpiid_to_sapicid[MAX_SAPICS] =
 	{ [0 ... MAX_SAPICS - 1] = -1 };
@@ -236,9 +242,7 @@ acpi_parse_iosapic (acpi_table_entry_header *header, const unsigned long end)
 	if (BAD_MADT_ENTRY(iosapic, end))
 		return -EINVAL;
 
-	iosapic_init(iosapic->address, iosapic->global_irq_base);
-
-	return 0;
+	return iosapic_init(iosapic->address, iosapic->global_irq_base);
 }
 
 
@@ -267,10 +271,56 @@ acpi_parse_plat_int_src (
 						(plintsrc->flags.trigger == 1) ? IOSAPIC_EDGE : IOSAPIC_LEVEL);
 
 	platform_intr_list[plintsrc->type] = vector;
+	if (acpi_madt_rev > 1) {
+		acpi_cpei_override = plintsrc->plint_flags.cpei_override_flag;
+	}
+
+	/*
+	 * Save the physical id, so we can check when its being removed
+	 */
+	acpi_cpei_phys_cpuid = ((plintsrc->id << 8) | (plintsrc->eid)) & 0xffff;
+
 	return 0;
 }
 
 
+unsigned int can_cpei_retarget(void)
+{
+	extern int cpe_vector;
+
+	/*
+	 * Only if CPEI is supported and the override flag
+	 * is present, otherwise return that its re-targettable
+	 * if we are in polling mode.
+	 */
+	if (cpe_vector > 0 && !acpi_cpei_override)
+		return 0;
+	else
+		return 1;
+}
+
+unsigned int is_cpu_cpei_target(unsigned int cpu)
+{
+	unsigned int logical_id;
+
+	logical_id = cpu_logical_id(acpi_cpei_phys_cpuid);
+
+	if (logical_id == cpu)
+		return 1;
+	else
+		return 0;
+}
+
+void set_cpei_target_cpu(unsigned int cpu)
+{
+	acpi_cpei_phys_cpuid = cpu_physical_id(cpu);
+}
+
+unsigned int get_cpei_target_cpu(void)
+{
+	return acpi_cpei_phys_cpuid;
+}
+
 static int __init
 acpi_parse_int_src_ovr (
 	acpi_table_entry_header *header, const unsigned long end)
@@ -328,6 +378,8 @@ acpi_parse_madt (unsigned long phys_addr, unsigned long size)
 
 	acpi_madt = (struct acpi_table_madt *) __va(phys_addr);
 
+	acpi_madt_rev = acpi_madt->header.revision;
+
 	/* remember the value for reference after free_initmem() */
 #ifdef CONFIG_ITANIUM
 	has_8259 = 1; /* Firmware on old Itanium systems is broken */
@@ -642,9 +694,11 @@ acpi_boot_init (void)
 			if (smp_boot_data.cpu_phys_id[cpu] != hard_smp_processor_id())
 				node_cpuid[i++].phys_id = smp_boot_data.cpu_phys_id[cpu];
 	}
-	build_cpu_to_node_map();
 # endif
 #endif
+#ifdef CONFIG_ACPI_NUMA
+	build_cpu_to_node_map();
+#endif
 	/* Make boot-up look pretty */
 	printk(KERN_INFO "%d CPUs available, %d CPUs total\n", available_cpus, total_cpus);
 	return 0;
@@ -772,7 +826,7 @@ EXPORT_SYMBOL(acpi_unmap_lsapic);
  
 
 #ifdef CONFIG_ACPI_NUMA
-acpi_status __init
+acpi_status __devinit
 acpi_map_iosapic (acpi_handle handle, u32 depth, void *context, void **ret)
 {
 	struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
@@ -825,4 +879,28 @@ acpi_map_iosapic (acpi_handle handle, u32 depth, void *context, void **ret)
 	return AE_OK;
 }
 #endif /* CONFIG_NUMA */
+
+int
+acpi_register_ioapic (acpi_handle handle, u64 phys_addr, u32 gsi_base)
+{
+	int err;
+
+	if ((err = iosapic_init(phys_addr, gsi_base)))
+		return err;
+
+#if CONFIG_ACPI_NUMA
+	acpi_map_iosapic(handle, 0, NULL, NULL);
+#endif /* CONFIG_ACPI_NUMA */
+
+	return 0;
+}
+EXPORT_SYMBOL(acpi_register_ioapic);
+
+int
+acpi_unregister_ioapic (acpi_handle handle, u32 gsi_base)
+{
+	return iosapic_remove(gsi_base);
+}
+EXPORT_SYMBOL(acpi_unregister_ioapic);
+
 #endif /* CONFIG_ACPI_BOOT */
diff --git a/arch/ia64/kernel/domain.c b/arch/ia64/kernel/domain.c
index fe532c970438..d65e87b6394f 100644
--- a/arch/ia64/kernel/domain.c
+++ b/arch/ia64/kernel/domain.c
@@ -14,7 +14,7 @@
 #include <linux/topology.h>
 #include <linux/nodemask.h>
 
-#define SD_NODES_PER_DOMAIN 6
+#define SD_NODES_PER_DOMAIN 16
 
 #ifdef CONFIG_NUMA
 /**
@@ -27,7 +27,7 @@
  *
  * Should use nodemask_t.
  */
-static int __devinit find_next_best_node(int node, unsigned long *used_nodes)
+static int find_next_best_node(int node, unsigned long *used_nodes)
 {
 	int i, n, val, min_val, best_node = 0;
 
@@ -66,7 +66,7 @@ static int __devinit find_next_best_node(int node, unsigned long *used_nodes)
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
-static cpumask_t __devinit sched_domain_node_span(int node)
+static cpumask_t sched_domain_node_span(int node)
 {
 	int i;
 	cpumask_t span, nodemask;
@@ -96,7 +96,7 @@ static cpumask_t __devinit sched_domain_node_span(int node)
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
-static int __devinit cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu)
 {
 	return cpu;
 }
@@ -104,7 +104,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
 
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
 static struct sched_group sched_group_phys[NR_CPUS];
-static int __devinit cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu)
 {
 #ifdef CONFIG_SCHED_SMT
 	return first_cpu(cpu_sibling_map[cpu]);
@@ -125,44 +125,36 @@ static struct sched_group *sched_group_nodes[MAX_NUMNODES];
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static struct sched_group sched_group_allnodes[MAX_NUMNODES];
 
-static int __devinit cpu_to_allnodes_group(int cpu)
+static int cpu_to_allnodes_group(int cpu)
 {
 	return cpu_to_node(cpu);
 }
 #endif
 
 /*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
  */
-void __devinit arch_init_sched_domains(void)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
-	cpumask_t cpu_default_map;
 
 	/*
-	 * Setup mask for cpus without special case scheduling requirements.
-	 * For now this just excludes isolated cpus, but could be used to
-	 * exclude other special cases in the future.
+	 * Set up domains for cpus specified by the cpu_map.
 	 */
-	cpus_complement(cpu_default_map, cpu_isolated_map);
-	cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
-
-	/*
-	 * Set up domains. Isolated domains just stay on the dummy domain.
-	 */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		int group;
 		struct sched_domain *sd = NULL, *p;
 		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
 		if (num_online_cpus()
 				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
-			sd->span = cpu_default_map;
+			sd->span = *cpu_map;
 			group = cpu_to_allnodes_group(i);
 			sd->groups = &sched_group_allnodes[group];
 			p = sd;
@@ -173,7 +165,7 @@ void __devinit arch_init_sched_domains(void)
 		*sd = SD_NODE_INIT;
 		sd->span = sched_domain_node_span(cpu_to_node(i));
 		sd->parent = p;
-		cpus_and(sd->span, sd->span, cpu_default_map);
+		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
 		p = sd;
@@ -190,7 +182,7 @@ void __devinit arch_init_sched_domains(void)
 		group = cpu_to_cpu_group(i);
 		*sd = SD_SIBLING_INIT;
 		sd->span = cpu_sibling_map[i];
-		cpus_and(sd->span, sd->span, cpu_default_map);
+		cpus_and(sd->span, sd->span, *cpu_map);
 		sd->parent = p;
 		sd->groups = &sched_group_cpus[group];
 #endif
@@ -198,9 +190,9 @@ void __devinit arch_init_sched_domains(void)
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
-		cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
 			continue;
 
@@ -213,7 +205,7 @@ void __devinit arch_init_sched_domains(void)
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		cpumask_t nodemask = node_to_cpumask(i);
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask))
 			continue;
 
@@ -222,7 +214,7 @@ void __devinit arch_init_sched_domains(void)
 	}
 
 #ifdef CONFIG_NUMA
-	init_sched_build_groups(sched_group_allnodes, cpu_default_map,
+	init_sched_build_groups(sched_group_allnodes, *cpu_map,
 				&cpu_to_allnodes_group);
 
 	for (i = 0; i < MAX_NUMNODES; i++) {
@@ -233,12 +225,12 @@ void __devinit arch_init_sched_domains(void)
 		cpumask_t covered = CPU_MASK_NONE;
 		int j;
 
-		cpus_and(nodemask, nodemask, cpu_default_map);
+		cpus_and(nodemask, nodemask, *cpu_map);
 		if (cpus_empty(nodemask))
 			continue;
 
 		domainspan = sched_domain_node_span(i);
-		cpus_and(domainspan, domainspan, cpu_default_map);
+		cpus_and(domainspan, domainspan, *cpu_map);
 
 		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
 		sched_group_nodes[i] = sg;
@@ -266,7 +258,7 @@ void __devinit arch_init_sched_domains(void)
 			int n = (i + j) % MAX_NUMNODES;
 
 			cpus_complement(notcovered, covered);
-			cpus_and(tmp, notcovered, cpu_default_map);
+			cpus_and(tmp, notcovered, *cpu_map);
 			cpus_and(tmp, tmp, domainspan);
 			if (cpus_empty(tmp))
 				break;
@@ -293,7 +285,7 @@ void __devinit arch_init_sched_domains(void)
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
-	for_each_cpu_mask(i, cpu_default_map) {
+	for_each_cpu_mask(i, *cpu_map) {
 		int power;
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
@@ -359,13 +351,35 @@ next_sg:
 		cpu_attach_domain(sd, i);
 	}
 }
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+void arch_init_sched_domains(const cpumask_t *cpu_map)
+{
+	cpumask_t cpu_default_map;
+
+	/*
+	 * Setup mask for cpus without special case scheduling requirements.
+	 * For now this just excludes isolated cpus, but could be used to
+	 * exclude other special cases in the future.
+	 */
+	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+	build_sched_domains(&cpu_default_map);
+}
 
-void __devinit arch_destroy_sched_domains(void)
+void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
 #ifdef CONFIG_NUMA
 	int i;
 	for (i = 0; i < MAX_NUMNODES; i++) {
+		cpumask_t nodemask = node_to_cpumask(i);
 		struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
 		if (sg == NULL)
 			continue;
 		sg = sg->next;
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index b1d5d3d5276c..9be53e1ea404 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -470,18 +470,6 @@ ENTRY(load_switch_stack)
 	br.cond.sptk.many b7
 END(load_switch_stack)
 
-GLOBAL_ENTRY(__ia64_syscall)
-	.regstk 6,0,0,0
-	mov r15=in5				// put syscall number in place
-	break __BREAK_SYSCALL
-	movl r2=errno
-	cmp.eq p6,p7=-1,r10
-	;;
-(p6)	st4 [r2]=r8
-(p6)	mov r8=-1
-	br.ret.sptk.many rp
-END(__ia64_syscall)
-
 GLOBAL_ENTRY(execve)
 	mov r15=__NR_execve			// put syscall number in place
 	break __BREAK_SYSCALL
@@ -637,7 +625,7 @@ END(ia64_ret_from_syscall)
  *	      r8-r11: restored (syscall return value(s))
  *		 r12: restored (user-level stack pointer)
  *		 r13: restored (user-level thread pointer)
- *		 r14: cleared
+ *		 r14: set to __kernel_syscall_via_epc
  *		 r15: restored (syscall #)
  *	     r16-r17: cleared
  *		 r18: user-level b6
@@ -658,7 +646,7 @@ END(ia64_ret_from_syscall)
  *		  pr: restored (user-level pr)
  *		  b0: restored (user-level rp)
  *	          b6: restored
- *		  b7: cleared
+ *		  b7: set to __kernel_syscall_via_epc
  *	     ar.unat: restored (user-level ar.unat)
  *	      ar.pfs: restored (user-level ar.pfs)
  *	      ar.rsc: restored (user-level ar.rsc)
@@ -704,72 +692,79 @@ ENTRY(ia64_leave_syscall)
 	;;
 (p6)	ld4 r31=[r18]				// load current_thread_info()->flags
 	ld8 r19=[r2],PT(B6)-PT(LOADRS)		// load ar.rsc value for "loadrs"
-	mov b7=r0		// clear b7
+	nop.i 0
 	;;
-	ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE)	// load ar.bspstore (may be garbage)
+	mov r16=ar.bsp				// M2  get existing backing store pointer
 	ld8 r18=[r2],PT(R9)-PT(B6)		// load b6
 (p6)	and r15=TIF_WORK_MASK,r31		// any work other than TIF_SYSCALL_TRACE?
 	;;
-	mov r16=ar.bsp				// M2  get existing backing store pointer
+	ld8 r23=[r3],PT(R11)-PT(AR_BSPSTORE)	// load ar.bspstore (may be garbage)
 (p6)	cmp4.ne.unc p6,p0=r15, r0		// any special work pending?
 (p6)	br.cond.spnt .work_pending_syscall
 	;;
 	// start restoring the state saved on the kernel stack (struct pt_regs):
 	ld8 r9=[r2],PT(CR_IPSR)-PT(R9)
 	ld8 r11=[r3],PT(CR_IIP)-PT(R11)
-	mov f6=f0		// clear f6
+(pNonSys) break 0		//      bug check: we shouldn't be here if pNonSys is TRUE!
 	;;
 	invala			// M0|1 invalidate ALAT
-	rsm psr.i | psr.ic	// M2 initiate turning off of interrupt and interruption collection
-	mov f9=f0		// clear f9
+	rsm psr.i | psr.ic	// M2   turn off interrupts and interruption collection
+	cmp.eq p9,p0=r0,r0	// A    set p9 to indicate that we should restore cr.ifs
 
-	ld8 r29=[r2],16		// load cr.ipsr
-	ld8 r28=[r3],16			// load cr.iip
-	mov f8=f0		// clear f8
+	ld8 r29=[r2],16		// M0|1 load cr.ipsr
+	ld8 r28=[r3],16		// M0|1 load cr.iip
+	mov r22=r0		// A    clear r22
 	;;
 	ld8 r30=[r2],16		// M0|1 load cr.ifs
 	ld8 r25=[r3],16		// M0|1 load ar.unat
-	cmp.eq p9,p0=r0,r0	// set p9 to indicate that we should restore cr.ifs
+(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
 	;;
 	ld8 r26=[r2],PT(B0)-PT(AR_PFS)	// M0|1 load ar.pfs
-(pKStk)	mov r22=psr		// M2 read PSR now that interrupts are disabled
-	mov f10=f0		// clear f10
+(pKStk)	mov r22=psr			// M2   read PSR now that interrupts are disabled
+	nop 0
 	;;
-	ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // load b0
-	ld8 r27=[r3],PT(PR)-PT(AR_RSC)	// load ar.rsc
-	mov f11=f0		// clear f11
+	ld8 r21=[r2],PT(AR_RNAT)-PT(B0) // M0|1 load b0
+	ld8 r27=[r3],PT(PR)-PT(AR_RSC)	// M0|1 load ar.rsc
+	mov f6=f0			// F    clear f6
 	;;
-	ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT)	// load ar.rnat (may be garbage)
-	ld8 r31=[r3],PT(R1)-PT(PR)		// load predicates
-(pUStk) add r14=IA64_TASK_THREAD_ON_USTACK_OFFSET,r13
+	ld8 r24=[r2],PT(AR_FPSR)-PT(AR_RNAT)	// M0|1 load ar.rnat (may be garbage)
+	ld8 r31=[r3],PT(R1)-PT(PR)		// M0|1 load predicates
+	mov f7=f0				// F    clear f7
 	;;
-	ld8 r20=[r2],PT(R12)-PT(AR_FPSR)	// load ar.fpsr
-	ld8.fill r1=[r3],16	// load r1
-(pUStk) mov r17=1
+	ld8 r20=[r2],PT(R12)-PT(AR_FPSR)	// M0|1 load ar.fpsr
+	ld8.fill r1=[r3],16			// M0|1 load r1
+(pUStk) mov r17=1				// A
 	;;
-	srlz.d			// M0  ensure interruption collection is off
-	ld8.fill r13=[r3],16
-	mov f7=f0		// clear f7
+(pUStk) st1 [r14]=r17				// M2|3
+	ld8.fill r13=[r3],16			// M0|1
+	mov f8=f0				// F    clear f8
 	;;
-	ld8.fill r12=[r2]	// restore r12 (sp)
-	mov.m ar.ssd=r0		// M2 clear ar.ssd
-	mov r22=r0		// clear r22
+	ld8.fill r12=[r2]			// M0|1 restore r12 (sp)
+	ld8.fill r15=[r3]			// M0|1 restore r15
+	mov b6=r18				// I0   restore b6
+
+	addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 // A
+	mov f9=f0					// F    clear f9
+(pKStk) br.cond.dpnt.many skip_rbs_switch		// B
 
-	ld8.fill r15=[r3]	// restore r15
-(pUStk) st1 [r14]=r17
-	addl r3=THIS_CPU(ia64_phys_stacked_size_p8),r0
+	srlz.d				// M0   ensure interruption collection is off (for cover)
+	shr.u r18=r19,16		// I0|1 get byte size of existing "dirty" partition
+	cover				// B    add current frame into dirty partition & set cr.ifs
 	;;
-(pUStk)	ld4 r17=[r3]		// r17 = cpu_data->phys_stacked_size_p8
-	mov.m ar.csd=r0		// M2 clear ar.csd
-	mov b6=r18		// I0  restore b6
+(pUStk) ld4 r17=[r17]			// M0|1 r17 = cpu_data->phys_stacked_size_p8
+	mov r19=ar.bsp			// M2   get new backing store pointer
+	mov f10=f0			// F    clear f10
+
+	nop.m 0
+	movl r14=__kernel_syscall_via_epc // X
 	;;
-	mov r14=r0		// clear r14
-	shr.u r18=r19,16	// I0|1 get byte size of existing "dirty" partition
-(pKStk) br.cond.dpnt.many skip_rbs_switch
+	mov.m ar.csd=r0			// M2   clear ar.csd
+	mov.m ar.ccv=r0			// M2   clear ar.ccv
+	mov b7=r14			// I0   clear b7 (hint with __kernel_syscall_via_epc)
 
-	mov.m ar.ccv=r0		// clear ar.ccv
-(pNonSys) br.cond.dpnt.many dont_preserve_current_frame
-	br.cond.sptk.many rbs_switch
+	mov.m ar.ssd=r0			// M2   clear ar.ssd
+	mov f11=f0			// F    clear f11
+	br.cond.sptk.many rbs_switch	// B
 END(ia64_leave_syscall)
 
 #ifdef CONFIG_IA32_SUPPORT
@@ -885,7 +880,7 @@ GLOBAL_ENTRY(ia64_leave_kernel)
 	ldf.fill f7=[r2],PT(F11)-PT(F7)
 	ldf.fill f8=[r3],32
 	;;
-	srlz.i			// ensure interruption collection is off
+	srlz.d	// ensure that inter. collection is off (VHPT is don't care, since text is pinned)
 	mov ar.ccv=r15
 	;;
 	ldf.fill f11=[r2]
@@ -945,11 +940,10 @@ GLOBAL_ENTRY(ia64_leave_kernel)
 	 * NOTE: alloc, loadrs, and cover can't be predicated.
 	 */
 (pNonSys) br.cond.dpnt dont_preserve_current_frame
-
-rbs_switch:
 	cover				// add current frame into dirty partition and set cr.ifs
 	;;
 	mov r19=ar.bsp			// get new backing store pointer
+rbs_switch:
 	sub r16=r16,r18			// krbs = old bsp - size of dirty partition
 	cmp.ne p9,p0=r0,r0		// clear p9 to skip restore of cr.ifs
 	;;
@@ -1024,14 +1018,14 @@ rse_clear_invalid:
 	mov loc5=0
 	mov loc6=0
 	mov loc7=0
-(pRecurse) br.call.sptk.few b0=rse_clear_invalid
+(pRecurse) br.call.dptk.few b0=rse_clear_invalid
 	;;
 	mov loc8=0
 	mov loc9=0
 	cmp.ne pReturn,p0=r0,in1	// if recursion count != 0, we need to do a br.ret
 	mov loc10=0
 	mov loc11=0
-(pReturn) br.ret.sptk.many b0
+(pReturn) br.ret.dptk.many b0
 #endif /* !CONFIG_ITANIUM */
 #	undef pRecurse
 #	undef pReturn
@@ -1255,7 +1249,7 @@ ENTRY(sys_rt_sigreturn)
  	stf.spill [r17]=f11
 	adds out0=16,sp				// out0 = &sigscratch
 	br.call.sptk.many rp=ia64_rt_sigreturn
-.ret19:	.restore sp 0
+.ret19:	.restore sp,0
 	adds sp=16,sp
 	;;
 	ld8 r9=[sp]				// load new ar.unat
@@ -1577,11 +1571,11 @@ sys_call_table:
 	data8 sys_add_key
 	data8 sys_request_key
 	data8 sys_keyctl
+	data8 sys_ioprio_set
+	data8 sys_ioprio_get			// 1275
 	data8 sys_ni_syscall
-	data8 sys_ni_syscall			// 1275
-	data8 sys_set_zone_reclaim
-	data8 sys_ni_syscall
-	data8 sys_ni_syscall
-	data8 sys_ni_syscall
+	data8 sys_inotify_init
+	data8 sys_inotify_add_watch
+	data8 sys_inotify_rm_watch
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
diff --git a/arch/ia64/kernel/entry.h b/arch/ia64/kernel/entry.h
index 6d4ecec989b5..78eeb0793419 100644
--- a/arch/ia64/kernel/entry.h
+++ b/arch/ia64/kernel/entry.h
@@ -60,7 +60,7 @@
 	.spillsp @priunat,SW(AR_UNAT)+16+(off);					\
 	.spillsp ar.rnat,SW(AR_RNAT)+16+(off);					\
 	.spillsp ar.bspstore,SW(AR_BSPSTORE)+16+(off);				\
-	.spillsp pr,SW(PR)+16+(off))
+	.spillsp pr,SW(PR)+16+(off)
 
 #define DO_SAVE_SWITCH_STACK			\
 	movl r28=1f;				\
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index 962b6c4e32b5..7d7684a369d3 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -531,93 +531,114 @@ GLOBAL_ENTRY(fsys_bubble_down)
 	.altrp b6
 	.body
 	/*
-	 * We get here for syscalls that don't have a lightweight handler.  For those, we
-	 * need to bubble down into the kernel and that requires setting up a minimal
-	 * pt_regs structure, and initializing the CPU state more or less as if an
-	 * interruption had occurred.  To make syscall-restarts work, we setup pt_regs
-	 * such that cr_iip points to the second instruction in syscall_via_break.
-	 * Decrementing the IP hence will restart the syscall via break and not
-	 * decrementing IP will return us to the caller, as usual.  Note that we preserve
-	 * the value of psr.pp rather than initializing it from dcr.pp.  This makes it
-	 * possible to distinguish fsyscall execution from other privileged execution.
+	 * We get here for syscalls that don't have a lightweight
+	 * handler.  For those, we need to bubble down into the kernel
+	 * and that requires setting up a minimal pt_regs structure,
+	 * and initializing the CPU state more or less as if an
+	 * interruption had occurred.  To make syscall-restarts work,
+	 * we setup pt_regs such that cr_iip points to the second
+	 * instruction in syscall_via_break.  Decrementing the IP
+	 * hence will restart the syscall via break and not
+	 * decrementing IP will return us to the caller, as usual.
+	 * Note that we preserve the value of psr.pp rather than
+	 * initializing it from dcr.pp.  This makes it possible to
+	 * distinguish fsyscall execution from other privileged
+	 * execution.
 	 *
 	 * On entry:
-	 *	- normal fsyscall handler register usage, except that we also have:
+	 *	- normal fsyscall handler register usage, except
+	 *	  that we also have:
 	 *	- r18: address of syscall entry point
 	 *	- r21: ar.fpsr
 	 *	- r26: ar.pfs
 	 *	- r27: ar.rsc
 	 *	- r29: psr
+	 *
+	 * We used to clear some PSR bits here but that requires slow
+	 * serialization.  Fortuntely, that isn't really necessary.
+	 * The rationale is as follows: we used to clear bits
+	 * ~PSR_PRESERVED_BITS in PSR.L.  Since
+	 * PSR_PRESERVED_BITS==PSR.{UP,MFL,MFH,PK,DT,PP,SP,RT,IC}, we
+	 * ended up clearing PSR.{BE,AC,I,DFL,DFH,DI,DB,SI,TB}.
+	 * However,
+	 *
+	 * PSR.BE : already is turned off in __kernel_syscall_via_epc()
+	 * PSR.AC : don't care (kernel normally turns PSR.AC on)
+	 * PSR.I  : already turned off by the time fsys_bubble_down gets
+	 *	    invoked
+	 * PSR.DFL: always 0 (kernel never turns it on)
+	 * PSR.DFH: don't care --- kernel never touches f32-f127 on its own
+	 *	    initiative
+	 * PSR.DI : always 0 (kernel never turns it on)
+	 * PSR.SI : always 0 (kernel never turns it on)
+	 * PSR.DB : don't care --- kernel never enables kernel-level
+	 *	    breakpoints
+	 * PSR.TB : must be 0 already; if it wasn't zero on entry to
+	 *          __kernel_syscall_via_epc, the branch to fsys_bubble_down
+	 *          will trigger a taken branch; the taken-trap-handler then
+	 *          converts the syscall into a break-based system-call.
 	 */
-#	define PSR_PRESERVED_BITS	(IA64_PSR_UP | IA64_PSR_MFL | IA64_PSR_MFH | IA64_PSR_PK \
-					 | IA64_PSR_DT | IA64_PSR_PP | IA64_PSR_SP | IA64_PSR_RT \
-					 | IA64_PSR_IC)
 	/*
-	 * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc.  The rest we have
-	 * to synthesize.
+	 * Reading psr.l gives us only bits 0-31, psr.it, and psr.mc.
+	 * The rest we have to synthesize.
 	 */
-#	define PSR_ONE_BITS		((3 << IA64_PSR_CPL0_BIT) | (0x1 << IA64_PSR_RI_BIT) \
+#	define PSR_ONE_BITS		((3 << IA64_PSR_CPL0_BIT)	\
+					 | (0x1 << IA64_PSR_RI_BIT)	\
 					 | IA64_PSR_BN | IA64_PSR_I)
 
-	invala
-	movl r8=PSR_ONE_BITS
+	invala					// M0|1
+	movl r14=ia64_ret_from_syscall		// X
 
-	mov r25=ar.unat			// save ar.unat (5 cyc)
-	movl r9=PSR_PRESERVED_BITS
+	nop.m 0
+	movl r28=__kernel_syscall_via_break	// X	create cr.iip
+	;;
 
-	mov ar.rsc=0			// set enforced lazy mode, pl 0, little-endian, loadrs=0
-	movl r28=__kernel_syscall_via_break
+	mov r2=r16				// A    get task addr to addl-addressable register
+	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16 // A
+	mov r31=pr				// I0   save pr (2 cyc)
 	;;
-	mov r23=ar.bspstore		// save ar.bspstore (12 cyc)
-	mov r31=pr			// save pr (2 cyc)
-	mov r20=r1			// save caller's gp in r20
+	st1 [r16]=r0				// M2|3 clear current->thread.on_ustack flag
+	addl r22=IA64_RBS_OFFSET,r2		// A    compute base of RBS
+	add r3=TI_FLAGS+IA64_TASK_SIZE,r2	// A
 	;;
-	mov r2=r16			// copy current task addr to addl-addressable register
-	and r9=r9,r29
-	mov r19=b6			// save b6 (2 cyc)
+	ld4 r3=[r3]				// M0|1 r3 = current_thread_info()->flags
+	lfetch.fault.excl.nt1 [r22]		// M0|1 prefetch register backing-store
+	nop.i 0
 	;;
-	mov psr.l=r9			// slam the door (17 cyc to srlz.i)
-	or r29=r8,r29			// construct cr.ipsr value to save
-	addl r22=IA64_RBS_OFFSET,r2	// compute base of RBS
+	mov ar.rsc=0				// M2   set enforced lazy mode, pl 0, LE, loadrs=0
+	nop.m 0
+	nop.i 0
 	;;
-	// GAS reports a spurious RAW hazard on the read of ar.rnat because it thinks
-	// we may be reading ar.itc after writing to psr.l.  Avoid that message with
-	// this directive:
-	dv_serialize_data
-	mov.m r24=ar.rnat		// read ar.rnat (5 cyc lat)
-	lfetch.fault.excl.nt1 [r22]
-	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r2
-
-	// ensure previous insn group is issued before we stall for srlz.i:
+	mov r23=ar.bspstore			// M2 (12 cyc) save ar.bspstore
+	mov.m r24=ar.rnat			// M2 (5 cyc) read ar.rnat (dual-issues!)
+	nop.i 0
 	;;
-	srlz.i				// ensure new psr.l has been established
-	/////////////////////////////////////////////////////////////////////////////
-	////////// from this point on, execution is not interruptible anymore
-	/////////////////////////////////////////////////////////////////////////////
-	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2	// compute base of memory stack
-	cmp.ne pKStk,pUStk=r0,r0	// set pKStk <- 0, pUStk <- 1
+	mov ar.bspstore=r22			// M2 (6 cyc) switch to kernel RBS
+	movl r8=PSR_ONE_BITS			// X
 	;;
-	st1 [r16]=r0			// clear current->thread.on_ustack flag
-	mov ar.bspstore=r22		// switch to kernel RBS
-	mov b6=r18			// copy syscall entry-point to b6 (7 cyc)
-	add r3=TI_FLAGS+IA64_TASK_SIZE,r2
+	mov r25=ar.unat				// M2 (5 cyc) save ar.unat
+	mov r19=b6				// I0   save b6 (2 cyc)
+	mov r20=r1				// A    save caller's gp in r20
 	;;
-	ld4 r3=[r3]				// r2 = current_thread_info()->flags
-	mov r18=ar.bsp			// save (kernel) ar.bsp (12 cyc)
-	mov ar.rsc=0x3			// set eager mode, pl 0, little-endian, loadrs=0
-	br.call.sptk.many b7=ia64_syscall_setup
-	;;
-	ssm psr.i
-	movl r2=ia64_ret_from_syscall
+	or r29=r8,r29				// A    construct cr.ipsr value to save
+	mov b6=r18				// I0   copy syscall entry-point to b6 (7 cyc)
+	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r2 // A compute base of memory stack
+
+	mov r18=ar.bsp				// M2   save (kernel) ar.bsp (12 cyc)
+	cmp.ne pKStk,pUStk=r0,r0		// A    set pKStk <- 0, pUStk <- 1
+	br.call.sptk.many b7=ia64_syscall_setup	// B
 	;;
-	mov rp=r2				// set the real return addr
-	and r3=_TIF_SYSCALL_TRACEAUDIT,r3
+	mov ar.rsc=0x3				// M2   set eager mode, pl 0, LE, loadrs=0
+	mov rp=r14				// I0   set the real return addr
+	and r3=_TIF_SYSCALL_TRACEAUDIT,r3	// A
 	;;
-	cmp.eq p8,p0=r3,r0
+	ssm psr.i				// M2   we're on kernel stacks now, reenable irqs
+	cmp.eq p8,p0=r3,r0			// A
+(p10)	br.cond.spnt.many ia64_ret_from_syscall	// B    return if bad call-frame or r15 is a NaT
 
-(p10)	br.cond.spnt.many ia64_ret_from_syscall	// p10==true means out registers are more than 8
-(p8)	br.call.sptk.many b6=b6		// ignore this return addr
-	br.cond.sptk ia64_trace_syscall
+	nop.m 0
+(p8)	br.call.sptk.many b6=b6			// B    (ignore return address)
+	br.cond.spnt ia64_trace_syscall		// B
 END(fsys_bubble_down)
 
 	.rodata
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
index facf75acdc85..86948ce63e43 100644
--- a/arch/ia64/kernel/gate.S
+++ b/arch/ia64/kernel/gate.S
@@ -72,38 +72,40 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
 	 * bundle get executed.  The remaining code must be safe even if
 	 * they do not get executed.
 	 */
-	adds r17=-1024,r15
-	mov r10=0				// default to successful syscall execution
-	epc
+	adds r17=-1024,r15			// A
+	mov r10=0				// A    default to successful syscall execution
+	epc					// B	causes split-issue
 }
 	;;
-	rsm psr.be // note: on McKinley "rsm psr.be/srlz.d" is slightly faster than "rum psr.be"
-	LOAD_FSYSCALL_TABLE(r14)
-
-	mov r16=IA64_KR(CURRENT)		// 12 cycle read latency
-	tnat.nz p10,p9=r15
-	mov r19=NR_syscalls-1
+	rsm psr.be | psr.i			// M2 (5 cyc to srlz.d)
+	LOAD_FSYSCALL_TABLE(r14)		// X
 	;;
-	shladd r18=r17,3,r14
-
-	srlz.d
-	cmp.ne p8,p0=r0,r0			// p8 <- FALSE
-	/* Note: if r17 is a NaT, p6 will be set to zero.  */
-	cmp.geu p6,p7=r19,r17			// (syscall > 0 && syscall < 1024+NR_syscalls)?
-	;;
-(p6)	ld8 r18=[r18]
-	mov r21=ar.fpsr
-	add r14=-8,r14				// r14 <- addr of fsys_bubble_down entry
-	;;
-(p6)	mov b7=r18
-(p6)	tbit.z p8,p0=r18,0
-(p8)	br.dptk.many b7
-
-(p6)	rsm psr.i
-	mov r27=ar.rsc
-	mov r26=ar.pfs
+	mov r16=IA64_KR(CURRENT)		// M2 (12 cyc)
+	shladd r18=r17,3,r14			// A
+	mov r19=NR_syscalls-1			// A
+	;;
+	lfetch [r18]				// M0|1
+	mov r29=psr				// M2 (12 cyc)
+	// If r17 is a NaT, p6 will be zero
+	cmp.geu p6,p7=r19,r17			// A    (sysnr > 0 && sysnr < 1024+NR_syscalls)?
+	;;
+	mov r21=ar.fpsr				// M2 (12 cyc)
+	tnat.nz p10,p9=r15			// I0
+	mov.i r26=ar.pfs			// I0 (would stall anyhow due to srlz.d...)
+	;;
+	srlz.d					// M0 (forces split-issue) ensure PSR.BE==0
+(p6)	ld8 r18=[r18]				// M0|1
+	nop.i 0
+	;;
+	nop.m 0
+(p6)	tbit.z.unc p8,p0=r18,0			// I0 (dual-issues with "mov b7=r18"!)
+	nop.i 0
 	;;
-	mov r29=psr				// read psr (12 cyc load latency)
+(p8)	ssm psr.i
+(p6)	mov b7=r18				// I0
+(p8)	br.dptk.many b7				// B
+
+	mov r27=ar.rsc				// M2 (12 cyc)
 /*
  * brl.cond doesn't work as intended because the linker would convert this branch
  * into a branch to a PLT.  Perhaps there will be a way to avoid this with some
@@ -111,6 +113,8 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
  * instead.
  */
 #ifdef CONFIG_ITANIUM
+(p6)	add r14=-8,r14				// r14 <- addr of fsys_bubble_down entry
+	;;
 (p6)	ld8 r14=[r14]				// r14 <- fsys_bubble_down
 	;;
 (p6)	mov b7=r14
@@ -118,7 +122,7 @@ GLOBAL_ENTRY(__kernel_syscall_via_epc)
 #else
 	BRL_COND_FSYS_BUBBLE_DOWN(p6)
 #endif
-
+	ssm psr.i
 	mov r10=-1
 (p10)	mov r8=EINVAL
 (p9)	mov r8=ENOSYS
diff --git a/arch/ia64/kernel/ia64_ksyms.c b/arch/ia64/kernel/ia64_ksyms.c
index 7bbf019c9867..01572814abe4 100644
--- a/arch/ia64/kernel/ia64_ksyms.c
+++ b/arch/ia64/kernel/ia64_ksyms.c
@@ -58,9 +58,6 @@ EXPORT_SYMBOL(__strlen_user);
 EXPORT_SYMBOL(__strncpy_from_user);
 EXPORT_SYMBOL(__strnlen_user);
 
-#include <asm/unistd.h>
-EXPORT_SYMBOL(__ia64_syscall);
-
 /* from arch/ia64/lib */
 extern void __divsi3(void);
 extern void __udivsi3(void);
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 88b014381df5..7936b62f7a2e 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -129,14 +129,13 @@ static struct iosapic {
 	char __iomem	*addr;		/* base address of IOSAPIC */
 	unsigned int 	gsi_base;	/* first GSI assigned to this IOSAPIC */
 	unsigned short 	num_rte;	/* number of RTE in this IOSAPIC */
+	int		rtes_inuse;	/* # of RTEs in use on this IOSAPIC */
 #ifdef CONFIG_NUMA
 	unsigned short	node;		/* numa node association via pxm */
 #endif
 } iosapic_lists[NR_IOSAPICS];
 
-static int num_iosapic;
-
-static unsigned char pcat_compat __initdata;	/* 8259 compatibility flag */
+static unsigned char pcat_compat __devinitdata;	/* 8259 compatibility flag */
 
 static int iosapic_kmalloc_ok;
 static LIST_HEAD(free_rte_list);
@@ -149,7 +148,7 @@ find_iosapic (unsigned int gsi)
 {
 	int i;
 
-	for (i = 0; i < num_iosapic; i++) {
+	for (i = 0; i < NR_IOSAPICS; i++) {
 		if ((unsigned) (gsi - iosapic_lists[i].gsi_base) < iosapic_lists[i].num_rte)
 			return i;
 	}
@@ -490,8 +489,6 @@ static int iosapic_find_sharable_vector (unsigned long trigger, unsigned long po
 			}
 		}
 	}
-	if (vector < 0)
-		panic("%s: out of interrupt vectors!\n", __FUNCTION__);
 
 	return vector;
 }
@@ -507,6 +504,8 @@ iosapic_reassign_vector (int vector)
 
 	if (!list_empty(&iosapic_intr_info[vector].rtes)) {
 		new_vector = assign_irq_vector(AUTO_ASSIGN);
+		if (new_vector < 0)
+			panic("%s: out of interrupt vectors!\n", __FUNCTION__);
 		printk(KERN_INFO "Reassigning vector %d to %d\n", vector, new_vector);
 		memcpy(&iosapic_intr_info[new_vector], &iosapic_intr_info[vector],
 		       sizeof(struct iosapic_intr_info));
@@ -598,6 +597,7 @@ register_intr (unsigned int gsi, int vector, unsigned char delivery,
 		rte->refcnt++;
 		list_add_tail(&rte->rte_list, &iosapic_intr_info[vector].rtes);
 		iosapic_intr_info[vector].count++;
+		iosapic_lists[index].rtes_inuse++;
 	}
 	else if (vector_is_shared(vector)) {
 		struct iosapic_intr_info *info = &iosapic_intr_info[vector];
@@ -734,9 +734,12 @@ again:
 	spin_unlock_irqrestore(&iosapic_lock, flags);
 
 	/* If vector is running out, we try to find a sharable vector */
-	vector = assign_irq_vector_nopanic(AUTO_ASSIGN);
-	if (vector < 0)
+	vector = assign_irq_vector(AUTO_ASSIGN);
+	if (vector < 0) {
 		vector = iosapic_find_sharable_vector(trigger, polarity);
+		if (vector < 0)
+			panic("%s: out of interrupt vectors!\n", __FUNCTION__);
+	}
 
 	spin_lock_irqsave(&irq_descp(vector)->lock, flags);
 	spin_lock(&iosapic_lock);
@@ -778,7 +781,7 @@ void
 iosapic_unregister_intr (unsigned int gsi)
 {
 	unsigned long flags;
-	int irq, vector;
+	int irq, vector, index;
 	irq_desc_t *idesc;
 	u32 low32;
 	unsigned long trigger, polarity;
@@ -819,6 +822,9 @@ iosapic_unregister_intr (unsigned int gsi)
 		list_del(&rte->rte_list);
 		iosapic_intr_info[vector].count--;
 		iosapic_free_rte(rte);
+		index = find_iosapic(gsi);
+		iosapic_lists[index].rtes_inuse--;
+		WARN_ON(iosapic_lists[index].rtes_inuse < 0);
 
 		trigger	 = iosapic_intr_info[vector].trigger;
 		polarity = iosapic_intr_info[vector].polarity;
@@ -881,6 +887,8 @@ iosapic_register_platform_intr (u32 int_type, unsigned int gsi,
 		break;
 	      case ACPI_INTERRUPT_INIT:
 		vector = assign_irq_vector(AUTO_ASSIGN);
+		if (vector < 0)
+			panic("%s: out of interrupt vectors!\n", __FUNCTION__);
 		delivery = IOSAPIC_INIT;
 		break;
 	      case ACPI_INTERRUPT_CPEI:
@@ -952,30 +960,86 @@ iosapic_system_init (int system_pcat_compat)
 	}
 }
 
-void __init
+static inline int
+iosapic_alloc (void)
+{
+	int index;
+
+	for (index = 0; index < NR_IOSAPICS; index++)
+		if (!iosapic_lists[index].addr)
+			return index;
+
+	printk(KERN_WARNING "%s: failed to allocate iosapic\n", __FUNCTION__);
+	return -1;
+}
+
+static inline void
+iosapic_free (int index)
+{
+	memset(&iosapic_lists[index], 0, sizeof(iosapic_lists[0]));
+}
+
+static inline int
+iosapic_check_gsi_range (unsigned int gsi_base, unsigned int ver)
+{
+	int index;
+	unsigned int gsi_end, base, end;
+
+	/* check gsi range */
+	gsi_end = gsi_base + ((ver >> 16) & 0xff);
+	for (index = 0; index < NR_IOSAPICS; index++) {
+		if (!iosapic_lists[index].addr)
+			continue;
+
+		base = iosapic_lists[index].gsi_base;
+		end  = base + iosapic_lists[index].num_rte - 1;
+
+		if (gsi_base < base && gsi_end < base)
+			continue;/* OK */
+
+		if (gsi_base > end && gsi_end > end)
+			continue; /* OK */
+
+		return -EBUSY;
+	}
+	return 0;
+}
+
+int __devinit
 iosapic_init (unsigned long phys_addr, unsigned int gsi_base)
 {
-	int num_rte;
+	int num_rte, err, index;
 	unsigned int isa_irq, ver;
 	char __iomem *addr;
+	unsigned long flags;
 
-	addr = ioremap(phys_addr, 0);
-	ver = iosapic_version(addr);
+	spin_lock_irqsave(&iosapic_lock, flags);
+	{
+		addr = ioremap(phys_addr, 0);
+		ver = iosapic_version(addr);
 
-	/*
-	 * The MAX_REDIR register holds the highest input pin
-	 * number (starting from 0).
-	 * We add 1 so that we can use it for number of pins (= RTEs)
-	 */
-	num_rte = ((ver >> 16) & 0xff) + 1;
+		if ((err = iosapic_check_gsi_range(gsi_base, ver))) {
+			iounmap(addr);
+			spin_unlock_irqrestore(&iosapic_lock, flags);
+			return err;
+		}
+
+		/*
+		 * The MAX_REDIR register holds the highest input pin
+		 * number (starting from 0).
+		 * We add 1 so that we can use it for number of pins (= RTEs)
+		 */
+		num_rte = ((ver >> 16) & 0xff) + 1;
 
-	iosapic_lists[num_iosapic].addr = addr;
-	iosapic_lists[num_iosapic].gsi_base = gsi_base;
-	iosapic_lists[num_iosapic].num_rte = num_rte;
+		index = iosapic_alloc();
+		iosapic_lists[index].addr = addr;
+		iosapic_lists[index].gsi_base = gsi_base;
+		iosapic_lists[index].num_rte = num_rte;
 #ifdef CONFIG_NUMA
-	iosapic_lists[num_iosapic].node = MAX_NUMNODES;
+		iosapic_lists[index].node = MAX_NUMNODES;
 #endif
-	num_iosapic++;
+	}
+	spin_unlock_irqrestore(&iosapic_lock, flags);
 
 	if ((gsi_base == 0) && pcat_compat) {
 		/*
@@ -986,10 +1050,43 @@ iosapic_init (unsigned long phys_addr, unsigned int gsi_base)
 		for (isa_irq = 0; isa_irq < 16; ++isa_irq)
 			iosapic_override_isa_irq(isa_irq, isa_irq, IOSAPIC_POL_HIGH, IOSAPIC_EDGE);
 	}
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG
+int
+iosapic_remove (unsigned int gsi_base)
+{
+	int index, err = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iosapic_lock, flags);
+	{
+		index = find_iosapic(gsi_base);
+		if (index < 0) {
+			printk(KERN_WARNING "%s: No IOSAPIC for GSI base %u\n",
+			       __FUNCTION__, gsi_base);
+			goto out;
+		}
+
+		if (iosapic_lists[index].rtes_inuse) {
+			err = -EBUSY;
+			printk(KERN_WARNING "%s: IOSAPIC for GSI base %u is busy\n",
+			       __FUNCTION__, gsi_base);
+			goto out;
+		}
+
+		iounmap(iosapic_lists[index].addr);
+		iosapic_free(index);
+	}
+ out:
+	spin_unlock_irqrestore(&iosapic_lock, flags);
+	return err;
 }
+#endif /* CONFIG_HOTPLUG */
 
 #ifdef CONFIG_NUMA
-void __init
+void __devinit
 map_iosapic_to_node(unsigned int gsi_base, int node)
 {
 	int index;
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 4fe60c7a2e90..6c4d59fd0364 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -63,30 +63,19 @@ EXPORT_SYMBOL(isa_irq_to_vector_map);
 static unsigned long ia64_vector_mask[BITS_TO_LONGS(IA64_NUM_DEVICE_VECTORS)];
 
 int
-assign_irq_vector_nopanic (int irq)
+assign_irq_vector (int irq)
 {
 	int pos, vector;
  again:
 	pos = find_first_zero_bit(ia64_vector_mask, IA64_NUM_DEVICE_VECTORS);
 	vector = IA64_FIRST_DEVICE_VECTOR + pos;
 	if (vector > IA64_LAST_DEVICE_VECTOR)
-		return -1;
+		return -ENOSPC;
 	if (test_and_set_bit(pos, ia64_vector_mask))
 		goto again;
 	return vector;
 }
 
-int
-assign_irq_vector (int irq)
-{
-	int vector = assign_irq_vector_nopanic(irq);
-
-	if (vector < 0)
-		panic("assign_irq_vector: out of interrupt vectors!");
-
-	return vector;
-}
-
 void
 free_irq_vector (int vector)
 {
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index d9c05d53435b..3bb3a13c4047 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -1,7 +1,7 @@
 /*
  * arch/ia64/kernel/ivt.S
  *
- * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
+ * Copyright (C) 1998-2001, 2003, 2005 Hewlett-Packard Co
  *	Stephane Eranian <eranian@hpl.hp.com>
  *	David Mosberger <davidm@hpl.hp.com>
  * Copyright (C) 2000, 2002-2003 Intel Co
@@ -405,17 +405,22 @@ ENTRY(nested_dtlb_miss)
 	 *		r30:	continuation address
 	 *		r31:	saved pr
 	 *
-	 * Clobbered:	b0, r18, r19, r21, psr.dt (cleared)
+	 * Clobbered:	b0, r18, r19, r21, r22, psr.dt (cleared)
 	 */
 	rsm psr.dt				// switch to using physical data addressing
 	mov r19=IA64_KR(PT_BASE)		// get the page table base address
 	shl r21=r16,3				// shift bit 60 into sign bit
+	mov r18=cr.itir
 	;;
 	shr.u r17=r16,61			// get the region number into r17
+	extr.u r18=r18,2,6			// get the faulting page size
 	;;
 	cmp.eq p6,p7=5,r17			// is faulting address in region 5?
-	shr.u r18=r16,PGDIR_SHIFT		// get bits 33-63 of faulting address
+	add r22=-PAGE_SHIFT,r18			// adjustment for hugetlb address
+	add r18=PGDIR_SHIFT-PAGE_SHIFT,r18
 	;;
+	shr.u r22=r16,r22
+	shr.u r18=r16,r18
 (p7)	dep r17=r17,r19,(PAGE_SHIFT-3),3	// put region number bits in place
 
 	srlz.d
@@ -428,7 +433,7 @@ ENTRY(nested_dtlb_miss)
 (p6)	dep r17=r18,r19,3,(PAGE_SHIFT-3)	// r17=PTA + IFA(33,42)*8
 (p7)	dep r17=r18,r17,3,(PAGE_SHIFT-6)	// r17=PTA + (((IFA(61,63) << 7) | IFA(33,39))*8)
 	cmp.eq p7,p6=0,r21			// unused address bits all zeroes?
-	shr.u r18=r16,PMD_SHIFT			// shift L2 index into position
+	shr.u r18=r22,PMD_SHIFT			// shift L2 index into position
 	;;
 	ld8 r17=[r17]				// fetch the L1 entry (may be 0)
 	;;
@@ -436,7 +441,7 @@ ENTRY(nested_dtlb_miss)
 	dep r17=r18,r17,3,(PAGE_SHIFT-3)	// compute address of L2 page table entry
 	;;
 (p7)	ld8 r17=[r17]				// fetch the L2 entry (may be 0)
-	shr.u r19=r16,PAGE_SHIFT		// shift L3 index into position
+	shr.u r19=r22,PAGE_SHIFT		// shift L3 index into position
 	;;
 (p7)	cmp.eq.or.andcm p6,p7=r17,r0		// was L2 entry NULL?
 	dep r17=r19,r17,3,(PAGE_SHIFT-3)	// compute address of L3 page table entry
@@ -687,82 +692,118 @@ ENTRY(break_fault)
 	 * to prevent leaking bits from kernel to user level.
 	 */
 	DBG_FAULT(11)
-	mov r16=IA64_KR(CURRENT)		// r16 = current task; 12 cycle read lat.
-	mov r17=cr.iim
-	mov r18=__IA64_BREAK_SYSCALL
-	mov r21=ar.fpsr
-	mov r29=cr.ipsr
-	mov r19=b6
-	mov r25=ar.unat
-	mov r27=ar.rsc
-	mov r26=ar.pfs
-	mov r28=cr.iip
-	mov r31=pr				// prepare to save predicates
-	mov r20=r1
-	;;
+	mov.m r16=IA64_KR(CURRENT)		// M2 r16 <- current task (12 cyc)
+	mov r29=cr.ipsr				// M2 (12 cyc)
+	mov r31=pr				// I0 (2 cyc)
+
+	mov r17=cr.iim				// M2 (2 cyc)
+	mov.m r27=ar.rsc			// M2 (12 cyc)
+	mov r18=__IA64_BREAK_SYSCALL		// A
+
+	mov.m ar.rsc=0				// M2
+	mov.m r21=ar.fpsr			// M2 (12 cyc)
+	mov r19=b6				// I0 (2 cyc)
+	;;
+	mov.m r23=ar.bspstore			// M2 (12 cyc)
+	mov.m r24=ar.rnat			// M2 (5 cyc)
+	mov.i r26=ar.pfs			// I0 (2 cyc)
+
+	invala					// M0|1
+	nop.m 0					// M
+	mov r20=r1				// A			save r1
+
+	nop.m 0
+	movl r30=sys_call_table			// X
+
+	mov r28=cr.iip				// M2 (2 cyc)
+	cmp.eq p0,p7=r18,r17			// I0 is this a system call?
+(p7)	br.cond.spnt non_syscall		// B  no ->
+	//
+	// From this point on, we are definitely on the syscall-path
+	// and we can use (non-banked) scratch registers.
+	//
+///////////////////////////////////////////////////////////////////////
+	mov r1=r16				// A    move task-pointer to "addl"-addressable reg
+	mov r2=r16				// A    setup r2 for ia64_syscall_setup
+	add r9=TI_FLAGS+IA64_TASK_SIZE,r16	// A	r9 = &current_thread_info()->flags
+
 	adds r16=IA64_TASK_THREAD_ON_USTACK_OFFSET,r16
-	cmp.eq p0,p7=r18,r17			// is this a system call? (p7 <- false, if so)
-(p7)	br.cond.spnt non_syscall
+	adds r15=-1024,r15			// A    subtract 1024 from syscall number
+	mov r3=NR_syscalls - 1
 	;;
-	ld1 r17=[r16]				// load current->thread.on_ustack flag
-	st1 [r16]=r0				// clear current->thread.on_ustack flag
-	add r1=-IA64_TASK_THREAD_ON_USTACK_OFFSET,r16	// set r1 for MINSTATE_START_SAVE_MIN_VIRT
+	ld1.bias r17=[r16]			// M0|1 r17 = current->thread.on_ustack flag
+	ld4 r9=[r9]				// M0|1 r9 = current_thread_info()->flags
+	extr.u r8=r29,41,2			// I0   extract ei field from cr.ipsr
+
+	shladd r30=r15,3,r30			// A    r30 = sys_call_table + 8*(syscall-1024)
+	addl r22=IA64_RBS_OFFSET,r1		// A    compute base of RBS
+	cmp.leu p6,p7=r15,r3			// A    syscall number in range?
 	;;
-	invala
 
-	/* adjust return address so we skip over the break instruction: */
+	lfetch.fault.excl.nt1 [r22]		// M0|1 prefetch RBS
+(p6)	ld8 r30=[r30]				// M0|1 load address of syscall entry point
+	tnat.nz.or p7,p0=r15			// I0	is syscall nr a NaT?
 
-	extr.u r8=r29,41,2			// extract ei field from cr.ipsr
-	;;
-	cmp.eq p6,p7=2,r8			// isr.ei==2?
-	mov r2=r1				// setup r2 for ia64_syscall_setup
-	;;
-(p6)	mov r8=0				// clear ei to 0
-(p6)	adds r28=16,r28				// switch cr.iip to next bundle cr.ipsr.ei wrapped
-(p7)	adds r8=1,r8				// increment ei to next slot
-	;;
-	cmp.eq pKStk,pUStk=r0,r17		// are we in kernel mode already?
-	dep r29=r8,r29,41,2			// insert new ei into cr.ipsr
+	mov.m ar.bspstore=r22			// M2   switch to kernel RBS
+	cmp.eq p8,p9=2,r8			// A    isr.ei==2?
 	;;
 
-	// switch from user to kernel RBS:
-	MINSTATE_START_SAVE_MIN_VIRT
-	br.call.sptk.many b7=ia64_syscall_setup
-	;;
-	MINSTATE_END_SAVE_MIN_VIRT		// switch to bank 1
-	ssm psr.ic | PSR_DEFAULT_BITS
-	;;
-	srlz.i					// guarantee that interruption collection is on
-	mov r3=NR_syscalls - 1
-	;;
-(p15)	ssm psr.i				// restore psr.i
-	// p10==true means out registers are more than 8 or r15's Nat is true
-(p10)	br.cond.spnt.many ia64_ret_from_syscall
-	;;
-	movl r16=sys_call_table
+(p8)	mov r8=0				// A    clear ei to 0
+(p7)	movl r30=sys_ni_syscall			// X
 
-	adds r15=-1024,r15			// r15 contains the syscall number---subtract 1024
-	movl r2=ia64_ret_from_syscall
-	;;
-	shladd r20=r15,3,r16			// r20 = sys_call_table + 8*(syscall-1024)
-	cmp.leu p6,p7=r15,r3			// (syscall > 0 && syscall < 1024 + NR_syscalls) ?
-	mov rp=r2				// set the real return addr
+(p8)	adds r28=16,r28				// A    switch cr.iip to next bundle
+(p9)	adds r8=1,r8				// A    increment ei to next slot
+	nop.i 0
 	;;
-(p6)	ld8 r20=[r20]				// load address of syscall entry point
-(p7)	movl r20=sys_ni_syscall
 
-	add r2=TI_FLAGS+IA64_TASK_SIZE,r13
-	;;
-	ld4 r2=[r2]				// r2 = current_thread_info()->flags
-	;;
-	and r2=_TIF_SYSCALL_TRACEAUDIT,r2	// mask trace or audit
+	mov.m r25=ar.unat			// M2 (5 cyc)
+	dep r29=r8,r29,41,2			// I0   insert new ei into cr.ipsr
+	adds r15=1024,r15			// A    restore original syscall number
+	//
+	// If any of the above loads miss in L1D, we'll stall here until
+	// the data arrives.
+	//
+///////////////////////////////////////////////////////////////////////
+	st1 [r16]=r0				// M2|3 clear current->thread.on_ustack flag
+	mov b6=r30				// I0   setup syscall handler branch reg early
+	cmp.eq pKStk,pUStk=r0,r17		// A    were we on kernel stacks already?
+
+	and r9=_TIF_SYSCALL_TRACEAUDIT,r9	// A    mask trace or audit
+	mov r18=ar.bsp				// M2 (12 cyc)
+(pKStk)	br.cond.spnt .break_fixup		// B	we're already in kernel-mode -- fix up RBS
+	;;
+.back_from_break_fixup:
+(pUStk)	addl r1=IA64_STK_OFFSET-IA64_PT_REGS_SIZE,r1 // A    compute base of memory stack
+	cmp.eq p14,p0=r9,r0			// A    are syscalls being traced/audited?
+	br.call.sptk.many b7=ia64_syscall_setup	// B
+1:
+	mov ar.rsc=0x3				// M2   set eager mode, pl 0, LE, loadrs=0
+	nop 0
+	bsw.1					// B (6 cyc) regs are saved, switch to bank 1
 	;;
-	cmp.eq p8,p0=r2,r0
-	mov b6=r20
+
+	ssm psr.ic | PSR_DEFAULT_BITS		// M2	now it's safe to re-enable intr.-collection
+	movl r3=ia64_ret_from_syscall		// X
 	;;
-(p8)	br.call.sptk.many b6=b6			// ignore this return addr
-	br.cond.sptk ia64_trace_syscall
+
+	srlz.i					// M0   ensure interruption collection is on
+	mov rp=r3				// I0   set the real return addr
+(p10)	br.cond.spnt.many ia64_ret_from_syscall	// B    return if bad call-frame or r15 is a NaT
+
+(p15)	ssm psr.i				// M2   restore psr.i
+(p14)	br.call.sptk.many b6=b6			// B    invoke syscall-handker (ignore return addr)
+	br.cond.spnt.many ia64_trace_syscall	// B	do syscall-tracing thingamagic
 	// NOT REACHED
+///////////////////////////////////////////////////////////////////////
+	// On entry, we optimistically assumed that we're coming from user-space.
+	// For the rare cases where a system-call is done from within the kernel,
+	// we fix things up at this point:
+.break_fixup:
+	add r1=-IA64_PT_REGS_SIZE,sp		// A    allocate space for pt_regs structure
+	mov ar.rnat=r24				// M2	restore kernel's AR.RNAT
+	;;
+	mov ar.bspstore=r23			// M2	restore kernel's AR.BSPSTORE
+	br.cond.sptk .back_from_break_fixup
 END(break_fault)
 
 	.org ia64_ivt+0x3000
@@ -837,8 +878,6 @@ END(interrupt)
 	 *	- r31: saved pr
 	 *	-  b0: original contents (to be saved)
 	 * On exit:
-	 *	- executing on bank 1 registers
-	 *	- psr.ic enabled, interrupts restored
 	 *	-  p10: TRUE if syscall is invoked with more than 8 out
 	 *		registers or r15's Nat is true
 	 *	-  r1: kernel's gp
@@ -846,8 +885,11 @@ END(interrupt)
 	 *	-  r8: -EINVAL if p10 is true
 	 *	- r12: points to kernel stack
 	 *	- r13: points to current task
+	 *	- r14: preserved (same as on entry)
+	 *	- p13: preserved
 	 *	- p15: TRUE if interrupts need to be re-enabled
 	 *	- ar.fpsr: set to kernel settings
+	 *	-  b6: preserved (same as on entry)
 	 */
 GLOBAL_ENTRY(ia64_syscall_setup)
 #if PT(B6) != 0
@@ -915,10 +957,10 @@ GLOBAL_ENTRY(ia64_syscall_setup)
 (p13)	mov in5=-1
 	;;
 	st8 [r16]=r21,PT(R8)-PT(AR_FPSR)	// save ar.fpsr
-	tnat.nz p14,p0=in6
+	tnat.nz p13,p0=in6
 	cmp.lt p10,p9=r11,r8	// frame size can't be more than local+8
 	;;
-	stf8 [r16]=f1		// ensure pt_regs.r8 != 0 (see handle_syscall_error)
+	mov r8=1
 (p9)	tnat.nz p10,p0=r15
 	adds r12=-16,r1		// switch to kernel memory stack (with 16 bytes of scratch)
 
@@ -929,9 +971,9 @@ GLOBAL_ENTRY(ia64_syscall_setup)
 	mov r13=r2				// establish `current'
 	movl r1=__gp				// establish kernel global pointer
 	;;
-(p14)	mov in6=-1
+	st8 [r16]=r8		// ensure pt_regs.r8 != 0 (see handle_syscall_error)
+(p13)	mov in6=-1
 (p8)	mov in7=-1
-	nop.i 0
 
 	cmp.eq pSys,pNonSys=r0,r0		// set pSys=1, pNonSys=0
 	movl r17=FPSR_DEFAULT
@@ -1002,6 +1044,8 @@ END(dispatch_illegal_op_fault)
 	FAULT(17)
 
 ENTRY(non_syscall)
+	mov ar.rsc=r27			// restore ar.rsc before SAVE_MIN_WITH_COVER
+	;;
 	SAVE_MIN_WITH_COVER
 
 	// There is no particular reason for this code to be here, other than that
@@ -1199,6 +1243,25 @@ END(disabled_fp_reg)
 // 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
 ENTRY(nat_consumption)
 	DBG_FAULT(26)
+
+	mov r16=cr.ipsr
+	mov r17=cr.isr
+	mov r31=pr				// save PR
+	;;
+	and r18=0xf,r17				// r18 = cr.ipsr.code{3:0}
+	tbit.z p6,p0=r17,IA64_ISR_NA_BIT
+	;;
+	cmp.ne.or p6,p0=IA64_ISR_CODE_LFETCH,r18
+	dep r16=-1,r16,IA64_PSR_ED_BIT,1
+(p6)	br.cond.spnt 1f		// branch if (cr.ispr.na == 0 || cr.ipsr.code{3:0} != LFETCH)
+	;;
+	mov cr.ipsr=r16		// set cr.ipsr.na
+	mov pr=r31,-1
+	;;
+	rfi
+
+1:	mov pr=r31,-1
+	;;
 	FAULT(26)
 END(nat_consumption)
 
diff --git a/arch/ia64/kernel/jprobes.S b/arch/ia64/kernel/jprobes.S
new file mode 100644
index 000000000000..b7fa3ccd2b0f
--- /dev/null
+++ b/arch/ia64/kernel/jprobes.S
@@ -0,0 +1,61 @@
+/*
+ * Jprobe specific operations
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) Intel Corporation, 2005
+ *
+ * 2005-May     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
+ *              <anil.s.keshavamurthy@intel.com> initial implementation
+ *
+ * Jprobes (a.k.a. "jump probes" which is built on-top of kprobes) allow a
+ * probe to be inserted into the beginning of a function call.  The fundamental
+ * difference between a jprobe and a kprobe is the jprobe handler is executed
+ * in the same context as the target function, while the kprobe handlers
+ * are executed in interrupt context.
+ *
+ * For jprobes we initially gain control by placing a break point in the
+ * first instruction of the targeted function.  When we catch that specific
+ * break, we:
+ *        * set the return address to our jprobe_inst_return() function
+ *        * jump to the jprobe handler function
+ *
+ * Since we fixed up the return address, the jprobe handler will return to our
+ * jprobe_inst_return() function, giving us control again.  At this point we
+ * are back in the parents frame marker, so we do yet another call to our
+ * jprobe_break() function to fix up the frame marker as it would normally
+ * exist in the target function.
+ *
+ * Our jprobe_return function then transfers control back to kprobes.c by
+ * executing a break instruction using one of our reserved numbers.  When we
+ * catch that break in kprobes.c, we continue like we do for a normal kprobe
+ * by single stepping the emulated instruction, and then returning execution
+ * to the correct location.
+ */
+#include <asm/asmmacro.h>
+
+	/*
+	 * void jprobe_break(void)
+	 */
+ENTRY(jprobe_break)
+	break.m 0x80300
+END(jprobe_break)
+
+	/*
+	 * void jprobe_inst_return(void)
+	 */
+GLOBAL_ENTRY(jprobe_inst_return)
+	br.call.sptk.many b0=jprobe_break
+END(jprobe_inst_return)
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
new file mode 100644
index 000000000000..884f5cd27d8a
--- /dev/null
+++ b/arch/ia64/kernel/kprobes.c
@@ -0,0 +1,721 @@
+/*
+ *  Kernel Probes (KProbes)
+ *  arch/ia64/kernel/kprobes.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) Intel Corporation, 2005
+ *
+ * 2005-Apr     Rusty Lynch <rusty.lynch@intel.com> and Anil S Keshavamurthy
+ *              <anil.s.keshavamurthy@intel.com> adapted from i386
+ */
+
+#include <linux/config.h>
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/preempt.h>
+#include <linux/moduleloader.h>
+
+#include <asm/pgtable.h>
+#include <asm/kdebug.h>
+#include <asm/sections.h>
+
+extern void jprobe_inst_return(void);
+
+/* kprobe_status settings */
+#define KPROBE_HIT_ACTIVE	0x00000001
+#define KPROBE_HIT_SS		0x00000002
+
+static struct kprobe *current_kprobe, *kprobe_prev;
+static unsigned long kprobe_status, kprobe_status_prev;
+static struct pt_regs jprobe_saved_regs;
+
+enum instruction_type {A, I, M, F, B, L, X, u};
+static enum instruction_type bundle_encoding[32][3] = {
+  { M, I, I },				/* 00 */
+  { M, I, I },				/* 01 */
+  { M, I, I },				/* 02 */
+  { M, I, I },				/* 03 */
+  { M, L, X },				/* 04 */
+  { M, L, X },				/* 05 */
+  { u, u, u },  			/* 06 */
+  { u, u, u },  			/* 07 */
+  { M, M, I },				/* 08 */
+  { M, M, I },				/* 09 */
+  { M, M, I },				/* 0A */
+  { M, M, I },				/* 0B */
+  { M, F, I },				/* 0C */
+  { M, F, I },				/* 0D */
+  { M, M, F },				/* 0E */
+  { M, M, F },				/* 0F */
+  { M, I, B },				/* 10 */
+  { M, I, B },				/* 11 */
+  { M, B, B },				/* 12 */
+  { M, B, B },				/* 13 */
+  { u, u, u },  			/* 14 */
+  { u, u, u },  			/* 15 */
+  { B, B, B },				/* 16 */
+  { B, B, B },				/* 17 */
+  { M, M, B },				/* 18 */
+  { M, M, B },				/* 19 */
+  { u, u, u },  			/* 1A */
+  { u, u, u },  			/* 1B */
+  { M, F, B },				/* 1C */
+  { M, F, B },				/* 1D */
+  { u, u, u },  			/* 1E */
+  { u, u, u },  			/* 1F */
+};
+
+/*
+ * In this function we check to see if the instruction
+ * is IP relative instruction and update the kprobe
+ * inst flag accordingly
+ */
+static void update_kprobe_inst_flag(uint template, uint  slot, uint major_opcode,
+	unsigned long kprobe_inst, struct kprobe *p)
+{
+	p->ainsn.inst_flag = 0;
+	p->ainsn.target_br_reg = 0;
+
+	if (bundle_encoding[template][slot] == B) {
+		switch (major_opcode) {
+		  case INDIRECT_CALL_OPCODE:
+	 		p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG;
+ 			p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7);
+ 			break;
+		  case IP_RELATIVE_PREDICT_OPCODE:
+		  case IP_RELATIVE_BRANCH_OPCODE:
+			p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR;
+ 			break;
+		  case IP_RELATIVE_CALL_OPCODE:
+ 			p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR;
+ 			p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG;
+ 			p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7);
+ 			break;
+		}
+ 	} else if (bundle_encoding[template][slot] == X) {
+		switch (major_opcode) {
+		  case LONG_CALL_OPCODE:
+			p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG;
+			p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7);
+		  break;
+		}
+	}
+	return;
+}
+
+/*
+ * In this function we check to see if the instruction
+ * on which we are inserting kprobe is supported.
+ * Returns 0 if supported
+ * Returns -EINVAL if unsupported
+ */
+static int unsupported_inst(uint template, uint  slot, uint major_opcode,
+	unsigned long kprobe_inst, struct kprobe *p)
+{
+	unsigned long addr = (unsigned long)p->addr;
+
+	if (bundle_encoding[template][slot] == I) {
+		switch (major_opcode) {
+			case 0x0: //I_UNIT_MISC_OPCODE:
+			/*
+			 * Check for Integer speculation instruction
+			 * - Bit 33-35 to be equal to 0x1
+			 */
+			if (((kprobe_inst >> 33) & 0x7) == 1) {
+				printk(KERN_WARNING
+					"Kprobes on speculation inst at <0x%lx> not supported\n",
+					addr);
+				return -EINVAL;
+			}
+
+			/*
+			 * IP relative mov instruction
+			 *  - Bit 27-35 to be equal to 0x30
+			 */
+			if (((kprobe_inst >> 27) & 0x1FF) == 0x30) {
+				printk(KERN_WARNING
+					"Kprobes on \"mov r1=ip\" at <0x%lx> not supported\n",
+					addr);
+				return -EINVAL;
+
+			}
+		}
+	}
+	return 0;
+}
+
+
+/*
+ * In this function we check to see if the instruction
+ * (qp) cmpx.crel.ctype p1,p2=r2,r3
+ * on which we are inserting kprobe is cmp instruction
+ * with ctype as unc.
+ */
+static uint is_cmp_ctype_unc_inst(uint template, uint slot, uint major_opcode,
+unsigned long kprobe_inst)
+{
+	cmp_inst_t cmp_inst;
+	uint ctype_unc = 0;
+
+	if (!((bundle_encoding[template][slot] == I) ||
+		(bundle_encoding[template][slot] == M)))
+		goto out;
+
+	if (!((major_opcode == 0xC) || (major_opcode == 0xD) ||
+		(major_opcode == 0xE)))
+		goto out;
+
+	cmp_inst.l = kprobe_inst;
+	if ((cmp_inst.f.x2 == 0) || (cmp_inst.f.x2 == 1)) {
+		/* Integere compare - Register Register (A6 type)*/
+		if ((cmp_inst.f.tb == 0) && (cmp_inst.f.ta == 0)
+				&&(cmp_inst.f.c == 1))
+			ctype_unc = 1;
+	} else if ((cmp_inst.f.x2 == 2)||(cmp_inst.f.x2 == 3)) {
+		/* Integere compare - Immediate Register (A8 type)*/
+		if ((cmp_inst.f.ta == 0) &&(cmp_inst.f.c == 1))
+			ctype_unc = 1;
+	}
+out:
+	return ctype_unc;
+}
+
+/*
+ * In this function we override the bundle with
+ * the break instruction at the given slot.
+ */
+static void prepare_break_inst(uint template, uint  slot, uint major_opcode,
+	unsigned long kprobe_inst, struct kprobe *p)
+{
+	unsigned long break_inst = BREAK_INST;
+	bundle_t *bundle = &p->ainsn.insn.bundle;
+
+	/*
+	 * Copy the original kprobe_inst qualifying predicate(qp)
+	 * to the break instruction iff !is_cmp_ctype_unc_inst
+	 * because for cmp instruction with ctype equal to unc,
+	 * which is a special instruction always needs to be
+	 * executed regradless of qp
+	 */
+	if (!is_cmp_ctype_unc_inst(template, slot, major_opcode, kprobe_inst))
+		break_inst |= (0x3f & kprobe_inst);
+
+	switch (slot) {
+	  case 0:
+		bundle->quad0.slot0 = break_inst;
+		break;
+	  case 1:
+		bundle->quad0.slot1_p0 = break_inst;
+		bundle->quad1.slot1_p1 = break_inst >> (64-46);
+		break;
+	  case 2:
+		bundle->quad1.slot2 = break_inst;
+		break;
+	}
+
+	/*
+	 * Update the instruction flag, so that we can
+	 * emulate the instruction properly after we
+	 * single step on original instruction
+	 */
+	update_kprobe_inst_flag(template, slot, major_opcode, kprobe_inst, p);
+}
+
+static inline void get_kprobe_inst(bundle_t *bundle, uint slot,
+	       	unsigned long *kprobe_inst, uint *major_opcode)
+{
+	unsigned long kprobe_inst_p0, kprobe_inst_p1;
+	unsigned int template;
+
+	template = bundle->quad0.template;
+
+	switch (slot) {
+	  case 0:
+ 		*major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT);
+ 		*kprobe_inst = bundle->quad0.slot0;
+		break;
+	  case 1:
+ 		*major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT);
+  		kprobe_inst_p0 = bundle->quad0.slot1_p0;
+  		kprobe_inst_p1 = bundle->quad1.slot1_p1;
+  		*kprobe_inst = kprobe_inst_p0 | (kprobe_inst_p1 << (64-46));
+		break;
+	  case 2:
+ 		*major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT);
+ 		*kprobe_inst = bundle->quad1.slot2;
+		break;
+	}
+}
+
+/* Returns non-zero if the addr is in the Interrupt Vector Table */
+static inline int in_ivt_functions(unsigned long addr)
+{
+	return (addr >= (unsigned long)__start_ivt_text
+		&& addr < (unsigned long)__end_ivt_text);
+}
+
+static int valid_kprobe_addr(int template, int slot, unsigned long addr)
+{
+	if ((slot > 2) || ((bundle_encoding[template][1] == L) && slot > 1)) {
+		printk(KERN_WARNING "Attempting to insert unaligned kprobe "
+				"at 0x%lx\n", addr);
+		return -EINVAL;
+	}
+
+ 	if (in_ivt_functions(addr)) {
+ 		printk(KERN_WARNING "Kprobes can't be inserted inside "
+				"IVT functions at 0x%lx\n", addr);
+ 		return -EINVAL;
+ 	}
+
+	if (slot == 1 && bundle_encoding[template][1] != L) {
+		printk(KERN_WARNING "Inserting kprobes on slot #1 "
+		       "is not supported\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline void save_previous_kprobe(void)
+{
+	kprobe_prev = current_kprobe;
+	kprobe_status_prev = kprobe_status;
+}
+
+static inline void restore_previous_kprobe(void)
+{
+	current_kprobe = kprobe_prev;
+	kprobe_status = kprobe_status_prev;
+}
+
+static inline void set_current_kprobe(struct kprobe *p)
+{
+	current_kprobe = p;
+}
+
+static void kretprobe_trampoline(void)
+{
+}
+
+/*
+ * At this point the target function has been tricked into
+ * returning into our trampoline.  Lookup the associated instance
+ * and then:
+ *    - call the handler function
+ *    - cleanup by marking the instance as unused
+ *    - long jump back to the original return address
+ */
+int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct kretprobe_instance *ri = NULL;
+	struct hlist_head *head;
+	struct hlist_node *node, *tmp;
+	unsigned long orig_ret_address = 0;
+	unsigned long trampoline_address =
+		((struct fnptr *)kretprobe_trampoline)->ip;
+
+        head = kretprobe_inst_table_head(current);
+
+	/*
+	 * It is possible to have multiple instances associated with a given
+	 * task either because an multiple functions in the call path
+	 * have a return probe installed on them, and/or more then one return
+	 * return probe was registered for a target function.
+	 *
+	 * We can handle this because:
+	 *     - instances are always inserted at the head of the list
+	 *     - when multiple return probes are registered for the same
+	 *       function, the first instance's ret_addr will point to the
+	 *       real return address, and all the rest will point to
+	 *       kretprobe_trampoline
+	 */
+	hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
+                if (ri->task != current)
+			/* another task is sharing our hash bucket */
+                        continue;
+
+		if (ri->rp && ri->rp->handler)
+			ri->rp->handler(ri, regs);
+
+		orig_ret_address = (unsigned long)ri->ret_addr;
+		recycle_rp_inst(ri);
+
+		if (orig_ret_address != trampoline_address)
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			break;
+	}
+
+	BUG_ON(!orig_ret_address || (orig_ret_address == trampoline_address));
+	regs->cr_iip = orig_ret_address;
+
+	unlock_kprobes();
+	preempt_enable_no_resched();
+
+        /*
+         * By returning a non-zero value, we are telling
+         * kprobe_handler() that we have handled unlocking
+         * and re-enabling preemption.
+         */
+        return 1;
+}
+
+void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs)
+{
+	struct kretprobe_instance *ri;
+
+	if ((ri = get_free_rp_inst(rp)) != NULL) {
+		ri->rp = rp;
+		ri->task = current;
+		ri->ret_addr = (kprobe_opcode_t *)regs->b0;
+
+		/* Replace the return addr with trampoline addr */
+		regs->b0 = ((struct fnptr *)kretprobe_trampoline)->ip;
+
+		add_rp_inst(ri);
+	} else {
+		rp->nmissed++;
+	}
+}
+
+int arch_prepare_kprobe(struct kprobe *p)
+{
+	unsigned long addr = (unsigned long) p->addr;
+	unsigned long *kprobe_addr = (unsigned long *)(addr & ~0xFULL);
+	unsigned long kprobe_inst=0;
+	unsigned int slot = addr & 0xf, template, major_opcode = 0;
+	bundle_t *bundle = &p->ainsn.insn.bundle;
+
+	memcpy(&p->opcode.bundle, kprobe_addr, sizeof(bundle_t));
+	memcpy(&p->ainsn.insn.bundle, kprobe_addr, sizeof(bundle_t));
+
+ 	template = bundle->quad0.template;
+
+	if(valid_kprobe_addr(template, slot, addr))
+		return -EINVAL;
+
+	/* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */
+ 	if (slot == 1 && bundle_encoding[template][1] == L)
+  		slot++;
+
+	/* Get kprobe_inst and major_opcode from the bundle */
+	get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode);
+
+	if (unsupported_inst(template, slot, major_opcode, kprobe_inst, p))
+			return -EINVAL;
+
+	prepare_break_inst(template, slot, major_opcode, kprobe_inst, p);
+
+	return 0;
+}
+
+void arch_arm_kprobe(struct kprobe *p)
+{
+	unsigned long addr = (unsigned long)p->addr;
+	unsigned long arm_addr = addr & ~0xFULL;
+
+	memcpy((char *)arm_addr, &p->ainsn.insn.bundle, sizeof(bundle_t));
+	flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t));
+}
+
+void arch_disarm_kprobe(struct kprobe *p)
+{
+	unsigned long addr = (unsigned long)p->addr;
+	unsigned long arm_addr = addr & ~0xFULL;
+
+	/* p->opcode contains the original unaltered bundle */
+	memcpy((char *) arm_addr, (char *) &p->opcode.bundle, sizeof(bundle_t));
+	flush_icache_range(arm_addr, arm_addr + sizeof(bundle_t));
+}
+
+void arch_remove_kprobe(struct kprobe *p)
+{
+}
+
+/*
+ * We are resuming execution after a single step fault, so the pt_regs
+ * structure reflects the register state after we executed the instruction
+ * located in the kprobe (p->ainsn.insn.bundle).  We still need to adjust
+ * the ip to point back to the original stack address. To set the IP address
+ * to original stack address, handle the case where we need to fixup the
+ * relative IP address and/or fixup branch register.
+ */
+static void resume_execution(struct kprobe *p, struct pt_regs *regs)
+{
+  	unsigned long bundle_addr = ((unsigned long) (&p->opcode.bundle)) & ~0xFULL;
+  	unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL;
+ 	unsigned long template;
+ 	int slot = ((unsigned long)p->addr & 0xf);
+
+	template = p->opcode.bundle.quad0.template;
+
+ 	if (slot == 1 && bundle_encoding[template][1] == L)
+ 		slot = 2;
+
+	if (p->ainsn.inst_flag) {
+
+		if (p->ainsn.inst_flag & INST_FLAG_FIX_RELATIVE_IP_ADDR) {
+			/* Fix relative IP address */
+ 			regs->cr_iip = (regs->cr_iip - bundle_addr) + resume_addr;
+		}
+
+		if (p->ainsn.inst_flag & INST_FLAG_FIX_BRANCH_REG) {
+		/*
+		 * Fix target branch register, software convention is
+		 * to use either b0 or b6 or b7, so just checking
+		 * only those registers
+		 */
+			switch (p->ainsn.target_br_reg) {
+			case 0:
+				if ((regs->b0 == bundle_addr) ||
+					(regs->b0 == bundle_addr + 0x10)) {
+					regs->b0 = (regs->b0 - bundle_addr) +
+						resume_addr;
+				}
+				break;
+			case 6:
+				if ((regs->b6 == bundle_addr) ||
+					(regs->b6 == bundle_addr + 0x10)) {
+					regs->b6 = (regs->b6 - bundle_addr) +
+						resume_addr;
+				}
+				break;
+			case 7:
+				if ((regs->b7 == bundle_addr) ||
+					(regs->b7 == bundle_addr + 0x10)) {
+					regs->b7 = (regs->b7 - bundle_addr) +
+						resume_addr;
+				}
+				break;
+			} /* end switch */
+		}
+		goto turn_ss_off;
+	}
+
+	if (slot == 2) {
+ 		if (regs->cr_iip == bundle_addr + 0x10) {
+ 			regs->cr_iip = resume_addr + 0x10;
+ 		}
+ 	} else {
+ 		if (regs->cr_iip == bundle_addr) {
+ 			regs->cr_iip = resume_addr;
+ 		}
+	}
+
+turn_ss_off:
+  	/* Turn off Single Step bit */
+  	ia64_psr(regs)->ss = 0;
+}
+
+static void prepare_ss(struct kprobe *p, struct pt_regs *regs)
+{
+	unsigned long bundle_addr = (unsigned long) &p->opcode.bundle;
+	unsigned long slot = (unsigned long)p->addr & 0xf;
+
+	/* Update instruction pointer (IIP) and slot number (IPSR.ri) */
+	regs->cr_iip = bundle_addr & ~0xFULL;
+
+	if (slot > 2)
+		slot = 0;
+
+	ia64_psr(regs)->ri = slot;
+
+	/* turn on single stepping */
+	ia64_psr(regs)->ss = 1;
+}
+
+static int pre_kprobes_handler(struct die_args *args)
+{
+	struct kprobe *p;
+	int ret = 0;
+	struct pt_regs *regs = args->regs;
+	kprobe_opcode_t *addr = (kprobe_opcode_t *)instruction_pointer(regs);
+
+	preempt_disable();
+
+	/* Handle recursion cases */
+	if (kprobe_running()) {
+		p = get_kprobe(addr);
+		if (p) {
+			if (kprobe_status == KPROBE_HIT_SS) {
+				unlock_kprobes();
+				goto no_kprobe;
+			}
+			/* We have reentered the pre_kprobe_handler(), since
+			 * another probe was hit while within the handler.
+			 * We here save the original kprobes variables and
+			 * just single step on the instruction of the new probe
+			 * without calling any user handlers.
+			 */
+			save_previous_kprobe();
+			set_current_kprobe(p);
+			p->nmissed++;
+			prepare_ss(p, regs);
+			kprobe_status = KPROBE_REENTER;
+			return 1;
+		} else if (args->err == __IA64_BREAK_JPROBE) {
+			/*
+			 * jprobe instrumented function just completed
+			 */
+			p = current_kprobe;
+			if (p->break_handler && p->break_handler(p, regs)) {
+				goto ss_probe;
+			}
+		} else {
+			/* Not our break */
+			goto no_kprobe;
+		}
+	}
+
+	lock_kprobes();
+	p = get_kprobe(addr);
+	if (!p) {
+		unlock_kprobes();
+		goto no_kprobe;
+	}
+
+	kprobe_status = KPROBE_HIT_ACTIVE;
+	set_current_kprobe(p);
+
+	if (p->pre_handler && p->pre_handler(p, regs))
+		/*
+		 * Our pre-handler is specifically requesting that we just
+		 * do a return.  This is used for both the jprobe pre-handler
+		 * and the kretprobe trampoline
+		 */
+		return 1;
+
+ss_probe:
+	prepare_ss(p, regs);
+	kprobe_status = KPROBE_HIT_SS;
+	return 1;
+
+no_kprobe:
+	preempt_enable_no_resched();
+	return ret;
+}
+
+static int post_kprobes_handler(struct pt_regs *regs)
+{
+	if (!kprobe_running())
+		return 0;
+
+	if ((kprobe_status != KPROBE_REENTER) && current_kprobe->post_handler) {
+		kprobe_status = KPROBE_HIT_SSDONE;
+		current_kprobe->post_handler(current_kprobe, regs, 0);
+	}
+
+	resume_execution(current_kprobe, regs);
+
+	/*Restore back the original saved kprobes variables and continue. */
+	if (kprobe_status == KPROBE_REENTER) {
+		restore_previous_kprobe();
+		goto out;
+	}
+
+	unlock_kprobes();
+
+out:
+	preempt_enable_no_resched();
+	return 1;
+}
+
+static int kprobes_fault_handler(struct pt_regs *regs, int trapnr)
+{
+	if (!kprobe_running())
+		return 0;
+
+	if (current_kprobe->fault_handler &&
+	    current_kprobe->fault_handler(current_kprobe, regs, trapnr))
+		return 1;
+
+	if (kprobe_status & KPROBE_HIT_SS) {
+		resume_execution(current_kprobe, regs);
+		unlock_kprobes();
+		preempt_enable_no_resched();
+	}
+
+	return 0;
+}
+
+int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
+			     void *data)
+{
+	struct die_args *args = (struct die_args *)data;
+	switch(val) {
+	case DIE_BREAK:
+		if (pre_kprobes_handler(args))
+			return NOTIFY_STOP;
+		break;
+	case DIE_SS:
+		if (post_kprobes_handler(args->regs))
+			return NOTIFY_STOP;
+		break;
+	case DIE_PAGE_FAULT:
+		if (kprobes_fault_handler(args->regs, args->trapnr))
+			return NOTIFY_STOP;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	struct jprobe *jp = container_of(p, struct jprobe, kp);
+	unsigned long addr = ((struct fnptr *)(jp->entry))->ip;
+
+	/* save architectural state */
+	jprobe_saved_regs = *regs;
+
+	/* after rfi, execute the jprobe instrumented function */
+	regs->cr_iip = addr & ~0xFULL;
+	ia64_psr(regs)->ri = addr & 0xf;
+	regs->r1 = ((struct fnptr *)(jp->entry))->gp;
+
+	/*
+	 * fix the return address to our jprobe_inst_return() function
+	 * in the jprobes.S file
+	 */
+ 	regs->b0 = ((struct fnptr *)(jprobe_inst_return))->ip;
+
+	return 1;
+}
+
+int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+	*regs = jprobe_saved_regs;
+	return 1;
+}
+
+static struct kprobe trampoline_p = {
+	.pre_handler = trampoline_probe_handler
+};
+
+int __init arch_init_kprobes(void)
+{
+	trampoline_p.addr =
+		(kprobe_opcode_t *)((struct fnptr *)kretprobe_trampoline)->ip;
+	return register_kprobe(&trampoline_p);
+}
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 736e328b5e61..4ebbf3974381 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -271,7 +271,7 @@ ia64_mca_log_sal_error_record(int sal_info_type)
 
 #ifdef CONFIG_ACPI
 
-static int cpe_vector = -1;
+int cpe_vector = -1;
 
 static irqreturn_t
 ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs)
diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c
new file mode 100644
index 000000000000..a68ce6678092
--- /dev/null
+++ b/arch/ia64/kernel/numa.c
@@ -0,0 +1,57 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * ia64 kernel NUMA specific stuff
+ *
+ * Copyright (C) 2002 Erich Focht <efocht@ess.nec.de>
+ * Copyright (C) 2004 Silicon Graphics, Inc.
+ *   Jesse Barnes <jbarnes@sgi.com>
+ */
+#include <linux/config.h>
+#include <linux/topology.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <asm/smp.h>
+
+u8 cpu_to_node_map[NR_CPUS] __cacheline_aligned;
+EXPORT_SYMBOL(cpu_to_node_map);
+
+cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
+
+/**
+ * build_cpu_to_node_map - setup cpu to node and node to cpumask arrays
+ *
+ * Build cpu to node mapping and initialize the per node cpu masks using
+ * info from the node_cpuid array handed to us by ACPI.
+ */
+void __init build_cpu_to_node_map(void)
+{
+	int cpu, i, node;
+
+	for(node=0; node < MAX_NUMNODES; node++)
+		cpus_clear(node_to_cpu_mask[node]);
+
+	for(cpu = 0; cpu < NR_CPUS; ++cpu) {
+		node = -1;
+		for (i = 0; i < NR_CPUS; ++i)
+			if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) {
+				node = node_cpuid[i].nid;
+				break;
+			}
+		cpu_to_node_map[cpu] = (node >= 0) ? node : 0;
+		if (node >= 0)
+			cpu_set(cpu, node_to_cpu_mask[node]);
+	}
+}
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 6407bff6bfd7..b8ebb8e427ef 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -37,7 +37,6 @@
 #include <linux/vfs.h>
 #include <linux/pagemap.h>
 #include <linux/mount.h>
-#include <linux/version.h>
 #include <linux/bitops.h>
 
 #include <asm/errno.h>
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index ebb71f3d6d19..051e050359e4 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -27,6 +27,7 @@
 #include <linux/efi.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
+#include <linux/kprobes.h>
 
 #include <asm/cpu.h>
 #include <asm/delay.h>
@@ -178,7 +179,7 @@ static int can_do_pal_halt = 1;
 
 static int __init nohalt_setup(char * str)
 {
-	pal_halt = 0;
+	pal_halt = can_do_pal_halt = 0;
 	return 1;
 }
 __setup("nohalt", nohalt_setup);
@@ -195,6 +196,7 @@ update_pal_halt_status(int status)
 void
 default_idle (void)
 {
+	local_irq_enable();
 	while (!need_resched())
 		if (can_do_pal_halt)
 			safe_halt();
@@ -707,6 +709,13 @@ kernel_thread_helper (int (*fn)(void *), void *arg)
 void
 flush_thread (void)
 {
+	/*
+	 * Remove function-return probe instances associated with this task
+	 * and put them back on the free list. Do not insert an exit probe for
+	 * this function, it will be disabled by kprobe_flush_task if you do.
+	 */
+	kprobe_flush_task(current);
+
 	/* drop floating-point and debug-register state if it exists: */
 	current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID);
 	ia64_drop_fpu(current);
@@ -721,6 +730,14 @@ flush_thread (void)
 void
 exit_thread (void)
 {
+
+	/*
+	 * Remove function-return probe instances associated with this task
+	 * and put them back on the free list. Do not insert an exit probe for
+	 * this function, it will be disabled by kprobe_flush_task if you do.
+	 */
+	kprobe_flush_task(current);
+
 	ia64_drop_fpu(current);
 #ifdef CONFIG_PERFMON
        /* if needed, stop monitoring and flush state to perfmon context */
@@ -790,16 +807,12 @@ machine_restart (char *restart_cmd)
 	(*efi.reset_system)(EFI_RESET_WARM, 0, 0, NULL);
 }
 
-EXPORT_SYMBOL(machine_restart);
-
 void
 machine_halt (void)
 {
 	cpu_halt();
 }
 
-EXPORT_SYMBOL(machine_halt);
-
 void
 machine_power_off (void)
 {
@@ -808,4 +821,3 @@ machine_power_off (void)
 	machine_halt();
 }
 
-EXPORT_SYMBOL(machine_power_off);
diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c
index 575a8f657b31..bbb8bc7c0552 100644
--- a/arch/ia64/kernel/ptrace.c
+++ b/arch/ia64/kernel/ptrace.c
@@ -725,12 +725,32 @@ convert_to_non_syscall (struct task_struct *child, struct pt_regs  *pt,
 			break;
 	}
 
+	/*
+	 * Note: at the time of this call, the target task is blocked
+	 * in notify_resume_user() and by clearling PRED_LEAVE_SYSCALL
+	 * (aka, "pLvSys") we redirect execution from
+	 * .work_pending_syscall_end to .work_processed_kernel.
+	 */
 	unw_get_pr(&prev_info, &pr);
-	pr &= ~(1UL << PRED_SYSCALL);
+	pr &= ~((1UL << PRED_SYSCALL) | (1UL << PRED_LEAVE_SYSCALL));
 	pr |=  (1UL << PRED_NON_SYSCALL);
 	unw_set_pr(&prev_info, pr);
 
 	pt->cr_ifs = (1UL << 63) | cfm;
+	/*
+	 * Clear the memory that is NOT written on syscall-entry to
+	 * ensure we do not leak kernel-state to user when execution
+	 * resumes.
+	 */
+	pt->r2 = 0;
+	pt->r3 = 0;
+	pt->r14 = 0;
+	memset(&pt->r16, 0, 16*8);	/* clear r16-r31 */
+	memset(&pt->f6, 0, 6*16);	/* clear f6-f11 */
+	pt->b7 = 0;
+	pt->ar_ccv = 0;
+	pt->ar_csd = 0;
+	pt->ar_ssd = 0;
 }
 
 static int
@@ -945,6 +965,13 @@ access_uarea (struct task_struct *child, unsigned long addr,
 				*data = (pt->cr_ipsr & IPSR_MASK);
 			return 0;
 
+		      case PT_AR_RSC:
+			if (write_access)
+				pt->ar_rsc = *data | (3 << 2); /* force PL3 */
+			else
+				*data = pt->ar_rsc;
+			return 0;
+
 		      case PT_AR_RNAT:
 			urbs_end = ia64_get_user_rbs_end(child, pt, NULL);
 			rnat_addr = (long) ia64_rse_rnat_addr((long *)
@@ -996,9 +1023,6 @@ access_uarea (struct task_struct *child, unsigned long addr,
 		      case PT_AR_BSPSTORE:
 			ptr = pt_reg_addr(pt, ar_bspstore);
 			break;
-		      case PT_AR_RSC:
-			ptr = pt_reg_addr(pt, ar_rsc);
-			break;
 		      case PT_AR_UNAT:
 			ptr = pt_reg_addr(pt, ar_unat);
 			break;
@@ -1234,7 +1258,7 @@ ptrace_getregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
 static long
 ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
 {
-	unsigned long psr, ec, lc, rnat, bsp, cfm, nat_bits, val = 0;
+	unsigned long psr, rsc, ec, lc, rnat, bsp, cfm, nat_bits, val = 0;
 	struct unw_frame_info info;
 	struct switch_stack *sw;
 	struct ia64_fpreg fpval;
@@ -1267,7 +1291,7 @@ ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
 	/* app regs */
 
 	retval |= __get_user(pt->ar_pfs, &ppr->ar[PT_AUR_PFS]);
-	retval |= __get_user(pt->ar_rsc, &ppr->ar[PT_AUR_RSC]);
+	retval |= __get_user(rsc, &ppr->ar[PT_AUR_RSC]);
 	retval |= __get_user(pt->ar_bspstore, &ppr->ar[PT_AUR_BSPSTORE]);
 	retval |= __get_user(pt->ar_unat, &ppr->ar[PT_AUR_UNAT]);
 	retval |= __get_user(pt->ar_ccv, &ppr->ar[PT_AUR_CCV]);
@@ -1365,6 +1389,7 @@ ptrace_setregs (struct task_struct *child, struct pt_all_user_regs __user *ppr)
 	retval |= __get_user(nat_bits, &ppr->nat);
 
 	retval |= access_uarea(child, PT_CR_IPSR, &psr, 1);
+	retval |= access_uarea(child, PT_AR_RSC, &rsc, 1);
 	retval |= access_uarea(child, PT_AR_EC, &ec, 1);
 	retval |= access_uarea(child, PT_AR_LC, &lc, 1);
 	retval |= access_uarea(child, PT_AR_RNAT, &rnat, 1);
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index d14692e0920a..84f89da7c640 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -20,6 +20,7 @@
  * 02/01/00 R.Seth	fixed get_cpuinfo for SMP
  * 01/07/99 S.Eranian	added the support for command line argument
  * 06/24/99 W.Drummond	added boot_cpu_data.
+ * 05/28/05 Z. Menyhart	Dynamic stride size for "flush_icache_range()"
  */
 #include <linux/config.h>
 #include <linux/module.h>
@@ -40,6 +41,8 @@
 #include <linux/serial_core.h>
 #include <linux/efi.h>
 #include <linux/initrd.h>
+#include <linux/platform.h>
+#include <linux/pm.h>
 
 #include <asm/ia32.h>
 #include <asm/machvec.h>
@@ -72,6 +75,8 @@ DEFINE_PER_CPU(unsigned long, ia64_phys_stacked_size_p8);
 unsigned long ia64_cycles_per_usec;
 struct ia64_boot_param *ia64_boot_param;
 struct screen_info screen_info;
+unsigned long vga_console_iobase;
+unsigned long vga_console_membase;
 
 unsigned long ia64_max_cacheline_size;
 unsigned long ia64_iobase;	/* virtual address for I/O accesses */
@@ -81,6 +86,13 @@ EXPORT_SYMBOL(io_space);
 unsigned int num_io_spaces;
 
 /*
+ * "flush_icache_range()" needs to know what processor dependent stride size to use
+ * when it makes i-cache(s) coherent with d-caches.
+ */
+#define	I_CACHE_STRIDE_SHIFT	5	/* Safest way to go: 32 bytes by 32 bytes */
+unsigned long ia64_i_cache_stride_shift = ~0;
+
+/*
  * The merge_mask variable needs to be set to (max(iommu_page_size(iommu)) - 1).  This
  * mask specifies a mask of address bits that must be 0 in order for two buffers to be
  * mergeable by the I/O MMU (i.e., the end address of the first buffer and the start
@@ -273,23 +285,25 @@ io_port_init (void)
 static inline int __init
 early_console_setup (char *cmdline)
 {
+	int earlycons = 0;
+
 #ifdef CONFIG_SERIAL_SGI_L1_CONSOLE
 	{
 		extern int sn_serial_console_early_setup(void);
 		if (!sn_serial_console_early_setup())
-			return 0;
+			earlycons++;
 	}
 #endif
 #ifdef CONFIG_EFI_PCDP
 	if (!efi_setup_pcdp_console(cmdline))
-		return 0;
+		earlycons++;
 #endif
 #ifdef CONFIG_SERIAL_8250_CONSOLE
 	if (!early_serial_console_init(cmdline))
-		return 0;
+		earlycons++;
 #endif
 
-	return -1;
+	return (earlycons) ? 0 : -1;
 }
 
 static inline void
@@ -622,6 +636,12 @@ setup_per_cpu_areas (void)
 	/* start_kernel() requires this... */
 }
 
+/*
+ * Calculate the max. cache line size.
+ *
+ * In addition, the minimum of the i-cache stride sizes is calculated for
+ * "flush_icache_range()".
+ */
 static void
 get_max_cacheline_size (void)
 {
@@ -635,6 +655,8 @@ get_max_cacheline_size (void)
                 printk(KERN_ERR "%s: ia64_pal_cache_summary() failed (status=%ld)\n",
                        __FUNCTION__, status);
                 max = SMP_CACHE_BYTES;
+		/* Safest setup for "flush_icache_range()" */
+		ia64_i_cache_stride_shift = I_CACHE_STRIDE_SHIFT;
 		goto out;
         }
 
@@ -643,14 +665,31 @@ get_max_cacheline_size (void)
 						    &cci);
 		if (status != 0) {
 			printk(KERN_ERR
-			       "%s: ia64_pal_cache_config_info(l=%lu) failed (status=%ld)\n",
+			       "%s: ia64_pal_cache_config_info(l=%lu, 2) failed (status=%ld)\n",
 			       __FUNCTION__, l, status);
 			max = SMP_CACHE_BYTES;
+			/* The safest setup for "flush_icache_range()" */
+			cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
+			cci.pcci_unified = 1;
 		}
 		line_size = 1 << cci.pcci_line_size;
 		if (line_size > max)
 			max = line_size;
-        }
+		if (!cci.pcci_unified) {
+			status = ia64_pal_cache_config_info(l,
+						    /* cache_type (instruction)= */ 1,
+						    &cci);
+			if (status != 0) {
+				printk(KERN_ERR
+				"%s: ia64_pal_cache_config_info(l=%lu, 1) failed (status=%ld)\n",
+					__FUNCTION__, l, status);
+				/* The safest setup for "flush_icache_range()" */
+				cci.pcci_stride = I_CACHE_STRIDE_SHIFT;
+			}
+		}
+		if (cci.pcci_stride < ia64_i_cache_stride_shift)
+			ia64_i_cache_stride_shift = cci.pcci_stride;
+	}
   out:
 	if (max > ia64_max_cacheline_size)
 		ia64_max_cacheline_size = max;
@@ -779,6 +818,7 @@ cpu_init (void)
 	/* size of physical stacked register partition plus 8 bytes: */
 	__get_cpu_var(ia64_phys_stacked_size_p8) = num_phys_stacked*8 + 8;
 	platform_cpu_init();
+	pm_idle = default_idle;
 }
 
 void
diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c
index 499b7e5317cf..b8a0a7d257a9 100644
--- a/arch/ia64/kernel/signal.c
+++ b/arch/ia64/kernel/signal.c
@@ -94,7 +94,7 @@ sys_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, long arg2,
 static long
 restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
 {
-	unsigned long ip, flags, nat, um, cfm;
+	unsigned long ip, flags, nat, um, cfm, rsc;
 	long err;
 
 	/* Always make any pending restarted system calls return -EINTR */
@@ -106,7 +106,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
 	err |= __get_user(ip, &sc->sc_ip);			/* instruction pointer */
 	err |= __get_user(cfm, &sc->sc_cfm);
 	err |= __get_user(um, &sc->sc_um);			/* user mask */
-	err |= __get_user(scr->pt.ar_rsc, &sc->sc_ar_rsc);
+	err |= __get_user(rsc, &sc->sc_ar_rsc);
 	err |= __get_user(scr->pt.ar_unat, &sc->sc_ar_unat);
 	err |= __get_user(scr->pt.ar_fpsr, &sc->sc_ar_fpsr);
 	err |= __get_user(scr->pt.ar_pfs, &sc->sc_ar_pfs);
@@ -119,6 +119,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
 	err |= __copy_from_user(&scr->pt.r15, &sc->sc_gr[15], 8);	/* r15 */
 
 	scr->pt.cr_ifs = cfm | (1UL << 63);
+	scr->pt.ar_rsc = rsc | (3 << 2); /* force PL3 */
 
 	/* establish new instruction pointer: */
 	scr->pt.cr_iip = ip & ~0x3UL;
@@ -142,6 +143,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
 
 		__copy_from_user(current->thread.fph, &sc->sc_fr[32], 96*16);
 		psr->mfh = 0;	/* drop signal handler's fph contents... */
+		preempt_disable();
 		if (psr->dfh)
 			ia64_drop_fpu(current);
 		else {
@@ -149,6 +151,7 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr)
 			__ia64_load_fpu(current->thread.fph);
 			ia64_set_local_fpu_owner(current);
 		}
+		preempt_enable();
 	}
 	return err;
 }
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index 953095e2ce15..0166a9847095 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -231,13 +231,16 @@ smp_flush_tlb_all (void)
 void
 smp_flush_tlb_mm (struct mm_struct *mm)
 {
+	preempt_disable();
 	/* this happens for the common case of a single-threaded fork():  */
 	if (likely(mm == current->active_mm && atomic_read(&mm->mm_users) == 1))
 	{
 		local_finish_flush_tlb_mm(mm);
+		preempt_enable();
 		return;
 	}
 
+	preempt_enable();
 	/*
 	 * We could optimize this further by using mm->cpu_vm_mask to track which CPUs
 	 * have been running in the address space.  It's not clear that this is worth the
@@ -269,7 +272,7 @@ smp_call_function_single (int cpuid, void (*func) (void *info), void *info, int
 	int me = get_cpu(); /* prevent preemption and reschedule on another processor */
 
 	if (cpuid == me) {
-		printk("%s: trying to call self\n", __FUNCTION__);
+		printk(KERN_INFO "%s: trying to call self\n", __FUNCTION__);
 		put_cpu();
 		return -EBUSY;
 	}
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 3865f088ffa2..7d72c0d872b3 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -346,6 +346,7 @@ smp_callin (void)
 	lock_ipi_calllock();
 	cpu_set(cpuid, cpu_online_map);
 	unlock_ipi_calllock();
+	per_cpu(cpu_state, cpuid) = CPU_ONLINE;
 
 	smp_setup_percpu_timer();
 
@@ -524,47 +525,6 @@ smp_build_cpu_map (void)
 	}
 }
 
-#ifdef CONFIG_NUMA
-
-/* on which node is each logical CPU (one cacheline even for 64 CPUs) */
-u8 cpu_to_node_map[NR_CPUS] __cacheline_aligned;
-EXPORT_SYMBOL(cpu_to_node_map);
-/* which logical CPUs are on which nodes */
-cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned;
-
-/*
- * Build cpu to node mapping and initialize the per node cpu masks.
- */
-void __init
-build_cpu_to_node_map (void)
-{
-	int cpu, i, node;
-
-	for(node=0; node<MAX_NUMNODES; node++)
-		cpus_clear(node_to_cpu_mask[node]);
-	for(cpu = 0; cpu < NR_CPUS; ++cpu) {
-		/*
-		 * All Itanium NUMA platforms I know use ACPI, so maybe we
-		 * can drop this ifdef completely.                    [EF]
-		 */
-#ifdef CONFIG_ACPI_NUMA
-		node = -1;
-		for (i = 0; i < NR_CPUS; ++i)
-			if (cpu_physical_id(cpu) == node_cpuid[i].phys_id) {
-				node = node_cpuid[i].nid;
-				break;
-			}
-#else
-#		error Fixme: Dunno how to build CPU-to-node map.
-#endif
-		cpu_to_node_map[cpu] = (node >= 0) ? node : 0;
-		if (node >= 0)
-			cpu_set(cpu, node_to_cpu_mask[node]);
-	}
-}
-
-#endif /* CONFIG_NUMA */
-
 /*
  * Cycle through the APs sending Wakeup IPIs to boot each.
  */
@@ -611,6 +571,7 @@ void __devinit smp_prepare_boot_cpu(void)
 {
 	cpu_set(smp_processor_id(), cpu_online_map);
 	cpu_set(smp_processor_id(), cpu_callin_map);
+	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 }
 
 /*
@@ -688,6 +649,7 @@ int __cpu_disable(void)
 		return -EBUSY;
 
 	remove_siblinginfo(cpu);
+	cpu_clear(cpu, cpu_online_map);
 	fixup_irqs();
 	local_flush_tlb_all();
 	cpu_clear(cpu, cpu_callin_map);
@@ -774,6 +736,7 @@ __cpu_up (unsigned int cpu)
 	if (cpu_isset(cpu, cpu_callin_map))
 		return -EINVAL;
 
+	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 	/* Processor goes to start_secondary(), sets online flag */
 	ret = do_boot_cpu(sapicid, cpu);
 	if (ret < 0)
diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c
index f1aafd4c05f9..92ff46ad21e2 100644
--- a/arch/ia64/kernel/topology.c
+++ b/arch/ia64/kernel/topology.c
@@ -36,6 +36,15 @@ int arch_register_cpu(int num)
 	parent = &sysfs_nodes[cpu_to_node(num)];
 #endif /* CONFIG_NUMA */
 
+#ifdef CONFIG_ACPI_BOOT
+	/*
+	 * If CPEI cannot be re-targetted, and this is
+	 * CPEI target, then dont create the control file
+	 */
+	if (!can_cpei_retarget() && is_cpu_cpei_target(num))
+		sysfs_cpus[num].cpu.no_control = 1;
+#endif
+
 	return register_cpu(&sysfs_cpus[num].cpu, num, parent);
 }
 
diff --git a/arch/ia64/kernel/traps.c b/arch/ia64/kernel/traps.c
index 1861173bd4f6..4440c8343fa4 100644
--- a/arch/ia64/kernel/traps.c
+++ b/arch/ia64/kernel/traps.c
@@ -21,12 +21,26 @@
 #include <asm/intrinsics.h>
 #include <asm/processor.h>
 #include <asm/uaccess.h>
+#include <asm/kdebug.h>
 
 extern spinlock_t timerlist_lock;
 
 fpswa_interface_t *fpswa_interface;
 EXPORT_SYMBOL(fpswa_interface);
 
+struct notifier_block *ia64die_chain;
+static DEFINE_SPINLOCK(die_notifier_lock);
+
+int register_die_notifier(struct notifier_block *nb)
+{
+	int err = 0;
+	unsigned long flags;
+	spin_lock_irqsave(&die_notifier_lock, flags);
+	err = notifier_chain_register(&ia64die_chain, nb);
+	spin_unlock_irqrestore(&die_notifier_lock, flags);
+	return err;
+}
+
 void __init
 trap_init (void)
 {
@@ -76,14 +90,16 @@ die (const char *str, struct pt_regs *regs, long err)
 		.lock_owner_depth =	0
 	};
 	static int die_counter;
+	int cpu = get_cpu();
 
-	if (die.lock_owner != smp_processor_id()) {
+	if (die.lock_owner != cpu) {
 		console_verbose();
 		spin_lock_irq(&die.lock);
-		die.lock_owner = smp_processor_id();
+		die.lock_owner = cpu;
 		die.lock_owner_depth = 0;
 		bust_spinlocks(1);
 	}
+	put_cpu();
 
 	if (++die.lock_owner_depth < 3) {
 		printk("%s[%d]: %s %ld [%d]\n",
@@ -137,6 +153,10 @@ ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 
 	switch (break_num) {
 	      case 0: /* unknown error (used by GCC for __builtin_abort()) */
+		if (notify_die(DIE_BREAK, "break 0", regs, break_num, TRAP_BRKPT, SIGTRAP)
+			       	== NOTIFY_STOP) {
+			return;
+		}
 		die_if_kernel("bugcheck!", regs, break_num);
 		sig = SIGILL; code = ILL_ILLOPC;
 		break;
@@ -189,6 +209,15 @@ ia64_bad_break (unsigned long break_num, struct pt_regs *regs)
 		sig = SIGILL; code = __ILL_BNDMOD;
 		break;
 
+	      case 0x80200:
+	      case 0x80300:
+		if (notify_die(DIE_BREAK, "kprobe", regs, break_num, TRAP_BRKPT, SIGTRAP)
+			       	== NOTIFY_STOP) {
+			return;
+		}
+		sig = SIGTRAP; code = TRAP_BRKPT;
+		break;
+
 	      default:
 		if (break_num < 0x40000 || break_num > 0x100000)
 			die_if_kernel("Bad break", regs, break_num);
@@ -548,7 +577,11 @@ ia64_fault (unsigned long vector, unsigned long isr, unsigned long ifa,
 #endif
 			break;
 		      case 35: siginfo.si_code = TRAP_BRANCH; ifa = 0; break;
-		      case 36: siginfo.si_code = TRAP_TRACE; ifa = 0; break;
+		      case 36:
+			      if (notify_die(DIE_SS, "ss", &regs, vector,
+					     vector, SIGTRAP) == NOTIFY_STOP)
+				      return;
+			      siginfo.si_code = TRAP_TRACE; ifa = 0; break;
 		}
 		siginfo.si_signo = SIGTRAP;
 		siginfo.si_errno = 0;
diff --git a/arch/ia64/kernel/unwind.c b/arch/ia64/kernel/unwind.c
index 2776a074c6f1..3288be47bc75 100644
--- a/arch/ia64/kernel/unwind.c
+++ b/arch/ia64/kernel/unwind.c
@@ -362,7 +362,7 @@ unw_access_gr (struct unw_frame_info *info, int regnum, unsigned long *val, char
 			if (info->pri_unat_loc)
 				nat_addr = info->pri_unat_loc;
 			else
-				nat_addr = &info->sw->ar_unat;
+				nat_addr = &info->sw->caller_unat;
 			nat_mask = (1UL << ((long) addr & 0x1f8)/8);
 		}
 	} else {
@@ -524,7 +524,7 @@ unw_access_ar (struct unw_frame_info *info, int regnum, unsigned long *val, int
 	      case UNW_AR_UNAT:
 		addr = info->unat_loc;
 		if (!addr)
-			addr = &info->sw->ar_unat;
+			addr = &info->sw->caller_unat;
 		break;
 
 	      case UNW_AR_LC:
@@ -1775,7 +1775,7 @@ run_script (struct unw_script *script, struct unw_frame_info *state)
 
 		      case UNW_INSN_SETNAT_MEMSTK:
 			if (!state->pri_unat_loc)
-				state->pri_unat_loc = &state->sw->ar_unat;
+				state->pri_unat_loc = &state->sw->caller_unat;
 			/* register off. is a multiple of 8, so the least 3 bits (type) are 0 */
 			s[dst+1] = ((unsigned long) state->pri_unat_loc - s[dst]) | UNW_NAT_MEMSTK;
 			break;
@@ -2243,11 +2243,11 @@ unw_init (void)
 	if (8*sizeof(unw_hash_index_t) < UNW_LOG_HASH_SIZE)
 		unw_hash_index_t_is_too_narrow();
 
-	unw.sw_off[unw.preg_index[UNW_REG_PRI_UNAT_GR]] = SW(AR_UNAT);
+	unw.sw_off[unw.preg_index[UNW_REG_PRI_UNAT_GR]] = SW(CALLER_UNAT);
 	unw.sw_off[unw.preg_index[UNW_REG_BSPSTORE]] = SW(AR_BSPSTORE);
-	unw.sw_off[unw.preg_index[UNW_REG_PFS]] = SW(AR_UNAT);
+	unw.sw_off[unw.preg_index[UNW_REG_PFS]] = SW(AR_PFS);
 	unw.sw_off[unw.preg_index[UNW_REG_RP]] = SW(B0);
-	unw.sw_off[unw.preg_index[UNW_REG_UNAT]] = SW(AR_UNAT);
+	unw.sw_off[unw.preg_index[UNW_REG_UNAT]] = SW(CALLER_UNAT);
 	unw.sw_off[unw.preg_index[UNW_REG_PR]] = SW(PR);
 	unw.sw_off[unw.preg_index[UNW_REG_LC]] = SW(AR_LC);
 	unw.sw_off[unw.preg_index[UNW_REG_FPSR]] = SW(AR_FPSR);
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index b9f0db4c1b04..a676e79e0681 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -8,6 +8,11 @@
 #define LOAD_OFFSET	(KERNEL_START - KERNEL_TR_PAGE_SIZE)
 #include <asm-generic/vmlinux.lds.h>
 
+#define IVT_TEXT							\
+		VMLINUX_SYMBOL(__start_ivt_text) = .;			\
+		*(.text.ivt)						\
+		VMLINUX_SYMBOL(__end_ivt_text) = .;
+
 OUTPUT_FORMAT("elf64-ia64-little")
 OUTPUT_ARCH(ia64)
 ENTRY(phys_start)
@@ -39,7 +44,7 @@ SECTIONS
 
   .text : AT(ADDR(.text) - LOAD_OFFSET)
     {
-	*(.text.ivt)
+	IVT_TEXT
 	*(.text)
 	SCHED_TEXT
 	LOCK_TEXT