From 904464b91eca8c665acea033489225af02eeb75a Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Fri, 22 Feb 2013 14:01:42 +0100
Subject: ARM: 7655/1: smp_twd: make twd_local_timer_of_register() no-op for
 nosmp

When booting a SMP build kernel with nosmp on kernel cmdline, the
following fat warning will be hit.

------------[ cut here ]------------
WARNING: at arch/arm/kernel/smp_twd.c:345
twd_local_timer_of_register+0x7c/0x90()
twd_local_timer_of_register failed (-6)
Modules linked in:
Backtrace:
[<80011f14>] (dump_backtrace+0x0/0x10c) from [<8044dd30>]
(dump_stack+0x18/0x1c)
 r7:805e9f58 r6:805ba84c r5:80539331 r4:00000159
[<8044dd18>] (dump_stack+0x0/0x1c) from [<80020fbc>]
(warn_slowpath_common+0x54/0x6c)
[<80020f68>] (warn_slowpath_common+0x0/0x6c) from [<80021078>]
(warn_slowpath_fmt+0x38/0x40)
 r9:412fc09a r8:8fffffff r7:ffffffff r6:00000001 r5:80633b8c
r4:80b32da8
[<80021040>] (warn_slowpath_fmt+0x0/0x40) from [<805ba84]
(twd_local_timer_of_register+0x7c/0x90)
 r3:fffffffa r2:8053934b
[<805ba7d0>] (twd_local_timer_of_register+0x0/0x90) from [<805c0bec>]
(imx6q_timer_init+0x18/0x4c)
 r5:80633800 r4:8053b701
[<805c0bd4>] (imx6q_timer_init+0x0/0x4c) from [<805ba4e8>]
(time_init+0x28/0x38)
 r5:80633800 r4:805dc0f4
[<805ba4c0>] (time_init+0x0/0x38) from [<805b6854>]
(start_kernel+0x1a0/0x310)
[<805b66b4>] (start_kernel+0x0/0x310) from [<10008044>] (0x10008044)
 r8:1000406a r7:805f3f8c r6:805dc0c4 r5:805f0518 r4:10c5387d
---[ end trace 1b75b31a2719ed1c ]---

Check (!is_smp() || !setup_max_cpus) in twd_local_timer_of_register()
to make it be a no-op for the conditions, thus avoid above warning.

Reported-by: Dirk Behme <dirk.behme@de.bosch.com>
Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/kernel/smp_twd.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/arm/kernel')

diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c
index c092115d903a..3f2565037480 100644
--- a/arch/arm/kernel/smp_twd.c
+++ b/arch/arm/kernel/smp_twd.c
@@ -22,6 +22,7 @@
 #include <linux/of_irq.h>
 #include <linux/of_address.h>
 
+#include <asm/smp_plat.h>
 #include <asm/smp_twd.h>
 #include <asm/localtimer.h>
 
@@ -373,6 +374,9 @@ void __init twd_local_timer_of_register(void)
 	struct device_node *np;
 	int err;
 
+	if (!is_smp() || !setup_max_cpus)
+		return;
+
 	np = of_find_matching_node(NULL, twd_of_match);
 	if (!np)
 		return;
-- 
cgit v1.2.3


From d61947a164760ac520cb416768afdf38c33d60e7 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 28 Feb 2013 17:46:16 +0100
Subject: ARM: 7657/1: head: fix swapper and idmap population with LPAE and
 big-endian

The LPAE page table format uses 64-bit descriptors, so we need to take
endianness into account when populating the swapper and idmap tables
during early initialisation.

This patch ensures that we store the two words making up each page table
entry in the correct order when running big-endian.

Cc: <stable@vger.kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/kernel/head.S | 26 ++++++++++++++++++++++----
 1 file changed, 22 insertions(+), 4 deletions(-)

(limited to 'arch/arm/kernel')

diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
index 486a15ae9011..e0eb9a1cae77 100644
--- a/arch/arm/kernel/head.S
+++ b/arch/arm/kernel/head.S
@@ -184,13 +184,22 @@ __create_page_tables:
 	orr	r3, r3, #3			@ PGD block type
 	mov	r6, #4				@ PTRS_PER_PGD
 	mov	r7, #1 << (55 - 32)		@ L_PGD_SWAPPER
-1:	str	r3, [r0], #4			@ set bottom PGD entry bits
+1:
+#ifdef CONFIG_CPU_ENDIAN_BE8
 	str	r7, [r0], #4			@ set top PGD entry bits
+	str	r3, [r0], #4			@ set bottom PGD entry bits
+#else
+	str	r3, [r0], #4			@ set bottom PGD entry bits
+	str	r7, [r0], #4			@ set top PGD entry bits
+#endif
 	add	r3, r3, #0x1000			@ next PMD table
 	subs	r6, r6, #1
 	bne	1b
 
 	add	r4, r4, #0x1000			@ point to the PMD tables
+#ifdef CONFIG_CPU_ENDIAN_BE8
+	add	r4, r4, #4			@ we only write the bottom word
+#endif
 #endif
 
 	ldr	r7, [r10, #PROCINFO_MM_MMUFLAGS] @ mm_mmuflags
@@ -258,6 +267,11 @@ __create_page_tables:
 	addne	r6, r6, #1 << SECTION_SHIFT
 	strne	r6, [r3]
 
+#if defined(CONFIG_LPAE) && defined(CONFIG_CPU_ENDIAN_BE8)
+	sub	r4, r4, #4			@ Fixup page table pointer
+						@ for 64-bit descriptors
+#endif
+
 #ifdef CONFIG_DEBUG_LL
 #if !defined(CONFIG_DEBUG_ICEDCC) && !defined(CONFIG_DEBUG_SEMIHOSTING)
 	/*
@@ -276,13 +290,17 @@ __create_page_tables:
 	orr	r3, r7, r3, lsl #SECTION_SHIFT
 #ifdef CONFIG_ARM_LPAE
 	mov	r7, #1 << (54 - 32)		@ XN
+#ifdef CONFIG_CPU_ENDIAN_BE8
+	str	r7, [r0], #4
+	str	r3, [r0], #4
 #else
-	orr	r3, r3, #PMD_SECT_XN
-#endif
 	str	r3, [r0], #4
-#ifdef CONFIG_ARM_LPAE
 	str	r7, [r0], #4
 #endif
+#else
+	orr	r3, r3, #PMD_SECT_XN
+	str	r3, [r0], #4
+#endif
 
 #else /* CONFIG_DEBUG_ICEDCC || CONFIG_DEBUG_SEMIHOSTING */
 	/* we don't need any serial debugging mappings */
-- 
cgit v1.2.3


From 8a4e3a9ead7e37ce1505602b564c15da09ac039f Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 28 Feb 2013 17:47:36 +0100
Subject: ARM: 7659/1: mm: make mm->context.id an atomic64_t variable

mm->context.id is updated under asid_lock when a new ASID is allocated
to an mm_struct. However, it is also read without the lock when a task
is being scheduled and checking whether or not the current ASID
generation is up-to-date.

If two threads of the same process are being scheduled in parallel and
the bottom bits of the generation in their mm->context.id match the
current generation (that is, the mm_struct has not been used for ~2^24
rollovers) then the non-atomic, lockless access to mm->context.id may
yield the incorrect ASID.

This patch fixes this issue by making mm->context.id and atomic64_t,
ensuring that the generation is always read consistently. For code that
only requires access to the ASID bits (e.g. TLB flushing by mm), then
the value is accessed directly, which GCC converts to an ldrb.

Cc: <stable@vger.kernel.org> # 3.8
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/mmu.h         |  8 ++++----
 arch/arm/include/asm/mmu_context.h |  2 +-
 arch/arm/kernel/asm-offsets.c      |  2 +-
 arch/arm/mm/context.c              | 21 +++++++++++++--------
 4 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'arch/arm/kernel')

diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h
index 9f77e7804f3b..e3d55547e755 100644
--- a/arch/arm/include/asm/mmu.h
+++ b/arch/arm/include/asm/mmu.h
@@ -5,15 +5,15 @@
 
 typedef struct {
 #ifdef CONFIG_CPU_HAS_ASID
-	u64 id;
+	atomic64_t	id;
 #endif
-	unsigned int vmalloc_seq;
+	unsigned int	vmalloc_seq;
 } mm_context_t;
 
 #ifdef CONFIG_CPU_HAS_ASID
 #define ASID_BITS	8
 #define ASID_MASK	((~0ULL) << ASID_BITS)
-#define ASID(mm)	((mm)->context.id & ~ASID_MASK)
+#define ASID(mm)	((mm)->context.id.counter & ~ASID_MASK)
 #else
 #define ASID(mm)	(0)
 #endif
@@ -26,7 +26,7 @@ typedef struct {
  *  modified for 2.6 by Hyok S. Choi <hyok.choi@samsung.com>
  */
 typedef struct {
-	unsigned long		end_brk;
+	unsigned long	end_brk;
 } mm_context_t;
 
 #endif
diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index e1f644bc7cc5..863a6611323c 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -25,7 +25,7 @@ void __check_vmalloc_seq(struct mm_struct *mm);
 #ifdef CONFIG_CPU_HAS_ASID
 
 void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk);
-#define init_new_context(tsk,mm)	({ mm->context.id = 0; })
+#define init_new_context(tsk,mm)	({ atomic64_set(&mm->context.id, 0); 0; })
 
 #else	/* !CONFIG_CPU_HAS_ASID */
 
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 5ce738b43508..923eec7105cf 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -110,7 +110,7 @@ int main(void)
   BLANK();
 #endif
 #ifdef CONFIG_CPU_HAS_ASID
-  DEFINE(MM_CONTEXT_ID,		offsetof(struct mm_struct, context.id));
+  DEFINE(MM_CONTEXT_ID,		offsetof(struct mm_struct, context.id.counter));
   BLANK();
 #endif
   DEFINE(VMA_VM_MM,		offsetof(struct vm_area_struct, vm_mm));
diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c
index 03ba181e359c..44d4ee52f3e2 100644
--- a/arch/arm/mm/context.c
+++ b/arch/arm/mm/context.c
@@ -152,9 +152,9 @@ static int is_reserved_asid(u64 asid)
 	return 0;
 }
 
-static void new_context(struct mm_struct *mm, unsigned int cpu)
+static u64 new_context(struct mm_struct *mm, unsigned int cpu)
 {
-	u64 asid = mm->context.id;
+	u64 asid = atomic64_read(&mm->context.id);
 	u64 generation = atomic64_read(&asid_generation);
 
 	if (asid != 0 && is_reserved_asid(asid)) {
@@ -181,13 +181,14 @@ static void new_context(struct mm_struct *mm, unsigned int cpu)
 		cpumask_clear(mm_cpumask(mm));
 	}
 
-	mm->context.id = asid;
+	return asid;
 }
 
 void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk)
 {
 	unsigned long flags;
 	unsigned int cpu = smp_processor_id();
+	u64 asid;
 
 	if (unlikely(mm->context.vmalloc_seq != init_mm.context.vmalloc_seq))
 		__check_vmalloc_seq(mm);
@@ -198,19 +199,23 @@ void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk)
 	 */
 	cpu_set_reserved_ttbr0();
 
-	if (!((mm->context.id ^ atomic64_read(&asid_generation)) >> ASID_BITS)
-	    && atomic64_xchg(&per_cpu(active_asids, cpu), mm->context.id))
+	asid = atomic64_read(&mm->context.id);
+	if (!((asid ^ atomic64_read(&asid_generation)) >> ASID_BITS)
+	    && atomic64_xchg(&per_cpu(active_asids, cpu), asid))
 		goto switch_mm_fastpath;
 
 	raw_spin_lock_irqsave(&cpu_asid_lock, flags);
 	/* Check that our ASID belongs to the current generation. */
-	if ((mm->context.id ^ atomic64_read(&asid_generation)) >> ASID_BITS)
-		new_context(mm, cpu);
+	asid = atomic64_read(&mm->context.id);
+	if ((asid ^ atomic64_read(&asid_generation)) >> ASID_BITS) {
+		asid = new_context(mm, cpu);
+		atomic64_set(&mm->context.id, asid);
+	}
 
 	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
 		local_flush_tlb_all();
 
-	atomic64_set(&per_cpu(active_asids, cpu), mm->context.id);
+	atomic64_set(&per_cpu(active_asids, cpu), asid);
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
 	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 
-- 
cgit v1.2.3


From 862c588f062fe9339a180cf6429e4df1855c376a Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 28 Feb 2013 17:48:11 +0100
Subject: ARM: 7660/1: tlb: add branch predictor maintenance operations

The ARM architecture requires explicit branch predictor maintenance
when updating an instruction stream for a given virtual address. In
reality, this isn't so much of a burden because the branch predictor
is flushed during the cache maintenance required to make the new
instructions visible to the I-side of the processor.

However, there are still some cases where explicit flushing is required,
so add a local_bp_flush_all operation to deal with this.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/tlbflush.h | 34 ++++++++++++++++++++++++++++------
 arch/arm/kernel/smp_tlb.c       | 12 ++++++++++++
 2 files changed, 40 insertions(+), 6 deletions(-)

(limited to 'arch/arm/kernel')

diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
index 6e924d3a77eb..4db8c8820f0d 100644
--- a/arch/arm/include/asm/tlbflush.h
+++ b/arch/arm/include/asm/tlbflush.h
@@ -34,10 +34,13 @@
 #define TLB_V6_D_ASID	(1 << 17)
 #define TLB_V6_I_ASID	(1 << 18)
 
+#define TLB_V6_BP	(1 << 19)
+
 /* Unified Inner Shareable TLB operations (ARMv7 MP extensions) */
-#define TLB_V7_UIS_PAGE	(1 << 19)
-#define TLB_V7_UIS_FULL (1 << 20)
-#define TLB_V7_UIS_ASID (1 << 21)
+#define TLB_V7_UIS_PAGE	(1 << 20)
+#define TLB_V7_UIS_FULL (1 << 21)
+#define TLB_V7_UIS_ASID (1 << 22)
+#define TLB_V7_UIS_BP	(1 << 23)
 
 #define TLB_BARRIER	(1 << 28)
 #define TLB_L2CLEAN_FR	(1 << 29)		/* Feroceon */
@@ -150,7 +153,8 @@
 #define v6wbi_tlb_flags (TLB_WB | TLB_DCLEAN | TLB_BARRIER | \
 			 TLB_V6_I_FULL | TLB_V6_D_FULL | \
 			 TLB_V6_I_PAGE | TLB_V6_D_PAGE | \
-			 TLB_V6_I_ASID | TLB_V6_D_ASID)
+			 TLB_V6_I_ASID | TLB_V6_D_ASID | \
+			 TLB_V6_BP)
 
 #ifdef CONFIG_CPU_TLB_V6
 # define v6wbi_possible_flags	v6wbi_tlb_flags
@@ -166,9 +170,11 @@
 #endif
 
 #define v7wbi_tlb_flags_smp	(TLB_WB | TLB_DCLEAN | TLB_BARRIER | \
-			 TLB_V7_UIS_FULL | TLB_V7_UIS_PAGE | TLB_V7_UIS_ASID)
+				 TLB_V7_UIS_FULL | TLB_V7_UIS_PAGE | \
+				 TLB_V7_UIS_ASID | TLB_V7_UIS_BP)
 #define v7wbi_tlb_flags_up	(TLB_WB | TLB_DCLEAN | TLB_BARRIER | \
-			 TLB_V6_U_FULL | TLB_V6_U_PAGE | TLB_V6_U_ASID)
+				 TLB_V6_U_FULL | TLB_V6_U_PAGE | \
+				 TLB_V6_U_ASID | TLB_V6_BP)
 
 #ifdef CONFIG_CPU_TLB_V7
 
@@ -430,6 +436,20 @@ static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
 	}
 }
 
+static inline void local_flush_bp_all(void)
+{
+	const int zero = 0;
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+
+	if (tlb_flag(TLB_V7_UIS_BP))
+		asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero));
+	else if (tlb_flag(TLB_V6_BP))
+		asm("mcr p15, 0, %0, c7, c5, 6" : : "r" (zero));
+
+	if (tlb_flag(TLB_BARRIER))
+		isb();
+}
+
 /*
  *	flush_pmd_entry
  *
@@ -480,6 +500,7 @@ static inline void clean_pmd_entry(void *pmd)
 #define flush_tlb_kernel_page	local_flush_tlb_kernel_page
 #define flush_tlb_range		local_flush_tlb_range
 #define flush_tlb_kernel_range	local_flush_tlb_kernel_range
+#define flush_bp_all		local_flush_bp_all
 #else
 extern void flush_tlb_all(void);
 extern void flush_tlb_mm(struct mm_struct *mm);
@@ -487,6 +508,7 @@ extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr);
 extern void flush_tlb_kernel_page(unsigned long kaddr);
 extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end);
 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
+extern void flush_bp_all(void);
 #endif
 
 /*
diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c
index 02c5d2ce23bf..bd0300531399 100644
--- a/arch/arm/kernel/smp_tlb.c
+++ b/arch/arm/kernel/smp_tlb.c
@@ -64,6 +64,11 @@ static inline void ipi_flush_tlb_kernel_range(void *arg)
 	local_flush_tlb_kernel_range(ta->ta_start, ta->ta_end);
 }
 
+static inline void ipi_flush_bp_all(void *ignored)
+{
+	local_flush_bp_all();
+}
+
 void flush_tlb_all(void)
 {
 	if (tlb_ops_need_broadcast())
@@ -127,3 +132,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 		local_flush_tlb_kernel_range(start, end);
 }
 
+void flush_bp_all(void)
+{
+	if (tlb_ops_need_broadcast())
+		on_each_cpu(ipi_flush_bp_all, NULL, 1);
+	else
+		local_flush_bp_all();
+}
-- 
cgit v1.2.3


From 89c7e4b8bbb3d4fa52df5746a8ad38e610143651 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 28 Feb 2013 17:48:40 +0100
Subject: ARM: 7661/1: mm: perform explicit branch predictor maintenance when
 required

The ARM ARM requires branch predictor maintenance if, for a given ASID,
the instructions at a specific virtual address appear to change.

From the kernel's point of view, that means:

	- Changing the kernel's view of memory (e.g. switching to the
	  identity map)
	- ASID rollover (since ASIDs will be re-allocated to new tasks)

This patch adds explicit branch predictor maintenance when either of the
two conditions above are met.

Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/kernel/smp.c     | 1 +
 arch/arm/kernel/suspend.c | 1 +
 arch/arm/mm/context.c     | 4 +++-
 arch/arm/mm/idmap.c       | 1 +
 4 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch/arm/kernel')

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 1bdfd87c8e41..31644f1978d5 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -285,6 +285,7 @@ asmlinkage void __cpuinit secondary_start_kernel(void)
 	 * switch away from it before attempting any exclusive accesses.
 	 */
 	cpu_switch_mm(mm->pgd, mm);
+	local_flush_bp_all();
 	enter_lazy_tlb(mm, current);
 	local_flush_tlb_all();
 
diff --git a/arch/arm/kernel/suspend.c b/arch/arm/kernel/suspend.c
index 358bca3a995e..c59c97ea8268 100644
--- a/arch/arm/kernel/suspend.c
+++ b/arch/arm/kernel/suspend.c
@@ -68,6 +68,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long))
 	ret = __cpu_suspend(arg, fn);
 	if (ret == 0) {
 		cpu_switch_mm(mm->pgd, mm);
+		local_flush_bp_all();
 		local_flush_tlb_all();
 	}
 
diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c
index 44d4ee52f3e2..a5a4b2bc42ba 100644
--- a/arch/arm/mm/context.c
+++ b/arch/arm/mm/context.c
@@ -212,8 +212,10 @@ void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk)
 		atomic64_set(&mm->context.id, asid);
 	}
 
-	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending))
+	if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) {
+		local_flush_bp_all();
 		local_flush_tlb_all();
+	}
 
 	atomic64_set(&per_cpu(active_asids, cpu), asid);
 	cpumask_set_cpu(cpu, mm_cpumask(mm));
diff --git a/arch/arm/mm/idmap.c b/arch/arm/mm/idmap.c
index 2dffc010cc41..5ee505c937d1 100644
--- a/arch/arm/mm/idmap.c
+++ b/arch/arm/mm/idmap.c
@@ -141,6 +141,7 @@ void setup_mm_for_reboot(void)
 {
 	/* Switch to the identity mapping. */
 	cpu_switch_mm(idmap_pgd, &init_mm);
+	local_flush_bp_all();
 
 #ifdef CONFIG_CPU_HAS_ASID
 	/*
-- 
cgit v1.2.3


From 1a8e611874da714ee7ef1e92e5160b38dc54959b Mon Sep 17 00:00:00 2001
From: Dietmar Eggemann <Dietmar.Eggemann@arm.com>
Date: Thu, 28 Feb 2013 17:48:57 +0100
Subject: ARM: 7662/1: hw_breakpoint: reset debug logic on secondary CPUs in
 s2ram resume

We must mask out the CPU_TASKS_FROZEN bit so that reset_ctrl_regs is
also called on a secondary CPU during s2ram resume, where only the boot
CPU will receive the PM_EXIT notification.

Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/kernel/hw_breakpoint.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/arm/kernel')

diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 5eae53e7a2e1..96093b75ab90 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -1023,7 +1023,7 @@ out_mdbgen:
 static int __cpuinit dbg_reset_notify(struct notifier_block *self,
 				      unsigned long action, void *cpu)
 {
-	if (action == CPU_ONLINE)
+	if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE)
 		smp_call_function_single((int)cpu, reset_ctrl_regs, NULL, 1);
 
 	return NOTIFY_OK;
-- 
cgit v1.2.3


From f2fe09b055e2549de41fb107b34c60bac4a1b0cf Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Thu, 28 Feb 2013 17:49:11 +0100
Subject: ARM: 7663/1: perf: fix ARMv7 EVTYPE_MASK to include NSH bit

Masked out PMXEVTYPER.NSH means that we can't enable profiling at PL2,
regardless of the settings in the HDCR.

This patch fixes the broken mask.

Cc: <stable@vger.kernel.org>
Reported-by: Christoffer Dall <cdall@cs.columbia.edu>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/kernel/perf_event_v7.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/arm/kernel')

diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 8c79a9e70b83..039cffb053a7 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -774,7 +774,7 @@ static const unsigned armv7_a7_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 /*
  * PMXEVTYPER: Event selection reg
  */
-#define	ARMV7_EVTYPE_MASK	0xc00000ff	/* Mask for writable bits */
+#define	ARMV7_EVTYPE_MASK	0xc80000ff	/* Mask for writable bits */
 #define	ARMV7_EVTYPE_EVENT	0xff		/* Mask for EVENT bits */
 
 /*
-- 
cgit v1.2.3


From e595ede6050b1ce982d74f7084f93715bcc32359 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Thu, 28 Feb 2013 17:51:29 +0100
Subject: ARM: 7664/1: perf: remove erroneous semicolon from event
 initialisation

Commit 9dcbf466559f ("ARM: perf: simplify __hw_perf_event_init err
handling") tidied up the error handling code for perf event
initialisation on ARM, but a copy-and-paste error left a dangling
semicolon at the end of an if statement.

This patch removes the broken semicolon, restoring the old group
validation semantics.

Cc: Mark Rutland <mark.rutland@arm.com>
Acked-by: Dirk Behme <dirk.behme@gmail.com>
Signed-off-by: Chen Gang <gang.chen@asianux.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/arm/kernel')

diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 31e0eb353cd8..a89206799db8 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -400,7 +400,7 @@ __hw_perf_event_init(struct perf_event *event)
 	}
 
 	if (event->group_leader != event) {
-		if (validate_group(event) != 0);
+		if (validate_group(event) != 0)
 			return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 3f7d1fe108dbaefd0c57a41753fc2c90b395f458 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <cyrill.gorcunov@gmail.com>
Date: Thu, 28 Feb 2013 21:53:35 +0100
Subject: ARM: 7665/1: Wire up kcmp syscall

Wire up kcmp syscall for ability to proceed checkpoint/restore
procedure on ARM platform.

Signed-off-by: Alexander Kartashov <alekskartashov@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/uapi/asm/unistd.h | 2 +-
 arch/arm/kernel/calls.S            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/arm/kernel')

diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index 4da7cde70b5d..af33b44990ed 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -404,7 +404,7 @@
 #define __NR_setns			(__NR_SYSCALL_BASE+375)
 #define __NR_process_vm_readv		(__NR_SYSCALL_BASE+376)
 #define __NR_process_vm_writev		(__NR_SYSCALL_BASE+377)
-					/* 378 for kcmp */
+#define __NR_kcmp			(__NR_SYSCALL_BASE+378)
 #define __NR_finit_module		(__NR_SYSCALL_BASE+379)
 
 /*
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index 0cc57611fc4f..c6ca7e376773 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -387,7 +387,7 @@
 /* 375 */	CALL(sys_setns)
 		CALL(sys_process_vm_readv)
 		CALL(sys_process_vm_writev)
-		CALL(sys_ni_syscall)	/* reserved for sys_kcmp */
+		CALL(sys_kcmp)
 		CALL(sys_finit_module)
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
-- 
cgit v1.2.3


From 44d6b1fc3e3c6a3af8e599b724972e881c81e1c9 Mon Sep 17 00:00:00 2001
From: Stephen Boyd <sboyd@codeaurora.org>
Date: Tue, 5 Mar 2013 03:54:06 +0100
Subject: ARM: 7667/1: perf: Fix section mismatch on armpmu_init()

WARNING: vmlinux.o(.text+0xfb80): Section mismatch in reference
from the function armpmu_register() to the function
.init.text:armpmu_init()
The function armpmu_register() references
the function __init armpmu_init().
This is often because armpmu_register lacks a __init
annotation or the annotation of armpmu_init is wrong.

Just drop the __init marking on armpmu_init() because
armpmu_register() no longer has an __init marking.

Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/arm/kernel')

diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index a89206799db8..146157dfe27c 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -484,7 +484,7 @@ const struct dev_pm_ops armpmu_dev_pm_ops = {
 	SET_RUNTIME_PM_OPS(armpmu_runtime_suspend, armpmu_runtime_resume, NULL)
 };
 
-static void __init armpmu_init(struct arm_pmu *armpmu)
+static void armpmu_init(struct arm_pmu *armpmu)
 {
 	atomic_set(&armpmu->active_events, 0);
 	mutex_init(&armpmu->reserve_mutex);
-- 
cgit v1.2.3


From f7db706b132f11c79ae1d74b2382e0926cf31644 Mon Sep 17 00:00:00 2001
From: Santosh Shilimkar <santosh.shilimkar@ti.com>
Date: Thu, 14 Mar 2013 10:03:03 +0100
Subject: ARM: 7674/1: smp: Avoid dummy clockevent being preferred over real
 hardware clock-event

With recent arm broadcast time clean-up from Mark Rutland, the dummy
broadcast device is always registered with timer subsystem. And since
the rating of the dummy clock event is very high, it may be preferred
over a real clock event.

This is a change in behavior from past and not an intended one. So
reduce the rating of the dummy clock-event so that real clock-event
device is selected when available.

Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/arm/kernel')

diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 31644f1978d5..79078edbb9bc 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -480,7 +480,7 @@ static void __cpuinit broadcast_timer_setup(struct clock_event_device *evt)
 	evt->features	= CLOCK_EVT_FEAT_ONESHOT |
 			  CLOCK_EVT_FEAT_PERIODIC |
 			  CLOCK_EVT_FEAT_DUMMY;
-	evt->rating	= 400;
+	evt->rating	= 100;
 	evt->mult	= 1;
 	evt->set_mode	= broadcast_timer_set_mode;
 
-- 
cgit v1.2.3


From 7fe31d28e839f9565c8176ec584676a045970802 Mon Sep 17 00:00:00 2001
From: Dave Martin <dave.martin@linaro.org>
Date: Tue, 17 Jul 2012 14:25:42 +0100
Subject: ARM: mcpm: introduce helpers for platform coherency exit/setup

This provides helper methods to coordinate between CPUs coming down
and CPUs going up, as well as documentation on the used algorithms,
so that cluster teardown and setup
operations are not done for a cluster simultaneously.

For use in the power_down() implementation:
  * __mcpm_cpu_going_down(unsigned int cluster, unsigned int cpu)
  * __mcpm_outbound_enter_critical(unsigned int cluster)
  * __mcpm_outbound_leave_critical(unsigned int cluster)
  * __mcpm_cpu_down(unsigned int cluster, unsigned int cpu)

The power_up_setup() helper should do platform-specific setup in
preparation for turning the CPU on, such as invalidating local caches
or entering coherency.  It must be assembler for now, since it must
run before the MMU can be switched on.  It is passed the affinity level
for which initialization should be performed.

Because the mcpm_sync_struct content is looked-up and modified
with the cache enabled or disabled depending on the code path, it is
crucial to always ensure proper cache maintenance to update main memory
right away.  The sync_cache_*() helpers are used to that end.

Also, in order to prevent a cached writer from interfering with an
adjacent non-cached writer, we ensure each state variable is located to
a separate cache line.

Thanks to Nicolas Pitre and Achin Gupta for the help with this
patch.

Signed-off-by: Dave Martin <dave.martin@linaro.org>
Signed-off-by: Nicolas Pitre <nico@linaro.org>
Reviewed-by: Will Deacon <will.deacon@arm.com>
---
 Documentation/arm/cluster-pm-race-avoidance.txt | 498 ++++++++++++++++++++++++
 arch/arm/common/mcpm_entry.c                    | 150 +++++++
 arch/arm/common/mcpm_head.S                     | 106 ++++-
 arch/arm/include/asm/mcpm.h                     |  73 ++++
 arch/arm/kernel/asm-offsets.c                   |   3 +
 5 files changed, 828 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/arm/cluster-pm-race-avoidance.txt

(limited to 'arch/arm/kernel')

diff --git a/Documentation/arm/cluster-pm-race-avoidance.txt b/Documentation/arm/cluster-pm-race-avoidance.txt
new file mode 100644
index 000000000000..750b6fc24af9
--- /dev/null
+++ b/Documentation/arm/cluster-pm-race-avoidance.txt
@@ -0,0 +1,498 @@
+Cluster-wide Power-up/power-down race avoidance algorithm
+=========================================================
+
+This file documents the algorithm which is used to coordinate CPU and
+cluster setup and teardown operations and to manage hardware coherency
+controls safely.
+
+The section "Rationale" explains what the algorithm is for and why it is
+needed.  "Basic model" explains general concepts using a simplified view
+of the system.  The other sections explain the actual details of the
+algorithm in use.
+
+
+Rationale
+---------
+
+In a system containing multiple CPUs, it is desirable to have the
+ability to turn off individual CPUs when the system is idle, reducing
+power consumption and thermal dissipation.
+
+In a system containing multiple clusters of CPUs, it is also desirable
+to have the ability to turn off entire clusters.
+
+Turning entire clusters off and on is a risky business, because it
+involves performing potentially destructive operations affecting a group
+of independently running CPUs, while the OS continues to run.  This
+means that we need some coordination in order to ensure that critical
+cluster-level operations are only performed when it is truly safe to do
+so.
+
+Simple locking may not be sufficient to solve this problem, because
+mechanisms like Linux spinlocks may rely on coherency mechanisms which
+are not immediately enabled when a cluster powers up.  Since enabling or
+disabling those mechanisms may itself be a non-atomic operation (such as
+writing some hardware registers and invalidating large caches), other
+methods of coordination are required in order to guarantee safe
+power-down and power-up at the cluster level.
+
+The mechanism presented in this document describes a coherent memory
+based protocol for performing the needed coordination.  It aims to be as
+lightweight as possible, while providing the required safety properties.
+
+
+Basic model
+-----------
+
+Each cluster and CPU is assigned a state, as follows:
+
+	DOWN
+	COMING_UP
+	UP
+	GOING_DOWN
+
+	    +---------> UP ----------+
+	    |                        v
+
+	COMING_UP                GOING_DOWN
+
+	    ^                        |
+	    +--------- DOWN <--------+
+
+
+DOWN:	The CPU or cluster is not coherent, and is either powered off or
+	suspended, or is ready to be powered off or suspended.
+
+COMING_UP: The CPU or cluster has committed to moving to the UP state.
+	It may be part way through the process of initialisation and
+	enabling coherency.
+
+UP:	The CPU or cluster is active and coherent at the hardware
+	level.  A CPU in this state is not necessarily being used
+	actively by the kernel.
+
+GOING_DOWN: The CPU or cluster has committed to moving to the DOWN
+	state.  It may be part way through the process of teardown and
+	coherency exit.
+
+
+Each CPU has one of these states assigned to it at any point in time.
+The CPU states are described in the "CPU state" section, below.
+
+Each cluster is also assigned a state, but it is necessary to split the
+state value into two parts (the "cluster" state and "inbound" state) and
+to introduce additional states in order to avoid races between different
+CPUs in the cluster simultaneously modifying the state.  The cluster-
+level states are described in the "Cluster state" section.
+
+To help distinguish the CPU states from cluster states in this
+discussion, the state names are given a CPU_ prefix for the CPU states,
+and a CLUSTER_ or INBOUND_ prefix for the cluster states.
+
+
+CPU state
+---------
+
+In this algorithm, each individual core in a multi-core processor is
+referred to as a "CPU".  CPUs are assumed to be single-threaded:
+therefore, a CPU can only be doing one thing at a single point in time.
+
+This means that CPUs fit the basic model closely.
+
+The algorithm defines the following states for each CPU in the system:
+
+	CPU_DOWN
+	CPU_COMING_UP
+	CPU_UP
+	CPU_GOING_DOWN
+
+	 cluster setup and
+	CPU setup complete          policy decision
+	      +-----------> CPU_UP ------------+
+	      |                                v
+
+	CPU_COMING_UP                   CPU_GOING_DOWN
+
+	      ^                                |
+	      +----------- CPU_DOWN <----------+
+	 policy decision           CPU teardown complete
+	or hardware event
+
+
+The definitions of the four states correspond closely to the states of
+the basic model.
+
+Transitions between states occur as follows.
+
+A trigger event (spontaneous) means that the CPU can transition to the
+next state as a result of making local progress only, with no
+requirement for any external event to happen.
+
+
+CPU_DOWN:
+
+	A CPU reaches the CPU_DOWN state when it is ready for
+	power-down.  On reaching this state, the CPU will typically
+	power itself down or suspend itself, via a WFI instruction or a
+	firmware call.
+
+	Next state:	CPU_COMING_UP
+	Conditions:	none
+
+	Trigger events:
+
+		a) an explicit hardware power-up operation, resulting
+		   from a policy decision on another CPU;
+
+		b) a hardware event, such as an interrupt.
+
+
+CPU_COMING_UP:
+
+	A CPU cannot start participating in hardware coherency until the
+	cluster is set up and coherent.  If the cluster is not ready,
+	then the CPU will wait in the CPU_COMING_UP state until the
+	cluster has been set up.
+
+	Next state:	CPU_UP
+	Conditions:	The CPU's parent cluster must be in CLUSTER_UP.
+	Trigger events:	Transition of the parent cluster to CLUSTER_UP.
+
+	Refer to the "Cluster state" section for a description of the
+	CLUSTER_UP state.
+
+
+CPU_UP:
+	When a CPU reaches the CPU_UP state, it is safe for the CPU to
+	start participating in local coherency.
+
+	This is done by jumping to the kernel's CPU resume code.
+
+	Note that the definition of this state is slightly different
+	from the basic model definition: CPU_UP does not mean that the
+	CPU is coherent yet, but it does mean that it is safe to resume
+	the kernel.  The kernel handles the rest of the resume
+	procedure, so the remaining steps are not visible as part of the
+	race avoidance algorithm.
+
+	The CPU remains in this state until an explicit policy decision
+	is made to shut down or suspend the CPU.
+
+	Next state:	CPU_GOING_DOWN
+	Conditions:	none
+	Trigger events:	explicit policy decision
+
+
+CPU_GOING_DOWN:
+
+	While in this state, the CPU exits coherency, including any
+	operations required to achieve this (such as cleaning data
+	caches).
+
+	Next state:	CPU_DOWN
+	Conditions:	local CPU teardown complete
+	Trigger events:	(spontaneous)
+
+
+Cluster state
+-------------
+
+A cluster is a group of connected CPUs with some common resources.
+Because a cluster contains multiple CPUs, it can be doing multiple
+things at the same time.  This has some implications.  In particular, a
+CPU can start up while another CPU is tearing the cluster down.
+
+In this discussion, the "outbound side" is the view of the cluster state
+as seen by a CPU tearing the cluster down.  The "inbound side" is the
+view of the cluster state as seen by a CPU setting the CPU up.
+
+In order to enable safe coordination in such situations, it is important
+that a CPU which is setting up the cluster can advertise its state
+independently of the CPU which is tearing down the cluster.  For this
+reason, the cluster state is split into two parts:
+
+	"cluster" state: The global state of the cluster; or the state
+		on the outbound side:
+
+		CLUSTER_DOWN
+		CLUSTER_UP
+		CLUSTER_GOING_DOWN
+
+	"inbound" state: The state of the cluster on the inbound side.
+
+		INBOUND_NOT_COMING_UP
+		INBOUND_COMING_UP
+
+
+	The different pairings of these states results in six possible
+	states for the cluster as a whole:
+
+	                            CLUSTER_UP
+	          +==========> INBOUND_NOT_COMING_UP -------------+
+	          #                                               |
+	                                                          |
+	     CLUSTER_UP     <----+                                |
+	  INBOUND_COMING_UP      |                                v
+
+	          ^             CLUSTER_GOING_DOWN       CLUSTER_GOING_DOWN
+	          #              INBOUND_COMING_UP <=== INBOUND_NOT_COMING_UP
+
+	    CLUSTER_DOWN         |                                |
+	  INBOUND_COMING_UP <----+                                |
+	                                                          |
+	          ^                                               |
+	          +===========     CLUSTER_DOWN      <------------+
+	                       INBOUND_NOT_COMING_UP
+
+	Transitions -----> can only be made by the outbound CPU, and
+	only involve changes to the "cluster" state.
+
+	Transitions ===##> can only be made by the inbound CPU, and only
+	involve changes to the "inbound" state, except where there is no
+	further transition possible on the outbound side (i.e., the
+	outbound CPU has put the cluster into the CLUSTER_DOWN state).
+
+	The race avoidance algorithm does not provide a way to determine
+	which exact CPUs within the cluster play these roles.  This must
+	be decided in advance by some other means.  Refer to the section
+	"Last man and first man selection" for more explanation.
+
+
+	CLUSTER_DOWN/INBOUND_NOT_COMING_UP is the only state where the
+	cluster can actually be powered down.
+
+	The parallelism of the inbound and outbound CPUs is observed by
+	the existence of two different paths from CLUSTER_GOING_DOWN/
+	INBOUND_NOT_COMING_UP (corresponding to GOING_DOWN in the basic
+	model) to CLUSTER_DOWN/INBOUND_COMING_UP (corresponding to
+	COMING_UP in the basic model).  The second path avoids cluster
+	teardown completely.
+
+	CLUSTER_UP/INBOUND_COMING_UP is equivalent to UP in the basic
+	model.  The final transition to CLUSTER_UP/INBOUND_NOT_COMING_UP
+	is trivial and merely resets the state machine ready for the
+	next cycle.
+
+	Details of the allowable transitions follow.
+
+	The next state in each case is notated
+
+		<cluster state>/<inbound state> (<transitioner>)
+
+	where the <transitioner> is the side on which the transition
+	can occur; either the inbound or the outbound side.
+
+
+CLUSTER_DOWN/INBOUND_NOT_COMING_UP:
+
+	Next state:	CLUSTER_DOWN/INBOUND_COMING_UP (inbound)
+	Conditions:	none
+	Trigger events:
+
+		a) an explicit hardware power-up operation, resulting
+		   from a policy decision on another CPU;
+
+		b) a hardware event, such as an interrupt.
+
+
+CLUSTER_DOWN/INBOUND_COMING_UP:
+
+	In this state, an inbound CPU sets up the cluster, including
+	enabling of hardware coherency at the cluster level and any
+	other operations (such as cache invalidation) which are required
+	in order to achieve this.
+
+	The purpose of this state is to do sufficient cluster-level
+	setup to enable other CPUs in the cluster to enter coherency
+	safely.
+
+	Next state:	CLUSTER_UP/INBOUND_COMING_UP (inbound)
+	Conditions:	cluster-level setup and hardware coherency complete
+	Trigger events:	(spontaneous)
+
+
+CLUSTER_UP/INBOUND_COMING_UP:
+
+	Cluster-level setup is complete and hardware coherency is
+	enabled for the cluster.  Other CPUs in the cluster can safely
+	enter coherency.
+
+	This is a transient state, leading immediately to
+	CLUSTER_UP/INBOUND_NOT_COMING_UP.  All other CPUs on the cluster
+	should consider treat these two states as equivalent.
+
+	Next state:	CLUSTER_UP/INBOUND_NOT_COMING_UP (inbound)
+	Conditions:	none
+	Trigger events:	(spontaneous)
+
+
+CLUSTER_UP/INBOUND_NOT_COMING_UP:
+
+	Cluster-level setup is complete and hardware coherency is
+	enabled for the cluster.  Other CPUs in the cluster can safely
+	enter coherency.
+
+	The cluster will remain in this state until a policy decision is
+	made to power the cluster down.
+
+	Next state:	CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP (outbound)
+	Conditions:	none
+	Trigger events:	policy decision to power down the cluster
+
+
+CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP:
+
+	An outbound CPU is tearing the cluster down.  The selected CPU
+	must wait in this state until all CPUs in the cluster are in the
+	CPU_DOWN state.
+
+	When all CPUs are in the CPU_DOWN state, the cluster can be torn
+	down, for example by cleaning data caches and exiting
+	cluster-level coherency.
+
+	To avoid wasteful unnecessary teardown operations, the outbound
+	should check the inbound cluster state for asynchronous
+	transitions to INBOUND_COMING_UP.  Alternatively, individual
+	CPUs can be checked for entry into CPU_COMING_UP or CPU_UP.
+
+
+	Next states:
+
+	CLUSTER_DOWN/INBOUND_NOT_COMING_UP (outbound)
+		Conditions:	cluster torn down and ready to power off
+		Trigger events:	(spontaneous)
+
+	CLUSTER_GOING_DOWN/INBOUND_COMING_UP (inbound)
+		Conditions:	none
+		Trigger events:
+
+			a) an explicit hardware power-up operation,
+			   resulting from a policy decision on another
+			   CPU;
+
+			b) a hardware event, such as an interrupt.
+
+
+CLUSTER_GOING_DOWN/INBOUND_COMING_UP:
+
+	The cluster is (or was) being torn down, but another CPU has
+	come online in the meantime and is trying to set up the cluster
+	again.
+
+	If the outbound CPU observes this state, it has two choices:
+
+		a) back out of teardown, restoring the cluster to the
+		   CLUSTER_UP state;
+
+		b) finish tearing the cluster down and put the cluster
+		   in the CLUSTER_DOWN state; the inbound CPU will
+		   set up the cluster again from there.
+
+	Choice (a) permits the removal of some latency by avoiding
+	unnecessary teardown and setup operations in situations where
+	the cluster is not really going to be powered down.
+
+
+	Next states:
+
+	CLUSTER_UP/INBOUND_COMING_UP (outbound)
+		Conditions:	cluster-level setup and hardware
+				coherency complete
+		Trigger events:	(spontaneous)
+
+	CLUSTER_DOWN/INBOUND_COMING_UP (outbound)
+		Conditions:	cluster torn down and ready to power off
+		Trigger events:	(spontaneous)
+
+
+Last man and First man selection
+--------------------------------
+
+The CPU which performs cluster tear-down operations on the outbound side
+is commonly referred to as the "last man".
+
+The CPU which performs cluster setup on the inbound side is commonly
+referred to as the "first man".
+
+The race avoidance algorithm documented above does not provide a
+mechanism to choose which CPUs should play these roles.
+
+
+Last man:
+
+When shutting down the cluster, all the CPUs involved are initially
+executing Linux and hence coherent.  Therefore, ordinary spinlocks can
+be used to select a last man safely, before the CPUs become
+non-coherent.
+
+
+First man:
+
+Because CPUs may power up asynchronously in response to external wake-up
+events, a dynamic mechanism is needed to make sure that only one CPU
+attempts to play the first man role and do the cluster-level
+initialisation: any other CPUs must wait for this to complete before
+proceeding.
+
+Cluster-level initialisation may involve actions such as configuring
+coherency controls in the bus fabric.
+
+The current implementation in mcpm_head.S uses a separate mutual exclusion
+mechanism to do this arbitration.  This mechanism is documented in
+detail in vlocks.txt.
+
+
+Features and Limitations
+------------------------
+
+Implementation:
+
+	The current ARM-based implementation is split between
+	arch/arm/common/mcpm_head.S (low-level inbound CPU operations) and
+	arch/arm/common/mcpm_entry.c (everything else):
+
+	__mcpm_cpu_going_down() signals the transition of a CPU to the
+		CPU_GOING_DOWN state.
+
+	__mcpm_cpu_down() signals the transition of a CPU to the CPU_DOWN
+		state.
+
+	A CPU transitions to CPU_COMING_UP and then to CPU_UP via the
+		low-level power-up code in mcpm_head.S.  This could
+		involve CPU-specific setup code, but in the current
+		implementation it does not.
+
+	__mcpm_outbound_enter_critical() and __mcpm_outbound_leave_critical()
+		handle transitions from CLUSTER_UP to CLUSTER_GOING_DOWN
+		and from there to CLUSTER_DOWN or back to CLUSTER_UP (in
+		the case of an aborted cluster power-down).
+
+		These functions are more complex than the __mcpm_cpu_*()
+		functions due to the extra inter-CPU coordination which
+		is needed for safe transitions at the cluster level.
+
+	A cluster transitions from CLUSTER_DOWN back to CLUSTER_UP via
+		the low-level power-up code in mcpm_head.S.  This
+		typically involves platform-specific setup code,
+		provided by the platform-specific power_up_setup
+		function registered via mcpm_sync_init.
+
+Deep topologies:
+
+	As currently described and implemented, the algorithm does not
+	support CPU topologies involving more than two levels (i.e.,
+	clusters of clusters are not supported).  The algorithm could be
+	extended by replicating the cluster-level states for the
+	additional topological levels, and modifying the transition
+	rules for the intermediate (non-outermost) cluster levels.
+
+
+Colophon
+--------
+
+Originally created and documented by Dave Martin for Linaro Limited, in
+collaboration with Nicolas Pitre and Achin Gupta.
+
+Copyright (C) 2012-2013  Linaro Limited
+Distributed under the terms of Version 2 of the GNU General Public
+License, as defined in linux/COPYING.
diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c
index 5d72889a58a4..370236dd1a03 100644
--- a/arch/arm/common/mcpm_entry.c
+++ b/arch/arm/common/mcpm_entry.c
@@ -16,6 +16,7 @@
 #include <asm/mcpm.h>
 #include <asm/cacheflush.h>
 #include <asm/idmap.h>
+#include <asm/cputype.h>
 
 extern unsigned long mcpm_entry_vectors[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER];
 
@@ -111,3 +112,152 @@ int mcpm_cpu_powered_up(void)
 		platform_ops->powered_up();
 	return 0;
 }
+
+struct sync_struct mcpm_sync;
+
+/*
+ * __mcpm_cpu_going_down: Indicates that the cpu is being torn down.
+ *    This must be called at the point of committing to teardown of a CPU.
+ *    The CPU cache (SCTRL.C bit) is expected to still be active.
+ */
+void __mcpm_cpu_going_down(unsigned int cpu, unsigned int cluster)
+{
+	mcpm_sync.clusters[cluster].cpus[cpu].cpu = CPU_GOING_DOWN;
+	sync_cache_w(&mcpm_sync.clusters[cluster].cpus[cpu].cpu);
+}
+
+/*
+ * __mcpm_cpu_down: Indicates that cpu teardown is complete and that the
+ *    cluster can be torn down without disrupting this CPU.
+ *    To avoid deadlocks, this must be called before a CPU is powered down.
+ *    The CPU cache (SCTRL.C bit) is expected to be off.
+ *    However L2 cache might or might not be active.
+ */
+void __mcpm_cpu_down(unsigned int cpu, unsigned int cluster)
+{
+	dmb();
+	mcpm_sync.clusters[cluster].cpus[cpu].cpu = CPU_DOWN;
+	sync_cache_w(&mcpm_sync.clusters[cluster].cpus[cpu].cpu);
+	dsb_sev();
+}
+
+/*
+ * __mcpm_outbound_leave_critical: Leave the cluster teardown critical section.
+ * @state: the final state of the cluster:
+ *     CLUSTER_UP: no destructive teardown was done and the cluster has been
+ *         restored to the previous state (CPU cache still active); or
+ *     CLUSTER_DOWN: the cluster has been torn-down, ready for power-off
+ *         (CPU cache disabled, L2 cache either enabled or disabled).
+ */
+void __mcpm_outbound_leave_critical(unsigned int cluster, int state)
+{
+	dmb();
+	mcpm_sync.clusters[cluster].cluster = state;
+	sync_cache_w(&mcpm_sync.clusters[cluster].cluster);
+	dsb_sev();
+}
+
+/*
+ * __mcpm_outbound_enter_critical: Enter the cluster teardown critical section.
+ * This function should be called by the last man, after local CPU teardown
+ * is complete.  CPU cache expected to be active.
+ *
+ * Returns:
+ *     false: the critical section was not entered because an inbound CPU was
+ *         observed, or the cluster is already being set up;
+ *     true: the critical section was entered: it is now safe to tear down the
+ *         cluster.
+ */
+bool __mcpm_outbound_enter_critical(unsigned int cpu, unsigned int cluster)
+{
+	unsigned int i;
+	struct mcpm_sync_struct *c = &mcpm_sync.clusters[cluster];
+
+	/* Warn inbound CPUs that the cluster is being torn down: */
+	c->cluster = CLUSTER_GOING_DOWN;
+	sync_cache_w(&c->cluster);
+
+	/* Back out if the inbound cluster is already in the critical region: */
+	sync_cache_r(&c->inbound);
+	if (c->inbound == INBOUND_COMING_UP)
+		goto abort;
+
+	/*
+	 * Wait for all CPUs to get out of the GOING_DOWN state, so that local
+	 * teardown is complete on each CPU before tearing down the cluster.
+	 *
+	 * If any CPU has been woken up again from the DOWN state, then we
+	 * shouldn't be taking the cluster down at all: abort in that case.
+	 */
+	sync_cache_r(&c->cpus);
+	for (i = 0; i < MAX_CPUS_PER_CLUSTER; i++) {
+		int cpustate;
+
+		if (i == cpu)
+			continue;
+
+		while (1) {
+			cpustate = c->cpus[i].cpu;
+			if (cpustate != CPU_GOING_DOWN)
+				break;
+
+			wfe();
+			sync_cache_r(&c->cpus[i].cpu);
+		}
+
+		switch (cpustate) {
+		case CPU_DOWN:
+			continue;
+
+		default:
+			goto abort;
+		}
+	}
+
+	return true;
+
+abort:
+	__mcpm_outbound_leave_critical(cluster, CLUSTER_UP);
+	return false;
+}
+
+int __mcpm_cluster_state(unsigned int cluster)
+{
+	sync_cache_r(&mcpm_sync.clusters[cluster].cluster);
+	return mcpm_sync.clusters[cluster].cluster;
+}
+
+extern unsigned long mcpm_power_up_setup_phys;
+
+int __init mcpm_sync_init(
+	void (*power_up_setup)(unsigned int affinity_level))
+{
+	unsigned int i, j, mpidr, this_cluster;
+
+	BUILD_BUG_ON(MCPM_SYNC_CLUSTER_SIZE * MAX_NR_CLUSTERS != sizeof mcpm_sync);
+	BUG_ON((unsigned long)&mcpm_sync & (__CACHE_WRITEBACK_GRANULE - 1));
+
+	/*
+	 * Set initial CPU and cluster states.
+	 * Only one cluster is assumed to be active at this point.
+	 */
+	for (i = 0; i < MAX_NR_CLUSTERS; i++) {
+		mcpm_sync.clusters[i].cluster = CLUSTER_DOWN;
+		mcpm_sync.clusters[i].inbound = INBOUND_NOT_COMING_UP;
+		for (j = 0; j < MAX_CPUS_PER_CLUSTER; j++)
+			mcpm_sync.clusters[i].cpus[j].cpu = CPU_DOWN;
+	}
+	mpidr = read_cpuid_mpidr();
+	this_cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+	for_each_online_cpu(i)
+		mcpm_sync.clusters[this_cluster].cpus[i].cpu = CPU_UP;
+	mcpm_sync.clusters[this_cluster].cluster = CLUSTER_UP;
+	sync_cache_w(&mcpm_sync);
+
+	if (power_up_setup) {
+		mcpm_power_up_setup_phys = virt_to_phys(power_up_setup);
+		sync_cache_w(&mcpm_power_up_setup_phys);
+	}
+
+	return 0;
+}
diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S
index 68c9903075a9..7d729bd72674 100644
--- a/arch/arm/common/mcpm_head.S
+++ b/arch/arm/common/mcpm_head.S
@@ -7,11 +7,19 @@
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
+ *
+ *
+ * Refer to Documentation/arm/cluster-pm-race-avoidance.txt
+ * for details of the synchronisation algorithms used here.
  */
 
 #include <linux/linkage.h>
 #include <asm/mcpm.h>
 
+.if MCPM_SYNC_CLUSTER_CPUS
+.error "cpus must be the first member of struct mcpm_sync_struct"
+.endif
+
 	.macro	pr_dbg	string
 #if defined(CONFIG_DEBUG_LL) && defined(DEBUG)
 	b	1901f
@@ -57,24 +65,114 @@ ENTRY(mcpm_entry_point)
 2:	pr_dbg	"kernel mcpm_entry_point\n"
 
 	/*
-	 * MMU is off so we need to get to mcpm_entry_vectors in a
+	 * MMU is off so we need to get to various variables in a
 	 * position independent way.
 	 */
 	adr	r5, 3f
-	ldr	r6, [r5]
+	ldmia	r5, {r6, r7, r8}
 	add	r6, r5, r6			@ r6 = mcpm_entry_vectors
+	ldr	r7, [r5, r7]			@ r7 = mcpm_power_up_setup_phys
+	add	r8, r5, r8			@ r8 = mcpm_sync
+
+	mov	r0, #MCPM_SYNC_CLUSTER_SIZE
+	mla	r8, r0, r10, r8			@ r8 = sync cluster base
+
+	@ Signal that this CPU is coming UP:
+	mov	r0, #CPU_COMING_UP
+	mov	r5, #MCPM_SYNC_CPU_SIZE
+	mla	r5, r9, r5, r8			@ r5 = sync cpu address
+	strb	r0, [r5]
+
+	@ At this point, the cluster cannot unexpectedly enter the GOING_DOWN
+	@ state, because there is at least one active CPU (this CPU).
+
+	@ Note: the following is racy as another CPU might be testing
+	@ the same flag at the same moment.  That'll be fixed later.
+	ldrb	r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER]
+	cmp	r0, #CLUSTER_UP			@ cluster already up?
+	bne	mcpm_setup			@ if not, set up the cluster
+
+	@ Otherwise, skip setup:
+	b	mcpm_setup_complete
+
+mcpm_setup:
+	@ Control dependency implies strb not observable before previous ldrb.
+
+	@ Signal that the cluster is being brought up:
+	mov	r0, #INBOUND_COMING_UP
+	strb	r0, [r8, #MCPM_SYNC_CLUSTER_INBOUND]
+	dmb
+
+	@ Any CPU trying to take the cluster into CLUSTER_GOING_DOWN from this
+	@ point onwards will observe INBOUND_COMING_UP and abort.
+
+	@ Wait for any previously-pending cluster teardown operations to abort
+	@ or complete:
+mcpm_teardown_wait:
+	ldrb	r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER]
+	cmp	r0, #CLUSTER_GOING_DOWN
+	bne	first_man_setup
+	wfe
+	b	mcpm_teardown_wait
+
+first_man_setup:
+	dmb
+
+	@ If the outbound gave up before teardown started, skip cluster setup:
+
+	cmp	r0, #CLUSTER_UP
+	beq	mcpm_setup_leave
+
+	@ power_up_setup is now responsible for setting up the cluster:
+
+	cmp	r7, #0
+	mov	r0, #1		@ second (cluster) affinity level
+	blxne	r7		@ Call power_up_setup if defined
+	dmb
+
+	mov	r0, #CLUSTER_UP
+	strb	r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER]
+	dmb
+
+mcpm_setup_leave:
+	@ Leave the cluster setup critical section:
+
+	mov	r0, #INBOUND_NOT_COMING_UP
+	strb	r0, [r8, #MCPM_SYNC_CLUSTER_INBOUND]
+	dsb
+	sev
+
+mcpm_setup_complete:
+	@ If a platform-specific CPU setup hook is needed, it is
+	@ called from here.
+
+	cmp	r7, #0
+	mov	r0, #0		@ first (CPU) affinity level
+	blxne	r7		@ Call power_up_setup if defined
+	dmb
+
+	@ Mark the CPU as up:
+
+	mov	r0, #CPU_UP
+	strb	r0, [r5]
+
+	@ Observability order of CPU_UP and opening of the gate does not matter.
 
 mcpm_entry_gated:
 	ldr	r5, [r6, r4, lsl #2]		@ r5 = CPU entry vector
 	cmp	r5, #0
 	wfeeq
 	beq	mcpm_entry_gated
+	dmb
+
 	pr_dbg	"released\n"
 	bx	r5
 
 	.align	2
 
 3:	.word	mcpm_entry_vectors - .
+	.word	mcpm_power_up_setup_phys - 3b
+	.word	mcpm_sync - 3b
 
 ENDPROC(mcpm_entry_point)
 
@@ -84,3 +182,7 @@ ENDPROC(mcpm_entry_point)
 	.type	mcpm_entry_vectors, #object
 ENTRY(mcpm_entry_vectors)
 	.space	4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER
+
+	.type	mcpm_power_up_setup_phys, #object
+ENTRY(mcpm_power_up_setup_phys)
+	.space  4		@ set by mcpm_sync_init()
diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h
index 627761fce780..3046e90210cb 100644
--- a/arch/arm/include/asm/mcpm.h
+++ b/arch/arm/include/asm/mcpm.h
@@ -24,6 +24,9 @@
 
 #ifndef __ASSEMBLY__
 
+#include <linux/types.h>
+#include <asm/cacheflush.h>
+
 /*
  * Platform specific code should use this symbol to set up secondary
  * entry location for processors to use when released from reset.
@@ -130,5 +133,75 @@ struct mcpm_platform_ops {
  */
 int __init mcpm_platform_register(const struct mcpm_platform_ops *ops);
 
+/* Synchronisation structures for coordinating safe cluster setup/teardown: */
+
+/*
+ * When modifying this structure, make sure you update the MCPM_SYNC_ defines
+ * to match.
+ */
+struct mcpm_sync_struct {
+	/* individual CPU states */
+	struct {
+		s8 cpu __aligned(__CACHE_WRITEBACK_GRANULE);
+	} cpus[MAX_CPUS_PER_CLUSTER];
+
+	/* cluster state */
+	s8 cluster __aligned(__CACHE_WRITEBACK_GRANULE);
+
+	/* inbound-side state */
+	s8 inbound __aligned(__CACHE_WRITEBACK_GRANULE);
+};
+
+struct sync_struct {
+	struct mcpm_sync_struct clusters[MAX_NR_CLUSTERS];
+};
+
+extern unsigned long sync_phys;	/* physical address of *mcpm_sync */
+
+void __mcpm_cpu_going_down(unsigned int cpu, unsigned int cluster);
+void __mcpm_cpu_down(unsigned int cpu, unsigned int cluster);
+void __mcpm_outbound_leave_critical(unsigned int cluster, int state);
+bool __mcpm_outbound_enter_critical(unsigned int this_cpu, unsigned int cluster);
+int __mcpm_cluster_state(unsigned int cluster);
+
+int __init mcpm_sync_init(
+	void (*power_up_setup)(unsigned int affinity_level));
+
+#else
+
+/* 
+ * asm-offsets.h causes trouble when included in .c files, and cacheflush.h
+ * cannot be included in asm files.  Let's work around the conflict like this.
+ */
+#include <asm/asm-offsets.h>
+#define __CACHE_WRITEBACK_GRANULE CACHE_WRITEBACK_GRANULE
+
 #endif /* ! __ASSEMBLY__ */
+
+/* Definitions for mcpm_sync_struct */
+#define CPU_DOWN		0x11
+#define CPU_COMING_UP		0x12
+#define CPU_UP			0x13
+#define CPU_GOING_DOWN		0x14
+
+#define CLUSTER_DOWN		0x21
+#define CLUSTER_UP		0x22
+#define CLUSTER_GOING_DOWN	0x23
+
+#define INBOUND_NOT_COMING_UP	0x31
+#define INBOUND_COMING_UP	0x32
+
+/*
+ * Offsets for the mcpm_sync_struct members, for use in asm.
+ * We don't want to make them global to the kernel via asm-offsets.c.
+ */
+#define MCPM_SYNC_CLUSTER_CPUS	0
+#define MCPM_SYNC_CPU_SIZE	__CACHE_WRITEBACK_GRANULE
+#define MCPM_SYNC_CLUSTER_CLUSTER \
+	(MCPM_SYNC_CLUSTER_CPUS + MCPM_SYNC_CPU_SIZE * MAX_CPUS_PER_CLUSTER)
+#define MCPM_SYNC_CLUSTER_INBOUND \
+	(MCPM_SYNC_CLUSTER_CLUSTER + __CACHE_WRITEBACK_GRANULE)
+#define MCPM_SYNC_CLUSTER_SIZE \
+	(MCPM_SYNC_CLUSTER_INBOUND + __CACHE_WRITEBACK_GRANULE)
+
 #endif
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 923eec7105cf..1bed82a0a9e0 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -149,6 +149,9 @@ int main(void)
   DEFINE(DMA_BIDIRECTIONAL,	DMA_BIDIRECTIONAL);
   DEFINE(DMA_TO_DEVICE,		DMA_TO_DEVICE);
   DEFINE(DMA_FROM_DEVICE,	DMA_FROM_DEVICE);
+  BLANK();
+  DEFINE(CACHE_WRITEBACK_GRANULE, __CACHE_WRITEBACK_GRANULE);
+  BLANK();
 #ifdef CONFIG_KVM_ARM_HOST
   DEFINE(VCPU_KVM,		offsetof(struct kvm_vcpu, kvm));
   DEFINE(VCPU_MIDR,		offsetof(struct kvm_vcpu, arch.midr));
-- 
cgit v1.2.3


From 1ae98561b16f305e43151405f226727c00ee52bc Mon Sep 17 00:00:00 2001
From: Dave Martin <dave.martin@linaro.org>
Date: Fri, 17 Aug 2012 16:07:02 +0100
Subject: ARM: mcpm_head.S: vlock-based first man election

Instead of requiring the first man to be elected in advance (which
can be suboptimal in some situations), this patch uses a per-
cluster mutex to co-ordinate selection of the first man.

This should also make it more feasible to reuse this code path for
asynchronous cluster resume (as in CPUidle scenarios).

We must ensure that the vlock data doesn't share a cacheline with
anything else, or dirty cache eviction could corrupt it.

Signed-off-by: Dave Martin <dave.martin@linaro.org>
Signed-off-by: Nicolas Pitre <nicolas.pitre@linaro.org>
Reviewed-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Reviewed-by: Will Deacon <will.deacon@arm.com>
---
 arch/arm/common/Makefile      |  2 +-
 arch/arm/common/mcpm_head.S   | 41 ++++++++++++++++++++++++++++++++++++-----
 arch/arm/kernel/asm-offsets.c |  1 +
 3 files changed, 38 insertions(+), 6 deletions(-)

(limited to 'arch/arm/kernel')

diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile
index b070671033ae..9ec273188ccb 100644
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -11,4 +11,4 @@ obj-$(CONFIG_SHARP_PARAM)	+= sharpsl_param.o
 obj-$(CONFIG_SHARP_SCOOP)	+= scoop.o
 obj-$(CONFIG_PCI_HOST_ITE8152)  += it8152.o
 obj-$(CONFIG_ARM_TIMER_SP804)	+= timer-sp.o
-obj-$(CONFIG_MCPM)		+= mcpm_head.o mcpm_entry.o
+obj-$(CONFIG_MCPM)		+= mcpm_head.o mcpm_entry.o vlock.o
diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S
index 7d729bd72674..8178705c4b24 100644
--- a/arch/arm/common/mcpm_head.S
+++ b/arch/arm/common/mcpm_head.S
@@ -16,6 +16,8 @@
 #include <linux/linkage.h>
 #include <asm/mcpm.h>
 
+#include "vlock.h"
+
 .if MCPM_SYNC_CLUSTER_CPUS
 .error "cpus must be the first member of struct mcpm_sync_struct"
 .endif
@@ -69,10 +71,11 @@ ENTRY(mcpm_entry_point)
 	 * position independent way.
 	 */
 	adr	r5, 3f
-	ldmia	r5, {r6, r7, r8}
+	ldmia	r5, {r6, r7, r8, r11}
 	add	r6, r5, r6			@ r6 = mcpm_entry_vectors
 	ldr	r7, [r5, r7]			@ r7 = mcpm_power_up_setup_phys
 	add	r8, r5, r8			@ r8 = mcpm_sync
+	add	r11, r5, r11			@ r11 = first_man_locks
 
 	mov	r0, #MCPM_SYNC_CLUSTER_SIZE
 	mla	r8, r0, r10, r8			@ r8 = sync cluster base
@@ -86,13 +89,22 @@ ENTRY(mcpm_entry_point)
 	@ At this point, the cluster cannot unexpectedly enter the GOING_DOWN
 	@ state, because there is at least one active CPU (this CPU).
 
-	@ Note: the following is racy as another CPU might be testing
-	@ the same flag at the same moment.  That'll be fixed later.
+	mov	r0, #VLOCK_SIZE
+	mla	r11, r0, r10, r11		@ r11 = cluster first man lock
+	mov	r0, r11
+	mov	r1, r9				@ cpu
+	bl	vlock_trylock			@ implies DMB
+
+	cmp	r0, #0				@ failed to get the lock?
+	bne	mcpm_setup_wait		@ wait for cluster setup if so
+
 	ldrb	r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER]
 	cmp	r0, #CLUSTER_UP			@ cluster already up?
 	bne	mcpm_setup			@ if not, set up the cluster
 
-	@ Otherwise, skip setup:
+	@ Otherwise, release the first man lock and skip setup:
+	mov	r0, r11
+	bl	vlock_unlock
 	b	mcpm_setup_complete
 
 mcpm_setup:
@@ -142,6 +154,19 @@ mcpm_setup_leave:
 	dsb
 	sev
 
+	mov	r0, r11
+	bl	vlock_unlock	@ implies DMB
+	b	mcpm_setup_complete
+
+	@ In the contended case, non-first men wait here for cluster setup
+	@ to complete:
+mcpm_setup_wait:
+	ldrb	r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER]
+	cmp	r0, #CLUSTER_UP
+	wfene
+	bne	mcpm_setup_wait
+	dmb
+
 mcpm_setup_complete:
 	@ If a platform-specific CPU setup hook is needed, it is
 	@ called from here.
@@ -173,11 +198,17 @@ mcpm_entry_gated:
 3:	.word	mcpm_entry_vectors - .
 	.word	mcpm_power_up_setup_phys - 3b
 	.word	mcpm_sync - 3b
+	.word	first_man_locks - 3b
 
 ENDPROC(mcpm_entry_point)
 
 	.bss
-	.align	5
+
+	.align	CACHE_WRITEBACK_ORDER
+	.type	first_man_locks, #object
+first_man_locks:
+	.space	VLOCK_SIZE * MAX_NR_CLUSTERS
+	.align	CACHE_WRITEBACK_ORDER
 
 	.type	mcpm_entry_vectors, #object
 ENTRY(mcpm_entry_vectors)
diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c
index 1bed82a0a9e0..3f088225e71c 100644
--- a/arch/arm/kernel/asm-offsets.c
+++ b/arch/arm/kernel/asm-offsets.c
@@ -150,6 +150,7 @@ int main(void)
   DEFINE(DMA_TO_DEVICE,		DMA_TO_DEVICE);
   DEFINE(DMA_FROM_DEVICE,	DMA_FROM_DEVICE);
   BLANK();
+  DEFINE(CACHE_WRITEBACK_ORDER, __CACHE_WRITEBACK_ORDER);
   DEFINE(CACHE_WRITEBACK_GRANULE, __CACHE_WRITEBACK_GRANULE);
   BLANK();
 #ifdef CONFIG_KVM_ARM_HOST
-- 
cgit v1.2.3