From 76c714be0e5e60c935a53b31be58939510ba1d0f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 30 Oct 2015 18:56:19 +0000 Subject: arm64: pgtable: implement pte_accessible() This patch implements the pte_accessible() macro, which can be used to test whether or not a given pte is a candidate for allocation in the TLB. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7e074f93f383..450b355f3f49 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -167,6 +167,16 @@ extern struct page *empty_zero_page; ((pte_val(pte) & (PTE_VALID | PTE_USER)) == (PTE_VALID | PTE_USER)) #define pte_valid_not_user(pte) \ ((pte_val(pte) & (PTE_VALID | PTE_USER)) == PTE_VALID) +#define pte_valid_young(pte) \ + ((pte_val(pte) & (PTE_VALID | PTE_AF)) == (PTE_VALID | PTE_AF)) + +/* + * Could the pte be present in the TLB? We must check mm_tlb_flush_pending + * so that we don't erroneously return false for pages that have been + * remapped as PROT_NONE but are yet to be flushed from the TLB. + */ +#define pte_accessible(mm, pte) \ + (mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid_young(pte)) static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) { -- cgit v1.2.3 From b9b7aebb42d1b1392f3111de61136bb6cf3aae3f Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Wed, 2 Dec 2015 14:00:10 +0000 Subject: arm64: fix COMPAT_SHMLBA definition for large pages ARM glibc uses (4 * __getpagesize()) for SHMLBA, which is correct for 4KB pages and works fine for 64KB pages, but the kernel uses a hardcoded 16KB that is too small for 64KB page based kernels. This changes the definition to what user space sees when using 64KB pages. Acked-by: Arnd Bergmann Signed-off-by: Yury Norov Signed-off-by: Will Deacon --- arch/arm64/include/asm/shmparam.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/shmparam.h b/arch/arm64/include/asm/shmparam.h index 4df608a8459e..e368a55ebd22 100644 --- a/arch/arm64/include/asm/shmparam.h +++ b/arch/arm64/include/asm/shmparam.h @@ -21,7 +21,7 @@ * alignment value. Since we don't have aliasing D-caches, the rest of * the time we can safely use PAGE_SIZE. */ -#define COMPAT_SHMLBA 0x4000 +#define COMPAT_SHMLBA (4 * PAGE_SIZE) #include -- cgit v1.2.3 From d86b8da04dfa4771a68bdbad6c424d40f22f0d14 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 19 Nov 2015 17:48:31 +0000 Subject: arm64: spinlock: serialise spin_unlock_wait against concurrent lockers Boqun Feng reported a rather nasty ordering issue with spin_unlock_wait on architectures implementing spin_lock with LL/SC sequences and acquire semantics: | CPU 1 CPU 2 CPU 3 | ================== ==================== ============== | spin_unlock(&lock); | spin_lock(&lock): | r1 = *lock; // r1 == 0; | o = READ_ONCE(object); // reordered here | object = NULL; | smp_mb(); | spin_unlock_wait(&lock); | *lock = 1; | smp_mb(); | o->dead = true; | if (o) // true | BUG_ON(o->dead); // true!! The crux of the problem is that spin_unlock_wait(&lock) can return on CPU 1 whilst CPU 2 is in the process of taking the lock. This can be resolved by upgrading spin_unlock_wait to a LOCK operation, forcing it to serialise against a concurrent locker and giving it acquire semantics in the process (although it is not at all clear whether this is needed - different callers seem to assume different things about the barrier semantics and architectures are similarly disjoint in their implementations of the macro). This patch implements spin_unlock_wait using an LL/SC sequence with acquire semantics on arm64. For v8.1 systems with the LSE atomics, the exclusive writeback is omitted, since the spin_lock operation is indivisible and no intermediate state can be observed. Signed-off-by: Will Deacon --- arch/arm64/include/asm/spinlock.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index c85e96d174a5..fc9682bfe002 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -26,9 +26,28 @@ * The memory barriers are implicit with the load-acquire and store-release * instructions. */ +static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) +{ + unsigned int tmp; + arch_spinlock_t lockval; -#define arch_spin_unlock_wait(lock) \ - do { while (arch_spin_is_locked(lock)) cpu_relax(); } while (0) + asm volatile( +" sevl\n" +"1: wfe\n" +"2: ldaxr %w0, %2\n" +" eor %w1, %w0, %w0, ror #16\n" +" cbnz %w1, 1b\n" + ARM64_LSE_ATOMIC_INSN( + /* LL/SC */ +" stxr %w1, %w0, %2\n" +" cbnz %w1, 2b\n", /* Serialise against any concurrent lockers */ + /* LSE atomics */ +" nop\n" +" nop\n") + : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock) + : + : "memory"); +} #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock) -- cgit v1.2.3 From 6cdf9c7ca687e01840d0215437620a20263012fc Mon Sep 17 00:00:00 2001 From: Jungseok Lee Date: Fri, 4 Dec 2015 11:02:25 +0000 Subject: arm64: Store struct thread_info in sp_el0 There is need for figuring out how to manage struct thread_info data when IRQ stack is introduced. struct thread_info information should be copied to IRQ stack under the current thread_info calculation logic whenever context switching is invoked. This is too expensive to keep supporting the approach. Instead, this patch pays attention to sp_el0 which is an unused scratch register in EL1 context. sp_el0 utilization not only simplifies the management, but also prevents text section size from being increased largely due to static allocated IRQ stack as removing masking operation using THREAD_SIZE in many places. Reviewed-by: Catalin Marinas Signed-off-by: Jungseok Lee Signed-off-by: James Morse Signed-off-by: Will Deacon --- arch/arm64/include/asm/thread_info.h | 10 ++++++++-- arch/arm64/kernel/entry.S | 15 ++++++++++++--- arch/arm64/kernel/head.S | 5 +++++ arch/arm64/kernel/sleep.S | 3 +++ 4 files changed, 28 insertions(+), 5 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 90c7ff233735..abd64bd1f6d9 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -73,10 +73,16 @@ register unsigned long current_stack_pointer asm ("sp"); */ static inline struct thread_info *current_thread_info(void) __attribute_const__; +/* + * struct thread_info can be accessed directly via sp_el0. + */ static inline struct thread_info *current_thread_info(void) { - return (struct thread_info *) - (current_stack_pointer & ~(THREAD_SIZE - 1)); + unsigned long sp_el0; + + asm ("mrs %0, sp_el0" : "=r" (sp_el0)); + + return (struct thread_info *)sp_el0; } #define thread_saved_pc(tsk) \ diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index e5b25389c48f..245fa6837880 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -88,7 +88,8 @@ .if \el == 0 mrs x21, sp_el0 - get_thread_info tsk // Ensure MDSCR_EL1.SS is clear, + mov tsk, sp + and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear, ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug disable_step_tsk x19, x20 // exceptions when scheduling. .else @@ -107,6 +108,13 @@ str x21, [sp, #S_SYSCALLNO] .endif + /* + * Set sp_el0 to current thread_info. + */ + .if \el == 0 + msr sp_el0, tsk + .endif + /* * Registers that may be useful after this macro is invoked: * @@ -164,8 +172,7 @@ alternative_endif .endm .macro get_thread_info, rd - mov \rd, sp - and \rd, \rd, #~(THREAD_SIZE - 1) // top of stack + mrs \rd, sp_el0 .endm /* @@ -599,6 +606,8 @@ ENTRY(cpu_switch_to) ldp x29, x9, [x8], #16 ldr lr, [x8] mov sp, x9 + and x9, x9, #~(THREAD_SIZE - 1) + msr sp_el0, x9 ret ENDPROC(cpu_switch_to) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 23cfc08fc8ba..b363f340f2c7 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -424,6 +424,9 @@ __mmap_switched: b 1b 2: adr_l sp, initial_sp, x4 + mov x4, sp + and x4, x4, #~(THREAD_SIZE - 1) + msr sp_el0, x4 // Save thread_info str_l x21, __fdt_pointer, x5 // Save FDT pointer str_l x24, memstart_addr, x6 // Save PHYS_OFFSET mov x29, #0 @@ -606,6 +609,8 @@ ENDPROC(secondary_startup) ENTRY(__secondary_switched) ldr x0, [x21] // get secondary_data.stack mov sp, x0 + and x0, x0, #~(THREAD_SIZE - 1) + msr sp_el0, x0 // save thread_info mov x29, #0 b secondary_start_kernel ENDPROC(__secondary_switched) diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index f586f7c875e2..e33fe33876ab 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -173,6 +173,9 @@ ENTRY(cpu_resume) /* load physical address of identity map page table in x1 */ adrp x1, idmap_pg_dir mov sp, x2 + /* save thread_info */ + and x2, x2, #~(THREAD_SIZE - 1) + msr sp_el0, x2 /* * cpu_do_resume expects x0 to contain context physical address * pointer and x1 to contain physical address of 1:1 page tables -- cgit v1.2.3 From 132cd887b5c54758d04bf25c52fa48f45e843a30 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Fri, 4 Dec 2015 11:02:26 +0000 Subject: arm64: Modify stack trace and dump for use with irq_stack This patch allows unwind_frame() to traverse from interrupt stack to task stack correctly. It requires data from a dummy stack frame, created during irq_stack_entry(), added by a later patch. A similar approach is taken to modify dump_backtrace(), which expects to find struct pt_regs underneath any call to functions marked __exception. When on an irq_stack, the struct pt_regs is stored on the old task stack, the location of which is stored in the dummy stack frame. Reviewed-by: Catalin Marinas Signed-off-by: AKASHI Takahiro [james.morse: merged two patches, reworked for per_cpu irq_stacks, and no alignment guarantees, added irq_stack definitions] Signed-off-by: James Morse Signed-off-by: Will Deacon --- arch/arm64/include/asm/irq.h | 32 ++++++++++++++++++++++++++++++++ arch/arm64/kernel/irq.c | 3 +++ arch/arm64/kernel/stacktrace.c | 29 +++++++++++++++++++++++++++-- arch/arm64/kernel/traps.c | 14 +++++++++++++- 4 files changed, 75 insertions(+), 3 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 8e8d30684392..e2f3f135a3bc 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -1,10 +1,32 @@ #ifndef __ASM_IRQ_H #define __ASM_IRQ_H +#define IRQ_STACK_SIZE THREAD_SIZE +#define IRQ_STACK_START_SP THREAD_START_SP + +#ifndef __ASSEMBLER__ + +#include + #include +#include struct pt_regs; +DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); + +/* + * The highest address on the stack, and the first to be used. Used to + * find the dummy-stack frame put down by el?_irq() in entry.S. + */ +#define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) + +/* + * The offset from irq_stack_ptr where entry.S will store the original + * stack pointer. Used by unwind_frame() and dump_backtrace(). + */ +#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x10)); + extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); static inline int nr_legacy_irqs(void) @@ -12,4 +34,14 @@ static inline int nr_legacy_irqs(void) return 0; } +static inline bool on_irq_stack(unsigned long sp, int cpu) +{ + /* variable names the same as kernel/stacktrace.c */ + unsigned long low = (unsigned long)per_cpu(irq_stack, cpu); + unsigned long high = low + IRQ_STACK_START_SP; + + return (low <= sp && sp <= high); +} + +#endif /* !__ASSEMBLER__ */ #endif diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 9f17ec071ee0..1e3cef578e21 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -30,6 +30,9 @@ unsigned long irq_err_count; +/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned */ +DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); + int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_list(p, prec); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index ccb6078ed9f2..b947eeffa5b2 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -20,6 +20,7 @@ #include #include +#include #include /* @@ -39,17 +40,41 @@ int notrace unwind_frame(struct stackframe *frame) { unsigned long high, low; unsigned long fp = frame->fp; + unsigned long irq_stack_ptr; + + /* + * Use raw_smp_processor_id() to avoid false-positives from + * CONFIG_DEBUG_PREEMPT. get_wchan() calls unwind_frame() on sleeping + * task stacks, we can be pre-empted in this case, so + * {raw_,}smp_processor_id() may give us the wrong value. Sleeping + * tasks can't ever be on an interrupt stack, so regardless of cpu, + * the checks will always fail. + */ + irq_stack_ptr = IRQ_STACK_PTR(raw_smp_processor_id()); low = frame->sp; - high = ALIGN(low, THREAD_SIZE); + /* irq stacks are not THREAD_SIZE aligned */ + if (on_irq_stack(frame->sp, raw_smp_processor_id())) + high = irq_stack_ptr; + else + high = ALIGN(low, THREAD_SIZE) - 0x20; - if (fp < low || fp > high - 0x18 || fp & 0xf) + if (fp < low || fp > high || fp & 0xf) return -EINVAL; frame->sp = fp + 0x10; frame->fp = *(unsigned long *)(fp); frame->pc = *(unsigned long *)(fp + 8); + /* + * Check whether we are going to walk through from interrupt stack + * to task stack. + * If we reach the end of the stack - and its an interrupt stack, + * read the original task stack pointer from the dummy frame. + */ + if (frame->sp == irq_stack_ptr) + frame->sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + return 0; } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index e9b9b5364393..8a0084541f84 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -146,6 +146,7 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; + unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); @@ -180,9 +181,20 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (ret < 0) break; stack = frame.sp; - if (in_exception_text(where)) + if (in_exception_text(where)) { + /* + * If we switched to the irq_stack before calling this + * exception handler, then the pt_regs will be on the + * task stack. The easiest way to tell is if the large + * pt_regs would overlap with the end of the irq_stack. + */ + if (stack < irq_stack_ptr && + (stack + sizeof(struct pt_regs)) > irq_stack_ptr) + stack = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + dump_mem("", "Exception stack", stack, stack + sizeof(struct pt_regs), false); + } } } -- cgit v1.2.3 From 8e23dacd12a48e58125b84c817da50850b73280a Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 4 Dec 2015 11:02:27 +0000 Subject: arm64: Add do_softirq_own_stack() and enable irq_stacks entry.S is modified to switch to the per_cpu irq_stack during el{0,1}_irq. irq_count is used to detect recursive interrupts on the irq_stack, it is updated late by do_softirq_own_stack(), when called on the irq_stack, before __do_softirq() re-enables interrupts to process softirqs. do_softirq_own_stack() is added by this patch, but does not yet switch stack. This patch adds the dummy stack frame and data needed by the previous stack tracing patches. Reviewed-by: Catalin Marinas Signed-off-by: James Morse Signed-off-by: Will Deacon --- arch/arm64/include/asm/irq.h | 2 ++ arch/arm64/kernel/entry.S | 42 ++++++++++++++++++++++++++++++++++++++++-- arch/arm64/kernel/irq.c | 38 +++++++++++++++++++++++++++++++++++++- 3 files changed, 79 insertions(+), 3 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index e2f3f135a3bc..fa2a8d0e4792 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -11,6 +11,8 @@ #include #include +#define __ARCH_HAS_DO_SOFTIRQ + struct pt_regs; DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 245fa6837880..8f7e737949fe 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -175,6 +176,42 @@ alternative_endif mrs \rd, sp_el0 .endm + .macro irq_stack_entry, dummy_lr + mov x19, sp // preserve the original sp + + adr_l x25, irq_stack + mrs x26, tpidr_el1 + add x25, x25, x26 + + /* + * Check the lowest address on irq_stack for the irq_count value, + * incremented by do_softirq_own_stack if we have re-enabled irqs + * while on the irq_stack. + */ + ldr x26, [x25] + cbnz x26, 9998f // recursive use? + + /* switch to the irq stack */ + mov x26, #IRQ_STACK_START_SP + add x26, x25, x26 + mov sp, x26 + + /* Add a dummy stack frame */ + stp x29, \dummy_lr, [sp, #-16]! // dummy stack frame + mov x29, sp + stp xzr, x19, [sp, #-16]! + +9998: + .endm + + /* + * x19 should be preserved between irq_stack_entry and + * irq_stack_exit. + */ + .macro irq_stack_exit + mov sp, x19 + .endm + /* * These are the registers used in the syscall handler, and allow us to * have in theory up to 7 arguments to a function - x0 to x6. @@ -190,10 +227,11 @@ tsk .req x28 // current thread_info * Interrupt handling. */ .macro irq_handler - adrp x1, handle_arch_irq - ldr x1, [x1, #:lo12:handle_arch_irq] + ldr_l x1, handle_arch_irq mov x0, sp + irq_stack_entry x22 blr x1 + irq_stack_exit .endm .text diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 1e3cef578e21..ff7ebb710e51 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -25,14 +25,24 @@ #include #include #include +#include #include #include unsigned long irq_err_count; -/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned */ +/* + * irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. + * irq_stack[0] is used as irq_count, a non-zero value indicates the stack + * is in use, and el?_irq() shouldn't switch to it. This is used to detect + * recursive use of the irq_stack, it is lazily updated by + * do_softirq_own_stack(), which is called on the irq_stack, before + * re-enabling interrupts to process softirqs. + */ DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); +#define IRQ_COUNT() (*per_cpu(irq_stack, smp_processor_id())) + int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_list(p, prec); @@ -56,3 +66,29 @@ void __init init_IRQ(void) if (!handle_arch_irq) panic("No interrupt controller found."); } + +/* + * do_softirq_own_stack() is called from irq_exit() before __do_softirq() + * re-enables interrupts, at which point we may re-enter el?_irq(). We + * increase irq_count here so that el1_irq() knows that it is already on the + * irq stack. + * + * Called with interrupts disabled, so we don't worry about moving cpu, or + * being interrupted while modifying irq_count. + * + * This function doesn't actually switch stack. + */ +void do_softirq_own_stack(void) +{ + int cpu = smp_processor_id(); + + WARN_ON_ONCE(!irqs_disabled()); + + if (on_irq_stack(current_stack_pointer, cpu)) { + IRQ_COUNT()++; + __do_softirq(); + IRQ_COUNT()--; + } else { + __do_softirq(); + } +} -- cgit v1.2.3 From 7596abf2e5661d52c4f414f37addeed54e098880 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 9 Dec 2015 13:58:42 +0000 Subject: arm64: irq: fix walking from irq stack to task stack Running with CONFIG_DEBUG_SPINLOCK=y can trigger a BUG with the new IRQ stack code: BUG: spinlock lockup suspected on CPU#1 This is due to the IRQ_STACK_TO_TASK_STACK macro incorrectly retrieving the task stack pointer stashed at the top of the IRQ stack. Sayeth James: | Yup, this is what is happening. Its an off-by-one due to broken | thinking about how the stack works. My broken thinking was: | | > top ------------ | > | dummy_lr | <- irq_stack_ptr | > ------------ | > | x29 | | > ------------ | > | x19 | <- irq_stack_ptr - 0x10 | > ------------ | > | xzr | | > ------------ | | But the stack-pointer is decreased before use. So it actually looks | like this: | | > ------------ | > | | <- irq_stack_ptr | > top ------------ | > | dummy_lr | | > ------------ | > | x29 | <- irq_stack_ptr - 0x10 | > ------------ | > | x19 | | > ------------ | > | xzr | <- irq_stack_ptr - 0x20 | > ------------ | | The value being used as the original stack is x29, which in all the | tests is sp but without the current frames data, hence there are no | missing frames in the output. | | Jungseok Lee picked it up with a 32bit user space because aarch32 | can't use x29, so it remains 0 forever. The fix he posted is correct. This patch fixes the macro and adds some of this wisdom to a comment, so that the layout of the IRQ stack is well understood. Cc: James Morse Reported-by: Jungseok Lee Signed-off-by: Will Deacon --- arch/arm64/include/asm/irq.h | 20 ++++++++++++++++++-- arch/arm64/kernel/entry.S | 2 +- 2 files changed, 19 insertions(+), 3 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index fa2a8d0e4792..877c7e358384 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -19,7 +19,23 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); /* * The highest address on the stack, and the first to be used. Used to - * find the dummy-stack frame put down by el?_irq() in entry.S. + * find the dummy-stack frame put down by el?_irq() in entry.S, which + * is structured as follows: + * + * ------------ + * | | <- irq_stack_ptr + * top ------------ + * | elr_el1 | + * ------------ + * | x29 | <- irq_stack_ptr - 0x10 + * ------------ + * | xzr | + * ------------ + * | x19 | <- irq_stack_ptr - 0x20 + * ------------ + * + * where x19 holds a copy of the task stack pointer. + * */ #define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) @@ -27,7 +43,7 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); * The offset from irq_stack_ptr where entry.S will store the original * stack pointer. Used by unwind_frame() and dump_backtrace(). */ -#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x10)); +#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x20)); extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 8f7e737949fe..be7ec544b540 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -199,7 +199,7 @@ alternative_endif /* Add a dummy stack frame */ stp x29, \dummy_lr, [sp, #-16]! // dummy stack frame mov x29, sp - stp xzr, x19, [sp, #-16]! + stp x19, xzr, [sp, #-16]! 9998: .endm -- cgit v1.2.3 From f7d924894265794f447ea799dd853400749b5a22 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 30 Nov 2015 13:28:19 +0100 Subject: arm64/efi: refactor EFI init and runtime code for reuse by 32-bit ARM This refactors the EFI init and runtime code that will be shared between arm64 and ARM so that it can be built for both archs. Reviewed-by: Matt Fleming Signed-off-by: Ard Biesheuvel Signed-off-by: Will Deacon --- arch/arm64/include/asm/efi.h | 9 +++++++ arch/arm64/kernel/efi.c | 23 ++++++++++++++++++ drivers/firmware/efi/arm-init.c | 7 +++--- drivers/firmware/efi/arm-runtime.c | 48 +++++++++++++------------------------- drivers/firmware/efi/efi.c | 2 ++ 5 files changed, 54 insertions(+), 35 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index ef572206f1c3..8e88a696c9cb 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -2,7 +2,9 @@ #define _ASM_EFI_H #include +#include #include +#include #ifdef CONFIG_EFI extern void efi_init(void); @@ -10,6 +12,8 @@ extern void efi_init(void); #define efi_init() #endif +int efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md); + #define efi_call_virt(f, ...) \ ({ \ efi_##f##_t *__f; \ @@ -63,6 +67,11 @@ extern void efi_init(void); * Services are enabled and the EFI_RUNTIME_SERVICES bit set. */ +static inline void efi_set_pgd(struct mm_struct *mm) +{ + switch_mm(NULL, mm, NULL); +} + void efi_virtmap_load(void); void efi_virtmap_unload(void); diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index bd3b2f5adf0c..b6abc852f2a1 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -17,6 +17,29 @@ #include +int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) +{ + pteval_t prot_val; + + /* + * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be + * executable, everything else can be mapped with the XN bits + * set. + */ + if ((md->attribute & EFI_MEMORY_WB) == 0) + prot_val = PROT_DEVICE_nGnRE; + else if (md->type == EFI_RUNTIME_SERVICES_CODE || + !PAGE_ALIGNED(md->phys_addr)) + prot_val = pgprot_val(PAGE_KERNEL_EXEC); + else + prot_val = pgprot_val(PAGE_KERNEL); + + create_pgd_mapping(mm, md->phys_addr, md->virt_addr, + md->num_pages << EFI_PAGE_SHIFT, + __pgprot(prot_val | PTE_NG)); + return 0; +} + static int __init arm64_dmi_init(void) { /* diff --git a/drivers/firmware/efi/arm-init.c b/drivers/firmware/efi/arm-init.c index ffdd76a51929..9e15d571b53c 100644 --- a/drivers/firmware/efi/arm-init.c +++ b/drivers/firmware/efi/arm-init.c @@ -57,7 +57,7 @@ static int __init uefi_init(void) { efi_char16_t *c16; void *config_tables; - u64 table_size; + size_t table_size; char vendor[100] = "unknown"; int i, retval; @@ -69,7 +69,8 @@ static int __init uefi_init(void) } set_bit(EFI_BOOT, &efi.flags); - set_bit(EFI_64BIT, &efi.flags); + if (IS_ENABLED(CONFIG_64BIT)) + set_bit(EFI_64BIT, &efi.flags); /* * Verify the EFI Table @@ -107,7 +108,7 @@ static int __init uefi_init(void) goto out; } retval = efi_config_parse_tables(config_tables, efi.systab->nr_tables, - sizeof(efi_config_table_64_t), NULL); + sizeof(efi_config_table_t), NULL); early_memunmap(config_tables, table_size); out: diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 974743e13a4d..6ae21e41a429 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -12,6 +12,7 @@ */ #include +#include #include #include #include @@ -23,18 +24,14 @@ #include #include -#include -#include #include +#include #include -static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss; - extern u64 efi_system_table; static struct mm_struct efi_mm = { .mm_rb = RB_ROOT, - .pgd = efi_pgd, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), @@ -46,35 +43,27 @@ static bool __init efi_virtmap_init(void) { efi_memory_desc_t *md; + efi_mm.pgd = pgd_alloc(&efi_mm); init_new_context(NULL, &efi_mm); for_each_efi_memory_desc(&memmap, md) { - pgprot_t prot; + phys_addr_t phys = md->phys_addr; + int ret; if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; if (md->virt_addr == 0) return false; - pr_info(" EFI remap 0x%016llx => %p\n", - md->phys_addr, (void *)md->virt_addr); - - /* - * Only regions of type EFI_RUNTIME_SERVICES_CODE need to be - * executable, everything else can be mapped with the XN bits - * set. - */ - if ((md->attribute & EFI_MEMORY_WB) == 0) - prot = __pgprot(PROT_DEVICE_nGnRE); - else if (md->type == EFI_RUNTIME_SERVICES_CODE || - !PAGE_ALIGNED(md->phys_addr)) - prot = PAGE_KERNEL_EXEC; - else - prot = PAGE_KERNEL; - - create_pgd_mapping(&efi_mm, md->phys_addr, md->virt_addr, - md->num_pages << EFI_PAGE_SHIFT, - __pgprot(pgprot_val(prot) | PTE_NG)); + ret = efi_create_mapping(&efi_mm, md); + if (!ret) { + pr_info(" EFI remap %pa => %p\n", + &phys, (void *)(unsigned long)md->virt_addr); + } else { + pr_warn(" EFI remap %pa: failed to create mapping (%d)\n", + &phys, ret); + return false; + } } return true; } @@ -84,7 +73,7 @@ static bool __init efi_virtmap_init(void) * non-early mapping of the UEFI system table and virtual mappings for all * EFI_MEMORY_RUNTIME regions. */ -static int __init arm64_enable_runtime_services(void) +static int __init arm_enable_runtime_services(void) { u64 mapsize; @@ -131,12 +120,7 @@ static int __init arm64_enable_runtime_services(void) return 0; } -early_initcall(arm64_enable_runtime_services); - -static void efi_set_pgd(struct mm_struct *mm) -{ - switch_mm(NULL, mm, NULL); -} +early_initcall(arm_enable_runtime_services); void efi_virtmap_load(void) { diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 027ca212179f..cffa89b3317b 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -25,6 +25,8 @@ #include #include +#include + struct efi __read_mostly efi = { .mps = EFI_INVALID_TABLE_ADDR, .acpi = EFI_INVALID_TABLE_ADDR, -- cgit v1.2.3 From aa4d5d3cbc258c355151a3903211b27359390ec5 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 10 Dec 2015 10:22:39 +0000 Subject: arm64: Add this_cpu_ptr() assembler macro for use in entry.S irq_stack is a per_cpu variable, that needs to be access from entry.S. Use an assembler macro instead of the unreadable details. Signed-off-by: James Morse Signed-off-by: Will Deacon --- arch/arm64/include/asm/assembler.h | 11 +++++++++++ arch/arm64/kernel/entry.S | 4 +--- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 12eff928ef8b..bb7b72734c24 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -193,6 +193,17 @@ lr .req x30 // link register str \src, [\tmp, :lo12:\sym] .endm + /* + * @sym: The name of the per-cpu variable + * @reg: Result of per_cpu(sym, smp_processor_id()) + * @tmp: scratch register + */ + .macro this_cpu_ptr, sym, reg, tmp + adr_l \reg, \sym + mrs \tmp, tpidr_el1 + add \reg, \reg, \tmp + .endm + /* * Annotate a function as position independent, i.e., safe to be called before * the kernel virtual mapping is activated. diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index be7ec544b540..e394f8c9595a 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -179,9 +179,7 @@ alternative_endif .macro irq_stack_entry, dummy_lr mov x19, sp // preserve the original sp - adr_l x25, irq_stack - mrs x26, tpidr_el1 - add x25, x25, x26 + this_cpu_ptr irq_stack, x25, x26 /* * Check the lowest address on irq_stack for the irq_count value, -- cgit v1.2.3 From 9aa4ec1571da62366cfddc20f3b923609604fe63 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 9 Dec 2015 12:44:38 +0000 Subject: arm64: mm: fold alternatives into .init Currently we treat the alternatives separately from other data that's only used during initialisation, using separate .altinstructions and .altinstr_replacement linker sections. These are freed for general allocation separately from .init*. This is problematic as: * We do not remove execute permissions, as we do for .init, leaving the memory executable. * We pad between them, making the kernel Image bianry up to PAGE_SIZE bytes larger than necessary. This patch moves the two sections into the contiguous region used for .init*. This saves some memory, ensures that we remove execute permissions, and allows us to remove some code made redundant by this reorganisation. Signed-off-by: Mark Rutland Cc: Andre Przywara Cc: Catalin Marinas Cc: Jeremy Linton Cc: Laura Abbott Cc: Will Deacon Signed-off-by: Will Deacon --- arch/arm64/include/asm/alternative.h | 1 - arch/arm64/kernel/alternative.c | 6 ------ arch/arm64/kernel/vmlinux.lds.S | 5 ++--- arch/arm64/mm/init.c | 1 - 4 files changed, 2 insertions(+), 11 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index d56ec0715157..e4962f04201e 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -19,7 +19,6 @@ struct alt_instr { void __init apply_alternatives_all(void); void apply_alternatives(void *start, size_t length); -void free_alternatives_memory(void); #define ALTINSTR_ENTRY(feature) \ " .word 661b - .\n" /* label */ \ diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index ab9db0e9818c..d2ee1b21a10d 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -158,9 +158,3 @@ void apply_alternatives(void *start, size_t length) __apply_alternatives(®ion); } - -void free_alternatives_memory(void) -{ - free_reserved_area(__alt_instructions, __alt_instructions_end, - 0, "alternatives"); -} diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index a64340df2448..f943a848f844 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -140,9 +140,6 @@ SECTIONS PERCPU_SECTION(64) - . = ALIGN(PAGE_SIZE); - __init_end = .; - . = ALIGN(4); .altinstructions : { __alt_instructions = .; @@ -154,6 +151,8 @@ SECTIONS } . = ALIGN(PAGE_SIZE); + __init_end = .; + _data = .; _sdata = .; RW_DATA_SECTION(64, PAGE_SIZE, THREAD_SIZE) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 9b8cc673f43c..2ee6c208c318 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -360,7 +360,6 @@ void free_initmem(void) { fixup_init(); free_initmem_default(0); - free_alternatives_memory(); } #ifdef CONFIG_BLK_DEV_INITRD -- cgit v1.2.3 From 4a6ccf30263f4e265c0f171561bf4c40bed5f273 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Thu, 10 Dec 2015 16:54:32 +0000 Subject: arm64: cmpxchg: Don't incldue linux/mmdebug.h The arm64 asm/cmpxchg.h includes linux/mmdebug.h but doesn't so far as I can tell actually use anything from it. Removing the inclusion reduces spurious header dependency rebuilds and also avoids issues with recursive inclusions of headers causing build breaks due to attempts to use things before they are defined if linux/mmdebug.h starts pulling in more low level headers. Such errors have happened in -next recently, for example: In file included from include/linux/completion.h:11:0, from include/linux/rcupdate.h:43, from include/linux/tracepoint.h:19, from include/linux/mmdebug.h:6, from ./arch/arm64/include/asm/cmpxchg.h:22, from ./arch/arm64/include/asm/atomic.h:41, from include/linux/atomic.h:4, from include/linux/spinlock.h:406, from include/linux/seqlock.h:35, from include/linux/time.h:5, from include/uapi/linux/timex.h:56, from include/linux/timex.h:56, from include/linux/sched.h:19, from arch/arm64/kernel/asm-offsets.c:21: include/linux/wait.h: In function 'wait_on_atomic_t': include/linux/wait.h:1218:2: error: implicit declaration of function 'atomic_read' [-Werror=implicit-function-declaration] if (atomic_read(val) == 0) Signed-off-by: Mark Brown Signed-off-by: Will Deacon --- arch/arm64/include/asm/cmpxchg.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h index 9ea611ea69df..510c7b404454 100644 --- a/arch/arm64/include/asm/cmpxchg.h +++ b/arch/arm64/include/asm/cmpxchg.h @@ -19,7 +19,6 @@ #define __ASM_CMPXCHG_H #include -#include #include #include -- cgit v1.2.3 From 971c67ce37cfeeaf560e792a2c3bc21d8b67163a Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 15 Dec 2015 11:21:25 +0000 Subject: arm64: reduce stack use in irq_handler The code for switching to irq_stack stores three pieces of information on the stack, fp+lr, as a fake stack frame (that lets us walk back onto the interrupted tasks stack frame), and the address of the struct pt_regs that contains the register values from kernel entry. (which dump_backtrace() will print in any stack trace). To reduce this, we store fp, and the pointer to the struct pt_regs. unwind_frame() can recognise this as the irq_stack dummy frame, (as it only appears at the top of the irq_stack), and use the struct pt_regs values to find the missing interrupted link-register. Suggested-by: Will Deacon Signed-off-by: James Morse Signed-off-by: Will Deacon --- arch/arm64/include/asm/irq.h | 11 ++++------- arch/arm64/kernel/entry.S | 12 +++++++----- arch/arm64/kernel/stacktrace.c | 19 ++++++++++++++++--- 3 files changed, 27 insertions(+), 15 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 877c7e358384..3bece4379bd9 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -25,16 +25,13 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); * ------------ * | | <- irq_stack_ptr * top ------------ - * | elr_el1 | + * | x19 | <- irq_stack_ptr - 0x08 * ------------ * | x29 | <- irq_stack_ptr - 0x10 * ------------ - * | xzr | - * ------------ - * | x19 | <- irq_stack_ptr - 0x20 - * ------------ * - * where x19 holds a copy of the task stack pointer. + * where x19 holds a copy of the task stack pointer where the struct pt_regs + * from kernel_entry can be found. * */ #define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) @@ -43,7 +40,7 @@ DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); * The offset from irq_stack_ptr where entry.S will store the original * stack pointer. Used by unwind_frame() and dump_backtrace(). */ -#define IRQ_STACK_TO_TASK_STACK(ptr) *((unsigned long *)(ptr - 0x20)); +#define IRQ_STACK_TO_TASK_STACK(ptr) (*((unsigned long *)((ptr) - 0x08))) extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 2284c296e3f7..0667fb7d8bb1 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -178,7 +178,7 @@ alternative_endif mrs \rd, sp_el0 .endm - .macro irq_stack_entry, dummy_lr + .macro irq_stack_entry mov x19, sp // preserve the original sp this_cpu_ptr irq_stack, x25, x26 @@ -196,10 +196,12 @@ alternative_endif add x26, x25, x26 mov sp, x26 - /* Add a dummy stack frame */ - stp x29, \dummy_lr, [sp, #-16]! // dummy stack frame + /* + * Add a dummy stack frame, this non-standard format is fixed up + * by unwind_frame() + */ + stp x29, x19, [sp, #-16]! mov x29, sp - stp x19, xzr, [sp, #-16]! 9998: .endm @@ -229,7 +231,7 @@ tsk .req x28 // current thread_info .macro irq_handler ldr_l x1, handle_arch_irq mov x0, sp - irq_stack_entry x22 + irq_stack_entry blr x1 irq_stack_exit .endm diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index d916d5b6aef6..b9fd3a8abfc1 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -70,17 +70,30 @@ int notrace unwind_frame(struct stackframe *frame) * Check whether we are going to walk through from interrupt stack * to task stack. * If we reach the end of the stack - and its an interrupt stack, - * read the original task stack pointer from the dummy frame. + * unpack the dummy frame to find the original elr. * * Check the frame->fp we read from the bottom of the irq_stack, * and the original task stack pointer are both in current->stack. */ if (frame->sp == irq_stack_ptr) { + struct pt_regs *irq_args; unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); - if(object_is_on_stack((void *)orig_sp) && - object_is_on_stack((void *)frame->fp)) + if (object_is_on_stack((void *)orig_sp) && + object_is_on_stack((void *)frame->fp)) { frame->sp = orig_sp; + + /* orig_sp is the saved pt_regs, find the elr */ + irq_args = (struct pt_regs *)orig_sp; + frame->pc = irq_args->pc; + } else { + /* + * This frame has a non-standard format, and we + * didn't fix it, because the data looked wrong. + * Refuse to output this frame. + */ + return -EINVAL; + } } return 0; -- cgit v1.2.3 From 0a28714c53fd4f7aea709be7577dfbe0095c8c3e Mon Sep 17 00:00:00 2001 From: Ashok Kumar Date: Thu, 17 Dec 2015 01:38:32 -0800 Subject: arm64: Use PoU cache instr for I/D coherency In systems with three levels of cache(PoU at L1 and PoC at L3), PoC cache flush instructions flushes L2 and L3 caches which could affect performance. For cache flushes for I and D coherency, PoU should suffice. So changing all I and D coherency related cache flushes to PoU. Introduced a new __clean_dcache_area_pou API for dcache flush till PoU and provided a common macro for __flush_dcache_area and __clean_dcache_area_pou. Also, now in __sync_icache_dcache, icache invalidation for non-aliasing VIPT icache is done only for that particular page instead of the earlier __flush_icache_all. Reviewed-by: Catalin Marinas Reviewed-by: Mark Rutland Signed-off-by: Ashok Kumar Signed-off-by: Will Deacon --- arch/arm64/include/asm/cacheflush.h | 1 + arch/arm64/mm/cache.S | 28 +++++++++++++++++----------- arch/arm64/mm/flush.c | 33 ++++++++++++++++++--------------- arch/arm64/mm/proc-macros.S | 22 ++++++++++++++++++++++ 4 files changed, 58 insertions(+), 26 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 54efedaf331f..7fc294c3bc5b 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -68,6 +68,7 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void flush_icache_range(unsigned long start, unsigned long end); extern void __flush_dcache_area(void *addr, size_t len); +extern void __clean_dcache_area_pou(void *addr, size_t len); extern long __flush_cache_user_range(unsigned long start, unsigned long end); static inline void flush_cache_mm(struct mm_struct *mm) diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index cfa44a6adc0a..6df07069a025 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -81,25 +81,31 @@ ENDPROC(__flush_cache_user_range) /* * __flush_dcache_area(kaddr, size) * - * Ensure that the data held in the page kaddr is written back to the - * page in question. + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are cleaned and invalidated to the PoC. * * - kaddr - kernel address * - size - size in question */ ENTRY(__flush_dcache_area) - dcache_line_size x2, x3 - add x1, x0, x1 - sub x3, x2, #1 - bic x0, x0, x3 -1: dc civac, x0 // clean & invalidate D line / unified line - add x0, x0, x2 - cmp x0, x1 - b.lo 1b - dsb sy + dcache_by_line_op civac, sy, x0, x1, x2, x3 ret ENDPIPROC(__flush_dcache_area) +/* + * __clean_dcache_area_pou(kaddr, size) + * + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are cleaned to the PoU. + * + * - kaddr - kernel address + * - size - size in question + */ +ENTRY(__clean_dcache_area_pou) + dcache_by_line_op cvau, ish, x0, x1, x2, x3 + ret +ENDPROC(__clean_dcache_area_pou) + /* * __inval_cache_range(start, end) * - start - start address of region diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index c26b804015e8..46649d6e6c5a 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -34,19 +34,24 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, __flush_icache_all(); } +static void sync_icache_aliases(void *kaddr, unsigned long len) +{ + unsigned long addr = (unsigned long)kaddr; + + if (icache_is_aliasing()) { + __clean_dcache_area_pou(kaddr, len); + __flush_icache_all(); + } else { + flush_icache_range(addr, addr + len); + } +} + static void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, unsigned long uaddr, void *kaddr, unsigned long len) { - if (vma->vm_flags & VM_EXEC) { - unsigned long addr = (unsigned long)kaddr; - if (icache_is_aliasing()) { - __flush_dcache_area(kaddr, len); - __flush_icache_all(); - } else { - flush_icache_range(addr, addr + len); - } - } + if (vma->vm_flags & VM_EXEC) + sync_icache_aliases(kaddr, len); } /* @@ -74,13 +79,11 @@ void __sync_icache_dcache(pte_t pte, unsigned long addr) if (!page_mapping(page)) return; - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) { - __flush_dcache_area(page_address(page), - PAGE_SIZE << compound_order(page)); + if (!test_and_set_bit(PG_dcache_clean, &page->flags)) + sync_icache_aliases(page_address(page), + PAGE_SIZE << compound_order(page)); + else if (icache_is_aivivt()) __flush_icache_all(); - } else if (icache_is_aivivt()) { - __flush_icache_all(); - } } /* diff --git a/arch/arm64/mm/proc-macros.S b/arch/arm64/mm/proc-macros.S index 4c4d93c4bf65..146bd99a7532 100644 --- a/arch/arm64/mm/proc-macros.S +++ b/arch/arm64/mm/proc-macros.S @@ -62,3 +62,25 @@ bfi \valreg, \tmpreg, #TCR_T0SZ_OFFSET, #TCR_TxSZ_WIDTH #endif .endm + +/* + * Macro to perform a data cache maintenance for the interval + * [kaddr, kaddr + size) + * + * op: operation passed to dc instruction + * domain: domain used in dsb instruciton + * kaddr: starting virtual address of the region + * size: size of the region + * Corrupts: kaddr, size, tmp1, tmp2 + */ + .macro dcache_by_line_op op, domain, kaddr, size, tmp1, tmp2 + dcache_line_size \tmp1, \tmp2 + add \size, \kaddr, \size + sub \tmp2, \tmp1, #1 + bic \kaddr, \kaddr, \tmp2 +9998: dc \op, \kaddr + add \kaddr, \kaddr, \tmp1 + cmp \kaddr, \size + b.lo 9998b + dsb \domain + .endm -- cgit v1.2.3 From 66b3923a1a0f77a563b43f43f6ad091354abbfe9 Mon Sep 17 00:00:00 2001 From: David Woods Date: Thu, 17 Dec 2015 14:31:26 -0500 Subject: arm64: hugetlb: add support for PTE contiguous bit The arm64 MMU supports a Contiguous bit which is a hint that the TTE is one of a set of contiguous entries which can be cached in a single TLB entry. Supporting this bit adds new intermediate huge page sizes. The set of huge page sizes available depends on the base page size. Without using contiguous pages the huge page sizes are as follows. 4KB: 2MB 1GB 64KB: 512MB With a 4KB granule, the contiguous bit groups together sets of 16 pages and with a 64KB granule it groups sets of 32 pages. This enables two new huge page sizes in each case, so that the full set of available sizes is as follows. 4KB: 64KB 2MB 32MB 1GB 64KB: 2MB 512MB 16GB If a 16KB granule is used then the contiguous bit groups 128 pages at the PTE level and 32 pages at the PMD level. If the base page size is set to 64KB then 2MB pages are enabled by default. It is possible in the future to make 2MB the default huge page size for both 4KB and 64KB granules. Reviewed-by: Chris Metcalf Reviewed-by: Steve Capper Signed-off-by: David Woods Signed-off-by: Will Deacon --- arch/arm64/Kconfig | 3 - arch/arm64/include/asm/hugetlb.h | 44 ++---- arch/arm64/include/asm/pgtable-hwdef.h | 18 ++- arch/arm64/include/asm/pgtable.h | 10 +- arch/arm64/mm/hugetlbpage.c | 274 ++++++++++++++++++++++++++++++++- include/linux/hugetlb.h | 2 - 6 files changed, 313 insertions(+), 38 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4876459c0838..ffa3c549a4ba 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -530,9 +530,6 @@ config HW_PERF_EVENTS config SYS_SUPPORTS_HUGETLBFS def_bool y -config ARCH_WANT_GENERAL_HUGETLB - def_bool y - config ARCH_WANT_HUGE_PMD_SHARE def_bool y if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index bb4052e85dba..bbc1e35aa601 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -26,36 +26,7 @@ static inline pte_t huge_ptep_get(pte_t *ptep) return *ptep; } -static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - set_pte_at(mm, addr, ptep, pte); -} - -static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) -{ - ptep_clear_flush(vma, addr, ptep); -} - -static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - ptep_set_wrprotect(mm, addr, ptep); -} -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - return ptep_get_and_clear(mm, addr, ptep); -} - -static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, - pte_t pte, int dirty) -{ - return ptep_set_access_flags(vma, addr, ptep, pte, dirty); -} static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -97,4 +68,19 @@ static inline void arch_clear_hugepage_flags(struct page *page) clear_bit(PG_dcache_clean, &page->flags); } +extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writable); +#define arch_make_huge_pte arch_make_huge_pte +extern void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty); +extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); +extern void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep); +extern void huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); + #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index d6739e836f7b..5c25b831273d 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -90,7 +90,23 @@ /* * Contiguous page definitions. */ -#define CONT_PTES (_AC(1, UL) << CONT_SHIFT) +#ifdef CONFIG_ARM64_64K_PAGES +#define CONT_PTE_SHIFT 5 +#define CONT_PMD_SHIFT 5 +#elif defined(CONFIG_ARM64_16K_PAGES) +#define CONT_PTE_SHIFT 7 +#define CONT_PMD_SHIFT 5 +#else +#define CONT_PTE_SHIFT 4 +#define CONT_PMD_SHIFT 4 +#endif + +#define CONT_PTES (1 << CONT_PTE_SHIFT) +#define CONT_PTE_SIZE (CONT_PTES * PAGE_SIZE) +#define CONT_PTE_MASK (~(CONT_PTE_SIZE - 1)) +#define CONT_PMDS (1 << CONT_PMD_SHIFT) +#define CONT_PMD_SIZE (CONT_PMDS * PMD_SIZE) +#define CONT_PMD_MASK (~(CONT_PMD_SIZE - 1)) /* the the numerical offset of the PTE within a range of CONT_PTES */ #define CONT_RANGE_OFFSET(addr) (((addr)>>PAGE_SHIFT)&(CONT_PTES-1)) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 450b355f3f49..35a318c2fd87 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -227,7 +227,8 @@ static inline pte_t pte_mkspecial(pte_t pte) static inline pte_t pte_mkcont(pte_t pte) { - return set_pte_bit(pte, __pgprot(PTE_CONT)); + pte = set_pte_bit(pte, __pgprot(PTE_CONT)); + return set_pte_bit(pte, __pgprot(PTE_TYPE_PAGE)); } static inline pte_t pte_mknoncont(pte_t pte) @@ -235,6 +236,11 @@ static inline pte_t pte_mknoncont(pte_t pte) return clear_pte_bit(pte, __pgprot(PTE_CONT)); } +static inline pmd_t pmd_mkcont(pmd_t pmd) +{ + return __pmd(pmd_val(pmd) | PMD_SECT_CONT); +} + static inline void set_pte(pte_t *ptep, pte_t pte) { *ptep = pte; @@ -304,7 +310,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, /* * Hugetlb definitions. */ -#define HUGE_MAX_HSTATE 2 +#define HUGE_MAX_HSTATE 4 #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1, UL) << HPAGE_SHIFT) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 383b03ff38f8..82d607c3614e 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -41,17 +41,289 @@ int pud_huge(pud_t pud) #endif } +static int find_num_contig(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, size_t *pgsize) +{ + pgd_t *pgd = pgd_offset(mm, addr); + pud_t *pud; + pmd_t *pmd; + + *pgsize = PAGE_SIZE; + if (!pte_cont(pte)) + return 1; + if (!pgd_present(*pgd)) { + VM_BUG_ON(!pgd_present(*pgd)); + return 1; + } + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) { + VM_BUG_ON(!pud_present(*pud)); + return 1; + } + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + VM_BUG_ON(!pmd_present(*pmd)); + return 1; + } + if ((pte_t *)pmd == ptep) { + *pgsize = PMD_SIZE; + return CONT_PMDS; + } + return CONT_PTES; +} + +void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + size_t pgsize; + int i; + int ncontig = find_num_contig(mm, addr, ptep, pte, &pgsize); + unsigned long pfn; + pgprot_t hugeprot; + + if (ncontig == 1) { + set_pte_at(mm, addr, ptep, pte); + return; + } + + pfn = pte_pfn(pte); + hugeprot = __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); + for (i = 0; i < ncontig; i++) { + pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep, + pte_val(pfn_pte(pfn, hugeprot))); + set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); + ptep++; + pfn += pgsize >> PAGE_SHIFT; + addr += pgsize; + } +} + +pte_t *huge_pte_alloc(struct mm_struct *mm, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgd; + pud_t *pud; + pte_t *pte = NULL; + + pr_debug("%s: addr:0x%lx sz:0x%lx\n", __func__, addr, sz); + pgd = pgd_offset(mm, addr); + pud = pud_alloc(mm, pgd, addr); + if (!pud) + return NULL; + + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + } else if (sz == (PAGE_SIZE * CONT_PTES)) { + pmd_t *pmd = pmd_alloc(mm, pud, addr); + + WARN_ON(addr & (sz - 1)); + /* + * Note that if this code were ever ported to the + * 32-bit arm platform then it will cause trouble in + * the case where CONFIG_HIGHPTE is set, since there + * will be no pte_unmap() to correspond with this + * pte_alloc_map(). + */ + pte = pte_alloc_map(mm, NULL, pmd, addr); + } else if (sz == PMD_SIZE) { + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && + pud_none(*pud)) + pte = huge_pmd_share(mm, addr, pud); + else + pte = (pte_t *)pmd_alloc(mm, pud, addr); + } else if (sz == (PMD_SIZE * CONT_PMDS)) { + pmd_t *pmd; + + pmd = pmd_alloc(mm, pud, addr); + WARN_ON(addr & (sz - 1)); + return (pte_t *)pmd; + } + + pr_debug("%s: addr:0x%lx sz:0x%lx ret pte=%p/0x%llx\n", __func__, addr, + sz, pte, pte_val(*pte)); + return pte; +} + +pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + pte_t *pte = NULL; + + pgd = pgd_offset(mm, addr); + pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd); + if (!pgd_present(*pgd)) + return NULL; + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return NULL; + + if (pud_huge(*pud)) + return (pte_t *)pud; + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return NULL; + + if (pte_cont(pmd_pte(*pmd))) { + pmd = pmd_offset( + pud, (addr & CONT_PMD_MASK)); + return (pte_t *)pmd; + } + if (pmd_huge(*pmd)) + return (pte_t *)pmd; + pte = pte_offset_kernel(pmd, addr); + if (pte_present(*pte) && pte_cont(*pte)) { + pte = pte_offset_kernel( + pmd, (addr & CONT_PTE_MASK)); + return pte; + } + return NULL; +} + +pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, + struct page *page, int writable) +{ + size_t pagesize = huge_page_size(hstate_vma(vma)); + + if (pagesize == CONT_PTE_SIZE) { + entry = pte_mkcont(entry); + } else if (pagesize == CONT_PMD_SIZE) { + entry = pmd_pte(pmd_mkcont(pte_pmd(entry))); + } else if (pagesize != PUD_SIZE && pagesize != PMD_SIZE) { + pr_warn("%s: unrecognized huge page size 0x%lx\n", + __func__, pagesize); + } + return entry; +} + +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte; + + if (pte_cont(*ptep)) { + int ncontig, i; + size_t pgsize; + pte_t *cpte; + bool is_dirty = false; + + cpte = huge_pte_offset(mm, addr); + ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); + /* save the 1st pte to return */ + pte = ptep_get_and_clear(mm, addr, cpte); + for (i = 1; i < ncontig; ++i) { + /* + * If HW_AFDBM is enabled, then the HW could + * turn on the dirty bit for any of the page + * in the set, so check them all. + */ + ++cpte; + if (pte_dirty(ptep_get_and_clear(mm, addr, cpte))) + is_dirty = true; + } + if (is_dirty) + return pte_mkdirty(pte); + else + return pte; + } else { + return ptep_get_and_clear(mm, addr, ptep); + } +} + +int huge_ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty) +{ + pte_t *cpte; + + if (pte_cont(pte)) { + int ncontig, i, changed = 0; + size_t pgsize = 0; + unsigned long pfn = pte_pfn(pte); + /* Select all bits except the pfn */ + pgprot_t hugeprot = + __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ + pte_val(pte)); + + cpte = huge_pte_offset(vma->vm_mm, addr); + pfn = pte_pfn(*cpte); + ncontig = find_num_contig(vma->vm_mm, addr, cpte, + *cpte, &pgsize); + for (i = 0; i < ncontig; ++i, ++cpte) { + changed = ptep_set_access_flags(vma, addr, cpte, + pfn_pte(pfn, + hugeprot), + dirty); + pfn += pgsize >> PAGE_SHIFT; + } + return changed; + } else { + return ptep_set_access_flags(vma, addr, ptep, pte, dirty); + } +} + +void huge_ptep_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + if (pte_cont(*ptep)) { + int ncontig, i; + pte_t *cpte; + size_t pgsize = 0; + + cpte = huge_pte_offset(mm, addr); + ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); + for (i = 0; i < ncontig; ++i, ++cpte) + ptep_set_wrprotect(mm, addr, cpte); + } else { + ptep_set_wrprotect(mm, addr, ptep); + } +} + +void huge_ptep_clear_flush(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + if (pte_cont(*ptep)) { + int ncontig, i; + pte_t *cpte; + size_t pgsize = 0; + + cpte = huge_pte_offset(vma->vm_mm, addr); + ncontig = find_num_contig(vma->vm_mm, addr, cpte, + *cpte, &pgsize); + for (i = 0; i < ncontig; ++i, ++cpte) + ptep_clear_flush(vma, addr, cpte); + } else { + ptep_clear_flush(vma, addr, ptep); + } +} + static __init int setup_hugepagesz(char *opt) { unsigned long ps = memparse(opt, &opt); + if (ps == PMD_SIZE) { hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); } else if (ps == PUD_SIZE) { hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + } else if (ps == (PAGE_SIZE * CONT_PTES)) { + hugetlb_add_hstate(CONT_PTE_SHIFT); + } else if (ps == (PMD_SIZE * CONT_PMDS)) { + hugetlb_add_hstate((PMD_SHIFT + CONT_PMD_SHIFT) - PAGE_SHIFT); } else { - pr_err("hugepagesz: Unsupported page size %lu M\n", ps >> 20); + pr_err("hugepagesz: Unsupported page size %lu K\n", ps >> 10); return 0; } return 1; } __setup("hugepagesz=", setup_hugepagesz); + +#ifdef CONFIG_ARM64_64K_PAGES +static __init int add_default_hugepagesz(void) +{ + if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL) + hugetlb_add_hstate(CONT_PMD_SHIFT); + return 0; +} +arch_initcall(add_default_hugepagesz); +#endif diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 685c262e0be8..b0eb06423d5e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -96,9 +96,7 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, struct address_space *mapping, pgoff_t idx, unsigned long address); -#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); -#endif extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; -- cgit v1.2.3 From d224a69e3d80fe08f285d1f41d21b590bae4fa9f Mon Sep 17 00:00:00 2001 From: James Morse Date: Fri, 18 Dec 2015 16:01:47 +0000 Subject: arm64: remove irq_count and do_softirq_own_stack() sysrq_handle_reboot() re-enables interrupts while on the irq stack. The irq_stack implementation wrongly assumed this would only ever happen via the softirq path, allowing it to update irq_count late, in do_softirq_own_stack(). This means if an irq occurs in sysrq_handle_reboot(), during emergency_restart() the stack will be corrupted, as irq_count wasn't updated. Lose the optimisation, and instead of moving the adding/subtracting of irq_count into irq_stack_entry/irq_stack_exit, remove it, and compare sp_el0 (struct thread_info) with sp & ~(THREAD_SIZE - 1). This tells us if we are on a task stack, if so, we can safely switch to the irq stack. Finally, remove do_softirq_own_stack(), we don't need it anymore. Reported-by: Will Deacon Signed-off-by: James Morse [will: use get_thread_info macro] Signed-off-by: Will Deacon --- arch/arm64/include/asm/irq.h | 2 -- arch/arm64/kernel/entry.S | 19 ++++++++++--------- arch/arm64/kernel/irq.c | 38 +------------------------------------- 3 files changed, 11 insertions(+), 48 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/irq.h b/arch/arm64/include/asm/irq.h index 3bece4379bd9..b77197d941fc 100644 --- a/arch/arm64/include/asm/irq.h +++ b/arch/arm64/include/asm/irq.h @@ -11,8 +11,6 @@ #include #include -#define __ARCH_HAS_DO_SOFTIRQ - struct pt_regs; DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 0667fb7d8bb1..c0db321db7e1 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -181,19 +181,20 @@ alternative_endif .macro irq_stack_entry mov x19, sp // preserve the original sp - this_cpu_ptr irq_stack, x25, x26 - /* - * Check the lowest address on irq_stack for the irq_count value, - * incremented by do_softirq_own_stack if we have re-enabled irqs - * while on the irq_stack. + * Compare sp with the current thread_info, if the top + * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and + * should switch to the irq stack. */ - ldr x26, [x25] - cbnz x26, 9998f // recursive use? + and x25, x19, #~(THREAD_SIZE - 1) + cmp x25, tsk + b.ne 9998f - /* switch to the irq stack */ + this_cpu_ptr irq_stack, x25, x26 mov x26, #IRQ_STACK_START_SP add x26, x25, x26 + + /* switch to the irq stack */ mov sp, x26 /* @@ -405,10 +406,10 @@ el1_irq: bl trace_hardirqs_off #endif + get_thread_info tsk irq_handler #ifdef CONFIG_PREEMPT - get_thread_info tsk ldr w24, [tsk, #TI_PREEMPT] // get preempt count cbnz w24, 1f // preempt count != 0 ldr x0, [tsk, #TI_FLAGS] // get flags diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index ff7ebb710e51..2386b26c0712 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -25,24 +25,14 @@ #include #include #include -#include #include #include unsigned long irq_err_count; -/* - * irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. - * irq_stack[0] is used as irq_count, a non-zero value indicates the stack - * is in use, and el?_irq() shouldn't switch to it. This is used to detect - * recursive use of the irq_stack, it is lazily updated by - * do_softirq_own_stack(), which is called on the irq_stack, before - * re-enabling interrupts to process softirqs. - */ +/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); -#define IRQ_COUNT() (*per_cpu(irq_stack, smp_processor_id())) - int arch_show_interrupts(struct seq_file *p, int prec) { show_ipi_list(p, prec); @@ -66,29 +56,3 @@ void __init init_IRQ(void) if (!handle_arch_irq) panic("No interrupt controller found."); } - -/* - * do_softirq_own_stack() is called from irq_exit() before __do_softirq() - * re-enables interrupts, at which point we may re-enter el?_irq(). We - * increase irq_count here so that el1_irq() knows that it is already on the - * irq stack. - * - * Called with interrupts disabled, so we don't worry about moving cpu, or - * being interrupted while modifying irq_count. - * - * This function doesn't actually switch stack. - */ -void do_softirq_own_stack(void) -{ - int cpu = smp_processor_id(); - - WARN_ON_ONCE(!irqs_disabled()); - - if (on_irq_stack(current_stack_pointer, cpu)) { - IRQ_COUNT()++; - __do_softirq(); - IRQ_COUNT()--; - } else { - __do_softirq(); - } -} -- cgit v1.2.3 From fe13f95b720075327a761fe6ddb45b0c90cab504 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Tue, 15 Dec 2015 17:33:40 +0900 Subject: arm64: pass a task parameter to unwind_frame() Function graph tracer modifies a return address (LR) in a stack frame to hook a function's return. This will result in many useless entries (return_to_handler) showing up in a call stack list. We will fix this problem in a later patch ("arm64: ftrace: fix a stack tracer's output under function graph tracer"). But since real return addresses are saved in ret_stack[] array in struct task_struct, unwind functions need to be notified of, in addition to a stack pointer address, which task is being traced in order to find out real return addresses. This patch extends unwind functions' interfaces by adding an extra argument of a pointer to task_struct. Signed-off-by: AKASHI Takahiro Signed-off-by: Will Deacon --- arch/arm64/include/asm/stacktrace.h | 6 ++++-- arch/arm64/kernel/perf_callchain.c | 2 +- arch/arm64/kernel/process.c | 2 +- arch/arm64/kernel/return_address.c | 2 +- arch/arm64/kernel/stacktrace.c | 8 ++++---- arch/arm64/kernel/time.c | 2 +- arch/arm64/kernel/traps.c | 2 +- 7 files changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 7318f6d54aa9..6fb61c5090b4 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -16,14 +16,16 @@ #ifndef __ASM_STACKTRACE_H #define __ASM_STACKTRACE_H +struct task_struct; + struct stackframe { unsigned long fp; unsigned long sp; unsigned long pc; }; -extern int unwind_frame(struct stackframe *frame); -extern void walk_stackframe(struct stackframe *frame, +extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); +extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data); #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 3aa74830cc69..797220da912b 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -165,7 +165,7 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, frame.sp = regs->sp; frame.pc = regs->pc; - walk_stackframe(&frame, callchain_trace, entry); + walk_stackframe(current, &frame, callchain_trace, entry); } unsigned long perf_instruction_pointer(struct pt_regs *regs) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index f75b540bc3b4..98bf5461d4b6 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -348,7 +348,7 @@ unsigned long get_wchan(struct task_struct *p) do { if (frame.sp < stack_page || frame.sp >= stack_page + THREAD_SIZE || - unwind_frame(&frame)) + unwind_frame(p, &frame)) return 0; if (!in_sched_functions(frame.pc)) return frame.pc; diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 6c4fd2810ecb..07b37ac05be4 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -44,7 +44,7 @@ void *return_address(unsigned int level) frame.sp = current_stack_pointer; frame.pc = (unsigned long)return_address; /* dummy */ - walk_stackframe(&frame, save_return_addr, &data); + walk_stackframe(current, &frame, save_return_addr, &data); if (!data.level) return data.addr; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index b9fd3a8abfc1..f7ee597ec883 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -36,7 +36,7 @@ * ldp x29, x30, [sp] * add sp, sp, #0x10 */ -int notrace unwind_frame(struct stackframe *frame) +int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) { unsigned long high, low; unsigned long fp = frame->fp; @@ -99,7 +99,7 @@ int notrace unwind_frame(struct stackframe *frame) return 0; } -void notrace walk_stackframe(struct stackframe *frame, +void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data) { while (1) { @@ -107,7 +107,7 @@ void notrace walk_stackframe(struct stackframe *frame, if (fn(frame, data)) break; - ret = unwind_frame(frame); + ret = unwind_frame(tsk, frame); if (ret < 0) break; } @@ -159,7 +159,7 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) frame.pc = (unsigned long)save_stack_trace_tsk; } - walk_stackframe(&frame, save_trace, &data); + walk_stackframe(tsk, &frame, save_trace, &data); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 13339b6ffc1a..6e5c521f123a 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -53,7 +53,7 @@ unsigned long profile_pc(struct pt_regs *regs) frame.sp = regs->sp; frame.pc = regs->pc; do { - int ret = unwind_frame(&frame); + int ret = unwind_frame(NULL, &frame); if (ret < 0) return 0; } while (in_lock_functions(frame.pc)); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8a0084541f84..937008523fa5 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -177,7 +177,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) int ret; dump_backtrace_entry(where); - ret = unwind_frame(&frame); + ret = unwind_frame(tsk, &frame); if (ret < 0) break; stack = frame.sp; -- cgit v1.2.3 From 20380bb390a443b2c5c8800cec59743faf8151b4 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Tue, 15 Dec 2015 17:33:41 +0900 Subject: arm64: ftrace: fix a stack tracer's output under function graph tracer Function graph tracer modifies a return address (LR) in a stack frame to hook a function return. This will result in many useless entries (return_to_handler) showing up in a) a stack tracer's output b) perf call graph (with perf record -g) c) dump_backtrace (at panic et al.) For example, in case of a), $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ echo 1 > /proc/sys/kernel/stack_trace_enabled $ cat /sys/kernel/debug/tracing/stack_trace Depth Size Location (54 entries) ----- ---- -------- 0) 4504 16 gic_raise_softirq+0x28/0x150 1) 4488 80 smp_cross_call+0x38/0xb8 2) 4408 48 return_to_handler+0x0/0x40 3) 4360 32 return_to_handler+0x0/0x40 ... In case of b), $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ perf record -e mem:XXX:x -ag -- sleep 10 $ perf report ... | | |--0.22%-- 0x550f8 | | | 0x10888 | | | el0_svc_naked | | | sys_openat | | | return_to_handler | | | return_to_handler ... In case of c), $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ echo c > /proc/sysrq-trigger ... Call trace: [] sysrq_handle_crash+0x24/0x30 [] return_to_handler+0x0/0x40 [] return_to_handler+0x0/0x40 ... This patch replaces such entries with real addresses preserved in current->ret_stack[] at unwind_frame(). This way, we can cover all the cases. Reviewed-by: Jungseok Lee Signed-off-by: AKASHI Takahiro [will: fixed minor context changes conflicting with irq stack bits] Signed-off-by: Will Deacon --- arch/arm64/include/asm/ftrace.h | 2 ++ arch/arm64/include/asm/stacktrace.h | 3 +++ arch/arm64/kernel/perf_callchain.c | 3 +++ arch/arm64/kernel/process.c | 3 +++ arch/arm64/kernel/return_address.c | 3 +++ arch/arm64/kernel/stacktrace.c | 17 +++++++++++++++++ arch/arm64/kernel/time.c | 3 +++ arch/arm64/kernel/traps.c | 26 ++++++++++++++++++++------ 8 files changed, 54 insertions(+), 6 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index c5534facf941..3c60f37e48ab 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -28,6 +28,8 @@ struct dyn_arch_ftrace { extern unsigned long ftrace_graph_call; +extern void return_to_handler(void); + static inline unsigned long ftrace_call_adjust(unsigned long addr) { /* diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 6fb61c5090b4..801a16dbbdf6 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -22,6 +22,9 @@ struct stackframe { unsigned long fp; unsigned long sp; unsigned long pc; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + unsigned int graph; +#endif }; extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame); diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c index 797220da912b..ff4665462a02 100644 --- a/arch/arm64/kernel/perf_callchain.c +++ b/arch/arm64/kernel/perf_callchain.c @@ -164,6 +164,9 @@ void perf_callchain_kernel(struct perf_callchain_entry *entry, frame.fp = regs->regs[29]; frame.sp = regs->sp; frame.pc = regs->pc; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = current->curr_ret_stack; +#endif walk_stackframe(current, &frame, callchain_trace, entry); } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 98bf5461d4b6..88d742ba19d5 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -344,6 +344,9 @@ unsigned long get_wchan(struct task_struct *p) frame.fp = thread_saved_fp(p); frame.sp = thread_saved_sp(p); frame.pc = thread_saved_pc(p); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = p->curr_ret_stack; +#endif stack_page = (unsigned long)task_stack_page(p); do { if (frame.sp < stack_page || diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 07b37ac05be4..1718706fde83 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -43,6 +43,9 @@ void *return_address(unsigned int level) frame.fp = (unsigned long)__builtin_frame_address(0); frame.sp = current_stack_pointer; frame.pc = (unsigned long)return_address; /* dummy */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = current->curr_ret_stack; +#endif walk_stackframe(current, &frame, save_return_addr, &data); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index f7ee597ec883..4fad9787ab46 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -17,6 +17,7 @@ */ #include #include +#include #include #include @@ -66,6 +67,19 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) frame->fp = *(unsigned long *)(fp); frame->pc = *(unsigned long *)(fp + 8); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (tsk && tsk->ret_stack && + (frame->pc == (unsigned long)return_to_handler)) { + /* + * This is a case where function graph tracer has + * modified a return address (LR) in a stack frame + * to hook a function return. + * So replace it to an original value. + */ + frame->pc = tsk->ret_stack[frame->graph--].ret; + } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Check whether we are going to walk through from interrupt stack * to task stack. @@ -158,6 +172,9 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) frame.sp = current_stack_pointer; frame.pc = (unsigned long)save_stack_trace_tsk; } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = tsk->curr_ret_stack; +#endif walk_stackframe(tsk, &frame, save_trace, &data); if (trace->nr_entries < trace->max_entries) diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 6e5c521f123a..59779699a1a4 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -52,6 +52,9 @@ unsigned long profile_pc(struct pt_regs *regs) frame.fp = regs->regs[29]; frame.sp = regs->sp; frame.pc = regs->pc; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = -1; /* no task info */ +#endif do { int ret = unwind_frame(NULL, &frame); if (ret < 0) diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 937008523fa5..bdc293f6adc4 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -147,17 +147,14 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; unsigned long irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); + int skip; pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); if (!tsk) tsk = current; - if (regs) { - frame.fp = regs->regs[29]; - frame.sp = regs->sp; - frame.pc = regs->pc; - } else if (tsk == current) { + if (tsk == current) { frame.fp = (unsigned long)__builtin_frame_address(0); frame.sp = current_stack_pointer; frame.pc = (unsigned long)dump_backtrace; @@ -169,14 +166,31 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) frame.sp = thread_saved_sp(tsk); frame.pc = thread_saved_pc(tsk); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame.graph = tsk->curr_ret_stack; +#endif + skip = !!regs; pr_emerg("Call trace:\n"); while (1) { unsigned long where = frame.pc; unsigned long stack; int ret; - dump_backtrace_entry(where); + /* skip until specified stack frame */ + if (!skip) { + dump_backtrace_entry(where); + } else if (frame.fp == regs->regs[29]) { + skip = 0; + /* + * Mostly, this is the case where this function is + * called in panic/abort. As exception handler's + * stack frame does not contain the corresponding pc + * at which an exception has taken place, use regs->pc + * instead. + */ + dump_backtrace_entry(regs->pc); + } ret = unwind_frame(tsk, &frame); if (ret < 0) break; -- cgit v1.2.3 From 39b5be9b4233a9f212b98242bddf008f379b5122 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 5 Jan 2016 15:36:59 +0000 Subject: arm64: mm: move pgd_cache initialisation to pgtable_cache_init Initialising the suppport for EFI runtime services requires us to allocate a pgd off the back of an early_initcall. On systems where the PGD_SIZE is smaller than PAGE_SIZE (e.g. 64k pages and 48-bit VA), the pgd_cache isn't initialised at this stage, and we panic with a NULL dereference during boot: Unable to handle kernel NULL pointer dereference at virtual address 00000000 __create_mapping.isra.5+0x84/0x350 create_pgd_mapping+0x20/0x28 efi_create_mapping+0x5c/0x6c arm_enable_runtime_services+0x154/0x1e4 do_one_initcall+0x8c/0x190 kernel_init_freeable+0x84/0x1ec kernel_init+0x10/0xe0 ret_from_fork+0x10/0x50 This patch fixes the problem by initialising the pgd_cache earlier, in the pgtable_cache_init callback, which sounds suspiciously like what it was intended for. Reported-by: Dennis Chen Signed-off-by: Will Deacon --- arch/arm64/include/asm/pgtable.h | 3 ++- arch/arm64/mm/pgd.c | 12 ++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/arm64/include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 35a318c2fd87..a87e964d2791 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -676,7 +676,8 @@ extern int kern_addr_valid(unsigned long addr); #include -#define pgtable_cache_init() do { } while (0) +void pgd_cache_init(void); +#define pgtable_cache_init pgd_cache_init /* * On AArch64, the cache coherency is handled via the set_pte_at() function. diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index cb3ba1b812e7..ae11d4e03d0e 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -46,14 +46,14 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } -static int __init pgd_cache_init(void) +void __init pgd_cache_init(void) { + if (PGD_SIZE == PAGE_SIZE) + return; + /* * Naturally aligned pgds required by the architecture. */ - if (PGD_SIZE != PAGE_SIZE) - pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE, - SLAB_PANIC, NULL); - return 0; + pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE, + SLAB_PANIC, NULL); } -core_initcall(pgd_cache_init); -- cgit v1.2.3