diff options
author | Marc Zyngier <maz@kernel.org> | 2022-07-27 19:33:27 +0200 |
---|---|---|
committer | Marc Zyngier <maz@kernel.org> | 2022-07-27 19:33:27 +0200 |
commit | 0982c8d859f8f7022b9fd44d421c7ec721bb41f9 (patch) | |
tree | f6f9cb6622374da505eb728ab88e7a5f91c93583 /arch/arm64 | |
parent | Merge branch kvm-arm64/sysreg-cleanup-5.20 into kvmarm-master/next (diff) | |
parent | arm64: Update 'unwinder howto' (diff) | |
download | linux-0982c8d859f8f7022b9fd44d421c7ec721bb41f9.tar.xz linux-0982c8d859f8f7022b9fd44d421c7ec721bb41f9.zip |
Merge branch kvm-arm64/nvhe-stacktrace into kvmarm-master/next
* kvm-arm64/nvhe-stacktrace: (27 commits)
: .
: Add an overflow stack to the nVHE EL2 code, allowing
: the implementation of an unwinder, courtesy of
: Kalesh Singh. From the cover letter (slightly edited):
:
: "nVHE has two modes of operation: protected (pKVM) and unprotected
: (conventional nVHE). Depending on the mode, a slightly different approach
: is used to dump the hypervisor stacktrace but the core unwinding logic
: remains the same.
:
: * Protected nVHE (pKVM) stacktraces:
:
: In protected nVHE mode, the host cannot directly access hypervisor memory.
:
: The hypervisor stack unwinding happens in EL2 and is made accessible to
: the host via a shared buffer. Symbolizing and printing the stacktrace
: addresses is delegated to the host and happens in EL1.
:
: * Non-protected (Conventional) nVHE stacktraces:
:
: In non-protected mode, the host is able to directly access the hypervisor
: stack pages.
:
: The hypervisor stack unwinding and dumping of the stacktrace is performed
: by the host in EL1, as this avoids the memory overhead of setting up
: shared buffers between the host and hypervisor."
:
: Additional patches from Oliver Upton and Marc Zyngier, tidying up
: the initial series.
: .
arm64: Update 'unwinder howto'
KVM: arm64: Don't open code ARRAY_SIZE()
KVM: arm64: Move nVHE-only helpers into kvm/stacktrace.c
KVM: arm64: Make unwind()/on_accessible_stack() per-unwinder functions
KVM: arm64: Move nVHE stacktrace unwinding into its own compilation unit
KVM: arm64: Move PROTECTED_NVHE_STACKTRACE around
KVM: arm64: Introduce pkvm_dump_backtrace()
KVM: arm64: Implement protected nVHE hyp stack unwinder
KVM: arm64: Save protected-nVHE (pKVM) hyp stacktrace
KVM: arm64: Stub implementation of pKVM HYP stack unwinder
KVM: arm64: Allocate shared pKVM hyp stacktrace buffers
KVM: arm64: Add PROTECTED_NVHE_STACKTRACE Kconfig
KVM: arm64: Introduce hyp_dump_backtrace()
KVM: arm64: Implement non-protected nVHE hyp stack unwinder
KVM: arm64: Prepare non-protected nVHE hypervisor stacktrace
KVM: arm64: Stub implementation of non-protected nVHE HYP stack unwinder
KVM: arm64: On stack overflow switch to hyp overflow_stack
arm64: stacktrace: Add description of stacktrace/common.h
arm64: stacktrace: Factor out common unwind()
arm64: stacktrace: Handle frame pointer from different address spaces
...
Signed-off-by: Marc Zyngier <maz@kernel.org>
Diffstat (limited to 'arch/arm64')
-rw-r--r-- | arch/arm64/include/asm/kvm_asm.h | 16 | ||||
-rw-r--r-- | arch/arm64/include/asm/memory.h | 8 | ||||
-rw-r--r-- | arch/arm64/include/asm/stacktrace.h | 62 | ||||
-rw-r--r-- | arch/arm64/include/asm/stacktrace/common.h | 199 | ||||
-rw-r--r-- | arch/arm64/include/asm/stacktrace/nvhe.h | 55 | ||||
-rw-r--r-- | arch/arm64/kernel/Makefile | 5 | ||||
-rw-r--r-- | arch/arm64/kernel/stacktrace.c | 184 | ||||
-rw-r--r-- | arch/arm64/kvm/Kconfig | 13 | ||||
-rw-r--r-- | arch/arm64/kvm/Makefile | 2 | ||||
-rw-r--r-- | arch/arm64/kvm/arm.c | 2 | ||||
-rw-r--r-- | arch/arm64/kvm/handle_exit.c | 4 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/Makefile | 2 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/host.S | 9 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/stacktrace.c | 160 | ||||
-rw-r--r-- | arch/arm64/kvm/hyp/nvhe/switch.c | 6 | ||||
-rw-r--r-- | arch/arm64/kvm/stacktrace.c | 218 |
16 files changed, 775 insertions, 170 deletions
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 2e277f2ed671..53035763e48e 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -176,6 +176,22 @@ struct kvm_nvhe_init_params { unsigned long vtcr; }; +/* + * Used by the host in EL1 to dump the nVHE hypervisor backtrace on + * hyp_panic() in non-protected mode. + * + * @stack_base: hyp VA of the hyp_stack base. + * @overflow_stack_base: hyp VA of the hyp_overflow_stack base. + * @fp: hyp FP where the backtrace begins. + * @pc: hyp PC where the backtrace begins. + */ +struct kvm_nvhe_stacktrace_info { + unsigned long stack_base; + unsigned long overflow_stack_base; + unsigned long fp; + unsigned long pc; +}; + /* Translate a kernel address @ptr into its equivalent linear mapping */ #define kvm_ksym_ref(ptr) \ ({ \ diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 0af70d9abede..cab80a9a4086 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -114,6 +114,14 @@ #define OVERFLOW_STACK_SIZE SZ_4K /* + * With the minimum frame size of [x29, x30], exactly half the combined + * sizes of the hyp and overflow stacks is the maximum size needed to + * save the unwinded stacktrace; plus an additional entry to delimit the + * end. + */ +#define NVHE_STACKTRACE_SIZE ((OVERFLOW_STACK_SIZE + PAGE_SIZE) / 2 + sizeof(long)) + +/* * Alignment of kernel segments (e.g. .text, .data). * * 4 KB granule: 16 level 3 entries, with contiguous bit diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index aec9315bf156..6ebdcdff77f5 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -8,52 +8,20 @@ #include <linux/percpu.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> -#include <linux/types.h> #include <linux/llist.h> #include <asm/memory.h> +#include <asm/pointer_auth.h> #include <asm/ptrace.h> #include <asm/sdei.h> -enum stack_type { - STACK_TYPE_UNKNOWN, - STACK_TYPE_TASK, - STACK_TYPE_IRQ, - STACK_TYPE_OVERFLOW, - STACK_TYPE_SDEI_NORMAL, - STACK_TYPE_SDEI_CRITICAL, - __NR_STACK_TYPES -}; - -struct stack_info { - unsigned long low; - unsigned long high; - enum stack_type type; -}; +#include <asm/stacktrace/common.h> extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk, const char *loglvl); DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); -static inline bool on_stack(unsigned long sp, unsigned long size, - unsigned long low, unsigned long high, - enum stack_type type, struct stack_info *info) -{ - if (!low) - return false; - - if (sp < low || sp + size < sp || sp + size > high) - return false; - - if (info) { - info->low = low; - info->high = high; - info->type = type; - } - return true; -} - static inline bool on_irq_stack(unsigned long sp, unsigned long size, struct stack_info *info) { @@ -89,30 +57,4 @@ static inline bool on_overflow_stack(unsigned long sp, unsigned long size, struct stack_info *info) { return false; } #endif - -/* - * We can only safely access per-cpu stacks from current in a non-preemptible - * context. - */ -static inline bool on_accessible_stack(const struct task_struct *tsk, - unsigned long sp, unsigned long size, - struct stack_info *info) -{ - if (info) - info->type = STACK_TYPE_UNKNOWN; - - if (on_task_stack(tsk, sp, size, info)) - return true; - if (tsk != current || preemptible()) - return false; - if (on_irq_stack(sp, size, info)) - return true; - if (on_overflow_stack(sp, size, info)) - return true; - if (on_sdei_stack(sp, size, info)) - return true; - - return false; -} - #endif /* __ASM_STACKTRACE_H */ diff --git a/arch/arm64/include/asm/stacktrace/common.h b/arch/arm64/include/asm/stacktrace/common.h new file mode 100644 index 000000000000..f58eb944c46f --- /dev/null +++ b/arch/arm64/include/asm/stacktrace/common.h @@ -0,0 +1,199 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Common arm64 stack unwinder code. + * + * To implement a new arm64 stack unwinder: + * 1) Include this header + * + * 2) Call into unwind_next_common() from your top level unwind + * function, passing it the validation and translation callbacks + * (though the later can be NULL if no translation is required). + * + * See: arch/arm64/kernel/stacktrace.c for the reference implementation. + * + * Copyright (C) 2012 ARM Ltd. + */ +#ifndef __ASM_STACKTRACE_COMMON_H +#define __ASM_STACKTRACE_COMMON_H + +#include <linux/bitmap.h> +#include <linux/bitops.h> +#include <linux/kprobes.h> +#include <linux/types.h> + +enum stack_type { + STACK_TYPE_UNKNOWN, + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_OVERFLOW, + STACK_TYPE_SDEI_NORMAL, + STACK_TYPE_SDEI_CRITICAL, + STACK_TYPE_HYP, + __NR_STACK_TYPES +}; + +struct stack_info { + unsigned long low; + unsigned long high; + enum stack_type type; +}; + +/* + * A snapshot of a frame record or fp/lr register values, along with some + * accounting information necessary for robust unwinding. + * + * @fp: The fp value in the frame record (or the real fp) + * @pc: The lr value in the frame record (or the real lr) + * + * @stacks_done: Stacks which have been entirely unwound, for which it is no + * longer valid to unwind to. + * + * @prev_fp: The fp that pointed to this frame record, or a synthetic value + * of 0. This is used to ensure that within a stack, each + * subsequent frame record is at an increasing address. + * @prev_type: The type of stack this frame record was on, or a synthetic + * value of STACK_TYPE_UNKNOWN. This is used to detect a + * transition from one stack to another. + * + * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance + * associated with the most recently encountered replacement lr + * value. + * + * @task: The task being unwound. + */ +struct unwind_state { + unsigned long fp; + unsigned long pc; + DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES); + unsigned long prev_fp; + enum stack_type prev_type; +#ifdef CONFIG_KRETPROBES + struct llist_node *kr_cur; +#endif + struct task_struct *task; +}; + +static inline bool on_stack(unsigned long sp, unsigned long size, + unsigned long low, unsigned long high, + enum stack_type type, struct stack_info *info) +{ + if (!low) + return false; + + if (sp < low || sp + size < sp || sp + size > high) + return false; + + if (info) { + info->low = low; + info->high = high; + info->type = type; + } + return true; +} + +static inline void unwind_init_common(struct unwind_state *state, + struct task_struct *task) +{ + state->task = task; +#ifdef CONFIG_KRETPROBES + state->kr_cur = NULL; +#endif + + /* + * Prime the first unwind. + * + * In unwind_next() we'll check that the FP points to a valid stack, + * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be + * treated as a transition to whichever stack that happens to be. The + * prev_fp value won't be used, but we set it to 0 such that it is + * definitely not an accessible stack address. + */ + bitmap_zero(state->stacks_done, __NR_STACK_TYPES); + state->prev_fp = 0; + state->prev_type = STACK_TYPE_UNKNOWN; +} + +/* + * stack_trace_translate_fp_fn() - Translates a non-kernel frame pointer to + * a kernel address. + * + * @fp: the frame pointer to be updated to its kernel address. + * @type: the stack type associated with frame pointer @fp + * + * Returns true and success and @fp is updated to the corresponding + * kernel virtual address; otherwise returns false. + */ +typedef bool (*stack_trace_translate_fp_fn)(unsigned long *fp, + enum stack_type type); + +/* + * on_accessible_stack_fn() - Check whether a stack range is on any + * of the possible stacks. + * + * @tsk: task whose stack is being unwound + * @sp: stack address being checked + * @size: size of the stack range being checked + * @info: stack unwinding context + */ +typedef bool (*on_accessible_stack_fn)(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info); + +static inline int unwind_next_common(struct unwind_state *state, + struct stack_info *info, + on_accessible_stack_fn accessible, + stack_trace_translate_fp_fn translate_fp) +{ + unsigned long fp = state->fp, kern_fp = fp; + struct task_struct *tsk = state->task; + + if (fp & 0x7) + return -EINVAL; + + if (!accessible(tsk, fp, 16, info)) + return -EINVAL; + + if (test_bit(info->type, state->stacks_done)) + return -EINVAL; + + /* + * If fp is not from the current address space perform the necessary + * translation before dereferencing it to get the next fp. + */ + if (translate_fp && !translate_fp(&kern_fp, info->type)) + return -EINVAL; + + /* + * As stacks grow downward, any valid record on the same stack must be + * at a strictly higher address than the prior record. + * + * Stacks can nest in several valid orders, e.g. + * + * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL + * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW + * HYP -> OVERFLOW + * + * ... but the nesting itself is strict. Once we transition from one + * stack to another, it's never valid to unwind back to that first + * stack. + */ + if (info->type == state->prev_type) { + if (fp <= state->prev_fp) + return -EINVAL; + } else { + __set_bit(state->prev_type, state->stacks_done); + } + + /* + * Record this frame record's values and location. The prev_fp and + * prev_type are only meaningful to the next unwind_next() invocation. + */ + state->fp = READ_ONCE(*(unsigned long *)(kern_fp)); + state->pc = READ_ONCE(*(unsigned long *)(kern_fp + 8)); + state->prev_fp = fp; + state->prev_type = info->type; + + return 0; +} + +#endif /* __ASM_STACKTRACE_COMMON_H */ diff --git a/arch/arm64/include/asm/stacktrace/nvhe.h b/arch/arm64/include/asm/stacktrace/nvhe.h new file mode 100644 index 000000000000..d5527b600390 --- /dev/null +++ b/arch/arm64/include/asm/stacktrace/nvhe.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM nVHE hypervisor stack tracing support. + * + * The unwinder implementation depends on the nVHE mode: + * + * 1) Non-protected nVHE mode - the host can directly access the + * HYP stack pages and unwind the HYP stack in EL1. This saves having + * to allocate shared buffers for the host to read the unwinded + * stacktrace. + * + * 2) pKVM (protected nVHE) mode - the host cannot directly access + * the HYP memory. The stack is unwinded in EL2 and dumped to a shared + * buffer where the host can read and print the stacktrace. + * + * Copyright (C) 2022 Google LLC + */ +#ifndef __ASM_STACKTRACE_NVHE_H +#define __ASM_STACKTRACE_NVHE_H + +#include <asm/stacktrace/common.h> + +/* + * kvm_nvhe_unwind_init - Start an unwind from the given nVHE HYP fp and pc + * + * @state : unwind_state to initialize + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + */ +static inline void kvm_nvhe_unwind_init(struct unwind_state *state, + unsigned long fp, + unsigned long pc) +{ + unwind_init_common(state, NULL); + + state->fp = fp; + state->pc = pc; +} + +#ifndef __KVM_NVHE_HYPERVISOR__ +/* + * Conventional (non-protected) nVHE HYP stack unwinder + * + * In non-protected mode, the unwinding is done from kernel proper context + * (by the host in EL1). + */ + +DECLARE_KVM_NVHE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); +DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info); +DECLARE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); + +void kvm_nvhe_dump_backtrace(unsigned long hyp_offset); + +#endif /* __KVM_NVHE_HYPERVISOR__ */ +#endif /* __ASM_STACKTRACE_NVHE_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index fa7981d0d917..7075a9c6a4a6 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -14,6 +14,11 @@ CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong CFLAGS_syscall.o += -fno-stack-protector +# When KASAN is enabled, a stack trace is recorded for every alloc/free, which +# can significantly impact performance. Avoid instrumenting the stack trace +# collection code to minimize this impact. +KASAN_SANITIZE_stacktrace.o := n + # It's not safe to invoke KCOV when portions of the kernel environment aren't # available or are out-of-sync with HW state. Since `noinstr` doesn't always # inhibit KCOV instrumentation, disable it for the entire compilation unit. diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 0467cb79f080..ce190ee18a20 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -7,72 +7,90 @@ #include <linux/kernel.h> #include <linux/export.h> #include <linux/ftrace.h> -#include <linux/kprobes.h> #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/stacktrace.h> #include <asm/irq.h> -#include <asm/pointer_auth.h> #include <asm/stack_pointer.h> #include <asm/stacktrace.h> /* - * A snapshot of a frame record or fp/lr register values, along with some - * accounting information necessary for robust unwinding. + * Start an unwind from a pt_regs. * - * @fp: The fp value in the frame record (or the real fp) - * @pc: The lr value in the frame record (or the real lr) + * The unwind will begin at the PC within the regs. * - * @stacks_done: Stacks which have been entirely unwound, for which it is no - * longer valid to unwind to. + * The regs must be on a stack currently owned by the calling task. + */ +static inline void unwind_init_from_regs(struct unwind_state *state, + struct pt_regs *regs) +{ + unwind_init_common(state, current); + + state->fp = regs->regs[29]; + state->pc = regs->pc; +} + +/* + * Start an unwind from a caller. * - * @prev_fp: The fp that pointed to this frame record, or a synthetic value - * of 0. This is used to ensure that within a stack, each - * subsequent frame record is at an increasing address. - * @prev_type: The type of stack this frame record was on, or a synthetic - * value of STACK_TYPE_UNKNOWN. This is used to detect a - * transition from one stack to another. + * The unwind will begin at the caller of whichever function this is inlined + * into. * - * @kr_cur: When KRETPROBES is selected, holds the kretprobe instance - * associated with the most recently encountered replacement lr - * value. + * The function which invokes this must be noinline. */ -struct unwind_state { - unsigned long fp; - unsigned long pc; - DECLARE_BITMAP(stacks_done, __NR_STACK_TYPES); - unsigned long prev_fp; - enum stack_type prev_type; -#ifdef CONFIG_KRETPROBES - struct llist_node *kr_cur; -#endif -}; +static __always_inline void unwind_init_from_caller(struct unwind_state *state) +{ + unwind_init_common(state, current); + + state->fp = (unsigned long)__builtin_frame_address(1); + state->pc = (unsigned long)__builtin_return_address(0); +} -static notrace void unwind_init(struct unwind_state *state, unsigned long fp, - unsigned long pc) +/* + * Start an unwind from a blocked task. + * + * The unwind will begin at the blocked tasks saved PC (i.e. the caller of + * cpu_switch_to()). + * + * The caller should ensure the task is blocked in cpu_switch_to() for the + * duration of the unwind, or the unwind will be bogus. It is never valid to + * call this for the current task. + */ +static inline void unwind_init_from_task(struct unwind_state *state, + struct task_struct *task) { - state->fp = fp; - state->pc = pc; -#ifdef CONFIG_KRETPROBES - state->kr_cur = NULL; -#endif + unwind_init_common(state, task); + + state->fp = thread_saved_fp(task); + state->pc = thread_saved_pc(task); +} - /* - * Prime the first unwind. - * - * In unwind_next() we'll check that the FP points to a valid stack, - * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be - * treated as a transition to whichever stack that happens to be. The - * prev_fp value won't be used, but we set it to 0 such that it is - * definitely not an accessible stack address. - */ - bitmap_zero(state->stacks_done, __NR_STACK_TYPES); - state->prev_fp = 0; - state->prev_type = STACK_TYPE_UNKNOWN; +/* + * We can only safely access per-cpu stacks from current in a non-preemptible + * context. + */ +static bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info) +{ + if (info) + info->type = STACK_TYPE_UNKNOWN; + + if (on_task_stack(tsk, sp, size, info)) + return true; + if (tsk != current || preemptible()) + return false; + if (on_irq_stack(sp, size, info)) + return true; + if (on_overflow_stack(sp, size, info)) + return true; + if (on_sdei_stack(sp, size, info)) + return true; + + return false; } -NOKPROBE_SYMBOL(unwind_init); /* * Unwind from one frame record (A) to the next frame record (B). @@ -81,53 +99,20 @@ NOKPROBE_SYMBOL(unwind_init); * records (e.g. a cycle), determined based on the location and fp value of A * and the location (but not the fp value) of B. */ -static int notrace unwind_next(struct task_struct *tsk, - struct unwind_state *state) +static int notrace unwind_next(struct unwind_state *state) { + struct task_struct *tsk = state->task; unsigned long fp = state->fp; struct stack_info info; + int err; /* Final frame; nothing to unwind */ if (fp == (unsigned long)task_pt_regs(tsk)->stackframe) return -ENOENT; - if (fp & 0x7) - return -EINVAL; - - if (!on_accessible_stack(tsk, fp, 16, &info)) - return -EINVAL; - - if (test_bit(info.type, state->stacks_done)) - return -EINVAL; - - /* - * As stacks grow downward, any valid record on the same stack must be - * at a strictly higher address than the prior record. - * - * Stacks can nest in several valid orders, e.g. - * - * TASK -> IRQ -> OVERFLOW -> SDEI_NORMAL - * TASK -> SDEI_NORMAL -> SDEI_CRITICAL -> OVERFLOW - * - * ... but the nesting itself is strict. Once we transition from one - * stack to another, it's never valid to unwind back to that first - * stack. - */ - if (info.type == state->prev_type) { - if (fp <= state->prev_fp) - return -EINVAL; - } else { - set_bit(state->prev_type, state->stacks_done); - } - - /* - * Record this frame record's values and location. The prev_fp and - * prev_type are only meaningful to the next unwind_next() invocation. - */ - state->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); - state->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); - state->prev_fp = fp; - state->prev_type = info.type; + err = unwind_next_common(state, &info, on_accessible_stack, NULL); + if (err) + return err; state->pc = ptrauth_strip_insn_pac(state->pc); @@ -157,8 +142,7 @@ static int notrace unwind_next(struct task_struct *tsk, } NOKPROBE_SYMBOL(unwind_next); -static void notrace unwind(struct task_struct *tsk, - struct unwind_state *state, +static void notrace unwind(struct unwind_state *state, stack_trace_consume_fn consume_entry, void *cookie) { while (1) { @@ -166,7 +150,7 @@ static void notrace unwind(struct task_struct *tsk, if (!consume_entry(cookie, state->pc)) break; - ret = unwind_next(tsk, state); + ret = unwind_next(state); if (ret < 0) break; } @@ -212,15 +196,15 @@ noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry, { struct unwind_state state; - if (regs) - unwind_init(&state, regs->regs[29], regs->pc); - else if (task == current) - unwind_init(&state, - (unsigned long)__builtin_frame_address(1), - (unsigned long)__builtin_return_address(0)); - else - unwind_init(&state, thread_saved_fp(task), - thread_saved_pc(task)); - - unwind(task, &state, consume_entry, cookie); + if (regs) { + if (task != current) + return; + unwind_init_from_regs(&state, regs); + } else if (task == current) { + unwind_init_from_caller(&state); + } else { + unwind_init_from_task(&state, task); + } + + unwind(&state, consume_entry, cookie); } diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 8a5fbbf084df..815cc118c675 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -56,4 +56,17 @@ config NVHE_EL2_DEBUG If unsure, say N. +config PROTECTED_NVHE_STACKTRACE + bool "Protected KVM hypervisor stacktraces" + depends on NVHE_EL2_DEBUG + default n + help + Say Y here to enable pKVM hypervisor stacktraces on hyp_panic() + + If using protected nVHE mode, but cannot afford the associated + memory cost (less than 0.75 page per CPU) of pKVM stacktraces, + say N. + + If unsure, or not using protected nVHE (pKVM), say N. + endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile index aa127ae9f675..5e33c2d4645a 100644 --- a/arch/arm64/kvm/Makefile +++ b/arch/arm64/kvm/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_KVM) += hyp/ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \ inject_fault.o va_layout.o handle_exit.o \ - guest.o debug.o reset.o sys_regs.o \ + guest.o debug.o reset.o sys_regs.o stacktrace.o \ vgic-sys-reg-v3.o fpsimd.o pkvm.o \ arch_timer.o trng.o vmid.o \ vgic/vgic.o vgic/vgic-init.o \ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index b81f34800d3c..8fe73ee5fa84 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -49,7 +49,7 @@ DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); DECLARE_KVM_HYP_PER_CPU(unsigned long, kvm_hyp_vector); -static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); +DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); unsigned long kvm_arm_hyp_percpu_base[NR_CPUS]; DECLARE_KVM_NVHE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index cb0bc77c16f9..bbe5b393d689 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -17,6 +17,7 @@ #include <asm/kvm_emulate.h> #include <asm/kvm_mmu.h> #include <asm/debug-monitors.h> +#include <asm/stacktrace/nvhe.h> #include <asm/traps.h> #include <kvm/arm_hypercalls.h> @@ -353,6 +354,9 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, (void *)(panic_addr + kaslr_offset())); } + /* Dump the nVHE hypervisor backtrace */ + kvm_nvhe_dump_backtrace(hyp_offset); + /* * Hyp has panicked and we're going to handle that by panicking the * kernel. The kernel offset will be revealed in the panic so we're diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile index a2b0d043dddf..ed5d222e2826 100644 --- a/arch/arm64/kvm/hyp/nvhe/Makefile +++ b/arch/arm64/kvm/hyp/nvhe/Makefile @@ -14,7 +14,7 @@ lib-objs := $(addprefix ../../../lib/, $(lib-objs)) hyp-obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ hyp-main.o hyp-smp.o psci-relay.o early_alloc.o page_alloc.o \ - cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o + cache.o setup.o mm.o mem_protect.o sys_regs.o pkvm.o stacktrace.o hyp-obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o hyp-obj-$(CONFIG_DEBUG_LIST) += list_debug.o diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index ea6a397b64a6..b6c0188c4b35 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -177,13 +177,8 @@ SYM_FUNC_END(__host_hvc) b hyp_panic .L__hyp_sp_overflow\@: - /* - * Reset SP to the top of the stack, to allow handling the hyp_panic. - * This corrupts the stack but is ok, since we won't be attempting - * any unwinding here. - */ - ldr_this_cpu x0, kvm_init_params + NVHE_INIT_STACK_HYP_VA, x1 - mov sp, x0 + /* Switch to the overflow stack */ + adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0 b hyp_panic_bad_stack ASM_BUG() diff --git a/arch/arm64/kvm/hyp/nvhe/stacktrace.c b/arch/arm64/kvm/hyp/nvhe/stacktrace.c new file mode 100644 index 000000000000..58f645ad66bc --- /dev/null +++ b/arch/arm64/kvm/hyp/nvhe/stacktrace.c @@ -0,0 +1,160 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM nVHE hypervisor stack tracing support. + * + * Copyright (C) 2022 Google LLC + */ +#include <asm/kvm_asm.h> +#include <asm/kvm_hyp.h> +#include <asm/memory.h> +#include <asm/percpu.h> + +DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) + __aligned(16); + +DEFINE_PER_CPU(struct kvm_nvhe_stacktrace_info, kvm_stacktrace_info); + +/* + * hyp_prepare_backtrace - Prepare non-protected nVHE backtrace. + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Save the information needed by the host to unwind the non-protected + * nVHE hypervisor stack in EL1. + */ +static void hyp_prepare_backtrace(unsigned long fp, unsigned long pc) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info = this_cpu_ptr(&kvm_stacktrace_info); + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + + stacktrace_info->stack_base = (unsigned long)(params->stack_hyp_va - PAGE_SIZE); + stacktrace_info->overflow_stack_base = (unsigned long)this_cpu_ptr(overflow_stack); + stacktrace_info->fp = fp; + stacktrace_info->pc = pc; +} + +#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +#include <asm/stacktrace/nvhe.h> + +DEFINE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], pkvm_stacktrace); + +static bool on_overflow_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + unsigned long low = (unsigned long)this_cpu_ptr(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info); +} + +static bool on_hyp_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); + unsigned long high = params->stack_hyp_va; + unsigned long low = high - PAGE_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_HYP, info); +} + +static bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info) +{ + if (info) + info->type = STACK_TYPE_UNKNOWN; + + return (on_overflow_stack(sp, size, info) || + on_hyp_stack(sp, size, info)); +} + +static int unwind_next(struct unwind_state *state) +{ + struct stack_info info; + + return unwind_next_common(state, &info, on_accessible_stack, NULL); +} + +static void notrace unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, + void *cookie) +{ + while (1) { + int ret; + + if (!consume_entry(cookie, state->pc)) + break; + ret = unwind_next(state); + if (ret < 0) + break; + } +} + +/* + * pkvm_save_backtrace_entry - Saves a protected nVHE HYP stacktrace entry + * + * @arg : index of the entry in the stacktrace buffer + * @where : the program counter corresponding to the stack frame + * + * Save the return address of a stack frame to the shared stacktrace buffer. + * The host can access this shared buffer from EL1 to dump the backtrace. + */ +static bool pkvm_save_backtrace_entry(void *arg, unsigned long where) +{ + unsigned long *stacktrace = this_cpu_ptr(pkvm_stacktrace); + int *idx = (int *)arg; + + /* + * Need 2 free slots: 1 for current entry and 1 for the + * delimiter. + */ + if (*idx > ARRAY_SIZE(pkvm_stacktrace) - 2) + return false; + + stacktrace[*idx] = where; + stacktrace[++*idx] = 0UL; + + return true; +} + +/* + * pkvm_save_backtrace - Saves the protected nVHE HYP stacktrace + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Save the unwinded stack addresses to the shared stacktrace buffer. + * The host can access this shared buffer from EL1 to dump the backtrace. + */ +static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) +{ + struct unwind_state state; + int idx = 0; + + kvm_nvhe_unwind_init(&state, fp, pc); + + unwind(&state, pkvm_save_backtrace_entry, &idx); +} +#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +static void pkvm_save_backtrace(unsigned long fp, unsigned long pc) +{ +} +#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ + +/* + * kvm_nvhe_prepare_backtrace - prepare to dump the nVHE backtrace + * + * @fp : frame pointer at which to start the unwinding. + * @pc : program counter at which to start the unwinding. + * + * Saves the information needed by the host to dump the nVHE hypervisor + * backtrace. + */ +void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc) +{ + if (is_protected_kvm_enabled()) + pkvm_save_backtrace(fp, pc); + else + hyp_prepare_backtrace(fp, pc); +} diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index ab0e19117dfe..9f6385702061 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -34,6 +34,8 @@ DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); DEFINE_PER_CPU(unsigned long, kvm_hyp_vector); +extern void kvm_nvhe_prepare_backtrace(unsigned long fp, unsigned long pc); + static void __activate_traps(struct kvm_vcpu *vcpu) { u64 val; @@ -375,6 +377,10 @@ asmlinkage void __noreturn hyp_panic(void) __sysreg_restore_state_nvhe(host_ctxt); } + /* Prepare to dump kvm nvhe hyp stacktrace */ + kvm_nvhe_prepare_backtrace((unsigned long)__builtin_frame_address(0), + _THIS_IP_); + __hyp_do_panic(host_ctxt, spsr, elr, par); unreachable(); } diff --git a/arch/arm64/kvm/stacktrace.c b/arch/arm64/kvm/stacktrace.c new file mode 100644 index 000000000000..949d19d603fb --- /dev/null +++ b/arch/arm64/kvm/stacktrace.c @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * KVM nVHE hypervisor stack tracing support. + * + * The unwinder implementation depends on the nVHE mode: + * + * 1) Non-protected nVHE mode - the host can directly access the + * HYP stack pages and unwind the HYP stack in EL1. This saves having + * to allocate shared buffers for the host to read the unwinded + * stacktrace. + * + * 2) pKVM (protected nVHE) mode - the host cannot directly access + * the HYP memory. The stack is unwinded in EL2 and dumped to a shared + * buffer where the host can read and print the stacktrace. + * + * Copyright (C) 2022 Google LLC + */ + +#include <linux/kvm.h> +#include <linux/kvm_host.h> + +#include <asm/stacktrace/nvhe.h> + +/* + * kvm_nvhe_stack_kern_va - Convert KVM nVHE HYP stack addresses to a kernel VAs + * + * The nVHE hypervisor stack is mapped in the flexible 'private' VA range, to + * allow for guard pages below the stack. Consequently, the fixed offset address + * translation macros won't work here. + * + * The kernel VA is calculated as an offset from the kernel VA of the hypervisor + * stack base. + * + * Returns true on success and updates @addr to its corresponding kernel VA; + * otherwise returns false. + */ +static bool kvm_nvhe_stack_kern_va(unsigned long *addr, + enum stack_type type) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info; + unsigned long hyp_base, kern_base, hyp_offset; + + stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + + switch (type) { + case STACK_TYPE_HYP: + kern_base = (unsigned long)*this_cpu_ptr(&kvm_arm_hyp_stack_page); + hyp_base = (unsigned long)stacktrace_info->stack_base; + break; + case STACK_TYPE_OVERFLOW: + kern_base = (unsigned long)this_cpu_ptr_nvhe_sym(overflow_stack); + hyp_base = (unsigned long)stacktrace_info->overflow_stack_base; + break; + default: + return false; + } + + hyp_offset = *addr - hyp_base; + + *addr = kern_base + hyp_offset; + + return true; +} + +static bool on_overflow_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info + = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + unsigned long low = (unsigned long)stacktrace_info->overflow_stack_base; + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_OVERFLOW, info); +} + +static bool on_hyp_stack(unsigned long sp, unsigned long size, + struct stack_info *info) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info + = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + unsigned long low = (unsigned long)stacktrace_info->stack_base; + unsigned long high = low + PAGE_SIZE; + + return on_stack(sp, size, low, high, STACK_TYPE_HYP, info); +} + +static bool on_accessible_stack(const struct task_struct *tsk, + unsigned long sp, unsigned long size, + struct stack_info *info) +{ + if (info) + info->type = STACK_TYPE_UNKNOWN; + + return (on_overflow_stack(sp, size, info) || + on_hyp_stack(sp, size, info)); +} + +static int unwind_next(struct unwind_state *state) +{ + struct stack_info info; + + return unwind_next_common(state, &info, on_accessible_stack, + kvm_nvhe_stack_kern_va); +} + +static void unwind(struct unwind_state *state, + stack_trace_consume_fn consume_entry, void *cookie) +{ + while (1) { + int ret; + + if (!consume_entry(cookie, state->pc)) + break; + ret = unwind_next(state); + if (ret < 0) + break; + } +} + +/* + * kvm_nvhe_dump_backtrace_entry - Symbolize and print an nVHE backtrace entry + * + * @arg : the hypervisor offset, used for address translation + * @where : the program counter corresponding to the stack frame + */ +static bool kvm_nvhe_dump_backtrace_entry(void *arg, unsigned long where) +{ + unsigned long va_mask = GENMASK_ULL(vabits_actual - 1, 0); + unsigned long hyp_offset = (unsigned long)arg; + + /* Mask tags and convert to kern addr */ + where = (where & va_mask) + hyp_offset; + kvm_err(" [<%016lx>] %pB\n", where, (void *)(where + kaslr_offset())); + + return true; +} + +static void kvm_nvhe_dump_backtrace_start(void) +{ + kvm_err("nVHE call trace:\n"); +} + +static void kvm_nvhe_dump_backtrace_end(void) +{ + kvm_err("---[ end nVHE call trace ]---\n"); +} + +/* + * hyp_dump_backtrace - Dump the non-protected nVHE backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + * + * The host can directly access HYP stack pages in non-protected + * mode, so the unwinding is done directly from EL1. This removes + * the need for shared buffers between host and hypervisor for + * the stacktrace. + */ +static void hyp_dump_backtrace(unsigned long hyp_offset) +{ + struct kvm_nvhe_stacktrace_info *stacktrace_info; + struct unwind_state state; + + stacktrace_info = this_cpu_ptr_nvhe_sym(kvm_stacktrace_info); + + kvm_nvhe_unwind_init(&state, stacktrace_info->fp, stacktrace_info->pc); + + kvm_nvhe_dump_backtrace_start(); + unwind(&state, kvm_nvhe_dump_backtrace_entry, (void *)hyp_offset); + kvm_nvhe_dump_backtrace_end(); +} + +#ifdef CONFIG_PROTECTED_NVHE_STACKTRACE +DECLARE_KVM_NVHE_PER_CPU(unsigned long [NVHE_STACKTRACE_SIZE/sizeof(long)], + pkvm_stacktrace); + +/* + * pkvm_dump_backtrace - Dump the protected nVHE HYP backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + * + * Dumping of the pKVM HYP backtrace is done by reading the + * stack addresses from the shared stacktrace buffer, since the + * host cannot directly access hypervisor memory in protected + * mode. + */ +static void pkvm_dump_backtrace(unsigned long hyp_offset) +{ + unsigned long *stacktrace + = (unsigned long *) this_cpu_ptr_nvhe_sym(pkvm_stacktrace); + int i; + + kvm_nvhe_dump_backtrace_start(); + /* The saved stacktrace is terminated by a null entry */ + for (i = 0; + i < ARRAY_SIZE(kvm_nvhe_sym(pkvm_stacktrace)) && stacktrace[i]; + i++) + kvm_nvhe_dump_backtrace_entry((void *)hyp_offset, stacktrace[i]); + kvm_nvhe_dump_backtrace_end(); +} +#else /* !CONFIG_PROTECTED_NVHE_STACKTRACE */ +static void pkvm_dump_backtrace(unsigned long hyp_offset) +{ + kvm_err("Cannot dump pKVM nVHE stacktrace: !CONFIG_PROTECTED_NVHE_STACKTRACE\n"); +} +#endif /* CONFIG_PROTECTED_NVHE_STACKTRACE */ + +/* + * kvm_nvhe_dump_backtrace - Dump KVM nVHE hypervisor backtrace. + * + * @hyp_offset: hypervisor offset, used for address translation. + */ +void kvm_nvhe_dump_backtrace(unsigned long hyp_offset) +{ + if (is_protected_kvm_enabled()) + pkvm_dump_backtrace(hyp_offset); + else + hyp_dump_backtrace(hyp_offset); +} |