diff options
Diffstat (limited to 'arch/arm64/kernel')
31 files changed, 599 insertions, 418 deletions
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index ed65576ce710..6cc97730790e 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -9,6 +9,11 @@ CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_insn.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) +# Remove stack protector to avoid triggering unneeded stack canary +# checks due to randomize_kstack_offset. +CFLAGS_REMOVE_syscall.o = -fstack-protector -fstack-protector-strong +CFLAGS_syscall.o += -fno-stack-protector + # Object file lists. obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ entry-common.o entry-fpsimd.o process.o ptrace.o \ diff --git a/arch/arm64/kernel/acpi_parking_protocol.c b/arch/arm64/kernel/acpi_parking_protocol.c index e7c941d8340d..bfeeb5319abf 100644 --- a/arch/arm64/kernel/acpi_parking_protocol.c +++ b/arch/arm64/kernel/acpi_parking_protocol.c @@ -99,7 +99,8 @@ static int acpi_parking_protocol_cpu_boot(unsigned int cpu) * that read this address need to convert this address to the * Boot-Loader's endianness before jumping. */ - writeq_relaxed(__pa_symbol(secondary_entry), &mailbox->entry_point); + writeq_relaxed(__pa_symbol(function_nocfi(secondary_entry)), + &mailbox->entry_point); writel_relaxed(cpu_entry->gic_cpu_id, &mailbox->cpu_id); arch_send_wakeup_ipi_mask(cpumask_of(cpu)); diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 1184c44ea2c7..abc84636af07 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -133,8 +133,8 @@ static void clean_dcache_range_nopatch(u64 start, u64 end) } while (cur += d_size, cur < end); } -static void __apply_alternatives(void *alt_region, bool is_module, - unsigned long *feature_mask) +static void __nocfi __apply_alternatives(void *alt_region, bool is_module, + unsigned long *feature_mask) { struct alt_instr *alt; struct alt_region *region = alt_region; diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 8060e58a0739..0cb34ccb6e73 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -43,6 +43,7 @@ int main(void) #endif BLANK(); DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context)); + DEFINE(THREAD_SCTLR_USER, offsetof(struct task_struct, thread.sctlr_user)); #ifdef CONFIG_ARM64_PTR_AUTH DEFINE(THREAD_KEYS_USER, offsetof(struct task_struct, thread.keys_user)); DEFINE(THREAD_KEYS_KERNEL, offsetof(struct task_struct, thread.keys_kernel)); @@ -152,10 +153,6 @@ int main(void) #endif #ifdef CONFIG_ARM64_PTR_AUTH DEFINE(PTRAUTH_USER_KEY_APIA, offsetof(struct ptrauth_keys_user, apia)); - DEFINE(PTRAUTH_USER_KEY_APIB, offsetof(struct ptrauth_keys_user, apib)); - DEFINE(PTRAUTH_USER_KEY_APDA, offsetof(struct ptrauth_keys_user, apda)); - DEFINE(PTRAUTH_USER_KEY_APDB, offsetof(struct ptrauth_keys_user, apdb)); - DEFINE(PTRAUTH_USER_KEY_APGA, offsetof(struct ptrauth_keys_user, apga)); DEFINE(PTRAUTH_KERNEL_KEY_APIA, offsetof(struct ptrauth_keys_kernel, apia)); BLANK(); #endif diff --git a/arch/arm64/kernel/cpu-reset.h b/arch/arm64/kernel/cpu-reset.h index ed50e9587ad8..9a7b1262ef17 100644 --- a/arch/arm64/kernel/cpu-reset.h +++ b/arch/arm64/kernel/cpu-reset.h @@ -13,16 +13,16 @@ void __cpu_soft_restart(unsigned long el2_switch, unsigned long entry, unsigned long arg0, unsigned long arg1, unsigned long arg2); -static inline void __noreturn cpu_soft_restart(unsigned long entry, - unsigned long arg0, - unsigned long arg1, - unsigned long arg2) +static inline void __noreturn __nocfi cpu_soft_restart(unsigned long entry, + unsigned long arg0, + unsigned long arg1, + unsigned long arg2) { typeof(__cpu_soft_restart) *restart; unsigned long el2_switch = !is_kernel_in_hyp_mode() && is_hyp_mode_available(); - restart = (void *)__pa_symbol(__cpu_soft_restart); + restart = (void *)__pa_symbol(function_nocfi(__cpu_soft_restart)); cpu_install_idmap(); restart(el2_switch, entry, arg0, arg1, arg2); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index e3e0dcbf6b2f..30c82d38c189 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1451,7 +1451,7 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, } #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -static void +static void __nocfi kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) { typedef void (kpti_remap_fn)(int, int, phys_addr_t); @@ -1468,7 +1468,7 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) if (arm64_use_ng_mappings) return; - remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); + remap_fn = (void *)__pa_symbol(function_nocfi(idmap_kpti_install_ng_mappings)); cpu_install_idmap(); remap_fn(cpu, num_online_cpus(), __pa_symbol(swapper_pg_dir)); @@ -1827,6 +1827,18 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_pan, }, #endif /* CONFIG_ARM64_PAN */ +#ifdef CONFIG_ARM64_EPAN + { + .desc = "Enhanced Privileged Access Never", + .capability = ARM64_HAS_EPAN, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64MMFR1_EL1, + .field_pos = ID_AA64MMFR1_PAN_SHIFT, + .sign = FTR_UNSIGNED, + .min_field_value = 3, + }, +#endif /* CONFIG_ARM64_EPAN */ #ifdef CONFIG_ARM64_LSE_ATOMICS { .desc = "LSE atomic instructions", diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 9d3588450473..a1ec351c36bd 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -37,6 +37,8 @@ static void noinstr enter_from_kernel_mode(struct pt_regs *regs) lockdep_hardirqs_off(CALLER_ADDR0); rcu_irq_enter_check_tick(); trace_hardirqs_off_finish(); + + mte_check_tfsr_entry(); } /* @@ -47,6 +49,8 @@ static void noinstr exit_to_kernel_mode(struct pt_regs *regs) { lockdep_assert_irqs_disabled(); + mte_check_tfsr_exit(); + if (interrupts_enabled(regs)) { if (regs->exit_rcu) { trace_hardirqs_on_prepare(); @@ -293,6 +297,8 @@ asmlinkage void noinstr enter_from_user_mode(void) asmlinkage void noinstr exit_to_user_mode(void) { + mte_check_tfsr_exit(); + trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(CALLER_ADDR0); user_enter_irqoff(); diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S index 2ca395c25448..3ecec60d3295 100644 --- a/arch/arm64/kernel/entry-fpsimd.S +++ b/arch/arm64/kernel/entry-fpsimd.S @@ -48,6 +48,11 @@ SYM_FUNC_START(sve_get_vl) ret SYM_FUNC_END(sve_get_vl) +SYM_FUNC_START(sve_set_vq) + sve_load_vq x0, x1, x2 + ret +SYM_FUNC_END(sve_set_vq) + /* * Load SVE state from FPSIMD state. * diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index a31a0a713c85..4ac5455c0ead 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -148,16 +148,18 @@ alternative_cb_end .endm /* Check for MTE asynchronous tag check faults */ - .macro check_mte_async_tcf, flgs, tmp + .macro check_mte_async_tcf, tmp, ti_flags #ifdef CONFIG_ARM64_MTE + .arch_extension lse alternative_if_not ARM64_MTE b 1f alternative_else_nop_endif mrs_s \tmp, SYS_TFSRE0_EL1 tbz \tmp, #SYS_TFSR_EL1_TF0_SHIFT, 1f /* Asynchronous TCF occurred for TTBR0 access, set the TI flag */ - orr \flgs, \flgs, #_TIF_MTE_ASYNC_FAULT - str \flgs, [tsk, #TSK_TI_FLAGS] + mov \tmp, #_TIF_MTE_ASYNC_FAULT + add \ti_flags, tsk, #TSK_TI_FLAGS + stset \tmp, [\ti_flags] msr_s SYS_TFSRE0_EL1, xzr 1: #endif @@ -244,10 +246,32 @@ alternative_else_nop_endif disable_step_tsk x19, x20 /* Check for asynchronous tag check faults in user space */ - check_mte_async_tcf x19, x22 + check_mte_async_tcf x22, x23 apply_ssbd 1, x22, x23 - ptrauth_keys_install_kernel tsk, x20, x22, x23 +#ifdef CONFIG_ARM64_PTR_AUTH +alternative_if ARM64_HAS_ADDRESS_AUTH + /* + * Enable IA for in-kernel PAC if the task had it disabled. Although + * this could be implemented with an unconditional MRS which would avoid + * a load, this was measured to be slower on Cortex-A75 and Cortex-A76. + * + * Install the kernel IA key only if IA was enabled in the task. If IA + * was disabled on kernel exit then we would have left the kernel IA + * installed so there is no need to install it again. + */ + ldr x0, [tsk, THREAD_SCTLR_USER] + tbz x0, SCTLR_ELx_ENIA_SHIFT, 1f + __ptrauth_keys_install_kernel_nosync tsk, x20, x22, x23 + b 2f +1: + mrs x0, sctlr_el1 + orr x0, x0, SCTLR_ELx_ENIA + msr sctlr_el1, x0 +2: + isb +alternative_else_nop_endif +#endif mte_set_kernel_gcr x22, x23 @@ -351,8 +375,26 @@ alternative_else_nop_endif 3: scs_save tsk, x0 - /* No kernel C function calls after this as user keys are set. */ - ptrauth_keys_install_user tsk, x0, x1, x2 +#ifdef CONFIG_ARM64_PTR_AUTH +alternative_if ARM64_HAS_ADDRESS_AUTH + /* + * IA was enabled for in-kernel PAC. Disable it now if needed, or + * alternatively install the user's IA. All other per-task keys and + * SCTLR bits were updated on task switch. + * + * No kernel C function calls after this. + */ + ldr x0, [tsk, THREAD_SCTLR_USER] + tbz x0, SCTLR_ELx_ENIA_SHIFT, 1f + __ptrauth_keys_install_user tsk, x0, x1, x2 + b 2f +1: + mrs x0, sctlr_el1 + bic x0, x0, SCTLR_ELx_ENIA + msr sctlr_el1, x0 +2: +alternative_else_nop_endif +#endif mte_set_user_gcr tsk, x0, x1 @@ -491,28 +533,14 @@ tsk .req x28 // current thread_info /* * Interrupt handling. */ - .macro irq_handler - ldr_l x1, handle_arch_irq + .macro irq_handler, handler:req + ldr_l x1, \handler mov x0, sp irq_stack_entry blr x1 irq_stack_exit .endm -#ifdef CONFIG_ARM64_PSEUDO_NMI - /* - * Set res to 0 if irqs were unmasked in interrupted context. - * Otherwise set res to non-0 value. - */ - .macro test_irqs_unmasked res:req, pmr:req -alternative_if ARM64_HAS_IRQ_PRIO_MASKING - sub \res, \pmr, #GIC_PRIO_IRQON -alternative_else - mov \res, xzr -alternative_endif - .endm -#endif - .macro gic_prio_kentry_setup, tmp:req #ifdef CONFIG_ARM64_PSEUDO_NMI alternative_if ARM64_HAS_IRQ_PRIO_MASKING @@ -531,6 +559,47 @@ alternative_endif #endif .endm + .macro el1_interrupt_handler, handler:req + gic_prio_irq_setup pmr=x20, tmp=x1 + enable_da + + mov x0, sp + bl enter_el1_irq_or_nmi + + irq_handler \handler + +#ifdef CONFIG_PREEMPTION + ldr x24, [tsk, #TSK_TI_PREEMPT] // get preempt count +alternative_if ARM64_HAS_IRQ_PRIO_MASKING + /* + * DA were cleared at start of handling, and IF are cleared by + * the GIC irqchip driver using gic_arch_enable_irqs() for + * normal IRQs. If anything is set, it means we come back from + * an NMI instead of a normal IRQ, so skip preemption + */ + mrs x0, daif + orr x24, x24, x0 +alternative_else_nop_endif + cbnz x24, 1f // preempt count != 0 || NMI return path + bl arm64_preempt_schedule_irq // irq en/disable is done inside +1: +#endif + + mov x0, sp + bl exit_el1_irq_or_nmi + .endm + + .macro el0_interrupt_handler, handler:req + gic_prio_irq_setup pmr=x20, tmp=x0 + user_exit_irqoff + enable_da + + tbz x22, #55, 1f + bl do_el0_irq_bp_hardening +1: + irq_handler \handler + .endm + .text /* @@ -547,18 +616,18 @@ SYM_CODE_START(vectors) kernel_ventry 1, sync // Synchronous EL1h kernel_ventry 1, irq // IRQ EL1h - kernel_ventry 1, fiq_invalid // FIQ EL1h + kernel_ventry 1, fiq // FIQ EL1h kernel_ventry 1, error // Error EL1h kernel_ventry 0, sync // Synchronous 64-bit EL0 kernel_ventry 0, irq // IRQ 64-bit EL0 - kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0 + kernel_ventry 0, fiq // FIQ 64-bit EL0 kernel_ventry 0, error // Error 64-bit EL0 #ifdef CONFIG_COMPAT kernel_ventry 0, sync_compat, 32 // Synchronous 32-bit EL0 kernel_ventry 0, irq_compat, 32 // IRQ 32-bit EL0 - kernel_ventry 0, fiq_invalid_compat, 32 // FIQ 32-bit EL0 + kernel_ventry 0, fiq_compat, 32 // FIQ 32-bit EL0 kernel_ventry 0, error_compat, 32 // Error 32-bit EL0 #else kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0 @@ -624,12 +693,6 @@ SYM_CODE_START_LOCAL(el0_error_invalid) inv_entry 0, BAD_ERROR SYM_CODE_END(el0_error_invalid) -#ifdef CONFIG_COMPAT -SYM_CODE_START_LOCAL(el0_fiq_invalid_compat) - inv_entry 0, BAD_FIQ, 32 -SYM_CODE_END(el0_fiq_invalid_compat) -#endif - SYM_CODE_START_LOCAL(el1_sync_invalid) inv_entry 1, BAD_SYNC SYM_CODE_END(el1_sync_invalid) @@ -660,35 +723,16 @@ SYM_CODE_END(el1_sync) .align 6 SYM_CODE_START_LOCAL_NOALIGN(el1_irq) kernel_entry 1 - gic_prio_irq_setup pmr=x20, tmp=x1 - enable_da_f - - mov x0, sp - bl enter_el1_irq_or_nmi - - irq_handler - -#ifdef CONFIG_PREEMPTION - ldr x24, [tsk, #TSK_TI_PREEMPT] // get preempt count -alternative_if ARM64_HAS_IRQ_PRIO_MASKING - /* - * DA_F were cleared at start of handling. If anything is set in DAIF, - * we come back from an NMI, so skip preemption - */ - mrs x0, daif - orr x24, x24, x0 -alternative_else_nop_endif - cbnz x24, 1f // preempt count != 0 || NMI return path - bl arm64_preempt_schedule_irq // irq en/disable is done inside -1: -#endif - - mov x0, sp - bl exit_el1_irq_or_nmi - + el1_interrupt_handler handle_arch_irq kernel_exit 1 SYM_CODE_END(el1_irq) +SYM_CODE_START_LOCAL_NOALIGN(el1_fiq) + kernel_entry 1 + el1_interrupt_handler handle_arch_fiq + kernel_exit 1 +SYM_CODE_END(el1_fiq) + /* * EL0 mode handlers. */ @@ -715,6 +759,11 @@ SYM_CODE_START_LOCAL_NOALIGN(el0_irq_compat) b el0_irq_naked SYM_CODE_END(el0_irq_compat) +SYM_CODE_START_LOCAL_NOALIGN(el0_fiq_compat) + kernel_entry 0, 32 + b el0_fiq_naked +SYM_CODE_END(el0_fiq_compat) + SYM_CODE_START_LOCAL_NOALIGN(el0_error_compat) kernel_entry 0, 32 b el0_error_naked @@ -725,18 +774,17 @@ SYM_CODE_END(el0_error_compat) SYM_CODE_START_LOCAL_NOALIGN(el0_irq) kernel_entry 0 el0_irq_naked: - gic_prio_irq_setup pmr=x20, tmp=x0 - user_exit_irqoff - enable_da_f - - tbz x22, #55, 1f - bl do_el0_irq_bp_hardening -1: - irq_handler - + el0_interrupt_handler handle_arch_irq b ret_to_user SYM_CODE_END(el0_irq) +SYM_CODE_START_LOCAL_NOALIGN(el0_fiq) + kernel_entry 0 +el0_fiq_naked: + el0_interrupt_handler handle_arch_fiq + b ret_to_user +SYM_CODE_END(el0_fiq) + SYM_CODE_START_LOCAL(el1_error) kernel_entry 1 mrs x1, esr_el1 @@ -757,7 +805,7 @@ el0_error_naked: mov x0, sp mov x1, x25 bl do_serror - enable_da_f + enable_da b ret_to_user SYM_CODE_END(el0_error) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 823e3a8a8871..ad3dd34a83cf 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -285,7 +285,7 @@ static void task_fpsimd_load(void) WARN_ON(!system_supports_fpsimd()); WARN_ON(!have_cpu_fpsimd_context()); - if (system_supports_sve() && test_thread_flag(TIF_SVE)) + if (IS_ENABLED(CONFIG_ARM64_SVE) && test_thread_flag(TIF_SVE)) sve_load_state(sve_pffr(¤t->thread), ¤t->thread.uw.fpsimd_state.fpsr, sve_vq_from_vl(current->thread.sve_vl) - 1); @@ -307,7 +307,8 @@ static void fpsimd_save(void) WARN_ON(!have_cpu_fpsimd_context()); if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { - if (system_supports_sve() && test_thread_flag(TIF_SVE)) { + if (IS_ENABLED(CONFIG_ARM64_SVE) && + test_thread_flag(TIF_SVE)) { if (WARN_ON(sve_get_vl() != last->sve_vl)) { /* * Can't save the user regs, so current would @@ -926,9 +927,8 @@ void fpsimd_release_task(struct task_struct *dead_task) * Trapped SVE access * * Storage is allocated for the full SVE state, the current FPSIMD - * register contents are migrated across, and TIF_SVE is set so that - * the SVE access trap will be disabled the next time this task - * reaches ret_to_user. + * register contents are migrated across, and the access trap is + * disabled. * * TIF_SVE should be clear on entry: otherwise, fpsimd_restore_current_state() * would have disabled the SVE access trap for userspace during @@ -946,15 +946,24 @@ void do_sve_acc(unsigned int esr, struct pt_regs *regs) get_cpu_fpsimd_context(); - fpsimd_save(); - - /* Force ret_to_user to reload the registers: */ - fpsimd_flush_task_state(current); - - fpsimd_to_sve(current); if (test_and_set_thread_flag(TIF_SVE)) WARN_ON(1); /* SVE access shouldn't have trapped */ + /* + * Convert the FPSIMD state to SVE, zeroing all the state that + * is not shared with FPSIMD. If (as is likely) the current + * state is live in the registers then do this there and + * update our metadata for the current task including + * disabling the trap, otherwise update our in-memory copy. + */ + if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) { + sve_set_vq(sve_vq_from_vl(current->thread.sve_vl) - 1); + sve_flush_live(); + fpsimd_bind_task_to_cpu(); + } else { + fpsimd_to_sve(current); + } + put_cpu_fpsimd_context(); } @@ -1092,7 +1101,7 @@ void fpsimd_preserve_current_state(void) void fpsimd_signal_preserve_current_state(void) { fpsimd_preserve_current_state(); - if (system_supports_sve() && test_thread_flag(TIF_SVE)) + if (test_thread_flag(TIF_SVE)) sve_to_fpsimd(current); } @@ -1181,7 +1190,7 @@ void fpsimd_update_current_state(struct user_fpsimd_state const *state) get_cpu_fpsimd_context(); current->thread.uw.fpsimd_state = *state; - if (system_supports_sve() && test_thread_flag(TIF_SVE)) + if (test_thread_flag(TIF_SVE)) fpsimd_to_sve(current); task_fpsimd_load(); diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 86a5cf9bc19a..b5d3ddaf69d9 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -55,7 +55,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func) unsigned long pc; u32 new; - pc = (unsigned long)&ftrace_call; + pc = (unsigned long)function_nocfi(ftrace_call); new = aarch64_insn_gen_branch_imm(pc, (unsigned long)func, AARCH64_INSN_BRANCH_LINK); diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index dfb1feab867d..bda49430c9ea 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -71,13 +71,44 @@ static void init_irq_stacks(void) } #endif +static void default_handle_irq(struct pt_regs *regs) +{ + panic("IRQ taken without a root IRQ handler\n"); +} + +static void default_handle_fiq(struct pt_regs *regs) +{ + panic("FIQ taken without a root FIQ handler\n"); +} + +void (*handle_arch_irq)(struct pt_regs *) __ro_after_init = default_handle_irq; +void (*handle_arch_fiq)(struct pt_regs *) __ro_after_init = default_handle_fiq; + +int __init set_handle_irq(void (*handle_irq)(struct pt_regs *)) +{ + if (handle_arch_irq != default_handle_irq) + return -EBUSY; + + handle_arch_irq = handle_irq; + pr_info("Root IRQ handler: %ps\n", handle_irq); + return 0; +} + +int __init set_handle_fiq(void (*handle_fiq)(struct pt_regs *)) +{ + if (handle_arch_fiq != default_handle_fiq) + return -EBUSY; + + handle_arch_fiq = handle_fiq; + pr_info("Root FIQ handler: %ps\n", handle_fiq); + return 0; +} + void __init init_IRQ(void) { init_irq_stacks(); init_irq_scs(); irqchip_init(); - if (!handle_arch_irq) - panic("No interrupt controller found."); if (system_uses_irq_prio_masking()) { /* diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index 27f8939deb1b..341342b207f6 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -128,15 +128,17 @@ u64 __init kaslr_early_init(void) /* use the top 16 bits to randomize the linear region */ memstart_offset_seed = seed >> 48; - if (IS_ENABLED(CONFIG_KASAN_GENERIC) || - IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + if (!IS_ENABLED(CONFIG_KASAN_VMALLOC) && + (IS_ENABLED(CONFIG_KASAN_GENERIC) || + IS_ENABLED(CONFIG_KASAN_SW_TAGS))) /* - * KASAN does not expect the module region to intersect the - * vmalloc region, since shadow memory is allocated for each - * module at load time, whereas the vmalloc region is shadowed - * by KASAN zero pages. So keep modules out of the vmalloc - * region if KASAN is enabled, and put the kernel well within - * 4 GB of the module region. + * KASAN without KASAN_VMALLOC does not expect the module region + * to intersect the vmalloc region, since shadow memory is + * allocated for each module at load time, whereas the vmalloc + * region is shadowed by KASAN zero pages. So keep modules + * out of the vmalloc region if KASAN is enabled without + * KASAN_VMALLOC, and put the kernel well within 4 GB of the + * module region. */ return offset % SZ_2G; diff --git a/arch/arm64/kernel/machine_kexec_file.c b/arch/arm64/kernel/machine_kexec_file.c index 0cde47a63beb..63634b4d72c1 100644 --- a/arch/arm64/kernel/machine_kexec_file.c +++ b/arch/arm64/kernel/machine_kexec_file.c @@ -15,23 +15,12 @@ #include <linux/kexec.h> #include <linux/libfdt.h> #include <linux/memblock.h> +#include <linux/of.h> #include <linux/of_fdt.h> -#include <linux/random.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/types.h> #include <linux/vmalloc.h> -#include <asm/byteorder.h> - -/* relevant device tree properties */ -#define FDT_PROP_KEXEC_ELFHDR "linux,elfcorehdr" -#define FDT_PROP_MEM_RANGE "linux,usable-memory-range" -#define FDT_PROP_INITRD_START "linux,initrd-start" -#define FDT_PROP_INITRD_END "linux,initrd-end" -#define FDT_PROP_BOOTARGS "bootargs" -#define FDT_PROP_KASLR_SEED "kaslr-seed" -#define FDT_PROP_RNG_SEED "rng-seed" -#define RNG_SEED_SIZE 128 const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_image_ops, @@ -40,174 +29,16 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { int arch_kimage_file_post_load_cleanup(struct kimage *image) { - vfree(image->arch.dtb); + kvfree(image->arch.dtb); image->arch.dtb = NULL; - vfree(image->arch.elf_headers); - image->arch.elf_headers = NULL; - image->arch.elf_headers_sz = 0; + vfree(image->elf_headers); + image->elf_headers = NULL; + image->elf_headers_sz = 0; return kexec_image_post_load_cleanup_default(image); } -static int setup_dtb(struct kimage *image, - unsigned long initrd_load_addr, unsigned long initrd_len, - char *cmdline, void *dtb) -{ - int off, ret; - - ret = fdt_path_offset(dtb, "/chosen"); - if (ret < 0) - goto out; - - off = ret; - - ret = fdt_delprop(dtb, off, FDT_PROP_KEXEC_ELFHDR); - if (ret && ret != -FDT_ERR_NOTFOUND) - goto out; - ret = fdt_delprop(dtb, off, FDT_PROP_MEM_RANGE); - if (ret && ret != -FDT_ERR_NOTFOUND) - goto out; - - if (image->type == KEXEC_TYPE_CRASH) { - /* add linux,elfcorehdr */ - ret = fdt_appendprop_addrrange(dtb, 0, off, - FDT_PROP_KEXEC_ELFHDR, - image->arch.elf_headers_mem, - image->arch.elf_headers_sz); - if (ret) - return (ret == -FDT_ERR_NOSPACE ? -ENOMEM : -EINVAL); - - /* add linux,usable-memory-range */ - ret = fdt_appendprop_addrrange(dtb, 0, off, - FDT_PROP_MEM_RANGE, - crashk_res.start, - crashk_res.end - crashk_res.start + 1); - if (ret) - return (ret == -FDT_ERR_NOSPACE ? -ENOMEM : -EINVAL); - } - - /* add bootargs */ - if (cmdline) { - ret = fdt_setprop_string(dtb, off, FDT_PROP_BOOTARGS, cmdline); - if (ret) - goto out; - } else { - ret = fdt_delprop(dtb, off, FDT_PROP_BOOTARGS); - if (ret && (ret != -FDT_ERR_NOTFOUND)) - goto out; - } - - /* add initrd-* */ - if (initrd_load_addr) { - ret = fdt_setprop_u64(dtb, off, FDT_PROP_INITRD_START, - initrd_load_addr); - if (ret) - goto out; - - ret = fdt_setprop_u64(dtb, off, FDT_PROP_INITRD_END, - initrd_load_addr + initrd_len); - if (ret) - goto out; - } else { - ret = fdt_delprop(dtb, off, FDT_PROP_INITRD_START); - if (ret && (ret != -FDT_ERR_NOTFOUND)) - goto out; - - ret = fdt_delprop(dtb, off, FDT_PROP_INITRD_END); - if (ret && (ret != -FDT_ERR_NOTFOUND)) - goto out; - } - - /* add kaslr-seed */ - ret = fdt_delprop(dtb, off, FDT_PROP_KASLR_SEED); - if (ret == -FDT_ERR_NOTFOUND) - ret = 0; - else if (ret) - goto out; - - if (rng_is_initialized()) { - u64 seed = get_random_u64(); - ret = fdt_setprop_u64(dtb, off, FDT_PROP_KASLR_SEED, seed); - if (ret) - goto out; - } else { - pr_notice("RNG is not initialised: omitting \"%s\" property\n", - FDT_PROP_KASLR_SEED); - } - - /* add rng-seed */ - if (rng_is_initialized()) { - void *rng_seed; - ret = fdt_setprop_placeholder(dtb, off, FDT_PROP_RNG_SEED, - RNG_SEED_SIZE, &rng_seed); - if (ret) - goto out; - get_random_bytes(rng_seed, RNG_SEED_SIZE); - } else { - pr_notice("RNG is not initialised: omitting \"%s\" property\n", - FDT_PROP_RNG_SEED); - } - -out: - if (ret) - return (ret == -FDT_ERR_NOSPACE) ? -ENOMEM : -EINVAL; - - return 0; -} - -/* - * More space needed so that we can add initrd, bootargs, kaslr-seed, - * rng-seed, userable-memory-range and elfcorehdr. - */ -#define DTB_EXTRA_SPACE 0x1000 - -static int create_dtb(struct kimage *image, - unsigned long initrd_load_addr, unsigned long initrd_len, - char *cmdline, void **dtb) -{ - void *buf; - size_t buf_size; - size_t cmdline_len; - int ret; - - cmdline_len = cmdline ? strlen(cmdline) : 0; - buf_size = fdt_totalsize(initial_boot_params) - + cmdline_len + DTB_EXTRA_SPACE; - - for (;;) { - buf = vmalloc(buf_size); - if (!buf) - return -ENOMEM; - - /* duplicate a device tree blob */ - ret = fdt_open_into(initial_boot_params, buf, buf_size); - if (ret) { - vfree(buf); - return -EINVAL; - } - - ret = setup_dtb(image, initrd_load_addr, initrd_len, - cmdline, buf); - if (ret) { - vfree(buf); - if (ret == -ENOMEM) { - /* unlikely, but just in case */ - buf_size += DTB_EXTRA_SPACE; - continue; - } else { - return ret; - } - } - - /* trim it */ - fdt_pack(buf); - *dtb = buf; - - return 0; - } -} - static int prepare_elf_headers(void **addr, unsigned long *sz) { struct crash_mem *cmem; @@ -284,12 +115,12 @@ int load_other_segments(struct kimage *image, vfree(headers); goto out_err; } - image->arch.elf_headers = headers; - image->arch.elf_headers_mem = kbuf.mem; - image->arch.elf_headers_sz = headers_sz; + image->elf_headers = headers; + image->elf_load_addr = kbuf.mem; + image->elf_headers_sz = headers_sz; pr_debug("Loaded elf core header at 0x%lx bufsz=0x%lx memsz=0x%lx\n", - image->arch.elf_headers_mem, kbuf.bufsz, kbuf.memsz); + image->elf_load_addr, kbuf.bufsz, kbuf.memsz); } /* load initrd */ @@ -314,12 +145,15 @@ int load_other_segments(struct kimage *image, } /* load dtb */ - ret = create_dtb(image, initrd_load_addr, initrd_len, cmdline, &dtb); - if (ret) { + dtb = of_kexec_alloc_and_setup_fdt(image, initrd_load_addr, + initrd_len, cmdline, 0); + if (!dtb) { pr_err("Preparing for new dtb failed\n"); goto out_err; } + /* trim it */ + fdt_pack(dtb); dtb_len = fdt_totalsize(dtb); kbuf.buffer = dtb; kbuf.bufsz = dtb_len; @@ -343,6 +177,6 @@ int load_other_segments(struct kimage *image, out_err: image->nr_segments = orig_segments; - vfree(dtb); + kvfree(dtb); return ret; } diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index fe21e0f06492..b5ec010c481f 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -40,14 +40,16 @@ void *module_alloc(unsigned long size) NUMA_NO_NODE, __builtin_return_address(0)); if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && - !IS_ENABLED(CONFIG_KASAN_GENERIC) && - !IS_ENABLED(CONFIG_KASAN_SW_TAGS)) + (IS_ENABLED(CONFIG_KASAN_VMALLOC) || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + !IS_ENABLED(CONFIG_KASAN_SW_TAGS)))) /* - * KASAN can only deal with module allocations being served - * from the reserved module region, since the remainder of - * the vmalloc region is already backed by zero shadow pages, - * and punching holes into it is non-trivial. Since the module - * region is not randomized when KASAN is enabled, it is even + * KASAN without KASAN_VMALLOC can only deal with module + * allocations being served from the reserved module region, + * since the remainder of the vmalloc region is already + * backed by zero shadow pages, and punching holes into it + * is non-trivial. Since the module region is not randomized + * when KASAN is enabled without KASAN_VMALLOC, it is even * less likely that the module region gets exhausted, so we * can simply omit this fallback in that case. */ diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index b3c70a612c7a..125a10e413e9 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -26,6 +26,12 @@ u64 gcr_kernel_excl __ro_after_init; static bool report_fault_once = true; +#ifdef CONFIG_KASAN_HW_TAGS +/* Whether the MTE asynchronous mode is enabled. */ +DEFINE_STATIC_KEY_FALSE(mte_async_mode); +EXPORT_SYMBOL_GPL(mte_async_mode); +#endif + static void mte_sync_page_tags(struct page *page, pte_t *ptep, bool check_swap) { pte_t old_pte = READ_ONCE(*ptep); @@ -107,13 +113,45 @@ void mte_init_tags(u64 max_tag) write_sysreg_s(SYS_GCR_EL1_RRND | gcr_kernel_excl, SYS_GCR_EL1); } -void mte_enable_kernel(void) +static inline void __mte_enable_kernel(const char *mode, unsigned long tcf) { /* Enable MTE Sync Mode for EL1. */ - sysreg_clear_set(sctlr_el1, SCTLR_ELx_TCF_MASK, SCTLR_ELx_TCF_SYNC); + sysreg_clear_set(sctlr_el1, SCTLR_ELx_TCF_MASK, tcf); isb(); + + pr_info_once("MTE: enabled in %s mode at EL1\n", mode); +} + +#ifdef CONFIG_KASAN_HW_TAGS +void mte_enable_kernel_sync(void) +{ + /* + * Make sure we enter this function when no PE has set + * async mode previously. + */ + WARN_ONCE(system_uses_mte_async_mode(), + "MTE async mode enabled system wide!"); + + __mte_enable_kernel("synchronous", SCTLR_ELx_TCF_SYNC); } +void mte_enable_kernel_async(void) +{ + __mte_enable_kernel("asynchronous", SCTLR_ELx_TCF_ASYNC); + + /* + * MTE async mode is set system wide by the first PE that + * executes this function. + * + * Note: If in future KASAN acquires a runtime switching + * mode in between sync and async, this strategy needs + * to be reviewed. + */ + if (!system_uses_mte_async_mode()) + static_branch_enable(&mte_async_mode); +} +#endif + void mte_set_report_once(bool state) { WRITE_ONCE(report_fault_once, state); @@ -124,25 +162,28 @@ bool mte_report_once(void) return READ_ONCE(report_fault_once); } -static void update_sctlr_el1_tcf0(u64 tcf0) +#ifdef CONFIG_KASAN_HW_TAGS +void mte_check_tfsr_el1(void) { - /* ISB required for the kernel uaccess routines */ - sysreg_clear_set(sctlr_el1, SCTLR_EL1_TCF0_MASK, tcf0); - isb(); -} + u64 tfsr_el1; -static void set_sctlr_el1_tcf0(u64 tcf0) -{ - /* - * mte_thread_switch() checks current->thread.sctlr_tcf0 as an - * optimisation. Disable preemption so that it does not see - * the variable update before the SCTLR_EL1.TCF0 one. - */ - preempt_disable(); - current->thread.sctlr_tcf0 = tcf0; - update_sctlr_el1_tcf0(tcf0); - preempt_enable(); + if (!system_supports_mte()) + return; + + tfsr_el1 = read_sysreg_s(SYS_TFSR_EL1); + + if (unlikely(tfsr_el1 & SYS_TFSR_EL1_TF1)) { + /* + * Note: isb() is not required after this direct write + * because there is no indirect read subsequent to it + * (per ARM DDI 0487F.c table D13-1). + */ + write_sysreg_s(0, SYS_TFSR_EL1); + + kasan_report_async(); + } } +#endif static void update_gcr_el1_excl(u64 excl) { @@ -166,7 +207,7 @@ static void set_gcr_el1_excl(u64 excl) */ } -void flush_mte_state(void) +void mte_thread_init_user(void) { if (!system_supports_mte()) return; @@ -176,19 +217,39 @@ void flush_mte_state(void) write_sysreg_s(0, SYS_TFSRE0_EL1); clear_thread_flag(TIF_MTE_ASYNC_FAULT); /* disable tag checking */ - set_sctlr_el1_tcf0(SCTLR_EL1_TCF0_NONE); + set_task_sctlr_el1((current->thread.sctlr_user & ~SCTLR_EL1_TCF0_MASK) | + SCTLR_EL1_TCF0_NONE); /* reset tag generation mask */ set_gcr_el1_excl(SYS_GCR_EL1_EXCL_MASK); } void mte_thread_switch(struct task_struct *next) { + /* + * Check if an async tag exception occurred at EL1. + * + * Note: On the context switch path we rely on the dsb() present + * in __switch_to() to guarantee that the indirect writes to TFSR_EL1 + * are synchronized before this point. + */ + isb(); + mte_check_tfsr_el1(); +} + +void mte_suspend_enter(void) +{ if (!system_supports_mte()) return; - /* avoid expensive SCTLR_EL1 accesses if no change */ - if (current->thread.sctlr_tcf0 != next->thread.sctlr_tcf0) - update_sctlr_el1_tcf0(next->thread.sctlr_tcf0); + /* + * The barriers are required to guarantee that the indirect writes + * to TFSR_EL1 are synchronized before we report the state. + */ + dsb(nsh); + isb(); + + /* Report SYS_TFSR_EL1 before suspend entry */ + mte_check_tfsr_el1(); } void mte_suspend_exit(void) @@ -201,7 +262,7 @@ void mte_suspend_exit(void) long set_mte_ctrl(struct task_struct *task, unsigned long arg) { - u64 tcf0; + u64 sctlr = task->thread.sctlr_user & ~SCTLR_EL1_TCF0_MASK; u64 gcr_excl = ~((arg & PR_MTE_TAG_MASK) >> PR_MTE_TAG_SHIFT) & SYS_GCR_EL1_EXCL_MASK; @@ -210,23 +271,23 @@ long set_mte_ctrl(struct task_struct *task, unsigned long arg) switch (arg & PR_MTE_TCF_MASK) { case PR_MTE_TCF_NONE: - tcf0 = SCTLR_EL1_TCF0_NONE; + sctlr |= SCTLR_EL1_TCF0_NONE; break; case PR_MTE_TCF_SYNC: - tcf0 = SCTLR_EL1_TCF0_SYNC; + sctlr |= SCTLR_EL1_TCF0_SYNC; break; case PR_MTE_TCF_ASYNC: - tcf0 = SCTLR_EL1_TCF0_ASYNC; + sctlr |= SCTLR_EL1_TCF0_ASYNC; break; default: return -EINVAL; } if (task != current) { - task->thread.sctlr_tcf0 = tcf0; + task->thread.sctlr_user = sctlr; task->thread.gcr_user_excl = gcr_excl; } else { - set_sctlr_el1_tcf0(tcf0); + set_task_sctlr_el1(sctlr); set_gcr_el1_excl(gcr_excl); } @@ -243,7 +304,7 @@ long get_mte_ctrl(struct task_struct *task) ret = incl << PR_MTE_TAG_SHIFT; - switch (task->thread.sctlr_tcf0) { + switch (task->thread.sctlr_user & SCTLR_EL1_TCF0_MASK) { case SCTLR_EL1_TCF0_NONE: ret |= PR_MTE_TCF_NONE; break; diff --git a/arch/arm64/kernel/paravirt.c b/arch/arm64/kernel/paravirt.c index c07d7a034941..75fed4460407 100644 --- a/arch/arm64/kernel/paravirt.c +++ b/arch/arm64/kernel/paravirt.c @@ -18,6 +18,7 @@ #include <linux/reboot.h> #include <linux/slab.h> #include <linux/types.h> +#include <linux/static_call.h> #include <asm/paravirt.h> #include <asm/pvclock-abi.h> @@ -26,8 +27,12 @@ struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; -struct paravirt_patch_template pv_ops; -EXPORT_SYMBOL_GPL(pv_ops); +static u64 native_steal_clock(int cpu) +{ + return 0; +} + +DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); struct pv_time_stolen_time_region { struct pvclock_vcpu_stolen_time *kaddr; @@ -45,7 +50,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); /* return stolen time in ns by asking the hypervisor */ -static u64 pv_steal_clock(int cpu) +static u64 para_steal_clock(int cpu) { struct pv_time_stolen_time_region *reg; @@ -150,7 +155,7 @@ int __init pv_time_init(void) if (ret) return ret; - pv_ops.time.steal_clock = pv_steal_clock; + static_call_update(pv_steal_clock, para_steal_clock); static_key_slow_inc(¶virt_steal_enabled); if (steal_acc) diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 4658fcf88c2b..f594957e29bd 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -470,9 +470,8 @@ static inline u64 armv8pmu_read_evcntr(int idx) static inline u64 armv8pmu_read_hw_counter(struct perf_event *event) { int idx = event->hw.idx; - u64 val = 0; + u64 val = armv8pmu_read_evcntr(idx); - val = armv8pmu_read_evcntr(idx); if (armv8pmu_event_is_chained(event)) val = (val << 32) | armv8pmu_read_evcntr(idx - 1); return val; @@ -520,7 +519,7 @@ static u64 armv8pmu_read_counter(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - u64 value = 0; + u64 value; if (idx == ARMV8_IDX_CYCLE_COUNTER) value = read_sysreg(pmccntr_el0); diff --git a/arch/arm64/kernel/pointer_auth.c b/arch/arm64/kernel/pointer_auth.c index adb955fd9bdd..60901ab0a7fe 100644 --- a/arch/arm64/kernel/pointer_auth.c +++ b/arch/arm64/kernel/pointer_auth.c @@ -43,6 +43,69 @@ int ptrauth_prctl_reset_keys(struct task_struct *tsk, unsigned long arg) get_random_bytes(&keys->apdb, sizeof(keys->apdb)); if (arg & PR_PAC_APGAKEY) get_random_bytes(&keys->apga, sizeof(keys->apga)); + ptrauth_keys_install_user(keys); return 0; } + +static u64 arg_to_enxx_mask(unsigned long arg) +{ + u64 sctlr_enxx_mask = 0; + + WARN_ON(arg & ~PR_PAC_ENABLED_KEYS_MASK); + if (arg & PR_PAC_APIAKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENIA; + if (arg & PR_PAC_APIBKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENIB; + if (arg & PR_PAC_APDAKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENDA; + if (arg & PR_PAC_APDBKEY) + sctlr_enxx_mask |= SCTLR_ELx_ENDB; + return sctlr_enxx_mask; +} + +int ptrauth_set_enabled_keys(struct task_struct *tsk, unsigned long keys, + unsigned long enabled) +{ + u64 sctlr = tsk->thread.sctlr_user; + + if (!system_supports_address_auth()) + return -EINVAL; + + if (is_compat_thread(task_thread_info(tsk))) + return -EINVAL; + + if ((keys & ~PR_PAC_ENABLED_KEYS_MASK) || (enabled & ~keys)) + return -EINVAL; + + sctlr &= ~arg_to_enxx_mask(keys); + sctlr |= arg_to_enxx_mask(enabled); + if (tsk == current) + set_task_sctlr_el1(sctlr); + else + tsk->thread.sctlr_user = sctlr; + + return 0; +} + +int ptrauth_get_enabled_keys(struct task_struct *tsk) +{ + int retval = 0; + + if (!system_supports_address_auth()) + return -EINVAL; + + if (is_compat_thread(task_thread_info(tsk))) + return -EINVAL; + + if (tsk->thread.sctlr_user & SCTLR_ELx_ENIA) + retval |= PR_PAC_APIAKEY; + if (tsk->thread.sctlr_user & SCTLR_ELx_ENIB) + retval |= PR_PAC_APIBKEY; + if (tsk->thread.sctlr_user & SCTLR_ELx_ENDA) + retval |= PR_PAC_APDAKEY; + if (tsk->thread.sctlr_user & SCTLR_ELx_ENDB) + retval |= PR_PAC_APDBKEY; + + return retval; +} diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 66aac2881ba8..d607c9912025 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -264,13 +264,14 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr) * normal page fault. */ instruction_pointer_set(regs, (unsigned long) cur->addr); - if (!instruction_pointer(regs)) - BUG(); + BUG_ON(!instruction_pointer(regs)); - if (kcb->kprobe_status == KPROBE_REENTER) + if (kcb->kprobe_status == KPROBE_REENTER) { restore_previous_kprobe(kcb); - else + } else { + kprobes_restore_local_irqflag(kcb, regs); reset_current_kprobe(); + } break; case KPROBE_HIT_ACTIVE: diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 6e60aa3b5ea9..cbf52109583b 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -86,7 +86,7 @@ static void noinstr __cpu_do_idle_irqprio(void) unsigned long daif_bits; daif_bits = read_sysreg(daif); - write_sysreg(daif_bits | PSR_I_BIT, daif); + write_sysreg(daif_bits | PSR_I_BIT | PSR_F_BIT, daif); /* * Unmask PMR before going idle to make sure interrupts can @@ -341,7 +341,6 @@ void flush_thread(void) tls_thread_flush(); flush_ptrace_hw_breakpoint(current); flush_tagged_addr_state(); - flush_mte_state(); } void release_thread(struct task_struct *dead_task) @@ -531,6 +530,31 @@ static void erratum_1418040_thread_switch(struct task_struct *prev, write_sysreg(val, cntkctl_el1); } +static void update_sctlr_el1(u64 sctlr) +{ + /* + * EnIA must not be cleared while in the kernel as this is necessary for + * in-kernel PAC. It will be cleared on kernel exit if needed. + */ + sysreg_clear_set(sctlr_el1, SCTLR_USER_MASK & ~SCTLR_ELx_ENIA, sctlr); + + /* ISB required for the kernel uaccess routines when setting TCF0. */ + isb(); +} + +void set_task_sctlr_el1(u64 sctlr) +{ + /* + * __switch_to() checks current->thread.sctlr as an + * optimisation. Disable preemption so that it does not see + * the variable update before the SCTLR_EL1 one. + */ + preempt_disable(); + current->thread.sctlr_user = sctlr; + update_sctlr_el1(sctlr); + preempt_enable(); +} + /* * Thread switching. */ @@ -546,6 +570,7 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, entry_task_switch(next); ssbs_thread_switch(next); erratum_1418040_thread_switch(prev, next); + ptrauth_thread_switch_user(next); /* * Complete any pending TLB or cache maintenance on this CPU in case @@ -561,6 +586,9 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, * registers. */ mte_thread_switch(next); + /* avoid expensive SCTLR_EL1 accesses if no change */ + if (prev->thread.sctlr_user != next->thread.sctlr_user) + update_sctlr_el1(next->thread.sctlr_user); /* the actual thread switch */ last = cpu_switch_to(prev, next); @@ -610,7 +638,8 @@ void arch_setup_new_exec(void) { current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0; - ptrauth_thread_init_user(current); + ptrauth_thread_init_user(); + mte_thread_init_user(); if (task_spec_ssb_noexec(current)) { arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS, diff --git a/arch/arm64/kernel/psci.c b/arch/arm64/kernel/psci.c index 62d2bda7adb8..ab7f4c476104 100644 --- a/arch/arm64/kernel/psci.c +++ b/arch/arm64/kernel/psci.c @@ -38,7 +38,8 @@ static int __init cpu_psci_cpu_prepare(unsigned int cpu) static int cpu_psci_cpu_boot(unsigned int cpu) { - int err = psci_ops.cpu_on(cpu_logical_map(cpu), __pa_symbol(secondary_entry)); + phys_addr_t pa_secondary_entry = __pa_symbol(function_nocfi(secondary_entry)); + int err = psci_ops.cpu_on(cpu_logical_map(cpu), pa_secondary_entry); if (err) pr_err("failed to boot CPU%d (%d)\n", cpu, err); diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 170f42fd6101..eb2f73939b7b 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -909,6 +909,38 @@ static int pac_mask_get(struct task_struct *target, return membuf_write(&to, &uregs, sizeof(uregs)); } +static int pac_enabled_keys_get(struct task_struct *target, + const struct user_regset *regset, + struct membuf to) +{ + long enabled_keys = ptrauth_get_enabled_keys(target); + + if (IS_ERR_VALUE(enabled_keys)) + return enabled_keys; + + return membuf_write(&to, &enabled_keys, sizeof(enabled_keys)); +} + +static int pac_enabled_keys_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + long enabled_keys = ptrauth_get_enabled_keys(target); + + if (IS_ERR_VALUE(enabled_keys)) + return enabled_keys; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &enabled_keys, 0, + sizeof(long)); + if (ret) + return ret; + + return ptrauth_set_enabled_keys(target, PR_PAC_ENABLED_KEYS_MASK, + enabled_keys); +} + #ifdef CONFIG_CHECKPOINT_RESTORE static __uint128_t pac_key_to_user(const struct ptrauth_key *key) { @@ -1074,6 +1106,7 @@ enum aarch64_regset { #endif #ifdef CONFIG_ARM64_PTR_AUTH REGSET_PAC_MASK, + REGSET_PAC_ENABLED_KEYS, #ifdef CONFIG_CHECKPOINT_RESTORE REGSET_PACA_KEYS, REGSET_PACG_KEYS, @@ -1160,6 +1193,14 @@ static const struct user_regset aarch64_regsets[] = { .regset_get = pac_mask_get, /* this cannot be set dynamically */ }, + [REGSET_PAC_ENABLED_KEYS] = { + .core_note_type = NT_ARM_PAC_ENABLED_KEYS, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .regset_get = pac_enabled_keys_get, + .set = pac_enabled_keys_set, + }, #ifdef CONFIG_CHECKPOINT_RESTORE [REGSET_PACA_KEYS] = { .core_note_type = NT_ARM_PACA_KEYS, diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 5bfd9b87f85d..4ea9392f86e0 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -134,7 +134,7 @@ SYM_FUNC_START(_cpu_resume) */ bl cpu_do_resume -#if defined(CONFIG_KASAN) && CONFIG_KASAN_STACK +#if defined(CONFIG_KASAN) && defined(CONFIG_KASAN_STACK) mov x0, sp bl kasan_unpoison_task_stack_below #endif diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 357590beaabb..dcd7041b2b07 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -188,6 +188,7 @@ static void init_gic_priority_masking(void) cpuflags = read_sysreg(daif); WARN_ON(!(cpuflags & PSR_I_BIT)); + WARN_ON(!(cpuflags & PSR_F_BIT)); gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); } diff --git a/arch/arm64/kernel/smp_spin_table.c b/arch/arm64/kernel/smp_spin_table.c index 056772c26098..c45a83512805 100644 --- a/arch/arm64/kernel/smp_spin_table.c +++ b/arch/arm64/kernel/smp_spin_table.c @@ -66,6 +66,7 @@ static int smp_spin_table_cpu_init(unsigned int cpu) static int smp_spin_table_cpu_prepare(unsigned int cpu) { __le64 __iomem *release_addr; + phys_addr_t pa_holding_pen = __pa_symbol(function_nocfi(secondary_holding_pen)); if (!cpu_release_addr[cpu]) return -ENODEV; @@ -88,7 +89,7 @@ static int smp_spin_table_cpu_prepare(unsigned int cpu) * boot-loader's endianness before jumping. This is mandated by * the boot protocol. */ - writeq_relaxed(__pa_symbol(secondary_holding_pen), release_addr); + writeq_relaxed(pa_holding_pen, release_addr); __flush_dcache_area((__force void *)release_addr, sizeof(*release_addr)); diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index d55bdfb7789c..84b676bcf867 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -32,6 +32,30 @@ * add sp, sp, #0x10 */ + +void start_backtrace(struct stackframe *frame, unsigned long fp, + unsigned long pc) +{ + frame->fp = fp; + frame->pc = pc; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + frame->graph = 0; +#endif + + /* + * Prime the first unwind. + * + * In unwind_frame() we'll check that the FP points to a valid stack, + * which can't be STACK_TYPE_UNKNOWN, and the first unwind will be + * treated as a transition to whichever stack that happens to be. The + * prev_fp value won't be used, but we set it to 0 such that it is + * definitely not an accessible stack address. + */ + bitmap_zero(frame->stacks_done, __NR_STACK_TYPES); + frame->prev_fp = 0; + frame->prev_type = STACK_TYPE_UNKNOWN; +} + /* * Unwind from one frame record (A) to the next frame record (B). * diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index d7564891ffe1..e3f72df9509d 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -74,8 +74,9 @@ void notrace __cpu_suspend_exit(void) */ spectre_v4_enable_mitigation(NULL); - /* Restore additional MTE-specific configuration */ + /* Restore additional feature-specific configuration */ mte_suspend_exit(); + ptrauth_suspend_exit(); } /* @@ -91,6 +92,9 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) unsigned long flags; struct sleep_stack_data state; + /* Report any MTE async fault before going to suspend */ + mte_suspend_enter(); + /* * From this point debug exceptions are disabled to prevent * updates to mdscr register (saved and restored along with diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index b9cf12b271d7..263d6c1a525f 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -5,6 +5,7 @@ #include <linux/errno.h> #include <linux/nospec.h> #include <linux/ptrace.h> +#include <linux/randomize_kstack.h> #include <linux/syscalls.h> #include <asm/daifflags.h> @@ -43,6 +44,8 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, { long ret; + add_random_kstack_offset(); + if (scno < sc_nr) { syscall_fn_t syscall_fn; syscall_fn = syscall_table[array_index_nospec(scno, sc_nr)]; @@ -55,6 +58,19 @@ static void invoke_syscall(struct pt_regs *regs, unsigned int scno, ret = lower_32_bits(ret); regs->regs[0] = ret; + + /* + * Ultimately, this value will get limited by KSTACK_OFFSET_MAX(), + * but not enough for arm64 stack utilization comfort. To keep + * reasonable stack head room, reduce the maximum offset to 9 bits. + * + * The actual entropy will be further reduced by the compiler when + * applying stack alignment constraints: the AAPCS mandates a + * 16-byte (i.e. 4-bit) aligned SP at function boundaries. + * + * The resulting 5 bits of entropy is seen in SP[8:4]. + */ + choose_random_kstack_offset(get_random_int() & 0x1FF); } static inline bool has_syscall_work(unsigned long flags) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index e08a4126453a..4dd14a6620c1 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -199,12 +199,47 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) return 0; } -static DEFINE_STATIC_KEY_FALSE(amu_fie_key); -#define amu_freq_invariant() static_branch_unlikely(&amu_fie_key) +static void amu_scale_freq_tick(void) +{ + u64 prev_core_cnt, prev_const_cnt; + u64 core_cnt, const_cnt, scale; + + prev_const_cnt = this_cpu_read(arch_const_cycles_prev); + prev_core_cnt = this_cpu_read(arch_core_cycles_prev); + + update_freq_counters_refs(); + + const_cnt = this_cpu_read(arch_const_cycles_prev); + core_cnt = this_cpu_read(arch_core_cycles_prev); + + if (unlikely(core_cnt <= prev_core_cnt || + const_cnt <= prev_const_cnt)) + return; + + /* + * /\core arch_max_freq_scale + * scale = ------- * -------------------- + * /\const SCHED_CAPACITY_SCALE + * + * See validate_cpu_freq_invariance_counters() for details on + * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT. + */ + scale = core_cnt - prev_core_cnt; + scale *= this_cpu_read(arch_max_freq_scale); + scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT, + const_cnt - prev_const_cnt); + + scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); + this_cpu_write(arch_freq_scale, (unsigned long)scale); +} + +static struct scale_freq_data amu_sfd = { + .source = SCALE_FREQ_SOURCE_ARCH, + .set_freq_scale = amu_scale_freq_tick, +}; static void amu_fie_setup(const struct cpumask *cpus) { - bool invariant; int cpu; /* We are already set since the last insmod of cpufreq driver */ @@ -221,25 +256,10 @@ static void amu_fie_setup(const struct cpumask *cpus) cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); - invariant = topology_scale_freq_invariant(); - - /* We aren't fully invariant yet */ - if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask)) - return; - - static_branch_enable(&amu_fie_key); + topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus); pr_debug("CPUs[%*pbl]: counters will be used for FIE.", cpumask_pr_args(cpus)); - - /* - * Task scheduler behavior depends on frequency invariance support, - * either cpufreq or counter driven. If the support status changes as - * a result of counter initialisation and use, retrigger the build of - * scheduling domains to ensure the information is propagated properly. - */ - if (!invariant) - rebuild_sched_domains_energy(); } static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, @@ -256,8 +276,8 @@ static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, * initialized AMU support and enabled invariance. The AMU counters will * keep on working just fine in the absence of the cpufreq driver, and * for the CPUs for which there are no counters available, the last set - * value of freq_scale will remain valid as that is the frequency those - * CPUs are running at. + * value of arch_freq_scale will remain valid as that is the frequency + * those CPUs are running at. */ return 0; @@ -283,53 +303,6 @@ static int __init init_amu_fie(void) } core_initcall(init_amu_fie); -bool arch_freq_counters_available(const struct cpumask *cpus) -{ - return amu_freq_invariant() && - cpumask_subset(cpus, amu_fie_cpus); -} - -void topology_scale_freq_tick(void) -{ - u64 prev_core_cnt, prev_const_cnt; - u64 core_cnt, const_cnt, scale; - int cpu = smp_processor_id(); - - if (!amu_freq_invariant()) - return; - - if (!cpumask_test_cpu(cpu, amu_fie_cpus)) - return; - - prev_const_cnt = this_cpu_read(arch_const_cycles_prev); - prev_core_cnt = this_cpu_read(arch_core_cycles_prev); - - update_freq_counters_refs(); - - const_cnt = this_cpu_read(arch_const_cycles_prev); - core_cnt = this_cpu_read(arch_core_cycles_prev); - - if (unlikely(core_cnt <= prev_core_cnt || - const_cnt <= prev_const_cnt)) - return; - - /* - * /\core arch_max_freq_scale - * scale = ------- * -------------------- - * /\const SCHED_CAPACITY_SCALE - * - * See validate_cpu_freq_invariance_counters() for details on - * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT. - */ - scale = core_cnt - prev_core_cnt; - scale *= this_cpu_read(arch_max_freq_scale); - scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT, - const_cnt - prev_const_cnt); - - scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); - this_cpu_write(freq_scale, (unsigned long)scale); -} - #ifdef CONFIG_ACPI_CPPC_LIB #include <acpi/cppc_acpi.h> diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index cee5d04ea9ad..a61fc4f989b3 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -86,7 +86,7 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } -static int __vdso_init(enum vdso_abi abi) +static int __init __vdso_init(enum vdso_abi abi) { int i; struct page **vdso_pagelist; @@ -271,6 +271,14 @@ enum aarch32_map { static struct page *aarch32_vectors_page __ro_after_init; static struct page *aarch32_sig_page __ro_after_init; +static int aarch32_sigpage_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + current->mm->context.sigpage = (void *)new_vma->vm_start; + + return 0; +} + static struct vm_special_mapping aarch32_vdso_maps[] = { [AA32_MAP_VECTORS] = { .name = "[vectors]", /* ABI */ @@ -279,6 +287,7 @@ static struct vm_special_mapping aarch32_vdso_maps[] = { [AA32_MAP_SIGPAGE] = { .name = "[sigpage]", /* ABI */ .pages = &aarch32_sig_page, + .mremap = aarch32_sigpage_mremap, }, [AA32_MAP_VVAR] = { .name = "[vvar]", @@ -299,34 +308,35 @@ static int aarch32_alloc_kuser_vdso_page(void) if (!IS_ENABLED(CONFIG_KUSER_HELPERS)) return 0; - vdso_page = get_zeroed_page(GFP_ATOMIC); + vdso_page = get_zeroed_page(GFP_KERNEL); if (!vdso_page) return -ENOMEM; memcpy((void *)(vdso_page + 0x1000 - kuser_sz), __kuser_helper_start, kuser_sz); aarch32_vectors_page = virt_to_page(vdso_page); - flush_dcache_page(aarch32_vectors_page); return 0; } +#define COMPAT_SIGPAGE_POISON_WORD 0xe7fddef1 static int aarch32_alloc_sigpage(void) { extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[]; int sigret_sz = __aarch32_sigret_code_end - __aarch32_sigret_code_start; - unsigned long sigpage; + __le32 poison = cpu_to_le32(COMPAT_SIGPAGE_POISON_WORD); + void *sigpage; - sigpage = get_zeroed_page(GFP_ATOMIC); + sigpage = (void *)__get_free_page(GFP_KERNEL); if (!sigpage) return -ENOMEM; - memcpy((void *)sigpage, __aarch32_sigret_code_start, sigret_sz); + memset32(sigpage, (__force u32)poison, PAGE_SIZE / sizeof(poison)); + memcpy(sigpage, __aarch32_sigret_code_start, sigret_sz); aarch32_sig_page = virt_to_page(sigpage); - flush_dcache_page(aarch32_sig_page); return 0; } -static int __aarch32_alloc_vdso_pages(void) +static int __init __aarch32_alloc_vdso_pages(void) { if (!IS_ENABLED(CONFIG_COMPAT_VDSO)) |