From 5844cc25fd121074de7895181a2fa1ce100a0fdd Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 26 Nov 2020 20:25:27 +1000 Subject: powerpc/64s: Fix hash ISA v3.0 TLBIEL instruction generation A typo has the R field of the instruction assigned by lucky dip a la register allocator. Fixes: d4748276ae14c ("powerpc/64s: Improve local TLB flush for boot and MCE on POWER9") Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201126102530.691335-2-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_native.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 0203cdf48c54..97fa42d7027e 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -68,7 +68,7 @@ static __always_inline void tlbiel_hash_set_isa300(unsigned int set, unsigned in rs = ((unsigned long)pid << PPC_BITLSHIFT(31)); asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) - : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "r"(r) + : : "r"(rb), "r"(rs), "i"(ric), "i"(prs), "i"(r) : "memory"); } -- cgit v1.2.3 From c0b27c517acf8a2b359dd373a7e7e88b01a8308e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 26 Nov 2020 20:25:28 +1000 Subject: powerpc/64s/pseries: Fix hash tlbiel_all_isa300 for guest kernels tlbiel_all() can not be usable in !HVMODE when running hash presently, remove HV privileged flushes when running in guest to make it usable. Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201126102530.691335-3-npiggin@gmail.com --- arch/powerpc/mm/book3s64/hash_native.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 97fa42d7027e..52e170bd95ae 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -92,16 +92,15 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) asm volatile("ptesync": : :"memory"); /* - * Flush the first set of the TLB, and any caching of partition table - * entries. Then flush the remaining sets of the TLB. Hash mode uses - * partition scoped TLB translations. + * Flush the partition table cache if this is HV mode. */ - tlbiel_hash_set_isa300(0, is, 0, 2, 0); - for (set = 1; set < num_sets; set++) - tlbiel_hash_set_isa300(set, is, 0, 0, 0); + if (early_cpu_has_feature(CPU_FTR_HVMODE)) + tlbiel_hash_set_isa300(0, is, 0, 2, 0); /* - * Now invalidate the process table cache. + * Now invalidate the process table cache. UPRT=0 HPT modes (what + * current hardware implements) do not use the process table, but + * add the flushes anyway. * * From ISA v3.0B p. 1078: * The following forms are invalid. @@ -110,6 +109,14 @@ static void tlbiel_all_isa300(unsigned int num_sets, unsigned int is) */ tlbiel_hash_set_isa300(0, is, 0, 2, 1); + /* + * Then flush the sets of the TLB proper. Hash mode uses + * partition scoped TLB translations, which may be flushed + * in !HV mode. + */ + for (set = 0; set < num_sets; set++) + tlbiel_hash_set_isa300(set, is, 0, 0, 0); + ppc_after_tlbiel_barrier(); asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT "; isync" : : :"memory"); -- cgit v1.2.3 From 01b0f0eae0812e80efeee4ee17687e5386335e08 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 26 Nov 2020 20:25:30 +1000 Subject: powerpc/64s: Trim offlined CPUs from mm_cpumasks When offlining a CPU, powerpc/64s does not flush TLBs, rather it just leaves the CPU set in mm_cpumasks, so it continues to receive TLBIEs to manage its TLBs. However the exit_flush_lazy_tlbs() function expects that after returning, all CPUs (except self) have flushed TLBs for that mm, in which case TLBIEL can be used for this flush. This breaks for offline CPUs because they don't get the IPI to flush their TLB. This can lead to stale translations. Fix this by clearing the CPU from mm_cpumasks, then flushing all TLBs before going offline. These offlined CPU bits stuck in the cpumask also prevents the cpumask from being trimmed back to local mode, which means continual broadcast IPIs or TLBIEs are needed for TLB flushing. This patch prevents that situation too. A cast of many were involved in working this out, but in particular Milton, Aneesh, Paul made key discoveries. Fixes: 0cef77c7798a7 ("powerpc/64s/radix: flush remote CPUs out of single-threaded mm_cpumask") Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Debugged-by: Milton Miller Debugged-by: Aneesh Kumar K.V Debugged-by: Paul Mackerras Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201126102530.691335-5-npiggin@gmail.com --- arch/powerpc/include/asm/book3s/64/mmu.h | 12 ++++++++++++ arch/powerpc/mm/book3s64/mmu_context.c | 20 ++++++++++++++++++++ arch/powerpc/platforms/powermac/smp.c | 2 ++ arch/powerpc/platforms/powernv/smp.c | 3 +++ arch/powerpc/platforms/pseries/hotplug-cpu.c | 3 +++ 5 files changed, 40 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index e0b52940e43c..750918451dd2 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -242,6 +242,18 @@ extern void radix_init_pseries(void); static inline void radix_init_pseries(void) { }; #endif +#ifdef CONFIG_HOTPLUG_CPU +#define arch_clear_mm_cpumask_cpu(cpu, mm) \ + do { \ + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) { \ + atomic_dec(&(mm)->context.active_cpus); \ + cpumask_clear_cpu(cpu, mm_cpumask(mm)); \ + } \ + } while (0) + +void cleanup_cpu_mmu_context(void); +#endif + static inline int get_user_context(mm_context_t *ctx, unsigned long ea) { int index = ea >> MAX_EA_BITS_PER_CONTEXT; diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index 1c54821de7bf..0c8557220ae2 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -307,3 +308,22 @@ void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) isync(); } #endif + +/** + * cleanup_cpu_mmu_context - Clean up MMU details for this CPU (newly offlined) + * + * This clears the CPU from mm_cpumask for all processes, and then flushes the + * local TLB to ensure TLB coherency in case the CPU is onlined again. + * + * KVM guest translations are not necessarily flushed here. If KVM started + * using mm_cpumask or the Linux APIs which do, this would have to be resolved. + */ +#ifdef CONFIG_HOTPLUG_CPU +void cleanup_cpu_mmu_context(void) +{ + int cpu = smp_processor_id(); + + clear_tasks_mm_cpumask(cpu); + tlbiel_all(); +} +#endif diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index 74ebe664b016..adae2a6712e1 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -911,6 +911,8 @@ static int smp_core99_cpu_disable(void) mpic_cpu_set_priority(0xf); + cleanup_cpu_mmu_context(); + return 0; } diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 54c4ba45c7ce..cbb67813cd5d 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -143,6 +143,9 @@ static int pnv_smp_cpu_disable(void) xive_smp_disable_cpu(); else xics_migrate_irqs_away(); + + cleanup_cpu_mmu_context(); + return 0; } diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index f2837e33bf5d..a02012f1b04a 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -90,6 +90,9 @@ static int pseries_cpu_disable(void) xive_smp_disable_cpu(); else xics_migrate_irqs_away(); + + cleanup_cpu_mmu_context(); + return 0; } -- cgit v1.2.3 From 10f78fd0dabbc3856ddd67b09a46abdedb045913 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Fri, 27 Nov 2020 11:07:38 +0530 Subject: powerpc/numa: Fix a regression on memoryless node 0 Commit e75130f20b1f ("powerpc/numa: Offline memoryless cpuless node 0") offlines node 0 and expects nodes to be subsequently onlined when CPUs or nodes are detected. Commit 6398eaa26816 ("powerpc/numa: Prefer node id queried from vphn") skips onlining node 0 when CPUs are associated with node 0. On systems with node 0 having CPUs but no memory, this causes node 0 be marked offline. This causes issues at boot time when trying to set memory node for online CPUs while building the zonelist. 0:mon> t [link register ] c000000000400354 __build_all_zonelists+0x164/0x280 [c00000000161bda0] c0000000016533c8 node_states+0x20/0xa0 (unreliable) [c00000000161bdc0] c000000000400384 __build_all_zonelists+0x194/0x280 [c00000000161be30] c000000001041800 build_all_zonelists_init+0x4c/0x118 [c00000000161be80] c0000000004020d0 build_all_zonelists+0x190/0x1b0 [c00000000161bef0] c000000001003cf8 start_kernel+0x18c/0x6a8 [c00000000161bf90] c00000000000adb4 start_here_common+0x1c/0x3e8 0:mon> r R00 = c000000000400354 R16 = 000000000b57a0e8 R01 = c00000000161bda0 R17 = 000000000b57a6b0 R02 = c00000000161ce00 R18 = 000000000b5afee8 R03 = 0000000000000000 R19 = 000000000b6448a0 R04 = 0000000000000000 R20 = fffffffffffffffd R05 = 0000000000000000 R21 = 0000000001400000 R06 = 0000000000000000 R22 = 000000001ec00000 R07 = 0000000000000001 R23 = c000000001175580 R08 = 0000000000000000 R24 = c000000001651ed8 R09 = c0000000017e84d8 R25 = c000000001652480 R10 = 0000000000000000 R26 = c000000001175584 R11 = c000000c7fac0d10 R27 = c0000000019568d0 R12 = c000000000400180 R28 = 0000000000000000 R13 = c000000002200000 R29 = c00000000164dd78 R14 = 000000000b579f78 R30 = 0000000000000000 R15 = 000000000b57a2b8 R31 = c000000001175584 pc = c000000000400194 local_memory_node+0x24/0x80 cfar= c000000000074334 mcount+0xc/0x10 lr = c000000000400354 __build_all_zonelists+0x164/0x280 msr = 8000000002001033 cr = 44002284 ctr = c000000000400180 xer = 0000000000000001 trap = 380 dar = 0000000000001388 dsisr = c00000000161bc90 0:mon> Fix this by setting node to be online while onlining CPUs that belong to node 0. Fixes: e75130f20b1f ("powerpc/numa: Offline memoryless cpuless node 0") Fixes: 6398eaa26816 ("powerpc/numa: Prefer node id queried from vphn") Reported-by: Milan Mohanty Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201127053738.10085-1-srikar@linux.vnet.ibm.com --- arch/powerpc/mm/numa.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 63f61d8b55e5..f2bf98bdcea2 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -742,8 +742,7 @@ static int __init parse_numa_properties(void) of_node_put(cpu); } - if (likely(nid > 0)) - node_set_online(nid); + node_set_online(nid); } get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells); -- cgit v1.2.3 From 9ea69a55b3b9a71cded9726af591949c1138f235 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Thu, 26 Nov 2020 09:28:52 +0100 Subject: powerpc/pseries: Pass MSI affinity to irq_create_mapping() With virtio multiqueue, normally each queue IRQ is mapped to a CPU. Commit 0d9f0a52c8b9f ("virtio_scsi: use virtio IRQ affinity") exposed an existing shortcoming of the arch code by moving virtio_scsi to the automatic IRQ affinity assignment. The affinity is correctly computed in msi_desc but this is not applied to the system IRQs. It appears the affinity is correctly passed to rtas_setup_msi_irqs() but lost at this point and never passed to irq_domain_alloc_descs() (see commit 06ee6d571f0e ("genirq: Add affinity hint to irq allocation")) because irq_create_mapping() doesn't take an affinity parameter. Use the new irq_create_mapping_affinity() function, which allows to forward the affinity setting from rtas_setup_msi_irqs() to irq_domain_alloc_descs(). With this change, the virtqueues are correctly dispatched between the CPUs on pseries. Fixes: e75eafb9b039 ("genirq/msi: Switch to new irq spreading infrastructure") Signed-off-by: Laurent Vivier Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kurz Acked-by: Michael Ellerman Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201126082852.1178497-3-lvivier@redhat.com --- arch/powerpc/platforms/pseries/msi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c index 133f6adcb39c..b3ac2455faad 100644 --- a/arch/powerpc/platforms/pseries/msi.c +++ b/arch/powerpc/platforms/pseries/msi.c @@ -458,7 +458,8 @@ again: return hwirq; } - virq = irq_create_mapping(NULL, hwirq); + virq = irq_create_mapping_affinity(NULL, hwirq, + entry->affinity); if (!virq) { pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq); -- cgit v1.2.3 From ca1314d73eed493c49bb1932c60a8605530db2e4 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 30 Nov 2020 11:59:40 +0000 Subject: arm64: syscall: exit userspace before unmasking exceptions In el0_svc_common() we unmask exceptions before we call user_exit(), and so there's a window where an IRQ or debug exception can be taken while RCU is not watching. In do_debug_exception() we account for this in via debug_exception_{enter,exit}(), but in the el1_irq asm we do not and we call trace functions which rely on RCU before we have a guarantee that RCU is watching. Let's avoid this by having el0_svc_common() exit userspace before unmasking exceptions, matching what we do for all other EL0 entry paths. We can use user_exit_irqoff() to avoid the pointless save/restore of IRQ flags while we're sure exceptions are masked in DAIF. The workaround for Cortex-A76 erratum 1463225 may trigger a debug exception before this point, but the debug code invoked in this case is safe even when RCU is not watching. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201130115950.22492-2-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/syscall.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index e4c0dadf0d92..13fe79f8e2db 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -120,8 +120,8 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, */ cortex_a76_erratum_1463225_svc_handler(); + user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); - user_exit(); if (system_supports_mte() && (flags & _TIF_MTE_ASYNC_FAULT)) { /* -- cgit v1.2.3 From 114e0a684753516ef4b71ccb55a8ebcfa8735edb Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 30 Nov 2020 11:59:41 +0000 Subject: arm64: mark idle code as noinstr Core code disables RCU when calling arch_cpu_idle(), so it's not safe for arch_cpu_idle() or its calees to be instrumented, as the instrumentation callbacks may attempt to use RCU or other features which are unsafe to use in this context. Mark them noinstr to prevent issues. The use of local_irq_enable() in arch_cpu_idle() is similarly problematic, and the "sched/idle: Fix arch_cpu_idle() vs tracing" patch queued in the tip tree addresses that case. Reported-by: Marco Elver Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201130115950.22492-3-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/process.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index a47a40ec6ad9..0c65a91f8f53 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -72,13 +72,13 @@ EXPORT_SYMBOL_GPL(pm_power_off); void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); -static void __cpu_do_idle(void) +static void noinstr __cpu_do_idle(void) { dsb(sy); wfi(); } -static void __cpu_do_idle_irqprio(void) +static void noinstr __cpu_do_idle_irqprio(void) { unsigned long pmr; unsigned long daif_bits; @@ -108,7 +108,7 @@ static void __cpu_do_idle_irqprio(void) * ensure that interrupts are not masked at the PMR (because the core will * not wake up if we block the wake up signal in the interrupt controller). */ -void cpu_do_idle(void) +void noinstr cpu_do_idle(void) { if (system_uses_irq_prio_masking()) __cpu_do_idle_irqprio(); @@ -119,7 +119,7 @@ void cpu_do_idle(void) /* * This is our default idle handler. */ -void arch_cpu_idle(void) +void noinstr arch_cpu_idle(void) { /* * This should do all the clock switching and wait for interrupt -- cgit v1.2.3 From da192676483232a0a9478c89cdddd412e5167470 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 30 Nov 2020 11:59:42 +0000 Subject: arm64: entry: mark entry code as noinstr Functions in entry-common.c are marked as notrace and NOKPROBE_SYMBOL(), but they're still subject to other instrumentation which may rely on lockdep/rcu/context-tracking being up-to-date, and may cause nested exceptions (e.g. for WARN/BUG or KASAN's use of BRK) which will corrupt exceptions registers which have not yet been read. Prevent this by marking all functions in entry-common.c as noinstr to prevent compiler instrumentation. This also blacklists the functions for tracing and kprobes, so we don't need to handle that separately. Functions elsewhere will be dealt with in subsequent patches. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201130115950.22492-4-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/entry-common.c | 75 ++++++++++++++-------------------------- 1 file changed, 25 insertions(+), 50 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 43d4c329775f..75e99161f79e 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -17,7 +17,7 @@ #include #include -static void notrace el1_abort(struct pt_regs *regs, unsigned long esr) +static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -25,32 +25,28 @@ static void notrace el1_abort(struct pt_regs *regs, unsigned long esr) far = untagged_addr(far); do_mem_abort(far, esr, regs); } -NOKPROBE_SYMBOL(el1_abort); -static void notrace el1_pc(struct pt_regs *regs, unsigned long esr) +static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); local_daif_inherit(regs); do_sp_pc_abort(far, esr, regs); } -NOKPROBE_SYMBOL(el1_pc); -static void notrace el1_undef(struct pt_regs *regs) +static void noinstr el1_undef(struct pt_regs *regs) { local_daif_inherit(regs); do_undefinstr(regs); } -NOKPROBE_SYMBOL(el1_undef); -static void notrace el1_inv(struct pt_regs *regs, unsigned long esr) +static void noinstr el1_inv(struct pt_regs *regs, unsigned long esr) { local_daif_inherit(regs); bad_mode(regs, 0, esr); } -NOKPROBE_SYMBOL(el1_inv); -static void notrace el1_dbg(struct pt_regs *regs, unsigned long esr) +static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -64,16 +60,14 @@ static void notrace el1_dbg(struct pt_regs *regs, unsigned long esr) do_debug_exception(far, esr, regs); } -NOKPROBE_SYMBOL(el1_dbg); -static void notrace el1_fpac(struct pt_regs *regs, unsigned long esr) +static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) { local_daif_inherit(regs); do_ptrauth_fault(regs, esr); } -NOKPROBE_SYMBOL(el1_fpac); -asmlinkage void notrace el1_sync_handler(struct pt_regs *regs) +asmlinkage void noinstr el1_sync_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); @@ -106,9 +100,8 @@ asmlinkage void notrace el1_sync_handler(struct pt_regs *regs) el1_inv(regs, esr); } } -NOKPROBE_SYMBOL(el1_sync_handler); -static void notrace el0_da(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -117,9 +110,8 @@ static void notrace el0_da(struct pt_regs *regs, unsigned long esr) far = untagged_addr(far); do_mem_abort(far, esr, regs); } -NOKPROBE_SYMBOL(el0_da); -static void notrace el0_ia(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -135,41 +127,36 @@ static void notrace el0_ia(struct pt_regs *regs, unsigned long esr) local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); } -NOKPROBE_SYMBOL(el0_ia); -static void notrace el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr) { user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); do_fpsimd_acc(esr, regs); } -NOKPROBE_SYMBOL(el0_fpsimd_acc); -static void notrace el0_sve_acc(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_sve_acc(struct pt_regs *regs, unsigned long esr) { user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); do_sve_acc(esr, regs); } -NOKPROBE_SYMBOL(el0_sve_acc); -static void notrace el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) { user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); do_fpsimd_exc(esr, regs); } -NOKPROBE_SYMBOL(el0_fpsimd_exc); -static void notrace el0_sys(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_sys(struct pt_regs *regs, unsigned long esr) { user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); do_sysinstr(esr, regs); } -NOKPROBE_SYMBOL(el0_sys); -static void notrace el0_pc(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -180,41 +167,36 @@ static void notrace el0_pc(struct pt_regs *regs, unsigned long esr) local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(far, esr, regs); } -NOKPROBE_SYMBOL(el0_pc); -static void notrace el0_sp(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_sp(struct pt_regs *regs, unsigned long esr) { user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(regs->sp, esr, regs); } -NOKPROBE_SYMBOL(el0_sp); -static void notrace el0_undef(struct pt_regs *regs) +static void noinstr el0_undef(struct pt_regs *regs) { user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); do_undefinstr(regs); } -NOKPROBE_SYMBOL(el0_undef); -static void notrace el0_bti(struct pt_regs *regs) +static void noinstr el0_bti(struct pt_regs *regs) { user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); do_bti(regs); } -NOKPROBE_SYMBOL(el0_bti); -static void notrace el0_inv(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) { user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); bad_el0_sync(regs, 0, esr); } -NOKPROBE_SYMBOL(el0_inv); -static void notrace el0_dbg(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr) { /* Only watchpoints write FAR_EL1, otherwise its UNKNOWN */ unsigned long far = read_sysreg(far_el1); @@ -226,26 +208,23 @@ static void notrace el0_dbg(struct pt_regs *regs, unsigned long esr) do_debug_exception(far, esr, regs); local_daif_restore(DAIF_PROCCTX_NOIRQ); } -NOKPROBE_SYMBOL(el0_dbg); -static void notrace el0_svc(struct pt_regs *regs) +static void noinstr el0_svc(struct pt_regs *regs) { if (system_uses_irq_prio_masking()) gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); do_el0_svc(regs); } -NOKPROBE_SYMBOL(el0_svc); -static void notrace el0_fpac(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) { user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); do_ptrauth_fault(regs, esr); } -NOKPROBE_SYMBOL(el0_fpac); -asmlinkage void notrace el0_sync_handler(struct pt_regs *regs) +asmlinkage void noinstr el0_sync_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); @@ -297,27 +276,24 @@ asmlinkage void notrace el0_sync_handler(struct pt_regs *regs) el0_inv(regs, esr); } } -NOKPROBE_SYMBOL(el0_sync_handler); #ifdef CONFIG_COMPAT -static void notrace el0_cp15(struct pt_regs *regs, unsigned long esr) +static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) { user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); do_cp15instr(esr, regs); } -NOKPROBE_SYMBOL(el0_cp15); -static void notrace el0_svc_compat(struct pt_regs *regs) +static void noinstr el0_svc_compat(struct pt_regs *regs) { if (system_uses_irq_prio_masking()) gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); do_el0_svc_compat(regs); } -NOKPROBE_SYMBOL(el0_svc_compat); -asmlinkage void notrace el0_sync_compat_handler(struct pt_regs *regs) +asmlinkage void noinstr el0_sync_compat_handler(struct pt_regs *regs) { unsigned long esr = read_sysreg(esr_el1); @@ -360,5 +336,4 @@ asmlinkage void notrace el0_sync_compat_handler(struct pt_regs *regs) el0_inv(regs, esr); } } -NOKPROBE_SYMBOL(el0_sync_compat_handler); #endif /* CONFIG_COMPAT */ -- cgit v1.2.3 From 2f911d494f3f028bbe6346e383a354225682cf1b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 30 Nov 2020 11:59:43 +0000 Subject: arm64: entry: move enter_from_user_mode to entry-common.c In later patches we'll want to extend enter_from_user_mode() and add a corresponding exit_to_user_mode(). As these will be common for all entries/exits from userspace, it'd be better for these to live in entry-common.c with the rest of the entry logic. This patch moves enter_from_user_mode() into entry-common.c. As with other functions in entry-common.c it is marked as noinstr (which prevents all instrumentation, tracing, and kprobes) but there are no other functional changes. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201130115950.22492-5-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/entry-common.c | 6 ++++++ arch/arm64/kernel/traps.c | 7 ------- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 75e99161f79e..9a685e7686fe 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -101,6 +101,12 @@ asmlinkage void noinstr el1_sync_handler(struct pt_regs *regs) } } +asmlinkage void noinstr enter_from_user_mode(void) +{ + CT_WARN_ON(ct_state() != CONTEXT_USER); + user_exit_irqoff(); +} + static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8af4e0e85736..580c60afc39a 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -876,13 +876,6 @@ asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr) nmi_exit(); } -asmlinkage void enter_from_user_mode(void) -{ - CT_WARN_ON(ct_state() != CONTEXT_USER); - user_exit_irqoff(); -} -NOKPROBE_SYMBOL(enter_from_user_mode); - /* GENERIC_BUG traps */ int is_valid_bugaddr(unsigned long addr) -- cgit v1.2.3 From 3cb5ed4d76c15fb97c10e5e9f5268d92c68222ca Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 30 Nov 2020 11:59:44 +0000 Subject: arm64: entry: prepare ret_to_user for function call In a subsequent patch ret_to_user will need to make a C function call (in some configurations) which may clobber x0-x18 at the start of the finish_ret_to_user block, before enable_step_tsk consumes the flags loaded into x1. In preparation for this, let's load the flags into x19, which is preserved across C function calls. This avoids a redundant reload of the flags and ensures we operate on a consistent shapshot regardless. There should be no functional change as a result of this patch. At this point of the entry/exit paths we only need to preserve x28 (tsk) and the sp, and x19 is free for this use. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201130115950.22492-6-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/entry.S | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index b295fb912b12..84aec600eeed 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -774,13 +774,13 @@ SYM_CODE_END(el0_error) SYM_CODE_START_LOCAL(ret_to_user) disable_daif gic_prio_kentry_setup tmp=x3 - ldr x1, [tsk, #TSK_TI_FLAGS] - and x2, x1, #_TIF_WORK_MASK + ldr x19, [tsk, #TSK_TI_FLAGS] + and x2, x19, #_TIF_WORK_MASK cbnz x2, work_pending finish_ret_to_user: /* Ignore asynchronous tag check faults in the uaccess routines */ clear_mte_async_tcf - enable_step_tsk x1, x2 + enable_step_tsk x19, x2 #ifdef CONFIG_GCC_PLUGIN_STACKLEAK bl stackleak_erase #endif @@ -791,11 +791,12 @@ finish_ret_to_user: */ work_pending: mov x0, sp // 'regs' + mov x1, x19 bl do_notify_resume #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_on // enabled while in userspace #endif - ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step + ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step b finish_ret_to_user SYM_CODE_END(ret_to_user) -- cgit v1.2.3 From 105fc3352077bba5faaf12cf39f7e3aad26fb70b Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 30 Nov 2020 11:59:45 +0000 Subject: arm64: entry: move el1 irq/nmi logic to C In preparation for reworking the EL1 irq/nmi entry code, move the existing logic to C. We no longer need the asm_nmi_enter() and asm_nmi_exit() wrappers, so these are removed. The new C functions are marked noinstr, which prevents compiler instrumentation and runtime probing. In subsequent patches we'll want the new C helpers to be called in all cases, so we don't bother wrapping the calls with ifdeferry. Even when the new C functions are stubs the trivial calls are unlikely to have a measurable impact on the IRQ or NMI paths anyway. Prototypes are added to as otherwise (in some configurations) GCC will complain about the lack of a forward declaration. We already do this for existing function, e.g. enter_from_user_mode(). The new helpers are marked as noinstr (which prevents all instrumentation, tracing, and kprobes). Otherwise, there should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201130115950.22492-7-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/exception.h | 2 ++ arch/arm64/kernel/entry-common.c | 16 ++++++++++++++++ arch/arm64/kernel/entry.S | 34 ++++------------------------------ arch/arm64/kernel/irq.c | 15 --------------- 4 files changed, 22 insertions(+), 45 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 99b9383cd036..d69d53dd7be7 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -31,6 +31,8 @@ static inline u32 disr_to_esr(u64 disr) return esr; } +asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs); +asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs); asmlinkage void enter_from_user_mode(void); void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); void do_undefinstr(struct pt_regs *regs); diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 9a685e7686fe..920da254be1d 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -17,6 +17,22 @@ #include #include +asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) + nmi_enter(); + + trace_hardirqs_off(); +} + +asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs) +{ + if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) + nmi_exit(); + else + trace_hardirqs_on(); +} + static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 84aec600eeed..53e30750fc28 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -637,16 +637,8 @@ SYM_CODE_START_LOCAL_NOALIGN(el1_irq) gic_prio_irq_setup pmr=x20, tmp=x1 enable_da_f -#ifdef CONFIG_ARM64_PSEUDO_NMI - test_irqs_unmasked res=x0, pmr=x20 - cbz x0, 1f - bl asm_nmi_enter -1: -#endif - -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_off -#endif + mov x0, sp + bl enter_el1_irq_or_nmi irq_handler @@ -665,26 +657,8 @@ alternative_else_nop_endif 1: #endif -#ifdef CONFIG_ARM64_PSEUDO_NMI - /* - * When using IRQ priority masking, we can get spurious interrupts while - * PMR is set to GIC_PRIO_IRQOFF. An NMI might also have occurred in a - * section with interrupts disabled. Skip tracing in those cases. - */ - test_irqs_unmasked res=x0, pmr=x20 - cbz x0, 1f - bl asm_nmi_exit -1: -#endif - -#ifdef CONFIG_TRACE_IRQFLAGS -#ifdef CONFIG_ARM64_PSEUDO_NMI - test_irqs_unmasked res=x0, pmr=x20 - cbnz x0, 1f -#endif - bl trace_hardirqs_on -1: -#endif + mov x0, sp + bl exit_el1_irq_or_nmi kernel_exit 1 SYM_CODE_END(el1_irq) diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 9cf2fb87584a..60456a62da11 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -67,18 +67,3 @@ void __init init_IRQ(void) local_daif_restore(DAIF_PROCCTX_NOIRQ); } } - -/* - * Stubs to make nmi_enter/exit() code callable from ASM - */ -asmlinkage void notrace asm_nmi_enter(void) -{ - nmi_enter(); -} -NOKPROBE_SYMBOL(asm_nmi_enter); - -asmlinkage void notrace asm_nmi_exit(void) -{ - nmi_exit(); -} -NOKPROBE_SYMBOL(asm_nmi_exit); -- cgit v1.2.3 From 23529049c68423820487304f244144e0d576e85a Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 30 Nov 2020 11:59:46 +0000 Subject: arm64: entry: fix non-NMI user<->kernel transitions When built with PROVE_LOCKING, NO_HZ_FULL, and CONTEXT_TRACKING_FORCE will WARN() at boot time that interrupts are enabled when we call context_tracking_user_enter(), despite the DAIF flags indicating that IRQs are masked. The problem is that we're not tracking IRQ flag changes accurately, and so lockdep believes interrupts are enabled when they are not (and vice-versa). We can shuffle things so to make this more accurate. For kernel->user transitions there are a number of constraints we need to consider: 1) When we call __context_tracking_user_enter() HW IRQs must be disabled and lockdep must be up-to-date with this. 2) Userspace should be treated as having IRQs enabled from the PoV of both lockdep and tracing. 3) As context_tracking_user_enter() stops RCU from watching, we cannot use RCU after calling it. 4) IRQ flag tracing and lockdep have state that must be manipulated before RCU is disabled. ... with similar constraints applying for user->kernel transitions, with the ordering reversed. The generic entry code has enter_from_user_mode() and exit_to_user_mode() helpers to handle this. We can't use those directly, so we add arm64 copies for now (without the instrumentation markers which aren't used on arm64). These replace the existing user_exit() and user_exit_irqoff() calls spread throughout handlers, and the exception unmasking is left as-is. Note that: * The accounting for debug exceptions from userspace now happens in el0_dbg() and ret_to_user(), so this is removed from debug_exception_enter() and debug_exception_exit(). As user_exit_irqoff() wakes RCU, the userspace-specific check is removed. * The accounting for syscalls now happens in el0_svc(), el0_svc_compat(), and ret_to_user(), so this is removed from el0_svc_common(). This does not adversely affect the workaround for erratum 1463225, as this does not depend on any of the state tracking. * In ret_to_user() we mask interrupts with local_daif_mask(), and so we need to inform lockdep and tracing. Here a trace_hardirqs_off() is sufficient and safe as we have not yet exited kernel context and RCU is usable. * As PROVE_LOCKING selects TRACE_IRQFLAGS, the ifdeferry in entry.S only needs to check for the latter. * EL0 SError handling will be dealt with in a subsequent patch, as this needs to be treated as an NMI. Prior to this patch, booting an appropriately-configured kernel would result in spats as below: | DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled()) | WARNING: CPU: 2 PID: 1 at kernel/locking/lockdep.c:5280 check_flags.part.54+0x1dc/0x1f0 | Modules linked in: | CPU: 2 PID: 1 Comm: init Not tainted 5.10.0-rc3 #3 | Hardware name: linux,dummy-virt (DT) | pstate: 804003c5 (Nzcv DAIF +PAN -UAO -TCO BTYPE=--) | pc : check_flags.part.54+0x1dc/0x1f0 | lr : check_flags.part.54+0x1dc/0x1f0 | sp : ffff80001003bd80 | x29: ffff80001003bd80 x28: ffff66ce801e0000 | x27: 00000000ffffffff x26: 00000000000003c0 | x25: 0000000000000000 x24: ffffc31842527258 | x23: ffffc31842491368 x22: ffffc3184282d000 | x21: 0000000000000000 x20: 0000000000000001 | x19: ffffc318432ce000 x18: 0080000000000000 | x17: 0000000000000000 x16: ffffc31840f18a78 | x15: 0000000000000001 x14: ffffc3184285c810 | x13: 0000000000000001 x12: 0000000000000000 | x11: ffffc318415857a0 x10: ffffc318406614c0 | x9 : ffffc318415857a0 x8 : ffffc31841f1d000 | x7 : 647261685f706564 x6 : ffffc3183ff7c66c | x5 : ffff66ce801e0000 x4 : 0000000000000000 | x3 : ffffc3183fe00000 x2 : ffffc31841500000 | x1 : e956dc24146b3500 x0 : 0000000000000000 | Call trace: | check_flags.part.54+0x1dc/0x1f0 | lock_is_held_type+0x10c/0x188 | rcu_read_lock_sched_held+0x70/0x98 | __context_tracking_enter+0x310/0x350 | context_tracking_enter.part.3+0x5c/0xc8 | context_tracking_user_enter+0x6c/0x80 | finish_ret_to_user+0x2c/0x13cr Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201130115950.22492-8-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/exception.h | 1 + arch/arm64/kernel/entry-common.c | 40 +++++++++++++++++++++++++------------- arch/arm64/kernel/entry.S | 35 +++++++++++++-------------------- arch/arm64/kernel/syscall.c | 1 - arch/arm64/mm/fault.c | 22 ++++++++++----------- 5 files changed, 51 insertions(+), 48 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index d69d53dd7be7..d579b2e6db7a 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -34,6 +34,7 @@ static inline u32 disr_to_esr(u64 disr) asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs); asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs); asmlinkage void enter_from_user_mode(void); +asmlinkage void exit_to_user_mode(void); void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); void do_undefinstr(struct pt_regs *regs); void do_bti(struct pt_regs *regs); diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 920da254be1d..49d1c1dd9baf 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -119,15 +119,25 @@ asmlinkage void noinstr el1_sync_handler(struct pt_regs *regs) asmlinkage void noinstr enter_from_user_mode(void) { + lockdep_hardirqs_off(CALLER_ADDR0); CT_WARN_ON(ct_state() != CONTEXT_USER); user_exit_irqoff(); + trace_hardirqs_off_finish(); +} + +asmlinkage void noinstr exit_to_user_mode(void) +{ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + user_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); } static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); - user_exit_irqoff(); + enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); far = untagged_addr(far); do_mem_abort(far, esr, regs); @@ -145,35 +155,35 @@ static void noinstr el0_ia(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(far)) arm64_apply_bp_hardening(); - user_exit_irqoff(); + enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); do_mem_abort(far, esr, regs); } static void noinstr el0_fpsimd_acc(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); + enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); do_fpsimd_acc(esr, regs); } static void noinstr el0_sve_acc(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); + enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); do_sve_acc(esr, regs); } static void noinstr el0_fpsimd_exc(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); + enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); do_fpsimd_exc(esr, regs); } static void noinstr el0_sys(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); + enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); do_sysinstr(esr, regs); } @@ -185,35 +195,35 @@ static void noinstr el0_pc(struct pt_regs *regs, unsigned long esr) if (!is_ttbr0_addr(instruction_pointer(regs))) arm64_apply_bp_hardening(); - user_exit_irqoff(); + enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(far, esr, regs); } static void noinstr el0_sp(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); + enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); do_sp_pc_abort(regs->sp, esr, regs); } static void noinstr el0_undef(struct pt_regs *regs) { - user_exit_irqoff(); + enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); do_undefinstr(regs); } static void noinstr el0_bti(struct pt_regs *regs) { - user_exit_irqoff(); + enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); do_bti(regs); } static void noinstr el0_inv(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); + enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); bad_el0_sync(regs, 0, esr); } @@ -226,7 +236,7 @@ static void noinstr el0_dbg(struct pt_regs *regs, unsigned long esr) if (system_uses_irq_prio_masking()) gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); - user_exit_irqoff(); + enter_from_user_mode(); do_debug_exception(far, esr, regs); local_daif_restore(DAIF_PROCCTX_NOIRQ); } @@ -236,12 +246,13 @@ static void noinstr el0_svc(struct pt_regs *regs) if (system_uses_irq_prio_masking()) gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); + enter_from_user_mode(); do_el0_svc(regs); } static void noinstr el0_fpac(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); + enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); do_ptrauth_fault(regs, esr); } @@ -302,7 +313,7 @@ asmlinkage void noinstr el0_sync_handler(struct pt_regs *regs) #ifdef CONFIG_COMPAT static void noinstr el0_cp15(struct pt_regs *regs, unsigned long esr) { - user_exit_irqoff(); + enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); do_cp15instr(esr, regs); } @@ -312,6 +323,7 @@ static void noinstr el0_svc_compat(struct pt_regs *regs) if (system_uses_irq_prio_masking()) gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); + enter_from_user_mode(); do_el0_svc_compat(regs); } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 53e30750fc28..d72c818b019c 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -30,18 +30,18 @@ #include /* - * Context tracking subsystem. Used to instrument transitions - * between user and kernel mode. + * Context tracking and irqflag tracing need to instrument transitions between + * user and kernel mode. */ - .macro ct_user_exit_irqoff -#ifdef CONFIG_CONTEXT_TRACKING + .macro user_exit_irqoff +#if defined(CONFIG_CONTEXT_TRACKING) || defined(CONFIG_TRACE_IRQFLAGS) bl enter_from_user_mode #endif .endm - .macro ct_user_enter -#ifdef CONFIG_CONTEXT_TRACKING - bl context_tracking_user_enter + .macro user_enter_irqoff +#if defined(CONFIG_CONTEXT_TRACKING) || defined(CONFIG_TRACE_IRQFLAGS) + bl exit_to_user_mode #endif .endm @@ -298,9 +298,6 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING alternative_else_nop_endif ldp x21, x22, [sp, #S_PC] // load ELR, SPSR - .if \el == 0 - ct_user_enter - .endif #ifdef CONFIG_ARM64_SW_TTBR0_PAN alternative_if_not ARM64_HAS_PAN @@ -700,21 +697,14 @@ SYM_CODE_START_LOCAL_NOALIGN(el0_irq) kernel_entry 0 el0_irq_naked: gic_prio_irq_setup pmr=x20, tmp=x0 - ct_user_exit_irqoff + user_exit_irqoff enable_da_f -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_off -#endif - tbz x22, #55, 1f bl do_el0_irq_bp_hardening 1: irq_handler -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on -#endif b ret_to_user SYM_CODE_END(el0_irq) @@ -733,7 +723,7 @@ SYM_CODE_START_LOCAL(el0_error) el0_error_naked: mrs x25, esr_el1 gic_prio_kentry_setup tmp=x2 - ct_user_exit_irqoff + user_exit_irqoff enable_dbg mov x0, sp mov x1, x25 @@ -748,10 +738,14 @@ SYM_CODE_END(el0_error) SYM_CODE_START_LOCAL(ret_to_user) disable_daif gic_prio_kentry_setup tmp=x3 +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off +#endif ldr x19, [tsk, #TSK_TI_FLAGS] and x2, x19, #_TIF_WORK_MASK cbnz x2, work_pending finish_ret_to_user: + user_enter_irqoff /* Ignore asynchronous tag check faults in the uaccess routines */ clear_mte_async_tcf enable_step_tsk x19, x2 @@ -767,9 +761,6 @@ work_pending: mov x0, sp // 'regs' mov x1, x19 bl do_notify_resume -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_on // enabled while in userspace -#endif ldr x19, [tsk, #TSK_TI_FLAGS] // re-check for single-step b finish_ret_to_user SYM_CODE_END(ret_to_user) diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index 13fe79f8e2db..f8f758e4a306 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -120,7 +120,6 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, */ cortex_a76_erratum_1463225_svc_handler(); - user_exit_irqoff(); local_daif_restore(DAIF_PROCCTX); if (system_supports_mte() && (flags & _TIF_MTE_ASYNC_FAULT)) { diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 1ee94002801f..1f450b784d2c 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -789,16 +789,14 @@ void __init hook_debug_fault_code(int nr, */ static void debug_exception_enter(struct pt_regs *regs) { - /* - * Tell lockdep we disabled irqs in entry.S. Do nothing if they were - * already disabled to preserve the last enabled/disabled addresses. - */ - if (interrupts_enabled(regs)) - trace_hardirqs_off(); + if (!user_mode(regs)) { + /* + * Tell lockdep we disabled irqs in entry.S. Do nothing if they were + * already disabled to preserve the last enabled/disabled addresses. + */ + if (interrupts_enabled(regs)) + trace_hardirqs_off(); - if (user_mode(regs)) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - } else { /* * We might have interrupted pretty much anything. In * fact, if we're a debug exception, we can even interrupt @@ -819,8 +817,10 @@ static void debug_exception_exit(struct pt_regs *regs) { preempt_enable_no_resched(); - if (!user_mode(regs)) - rcu_nmi_exit(); + if (user_mode(regs)) + return; + + rcu_nmi_exit(); if (interrupts_enabled(regs)) trace_hardirqs_on(); -- cgit v1.2.3 From 1ec2f2c05b2ab845d068bff29bd32dbfc6a6ad4c Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 30 Nov 2020 11:59:47 +0000 Subject: arm64: ptrace: prepare for EL1 irq/rcu tracking Exceptions from EL1 may be taken when RCU isn't watching (e.g. in idle sequences), or when the lockdep hardirqs transiently out-of-sync with the hardware state (e.g. in the middle of local_irq_enable()). To correctly handle these cases, we'll need to save/restore this state across some exceptions taken from EL1. A series of subsequent patches will update EL1 exception handlers to handle this. In preparation for this, and to avoid dependencies between those patches, this patch adds two new fields to struct pt_regs so that exception handlers can track this state. Note that this is placed in pt_regs as some entry/exit sequences such as el1_irq are invoked from assembly, which makes it very difficult to add a separate structure as with the irqentry_state used by x86. We can separate this once more of the exception logic is moved to C. While the fields only need to be bool, they are both made u64 to keep pt_regs 16-byte aligned. There should be no functional change as a result of this patch. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201130115950.22492-9-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/ptrace.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 997cf8c8cd52..28c85b87b8cd 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -193,6 +193,10 @@ struct pt_regs { /* Only valid when ARM64_HAS_IRQ_PRIO_MASKING is enabled. */ u64 pmr_save; u64 stackframe[2]; + + /* Only valid for some EL1 exceptions. */ + u64 lockdep_hardirqs; + u64 exit_rcu; }; static inline bool in_syscall(struct pt_regs const *regs) -- cgit v1.2.3 From 7cd1ea1010acbede7eb87b6abb6198921fb36957 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 30 Nov 2020 11:59:48 +0000 Subject: arm64: entry: fix non-NMI kernel<->kernel transitions There are periods in kernel mode when RCU is not watching and/or the scheduler tick is disabled, but we can still take exceptions such as interrupts. The arm64 exception handlers do not account for this, and it's possible that RCU is not watching while an exception handler runs. The x86/generic entry code handles this by ensuring that all (non-NMI) kernel exception handlers call irqentry_enter() and irqentry_exit(), which handle RCU, lockdep, and IRQ flag tracing. We can't yet move to the generic entry code, and already hadnle the user<->kernel transitions elsewhere, so we add new kernel<->kernel transition helpers alog the lines of the generic entry code. Since we now track interrupts becoming masked when an exception is taken, local_daif_inherit() is modified to track interrupts becoming re-enabled when the original context is inherited. To balance the entry/exit paths, each handler masks all DAIF exceptions before exit_to_kernel_mode(). Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201130115950.22492-10-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/daifflags.h | 3 ++ arch/arm64/kernel/entry-common.c | 67 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/daifflags.h b/arch/arm64/include/asm/daifflags.h index ec213b4a1650..1c26d7baa67f 100644 --- a/arch/arm64/include/asm/daifflags.h +++ b/arch/arm64/include/asm/daifflags.h @@ -128,6 +128,9 @@ static inline void local_daif_inherit(struct pt_regs *regs) { unsigned long flags = regs->pstate & DAIF_MASK; + if (interrupts_enabled(regs)) + trace_hardirqs_on(); + /* * We can't use local_daif_restore(regs->pstate) here as * system_has_prio_mask_debugging() won't restore the I bit if it can diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 49d1c1dd9baf..86622d8dd0d8 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -17,12 +17,58 @@ #include #include +/* + * This is intended to match the logic in irqentry_enter(), handling the kernel + * mode transitions only. + */ +static void noinstr enter_from_kernel_mode(struct pt_regs *regs) +{ + regs->exit_rcu = false; + + if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) { + lockdep_hardirqs_off(CALLER_ADDR0); + rcu_irq_enter(); + trace_hardirqs_off_finish(); + + regs->exit_rcu = true; + return; + } + + lockdep_hardirqs_off(CALLER_ADDR0); + rcu_irq_enter_check_tick(); + trace_hardirqs_off_finish(); +} + +/* + * This is intended to match the logic in irqentry_exit(), handling the kernel + * mode transitions only, and with preemption handled elsewhere. + */ +static void noinstr exit_to_kernel_mode(struct pt_regs *regs) +{ + lockdep_assert_irqs_disabled(); + + if (interrupts_enabled(regs)) { + if (regs->exit_rcu) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + rcu_irq_exit(); + lockdep_hardirqs_on(CALLER_ADDR0); + return; + } + + trace_hardirqs_on(); + } else { + if (regs->exit_rcu) + rcu_irq_exit(); + } +} + asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) nmi_enter(); - - trace_hardirqs_off(); + else + enter_from_kernel_mode(regs); } asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs) @@ -30,36 +76,48 @@ asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs) if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) nmi_exit(); else - trace_hardirqs_on(); + exit_to_kernel_mode(regs); } static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + enter_from_kernel_mode(regs); local_daif_inherit(regs); far = untagged_addr(far); do_mem_abort(far, esr, regs); + local_daif_mask(); + exit_to_kernel_mode(regs); } static void noinstr el1_pc(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); + enter_from_kernel_mode(regs); local_daif_inherit(regs); do_sp_pc_abort(far, esr, regs); + local_daif_mask(); + exit_to_kernel_mode(regs); } static void noinstr el1_undef(struct pt_regs *regs) { + enter_from_kernel_mode(regs); local_daif_inherit(regs); do_undefinstr(regs); + local_daif_mask(); + exit_to_kernel_mode(regs); } static void noinstr el1_inv(struct pt_regs *regs, unsigned long esr) { + enter_from_kernel_mode(regs); local_daif_inherit(regs); bad_mode(regs, 0, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); } static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) @@ -79,8 +137,11 @@ static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) { + enter_from_kernel_mode(regs); local_daif_inherit(regs); do_ptrauth_fault(regs, esr); + local_daif_mask(); + exit_to_kernel_mode(regs); } asmlinkage void noinstr el1_sync_handler(struct pt_regs *regs) -- cgit v1.2.3 From f0cd5ac1e4c53cb691b3ed3cda1031e1c42153e2 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 30 Nov 2020 11:59:49 +0000 Subject: arm64: entry: fix NMI {user, kernel}->kernel transitions Exceptions which can be taken at (almost) any time are consdiered to be NMIs. On arm64 that includes: * SDEI events * GICv3 Pseudo-NMIs * Kernel stack overflows * Unexpected/unhandled exceptions ... but currently debug exceptions (BRKs, breakpoints, watchpoints, single-step) are not considered NMIs. As these can be taken at any time, kernel features (lockdep, RCU, ftrace) may not be in a consistent kernel state. For example, we may take an NMI from the idle code or partway through an entry/exit path. While nmi_enter() and nmi_exit() handle most of this state, notably they don't save/restore the lockdep state across an NMI being taken and handled. When interrupts are enabled and an NMI is taken, lockdep may see interrupts become disabled within the NMI code, but not see interrupts become enabled when returning from the NMI, leaving lockdep believing interrupts are disabled when they are actually disabled. The x86 code handles this in idtentry_{enter,exit}_nmi(), which will shortly be moved to the generic entry code. As we can't use either yet, we copy the x86 approach in arm64-specific helpers. All the NMI entrypoints are marked as noinstr to prevent any instrumentation handling code being invoked before the state has been corrected. Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201130115950.22492-11-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/exception.h | 2 ++ arch/arm64/kernel/entry-common.c | 34 ++++++++++++++++++++++++++++++++-- arch/arm64/kernel/sdei.c | 7 ++++--- arch/arm64/kernel/traps.c | 15 ++++++++++----- 4 files changed, 48 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index d579b2e6db7a..0756191f44f6 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -35,6 +35,8 @@ asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs); asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs); asmlinkage void enter_from_user_mode(void); asmlinkage void exit_to_user_mode(void); +void arm64_enter_nmi(struct pt_regs *regs); +void arm64_exit_nmi(struct pt_regs *regs); void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); void do_undefinstr(struct pt_regs *regs); void do_bti(struct pt_regs *regs); diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 86622d8dd0d8..6d70be0d3e8f 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -63,10 +63,40 @@ static void noinstr exit_to_kernel_mode(struct pt_regs *regs) } } +void noinstr arm64_enter_nmi(struct pt_regs *regs) +{ + regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); + + __nmi_enter(); + lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirq_enter(); + rcu_nmi_enter(); + + trace_hardirqs_off_finish(); + ftrace_nmi_enter(); +} + +void noinstr arm64_exit_nmi(struct pt_regs *regs) +{ + bool restore = regs->lockdep_hardirqs; + + ftrace_nmi_exit(); + if (restore) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + } + + rcu_nmi_exit(); + lockdep_hardirq_exit(); + if (restore) + lockdep_hardirqs_on(CALLER_ADDR0); + __nmi_exit(); +} + asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) - nmi_enter(); + arm64_enter_nmi(regs); else enter_from_kernel_mode(regs); } @@ -74,7 +104,7 @@ asmlinkage void noinstr enter_el1_irq_or_nmi(struct pt_regs *regs) asmlinkage void noinstr exit_el1_irq_or_nmi(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_ARM64_PSEUDO_NMI) && !interrupts_enabled(regs)) - nmi_exit(); + arm64_exit_nmi(regs); else exit_to_kernel_mode(regs); } diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index 7689f2031c0c..793c46d6a447 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -223,16 +224,16 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, } -asmlinkage __kprobes notrace unsigned long +asmlinkage noinstr unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { unsigned long ret; - nmi_enter(); + arm64_enter_nmi(regs); ret = _sdei_handler(regs, arg); - nmi_exit(); + arm64_exit_nmi(regs); return ret; } diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 580c60afc39a..2059d8f43f55 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -753,8 +754,10 @@ const char *esr_get_class_string(u32 esr) * bad_mode handles the impossible case in the exception vector. This is always * fatal. */ -asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr) +asmlinkage void notrace bad_mode(struct pt_regs *regs, int reason, unsigned int esr) { + arm64_enter_nmi(regs); + console_verbose(); pr_crit("Bad mode in %s handler detected on CPU%d, code 0x%08x -- %s\n", @@ -786,7 +789,7 @@ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) __aligned(16); -asmlinkage void handle_bad_stack(struct pt_regs *regs) +asmlinkage void noinstr handle_bad_stack(struct pt_regs *regs) { unsigned long tsk_stk = (unsigned long)current->stack; unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); @@ -794,6 +797,8 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs) unsigned int esr = read_sysreg(esr_el1); unsigned long far = read_sysreg(far_el1); + arm64_enter_nmi(regs); + console_verbose(); pr_emerg("Insufficient stack space to handle exception!"); @@ -865,15 +870,15 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) } } -asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr) +asmlinkage void noinstr do_serror(struct pt_regs *regs, unsigned int esr) { - nmi_enter(); + arm64_enter_nmi(regs); /* non-RAS errors are not containable */ if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) arm64_serror_panic(regs, esr); - nmi_exit(); + arm64_exit_nmi(regs); } /* GENERIC_BUG traps */ -- cgit v1.2.3 From 2a9b3e6ac69a8bf177d8496a11e749e2dc72fa22 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 30 Nov 2020 11:59:50 +0000 Subject: arm64: entry: fix EL1 debug transitions In debug_exception_enter() and debug_exception_exit() we trace hardirqs on/off while RCU isn't guaranteed to be watching, and we don't save and restore the hardirq state, and so may return with this having changed. Handle this appropriately with new entry/exit helpers which do the bare minimum to ensure this is appropriately maintained, without marking debug exceptions as NMIs. These are placed in entry-common.c with the other entry/exit helpers. In future we'll want to reconsider whether some debug exceptions should be NMIs, but this will require a significant refactoring, and for now this should prevent issues with lockdep and RCU. Signed-off-by: Mark Rutland Cc: Catalin Marins Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20201130115950.22492-12-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/entry-common.c | 26 ++++++++++++++++++++++++++ arch/arm64/mm/fault.c | 25 ------------------------- 2 files changed, 26 insertions(+), 25 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 6d70be0d3e8f..70e0a7591245 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -150,6 +150,30 @@ static void noinstr el1_inv(struct pt_regs *regs, unsigned long esr) exit_to_kernel_mode(regs); } +static void noinstr arm64_enter_el1_dbg(struct pt_regs *regs) +{ + regs->lockdep_hardirqs = lockdep_hardirqs_enabled(); + + lockdep_hardirqs_off(CALLER_ADDR0); + rcu_nmi_enter(); + + trace_hardirqs_off_finish(); +} + +static void noinstr arm64_exit_el1_dbg(struct pt_regs *regs) +{ + bool restore = regs->lockdep_hardirqs; + + if (restore) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + } + + rcu_nmi_exit(); + if (restore) + lockdep_hardirqs_on(CALLER_ADDR0); +} + static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) { unsigned long far = read_sysreg(far_el1); @@ -162,7 +186,9 @@ static void noinstr el1_dbg(struct pt_regs *regs, unsigned long esr) if (system_uses_irq_prio_masking()) gic_write_pmr(GIC_PRIO_IRQON | GIC_PRIO_PSR_I_SET); + arm64_enter_el1_dbg(regs); do_debug_exception(far, esr, regs); + arm64_exit_el1_dbg(regs); } static void noinstr el1_fpac(struct pt_regs *regs, unsigned long esr) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 1f450b784d2c..795d224f184f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -789,23 +789,6 @@ void __init hook_debug_fault_code(int nr, */ static void debug_exception_enter(struct pt_regs *regs) { - if (!user_mode(regs)) { - /* - * Tell lockdep we disabled irqs in entry.S. Do nothing if they were - * already disabled to preserve the last enabled/disabled addresses. - */ - if (interrupts_enabled(regs)) - trace_hardirqs_off(); - - /* - * We might have interrupted pretty much anything. In - * fact, if we're a debug exception, we can even interrupt - * NMI processing. We don't want this code makes in_nmi() - * to return true, but we need to notify RCU. - */ - rcu_nmi_enter(); - } - preempt_disable(); /* This code is a bit fragile. Test it. */ @@ -816,14 +799,6 @@ NOKPROBE_SYMBOL(debug_exception_enter); static void debug_exception_exit(struct pt_regs *regs) { preempt_enable_no_resched(); - - if (user_mode(regs)) - return; - - rcu_nmi_exit(); - - if (interrupts_enabled(regs)) - trace_hardirqs_on(); } NOKPROBE_SYMBOL(debug_exception_exit); -- cgit v1.2.3 From 9e5344e0ffc33f4fee899f98b6939a0682b1d9c3 Mon Sep 17 00:00:00 2001 From: Vincenzo Frascino Date: Mon, 30 Nov 2020 17:07:09 +0000 Subject: arm64: mte: Fix typo in macro definition UL in the definition of SYS_TFSR_EL1_TF1 was misspelled causing compilation issues when trying to implement in kernel MTE async mode. Fix the macro correcting the typo. Note: MTE async mode will be introduced with a future series. Fixes: c058b1c4a5ea ("arm64: mte: system register definitions") Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Vincenzo Frascino Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20201130170709.22309-1-vincenzo.frascino@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/sysreg.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 174817ba119c..c8ab9900293f 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -983,7 +983,7 @@ #define SYS_TFSR_EL1_TF0_SHIFT 0 #define SYS_TFSR_EL1_TF1_SHIFT 1 #define SYS_TFSR_EL1_TF0 (UL(1) << SYS_TFSR_EL1_TF0_SHIFT) -#define SYS_TFSR_EL1_TF1 (UK(2) << SYS_TFSR_EL1_TF1_SHIFT) +#define SYS_TFSR_EL1_TF1 (UL(1) << SYS_TFSR_EL1_TF1_SHIFT) /* Safe value for MPIDR_EL1: Bit31:RES1, Bit30:U:0, Bit24:MT:0 */ #define SYS_MPIDR_SAFE_VAL (BIT(31)) -- cgit v1.2.3 From f54db39fbe40731c40aefdd3bc26e7d56d668c64 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 30 Nov 2020 13:19:27 +0100 Subject: KVM: PPC: Book3S HV: XIVE: Fix vCPU id sanity check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 062cfab7069f ("KVM: PPC: Book3S HV: XIVE: Make VP block size configurable") updated kvmppc_xive_vcpu_id_valid() in a way that allows userspace to trigger an assertion in skiboot and crash the host: [ 696.186248988,3] XIVE[ IC 08 ] eq_blk != vp_blk (0 vs. 1) for target 0x4300008c/0 [ 696.186314757,0] Assert fail: hw/xive.c:2370:0 [ 696.186342458,0] Aborting! xive-kvCPU 0043 Backtrace: S: 0000000031e2b8f0 R: 0000000030013840 .backtrace+0x48 S: 0000000031e2b990 R: 000000003001b2d0 ._abort+0x4c S: 0000000031e2ba10 R: 000000003001b34c .assert_fail+0x34 S: 0000000031e2ba90 R: 0000000030058984 .xive_eq_for_target.part.20+0xb0 S: 0000000031e2bb40 R: 0000000030059fdc .xive_setup_silent_gather+0x2c S: 0000000031e2bc20 R: 000000003005a334 .opal_xive_set_vp_info+0x124 S: 0000000031e2bd20 R: 00000000300051a4 opal_entry+0x134 --- OPAL call token: 0x8a caller R1: 0xc000001f28563850 --- XIVE maintains the interrupt context state of non-dispatched vCPUs in an internal VP structure. We allocate a bunch of those on startup to accommodate all possible vCPUs. Each VP has an id, that we derive from the vCPU id for efficiency: static inline u32 kvmppc_xive_vp(struct kvmppc_xive *xive, u32 server) { return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server); } The KVM XIVE device used to allocate KVM_MAX_VCPUS VPs. This was limitting the number of concurrent VMs because the VP space is limited on the HW. Since most of the time, VMs run with a lot less vCPUs, commit 062cfab7069f ("KVM: PPC: Book3S HV: XIVE: Make VP block size configurable") gave the possibility for userspace to tune the size of the VP block through the KVM_DEV_XIVE_NR_SERVERS attribute. The check in kvmppc_pack_vcpu_id() was changed from cpu < KVM_MAX_VCPUS * xive->kvm->arch.emul_smt_mode to cpu < xive->nr_servers * xive->kvm->arch.emul_smt_mode The previous check was based on the fact that the VP block had KVM_MAX_VCPUS entries and that kvmppc_pack_vcpu_id() guarantees that packed vCPU ids are below KVM_MAX_VCPUS. We've changed the size of the VP block, but kvmppc_pack_vcpu_id() has nothing to do with it and it certainly doesn't ensure that the packed vCPU ids are below xive->nr_servers. kvmppc_xive_vcpu_id_valid() might thus return true when the VM was configured with a non-standard VSMT mode, even if the packed vCPU id is higher than what we expect. We end up using an unallocated VP id, which confuses OPAL. The assert in OPAL is probably abusive and should be converted to a regular error that the kernel can handle, but we shouldn't really use broken VP ids in the first place. Fix kvmppc_xive_vcpu_id_valid() so that it checks the packed vCPU id is below xive->nr_servers, which is explicitly what we want. Fixes: 062cfab7069f ("KVM: PPC: Book3S HV: XIVE: Make VP block size configurable") Cc: stable@vger.kernel.org # v5.5+ Signed-off-by: Greg Kurz Reviewed-by: Cédric Le Goater Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/160673876747.695514.1809676603724514920.stgit@bahia.lan --- arch/powerpc/kvm/book3s_xive.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index 85215e79db42..a0ebc29f30b2 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -1214,12 +1214,9 @@ void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu) static bool kvmppc_xive_vcpu_id_valid(struct kvmppc_xive *xive, u32 cpu) { /* We have a block of xive->nr_servers VPs. We just need to check - * raw vCPU ids are below the expected limit for this guest's - * core stride ; kvmppc_pack_vcpu_id() will pack them down to an - * index that can be safely used to compute a VP id that belongs - * to the VP block. + * packed vCPU ids are below that. */ - return cpu < xive->nr_servers * xive->kvm->arch.emul_smt_mode; + return kvmppc_pack_vcpu_id(xive->kvm, cpu) < xive->nr_servers; } int kvmppc_xive_compute_vp_id(struct kvmppc_xive *xive, u32 cpu, u32 *vp) -- cgit v1.2.3 From 59612b24f78a0b61fe078ec9dff2e48e9cec52c0 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 19 Nov 2020 13:46:56 -0700 Subject: kbuild: Hoist '--orphan-handling' into Kconfig Currently, '--orphan-handling=warn' is spread out across four different architectures in their respective Makefiles, which makes it a little unruly to deal with in case it needs to be disabled for a specific linker version (in this case, ld.lld 10.0.1). To make it easier to control this, hoist this warning into Kconfig and the main Makefile so that disabling it is simpler, as the warning will only be enabled in a couple places (main Makefile and a couple of compressed boot folders that blow away LDFLAGS_vmlinx) and making it conditional is easier due to Kconfig syntax. One small additional benefit of this is saving a call to ld-option on incremental builds because we will have already evaluated it for CONFIG_LD_ORPHAN_WARN. To keep the list of supported architectures the same, introduce CONFIG_ARCH_WANT_LD_ORPHAN_WARN, which an architecture can select to gain this automatically after all of the sections are specified and size asserted. A special thanks to Kees Cook for the help text on this config. Link: https://github.com/ClangBuiltLinux/linux/issues/1187 Acked-by: Kees Cook Acked-by: Michael Ellerman (powerpc) Reviewed-by: Nick Desaulniers Tested-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Signed-off-by: Masahiro Yamada --- Makefile | 6 ++++++ arch/Kconfig | 9 +++++++++ arch/arm/Kconfig | 1 + arch/arm/Makefile | 4 ---- arch/arm/boot/compressed/Makefile | 4 +++- arch/arm64/Kconfig | 1 + arch/arm64/Makefile | 4 ---- arch/powerpc/Kconfig | 1 + arch/powerpc/Makefile | 1 - arch/x86/Kconfig | 1 + arch/x86/Makefile | 3 --- arch/x86/boot/compressed/Makefile | 4 +++- init/Kconfig | 5 +++++ 13 files changed, 30 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/Makefile b/Makefile index ae1592c1f5d6..8327725e5d76 100644 --- a/Makefile +++ b/Makefile @@ -986,6 +986,12 @@ ifeq ($(CONFIG_RELR),y) LDFLAGS_vmlinux += --pack-dyn-relocs=relr endif +# We never want expected sections to be placed heuristically by the +# linker. All sections should be explicitly named in the linker script. +ifdef CONFIG_LD_ORPHAN_WARN +LDFLAGS_vmlinux += --orphan-handling=warn +endif + # Align the bit size of userspace programs with the kernel KBUILD_USERCFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CFLAGS)) KBUILD_USERLDFLAGS += $(filter -m32 -m64 --target=%, $(KBUILD_CFLAGS)) diff --git a/arch/Kconfig b/arch/Kconfig index 56b6ccc0e32d..ba4e966484ab 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1028,6 +1028,15 @@ config HAVE_STATIC_CALL_INLINE bool depends on HAVE_STATIC_CALL +config ARCH_WANT_LD_ORPHAN_WARN + bool + help + An arch should select this symbol once all linker sections are explicitly + included, size-asserted, or discarded in the linker scripts. This is + important because we never want expected sections to be placed heuristically + by the linker, since the locations of such sections can change between linker + versions. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index fe2f17eb2b50..002e0cf025f5 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -35,6 +35,7 @@ config ARM select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_IPC_PARSE_VERSION + select ARCH_WANT_LD_ORPHAN_WARN select BINFMT_FLAT_ARGVP_ENVP_ON_STACK select BUILDTIME_TABLE_SORT if MMU select CLONE_BACKWARDS diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 4d76eab2b22d..e15f76ca2887 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -16,10 +16,6 @@ LDFLAGS_vmlinux += --be8 KBUILD_LDFLAGS_MODULE += --be8 endif -# We never want expected sections to be placed heuristically by the -# linker. All sections should be explicitly named in the linker script. -LDFLAGS_vmlinux += $(call ld-option, --orphan-handling=warn) - GZFLAGS :=-9 #KBUILD_CFLAGS +=-pipe diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index 47f001ca5499..e1567418a2b1 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -129,7 +129,9 @@ LDFLAGS_vmlinux += --no-undefined # Delete all temporary local symbols LDFLAGS_vmlinux += -X # Report orphan sections -LDFLAGS_vmlinux += $(call ld-option, --orphan-handling=warn) +ifdef CONFIG_LD_ORPHAN_WARN +LDFLAGS_vmlinux += --orphan-handling=warn +endif # Next argument is a linker script LDFLAGS_vmlinux += -T diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 1515f6f153a0..a6b5b7ef40ae 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -81,6 +81,7 @@ config ARM64 select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36) + select ARCH_WANT_LD_ORPHAN_WARN select ARCH_HAS_UBSAN_SANITIZE_ALL select ARM_AMBA select ARM_ARCH_TIMER diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 5789c2d18d43..6a87d592bd00 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -28,10 +28,6 @@ LDFLAGS_vmlinux += --fix-cortex-a53-843419 endif endif -# We never want expected sections to be placed heuristically by the -# linker. All sections should be explicitly named in the linker script. -LDFLAGS_vmlinux += $(call ld-option, --orphan-handling=warn) - ifeq ($(CONFIG_ARM64_USE_LSE_ATOMICS), y) ifneq ($(CONFIG_ARM64_LSE_ATOMICS), y) $(warning LSE atomics not supported by binutils) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e9f13fe08492..5181872f9452 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -152,6 +152,7 @@ config PPC select ARCH_USE_QUEUED_SPINLOCKS if PPC_QUEUED_SPINLOCKS select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_IRQS_OFF_ACTIVATE_MM + select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WEAK_RELEASE_ACQUIRE select BINFMT_ELF select BUILDTIME_TABLE_SORT diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index a4d56f0a41d9..d9eb0da845e1 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -123,7 +123,6 @@ endif LDFLAGS_vmlinux-y := -Bstatic LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) -LDFLAGS_vmlinux += $(call ld-option,--orphan-handling=warn) ifdef CONFIG_PPC64 ifeq ($(call cc-option-yn,-mcmodel=medium),y) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f6946b81f74a..fbf26e0f7a6a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -100,6 +100,7 @@ config X86 select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_HUGE_PMD_SHARE + select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_TABLE_SORT select CLKEVT_I8253 diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 154259f18b8b..1bf21746f4ce 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -209,9 +209,6 @@ ifdef CONFIG_X86_64 LDFLAGS_vmlinux += -z max-page-size=0x200000 endif -# We never want expected sections to be placed heuristically by the -# linker. All sections should be explicitly named in the linker script. -LDFLAGS_vmlinux += $(call ld-option, --orphan-handling=warn) archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index ee249088cbfe..40b8fd375d52 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -61,7 +61,9 @@ KBUILD_LDFLAGS += $(call ld-option,--no-ld-generated-unwind-info) # Compressed kernel should be built as PIE since it may be loaded at any # address by the bootloader. LDFLAGS_vmlinux := -pie $(call ld-option, --no-dynamic-linker) -LDFLAGS_vmlinux += $(call ld-option, --orphan-handling=warn) +ifdef CONFIG_LD_ORPHAN_WARN +LDFLAGS_vmlinux += --orphan-handling=warn +endif LDFLAGS_vmlinux += -T hostprogs := mkpiggy diff --git a/init/Kconfig b/init/Kconfig index c9446911cf41..92c58b45abb8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1348,6 +1348,11 @@ config LD_DEAD_CODE_DATA_ELIMINATION present. This option is not well tested yet, so use at your own risk. +config LD_ORPHAN_WARN + def_bool y + depends on ARCH_WANT_LD_ORPHAN_WARN + depends on $(ld-option,--orphan-handling=warn) + config SYSCTL bool -- cgit v1.2.3 From fae3a13d2a3d49a89391889808428cf1e72afbd7 Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Mon, 30 Nov 2020 09:57:20 -0600 Subject: x86/resctrl: Fix AMD L3 QOS CDP enable/disable When the AMD QoS feature CDP (code and data prioritization) is enabled or disabled, the CDP bit in MSR 0000_0C81 is written on one of the CPUs in an L3 domain (core complex). That is not correct - the CDP bit needs to be updated on all the logical CPUs in the domain. This was not spelled out clearly in the spec earlier. The specification has been updated and the updated document, "AMD64 Technology Platform Quality of Service Extensions Publication # 56375 Revision: 1.02 Issue Date: October 2020" is available now. Refer the section: Code and Data Prioritization. Fix the issue by adding a new flag arch_has_per_cpu_cfg in rdt_cache data structure. The documentation can be obtained at: https://developer.amd.com/wp-content/resources/56375.pdf Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 [ bp: Massage commit message. ] Fixes: 4d05bf71f157 ("x86/resctrl: Introduce AMD QOS feature") Signed-off-by: Babu Moger Signed-off-by: Borislav Petkov Reviewed-by: Reinette Chatre Link: https://lkml.kernel.org/r/160675180380.15628.3309402017215002347.stgit@bmoger-ubuntu --- arch/x86/kernel/cpu/resctrl/core.c | 4 ++++ arch/x86/kernel/cpu/resctrl/internal.h | 3 +++ arch/x86/kernel/cpu/resctrl/rdtgroup.c | 9 +++++++-- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index e5f4ee8f4c3b..e8b5f1cf1ae8 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -570,6 +570,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r) if (d) { cpumask_set_cpu(cpu, &d->cpu_mask); + if (r->cache.arch_has_per_cpu_cfg) + rdt_domain_reconfigure_cdp(r); return; } @@ -923,6 +925,7 @@ static __init void rdt_init_res_defs_intel(void) r->rid == RDT_RESOURCE_L2CODE) { r->cache.arch_has_sparse_bitmaps = false; r->cache.arch_has_empty_bitmaps = false; + r->cache.arch_has_per_cpu_cfg = false; } else if (r->rid == RDT_RESOURCE_MBA) { r->msr_base = MSR_IA32_MBA_THRTL_BASE; r->msr_update = mba_wrmsr_intel; @@ -943,6 +946,7 @@ static __init void rdt_init_res_defs_amd(void) r->rid == RDT_RESOURCE_L2CODE) { r->cache.arch_has_sparse_bitmaps = true; r->cache.arch_has_empty_bitmaps = true; + r->cache.arch_has_per_cpu_cfg = true; } else if (r->rid == RDT_RESOURCE_MBA) { r->msr_base = MSR_IA32_MBA_BW_BASE; r->msr_update = mba_wrmsr_amd; diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index 80fa997fae60..f65d3c0dbc41 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -360,6 +360,8 @@ struct msr_param { * executing entities * @arch_has_sparse_bitmaps: True if a bitmap like f00f is valid. * @arch_has_empty_bitmaps: True if the '0' bitmap is valid. + * @arch_has_per_cpu_cfg: True if QOS_CFG register for this cache + * level has CPU scope. */ struct rdt_cache { unsigned int cbm_len; @@ -369,6 +371,7 @@ struct rdt_cache { unsigned int shareable_bits; bool arch_has_sparse_bitmaps; bool arch_has_empty_bitmaps; + bool arch_has_per_cpu_cfg; }; /** diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 6f4ca4bea625..f3418428682b 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -1909,8 +1909,13 @@ static int set_cache_qos_cfg(int level, bool enable) r_l = &rdt_resources_all[level]; list_for_each_entry(d, &r_l->domains, list) { - /* Pick one CPU from each domain instance to update MSR */ - cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + if (r_l->cache.arch_has_per_cpu_cfg) + /* Pick all the CPUs in the domain instance */ + for_each_cpu(cpu, &d->cpu_mask) + cpumask_set_cpu(cpu, cpu_mask); + else + /* Pick one CPU from each domain instance to update MSR */ + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); } cpu = get_cpu(); /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */ -- cgit v1.2.3 From a1ee28117077c3bf24e5ab6324c835eaab629c45 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 28 Nov 2020 17:07:21 +1000 Subject: powerpc/64s/powernv: Fix memory corruption when saving SLB entries on MCE This can be hit by an HPT guest running on an HPT host and bring down the host, so it's quite important to fix. Fixes: 7290f3b3d3e6 ("powerpc/64s/powernv: machine check dump SLB contents") Cc: stable@vger.kernel.org # v5.4+ Signed-off-by: Nicholas Piggin Acked-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201128070728.825934-2-npiggin@gmail.com --- arch/powerpc/platforms/powernv/setup.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 46115231a3b2..4426a109ec2f 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -211,11 +211,16 @@ static void __init pnv_init(void) add_preferred_console("hvc", 0, NULL); if (!radix_enabled()) { + size_t size = sizeof(struct slb_entry) * mmu_slb_size; int i; /* Allocate per cpu area to save old slb contents during MCE */ - for_each_possible_cpu(i) - paca_ptrs[i]->mce_faulty_slbs = memblock_alloc_node(mmu_slb_size, __alignof__(*paca_ptrs[i]->mce_faulty_slbs), cpu_to_node(i)); + for_each_possible_cpu(i) { + paca_ptrs[i]->mce_faulty_slbs = + memblock_alloc_node(size, + __alignof__(struct slb_entry), + cpu_to_node(i)); + } } } -- cgit v1.2.3 From a2bd4097b3ec242f4de4924db463a9c94530e03a Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 26 Nov 2020 18:00:37 +0100 Subject: s390/pci: fix CPU address in MSI for directed IRQ The directed MSIs are delivered to CPUs whose address is written to the MSI message address. The current code assumes that a CPU logical number (as it is seen by the kernel) is also the CPU address. The above assumption is not correct, as the CPU address is rather the value returned by STAP instruction. That value does not necessarily match the kernel logical CPU number. Fixes: e979ce7bced2 ("s390/pci: provide support for CPU directed interrupts") Cc: # v5.2+ Signed-off-by: Alexander Gordeev Reviewed-by: Halil Pasic Reviewed-by: Niklas Schnelle Signed-off-by: Niklas Schnelle Signed-off-by: Heiko Carstens --- arch/s390/pci/pci_irq.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 743f257cf2cb..75217fb63d7b 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -103,9 +103,10 @@ static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *de { struct msi_desc *entry = irq_get_msi_desc(data->irq); struct msi_msg msg = entry->msg; + int cpu_addr = smp_cpu_get_cpu_address(cpumask_first(dest)); msg.address_lo &= 0xff0000ff; - msg.address_lo |= (cpumask_first(dest) << 8); + msg.address_lo |= (cpu_addr << 8); pci_write_msi_msg(data->irq, &msg); return IRQ_SET_MASK_OK; @@ -238,6 +239,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) unsigned long bit; struct msi_desc *msi; struct msi_msg msg; + int cpu_addr; int rc, irq; zdev->aisb = -1UL; @@ -287,9 +289,15 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) handle_percpu_irq); msg.data = hwirq - bit; if (irq_delivery == DIRECTED) { + if (msi->affinity) + cpu = cpumask_first(&msi->affinity->mask); + else + cpu = 0; + cpu_addr = smp_cpu_get_cpu_address(cpu); + msg.address_lo = zdev->msi_addr & 0xff0000ff; - msg.address_lo |= msi->affinity ? - (cpumask_first(&msi->affinity->mask) << 8) : 0; + msg.address_lo |= (cpu_addr << 8); + for_each_possible_cpu(cpu) { airq_iv_set_data(zpci_ibv[cpu], hwirq, irq); } -- cgit v1.2.3 From b1cae1f84a0f609a34ebcaa087fbecef32f69882 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 2 Dec 2020 11:46:01 +0100 Subject: s390: fix irq state tracing With commit 58c644ba512c ("sched/idle: Fix arch_cpu_idle() vs tracing") common code calls arch_cpu_idle() with a lockdep state that tells irqs are on. This doesn't work very well for s390: psw_idle() will enable interrupts to wait for an interrupt. As soon as an interrupt occurs the interrupt handler will verify if the old context was psw_idle(). If that is the case the interrupt enablement bits in the old program status word will be cleared. A subsequent test in both the external as well as the io interrupt handler checks if in the old context interrupts were enabled. Due to the above patching of the old program status word it is assumed the old context had interrupts disabled, and therefore a call to TRACE_IRQS_OFF (aka trace_hardirqs_off_caller) is skipped. Which in turn makes lockdep incorrectly "think" that interrupts are enabled within the interrupt handler. Fix this by unconditionally calling TRACE_IRQS_OFF when entering interrupt handlers. Also call unconditionally TRACE_IRQS_ON when leaving interrupts handlers. This leaves the special psw_idle() case, which now returns with interrupts disabled, but has an "irqs on" lockdep state. So callers of psw_idle() must adjust the state on their own, if required. This is currently only __udelay_disabled(). Fixes: 58c644ba512c ("sched/idle: Fix arch_cpu_idle() vs tracing") Acked-by: Peter Zijlstra (Intel) Signed-off-by: Heiko Carstens --- arch/s390/kernel/entry.S | 15 --------------- arch/s390/lib/delay.c | 5 ++--- 2 files changed, 2 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 26bb0603c5a1..92beb1444644 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -763,12 +763,7 @@ ENTRY(io_int_handler) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ jo .Lio_restore -#if IS_ENABLED(CONFIG_TRACE_IRQFLAGS) - tmhh %r8,0x300 - jz 1f TRACE_IRQS_OFF -1: -#endif xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) .Lio_loop: lgr %r2,%r11 # pass pointer to pt_regs @@ -791,12 +786,7 @@ ENTRY(io_int_handler) TSTMSK __LC_CPU_FLAGS,_CIF_WORK jnz .Lio_work .Lio_restore: -#if IS_ENABLED(CONFIG_TRACE_IRQFLAGS) - tm __PT_PSW(%r11),3 - jno 0f TRACE_IRQS_ON -0: -#endif mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) tm __PT_PSW+1(%r11),0x01 # returning to user ? jno .Lio_exit_kernel @@ -976,12 +966,7 @@ ENTRY(ext_int_handler) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) TSTMSK __LC_CPU_FLAGS,_CIF_IGNORE_IRQ jo .Lio_restore -#if IS_ENABLED(CONFIG_TRACE_IRQFLAGS) - tmhh %r8,0x300 - jz 1f TRACE_IRQS_OFF -1: -#endif xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs lghi %r3,EXT_INTERRUPT diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c index daca7bad66de..8c0c68e7770e 100644 --- a/arch/s390/lib/delay.c +++ b/arch/s390/lib/delay.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL(__delay); static void __udelay_disabled(unsigned long long usecs) { - unsigned long cr0, cr0_new, psw_mask, flags; + unsigned long cr0, cr0_new, psw_mask; struct s390_idle_data idle; u64 end; @@ -45,9 +45,8 @@ static void __udelay_disabled(unsigned long long usecs) psw_mask = __extract_psw() | PSW_MASK_EXT | PSW_MASK_WAIT; set_clock_comparator(end); set_cpu_flag(CIF_IGNORE_IRQ); - local_irq_save(flags); psw_idle(&idle, psw_mask); - local_irq_restore(flags); + trace_hardirqs_off(); clear_cpu_flag(CIF_IGNORE_IRQ); set_clock_comparator(S390_lowcore.clock_comparator); __ctl_load(cr0, 0, 0); -- cgit v1.2.3 From 5debf02131227d39988e44adf5090fb796fa8466 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 26 Nov 2020 20:09:21 +0900 Subject: perf/x86/intel: Fix a warning on x86_pmu_stop() with large PEBS The commit 3966c3feca3f ("x86/perf/amd: Remove need to check "running" bit in NMI handler") introduced this. It seems x86_pmu_stop can be called recursively (like when it losts some samples) like below: x86_pmu_stop intel_pmu_disable_event (x86_pmu_disable) intel_pmu_pebs_disable intel_pmu_drain_pebs_nhm (x86_pmu_drain_pebs_buffer) x86_pmu_stop While commit 35d1ce6bec13 ("perf/x86/intel/ds: Fix x86_pmu_stop warning for large PEBS") fixed it for the normal cases, there's another path to call x86_pmu_stop() recursively when a PEBS error was detected (like two or more counters overflowed at the same time). Like in the Kan's previous fix, we can skip the interrupt accounting for large PEBS, so check the iregs which is set for PMI only. Fixes: 3966c3feca3f ("x86/perf/amd: Remove need to check "running" bit in NMI handler") Reported-by: John Sperbeck Suggested-by: Peter Zijlstra Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201126110922.317681-1-namhyung@kernel.org --- arch/x86/events/intel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index b47cc4226934..89dba588636e 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1940,7 +1940,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d if (error[bit]) { perf_log_lost_samples(event, error[bit]); - if (perf_event_account_interrupt(event)) + if (iregs && perf_event_account_interrupt(event)) x86_pmu_stop(event, 0); } -- cgit v1.2.3 From fc17db8aa4c53cbd2d5469bb0521ea0f0a6dbb27 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 26 Nov 2020 20:09:22 +0900 Subject: perf/x86/intel: Check PEBS status correctly The kernel cannot disambiguate when 2+ PEBS counters overflow at the same time. This is what the comment for this code suggests. However, I see the comparison is done with the unfiltered p->status which is a copy of IA32_PERF_GLOBAL_STATUS at the time of the sample. This register contains more than the PEBS counter overflow bits. It also includes many other bits which could also be set. Signed-off-by: Namhyung Kim Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201126110922.317681-2-namhyung@kernel.org --- arch/x86/events/intel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 89dba588636e..485c5066f8b8 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1916,7 +1916,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs, struct perf_sample_d * that caused the PEBS record. It's called collision. * If collision happened, the record will be dropped. */ - if (p->status != (1ULL << bit)) { + if (pebs_status != (1ULL << bit)) { for_each_set_bit(i, (unsigned long *)&pebs_status, size) error[i]++; continue; -- cgit v1.2.3 From 8dcc0e19dfbd73ad6b3172924d6da8f7f3f8b3b0 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 3 Dec 2020 09:22:52 -0600 Subject: x86/platform/uv: Fix UV4 hub revision adjustment Currently, UV4 is incorrectly identified as UV4A and UV4A as UV5. Hub chip starts with revision 1, fix it. [ bp: Massage commit message. ] Fixes: 647128f1536e ("x86/platform/uv: Update UV MMRs for UV5") Signed-off-by: Mike Travis Signed-off-by: Borislav Petkov Reviewed-by: Steve Wahl Acked-by: Dimitri Sivanich Link: https://lkml.kernel.org/r/20201203152252.371199-1-mike.travis@hpe.com --- arch/x86/kernel/apic/x2apic_uv_x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 1b98f8c12b96..235f5cde06fc 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -161,7 +161,7 @@ static int __init early_set_hub_type(void) /* UV4/4A only have a revision difference */ case UV4_HUB_PART_NUMBER: uv_min_hub_revision_id = node_id.s.revision - + UV4_HUB_REVISION_BASE; + + UV4_HUB_REVISION_BASE - 1; uv_hub_type_set(UV4); if (uv_min_hub_revision_id == UV4A_HUB_REVISION_BASE) uv_hub_type_set(UV4|UV4A); -- cgit v1.2.3 From 45c5775460f32ed8cdb7c16986ae1a2c254346b3 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Mon, 30 Nov 2020 09:30:33 +0100 Subject: usb: ohci-omap: Fix descriptor conversion There were a bunch of issues with the patch converting the OMAP1 OSK board to use descriptors for controlling the USB host: - The chip label was incorrect - The GPIO offset was off-by-one - The code should use sleeping accessors This patch tries to fix all issues at the same time. Cc: Aaro Koskinen Reported-by: Aaro Koskinen Fixes: 15d157e87443 ("usb: ohci-omap: Convert to use GPIO descriptors") Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20201130083033.29435-1-linus.walleij@linaro.org Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-omap1/board-osk.c | 2 +- drivers/usb/host/ohci-omap.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-omap1/board-osk.c b/arch/arm/mach-omap1/board-osk.c index 144b9caa935c..a720259099ed 100644 --- a/arch/arm/mach-omap1/board-osk.c +++ b/arch/arm/mach-omap1/board-osk.c @@ -288,7 +288,7 @@ static struct gpiod_lookup_table osk_usb_gpio_table = { .dev_id = "ohci", .table = { /* Power GPIO on the I2C-attached TPS65010 */ - GPIO_LOOKUP("i2c-tps65010", 1, "power", GPIO_ACTIVE_HIGH), + GPIO_LOOKUP("tps65010", 0, "power", GPIO_ACTIVE_HIGH), GPIO_LOOKUP(OMAP_GPIO_LABEL, 9, "overcurrent", GPIO_ACTIVE_HIGH), }, diff --git a/drivers/usb/host/ohci-omap.c b/drivers/usb/host/ohci-omap.c index 9ccdf2c216b5..6374501ba139 100644 --- a/drivers/usb/host/ohci-omap.c +++ b/drivers/usb/host/ohci-omap.c @@ -91,14 +91,14 @@ static int omap_ohci_transceiver_power(struct ohci_omap_priv *priv, int on) | ((1 << 5/*usb1*/) | (1 << 3/*usb2*/)), INNOVATOR_FPGA_CAM_USB_CONTROL); else if (priv->power) - gpiod_set_value(priv->power, 0); + gpiod_set_value_cansleep(priv->power, 0); } else { if (machine_is_omap_innovator() && cpu_is_omap1510()) __raw_writeb(__raw_readb(INNOVATOR_FPGA_CAM_USB_CONTROL) & ~((1 << 5/*usb1*/) | (1 << 3/*usb2*/)), INNOVATOR_FPGA_CAM_USB_CONTROL); else if (priv->power) - gpiod_set_value(priv->power, 1); + gpiod_set_value_cansleep(priv->power, 1); } return 0; -- cgit v1.2.3 From 4e9a5ae8df5b3365183150f6df49e49dece80d8c Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 3 Dec 2020 13:50:37 +0900 Subject: x86/uprobes: Do not use prefixes.nbytes when looping over prefixes.bytes Since insn.prefixes.nbytes can be bigger than the size of insn.prefixes.bytes[] when a prefix is repeated, the proper check must be insn.prefixes.bytes[i] != 0 and i < 4 instead of using insn.prefixes.nbytes. Introduce a for_each_insn_prefix() macro for this purpose. Debugged by Kees Cook . [ bp: Massage commit message, sync with the respective header in tools/ and drop "we". ] Fixes: 2b1444983508 ("uprobes, mm, x86: Add the ability to install and remove uprobes breakpoints") Reported-by: syzbot+9b64b619f10f19d19a7c@syzkaller.appspotmail.com Signed-off-by: Masami Hiramatsu Signed-off-by: Borislav Petkov Reviewed-by: Srikar Dronamraju Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/160697103739.3146288.7437620795200799020.stgit@devnote2 --- arch/x86/include/asm/insn.h | 15 +++++++++++++++ arch/x86/kernel/uprobes.c | 10 ++++++---- tools/arch/x86/include/asm/insn.h | 15 +++++++++++++++ 3 files changed, 36 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 5c1ae3eff9d4..a8c3d284fa46 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -201,6 +201,21 @@ static inline int insn_offset_immediate(struct insn *insn) return insn_offset_displacement(insn) + insn->displacement.nbytes; } +/** + * for_each_insn_prefix() -- Iterate prefixes in the instruction + * @insn: Pointer to struct insn. + * @idx: Index storage. + * @prefix: Prefix byte. + * + * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix + * and the index is stored in @idx (note that this @idx is just for a cursor, + * do not change it.) + * Since prefixes.nbytes can be bigger than 4 if some prefixes + * are repeated, it cannot be used for looping over the prefixes. + */ +#define for_each_insn_prefix(insn, idx, prefix) \ + for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) + #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 3fdaa042823d..138bdb1fd136 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -255,12 +255,13 @@ static volatile u32 good_2byte_insns[256 / 32] = { static bool is_prefix_bad(struct insn *insn) { + insn_byte_t p; int i; - for (i = 0; i < insn->prefixes.nbytes; i++) { + for_each_insn_prefix(insn, i, p) { insn_attr_t attr; - attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]); + attr = inat_get_opcode_attribute(p); switch (attr) { case INAT_MAKE_PREFIX(INAT_PFX_ES): case INAT_MAKE_PREFIX(INAT_PFX_CS): @@ -715,6 +716,7 @@ static const struct uprobe_xol_ops push_xol_ops = { static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) { u8 opc1 = OPCODE1(insn); + insn_byte_t p; int i; switch (opc1) { @@ -746,8 +748,8 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) * Intel and AMD behavior differ in 64-bit mode: Intel ignores 66 prefix. * No one uses these insns, reject any branch insns with such prefix. */ - for (i = 0; i < insn->prefixes.nbytes; i++) { - if (insn->prefixes.bytes[i] == 0x66) + for_each_insn_prefix(insn, i, p) { + if (p == 0x66) return -ENOTSUPP; } diff --git a/tools/arch/x86/include/asm/insn.h b/tools/arch/x86/include/asm/insn.h index 568854b14d0a..52c6262e6bfd 100644 --- a/tools/arch/x86/include/asm/insn.h +++ b/tools/arch/x86/include/asm/insn.h @@ -201,6 +201,21 @@ static inline int insn_offset_immediate(struct insn *insn) return insn_offset_displacement(insn) + insn->displacement.nbytes; } +/** + * for_each_insn_prefix() -- Iterate prefixes in the instruction + * @insn: Pointer to struct insn. + * @idx: Index storage. + * @prefix: Prefix byte. + * + * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix + * and the index is stored in @idx (note that this @idx is just for a cursor, + * do not change it.) + * Since prefixes.nbytes can be bigger than 4 if some prefixes + * are repeated, it cannot be used for looping over the prefixes. + */ +#define for_each_insn_prefix(insn, idx, prefix) \ + for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) + #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e -- cgit v1.2.3 From 12cb908a11b2544b5f53e9af856e6b6a90ed5533 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 3 Dec 2020 13:50:50 +0900 Subject: x86/insn-eval: Use new for_each_insn_prefix() macro to loop over prefixes bytes Since insn.prefixes.nbytes can be bigger than the size of insn.prefixes.bytes[] when a prefix is repeated, the proper check must be insn.prefixes.bytes[i] != 0 and i < 4 instead of using insn.prefixes.nbytes. Use the new for_each_insn_prefix() macro which does it correctly. Debugged by Kees Cook . [ bp: Massage commit message. ] Fixes: 32d0b95300db ("x86/insn-eval: Add utility functions to get segment selector") Reported-by: syzbot+9b64b619f10f19d19a7c@syzkaller.appspotmail.com Signed-off-by: Masami Hiramatsu Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/160697104969.3146288.16329307586428270032.stgit@devnote2 --- arch/x86/lib/insn-eval.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c index 58f7fb95c7f4..4229950a5d78 100644 --- a/arch/x86/lib/insn-eval.c +++ b/arch/x86/lib/insn-eval.c @@ -63,13 +63,12 @@ static bool is_string_insn(struct insn *insn) */ bool insn_has_rep_prefix(struct insn *insn) { + insn_byte_t p; int i; insn_get_prefixes(insn); - for (i = 0; i < insn->prefixes.nbytes; i++) { - insn_byte_t p = insn->prefixes.bytes[i]; - + for_each_insn_prefix(insn, i, p) { if (p == 0xf2 || p == 0xf3) return true; } @@ -95,14 +94,15 @@ static int get_seg_reg_override_idx(struct insn *insn) { int idx = INAT_SEG_REG_DEFAULT; int num_overrides = 0, i; + insn_byte_t p; insn_get_prefixes(insn); /* Look for any segment override prefixes. */ - for (i = 0; i < insn->prefixes.nbytes; i++) { + for_each_insn_prefix(insn, i, p) { insn_attr_t attr; - attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]); + attr = inat_get_opcode_attribute(p); switch (attr) { case INAT_MAKE_PREFIX(INAT_PFX_CS): idx = INAT_SEG_REG_CS; -- cgit v1.2.3 From 84da009f06e60cf59d5e861f8e2101d2d3885517 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 3 Dec 2020 13:51:01 +0900 Subject: x86/sev-es: Use new for_each_insn_prefix() macro to loop over prefixes bytes Since insn.prefixes.nbytes can be bigger than the size of insn.prefixes.bytes[] when a prefix is repeated, the proper check must be: insn.prefixes.bytes[i] != 0 and i < 4 instead of using insn.prefixes.nbytes. Use the new for_each_insn_prefix() macro which does it correctly. Debugged by Kees Cook . [ bp: Massage commit message. ] Fixes: 25189d08e516 ("x86/sev-es: Add support for handling IOIO exceptions") Reported-by: syzbot+9b64b619f10f19d19a7c@syzkaller.appspotmail.com Signed-off-by: Masami Hiramatsu Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/160697106089.3146288.2052422845039649176.stgit@devnote2 --- arch/x86/boot/compressed/sev-es.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/boot/compressed/sev-es.c b/arch/x86/boot/compressed/sev-es.c index 954cb2702e23..27826c265aab 100644 --- a/arch/x86/boot/compressed/sev-es.c +++ b/arch/x86/boot/compressed/sev-es.c @@ -32,13 +32,12 @@ struct ghcb *boot_ghcb; */ static bool insn_has_rep_prefix(struct insn *insn) { + insn_byte_t p; int i; insn_get_prefixes(insn); - for (i = 0; i < insn->prefixes.nbytes; i++) { - insn_byte_t p = insn->prefixes.bytes[i]; - + for_each_insn_prefix(insn, i, p) { if (p == 0xf2 || p == 0xf3) return true; } -- cgit v1.2.3 From e91d8d78237de8d7120c320b3645b7100848f24d Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 5 Dec 2020 22:14:51 -0800 Subject: mm/zsmalloc.c: drop ZSMALLOC_PGTABLE_MAPPING While I was doing zram testing, I found sometimes decompression failed since the compression buffer was corrupted. With investigation, I found below commit calls cond_resched unconditionally so it could make a problem in atomic context if the task is reschedule. BUG: sleeping function called from invalid context at mm/vmalloc.c:108 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 946, name: memhog 3 locks held by memhog/946: #0: ffff9d01d4b193e8 (&mm->mmap_lock#2){++++}-{4:4}, at: __mm_populate+0x103/0x160 #1: ffffffffa3d53de0 (fs_reclaim){+.+.}-{0:0}, at: __alloc_pages_slowpath.constprop.0+0xa98/0x1160 #2: ffff9d01d56b8110 (&zspage->lock){.+.+}-{3:3}, at: zs_map_object+0x8e/0x1f0 CPU: 0 PID: 946 Comm: memhog Not tainted 5.9.3-00011-gc5bfc0287345-dirty #316 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 Call Trace: unmap_kernel_range_noflush+0x2eb/0x350 unmap_kernel_range+0x14/0x30 zs_unmap_object+0xd5/0xe0 zram_bvec_rw.isra.0+0x38c/0x8e0 zram_rw_page+0x90/0x101 bdev_write_page+0x92/0xe0 __swap_writepage+0x94/0x4a0 pageout+0xe3/0x3a0 shrink_page_list+0xb94/0xd60 shrink_inactive_list+0x158/0x460 We can fix this by removing the ZSMALLOC_PGTABLE_MAPPING feature (which contains the offending calling code) from zsmalloc. Even though this option showed some amount improvement(e.g., 30%) in some arm32 platforms, it has been headache to maintain since it have abused APIs[1](e.g., unmap_kernel_range in atomic context). Since we are approaching to deprecate 32bit machines and already made the config option available for only builtin build since v5.8, lastly it has been not default option in zsmalloc, it's time to drop the option for better maintenance. [1] http://lore.kernel.org/linux-mm/20201105170249.387069-1-minchan@kernel.org Fixes: e47110e90584 ("mm/vunmap: add cond_resched() in vunmap_pmd_range") Signed-off-by: Minchan Kim Signed-off-by: Andrew Morton Reviewed-by: Sergey Senozhatsky Cc: Tony Lindgren Cc: Christoph Hellwig Cc: Harish Sriram Cc: Uladzislau Rezki Cc: Link: https://lkml.kernel.org/r/20201117202916.GA3856507@google.com Signed-off-by: Linus Torvalds --- arch/arm/configs/omap2plus_defconfig | 1 - include/linux/zsmalloc.h | 1 - mm/Kconfig | 13 --------- mm/zsmalloc.c | 54 ------------------------------------ 4 files changed, 69 deletions(-) (limited to 'arch') diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 34793aabdb65..58df9fd79a76 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -81,7 +81,6 @@ CONFIG_PARTITION_ADVANCED=y CONFIG_BINFMT_MISC=y CONFIG_CMA=y CONFIG_ZSMALLOC=m -CONFIG_ZSMALLOC_PGTABLE_MAPPING=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_UNIX=y diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 0fdbf653b173..4807ca4d52e0 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -20,7 +20,6 @@ * zsmalloc mapping modes * * NOTE: These only make a difference when a mapped object spans pages. - * They also have no effect when ZSMALLOC_PGTABLE_MAPPING is selected. */ enum zs_mapmode { ZS_MM_RW, /* normal read-write mapping */ diff --git a/mm/Kconfig b/mm/Kconfig index d42423f884a7..390165ffbb0f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -707,19 +707,6 @@ config ZSMALLOC returned by an alloc(). This handle must be mapped in order to access the allocated space. -config ZSMALLOC_PGTABLE_MAPPING - bool "Use page table mapping to access object in zsmalloc" - depends on ZSMALLOC=y - help - By default, zsmalloc uses a copy-based object mapping method to - access allocations that span two pages. However, if a particular - architecture (ex, ARM) performs VM mapping faster than copying, - then you should select this. This causes zsmalloc to use page table - mapping rather than copying for object mapping. - - You can check speed with zsmalloc benchmark: - https://github.com/spartacus06/zsmapbench - config ZSMALLOC_STAT bool "Export zsmalloc statistics" depends on ZSMALLOC diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 918c7b019b3d..cdfaaadea8ff 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -293,11 +293,7 @@ struct zspage { }; struct mapping_area { -#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING - struct vm_struct *vm; /* vm area for mapping object that span pages */ -#else char *vm_buf; /* copy buffer for objects that span pages */ -#endif char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ }; @@ -1113,54 +1109,6 @@ static struct zspage *find_get_zspage(struct size_class *class) return zspage; } -#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING -static inline int __zs_cpu_up(struct mapping_area *area) -{ - /* - * Make sure we don't leak memory if a cpu UP notification - * and zs_init() race and both call zs_cpu_up() on the same cpu - */ - if (area->vm) - return 0; - area->vm = get_vm_area(PAGE_SIZE * 2, 0); - if (!area->vm) - return -ENOMEM; - - /* - * Populate ptes in advance to avoid pte allocation with GFP_KERNEL - * in non-preemtible context of zs_map_object. - */ - return apply_to_page_range(&init_mm, (unsigned long)area->vm->addr, - PAGE_SIZE * 2, NULL, NULL); -} - -static inline void __zs_cpu_down(struct mapping_area *area) -{ - if (area->vm) - free_vm_area(area->vm); - area->vm = NULL; -} - -static inline void *__zs_map_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - unsigned long addr = (unsigned long)area->vm->addr; - - BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0); - area->vm_addr = area->vm->addr; - return area->vm_addr + off; -} - -static inline void __zs_unmap_object(struct mapping_area *area, - struct page *pages[2], int off, int size) -{ - unsigned long addr = (unsigned long)area->vm_addr; - - unmap_kernel_range(addr, PAGE_SIZE * 2); -} - -#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ - static inline int __zs_cpu_up(struct mapping_area *area) { /* @@ -1241,8 +1189,6 @@ out: pagefault_enable(); } -#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */ - static int zs_cpu_prepare(unsigned int cpu) { struct mapping_area *area; -- cgit v1.2.3