From 5d1335dabb3c493a3d6d5b233953b6ac7b6c1ff2 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Mon, 19 Dec 2022 20:56:36 +0100 Subject: parisc: Fix return code of pdc_iodc_print() There is an off-by-one if the printed string includes a new-line char. Cc: stable@vger.kernel.org Signed-off-by: Helge Deller --- arch/parisc/kernel/firmware.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/firmware.c b/arch/parisc/kernel/firmware.c index 4dfe1f49c5c8..6817892a2c58 100644 --- a/arch/parisc/kernel/firmware.c +++ b/arch/parisc/kernel/firmware.c @@ -1303,7 +1303,7 @@ static char iodc_dbuf[4096] __page_aligned_bss; */ int pdc_iodc_print(const unsigned char *str, unsigned count) { - unsigned int i; + unsigned int i, found = 0; unsigned long flags; count = min_t(unsigned int, count, sizeof(iodc_dbuf)); @@ -1315,6 +1315,7 @@ int pdc_iodc_print(const unsigned char *str, unsigned count) iodc_dbuf[i+0] = '\r'; iodc_dbuf[i+1] = '\n'; i += 2; + found = 1; goto print; default: iodc_dbuf[i] = str[i]; @@ -1330,7 +1331,7 @@ print: __pa(pdc_result), 0, __pa(iodc_dbuf), i, 0); spin_unlock_irqrestore(&pdc_lock, flags); - return i; + return i - found; } #if !defined(BOOTLOADER) -- cgit v1.2.3 From a23eaf9368aafa4defcc8904b20391b6ea07bb1e Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 27 Jan 2023 07:54:48 +0800 Subject: KVM: arm64: Add helper vgic_write_guest_lock() Currently, the unknown no-running-vcpu sites are reported when a dirty page is tracked by mark_page_dirty_in_slot(). Until now, the only known no-running-vcpu site is saving vgic/its tables through KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} command on KVM device "kvm-arm-vgic-its". Unfortunately, there are more unknown sites to be handled and no-running-vcpu context will be allowed in these sites: (1) KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_RESTORE_TABLES} command on KVM device "kvm-arm-vgic-its" to restore vgic/its tables. The vgic3 LPI pending status could be restored. (2) Save vgic3 pending table through KVM_DEV_ARM_{VGIC_GRP_CTRL, VGIC_SAVE_PENDING_TABLES} command on KVM device "kvm-arm-vgic-v3". In order to handle those unknown cases, we need a unified helper vgic_write_guest_lock(). struct vgic_dist::save_its_tables_in_progress is also renamed to struct vgic_dist::save_tables_in_progress. No functional change intended. Suggested-by: Oliver Upton Signed-off-by: Gavin Shan Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230126235451.469087-3-gshan@redhat.com --- arch/arm64/kvm/vgic/vgic-its.c | 13 +++++-------- arch/arm64/kvm/vgic/vgic.h | 14 ++++++++++++++ include/kvm/arm_vgic.h | 2 +- 3 files changed, 20 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 94a666dd1443..2642e9ce2819 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2187,7 +2187,7 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | ite->collection->collection_id; val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); + return vgic_write_guest_lock(kvm, gpa, &val, ite_esz); } /** @@ -2339,7 +2339,7 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | (dev->num_eventid_bits - 1)); val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, ptr, &val, dte_esz); + return vgic_write_guest_lock(kvm, ptr, &val, dte_esz); } /** @@ -2526,7 +2526,7 @@ static int vgic_its_save_cte(struct vgic_its *its, ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | collection->collection_id); val = cpu_to_le64(val); - return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); + return vgic_write_guest_lock(its->dev->kvm, gpa, &val, esz); } /* @@ -2607,7 +2607,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) */ val = 0; BUG_ON(cte_esz > sizeof(val)); - ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); + ret = vgic_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); return ret; } @@ -2743,7 +2743,6 @@ static int vgic_its_has_attr(struct kvm_device *dev, static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) { const struct vgic_its_abi *abi = vgic_its_get_abi(its); - struct vgic_dist *dist = &kvm->arch.vgic; int ret = 0; if (attr == KVM_DEV_ARM_VGIC_CTRL_INIT) /* Nothing to do */ @@ -2763,9 +2762,7 @@ static int vgic_its_ctrl(struct kvm *kvm, struct vgic_its *its, u64 attr) vgic_its_reset(kvm, its); break; case KVM_DEV_ARM_ITS_SAVE_TABLES: - dist->save_its_tables_in_progress = true; ret = abi->save_tables(its); - dist->save_its_tables_in_progress = false; break; case KVM_DEV_ARM_ITS_RESTORE_TABLES: ret = abi->restore_tables(its); @@ -2792,7 +2789,7 @@ bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) { struct vgic_dist *dist = &kvm->arch.vgic; - return dist->save_its_tables_in_progress; + return dist->table_write_in_progress; } static int vgic_its_set_attr(struct kvm_device *dev, diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index 23e280fa0a16..7f7f3c5ed85a 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -6,6 +6,7 @@ #define __KVM_ARM_VGIC_NEW_H__ #include +#include #define PRODUCT_ID_KVM 0x4b /* ASCII code K */ #define IMPLEMENTER_ARM 0x43b @@ -131,6 +132,19 @@ static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq) return vgic_irq_get_lr_count(irq) > 1; } +static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa, + const void *data, unsigned long len) +{ + struct vgic_dist *dist = &kvm->arch.vgic; + int ret; + + dist->table_write_in_progress = true; + ret = kvm_write_guest_lock(kvm, gpa, data, len); + dist->table_write_in_progress = false; + + return ret; +} + /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index 9270cd87da3f..6470f67e63c4 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -263,7 +263,7 @@ struct vgic_dist { struct vgic_io_device dist_iodev; bool has_its; - bool save_its_tables_in_progress; + bool table_write_in_progress; /* * Contains the attributes and gpa of the LPI configuration table. -- cgit v1.2.3 From 2f8b1ad2228a7f1f1e2458864f4bfc1cbdf511ed Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 27 Jan 2023 07:54:50 +0800 Subject: KVM: arm64: Allow no running vcpu on restoring vgic3 LPI pending status We don't have a running VCPU context to restore vgic3 LPI pending status due to command KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_RESTORE_TABLES} on KVM device "kvm-arm-vgic-its". Use vgic_write_guest_lock() to restore vgic3 LPI pending status. Signed-off-by: Gavin Shan Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230126235451.469087-4-gshan@redhat.com --- Documentation/virt/kvm/api.rst | 8 +++++--- arch/arm64/kvm/vgic/vgic-v3.c | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index dbed78a9c31b..25e9095596af 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8070,9 +8070,11 @@ considering the state as complete. VMM needs to ensure that the dirty state is final and avoid missing dirty pages from another ioctl ordered after the bitmap collection. -NOTE: One example of using the backup bitmap is saving arm64 vgic/its -tables through KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} command on -KVM device "kvm-arm-vgic-its" when dirty ring is enabled. +NOTE: Multiple examples of using the backup bitmap: (1) save vgic/its +tables through command KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} on +KVM device "kvm-arm-vgic-its". (2) restore vgic/its tables through +command KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_RESTORE_TABLES} on KVM device +"kvm-arm-vgic-its". VGICv3 LPI pending status is restored. 8.30 KVM_CAP_XEN_HVM -------------------- diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 2624963cb95b..3dab286ed23f 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -339,7 +339,7 @@ retry: if (status) { /* clear consumed data */ val &= ~(1 << bit_nr); - ret = kvm_write_guest_lock(kvm, ptr, &val, 1); + ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) return ret; } -- cgit v1.2.3 From 6028acbe3a5f2119a2a6ddd3e06453c87c09cae0 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 27 Jan 2023 07:54:51 +0800 Subject: KVM: arm64: Allow no running vcpu on saving vgic3 pending table We don't have a running VCPU context to save vgic3 pending table due to KVM_DEV_ARM_VGIC_{GRP_CTRL, SAVE_PENDING_TABLES} command on KVM device "kvm-arm-vgic-v3". The unknown case is caught by kvm-unit-tests. # ./kvm-unit-tests/tests/its-pending-migration WARNING: CPU: 120 PID: 7973 at arch/arm64/kvm/../../../virt/kvm/kvm_main.c:3325 \ mark_page_dirty_in_slot+0x60/0xe0 : mark_page_dirty_in_slot+0x60/0xe0 __kvm_write_guest_page+0xcc/0x100 kvm_write_guest+0x7c/0xb0 vgic_v3_save_pending_tables+0x148/0x2a0 vgic_set_common_attr+0x158/0x240 vgic_v3_set_attr+0x4c/0x5c kvm_device_ioctl+0x100/0x160 __arm64_sys_ioctl+0xa8/0xf0 invoke_syscall.constprop.0+0x7c/0xd0 el0_svc_common.constprop.0+0x144/0x160 do_el0_svc+0x34/0x60 el0_svc+0x3c/0x1a0 el0t_64_sync_handler+0xb4/0x130 el0t_64_sync+0x178/0x17c Use vgic_write_guest_lock() to save vgic3 pending table. Reported-by: Zenghui Yu Signed-off-by: Gavin Shan Reviewed-by: Oliver Upton Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20230126235451.469087-5-gshan@redhat.com --- Documentation/virt/kvm/api.rst | 4 +++- arch/arm64/kvm/vgic/vgic-v3.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 25e9095596af..5006901ad6c6 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -8074,7 +8074,9 @@ NOTE: Multiple examples of using the backup bitmap: (1) save vgic/its tables through command KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_SAVE_TABLES} on KVM device "kvm-arm-vgic-its". (2) restore vgic/its tables through command KVM_DEV_ARM_{VGIC_GRP_CTRL, ITS_RESTORE_TABLES} on KVM device -"kvm-arm-vgic-its". VGICv3 LPI pending status is restored. +"kvm-arm-vgic-its". VGICv3 LPI pending status is restored. (3) save +vgic3 pending table through KVM_DEV_ARM_VGIC_{GRP_CTRL, SAVE_PENDING_TABLES} +command on KVM device "kvm-arm-vgic-v3". 8.30 KVM_CAP_XEN_HVM -------------------- diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 3dab286ed23f..684bdfaad4a9 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -434,7 +434,7 @@ int vgic_v3_save_pending_tables(struct kvm *kvm) else val &= ~(1 << bit_nr); - ret = kvm_write_guest_lock(kvm, ptr, &val, 1); + ret = vgic_write_guest_lock(kvm, ptr, &val, 1); if (ret) goto out; } -- cgit v1.2.3 From 8afffce6aa3bddc940ac1909627ff1e772b6cbf1 Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy Date: Sat, 28 Jan 2023 18:11:38 +0530 Subject: powerpc/85xx: Fix unannotated intra-function call warning objtool throws the following warning: arch/powerpc/kernel/head_85xx.o: warning: objtool: .head.text+0x1a6c: unannotated intra-function call Fix the warning by annotating KernelSPE symbol with SYM_FUNC_START_LOCAL and SYM_FUNC_END macros. Reported-by: kernel test robot Signed-off-by: Sathvika Vasireddy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20230128124138.1066176-1-sv@linux.ibm.com --- arch/powerpc/kernel/head_85xx.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S index d438ca74e96c..fdbee1093e2b 100644 --- a/arch/powerpc/kernel/head_85xx.S +++ b/arch/powerpc/kernel/head_85xx.S @@ -864,7 +864,7 @@ _GLOBAL(load_up_spe) * SPE unavailable trap from kernel - print a message, but let * the task use SPE in the kernel until it returns to user mode. */ -KernelSPE: +SYM_FUNC_START_LOCAL(KernelSPE) lwz r3,_MSR(r1) oris r3,r3,MSR_SPE@h stw r3,_MSR(r1) /* enable use of SPE after return */ @@ -881,6 +881,7 @@ KernelSPE: #endif .align 4,0 +SYM_FUNC_END(KernelSPE) #endif /* CONFIG_SPE */ /* -- cgit v1.2.3 From fe6de81b610e5d0b9d2231acff2de74a35482e7d Mon Sep 17 00:00:00 2001 From: Sathvika Vasireddy Date: Sat, 28 Jan 2023 18:11:58 +0530 Subject: powerpc/kvm: Fix unannotated intra-function call warning objtool throws the following warning: arch/powerpc/kvm/booke.o: warning: objtool: kvmppc_fill_pt_regs+0x30: unannotated intra-function call Fix the warning by setting the value of 'nip' using the _THIS_IP_ macro, without using an assembly bl/mflr sequence to save the instruction pointer. Reported-by: kernel test robot Suggested-by: Michael Ellerman Signed-off-by: Sathvika Vasireddy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20230128124158.1066251-1-sv@linux.ibm.com --- arch/powerpc/kvm/booke.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 0dce93ccaadf..e89281d3ba28 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -912,16 +912,15 @@ static int kvmppc_handle_debug(struct kvm_vcpu *vcpu) static void kvmppc_fill_pt_regs(struct pt_regs *regs) { - ulong r1, ip, msr, lr; + ulong r1, msr, lr; asm("mr %0, 1" : "=r"(r1)); asm("mflr %0" : "=r"(lr)); asm("mfmsr %0" : "=r"(msr)); - asm("bl 1f; 1: mflr %0" : "=r"(ip)); memset(regs, 0, sizeof(*regs)); regs->gpr[1] = r1; - regs->nip = ip; + regs->nip = _THIS_IP_; regs->msr = msr; regs->link = lr; } -- cgit v1.2.3 From bc88ef663265676419555df2dc469a471c0add31 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 21 Jan 2023 19:53:52 +1000 Subject: powerpc/64s: Fix local irq disable when PMIs are disabled When PMI interrupts are soft-masked, local_irq_save() will clear the PMI mask bit, allowing PMIs in and causing a race condition. This causes a deadlock in native_hpte_insert via hash_preload, which depends on PMIs being disabled since commit 8b91cee5eadd ("powerpc/64s/hash: Make hash faults work in NMI context"). native_hpte_insert calls local_irq_save(). It's possible the lpar hash code is also affected when tracing is enabled because __trace_hcall_entry() calls local_irq_save(). Fix this by making arch_local_irq_save() _or_ the IRQS_DISABLED bit into the mask. This was found with the stress_hpt option with a kbuild workload running together with `perf record -g`. Fixes: f442d004806e ("powerpc/64s: Add support to mask perf interrupts and replay them") Fixes: 8b91cee5eadd ("powerpc/64s/hash: Make hash faults work in NMI context") Signed-off-by: Nicholas Piggin [mpe: Just take the fix without the new warning] Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20230121095352.2823517-1-npiggin@gmail.com --- arch/powerpc/include/asm/hw_irq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 77fa88c2aed0..0b7d01d408ac 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -192,7 +192,7 @@ static inline void arch_local_irq_enable(void) static inline unsigned long arch_local_irq_save(void) { - return irq_soft_mask_set_return(IRQS_DISABLED); + return irq_soft_mask_or_return(IRQS_DISABLED); } static inline bool arch_irqs_disabled_flags(unsigned long flags) -- cgit v1.2.3 From c28548012ee2bac55772ef7685138bd1124b80c3 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sat, 21 Jan 2023 20:01:56 +1000 Subject: powerpc/64: Fix perf profiling asynchronous interrupt handlers Interrupt entry sets the soft mask to IRQS_ALL_DISABLED to match the hard irq disabled state. So when should_hard_irq_enable() returns true because we want PMI interrupts in irq handlers, MSR[EE] is enabled but PMIs just get soft-masked. Fix this by clearing IRQS_PMI_DISABLED before enabling MSR[EE]. This also tidies some of the warnings, no need to duplicate them in both should_hard_irq_enable() and do_hard_irq_enable(). Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20230121100156.2824054-1-npiggin@gmail.com --- arch/powerpc/include/asm/hw_irq.h | 41 +++++++++++++++++++++++++++------------ arch/powerpc/kernel/dbell.c | 2 +- arch/powerpc/kernel/irq.c | 2 +- arch/powerpc/kernel/time.c | 2 +- 4 files changed, 32 insertions(+), 15 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 0b7d01d408ac..eb6d094083fd 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -173,6 +173,15 @@ static inline notrace unsigned long irq_soft_mask_or_return(unsigned long mask) return flags; } +static inline notrace unsigned long irq_soft_mask_andc_return(unsigned long mask) +{ + unsigned long flags = irq_soft_mask_return(); + + irq_soft_mask_set(flags & ~mask); + + return flags; +} + static inline unsigned long arch_local_save_flags(void) { return irq_soft_mask_return(); @@ -331,10 +340,11 @@ bool power_pmu_wants_prompt_pmi(void); * is a different soft-masked interrupt pending that requires hard * masking. */ -static inline bool should_hard_irq_enable(void) +static inline bool should_hard_irq_enable(struct pt_regs *regs) { if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { - WARN_ON(irq_soft_mask_return() == IRQS_ENABLED); + WARN_ON(irq_soft_mask_return() != IRQS_ALL_DISABLED); + WARN_ON(!(get_paca()->irq_happened & PACA_IRQ_HARD_DIS)); WARN_ON(mfmsr() & MSR_EE); } @@ -347,8 +357,17 @@ static inline bool should_hard_irq_enable(void) * * TODO: Add test for 64e */ - if (IS_ENABLED(CONFIG_PPC_BOOK3S_64) && !power_pmu_wants_prompt_pmi()) - return false; + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) { + if (!power_pmu_wants_prompt_pmi()) + return false; + /* + * If PMIs are disabled then IRQs should be disabled as well, + * so we shouldn't see this condition, check for it just in + * case because we are about to enable PMIs. + */ + if (WARN_ON_ONCE(regs->softe & IRQS_PMI_DISABLED)) + return false; + } if (get_paca()->irq_happened & PACA_IRQ_MUST_HARD_MASK) return false; @@ -358,18 +377,16 @@ static inline bool should_hard_irq_enable(void) /* * Do the hard enabling, only call this if should_hard_irq_enable is true. + * This allows PMI interrupts to profile irq handlers. */ static inline void do_hard_irq_enable(void) { - if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) { - WARN_ON(irq_soft_mask_return() == IRQS_ENABLED); - WARN_ON(get_paca()->irq_happened & PACA_IRQ_MUST_HARD_MASK); - WARN_ON(mfmsr() & MSR_EE); - } /* - * This allows PMI interrupts (and watchdog soft-NMIs) through. - * There is no other reason to enable this way. + * Asynch interrupts come in with IRQS_ALL_DISABLED, + * PACA_IRQ_HARD_DIS, and MSR[EE]=0. */ + if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) + irq_soft_mask_andc_return(IRQS_PMI_DISABLED); get_paca()->irq_happened &= ~PACA_IRQ_HARD_DIS; __hard_irq_enable(); } @@ -452,7 +469,7 @@ static inline bool arch_irq_disabled_regs(struct pt_regs *regs) return !(regs->msr & MSR_EE); } -static __always_inline bool should_hard_irq_enable(void) +static __always_inline bool should_hard_irq_enable(struct pt_regs *regs) { return false; } diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c index f55c6fb34a3a..5712dd846263 100644 --- a/arch/powerpc/kernel/dbell.c +++ b/arch/powerpc/kernel/dbell.c @@ -27,7 +27,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(doorbell_exception) ppc_msgsync(); - if (should_hard_irq_enable()) + if (should_hard_irq_enable(regs)) do_hard_irq_enable(); kvmppc_clear_host_ipi(smp_processor_id()); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index c5b9ce887483..c9535f2760b5 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -238,7 +238,7 @@ static void __do_irq(struct pt_regs *regs, unsigned long oldsp) irq = static_call(ppc_get_irq)(); /* We can hard enable interrupts now to allow perf interrupts */ - if (should_hard_irq_enable()) + if (should_hard_irq_enable(regs)) do_hard_irq_enable(); /* And finally process it */ diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index d68de3618741..e26eb6618ae5 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -515,7 +515,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) } /* Conditionally hard-enable interrupts. */ - if (should_hard_irq_enable()) { + if (should_hard_irq_enable(regs)) { /* * Ensure a positive value is written to the decrementer, or * else some CPUs will continue to take decrementer exceptions. -- cgit v1.2.3 From ad53db4acb415976761d7302f5b02e97f2bd097e Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 30 Jan 2023 12:44:01 +1100 Subject: powerpc/imc-pmu: Revert nest_init_lock to being a mutex The recent commit 76d588dddc45 ("powerpc/imc-pmu: Fix use of mutex in IRQs disabled section") fixed warnings (and possible deadlocks) in the IMC PMU driver by converting the locking to use spinlocks. It also converted the init-time nest_init_lock to a spinlock, even though it's not used at runtime in IRQ disabled sections or while holding other spinlocks. This leads to warnings such as: BUG: sleeping function called from invalid context at include/linux/percpu-rwsem.h:49 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 1, name: swapper/0 preempt_count: 1, expected: 0 CPU: 7 PID: 1 Comm: swapper/0 Not tainted 6.2.0-rc2-14719-gf12cd06109f4-dirty #1 Hardware name: Mambo,Simulated-System POWER9 0x4e1203 opal:v6.6.6 PowerNV Call Trace: dump_stack_lvl+0x74/0xa8 (unreliable) __might_resched+0x178/0x1a0 __cpuhp_setup_state+0x64/0x1e0 init_imc_pmu+0xe48/0x1250 opal_imc_counters_probe+0x30c/0x6a0 platform_probe+0x78/0x110 really_probe+0x104/0x420 __driver_probe_device+0xb0/0x170 driver_probe_device+0x58/0x180 __driver_attach+0xd8/0x250 bus_for_each_dev+0xb4/0x140 driver_attach+0x34/0x50 bus_add_driver+0x1e8/0x2d0 driver_register+0xb4/0x1c0 __platform_driver_register+0x38/0x50 opal_imc_driver_init+0x2c/0x40 do_one_initcall+0x80/0x360 kernel_init_freeable+0x310/0x3b8 kernel_init+0x30/0x1a0 ret_from_kernel_thread+0x5c/0x64 Fix it by converting nest_init_lock back to a mutex, so that we can call sleeping functions while holding it. There is no interaction between nest_init_lock and the runtime spinlocks used by the actual PMU routines. Fixes: 76d588dddc45 ("powerpc/imc-pmu: Fix use of mutex in IRQs disabled section") Tested-by: Kajol Jain Reviewed-by: Kajol Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20230130014401.540543-1-mpe@ellerman.id.au --- arch/powerpc/perf/imc-pmu.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 100e97daf76b..9d229ef7f86e 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -22,7 +22,7 @@ * Used to avoid races in counting the nest-pmu units during hotplug * register and unregister */ -static DEFINE_SPINLOCK(nest_init_lock); +static DEFINE_MUTEX(nest_init_lock); static DEFINE_PER_CPU(struct imc_pmu_ref *, local_nest_imc_refc); static struct imc_pmu **per_nest_pmu_arr; static cpumask_t nest_imc_cpumask; @@ -1629,7 +1629,7 @@ static void imc_common_mem_free(struct imc_pmu *pmu_ptr) static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) { if (pmu_ptr->domain == IMC_DOMAIN_NEST) { - spin_lock(&nest_init_lock); + mutex_lock(&nest_init_lock); if (nest_pmus == 1) { cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE); kfree(nest_imc_refc); @@ -1639,7 +1639,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) if (nest_pmus > 0) nest_pmus--; - spin_unlock(&nest_init_lock); + mutex_unlock(&nest_init_lock); } /* Free core_imc memory */ @@ -1796,11 +1796,11 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id * rest. To handle the cpuhotplug callback unregister, we track * the number of nest pmus in "nest_pmus". */ - spin_lock(&nest_init_lock); + mutex_lock(&nest_init_lock); if (nest_pmus == 0) { ret = init_nest_pmu_ref(); if (ret) { - spin_unlock(&nest_init_lock); + mutex_unlock(&nest_init_lock); kfree(per_nest_pmu_arr); per_nest_pmu_arr = NULL; goto err_free_mem; @@ -1808,7 +1808,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id /* Register for cpu hotplug notification. */ ret = nest_pmu_cpumask_init(); if (ret) { - spin_unlock(&nest_init_lock); + mutex_unlock(&nest_init_lock); kfree(nest_imc_refc); kfree(per_nest_pmu_arr); per_nest_pmu_arr = NULL; @@ -1816,7 +1816,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id } } nest_pmus++; - spin_unlock(&nest_init_lock); + mutex_unlock(&nest_init_lock); break; case IMC_DOMAIN_CORE: ret = core_imc_pmu_cpumask_init(); -- cgit v1.2.3 From 7294194b47e994753a86eee8cf1c61f3f36458a3 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 30 Jan 2023 12:47:07 +1100 Subject: powerpc/kexec_file: Fix division by zero in extra size estimation In kexec_extra_fdt_size_ppc64() there's logic to estimate how much extra space will be needed in the device tree for some memory related properties. That logic uses the size of RAM divided by drmem_lmb_size() to do the estimation. However drmem_lmb_size() can be zero if the machine has no hotpluggable memory configured, which is the case when booting with qemu and no maxmem=x parameter is passed (the default). The division by zero is reported by UBSAN, and can also lead to an overflow and a warning from kvmalloc, and kdump kernel loading fails: WARNING: CPU: 0 PID: 133 at mm/util.c:596 kvmalloc_node+0x15c/0x160 Modules linked in: CPU: 0 PID: 133 Comm: kexec Not tainted 6.2.0-rc5-03455-g07358bd97810 #223 Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1200 0xf000005 of:SLOF,git-dd0dca pSeries NIP: c00000000041ff4c LR: c00000000041fe58 CTR: 0000000000000000 REGS: c0000000096ef750 TRAP: 0700 Not tainted (6.2.0-rc5-03455-g07358bd97810) MSR: 800000000282b033 CR: 24248242 XER: 2004011e CFAR: c00000000041fed0 IRQMASK: 0 ... NIP kvmalloc_node+0x15c/0x160 LR kvmalloc_node+0x68/0x160 Call Trace: kvmalloc_node+0x68/0x160 (unreliable) of_kexec_alloc_and_setup_fdt+0xb8/0x7d0 elf64_load+0x25c/0x4a0 kexec_image_load_default+0x58/0x80 sys_kexec_file_load+0x5c0/0x920 system_call_exception+0x128/0x330 system_call_vectored_common+0x15c/0x2ec To fix it, skip the calculation if drmem_lmb_size() is zero. Fixes: 2377c92e37fe ("powerpc/kexec_file: fix FDT size estimation for kdump kernel") Cc: stable@vger.kernel.org # v5.12+ Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20230130014707.541110-1-mpe@ellerman.id.au --- arch/powerpc/kexec/file_load_64.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index af8854f9eae3..19d084682bc2 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -989,10 +989,13 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) * linux,drconf-usable-memory properties. Get an approximate on the * number of usable memory entries and use for FDT size estimation. */ - usm_entries = ((memblock_end_of_DRAM() / drmem_lmb_size()) + - (2 * (resource_size(&crashk_res) / drmem_lmb_size()))); - - extra_size = (unsigned int)(usm_entries * sizeof(u64)); + if (drmem_lmb_size()) { + usm_entries = ((memblock_end_of_DRAM() / drmem_lmb_size()) + + (2 * (resource_size(&crashk_res) / drmem_lmb_size()))); + extra_size = (unsigned int)(usm_entries * sizeof(u64)); + } else { + extra_size = 0; + } /* * Get the number of CPU nodes in the current DT. This allows to -- cgit v1.2.3 From 98d0219e043e09013e883eacde3b93e0b2bf944d Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 10 Jan 2023 23:47:52 +1100 Subject: powerpc/64s/radix: Fix crash with unaligned relocated kernel If a relocatable kernel is loaded at an address that is not 2MB aligned and told not to relocate to zero, the kernel can crash due to mark_rodata_ro() incorrectly changing some read-write data to read-only. Scenarios where the misalignment can occur are when the kernel is loaded by kdump or using the RELOCATABLE_TEST config option. Example crash with the kernel loaded at 5MB: Run /sbin/init as init process BUG: Unable to handle kernel data access on write at 0xc000000000452000 Faulting instruction address: 0xc0000000005b6730 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries CPU: 1 PID: 1 Comm: init Not tainted 6.2.0-rc1-00011-g349188be4841 #166 Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 0xf000005 of:SLOF,git-5b4c5a hv:linux,kvm pSeries NIP: c0000000005b6730 LR: c000000000ae9ab8 CTR: 0000000000000380 REGS: c000000004503250 TRAP: 0300 Not tainted (6.2.0-rc1-00011-g349188be4841) MSR: 8000000000009033 CR: 44288480 XER: 00000000 CFAR: c0000000005b66ec DAR: c000000000452000 DSISR: 0a000000 IRQMASK: 0 ... NIP memset+0x68/0x104 LR zero_user_segments.constprop.0+0xa8/0xf0 Call Trace: ext4_mpage_readpages+0x7f8/0x830 ext4_readahead+0x48/0x60 read_pages+0xb8/0x380 page_cache_ra_unbounded+0x19c/0x250 filemap_fault+0x58c/0xae0 __do_fault+0x60/0x100 __handle_mm_fault+0x1230/0x1a40 handle_mm_fault+0x120/0x300 ___do_page_fault+0x20c/0xa80 do_page_fault+0x30/0xc0 data_access_common_virt+0x210/0x220 This happens because mark_rodata_ro() tries to change permissions on the range _stext..__end_rodata, but _stext sits in the middle of the 2MB page from 4MB to 6MB: radix-mmu: Mapped 0x0000000000000000-0x0000000000200000 with 2.00 MiB pages (exec) radix-mmu: Mapped 0x0000000000200000-0x0000000000400000 with 2.00 MiB pages radix-mmu: Mapped 0x0000000000400000-0x0000000002400000 with 2.00 MiB pages (exec) The logic that changes the permissions assumes the linear mapping was split correctly at boot, so it marks the entire 2MB page read-only. That leads to the write fault above. To fix it, the boot time mapping logic needs to consider that if the kernel is running at a non-zero address then _stext is a boundary where it must split the mapping. That leads to the mapping being split correctly, allowing the rodata permission change to take happen correctly, with no spillover: radix-mmu: Mapped 0x0000000000000000-0x0000000000200000 with 2.00 MiB pages (exec) radix-mmu: Mapped 0x0000000000200000-0x0000000000400000 with 2.00 MiB pages radix-mmu: Mapped 0x0000000000400000-0x0000000000500000 with 64.0 KiB pages radix-mmu: Mapped 0x0000000000500000-0x0000000000600000 with 64.0 KiB pages (exec) radix-mmu: Mapped 0x0000000000600000-0x0000000002400000 with 2.00 MiB pages (exec) If the kernel is loaded at a 2MB aligned address, the mapping continues to use 2MB pages as before: radix-mmu: Mapped 0x0000000000000000-0x0000000000200000 with 2.00 MiB pages (exec) radix-mmu: Mapped 0x0000000000200000-0x0000000000400000 with 2.00 MiB pages radix-mmu: Mapped 0x0000000000400000-0x0000000002c00000 with 2.00 MiB pages (exec) radix-mmu: Mapped 0x0000000002c00000-0x0000000100000000 with 2.00 MiB pages Fixes: c55d7b5e6426 ("powerpc: Remove STRICT_KERNEL_RWX incompatibility with RELOCATABLE") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20230110124753.1325426-1-mpe@ellerman.id.au --- arch/powerpc/mm/book3s64/radix_pgtable.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index cac727b01799..5a2384ed1727 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -262,6 +262,17 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e static unsigned long next_boundary(unsigned long addr, unsigned long end) { #ifdef CONFIG_STRICT_KERNEL_RWX + unsigned long stext_phys; + + stext_phys = __pa_symbol(_stext); + + // Relocatable kernel running at non-zero real address + if (stext_phys != 0) { + // Start of relocated kernel text is a rodata boundary + if (addr < stext_phys) + return stext_phys; + } + if (addr < __pa_symbol(__srwx_boundary)) return __pa_symbol(__srwx_boundary); #endif -- cgit v1.2.3 From 111bcb37385353f0510e5847d5abcd1c613dba23 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 10 Jan 2023 23:47:53 +1100 Subject: powerpc/64s/radix: Fix RWX mapping with relocated kernel If a relocatable kernel is loaded at a non-zero address and told not to relocate to zero (kdump or RELOCATABLE_TEST), the mapping of the interrupt code at zero is left with RWX permissions. That is a security weakness, and leads to a warning at boot if CONFIG_DEBUG_WX is enabled: powerpc/mm: Found insecure W+X mapping at address 00000000056435bc/0xc000000000000000 WARNING: CPU: 1 PID: 1 at arch/powerpc/mm/ptdump/ptdump.c:193 note_page+0x484/0x4c0 CPU: 1 PID: 1 Comm: swapper/0 Not tainted 6.2.0-rc1-00001-g8ae8e98aea82-dirty #175 Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 0xf000005 of:SLOF,git-dd0dca hv:linux,kvm pSeries NIP: c0000000004a1c34 LR: c0000000004a1c30 CTR: 0000000000000000 REGS: c000000003503770 TRAP: 0700 Not tainted (6.2.0-rc1-00001-g8ae8e98aea82-dirty) MSR: 8000000002029033 CR: 24000220 XER: 00000000 CFAR: c000000000545a58 IRQMASK: 0 ... NIP note_page+0x484/0x4c0 LR note_page+0x480/0x4c0 Call Trace: note_page+0x480/0x4c0 (unreliable) ptdump_pmd_entry+0xc8/0x100 walk_pgd_range+0x618/0xab0 walk_page_range_novma+0x74/0xc0 ptdump_walk_pgd+0x98/0x170 ptdump_check_wx+0x94/0x100 mark_rodata_ro+0x30/0x70 kernel_init+0x78/0x1a0 ret_from_kernel_thread+0x5c/0x64 The fix has two parts. Firstly the pages from zero up to the end of interrupts need to be marked read-only, so that they are left with R-X permissions. Secondly the mapping logic needs to be taught to ensure there is a page boundary at the end of the interrupt region, so that the permission change only applies to the interrupt text, and not the region following it. Fixes: c55d7b5e6426 ("powerpc: Remove STRICT_KERNEL_RWX incompatibility with RELOCATABLE") Reported-by: Sachin Sant Tested-by: Sachin Sant Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20230110124753.1325426-2-mpe@ellerman.id.au --- arch/powerpc/mm/book3s64/radix_pgtable.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 5a2384ed1727..26245aaf12b8 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -234,6 +234,14 @@ void radix__mark_rodata_ro(void) end = (unsigned long)__end_rodata; radix__change_memory_range(start, end, _PAGE_WRITE); + + for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) { + end = start + PAGE_SIZE; + if (overlaps_interrupt_vector_text(start, end)) + radix__change_memory_range(start, end, _PAGE_WRITE); + else + break; + } } void radix__mark_initmem_nx(void) @@ -268,6 +276,11 @@ static unsigned long next_boundary(unsigned long addr, unsigned long end) // Relocatable kernel running at non-zero real address if (stext_phys != 0) { + // The end of interrupts code at zero is a rodata boundary + unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys; + if (addr < end_intr) + return end_intr; + // Start of relocated kernel text is a rodata boundary if (addr < stext_phys) return stext_phys; -- cgit v1.2.3 From 9d2c7203ffdb846399b82b0660563c89e918c751 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 31 Jan 2023 09:57:18 +0100 Subject: x86/debug: Fix stack recursion caused by wrongly ordered DR7 accesses In kernels compiled with CONFIG_PARAVIRT=n, the compiler re-orders the DR7 read in exc_nmi() to happen before the call to sev_es_ist_enter(). This is problematic when running as an SEV-ES guest because in this environment the DR7 read might cause a #VC exception, and taking #VC exceptions is not safe in exc_nmi() before sev_es_ist_enter() has run. The result is stack recursion if the NMI was caused on the #VC IST stack, because a subsequent #VC exception in the NMI handler will overwrite the stack frame of the interrupted #VC handler. As there are no compiler barriers affecting the ordering of DR7 reads/writes, make the accesses to this register volatile, forbidding the compiler to re-order them. [ bp: Massage text, make them volatile too, to make sure some aggressive compiler optimization pass doesn't discard them. ] Fixes: 315562c9af3d ("x86/sev-es: Adjust #VC IST Stack on entering NMI handler") Reported-by: Alexey Kardashevskiy Signed-off-by: Joerg Roedel Signed-off-by: Borislav Petkov (AMD) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230127035616.508966-1-aik@amd.com --- arch/x86/include/asm/debugreg.h | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index b049d950612f..ca97442e8d49 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -39,7 +39,20 @@ static __always_inline unsigned long native_get_debugreg(int regno) asm("mov %%db6, %0" :"=r" (val)); break; case 7: - asm("mov %%db7, %0" :"=r" (val)); + /* + * Apply __FORCE_ORDER to DR7 reads to forbid re-ordering them + * with other code. + * + * This is needed because a DR7 access can cause a #VC exception + * when running under SEV-ES. Taking a #VC exception is not a + * safe thing to do just anywhere in the entry code and + * re-ordering might place the access into an unsafe location. + * + * This happened in the NMI handler, where the DR7 read was + * re-ordered to happen before the call to sev_es_ist_enter(), + * causing stack recursion. + */ + asm volatile("mov %%db7, %0" : "=r" (val) : __FORCE_ORDER); break; default: BUG(); @@ -66,7 +79,16 @@ static __always_inline void native_set_debugreg(int regno, unsigned long value) asm("mov %0, %%db6" ::"r" (value)); break; case 7: - asm("mov %0, %%db7" ::"r" (value)); + /* + * Apply __FORCE_ORDER to DR7 writes to forbid re-ordering them + * with other code. + * + * While is didn't happen with a DR7 write (see the DR7 read + * comment above which explains where it happened), add the + * __FORCE_ORDER here too to avoid similar problems in the + * future. + */ + asm volatile("mov %0, %%db7" ::"r" (value), __FORCE_ORDER); break; default: BUG(); -- cgit v1.2.3 From 6f28a2613497fc587e347afa99fa2c52230678a7 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 17 Jan 2023 15:16:32 +0000 Subject: ia64: fix build error due to switch case label appearing next to declaration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since commit aa06a9bd8533 ("ia64: fix clock_getres(CLOCK_MONOTONIC) to report ITC frequency"), gcc 10.1.0 fails to build ia64 with the gnomic: | ../arch/ia64/kernel/sys_ia64.c: In function 'ia64_clock_getres': | ../arch/ia64/kernel/sys_ia64.c:189:3: error: a label can only be part of a statement and a declaration is not a statement | 189 | s64 tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); This line appears immediately after a case label in a switch. Move the declarations out of the case, to the top of the function. Link: https://lkml.kernel.org/r/20230117151632.393836-1-james.morse@arm.com Fixes: aa06a9bd8533 ("ia64: fix clock_getres(CLOCK_MONOTONIC) to report ITC frequency") Signed-off-by: James Morse Reviewed-by: Sergei Trofimovich Cc: Émeric Maschino Cc: matoro Cc: John Paul Adrian Glaubitz Cc: Signed-off-by: Andrew Morton --- arch/ia64/kernel/sys_ia64.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index f6a502e8f02c..6e948d015332 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -170,6 +170,9 @@ ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, u asmlinkage long ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp) { + struct timespec64 rtn_tp; + s64 tick_ns; + /* * ia64's clock_gettime() syscall is implemented as a vdso call * fsys_clock_gettime(). Currently it handles only @@ -185,8 +188,8 @@ ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user * switch (which_clock) { case CLOCK_REALTIME: case CLOCK_MONOTONIC: - s64 tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); - struct timespec64 rtn_tp = ns_to_timespec64(tick_ns); + tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); + rtn_tp = ns_to_timespec64(tick_ns); return put_timespec64(&rtn_tp, tp); } -- cgit v1.2.3 From c1c551bebf928889e7a8fef7415b44f9a64975f4 Mon Sep 17 00:00:00 2001 From: Tom Saeger Date: Mon, 23 Jan 2023 17:09:35 -0700 Subject: sh: define RUNTIME_DISCARD_EXIT sh vmlinux fails to link with GNU ld < 2.40 (likely < 2.36) since commit 99cb0d917ffa ("arch: fix broken BuildID for arm64 and riscv"). This is similar to fixes for powerpc and s390: commit 4b9880dbf3bd ("powerpc/vmlinux.lds: Define RUNTIME_DISCARD_EXIT"). commit a494398bde27 ("s390: define RUNTIME_DISCARD_EXIT to fix link error with GNU ld < 2.36"). $ sh4-linux-gnu-ld --version | head -n1 GNU ld (GNU Binutils for Debian) 2.35.2 $ make ARCH=sh CROSS_COMPILE=sh4-linux-gnu- microdev_defconfig $ make ARCH=sh CROSS_COMPILE=sh4-linux-gnu- `.exit.text' referenced in section `__bug_table' of crypto/algboss.o: defined in discarded section `.exit.text' of crypto/algboss.o `.exit.text' referenced in section `__bug_table' of drivers/char/hw_random/core.o: defined in discarded section `.exit.text' of drivers/char/hw_random/core.o make[2]: *** [scripts/Makefile.vmlinux:34: vmlinux] Error 1 make[1]: *** [Makefile:1252: vmlinux] Error 2 arch/sh/kernel/vmlinux.lds.S keeps EXIT_TEXT: /* * .exit.text is discarded at runtime, not link time, to deal with * references from __bug_table */ .exit.text : AT(ADDR(.exit.text)) { EXIT_TEXT } However, EXIT_TEXT is thrown away by DISCARD(include/asm-generic/vmlinux.lds.h) because sh does not define RUNTIME_DISCARD_EXIT. GNU ld 2.40 does not have this issue and builds fine. This corresponds with Masahiro's comments in a494398bde27: "Nathan [Chancellor] also found that binutils commit 21401fc7bf67 ("Duplicate output sections in scripts") cured this issue, so we cannot reproduce it with binutils 2.36+, but it is better to not rely on it." Link: https://lkml.kernel.org/r/9166a8abdc0f979e50377e61780a4bba1dfa2f52.1674518464.git.tom.saeger@oracle.com Fixes: 99cb0d917ffa ("arch: fix broken BuildID for arm64 and riscv") Link: https://lore.kernel.org/all/Y7Jal56f6UBh1abE@dev-arch.thelio-3990X/ Link: https://lore.kernel.org/all/20230123194218.47ssfzhrpnv3xfez@oracle.com/ Signed-off-by: Tom Saeger Tested-by: John Paul Adrian Glaubitz Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dennis Gilmore Cc: Greg Kroah-Hartman Cc: Masahiro Yamada Cc: Naresh Kamboju Cc: Nathan Chancellor Cc: Palmer Dabbelt Cc: Rich Felker Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/sh/kernel/vmlinux.lds.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 3161b9ccd2a5..b6276a3521d7 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -4,6 +4,7 @@ * Written by Niibe Yutaka and Paul Mundt */ OUTPUT_ARCH(sh) +#define RUNTIME_DISCARD_EXIT #include #include #include -- cgit v1.2.3 From fc546faa559538fb312c77e055243ece18ab3288 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Tue, 31 Jan 2023 08:36:15 +0530 Subject: powerpc/kexec_file: Count hot-pluggable memory in FDT estimate On Systems where online memory is lesser compared to max memory, the kexec_file_load system call may fail to load the kdump kernel with the below errors: "Failed to update fdt with linux,drconf-usable-memory property" "Error setting up usable-memory property for kdump kernel" This happens because the size estimation for usable memory properties for the kdump kernel's FDT is based on the online memory whereas the usable memory properties include max memory. In short, the hot-pluggable memory is not accounted for while estimating the size of the usable memory properties. The issue is addressed by calculating usable memory property size using max hotplug address instead of the last online memory address. Fixes: 2377c92e37fe ("powerpc/kexec_file: fix FDT size estimation for kdump kernel") Signed-off-by: Sourabh Jain Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20230131030615.729894-1-sourabhjain@linux.ibm.com --- arch/powerpc/kexec/file_load_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 19d084682bc2..52085751f5f4 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -990,7 +990,7 @@ unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) * number of usable memory entries and use for FDT size estimation. */ if (drmem_lmb_size()) { - usm_entries = ((memblock_end_of_DRAM() / drmem_lmb_size()) + + usm_entries = ((memory_hotplug_max() / drmem_lmb_size()) + (2 * (resource_size(&crashk_res) / drmem_lmb_size()))); extra_size = (unsigned int)(usm_entries * sizeof(u64)); } else { -- cgit v1.2.3 From 0b1d60d6dd9e2e867cc6e4277d73ea5a7ff2d4d0 Mon Sep 17 00:00:00 2001 From: Samuel Holland Date: Thu, 22 Sep 2022 01:09:58 -0500 Subject: riscv: Fix build with CONFIG_CC_OPTIMIZE_FOR_SIZE=y commit 8eb060e10185 ("arch/riscv: add Zihintpause support") broke building with CONFIG_CC_OPTIMIZE_FOR_SIZE enabled (gcc 11.1.0): CC arch/riscv/kernel/vdso/vgettimeofday.o In file included from : ./arch/riscv/include/asm/jump_label.h: In function 'cpu_relax': ././include/linux/compiler_types.h:285:33: warning: 'asm' operand 0 probably does not match constraints 285 | #define asm_volatile_goto(x...) asm goto(x) | ^~~ ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto' 41 | asm_volatile_goto( | ^~~~~~~~~~~~~~~~~ ././include/linux/compiler_types.h:285:33: error: impossible constraint in 'asm' 285 | #define asm_volatile_goto(x...) asm goto(x) | ^~~ ./arch/riscv/include/asm/jump_label.h:41:9: note: in expansion of macro 'asm_volatile_goto' 41 | asm_volatile_goto( | ^~~~~~~~~~~~~~~~~ make[1]: *** [scripts/Makefile.build:249: arch/riscv/kernel/vdso/vgettimeofday.o] Error 1 make: *** [arch/riscv/Makefile:128: vdso_prepare] Error 2 Having a static branch in cpu_relax() is problematic because that function is widely inlined, including in some quite complex functions like in the VDSO. A quick measurement shows this static branch is responsible by itself for around 40% of the jump table. Drop the static branch, which ends up being the same number of instructions anyway. If Zihintpause is supported, we trade the nop from the static branch for a div. If Zihintpause is unsupported, we trade the jump from the static branch for (what gets interpreted as) a nop. Fixes: 8eb060e10185 ("arch/riscv: add Zihintpause support") Signed-off-by: Samuel Holland Reviewed-by: Conor Dooley Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/hwcap.h | 3 --- arch/riscv/include/asm/vdso/processor.h | 25 ++++++++++--------------- 2 files changed, 10 insertions(+), 18 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h index 3c8a5ca95c72..3bf10a8e665a 100644 --- a/arch/riscv/include/asm/hwcap.h +++ b/arch/riscv/include/asm/hwcap.h @@ -67,7 +67,6 @@ enum riscv_isa_ext_id { */ enum riscv_isa_ext_key { RISCV_ISA_EXT_KEY_FPU, /* For 'F' and 'D' */ - RISCV_ISA_EXT_KEY_ZIHINTPAUSE, RISCV_ISA_EXT_KEY_MAX, }; @@ -87,8 +86,6 @@ static __always_inline int riscv_isa_ext2key(int num) return RISCV_ISA_EXT_KEY_FPU; case RISCV_ISA_EXT_d: return RISCV_ISA_EXT_KEY_FPU; - case RISCV_ISA_EXT_ZIHINTPAUSE: - return RISCV_ISA_EXT_KEY_ZIHINTPAUSE; default: return -EINVAL; } diff --git a/arch/riscv/include/asm/vdso/processor.h b/arch/riscv/include/asm/vdso/processor.h index 1e4f8b4aef79..789bdb8211a2 100644 --- a/arch/riscv/include/asm/vdso/processor.h +++ b/arch/riscv/include/asm/vdso/processor.h @@ -4,30 +4,25 @@ #ifndef __ASSEMBLY__ -#include #include -#include static inline void cpu_relax(void) { - if (!static_branch_likely(&riscv_isa_ext_keys[RISCV_ISA_EXT_KEY_ZIHINTPAUSE])) { #ifdef __riscv_muldiv - int dummy; - /* In lieu of a halt instruction, induce a long-latency stall. */ - __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy)); + int dummy; + /* In lieu of a halt instruction, induce a long-latency stall. */ + __asm__ __volatile__ ("div %0, %0, zero" : "=r" (dummy)); #endif - } else { - /* - * Reduce instruction retirement. - * This assumes the PC changes. - */ + /* + * Reduce instruction retirement. + * This assumes the PC changes. + */ #ifdef __riscv_zihintpause - __asm__ __volatile__ ("pause"); + __asm__ __volatile__ ("pause"); #else - /* Encoding of the pause instruction */ - __asm__ __volatile__ (".4byte 0x100000F"); + /* Encoding of the pause instruction */ + __asm__ __volatile__ (".4byte 0x100000F"); #endif - } barrier(); } -- cgit v1.2.3 From 3f0c17809a098d3f0c1ec83f1fb3ca61638d3dcd Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 1 Feb 2023 11:04:06 +0100 Subject: parisc: Replace hardcoded value with PRIV_USER constant in ptrace.c Prefer usage of the PRIV_USER constant over the hard-coded value to set the lowest 2 bits for the userspace privilege. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # 5.16+ --- arch/parisc/kernel/ptrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c index 69c62933e952..31735e982b6b 100644 --- a/arch/parisc/kernel/ptrace.c +++ b/arch/parisc/kernel/ptrace.c @@ -166,7 +166,7 @@ long arch_ptrace(struct task_struct *child, long request, addr >= sizeof(struct pt_regs)) break; if (addr == PT_IAOQ0 || addr == PT_IAOQ1) { - data |= 3; /* ensure userspace privilege */ + data |= PRIV_USER; /* ensure userspace privilege */ } if ((addr >= PT_GR1 && addr <= PT_GR31) || addr == PT_IAOQ0 || addr == PT_IAOQ1 || @@ -285,7 +285,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, if (addr >= sizeof(struct pt_regs)) break; if (addr == PT_IAOQ0+4 || addr == PT_IAOQ1+4) { - data |= 3; /* ensure userspace privilege */ + data |= PRIV_USER; /* ensure userspace privilege */ } if (addr >= PT_FR0 && addr <= PT_FR31 + 4) { /* Special case, fp regs are 64 bits anyway */ @@ -484,7 +484,7 @@ static void set_reg(struct pt_regs *regs, int num, unsigned long val) case RI(iaoq[0]): case RI(iaoq[1]): /* set 2 lowest bits to ensure userspace privilege: */ - regs->iaoq[num - RI(iaoq[0])] = val | 3; + regs->iaoq[num - RI(iaoq[0])] = val | PRIV_USER; return; case RI(sar): regs->sar = val; return; -- cgit v1.2.3 From 316f1f42b5cc1d95124c1f0387c867c1ba7b6d0e Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 1 Feb 2023 16:41:54 +0100 Subject: parisc: Wire up PTRACE_GETREGS/PTRACE_SETREGS for compat case Wire up the missing ptrace requests PTRACE_GETREGS, PTRACE_SETREGS, PTRACE_GETFPREGS and PTRACE_SETFPREGS when running 32-bit applications on 64-bit kernels. Signed-off-by: Helge Deller Cc: stable@vger.kernel.org # 4.7+ --- arch/parisc/kernel/ptrace.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c index 31735e982b6b..ceb45f51d52e 100644 --- a/arch/parisc/kernel/ptrace.c +++ b/arch/parisc/kernel/ptrace.c @@ -126,6 +126,12 @@ long arch_ptrace(struct task_struct *child, long request, unsigned long tmp; long ret = -EIO; + unsigned long user_regs_struct_size = sizeof(struct user_regs_struct); +#ifdef CONFIG_64BIT + if (is_compat_task()) + user_regs_struct_size /= 2; +#endif + switch (request) { /* Read the word at location addr in the USER area. For ptraced @@ -181,14 +187,14 @@ long arch_ptrace(struct task_struct *child, long request, return copy_regset_to_user(child, task_user_regset_view(current), REGSET_GENERAL, - 0, sizeof(struct user_regs_struct), + 0, user_regs_struct_size, datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, task_user_regset_view(current), REGSET_GENERAL, - 0, sizeof(struct user_regs_struct), + 0, user_regs_struct_size, datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ @@ -302,6 +308,11 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, } } break; + case PTRACE_GETREGS: + case PTRACE_SETREGS: + case PTRACE_GETFPREGS: + case PTRACE_SETFPREGS: + return arch_ptrace(child, request, addr, data); default: ret = compat_ptrace_request(child, request, addr, data); -- cgit v1.2.3 From 1665c027afb225882a5a0b014c45e84290b826c2 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Tue, 31 Jan 2023 22:14:07 +1100 Subject: powerpc/64s: Reconnect tlb_flush() to hash__tlb_flush() Commit baf1ed24b27d ("powerpc/mm: Remove empty hash__ functions") removed some empty hash MMU flushing routines, but got a bit overeager and also removed the call to hash__tlb_flush() from tlb_flush(). In regular use this doesn't lead to any noticable breakage, which is a little concerning. Presumably there are flushes happening via other paths such as arch_leave_lazy_mmu_mode(), and/or a bit of luck. Fix it by reinstating the call to hash__tlb_flush(). Fixes: baf1ed24b27d ("powerpc/mm: Remove empty hash__ functions") Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20230131111407.806770-1-mpe@ellerman.id.au --- arch/powerpc/include/asm/book3s/64/tlbflush.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index dd39313242b4..d5cd16270c5d 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -97,6 +97,8 @@ static inline void tlb_flush(struct mmu_gather *tlb) { if (radix_enabled()) radix__tlb_flush(tlb); + + return hash__tlb_flush(tlb); } #ifdef CONFIG_SMP -- cgit v1.2.3 From 87f48c7ccc73afc78630530d9af51f458f58cab8 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Tue, 31 Jan 2023 23:06:04 -0500 Subject: riscv: kprobe: Fixup kernel panic when probing an illegal position MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kernel would panic when probed for an illegal position. eg: (CONFIG_RISCV_ISA_C=n) echo 'p:hello kernel_clone+0x16 a0=%a0' >> kprobe_events echo 1 > events/kprobes/hello/enable cat trace Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: __do_sys_newfstatat+0xb8/0xb8 CPU: 0 PID: 111 Comm: sh Not tainted 6.2.0-rc1-00027-g2d398fe49a4d #490 Hardware name: riscv-virtio,qemu (DT) Call Trace: [] dump_backtrace+0x38/0x48 [] show_stack+0x50/0x68 [] dump_stack_lvl+0x60/0x84 [] dump_stack+0x20/0x30 [] panic+0x160/0x374 [] generic_handle_arch_irq+0x0/0xa8 [] sys_newstat+0x0/0x30 [] sys_clone+0x20/0x30 [] ret_from_syscall+0x0/0x4 ---[ end Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: __do_sys_newfstatat+0xb8/0xb8 ]--- That is because the kprobe's ebreak instruction broke the kernel's original code. The user should guarantee the correction of the probe position, but it couldn't make the kernel panic. This patch adds arch_check_kprobe in arch_prepare_kprobe to prevent an illegal position (Such as the middle of an instruction). Fixes: c22b0bcb1dd0 ("riscv: Add kprobes supported") Signed-off-by: Guo Ren Signed-off-by: Guo Ren Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20230201040604.3390509-1-guoren@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/probes/kprobes.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch') diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index f21592d20306..41c7481afde3 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -48,6 +48,21 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) post_kprobe_handler(p, kcb, regs); } +static bool __kprobes arch_check_kprobe(struct kprobe *p) +{ + unsigned long tmp = (unsigned long)p->addr - p->offset; + unsigned long addr = (unsigned long)p->addr; + + while (tmp <= addr) { + if (tmp == addr) + return true; + + tmp += GET_INSN_LENGTH(*(u16 *)tmp); + } + + return false; +} + int __kprobes arch_prepare_kprobe(struct kprobe *p) { unsigned long probe_addr = (unsigned long)p->addr; @@ -55,6 +70,9 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) if (probe_addr & 0x1) return -EILSEQ; + if (!arch_check_kprobe(p)) + return -EILSEQ; + /* copy instruction */ p->opcode = *p->addr; -- cgit v1.2.3 From 2f394c0e7d1129a35156e492bc8f445fb20f43ac Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Wed, 1 Feb 2023 10:29:45 +0100 Subject: riscv: disable generation of unwind tables GCC 13 will enable -fasynchronous-unwind-tables by default on riscv. In the kernel, we don't have any use for unwind tables yet, so disable them. More importantly, the .eh_frame section brings relocations (R_RISC_32_PCREL, R_RISCV_SET{6,8,16}, R_RISCV_SUB{6,8,16}) into modules that we are not prepared to handle. Signed-off-by: Andreas Schwab Link: https://lore.kernel.org/r/mvmzg9xybqu.fsf@suse.de Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/Makefile | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index faf2c2177094..82153960ac00 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -80,6 +80,9 @@ ifeq ($(CONFIG_PERF_EVENTS),y) KBUILD_CFLAGS += -fno-omit-frame-pointer endif +# Avoid generating .eh_frame sections. +KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -fno-unwind-tables + KBUILD_CFLAGS_MODULE += $(call cc-option,-mno-relax) KBUILD_AFLAGS_MODULE += $(call as-option,-Wa$(comma)-mno-relax) -- cgit v1.2.3