diff options
Diffstat (limited to 'arch')
55 files changed, 677 insertions, 464 deletions
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 03934808b2ed..c5ccca26a408 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -184,8 +184,6 @@ config ARM64 select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE - select HAVE_DYNAMIC_FTRACE_WITH_ARGS \ - if $(cc-option,-fpatchable-function-entry=2) select FTRACE_MCOUNT_USE_PATCHABLE_FUNCTION_ENTRY \ if DYNAMIC_FTRACE_WITH_ARGS select HAVE_EFFICIENT_UNALIGNED_ACCESS @@ -972,6 +970,22 @@ config ARM64_ERRATUM_2457168 If unsure, say Y. +config ARM64_ERRATUM_2645198 + bool "Cortex-A715: 2645198: Workaround possible [ESR|FAR]_ELx corruption" + default y + help + This option adds the workaround for ARM Cortex-A715 erratum 2645198. + + If a Cortex-A715 cpu sees a page mapping permissions change from executable + to non-executable, it may corrupt the ESR_ELx and FAR_ELx registers on the + next instruction abort caused by permission fault. + + Only user-space does executable to non-executable permission transition via + mprotect() system call. Workaround the problem by doing a break-before-make + TLB invalidation, for all changes to executable user space mappings. + + If unsure, say Y. + config CAVIUM_ERRATUM_22375 bool "Cavium erratum 22375, 24313" default y diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h index 0890e4f568fb..cbb3d961123b 100644 --- a/arch/arm64/include/asm/atomic_ll_sc.h +++ b/arch/arm64/include/asm/atomic_ll_sc.h @@ -315,7 +315,7 @@ __ll_sc__cmpxchg_double##name(unsigned long old1, \ " cbnz %w0, 1b\n" \ " " #mb "\n" \ "2:" \ - : "=&r" (tmp), "=&r" (ret), "+Q" (*(unsigned long *)ptr) \ + : "=&r" (tmp), "=&r" (ret), "+Q" (*(__uint128_t *)ptr) \ : "r" (old1), "r" (old2), "r" (new1), "r" (new2) \ : cl); \ \ diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h index 52075e93de6c..a94d6dacc029 100644 --- a/arch/arm64/include/asm/atomic_lse.h +++ b/arch/arm64/include/asm/atomic_lse.h @@ -311,7 +311,7 @@ __lse__cmpxchg_double##name(unsigned long old1, \ " eor %[old2], %[old2], %[oldval2]\n" \ " orr %[old1], %[old1], %[old2]" \ : [old1] "+&r" (x0), [old2] "+&r" (x1), \ - [v] "+Q" (*(unsigned long *)ptr) \ + [v] "+Q" (*(__uint128_t *)ptr) \ : [new1] "r" (x2), [new2] "r" (x3), [ptr] "r" (x4), \ [oldval1] "r" (oldval1), [oldval2] "r" (oldval2) \ : cl); \ diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 4e8b66c74ea2..683ca3af4084 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -124,6 +124,8 @@ #define APPLE_CPU_PART_M1_FIRESTORM_PRO 0x025 #define APPLE_CPU_PART_M1_ICESTORM_MAX 0x028 #define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029 +#define APPLE_CPU_PART_M2_BLIZZARD 0x032 +#define APPLE_CPU_PART_M2_AVALANCHE 0x033 #define AMPERE_CPU_PART_AMPERE1 0xAC3 @@ -177,6 +179,8 @@ #define MIDR_APPLE_M1_FIRESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_PRO) #define MIDR_APPLE_M1_ICESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_MAX) #define MIDR_APPLE_M1_FIRESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_MAX) +#define MIDR_APPLE_M2_BLIZZARD MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD) +#define MIDR_APPLE_M2_AVALANCHE MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE) #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 15b34fbfca66..206de10524e3 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -114,6 +114,15 @@ #define ESR_ELx_FSC_ACCESS (0x08) #define ESR_ELx_FSC_FAULT (0x04) #define ESR_ELx_FSC_PERM (0x0C) +#define ESR_ELx_FSC_SEA_TTW0 (0x14) +#define ESR_ELx_FSC_SEA_TTW1 (0x15) +#define ESR_ELx_FSC_SEA_TTW2 (0x16) +#define ESR_ELx_FSC_SEA_TTW3 (0x17) +#define ESR_ELx_FSC_SECC (0x18) +#define ESR_ELx_FSC_SECC_TTW0 (0x1c) +#define ESR_ELx_FSC_SECC_TTW1 (0x1d) +#define ESR_ELx_FSC_SECC_TTW2 (0x1e) +#define ESR_ELx_FSC_SECC_TTW3 (0x1f) /* ISS field definitions for Data Aborts */ #define ESR_ELx_ISV_SHIFT (24) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index d20f5da2d76f..6a4a1ab8eb23 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -49,6 +49,15 @@ extern pte_t huge_ptep_get(pte_t *ptep); void __init arm64_hugetlb_cma_reserve(void); +#define huge_ptep_modify_prot_start huge_ptep_modify_prot_start +extern pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); + +#define huge_ptep_modify_prot_commit huge_ptep_modify_prot_commit +extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t new_pte); + #include <asm-generic/hugetlb.h> #endif /* __ASM_HUGETLB_H */ diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 0df3fc3a0173..26b0c97df986 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -319,21 +319,6 @@ BIT(18) | \ GENMASK(16, 15)) -/* For compatibility with fault code shared with 32-bit */ -#define FSC_FAULT ESR_ELx_FSC_FAULT -#define FSC_ACCESS ESR_ELx_FSC_ACCESS -#define FSC_PERM ESR_ELx_FSC_PERM -#define FSC_SEA ESR_ELx_FSC_EXTABT -#define FSC_SEA_TTW0 (0x14) -#define FSC_SEA_TTW1 (0x15) -#define FSC_SEA_TTW2 (0x16) -#define FSC_SEA_TTW3 (0x17) -#define FSC_SECC (0x18) -#define FSC_SECC_TTW0 (0x1c) -#define FSC_SECC_TTW1 (0x1d) -#define FSC_SECC_TTW2 (0x1e) -#define FSC_SECC_TTW3 (0x1f) - /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ #define HPFAR_MASK (~UL(0xf)) /* diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index 9bdba47f7e14..193583df2d9c 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -349,16 +349,16 @@ static __always_inline u8 kvm_vcpu_trap_get_fault_level(const struct kvm_vcpu *v static __always_inline bool kvm_vcpu_abt_issea(const struct kvm_vcpu *vcpu) { switch (kvm_vcpu_trap_get_fault(vcpu)) { - case FSC_SEA: - case FSC_SEA_TTW0: - case FSC_SEA_TTW1: - case FSC_SEA_TTW2: - case FSC_SEA_TTW3: - case FSC_SECC: - case FSC_SECC_TTW0: - case FSC_SECC_TTW1: - case FSC_SECC_TTW2: - case FSC_SECC_TTW3: + case ESR_ELx_FSC_EXTABT: + case ESR_ELx_FSC_SEA_TTW0: + case ESR_ELx_FSC_SEA_TTW1: + case ESR_ELx_FSC_SEA_TTW2: + case ESR_ELx_FSC_SEA_TTW3: + case ESR_ELx_FSC_SECC: + case ESR_ELx_FSC_SECC_TTW0: + case ESR_ELx_FSC_SECC_TTW1: + case ESR_ELx_FSC_SECC_TTW2: + case ESR_ELx_FSC_SECC_TTW3: return true; default: return false; @@ -373,8 +373,26 @@ static __always_inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) static inline bool kvm_is_write_fault(struct kvm_vcpu *vcpu) { - if (kvm_vcpu_abt_iss1tw(vcpu)) - return true; + if (kvm_vcpu_abt_iss1tw(vcpu)) { + /* + * Only a permission fault on a S1PTW should be + * considered as a write. Otherwise, page tables baked + * in a read-only memslot will result in an exception + * being delivered in the guest. + * + * The drawback is that we end-up faulting twice if the + * guest is using any of HW AF/DB: a translation fault + * to map the page containing the PT (read only at + * first), then a permission fault to allow the flags + * to be set. + */ + switch (kvm_vcpu_trap_get_fault_type(vcpu)) { + case ESR_ELx_FSC_PERM: + return true; + default: + return false; + } + } if (kvm_vcpu_trap_is_iabt(vcpu)) return false; diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index b4bbeed80fb6..65e78999c75d 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -681,7 +681,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define pud_leaf(pud) (pud_present(pud) && !pud_table(pud)) #define pud_valid(pud) pte_valid(pud_pte(pud)) #define pud_user(pud) pte_user(pud_pte(pud)) - +#define pud_user_exec(pud) pte_user_exec(pud_pte(pud)) static inline void set_pud(pud_t *pudp, pud_t pud) { @@ -730,6 +730,7 @@ static inline pmd_t *pud_pgtable(pud_t pud) #else #define pud_page_paddr(pud) ({ BUILD_BUG(); 0; }) +#define pud_user_exec(pud) pud_user(pud) /* Always 0 with folding */ /* Match pmd_offset folding in <asm/generic/pgtable-nopmd.h> */ #define pmd_set_fixmap(addr) NULL @@ -862,12 +863,12 @@ static inline bool pte_user_accessible_page(pte_t pte) static inline bool pmd_user_accessible_page(pmd_t pmd) { - return pmd_leaf(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd)); + return pmd_leaf(pmd) && !pmd_present_invalid(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd)); } static inline bool pud_user_accessible_page(pud_t pud) { - return pud_leaf(pud) && pud_user(pud); + return pud_leaf(pud) && (pud_user(pud) || pud_user_exec(pud)); } #endif @@ -1093,6 +1094,15 @@ static inline bool pud_sect_supported(void) } +#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION +#define ptep_modify_prot_start ptep_modify_prot_start +extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep); + +#define ptep_modify_prot_commit ptep_modify_prot_commit +extern void ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t new_pte); #endif /* !__ASSEMBLY__ */ #endif /* __ASM_PGTABLE_H */ diff --git a/arch/arm64/include/asm/uprobes.h b/arch/arm64/include/asm/uprobes.h index ba4bff5ca674..2b09495499c6 100644 --- a/arch/arm64/include/asm/uprobes.h +++ b/arch/arm64/include/asm/uprobes.h @@ -16,7 +16,7 @@ #define UPROBE_SWBP_INSN_SIZE AARCH64_INSN_SIZE #define UPROBE_XOL_SLOT_BYTES MAX_UINSN_BYTES -typedef u32 uprobe_opcode_t; +typedef __le32 uprobe_opcode_t; struct arch_uprobe_task { }; diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 89ac00084f38..307faa2b4395 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -661,6 +661,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = { CAP_MIDR_RANGE_LIST(trbe_write_out_of_range_cpus), }, #endif +#ifdef CONFIG_ARM64_ERRATUM_2645198 + { + .desc = "ARM erratum 2645198", + .capability = ARM64_WORKAROUND_2645198, + ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A715) + }, +#endif #ifdef CONFIG_ARM64_ERRATUM_2077057 { .desc = "ARM erratum 2077057", diff --git a/arch/arm64/kernel/efi-rt-wrapper.S b/arch/arm64/kernel/efi-rt-wrapper.S index a00886410537..d872d18101d8 100644 --- a/arch/arm64/kernel/efi-rt-wrapper.S +++ b/arch/arm64/kernel/efi-rt-wrapper.S @@ -4,6 +4,7 @@ */ #include <linux/linkage.h> +#include <asm/assembler.h> SYM_FUNC_START(__efi_rt_asm_wrapper) stp x29, x30, [sp, #-112]! diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c index 353009d7f307..2e94d20c4ac7 100644 --- a/arch/arm64/kernel/elfcore.c +++ b/arch/arm64/kernel/elfcore.c @@ -8,28 +8,27 @@ #include <asm/cpufeature.h> #include <asm/mte.h> -#define for_each_mte_vma(vmi, vma) \ +#define for_each_mte_vma(cprm, i, m) \ if (system_supports_mte()) \ - for_each_vma(vmi, vma) \ - if (vma->vm_flags & VM_MTE) + for (i = 0, m = cprm->vma_meta; \ + i < cprm->vma_count; \ + i++, m = cprm->vma_meta + i) \ + if (m->flags & VM_MTE) -static unsigned long mte_vma_tag_dump_size(struct vm_area_struct *vma) +static unsigned long mte_vma_tag_dump_size(struct core_vma_metadata *m) { - if (vma->vm_flags & VM_DONTDUMP) - return 0; - - return vma_pages(vma) * MTE_PAGE_TAG_STORAGE; + return (m->dump_size >> PAGE_SHIFT) * MTE_PAGE_TAG_STORAGE; } /* Derived from dump_user_range(); start/end must be page-aligned */ static int mte_dump_tag_range(struct coredump_params *cprm, - unsigned long start, unsigned long end) + unsigned long start, unsigned long len) { int ret = 1; unsigned long addr; void *tags = NULL; - for (addr = start; addr < end; addr += PAGE_SIZE) { + for (addr = start; addr < start + len; addr += PAGE_SIZE) { struct page *page = get_dump_page(addr); /* @@ -65,7 +64,6 @@ static int mte_dump_tag_range(struct coredump_params *cprm, mte_save_page_tags(page_address(page), tags); put_page(page); if (!dump_emit(cprm, tags, MTE_PAGE_TAG_STORAGE)) { - mte_free_tag_storage(tags); ret = 0; break; } @@ -77,13 +75,13 @@ static int mte_dump_tag_range(struct coredump_params *cprm, return ret; } -Elf_Half elf_core_extra_phdrs(void) +Elf_Half elf_core_extra_phdrs(struct coredump_params *cprm) { - struct vm_area_struct *vma; + int i; + struct core_vma_metadata *m; int vma_count = 0; - VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(vmi, vma) + for_each_mte_vma(cprm, i, m) vma_count++; return vma_count; @@ -91,18 +89,18 @@ Elf_Half elf_core_extra_phdrs(void) int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) { - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, current->mm, 0); + int i; + struct core_vma_metadata *m; - for_each_mte_vma(vmi, vma) { + for_each_mte_vma(cprm, i, m) { struct elf_phdr phdr; phdr.p_type = PT_AARCH64_MEMTAG_MTE; phdr.p_offset = offset; - phdr.p_vaddr = vma->vm_start; + phdr.p_vaddr = m->start; phdr.p_paddr = 0; - phdr.p_filesz = mte_vma_tag_dump_size(vma); - phdr.p_memsz = vma->vm_end - vma->vm_start; + phdr.p_filesz = mte_vma_tag_dump_size(m); + phdr.p_memsz = m->end - m->start; offset += phdr.p_filesz; phdr.p_flags = 0; phdr.p_align = 0; @@ -114,28 +112,25 @@ int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) return 1; } -size_t elf_core_extra_data_size(void) +size_t elf_core_extra_data_size(struct coredump_params *cprm) { - struct vm_area_struct *vma; + int i; + struct core_vma_metadata *m; size_t data_size = 0; - VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(vmi, vma) - data_size += mte_vma_tag_dump_size(vma); + for_each_mte_vma(cprm, i, m) + data_size += mte_vma_tag_dump_size(m); return data_size; } int elf_core_write_extra_data(struct coredump_params *cprm) { - struct vm_area_struct *vma; - VMA_ITERATOR(vmi, current->mm, 0); - - for_each_mte_vma(vmi, vma) { - if (vma->vm_flags & VM_DONTDUMP) - continue; + int i; + struct core_vma_metadata *m; - if (!mte_dump_tag_range(cprm, vma->vm_start, vma->vm_end)) + for_each_mte_vma(cprm, i, m) { + if (!mte_dump_tag_range(cprm, m->start, m->dump_size)) return 0; } diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index dcc81e7200d4..b6ef1af0122e 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -385,7 +385,7 @@ static void task_fpsimd_load(void) WARN_ON(!system_supports_fpsimd()); WARN_ON(!have_cpu_fpsimd_context()); - if (system_supports_sve()) { + if (system_supports_sve() || system_supports_sme()) { switch (current->thread.fp_type) { case FP_STATE_FPSIMD: /* Stop tracking SVE for this task until next use. */ diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 2686ab157601..0c321ad23cd3 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -1357,7 +1357,7 @@ enum aarch64_regset { #ifdef CONFIG_ARM64_SVE REGSET_SVE, #endif -#ifdef CONFIG_ARM64_SVE +#ifdef CONFIG_ARM64_SME REGSET_SSVE, REGSET_ZA, #endif diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index e0d09bf5b01b..be279fd48248 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -281,7 +281,12 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) vl = task_get_sme_vl(current); } else { - if (!system_supports_sve()) + /* + * A SME only system use SVE for streaming mode so can + * have a SVE formatted context with a zero VL and no + * payload data. + */ + if (!system_supports_sve() && !system_supports_sme()) return -EINVAL; vl = task_get_sve_vl(current); @@ -732,7 +737,7 @@ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user, return err; } - if (system_supports_sve()) { + if (system_supports_sve() || system_supports_sme()) { unsigned int vq = 0; if (add_all || test_thread_flag(TIF_SVE) || diff --git a/arch/arm64/kvm/hyp/include/hyp/fault.h b/arch/arm64/kvm/hyp/include/hyp/fault.h index 1b8a2dcd712f..9ddcfe2c3e57 100644 --- a/arch/arm64/kvm/hyp/include/hyp/fault.h +++ b/arch/arm64/kvm/hyp/include/hyp/fault.h @@ -60,7 +60,7 @@ static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) */ if (!(esr & ESR_ELx_S1PTW) && (cpus_have_final_cap(ARM64_WORKAROUND_834220) || - (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) { + (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_PERM)) { if (!__translate_far_to_hpfar(far, &hpfar)) return false; } else { diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index 3330d1b76bdd..07d37ff88a3f 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -367,7 +367,7 @@ static bool kvm_hyp_handle_dabt_low(struct kvm_vcpu *vcpu, u64 *exit_code) if (static_branch_unlikely(&vgic_v2_cpuif_trap)) { bool valid; - valid = kvm_vcpu_trap_get_fault_type(vcpu) == FSC_FAULT && + valid = kvm_vcpu_trap_get_fault_type(vcpu) == ESR_ELx_FSC_FAULT && kvm_vcpu_dabt_isvalid(vcpu) && !kvm_vcpu_abt_issea(vcpu) && !kvm_vcpu_abt_iss1tw(vcpu); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 31d7fa4c7c14..a3ee3b605c9b 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -1212,7 +1212,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu); VM_BUG_ON(write_fault && exec_fault); - if (fault_status == FSC_PERM && !write_fault && !exec_fault) { + if (fault_status == ESR_ELx_FSC_PERM && !write_fault && !exec_fault) { kvm_err("Unexpected L2 read permission error\n"); return -EFAULT; } @@ -1277,7 +1277,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * only exception to this is when dirty logging is enabled at runtime * and a write fault needs to collapse a block entry into a table. */ - if (fault_status != FSC_PERM || (logging_active && write_fault)) { + if (fault_status != ESR_ELx_FSC_PERM || + (logging_active && write_fault)) { ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm)); if (ret) @@ -1342,7 +1343,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * backed by a THP and thus use block mapping if possible. */ if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { - if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE) + if (fault_status == ESR_ELx_FSC_PERM && + fault_granule > PAGE_SIZE) vma_pagesize = fault_granule; else vma_pagesize = transparent_hugepage_adjust(kvm, memslot, @@ -1350,7 +1352,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, &fault_ipa); } - if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { + if (fault_status != ESR_ELx_FSC_PERM && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new disallowed VMA */ if (kvm_vma_mte_allowed(vma)) { sanitise_mte_tags(kvm, pfn, vma_pagesize); @@ -1376,7 +1378,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * permissions only if vma_pagesize equals fault_granule. Otherwise, * kvm_pgtable_stage2_map() should be called to change block size. */ - if (fault_status == FSC_PERM && vma_pagesize == fault_granule) + if (fault_status == ESR_ELx_FSC_PERM && vma_pagesize == fault_granule) ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot); else ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize, @@ -1441,7 +1443,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) fault_ipa = kvm_vcpu_get_fault_ipa(vcpu); is_iabt = kvm_vcpu_trap_is_iabt(vcpu); - if (fault_status == FSC_FAULT) { + if (fault_status == ESR_ELx_FSC_FAULT) { /* Beyond sanitised PARange (which is the IPA limit) */ if (fault_ipa >= BIT_ULL(get_kvm_ipa_limit())) { kvm_inject_size_fault(vcpu); @@ -1476,8 +1478,9 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) kvm_vcpu_get_hfar(vcpu), fault_ipa); /* Check the stage-2 fault is trans. fault or write fault */ - if (fault_status != FSC_FAULT && fault_status != FSC_PERM && - fault_status != FSC_ACCESS) { + if (fault_status != ESR_ELx_FSC_FAULT && + fault_status != ESR_ELx_FSC_PERM && + fault_status != ESR_ELx_FSC_ACCESS) { kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n", kvm_vcpu_trap_get_class(vcpu), (unsigned long)kvm_vcpu_trap_get_fault(vcpu), @@ -1539,7 +1542,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) /* Userspace should not be able to register out-of-bounds IPAs */ VM_BUG_ON(fault_ipa >= kvm_phys_size(vcpu->kvm)); - if (fault_status == FSC_ACCESS) { + if (fault_status == ESR_ELx_FSC_ACCESS) { handle_access_fault(vcpu, fault_ipa); ret = 1; goto out_unlock; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index d5ee52d6bf73..c6cbfe6b854b 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -646,7 +646,7 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) return; /* Only preserve PMCR_EL0.N, and reset the rest to 0 */ - pmcr = read_sysreg(pmcr_el0) & ARMV8_PMU_PMCR_N_MASK; + pmcr = read_sysreg(pmcr_el0) & (ARMV8_PMU_PMCR_N_MASK << ARMV8_PMU_PMCR_N_SHIFT); if (!kvm_supports_32bit_el0()) pmcr |= ARMV8_PMU_PMCR_LC; diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 826ff6f2a4e7..2074521d4a8c 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -616,6 +616,8 @@ static const struct midr_range broken_seis[] = { MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_PRO), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM_MAX), MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM_MAX), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_BLIZZARD), + MIDR_ALL_VERSIONS(MIDR_APPLE_M2_AVALANCHE), {}, }; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 35e9a468d13e..95364e8bdc19 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -559,3 +559,24 @@ bool __init arch_hugetlb_valid_size(unsigned long size) { return __hugetlb_valid_size(size); } + +pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) +{ + if (IS_ENABLED(CONFIG_ARM64_ERRATUM_2645198) && + cpus_have_const_cap(ARM64_WORKAROUND_2645198)) { + /* + * Break-before-make (BBM) is required for all user space mappings + * when the permission changes from executable to non-executable + * in cases where cpu is affected with errata #2645198. + */ + if (pte_user_exec(READ_ONCE(*ptep))) + return huge_ptep_clear_flush(vma, addr, ptep); + } + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); +} + +void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + set_huge_pte_at(vma->vm_mm, addr, ptep, pte); +} diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 14c87e8d69d8..d77c9f56b7b4 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1630,3 +1630,24 @@ static int __init prevent_bootmem_remove_init(void) } early_initcall(prevent_bootmem_remove_init); #endif + +pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) +{ + if (IS_ENABLED(CONFIG_ARM64_ERRATUM_2645198) && + cpus_have_const_cap(ARM64_WORKAROUND_2645198)) { + /* + * Break-before-make (BBM) is required for all user space mappings + * when the permission changes from executable to non-executable + * in cases where cpu is affected with errata #2645198. + */ + if (pte_user_exec(READ_ONCE(*ptep))) + return ptep_clear_flush(vma, addr, ptep); + } + return ptep_get_and_clear(vma->vm_mm, addr, ptep); +} + +void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + set_pte_at(vma->vm_mm, addr, ptep, pte); +} diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps index a86ee376920a..dfeb2c51e257 100644 --- a/arch/arm64/tools/cpucaps +++ b/arch/arm64/tools/cpucaps @@ -71,6 +71,7 @@ WORKAROUND_2038923 WORKAROUND_2064142 WORKAROUND_2077057 WORKAROUND_2457168 +WORKAROUND_2645198 WORKAROUND_2658417 WORKAROUND_TRBE_OVERWRITE_FILL_MODE WORKAROUND_TSB_FLUSH_FAILURE diff --git a/arch/ia64/kernel/elfcore.c b/arch/ia64/kernel/elfcore.c index 94680521fbf9..8895df121540 100644 --- a/arch/ia64/kernel/elfcore.c +++ b/arch/ia64/kernel/elfcore.c @@ -7,7 +7,7 @@ #include <asm/elf.h> -Elf64_Half elf_core_extra_phdrs(void) +Elf64_Half elf_core_extra_phdrs(struct coredump_params *cprm) { return GATE_EHDR->e_phnum; } @@ -60,7 +60,7 @@ int elf_core_write_extra_data(struct coredump_params *cprm) return 1; } -size_t elf_core_extra_data_size(void) +size_t elf_core_extra_data_size(struct coredump_params *cprm) { const struct elf_phdr *const gate_phdrs = (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); diff --git a/arch/loongarch/include/asm/ftrace.h b/arch/loongarch/include/asm/ftrace.h index 90f9d3399b2a..3418d32d4fc7 100644 --- a/arch/loongarch/include/asm/ftrace.h +++ b/arch/loongarch/include/asm/ftrace.h @@ -10,8 +10,6 @@ #define FTRACE_REGS_PLT_IDX 1 #define NR_FTRACE_PLTS 2 -#define GRAPH_FAKE_OFFSET (sizeof(struct pt_regs) - offsetof(struct pt_regs, regs[1])) - #ifdef CONFIG_FUNCTION_TRACER #define MCOUNT_INSN_SIZE 4 /* sizeof mcount call */ diff --git a/arch/loongarch/include/asm/inst.h b/arch/loongarch/include/asm/inst.h index c00e1512d4fa..7eedd83fd0d7 100644 --- a/arch/loongarch/include/asm/inst.h +++ b/arch/loongarch/include/asm/inst.h @@ -377,14 +377,6 @@ static inline bool unsigned_imm_check(unsigned long val, unsigned int bit) return val < (1UL << bit); } -static inline unsigned long sign_extend(unsigned long val, unsigned int idx) -{ - if (!is_imm_negative(val, idx + 1)) - return ((1UL << idx) - 1) & val; - else - return ~((1UL << idx) - 1) | val; -} - #define DEF_EMIT_REG0I26_FORMAT(NAME, OP) \ static inline void emit_##NAME(union loongarch_instruction *insn, \ int offset) \ @@ -401,6 +393,7 @@ static inline void emit_##NAME(union loongarch_instruction *insn, \ } DEF_EMIT_REG0I26_FORMAT(b, b_op) +DEF_EMIT_REG0I26_FORMAT(bl, bl_op) #define DEF_EMIT_REG1I20_FORMAT(NAME, OP) \ static inline void emit_##NAME(union loongarch_instruction *insn, \ diff --git a/arch/loongarch/include/asm/unwind.h b/arch/loongarch/include/asm/unwind.h index f2b52b9ea93d..b9dce87afd2e 100644 --- a/arch/loongarch/include/asm/unwind.h +++ b/arch/loongarch/include/asm/unwind.h @@ -8,7 +8,9 @@ #define _ASM_UNWIND_H #include <linux/sched.h> +#include <linux/ftrace.h> +#include <asm/ptrace.h> #include <asm/stacktrace.h> enum unwinder_type { @@ -20,11 +22,13 @@ struct unwind_state { char type; /* UNWINDER_XXX */ struct stack_info stack_info; struct task_struct *task; - bool first, error, is_ftrace; + bool first, error, reset; int graph_idx; unsigned long sp, pc, ra; }; +bool default_next_frame(struct unwind_state *state); + void unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs); bool unwind_next_frame(struct unwind_state *state); @@ -40,4 +44,39 @@ static inline bool unwind_error(struct unwind_state *state) return state->error; } +#define GRAPH_FAKE_OFFSET (sizeof(struct pt_regs) - offsetof(struct pt_regs, regs[1])) + +static inline unsigned long unwind_graph_addr(struct unwind_state *state, + unsigned long pc, unsigned long cfa) +{ + return ftrace_graph_ret_addr(state->task, &state->graph_idx, + pc, (unsigned long *)(cfa - GRAPH_FAKE_OFFSET)); +} + +static __always_inline void __unwind_start(struct unwind_state *state, + struct task_struct *task, struct pt_regs *regs) +{ + memset(state, 0, sizeof(*state)); + if (regs) { + state->sp = regs->regs[3]; + state->pc = regs->csr_era; + state->ra = regs->regs[1]; + } else if (task && task != current) { + state->sp = thread_saved_fp(task); + state->pc = thread_saved_ra(task); + state->ra = 0; + } else { + state->sp = (unsigned long)__builtin_frame_address(0); + state->pc = (unsigned long)__builtin_return_address(0); + state->ra = 0; + } + state->task = task; + get_stack_info(state->sp, state->task, &state->stack_info); + state->pc = unwind_graph_addr(state, state->pc, state->sp); +} + +static __always_inline unsigned long __unwind_get_return_address(struct unwind_state *state) +{ + return unwind_done(state) ? 0 : state->pc; +} #endif /* _ASM_UNWIND_H */ diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile index fcaa024a685e..c8cfbd562921 100644 --- a/arch/loongarch/kernel/Makefile +++ b/arch/loongarch/kernel/Makefile @@ -8,7 +8,7 @@ extra-y := vmlinux.lds obj-y += head.o cpu-probe.o cacheinfo.o env.o setup.o entry.o genex.o \ traps.o irq.o idle.o process.o dma.o mem.o io.o reset.o switch.o \ elf.o syscall.o signal.o time.o topology.o inst.o ptrace.o vdso.o \ - alternative.o unaligned.o + alternative.o unaligned.o unwind.o obj-$(CONFIG_ACPI) += acpi.o obj-$(CONFIG_EFI) += efi.o diff --git a/arch/loongarch/kernel/alternative.c b/arch/loongarch/kernel/alternative.c index c5aebeac960b..4ad13847e962 100644 --- a/arch/loongarch/kernel/alternative.c +++ b/arch/loongarch/kernel/alternative.c @@ -74,7 +74,7 @@ static void __init_or_module recompute_jump(union loongarch_instruction *buf, switch (src->reg0i26_format.opcode) { case b_op: case bl_op: - jump_addr = cur_pc + sign_extend((si_h << 16 | si_l) << 2, 27); + jump_addr = cur_pc + sign_extend64((si_h << 16 | si_l) << 2, 27); if (in_alt_jump(jump_addr, start, end)) return; offset = jump_addr - pc; @@ -93,7 +93,7 @@ static void __init_or_module recompute_jump(union loongarch_instruction *buf, fallthrough; case beqz_op: case bnez_op: - jump_addr = cur_pc + sign_extend((si_h << 16 | si_l) << 2, 22); + jump_addr = cur_pc + sign_extend64((si_h << 16 | si_l) << 2, 22); if (in_alt_jump(jump_addr, start, end)) return; offset = jump_addr - pc; @@ -112,7 +112,7 @@ static void __init_or_module recompute_jump(union loongarch_instruction *buf, case bge_op: case bltu_op: case bgeu_op: - jump_addr = cur_pc + sign_extend(si << 2, 17); + jump_addr = cur_pc + sign_extend64(si << 2, 17); if (in_alt_jump(jump_addr, start, end)) return; offset = jump_addr - pc; diff --git a/arch/loongarch/kernel/cpu-probe.c b/arch/loongarch/kernel/cpu-probe.c index 255a09876ef2..3a3fce2d7846 100644 --- a/arch/loongarch/kernel/cpu-probe.c +++ b/arch/loongarch/kernel/cpu-probe.c @@ -94,7 +94,7 @@ static void cpu_probe_common(struct cpuinfo_loongarch *c) c->options = LOONGARCH_CPU_CPUCFG | LOONGARCH_CPU_CSR | LOONGARCH_CPU_TLB | LOONGARCH_CPU_VINT | LOONGARCH_CPU_WATCH; - elf_hwcap |= HWCAP_LOONGARCH_CRC32; + elf_hwcap = HWCAP_LOONGARCH_CPUCFG | HWCAP_LOONGARCH_CRC32; config = read_cpucfg(LOONGARCH_CPUCFG1); if (config & CPUCFG1_UAL) { diff --git a/arch/loongarch/kernel/genex.S b/arch/loongarch/kernel/genex.S index 75e5be807a0d..7e5c293ed89f 100644 --- a/arch/loongarch/kernel/genex.S +++ b/arch/loongarch/kernel/genex.S @@ -67,14 +67,17 @@ SYM_FUNC_END(except_vec_cex) .macro BUILD_HANDLER exception handler prep .align 5 SYM_FUNC_START(handle_\exception) + 666: BACKUP_T0T1 SAVE_ALL build_prep_\prep move a0, sp la.abs t0, do_\handler jirl ra, t0, 0 + 668: RESTORE_ALL_AND_RET SYM_FUNC_END(handle_\exception) + SYM_DATA(unwind_hint_\exception, .word 668b - 666b) .endm BUILD_HANDLER ade ade badv diff --git a/arch/loongarch/kernel/inst.c b/arch/loongarch/kernel/inst.c index 512579d79b22..badc59087042 100644 --- a/arch/loongarch/kernel/inst.c +++ b/arch/loongarch/kernel/inst.c @@ -58,7 +58,6 @@ u32 larch_insn_gen_nop(void) u32 larch_insn_gen_b(unsigned long pc, unsigned long dest) { long offset = dest - pc; - unsigned int immediate_l, immediate_h; union loongarch_instruction insn; if ((offset & 3) || offset < -SZ_128M || offset >= SZ_128M) { @@ -66,15 +65,7 @@ u32 larch_insn_gen_b(unsigned long pc, unsigned long dest) return INSN_BREAK; } - offset >>= 2; - - immediate_l = offset & 0xffff; - offset >>= 16; - immediate_h = offset & 0x3ff; - - insn.reg0i26_format.opcode = b_op; - insn.reg0i26_format.immediate_l = immediate_l; - insn.reg0i26_format.immediate_h = immediate_h; + emit_b(&insn, offset >> 2); return insn.word; } @@ -82,7 +73,6 @@ u32 larch_insn_gen_b(unsigned long pc, unsigned long dest) u32 larch_insn_gen_bl(unsigned long pc, unsigned long dest) { long offset = dest - pc; - unsigned int immediate_l, immediate_h; union loongarch_instruction insn; if ((offset & 3) || offset < -SZ_128M || offset >= SZ_128M) { @@ -90,15 +80,7 @@ u32 larch_insn_gen_bl(unsigned long pc, unsigned long dest) return INSN_BREAK; } - offset >>= 2; - - immediate_l = offset & 0xffff; - offset >>= 16; - immediate_h = offset & 0x3ff; - - insn.reg0i26_format.opcode = bl_op; - insn.reg0i26_format.immediate_l = immediate_l; - insn.reg0i26_format.immediate_h = immediate_h; + emit_bl(&insn, offset >> 2); return insn.word; } @@ -107,10 +89,7 @@ u32 larch_insn_gen_or(enum loongarch_gpr rd, enum loongarch_gpr rj, enum loongar { union loongarch_instruction insn; - insn.reg3_format.opcode = or_op; - insn.reg3_format.rd = rd; - insn.reg3_format.rj = rj; - insn.reg3_format.rk = rk; + emit_or(&insn, rd, rj, rk); return insn.word; } @@ -124,9 +103,7 @@ u32 larch_insn_gen_lu12iw(enum loongarch_gpr rd, int imm) { union loongarch_instruction insn; - insn.reg1i20_format.opcode = lu12iw_op; - insn.reg1i20_format.rd = rd; - insn.reg1i20_format.immediate = imm; + emit_lu12iw(&insn, rd, imm); return insn.word; } @@ -135,9 +112,7 @@ u32 larch_insn_gen_lu32id(enum loongarch_gpr rd, int imm) { union loongarch_instruction insn; - insn.reg1i20_format.opcode = lu32id_op; - insn.reg1i20_format.rd = rd; - insn.reg1i20_format.immediate = imm; + emit_lu32id(&insn, rd, imm); return insn.word; } @@ -146,10 +121,7 @@ u32 larch_insn_gen_lu52id(enum loongarch_gpr rd, enum loongarch_gpr rj, int imm) { union loongarch_instruction insn; - insn.reg2i12_format.opcode = lu52id_op; - insn.reg2i12_format.rd = rd; - insn.reg2i12_format.rj = rj; - insn.reg2i12_format.immediate = imm; + emit_lu52id(&insn, rd, rj, imm); return insn.word; } @@ -158,10 +130,7 @@ u32 larch_insn_gen_jirl(enum loongarch_gpr rd, enum loongarch_gpr rj, unsigned l { union loongarch_instruction insn; - insn.reg2i16_format.opcode = jirl_op; - insn.reg2i16_format.rd = rd; - insn.reg2i16_format.rj = rj; - insn.reg2i16_format.immediate = (dest - pc) >> 2; + emit_jirl(&insn, rj, rd, (dest - pc) >> 2); return insn.word; } diff --git a/arch/loongarch/kernel/process.c b/arch/loongarch/kernel/process.c index c583b1ef1f44..edfd220a3737 100644 --- a/arch/loongarch/kernel/process.c +++ b/arch/loongarch/kernel/process.c @@ -191,20 +191,14 @@ out: unsigned long __get_wchan(struct task_struct *task) { - unsigned long pc; + unsigned long pc = 0; struct unwind_state state; if (!try_get_task_stack(task)) return 0; - unwind_start(&state, task, NULL); - state.sp = thread_saved_fp(task); - get_stack_info(state.sp, state.task, &state.stack_info); - state.pc = thread_saved_ra(task); -#ifdef CONFIG_UNWINDER_PROLOGUE - state.type = UNWINDER_PROLOGUE; -#endif - for (; !unwind_done(&state); unwind_next_frame(&state)) { + for (unwind_start(&state, task, NULL); + !unwind_done(&state); unwind_next_frame(&state)) { pc = unwind_get_return_address(&state); if (!pc) break; diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 7ea62faeeadb..c38a146a973b 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -72,9 +72,6 @@ static void show_backtrace(struct task_struct *task, const struct pt_regs *regs, if (!task) task = current; - if (user_mode(regs)) - state.type = UNWINDER_GUESS; - printk("%sCall Trace:", loglvl); for (unwind_start(&state, task, pregs); !unwind_done(&state); unwind_next_frame(&state)) { diff --git a/arch/loongarch/kernel/unwind.c b/arch/loongarch/kernel/unwind.c new file mode 100644 index 000000000000..a463d6961344 --- /dev/null +++ b/arch/loongarch/kernel/unwind.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2022-2023 Loongson Technology Corporation Limited + */ +#include <linux/kernel.h> +#include <linux/ftrace.h> + +#include <asm/unwind.h> + +bool default_next_frame(struct unwind_state *state) +{ + struct stack_info *info = &state->stack_info; + unsigned long addr; + + if (unwind_done(state)) + return false; + + do { + for (state->sp += sizeof(unsigned long); + state->sp < info->end; state->sp += sizeof(unsigned long)) { + addr = *(unsigned long *)(state->sp); + state->pc = unwind_graph_addr(state, addr, state->sp + 8); + if (__kernel_text_address(state->pc)) + return true; + } + + state->sp = info->next_sp; + + } while (!get_stack_info(state->sp, state->task, info)); + + return false; +} diff --git a/arch/loongarch/kernel/unwind_guess.c b/arch/loongarch/kernel/unwind_guess.c index e2d2e4f3001f..98379b7d4147 100644 --- a/arch/loongarch/kernel/unwind_guess.c +++ b/arch/loongarch/kernel/unwind_guess.c @@ -2,37 +2,18 @@ /* * Copyright (C) 2022 Loongson Technology Corporation Limited */ -#include <linux/kernel.h> -#include <linux/ftrace.h> - #include <asm/unwind.h> unsigned long unwind_get_return_address(struct unwind_state *state) { - if (unwind_done(state)) - return 0; - else if (state->first) - return state->pc; - - return *(unsigned long *)(state->sp); + return __unwind_get_return_address(state); } EXPORT_SYMBOL_GPL(unwind_get_return_address); void unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs) { - memset(state, 0, sizeof(*state)); - - if (regs) { - state->sp = regs->regs[3]; - state->pc = regs->csr_era; - } - - state->task = task; - state->first = true; - - get_stack_info(state->sp, state->task, &state->stack_info); - + __unwind_start(state, task, regs); if (!unwind_done(state) && !__kernel_text_address(state->pc)) unwind_next_frame(state); } @@ -40,30 +21,6 @@ EXPORT_SYMBOL_GPL(unwind_start); bool unwind_next_frame(struct unwind_state *state) { - struct stack_info *info = &state->stack_info; - unsigned long addr; - - if (unwind_done(state)) - return false; - - if (state->first) - state->first = false; - - do { - for (state->sp += sizeof(unsigned long); - state->sp < info->end; - state->sp += sizeof(unsigned long)) { - addr = *(unsigned long *)(state->sp); - state->pc = ftrace_graph_ret_addr(state->task, &state->graph_idx, - addr, (unsigned long *)(state->sp - GRAPH_FAKE_OFFSET)); - if (__kernel_text_address(addr)) - return true; - } - - state->sp = info->next_sp; - - } while (!get_stack_info(state->sp, state->task, info)); - - return false; + return default_next_frame(state); } EXPORT_SYMBOL_GPL(unwind_next_frame); diff --git a/arch/loongarch/kernel/unwind_prologue.c b/arch/loongarch/kernel/unwind_prologue.c index 0f8d1451ebb8..9095fde8e55d 100644 --- a/arch/loongarch/kernel/unwind_prologue.c +++ b/arch/loongarch/kernel/unwind_prologue.c @@ -2,61 +2,116 @@ /* * Copyright (C) 2022 Loongson Technology Corporation Limited */ +#include <linux/cpumask.h> #include <linux/ftrace.h> #include <linux/kallsyms.h> #include <asm/inst.h> +#include <asm/loongson.h> #include <asm/ptrace.h> +#include <asm/setup.h> #include <asm/unwind.h> -static inline void unwind_state_fixup(struct unwind_state *state) -{ -#ifdef CONFIG_DYNAMIC_FTRACE - static unsigned long ftrace = (unsigned long)ftrace_call + 4; - - if (state->pc == ftrace) - state->is_ftrace = true; +extern const int unwind_hint_ade; +extern const int unwind_hint_ale; +extern const int unwind_hint_bp; +extern const int unwind_hint_fpe; +extern const int unwind_hint_fpu; +extern const int unwind_hint_lsx; +extern const int unwind_hint_lasx; +extern const int unwind_hint_lbt; +extern const int unwind_hint_ri; +extern const int unwind_hint_watch; +extern unsigned long eentry; +#ifdef CONFIG_NUMA +extern unsigned long pcpu_handlers[NR_CPUS]; #endif -} -unsigned long unwind_get_return_address(struct unwind_state *state) +static inline bool scan_handlers(unsigned long entry_offset) { + int idx, offset; - if (unwind_done(state)) - return 0; - else if (state->type) - return state->pc; - else if (state->first) - return state->pc; - - return *(unsigned long *)(state->sp); + if (entry_offset >= EXCCODE_INT_START * VECSIZE) + return false; + idx = entry_offset / VECSIZE; + offset = entry_offset % VECSIZE; + switch (idx) { + case EXCCODE_ADE: + return offset == unwind_hint_ade; + case EXCCODE_ALE: + return offset == unwind_hint_ale; + case EXCCODE_BP: + return offset == unwind_hint_bp; + case EXCCODE_FPE: + return offset == unwind_hint_fpe; + case EXCCODE_FPDIS: + return offset == unwind_hint_fpu; + case EXCCODE_LSXDIS: + return offset == unwind_hint_lsx; + case EXCCODE_LASXDIS: + return offset == unwind_hint_lasx; + case EXCCODE_BTDIS: + return offset == unwind_hint_lbt; + case EXCCODE_INE: + return offset == unwind_hint_ri; + case EXCCODE_WATCH: + return offset == unwind_hint_watch; + default: + return false; + } } -EXPORT_SYMBOL_GPL(unwind_get_return_address); -static bool unwind_by_guess(struct unwind_state *state) +static inline bool fix_exception(unsigned long pc) { - struct stack_info *info = &state->stack_info; - unsigned long addr; - - for (state->sp += sizeof(unsigned long); - state->sp < info->end; - state->sp += sizeof(unsigned long)) { - addr = *(unsigned long *)(state->sp); - state->pc = ftrace_graph_ret_addr(state->task, &state->graph_idx, - addr, (unsigned long *)(state->sp - GRAPH_FAKE_OFFSET)); - if (__kernel_text_address(addr)) +#ifdef CONFIG_NUMA + int cpu; + + for_each_possible_cpu(cpu) { + if (!pcpu_handlers[cpu]) + continue; + if (scan_handlers(pc - pcpu_handlers[cpu])) return true; } +#endif + return scan_handlers(pc - eentry); +} +/* + * As we meet ftrace_regs_entry, reset first flag like first doing + * tracing. Prologue analysis will stop soon because PC is at entry. + */ +static inline bool fix_ftrace(unsigned long pc) +{ +#ifdef CONFIG_DYNAMIC_FTRACE + return pc == (unsigned long)ftrace_call + LOONGARCH_INSN_SIZE; +#else return false; +#endif } +static inline bool unwind_state_fixup(struct unwind_state *state) +{ + if (!fix_exception(state->pc) && !fix_ftrace(state->pc)) + return false; + + state->reset = true; + return true; +} + +/* + * LoongArch function prologue is like follows, + * [instructions not use stack var] + * addi.d sp, sp, -imm + * st.d xx, sp, offset <- save callee saved regs and + * st.d yy, sp, offset save ra if function is nest. + * [others instructions] + */ static bool unwind_by_prologue(struct unwind_state *state) { long frame_ra = -1; unsigned long frame_size = 0; - unsigned long size, offset, pc = state->pc; + unsigned long size, offset, pc; struct pt_regs *regs; struct stack_info *info = &state->stack_info; union loongarch_instruction *ip, *ip_end; @@ -64,20 +119,21 @@ static bool unwind_by_prologue(struct unwind_state *state) if (state->sp >= info->end || state->sp < info->begin) return false; - if (state->is_ftrace) { - /* - * As we meet ftrace_regs_entry, reset first flag like first doing - * tracing. Prologue analysis will stop soon because PC is at entry. - */ + if (state->reset) { regs = (struct pt_regs *)state->sp; state->first = true; - state->is_ftrace = false; + state->reset = false; state->pc = regs->csr_era; state->ra = regs->regs[1]; state->sp = regs->regs[3]; return true; } + /* + * When first is not set, the PC is a return address in the previous frame. + * We need to adjust its value in case overflow to the next symbol. + */ + pc = state->pc - (state->first ? 0 : LOONGARCH_INSN_SIZE); if (!kallsyms_lookup_size_offset(pc, &size, &offset)) return false; @@ -93,6 +149,10 @@ static bool unwind_by_prologue(struct unwind_state *state) ip++; } + /* + * Can't find stack alloc action, PC may be in a leaf function. Only the + * first being true is reasonable, otherwise indicate analysis is broken. + */ if (!frame_size) { if (state->first) goto first; @@ -110,6 +170,7 @@ static bool unwind_by_prologue(struct unwind_state *state) ip++; } + /* Can't find save $ra action, PC may be in a leaf function, too. */ if (frame_ra < 0) { if (state->first) { state->sp = state->sp + frame_size; @@ -118,88 +179,47 @@ static bool unwind_by_prologue(struct unwind_state *state) return false; } - if (state->first) - state->first = false; - state->pc = *(unsigned long *)(state->sp + frame_ra); state->sp = state->sp + frame_size; goto out; first: - state->first = false; - if (state->pc == state->ra) - return false; - state->pc = state->ra; out: - unwind_state_fixup(state); - return !!__kernel_text_address(state->pc); -} - -void unwind_start(struct unwind_state *state, struct task_struct *task, - struct pt_regs *regs) -{ - memset(state, 0, sizeof(*state)); - - if (regs && __kernel_text_address(regs->csr_era)) { - state->pc = regs->csr_era; - state->sp = regs->regs[3]; - state->ra = regs->regs[1]; - state->type = UNWINDER_PROLOGUE; - } - - state->task = task; - state->first = true; - - get_stack_info(state->sp, state->task, &state->stack_info); - - if (!unwind_done(state) && !__kernel_text_address(state->pc)) - unwind_next_frame(state); + state->first = false; + return unwind_state_fixup(state) || __kernel_text_address(state->pc); } -EXPORT_SYMBOL_GPL(unwind_start); -bool unwind_next_frame(struct unwind_state *state) +static bool next_frame(struct unwind_state *state) { - struct stack_info *info = &state->stack_info; - struct pt_regs *regs; unsigned long pc; + struct pt_regs *regs; + struct stack_info *info = &state->stack_info; if (unwind_done(state)) return false; do { - switch (state->type) { - case UNWINDER_GUESS: - state->first = false; - if (unwind_by_guess(state)) - return true; - break; + if (unwind_by_prologue(state)) { + state->pc = unwind_graph_addr(state, state->pc, state->sp); + return true; + } + + if (info->type == STACK_TYPE_IRQ && info->end == state->sp) { + regs = (struct pt_regs *)info->next_sp; + pc = regs->csr_era; + + if (user_mode(regs) || !__kernel_text_address(pc)) + return false; + + state->first = true; + state->pc = pc; + state->ra = regs->regs[1]; + state->sp = regs->regs[3]; + get_stack_info(state->sp, state->task, info); - case UNWINDER_PROLOGUE: - if (unwind_by_prologue(state)) { - state->pc = ftrace_graph_ret_addr(state->task, &state->graph_idx, - state->pc, (unsigned long *)(state->sp - GRAPH_FAKE_OFFSET)); - return true; - } - - if (info->type == STACK_TYPE_IRQ && - info->end == state->sp) { - regs = (struct pt_regs *)info->next_sp; - pc = regs->csr_era; - - if (user_mode(regs) || !__kernel_text_address(pc)) - return false; - - state->first = true; - state->ra = regs->regs[1]; - state->sp = regs->regs[3]; - state->pc = ftrace_graph_ret_addr(state->task, &state->graph_idx, - pc, (unsigned long *)(state->sp - GRAPH_FAKE_OFFSET)); - get_stack_info(state->sp, state->task, info); - - return true; - } + return true; } state->sp = info->next_sp; @@ -208,4 +228,36 @@ bool unwind_next_frame(struct unwind_state *state) return false; } + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + return __unwind_get_return_address(state); +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +void unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs) +{ + __unwind_start(state, task, regs); + state->type = UNWINDER_PROLOGUE; + state->first = true; + + /* + * The current PC is not kernel text address, we cannot find its + * relative symbol. Thus, prologue analysis will be broken. Luckily, + * we can use the default_next_frame(). + */ + if (!__kernel_text_address(state->pc)) { + state->type = UNWINDER_GUESS; + if (!unwind_done(state)) + unwind_next_frame(state); + } +} +EXPORT_SYMBOL_GPL(unwind_start); + +bool unwind_next_frame(struct unwind_state *state) +{ + return state->type == UNWINDER_PROLOGUE ? + next_frame(state) : default_next_frame(state); +} EXPORT_SYMBOL_GPL(unwind_next_frame); diff --git a/arch/loongarch/mm/tlb.c b/arch/loongarch/mm/tlb.c index da3681f131c8..8bad6b0cff59 100644 --- a/arch/loongarch/mm/tlb.c +++ b/arch/loongarch/mm/tlb.c @@ -251,7 +251,7 @@ static void output_pgtable_bits_defines(void) } #ifdef CONFIG_NUMA -static unsigned long pcpu_handlers[NR_CPUS]; +unsigned long pcpu_handlers[NR_CPUS]; #endif extern long exception_handlers[VECSIZE * 128 / sizeof(long)]; diff --git a/arch/powerpc/boot/wrapper b/arch/powerpc/boot/wrapper index af04cea82b94..352d7de24018 100755 --- a/arch/powerpc/boot/wrapper +++ b/arch/powerpc/boot/wrapper @@ -210,6 +210,10 @@ ld_version() gsub(".*version ", ""); gsub("-.*", ""); split($1,a, "."); + if( length(a[3]) == "8" ) + # a[3] is probably a date of format yyyymmdd used for release snapshots. We + # can assume it to be zero as it does not signify a new version as such. + a[3] = 0; print a[1]*100000000 + a[2]*1000000 + a[3]*10000; exit }' diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index 4f897993b710..699a88584ae1 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -137,7 +137,7 @@ struct imc_pmu { * are inited. */ struct imc_pmu_ref { - struct mutex lock; + spinlock_t lock; unsigned int id; int refc; }; diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index 80a148c57de8..44a35ed4f686 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1012,7 +1012,7 @@ static void __init hash_init_partition_table(phys_addr_t hash_table, void hpt_clear_stress(void); static struct timer_list stress_hpt_timer; -void stress_hpt_timer_fn(struct timer_list *timer) +static void stress_hpt_timer_fn(struct timer_list *timer) { int next_cpu; diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index d517aba94d1b..100e97daf76b 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -14,6 +14,7 @@ #include <asm/cputhreads.h> #include <asm/smp.h> #include <linux/string.h> +#include <linux/spinlock.h> /* Nest IMC data structures and variables */ @@ -21,7 +22,7 @@ * Used to avoid races in counting the nest-pmu units during hotplug * register and unregister */ -static DEFINE_MUTEX(nest_init_lock); +static DEFINE_SPINLOCK(nest_init_lock); static DEFINE_PER_CPU(struct imc_pmu_ref *, local_nest_imc_refc); static struct imc_pmu **per_nest_pmu_arr; static cpumask_t nest_imc_cpumask; @@ -50,7 +51,7 @@ static int trace_imc_mem_size; * core and trace-imc */ static struct imc_pmu_ref imc_global_refc = { - .lock = __MUTEX_INITIALIZER(imc_global_refc.lock), + .lock = __SPIN_LOCK_INITIALIZER(imc_global_refc.lock), .id = 0, .refc = 0, }; @@ -400,7 +401,7 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu) get_hard_smp_processor_id(cpu)); /* * If this is the last cpu in this chip then, skip the reference - * count mutex lock and make the reference count on this chip zero. + * count lock and make the reference count on this chip zero. */ ref = get_nest_pmu_ref(cpu); if (!ref) @@ -462,15 +463,15 @@ static void nest_imc_counters_release(struct perf_event *event) /* * See if we need to disable the nest PMU. * If no events are currently in use, then we have to take a - * mutex to ensure that we don't race with another task doing + * lock to ensure that we don't race with another task doing * enable or disable the nest counters. */ ref = get_nest_pmu_ref(event->cpu); if (!ref) return; - /* Take the mutex lock for this node and then decrement the reference count */ - mutex_lock(&ref->lock); + /* Take the lock for this node and then decrement the reference count */ + spin_lock(&ref->lock); if (ref->refc == 0) { /* * The scenario where this is true is, when perf session is @@ -482,7 +483,7 @@ static void nest_imc_counters_release(struct perf_event *event) * an OPAL call to disable the engine in that node. * */ - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); return; } ref->refc--; @@ -490,7 +491,7 @@ static void nest_imc_counters_release(struct perf_event *event) rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, get_hard_smp_processor_id(event->cpu)); if (rc) { - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); pr_err("nest-imc: Unable to stop the counters for core %d\n", node_id); return; } @@ -498,7 +499,7 @@ static void nest_imc_counters_release(struct perf_event *event) WARN(1, "nest-imc: Invalid event reference count\n"); ref->refc = 0; } - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); } static int nest_imc_event_init(struct perf_event *event) @@ -557,26 +558,25 @@ static int nest_imc_event_init(struct perf_event *event) /* * Get the imc_pmu_ref struct for this node. - * Take the mutex lock and then increment the count of nest pmu events - * inited. + * Take the lock and then increment the count of nest pmu events inited. */ ref = get_nest_pmu_ref(event->cpu); if (!ref) return -EINVAL; - mutex_lock(&ref->lock); + spin_lock(&ref->lock); if (ref->refc == 0) { rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_NEST, get_hard_smp_processor_id(event->cpu)); if (rc) { - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); pr_err("nest-imc: Unable to start the counters for node %d\n", node_id); return rc; } } ++ref->refc; - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); event->destroy = nest_imc_counters_release; return 0; @@ -612,9 +612,8 @@ static int core_imc_mem_init(int cpu, int size) return -ENOMEM; mem_info->vbase = page_address(page); - /* Init the mutex */ core_imc_refc[core_id].id = core_id; - mutex_init(&core_imc_refc[core_id].lock); + spin_lock_init(&core_imc_refc[core_id].lock); rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_CORE, __pa((void *)mem_info->vbase), @@ -703,9 +702,8 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu) perf_pmu_migrate_context(&core_imc_pmu->pmu, cpu, ncpu); } else { /* - * If this is the last cpu in this core then, skip taking refernce - * count mutex lock for this core and directly zero "refc" for - * this core. + * If this is the last cpu in this core then skip taking reference + * count lock for this core and directly zero "refc" for this core. */ opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, get_hard_smp_processor_id(cpu)); @@ -720,11 +718,11 @@ static int ppc_core_imc_cpu_offline(unsigned int cpu) * last cpu in this core and core-imc event running * in this cpu. */ - mutex_lock(&imc_global_refc.lock); + spin_lock(&imc_global_refc.lock); if (imc_global_refc.id == IMC_DOMAIN_CORE) imc_global_refc.refc--; - mutex_unlock(&imc_global_refc.lock); + spin_unlock(&imc_global_refc.lock); } return 0; } @@ -739,7 +737,7 @@ static int core_imc_pmu_cpumask_init(void) static void reset_global_refc(struct perf_event *event) { - mutex_lock(&imc_global_refc.lock); + spin_lock(&imc_global_refc.lock); imc_global_refc.refc--; /* @@ -751,7 +749,7 @@ static void reset_global_refc(struct perf_event *event) imc_global_refc.refc = 0; imc_global_refc.id = 0; } - mutex_unlock(&imc_global_refc.lock); + spin_unlock(&imc_global_refc.lock); } static void core_imc_counters_release(struct perf_event *event) @@ -764,17 +762,17 @@ static void core_imc_counters_release(struct perf_event *event) /* * See if we need to disable the IMC PMU. * If no events are currently in use, then we have to take a - * mutex to ensure that we don't race with another task doing + * lock to ensure that we don't race with another task doing * enable or disable the core counters. */ core_id = event->cpu / threads_per_core; - /* Take the mutex lock and decrement the refernce count for this core */ + /* Take the lock and decrement the refernce count for this core */ ref = &core_imc_refc[core_id]; if (!ref) return; - mutex_lock(&ref->lock); + spin_lock(&ref->lock); if (ref->refc == 0) { /* * The scenario where this is true is, when perf session is @@ -786,7 +784,7 @@ static void core_imc_counters_release(struct perf_event *event) * an OPAL call to disable the engine in that core. * */ - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); return; } ref->refc--; @@ -794,7 +792,7 @@ static void core_imc_counters_release(struct perf_event *event) rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, get_hard_smp_processor_id(event->cpu)); if (rc) { - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); pr_err("IMC: Unable to stop the counters for core %d\n", core_id); return; } @@ -802,7 +800,7 @@ static void core_imc_counters_release(struct perf_event *event) WARN(1, "core-imc: Invalid event reference count\n"); ref->refc = 0; } - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); reset_global_refc(event); } @@ -840,7 +838,6 @@ static int core_imc_event_init(struct perf_event *event) if ((!pcmi->vbase)) return -ENODEV; - /* Get the core_imc mutex for this core */ ref = &core_imc_refc[core_id]; if (!ref) return -EINVAL; @@ -848,22 +845,22 @@ static int core_imc_event_init(struct perf_event *event) /* * Core pmu units are enabled only when it is used. * See if this is triggered for the first time. - * If yes, take the mutex lock and enable the core counters. + * If yes, take the lock and enable the core counters. * If not, just increment the count in core_imc_refc struct. */ - mutex_lock(&ref->lock); + spin_lock(&ref->lock); if (ref->refc == 0) { rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE, get_hard_smp_processor_id(event->cpu)); if (rc) { - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); pr_err("core-imc: Unable to start the counters for core %d\n", core_id); return rc; } } ++ref->refc; - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); /* * Since the system can run either in accumulation or trace-mode @@ -874,7 +871,7 @@ static int core_imc_event_init(struct perf_event *event) * to know whether any other trace/thread imc * events are running. */ - mutex_lock(&imc_global_refc.lock); + spin_lock(&imc_global_refc.lock); if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_CORE) { /* * No other trace/thread imc events are running in @@ -883,10 +880,10 @@ static int core_imc_event_init(struct perf_event *event) imc_global_refc.id = IMC_DOMAIN_CORE; imc_global_refc.refc++; } else { - mutex_unlock(&imc_global_refc.lock); + spin_unlock(&imc_global_refc.lock); return -EBUSY; } - mutex_unlock(&imc_global_refc.lock); + spin_unlock(&imc_global_refc.lock); event->hw.event_base = (u64)pcmi->vbase + (config & IMC_EVENT_OFFSET_MASK); event->destroy = core_imc_counters_release; @@ -958,10 +955,10 @@ static int ppc_thread_imc_cpu_offline(unsigned int cpu) mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63)))); /* Reduce the refc if thread-imc event running on this cpu */ - mutex_lock(&imc_global_refc.lock); + spin_lock(&imc_global_refc.lock); if (imc_global_refc.id == IMC_DOMAIN_THREAD) imc_global_refc.refc--; - mutex_unlock(&imc_global_refc.lock); + spin_unlock(&imc_global_refc.lock); return 0; } @@ -1001,7 +998,7 @@ static int thread_imc_event_init(struct perf_event *event) if (!target) return -EINVAL; - mutex_lock(&imc_global_refc.lock); + spin_lock(&imc_global_refc.lock); /* * Check if any other trace/core imc events are running in the * system, if not set the global id to thread-imc. @@ -1010,10 +1007,10 @@ static int thread_imc_event_init(struct perf_event *event) imc_global_refc.id = IMC_DOMAIN_THREAD; imc_global_refc.refc++; } else { - mutex_unlock(&imc_global_refc.lock); + spin_unlock(&imc_global_refc.lock); return -EBUSY; } - mutex_unlock(&imc_global_refc.lock); + spin_unlock(&imc_global_refc.lock); event->pmu->task_ctx_nr = perf_sw_context; event->destroy = reset_global_refc; @@ -1135,25 +1132,25 @@ static int thread_imc_event_add(struct perf_event *event, int flags) /* * imc pmus are enabled only when it is used. * See if this is triggered for the first time. - * If yes, take the mutex lock and enable the counters. + * If yes, take the lock and enable the counters. * If not, just increment the count in ref count struct. */ ref = &core_imc_refc[core_id]; if (!ref) return -EINVAL; - mutex_lock(&ref->lock); + spin_lock(&ref->lock); if (ref->refc == 0) { if (opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE, get_hard_smp_processor_id(smp_processor_id()))) { - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); pr_err("thread-imc: Unable to start the counter\ for core %d\n", core_id); return -EINVAL; } } ++ref->refc; - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); return 0; } @@ -1170,12 +1167,12 @@ static void thread_imc_event_del(struct perf_event *event, int flags) return; } - mutex_lock(&ref->lock); + spin_lock(&ref->lock); ref->refc--; if (ref->refc == 0) { if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, get_hard_smp_processor_id(smp_processor_id()))) { - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); pr_err("thread-imc: Unable to stop the counters\ for core %d\n", core_id); return; @@ -1183,7 +1180,7 @@ static void thread_imc_event_del(struct perf_event *event, int flags) } else if (ref->refc < 0) { ref->refc = 0; } - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); /* Set bit 0 of LDBAR to zero, to stop posting updates to memory */ mtspr(SPRN_LDBAR, (mfspr(SPRN_LDBAR) & (~(1UL << 63)))); @@ -1224,9 +1221,8 @@ static int trace_imc_mem_alloc(int cpu_id, int size) } } - /* Init the mutex, if not already */ trace_imc_refc[core_id].id = core_id; - mutex_init(&trace_imc_refc[core_id].lock); + spin_lock_init(&trace_imc_refc[core_id].lock); mtspr(SPRN_LDBAR, 0); return 0; @@ -1246,10 +1242,10 @@ static int ppc_trace_imc_cpu_offline(unsigned int cpu) * Reduce the refc if any trace-imc event running * on this cpu. */ - mutex_lock(&imc_global_refc.lock); + spin_lock(&imc_global_refc.lock); if (imc_global_refc.id == IMC_DOMAIN_TRACE) imc_global_refc.refc--; - mutex_unlock(&imc_global_refc.lock); + spin_unlock(&imc_global_refc.lock); return 0; } @@ -1371,17 +1367,17 @@ static int trace_imc_event_add(struct perf_event *event, int flags) } mtspr(SPRN_LDBAR, ldbar_value); - mutex_lock(&ref->lock); + spin_lock(&ref->lock); if (ref->refc == 0) { if (opal_imc_counters_start(OPAL_IMC_COUNTERS_TRACE, get_hard_smp_processor_id(smp_processor_id()))) { - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); pr_err("trace-imc: Unable to start the counters for core %d\n", core_id); return -EINVAL; } } ++ref->refc; - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); return 0; } @@ -1414,19 +1410,19 @@ static void trace_imc_event_del(struct perf_event *event, int flags) return; } - mutex_lock(&ref->lock); + spin_lock(&ref->lock); ref->refc--; if (ref->refc == 0) { if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_TRACE, get_hard_smp_processor_id(smp_processor_id()))) { - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); pr_err("trace-imc: Unable to stop the counters for core %d\n", core_id); return; } } else if (ref->refc < 0) { ref->refc = 0; } - mutex_unlock(&ref->lock); + spin_unlock(&ref->lock); trace_imc_event_stop(event, flags); } @@ -1448,7 +1444,7 @@ static int trace_imc_event_init(struct perf_event *event) * no other thread is running any core/thread imc * events */ - mutex_lock(&imc_global_refc.lock); + spin_lock(&imc_global_refc.lock); if (imc_global_refc.id == 0 || imc_global_refc.id == IMC_DOMAIN_TRACE) { /* * No core/thread imc events are running in the @@ -1457,10 +1453,10 @@ static int trace_imc_event_init(struct perf_event *event) imc_global_refc.id = IMC_DOMAIN_TRACE; imc_global_refc.refc++; } else { - mutex_unlock(&imc_global_refc.lock); + spin_unlock(&imc_global_refc.lock); return -EBUSY; } - mutex_unlock(&imc_global_refc.lock); + spin_unlock(&imc_global_refc.lock); event->hw.idx = -1; @@ -1533,10 +1529,10 @@ static int init_nest_pmu_ref(void) i = 0; for_each_node(nid) { /* - * Mutex lock to avoid races while tracking the number of + * Take the lock to avoid races while tracking the number of * sessions using the chip's nest pmu units. */ - mutex_init(&nest_imc_refc[i].lock); + spin_lock_init(&nest_imc_refc[i].lock); /* * Loop to init the "id" with the node_id. Variable "i" initialized to @@ -1633,7 +1629,7 @@ static void imc_common_mem_free(struct imc_pmu *pmu_ptr) static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) { if (pmu_ptr->domain == IMC_DOMAIN_NEST) { - mutex_lock(&nest_init_lock); + spin_lock(&nest_init_lock); if (nest_pmus == 1) { cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE); kfree(nest_imc_refc); @@ -1643,7 +1639,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) if (nest_pmus > 0) nest_pmus--; - mutex_unlock(&nest_init_lock); + spin_unlock(&nest_init_lock); } /* Free core_imc memory */ @@ -1800,11 +1796,11 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id * rest. To handle the cpuhotplug callback unregister, we track * the number of nest pmus in "nest_pmus". */ - mutex_lock(&nest_init_lock); + spin_lock(&nest_init_lock); if (nest_pmus == 0) { ret = init_nest_pmu_ref(); if (ret) { - mutex_unlock(&nest_init_lock); + spin_unlock(&nest_init_lock); kfree(per_nest_pmu_arr); per_nest_pmu_arr = NULL; goto err_free_mem; @@ -1812,7 +1808,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id /* Register for cpu hotplug notification. */ ret = nest_pmu_cpumask_init(); if (ret) { - mutex_unlock(&nest_init_lock); + spin_unlock(&nest_init_lock); kfree(nest_imc_refc); kfree(per_nest_pmu_arr); per_nest_pmu_arr = NULL; @@ -1820,7 +1816,7 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id } } nest_pmus++; - mutex_unlock(&nest_init_lock); + spin_unlock(&nest_init_lock); break; case IMC_DOMAIN_CORE: ret = core_imc_pmu_cpumask_init(); diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 2b6091349daa..696c9e007a36 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -508,6 +508,7 @@ static void __init setup_lowcore_dat_on(void) { struct lowcore *abs_lc; unsigned long flags; + int i; __ctl_clear_bit(0, 28); S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT; @@ -523,8 +524,8 @@ static void __init setup_lowcore_dat_on(void) abs_lc = get_abs_lowcore(&flags); abs_lc->restart_flags = RESTART_FLAG_CTLREGS; abs_lc->program_new_psw = S390_lowcore.program_new_psw; - memcpy(abs_lc->cregs_save_area, S390_lowcore.cregs_save_area, - sizeof(abs_lc->cregs_save_area)); + for (i = 0; i < 16; i++) + abs_lc->cregs_save_area[i] = S390_lowcore.cregs_save_area[i]; put_abs_lowcore(abs_lc, flags); } diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S index 5521ea12f44e..aa9b96457584 100644 --- a/arch/x86/boot/bioscall.S +++ b/arch/x86/boot/bioscall.S @@ -32,7 +32,7 @@ intcall: movw %dx, %si movw %sp, %di movw $11, %cx - rep; movsd + rep; movsl /* Pop full state from the stack */ popal @@ -67,7 +67,7 @@ intcall: jz 4f movw %sp, %si movw $11, %cx - rep; movsd + rep; movsl 4: addw $44, %sp /* Restore state and return */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f35f1ff4427b..6aaae18f1854 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1111,6 +1111,7 @@ struct msr_bitmap_range { /* Xen emulation context */ struct kvm_xen { + struct mutex xen_lock; u32 xen_version; bool long_mode; bool runstate_update_flag; diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index efe0c30d3a12..77538abeb72a 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -146,6 +146,30 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid) return entry; } +static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val) +{ + u64 msr_val; + + /* + * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured + * with a valid event code for supported resource type and the bits + * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, + * IA32_QM_CTR.data (bits 61:0) reports the monitored data. + * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) + * are error bits. + */ + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, msr_val); + + if (msr_val & RMID_VAL_ERROR) + return -EIO; + if (msr_val & RMID_VAL_UNAVAIL) + return -EINVAL; + + *val = msr_val; + return 0; +} + static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom, u32 rmid, enum resctrl_event_id eventid) @@ -172,8 +196,12 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d, struct arch_mbm_state *am; am = get_arch_mbm_state(hw_dom, rmid, eventid); - if (am) + if (am) { memset(am, 0, sizeof(*am)); + + /* Record any initial, non-zero count value. */ + __rmid_read(rmid, eventid, &am->prev_msr); + } } static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width) @@ -191,25 +219,14 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d, struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d); struct arch_mbm_state *am; u64 msr_val, chunks; + int ret; if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask)) return -EINVAL; - /* - * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured - * with a valid event code for supported resource type and the bits - * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, - * IA32_QM_CTR.data (bits 61:0) reports the monitored data. - * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) - * are error bits. - */ - wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); - rdmsrl(MSR_IA32_QM_CTR, msr_val); - - if (msr_val & RMID_VAL_ERROR) - return -EIO; - if (msr_val & RMID_VAL_UNAVAIL) - return -EINVAL; + ret = __rmid_read(rmid, eventid, &msr_val); + if (ret) + return ret; am = get_arch_mbm_state(hw_dom, rmid, eventid); if (am) { diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index e5a48f05e787..5993da21d822 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -580,8 +580,10 @@ static int __rdtgroup_move_task(struct task_struct *tsk, /* * Ensure the task's closid and rmid are written before determining if * the task is current that will decide if it will be interrupted. + * This pairs with the full barrier between the rq->curr update and + * resctrl_sched_in() during context switch. */ - barrier(); + smp_mb(); /* * By now, the task's closid and rmid are set. If the task is current @@ -2402,6 +2404,14 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, WRITE_ONCE(t->rmid, to->mon.rmid); /* + * Order the closid/rmid stores above before the loads + * in task_curr(). This pairs with the full barrier + * between the rq->curr update and resctrl_sched_in() + * during context switch. + */ + smp_mb(); + + /* * If the task is on a CPU, set the CPU in the mask. * The detection is inaccurate as tasks might move or * schedule before the smp function call takes place. diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b14653b61470..596061c1610e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -770,16 +770,22 @@ struct kvm_cpuid_array { int nent; }; +static struct kvm_cpuid_entry2 *get_next_cpuid(struct kvm_cpuid_array *array) +{ + if (array->nent >= array->maxnent) + return NULL; + + return &array->entries[array->nent++]; +} + static struct kvm_cpuid_entry2 *do_host_cpuid(struct kvm_cpuid_array *array, u32 function, u32 index) { - struct kvm_cpuid_entry2 *entry; + struct kvm_cpuid_entry2 *entry = get_next_cpuid(array); - if (array->nent >= array->maxnent) + if (!entry) return NULL; - entry = &array->entries[array->nent++]; - memset(entry, 0, sizeof(*entry)); entry->function = function; entry->index = index; @@ -956,22 +962,13 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->edx = edx.full; break; } - /* - * Per Intel's SDM, the 0x1f is a superset of 0xb, - * thus they can be handled by common code. - */ case 0x1f: case 0xb: /* - * Populate entries until the level type (ECX[15:8]) of the - * previous entry is zero. Note, CPUID EAX.{0x1f,0xb}.0 is - * the starting entry, filled by the primary do_host_cpuid(). + * No topology; a valid topology is indicated by the presence + * of subleaf 1. */ - for (i = 1; entry->ecx & 0xff00; ++i) { - entry = do_host_cpuid(array, function, i); - if (!entry) - goto out; - } + entry->eax = entry->ebx = entry->ecx = 0; break; case 0xd: { u64 permitted_xcr0 = kvm_caps.supported_xcr0 & xstate_get_guest_group_perm(); @@ -1202,6 +1199,9 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) entry->ebx = entry->ecx = entry->edx = 0; break; case 0x8000001e: + /* Do not return host topology information. */ + entry->eax = entry->ebx = entry->ecx = 0; + entry->edx = 0; /* reserved */ break; case 0x8000001F: if (!kvm_cpu_cap_has(X86_FEATURE_SEV)) { diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index bc9cd7086fa9..add65dd59756 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -138,15 +138,13 @@ void recalc_intercepts(struct vcpu_svm *svm) c->intercepts[i] = h->intercepts[i]; if (g->int_ctl & V_INTR_MASKING_MASK) { - /* We only want the cr8 intercept bits of L1 */ - vmcb_clr_intercept(c, INTERCEPT_CR8_READ); - vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); - /* - * Once running L2 with HF_VINTR_MASK, EFLAGS.IF does not - * affect any interrupt we may want to inject; therefore, - * interrupt window vmexits are irrelevant to L0. + * Once running L2 with HF_VINTR_MASK, EFLAGS.IF and CR8 + * does not affect any interrupt we may want to inject; + * therefore, writes to CR8 are irrelevant to L0, as are + * interrupt window vmexits. */ + vmcb_clr_intercept(c, INTERCEPT_CR8_WRITE); vmcb_clr_intercept(c, INTERCEPT_VINTR); } diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index 2e29bdc2949c..8fd41f5deae3 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -271,7 +271,15 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * Attempt to obtain the GPC lock on *both* (if there are two) * gfn_to_pfn caches that cover the region. */ - read_lock_irqsave(&gpc1->lock, flags); + if (atomic) { + local_irq_save(flags); + if (!read_trylock(&gpc1->lock)) { + local_irq_restore(flags); + return; + } + } else { + read_lock_irqsave(&gpc1->lock, flags); + } while (!kvm_gpc_check(gpc1, user_len1)) { read_unlock_irqrestore(&gpc1->lock, flags); @@ -304,9 +312,18 @@ static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) * The guest's runstate_info is split across two pages and we * need to hold and validate both GPCs simultaneously. We can * declare a lock ordering GPC1 > GPC2 because nothing else - * takes them more than one at a time. + * takes them more than one at a time. Set a subclass on the + * gpc1 lock to make lockdep shut up about it. */ - read_lock(&gpc2->lock); + lock_set_subclass(&gpc1->lock.dep_map, 1, _THIS_IP_); + if (atomic) { + if (!read_trylock(&gpc2->lock)) { + read_unlock_irqrestore(&gpc1->lock, flags); + return; + } + } else { + read_lock(&gpc2->lock); + } if (!kvm_gpc_check(gpc2, user_len2)) { read_unlock(&gpc2->lock); @@ -590,26 +607,26 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) { r = -EINVAL; } else { - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.long_mode = !!data->u.long_mode; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; } break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); break; case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: if (data->u.vector && data->u.vector < 0x10) r = -EINVAL; else { - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.upcall_vector = data->u.vector; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; } break; @@ -619,9 +636,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_XEN_VERSION: - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.xen_version = data->u.xen_version; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; break; @@ -630,9 +647,9 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) r = -EOPNOTSUPP; break; } - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag; - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); r = 0; break; @@ -647,7 +664,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) { int r = -ENOENT; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); switch (data->type) { case KVM_XEN_ATTR_TYPE_LONG_MODE: @@ -686,7 +703,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return r; } @@ -694,7 +711,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) { int idx, r = -ENOENT; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.xen.xen_lock); idx = srcu_read_lock(&vcpu->kvm->srcu); switch (data->type) { @@ -922,7 +939,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) } srcu_read_unlock(&vcpu->kvm->srcu, idx); - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.xen.xen_lock); return r; } @@ -930,7 +947,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) { int r = -ENOENT; - mutex_lock(&vcpu->kvm->lock); + mutex_lock(&vcpu->kvm->arch.xen.xen_lock); switch (data->type) { case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: @@ -1013,7 +1030,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) break; } - mutex_unlock(&vcpu->kvm->lock); + mutex_unlock(&vcpu->kvm->arch.xen.xen_lock); return r; } @@ -1106,7 +1123,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) xhc->blob_size_32 || xhc->blob_size_64)) return -EINVAL; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); if (xhc->msr && !kvm->arch.xen_hvm_config.msr) static_branch_inc(&kvm_xen_enabled.key); @@ -1115,7 +1132,7 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return 0; } @@ -1658,15 +1675,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) mm_borrowed = true; } - /* - * For the irqfd workqueue, using the main kvm->lock mutex is - * fine since this function is invoked from kvm_set_irq() with - * no other lock held, no srcu. In future if it will be called - * directly from a vCPU thread (e.g. on hypercall for an IPI) - * then it may need to switch to using a leaf-node mutex for - * serializing the shared_info mapping. - */ - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); /* * It is theoretically possible for the page to be unmapped @@ -1695,7 +1704,7 @@ static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) srcu_read_unlock(&kvm->srcu, idx); } while(!rc); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (mm_borrowed) kthread_unuse_mm(kvm->mm); @@ -1811,7 +1820,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm, int ret; /* Protect writes to evtchnfd as well as the idr lookup. */ - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port); ret = -ENOENT; @@ -1842,7 +1851,7 @@ static int kvm_xen_eventfd_update(struct kvm *kvm, } ret = 0; out_unlock: - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return ret; } @@ -1905,10 +1914,10 @@ static int kvm_xen_eventfd_assign(struct kvm *kvm, evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; } - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1, GFP_KERNEL); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (ret >= 0) return 0; @@ -1926,9 +1935,9 @@ static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) { struct evtchnfd *evtchnfd; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port); - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); if (!evtchnfd) return -ENOENT; @@ -1946,7 +1955,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm) int i; int n = 0; - mutex_lock(&kvm->lock); + mutex_lock(&kvm->arch.xen.xen_lock); /* * Because synchronize_srcu() cannot be called inside the @@ -1958,7 +1967,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm) all_evtchnfds = kmalloc_array(n, sizeof(struct evtchnfd *), GFP_KERNEL); if (!all_evtchnfds) { - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); return -ENOMEM; } @@ -1967,7 +1976,7 @@ static int kvm_xen_eventfd_reset(struct kvm *kvm) all_evtchnfds[n++] = evtchnfd; idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port); } - mutex_unlock(&kvm->lock); + mutex_unlock(&kvm->arch.xen.xen_lock); synchronize_srcu(&kvm->srcu); @@ -2069,6 +2078,7 @@ void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) void kvm_xen_init_vm(struct kvm *kvm) { + mutex_init(&kvm->arch.xen.xen_lock); idr_init(&kvm->arch.xen.evtchn_ports); kvm_gpc_init(&kvm->arch.xen.shinfo_cache, kvm, NULL, KVM_HOST_USES_PFN); } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d3987359d441..cb258f58fdc8 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -26,6 +26,7 @@ #include <asm/pti.h> #include <asm/text-patching.h> #include <asm/memtype.h> +#include <asm/paravirt.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -804,6 +805,9 @@ void __init poking_init(void) poking_mm = mm_alloc(); BUG_ON(!poking_mm); + /* Xen PV guests need the PGD to be pinned. */ + paravirt_arch_dup_mmap(NULL, poking_mm); + /* * Randomize the poking address, but make sure that the following page * will be mapped at the same PMD. We need 2 pages, so find space for 3, diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c index 46de9cf5c91d..fb4b1b5e0dea 100644 --- a/arch/x86/mm/pat/memtype.c +++ b/arch/x86/mm/pat/memtype.c @@ -387,7 +387,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, u8 mtrr_type, uniform; mtrr_type = mtrr_type_lookup(start, end, &uniform); - if (mtrr_type != MTRR_TYPE_WRBACK) + if (mtrr_type != MTRR_TYPE_WRBACK && + mtrr_type != MTRR_TYPE_INVALID) return _PAGE_CACHE_MODE_UC_MINUS; return _PAGE_CACHE_MODE_WB; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 758cbfe55daa..4b3efaa82ab7 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -12,6 +12,7 @@ */ #include <linux/acpi.h> +#include <linux/efi.h> #include <linux/pci.h> #include <linux/init.h> #include <linux/bitmap.h> @@ -442,17 +443,42 @@ static bool is_acpi_reserved(u64 start, u64 end, enum e820_type not_used) return mcfg_res.flags; } +static bool is_efi_mmio(u64 start, u64 end, enum e820_type not_used) +{ +#ifdef CONFIG_EFI + efi_memory_desc_t *md; + u64 size, mmio_start, mmio_end; + + for_each_efi_memory_desc(md) { + if (md->type == EFI_MEMORY_MAPPED_IO) { + size = md->num_pages << EFI_PAGE_SHIFT; + mmio_start = md->phys_addr; + mmio_end = mmio_start + size; + + /* + * N.B. Caller supplies (start, start + size), + * so to match, mmio_end is the first address + * *past* the EFI_MEMORY_MAPPED_IO area. + */ + if (mmio_start <= start && end <= mmio_end) + return true; + } + } +#endif + + return false; +} + typedef bool (*check_reserved_t)(u64 start, u64 end, enum e820_type type); static bool __ref is_mmconf_reserved(check_reserved_t is_reserved, struct pci_mmcfg_region *cfg, - struct device *dev, int with_e820) + struct device *dev, const char *method) { u64 addr = cfg->res.start; u64 size = resource_size(&cfg->res); u64 old_size = size; int num_buses; - char *method = with_e820 ? "E820" : "ACPI motherboard resources"; while (!is_reserved(addr, addr + size, E820_TYPE_RESERVED)) { size >>= 1; @@ -464,10 +490,10 @@ static bool __ref is_mmconf_reserved(check_reserved_t is_reserved, return false; if (dev) - dev_info(dev, "MMCONFIG at %pR reserved in %s\n", + dev_info(dev, "MMCONFIG at %pR reserved as %s\n", &cfg->res, method); else - pr_info(PREFIX "MMCONFIG at %pR reserved in %s\n", + pr_info(PREFIX "MMCONFIG at %pR reserved as %s\n", &cfg->res, method); if (old_size != size) { @@ -500,7 +526,8 @@ static bool __ref pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int early) { if (!early && !acpi_disabled) { - if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, 0)) + if (is_mmconf_reserved(is_acpi_reserved, cfg, dev, + "ACPI motherboard resource")) return true; if (dev) @@ -513,6 +540,10 @@ pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int e "MMCONFIG at %pR not reserved in " "ACPI motherboard resources\n", &cfg->res); + + if (is_mmconf_reserved(is_efi_mmio, cfg, dev, + "EfiMemoryMappedIO")) + return true; } /* @@ -527,7 +558,8 @@ pci_mmcfg_check_reserved(struct device *dev, struct pci_mmcfg_region *cfg, int e /* Don't try to do this check unless configuration type 1 is available. how about type 2 ?*/ if (raw_pci_ops) - return is_mmconf_reserved(e820__mapped_all, cfg, dev, 1); + return is_mmconf_reserved(e820__mapped_all, cfg, dev, + "E820 entry"); return false; } diff --git a/arch/x86/um/elfcore.c b/arch/x86/um/elfcore.c index 48a3eb09d951..650cdbbdaf45 100644 --- a/arch/x86/um/elfcore.c +++ b/arch/x86/um/elfcore.c @@ -7,7 +7,7 @@ #include <asm/elf.h> -Elf32_Half elf_core_extra_phdrs(void) +Elf32_Half elf_core_extra_phdrs(struct coredump_params *cprm) { return vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0; } @@ -60,7 +60,7 @@ int elf_core_write_extra_data(struct coredump_params *cprm) return 1; } -size_t elf_core_extra_data_size(void) +size_t elf_core_extra_data_size(struct coredump_params *cprm) { if ( vsyscall_ehdr ) { const struct elfhdr *const ehdrp = |