diff options
Diffstat (limited to 'arch/powerpc')
41 files changed, 577 insertions, 192 deletions
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 36f858c37ca7..81b0031f909f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -199,7 +199,7 @@ config PPC select HAVE_OPTPROBES if PPC64 select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS_NMI if PPC64 - select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH + select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 8d4ed73d5490..e2b3e7a00c9e 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -59,6 +59,19 @@ machine-$(CONFIG_PPC64) += 64 machine-$(CONFIG_CPU_LITTLE_ENDIAN) += le UTS_MACHINE := $(subst $(space),,$(machine-y)) +# XXX This needs to be before we override LD below +ifdef CONFIG_PPC32 +KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o +else +ifeq ($(call ld-ifversion, -ge, 225000000, y),y) +# Have the linker provide sfpr if possible. +# There is a corresponding test in arch/powerpc/lib/Makefile +KBUILD_LDFLAGS_MODULE += --save-restore-funcs +else +KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o +endif +endif + ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y) override LD += -EL LDEMULATION := lppc @@ -190,18 +203,6 @@ else CHECKFLAGS += -D__LITTLE_ENDIAN__ endif -ifdef CONFIG_PPC32 -KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o -else -ifeq ($(call ld-ifversion, -ge, 225000000, y),y) -# Have the linker provide sfpr if possible. -# There is a corresponding test in arch/powerpc/lib/Makefile -KBUILD_LDFLAGS_MODULE += --save-restore-funcs -else -KBUILD_LDFLAGS_MODULE += arch/powerpc/lib/crtsavres.o -endif -endif - ifeq ($(CONFIG_476FPE_ERR46),y) KBUILD_LDFLAGS_MODULE += --ppc476-workaround \ -T $(srctree)/arch/powerpc/platforms/44x/ppc476_modules.lds diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index a7814a7b1523..6f952fe1f084 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -25,12 +25,20 @@ compress-$(CONFIG_KERNEL_XZ) := CONFIG_KERNEL_XZ BOOTCFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ -fno-strict-aliasing -Os -msoft-float -pipe \ -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \ - -isystem $(shell $(CROSS32CC) -print-file-name=include) \ -D$(compress-y) +BOOTCC := $(CC) ifdef CONFIG_PPC64_BOOT_WRAPPER BOOTCFLAGS += -m64 +else +BOOTCFLAGS += -m32 +ifdef CROSS32_COMPILE + BOOTCC := $(CROSS32_COMPILE)gcc +endif endif + +BOOTCFLAGS += -isystem $(shell $(BOOTCC) -print-file-name=include) + ifdef CONFIG_CPU_BIG_ENDIAN BOOTCFLAGS += -mbig-endian else @@ -183,10 +191,10 @@ clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \ empty.c zImage.coff.lds zImage.ps3.lds zImage.lds quiet_cmd_bootcc = BOOTCC $@ - cmd_bootcc = $(CROSS32CC) -Wp,-MD,$(depfile) $(BOOTCFLAGS) -c -o $@ $< + cmd_bootcc = $(BOOTCC) -Wp,-MD,$(depfile) $(BOOTCFLAGS) -c -o $@ $< quiet_cmd_bootas = BOOTAS $@ - cmd_bootas = $(CROSS32CC) -Wp,-MD,$(depfile) $(BOOTAFLAGS) -c -o $@ $< + cmd_bootas = $(BOOTCC) -Wp,-MD,$(depfile) $(BOOTAFLAGS) -c -o $@ $< quiet_cmd_bootar = BOOTAR $@ cmd_bootar = $(CROSS32AR) -cr$(KBUILD_ARFLAGS) $@.$$$$ $(filter-out FORCE,$^); mv $@.$$$$ $@ diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index 0695ce047d56..34fc9bbfca9e 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -293,7 +293,8 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUG_STACKOVERFLOW=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y +CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_LATENCYTOP=y CONFIG_SCHED_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 5175028c56ce..c5246d29f385 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -324,7 +324,8 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUG_STACKOVERFLOW=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y +CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_DEBUG_MUTEXES=y CONFIG_LATENCYTOP=y CONFIG_SCHED_TRACER=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 1a61aa20dfba..fd5d98a0b95c 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -291,7 +291,8 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_STACK_USAGE=y CONFIG_DEBUG_STACKOVERFLOW=y -CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y +CONFIG_HARDLOCKUP_DETECTOR=y CONFIG_LATENCYTOP=y CONFIG_SCHED_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 0ce513f2926f..36fc7bfe9e11 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -91,6 +91,7 @@ static inline int hash__pgd_bad(pgd_t pgd) } #ifdef CONFIG_STRICT_KERNEL_RWX extern void hash__mark_rodata_ro(void); +extern void hash__mark_initmem_nx(void); #endif extern void hpte_need_flush(struct mm_struct *mm, unsigned long addr, diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 77529a3e3811..5b4023c616f7 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -59,13 +59,14 @@ extern struct patb_entry *partition_tb; #define PRTS_MASK 0x1f /* process table size field */ #define PRTB_MASK 0x0ffffffffffff000UL -/* - * Limit process table to PAGE_SIZE table. This - * also limit the max pid we can support. - * MAX_USER_CONTEXT * 16 bytes of space. - */ -#define PRTB_SIZE_SHIFT (CONTEXT_BITS + 4) -#define PRTB_ENTRIES (1ul << CONTEXT_BITS) +/* Number of supported PID bits */ +extern unsigned int mmu_pid_bits; + +/* Base PID to allocate from */ +extern unsigned int mmu_base_pid; + +#define PRTB_SIZE_SHIFT (mmu_pid_bits + 4) +#define PRTB_ENTRIES (1ul << mmu_pid_bits) /* * Power9 currently only support 64K partition table size. diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index c0737c86a362..818a58fc3f4f 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -608,9 +608,17 @@ static inline pte_t pte_mkdevmap(pte_t pte) return __pte(pte_val(pte) | _PAGE_SPECIAL|_PAGE_DEVMAP); } +/* + * This is potentially called with a pmd as the argument, in which case it's not + * safe to check _PAGE_DEVMAP unless we also confirm that _PAGE_PTE is set. + * That's because the bit we use for _PAGE_DEVMAP is not reserved for software + * use in page directory entries (ie. non-ptes). + */ static inline int pte_devmap(pte_t pte) { - return !!(pte_raw(pte) & cpu_to_be64(_PAGE_DEVMAP)); + u64 mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE); + + return (pte_raw(pte) & mask) == mask; } static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) @@ -1192,5 +1200,6 @@ static inline const int pud_pfn(pud_t pud) BUILD_BUG(); return 0; } + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 487709ff6875..544440b5aff3 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -118,6 +118,7 @@ #ifdef CONFIG_STRICT_KERNEL_RWX extern void radix__mark_rodata_ro(void); +extern void radix__mark_initmem_nx(void); #endif static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr, diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index 0151af6c2a50..87fcc1948817 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -18,7 +18,7 @@ #include <asm/asm-offsets.h> #ifdef CONFIG_DEBUG_BUGVERBOSE .macro EMIT_BUG_ENTRY addr,file,line,flags - .section __bug_table,"a" + .section __bug_table,"aw" 5001: PPC_LONG \addr, 5002f .short \line, \flags .org 5001b+BUG_ENTRY_SIZE @@ -29,7 +29,7 @@ .endm #else .macro EMIT_BUG_ENTRY addr,file,line,flags - .section __bug_table,"a" + .section __bug_table,"aw" 5001: PPC_LONG \addr .short \flags .org 5001b+BUG_ENTRY_SIZE @@ -42,14 +42,14 @@ sizeof(struct bug_entry), respectively */ #ifdef CONFIG_DEBUG_BUGVERBOSE #define _EMIT_BUG_ENTRY \ - ".section __bug_table,\"a\"\n" \ + ".section __bug_table,\"aw\"\n" \ "2:\t" PPC_LONG "1b, %0\n" \ "\t.short %1, %2\n" \ ".org 2b+%3\n" \ ".previous\n" #else #define _EMIT_BUG_ENTRY \ - ".section __bug_table,\"a\"\n" \ + ".section __bug_table,\"aw\"\n" \ "2:\t" PPC_LONG "1b\n" \ "\t.short %2\n" \ ".org 2b+%3\n" \ diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index da7e9432fa8f..35bec1c5bd5a 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -45,7 +45,7 @@ extern void set_context(unsigned long id, pgd_t *pgd); #ifdef CONFIG_PPC_BOOK3S_64 extern void radix__switch_mmu_context(struct mm_struct *prev, - struct mm_struct *next); + struct mm_struct *next); static inline void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) @@ -67,6 +67,12 @@ extern void __destroy_context(unsigned long context_id); extern void mmu_context_init(void); #endif +#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU) +extern void radix_kvm_prefetch_workaround(struct mm_struct *mm); +#else +static inline void radix_kvm_prefetch_workaround(struct mm_struct *mm) { } +#endif + extern void switch_cop(struct mm_struct *next); extern int use_cop(unsigned long acop, struct mm_struct *mm); extern void drop_cop(unsigned long acop, struct mm_struct *mm); @@ -79,10 +85,32 @@ static inline void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { + bool new_on_cpu = false; + /* Mark this context has been used on the new CPU */ - if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) { cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); + /* + * This full barrier orders the store to the cpumask above vs + * a subsequent operation which allows this CPU to begin loading + * translations for next. + * + * When using the radix MMU that operation is the load of the + * MMU context id, which is then moved to SPRN_PID. + * + * For the hash MMU it is either the first load from slb_cache + * in switch_slb(), and/or the store of paca->mm_ctx_id in + * copy_mm_to_paca(). + * + * On the read side the barrier is in pte_xchg(), which orders + * the store to the PTE vs the load of mm_cpumask. + */ + smp_mb(); + + new_on_cpu = true; + } + /* 32-bit keeps track of the current PGDIR in the thread struct */ #ifdef CONFIG_PPC32 tsk->thread.pgdir = next->pgd; @@ -109,6 +137,10 @@ static inline void switch_mm_irqs_off(struct mm_struct *prev, if (cpu_has_feature(CPU_FTR_ALTIVEC)) asm volatile ("dssall"); #endif /* CONFIG_ALTIVEC */ + + if (new_on_cpu) + radix_kvm_prefetch_workaround(next); + /* * The actual HW switching method differs between the various * sub architectures. Out of line for now diff --git a/arch/powerpc/include/asm/pgtable-be-types.h b/arch/powerpc/include/asm/pgtable-be-types.h index 9c0f5db5cf46..67e7e3d990f4 100644 --- a/arch/powerpc/include/asm/pgtable-be-types.h +++ b/arch/powerpc/include/asm/pgtable-be-types.h @@ -87,6 +87,7 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) unsigned long *p = (unsigned long *)ptep; __be64 prev; + /* See comment in switch_mm_irqs_off() */ prev = (__force __be64)__cmpxchg_u64(p, (__force unsigned long)pte_raw(old), (__force unsigned long)pte_raw(new)); diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h index 8bd3b13fe2fb..369a164b545c 100644 --- a/arch/powerpc/include/asm/pgtable-types.h +++ b/arch/powerpc/include/asm/pgtable-types.h @@ -62,6 +62,7 @@ static inline bool pte_xchg(pte_t *ptep, pte_t old, pte_t new) { unsigned long *p = (unsigned long *)ptep; + /* See comment in switch_mm_irqs_off() */ return pte_val(old) == __cmpxchg_u64(p, pte_val(old), pte_val(new)); } #endif diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index dd01212935ac..afae9a336136 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -80,6 +80,13 @@ unsigned long vmalloc_to_phys(void *vmalloc_addr); void pgtable_cache_add(unsigned shift, void (*ctor)(void *)); void pgtable_cache_init(void); + +#ifdef CONFIG_STRICT_KERNEL_RWX +void mark_initmem_nx(void); +#else +static inline void mark_initmem_nx(void) { } +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/arch/powerpc/include/uapi/asm/ioctls.h b/arch/powerpc/include/uapi/asm/ioctls.h index bfd609a3e928..e3b10469f787 100644 --- a/arch/powerpc/include/uapi/asm/ioctls.h +++ b/arch/powerpc/include/uapi/asm/ioctls.h @@ -100,7 +100,7 @@ #define TIOCGPKT _IOR('T', 0x38, int) /* Get packet mode state */ #define TIOCGPTLCK _IOR('T', 0x39, int) /* Get Pty lock state */ #define TIOCGEXCL _IOR('T', 0x40, int) /* Get exclusive mode state */ -#define TIOCGPTPEER _IOR('T', 0x41, int) /* Safely open the slave */ +#define TIOCGPTPEER _IO('T', 0x41) /* Safely open the slave */ #define TIOCSERCONFIG 0x5453 #define TIOCSERGWILD 0x5454 diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 49d8422767b4..e925c1c99c71 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -223,17 +223,27 @@ system_call_exit: andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) bne- .Lsyscall_exit_work - /* If MSR_FP and MSR_VEC are set in user msr, then no need to restore */ - li r7,MSR_FP + andi. r0,r8,MSR_FP + beq 2f #ifdef CONFIG_ALTIVEC - oris r7,r7,MSR_VEC@h + andis. r0,r8,MSR_VEC@h + bne 3f #endif - and r0,r8,r7 - cmpd r0,r7 - bne .Lsyscall_restore_math -.Lsyscall_restore_math_cont: +2: addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_PPC_BOOK3S + li r10,MSR_RI + mtmsrd r10,1 /* Restore RI */ +#endif + bl restore_math +#ifdef CONFIG_PPC_BOOK3S + li r11,0 + mtmsrd r11,1 +#endif + ld r8,_MSR(r1) + ld r3,RESULT(r1) + li r11,-MAX_ERRNO - cmpld r3,r11 +3: cmpld r3,r11 ld r5,_CCR(r1) bge- .Lsyscall_error .Lsyscall_error_cont: @@ -267,40 +277,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) std r5,_CCR(r1) b .Lsyscall_error_cont -.Lsyscall_restore_math: - /* - * Some initial tests from restore_math to avoid the heavyweight - * C code entry and MSR manipulations. - */ - LOAD_REG_IMMEDIATE(r0, MSR_TS_MASK) - and. r0,r0,r8 - bne 1f - - ld r7,PACACURRENT(r13) - lbz r0,THREAD+THREAD_LOAD_FP(r7) -#ifdef CONFIG_ALTIVEC - lbz r6,THREAD+THREAD_LOAD_VEC(r7) - add r0,r0,r6 -#endif - cmpdi r0,0 - beq .Lsyscall_restore_math_cont - -1: addi r3,r1,STACK_FRAME_OVERHEAD -#ifdef CONFIG_PPC_BOOK3S - li r10,MSR_RI - mtmsrd r10,1 /* Restore RI */ -#endif - bl restore_math -#ifdef CONFIG_PPC_BOOK3S - li r11,0 - mtmsrd r11,1 -#endif - /* Restore volatiles, reload MSR from updated one */ - ld r8,_MSR(r1) - ld r3,RESULT(r1) - li r11,-MAX_ERRNO - b .Lsyscall_restore_math_cont - /* Traced system call support */ .Lsyscall_dotrace: bl save_nvgprs diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index e6d8354d79ef..f14f3c04ec7e 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -824,7 +824,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) * r3 volatile parameter and return value for status * r4-r10 volatile input and output value * r11 volatile hypercall number and output value - * r12 volatile + * r12 volatile input and output value * r13-r31 nonvolatile * LR nonvolatile * CTR volatile @@ -834,25 +834,26 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) * Other registers nonvolatile * * The intersection of volatile registers that don't contain possible - * inputs is: r12, cr0, xer, ctr. We may use these as scratch regs - * upon entry without saving. + * inputs is: cr0, xer, ctr. We may use these as scratch regs upon entry + * without saving, though xer is not a good idea to use, as hardware may + * interpret some bits so it may be costly to change them. */ #ifdef CONFIG_KVM_BOOK3S_64_HANDLER /* * There is a little bit of juggling to get syscall and hcall - * working well. Save r10 in ctr to be restored in case it is a - * hcall. + * working well. Save r13 in ctr to avoid using SPRG scratch + * register. * * Userspace syscalls have already saved the PPR, hcalls must save * it before setting HMT_MEDIUM. */ #define SYSCALL_KVMTEST \ - mr r12,r13; \ + mtctr r13; \ GET_PACA(r13); \ - mtctr r10; \ + std r10,PACA_EXGEN+EX_R10(r13); \ KVMTEST_PR(0xc00); /* uses r10, branch to do_kvm_0xc00_system_call */ \ HMT_MEDIUM; \ - mr r9,r12; \ + mfctr r9; #else #define SYSCALL_KVMTEST \ @@ -935,8 +936,8 @@ EXC_VIRT_END(system_call, 0x4c00, 0x100) * This is a hcall, so register convention is as above, with these * differences: * r13 = PACA - * r12 = orig r13 - * ctr = orig r10 + * ctr = orig r13 + * orig r10 saved in PACA */ TRAMP_KVM_BEGIN(do_kvm_0xc00) /* @@ -944,14 +945,13 @@ TRAMP_KVM_BEGIN(do_kvm_0xc00) * HMT_MEDIUM. That allows the KVM code to save that value into the * guest state (it is the guest's PPR value). */ - OPT_GET_SPR(r0, SPRN_PPR, CPU_FTR_HAS_PPR) + OPT_GET_SPR(r10, SPRN_PPR, CPU_FTR_HAS_PPR) HMT_MEDIUM - OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r0, CPU_FTR_HAS_PPR) + OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r10, CPU_FTR_HAS_PPR) mfctr r10 - SET_SCRATCH0(r12) + SET_SCRATCH0(r10) std r9,PACA_EXGEN+EX_R9(r13) mfcr r9 - std r10,PACA_EXGEN+EX_R10(r13) KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00) #endif @@ -1325,10 +1325,18 @@ EXC_VIRT_NONE(0x5800, 0x100) std r10,PACA_EXGEN+EX_R13(r13); \ EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H) +/* + * Branch to soft_nmi_interrupt using the emergency stack. The emergency + * stack is one that is usable by maskable interrupts so long as MSR_EE + * remains off. It is used for recovery when something has corrupted the + * normal kernel stack, for example. The "soft NMI" must not use the process + * stack because we want irq disabled sections to avoid touching the stack + * at all (other than PMU interrupts), so use the emergency stack for this, + * and run it entirely with interrupts hard disabled. + */ EXC_COMMON_BEGIN(soft_nmi_common) mr r10,r1 ld r1,PACAEMERGSP(r13) - ld r1,PACA_NMI_EMERG_SP(r13) subi r1,r1,INT_FRAME_SIZE EXCEPTION_COMMON_NORET_STACK(PACA_EXGEN, 0x900, system_reset, soft_nmi_interrupt, diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 5adb390e773b..e6252c5a57a4 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -30,6 +30,7 @@ * Use unused space in the interrupt stack to save and restore * registers for winkle support. */ +#define _MMCR0 GPR0 #define _SDR1 GPR3 #define _PTCR GPR3 #define _RPR GPR4 @@ -272,6 +273,14 @@ power_enter_stop: b pnv_wakeup_noloss .Lhandle_esl_ec_set: + /* + * POWER9 DD2 can incorrectly set PMAO when waking up after a + * state-loss idle. Saving and restoring MMCR0 over idle is a + * workaround. + */ + mfspr r4,SPRN_MMCR0 + std r4,_MMCR0(r1) + /* * Check if the requested state is a deep idle state. */ @@ -450,10 +459,20 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) pnv_restore_hyp_resource_arch300: /* * Workaround for POWER9, if we lost resources, the ERAT - * might have been mixed up and needs flushing. + * might have been mixed up and needs flushing. We also need + * to reload MMCR0 (see comment above). We also need to set + * then clear bit 60 in MMCRA to ensure the PMU starts running. */ blt cr3,1f PPC_INVALIDATE_ERAT + ld r1,PACAR1(r13) + mfspr r4,SPRN_MMCRA + ori r4,r4,(1 << (63-60)) + mtspr SPRN_MMCRA,r4 + xori r4,r4,(1 << (63-60)) + mtspr SPRN_MMCRA,r4 + ld r4,_MMCR0(r1) + mtspr SPRN_MMCR0,r4 1: /* * POWER ISA 3. Use PSSCR to determine if we diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 0bcec745a672..f291f7826abc 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -145,6 +145,19 @@ notrace unsigned int __check_irq_replay(void) /* Clear bit 0 which we wouldn't clear otherwise */ local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + if (happened & PACA_IRQ_HARD_DIS) { + /* + * We may have missed a decrementer interrupt if hard disabled. + * Check the decrementer register in case we had a rollover + * while hard disabled. + */ + if (!(happened & PACA_IRQ_DEC)) { + if (decrementer_check_overflow()) { + local_paca->irq_happened |= PACA_IRQ_DEC; + happened |= PACA_IRQ_DEC; + } + } + } /* * Force the delivery of pending soft-disabled interrupts on PS3. @@ -170,7 +183,7 @@ notrace unsigned int __check_irq_replay(void) * in case we also had a rollover while hard disabled */ local_paca->irq_happened &= ~PACA_IRQ_DEC; - if ((happened & PACA_IRQ_DEC) || decrementer_check_overflow()) + if (happened & PACA_IRQ_DEC) return 0x900; /* Finally check if an external interrupt happened */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 9f3e2c932dcc..1f0fd361e09b 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -362,7 +362,8 @@ void enable_kernel_vsx(void) cpumsr = msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX); - if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) { + if (current->thread.regs && + (current->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP))) { check_if_tm_restore_required(current); /* * If a thread has already been reclaimed then the @@ -386,7 +387,7 @@ void flush_vsx_to_thread(struct task_struct *tsk) { if (tsk->thread.regs) { preempt_disable(); - if (tsk->thread.regs->msr & MSR_VSX) { + if (tsk->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP)) { BUG_ON(tsk != current); giveup_vsx(tsk); } @@ -511,10 +512,6 @@ void restore_math(struct pt_regs *regs) { unsigned long msr; - /* - * Syscall exit makes a similar initial check before branching - * to restore_math. Keep them in synch. - */ if (!msr_tm_active(regs->msr) && !current->thread.load_fp && !loadvec(current->thread)) return; diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 925a4ef90559..660ed39e9c9a 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -127,12 +127,19 @@ static void flush_tmregs_to_thread(struct task_struct *tsk) * If task is not current, it will have been flushed already to * it's thread_struct during __switch_to(). * - * A reclaim flushes ALL the state. + * A reclaim flushes ALL the state or if not in TM save TM SPRs + * in the appropriate thread structures from live. */ - if (tsk == current && MSR_TM_SUSPENDED(mfmsr())) - tm_reclaim_current(TM_CAUSE_SIGNAL); + if (tsk != current) + return; + if (MSR_TM_SUSPENDED(mfmsr())) { + tm_reclaim_current(TM_CAUSE_SIGNAL); + } else { + tm_enable(); + tm_save_sprs(&(tsk->thread)); + } } #else static inline void flush_tmregs_to_thread(struct task_struct *tsk) { } diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 997c88d54acf..8d3320562c70 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -351,7 +351,7 @@ static void nmi_ipi_lock_start(unsigned long *flags) hard_irq_disable(); while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) { raw_local_irq_restore(*flags); - cpu_relax(); + spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0); raw_local_irq_save(*flags); hard_irq_disable(); } @@ -360,7 +360,7 @@ static void nmi_ipi_lock_start(unsigned long *flags) static void nmi_ipi_lock(void) { while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) - cpu_relax(); + spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0); } static void nmi_ipi_unlock(void) @@ -475,7 +475,7 @@ int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us) nmi_ipi_lock_start(&flags); while (nmi_ipi_busy_count) { nmi_ipi_unlock_end(&flags); - cpu_relax(); + spin_until_cond(nmi_ipi_busy_count == 0); nmi_ipi_lock_start(&flags); } @@ -1003,21 +1003,13 @@ static struct sched_domain_topology_level powerpc_topology[] = { { NULL, }, }; -static __init long smp_setup_cpu_workfn(void *data __always_unused) -{ - smp_ops->setup_cpu(boot_cpuid); - return 0; -} - void __init smp_cpus_done(unsigned int max_cpus) { /* - * We want the setup_cpu() here to be called on the boot CPU, but - * init might run on any CPU, so make sure it's invoked on the boot - * CPU. + * We are running pinned to the boot CPU, see rest_init(). */ if (smp_ops && smp_ops->setup_cpu) - work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL); + smp_ops->setup_cpu(boot_cpuid); if (smp_ops && smp_ops->bringup_done) smp_ops->bringup_done(); diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index b67f8b03a32d..34721a257a77 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -71,15 +71,20 @@ static inline void wd_smp_lock(unsigned long *flags) * This may be called from low level interrupt handlers at some * point in future. */ - local_irq_save(*flags); - while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) - cpu_relax(); + raw_local_irq_save(*flags); + hard_irq_disable(); /* Make it soft-NMI safe */ + while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) { + raw_local_irq_restore(*flags); + spin_until_cond(!test_bit(0, &__wd_smp_lock)); + raw_local_irq_save(*flags); + hard_irq_disable(); + } } static inline void wd_smp_unlock(unsigned long *flags) { clear_bit_unlock(0, &__wd_smp_lock); - local_irq_restore(*flags); + raw_local_irq_restore(*flags); } static void wd_lockup_ipi(struct pt_regs *regs) @@ -96,10 +101,10 @@ static void wd_lockup_ipi(struct pt_regs *regs) nmi_panic(regs, "Hard LOCKUP"); } -static void set_cpu_stuck(int cpu, u64 tb) +static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) { - cpumask_set_cpu(cpu, &wd_smp_cpus_stuck); - cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); + cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask); + cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask); if (cpumask_empty(&wd_smp_cpus_pending)) { wd_smp_last_reset_tb = tb; cpumask_andnot(&wd_smp_cpus_pending, @@ -107,6 +112,10 @@ static void set_cpu_stuck(int cpu, u64 tb) &wd_smp_cpus_stuck); } } +static void set_cpu_stuck(int cpu, u64 tb) +{ + set_cpumask_stuck(cpumask_of(cpu), tb); +} static void watchdog_smp_panic(int cpu, u64 tb) { @@ -135,11 +144,9 @@ static void watchdog_smp_panic(int cpu, u64 tb) } smp_flush_nmi_ipi(1000000); - /* Take the stuck CPU out of the watch group */ - for_each_cpu(c, &wd_smp_cpus_pending) - set_cpu_stuck(c, tb); + /* Take the stuck CPUs out of the watch group */ + set_cpumask_stuck(&wd_smp_cpus_pending, tb); -out: wd_smp_unlock(&flags); printk_safe_flush(); @@ -152,6 +159,11 @@ out: if (hardlockup_panic) nmi_panic(NULL, "Hard LOCKUP"); + + return; + +out: + wd_smp_unlock(&flags); } static void wd_smp_clear_cpu_pending(int cpu, u64 tb) @@ -258,9 +270,11 @@ static void wd_timer_fn(unsigned long data) void arch_touch_nmi_watchdog(void) { + unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000; int cpu = smp_processor_id(); - watchdog_timer_interrupt(cpu); + if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks) + watchdog_timer_interrupt(cpu); } EXPORT_SYMBOL(arch_touch_nmi_watchdog); @@ -283,6 +297,8 @@ static void stop_watchdog_timer_on(unsigned int cpu) static int start_wd_on_cpu(unsigned int cpu) { + unsigned long flags; + if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) { WARN_ON(1); return 0; @@ -297,12 +313,14 @@ static int start_wd_on_cpu(unsigned int cpu) if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) return 0; + wd_smp_lock(&flags); cpumask_set_cpu(cpu, &wd_cpus_enabled); if (cpumask_weight(&wd_cpus_enabled) == 1) { cpumask_set_cpu(cpu, &wd_smp_cpus_pending); wd_smp_last_reset_tb = get_tb(); } - smp_wmb(); + wd_smp_unlock(&flags); + start_watchdog_timer_on(cpu); return 0; @@ -310,12 +328,17 @@ static int start_wd_on_cpu(unsigned int cpu) static int stop_wd_on_cpu(unsigned int cpu) { + unsigned long flags; + if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) return 0; /* Can happen in CPU unplug case */ stop_watchdog_timer_on(cpu); + wd_smp_lock(&flags); cpumask_clear_cpu(cpu, &wd_cpus_enabled); + wd_smp_unlock(&flags); + wd_smp_clear_cpu_pending(cpu, get_tb()); return 0; diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 8cb0190e2a73..b42812e014c0 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -164,8 +164,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) goto out; } - if (kvm->arch.hpt.virt) + if (kvm->arch.hpt.virt) { kvmppc_free_hpt(&kvm->arch.hpt); + kvmppc_rmap_reset(kvm); + } err = kvmppc_allocate_hpt(&info, order); if (err < 0) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index a160c14304eb..53766e2bc029 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -294,32 +294,26 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args) { struct kvmppc_spapr_tce_table *stt = NULL; + struct kvmppc_spapr_tce_table *siter; unsigned long npages, size; int ret = -ENOMEM; int i; + int fd = -1; if (!args->size) return -EINVAL; - /* Check this LIOBN hasn't been previously allocated */ - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == args->liobn) - return -EBUSY; - } - size = _ALIGN_UP(args->size, PAGE_SIZE >> 3); npages = kvmppc_tce_pages(size); ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); - if (ret) { - stt = NULL; - goto fail; - } + if (ret) + return ret; ret = -ENOMEM; stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); if (!stt) - goto fail; + goto fail_acct; stt->liobn = args->liobn; stt->page_shift = args->page_shift; @@ -334,24 +328,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; } - kvm_get_kvm(kvm); + ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, + stt, O_RDWR | O_CLOEXEC); + if (ret < 0) + goto fail; mutex_lock(&kvm->lock); - list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); + + /* Check this LIOBN hasn't been previously allocated */ + ret = 0; + list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) { + if (siter->liobn == args->liobn) { + ret = -EBUSY; + break; + } + } + + if (!ret) { + list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables); + kvm_get_kvm(kvm); + } mutex_unlock(&kvm->lock); - return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, - stt, O_RDWR | O_CLOEXEC); + if (!ret) + return fd; -fail: - if (stt) { - for (i = 0; i < npages; i++) - if (stt->pages[i]) - __free_page(stt->pages[i]); + put_unused_fd(fd); - kfree(stt); - } + fail: + for (i = 0; i < npages; i++) + if (stt->pages[i]) + __free_page(stt->pages[i]); + + kfree(stt); + fail_acct: + kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); return ret; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 0b436df746fc..359c79cdf0cc 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3211,6 +3211,8 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) run->fail_entry.hardware_entry_failure_reason = 0; return -EINVAL; } + /* Enable TM so we can read the TM SPRs */ + mtmsr(mfmsr() | MSR_TM); current->thread.tm_tfhar = mfspr(SPRN_TFHAR); current->thread.tm_tfiar = mfspr(SPRN_TFIAR); current->thread.tm_texasr = mfspr(SPRN_TEXASR); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index cb44065e2946..9c9c983b864f 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1291,6 +1291,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) /* Hypervisor doorbell - exit only if host IPI flag set */ cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL bne 3f +BEGIN_FTR_SECTION + PPC_MSGSYNC +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r0, HSTATE_HOST_IPI(r13) cmpwi r0, 0 beq 4f @@ -1443,12 +1446,14 @@ mc_cont: ori r6,r6,1 mtspr SPRN_CTRLT,r6 4: - /* Read the guest SLB and save it away */ + /* Check if we are running hash or radix and store it in cr2 */ ld r5, VCPU_KVM(r9) lbz r0, KVM_RADIX(r5) - cmpwi r0, 0 + cmpwi cr2,r0,0 + + /* Read the guest SLB and save it away */ li r5, 0 - bne 3f /* for radix, save 0 entries */ + bne cr2, 3f /* for radix, save 0 entries */ lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */ mtctr r0 li r6,0 @@ -1712,11 +1717,6 @@ BEGIN_FTR_SECTION_NESTED(96) END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 22: - /* Clear out SLB */ - li r5,0 - slbmte r5,r5 - slbia - ptesync /* Restore host values of some registers */ BEGIN_FTR_SECTION @@ -1737,10 +1737,56 @@ BEGIN_FTR_SECTION mtspr SPRN_PID, r7 mtspr SPRN_IAMR, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + +#ifdef CONFIG_PPC_RADIX_MMU + /* + * Are we running hash or radix ? + */ + beq cr2,3f + + /* Radix: Handle the case where the guest used an illegal PID */ + LOAD_REG_ADDR(r4, mmu_base_pid) + lwz r3, VCPU_GUEST_PID(r9) + lwz r5, 0(r4) + cmpw cr0,r3,r5 + blt 2f + + /* + * Illegal PID, the HW might have prefetched and cached in the TLB + * some translations for the LPID 0 / guest PID combination which + * Linux doesn't know about, so we need to flush that PID out of + * the TLB. First we need to set LPIDR to 0 so tlbiel applies to + * the right context. + */ + li r0,0 + mtspr SPRN_LPID,r0 + isync + + /* Then do a congruence class local flush */ + ld r6,VCPU_KVM(r9) + lwz r0,KVM_TLB_SETS(r6) + mtctr r0 + li r7,0x400 /* IS field = 0b01 */ + ptesync + sldi r0,r3,32 /* RS has PID */ +1: PPC_TLBIEL(7,0,2,1,1) /* RIC=2, PRS=1, R=1 */ + addi r7,r7,0x1000 + bdnz 1b + ptesync + +2: /* Flush the ERAT on radix P9 DD1 guest exit */ BEGIN_FTR_SECTION PPC_INVALIDATE_ERAT END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) + b 4f +#endif /* CONFIG_PPC_RADIX_MMU */ + /* Hash: clear out SLB */ +3: li r5,0 + slbmte r5,r5 + slbia + ptesync +4: /* * POWER7/POWER8 guest -> host partition switch code. * We don't have to lock against tlbies but we do diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c index 4636ca6e7d38..d1ed2c41b5d2 100644 --- a/arch/powerpc/kvm/book3s_xive_template.c +++ b/arch/powerpc/kvm/book3s_xive_template.c @@ -16,7 +16,22 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc) u8 cppr; u16 ack; - /* XXX DD1 bug workaround: Check PIPR vs. CPPR first ! */ + /* + * Ensure any previous store to CPPR is ordered vs. + * the subsequent loads from PIPR or ACK. + */ + eieio(); + + /* + * DD1 bug workaround: If PIPR is less favored than CPPR + * ignore the interrupt or we might incorrectly lose an IPB + * bit. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR); + if (pipr >= xc->hw_cppr) + return; + } /* Perform the acknowledge OS to register cycle. */ ack = be16_to_cpu(__x_readw(__x_tima + TM_SPC_ACK_OS_REG)); @@ -235,6 +250,11 @@ skip_ipi: /* * If we found an interrupt, adjust what the guest CPPR should * be as if we had just fetched that interrupt from HW. + * + * Note: This can only make xc->cppr smaller as the previous + * loop will only exit with hirq != 0 if prio is lower than + * the current xc->cppr. Thus we don't need to re-check xc->mfrr + * for pending IPIs. */ if (hirq) xc->cppr = prio; @@ -381,6 +401,12 @@ X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr) xc->cppr = cppr; /* + * Order the above update of xc->cppr with the subsequent + * read of xc->mfrr inside push_pending_to_hw() + */ + smp_mb(); + + /* * We are masking less, we need to look for pending things * to deliver and set VP pending bits accordingly to trigger * a new interrupt otherwise we might miss MFRR changes for @@ -420,21 +446,37 @@ X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr) * used to signal MFRR changes is EOId when fetched from * the queue. */ - if (irq == XICS_IPI || irq == 0) + if (irq == XICS_IPI || irq == 0) { + /* + * This barrier orders the setting of xc->cppr vs. + * subsquent test of xc->mfrr done inside + * scan_interrupts and push_pending_to_hw + */ + smp_mb(); goto bail; + } /* Find interrupt source */ sb = kvmppc_xive_find_source(xive, irq, &src); if (!sb) { pr_devel(" source not found !\n"); rc = H_PARAMETER; + /* Same as above */ + smp_mb(); goto bail; } state = &sb->irq_state[src]; kvmppc_xive_select_irq(state, &hw_num, &xd); state->in_eoi = true; - mb(); + + /* + * This barrier orders both setting of in_eoi above vs, + * subsequent test of guest_priority, and the setting + * of xc->cppr vs. subsquent test of xc->mfrr done inside + * scan_interrupts and push_pending_to_hw + */ + smp_mb(); again: if (state->guest_priority == MASKED) { @@ -461,6 +503,14 @@ again: } + /* + * This barrier orders the above guest_priority check + * and spin_lock/unlock with clearing in_eoi below. + * + * It also has to be a full mb() as it must ensure + * the MMIOs done in source_eoi() are completed before + * state->in_eoi is visible. + */ mb(); state->in_eoi = false; bail: @@ -495,6 +545,18 @@ X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server, /* Locklessly write over MFRR */ xc->mfrr = mfrr; + /* + * The load of xc->cppr below and the subsequent MMIO store + * to the IPI must happen after the above mfrr update is + * globally visible so that: + * + * - Synchronize with another CPU doing an H_EOI or a H_CPPR + * updating xc->cppr then reading xc->mfrr. + * + * - The target of the IPI sees the xc->mfrr update + */ + mb(); + /* Shoot the IPI if most favored than target cppr */ if (mfrr < xc->cppr) __x_writeq(0, __x_trig_page(&xc->vp_ipi_data)); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 8541f18694a4..46b4e67d2372 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -402,6 +402,7 @@ void __init mem_init(void) void free_initmem(void) { ppc_md.progress = ppc_printk_progress; + mark_initmem_nx(); free_initmem_default(POISON_FREE_INITMEM); } diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index abed1fe6992f..a75f63833284 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -126,9 +126,10 @@ static int hash__init_new_context(struct mm_struct *mm) static int radix__init_new_context(struct mm_struct *mm) { unsigned long rts_field; - int index; + int index, max_id; - index = alloc_context_id(1, PRTB_ENTRIES - 1); + max_id = (1 << mmu_pid_bits) - 1; + index = alloc_context_id(mmu_base_pid, max_id); if (index < 0) return index; diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index 188b4107584d..443a2c66a304 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -425,33 +425,51 @@ int hash__has_transparent_hugepage(void) #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_STRICT_KERNEL_RWX -void hash__mark_rodata_ro(void) +static bool hash__change_memory_range(unsigned long start, unsigned long end, + unsigned long newpp) { - unsigned long start = (unsigned long)_stext; - unsigned long end = (unsigned long)__init_begin; unsigned long idx; unsigned int step, shift; - unsigned long newpp = PP_RXXX; shift = mmu_psize_defs[mmu_linear_psize].shift; step = 1 << shift; - start = ((start + step - 1) >> shift) << shift; - end = (end >> shift) << shift; + start = ALIGN_DOWN(start, step); + end = ALIGN(end, step); // aligns up - pr_devel("marking ro start %lx, end %lx, step %x\n", - start, end, step); + if (start >= end) + return false; - if (start == end) { - pr_warn("could not set rodata ro, relocate the start" - " of the kernel to a 0x%x boundary\n", step); - return; - } + pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n", + start, end, newpp, step); for (idx = start; idx < end; idx += step) /* Not sure if we can do much with the return value */ mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, mmu_kernel_ssize); + return true; +} + +void hash__mark_rodata_ro(void) +{ + unsigned long start, end; + + start = (unsigned long)_stext; + end = (unsigned long)__init_begin; + + WARN_ON(!hash__change_memory_range(start, end, PP_RXXX)); +} + +void hash__mark_initmem_nx(void) +{ + unsigned long start, end, pp; + + start = (unsigned long)__init_begin; + end = (unsigned long)__init_end; + + pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL)); + + WARN_ON(!hash__change_memory_range(start, end, pp)); } #endif diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 8c13e4282308..671a45d86c18 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -25,6 +25,9 @@ #include <trace/events/thp.h> +unsigned int mmu_pid_bits; +unsigned int mmu_base_pid; + static int native_register_process_table(unsigned long base, unsigned long pg_sz, unsigned long table_size) { @@ -112,10 +115,9 @@ set_the_pte: } #ifdef CONFIG_STRICT_KERNEL_RWX -void radix__mark_rodata_ro(void) +void radix__change_memory_range(unsigned long start, unsigned long end, + unsigned long clear) { - unsigned long start = (unsigned long)_stext; - unsigned long end = (unsigned long)__init_begin; unsigned long idx; pgd_t *pgdp; pud_t *pudp; @@ -125,7 +127,8 @@ void radix__mark_rodata_ro(void) start = ALIGN_DOWN(start, PAGE_SIZE); end = PAGE_ALIGN(end); // aligns up - pr_devel("marking ro start %lx, end %lx\n", start, end); + pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n", + start, end, clear); for (idx = start; idx < end; idx += PAGE_SIZE) { pgdp = pgd_offset_k(idx); @@ -147,11 +150,29 @@ void radix__mark_rodata_ro(void) if (!ptep) continue; update_the_pte: - radix__pte_update(&init_mm, idx, ptep, _PAGE_WRITE, 0, 0); + radix__pte_update(&init_mm, idx, ptep, clear, 0, 0); } radix__flush_tlb_kernel_range(start, end); } + +void radix__mark_rodata_ro(void) +{ + unsigned long start, end; + + start = (unsigned long)_stext; + end = (unsigned long)__init_begin; + + radix__change_memory_range(start, end, _PAGE_WRITE); +} + +void radix__mark_initmem_nx(void) +{ + unsigned long start = (unsigned long)__init_begin; + unsigned long end = (unsigned long)__init_end; + + radix__change_memory_range(start, end, _PAGE_EXEC); +} #endif /* CONFIG_STRICT_KERNEL_RWX */ static inline void __meminit print_mapping(unsigned long start, @@ -243,11 +264,34 @@ static void __init radix_init_pgtable(void) for_each_memblock(memory, reg) WARN_ON(create_physical_mapping(reg->base, reg->base + reg->size)); + + /* Find out how many PID bits are supported */ + if (cpu_has_feature(CPU_FTR_HVMODE)) { + if (!mmu_pid_bits) + mmu_pid_bits = 20; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + /* + * When KVM is possible, we only use the top half of the + * PID space to avoid collisions between host and guest PIDs + * which can cause problems due to prefetch when exiting the + * guest with AIL=3 + */ + mmu_base_pid = 1 << (mmu_pid_bits - 1); +#else + mmu_base_pid = 1; +#endif + } else { + /* The guest uses the bottom half of the PID space */ + if (!mmu_pid_bits) + mmu_pid_bits = 19; + mmu_base_pid = 1; + } + /* * Allocate Partition table and process table for the * host. */ - BUILD_BUG_ON_MSG((PRTB_SIZE_SHIFT > 36), "Process table size too large."); + BUG_ON(PRTB_SIZE_SHIFT > 36); process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT); /* * Fill in the process table. @@ -321,6 +365,12 @@ static int __init radix_dt_scan_page_sizes(unsigned long node, if (type == NULL || strcmp(type, "cpu") != 0) return 0; + /* Find MMU PID size */ + prop = of_get_flat_dt_prop(node, "ibm,mmu-pid-bits", &size); + if (prop && size == 4) + mmu_pid_bits = be32_to_cpup(prop); + + /* Grab page size encodings */ prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size); if (!prop) return 0; diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 5c0b795d656c..0736e94c7615 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -505,4 +505,12 @@ void mark_rodata_ro(void) else hash__mark_rodata_ro(); } + +void mark_initmem_nx(void) +{ + if (radix_enabled()) + radix__mark_initmem_nx(); + else + hash__mark_initmem_nx(); +} #endif diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index e94fbd4c8845..781532d7bc4d 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -36,7 +36,7 @@ void subpage_prot_free(struct mm_struct *mm) } } addr = 0; - for (i = 0; i < 2; ++i) { + for (i = 0; i < (TASK_SIZE_USER64 >> 43); ++i) { p = spt->protptrs[i]; if (!p) continue; diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c index 744e0164ecf5..16ae1bbe13f0 100644 --- a/arch/powerpc/mm/tlb-radix.c +++ b/arch/powerpc/mm/tlb-radix.c @@ -12,12 +12,12 @@ #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/memblock.h> -#include <asm/ppc-opcode.h> +#include <asm/ppc-opcode.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include <asm/trace.h> - +#include <asm/cputhreads.h> #define RIC_FLUSH_TLB 0 #define RIC_FLUSH_PWC 1 @@ -454,3 +454,44 @@ void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm, else radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize); } + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +extern void radix_kvm_prefetch_workaround(struct mm_struct *mm) +{ + unsigned int pid = mm->context.id; + + if (unlikely(pid == MMU_NO_CONTEXT)) + return; + + /* + * If this context hasn't run on that CPU before and KVM is + * around, there's a slim chance that the guest on another + * CPU just brought in obsolete translation into the TLB of + * this CPU due to a bad prefetch using the guest PID on + * the way into the hypervisor. + * + * We work around this here. If KVM is possible, we check if + * any sibling thread is in KVM. If it is, the window may exist + * and thus we flush that PID from the core. + * + * A potential future improvement would be to mark which PIDs + * have never been used on the system and avoid it if the PID + * is new and the process has no other cpumask bit set. + */ + if (cpu_has_feature(CPU_FTR_HVMODE) && radix_enabled()) { + int cpu = smp_processor_id(); + int sib = cpu_first_thread_sibling(cpu); + bool flush = false; + + for (; sib <= cpu_last_thread_sibling(cpu) && !flush; sib++) { + if (sib == cpu) + continue; + if (paca[sib].kvm_hstate.kvm_vcpu) + flush = true; + } + if (flush) + _tlbiel_pid(pid, RIC_FLUSH_ALL); + } +} +EXPORT_SYMBOL_GPL(radix_kvm_prefetch_workaround); +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ diff --git a/arch/powerpc/platforms/83xx/mpc832x_rdb.c b/arch/powerpc/platforms/83xx/mpc832x_rdb.c index d7c9b186954d..763ffca9628d 100644 --- a/arch/powerpc/platforms/83xx/mpc832x_rdb.c +++ b/arch/powerpc/platforms/83xx/mpc832x_rdb.c @@ -89,7 +89,7 @@ static int __init of_fsl_spi_probe(char *type, char *compatible, u32 sysclk, goto err; ret = of_irq_to_resource(np, 0, &res[1]); - if (!ret) + if (ret <= 0) goto err; pdev = platform_device_alloc("mpc83xx_spi", i); diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 2abee070373f..a553aeea7af6 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -56,6 +56,7 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE; */ static u64 pnv_deepest_stop_psscr_val; static u64 pnv_deepest_stop_psscr_mask; +static u64 pnv_deepest_stop_flag; static bool deepest_stop_found; static int pnv_save_sprs_for_deep_states(void) @@ -185,8 +186,40 @@ static void pnv_alloc_idle_core_states(void) update_subcore_sibling_mask(); - if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) - pnv_save_sprs_for_deep_states(); + if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) { + int rc = pnv_save_sprs_for_deep_states(); + + if (likely(!rc)) + return; + + /* + * The stop-api is unable to restore hypervisor + * resources on wakeup from platform idle states which + * lose full context. So disable such states. + */ + supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT; + pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n"); + pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n"); + + if (cpu_has_feature(CPU_FTR_ARCH_300) && + (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) { + /* + * Use the default stop state for CPU-Hotplug + * if available. + */ + if (default_stop_found) { + pnv_deepest_stop_psscr_val = + pnv_default_stop_val; + pnv_deepest_stop_psscr_mask = + pnv_default_stop_mask; + pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n", + pnv_deepest_stop_psscr_val); + } else { /* Fallback to snooze loop for CPU-Hotplug */ + deepest_stop_found = false; + pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n"); + } + } + } } u32 pnv_get_supported_cpuidle_states(void) @@ -375,7 +408,8 @@ unsigned long pnv_cpu_offline(unsigned int cpu) pnv_deepest_stop_psscr_val; srr1 = power9_idle_stop(psscr); - } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { + } else if ((idle_states & OPAL_PM_WINKLE_ENABLED) && + (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) { srr1 = power7_idle_insn(PNV_THREAD_WINKLE); } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { @@ -553,6 +587,7 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags, max_residency_ns = residency_ns[i]; pnv_deepest_stop_psscr_val = psscr_val[i]; pnv_deepest_stop_psscr_mask = psscr_mask[i]; + pnv_deepest_stop_flag = flags[i]; deepest_stop_found = true; } diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 9b87abb178f0..cad6b57ce494 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -78,7 +78,7 @@ void opal_configure_cores(void) * ie. Host hash supports hash guests * Host radix supports hash/radix guests */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) { reinit_flags |= OPAL_REINIT_CPUS_MMU_HASH; if (early_radix_enabled()) reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX; diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index 437613588df1..b900eb1d5e17 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1852,6 +1852,14 @@ static int pnv_pci_ioda_dma_set_mask(struct pci_dev *pdev, u64 dma_mask) /* 4GB offset bypasses 32-bit space */ set_dma_offset(&pdev->dev, (1ULL << 32)); set_dma_ops(&pdev->dev, &dma_direct_ops); + } else if (dma_mask >> 32 && dma_mask != DMA_BIT_MASK(64)) { + /* + * Fail the request if a DMA mask between 32 and 64 bits + * was requested but couldn't be fulfilled. Ideally we + * would do this for 64-bits but historically we have + * always fallen back to 32-bits. + */ + return -ENOMEM; } else { dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n"); set_dma_ops(&pdev->dev, &dma_iommu_ops); diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c index e5bf1e84047f..011ef2180fe6 100644 --- a/arch/powerpc/platforms/pseries/reconfig.c +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -82,7 +82,6 @@ static int pSeries_reconfig_remove_node(struct device_node *np) of_detach_node(np); of_node_put(parent); - of_node_put(np); /* Must decrement the refcount */ return 0; } |