diff options
Diffstat (limited to 'arch/powerpc/kernel')
64 files changed, 2333 insertions, 1937 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index e132902e1f14..91960f83039c 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -25,8 +25,6 @@ CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -# timers used by tracing -CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) endif obj-y := cputable.o ptrace.o syscalls.o \ @@ -40,6 +38,7 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o nvram_64.o firmware.o obj-$(CONFIG_VDSO32) += vdso32/ +obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o @@ -84,7 +83,7 @@ extra-y := head_$(BITS).o extra-$(CONFIG_40x) := head_40x.o extra-$(CONFIG_44x) := head_44x.o extra-$(CONFIG_FSL_BOOKE) := head_fsl_booke.o -extra-$(CONFIG_8xx) := head_8xx.o +extra-$(CONFIG_PPC_8xx) := head_8xx.o extra-y += vmlinux.lds obj-$(CONFIG_RELOCATABLE) += reloc_$(BITS).o diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index ec7a8b099dd9..26b9994d27ee 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -27,6 +27,7 @@ #include <asm/switch_to.h> #include <asm/disassemble.h> #include <asm/cpu_has_feature.h> +#include <asm/sstep.h> struct aligninfo { unsigned char len; @@ -40,364 +41,9 @@ struct aligninfo { #define LD 0 /* load */ #define ST 1 /* store */ #define SE 2 /* sign-extend value, or FP ld/st as word */ -#define F 4 /* to/from fp regs */ -#define U 8 /* update index register */ -#define M 0x10 /* multiple load/store */ #define SW 0x20 /* byte swap */ -#define S 0x40 /* single-precision fp or... */ -#define SX 0x40 /* ... byte count in XER */ -#define HARD 0x80 /* string, stwcx. */ #define E4 0x40 /* SPE endianness is word */ #define E8 0x80 /* SPE endianness is double word */ -#define SPLT 0x80 /* VSX SPLAT load */ - -/* DSISR bits reported for a DCBZ instruction: */ -#define DCBZ 0x5f /* 8xx/82xx dcbz faults when cache not enabled */ - -/* - * The PowerPC stores certain bits of the instruction that caused the - * alignment exception in the DSISR register. This array maps those - * bits to information about the operand length and what the - * instruction would do. - */ -static struct aligninfo aligninfo[128] = { - { 4, LD }, /* 00 0 0000: lwz / lwarx */ - INVALID, /* 00 0 0001 */ - { 4, ST }, /* 00 0 0010: stw */ - INVALID, /* 00 0 0011 */ - { 2, LD }, /* 00 0 0100: lhz */ - { 2, LD+SE }, /* 00 0 0101: lha */ - { 2, ST }, /* 00 0 0110: sth */ - { 4, LD+M }, /* 00 0 0111: lmw */ - { 4, LD+F+S }, /* 00 0 1000: lfs */ - { 8, LD+F }, /* 00 0 1001: lfd */ - { 4, ST+F+S }, /* 00 0 1010: stfs */ - { 8, ST+F }, /* 00 0 1011: stfd */ - { 16, LD }, /* 00 0 1100: lq */ - { 8, LD }, /* 00 0 1101: ld/ldu/lwa */ - INVALID, /* 00 0 1110 */ - { 8, ST }, /* 00 0 1111: std/stdu */ - { 4, LD+U }, /* 00 1 0000: lwzu */ - INVALID, /* 00 1 0001 */ - { 4, ST+U }, /* 00 1 0010: stwu */ - INVALID, /* 00 1 0011 */ - { 2, LD+U }, /* 00 1 0100: lhzu */ - { 2, LD+SE+U }, /* 00 1 0101: lhau */ - { 2, ST+U }, /* 00 1 0110: sthu */ - { 4, ST+M }, /* 00 1 0111: stmw */ - { 4, LD+F+S+U }, /* 00 1 1000: lfsu */ - { 8, LD+F+U }, /* 00 1 1001: lfdu */ - { 4, ST+F+S+U }, /* 00 1 1010: stfsu */ - { 8, ST+F+U }, /* 00 1 1011: stfdu */ - { 16, LD+F }, /* 00 1 1100: lfdp */ - INVALID, /* 00 1 1101 */ - { 16, ST+F }, /* 00 1 1110: stfdp */ - INVALID, /* 00 1 1111 */ - { 8, LD }, /* 01 0 0000: ldx */ - INVALID, /* 01 0 0001 */ - { 8, ST }, /* 01 0 0010: stdx */ - INVALID, /* 01 0 0011 */ - INVALID, /* 01 0 0100 */ - { 4, LD+SE }, /* 01 0 0101: lwax */ - INVALID, /* 01 0 0110 */ - INVALID, /* 01 0 0111 */ - { 4, LD+M+HARD+SX }, /* 01 0 1000: lswx */ - { 4, LD+M+HARD }, /* 01 0 1001: lswi */ - { 4, ST+M+HARD+SX }, /* 01 0 1010: stswx */ - { 4, ST+M+HARD }, /* 01 0 1011: stswi */ - INVALID, /* 01 0 1100 */ - { 8, LD+U }, /* 01 0 1101: ldu */ - INVALID, /* 01 0 1110 */ - { 8, ST+U }, /* 01 0 1111: stdu */ - { 8, LD+U }, /* 01 1 0000: ldux */ - INVALID, /* 01 1 0001 */ - { 8, ST+U }, /* 01 1 0010: stdux */ - INVALID, /* 01 1 0011 */ - INVALID, /* 01 1 0100 */ - { 4, LD+SE+U }, /* 01 1 0101: lwaux */ - INVALID, /* 01 1 0110 */ - INVALID, /* 01 1 0111 */ - INVALID, /* 01 1 1000 */ - INVALID, /* 01 1 1001 */ - INVALID, /* 01 1 1010 */ - INVALID, /* 01 1 1011 */ - INVALID, /* 01 1 1100 */ - INVALID, /* 01 1 1101 */ - INVALID, /* 01 1 1110 */ - INVALID, /* 01 1 1111 */ - INVALID, /* 10 0 0000 */ - INVALID, /* 10 0 0001 */ - INVALID, /* 10 0 0010: stwcx. */ - INVALID, /* 10 0 0011 */ - INVALID, /* 10 0 0100 */ - INVALID, /* 10 0 0101 */ - INVALID, /* 10 0 0110 */ - INVALID, /* 10 0 0111 */ - { 4, LD+SW }, /* 10 0 1000: lwbrx */ - INVALID, /* 10 0 1001 */ - { 4, ST+SW }, /* 10 0 1010: stwbrx */ - INVALID, /* 10 0 1011 */ - { 2, LD+SW }, /* 10 0 1100: lhbrx */ - { 4, LD+SE }, /* 10 0 1101 lwa */ - { 2, ST+SW }, /* 10 0 1110: sthbrx */ - { 16, ST }, /* 10 0 1111: stq */ - INVALID, /* 10 1 0000 */ - INVALID, /* 10 1 0001 */ - INVALID, /* 10 1 0010 */ - INVALID, /* 10 1 0011 */ - INVALID, /* 10 1 0100 */ - INVALID, /* 10 1 0101 */ - INVALID, /* 10 1 0110 */ - INVALID, /* 10 1 0111 */ - INVALID, /* 10 1 1000 */ - INVALID, /* 10 1 1001 */ - INVALID, /* 10 1 1010 */ - INVALID, /* 10 1 1011 */ - INVALID, /* 10 1 1100 */ - INVALID, /* 10 1 1101 */ - INVALID, /* 10 1 1110 */ - { 0, ST+HARD }, /* 10 1 1111: dcbz */ - { 4, LD }, /* 11 0 0000: lwzx */ - INVALID, /* 11 0 0001 */ - { 4, ST }, /* 11 0 0010: stwx */ - INVALID, /* 11 0 0011 */ - { 2, LD }, /* 11 0 0100: lhzx */ - { 2, LD+SE }, /* 11 0 0101: lhax */ - { 2, ST }, /* 11 0 0110: sthx */ - INVALID, /* 11 0 0111 */ - { 4, LD+F+S }, /* 11 0 1000: lfsx */ - { 8, LD+F }, /* 11 0 1001: lfdx */ - { 4, ST+F+S }, /* 11 0 1010: stfsx */ - { 8, ST+F }, /* 11 0 1011: stfdx */ - { 16, LD+F }, /* 11 0 1100: lfdpx */ - { 4, LD+F+SE }, /* 11 0 1101: lfiwax */ - { 16, ST+F }, /* 11 0 1110: stfdpx */ - { 4, ST+F }, /* 11 0 1111: stfiwx */ - { 4, LD+U }, /* 11 1 0000: lwzux */ - INVALID, /* 11 1 0001 */ - { 4, ST+U }, /* 11 1 0010: stwux */ - INVALID, /* 11 1 0011 */ - { 2, LD+U }, /* 11 1 0100: lhzux */ - { 2, LD+SE+U }, /* 11 1 0101: lhaux */ - { 2, ST+U }, /* 11 1 0110: sthux */ - INVALID, /* 11 1 0111 */ - { 4, LD+F+S+U }, /* 11 1 1000: lfsux */ - { 8, LD+F+U }, /* 11 1 1001: lfdux */ - { 4, ST+F+S+U }, /* 11 1 1010: stfsux */ - { 8, ST+F+U }, /* 11 1 1011: stfdux */ - INVALID, /* 11 1 1100 */ - { 4, LD+F }, /* 11 1 1101: lfiwzx */ - INVALID, /* 11 1 1110 */ - INVALID, /* 11 1 1111 */ -}; - -/* - * The dcbz (data cache block zero) instruction - * gives an alignment fault if used on non-cacheable - * memory. We handle the fault mainly for the - * case when we are running with the cache disabled - * for debugging. - */ -static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr) -{ - long __user *p; - int i, size; - -#ifdef __powerpc64__ - size = ppc64_caches.l1d.block_size; -#else - size = L1_CACHE_BYTES; -#endif - p = (long __user *) (regs->dar & -size); - if (user_mode(regs) && !access_ok(VERIFY_WRITE, p, size)) - return -EFAULT; - for (i = 0; i < size / sizeof(long); ++i) - if (__put_user_inatomic(0, p+i)) - return -EFAULT; - return 1; -} - -/* - * Emulate load & store multiple instructions - * On 64-bit machines, these instructions only affect/use the - * bottom 4 bytes of each register, and the loads clear the - * top 4 bytes of the affected register. - */ -#ifdef __BIG_ENDIAN__ -#ifdef CONFIG_PPC64 -#define REG_BYTE(rp, i) *((u8 *)((rp) + ((i) >> 2)) + ((i) & 3) + 4) -#else -#define REG_BYTE(rp, i) *((u8 *)(rp) + (i)) -#endif -#else -#define REG_BYTE(rp, i) (*(((u8 *)((rp) + ((i)>>2)) + ((i)&3)))) -#endif - -#define SWIZ_PTR(p) ((unsigned char __user *)((p) ^ swiz)) - -static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, - unsigned int reg, unsigned int nb, - unsigned int flags, unsigned int instr, - unsigned long swiz) -{ - unsigned long *rptr; - unsigned int nb0, i, bswiz; - unsigned long p; - - /* - * We do not try to emulate 8 bytes multiple as they aren't really - * available in our operating environments and we don't try to - * emulate multiples operations in kernel land as they should never - * be used/generated there at least not on unaligned boundaries - */ - if (unlikely((nb > 4) || !user_mode(regs))) - return 0; - - /* lmw, stmw, lswi/x, stswi/x */ - nb0 = 0; - if (flags & HARD) { - if (flags & SX) { - nb = regs->xer & 127; - if (nb == 0) - return 1; - } else { - unsigned long pc = regs->nip ^ (swiz & 4); - - if (__get_user_inatomic(instr, - (unsigned int __user *)pc)) - return -EFAULT; - if (swiz == 0 && (flags & SW)) - instr = cpu_to_le32(instr); - nb = (instr >> 11) & 0x1f; - if (nb == 0) - nb = 32; - } - if (nb + reg * 4 > 128) { - nb0 = nb + reg * 4 - 128; - nb = 128 - reg * 4; - } -#ifdef __LITTLE_ENDIAN__ - /* - * String instructions are endian neutral but the code - * below is not. Force byte swapping on so that the - * effects of swizzling are undone in the load/store - * loops below. - */ - flags ^= SW; -#endif - } else { - /* lwm, stmw */ - nb = (32 - reg) * 4; - } - - if (!access_ok((flags & ST ? VERIFY_WRITE: VERIFY_READ), addr, nb+nb0)) - return -EFAULT; /* bad address */ - - rptr = ®s->gpr[reg]; - p = (unsigned long) addr; - bswiz = (flags & SW)? 3: 0; - - if (!(flags & ST)) { - /* - * This zeroes the top 4 bytes of the affected registers - * in 64-bit mode, and also zeroes out any remaining - * bytes of the last register for lsw*. - */ - memset(rptr, 0, ((nb + 3) / 4) * sizeof(unsigned long)); - if (nb0 > 0) - memset(®s->gpr[0], 0, - ((nb0 + 3) / 4) * sizeof(unsigned long)); - - for (i = 0; i < nb; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) - return -EFAULT; - if (nb0 > 0) { - rptr = ®s->gpr[0]; - addr += nb; - for (i = 0; i < nb0; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, - i ^ bswiz), - SWIZ_PTR(p))) - return -EFAULT; - } - - } else { - for (i = 0; i < nb; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) - return -EFAULT; - if (nb0 > 0) { - rptr = ®s->gpr[0]; - addr += nb; - for (i = 0; i < nb0; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, - i ^ bswiz), - SWIZ_PTR(p))) - return -EFAULT; - } - } - return 1; -} - -/* - * Emulate floating-point pair loads and stores. - * Only POWER6 has these instructions, and it does true little-endian, - * so we don't need the address swizzling. - */ -static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg, - unsigned int flags) -{ - char *ptr0 = (char *) ¤t->thread.TS_FPR(reg); - char *ptr1 = (char *) ¤t->thread.TS_FPR(reg+1); - int i, ret, sw = 0; - - if (reg & 1) - return 0; /* invalid form: FRS/FRT must be even */ - if (flags & SW) - sw = 7; - ret = 0; - for (i = 0; i < 8; ++i) { - if (!(flags & ST)) { - ret |= __get_user(ptr0[i^sw], addr + i); - ret |= __get_user(ptr1[i^sw], addr + i + 8); - } else { - ret |= __put_user(ptr0[i^sw], addr + i); - ret |= __put_user(ptr1[i^sw], addr + i + 8); - } - } - if (ret) - return -EFAULT; - return 1; /* exception handled and fixed up */ -} - -#ifdef CONFIG_PPC64 -static int emulate_lq_stq(struct pt_regs *regs, unsigned char __user *addr, - unsigned int reg, unsigned int flags) -{ - char *ptr0 = (char *)®s->gpr[reg]; - char *ptr1 = (char *)®s->gpr[reg+1]; - int i, ret, sw = 0; - - if (reg & 1) - return 0; /* invalid form: GPR must be even */ - if (flags & SW) - sw = 7; - ret = 0; - for (i = 0; i < 8; ++i) { - if (!(flags & ST)) { - ret |= __get_user(ptr0[i^sw], addr + i); - ret |= __get_user(ptr1[i^sw], addr + i + 8); - } else { - ret |= __put_user(ptr0[i^sw], addr + i); - ret |= __put_user(ptr1[i^sw], addr + i + 8); - } - } - if (ret) - return -EFAULT; - return 1; /* exception handled and fixed up */ -} -#endif /* CONFIG_PPC64 */ #ifdef CONFIG_SPE @@ -636,133 +282,21 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, } #endif /* CONFIG_SPE */ -#ifdef CONFIG_VSX -/* - * Emulate VSX instructions... - */ -static int emulate_vsx(unsigned char __user *addr, unsigned int reg, - unsigned int areg, struct pt_regs *regs, - unsigned int flags, unsigned int length, - unsigned int elsize) -{ - char *ptr; - unsigned long *lptr; - int ret = 0; - int sw = 0; - int i, j; - - /* userland only */ - if (unlikely(!user_mode(regs))) - return 0; - - flush_vsx_to_thread(current); - - if (reg < 32) - ptr = (char *) ¤t->thread.fp_state.fpr[reg][0]; - else - ptr = (char *) ¤t->thread.vr_state.vr[reg - 32]; - - lptr = (unsigned long *) ptr; - -#ifdef __LITTLE_ENDIAN__ - if (flags & SW) { - elsize = length; - sw = length-1; - } else { - /* - * The elements are BE ordered, even in LE mode, so process - * them in reverse order. - */ - addr += length - elsize; - - /* 8 byte memory accesses go in the top 8 bytes of the VR */ - if (length == 8) - ptr += 8; - } -#else - if (flags & SW) - sw = elsize-1; -#endif - - for (j = 0; j < length; j += elsize) { - for (i = 0; i < elsize; ++i) { - if (flags & ST) - ret |= __put_user(ptr[i^sw], addr + i); - else - ret |= __get_user(ptr[i^sw], addr + i); - } - ptr += elsize; -#ifdef __LITTLE_ENDIAN__ - addr -= elsize; -#else - addr += elsize; -#endif - } - -#ifdef __BIG_ENDIAN__ -#define VSX_HI 0 -#define VSX_LO 1 -#else -#define VSX_HI 1 -#define VSX_LO 0 -#endif - - if (!ret) { - if (flags & U) - regs->gpr[areg] = regs->dar; - - /* Splat load copies the same data to top and bottom 8 bytes */ - if (flags & SPLT) - lptr[VSX_LO] = lptr[VSX_HI]; - /* For 8 byte loads, zero the low 8 bytes */ - else if (!(flags & ST) && (8 == length)) - lptr[VSX_LO] = 0; - } else - return -EFAULT; - - return 1; -} -#endif - /* * Called on alignment exception. Attempts to fixup * * Return 1 on success * Return 0 if unable to handle the interrupt * Return -EFAULT if data address is bad + * Other negative return values indicate that the instruction can't + * be emulated, and the process should be given a SIGBUS. */ int fix_alignment(struct pt_regs *regs) { - unsigned int instr, nb, flags, instruction = 0; - unsigned int reg, areg; - unsigned int dsisr; - unsigned char __user *addr; - unsigned long p, swiz; - int ret, i; - union data { - u64 ll; - double dd; - unsigned char v[8]; - struct { -#ifdef __LITTLE_ENDIAN__ - int low32; - unsigned hi32; -#else - unsigned hi32; - int low32; -#endif - } x32; - struct { -#ifdef __LITTLE_ENDIAN__ - short low16; - unsigned char hi48[6]; -#else - unsigned char hi48[6]; - short low16; -#endif - } x16; - } data; + unsigned int instr; + struct instruction_op op; + int r, type; /* * We require a complete register set, if not, then our assembly @@ -770,121 +304,23 @@ int fix_alignment(struct pt_regs *regs) */ CHECK_FULL_REGS(regs); - dsisr = regs->dsisr; - - /* Some processors don't provide us with a DSISR we can use here, - * let's make one up from the instruction - */ - if (cpu_has_feature(CPU_FTR_NODSISRALIGN)) { - unsigned long pc = regs->nip; - - if (cpu_has_feature(CPU_FTR_PPC_LE) && (regs->msr & MSR_LE)) - pc ^= 4; - if (unlikely(__get_user_inatomic(instr, - (unsigned int __user *)pc))) - return -EFAULT; - if (cpu_has_feature(CPU_FTR_REAL_LE) && (regs->msr & MSR_LE)) - instr = cpu_to_le32(instr); - dsisr = make_dsisr(instr); - instruction = instr; + if (unlikely(__get_user(instr, (unsigned int __user *)regs->nip))) + return -EFAULT; + if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) { + /* We don't handle PPC little-endian any more... */ + if (cpu_has_feature(CPU_FTR_PPC_LE)) + return -EIO; + instr = swab32(instr); } - /* extract the operation and registers from the dsisr */ - reg = (dsisr >> 5) & 0x1f; /* source/dest register */ - areg = dsisr & 0x1f; /* register to update */ - #ifdef CONFIG_SPE if ((instr >> 26) == 0x4) { + int reg = (instr >> 21) & 0x1f; PPC_WARN_ALIGNMENT(spe, regs); return emulate_spe(regs, reg, instr); } #endif - instr = (dsisr >> 10) & 0x7f; - instr |= (dsisr >> 13) & 0x60; - - /* Lookup the operation in our table */ - nb = aligninfo[instr].len; - flags = aligninfo[instr].flags; - - /* - * Handle some cases which give overlaps in the DSISR values. - */ - if (IS_XFORM(instruction)) { - switch (get_xop(instruction)) { - case 532: /* ldbrx */ - nb = 8; - flags = LD+SW; - break; - case 660: /* stdbrx */ - nb = 8; - flags = ST+SW; - break; - case 20: /* lwarx */ - case 84: /* ldarx */ - case 116: /* lharx */ - case 276: /* lqarx */ - return 0; /* not emulated ever */ - } - } - - /* Byteswap little endian loads and stores */ - swiz = 0; - if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) { - flags ^= SW; -#ifdef __BIG_ENDIAN__ - /* - * So-called "PowerPC little endian" mode works by - * swizzling addresses rather than by actually doing - * any byte-swapping. To emulate this, we XOR each - * byte address with 7. We also byte-swap, because - * the processor's address swizzling depends on the - * operand size (it xors the address with 7 for bytes, - * 6 for halfwords, 4 for words, 0 for doublewords) but - * we will xor with 7 and load/store each byte separately. - */ - if (cpu_has_feature(CPU_FTR_PPC_LE)) - swiz = 7; -#endif - } - - /* DAR has the operand effective address */ - addr = (unsigned char __user *)regs->dar; - -#ifdef CONFIG_VSX - if ((instruction & 0xfc00003e) == 0x7c000018) { - unsigned int elsize; - - /* Additional register addressing bit (64 VSX vs 32 FPR/GPR) */ - reg |= (instruction & 0x1) << 5; - /* Simple inline decoder instead of a table */ - /* VSX has only 8 and 16 byte memory accesses */ - nb = 8; - if (instruction & 0x200) - nb = 16; - - /* Vector stores in little-endian mode swap individual - elements, so process them separately */ - elsize = 4; - if (instruction & 0x80) - elsize = 8; - - flags = 0; - if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) - flags |= SW; - if (instruction & 0x100) - flags |= ST; - if (instruction & 0x040) - flags |= U; - /* splat load needs a special decoder */ - if ((instruction & 0x400) == 0){ - flags |= SPLT; - nb = 8; - } - PPC_WARN_ALIGNMENT(vsx, regs); - return emulate_vsx(addr, reg, areg, regs, flags, nb, elsize); - } -#endif /* * ISA 3.0 (such as P9) copy, copy_first, paste and paste_last alignment @@ -896,173 +332,27 @@ int fix_alignment(struct pt_regs *regs) * when pasting to a co-processor. Furthermore, paste_last is the * synchronisation point for preceding copy/paste sequences. */ - if ((instruction & 0xfc0006fe) == PPC_INST_COPY) + if ((instr & 0xfc0006fe) == PPC_INST_COPY) return -EIO; - /* A size of 0 indicates an instruction we don't support, with - * the exception of DCBZ which is handled as a special case here - */ - if (instr == DCBZ) { - PPC_WARN_ALIGNMENT(dcbz, regs); - return emulate_dcbz(regs, addr); - } - if (unlikely(nb == 0)) - return 0; - - /* Load/Store Multiple instructions are handled in their own - * function - */ - if (flags & M) { - PPC_WARN_ALIGNMENT(multiple, regs); - return emulate_multiple(regs, addr, reg, nb, - flags, instr, swiz); - } - - /* Verify the address of the operand */ - if (unlikely(user_mode(regs) && - !access_ok((flags & ST ? VERIFY_WRITE : VERIFY_READ), - addr, nb))) - return -EFAULT; - - /* Force the fprs into the save area so we can reference them */ - if (flags & F) { - /* userland only */ - if (unlikely(!user_mode(regs))) - return 0; - flush_fp_to_thread(current); - } + r = analyse_instr(&op, regs, instr); + if (r < 0) + return -EINVAL; - if (nb == 16) { - if (flags & F) { - /* Special case for 16-byte FP loads and stores */ - PPC_WARN_ALIGNMENT(fp_pair, regs); - return emulate_fp_pair(addr, reg, flags); - } else { -#ifdef CONFIG_PPC64 - /* Special case for 16-byte loads and stores */ - PPC_WARN_ALIGNMENT(lq_stq, regs); - return emulate_lq_stq(regs, addr, reg, flags); -#else - return 0; -#endif - } - } - - PPC_WARN_ALIGNMENT(unaligned, regs); - - /* If we are loading, get the data from user space, else - * get it from register values - */ - if (!(flags & ST)) { - unsigned int start = 0; - - switch (nb) { - case 4: - start = offsetof(union data, x32.low32); - break; - case 2: - start = offsetof(union data, x16.low16); - break; - } - - data.ll = 0; - ret = 0; - p = (unsigned long)addr; - - for (i = 0; i < nb; i++) - ret |= __get_user_inatomic(data.v[start + i], - SWIZ_PTR(p++)); - - if (unlikely(ret)) - return -EFAULT; - - } else if (flags & F) { - data.ll = current->thread.TS_FPR(reg); - if (flags & S) { - /* Single-precision FP store requires conversion... */ -#ifdef CONFIG_PPC_FPU - preempt_disable(); - enable_kernel_fp(); - cvt_df(&data.dd, (float *)&data.x32.low32); - disable_kernel_fp(); - preempt_enable(); -#else - return 0; -#endif - } - } else - data.ll = regs->gpr[reg]; - - if (flags & SW) { - switch (nb) { - case 8: - data.ll = swab64(data.ll); - break; - case 4: - data.x32.low32 = swab32(data.x32.low32); - break; - case 2: - data.x16.low16 = swab16(data.x16.low16); - break; - } - } - - /* Perform other misc operations like sign extension - * or floating point single precision conversion - */ - switch (flags & ~(U|SW)) { - case LD+SE: /* sign extending integer loads */ - case LD+F+SE: /* sign extend for lfiwax */ - if ( nb == 2 ) - data.ll = data.x16.low16; - else /* nb must be 4 */ - data.ll = data.x32.low32; - break; - - /* Single-precision FP load requires conversion... */ - case LD+F+S: -#ifdef CONFIG_PPC_FPU - preempt_disable(); - enable_kernel_fp(); - cvt_fd((float *)&data.x32.low32, &data.dd); - disable_kernel_fp(); - preempt_enable(); -#else - return 0; -#endif - break; + type = op.type & INSTR_TYPE_MASK; + if (!OP_IS_LOAD_STORE(type)) { + if (type != CACHEOP + DCBZ) + return -EINVAL; + PPC_WARN_ALIGNMENT(dcbz, regs); + r = emulate_dcbz(op.ea, regs); + } else { + if (type == LARX || type == STCX) + return -EIO; + PPC_WARN_ALIGNMENT(unaligned, regs); + r = emulate_loadstore(regs, &op); } - /* Store result to memory or update registers */ - if (flags & ST) { - unsigned int start = 0; - - switch (nb) { - case 4: - start = offsetof(union data, x32.low32); - break; - case 2: - start = offsetof(union data, x16.low16); - break; - } - - ret = 0; - p = (unsigned long)addr; - - for (i = 0; i < nb; i++) - ret |= __put_user_inatomic(data.v[start + i], - SWIZ_PTR(p++)); - - if (unlikely(ret)) - return -EFAULT; - } else if (flags & F) - current->thread.TS_FPR(reg) = data.ll; - else - regs->gpr[reg] = data.ll; - - /* Update RA as needed */ - if (flags & U) - regs->gpr[areg] = regs->dar; - - return 1; + if (!r) + return 1; + return r; } diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 709e23425317..8cfb20e38cfe 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -100,12 +100,12 @@ int main(void) OFFSET(THREAD_NORMSAVES, thread_struct, normsave[0]); #endif OFFSET(THREAD_FPEXC_MODE, thread_struct, fpexc_mode); - OFFSET(THREAD_FPSTATE, thread_struct, fp_state); + OFFSET(THREAD_FPSTATE, thread_struct, fp_state.fpr); OFFSET(THREAD_FPSAVEAREA, thread_struct, fp_save_area); OFFSET(FPSTATE_FPSCR, thread_fp_state, fpscr); OFFSET(THREAD_LOAD_FP, thread_struct, load_fp); #ifdef CONFIG_ALTIVEC - OFFSET(THREAD_VRSTATE, thread_struct, vr_state); + OFFSET(THREAD_VRSTATE, thread_struct, vr_state.vr); OFFSET(THREAD_VRSAVEAREA, thread_struct, vr_save_area); OFFSET(THREAD_VRSAVE, thread_struct, vrsave); OFFSET(THREAD_USED_VR, thread_struct, used_vr); @@ -145,9 +145,9 @@ int main(void) OFFSET(THREAD_TM_PPR, thread_struct, tm_ppr); OFFSET(THREAD_TM_DSCR, thread_struct, tm_dscr); OFFSET(PT_CKPT_REGS, thread_struct, ckpt_regs); - OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state); + OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state.vr); OFFSET(THREAD_CKVRSAVE, thread_struct, ckvrsave); - OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state); + OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state.fpr); /* Local pt_regs on stack for Transactional Memory funcs. */ DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16); @@ -485,6 +485,7 @@ int main(void) OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls); OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v); OFFSET(KVM_RADIX, kvm, arch.radix); + OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled); OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr); OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar); OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); @@ -513,6 +514,7 @@ int main(void) OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions); OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded); OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded); + OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request); OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr); OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc); OFFSET(VCPU_SPMC, kvm_vcpu, arch.spmc); @@ -542,6 +544,7 @@ int main(void) OFFSET(VCPU_WORT, kvm_vcpu, arch.wort); OFFSET(VCPU_TID, kvm_vcpu, arch.tid); OFFSET(VCPU_PSSCR, kvm_vcpu, arch.psscr); + OFFSET(VCPU_HFSCR, kvm_vcpu, arch.hfscr); OFFSET(VCORE_ENTRY_EXIT, kvmppc_vcore, entry_exit_map); OFFSET(VCORE_IN_GUEST, kvmppc_vcore, in_guest); OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads); @@ -742,9 +745,19 @@ int main(void) OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask); OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); + OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr); +#define STOP_SPR(x, f) OFFSET(x, paca_struct, stop_sprs.f) + STOP_SPR(STOP_PID, pid); + STOP_SPR(STOP_LDBAR, ldbar); + STOP_SPR(STOP_FSCR, fscr); + STOP_SPR(STOP_HFSCR, hfscr); + STOP_SPR(STOP_MMCR1, mmcr1); + STOP_SPR(STOP_MMCR2, mmcr2); + STOP_SPR(STOP_MMCRA, mmcra); #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); + DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE); #ifdef CONFIG_PPC_8xx DEFINE(VIRT_IMMR_BASE, (u64)__fix_to_virt(FIX_IMMR_BASE)); diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c index 8275858a434d..3f46ca1c59f9 100644 --- a/arch/powerpc/kernel/btext.c +++ b/arch/powerpc/kernel/btext.c @@ -253,7 +253,7 @@ int __init btext_find_display(int allow_nonstdout) for_each_node_by_type(np, "display") { if (of_get_property(np, "linux,opened", NULL)) { - printk("trying %s ...\n", np->full_name); + printk("trying %pOF ...\n", np); rc = btext_initialize(np); printk("result: %d\n", rc); } diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c index c641983bbdd6..a8f20e5928e1 100644 --- a/arch/powerpc/kernel/cacheinfo.c +++ b/arch/powerpc/kernel/cacheinfo.c @@ -167,10 +167,10 @@ static void release_cache_debugcheck(struct cache *cache) list_for_each_entry(iter, &cache_list, list) WARN_ONCE(iter->next_local == cache, - "cache for %s(%s) refers to cache for %s(%s)\n", - iter->ofnode->full_name, + "cache for %pOF(%s) refers to cache for %pOF(%s)\n", + iter->ofnode, cache_type_string(iter), - cache->ofnode->full_name, + cache->ofnode, cache_type_string(cache)); } @@ -179,8 +179,8 @@ static void release_cache(struct cache *cache) if (!cache) return; - pr_debug("freeing L%d %s cache for %s\n", cache->level, - cache_type_string(cache), cache->ofnode->full_name); + pr_debug("freeing L%d %s cache for %pOF\n", cache->level, + cache_type_string(cache), cache->ofnode); release_cache_debugcheck(cache); list_del(&cache->list); @@ -194,8 +194,8 @@ static void cache_cpu_set(struct cache *cache, int cpu) while (next) { WARN_ONCE(cpumask_test_cpu(cpu, &next->shared_cpu_map), - "CPU %i already accounted in %s(%s)\n", - cpu, next->ofnode->full_name, + "CPU %i already accounted in %pOF(%s)\n", + cpu, next->ofnode, cache_type_string(next)); cpumask_set_cpu(cpu, &next->shared_cpu_map); next = next->next_local; @@ -355,7 +355,7 @@ static int cache_is_unified_d(const struct device_node *np) */ static struct cache *cache_do_one_devnode_unified(struct device_node *node, int level) { - pr_debug("creating L%d ucache for %s\n", level, node->full_name); + pr_debug("creating L%d ucache for %pOF\n", level, node); return new_cache(cache_is_unified_d(node), level, node); } @@ -365,8 +365,8 @@ static struct cache *cache_do_one_devnode_split(struct device_node *node, { struct cache *dcache, *icache; - pr_debug("creating L%d dcache and icache for %s\n", level, - node->full_name); + pr_debug("creating L%d dcache and icache for %pOF\n", level, + node); dcache = new_cache(CACHE_TYPE_DATA, level, node); icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node); @@ -679,7 +679,6 @@ static struct kobj_type cache_index_type = { static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) { - const char *cache_name; const char *cache_type; struct cache *cache; char *buf; @@ -690,7 +689,6 @@ static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) return; cache = dir->cache; - cache_name = cache->ofnode->full_name; cache_type = cache_type_string(cache); /* We don't want to create an attribute that can't provide a @@ -707,14 +705,14 @@ static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir) rc = attr->show(&dir->kobj, attr, buf); if (rc <= 0) { pr_debug("not creating %s attribute for " - "%s(%s) (rc = %zd)\n", - attr->attr.name, cache_name, + "%pOF(%s) (rc = %zd)\n", + attr->attr.name, cache->ofnode, cache_type, rc); continue; } if (sysfs_create_file(&dir->kobj, &attr->attr)) - pr_debug("could not create %s attribute for %s(%s)\n", - attr->attr.name, cache_name, cache_type); + pr_debug("could not create %s attribute for %pOF(%s)\n", + attr->attr.name, cache->ofnode, cache_type); } kfree(buf); @@ -831,8 +829,8 @@ static void cache_cpu_clear(struct cache *cache, int cpu) struct cache *next = cache->next_local; WARN_ONCE(!cpumask_test_cpu(cpu, &cache->shared_cpu_map), - "CPU %i not accounted in %s(%s)\n", - cpu, cache->ofnode->full_name, + "CPU %i not accounted in %pOF(%s)\n", + cpu, cache->ofnode, cache_type_string(cache)); cpumask_clear_cpu(cpu, &cache->shared_cpu_map); diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index 10cb2896b2ae..610955fe8b81 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -218,13 +218,20 @@ __init_tlb_power8: ptesync 1: blr +/* + * Flush the TLB in hash mode. Hash must flush with RIC=2 once for process + * and one for partition scope to clear process and partition table entries. + */ __init_tlb_power9: - li r6,POWER9_TLB_SETS_HASH + li r6,POWER9_TLB_SETS_HASH - 1 mtctr r6 li r7,0xc00 /* IS field = 0b11 */ + li r8,0 ptesync -2: tlbiel r7 - addi r7,r7,0x1000 + PPC_TLBIEL(7, 8, 2, 1, 0) + PPC_TLBIEL(7, 8, 2, 0, 0) +2: addi r7,r7,0x1000 + PPC_TLBIEL(7, 8, 0, 0, 0) bdnz 2b ptesync 1: blr diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index 6f849832a669..760872916013 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -1259,10 +1259,10 @@ static struct cpu_spec __initdata cpu_specs[] = { .platform = "ppc603", }, #endif /* CONFIG_PPC_BOOK3S_32 */ -#ifdef CONFIG_8xx +#ifdef CONFIG_PPC_8xx { /* 8xx */ .pvr_mask = 0xffff0000, - .pvr_value = 0x00500000, + .pvr_value = PVR_8xx, .cpu_name = "8xx", /* CPU_FTR_MAYBE_CAN_DOZE is possible, * if the 8xx code is there.... */ @@ -1274,7 +1274,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_8xx, .platform = "ppc823", }, -#endif /* CONFIG_8xx */ +#endif /* CONFIG_PPC_8xx */ #ifdef CONFIG_40x { /* 403GC */ .pvr_mask = 0xffffff00, @@ -1936,6 +1936,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_440A, .platform = "ppc440", }, +#ifdef CONFIG_PPC_47x { /* 476 DD2 core */ .pvr_mask = 0xffffffff, .pvr_value = 0x11a52080, @@ -1992,6 +1993,7 @@ static struct cpu_spec __initdata cpu_specs[] = { .machine_check = machine_check_47x, .platform = "ppc470", }, +#endif /* CONFIG_PPC_47x */ { /* default match */ .pvr_mask = 0x00000000, .pvr_value = 0x00000000, diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c index fb7cbaa37658..8f7abf9baa63 100644 --- a/arch/powerpc/kernel/dma-iommu.c +++ b/arch/powerpc/kernel/dma-iommu.c @@ -105,6 +105,11 @@ static u64 dma_iommu_get_required_mask(struct device *dev) return mask; } +int dma_iommu_mapping_error(struct device *dev, dma_addr_t dma_addr) +{ + return dma_addr == IOMMU_MAPPING_ERROR; +} + struct dma_map_ops dma_iommu_ops = { .alloc = dma_iommu_alloc_coherent, .free = dma_iommu_free_coherent, @@ -115,5 +120,6 @@ struct dma_map_ops dma_iommu_ops = { .map_page = dma_iommu_map_page, .unmap_page = dma_iommu_unmap_page, .get_required_mask = dma_iommu_get_required_mask, + .mapping_error = dma_iommu_mapping_error, }; EXPORT_SYMBOL(dma_iommu_ops); diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index 41c749586bd2..4194bbbbdb10 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -314,18 +314,6 @@ EXPORT_SYMBOL(dma_set_coherent_mask); #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) -int __dma_set_mask(struct device *dev, u64 dma_mask) -{ - const struct dma_map_ops *dma_ops = get_dma_ops(dev); - - if ((dma_ops != NULL) && (dma_ops->set_dma_mask != NULL)) - return dma_ops->set_dma_mask(dev, dma_mask); - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - *dev->dma_mask = dma_mask; - return 0; -} - int dma_set_mask(struct device *dev, u64 dma_mask) { if (ppc_md.dma_set_mask) @@ -338,7 +326,10 @@ int dma_set_mask(struct device *dev, u64 dma_mask) return phb->controller_ops.dma_set_mask(pdev, dma_mask); } - return __dma_set_mask(dev, dma_mask); + if (!dev->dma_mask || !dma_supported(dev, dma_mask)) + return -EIO; + *dev->dma_mask = dma_mask; + return 0; } EXPORT_SYMBOL(dma_set_mask); diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 4c7656dc4e04..1df770e8cbe0 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -94,9 +94,6 @@ static void (*init_pmu_registers)(void); static void cpufeatures_flush_tlb(void) { - unsigned long rb; - unsigned int i, num_sets; - /* * This is a temporary measure to keep equivalent TLB flush as the * cputable based setup code. @@ -105,24 +102,15 @@ static void cpufeatures_flush_tlb(void) case PVR_POWER8: case PVR_POWER8E: case PVR_POWER8NVL: - num_sets = POWER8_TLB_SETS; + __flush_tlb_power8(POWER8_TLB_SETS); break; case PVR_POWER9: - num_sets = POWER9_TLB_SETS_HASH; + __flush_tlb_power9(POWER9_TLB_SETS_HASH); break; default: - num_sets = 1; pr_err("unknown CPU version for boot TLB flush\n"); break; } - - asm volatile("ptesync" : : : "memory"); - rb = TLBIEL_INVAL_SET; - for (i = 0; i < num_sets; i++) { - asm volatile("tlbiel %0" : : "r" (rb)); - rb += 1 << TLBIEL_INVAL_SET_SHIFT; - } - asm volatile("ptesync" : : : "memory"); } static void __restore_cpu_cpufeatures(void) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 63992b2d8e15..9e816787c0d4 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -44,6 +44,7 @@ #include <asm/machdep.h> #include <asm/ppc-pci.h> #include <asm/rtas.h> +#include <asm/pte-walk.h> /** Overview: @@ -169,10 +170,10 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len) char buffer[128]; n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n", - edev->phb->global_number, pdn->busno, + pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n", - edev->phb->global_number, pdn->busno, + pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); eeh_ops->read_config(pdn, PCI_VENDOR_ID, 4, &cfg); @@ -352,8 +353,7 @@ static inline unsigned long eeh_token_to_phys(unsigned long token) * worried about _PAGE_SPLITTING/collapse. Also we will not hit * page table free, because of init_mm. */ - ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token, - NULL, &hugepage_shift); + ptep = find_init_mm_pte(token, &hugepage_shift); if (!ptep) return token; WARN_ON(hugepage_shift); @@ -435,7 +435,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev) int ret; int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); unsigned long flags; - struct pci_dn *pdn; + struct device_node *dn; struct pci_dev *dev; struct eeh_pe *pe, *parent_pe, *phb_pe; int rc = 0; @@ -493,9 +493,10 @@ int eeh_dev_check_failure(struct eeh_dev *edev) if (pe->state & EEH_PE_ISOLATED) { pe->check_count++; if (pe->check_count % EEH_MAX_FAILS == 0) { - pdn = eeh_dev_to_pdn(edev); - if (pdn->node) - location = of_get_property(pdn->node, "ibm,loc-code", NULL); + dn = pci_device_to_OF_node(dev); + if (dn) + location = of_get_property(dn, "ibm,loc-code", + NULL); printk(KERN_ERR "EEH: %d reads ignored for recovering device at " "location=%s driver=%s pci addr=%s\n", pe->check_count, @@ -1064,7 +1065,7 @@ core_initcall_sync(eeh_init); */ void eeh_add_device_early(struct pci_dn *pdn) { - struct pci_controller *phb; + struct pci_controller *phb = pdn ? pdn->phb : NULL; struct eeh_dev *edev = pdn_to_eeh_dev(pdn); if (!edev) @@ -1074,7 +1075,6 @@ void eeh_add_device_early(struct pci_dn *pdn) return; /* USB Bus children of PCI devices will not have BUID's */ - phb = edev->phb; if (NULL == phb || (eeh_has_flag(EEH_PROBE_MODE_DEVTREE) && 0 == phb->buid)) return; diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c index d6b2ca70d14d..ad04ecd63c20 100644 --- a/arch/powerpc/kernel/eeh_dev.c +++ b/arch/powerpc/kernel/eeh_dev.c @@ -50,21 +50,16 @@ */ struct eeh_dev *eeh_dev_init(struct pci_dn *pdn) { - struct pci_controller *phb = pdn->phb; struct eeh_dev *edev; /* Allocate EEH device */ edev = kzalloc(sizeof(*edev), GFP_KERNEL); - if (!edev) { - pr_warn("%s: out of memory\n", - __func__); + if (!edev) return NULL; - } /* Associate EEH device with OF node */ pdn->edev = edev; edev->pdn = pdn; - edev->phb = phb; INIT_LIST_HEAD(&edev->list); INIT_LIST_HEAD(&edev->rmv_list); diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index c405c79e50cd..8b840191df59 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -428,7 +428,7 @@ static void *eeh_add_virt_device(void *data, void *userdata) if (!(edev->physfn)) { pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n", - __func__, edev->phb->global_number, pdn->busno, + __func__, pdn->phb->global_number, pdn->busno, PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn)); return NULL; } diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index cc4b206f77e4..2e8d1b2b5af4 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -230,10 +230,15 @@ void *eeh_pe_dev_traverse(struct eeh_pe *root, * Bus/Device/Function number. The extra data referred by flag * indicates which type of address should be used. */ +struct eeh_pe_get_flag { + int pe_no; + int config_addr; +}; + static void *__eeh_pe_get(void *data, void *flag) { struct eeh_pe *pe = (struct eeh_pe *)data; - struct eeh_dev *edev = (struct eeh_dev *)flag; + struct eeh_pe_get_flag *tmp = (struct eeh_pe_get_flag *) flag; /* Unexpected PHB PE */ if (pe->type & EEH_PE_PHB) @@ -244,17 +249,17 @@ static void *__eeh_pe_get(void *data, void *flag) * have non-zero PE address */ if (eeh_has_flag(EEH_VALID_PE_ZERO)) { - if (edev->pe_config_addr == pe->addr) + if (tmp->pe_no == pe->addr) return pe; } else { - if (edev->pe_config_addr && - (edev->pe_config_addr == pe->addr)) + if (tmp->pe_no && + (tmp->pe_no == pe->addr)) return pe; } /* Try BDF address */ - if (edev->config_addr && - (edev->config_addr == pe->config_addr)) + if (tmp->config_addr && + (tmp->config_addr == pe->config_addr)) return pe; return NULL; @@ -262,7 +267,9 @@ static void *__eeh_pe_get(void *data, void *flag) /** * eeh_pe_get - Search PE based on the given address - * @edev: EEH device + * @phb: PCI controller + * @pe_no: PE number + * @config_addr: Config address * * Search the corresponding PE based on the specified address which * is included in the eeh device. The function is used to check if @@ -271,12 +278,14 @@ static void *__eeh_pe_get(void *data, void *flag) * which is composed of PCI bus/device/function number, or unified * PE address. */ -struct eeh_pe *eeh_pe_get(struct eeh_dev *edev) +struct eeh_pe *eeh_pe_get(struct pci_controller *phb, + int pe_no, int config_addr) { - struct eeh_pe *root = eeh_phb_pe_get(edev->phb); + struct eeh_pe *root = eeh_phb_pe_get(phb); + struct eeh_pe_get_flag tmp = { pe_no, config_addr }; struct eeh_pe *pe; - pe = eeh_pe_traverse(root, __eeh_pe_get, edev); + pe = eeh_pe_traverse(root, __eeh_pe_get, &tmp); return pe; } @@ -330,11 +339,13 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev) int eeh_add_to_parent_pe(struct eeh_dev *edev) { struct eeh_pe *pe, *parent; + struct pci_dn *pdn = eeh_dev_to_pdn(edev); + int config_addr = (pdn->busno << 8) | (pdn->devfn); /* Check if the PE number is valid */ if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) { pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n", - __func__, edev->config_addr, edev->phb->global_number); + __func__, config_addr, pdn->phb->global_number); return -EINVAL; } @@ -344,7 +355,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) * PE should be composed of PCI bus and its subordinate * components. */ - pe = eeh_pe_get(edev); + pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr); if (pe && !(pe->type & EEH_PE_INVALID)) { /* Mark the PE as type of PCI bus */ pe->type = EEH_PE_BUS; @@ -353,11 +364,11 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) /* Put the edev to PE */ list_add_tail(&edev->list, &pe->edevs); pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n", - edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF), - pe->addr); + pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn), + pe->addr); return 0; } else if (pe && (pe->type & EEH_PE_INVALID)) { list_add_tail(&edev->list, &pe->edevs); @@ -376,25 +387,25 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) pr_debug("EEH: Add %04x:%02x:%02x.%01x to Device " "PE#%x, Parent PE#%x\n", - edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF), - pe->addr, pe->parent->addr); + pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn), + pe->addr, pe->parent->addr); return 0; } /* Create a new EEH PE */ if (edev->physfn) - pe = eeh_pe_alloc(edev->phb, EEH_PE_VF); + pe = eeh_pe_alloc(pdn->phb, EEH_PE_VF); else - pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE); + pe = eeh_pe_alloc(pdn->phb, EEH_PE_DEVICE); if (!pe) { pr_err("%s: out of memory!\n", __func__); return -ENOMEM; } pe->addr = edev->pe_config_addr; - pe->config_addr = edev->config_addr; + pe->config_addr = config_addr; /* * Put the new EEH PE into hierarchy tree. If the parent @@ -404,10 +415,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) */ parent = eeh_pe_get_parent(edev); if (!parent) { - parent = eeh_phb_pe_get(edev->phb); + parent = eeh_phb_pe_get(pdn->phb); if (!parent) { pr_err("%s: No PHB PE is found (PHB Domain=%d)\n", - __func__, edev->phb->global_number); + __func__, pdn->phb->global_number); edev->pe = NULL; kfree(pe); return -EEXIST; @@ -424,10 +435,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev) edev->pe = pe; pr_debug("EEH: Add %04x:%02x:%02x.%01x to " "Device PE#%x, Parent PE#%x\n", - edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF), + pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn), pe->addr, pe->parent->addr); return 0; @@ -446,13 +457,14 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev) { struct eeh_pe *pe, *parent, *child; int cnt; + struct pci_dn *pdn = eeh_dev_to_pdn(edev); if (!edev->pe) { pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n", - __func__, edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF)); + __func__, pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn)); return -EEXIST; } @@ -712,10 +724,10 @@ static void eeh_bridge_check_link(struct eeh_dev *edev) return; pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n", - __func__, edev->phb->global_number, - edev->config_addr >> 8, - PCI_SLOT(edev->config_addr & 0xFF), - PCI_FUNC(edev->config_addr & 0xFF)); + __func__, pdn->phb->global_number, + pdn->busno, + PCI_SLOT(pdn->devfn), + PCI_FUNC(pdn->devfn)); /* Check slot status */ cap = edev->pcie_cap; diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c index 1ceecdda810b..797549289798 100644 --- a/arch/powerpc/kernel/eeh_sysfs.c +++ b/arch/powerpc/kernel/eeh_sysfs.c @@ -51,7 +51,6 @@ static ssize_t eeh_show_##_name(struct device *dev, \ static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL); EEH_SHOW_ATTR(eeh_mode, mode, "0x%x"); -EEH_SHOW_ATTR(eeh_config_addr, config_addr, "0x%x"); EEH_SHOW_ATTR(eeh_pe_config_addr, pe_config_addr, "0x%x"); static ssize_t eeh_pe_state_show(struct device *dev, @@ -103,7 +102,6 @@ void eeh_sysfs_add_device(struct pci_dev *pdev) return; rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode); - rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr); rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_state); @@ -128,7 +126,6 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev) } device_remove_file(&pdev->dev, &dev_attr_eeh_mode); - device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr); device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); device_remove_file(&pdev->dev, &dev_attr_eeh_pe_state); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 8587059ad848..e780e1fbf6c2 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -43,6 +43,13 @@ #define LOAD_MSR_KERNEL(r, x) li r,(x) #endif +/* + * Align to 4k in order to ensure that all functions modyfing srr0/srr1 + * fit into one page in order to not encounter a TLB miss between the + * modification of srr0/srr1 and the associated rfi. + */ + .align 12 + #ifdef CONFIG_BOOKE .globl mcheck_transfer_to_handler mcheck_transfer_to_handler: @@ -586,6 +593,10 @@ ppc_swapcontext: handle_page_fault: stw r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_6xx + andis. r0,r5,DSISR_DABRMATCH@h + bne- handle_dabr_fault +#endif bl do_page_fault cmpwi r3,0 beq+ ret_from_except @@ -599,6 +610,17 @@ handle_page_fault: bl bad_page_fault b ret_from_except_full +#ifdef CONFIG_6xx + /* We have a data breakpoint exception - handle it */ +handle_dabr_fault: + SAVE_NVGPRS(r1) + lwz r0,_TRAP(r1) + clrrwi r0,r0,1 + stw r0,_TRAP(r1) + bl do_break + b ret_from_except_full +#endif + /* * This routine switches between two different tasks. The process * state of one is saved on its kernel stack. Then the state diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index bfbad08a1207..4a0fd4f40245 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -57,7 +57,7 @@ system_call_common: #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */ - bne tabort_syscall + bne .Ltabort_syscall END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif andi. r10,r12,MSR_PR @@ -143,6 +143,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) mtmsrd r11,1 #endif /* CONFIG_PPC_BOOK3E */ +system_call: /* label this so stack traces look sane */ /* We do need to set SOFTE in the stack frame or the return * from interrupt will be painful */ @@ -152,11 +153,11 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) CURRENT_THREAD_INFO(r11, r1) ld r10,TI_FLAGS(r11) andi. r11,r10,_TIF_SYSCALL_DOTRACE - bne syscall_dotrace /* does not return */ + bne .Lsyscall_dotrace /* does not return */ cmpldi 0,r0,NR_syscalls - bge- syscall_enosys + bge- .Lsyscall_enosys -system_call: /* label this so stack traces look sane */ +.Lsyscall: /* * Need to vector to 32 Bit or default sys_call_table here, * based on caller's run-mode / personality. @@ -185,8 +186,20 @@ system_call: /* label this so stack traces look sane */ #ifdef CONFIG_PPC_BOOK3S /* No MSR:RI on BookE */ andi. r10,r8,MSR_RI - beq- unrecov_restore + beq- .Lunrecov_restore #endif + +/* + * This is a few instructions into the actual syscall exit path (which actually + * starts at .Lsyscall_exit) to cater to kprobe blacklisting and to reduce the + * number of visible symbols for profiling purposes. + * + * We can probe from system_call until this point as MSR_RI is set. But once it + * is cleared below, we won't be able to take a trap. + * + * This is blacklisted from kprobes further below with _ASM_NOKPROBE_SYMBOL(). + */ +system_call_exit: /* * Disable interrupts so current_thread_info()->flags can't change, * and so that we don't get interrupted after loading SRR0/1. @@ -208,7 +221,7 @@ system_call: /* label this so stack traces look sane */ ld r9,TI_FLAGS(r12) li r11,-MAX_ERRNO andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) - bne- syscall_exit_work + bne- .Lsyscall_exit_work andi. r0,r8,MSR_FP beq 2f @@ -232,7 +245,7 @@ system_call: /* label this so stack traces look sane */ 3: cmpld r3,r11 ld r5,_CCR(r1) - bge- syscall_error + bge- .Lsyscall_error .Lsyscall_error_cont: ld r7,_NIP(r1) BEGIN_FTR_SECTION @@ -258,14 +271,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) RFI b . /* prevent speculative execution */ -syscall_error: +.Lsyscall_error: oris r5,r5,0x1000 /* Set SO bit in CR */ neg r3,r3 std r5,_CCR(r1) b .Lsyscall_error_cont - + /* Traced system call support */ -syscall_dotrace: +.Lsyscall_dotrace: bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl do_syscall_trace_enter @@ -286,23 +299,23 @@ syscall_dotrace: ld r7,GPR7(r1) ld r8,GPR8(r1) - /* Repopulate r9 and r10 for the system_call path */ + /* Repopulate r9 and r10 for the syscall path */ addi r9,r1,STACK_FRAME_OVERHEAD CURRENT_THREAD_INFO(r10, r1) ld r10,TI_FLAGS(r10) cmpldi r0,NR_syscalls - blt+ system_call + blt+ .Lsyscall /* Return code is already in r3 thanks to do_syscall_trace_enter() */ b .Lsyscall_exit -syscall_enosys: +.Lsyscall_enosys: li r3,-ENOSYS b .Lsyscall_exit -syscall_exit_work: +.Lsyscall_exit_work: #ifdef CONFIG_PPC_BOOK3S li r10,MSR_RI mtmsrd r10,1 /* Restore RI */ @@ -362,7 +375,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) b ret_from_except #ifdef CONFIG_PPC_TRANSACTIONAL_MEM -tabort_syscall: +.Ltabort_syscall: /* Firstly we need to enable TM in the kernel */ mfmsr r10 li r9, 1 @@ -388,6 +401,8 @@ tabort_syscall: rfid b . /* prevent speculative execution */ #endif +_ASM_NOKPROBE_SYMBOL(system_call_common); +_ASM_NOKPROBE_SYMBOL(system_call_exit); /* Save non-volatile GPRs, if not already saved. */ _GLOBAL(save_nvgprs) @@ -398,6 +413,7 @@ _GLOBAL(save_nvgprs) clrrdi r0,r11,1 std r0,_TRAP(r1) blr +_ASM_NOKPROBE_SYMBOL(save_nvgprs); /* @@ -488,33 +504,30 @@ _GLOBAL(_switch) std r23,_CCR(r1) std r1,KSP(r3) /* Set old stack pointer */ -#ifdef CONFIG_SMP - /* We need a sync somewhere here to make sure that if the - * previous task gets rescheduled on another CPU, it sees all - * stores it has performed on this one. + /* + * On SMP kernels, care must be taken because a task may be + * scheduled off CPUx and on to CPUy. Memory ordering must be + * considered. + * + * Cacheable stores on CPUx will be visible when the task is + * scheduled on CPUy by virtue of the core scheduler barriers + * (see "Notes on Program-Order guarantees on SMP systems." in + * kernel/sched/core.c). + * + * Uncacheable stores in the case of involuntary preemption must + * be taken care of. The smp_mb__before_spin_lock() in __schedule() + * is implemented as hwsync on powerpc, which orders MMIO too. So + * long as there is an hwsync in the context switch path, it will + * be executed on the source CPU after the task has performed + * all MMIO ops on that CPU, and on the destination CPU before the + * task performs any MMIO ops there. */ - sync -#endif /* CONFIG_SMP */ /* - * If we optimise away the clear of the reservation in system - * calls because we know the CPU tracks the address of the - * reservation, then we need to clear it here to cover the - * case that the kernel context switch path has no larx - * instructions. + * The kernel context switch path must contain a spin_lock, + * which contains larx/stcx, which will clear any reservation + * of the task being switched. */ -BEGIN_FTR_SECTION - ldarx r6,0,r1 -END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS) - -BEGIN_FTR_SECTION -/* - * A cp_abort (copy paste abort) here ensures that when context switching, a - * copy from one process can't leak into the paste of another. - */ - PPC_CP_ABORT -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - #ifdef CONFIG_PPC_BOOK3S /* Cancel all explict user streams as they will have no use after context * switch and will stop the HW from creating streams itself @@ -583,6 +596,14 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) top of the kernel stack. */ addi r7,r7,THREAD_SIZE-SWITCH_FRAME_SIZE + /* + * PMU interrupts in radix may come in here. They will use r1, not + * PACAKSAVE, so this stack switch will not cause a problem. They + * will store to the process stack, which may then be migrated to + * another CPU. However the rq lock release on this CPU paired with + * the rq lock acquire on the new CPU before the stack becomes + * active on the new CPU, will order those stores. + */ mr r1,r8 /* start using new stack pointer */ std r7,PACAKSAVE(r13) @@ -763,11 +784,11 @@ restore: ld r5,SOFTE(r1) lbz r6,PACASOFTIRQEN(r13) cmpwi cr0,r5,0 - beq restore_irq_off + beq .Lrestore_irq_off /* We are enabling, were we already enabled ? Yes, just return */ cmpwi cr0,r6,1 - beq cr0,do_restore + beq cr0,.Ldo_restore /* * We are about to soft-enable interrupts (we are hard disabled @@ -776,14 +797,14 @@ restore: */ lbz r0,PACAIRQHAPPENED(r13) cmpwi cr0,r0,0 - bne- restore_check_irq_replay + bne- .Lrestore_check_irq_replay /* * Get here when nothing happened while soft-disabled, just * soft-enable and move-on. We will hard-enable as a side * effect of rfi */ -restore_no_replay: +.Lrestore_no_replay: TRACE_ENABLE_INTS li r0,1 stb r0,PACASOFTIRQEN(r13); @@ -791,7 +812,7 @@ restore_no_replay: /* * Final return path. BookE is handled in a different file */ -do_restore: +.Ldo_restore: #ifdef CONFIG_PPC_BOOK3E b exception_return_book3e #else @@ -825,7 +846,7 @@ fast_exception_return: REST_8GPRS(5, r1) andi. r0,r3,MSR_RI - beq- unrecov_restore + beq- .Lunrecov_restore /* Load PPR from thread struct before we clear MSR:RI */ BEGIN_FTR_SECTION @@ -883,7 +904,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * make sure that in this case, we also clear PACA_IRQ_HARD_DIS * or that bit can get out of sync and bad things will happen */ -restore_irq_off: +.Lrestore_irq_off: ld r3,_MSR(r1) lbz r7,PACAIRQHAPPENED(r13) andi. r0,r3,MSR_EE @@ -893,13 +914,13 @@ restore_irq_off: 1: li r0,0 stb r0,PACASOFTIRQEN(r13); TRACE_DISABLE_INTS - b do_restore + b .Ldo_restore /* * Something did happen, check if a re-emit is needed * (this also clears paca->irq_happened) */ -restore_check_irq_replay: +.Lrestore_check_irq_replay: /* XXX: We could implement a fast path here where we check * for irq_happened being just 0x01, in which case we can * clear it and return. That means that we would potentially @@ -909,7 +930,7 @@ restore_check_irq_replay: */ bl __check_irq_replay cmpwi cr0,r3,0 - beq restore_no_replay + beq .Lrestore_no_replay /* * We need to re-emit an interrupt. We do so by re-using our @@ -945,23 +966,26 @@ restore_check_irq_replay: #ifdef CONFIG_PPC_BOOK3E cmpwi cr0,r3,0x280 #else - BEGIN_FTR_SECTION - cmpwi cr0,r3,0xe80 - FTR_SECTION_ELSE - cmpwi cr0,r3,0xa00 - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) + cmpwi cr0,r3,0xa00 #endif /* CONFIG_PPC_BOOK3E */ bne 1f addi r3,r1,STACK_FRAME_OVERHEAD; bl doorbell_exception - b ret_from_except #endif /* CONFIG_PPC_DOORBELL */ 1: b ret_from_except /* What else to do here ? */ -unrecov_restore: +.Lunrecov_restore: addi r3,r1,STACK_FRAME_OVERHEAD bl unrecoverable_exception - b unrecov_restore + b .Lunrecov_restore + +_ASM_NOKPROBE_SYMBOL(ret_from_except); +_ASM_NOKPROBE_SYMBOL(ret_from_except_lite); +_ASM_NOKPROBE_SYMBOL(resume_kernel); +_ASM_NOKPROBE_SYMBOL(fast_exc_return_irq); +_ASM_NOKPROBE_SYMBOL(restore); +_ASM_NOKPROBE_SYMBOL(fast_exception_return); + #ifdef CONFIG_PPC_RTAS /* @@ -1038,6 +1062,8 @@ _GLOBAL(enter_rtas) rldicr r9,r9,MSR_SF_LG,(63-MSR_SF_LG) ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE andc r6,r0,r9 + +__enter_rtas: sync /* disable interrupts so SRR0/1 */ mtmsrd r0 /* don't get trashed */ @@ -1074,9 +1100,11 @@ rtas_return_loc: mtspr SPRN_SRR1,r4 rfid b . /* prevent speculative execution */ +_ASM_NOKPROBE_SYMBOL(__enter_rtas) +_ASM_NOKPROBE_SYMBOL(rtas_return_loc) .align 3 -1: .llong rtas_restore_regs +1: .8byte rtas_restore_regs rtas_restore_regs: /* relocation is on at this point */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b886795060fd..48da0f5d2f7f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -99,7 +99,11 @@ EXC_VIRT_NONE(0x4000, 0x100) #ifdef CONFIG_PPC_P7_NAP /* * If running native on arch 2.06 or later, check if we are waking up - * from nap/sleep/winkle, and branch to idle handler. + * from nap/sleep/winkle, and branch to idle handler. This tests SRR1 + * bits 46:47. A non-0 value indicates that we are coming from a power + * saving state. The idle wakeup handler initially runs in real mode, + * but we branch to the 0xc000... address so we can turn on relocation + * with mtmsr. */ #define IDLETEST(n) \ BEGIN_FTR_SECTION ; \ @@ -107,7 +111,7 @@ EXC_VIRT_NONE(0x4000, 0x100) rlwinm. r10,r10,47-31,30,31 ; \ beq- 1f ; \ cmpwi cr3,r10,2 ; \ - BRANCH_TO_COMMON(r10, system_reset_idle_common) ; \ + BRANCH_TO_C000(r10, system_reset_idle_common) ; \ 1: \ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #else @@ -128,6 +132,7 @@ EXC_VIRT_NONE(0x4100, 0x100) #ifdef CONFIG_PPC_P7_NAP EXC_COMMON_BEGIN(system_reset_idle_common) + mfspr r12,SPRN_SRR1 b pnv_powersave_wakeup #endif @@ -507,46 +512,22 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_DAR - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crset 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - /* - * We can't just use a direct branch to slb_miss_realmode - * because the distance from here to there depends on where - * the kernel ends up being put. - */ - mfctr r11 - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_DAR - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crset 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - /* - * We can't just use a direct branch to slb_miss_realmode - * because the distance from here to there depends on where - * the kernel ends up being put. - */ - mfctr r11 - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) @@ -560,7 +541,7 @@ EXC_COMMON_BEGIN(instruction_access_common) RECONCILE_IRQ_STATE(r10, r11) ld r12,_MSR(r1) ld r3,_NIP(r1) - andis. r4,r12,0x5820 + andis. r4,r12,DSISR_BAD_FAULT_64S@h li r5,0x400 std r3,_DAR(r1) std r4,_DSISR(r1) @@ -575,88 +556,82 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crclr 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - mfctr r11 - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crclr 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - mfctr r11 - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) TRAMP_KVM(PACA_EXSLB, 0x480) -/* This handler is used by both 0x380 and 0x480 slb miss interrupts */ -EXC_COMMON_BEGIN(slb_miss_realmode) +/* + * This handler is used by the 0x380 and 0x480 SLB miss interrupts, as well as + * the virtual mode 0x4380 and 0x4480 interrupts if AIL is enabled. + */ +EXC_COMMON_BEGIN(slb_miss_common) /* * r13 points to the PACA, r9 contains the saved CR, - * r12 contain the saved SRR1, SRR0 is still ready for return + * r12 contains the saved r3, + * r11 contain the saved SRR1, SRR0 is still ready for return * r3 has the faulting address * r9 - r13 are saved in paca->exslb. - * r3 is saved in paca->slb_r3 * cr6.eq is set for a D-SLB miss, clear for a I-SLB miss * We assume we aren't going to take any exceptions during this * procedure. */ mflr r10 -#ifdef CONFIG_RELOCATABLE - mtctr r11 -#endif - stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ - std r3,PACA_EXSLB+EX_DAR(r13) + + /* + * Test MSR_RI before calling slb_allocate_realmode, because the + * MSR in r11 gets clobbered. However we still want to allocate + * SLB in case MSR_RI=0, to minimise the risk of getting stuck in + * recursive SLB faults. So use cr5 for this, which is preserved. + */ + andi. r11,r11,MSR_RI /* check for unrecoverable exception */ + cmpdi cr5,r11,MSR_RI crset 4*cr0+eq #ifdef CONFIG_PPC_STD_MMU_64 BEGIN_MMU_FTR_SECTION - bl slb_allocate_realmode + bl slb_allocate END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) #endif ld r10,PACA_EXSLB+EX_LR(r13) - ld r3,PACA_EXSLB+EX_R3(r13) lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ mtlr r10 - beq 8f /* if bad address, make full stack frame */ + beq- 8f /* if bad address, make full stack frame */ - andi. r10,r12,MSR_RI /* check for unrecoverable exception */ - beq- 2f + bne- cr5,2f /* if unrecoverable exception, oops */ /* All done -- return from exception. */ .machine push .machine "power4" mtcrf 0x80,r9 + mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ mtcrf 0x02,r9 /* I/D indication is in cr6 */ mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ .machine pop + RESTORE_CTR(r9, PACA_EXSLB) RESTORE_PPR_PACA(PACA_EXSLB, r9) + mr r3,r12 ld r9,PACA_EXSLB+EX_R9(r13) ld r10,PACA_EXSLB+EX_R10(r13) ld r11,PACA_EXSLB+EX_R11(r13) @@ -665,7 +640,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) rfid b . /* prevent speculative execution */ -2: mfspr r11,SPRN_SRR0 +2: std r3,PACA_EXSLB+EX_DAR(r13) + mr r3,r12 + mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 LOAD_HANDLER(r10,unrecov_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) @@ -673,7 +651,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) rfid b . -8: mfspr r11,SPRN_SRR0 +8: std r3,PACA_EXSLB+EX_DAR(r13) + mr r3,r12 + mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 LOAD_HANDLER(r10,bad_addr_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) @@ -821,46 +802,81 @@ EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00) TRAMP_KVM(PACA_EXGEN, 0xb00) EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) +/* + * system call / hypercall (0xc00, 0x4c00) + * + * The system call exception is invoked with "sc 0" and does not alter HV bit. + * There is support for kernel code to invoke system calls but there are no + * in-tree users. + * + * The hypercall is invoked with "sc 1" and sets HV=1. + * + * In HPT, sc 1 always goes to 0xc00 real mode. In RADIX, sc 1 can go to + * 0x4c00 virtual mode. + * + * Call convention: + * + * syscall register convention is in Documentation/powerpc/syscall64-abi.txt + * + * For hypercalls, the register convention is as follows: + * r0 volatile + * r1-2 nonvolatile + * r3 volatile parameter and return value for status + * r4-r10 volatile input and output value + * r11 volatile hypercall number and output value + * r12 volatile input and output value + * r13-r31 nonvolatile + * LR nonvolatile + * CTR volatile + * XER volatile + * CR0-1 CR5-7 volatile + * CR2-4 nonvolatile + * Other registers nonvolatile + * + * The intersection of volatile registers that don't contain possible + * inputs is: cr0, xer, ctr. We may use these as scratch regs upon entry + * without saving, though xer is not a good idea to use, as hardware may + * interpret some bits so it may be costly to change them. + */ #ifdef CONFIG_KVM_BOOK3S_64_HANDLER - /* - * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems - * that support it) before changing to HMT_MEDIUM. That allows the KVM - * code to save that value into the guest state (it is the guest's PPR - * value). Otherwise just change to HMT_MEDIUM as userspace has - * already saved the PPR. - */ + /* + * There is a little bit of juggling to get syscall and hcall + * working well. Save r13 in ctr to avoid using SPRG scratch + * register. + * + * Userspace syscalls have already saved the PPR, hcalls must save + * it before setting HMT_MEDIUM. + */ #define SYSCALL_KVMTEST \ - SET_SCRATCH0(r13); \ + mtctr r13; \ GET_PACA(r13); \ - std r9,PACA_EXGEN+EX_R9(r13); \ - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \ - HMT_MEDIUM; \ std r10,PACA_EXGEN+EX_R10(r13); \ - OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); \ - mfcr r9; \ - KVMTEST_PR(0xc00); \ - GET_SCRATCH0(r13) + KVMTEST_PR(0xc00); /* uses r10, branch to do_kvm_0xc00_system_call */ \ + HMT_MEDIUM; \ + mfctr r9; #else #define SYSCALL_KVMTEST \ - HMT_MEDIUM + HMT_MEDIUM; \ + mr r9,r13; \ + GET_PACA(r13); #endif #define LOAD_SYSCALL_HANDLER(reg) \ __LOAD_HANDLER(reg, system_call_common) -/* Syscall routine is used twice, in reloc-off and reloc-on paths */ -#define SYSCALL_PSERIES_1 \ +#define SYSCALL_FASTENDIAN_TEST \ BEGIN_FTR_SECTION \ cmpdi r0,0x1ebe ; \ beq- 1f ; \ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ - mr r9,r13 ; \ - GET_PACA(r13) ; \ - mfspr r11,SPRN_SRR0 ; \ -0: -#define SYSCALL_PSERIES_2_RFID \ +/* + * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9, + * and HMT_MEDIUM. + */ +#define SYSCALL_REAL \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ LOAD_SYSCALL_HANDLER(r10) ; \ mtspr SPRN_SRR0,r10 ; \ @@ -869,11 +885,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ rfid ; \ b . ; /* prevent speculative execution */ -#define SYSCALL_PSERIES_3 \ +#define SYSCALL_FASTENDIAN \ /* Fast LE/BE switch system call */ \ 1: mfspr r12,SPRN_SRR1 ; \ xori r12,r12,MSR_LE ; \ mtspr SPRN_SRR1,r12 ; \ + mr r13,r9 ; \ rfid ; /* return to userspace */ \ b . ; /* prevent speculative execution */ @@ -882,16 +899,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ * We can't branch directly so we do it via the CTR which * is volatile across system calls. */ -#define SYSCALL_PSERIES_2_DIRECT \ - LOAD_SYSCALL_HANDLER(r12) ; \ - mtctr r12 ; \ +#define SYSCALL_VIRT \ + LOAD_SYSCALL_HANDLER(r10) ; \ + mtctr r10 ; \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ li r10,MSR_RI ; \ mtmsrd r10,1 ; \ bctr ; #else /* We can branch directly */ -#define SYSCALL_PSERIES_2_DIRECT \ +#define SYSCALL_VIRT \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ li r10,MSR_RI ; \ mtmsrd r10,1 ; /* Set RI (EE=0) */ \ @@ -899,20 +918,42 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ #endif EXC_REAL_BEGIN(system_call, 0xc00, 0x100) - SYSCALL_KVMTEST - SYSCALL_PSERIES_1 - SYSCALL_PSERIES_2_RFID - SYSCALL_PSERIES_3 + SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */ + SYSCALL_FASTENDIAN_TEST + SYSCALL_REAL + SYSCALL_FASTENDIAN EXC_REAL_END(system_call, 0xc00, 0x100) EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100) - SYSCALL_KVMTEST - SYSCALL_PSERIES_1 - SYSCALL_PSERIES_2_DIRECT - SYSCALL_PSERIES_3 + SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */ + SYSCALL_FASTENDIAN_TEST + SYSCALL_VIRT + SYSCALL_FASTENDIAN EXC_VIRT_END(system_call, 0x4c00, 0x100) -TRAMP_KVM(PACA_EXGEN, 0xc00) +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER + /* + * This is a hcall, so register convention is as above, with these + * differences: + * r13 = PACA + * ctr = orig r13 + * orig r10 saved in PACA + */ +TRAMP_KVM_BEGIN(do_kvm_0xc00) + /* + * Save the PPR (on systems that support it) before changing to + * HMT_MEDIUM. That allows the KVM code to save that value into the + * guest state (it is the guest's PPR value). + */ + OPT_GET_SPR(r10, SPRN_PPR, CPU_FTR_HAS_PPR) + HMT_MEDIUM + OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r10, CPU_FTR_HAS_PPR) + mfctr r10 + SET_SCRATCH0(r10) + std r9,PACA_EXGEN+EX_R9(r13) + mfcr r9 + KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00) +#endif EXC_REAL(single_step, 0xd00, 0x100) @@ -1273,6 +1314,39 @@ EXC_REAL_NONE(0x1800, 0x100) EXC_VIRT_NONE(0x5800, 0x100) #endif +#ifdef CONFIG_PPC_WATCHDOG + +#define MASKED_DEC_HANDLER_LABEL 3f + +#define MASKED_DEC_HANDLER(_H) \ +3: /* soft-nmi */ \ + std r12,PACA_EXGEN+EX_R12(r13); \ + GET_SCRATCH0(r10); \ + std r10,PACA_EXGEN+EX_R13(r13); \ + EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H) + +/* + * Branch to soft_nmi_interrupt using the emergency stack. The emergency + * stack is one that is usable by maskable interrupts so long as MSR_EE + * remains off. It is used for recovery when something has corrupted the + * normal kernel stack, for example. The "soft NMI" must not use the process + * stack because we want irq disabled sections to avoid touching the stack + * at all (other than PMU interrupts), so use the emergency stack for this, + * and run it entirely with interrupts hard disabled. + */ +EXC_COMMON_BEGIN(soft_nmi_common) + mr r10,r1 + ld r1,PACAEMERGSP(r13) + subi r1,r1,INT_FRAME_SIZE + EXCEPTION_COMMON_NORET_STACK(PACA_EXGEN, 0x900, + system_reset, soft_nmi_interrupt, + ADD_NVGPRS;ADD_RECONCILE) + b ret_from_except + +#else /* CONFIG_PPC_WATCHDOG */ +#define MASKED_DEC_HANDLER_LABEL 2f /* normal return */ +#define MASKED_DEC_HANDLER(_H) +#endif /* CONFIG_PPC_WATCHDOG */ /* * An interrupt came in while soft-disabled. We set paca->irq_happened, then: @@ -1295,22 +1369,20 @@ masked_##_H##interrupt: \ lis r10,0x7fff; \ ori r10,r10,0xffff; \ mtspr SPRN_DEC,r10; \ - b 2f; \ -1: cmpwi r10,PACA_IRQ_DBELL; \ - beq 2f; \ - cmpwi r10,PACA_IRQ_HMI; \ - beq 2f; \ + b MASKED_DEC_HANDLER_LABEL; \ +1: andi. r10,r10,(PACA_IRQ_DBELL|PACA_IRQ_HMI); \ + bne 2f; \ mfspr r10,SPRN_##_H##SRR1; \ - rldicl r10,r10,48,1; /* clear MSR_EE */ \ - rotldi r10,r10,16; \ + xori r10,r10,MSR_EE; /* clear MSR_EE */ \ mtspr SPRN_##_H##SRR1,r10; \ 2: mtcrf 0x80,r9; \ ld r9,PACA_EXGEN+EX_R9(r13); \ ld r10,PACA_EXGEN+EX_R10(r13); \ ld r11,PACA_EXGEN+EX_R11(r13); \ - GET_SCRATCH0(r13); \ + /* returns to kernel where r13 must be set up, so don't restore it */ \ ##_H##rfid; \ - b . + b .; \ + MASKED_DEC_HANDLER(_H) /* * Real mode exceptions actually use this too, but alternate @@ -1410,8 +1482,10 @@ USE_TEXT_SECTION() */ .balign IFETCH_ALIGN_BYTES do_hash_page: -#ifdef CONFIG_PPC_STD_MMU_64 - andis. r0,r4,0xa450 /* weird error? */ + #ifdef CONFIG_PPC_STD_MMU_64 + lis r0,DSISR_BAD_FAULT_64S@h + ori r0,r0,DSISR_BAD_FAULT_64S@l + and. r0,r4,r0 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ CURRENT_THREAD_INFO(r11, r1) lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ @@ -1553,6 +1627,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) 1: addi r3,r1,STACK_FRAME_OVERHEAD bl kernel_bad_stack b 1b +_ASM_NOKPROBE_SYMBOL(bad_stack); + +/* + * When doorbell is triggered from system reset wakeup, the message is + * not cleared, so it would fire again when EE is enabled. + * + * When coming from local_irq_enable, there may be the same problem if + * we were hard disabled. + * + * Execute msgclr to clear pending exceptions before handling it. + */ +h_doorbell_common_msgclr: + LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) + PPC_MSGCLR(3) + b h_doorbell_common + +doorbell_super_common_msgclr: + LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) + PPC_MSGCLRP(3) + b doorbell_super_common /* * Called from arch_local_irq_enable when an interrupt needs @@ -1563,6 +1657,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) * Note: While MSR:EE is off, we need to make sure that _MSR * in the generated frame has EE set to 1 or the exception * handler will not properly re-enable them. + * + * Note that we don't specify LR as the NIP (return address) for + * the interrupt because that would unbalance the return branch + * predictor. */ _GLOBAL(__replay_interrupt) /* We are going to jump to the exception common code which @@ -1570,22 +1668,27 @@ _GLOBAL(__replay_interrupt) * we don't give a damn about, so we don't bother storing them. */ mfmsr r12 - mflr r11 + LOAD_REG_ADDR(r11, replay_interrupt_return) mfcr r9 ori r12,r12,MSR_EE cmpwi r3,0x900 beq decrementer_common cmpwi r3,0x500 - beq hardware_interrupt_common BEGIN_FTR_SECTION - cmpwi r3,0xe80 - beq h_doorbell_common - cmpwi r3,0xea0 beq h_virt_irq_common +FTR_SECTION_ELSE + beq hardware_interrupt_common +ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_300) +BEGIN_FTR_SECTION + cmpwi r3,0xa00 + beq h_doorbell_common_msgclr cmpwi r3,0xe60 beq hmi_exception_common FTR_SECTION_ELSE cmpwi r3,0xa00 - beq doorbell_super_common + beq doorbell_super_common_msgclr ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) +replay_interrupt_return: blr + +_ASM_NOKPROBE_SYMBOL(__replay_interrupt) diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 466569e26278..e1431800bfb9 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -113,11 +113,62 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, return 1; } +/* + * If fadump is registered, check if the memory provided + * falls within boot memory area. + */ +int is_fadump_boot_memory_area(u64 addr, ulong size) +{ + if (!fw_dump.dump_registered) + return 0; + + return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size; +} + +int should_fadump_crash(void) +{ + if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr) + return 0; + return 1; +} + int is_fadump_active(void) { return fw_dump.dump_active; } +/* + * Returns 1, if there are no holes in boot memory area, + * 0 otherwise. + */ +static int is_boot_memory_area_contiguous(void) +{ + struct memblock_region *reg; + unsigned long tstart, tend; + unsigned long start_pfn = PHYS_PFN(RMA_START); + unsigned long end_pfn = PHYS_PFN(RMA_START + fw_dump.boot_memory_size); + unsigned int ret = 0; + + for_each_memblock(memory, reg) { + tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); + tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + if (tstart < tend) { + /* Memory hole from start_pfn to tstart */ + if (tstart > start_pfn) + break; + + if (tend == end_pfn) { + ret = 1; + break; + } + + start_pfn = tend + 1; + } + } + + return ret; +} + /* Print firmware assisted dump configurations for debugging purpose. */ static void fadump_show_config(void) { @@ -212,20 +263,46 @@ static inline unsigned long fadump_calculate_reserve_size(void) int ret; unsigned long long base, size; + if (fw_dump.reserve_bootvar) + pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n"); + /* * Check if the size is specified through crashkernel= cmdline - * option. If yes, then use that but ignore base as fadump - * reserves memory at end of RAM. + * option. If yes, then use that but ignore base as fadump reserves + * memory at a predefined offset. */ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &size, &base); if (ret == 0 && size > 0) { + unsigned long max_size; + + if (fw_dump.reserve_bootvar) + pr_info("Using 'crashkernel=' parameter for memory reservation.\n"); + fw_dump.reserve_bootvar = (unsigned long)size; + + /* + * Adjust if the boot memory size specified is above + * the upper limit. + */ + max_size = memblock_phys_mem_size() / MAX_BOOT_MEM_RATIO; + if (fw_dump.reserve_bootvar > max_size) { + fw_dump.reserve_bootvar = max_size; + pr_info("Adjusted boot memory size to %luMB\n", + (fw_dump.reserve_bootvar >> 20)); + } + + return fw_dump.reserve_bootvar; + } else if (fw_dump.reserve_bootvar) { + /* + * 'fadump_reserve_mem=' is being used to reserve memory + * for firmware-assisted dump. + */ return fw_dump.reserve_bootvar; } /* divide by 20 to get 5% of value */ - size = memblock_end_of_DRAM() / 20; + size = memblock_phys_mem_size() / 20; /* round it down in multiples of 256 */ size = size & ~0x0FFFFFFFUL; @@ -377,9 +454,22 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); -static void register_fw_dump(struct fadump_mem_struct *fdm) +/* + * Look for fadump_reserve_mem= cmdline option + * TODO: Remove references to 'fadump_reserve_mem=' parameter, + * the sooner 'crashkernel=' parameter is accustomed to. + */ +static int __init early_fadump_reserve_mem(char *p) { - int rc; + if (p) + fw_dump.reserve_bootvar = memparse(p, &p); + return 0; +} +early_param("fadump_reserve_mem", early_fadump_reserve_mem); + +static int register_fw_dump(struct fadump_mem_struct *fdm) +{ + int rc, err; unsigned int wait_time; pr_debug("Registering for firmware-assisted kernel dump...\n"); @@ -396,26 +486,38 @@ static void register_fw_dump(struct fadump_mem_struct *fdm) } while (wait_time); + err = -EIO; switch (rc) { + default: + pr_err("Failed to register. Unknown Error(%d).\n", rc); + break; case -1: printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Hardware Error(%d).\n", rc); break; case -3: + if (!is_boot_memory_area_contiguous()) + pr_err("Can't have holes in boot memory area while " + "registering fadump\n"); + printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Parameter Error(%d).\n", rc); + err = -EINVAL; break; case -9: printk(KERN_ERR "firmware-assisted kernel dump is already " " registered."); fw_dump.dump_registered = 1; + err = -EEXIST; break; case 0: printk(KERN_INFO "firmware-assisted kernel dump registration" " is successful\n"); fw_dump.dump_registered = 1; + err = 0; break; } + return err; } void crash_fadump(struct pt_regs *regs, const char *str) @@ -423,7 +525,7 @@ void crash_fadump(struct pt_regs *regs, const char *str) struct fadump_crash_info_header *fdh = NULL; int old_cpu, this_cpu; - if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr) + if (!should_fadump_crash()) return; /* @@ -831,8 +933,19 @@ static void fadump_setup_crash_memory_ranges(void) for_each_memblock(memory, reg) { start = (unsigned long long)reg->base; end = start + (unsigned long long)reg->size; - if (start == RMA_START && end >= fw_dump.boot_memory_size) - start = fw_dump.boot_memory_size; + + /* + * skip the first memory chunk that is already added (RMA_START + * through boot_memory_size). This logic needs a relook if and + * when RMA_START changes to a non-zero value. + */ + BUILD_BUG_ON(RMA_START != 0); + if (start < fw_dump.boot_memory_size) { + if (end > fw_dump.boot_memory_size) + start = fw_dump.boot_memory_size; + else + continue; + } /* add this range excluding the reserved dump area. */ fadump_exclude_reserved_area(start, end); @@ -893,8 +1006,7 @@ static int fadump_create_elfcore_headers(char *bufp) phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note()); phdr->p_offset = phdr->p_paddr; - phdr->p_memsz = vmcoreinfo_max_size; - phdr->p_filesz = vmcoreinfo_max_size; + phdr->p_memsz = phdr->p_filesz = VMCOREINFO_NOTE_SIZE; /* Increment number of program headers. */ (elf->e_phnum)++; @@ -956,7 +1068,7 @@ static unsigned long init_fadump_header(unsigned long addr) return addr; } -static void register_fadump(void) +static int register_fadump(void) { unsigned long addr; void *vaddr; @@ -966,7 +1078,7 @@ static void register_fadump(void) * assisted dump. */ if (!fw_dump.reserve_dump_area_size) - return; + return -ENODEV; fadump_setup_crash_memory_ranges(); @@ -979,7 +1091,7 @@ static void register_fadump(void) fadump_create_elfcore_headers(vaddr); /* register the future kernel dump with firmware. */ - register_fw_dump(&fdm); + return register_fw_dump(&fdm); } static int fadump_unregister_dump(struct fadump_mem_struct *fdm) @@ -1046,28 +1158,71 @@ void fadump_cleanup(void) } } +static void fadump_free_reserved_memory(unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn; + unsigned long time_limit = jiffies + HZ; + + pr_info("freeing reserved memory (0x%llx - 0x%llx)\n", + PFN_PHYS(start_pfn), PFN_PHYS(end_pfn)); + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + free_reserved_page(pfn_to_page(pfn)); + + if (time_after(jiffies, time_limit)) { + cond_resched(); + time_limit = jiffies + HZ; + } + } +} + +/* + * Skip memory holes and free memory that was actually reserved. + */ +static void fadump_release_reserved_area(unsigned long start, unsigned long end) +{ + struct memblock_region *reg; + unsigned long tstart, tend; + unsigned long start_pfn = PHYS_PFN(start); + unsigned long end_pfn = PHYS_PFN(end); + + for_each_memblock(memory, reg) { + tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); + tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + if (tstart < tend) { + fadump_free_reserved_memory(tstart, tend); + + if (tend == end_pfn) + break; + + start_pfn = tend + 1; + } + } +} + /* * Release the memory that was reserved in early boot to preserve the memory * contents. The released memory will be available for general use. */ static void fadump_release_memory(unsigned long begin, unsigned long end) { - unsigned long addr; unsigned long ra_start, ra_end; ra_start = fw_dump.reserve_dump_area_start; ra_end = ra_start + fw_dump.reserve_dump_area_size; - for (addr = begin; addr < end; addr += PAGE_SIZE) { - /* - * exclude the dump reserve area. Will reuse it for next - * fadump registration. - */ - if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start)) - continue; - - free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); - } + /* + * exclude the dump reserve area. Will reuse it for next + * fadump registration. + */ + if (begin < ra_end && end > ra_start) { + if (begin < ra_start) + fadump_release_reserved_area(begin, ra_start); + if (end > ra_end) + fadump_release_reserved_area(ra_end, end); + } else + fadump_release_reserved_area(begin, end); } static void fadump_invalidate_release_mem(void) @@ -1161,7 +1316,6 @@ static ssize_t fadump_register_store(struct kobject *kobj, switch (buf[0]) { case '0': if (fw_dump.dump_registered == 0) { - ret = -EINVAL; goto unlock_out; } /* Un-register Firmware-assisted dump */ @@ -1169,11 +1323,11 @@ static ssize_t fadump_register_store(struct kobject *kobj, break; case '1': if (fw_dump.dump_registered == 1) { - ret = -EINVAL; + ret = -EEXIST; goto unlock_out; } /* Register Firmware-assisted dump */ - register_fadump(); + ret = register_fadump(); break; default: ret = -EINVAL; @@ -1299,6 +1453,25 @@ static void fadump_init_files(void) return; } +static int fadump_panic_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + /* + * If firmware-assisted dump has been registered then trigger + * firmware-assisted dump and let firmware handle everything + * else. If this returns, then fadump was not registered, so + * go through the rest of the panic path. + */ + crash_fadump(NULL, ptr); + + return NOTIFY_DONE; +} + +static struct notifier_block fadump_panic_block = { + .notifier_call = fadump_panic_event, + .priority = INT_MIN /* may not return; must be done last */ +}; + /* * Prepare for firmware-assisted dump. */ @@ -1331,6 +1504,9 @@ int __init setup_fadump(void) init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); fadump_init_files(); + atomic_notifier_chain_register(&panic_notifier_list, + &fadump_panic_block); + return 1; } subsys_initcall(setup_fadump); diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index e22734278458..8c54166491e7 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -388,7 +388,7 @@ DataAccess: EXCEPTION_PROLOG mfspr r10,SPRN_DSISR stw r10,_DSISR(r11) - andis. r0,r10,0xa470 /* weird error? */ + andis. r0,r10,DSISR_BAD_FAULT_32S@h bne 1f /* if not, try to put a PTE */ mfspr r4,SPRN_DAR /* into the hash table */ rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */ @@ -403,13 +403,13 @@ DataAccess: DO_KVM 0x400 InstructionAccess: EXCEPTION_PROLOG - andis. r0,r9,0x4000 /* no pte found? */ + andis. r0,r9,SRR1_ISI_NOPT@h /* no pte found? */ beq 1f /* if so, try to put a PTE */ li r3,0 /* into the hash table */ mr r4,r12 /* SRR0 is fault address */ bl hash_page 1: mr r4,r12 - mr r5,r9 + andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ EXC_XFER_LITE(0x400, handle_page_fault) /* External interrupt */ diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S index 0ddc602b33a4..ff8511d6d8ea 100644 --- a/arch/powerpc/kernel/head_64.S +++ b/arch/powerpc/kernel/head_64.S @@ -92,13 +92,13 @@ END_FTR_SECTION(0, 1) .balign 8 .globl __secondary_hold_spinloop __secondary_hold_spinloop: - .llong 0x0 + .8byte 0x0 /* Secondary processors write this value with their cpu # */ /* after they enter the spin loop immediately below. */ .globl __secondary_hold_acknowledge __secondary_hold_acknowledge: - .llong 0x0 + .8byte 0x0 #ifdef CONFIG_RELOCATABLE /* This flag is set to 1 by a loader if the kernel should run @@ -650,7 +650,7 @@ __after_prom_start: bctr .balign 8 -p_end: .llong _end - copy_to_here +p_end: .8byte _end - copy_to_here 4: /* @@ -892,7 +892,7 @@ _GLOBAL(relative_toc) blr .balign 8 -p_toc: .llong __toc_start + 0x8000 - 0b +p_toc: .8byte __toc_start + 0x8000 - 0b /* * This is where the main kernel code starts. diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index c032fe8c2d26..4fee00d414e8 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -50,18 +50,20 @@ mtspr spr, reg #endif -/* Macro to test if an address is a kernel address */ #if CONFIG_TASK_SIZE <= 0x80000000 && CONFIG_PAGE_OFFSET >= 0x80000000 -#define IS_KERNEL(tmp, addr) \ - andis. tmp, addr, 0x8000 /* Address >= 0x80000000 */ -#define BRANCH_UNLESS_KERNEL(label) beq label -#else -#define IS_KERNEL(tmp, addr) \ - rlwinm tmp, addr, 16, 16, 31; \ - cmpli cr0, tmp, PAGE_OFFSET >> 16 -#define BRANCH_UNLESS_KERNEL(label) blt label +/* By simply checking Address >= 0x80000000, we know if its a kernel address */ +#define SIMPLE_KERNEL_ADDRESS 1 #endif +/* + * We need an ITLB miss handler for kernel addresses if: + * - Either we have modules + * - Or we have not pinned the first 8M + */ +#if defined(CONFIG_MODULES) || !defined(CONFIG_PIN_TLB_TEXT) || \ + defined(CONFIG_DEBUG_PAGEALLOC) +#define ITLB_MISS_KERNEL 1 +#endif /* * Value for the bits that have fixed value in RPN entries. @@ -123,7 +125,6 @@ turn_on_mmu: lis r0,start_here@h ori r0,r0,start_here@l mtspr SPRN_SRR0,r0 - SYNC rfi /* enables MMU */ /* @@ -170,7 +171,7 @@ turn_on_mmu: stw r1,0(r11); \ tovirt(r1,r11); /* set new kernel sp */ \ li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR); /* can take exceptions */ \ - MTMSRD(r10); /* (except for mach check in rtas) */ \ + mtmsr r10; \ stw r0,GPR0(r11); \ SAVE_4GPRS(3, r11); \ SAVE_2GPRS(7, r11) @@ -300,7 +301,7 @@ SystemCall: /* On the MPC8xx, this is a software emulation interrupt. It occurs * for all unimplemented and illegal instructions. */ - EXCEPTION(0x1000, SoftEmu, SoftwareEmulation, EXC_XFER_STD) + EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD) . = 0x1100 /* @@ -325,7 +326,7 @@ SystemCall: #endif InstructionTLBMiss: -#if defined(CONFIG_8xx_CPU6) || defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE) +#if defined(CONFIG_8xx_CPU6) || defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mtspr SPRN_SPRG_SCRATCH2, r3 #endif EXCEPTION_PROLOG_0 @@ -343,15 +344,32 @@ InstructionTLBMiss: INVALIDATE_ADJACENT_PAGES_CPU15(r11, r10) /* Only modules will cause ITLB Misses as we always * pin the first 8MB of kernel memory */ -#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE) +#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mfcr r3 #endif -#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) - IS_KERNEL(r11, r10) +#ifdef ITLB_MISS_KERNEL +#if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT) + andis. r11, r10, 0x8000 /* Address >= 0x80000000 */ +#else + rlwinm r11, r10, 16, 0xfff8 + cmpli cr0, r11, PAGE_OFFSET@h +#ifndef CONFIG_PIN_TLB_TEXT + /* It is assumed that kernel code fits into the first 8M page */ +_ENTRY(ITLBMiss_cmp) + cmpli cr7, r11, (PAGE_OFFSET + 0x0800000)@h +#endif +#endif #endif mfspr r11, SPRN_M_TW /* Get level 1 table */ -#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) - BRANCH_UNLESS_KERNEL(3f) +#ifdef ITLB_MISS_KERNEL +#if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT) + beq+ 3f +#else + blt+ 3f +#endif +#ifndef CONFIG_PIN_TLB_TEXT + blt cr7, ITLBMissLinear +#endif lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha 3: #endif @@ -369,7 +387,7 @@ InstructionTLBMiss: rlwimi r10, r11, 0, 0, 32 - PAGE_SHIFT - 1 /* Add level 2 base */ lwz r10, 0(r10) /* Get the pte */ 4: -#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE) +#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mtcr r3 #endif /* Insert the APG into the TWC from the Linux PTE. */ @@ -400,7 +418,7 @@ InstructionTLBMiss: MTSPR_CPU6(SPRN_MI_RPN, r10, r3) /* Update TLB entry */ /* Restore registers */ -#if defined(CONFIG_8xx_CPU6) || defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE) +#if defined(CONFIG_8xx_CPU6) || defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE) mfspr r3, SPRN_SPRG_SCRATCH2 #endif EXCEPTION_EPILOG_0 @@ -447,23 +465,23 @@ DataStoreTLBMiss: * kernel page tables. */ mfspr r10, SPRN_MD_EPN - rlwinm r10, r10, 16, 0xfff8 - cmpli cr0, r10, PAGE_OFFSET@h + rlwinm r11, r10, 16, 0xfff8 + cmpli cr0, r11, PAGE_OFFSET@h mfspr r11, SPRN_M_TW /* Get level 1 table */ blt+ 3f + rlwinm r11, r10, 16, 0xfff8 #ifndef CONFIG_PIN_TLB_IMMR - cmpli cr0, r10, VIRT_IMMR_BASE@h + cmpli cr0, r11, VIRT_IMMR_BASE@h #endif _ENTRY(DTLBMiss_cmp) - cmpli cr7, r10, (PAGE_OFFSET + 0x1800000)@h - lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha + cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h #ifndef CONFIG_PIN_TLB_IMMR _ENTRY(DTLBMiss_jmp) beq- DTLBMissIMMR #endif blt cr7, DTLBMissLinear + lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha 3: - mfspr r10, SPRN_MD_EPN /* Insert level 1 index */ rlwimi r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29 @@ -569,8 +587,8 @@ _ENTRY(DTLBMiss_jmp) InstructionTLBError: EXCEPTION_PROLOG mr r4,r12 - mr r5,r9 - andis. r10,r5,0x4000 + andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */ + andis. r10,r9,SRR1_ISI_NOPT@h beq+ 1f tlbie r4 itlbie: @@ -595,7 +613,7 @@ DARFixed:/* Return from dcbx instruction bug workaround */ mfspr r5,SPRN_DSISR stw r5,_DSISR(r11) mfspr r4,SPRN_DAR - andis. r10,r5,0x4000 + andis. r10,r5,DSISR_NOHPTE@h beq+ 1f tlbie r4 dtlbie: @@ -684,7 +702,7 @@ DTLBMissLinear: /* Set 8M byte page and mark it valid */ li r11, MD_PS8MEG | MD_SVALID MTSPR_CPU6(SPRN_MD_TWC, r11, r3) - rlwinm r10, r10, 16, 0x0f800000 /* 8xx supports max 256Mb RAM */ + rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SHARED | _PAGE_DIRTY | \ _PAGE_PRESENT MTSPR_CPU6(SPRN_MD_RPN, r10, r11) /* Update TLB entry */ @@ -695,6 +713,22 @@ DTLBMissLinear: EXCEPTION_EPILOG_0 rfi +#ifndef CONFIG_PIN_TLB_TEXT +ITLBMissLinear: + mtcr r3 + /* Set 8M byte page and mark it valid */ + li r11, MI_PS8MEG | MI_SVALID | _PAGE_EXEC + MTSPR_CPU6(SPRN_MI_TWC, r11, r3) + rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */ + ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SHARED | _PAGE_DIRTY | \ + _PAGE_PRESENT + MTSPR_CPU6(SPRN_MI_RPN, r10, r11) /* Update TLB entry */ + + mfspr r3, SPRN_SPRG_SCRATCH2 + EXCEPTION_EPILOG_0 + rfi +#endif + /* This is the procedure to calculate the data EA for buggy dcbx,dcbi instructions * by decoding the registers used by the dcbx instruction and adding them. * DAR is set to the calculated address. @@ -705,9 +739,10 @@ FixupDAR:/* Entry point for dcbx workaround. */ mtspr SPRN_SPRG_SCRATCH2, r10 /* fetch instruction from memory. */ mfspr r10, SPRN_SRR0 - IS_KERNEL(r11, r10) + rlwinm r11, r10, 16, 0xfff8 + cmpli cr0, r11, PAGE_OFFSET@h mfspr r11, SPRN_M_TW /* Get level 1 table */ - BRANCH_UNLESS_KERNEL(3f) + blt+ 3f rlwinm r11, r10, 16, 0xfff8 _ENTRY(FixupDAR_cmp) cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h @@ -915,10 +950,8 @@ start_here: rfi /* Load up the kernel context */ 2: - SYNC /* Force all PTE updates to finish */ tlbia /* Clear all TLB entries */ sync /* wait for tlbia/tlbie to finish */ - TLBSYNC /* ... on all CPUs */ /* set up the PTE pointers for the Abatron bdiGDB. */ @@ -955,15 +988,14 @@ initial_mmu: mtspr SPRN_MD_CTR, r10 /* remove PINNED DTLB entries */ tlbia /* Invalidate all TLB entries */ -/* Always pin the first 8 MB ITLB to prevent ITLB - misses while mucking around with SRR0/SRR1 in asm -*/ +#ifdef CONFIG_PIN_TLB_TEXT lis r8, MI_RSV4I@h ori r8, r8, 0x1c00 mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */ +#endif -#ifdef CONFIG_PIN_TLB +#ifdef CONFIG_PIN_TLB_DATA oris r10, r10, MD_RSV4I@h mtspr SPRN_MD_CTR, r10 /* Set data TLB control */ #endif @@ -989,6 +1021,7 @@ initial_mmu: * internal registers (among other things). */ #ifdef CONFIG_PIN_TLB_IMMR + oris r10, r10, MD_RSV4I@h ori r10, r10, 0x1c00 mtspr SPRN_MD_CTR, r10 diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 4898d676dcae..1125c9be9e06 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -30,7 +30,9 @@ * Use unused space in the interrupt stack to save and restore * registers for winkle support. */ +#define _MMCR0 GPR0 #define _SDR1 GPR3 +#define _PTCR GPR3 #define _RPR GPR4 #define _SPURR GPR5 #define _PURR GPR6 @@ -39,7 +41,7 @@ #define _AMOR GPR9 #define _WORT GPR10 #define _WORC GPR11 -#define _PTCR GPR12 +#define _LPCR GPR12 #define PSSCR_EC_ESL_MASK_SHIFTED (PSSCR_EC | PSSCR_ESL) >> 16 @@ -55,12 +57,14 @@ save_sprs_to_stack: * here since any thread in the core might wake up first */ BEGIN_FTR_SECTION - mfspr r3,SPRN_PTCR - std r3,_PTCR(r1) /* * Note - SDR1 is dropped in Power ISA v3. Hence not restoring * SDR1 here */ + mfspr r3,SPRN_PTCR + std r3,_PTCR(r1) + mfspr r3,SPRN_LPCR + std r3,_LPCR(r1) FTR_SECTION_ELSE mfspr r3,SPRN_SDR1 std r3,_SDR1(r1) @@ -81,7 +85,61 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) std r3,_WORT(r1) mfspr r3,SPRN_WORC std r3,_WORC(r1) +/* + * On POWER9, there are idle states such as stop4, invoked via cpuidle, + * that lose hypervisor resources. In such cases, we need to save + * additional SPRs before entering those idle states so that they can + * be restored to their older values on wakeup from the idle state. + * + * On POWER8, the only such deep idle state is winkle which is used + * only in the context of CPU-Hotplug, where these additional SPRs are + * reinitiazed to a sane value. Hence there is no need to save/restore + * these SPRs. + */ +BEGIN_FTR_SECTION + blr +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + +power9_save_additional_sprs: + mfspr r3, SPRN_PID + mfspr r4, SPRN_LDBAR + std r3, STOP_PID(r13) + std r4, STOP_LDBAR(r13) + + mfspr r3, SPRN_FSCR + mfspr r4, SPRN_HFSCR + std r3, STOP_FSCR(r13) + std r4, STOP_HFSCR(r13) + + mfspr r3, SPRN_MMCRA + mfspr r4, SPRN_MMCR1 + std r3, STOP_MMCRA(r13) + std r4, STOP_MMCR1(r13) + mfspr r3, SPRN_MMCR2 + std r3, STOP_MMCR2(r13) + blr + +power9_restore_additional_sprs: + ld r3,_LPCR(r1) + ld r4, STOP_PID(r13) + mtspr SPRN_LPCR,r3 + mtspr SPRN_PID, r4 + + ld r3, STOP_LDBAR(r13) + ld r4, STOP_FSCR(r13) + mtspr SPRN_LDBAR, r3 + mtspr SPRN_FSCR, r4 + + ld r3, STOP_HFSCR(r13) + ld r4, STOP_MMCRA(r13) + mtspr SPRN_HFSCR, r3 + mtspr SPRN_MMCRA, r4 + /* We have already restored PACA_MMCR0 */ + ld r3, STOP_MMCR1(r13) + ld r4, STOP_MMCR2(r13) + mtspr SPRN_MMCR1, r3 + mtspr SPRN_MMCR2, r4 blr /* @@ -106,13 +164,9 @@ core_idle_lock_held: /* * Pass requested state in r3: * r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8 - * - Requested STOP state in POWER9 + * - Requested PSSCR value in POWER9 * - * To check IRQ_HAPPENED in r4 - * 0 - don't check - * 1 - check - * - * Address to 'rfid' to in r5 + * Address of idle handler to branch to in realmode in r4 */ pnv_powersave_common: /* Use r3 to pass state nap/sleep/winkle */ @@ -122,37 +176,14 @@ pnv_powersave_common: * need to save PC, some CR bits and the NV GPRs, * but for now an interrupt frame will do. */ + mtctr r4 + mflr r0 std r0,16(r1) stdu r1,-INT_FRAME_SIZE(r1) std r0,_LINK(r1) std r0,_NIP(r1) - /* Hard disable interrupts */ - mfmsr r9 - rldicl r9,r9,48,1 - rotldi r9,r9,16 - mtmsrd r9,1 /* hard-disable interrupts */ - - /* Check if something happened while soft-disabled */ - lbz r0,PACAIRQHAPPENED(r13) - andi. r0,r0,~PACA_IRQ_HARD_DIS@l - beq 1f - cmpwi cr0,r4,0 - beq 1f - addi r1,r1,INT_FRAME_SIZE - ld r0,16(r1) - li r3,0 /* Return 0 (no nap) */ - mtlr r0 - blr - -1: /* We mark irqs hard disabled as this is the state we'll - * be in when returning and we need to tell arch_local_irq_restore() - * about it - */ - li r0,PACA_IRQ_HARD_DIS - stb r0,PACAIRQHAPPENED(r13) - /* We haven't lost state ... yet */ li r0,0 stb r0,PACA_NAPSTATELOST(r13) @@ -160,24 +191,42 @@ pnv_powersave_common: /* Continue saving state */ SAVE_GPR(2, r1) SAVE_NVGPRS(r1) - mfcr r4 - std r4,_CCR(r1) - std r9,_MSR(r1) + mfcr r5 + std r5,_CCR(r1) std r1,PACAR1(r13) +BEGIN_FTR_SECTION /* + * POWER9 does not require real mode to stop, and presently does not + * set hwthread_state for KVM (threads don't share MMU context), so + * we can remain in virtual mode for this. + */ + bctr +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + /* + * POWER8 * Go to real mode to do the nap, as required by the architecture. * Also, we need to be in real mode before setting hwthread_state, * because as soon as we do that, another thread can switch * the MMU context to the guest. */ LOAD_REG_IMMEDIATE(r7, MSR_IDLE) - li r6, MSR_RI - andc r6, r9, r6 - mtmsrd r6, 1 /* clear RI before setting SRR0/1 */ - mtspr SPRN_SRR0, r5 - mtspr SPRN_SRR1, r7 - rfid + mtmsrd r7,0 + bctr + +/* + * This is the sequence required to execute idle instructions, as + * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0. + */ +#define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST) \ + /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \ + std r0,0(r1); \ + ptesync; \ + ld r0,0(r1); \ +236: cmpd cr0,r0,r0; \ + bne 236b; \ + IDLE_INST; + .globl pnv_enter_arch207_idle_mode pnv_enter_arch207_idle_mode: @@ -270,24 +319,52 @@ enter_winkle: /* * r3 - PSSCR value corresponding to the requested stop state. */ -power_enter_stop: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - /* Tell KVM we're entering idle */ +power_enter_stop_kvm_rm: + /* + * This is currently unused because POWER9 KVM does not have to + * gather secondary threads into sibling mode, but the code is + * here in case that function is required. + * + * Tell KVM we're entering idle. + */ li r4,KVM_HWTHREAD_IN_IDLE /* DO THIS IN REAL MODE! See comment above. */ stb r4,HSTATE_HWTHREAD_STATE(r13) #endif +power_enter_stop: /* * Check if we are executing the lite variant with ESL=EC=0 */ andis. r4,r3,PSSCR_EC_ESL_MASK_SHIFTED clrldi r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */ bne .Lhandle_esl_ec_set - IDLE_STATE_ENTER_SEQ(PPC_STOP) + PPC_STOP li r3,0 /* Since we didn't lose state, return 0 */ + + /* + * pnv_wakeup_noloss() expects r12 to contain the SRR1 value so + * it can determine if the wakeup reason is an HMI in + * CHECK_HMI_INTERRUPT. + * + * However, when we wakeup with ESL=0, SRR1 will not contain the wakeup + * reason, so there is no point setting r12 to SRR1. + * + * Further, we clear r12 here, so that we don't accidentally enter the + * HMI in pnv_wakeup_noloss() if the value of r12[42:45] == WAKE_HMI. + */ + li r12, 0 b pnv_wakeup_noloss .Lhandle_esl_ec_set: + /* + * POWER9 DD2 can incorrectly set PMAO when waking up after a + * state-loss idle. Saving and restoring MMCR0 over idle is a + * workaround. + */ + mfspr r4,SPRN_MMCR0 + std r4,_MMCR0(r1) + /* * Check if the requested state is a deep idle state. */ @@ -295,7 +372,8 @@ power_enter_stop: ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) cmpd r3,r4 bge .Lhandle_deep_stop - IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) + PPC_STOP /* Does not return (system reset interrupt) */ + .Lhandle_deep_stop: /* * Entering deep idle state. @@ -317,47 +395,25 @@ lwarx_loop_stop: bl save_sprs_to_stack - IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) + PPC_STOP /* Does not return (system reset interrupt) */ -_GLOBAL(power7_idle) +/* + * Entered with MSR[EE]=0 and no soft-masked interrupts pending. + * r3 contains desired idle state (PNV_THREAD_NAP/SLEEP/WINKLE). + */ +_GLOBAL(power7_idle_insn) /* Now check if user or arch enabled NAP mode */ - LOAD_REG_ADDRBASE(r3,powersave_nap) - lwz r4,ADDROFF(powersave_nap)(r3) - cmpwi 0,r4,0 - beqlr - li r3, 1 - /* fall through */ - -_GLOBAL(power7_nap) - mr r4,r3 - li r3,PNV_THREAD_NAP - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) + LOAD_REG_ADDR(r4, pnv_enter_arch207_idle_mode) b pnv_powersave_common - /* No return */ - -_GLOBAL(power7_sleep) - li r3,PNV_THREAD_SLEEP - li r4,1 - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) - b pnv_powersave_common - /* No return */ - -_GLOBAL(power7_winkle) - li r3,PNV_THREAD_WINKLE - li r4,1 - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) - b pnv_powersave_common - /* No return */ #define CHECK_HMI_INTERRUPT \ - mfspr r0,SPRN_SRR1; \ BEGIN_FTR_SECTION_NESTED(66); \ - rlwinm r0,r0,45-31,0xf; /* extract wake reason field (P8) */ \ + rlwinm r0,r12,45-31,0xf; /* extract wake reason field (P8) */ \ FTR_SECTION_ELSE_NESTED(66); \ - rlwinm r0,r0,45-31,0xe; /* P7 wake reason field is 3 bits */ \ + rlwinm r0,r12,45-31,0xe; /* P7 wake reason field is 3 bits */ \ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ cmpwi r0,0xa; /* Hypervisor maintenance ? */ \ - bne 20f; \ + bne+ 20f; \ /* Invoke opal call to handle hmi */ \ ld r2,PACATOC(r13); \ ld r1,PACAR1(r13); \ @@ -369,16 +425,13 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ 20: nop; /* - * r3 - The PSSCR value corresponding to the stop state. - * r4 - The PSSCR mask corrresonding to the stop state. + * Entered with MSR[EE]=0 and no soft-masked interrupts pending. + * r3 contains desired PSSCR register value. */ _GLOBAL(power9_idle_stop) - mfspr r5,SPRN_PSSCR - andc r5,r5,r4 - or r3,r3,r5 + std r3, PACA_REQ_PSSCR(r13) mtspr SPRN_PSSCR,r3 - LOAD_REG_ADDR(r5,power_enter_stop) - li r4,1 + LOAD_REG_ADDR(r4,power_enter_stop) b pnv_powersave_common /* No return */ @@ -436,17 +489,29 @@ pnv_powersave_wakeup_mce: /* * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake - * reason into SRR1, which allows reuse of the system reset wakeup + * reason into r12, which allows reuse of the system reset wakeup * code without being mistaken for another type of wakeup. */ - oris r3,r3,SRR1_WAKEMCE_RESVD@h - mtspr SPRN_SRR1,r3 + oris r12,r3,SRR1_WAKEMCE_RESVD@h b pnv_powersave_wakeup +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +kvm_start_guest_check: + li r0,KVM_HWTHREAD_IN_KERNEL + stb r0,HSTATE_HWTHREAD_STATE(r13) + /* Order setting hwthread_state vs. testing hwthread_req */ + sync + lbz r0,HSTATE_HWTHREAD_REQ(r13) + cmpwi r0,0 + beqlr + b kvm_start_guest +#endif + /* * Called from reset vector for powersave wakeups. * cr3 - set to gt if waking up with partial/complete hypervisor state loss + * r12 - SRR1 */ .global pnv_powersave_wakeup pnv_powersave_wakeup: @@ -464,20 +529,15 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) li r0,PNV_THREAD_RUNNING stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */ + mr r3,r12 + #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - li r0,KVM_HWTHREAD_IN_KERNEL - stb r0,HSTATE_HWTHREAD_STATE(r13) - /* Order setting hwthread_state vs. testing hwthread_req */ - sync - lbz r0,HSTATE_HWTHREAD_REQ(r13) - cmpwi r0,0 - beq 1f - b kvm_start_guest -1: +BEGIN_FTR_SECTION + bl kvm_start_guest_check +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) #endif /* Return SRR1 from power7_nap() */ - mfspr r3,SPRN_SRR1 blt cr3,pnv_wakeup_noloss b pnv_wakeup_loss @@ -489,18 +549,45 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) */ pnv_restore_hyp_resource_arch300: /* + * Workaround for POWER9, if we lost resources, the ERAT + * might have been mixed up and needs flushing. We also need + * to reload MMCR0 (see comment above). We also need to set + * then clear bit 60 in MMCRA to ensure the PMU starts running. + */ + blt cr3,1f + PPC_INVALIDATE_ERAT + ld r1,PACAR1(r13) + mfspr r4,SPRN_MMCRA + ori r4,r4,(1 << (63-60)) + mtspr SPRN_MMCRA,r4 + xori r4,r4,(1 << (63-60)) + mtspr SPRN_MMCRA,r4 + ld r4,_MMCR0(r1) + mtspr SPRN_MMCR0,r4 +1: + /* * POWER ISA 3. Use PSSCR to determine if we * are waking up from deep idle state */ LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state) ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) - mfspr r5,SPRN_PSSCR +BEGIN_FTR_SECTION_NESTED(71) + /* + * Assume that we are waking up from the state + * same as the Requested Level (RL) in the PSSCR + * which are Bits 60-63 + */ + ld r5,PACA_REQ_PSSCR(r13) + rldicl r5,r5,0,60 +FTR_SECTION_ELSE_NESTED(71) /* * 0-3 bits correspond to Power-Saving Level Status * which indicates the idle state we are waking up from */ + mfspr r5, SPRN_PSSCR rldicl r5,r5,4,60 +ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_POWER9_DD1, 71) cmpd cr4,r5,r4 bge cr4,pnv_wakeup_tb_loss /* returns to caller */ @@ -567,9 +654,9 @@ pnv_wakeup_tb_loss: * is required to return back to reset vector after hypervisor state * restore is complete. */ + mr r19,r12 mr r18,r4 mflr r17 - mfspr r16,SPRN_SRR1 BEGIN_FTR_SECTION CHECK_HMI_INTERRUPT END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) @@ -731,13 +818,14 @@ timebase_resync: * Use cr3 which indicates that we are waking up with atleast partial * hypervisor state loss to determine if TIMEBASE RESYNC is needed. */ - ble cr3,clear_lock + ble cr3,.Ltb_resynced /* Time base re-sync */ bl opal_resync_timebase; /* - * If waking up from sleep, per core state is not lost, skip to - * clear_lock. + * If waking up from sleep (POWER8), per core state + * is not lost, skip to clear_lock. */ +.Ltb_resynced: blt cr4,clear_lock /* @@ -812,9 +900,20 @@ no_segments: mtctr r12 bctrl +/* + * On POWER9, we can come here on wakeup from a cpuidle stop state. + * Hence restore the additional SPRs to the saved value. + * + * On POWER8, we come here only on winkle. Since winkle is used + * only in the case of CPU-Hotplug, we don't need to restore + * the additional SPRs. + */ +BEGIN_FTR_SECTION + bl power9_restore_additional_sprs +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) hypervisor_state_restored: - mtspr SPRN_SRR1,r16 + mr r12,r19 mtlr r17 blr /* return to pnv_powersave_wakeup */ @@ -827,6 +926,7 @@ fastsleep_workaround_at_exit: /* * R3 here contains the value that will be returned to the caller * of power7_nap. + * R12 contains SRR1 for CHECK_HMI_INTERRUPT. */ .global pnv_wakeup_loss pnv_wakeup_loss: @@ -836,32 +936,33 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) REST_NVGPRS(r1) REST_GPR(2, r1) + ld r4,PACAKMSR(r13) + ld r5,_LINK(r1) ld r6,_CCR(r1) - ld r4,_MSR(r1) - ld r5,_NIP(r1) addi r1,r1,INT_FRAME_SIZE + mtlr r5 mtcr r6 - mtspr SPRN_SRR1,r4 - mtspr SPRN_SRR0,r5 - rfid + mtmsrd r4 + blr /* * R3 here contains the value that will be returned to the caller * of power7_nap. + * R12 contains SRR1 for CHECK_HMI_INTERRUPT. */ pnv_wakeup_noloss: lbz r0,PACA_NAPSTATELOST(r13) cmpwi r0,0 bne pnv_wakeup_loss + ld r1,PACAR1(r13) BEGIN_FTR_SECTION CHECK_HMI_INTERRUPT END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - ld r1,PACAR1(r13) - ld r6,_CCR(r1) - ld r4,_MSR(r1) + ld r4,PACAKMSR(r13) ld r5,_NIP(r1) + ld r6,_CCR(r1) addi r1,r1,INT_FRAME_SIZE + mtlr r5 mtcr r6 - mtspr SPRN_SRR1,r4 - mtspr SPRN_SRR0,r5 - rfid + mtmsrd r4 + blr diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c index a582e0d42525..aa9f1b8261db 100644 --- a/arch/powerpc/kernel/io-workarounds.c +++ b/arch/powerpc/kernel/io-workarounds.c @@ -19,6 +19,8 @@ #include <asm/pgtable.h> #include <asm/ppc-pci.h> #include <asm/io-workarounds.h> +#include <asm/pte-walk.h> + #define IOWA_MAX_BUS 8 @@ -75,8 +77,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr) * We won't find huge pages here (iomem). Also can't hit * a page table free due to init_mm */ - ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr, - NULL, &hugepage_shift); + ptep = find_init_mm_pte(vaddr, &hugepage_shift); if (ptep == NULL) paddr = 0; else { @@ -192,7 +193,7 @@ void iowa_register_bus(struct pci_controller *phb, struct ppc_pci_io *ops, if (iowa_bus_count >= IOWA_MAX_BUS) { pr_err("IOWA:Too many pci bridges, " - "workarounds disabled for %s\n", np->full_name); + "workarounds disabled for %pOF\n", np); return; } @@ -207,6 +208,6 @@ void iowa_register_bus(struct pci_controller *phb, struct ppc_pci_io *ops, iowa_bus_count++; - pr_debug("IOWA:[%d]Add bus, %s.\n", iowa_bus_count-1, np->full_name); + pr_debug("IOWA:[%d]Add bus, %pOF.\n", iowa_bus_count-1, np); } diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index f2b724cd9e64..af7a20dc6e09 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -127,8 +127,7 @@ static ssize_t fail_iommu_store(struct device *dev, return count; } -static DEVICE_ATTR(fail_iommu, S_IRUGO|S_IWUSR, fail_iommu_show, - fail_iommu_store); +static DEVICE_ATTR_RW(fail_iommu); static int fail_iommu_bus_notify(struct notifier_block *nb, unsigned long action, void *data) @@ -190,7 +189,7 @@ static unsigned long iommu_range_alloc(struct device *dev, unsigned int pool_nr; struct iommu_pool *pool; - align_mask = 0xffffffffffffffffl >> (64 - align_order); + align_mask = (1ull << align_order) - 1; /* This allocator was derived from x86_64's bit string search */ @@ -198,17 +197,17 @@ static unsigned long iommu_range_alloc(struct device *dev, if (unlikely(npages == 0)) { if (printk_ratelimit()) WARN_ON(1); - return DMA_ERROR_CODE; + return IOMMU_MAPPING_ERROR; } if (should_fail_iommu(dev)) - return DMA_ERROR_CODE; + return IOMMU_MAPPING_ERROR; /* * We don't need to disable preemption here because any CPU can * safely use any IOMMU pool. */ - pool_nr = __this_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1); + pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1); if (largealloc) pool = &(tbl->large_pool); @@ -278,7 +277,7 @@ again: } else { /* Give up */ spin_unlock_irqrestore(&(pool->lock), flags); - return DMA_ERROR_CODE; + return IOMMU_MAPPING_ERROR; } } @@ -310,13 +309,13 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, unsigned long attrs) { unsigned long entry; - dma_addr_t ret = DMA_ERROR_CODE; + dma_addr_t ret = IOMMU_MAPPING_ERROR; int build_fail; entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order); - if (unlikely(entry == DMA_ERROR_CODE)) - return DMA_ERROR_CODE; + if (unlikely(entry == IOMMU_MAPPING_ERROR)) + return IOMMU_MAPPING_ERROR; entry += tbl->it_offset; /* Offset into real TCE table */ ret = entry << tbl->it_page_shift; /* Set the return dma address */ @@ -328,12 +327,12 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, /* tbl->it_ops->set() only returns non-zero for transient errors. * Clean up the table bitmap in this case and return - * DMA_ERROR_CODE. For all other errors the functionality is + * IOMMU_MAPPING_ERROR. For all other errors the functionality is * not altered. */ if (unlikely(build_fail)) { __iommu_free(tbl, ret, npages); - return DMA_ERROR_CODE; + return IOMMU_MAPPING_ERROR; } /* Flush/invalidate TLB caches if necessary */ @@ -478,7 +477,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen); /* Handle failure */ - if (unlikely(entry == DMA_ERROR_CODE)) { + if (unlikely(entry == IOMMU_MAPPING_ERROR)) { if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) dev_info(dev, "iommu_alloc failed, tbl %p " @@ -545,7 +544,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, */ if (outcount < incount) { outs = sg_next(outs); - outs->dma_address = DMA_ERROR_CODE; + outs->dma_address = IOMMU_MAPPING_ERROR; outs->dma_length = 0; } @@ -563,7 +562,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl, npages = iommu_num_pages(s->dma_address, s->dma_length, IOMMU_PAGE_SIZE(tbl)); __iommu_free(tbl, vaddr, npages); - s->dma_address = DMA_ERROR_CODE; + s->dma_address = IOMMU_MAPPING_ERROR; s->dma_length = 0; } if (s == outs) @@ -777,7 +776,7 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, unsigned long mask, enum dma_data_direction direction, unsigned long attrs) { - dma_addr_t dma_handle = DMA_ERROR_CODE; + dma_addr_t dma_handle = IOMMU_MAPPING_ERROR; void *vaddr; unsigned long uaddr; unsigned int npages, align; @@ -797,7 +796,7 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl, dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction, mask >> tbl->it_page_shift, align, attrs); - if (dma_handle == DMA_ERROR_CODE) { + if (dma_handle == IOMMU_MAPPING_ERROR) { if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) { dev_info(dev, "iommu_alloc failed, tbl %p " @@ -869,7 +868,7 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl, io_order = get_iommu_order(size, tbl); mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL, mask >> tbl->it_page_shift, io_order, 0); - if (mapping == DMA_ERROR_CODE) { + if (mapping == IOMMU_MAPPING_ERROR) { free_pages((unsigned long)ret, order); return NULL; } diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5c291df30fe3..4e65bf82f5e0 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -24,7 +24,7 @@ * mask register (of which only 16 are defined), hence the weird shifting * and complement of the cached_irq_mask. I want to be able to stuff * this right into the SIU SMASK register. - * Many of the prep/chrp functions are conditional compiled on CONFIG_8xx + * Many of the prep/chrp functions are conditional compiled on CONFIG_PPC_8xx * to reduce code space and undefined function references. */ @@ -143,8 +143,22 @@ notrace unsigned int __check_irq_replay(void) */ unsigned char happened = local_paca->irq_happened; - /* Clear bit 0 which we wouldn't clear otherwise */ - local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + if (happened & PACA_IRQ_HARD_DIS) { + /* Clear bit 0 which we wouldn't clear otherwise */ + local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS; + + /* + * We may have missed a decrementer interrupt if hard disabled. + * Check the decrementer register in case we had a rollover + * while hard disabled. + */ + if (!(happened & PACA_IRQ_DEC)) { + if (decrementer_check_overflow()) { + local_paca->irq_happened |= PACA_IRQ_DEC; + happened |= PACA_IRQ_DEC; + } + } + } /* * Force the delivery of pending soft-disabled interrupts on PS3. @@ -160,41 +174,39 @@ notrace unsigned int __check_irq_replay(void) * This is a higher priority interrupt than the others, so * replay it first. */ - local_paca->irq_happened &= ~PACA_IRQ_HMI; - if (happened & PACA_IRQ_HMI) + if (happened & PACA_IRQ_HMI) { + local_paca->irq_happened &= ~PACA_IRQ_HMI; return 0xe60; + } - /* - * We may have missed a decrementer interrupt. We check the - * decrementer itself rather than the paca irq_happened field - * in case we also had a rollover while hard disabled - */ - local_paca->irq_happened &= ~PACA_IRQ_DEC; - if ((happened & PACA_IRQ_DEC) || decrementer_check_overflow()) + if (happened & PACA_IRQ_DEC) { + local_paca->irq_happened &= ~PACA_IRQ_DEC; return 0x900; + } - /* Finally check if an external interrupt happened */ - local_paca->irq_happened &= ~PACA_IRQ_EE; - if (happened & PACA_IRQ_EE) + if (happened & PACA_IRQ_EE) { + local_paca->irq_happened &= ~PACA_IRQ_EE; return 0x500; + } #ifdef CONFIG_PPC_BOOK3E - /* Finally check if an EPR external interrupt happened - * this bit is typically set if we need to handle another - * "edge" interrupt from within the MPIC "EPR" handler + /* + * Check if an EPR external interrupt happened this bit is typically + * set if we need to handle another "edge" interrupt from within the + * MPIC "EPR" handler. */ - local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE; - if (happened & PACA_IRQ_EE_EDGE) + if (happened & PACA_IRQ_EE_EDGE) { + local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE; return 0x500; + } - local_paca->irq_happened &= ~PACA_IRQ_DBELL; - if (happened & PACA_IRQ_DBELL) + if (happened & PACA_IRQ_DBELL) { + local_paca->irq_happened &= ~PACA_IRQ_DBELL; return 0x280; + } #else - local_paca->irq_happened &= ~PACA_IRQ_DBELL; if (happened & PACA_IRQ_DBELL) { - if (cpu_has_feature(CPU_FTR_HVMODE)) - return 0xe80; + local_paca->irq_happened &= ~PACA_IRQ_DBELL; return 0xa00; } #endif /* CONFIG_PPC_BOOK3E */ @@ -322,7 +334,8 @@ bool prep_irq_for_idle(void) * First we need to hard disable to ensure no interrupt * occurs before we effectively enter the low power state */ - hard_irq_disable(); + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; /* * If anything happened while we were soft-disabled, @@ -347,6 +360,65 @@ bool prep_irq_for_idle(void) return true; } +#ifdef CONFIG_PPC_BOOK3S +/* + * This is for idle sequences that return with IRQs off, but the + * idle state itself wakes on interrupt. Tell the irq tracer that + * IRQs are enabled for the duration of idle so it does not get long + * off times. Must be paired with fini_irq_for_idle_irqsoff. + */ +bool prep_irq_for_idle_irqsoff(void) +{ + WARN_ON(!irqs_disabled()); + + /* + * First we need to hard disable to ensure no interrupt + * occurs before we effectively enter the low power state + */ + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + + /* + * If anything happened while we were soft-disabled, + * we return now and do not enter the low power state. + */ + if (lazy_irq_pending()) + return false; + + /* Tell lockdep we are about to re-enable */ + trace_hardirqs_on(); + + return true; +} + +/* + * Take the SRR1 wakeup reason, index into this table to find the + * appropriate irq_happened bit. + */ +static const u8 srr1_to_lazyirq[0x10] = { + 0, 0, 0, + PACA_IRQ_DBELL, + 0, + PACA_IRQ_DBELL, + PACA_IRQ_DEC, + 0, + PACA_IRQ_EE, + PACA_IRQ_EE, + PACA_IRQ_HMI, + 0, 0, 0, 0, 0 }; + +void irq_set_pending_from_srr1(unsigned long srr1) +{ + unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18; + + /* + * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0, + * so this can be called unconditionally with srr1 wake reason. + */ + local_paca->irq_happened |= srr1_to_lazyirq[idx]; +} +#endif /* CONFIG_PPC_BOOK3S */ + /* * Force a replay of the external interrupt handler on this CPU. */ @@ -410,6 +482,18 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, " Hypervisor Maintenance Interrupts\n"); } + seq_printf(p, "%*s: ", prec, "NMI"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).sreset_irqs); + seq_printf(p, " System Reset interrupts\n"); + +#ifdef CONFIG_PPC_WATCHDOG + seq_printf(p, "%*s: ", prec, "WDG"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat, j).soft_nmi_irqs); + seq_printf(p, " Watchdog soft-NMI interrupts\n"); +#endif + #ifdef CONFIG_PPC_DOORBELL if (cpu_has_feature(CPU_FTR_DBELL)) { seq_printf(p, "%*s: ", prec, "DBL"); @@ -434,6 +518,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += per_cpu(irq_stat, cpu).spurious_irqs; sum += per_cpu(irq_stat, cpu).timer_irqs_others; sum += per_cpu(irq_stat, cpu).hmi_exceptions; + sum += per_cpu(irq_stat, cpu).sreset_irqs; +#ifdef CONFIG_PPC_WATCHDOG + sum += per_cpu(irq_stat, cpu).soft_nmi_irqs; +#endif #ifdef CONFIG_PPC_DOORBELL sum += per_cpu(irq_stat, cpu).doorbell_irqs; #endif diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index bb6f8993412e..1df6c74aa731 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -164,7 +164,7 @@ void __init isa_bridge_find_early(struct pci_controller *hose) /* Set the global ISA io base to indicate we have an ISA bridge */ isa_io_base = ISA_IO_BASE; - pr_debug("ISA bridge (early) is %s\n", np->full_name); + pr_debug("ISA bridge (early) is %pOF\n", np); } /** @@ -187,15 +187,15 @@ void __init isa_bridge_init_non_pci(struct device_node *np) pna = of_n_addr_cells(np); if (of_property_read_u32(np, "#address-cells", &na) || of_property_read_u32(np, "#size-cells", &ns)) { - pr_warn("ISA: Non-PCI bridge %s is missing address format\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF is missing address format\n", + np); return; } /* Check it's a supported address format */ if (na != 2 || ns != 1) { - pr_warn("ISA: Non-PCI bridge %s has unsupported address format\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF has unsupported address format\n", + np); return; } rs = na + ns + pna; @@ -203,8 +203,8 @@ void __init isa_bridge_init_non_pci(struct device_node *np) /* Grab the ranges property */ ranges = of_get_property(np, "ranges", &rlen); if (ranges == NULL || rlen < rs) { - pr_warn("ISA: Non-PCI bridge %s has absent or invalid ranges\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF has absent or invalid ranges\n", + np); return; } @@ -220,8 +220,8 @@ void __init isa_bridge_init_non_pci(struct device_node *np) /* Got something ? */ if (!size || !pbasep) { - pr_warn("ISA: Non-PCI bridge %s has no usable IO range\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF has no usable IO range\n", + np); return; } @@ -233,15 +233,15 @@ void __init isa_bridge_init_non_pci(struct device_node *np) /* Map pbase */ pbase = of_translate_address(np, pbasep); if (pbase == OF_BAD_ADDR) { - pr_warn("ISA: Non-PCI bridge %s failed to translate IO base\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF failed to translate IO base\n", + np); return; } /* We need page alignment */ if ((cbase & ~PAGE_MASK) || (pbase & ~PAGE_MASK)) { - pr_warn("ISA: Non-PCI bridge %s has non aligned IO range\n", - np->full_name); + pr_warn("ISA: Non-PCI bridge %pOF has non aligned IO range\n", + np); return; } @@ -255,7 +255,7 @@ void __init isa_bridge_init_non_pci(struct device_node *np) __ioremap_at(pbase, (void *)ISA_IO_BASE, size, pgprot_val(pgprot_noncached(__pgprot(0)))); - pr_debug("ISA: Non-PCI bridge is %s\n", np->full_name); + pr_debug("ISA: Non-PCI bridge is %pOF\n", np); } /** @@ -277,8 +277,8 @@ static void isa_bridge_find_late(struct pci_dev *pdev, /* Set the global ISA io base to indicate we have an ISA bridge */ isa_io_base = ISA_IO_BASE; - pr_debug("ISA bridge (late) is %s on %s\n", - devnode->full_name, pci_name(pdev)); + pr_debug("ISA bridge (late) is %pOF on %s\n", + devnode, pci_name(pdev)); } /** diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index dbf098121ce6..35e240a0a408 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -67,9 +67,9 @@ static struct hard_trap_info #endif #else /* ! (defined(CONFIG_40x) || defined(CONFIG_BOOKE)) */ { 0x0d00, 0x05 /* SIGTRAP */ }, /* single-step */ -#if defined(CONFIG_8xx) +#if defined(CONFIG_PPC_8xx) { 0x1000, 0x04 /* SIGILL */ }, /* software emulation */ -#else /* ! CONFIG_8xx */ +#else /* ! CONFIG_PPC_8xx */ { 0x0f00, 0x04 /* SIGILL */ }, /* performance monitor */ { 0x0f20, 0x08 /* SIGFPE */ }, /* altivec unavailable */ { 0x1300, 0x05 /* SIGTRAP */ }, /* instruction address break */ diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 01addfb0ed0a..367494dc67d9 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -164,17 +164,13 @@ NOKPROBE_SYMBOL(arch_prepare_kprobe); void arch_arm_kprobe(struct kprobe *p) { - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + patch_instruction(p->addr, BREAKPOINT_INSTRUCTION); } NOKPROBE_SYMBOL(arch_arm_kprobe); void arch_disarm_kprobe(struct kprobe *p) { - *p->addr = p->opcode; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + patch_instruction(p->addr, p->opcode); } NOKPROBE_SYMBOL(arch_disarm_kprobe); @@ -221,7 +217,7 @@ static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs kcb->kprobe_saved_msr = regs->msr; } -bool arch_function_offset_within_entry(unsigned long offset) +bool arch_kprobe_on_func_entry(unsigned long offset) { #ifdef PPC64_ELF_ABI_v2 #ifdef CONFIG_KPROBES_ON_FTRACE diff --git a/arch/powerpc/kernel/l2cr_6xx.S b/arch/powerpc/kernel/l2cr_6xx.S index 97ec8557f974..6408f09dbbd9 100644 --- a/arch/powerpc/kernel/l2cr_6xx.S +++ b/arch/powerpc/kernel/l2cr_6xx.S @@ -181,7 +181,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPEC7450) mtctr r4 li r4,0 1: - lwzx r0,r0,r4 + lwzx r0,0,r4 addi r4,r4,32 /* Go to start of next cache line */ bdnz 1b isync @@ -328,7 +328,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_L3CR) mtctr r4 li r4,0 1: - lwzx r0,r0,r4 + lwzx r0,0,r4 dcbf 0,r4 addi r4,r4,32 /* Go to start of next cache line */ bdnz 1b diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c index 0694d20f85b6..5e5a64a8b4e4 100644 --- a/arch/powerpc/kernel/legacy_serial.c +++ b/arch/powerpc/kernel/legacy_serial.c @@ -147,8 +147,8 @@ static int __init add_legacy_port(struct device_node *np, int want_index, legacy_serial_ports[index].serial_out = tsi_serial_out; } - printk(KERN_DEBUG "Found legacy serial port %d for %s\n", - index, np->full_name); + printk(KERN_DEBUG "Found legacy serial port %d for %pOF\n", + index, np); printk(KERN_DEBUG " %s=%llx, taddr=%llx, irq=%lx, clk=%d, speed=%d\n", (iotype == UPIO_PORT) ? "port" : "mem", (unsigned long long)base, (unsigned long long)taddr, irq, @@ -207,7 +207,7 @@ static int __init add_legacy_isa_port(struct device_node *np, int index = -1; u64 taddr; - DBG(" -> add_legacy_isa_port(%s)\n", np->full_name); + DBG(" -> add_legacy_isa_port(%pOF)\n", np); /* Get the ISA port number */ reg = of_get_property(np, "reg", NULL); @@ -256,7 +256,7 @@ static int __init add_legacy_pci_port(struct device_node *np, unsigned int flags; int iotype, index = -1, lindex = 0; - DBG(" -> add_legacy_pci_port(%s)\n", np->full_name); + DBG(" -> add_legacy_pci_port(%pOF)\n", np); /* We only support ports that have a clock frequency properly * encoded in the device-tree (that is have an fcode). Anything @@ -374,7 +374,7 @@ void __init find_legacy_serial_ports(void) if (path != NULL) { stdout = of_find_node_by_path(path); if (stdout) - DBG("stdout is %s\n", stdout->full_name); + DBG("stdout is %pOF\n", stdout); } else { DBG(" no linux,stdout-path !\n"); } @@ -603,7 +603,7 @@ static int __init check_legacy_serial_console(void) DBG(" can't find stdout package %s !\n", name); return -ENODEV; } - DBG("stdout is %s\n", prom_stdout->full_name); + DBG("stdout is %pOF\n", prom_stdout); name = of_get_property(prom_stdout, "name", NULL); if (!name) { diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 5f9eada3519b..9b2ea7e71c06 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -22,11 +22,14 @@ #undef DEBUG #define pr_fmt(fmt) "mce: " fmt +#include <linux/hardirq.h> #include <linux/types.h> #include <linux/ptrace.h> #include <linux/percpu.h> #include <linux/export.h> #include <linux/irq_work.h> + +#include <asm/machdep.h> #include <asm/mce.h> static DEFINE_PER_CPU(int, mce_nest_count); @@ -268,6 +271,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, static const char *mc_ra_types[] = { "Indeterminate", "Instruction fetch (bad)", + "Instruction fetch (foreign)", "Page table walk ifetch (bad)", "Page table walk ifetch (foreign)", "Load (bad)", @@ -405,6 +409,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, break; } } +EXPORT_SYMBOL_GPL(machine_check_print_event_info); uint64_t get_mce_fault_addr(struct machine_check_event *evt) { @@ -444,3 +449,33 @@ uint64_t get_mce_fault_addr(struct machine_check_event *evt) return 0; } EXPORT_SYMBOL(get_mce_fault_addr); + +/* + * This function is called in real mode. Strictly no printk's please. + * + * regs->nip and regs->msr contains srr0 and ssr1. + */ +long machine_check_early(struct pt_regs *regs) +{ + long handled = 0; + + __this_cpu_inc(irq_stat.mce_exceptions); + + if (cur_cpu_spec && cur_cpu_spec->machine_check_early) + handled = cur_cpu_spec->machine_check_early(regs); + return handled; +} + +long hmi_exception_realmode(struct pt_regs *regs) +{ + __this_cpu_inc(irq_stat.hmi_exceptions); + + wait_for_subcore_guest_exit(); + + if (ppc_md.hmi_exception_early) + ppc_md.hmi_exception_early(regs); + + wait_for_tb_resync(); + + return 0; +} diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index f913139bb0c2..b76ca198e09c 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -53,6 +53,60 @@ static void flush_tlb_206(unsigned int num_sets, unsigned int action) asm volatile("ptesync" : : : "memory"); } +static void flush_tlb_300(unsigned int num_sets, unsigned int action) +{ + unsigned long rb; + unsigned int i; + unsigned int r; + + switch (action) { + case TLB_INVAL_SCOPE_GLOBAL: + rb = TLBIEL_INVAL_SET; + break; + case TLB_INVAL_SCOPE_LPID: + rb = TLBIEL_INVAL_SET_LPID; + break; + default: + BUG(); + break; + } + + asm volatile("ptesync" : : : "memory"); + + if (early_radix_enabled()) + r = 1; + else + r = 0; + + /* + * First flush table/PWC caches with set 0, then flush the + * rest of the sets, partition scope. Radix must then do it + * all again with process scope. Hash just has to flush + * process table. + */ + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) : : + "r"(rb), "r"(0), "i"(2), "i"(0), "r"(r)); + for (i = 1; i < num_sets; i++) { + unsigned long set = i * (1<<TLBIEL_INVAL_SET_SHIFT); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) : : + "r"(rb+set), "r"(0), "i"(2), "i"(0), "r"(r)); + } + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) : : + "r"(rb), "r"(0), "i"(2), "i"(1), "r"(r)); + if (early_radix_enabled()) { + for (i = 1; i < num_sets; i++) { + unsigned long set = i * (1<<TLBIEL_INVAL_SET_SHIFT); + + asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) : : + "r"(rb+set), "r"(0), "i"(2), "i"(1), "r"(r)); + } + } + + asm volatile("ptesync" : : : "memory"); +} + /* * Generic routines to flush TLB on POWER processors. These routines * are used as flush_tlb hook in the cpu_spec. @@ -79,7 +133,7 @@ void __flush_tlb_power9(unsigned int action) else num_sets = POWER9_TLB_SETS_HASH; - flush_tlb_206(num_sets, action); + flush_tlb_300(num_sets, action); } @@ -236,6 +290,9 @@ static const struct mce_ierror_table mce_p9_ierror_table[] = { { 0x00000000081c0000, 0x0000000000180000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000001c0000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH_FOREIGN, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000000081c0000, 0x0000000008000000, true, MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 84db14e435f5..3f7a9a2d2435 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -244,8 +244,7 @@ _GLOBAL(_nmask_and_or_msr) */ _GLOBAL(real_readb) mfmsr r7 - ori r0,r7,MSR_DR - xori r0,r0,MSR_DR + rlwinm r0,r7,0,~MSR_DR sync mtmsr r0 sync @@ -262,8 +261,7 @@ _GLOBAL(real_readb) */ _GLOBAL(real_writeb) mfmsr r7 - ori r0,r7,MSR_DR - xori r0,r0,MSR_DR + rlwinm r0,r7,0,~MSR_DR sync mtmsr r0 sync diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index c119044cad0d..8ac0bd2bddb0 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -614,6 +614,18 @@ _GLOBAL(kexec_sequence) li r0,0 std r0,16(r1) +BEGIN_FTR_SECTION + /* + * This is the best time to turn AMR/IAMR off. + * key 0 is used in radix for supervisor<->user + * protection, but on hash key 0 is reserved + * ideally we want to enter with a clean state. + * NOTE, we rely on r0 being 0 from above. + */ + mtspr SPRN_IAMR,r0 + mtspr SPRN_AMOR,r0 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + /* save regs for local vars on new stack. * yes, we won't go back, but ... */ diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index eae61b044e9e..496d6393bd41 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -792,21 +792,17 @@ static ssize_t dev_nvram_write(struct file *file, const char __user *buf, count = min_t(size_t, count, size - *ppos); count = min(count, PAGE_SIZE); - ret = -ENOMEM; - tmp = kmalloc(count, GFP_KERNEL); - if (!tmp) - goto out; - - ret = -EFAULT; - if (copy_from_user(tmp, buf, count)) + tmp = memdup_user(buf, count); + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); goto out; + } ret = ppc_md.nvram_write(tmp, count, ppos); -out: kfree(tmp); +out: return ret; - } static long dev_nvram_ioctl(struct file *file, unsigned int cmd, diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c index 34aeac54f120..becaec990140 100644 --- a/arch/powerpc/kernel/of_platform.c +++ b/arch/powerpc/kernel/of_platform.c @@ -45,7 +45,7 @@ static int of_pci_phb_probe(struct platform_device *dev) if (ppc_md.pci_setup_phb == NULL) return -ENODEV; - pr_info("Setting up PCI bus %s\n", dev->dev.of_node->full_name); + pr_info("Setting up PCI bus %pOF\n", dev->dev.of_node); /* Alloc and setup PHB data structure */ phb = pcibios_alloc_controller(dev->dev.of_node); diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index ec60ed0d4aad..6f8273f5e988 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -158,12 +158,13 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr) { /* addis r4,0,(insn)@h */ - *addr++ = PPC_INST_ADDIS | ___PPC_RT(4) | - ((val >> 16) & 0xffff); + patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(4) | + ((val >> 16) & 0xffff)); + addr++; /* ori r4,r4,(insn)@l */ - *addr = PPC_INST_ORI | ___PPC_RA(4) | ___PPC_RS(4) | - (val & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(4) | + ___PPC_RS(4) | (val & 0xffff)); } /* @@ -173,24 +174,28 @@ void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr) void patch_imm64_load_insns(unsigned long val, kprobe_opcode_t *addr) { /* lis r3,(op)@highest */ - *addr++ = PPC_INST_ADDIS | ___PPC_RT(3) | - ((val >> 48) & 0xffff); + patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(3) | + ((val >> 48) & 0xffff)); + addr++; /* ori r3,r3,(op)@higher */ - *addr++ = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) | - ((val >> 32) & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) | + ___PPC_RS(3) | ((val >> 32) & 0xffff)); + addr++; /* rldicr r3,r3,32,31 */ - *addr++ = PPC_INST_RLDICR | ___PPC_RA(3) | ___PPC_RS(3) | - __PPC_SH64(32) | __PPC_ME64(31); + patch_instruction(addr, PPC_INST_RLDICR | ___PPC_RA(3) | + ___PPC_RS(3) | __PPC_SH64(32) | __PPC_ME64(31)); + addr++; /* oris r3,r3,(op)@h */ - *addr++ = PPC_INST_ORIS | ___PPC_RA(3) | ___PPC_RS(3) | - ((val >> 16) & 0xffff); + patch_instruction(addr, PPC_INST_ORIS | ___PPC_RA(3) | + ___PPC_RS(3) | ((val >> 16) & 0xffff)); + addr++; /* ori r3,r3,(op)@l */ - *addr = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) | - (val & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) | + ___PPC_RS(3) | (val & 0xffff)); } int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) @@ -198,7 +203,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) kprobe_opcode_t *buff, branch_op_callback, branch_emulate_step; kprobe_opcode_t *op_callback_addr, *emulate_step_addr; long b_offset; - unsigned long nip; + unsigned long nip, size; + int rc, i; kprobe_ppc_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; @@ -231,8 +237,14 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) goto error; /* Setup template */ - memcpy(buff, optprobe_template_entry, - TMPL_END_IDX * sizeof(kprobe_opcode_t)); + /* We can optimize this via patch_instruction_window later */ + size = (TMPL_END_IDX * sizeof(kprobe_opcode_t)) / sizeof(int); + pr_devel("Copying template to %p, size %lu\n", buff, size); + for (i = 0; i < size; i++) { + rc = patch_instruction(buff + i, *(optprobe_template_entry + i)); + if (rc < 0) + goto error; + } /* * Fixup the template with instructions to: @@ -261,8 +273,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) if (!branch_op_callback || !branch_emulate_step) goto error; - buff[TMPL_CALL_HDLR_IDX] = branch_op_callback; - buff[TMPL_EMULATE_IDX] = branch_emulate_step; + patch_instruction(buff + TMPL_CALL_HDLR_IDX, branch_op_callback); + patch_instruction(buff + TMPL_EMULATE_IDX, branch_emulate_step); /* * 3. load instruction to be emulated into relevant register, and @@ -272,8 +284,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) /* * 4. branch back from trampoline */ - buff[TMPL_RET_IDX] = create_branch((unsigned int *)buff + TMPL_RET_IDX, - (unsigned long)nip, 0); + patch_branch(buff + TMPL_RET_IDX, (unsigned long)nip, 0); flush_icache_range((unsigned long)buff, (unsigned long)(&buff[TMPL_END_IDX])); diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S index 4937bef7652f..52fc864cdec4 100644 --- a/arch/powerpc/kernel/optprobes_head.S +++ b/arch/powerpc/kernel/optprobes_head.S @@ -60,10 +60,6 @@ optprobe_template_entry: std r5,_CCR(r1) lbz r5,PACASOFTIRQEN(r13) std r5,SOFTE(r1) - mfdar r5 - std r5,_DAR(r1) - mfdsisr r5 - std r5,_DSISR(r1) /* * We may get here from a module, so load the kernel TOC in r2. @@ -122,10 +118,6 @@ optprobe_template_call_emulate: mtxer r5 ld r5,_CCR(r1) mtcr r5 - ld r5,_DAR(r1) - mtdar r5 - ld r5,_DSISR(r1) - mtdsisr r5 REST_GPR(0,r1) REST_10GPRS(2,r1) REST_10GPRS(12,r1) diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c index 8d63627e067f..70f073d6c3b2 100644 --- a/arch/powerpc/kernel/paca.c +++ b/arch/powerpc/kernel/paca.c @@ -99,18 +99,27 @@ static inline void free_lppacas(void) { } * If you make the number of persistent SLB entries dynamic, please also * update PR KVM to flush and restore them accordingly. */ -static struct slb_shadow *slb_shadow; +static struct slb_shadow * __initdata slb_shadow; static void __init allocate_slb_shadows(int nr_cpus, int limit) { int size = PAGE_ALIGN(sizeof(struct slb_shadow) * nr_cpus); + + if (early_radix_enabled()) + return; + slb_shadow = __va(memblock_alloc_base(size, PAGE_SIZE, limit)); memset(slb_shadow, 0, size); } static struct slb_shadow * __init init_slb_shadow(int cpu) { - struct slb_shadow *s = &slb_shadow[cpu]; + struct slb_shadow *s; + + if (early_radix_enabled()) + return NULL; + + s = &slb_shadow[cpu]; /* * When we come through here to initialise boot_paca, the slb_shadow diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c index 341a7469cab8..02831a396419 100644 --- a/arch/powerpc/kernel/pci-common.c +++ b/arch/powerpc/kernel/pci-common.c @@ -373,9 +373,8 @@ static int pci_read_irq_line(struct pci_dev *pci_dev) if (virq) irq_set_irq_type(virq, IRQ_TYPE_LEVEL_LOW); } else { - pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %s\n", - oirq.args_count, oirq.args[0], oirq.args[1], - of_node_full_name(oirq.np)); + pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %pOF\n", + oirq.args_count, oirq.args[0], oirq.args[1], oirq.np); virq = irq_create_of_mapping(&oirq); } @@ -741,8 +740,8 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose, struct of_pci_range range; struct of_pci_range_parser parser; - printk(KERN_INFO "PCI host bridge %s %s ranges:\n", - dev->full_name, primary ? "(primary)" : ""); + printk(KERN_INFO "PCI host bridge %pOF %s ranges:\n", + dev, primary ? "(primary)" : ""); /* Check for ranges property */ if (of_pci_range_parser_init(&parser, dev)) @@ -1556,8 +1555,8 @@ static void pcibios_setup_phb_resources(struct pci_controller *hose, if (!res->flags) { pr_debug("PCI: I/O resource not set for host" - " bridge %s (domain %d)\n", - hose->dn->full_name, hose->global_number); + " bridge %pOF (domain %d)\n", + hose->dn, hose->global_number); } else { offset = pcibios_io_space_offset(hose); @@ -1668,7 +1667,7 @@ void pcibios_scan_phb(struct pci_controller *hose) struct device_node *node = hose->dn; int mode; - pr_debug("PCI: Scanning PHB %s\n", of_node_full_name(node)); + pr_debug("PCI: Scanning PHB %pOF\n", node); /* Get some IO space for the new PHB */ pcibios_setup_phb_io_space(hose); diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 41c86c6b6e4d..1d817f4d97d9 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -79,8 +79,8 @@ make_one_node_map(struct device_node* node, u8 pci_bus) return; bus_range = of_get_property(node, "bus-range", &len); if (bus_range == NULL || len < 2 * sizeof(int)) { - printk(KERN_WARNING "Can't get bus-range for %s, " - "assuming it starts at 0\n", node->full_name); + printk(KERN_WARNING "Can't get bus-range for %pOF, " + "assuming it starts at 0\n", node); pci_to_OF_bus_map[pci_bus] = 0; } else pci_to_OF_bus_map[pci_bus] = bus_range[0]; diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c index ed5e9ff61a68..932b9741aa8f 100644 --- a/arch/powerpc/kernel/pci_64.c +++ b/arch/powerpc/kernel/pci_64.c @@ -111,7 +111,7 @@ int pcibios_unmap_io_space(struct pci_bus *bus) if (hose->io_base_alloc == NULL) return 0; - pr_debug("IO unmapping for PHB %s\n", hose->dn->full_name); + pr_debug("IO unmapping for PHB %pOF\n", hose->dn); pr_debug(" alloc=0x%p\n", hose->io_base_alloc); /* This is a PHB, we fully unmap the IO area */ @@ -151,7 +151,7 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose) hose->io_base_virt = (void __iomem *)(area->addr + hose->io_base_phys - phys_page); - pr_debug("IO mapping for PHB %s\n", hose->dn->full_name); + pr_debug("IO mapping for PHB %pOF\n", hose->dn); pr_debug(" phys=0x%016llx, virt=0x%p (alloc=0x%p)\n", hose->io_base_phys, hose->io_base_virt, hose->io_base_alloc); pr_debug(" size=0x%016llx (alloc=0x%016lx)\n", diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c index 592693437070..0e395afbf0f4 100644 --- a/arch/powerpc/kernel/pci_dn.c +++ b/arch/powerpc/kernel/pci_dn.c @@ -139,7 +139,6 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev) #ifdef CONFIG_PCI_IOV static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, - struct pci_dev *pdev, int vf_index, int busno, int devfn) { @@ -150,10 +149,8 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, return NULL; pdn = kzalloc(sizeof(*pdn), GFP_KERNEL); - if (!pdn) { - dev_warn(&pdev->dev, "%s: Out of memory!\n", __func__); + if (!pdn) return NULL; - } pdn->phb = parent->phb; pdn->parent = parent; @@ -167,13 +164,6 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent, INIT_LIST_HEAD(&pdn->list); list_add_tail(&pdn->list, &parent->child_list); - /* - * If we already have PCI device instance, lets - * bind them. - */ - if (pdev) - pdev->dev.archdata.pci_data = pdn; - return pdn; } #endif @@ -201,7 +191,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev) for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) { struct eeh_dev *edev __maybe_unused; - pdn = add_one_dev_pci_data(parent, NULL, i, + pdn = add_one_dev_pci_data(parent, i, pci_iov_virtfn_bus(pdev, i), pci_iov_virtfn_devfn(pdev, i)); if (!pdn) { @@ -303,7 +293,6 @@ struct pci_dn *pci_add_device_node_info(struct pci_controller *hose, if (pdn == NULL) return NULL; dn->data = pdn; - pdn->node = dn; pdn->phb = hose; #ifdef CONFIG_PPC_POWERNV pdn->pe_number = IODA_INVALID_PE; @@ -352,6 +341,7 @@ EXPORT_SYMBOL_GPL(pci_add_device_node_info); void pci_remove_device_node_info(struct device_node *dn) { struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL; + struct device_node *parent; #ifdef CONFIG_EEH struct eeh_dev *edev = pdn_to_eeh_dev(pdn); @@ -364,8 +354,10 @@ void pci_remove_device_node_info(struct device_node *dn) WARN_ON(!list_empty(&pdn->child_list)); list_del(&pdn->list); - if (pdn->parent) - of_node_put(pdn->parent->node); + + parent = of_get_parent(dn); + if (parent) + of_node_put(parent); dn->data = NULL; kfree(pdn); diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c index ea3d98115b88..0d790f8432d2 100644 --- a/arch/powerpc/kernel/pci_of_scan.c +++ b/arch/powerpc/kernel/pci_of_scan.c @@ -211,19 +211,19 @@ void of_scan_pci_bridge(struct pci_dev *dev) unsigned int flags; u64 size; - pr_debug("of_scan_pci_bridge(%s)\n", node->full_name); + pr_debug("of_scan_pci_bridge(%pOF)\n", node); /* parse bus-range property */ busrange = of_get_property(node, "bus-range", &len); if (busrange == NULL || len != 8) { - printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %s\n", - node->full_name); + printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %pOF\n", + node); return; } ranges = of_get_property(node, "ranges", &len); if (ranges == NULL) { - printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %s\n", - node->full_name); + printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %pOF\n", + node); return; } @@ -233,8 +233,8 @@ void of_scan_pci_bridge(struct pci_dev *dev) bus = pci_add_new_bus(dev->bus, dev, of_read_number(busrange, 1)); if (!bus) { - printk(KERN_ERR "Failed to create pci bus for %s\n", - node->full_name); + printk(KERN_ERR "Failed to create pci bus for %pOF\n", + node); return; } } @@ -262,13 +262,13 @@ void of_scan_pci_bridge(struct pci_dev *dev) res = bus->resource[0]; if (res->flags) { printk(KERN_ERR "PCI: ignoring extra I/O range" - " for bridge %s\n", node->full_name); + " for bridge %pOF\n", node); continue; } } else { if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) { printk(KERN_ERR "PCI: too many memory ranges" - " for bridge %s\n", node->full_name); + " for bridge %pOF\n", node); continue; } res = bus->resource[i]; @@ -307,7 +307,7 @@ static struct pci_dev *of_scan_pci_dev(struct pci_bus *bus, struct eeh_dev *edev = pdn_to_eeh_dev(PCI_DN(dn)); #endif - pr_debug(" * %s\n", dn->full_name); + pr_debug(" * %pOF\n", dn); if (!of_device_is_available(dn)) return NULL; @@ -350,8 +350,8 @@ static void __of_scan_bus(struct device_node *node, struct pci_bus *bus, struct device_node *child; struct pci_dev *dev; - pr_debug("of_scan_bus(%s) bus no %d...\n", - node->full_name, bus->number); + pr_debug("of_scan_bus(%pOF) bus no %d...\n", + node, bus->number); /* Scan direct children */ for_each_child_of_node(node, child) { diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 2ad725ef4368..a0c74bbf3454 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -230,7 +230,8 @@ void enable_kernel_fp(void) } EXPORT_SYMBOL(enable_kernel_fp); -static int restore_fp(struct task_struct *tsk) { +static int restore_fp(struct task_struct *tsk) +{ if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) { load_fp_state(¤t->thread.fp_state); current->thread.load_fp++; @@ -330,11 +331,19 @@ static inline int restore_altivec(struct task_struct *tsk) { return 0; } #ifdef CONFIG_VSX static void __giveup_vsx(struct task_struct *tsk) { - if (tsk->thread.regs->msr & MSR_FP) + unsigned long msr = tsk->thread.regs->msr; + + /* + * We should never be ssetting MSR_VSX without also setting + * MSR_FP and MSR_VEC + */ + WARN_ON((msr & MSR_VSX) && !((msr & MSR_FP) && (msr & MSR_VEC))); + + /* __giveup_fpu will clear MSR_VSX */ + if (msr & MSR_FP) __giveup_fpu(tsk); - if (tsk->thread.regs->msr & MSR_VEC) + if (msr & MSR_VEC) __giveup_altivec(tsk); - tsk->thread.regs->msr &= ~MSR_VSX; } static void giveup_vsx(struct task_struct *tsk) @@ -346,14 +355,6 @@ static void giveup_vsx(struct task_struct *tsk) msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX); } -static void save_vsx(struct task_struct *tsk) -{ - if (tsk->thread.regs->msr & MSR_FP) - save_fpu(tsk); - if (tsk->thread.regs->msr & MSR_VEC) - save_altivec(tsk); -} - void enable_kernel_vsx(void) { unsigned long cpumsr; @@ -362,7 +363,8 @@ void enable_kernel_vsx(void) cpumsr = msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX); - if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) { + if (current->thread.regs && + (current->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP))) { check_if_tm_restore_required(current); /* * If a thread has already been reclaimed then the @@ -373,10 +375,6 @@ void enable_kernel_vsx(void) */ if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr)) return; - if (current->thread.regs->msr & MSR_FP) - __giveup_fpu(current); - if (current->thread.regs->msr & MSR_VEC) - __giveup_altivec(current); __giveup_vsx(current); } } @@ -386,7 +384,7 @@ void flush_vsx_to_thread(struct task_struct *tsk) { if (tsk->thread.regs) { preempt_disable(); - if (tsk->thread.regs->msr & MSR_VSX) { + if (tsk->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP)) { BUG_ON(tsk != current); giveup_vsx(tsk); } @@ -406,7 +404,6 @@ static int restore_vsx(struct task_struct *tsk) } #else static inline int restore_vsx(struct task_struct *tsk) { return 0; } -static inline void save_vsx(struct task_struct *tsk) { } #endif /* CONFIG_VSX */ #ifdef CONFIG_SPE @@ -486,6 +483,8 @@ void giveup_all(struct task_struct *tsk) msr_check_and_set(msr_all_available); check_if_tm_restore_required(tsk); + WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC))); + #ifdef CONFIG_PPC_FPU if (usermsr & MSR_FP) __giveup_fpu(tsk); @@ -494,10 +493,6 @@ void giveup_all(struct task_struct *tsk) if (usermsr & MSR_VEC) __giveup_altivec(tsk); #endif -#ifdef CONFIG_VSX - if (usermsr & MSR_VSX) - __giveup_vsx(tsk); -#endif #ifdef CONFIG_SPE if (usermsr & MSR_SPE) __giveup_spe(tsk); @@ -552,19 +547,13 @@ void save_all(struct task_struct *tsk) msr_check_and_set(msr_all_available); - /* - * Saving the way the register space is in hardware, save_vsx boils - * down to a save_fpu() and save_altivec() - */ - if (usermsr & MSR_VSX) { - save_vsx(tsk); - } else { - if (usermsr & MSR_FP) - save_fpu(tsk); + WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC))); - if (usermsr & MSR_VEC) - save_altivec(tsk); - } + if (usermsr & MSR_FP) + save_fpu(tsk); + + if (usermsr & MSR_VEC) + save_altivec(tsk); if (usermsr & MSR_SPE) __giveup_spe(tsk); @@ -1133,6 +1122,11 @@ static inline void restore_sprs(struct thread_struct *old_thread, #endif } +#ifdef CONFIG_PPC_BOOK3S_64 +#define CP_SIZE 128 +static const u8 dummy_copy_buffer[CP_SIZE] __attribute__((aligned(CP_SIZE))); +#endif + struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *new) { @@ -1195,12 +1189,14 @@ struct task_struct *__switch_to(struct task_struct *prev, __switch_to_tm(prev, new); - /* - * We can't take a PMU exception inside _switch() since there is a - * window where the kernel stack SLB and the kernel stack are out - * of sync. Hard disable here. - */ - hard_irq_disable(); + if (!radix_enabled()) { + /* + * We can't take a PMU exception inside _switch() since there + * is a window where the kernel stack SLB and the kernel stack + * are out of sync. Hard disable here. + */ + hard_irq_disable(); + } /* * Call restore_sprs() before calling _switch(). If we move it after @@ -1220,8 +1216,28 @@ struct task_struct *__switch_to(struct task_struct *prev, batch->active = 1; } - if (current_thread_info()->task->thread.regs) + if (current_thread_info()->task->thread.regs) { restore_math(current_thread_info()->task->thread.regs); + + /* + * The copy-paste buffer can only store into foreign real + * addresses, so unprivileged processes can not see the + * data or use it in any way unless they have foreign real + * mappings. We don't have a VAS driver that allocates those + * yet, so no cpabort is required. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + /* + * DD1 allows paste into normal system memory, so we + * do an unpaired copy here to clear the buffer and + * prevent a covert channel being set up. + * + * cpabort is not used because it is quite expensive. + */ + asm volatile(PPC_COPY(%0, %1) + : : "r"(dummy_copy_buffer), "r"(0)); + } + } #endif /* CONFIG_PPC_STD_MMU_64 */ return last; @@ -1364,13 +1380,13 @@ void show_regs(struct pt_regs * regs) show_regs_print_info(KERN_DEFAULT); - printk("NIP: "REG" LR: "REG" CTR: "REG"\n", + printk("NIP: "REG" LR: "REG" CTR: "REG"\n", regs->nip, regs->link, regs->ctr); printk("REGS: %p TRAP: %04lx %s (%s)\n", regs, regs->trap, print_tainted(), init_utsname()->release); - printk("MSR: "REG" ", regs->msr); + printk("MSR: "REG" ", regs->msr); print_msr_bits(regs->msr); - printk(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); + pr_cont(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer); trap = TRAP(regs); if ((regs->trap != 0xc00) && cpu_has_feature(CPU_FTR_CFAR)) pr_cont("CFAR: "REG" ", regs->orig_gpr3); @@ -1963,11 +1979,25 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) void notrace __ppc64_runlatch_on(void) { struct thread_info *ti = current_thread_info(); - unsigned long ctrl; - ctrl = mfspr(SPRN_CTRLF); - ctrl |= CTRL_RUNLATCH; - mtspr(SPRN_CTRLT, ctrl); + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + /* + * Least significant bit (RUN) is the only writable bit of + * the CTRL register, so we can avoid mfspr. 2.06 is not the + * earliest ISA where this is the case, but it's convenient. + */ + mtspr(SPRN_CTRLT, CTRL_RUNLATCH); + } else { + unsigned long ctrl; + + /* + * Some architectures (e.g., Cell) have writable fields other + * than RUN, so do the read-modify-write. + */ + ctrl = mfspr(SPRN_CTRLF); + ctrl |= CTRL_RUNLATCH; + mtspr(SPRN_CTRLT, ctrl); + } ti->local_flags |= _TLF_RUNLATCH; } @@ -1976,13 +2006,18 @@ void notrace __ppc64_runlatch_on(void) void notrace __ppc64_runlatch_off(void) { struct thread_info *ti = current_thread_info(); - unsigned long ctrl; ti->local_flags &= ~_TLF_RUNLATCH; - ctrl = mfspr(SPRN_CTRLF); - ctrl &= ~CTRL_RUNLATCH; - mtspr(SPRN_CTRLT, ctrl); + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + mtspr(SPRN_CTRLT, 0); + } else { + unsigned long ctrl; + + ctrl = mfspr(SPRN_CTRLF); + ctrl &= ~CTRL_RUNLATCH; + mtspr(SPRN_CTRLT, ctrl); + } } #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index dd8a04f3053a..02190e90c7ae 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -15,6 +15,9 @@ #undef DEBUG_PROM +/* we cannot use FORTIFY as it brings in new symbols */ +#define __NO_FORTIFY + #include <stdarg.h> #include <linux/kernel.h> #include <linux/string.h> @@ -174,6 +177,7 @@ struct platform_support { bool hash_mmu; bool radix_mmu; bool radix_gtse; + bool xive; }; /* Platforms codes are now obsolete in the kernel. Now only used within this @@ -1038,6 +1042,27 @@ static void __init prom_parse_mmu_model(u8 val, } } +static void __init prom_parse_xive_model(u8 val, + struct platform_support *support) +{ + switch (val) { + case OV5_FEAT(OV5_XIVE_EITHER): /* Either Available */ + prom_debug("XIVE - either mode supported\n"); + support->xive = true; + break; + case OV5_FEAT(OV5_XIVE_EXPLOIT): /* Only Exploitation mode */ + prom_debug("XIVE - exploitation mode supported\n"); + support->xive = true; + break; + case OV5_FEAT(OV5_XIVE_LEGACY): /* Only Legacy mode */ + prom_debug("XIVE - legacy mode supported\n"); + break; + default: + prom_debug("Unknown xive support option: 0x%x\n", val); + break; + } +} + static void __init prom_parse_platform_support(u8 index, u8 val, struct platform_support *support) { @@ -1051,6 +1076,10 @@ static void __init prom_parse_platform_support(u8 index, u8 val, support->radix_gtse = true; } break; + case OV5_INDX(OV5_XIVE_SUPPORT): /* Interrupt mode */ + prom_parse_xive_model(val & OV5_FEAT(OV5_XIVE_SUPPORT), + support); + break; } } @@ -1059,7 +1088,8 @@ static void __init prom_check_platform_support(void) struct platform_support supported = { .hash_mmu = false, .radix_mmu = false, - .radix_gtse = false + .radix_gtse = false, + .xive = false }; int prop_len = prom_getproplen(prom.chosen, "ibm,arch-vec-5-platform-support"); @@ -1092,6 +1122,11 @@ static void __init prom_check_platform_support(void) /* We're probably on a legacy hypervisor */ prom_debug("Assuming legacy hash support\n"); } + + if (supported.xive) { + prom_debug("Asking for XIVE\n"); + ibm_architecture_vec.vec5.intarch = OV5_FEAT(OV5_XIVE_EXPLOIT); + } } static void __init prom_send_capabilities(void) diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 925a4ef90559..07cd22e35405 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -127,12 +127,19 @@ static void flush_tmregs_to_thread(struct task_struct *tsk) * If task is not current, it will have been flushed already to * it's thread_struct during __switch_to(). * - * A reclaim flushes ALL the state. + * A reclaim flushes ALL the state or if not in TM save TM SPRs + * in the appropriate thread structures from live. */ - if (tsk == current && MSR_TM_SUSPENDED(mfmsr())) - tm_reclaim_current(TM_CAUSE_SIGNAL); + if (tsk != current) + return; + if (MSR_TM_SUSPENDED(mfmsr())) { + tm_reclaim_current(TM_CAUSE_SIGNAL); + } else { + tm_enable(); + tm_save_sprs(&(tsk->thread)); + } } #else static inline void flush_tmregs_to_thread(struct task_struct *tsk) { } @@ -1587,11 +1594,8 @@ static int ppr_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - int ret; - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.ppr, 0, sizeof(u64)); - return ret; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.ppr, 0, sizeof(u64)); } static int ppr_set(struct task_struct *target, @@ -1599,11 +1603,8 @@ static int ppr_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - int ret; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.ppr, 0, sizeof(u64)); - return ret; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.ppr, 0, sizeof(u64)); } static int dscr_get(struct task_struct *target, @@ -1611,22 +1612,16 @@ static int dscr_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - int ret; - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.dscr, 0, sizeof(u64)); - return ret; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.dscr, 0, sizeof(u64)); } static int dscr_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - int ret; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.dscr, 0, sizeof(u64)); - return ret; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.dscr, 0, sizeof(u64)); } #endif #ifdef CONFIG_PPC_BOOK3S_64 @@ -1635,22 +1630,16 @@ static int tar_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - int ret; - - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.tar, 0, sizeof(u64)); - return ret; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.tar, 0, sizeof(u64)); } static int tar_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - int ret; - - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.tar, 0, sizeof(u64)); - return ret; + return user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.tar, 0, sizeof(u64)); } static int ebb_active(struct task_struct *target, diff --git a/arch/powerpc/kernel/reloc_64.S b/arch/powerpc/kernel/reloc_64.S index d88736fbece6..e8cfc69f59ae 100644 --- a/arch/powerpc/kernel/reloc_64.S +++ b/arch/powerpc/kernel/reloc_64.S @@ -82,7 +82,7 @@ _GLOBAL(relocate) 6: blr .balign 8 -p_dyn: .llong __dynamic_start - 0b -p_rela: .llong __rela_dyn_start - 0b -p_st: .llong _stext - 0b +p_dyn: .8byte __dynamic_start - 0b +p_rela: .8byte __rela_dyn_start - 0b +p_st: .8byte _stext - 0b diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index 73f1934582c2..c2b148b1634a 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -91,26 +91,14 @@ static int rtas_pci_read_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { - struct device_node *busdn, *dn; struct pci_dn *pdn; - bool found = false; int ret; - /* Search only direct children of the bus */ *val = 0xFFFFFFFF; - busdn = pci_bus_to_OF_node(bus); - for (dn = busdn->child; dn; dn = dn->sibling) { - pdn = PCI_DN(dn); - if (pdn && pdn->devfn == devfn - && of_device_is_available(dn)) { - found = true; - break; - } - } - if (!found) - return PCIBIOS_DEVICE_NOT_FOUND; + pdn = pci_get_pdn_by_devfn(bus, devfn); + /* Validity of pdn is checked in here */ ret = rtas_read_config(pdn, where, size, val); if (*val == EEH_IO_ERROR_VALUE(size) && eeh_dev_check_failure(pdn_to_eeh_dev(pdn))) @@ -153,24 +141,11 @@ static int rtas_pci_write_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { - struct device_node *busdn, *dn; struct pci_dn *pdn; - bool found = false; - - /* Search only direct children of the bus */ - busdn = pci_bus_to_OF_node(bus); - for (dn = busdn->child; dn; dn = dn->sibling) { - pdn = PCI_DN(dn); - if (pdn && pdn->devfn == devfn - && of_device_is_available(dn)) { - found = true; - break; - } - } - if (!found) - return PCIBIOS_DEVICE_NOT_FOUND; + pdn = pci_get_pdn_by_devfn(bus, devfn); + /* Validity of pdn is checked in here. */ return rtas_write_config(pdn, where, size, val); } diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 3650732639ed..0f0b1b2f3b60 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -283,7 +283,7 @@ static void prrn_work_fn(struct work_struct *work) * the RTAS event. */ pseries_devicetree_update(-prrn_update_scope); - arch_update_cpu_topology(); + numa_update_cpu_topology(false); } static DECLARE_WORK(prrn_work, prrn_work_fn); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 857129acf960..7de73589d8e2 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -335,6 +335,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) maj = ((pvr >> 8) & 0xFF) - 1; min = pvr & 0xFF; break; + case 0x004e: /* POWER9 bits 12-15 give chip type */ + maj = (pvr >> 8) & 0x0F; + min = pvr & 0xFF; + break; default: maj = (pvr >> 8) & 0xFF; min = pvr & 0xFF; @@ -477,7 +481,7 @@ void __init smp_setup_cpu_maps(void) __be32 cpu_be; int j, len; - DBG(" * %s...\n", dn->full_name); + DBG(" * %pOF...\n", dn); intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); @@ -700,30 +704,6 @@ int check_legacy_ioport(unsigned long base_port) } EXPORT_SYMBOL(check_legacy_ioport); -static int ppc_panic_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - /* - * If firmware-assisted dump has been registered then trigger - * firmware-assisted dump and let firmware handle everything else. - */ - crash_fadump(NULL, ptr); - ppc_md.panic(ptr); /* May not return */ - return NOTIFY_DONE; -} - -static struct notifier_block ppc_panic_block = { - .notifier_call = ppc_panic_event, - .priority = INT_MIN /* may not return; must be done last */ -}; - -void __init setup_panic(void) -{ - if (!ppc_md.panic) - return; - atomic_notifier_chain_register(&panic_notifier_list, &ppc_panic_block); -} - #ifdef CONFIG_CHECK_CACHE_COHERENCY /* * For platforms that have configurable cache-coherency. This function @@ -868,9 +848,6 @@ void __init setup_arch(char **cmdline_p) /* Probe the machine type, establish ppc_md. */ probe_machine(); - /* Setup panic notifier if requested by the platform. */ - setup_panic(); - /* * Configure ppc_md.power_save (ppc32 only, 64-bit machines do * it from their respective probe() function. @@ -912,13 +889,6 @@ void __init setup_arch(char **cmdline_p) /* Reserve large chunks of memory for use by CMA for KVM. */ kvm_cma_reserve(); - /* - * Reserve any gigantic pages requested on the command line. - * memblock needs to have been initialized by the time this is - * called since this will reserve memory. - */ - reserve_hugetlb_gpages(); - klp_init_thread_info(&init_thread_info); init_mm.start_code = (unsigned long)_stext; diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 2f88f6cf1a42..51ebc01fff52 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -98,6 +98,9 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */ notrace void __init machine_init(u64 dt_ptr) { + unsigned int *addr = &memset_nocache_branch; + unsigned long insn; + /* Configure static keys first, now that we're relocated. */ setup_feature_keys(); @@ -105,7 +108,9 @@ notrace void __init machine_init(u64 dt_ptr) udbg_early_init(); patch_instruction((unsigned int *)&memcpy, PPC_INST_NOP); - patch_instruction(&memset_nocache_branch, PPC_INST_NOP); + + insn = create_cond_branch(addr, branch_target(addr), 0x820000); + patch_instruction(addr, insn); /* replace b by bne cr0 */ /* Do some early initialization based on the flat device tree */ early_init_devtree(__va(dt_ptr)); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 4640f6d64f8b..b89c6aac48c9 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -564,6 +564,9 @@ static __init u64 safe_stack_limit(void) /* Other BookE, we assume the first GB is bolted */ return 1ul << 30; #else + if (early_radix_enabled()) + return ULONG_MAX; + /* BookS, the first segment is bolted */ if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) return 1UL << SID_SHIFT_1T; @@ -578,7 +581,8 @@ void __init irqstack_early_init(void) /* * Interrupt stacks must be in the first segment since we - * cannot afford to take SLB misses on them. + * cannot afford to take SLB misses on them. They are not + * accessed in realmode. */ for_each_possible_cpu(i) { softirq_ctx[i] = (struct thread_info *) @@ -649,8 +653,9 @@ void __init emergency_stack_init(void) * aligned. * * Since we use these as temporary stacks during secondary CPU - * bringup, we need to get at them in real mode. This means they - * must also be within the RMO region. + * bringup, machine check, system reset, and HMI, we need to get + * at them in real mode. This means they must also be within the RMO + * region. * * The IRQ stacks allocated elsewhere in this file are zeroed and * initialized in kernel/irq.c. These are initialized here in order @@ -752,21 +757,30 @@ struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF u64 hw_nmi_get_sample_period(int watchdog_thresh) { return ppc_proc_freq * watchdog_thresh; } +#endif /* - * The hardlockup detector breaks PMU event based branches and is likely - * to get false positives in KVM guests, so disable it by default. + * The perf based hardlockup detector breaks PMU event based branches, so + * disable it by default. Book3S has a soft-nmi hardlockup detector based + * on the decrementer interrupt, so it does not suffer from this problem. + * + * It is likely to get false positives in VM guests, so disable it there + * by default too. */ static int __init disable_hardlockup_detector(void) { +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF hardlockup_detector_disable(); +#else + if (firmware_has_feature(FW_FEATURE_LPAR)) + hardlockup_detector_disable(); +#endif return 0; } early_initcall(disable_hardlockup_detector); -#endif diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index df2a41647d8e..e0a4c1f82e25 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -33,6 +33,7 @@ #include <linux/notifier.h> #include <linux/topology.h> #include <linux/profile.h> +#include <linux/processor.h> #include <asm/ptrace.h> #include <linux/atomic.h> @@ -74,9 +75,11 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 }; struct thread_info *secondary_ti; DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); +DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); +EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); EXPORT_PER_CPU_SYMBOL(cpu_core_map); /* SMP operations for this machine */ @@ -97,7 +100,7 @@ int smp_generic_cpu_bootable(unsigned int nr) /* Special case - we inhibit secondary thread startup * during boot if the user requests it. */ - if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) { + if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) return 0; if (smt_enabled_at_boot @@ -112,7 +115,8 @@ int smp_generic_cpu_bootable(unsigned int nr) #ifdef CONFIG_PPC64 int smp_generic_kick_cpu(int nr) { - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= nr_cpu_ids) + return -EINVAL; /* * The processor is currently spinning, waiting for the @@ -349,7 +353,7 @@ static void nmi_ipi_lock_start(unsigned long *flags) hard_irq_disable(); while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) { raw_local_irq_restore(*flags); - cpu_relax(); + spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0); raw_local_irq_save(*flags); hard_irq_disable(); } @@ -358,7 +362,7 @@ static void nmi_ipi_lock_start(unsigned long *flags) static void nmi_ipi_lock(void) { while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) - cpu_relax(); + spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0); } static void nmi_ipi_unlock(void) @@ -433,13 +437,31 @@ static void do_smp_send_nmi_ipi(int cpu) } } +void smp_flush_nmi_ipi(u64 delay_us) +{ + unsigned long flags; + + nmi_ipi_lock_start(&flags); + while (nmi_ipi_busy_count) { + nmi_ipi_unlock_end(&flags); + udelay(1); + if (delay_us) { + delay_us--; + if (!delay_us) + return; + } + nmi_ipi_lock_start(&flags); + } + nmi_ipi_unlock_end(&flags); +} + /* * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS. * - fn is the target callback function. * - delay_us > 0 is the delay before giving up waiting for targets to * enter the handler, == 0 specifies indefinite delay. */ -static int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us) +int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us) { unsigned long flags; int me = raw_smp_processor_id(); @@ -455,7 +477,7 @@ static int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us) nmi_ipi_lock_start(&flags); while (nmi_ipi_busy_count) { nmi_ipi_unlock_end(&flags); - cpu_relax(); + spin_until_cond(nmi_ipi_busy_count == 0); nmi_ipi_lock_start(&flags); } @@ -551,6 +573,26 @@ static void smp_store_cpu_info(int id) #endif } +/* + * Relationships between CPUs are maintained in a set of per-cpu cpumasks so + * rather than just passing around the cpumask we pass around a function that + * returns the that cpumask for the given CPU. + */ +static void set_cpus_related(int i, int j, struct cpumask *(*get_cpumask)(int)) +{ + cpumask_set_cpu(i, get_cpumask(j)); + cpumask_set_cpu(j, get_cpumask(i)); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void set_cpus_unrelated(int i, int j, + struct cpumask *(*get_cpumask)(int)) +{ + cpumask_clear_cpu(i, get_cpumask(j)); + cpumask_clear_cpu(j, get_cpumask(i)); +} +#endif + void __init smp_prepare_cpus(unsigned int max_cpus) { unsigned int cpu; @@ -570,6 +612,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) for_each_possible_cpu(cpu) { zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); + zalloc_cpumask_var_node(&per_cpu(cpu_l2_cache_map, cpu), + GFP_KERNEL, cpu_to_node(cpu)); zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu), GFP_KERNEL, cpu_to_node(cpu)); /* @@ -582,7 +626,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } } + /* Init the cpumasks so the boot CPU is related to itself */ cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid)); + cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid)); cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); if (smp_ops && smp_ops->probe) @@ -766,8 +812,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) smp_ops->give_timebase(); /* Wait until cpu puts itself in the online & active maps */ - while (!cpu_online(cpu)) - cpu_relax(); + spin_until_cond(cpu_online(cpu)); return 0; } @@ -809,33 +854,6 @@ int cpu_first_thread_of_core(int core) } EXPORT_SYMBOL_GPL(cpu_first_thread_of_core); -static void traverse_siblings_chip_id(int cpu, bool add, int chipid) -{ - const struct cpumask *mask; - struct device_node *np; - int i, plen; - const __be32 *prop; - - mask = add ? cpu_online_mask : cpu_present_mask; - for_each_cpu(i, mask) { - np = of_get_cpu_node(i, NULL); - if (!np) - continue; - prop = of_get_property(np, "ibm,chip-id", &plen); - if (prop && plen == sizeof(int) && - of_read_number(prop, 1) == chipid) { - if (add) { - cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, cpu_core_mask(cpu)); - } else { - cpumask_clear_cpu(cpu, cpu_core_mask(i)); - cpumask_clear_cpu(i, cpu_core_mask(cpu)); - } - } - of_node_put(np); - } -} - /* Must be called when no change can occur to cpu_present_mask, * i.e. during cpu online or offline. */ @@ -858,52 +876,93 @@ static struct device_node *cpu_to_l2cache(int cpu) return cache; } -static void traverse_core_siblings(int cpu, bool add) +static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int)) { struct device_node *l2_cache, *np; - const struct cpumask *mask; - int i, chip, plen; - const __be32 *prop; - - /* First see if we have ibm,chip-id properties in cpu nodes */ - np = of_get_cpu_node(cpu, NULL); - if (np) { - chip = -1; - prop = of_get_property(np, "ibm,chip-id", &plen); - if (prop && plen == sizeof(int)) - chip = of_read_number(prop, 1); - of_node_put(np); - if (chip >= 0) { - traverse_siblings_chip_id(cpu, add, chip); - return; - } - } + int i; l2_cache = cpu_to_l2cache(cpu); - mask = add ? cpu_online_mask : cpu_present_mask; - for_each_cpu(i, mask) { + if (!l2_cache) + return false; + + for_each_cpu(i, cpu_online_mask) { + /* + * when updating the marks the current CPU has not been marked + * online, but we need to update the cache masks + */ np = cpu_to_l2cache(i); if (!np) continue; - if (np == l2_cache) { - if (add) { - cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, cpu_core_mask(cpu)); - } else { - cpumask_clear_cpu(cpu, cpu_core_mask(i)); - cpumask_clear_cpu(i, cpu_core_mask(cpu)); - } - } + + if (np == l2_cache) + set_cpus_related(cpu, i, mask_fn); + of_node_put(np); } of_node_put(l2_cache); + + return true; } +#ifdef CONFIG_HOTPLUG_CPU +static void remove_cpu_from_masks(int cpu) +{ + int i; + + /* NB: cpu_core_mask is a superset of the others */ + for_each_cpu(i, cpu_core_mask(cpu)) { + set_cpus_unrelated(cpu, i, cpu_core_mask); + set_cpus_unrelated(cpu, i, cpu_l2_cache_mask); + set_cpus_unrelated(cpu, i, cpu_sibling_mask); + } +} +#endif + +static void add_cpu_to_masks(int cpu) +{ + int first_thread = cpu_first_thread_sibling(cpu); + int chipid = cpu_to_chip_id(cpu); + int i; + + /* + * This CPU will not be in the online mask yet so we need to manually + * add it to it's own thread sibling mask. + */ + cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); + + for (i = first_thread; i < first_thread + threads_per_core; i++) + if (cpu_online(i)) + set_cpus_related(i, cpu, cpu_sibling_mask); + + /* + * Copy the thread sibling mask into the cache sibling mask + * and mark any CPUs that share an L2 with this CPU. + */ + for_each_cpu(i, cpu_sibling_mask(cpu)) + set_cpus_related(cpu, i, cpu_l2_cache_mask); + update_mask_by_l2(cpu, cpu_l2_cache_mask); + + /* + * Copy the cache sibling mask into core sibling mask and mark + * any CPUs on the same chip as this CPU. + */ + for_each_cpu(i, cpu_l2_cache_mask(cpu)) + set_cpus_related(cpu, i, cpu_core_mask); + + if (chipid == -1) + return; + + for_each_cpu(i, cpu_online_mask) + if (cpu_to_chip_id(i) == chipid) + set_cpus_related(cpu, i, cpu_core_mask); +} + +static bool shared_caches; + /* Activate a secondary processor. */ void start_secondary(void *unused) { unsigned int cpu = smp_processor_id(); - int i, base; mmgrab(&init_mm); current->active_mm = &init_mm; @@ -926,22 +985,15 @@ void start_secondary(void *unused) vdso_getcpu_init(); #endif - /* Update sibling maps */ - base = cpu_first_thread_sibling(cpu); - for (i = 0; i < threads_per_core; i++) { - if (cpu_is_offline(base + i) && (cpu != base + i)) - continue; - cpumask_set_cpu(cpu, cpu_sibling_mask(base + i)); - cpumask_set_cpu(base + i, cpu_sibling_mask(cpu)); + /* Update topology CPU masks */ + add_cpu_to_masks(cpu); - /* cpu_core_map should be a superset of - * cpu_sibling_map even if we don't have cache - * information, so update the former here, too. - */ - cpumask_set_cpu(cpu, cpu_core_mask(base + i)); - cpumask_set_cpu(base + i, cpu_core_mask(cpu)); - } - traverse_core_siblings(cpu, true); + /* + * Check for any shared caches. Note that this must be done on a + * per-core basis because one core in the pair might be disabled. + */ + if (!cpumask_equal(cpu_l2_cache_mask(cpu), cpu_sibling_mask(cpu))) + shared_caches = true; set_numa_node(numa_cpu_lookup_table[cpu]); set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu])); @@ -984,35 +1036,65 @@ static struct sched_domain_topology_level powerpc_topology[] = { { NULL, }, }; -static __init long smp_setup_cpu_workfn(void *data __always_unused) +/* + * P9 has a slightly odd architecture where pairs of cores share an L2 cache. + * This topology makes it *much* cheaper to migrate tasks between adjacent cores + * since the migrated task remains cache hot. We want to take advantage of this + * at the scheduler level so an extra topology level is required. + */ +static int powerpc_shared_cache_flags(void) { - smp_ops->setup_cpu(boot_cpuid); - return 0; + return SD_SHARE_PKG_RESOURCES; +} + +/* + * We can't just pass cpu_l2_cache_mask() directly because + * returns a non-const pointer and the compiler barfs on that. + */ +static const struct cpumask *shared_cache_mask(int cpu) +{ + return cpu_l2_cache_mask(cpu); } +static struct sched_domain_topology_level power9_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) }, +#endif + { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) }, + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + void __init smp_cpus_done(unsigned int max_cpus) { /* - * We want the setup_cpu() here to be called on the boot CPU, but - * init might run on any CPU, so make sure it's invoked on the boot - * CPU. + * We are running pinned to the boot CPU, see rest_init(). */ if (smp_ops && smp_ops->setup_cpu) - work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL); + smp_ops->setup_cpu(boot_cpuid); if (smp_ops && smp_ops->bringup_done) smp_ops->bringup_done(); dump_numa_cpu_topology(); - set_sched_topology(powerpc_topology); + /* + * If any CPU detects that it's sharing a cache with another CPU then + * use the deeper topology that is aware of this sharing. + */ + if (shared_caches) { + pr_info("Using shared cache scheduler topology\n"); + set_sched_topology(power9_topology); + } else { + pr_info("Using standard scheduler topology\n"); + set_sched_topology(powerpc_topology); + } } #ifdef CONFIG_HOTPLUG_CPU int __cpu_disable(void) { int cpu = smp_processor_id(); - int base, i; int err; if (!smp_ops->cpu_disable) @@ -1023,14 +1105,7 @@ int __cpu_disable(void) return err; /* Update sibling maps */ - base = cpu_first_thread_sibling(cpu); - for (i = 0; i < threads_per_core && base + i < nr_cpu_ids; i++) { - cpumask_clear_cpu(cpu, cpu_sibling_mask(base + i)); - cpumask_clear_cpu(base + i, cpu_sibling_mask(cpu)); - cpumask_clear_cpu(cpu, cpu_core_mask(base + i)); - cpumask_clear_cpu(base + i, cpu_core_mask(cpu)); - } - traverse_core_siblings(cpu, false); + remove_cpu_from_masks(cpu); return 0; } diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S index 988f38dced0f..82d8aae81c6a 100644 --- a/arch/powerpc/kernel/swsusp_asm64.S +++ b/arch/powerpc/kernel/swsusp_asm64.S @@ -179,7 +179,7 @@ nothing_to_copy: sld r3, r3, r0 li r0, 0 1: - dcbf r0,r3 + dcbf 0,r3 addi r3,r3,0x20 bdnz 1b diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index 4d6b1d3a747f..7ccb7f81f8db 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -17,13 +17,13 @@ #include <asm/ppc_asm.h> #ifdef CONFIG_PPC64 -#define SYSCALL(func) .llong DOTSYM(sys_##func),DOTSYM(sys_##func) -#define COMPAT_SYS(func) .llong DOTSYM(sys_##func),DOTSYM(compat_sys_##func) -#define PPC_SYS(func) .llong DOTSYM(ppc_##func),DOTSYM(ppc_##func) -#define OLDSYS(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall) -#define SYS32ONLY(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func) -#define PPC64ONLY(func) .llong DOTSYM(ppc_##func),DOTSYM(sys_ni_syscall) -#define SYSX(f, f3264, f32) .llong DOTSYM(f),DOTSYM(f3264) +#define SYSCALL(func) .8byte DOTSYM(sys_##func),DOTSYM(sys_##func) +#define COMPAT_SYS(func) .8byte DOTSYM(sys_##func),DOTSYM(compat_sys_##func) +#define PPC_SYS(func) .8byte DOTSYM(ppc_##func),DOTSYM(ppc_##func) +#define OLDSYS(func) .8byte DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall) +#define SYS32ONLY(func) .8byte DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func) +#define PPC64ONLY(func) .8byte DOTSYM(ppc_##func),DOTSYM(sys_ni_syscall) +#define SYSX(f, f3264, f32) .8byte DOTSYM(f),DOTSYM(f3264) #else #define SYSCALL(func) .long sys_##func #define COMPAT_SYS(func) .long sys_##func diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 2b33cfaac7b8..fe6f3a285455 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -59,10 +59,10 @@ #include <linux/suspend.h> #include <linux/rtc.h> #include <linux/sched/cputime.h> +#include <linux/processor.h> #include <asm/trace.h> #include <asm/io.h> -#include <asm/processor.h> #include <asm/nvram.h> #include <asm/cache.h> #include <asm/machdep.h> @@ -442,6 +442,7 @@ void __delay(unsigned long loops) unsigned long start; int diff; + spin_begin(); if (__USE_RTC()) { start = get_rtcl(); do { @@ -449,13 +450,14 @@ void __delay(unsigned long loops) diff = get_rtcl() - start; if (diff < 0) diff += 1000000000; + spin_cpu_relax(); } while (diff < loops); } else { start = get_tbl(); while (get_tbl() - start < loops) - HMT_low(); - HMT_medium(); + spin_cpu_relax(); } + spin_end(); } EXPORT_SYMBOL(__delay); @@ -675,7 +677,7 @@ EXPORT_SYMBOL_GPL(tb_to_ns); * the high 64 bits of a * b, i.e. (a * b) >> 64, where a and b * are 64-bit unsigned numbers. */ -unsigned long long sched_clock(void) +notrace unsigned long long sched_clock(void) { if (__USE_RTC()) return get_rtc(); @@ -739,12 +741,20 @@ static int __init get_freq(char *name, int cells, unsigned long *val) static void start_cpu_decrementer(void) { #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) + unsigned int tcr; + /* Clear any pending timer interrupts */ mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS); - /* Enable decrementer interrupt */ - mtspr(SPRN_TCR, TCR_DIE); -#endif /* defined(CONFIG_BOOKE) || defined(CONFIG_40x) */ + tcr = mfspr(SPRN_TCR); + /* + * The watchdog may have already been enabled by u-boot. So leave + * TRC[WP] (Watchdog Period) alone. + */ + tcr &= TCR_WP_MASK; /* Clear all bits except for TCR[WP] */ + tcr |= TCR_DIE; /* Enable decrementer */ + mtspr(SPRN_TCR, tcr); +#endif } void __init generic_calibrate_decr(void) @@ -823,38 +833,76 @@ void read_persistent_clock(struct timespec *ts) } /* clocksource code */ -static u64 rtc_read(struct clocksource *cs) +static notrace u64 rtc_read(struct clocksource *cs) { return (u64)get_rtc(); } -static u64 timebase_read(struct clocksource *cs) +static notrace u64 timebase_read(struct clocksource *cs) { return (u64)get_tb(); } -void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm, - struct clocksource *clock, u32 mult, u64 cycle_last) + +void update_vsyscall(struct timekeeper *tk) { + struct timespec xt; + struct clocksource *clock = tk->tkr_mono.clock; + u32 mult = tk->tkr_mono.mult; + u32 shift = tk->tkr_mono.shift; + u64 cycle_last = tk->tkr_mono.cycle_last; u64 new_tb_to_xs, new_stamp_xsec; - u32 frac_sec; + u64 frac_sec; if (clock != &clocksource_timebase) return; + xt.tv_sec = tk->xtime_sec; + xt.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + /* Make userspace gettimeofday spin until we're done. */ ++vdso_data->tb_update_count; smp_mb(); - /* 19342813113834067 ~= 2^(20+64) / 1e9 */ - new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); - new_stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC; - do_div(new_stamp_xsec, 1000000000); - new_stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC; + /* + * This computes ((2^20 / 1e9) * mult) >> shift as a + * 0.64 fixed-point fraction. + * The computation in the else clause below won't overflow + * (as long as the timebase frequency is >= 1.049 MHz) + * but loses precision because we lose the low bits of the constant + * in the shift. Note that 19342813113834067 ~= 2^(20+64) / 1e9. + * For a shift of 24 the error is about 0.5e-9, or about 0.5ns + * over a second. (Shift values are usually 22, 23 or 24.) + * For high frequency clocks such as the 512MHz timebase clock + * on POWER[6789], the mult value is small (e.g. 32768000) + * and so we can shift the constant by 16 initially + * (295147905179 ~= 2^(20+64-16) / 1e9) and then do the + * remaining shifts after the multiplication, which gives a + * more accurate result (e.g. with mult = 32768000, shift = 24, + * the error is only about 1.2e-12, or 0.7ns over 10 minutes). + */ + if (mult <= 62500000 && clock->shift >= 16) + new_tb_to_xs = ((u64) mult * 295147905179ULL) >> (clock->shift - 16); + else + new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); + + /* + * Compute the fractional second in units of 2^-32 seconds. + * The fractional second is tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift + * in nanoseconds, so multiplying that by 2^32 / 1e9 gives + * it in units of 2^-32 seconds. + * We assume shift <= 32 because clocks_calc_mult_shift() + * generates shift values in the range 0 - 32. + */ + frac_sec = tk->tkr_mono.xtime_nsec << (32 - shift); + do_div(frac_sec, NSEC_PER_SEC); - BUG_ON(wall_time->tv_nsec >= NSEC_PER_SEC); - /* this is tv_nsec / 1e9 as a 0.32 fraction */ - frac_sec = ((u64) wall_time->tv_nsec * 18446744073ULL) >> 32; + /* + * Work out new stamp_xsec value for any legacy users of systemcfg. + * stamp_xsec is in units of 2^-20 seconds. + */ + new_stamp_xsec = frac_sec >> 12; + new_stamp_xsec += tk->xtime_sec * XSEC_PER_SEC; /* * tb_update_count is used to allow the userspace gettimeofday code @@ -864,15 +912,13 @@ void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm, * the two values of tb_update_count match and are even then the * tb_to_xs and stamp_xsec values are consistent. If not, then it * loops back and reads them again until this criteria is met. - * We expect the caller to have done the first increment of - * vdso_data->tb_update_count already. */ vdso_data->tb_orig_stamp = cycle_last; vdso_data->stamp_xsec = new_stamp_xsec; vdso_data->tb_to_xs = new_tb_to_xs; - vdso_data->wtom_clock_sec = wtm->tv_sec; - vdso_data->wtom_clock_nsec = wtm->tv_nsec; - vdso_data->stamp_xtime = *wall_time; + vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec; + vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec; + vdso_data->stamp_xtime = xt; vdso_data->stamp_sec_fraction = frac_sec; smp_wmb(); ++(vdso_data->tb_update_count); diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 3a2d04134da9..c4ba37822ba0 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -313,8 +313,8 @@ dont_backup_fp: blr - /* void tm_recheckpoint(struct thread_struct *thread, - * unsigned long orig_msr) + /* void __tm_recheckpoint(struct thread_struct *thread, + * unsigned long orig_msr) * - Restore the checkpointed register state saved by tm_reclaim * when we switch_to a process. * diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index d4e545d27ef9..ec74e203ee04 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -114,6 +114,28 @@ static void pmac_backlight_unblank(void) static inline void pmac_backlight_unblank(void) { } #endif +/* + * If oops/die is expected to crash the machine, return true here. + * + * This should not be expected to be 100% accurate, there may be + * notifiers registered or other unexpected conditions that may bring + * down the kernel. Or if the current process in the kernel is holding + * locks or has other critical state, the kernel may become effectively + * unusable anyway. + */ +bool die_will_crash(void) +{ + if (should_fadump_crash()) + return true; + if (kexec_should_crash(current)) + return true; + if (in_interrupt() || panic_on_oops || + !current->pid || is_global_init(current)) + return true; + + return false; +} + static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; @@ -162,21 +184,9 @@ static void oops_end(unsigned long flags, struct pt_regs *regs, crash_fadump(regs, "die oops"); - /* - * A system reset (0x100) is a request to dump, so we always send - * it through the crashdump code. - */ - if (kexec_should_crash(current) || (TRAP(regs) == 0x100)) { + if (kexec_should_crash(current)) crash_kexec(regs); - /* - * We aren't the primary crash CPU. We need to send it - * to a holding pattern to avoid it ending up in the panic - * code. - */ - crash_kexec_secondary(regs); - } - if (!signr) return; @@ -202,18 +212,25 @@ NOKPROBE_SYMBOL(oops_end); static int __die(const char *str, struct pt_regs *regs, long err) { printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP NR_CPUS=%d ", NR_CPUS); -#endif + + if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) + printk("LE "); + else + printk("BE "); + + if (IS_ENABLED(CONFIG_PREEMPT)) + pr_cont("PREEMPT "); + + if (IS_ENABLED(CONFIG_SMP)) + pr_cont("SMP NR_CPUS=%d ", NR_CPUS); + if (debug_pagealloc_enabled()) - printk("DEBUG_PAGEALLOC "); -#ifdef CONFIG_NUMA - printk("NUMA "); -#endif - printk("%s\n", ppc_md.name ? ppc_md.name : ""); + pr_cont("DEBUG_PAGEALLOC "); + + if (IS_ENABLED(CONFIG_NUMA)) + pr_cont("NUMA "); + + pr_cont("%s\n", ppc_md.name ? ppc_md.name : ""); if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP) return 1; @@ -237,6 +254,7 @@ void die(const char *str, struct pt_regs *regs, long err) err = 0; oops_end(flags, regs, err); } +NOKPROBE_SYMBOL(die); void user_single_step_siginfo(struct task_struct *tsk, struct pt_regs *regs, siginfo_t *info) @@ -287,23 +305,52 @@ void system_reset_exception(struct pt_regs *regs) if (!nested) nmi_enter(); + __this_cpu_inc(irq_stat.sreset_irqs); + /* See if any machine dependent calls */ if (ppc_md.system_reset_exception) { if (ppc_md.system_reset_exception(regs)) goto out; } - die("System Reset", regs, SIGABRT); + if (debugger(regs)) + goto out; + + /* + * A system reset is a request to dump, so we always send + * it through the crashdump code (if fadump or kdump are + * registered). + */ + crash_fadump(regs, "System Reset"); + + crash_kexec(regs); + + /* + * We aren't the primary crash CPU. We need to send it + * to a holding pattern to avoid it ending up in the panic + * code. + */ + crash_kexec_secondary(regs); + + /* + * No debugger or crash dump registered, print logs then + * panic. + */ + __die("System Reset", regs, SIGABRT); + + mdelay(2*MSEC_PER_SEC); /* Wait a little while for others to print */ + add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE); + nmi_panic(regs, "System Reset"); out: #ifdef CONFIG_PPC_BOOK3S_64 BUG_ON(get_paca()->in_nmi == 0); if (get_paca()->in_nmi > 1) - panic("Unrecoverable nested System Reset"); + nmi_panic(regs, "Unrecoverable nested System Reset"); #endif /* Must die if the interrupt is not recoverable */ if (!(regs->msr & MSR_RI)) - panic("Unrecoverable System Reset"); + nmi_panic(regs, "Unrecoverable System Reset"); if (!nested) nmi_exit(); @@ -311,39 +358,6 @@ out: /* What should we do here? We could issue a shutdown or hard reset. */ } -#ifdef CONFIG_PPC64 -/* - * This function is called in real mode. Strictly no printk's please. - * - * regs->nip and regs->msr contains srr0 and ssr1. - */ -long machine_check_early(struct pt_regs *regs) -{ - long handled = 0; - - __this_cpu_inc(irq_stat.mce_exceptions); - - if (cur_cpu_spec && cur_cpu_spec->machine_check_early) - handled = cur_cpu_spec->machine_check_early(regs); - return handled; -} - -long hmi_exception_realmode(struct pt_regs *regs) -{ - __this_cpu_inc(irq_stat.hmi_exceptions); - - wait_for_subcore_guest_exit(); - - if (ppc_md.hmi_exception_early) - ppc_md.hmi_exception_early(regs); - - wait_for_tb_resync(); - - return 0; -} - -#endif - /* * I/O accesses can cause machine checks on powermacs. * Check if the NIP corresponds to the address of a sync @@ -396,11 +410,6 @@ static inline int check_io_access(struct pt_regs *regs) /* On 4xx, the reason for the machine check or program exception is in the ESR. */ #define get_reason(regs) ((regs)->dsisr) -#ifndef CONFIG_FSL_BOOKE -#define get_mc_reason(regs) ((regs)->dsisr) -#else -#define get_mc_reason(regs) (mfspr(SPRN_MCSR)) -#endif #define REASON_FP ESR_FP #define REASON_ILLEGAL (ESR_PIL | ESR_PUO) #define REASON_PRIVILEGED ESR_PPR @@ -414,108 +423,17 @@ static inline int check_io_access(struct pt_regs *regs) /* On non-4xx, the reason for the machine check or program exception is in the MSR. */ #define get_reason(regs) ((regs)->msr) -#define get_mc_reason(regs) ((regs)->msr) -#define REASON_TM 0x200000 -#define REASON_FP 0x100000 -#define REASON_ILLEGAL 0x80000 -#define REASON_PRIVILEGED 0x40000 -#define REASON_TRAP 0x20000 +#define REASON_TM SRR1_PROGTM +#define REASON_FP SRR1_PROGFPE +#define REASON_ILLEGAL SRR1_PROGILL +#define REASON_PRIVILEGED SRR1_PROGPRIV +#define REASON_TRAP SRR1_PROGTRAP #define single_stepping(regs) ((regs)->msr & MSR_SE) #define clear_single_step(regs) ((regs)->msr &= ~MSR_SE) #endif -#if defined(CONFIG_4xx) -int machine_check_4xx(struct pt_regs *regs) -{ - unsigned long reason = get_mc_reason(regs); - - if (reason & ESR_IMCP) { - printk("Instruction"); - mtspr(SPRN_ESR, reason & ~ESR_IMCP); - } else - printk("Data"); - printk(" machine check in kernel mode.\n"); - - return 0; -} - -int machine_check_440A(struct pt_regs *regs) -{ - unsigned long reason = get_mc_reason(regs); - - printk("Machine check in kernel mode.\n"); - if (reason & ESR_IMCP){ - printk("Instruction Synchronous Machine Check exception\n"); - mtspr(SPRN_ESR, reason & ~ESR_IMCP); - } - else { - u32 mcsr = mfspr(SPRN_MCSR); - if (mcsr & MCSR_IB) - printk("Instruction Read PLB Error\n"); - if (mcsr & MCSR_DRB) - printk("Data Read PLB Error\n"); - if (mcsr & MCSR_DWB) - printk("Data Write PLB Error\n"); - if (mcsr & MCSR_TLBP) - printk("TLB Parity Error\n"); - if (mcsr & MCSR_ICP){ - flush_instruction_cache(); - printk("I-Cache Parity Error\n"); - } - if (mcsr & MCSR_DCSP) - printk("D-Cache Search Parity Error\n"); - if (mcsr & MCSR_DCFP) - printk("D-Cache Flush Parity Error\n"); - if (mcsr & MCSR_IMPE) - printk("Machine Check exception is imprecise\n"); - - /* Clear MCSR */ - mtspr(SPRN_MCSR, mcsr); - } - return 0; -} - -int machine_check_47x(struct pt_regs *regs) -{ - unsigned long reason = get_mc_reason(regs); - u32 mcsr; - - printk(KERN_ERR "Machine check in kernel mode.\n"); - if (reason & ESR_IMCP) { - printk(KERN_ERR - "Instruction Synchronous Machine Check exception\n"); - mtspr(SPRN_ESR, reason & ~ESR_IMCP); - return 0; - } - mcsr = mfspr(SPRN_MCSR); - if (mcsr & MCSR_IB) - printk(KERN_ERR "Instruction Read PLB Error\n"); - if (mcsr & MCSR_DRB) - printk(KERN_ERR "Data Read PLB Error\n"); - if (mcsr & MCSR_DWB) - printk(KERN_ERR "Data Write PLB Error\n"); - if (mcsr & MCSR_TLBP) - printk(KERN_ERR "TLB Parity Error\n"); - if (mcsr & MCSR_ICP) { - flush_instruction_cache(); - printk(KERN_ERR "I-Cache Parity Error\n"); - } - if (mcsr & MCSR_DCSP) - printk(KERN_ERR "D-Cache Search Parity Error\n"); - if (mcsr & PPC47x_MCSR_GPR) - printk(KERN_ERR "GPR Parity Error\n"); - if (mcsr & PPC47x_MCSR_FPR) - printk(KERN_ERR "FPR Parity Error\n"); - if (mcsr & PPC47x_MCSR_IPR) - printk(KERN_ERR "Machine Check exception is imprecise\n"); - - /* Clear MCSR */ - mtspr(SPRN_MCSR, mcsr); - - return 0; -} -#elif defined(CONFIG_E500) +#if defined(CONFIG_E500) int machine_check_e500mc(struct pt_regs *regs) { unsigned long mcsr = mfspr(SPRN_MCSR); @@ -617,7 +535,7 @@ silent_out: int machine_check_e500(struct pt_regs *regs) { - unsigned long reason = get_mc_reason(regs); + unsigned long reason = mfspr(SPRN_MCSR); if (reason & MCSR_BUS_RBERR) { if (fsl_rio_mcheck_exception(regs)) @@ -664,7 +582,7 @@ int machine_check_generic(struct pt_regs *regs) #elif defined(CONFIG_E200) int machine_check_e200(struct pt_regs *regs) { - unsigned long reason = get_mc_reason(regs); + unsigned long reason = mfspr(SPRN_MCSR); printk("Machine check in kernel mode.\n"); printk("Caused by (from MCSR=%lx): ", reason); @@ -686,35 +604,10 @@ int machine_check_e200(struct pt_regs *regs) return 0; } -#elif defined(CONFIG_PPC_8xx) -int machine_check_8xx(struct pt_regs *regs) -{ - unsigned long reason = get_mc_reason(regs); - - pr_err("Machine check in kernel mode.\n"); - pr_err("Caused by (from SRR1=%lx): ", reason); - if (reason & 0x40000000) - pr_err("Fetch error at address %lx\n", regs->nip); - else - pr_err("Data access error at address %lx\n", regs->dar); - -#ifdef CONFIG_PCI - /* the qspan pci read routines can cause machine checks -- Cort - * - * yuck !!! that totally needs to go away ! There are better ways - * to deal with that than having a wart in the mcheck handler. - * -- BenH - */ - bad_page_fault(regs, regs->dar, SIGBUS); - return 1; -#else - return 0; -#endif -} -#else +#elif defined(CONFIG_PPC32) int machine_check_generic(struct pt_regs *regs) { - unsigned long reason = get_mc_reason(regs); + unsigned long reason = regs->msr; printk("Machine check in kernel mode.\n"); printk("Caused by (from SRR1=%lx): ", reason); @@ -751,10 +644,14 @@ int machine_check_generic(struct pt_regs *regs) void machine_check_exception(struct pt_regs *regs) { - enum ctx_state prev_state = exception_enter(); int recover = 0; + bool nested = in_nmi(); + if (!nested) + nmi_enter(); - __this_cpu_inc(irq_stat.mce_exceptions); + /* 64s accounts the mce in machine_check_early when in HVMODE */ + if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !cpu_has_feature(CPU_FTR_HVMODE)) + __this_cpu_inc(irq_stat.mce_exceptions); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); @@ -782,10 +679,11 @@ void machine_check_exception(struct pt_regs *regs) /* Must die if the interrupt is not recoverable */ if (!(regs->msr & MSR_RI)) - panic("Unrecoverable Machine check"); + nmi_panic(regs, "Unrecoverable Machine check"); bail: - exception_exit(prev_state); + if (!nested) + nmi_exit(); } void SMIException(struct pt_regs *regs) @@ -1671,24 +1569,6 @@ void performance_monitor_exception(struct pt_regs *regs) perf_irq(regs); } -#ifdef CONFIG_8xx -void SoftwareEmulation(struct pt_regs *regs) -{ - CHECK_FULL_REGS(regs); - - if (!user_mode(regs)) { - debugger(regs); - die("Kernel Mode Unimplemented Instruction or SW FPU Emulation", - regs, SIGFPE); - } - - if (!emulate_math(regs)) - return; - - _exception(SIGILL, regs, ILL_ILLOPC, regs->nip); -} -#endif /* CONFIG_8xx */ - #ifdef CONFIG_PPC_ADV_DEBUG_REGS static void handle_debug(struct pt_regs *regs, unsigned long debug_status) { @@ -1968,6 +1848,7 @@ void unrecoverable_exception(struct pt_regs *regs) regs->trap, regs->nip); die("Unrecoverable exception", regs, SIGABRT); } +NOKPROBE_SYMBOL(unrecoverable_exception); #if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x) /* @@ -1998,6 +1879,7 @@ void kernel_bad_stack(struct pt_regs *regs) regs->gpr[1], regs->nip); die("Bad kernel stack pointer", regs, SIGABRT); } +NOKPROBE_SYMBOL(kernel_bad_stack); void __init trap_init(void) { diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c index 003b20964ea0..5d105b8eeece 100644 --- a/arch/powerpc/kernel/uprobes.c +++ b/arch/powerpc/kernel/uprobes.c @@ -205,3 +205,12 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs return orig_ret_vaddr; } + +bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + if (ctx == RP_CHECK_CHAIN_CALL) + return regs->gpr[1] <= ret->stack; + else + return regs->gpr[1] < ret->stack; +} diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 6b2b69616e77..769c2624e0a6 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -232,15 +232,9 @@ __do_get_tspec: lwz r6,(CFG_TB_ORIG_STAMP+4)(r9) /* Get a stable TB value */ -#ifdef CONFIG_8xx -2: mftbu r3 - mftbl r4 - mftbu r0 -#else -2: mfspr r3, SPRN_TBRU - mfspr r4, SPRN_TBRL - mfspr r0, SPRN_TBRU -#endif +2: MFTBU(r3) + MFTBL(r4) + MFTBU(r0) cmplw cr0,r3,r0 bne- 2b diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 2f793be3d2b1..882628fa6987 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -8,6 +8,12 @@ #include <asm/cache.h> #include <asm/thread_info.h> +#if defined(CONFIG_STRICT_KERNEL_RWX) && !defined(CONFIG_PPC32) +#define STRICT_ALIGN_SIZE (1 << 24) +#else +#define STRICT_ALIGN_SIZE PAGE_SIZE +#endif + ENTRY(_stext) PHDRS { @@ -58,7 +64,6 @@ SECTIONS #ifdef CONFIG_PPC64 KEEP(*(.head.text.first_256B)); #ifdef CONFIG_PPC_BOOK3E -# define END_FIXED 0x100 #else KEEP(*(.head.text.real_vectors)); *(.head.text.real_trampolines); @@ -66,12 +71,8 @@ SECTIONS *(.head.text.virt_trampolines); # if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) KEEP(*(.head.data.fwnmi_page)); -# define END_FIXED 0x8000 -# else -# define END_FIXED 0x7000 # endif #endif - ASSERT((. == END_FIXED), "vmlinux.lds.S: fixed section overflow error"); #else /* !CONFIG_PPC64 */ HEAD_TEXT #endif @@ -79,23 +80,6 @@ SECTIONS __head_end = .; - /* - * If the build dies here, it's likely code in head_64.S is referencing - * labels it can't reach, and the linker inserting stubs without the - * assembler's knowledge. To debug, remove the above assert and - * rebuild. Look for branch stubs in the fixed section region. - * - * Linker stub generation could be allowed in "trampoline" - * sections if absolutely necessary, but this would require - * some rework of the fixed sections. Before resorting to this, - * consider references that have sufficient addressing range, - * (e.g., hand coded trampolines) so the linker does not have - * to add stubs. - * - * Linker stubs at the top of the main text section are currently not - * detected, and will result in a crash at boot due to offsets being - * wrong. - */ #ifdef CONFIG_PPC64 /* * BLOCK(0) overrides the default output section alignment because @@ -103,18 +87,31 @@ SECTIONS * section placement to work. */ .text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) { +#ifdef CONFIG_LD_HEAD_STUB_CATCH + *(.linker_stub_catch); + . = . ; +#endif + #else .text : AT(ADDR(.text) - LOAD_OFFSET) { ALIGN_FUNCTION(); #endif /* careful! __ftr_alt_* sections need to be close to .text */ - *(.text .fixup __ftr_alt_* .ref.text) + *(.text.hot .text .text.fixup .text.unlikely .fixup __ftr_alt_* .ref.text); SCHED_TEXT CPUIDLE_TEXT LOCK_TEXT KPROBES_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT + /* + * -Os builds call FP save/restore functions. The powerpc64 + * linker generates those on demand in the .sfpr section. + * .sfpr gets placed at the beginning of a group of input + * sections, which can break start-of-text offset if it is + * included with the main text sections, so put it by itself. + */ + *(.sfpr); MEM_KEEP(init.text) MEM_KEEP(exit.text) @@ -132,7 +129,7 @@ SECTIONS PROVIDE32 (etext = .); /* Read-only data */ - RODATA + RO_DATA(PAGE_SIZE) EXCEPTION_TABLE(0) @@ -149,7 +146,7 @@ SECTIONS /* * Init sections discarded at runtime */ - . = ALIGN(PAGE_SIZE); + . = ALIGN(STRICT_ALIGN_SIZE); __init_begin = .; INIT_TEXT_SECTION(PAGE_SIZE) :kernel @@ -267,7 +264,9 @@ SECTIONS .data : AT(ADDR(.data) - LOAD_OFFSET) { DATA_DATA *(.sdata) + *(.sdata2) *(.got.plt) *(.got) + *(.plt) } #else .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -330,6 +329,16 @@ SECTIONS _end = . ; PROVIDE32 (end = .); - /* Sections to be discarded. */ + STABS_DEBUG + + DWARF_DEBUG + DISCARDS + /DISCARD/ : { + *(*.EMB.apuinfo) + *(.glink .iplt .plt .rela* .comment) + *(.gnu.version*) + *(.gnu.attributes) + *(.eh_frame) + } } diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c new file mode 100644 index 000000000000..2f6eadd9408d --- /dev/null +++ b/arch/powerpc/kernel/watchdog.c @@ -0,0 +1,412 @@ +/* + * Watchdog support on powerpc systems. + * + * Copyright 2017, IBM Corporation. + * + * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c + */ +#include <linux/kernel.h> +#include <linux/param.h> +#include <linux/init.h> +#include <linux/percpu.h> +#include <linux/cpu.h> +#include <linux/nmi.h> +#include <linux/module.h> +#include <linux/export.h> +#include <linux/kprobes.h> +#include <linux/hardirq.h> +#include <linux/reboot.h> +#include <linux/slab.h> +#include <linux/kdebug.h> +#include <linux/sched/debug.h> +#include <linux/delay.h> +#include <linux/smp.h> + +#include <asm/paca.h> + +/* + * The watchdog has a simple timer that runs on each CPU, once per timer + * period. This is the heartbeat. + * + * Then there are checks to see if the heartbeat has not triggered on a CPU + * for the panic timeout period. Currently the watchdog only supports an + * SMP check, so the heartbeat only turns on when we have 2 or more CPUs. + * + * This is not an NMI watchdog, but Linux uses that name for a generic + * watchdog in some cases, so NMI gets used in some places. + */ + +static cpumask_t wd_cpus_enabled __read_mostly; + +static u64 wd_panic_timeout_tb __read_mostly; /* timebase ticks until panic */ +static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */ + +static u64 wd_timer_period_ms __read_mostly; /* interval between heartbeat */ + +static DEFINE_PER_CPU(struct timer_list, wd_timer); +static DEFINE_PER_CPU(u64, wd_timer_tb); + +/* + * These are for the SMP checker. CPUs clear their pending bit in their + * heartbeat. If the bitmask becomes empty, the time is noted and the + * bitmask is refilled. + * + * All CPUs clear their bit in the pending mask every timer period. + * Once all have cleared, the time is noted and the bits are reset. + * If the time since all clear was greater than the panic timeout, + * we can panic with the list of stuck CPUs. + * + * This will work best with NMI IPIs for crash code so the stuck CPUs + * can be pulled out to get their backtraces. + */ +static unsigned long __wd_smp_lock; +static cpumask_t wd_smp_cpus_pending; +static cpumask_t wd_smp_cpus_stuck; +static u64 wd_smp_last_reset_tb; + +static inline void wd_smp_lock(unsigned long *flags) +{ + /* + * Avoid locking layers if possible. + * This may be called from low level interrupt handlers at some + * point in future. + */ + raw_local_irq_save(*flags); + hard_irq_disable(); /* Make it soft-NMI safe */ + while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) { + raw_local_irq_restore(*flags); + spin_until_cond(!test_bit(0, &__wd_smp_lock)); + raw_local_irq_save(*flags); + hard_irq_disable(); + } +} + +static inline void wd_smp_unlock(unsigned long *flags) +{ + clear_bit_unlock(0, &__wd_smp_lock); + raw_local_irq_restore(*flags); +} + +static void wd_lockup_ipi(struct pt_regs *regs) +{ + pr_emerg("Watchdog CPU:%d Hard LOCKUP\n", raw_smp_processor_id()); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); +} + +static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb) +{ + cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask); + cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask); + if (cpumask_empty(&wd_smp_cpus_pending)) { + wd_smp_last_reset_tb = tb; + cpumask_andnot(&wd_smp_cpus_pending, + &wd_cpus_enabled, + &wd_smp_cpus_stuck); + } +} +static void set_cpu_stuck(int cpu, u64 tb) +{ + set_cpumask_stuck(cpumask_of(cpu), tb); +} + +static void watchdog_smp_panic(int cpu, u64 tb) +{ + unsigned long flags; + int c; + + wd_smp_lock(&flags); + /* Double check some things under lock */ + if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb) + goto out; + if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) + goto out; + if (cpumask_weight(&wd_smp_cpus_pending) == 0) + goto out; + + pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n", + cpu, cpumask_pr_args(&wd_smp_cpus_pending)); + + /* + * Try to trigger the stuck CPUs. + */ + for_each_cpu(c, &wd_smp_cpus_pending) { + if (c == cpu) + continue; + smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000); + } + smp_flush_nmi_ipi(1000000); + + /* Take the stuck CPUs out of the watch group */ + set_cpumask_stuck(&wd_smp_cpus_pending, tb); + + wd_smp_unlock(&flags); + + printk_safe_flush(); + /* + * printk_safe_flush() seems to require another print + * before anything actually goes out to console. + */ + if (sysctl_hardlockup_all_cpu_backtrace) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(NULL, "Hard LOCKUP"); + + return; + +out: + wd_smp_unlock(&flags); +} + +static void wd_smp_clear_cpu_pending(int cpu, u64 tb) +{ + if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) { + if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) { + unsigned long flags; + + pr_emerg("Watchdog CPU:%d became unstuck\n", cpu); + wd_smp_lock(&flags); + cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck); + wd_smp_unlock(&flags); + } + return; + } + cpumask_clear_cpu(cpu, &wd_smp_cpus_pending); + if (cpumask_empty(&wd_smp_cpus_pending)) { + unsigned long flags; + + wd_smp_lock(&flags); + if (cpumask_empty(&wd_smp_cpus_pending)) { + wd_smp_last_reset_tb = tb; + cpumask_andnot(&wd_smp_cpus_pending, + &wd_cpus_enabled, + &wd_smp_cpus_stuck); + } + wd_smp_unlock(&flags); + } +} + +static void watchdog_timer_interrupt(int cpu) +{ + u64 tb = get_tb(); + + per_cpu(wd_timer_tb, cpu) = tb; + + wd_smp_clear_cpu_pending(cpu, tb); + + if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb) + watchdog_smp_panic(cpu, tb); +} + +void soft_nmi_interrupt(struct pt_regs *regs) +{ + unsigned long flags; + int cpu = raw_smp_processor_id(); + u64 tb; + + if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) + return; + + nmi_enter(); + + __this_cpu_inc(irq_stat.soft_nmi_irqs); + + tb = get_tb(); + if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) { + per_cpu(wd_timer_tb, cpu) = tb; + + wd_smp_lock(&flags); + if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) { + wd_smp_unlock(&flags); + goto out; + } + set_cpu_stuck(cpu, tb); + + pr_emerg("Watchdog CPU:%d Hard LOCKUP\n", cpu); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + wd_smp_unlock(&flags); + + if (sysctl_hardlockup_all_cpu_backtrace) + trigger_allbutself_cpu_backtrace(); + + if (hardlockup_panic) + nmi_panic(regs, "Hard LOCKUP"); + } + if (wd_panic_timeout_tb < 0x7fffffff) + mtspr(SPRN_DEC, wd_panic_timeout_tb); + +out: + nmi_exit(); +} + +static void wd_timer_reset(unsigned int cpu, struct timer_list *t) +{ + t->expires = jiffies + msecs_to_jiffies(wd_timer_period_ms); + if (wd_timer_period_ms > 1000) + t->expires = __round_jiffies_up(t->expires, cpu); + add_timer_on(t, cpu); +} + +static void wd_timer_fn(unsigned long data) +{ + struct timer_list *t = this_cpu_ptr(&wd_timer); + int cpu = smp_processor_id(); + + watchdog_timer_interrupt(cpu); + + wd_timer_reset(cpu, t); +} + +void arch_touch_nmi_watchdog(void) +{ + unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000; + int cpu = smp_processor_id(); + + if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks) + watchdog_timer_interrupt(cpu); +} +EXPORT_SYMBOL(arch_touch_nmi_watchdog); + +static void start_watchdog_timer_on(unsigned int cpu) +{ + struct timer_list *t = per_cpu_ptr(&wd_timer, cpu); + + per_cpu(wd_timer_tb, cpu) = get_tb(); + + setup_pinned_timer(t, wd_timer_fn, 0); + wd_timer_reset(cpu, t); +} + +static void stop_watchdog_timer_on(unsigned int cpu) +{ + struct timer_list *t = per_cpu_ptr(&wd_timer, cpu); + + del_timer_sync(t); +} + +static int start_wd_on_cpu(unsigned int cpu) +{ + unsigned long flags; + + if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) { + WARN_ON(1); + return 0; + } + + if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) + return 0; + + if (watchdog_suspended) + return 0; + + if (!cpumask_test_cpu(cpu, &watchdog_cpumask)) + return 0; + + wd_smp_lock(&flags); + cpumask_set_cpu(cpu, &wd_cpus_enabled); + if (cpumask_weight(&wd_cpus_enabled) == 1) { + cpumask_set_cpu(cpu, &wd_smp_cpus_pending); + wd_smp_last_reset_tb = get_tb(); + } + wd_smp_unlock(&flags); + + start_watchdog_timer_on(cpu); + + return 0; +} + +static int stop_wd_on_cpu(unsigned int cpu) +{ + unsigned long flags; + + if (!cpumask_test_cpu(cpu, &wd_cpus_enabled)) + return 0; /* Can happen in CPU unplug case */ + + stop_watchdog_timer_on(cpu); + + wd_smp_lock(&flags); + cpumask_clear_cpu(cpu, &wd_cpus_enabled); + wd_smp_unlock(&flags); + + wd_smp_clear_cpu_pending(cpu, get_tb()); + + return 0; +} + +static void watchdog_calc_timeouts(void) +{ + wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq; + + /* Have the SMP detector trigger a bit later */ + wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2; + + /* 2/5 is the factor that the perf based detector uses */ + wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5; +} + +void watchdog_nmi_reconfigure(void) +{ + int cpu; + + watchdog_calc_timeouts(); + + for_each_cpu(cpu, &wd_cpus_enabled) + stop_wd_on_cpu(cpu); + + for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask) + start_wd_on_cpu(cpu); +} + +/* + * This runs after lockup_detector_init() which sets up watchdog_cpumask. + */ +static int __init powerpc_watchdog_init(void) +{ + int err; + + watchdog_calc_timeouts(); + + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online", + start_wd_on_cpu, stop_wd_on_cpu); + if (err < 0) + pr_warn("Watchdog could not be initialized"); + + return 0; +} +arch_initcall(powerpc_watchdog_init); + +static void handle_backtrace_ipi(struct pt_regs *regs) +{ + nmi_cpu_backtrace(regs); +} + +static void raise_backtrace_ipi(cpumask_t *mask) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (cpu == smp_processor_id()) + handle_backtrace_ipi(NULL); + else + smp_send_nmi_ipi(cpu, handle_backtrace_ipi, 1000000); + } +} + +void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self) +{ + nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi); +} |