summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/kernel')
-rw-r--r--arch/powerpc/kernel/Makefile5
-rw-r--r--arch/powerpc/kernel/align.c774
-rw-r--r--arch/powerpc/kernel/asm-offsets.c21
-rw-r--r--arch/powerpc/kernel/btext.c2
-rw-r--r--arch/powerpc/kernel/cacheinfo.c34
-rw-r--r--arch/powerpc/kernel/cpu_setup_power.S13
-rw-r--r--arch/powerpc/kernel/cputable.c8
-rw-r--r--arch/powerpc/kernel/dma-iommu.c6
-rw-r--r--arch/powerpc/kernel/dma.c17
-rw-r--r--arch/powerpc/kernel/dt_cpu_ftrs.c16
-rw-r--r--arch/powerpc/kernel/eeh.c20
-rw-r--r--arch/powerpc/kernel/eeh_dev.c7
-rw-r--r--arch/powerpc/kernel/eeh_driver.c2
-rw-r--r--arch/powerpc/kernel/eeh_pe.c90
-rw-r--r--arch/powerpc/kernel/eeh_sysfs.c3
-rw-r--r--arch/powerpc/kernel/entry_32.S22
-rw-r--r--arch/powerpc/kernel/entry_64.S142
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S349
-rw-r--r--arch/powerpc/kernel/fadump.c230
-rw-r--r--arch/powerpc/kernel/head_32.S6
-rw-r--r--arch/powerpc/kernel/head_64.S8
-rw-r--r--arch/powerpc/kernel/head_8xx.S109
-rw-r--r--arch/powerpc/kernel/idle_book3s.S333
-rw-r--r--arch/powerpc/kernel/io-workarounds.c9
-rw-r--r--arch/powerpc/kernel/iommu.c35
-rw-r--r--arch/powerpc/kernel/irq.c140
-rw-r--r--arch/powerpc/kernel/isa-bridge.c32
-rw-r--r--arch/powerpc/kernel/kgdb.c4
-rw-r--r--arch/powerpc/kernel/kprobes.c10
-rw-r--r--arch/powerpc/kernel/l2cr_6xx.S4
-rw-r--r--arch/powerpc/kernel/legacy_serial.c12
-rw-r--r--arch/powerpc/kernel/mce.c35
-rw-r--r--arch/powerpc/kernel/mce_power.c59
-rw-r--r--arch/powerpc/kernel/misc_32.S6
-rw-r--r--arch/powerpc/kernel/misc_64.S12
-rw-r--r--arch/powerpc/kernel/nvram_64.c14
-rw-r--r--arch/powerpc/kernel/of_platform.c2
-rw-r--r--arch/powerpc/kernel/optprobes.c53
-rw-r--r--arch/powerpc/kernel/optprobes_head.S8
-rw-r--r--arch/powerpc/kernel/paca.c13
-rw-r--r--arch/powerpc/kernel/pci-common.c15
-rw-r--r--arch/powerpc/kernel/pci_32.c4
-rw-r--r--arch/powerpc/kernel/pci_64.c4
-rw-r--r--arch/powerpc/kernel/pci_dn.c22
-rw-r--r--arch/powerpc/kernel/pci_of_scan.c24
-rw-r--r--arch/powerpc/kernel/process.c141
-rw-r--r--arch/powerpc/kernel/prom_init.c37
-rw-r--r--arch/powerpc/kernel/ptrace.c55
-rw-r--r--arch/powerpc/kernel/reloc_64.S6
-rw-r--r--arch/powerpc/kernel/rtas_pci.c33
-rw-r--r--arch/powerpc/kernel/rtasd.c2
-rw-r--r--arch/powerpc/kernel/setup-common.c40
-rw-r--r--arch/powerpc/kernel/setup_32.c7
-rw-r--r--arch/powerpc/kernel/setup_64.c28
-rw-r--r--arch/powerpc/kernel/smp.c269
-rw-r--r--arch/powerpc/kernel/swsusp_asm64.S2
-rw-r--r--arch/powerpc/kernel/systbl.S14
-rw-r--r--arch/powerpc/kernel/time.c96
-rw-r--r--arch/powerpc/kernel/tm.S4
-rw-r--r--arch/powerpc/kernel/traps.c308
-rw-r--r--arch/powerpc/kernel/uprobes.c9
-rw-r--r--arch/powerpc/kernel/vdso32/gettimeofday.S12
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S61
-rw-r--r--arch/powerpc/kernel/watchdog.c412
64 files changed, 2333 insertions, 1937 deletions
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index e132902e1f14..91960f83039c 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -25,8 +25,6 @@ CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
-# timers used by tracing
-CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
endif
obj-y := cputable.o ptrace.o syscalls.o \
@@ -40,6 +38,7 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \
signal_64.o ptrace32.o \
paca.o nvram_64.o firmware.o
obj-$(CONFIG_VDSO32) += vdso32/
+obj-$(CONFIG_PPC_WATCHDOG) += watchdog.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o
obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o
@@ -84,7 +83,7 @@ extra-y := head_$(BITS).o
extra-$(CONFIG_40x) := head_40x.o
extra-$(CONFIG_44x) := head_44x.o
extra-$(CONFIG_FSL_BOOKE) := head_fsl_booke.o
-extra-$(CONFIG_8xx) := head_8xx.o
+extra-$(CONFIG_PPC_8xx) := head_8xx.o
extra-y += vmlinux.lds
obj-$(CONFIG_RELOCATABLE) += reloc_$(BITS).o
diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c
index ec7a8b099dd9..26b9994d27ee 100644
--- a/arch/powerpc/kernel/align.c
+++ b/arch/powerpc/kernel/align.c
@@ -27,6 +27,7 @@
#include <asm/switch_to.h>
#include <asm/disassemble.h>
#include <asm/cpu_has_feature.h>
+#include <asm/sstep.h>
struct aligninfo {
unsigned char len;
@@ -40,364 +41,9 @@ struct aligninfo {
#define LD 0 /* load */
#define ST 1 /* store */
#define SE 2 /* sign-extend value, or FP ld/st as word */
-#define F 4 /* to/from fp regs */
-#define U 8 /* update index register */
-#define M 0x10 /* multiple load/store */
#define SW 0x20 /* byte swap */
-#define S 0x40 /* single-precision fp or... */
-#define SX 0x40 /* ... byte count in XER */
-#define HARD 0x80 /* string, stwcx. */
#define E4 0x40 /* SPE endianness is word */
#define E8 0x80 /* SPE endianness is double word */
-#define SPLT 0x80 /* VSX SPLAT load */
-
-/* DSISR bits reported for a DCBZ instruction: */
-#define DCBZ 0x5f /* 8xx/82xx dcbz faults when cache not enabled */
-
-/*
- * The PowerPC stores certain bits of the instruction that caused the
- * alignment exception in the DSISR register. This array maps those
- * bits to information about the operand length and what the
- * instruction would do.
- */
-static struct aligninfo aligninfo[128] = {
- { 4, LD }, /* 00 0 0000: lwz / lwarx */
- INVALID, /* 00 0 0001 */
- { 4, ST }, /* 00 0 0010: stw */
- INVALID, /* 00 0 0011 */
- { 2, LD }, /* 00 0 0100: lhz */
- { 2, LD+SE }, /* 00 0 0101: lha */
- { 2, ST }, /* 00 0 0110: sth */
- { 4, LD+M }, /* 00 0 0111: lmw */
- { 4, LD+F+S }, /* 00 0 1000: lfs */
- { 8, LD+F }, /* 00 0 1001: lfd */
- { 4, ST+F+S }, /* 00 0 1010: stfs */
- { 8, ST+F }, /* 00 0 1011: stfd */
- { 16, LD }, /* 00 0 1100: lq */
- { 8, LD }, /* 00 0 1101: ld/ldu/lwa */
- INVALID, /* 00 0 1110 */
- { 8, ST }, /* 00 0 1111: std/stdu */
- { 4, LD+U }, /* 00 1 0000: lwzu */
- INVALID, /* 00 1 0001 */
- { 4, ST+U }, /* 00 1 0010: stwu */
- INVALID, /* 00 1 0011 */
- { 2, LD+U }, /* 00 1 0100: lhzu */
- { 2, LD+SE+U }, /* 00 1 0101: lhau */
- { 2, ST+U }, /* 00 1 0110: sthu */
- { 4, ST+M }, /* 00 1 0111: stmw */
- { 4, LD+F+S+U }, /* 00 1 1000: lfsu */
- { 8, LD+F+U }, /* 00 1 1001: lfdu */
- { 4, ST+F+S+U }, /* 00 1 1010: stfsu */
- { 8, ST+F+U }, /* 00 1 1011: stfdu */
- { 16, LD+F }, /* 00 1 1100: lfdp */
- INVALID, /* 00 1 1101 */
- { 16, ST+F }, /* 00 1 1110: stfdp */
- INVALID, /* 00 1 1111 */
- { 8, LD }, /* 01 0 0000: ldx */
- INVALID, /* 01 0 0001 */
- { 8, ST }, /* 01 0 0010: stdx */
- INVALID, /* 01 0 0011 */
- INVALID, /* 01 0 0100 */
- { 4, LD+SE }, /* 01 0 0101: lwax */
- INVALID, /* 01 0 0110 */
- INVALID, /* 01 0 0111 */
- { 4, LD+M+HARD+SX }, /* 01 0 1000: lswx */
- { 4, LD+M+HARD }, /* 01 0 1001: lswi */
- { 4, ST+M+HARD+SX }, /* 01 0 1010: stswx */
- { 4, ST+M+HARD }, /* 01 0 1011: stswi */
- INVALID, /* 01 0 1100 */
- { 8, LD+U }, /* 01 0 1101: ldu */
- INVALID, /* 01 0 1110 */
- { 8, ST+U }, /* 01 0 1111: stdu */
- { 8, LD+U }, /* 01 1 0000: ldux */
- INVALID, /* 01 1 0001 */
- { 8, ST+U }, /* 01 1 0010: stdux */
- INVALID, /* 01 1 0011 */
- INVALID, /* 01 1 0100 */
- { 4, LD+SE+U }, /* 01 1 0101: lwaux */
- INVALID, /* 01 1 0110 */
- INVALID, /* 01 1 0111 */
- INVALID, /* 01 1 1000 */
- INVALID, /* 01 1 1001 */
- INVALID, /* 01 1 1010 */
- INVALID, /* 01 1 1011 */
- INVALID, /* 01 1 1100 */
- INVALID, /* 01 1 1101 */
- INVALID, /* 01 1 1110 */
- INVALID, /* 01 1 1111 */
- INVALID, /* 10 0 0000 */
- INVALID, /* 10 0 0001 */
- INVALID, /* 10 0 0010: stwcx. */
- INVALID, /* 10 0 0011 */
- INVALID, /* 10 0 0100 */
- INVALID, /* 10 0 0101 */
- INVALID, /* 10 0 0110 */
- INVALID, /* 10 0 0111 */
- { 4, LD+SW }, /* 10 0 1000: lwbrx */
- INVALID, /* 10 0 1001 */
- { 4, ST+SW }, /* 10 0 1010: stwbrx */
- INVALID, /* 10 0 1011 */
- { 2, LD+SW }, /* 10 0 1100: lhbrx */
- { 4, LD+SE }, /* 10 0 1101 lwa */
- { 2, ST+SW }, /* 10 0 1110: sthbrx */
- { 16, ST }, /* 10 0 1111: stq */
- INVALID, /* 10 1 0000 */
- INVALID, /* 10 1 0001 */
- INVALID, /* 10 1 0010 */
- INVALID, /* 10 1 0011 */
- INVALID, /* 10 1 0100 */
- INVALID, /* 10 1 0101 */
- INVALID, /* 10 1 0110 */
- INVALID, /* 10 1 0111 */
- INVALID, /* 10 1 1000 */
- INVALID, /* 10 1 1001 */
- INVALID, /* 10 1 1010 */
- INVALID, /* 10 1 1011 */
- INVALID, /* 10 1 1100 */
- INVALID, /* 10 1 1101 */
- INVALID, /* 10 1 1110 */
- { 0, ST+HARD }, /* 10 1 1111: dcbz */
- { 4, LD }, /* 11 0 0000: lwzx */
- INVALID, /* 11 0 0001 */
- { 4, ST }, /* 11 0 0010: stwx */
- INVALID, /* 11 0 0011 */
- { 2, LD }, /* 11 0 0100: lhzx */
- { 2, LD+SE }, /* 11 0 0101: lhax */
- { 2, ST }, /* 11 0 0110: sthx */
- INVALID, /* 11 0 0111 */
- { 4, LD+F+S }, /* 11 0 1000: lfsx */
- { 8, LD+F }, /* 11 0 1001: lfdx */
- { 4, ST+F+S }, /* 11 0 1010: stfsx */
- { 8, ST+F }, /* 11 0 1011: stfdx */
- { 16, LD+F }, /* 11 0 1100: lfdpx */
- { 4, LD+F+SE }, /* 11 0 1101: lfiwax */
- { 16, ST+F }, /* 11 0 1110: stfdpx */
- { 4, ST+F }, /* 11 0 1111: stfiwx */
- { 4, LD+U }, /* 11 1 0000: lwzux */
- INVALID, /* 11 1 0001 */
- { 4, ST+U }, /* 11 1 0010: stwux */
- INVALID, /* 11 1 0011 */
- { 2, LD+U }, /* 11 1 0100: lhzux */
- { 2, LD+SE+U }, /* 11 1 0101: lhaux */
- { 2, ST+U }, /* 11 1 0110: sthux */
- INVALID, /* 11 1 0111 */
- { 4, LD+F+S+U }, /* 11 1 1000: lfsux */
- { 8, LD+F+U }, /* 11 1 1001: lfdux */
- { 4, ST+F+S+U }, /* 11 1 1010: stfsux */
- { 8, ST+F+U }, /* 11 1 1011: stfdux */
- INVALID, /* 11 1 1100 */
- { 4, LD+F }, /* 11 1 1101: lfiwzx */
- INVALID, /* 11 1 1110 */
- INVALID, /* 11 1 1111 */
-};
-
-/*
- * The dcbz (data cache block zero) instruction
- * gives an alignment fault if used on non-cacheable
- * memory. We handle the fault mainly for the
- * case when we are running with the cache disabled
- * for debugging.
- */
-static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr)
-{
- long __user *p;
- int i, size;
-
-#ifdef __powerpc64__
- size = ppc64_caches.l1d.block_size;
-#else
- size = L1_CACHE_BYTES;
-#endif
- p = (long __user *) (regs->dar & -size);
- if (user_mode(regs) && !access_ok(VERIFY_WRITE, p, size))
- return -EFAULT;
- for (i = 0; i < size / sizeof(long); ++i)
- if (__put_user_inatomic(0, p+i))
- return -EFAULT;
- return 1;
-}
-
-/*
- * Emulate load & store multiple instructions
- * On 64-bit machines, these instructions only affect/use the
- * bottom 4 bytes of each register, and the loads clear the
- * top 4 bytes of the affected register.
- */
-#ifdef __BIG_ENDIAN__
-#ifdef CONFIG_PPC64
-#define REG_BYTE(rp, i) *((u8 *)((rp) + ((i) >> 2)) + ((i) & 3) + 4)
-#else
-#define REG_BYTE(rp, i) *((u8 *)(rp) + (i))
-#endif
-#else
-#define REG_BYTE(rp, i) (*(((u8 *)((rp) + ((i)>>2)) + ((i)&3))))
-#endif
-
-#define SWIZ_PTR(p) ((unsigned char __user *)((p) ^ swiz))
-
-static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr,
- unsigned int reg, unsigned int nb,
- unsigned int flags, unsigned int instr,
- unsigned long swiz)
-{
- unsigned long *rptr;
- unsigned int nb0, i, bswiz;
- unsigned long p;
-
- /*
- * We do not try to emulate 8 bytes multiple as they aren't really
- * available in our operating environments and we don't try to
- * emulate multiples operations in kernel land as they should never
- * be used/generated there at least not on unaligned boundaries
- */
- if (unlikely((nb > 4) || !user_mode(regs)))
- return 0;
-
- /* lmw, stmw, lswi/x, stswi/x */
- nb0 = 0;
- if (flags & HARD) {
- if (flags & SX) {
- nb = regs->xer & 127;
- if (nb == 0)
- return 1;
- } else {
- unsigned long pc = regs->nip ^ (swiz & 4);
-
- if (__get_user_inatomic(instr,
- (unsigned int __user *)pc))
- return -EFAULT;
- if (swiz == 0 && (flags & SW))
- instr = cpu_to_le32(instr);
- nb = (instr >> 11) & 0x1f;
- if (nb == 0)
- nb = 32;
- }
- if (nb + reg * 4 > 128) {
- nb0 = nb + reg * 4 - 128;
- nb = 128 - reg * 4;
- }
-#ifdef __LITTLE_ENDIAN__
- /*
- * String instructions are endian neutral but the code
- * below is not. Force byte swapping on so that the
- * effects of swizzling are undone in the load/store
- * loops below.
- */
- flags ^= SW;
-#endif
- } else {
- /* lwm, stmw */
- nb = (32 - reg) * 4;
- }
-
- if (!access_ok((flags & ST ? VERIFY_WRITE: VERIFY_READ), addr, nb+nb0))
- return -EFAULT; /* bad address */
-
- rptr = &regs->gpr[reg];
- p = (unsigned long) addr;
- bswiz = (flags & SW)? 3: 0;
-
- if (!(flags & ST)) {
- /*
- * This zeroes the top 4 bytes of the affected registers
- * in 64-bit mode, and also zeroes out any remaining
- * bytes of the last register for lsw*.
- */
- memset(rptr, 0, ((nb + 3) / 4) * sizeof(unsigned long));
- if (nb0 > 0)
- memset(&regs->gpr[0], 0,
- ((nb0 + 3) / 4) * sizeof(unsigned long));
-
- for (i = 0; i < nb; ++i, ++p)
- if (__get_user_inatomic(REG_BYTE(rptr, i ^ bswiz),
- SWIZ_PTR(p)))
- return -EFAULT;
- if (nb0 > 0) {
- rptr = &regs->gpr[0];
- addr += nb;
- for (i = 0; i < nb0; ++i, ++p)
- if (__get_user_inatomic(REG_BYTE(rptr,
- i ^ bswiz),
- SWIZ_PTR(p)))
- return -EFAULT;
- }
-
- } else {
- for (i = 0; i < nb; ++i, ++p)
- if (__put_user_inatomic(REG_BYTE(rptr, i ^ bswiz),
- SWIZ_PTR(p)))
- return -EFAULT;
- if (nb0 > 0) {
- rptr = &regs->gpr[0];
- addr += nb;
- for (i = 0; i < nb0; ++i, ++p)
- if (__put_user_inatomic(REG_BYTE(rptr,
- i ^ bswiz),
- SWIZ_PTR(p)))
- return -EFAULT;
- }
- }
- return 1;
-}
-
-/*
- * Emulate floating-point pair loads and stores.
- * Only POWER6 has these instructions, and it does true little-endian,
- * so we don't need the address swizzling.
- */
-static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg,
- unsigned int flags)
-{
- char *ptr0 = (char *) &current->thread.TS_FPR(reg);
- char *ptr1 = (char *) &current->thread.TS_FPR(reg+1);
- int i, ret, sw = 0;
-
- if (reg & 1)
- return 0; /* invalid form: FRS/FRT must be even */
- if (flags & SW)
- sw = 7;
- ret = 0;
- for (i = 0; i < 8; ++i) {
- if (!(flags & ST)) {
- ret |= __get_user(ptr0[i^sw], addr + i);
- ret |= __get_user(ptr1[i^sw], addr + i + 8);
- } else {
- ret |= __put_user(ptr0[i^sw], addr + i);
- ret |= __put_user(ptr1[i^sw], addr + i + 8);
- }
- }
- if (ret)
- return -EFAULT;
- return 1; /* exception handled and fixed up */
-}
-
-#ifdef CONFIG_PPC64
-static int emulate_lq_stq(struct pt_regs *regs, unsigned char __user *addr,
- unsigned int reg, unsigned int flags)
-{
- char *ptr0 = (char *)&regs->gpr[reg];
- char *ptr1 = (char *)&regs->gpr[reg+1];
- int i, ret, sw = 0;
-
- if (reg & 1)
- return 0; /* invalid form: GPR must be even */
- if (flags & SW)
- sw = 7;
- ret = 0;
- for (i = 0; i < 8; ++i) {
- if (!(flags & ST)) {
- ret |= __get_user(ptr0[i^sw], addr + i);
- ret |= __get_user(ptr1[i^sw], addr + i + 8);
- } else {
- ret |= __put_user(ptr0[i^sw], addr + i);
- ret |= __put_user(ptr1[i^sw], addr + i + 8);
- }
- }
- if (ret)
- return -EFAULT;
- return 1; /* exception handled and fixed up */
-}
-#endif /* CONFIG_PPC64 */
#ifdef CONFIG_SPE
@@ -636,133 +282,21 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg,
}
#endif /* CONFIG_SPE */
-#ifdef CONFIG_VSX
-/*
- * Emulate VSX instructions...
- */
-static int emulate_vsx(unsigned char __user *addr, unsigned int reg,
- unsigned int areg, struct pt_regs *regs,
- unsigned int flags, unsigned int length,
- unsigned int elsize)
-{
- char *ptr;
- unsigned long *lptr;
- int ret = 0;
- int sw = 0;
- int i, j;
-
- /* userland only */
- if (unlikely(!user_mode(regs)))
- return 0;
-
- flush_vsx_to_thread(current);
-
- if (reg < 32)
- ptr = (char *) &current->thread.fp_state.fpr[reg][0];
- else
- ptr = (char *) &current->thread.vr_state.vr[reg - 32];
-
- lptr = (unsigned long *) ptr;
-
-#ifdef __LITTLE_ENDIAN__
- if (flags & SW) {
- elsize = length;
- sw = length-1;
- } else {
- /*
- * The elements are BE ordered, even in LE mode, so process
- * them in reverse order.
- */
- addr += length - elsize;
-
- /* 8 byte memory accesses go in the top 8 bytes of the VR */
- if (length == 8)
- ptr += 8;
- }
-#else
- if (flags & SW)
- sw = elsize-1;
-#endif
-
- for (j = 0; j < length; j += elsize) {
- for (i = 0; i < elsize; ++i) {
- if (flags & ST)
- ret |= __put_user(ptr[i^sw], addr + i);
- else
- ret |= __get_user(ptr[i^sw], addr + i);
- }
- ptr += elsize;
-#ifdef __LITTLE_ENDIAN__
- addr -= elsize;
-#else
- addr += elsize;
-#endif
- }
-
-#ifdef __BIG_ENDIAN__
-#define VSX_HI 0
-#define VSX_LO 1
-#else
-#define VSX_HI 1
-#define VSX_LO 0
-#endif
-
- if (!ret) {
- if (flags & U)
- regs->gpr[areg] = regs->dar;
-
- /* Splat load copies the same data to top and bottom 8 bytes */
- if (flags & SPLT)
- lptr[VSX_LO] = lptr[VSX_HI];
- /* For 8 byte loads, zero the low 8 bytes */
- else if (!(flags & ST) && (8 == length))
- lptr[VSX_LO] = 0;
- } else
- return -EFAULT;
-
- return 1;
-}
-#endif
-
/*
* Called on alignment exception. Attempts to fixup
*
* Return 1 on success
* Return 0 if unable to handle the interrupt
* Return -EFAULT if data address is bad
+ * Other negative return values indicate that the instruction can't
+ * be emulated, and the process should be given a SIGBUS.
*/
int fix_alignment(struct pt_regs *regs)
{
- unsigned int instr, nb, flags, instruction = 0;
- unsigned int reg, areg;
- unsigned int dsisr;
- unsigned char __user *addr;
- unsigned long p, swiz;
- int ret, i;
- union data {
- u64 ll;
- double dd;
- unsigned char v[8];
- struct {
-#ifdef __LITTLE_ENDIAN__
- int low32;
- unsigned hi32;
-#else
- unsigned hi32;
- int low32;
-#endif
- } x32;
- struct {
-#ifdef __LITTLE_ENDIAN__
- short low16;
- unsigned char hi48[6];
-#else
- unsigned char hi48[6];
- short low16;
-#endif
- } x16;
- } data;
+ unsigned int instr;
+ struct instruction_op op;
+ int r, type;
/*
* We require a complete register set, if not, then our assembly
@@ -770,121 +304,23 @@ int fix_alignment(struct pt_regs *regs)
*/
CHECK_FULL_REGS(regs);
- dsisr = regs->dsisr;
-
- /* Some processors don't provide us with a DSISR we can use here,
- * let's make one up from the instruction
- */
- if (cpu_has_feature(CPU_FTR_NODSISRALIGN)) {
- unsigned long pc = regs->nip;
-
- if (cpu_has_feature(CPU_FTR_PPC_LE) && (regs->msr & MSR_LE))
- pc ^= 4;
- if (unlikely(__get_user_inatomic(instr,
- (unsigned int __user *)pc)))
- return -EFAULT;
- if (cpu_has_feature(CPU_FTR_REAL_LE) && (regs->msr & MSR_LE))
- instr = cpu_to_le32(instr);
- dsisr = make_dsisr(instr);
- instruction = instr;
+ if (unlikely(__get_user(instr, (unsigned int __user *)regs->nip)))
+ return -EFAULT;
+ if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) {
+ /* We don't handle PPC little-endian any more... */
+ if (cpu_has_feature(CPU_FTR_PPC_LE))
+ return -EIO;
+ instr = swab32(instr);
}
- /* extract the operation and registers from the dsisr */
- reg = (dsisr >> 5) & 0x1f; /* source/dest register */
- areg = dsisr & 0x1f; /* register to update */
-
#ifdef CONFIG_SPE
if ((instr >> 26) == 0x4) {
+ int reg = (instr >> 21) & 0x1f;
PPC_WARN_ALIGNMENT(spe, regs);
return emulate_spe(regs, reg, instr);
}
#endif
- instr = (dsisr >> 10) & 0x7f;
- instr |= (dsisr >> 13) & 0x60;
-
- /* Lookup the operation in our table */
- nb = aligninfo[instr].len;
- flags = aligninfo[instr].flags;
-
- /*
- * Handle some cases which give overlaps in the DSISR values.
- */
- if (IS_XFORM(instruction)) {
- switch (get_xop(instruction)) {
- case 532: /* ldbrx */
- nb = 8;
- flags = LD+SW;
- break;
- case 660: /* stdbrx */
- nb = 8;
- flags = ST+SW;
- break;
- case 20: /* lwarx */
- case 84: /* ldarx */
- case 116: /* lharx */
- case 276: /* lqarx */
- return 0; /* not emulated ever */
- }
- }
-
- /* Byteswap little endian loads and stores */
- swiz = 0;
- if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) {
- flags ^= SW;
-#ifdef __BIG_ENDIAN__
- /*
- * So-called "PowerPC little endian" mode works by
- * swizzling addresses rather than by actually doing
- * any byte-swapping. To emulate this, we XOR each
- * byte address with 7. We also byte-swap, because
- * the processor's address swizzling depends on the
- * operand size (it xors the address with 7 for bytes,
- * 6 for halfwords, 4 for words, 0 for doublewords) but
- * we will xor with 7 and load/store each byte separately.
- */
- if (cpu_has_feature(CPU_FTR_PPC_LE))
- swiz = 7;
-#endif
- }
-
- /* DAR has the operand effective address */
- addr = (unsigned char __user *)regs->dar;
-
-#ifdef CONFIG_VSX
- if ((instruction & 0xfc00003e) == 0x7c000018) {
- unsigned int elsize;
-
- /* Additional register addressing bit (64 VSX vs 32 FPR/GPR) */
- reg |= (instruction & 0x1) << 5;
- /* Simple inline decoder instead of a table */
- /* VSX has only 8 and 16 byte memory accesses */
- nb = 8;
- if (instruction & 0x200)
- nb = 16;
-
- /* Vector stores in little-endian mode swap individual
- elements, so process them separately */
- elsize = 4;
- if (instruction & 0x80)
- elsize = 8;
-
- flags = 0;
- if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE))
- flags |= SW;
- if (instruction & 0x100)
- flags |= ST;
- if (instruction & 0x040)
- flags |= U;
- /* splat load needs a special decoder */
- if ((instruction & 0x400) == 0){
- flags |= SPLT;
- nb = 8;
- }
- PPC_WARN_ALIGNMENT(vsx, regs);
- return emulate_vsx(addr, reg, areg, regs, flags, nb, elsize);
- }
-#endif
/*
* ISA 3.0 (such as P9) copy, copy_first, paste and paste_last alignment
@@ -896,173 +332,27 @@ int fix_alignment(struct pt_regs *regs)
* when pasting to a co-processor. Furthermore, paste_last is the
* synchronisation point for preceding copy/paste sequences.
*/
- if ((instruction & 0xfc0006fe) == PPC_INST_COPY)
+ if ((instr & 0xfc0006fe) == PPC_INST_COPY)
return -EIO;
- /* A size of 0 indicates an instruction we don't support, with
- * the exception of DCBZ which is handled as a special case here
- */
- if (instr == DCBZ) {
- PPC_WARN_ALIGNMENT(dcbz, regs);
- return emulate_dcbz(regs, addr);
- }
- if (unlikely(nb == 0))
- return 0;
-
- /* Load/Store Multiple instructions are handled in their own
- * function
- */
- if (flags & M) {
- PPC_WARN_ALIGNMENT(multiple, regs);
- return emulate_multiple(regs, addr, reg, nb,
- flags, instr, swiz);
- }
-
- /* Verify the address of the operand */
- if (unlikely(user_mode(regs) &&
- !access_ok((flags & ST ? VERIFY_WRITE : VERIFY_READ),
- addr, nb)))
- return -EFAULT;
-
- /* Force the fprs into the save area so we can reference them */
- if (flags & F) {
- /* userland only */
- if (unlikely(!user_mode(regs)))
- return 0;
- flush_fp_to_thread(current);
- }
+ r = analyse_instr(&op, regs, instr);
+ if (r < 0)
+ return -EINVAL;
- if (nb == 16) {
- if (flags & F) {
- /* Special case for 16-byte FP loads and stores */
- PPC_WARN_ALIGNMENT(fp_pair, regs);
- return emulate_fp_pair(addr, reg, flags);
- } else {
-#ifdef CONFIG_PPC64
- /* Special case for 16-byte loads and stores */
- PPC_WARN_ALIGNMENT(lq_stq, regs);
- return emulate_lq_stq(regs, addr, reg, flags);
-#else
- return 0;
-#endif
- }
- }
-
- PPC_WARN_ALIGNMENT(unaligned, regs);
-
- /* If we are loading, get the data from user space, else
- * get it from register values
- */
- if (!(flags & ST)) {
- unsigned int start = 0;
-
- switch (nb) {
- case 4:
- start = offsetof(union data, x32.low32);
- break;
- case 2:
- start = offsetof(union data, x16.low16);
- break;
- }
-
- data.ll = 0;
- ret = 0;
- p = (unsigned long)addr;
-
- for (i = 0; i < nb; i++)
- ret |= __get_user_inatomic(data.v[start + i],
- SWIZ_PTR(p++));
-
- if (unlikely(ret))
- return -EFAULT;
-
- } else if (flags & F) {
- data.ll = current->thread.TS_FPR(reg);
- if (flags & S) {
- /* Single-precision FP store requires conversion... */
-#ifdef CONFIG_PPC_FPU
- preempt_disable();
- enable_kernel_fp();
- cvt_df(&data.dd, (float *)&data.x32.low32);
- disable_kernel_fp();
- preempt_enable();
-#else
- return 0;
-#endif
- }
- } else
- data.ll = regs->gpr[reg];
-
- if (flags & SW) {
- switch (nb) {
- case 8:
- data.ll = swab64(data.ll);
- break;
- case 4:
- data.x32.low32 = swab32(data.x32.low32);
- break;
- case 2:
- data.x16.low16 = swab16(data.x16.low16);
- break;
- }
- }
-
- /* Perform other misc operations like sign extension
- * or floating point single precision conversion
- */
- switch (flags & ~(U|SW)) {
- case LD+SE: /* sign extending integer loads */
- case LD+F+SE: /* sign extend for lfiwax */
- if ( nb == 2 )
- data.ll = data.x16.low16;
- else /* nb must be 4 */
- data.ll = data.x32.low32;
- break;
-
- /* Single-precision FP load requires conversion... */
- case LD+F+S:
-#ifdef CONFIG_PPC_FPU
- preempt_disable();
- enable_kernel_fp();
- cvt_fd((float *)&data.x32.low32, &data.dd);
- disable_kernel_fp();
- preempt_enable();
-#else
- return 0;
-#endif
- break;
+ type = op.type & INSTR_TYPE_MASK;
+ if (!OP_IS_LOAD_STORE(type)) {
+ if (type != CACHEOP + DCBZ)
+ return -EINVAL;
+ PPC_WARN_ALIGNMENT(dcbz, regs);
+ r = emulate_dcbz(op.ea, regs);
+ } else {
+ if (type == LARX || type == STCX)
+ return -EIO;
+ PPC_WARN_ALIGNMENT(unaligned, regs);
+ r = emulate_loadstore(regs, &op);
}
- /* Store result to memory or update registers */
- if (flags & ST) {
- unsigned int start = 0;
-
- switch (nb) {
- case 4:
- start = offsetof(union data, x32.low32);
- break;
- case 2:
- start = offsetof(union data, x16.low16);
- break;
- }
-
- ret = 0;
- p = (unsigned long)addr;
-
- for (i = 0; i < nb; i++)
- ret |= __put_user_inatomic(data.v[start + i],
- SWIZ_PTR(p++));
-
- if (unlikely(ret))
- return -EFAULT;
- } else if (flags & F)
- current->thread.TS_FPR(reg) = data.ll;
- else
- regs->gpr[reg] = data.ll;
-
- /* Update RA as needed */
- if (flags & U)
- regs->gpr[areg] = regs->dar;
-
- return 1;
+ if (!r)
+ return 1;
+ return r;
}
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 709e23425317..8cfb20e38cfe 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -100,12 +100,12 @@ int main(void)
OFFSET(THREAD_NORMSAVES, thread_struct, normsave[0]);
#endif
OFFSET(THREAD_FPEXC_MODE, thread_struct, fpexc_mode);
- OFFSET(THREAD_FPSTATE, thread_struct, fp_state);
+ OFFSET(THREAD_FPSTATE, thread_struct, fp_state.fpr);
OFFSET(THREAD_FPSAVEAREA, thread_struct, fp_save_area);
OFFSET(FPSTATE_FPSCR, thread_fp_state, fpscr);
OFFSET(THREAD_LOAD_FP, thread_struct, load_fp);
#ifdef CONFIG_ALTIVEC
- OFFSET(THREAD_VRSTATE, thread_struct, vr_state);
+ OFFSET(THREAD_VRSTATE, thread_struct, vr_state.vr);
OFFSET(THREAD_VRSAVEAREA, thread_struct, vr_save_area);
OFFSET(THREAD_VRSAVE, thread_struct, vrsave);
OFFSET(THREAD_USED_VR, thread_struct, used_vr);
@@ -145,9 +145,9 @@ int main(void)
OFFSET(THREAD_TM_PPR, thread_struct, tm_ppr);
OFFSET(THREAD_TM_DSCR, thread_struct, tm_dscr);
OFFSET(PT_CKPT_REGS, thread_struct, ckpt_regs);
- OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state);
+ OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state.vr);
OFFSET(THREAD_CKVRSAVE, thread_struct, ckvrsave);
- OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state);
+ OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state.fpr);
/* Local pt_regs on stack for Transactional Memory funcs. */
DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD +
sizeof(struct pt_regs) + 16);
@@ -485,6 +485,7 @@ int main(void)
OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls);
OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v);
OFFSET(KVM_RADIX, kvm, arch.radix);
+ OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled);
OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr);
OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar);
OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
@@ -513,6 +514,7 @@ int main(void)
OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions);
OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded);
OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded);
+ OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request);
OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr);
OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc);
OFFSET(VCPU_SPMC, kvm_vcpu, arch.spmc);
@@ -542,6 +544,7 @@ int main(void)
OFFSET(VCPU_WORT, kvm_vcpu, arch.wort);
OFFSET(VCPU_TID, kvm_vcpu, arch.tid);
OFFSET(VCPU_PSSCR, kvm_vcpu, arch.psscr);
+ OFFSET(VCPU_HFSCR, kvm_vcpu, arch.hfscr);
OFFSET(VCORE_ENTRY_EXIT, kvmppc_vcore, entry_exit_map);
OFFSET(VCORE_IN_GUEST, kvmppc_vcore, in_guest);
OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads);
@@ -742,9 +745,19 @@ int main(void)
OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask);
OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask);
OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas);
+ OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr);
+#define STOP_SPR(x, f) OFFSET(x, paca_struct, stop_sprs.f)
+ STOP_SPR(STOP_PID, pid);
+ STOP_SPR(STOP_LDBAR, ldbar);
+ STOP_SPR(STOP_FSCR, fscr);
+ STOP_SPR(STOP_HFSCR, hfscr);
+ STOP_SPR(STOP_MMCR1, mmcr1);
+ STOP_SPR(STOP_MMCR2, mmcr2);
+ STOP_SPR(STOP_MMCRA, mmcra);
#endif
DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
+ DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE);
#ifdef CONFIG_PPC_8xx
DEFINE(VIRT_IMMR_BASE, (u64)__fix_to_virt(FIX_IMMR_BASE));
diff --git a/arch/powerpc/kernel/btext.c b/arch/powerpc/kernel/btext.c
index 8275858a434d..3f46ca1c59f9 100644
--- a/arch/powerpc/kernel/btext.c
+++ b/arch/powerpc/kernel/btext.c
@@ -253,7 +253,7 @@ int __init btext_find_display(int allow_nonstdout)
for_each_node_by_type(np, "display") {
if (of_get_property(np, "linux,opened", NULL)) {
- printk("trying %s ...\n", np->full_name);
+ printk("trying %pOF ...\n", np);
rc = btext_initialize(np);
printk("result: %d\n", rc);
}
diff --git a/arch/powerpc/kernel/cacheinfo.c b/arch/powerpc/kernel/cacheinfo.c
index c641983bbdd6..a8f20e5928e1 100644
--- a/arch/powerpc/kernel/cacheinfo.c
+++ b/arch/powerpc/kernel/cacheinfo.c
@@ -167,10 +167,10 @@ static void release_cache_debugcheck(struct cache *cache)
list_for_each_entry(iter, &cache_list, list)
WARN_ONCE(iter->next_local == cache,
- "cache for %s(%s) refers to cache for %s(%s)\n",
- iter->ofnode->full_name,
+ "cache for %pOF(%s) refers to cache for %pOF(%s)\n",
+ iter->ofnode,
cache_type_string(iter),
- cache->ofnode->full_name,
+ cache->ofnode,
cache_type_string(cache));
}
@@ -179,8 +179,8 @@ static void release_cache(struct cache *cache)
if (!cache)
return;
- pr_debug("freeing L%d %s cache for %s\n", cache->level,
- cache_type_string(cache), cache->ofnode->full_name);
+ pr_debug("freeing L%d %s cache for %pOF\n", cache->level,
+ cache_type_string(cache), cache->ofnode);
release_cache_debugcheck(cache);
list_del(&cache->list);
@@ -194,8 +194,8 @@ static void cache_cpu_set(struct cache *cache, int cpu)
while (next) {
WARN_ONCE(cpumask_test_cpu(cpu, &next->shared_cpu_map),
- "CPU %i already accounted in %s(%s)\n",
- cpu, next->ofnode->full_name,
+ "CPU %i already accounted in %pOF(%s)\n",
+ cpu, next->ofnode,
cache_type_string(next));
cpumask_set_cpu(cpu, &next->shared_cpu_map);
next = next->next_local;
@@ -355,7 +355,7 @@ static int cache_is_unified_d(const struct device_node *np)
*/
static struct cache *cache_do_one_devnode_unified(struct device_node *node, int level)
{
- pr_debug("creating L%d ucache for %s\n", level, node->full_name);
+ pr_debug("creating L%d ucache for %pOF\n", level, node);
return new_cache(cache_is_unified_d(node), level, node);
}
@@ -365,8 +365,8 @@ static struct cache *cache_do_one_devnode_split(struct device_node *node,
{
struct cache *dcache, *icache;
- pr_debug("creating L%d dcache and icache for %s\n", level,
- node->full_name);
+ pr_debug("creating L%d dcache and icache for %pOF\n", level,
+ node);
dcache = new_cache(CACHE_TYPE_DATA, level, node);
icache = new_cache(CACHE_TYPE_INSTRUCTION, level, node);
@@ -679,7 +679,6 @@ static struct kobj_type cache_index_type = {
static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
{
- const char *cache_name;
const char *cache_type;
struct cache *cache;
char *buf;
@@ -690,7 +689,6 @@ static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
return;
cache = dir->cache;
- cache_name = cache->ofnode->full_name;
cache_type = cache_type_string(cache);
/* We don't want to create an attribute that can't provide a
@@ -707,14 +705,14 @@ static void cacheinfo_create_index_opt_attrs(struct cache_index_dir *dir)
rc = attr->show(&dir->kobj, attr, buf);
if (rc <= 0) {
pr_debug("not creating %s attribute for "
- "%s(%s) (rc = %zd)\n",
- attr->attr.name, cache_name,
+ "%pOF(%s) (rc = %zd)\n",
+ attr->attr.name, cache->ofnode,
cache_type, rc);
continue;
}
if (sysfs_create_file(&dir->kobj, &attr->attr))
- pr_debug("could not create %s attribute for %s(%s)\n",
- attr->attr.name, cache_name, cache_type);
+ pr_debug("could not create %s attribute for %pOF(%s)\n",
+ attr->attr.name, cache->ofnode, cache_type);
}
kfree(buf);
@@ -831,8 +829,8 @@ static void cache_cpu_clear(struct cache *cache, int cpu)
struct cache *next = cache->next_local;
WARN_ONCE(!cpumask_test_cpu(cpu, &cache->shared_cpu_map),
- "CPU %i not accounted in %s(%s)\n",
- cpu, cache->ofnode->full_name,
+ "CPU %i not accounted in %pOF(%s)\n",
+ cpu, cache->ofnode,
cache_type_string(cache));
cpumask_clear_cpu(cpu, &cache->shared_cpu_map);
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 10cb2896b2ae..610955fe8b81 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -218,13 +218,20 @@ __init_tlb_power8:
ptesync
1: blr
+/*
+ * Flush the TLB in hash mode. Hash must flush with RIC=2 once for process
+ * and one for partition scope to clear process and partition table entries.
+ */
__init_tlb_power9:
- li r6,POWER9_TLB_SETS_HASH
+ li r6,POWER9_TLB_SETS_HASH - 1
mtctr r6
li r7,0xc00 /* IS field = 0b11 */
+ li r8,0
ptesync
-2: tlbiel r7
- addi r7,r7,0x1000
+ PPC_TLBIEL(7, 8, 2, 1, 0)
+ PPC_TLBIEL(7, 8, 2, 0, 0)
+2: addi r7,r7,0x1000
+ PPC_TLBIEL(7, 8, 0, 0, 0)
bdnz 2b
ptesync
1: blr
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 6f849832a669..760872916013 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -1259,10 +1259,10 @@ static struct cpu_spec __initdata cpu_specs[] = {
.platform = "ppc603",
},
#endif /* CONFIG_PPC_BOOK3S_32 */
-#ifdef CONFIG_8xx
+#ifdef CONFIG_PPC_8xx
{ /* 8xx */
.pvr_mask = 0xffff0000,
- .pvr_value = 0x00500000,
+ .pvr_value = PVR_8xx,
.cpu_name = "8xx",
/* CPU_FTR_MAYBE_CAN_DOZE is possible,
* if the 8xx code is there.... */
@@ -1274,7 +1274,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.machine_check = machine_check_8xx,
.platform = "ppc823",
},
-#endif /* CONFIG_8xx */
+#endif /* CONFIG_PPC_8xx */
#ifdef CONFIG_40x
{ /* 403GC */
.pvr_mask = 0xffffff00,
@@ -1936,6 +1936,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.machine_check = machine_check_440A,
.platform = "ppc440",
},
+#ifdef CONFIG_PPC_47x
{ /* 476 DD2 core */
.pvr_mask = 0xffffffff,
.pvr_value = 0x11a52080,
@@ -1992,6 +1993,7 @@ static struct cpu_spec __initdata cpu_specs[] = {
.machine_check = machine_check_47x,
.platform = "ppc470",
},
+#endif /* CONFIG_PPC_47x */
{ /* default match */
.pvr_mask = 0x00000000,
.pvr_value = 0x00000000,
diff --git a/arch/powerpc/kernel/dma-iommu.c b/arch/powerpc/kernel/dma-iommu.c
index fb7cbaa37658..8f7abf9baa63 100644
--- a/arch/powerpc/kernel/dma-iommu.c
+++ b/arch/powerpc/kernel/dma-iommu.c
@@ -105,6 +105,11 @@ static u64 dma_iommu_get_required_mask(struct device *dev)
return mask;
}
+int dma_iommu_mapping_error(struct device *dev, dma_addr_t dma_addr)
+{
+ return dma_addr == IOMMU_MAPPING_ERROR;
+}
+
struct dma_map_ops dma_iommu_ops = {
.alloc = dma_iommu_alloc_coherent,
.free = dma_iommu_free_coherent,
@@ -115,5 +120,6 @@ struct dma_map_ops dma_iommu_ops = {
.map_page = dma_iommu_map_page,
.unmap_page = dma_iommu_unmap_page,
.get_required_mask = dma_iommu_get_required_mask,
+ .mapping_error = dma_iommu_mapping_error,
};
EXPORT_SYMBOL(dma_iommu_ops);
diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c
index 41c749586bd2..4194bbbbdb10 100644
--- a/arch/powerpc/kernel/dma.c
+++ b/arch/powerpc/kernel/dma.c
@@ -314,18 +314,6 @@ EXPORT_SYMBOL(dma_set_coherent_mask);
#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
-int __dma_set_mask(struct device *dev, u64 dma_mask)
-{
- const struct dma_map_ops *dma_ops = get_dma_ops(dev);
-
- if ((dma_ops != NULL) && (dma_ops->set_dma_mask != NULL))
- return dma_ops->set_dma_mask(dev, dma_mask);
- if (!dev->dma_mask || !dma_supported(dev, dma_mask))
- return -EIO;
- *dev->dma_mask = dma_mask;
- return 0;
-}
-
int dma_set_mask(struct device *dev, u64 dma_mask)
{
if (ppc_md.dma_set_mask)
@@ -338,7 +326,10 @@ int dma_set_mask(struct device *dev, u64 dma_mask)
return phb->controller_ops.dma_set_mask(pdev, dma_mask);
}
- return __dma_set_mask(dev, dma_mask);
+ if (!dev->dma_mask || !dma_supported(dev, dma_mask))
+ return -EIO;
+ *dev->dma_mask = dma_mask;
+ return 0;
}
EXPORT_SYMBOL(dma_set_mask);
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 4c7656dc4e04..1df770e8cbe0 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -94,9 +94,6 @@ static void (*init_pmu_registers)(void);
static void cpufeatures_flush_tlb(void)
{
- unsigned long rb;
- unsigned int i, num_sets;
-
/*
* This is a temporary measure to keep equivalent TLB flush as the
* cputable based setup code.
@@ -105,24 +102,15 @@ static void cpufeatures_flush_tlb(void)
case PVR_POWER8:
case PVR_POWER8E:
case PVR_POWER8NVL:
- num_sets = POWER8_TLB_SETS;
+ __flush_tlb_power8(POWER8_TLB_SETS);
break;
case PVR_POWER9:
- num_sets = POWER9_TLB_SETS_HASH;
+ __flush_tlb_power9(POWER9_TLB_SETS_HASH);
break;
default:
- num_sets = 1;
pr_err("unknown CPU version for boot TLB flush\n");
break;
}
-
- asm volatile("ptesync" : : : "memory");
- rb = TLBIEL_INVAL_SET;
- for (i = 0; i < num_sets; i++) {
- asm volatile("tlbiel %0" : : "r" (rb));
- rb += 1 << TLBIEL_INVAL_SET_SHIFT;
- }
- asm volatile("ptesync" : : : "memory");
}
static void __restore_cpu_cpufeatures(void)
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 63992b2d8e15..9e816787c0d4 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -44,6 +44,7 @@
#include <asm/machdep.h>
#include <asm/ppc-pci.h>
#include <asm/rtas.h>
+#include <asm/pte-walk.h>
/** Overview:
@@ -169,10 +170,10 @@ static size_t eeh_dump_dev_log(struct eeh_dev *edev, char *buf, size_t len)
char buffer[128];
n += scnprintf(buf+n, len-n, "%04x:%02x:%02x.%01x\n",
- edev->phb->global_number, pdn->busno,
+ pdn->phb->global_number, pdn->busno,
PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
pr_warn("EEH: of node=%04x:%02x:%02x.%01x\n",
- edev->phb->global_number, pdn->busno,
+ pdn->phb->global_number, pdn->busno,
PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
eeh_ops->read_config(pdn, PCI_VENDOR_ID, 4, &cfg);
@@ -352,8 +353,7 @@ static inline unsigned long eeh_token_to_phys(unsigned long token)
* worried about _PAGE_SPLITTING/collapse. Also we will not hit
* page table free, because of init_mm.
*/
- ptep = __find_linux_pte_or_hugepte(init_mm.pgd, token,
- NULL, &hugepage_shift);
+ ptep = find_init_mm_pte(token, &hugepage_shift);
if (!ptep)
return token;
WARN_ON(hugepage_shift);
@@ -435,7 +435,7 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
int ret;
int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE);
unsigned long flags;
- struct pci_dn *pdn;
+ struct device_node *dn;
struct pci_dev *dev;
struct eeh_pe *pe, *parent_pe, *phb_pe;
int rc = 0;
@@ -493,9 +493,10 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
if (pe->state & EEH_PE_ISOLATED) {
pe->check_count++;
if (pe->check_count % EEH_MAX_FAILS == 0) {
- pdn = eeh_dev_to_pdn(edev);
- if (pdn->node)
- location = of_get_property(pdn->node, "ibm,loc-code", NULL);
+ dn = pci_device_to_OF_node(dev);
+ if (dn)
+ location = of_get_property(dn, "ibm,loc-code",
+ NULL);
printk(KERN_ERR "EEH: %d reads ignored for recovering device at "
"location=%s driver=%s pci addr=%s\n",
pe->check_count,
@@ -1064,7 +1065,7 @@ core_initcall_sync(eeh_init);
*/
void eeh_add_device_early(struct pci_dn *pdn)
{
- struct pci_controller *phb;
+ struct pci_controller *phb = pdn ? pdn->phb : NULL;
struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
if (!edev)
@@ -1074,7 +1075,6 @@ void eeh_add_device_early(struct pci_dn *pdn)
return;
/* USB Bus children of PCI devices will not have BUID's */
- phb = edev->phb;
if (NULL == phb ||
(eeh_has_flag(EEH_PROBE_MODE_DEVTREE) && 0 == phb->buid))
return;
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
index d6b2ca70d14d..ad04ecd63c20 100644
--- a/arch/powerpc/kernel/eeh_dev.c
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -50,21 +50,16 @@
*/
struct eeh_dev *eeh_dev_init(struct pci_dn *pdn)
{
- struct pci_controller *phb = pdn->phb;
struct eeh_dev *edev;
/* Allocate EEH device */
edev = kzalloc(sizeof(*edev), GFP_KERNEL);
- if (!edev) {
- pr_warn("%s: out of memory\n",
- __func__);
+ if (!edev)
return NULL;
- }
/* Associate EEH device with OF node */
pdn->edev = edev;
edev->pdn = pdn;
- edev->phb = phb;
INIT_LIST_HEAD(&edev->list);
INIT_LIST_HEAD(&edev->rmv_list);
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index c405c79e50cd..8b840191df59 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -428,7 +428,7 @@ static void *eeh_add_virt_device(void *data, void *userdata)
if (!(edev->physfn)) {
pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n",
- __func__, edev->phb->global_number, pdn->busno,
+ __func__, pdn->phb->global_number, pdn->busno,
PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
return NULL;
}
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index cc4b206f77e4..2e8d1b2b5af4 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -230,10 +230,15 @@ void *eeh_pe_dev_traverse(struct eeh_pe *root,
* Bus/Device/Function number. The extra data referred by flag
* indicates which type of address should be used.
*/
+struct eeh_pe_get_flag {
+ int pe_no;
+ int config_addr;
+};
+
static void *__eeh_pe_get(void *data, void *flag)
{
struct eeh_pe *pe = (struct eeh_pe *)data;
- struct eeh_dev *edev = (struct eeh_dev *)flag;
+ struct eeh_pe_get_flag *tmp = (struct eeh_pe_get_flag *) flag;
/* Unexpected PHB PE */
if (pe->type & EEH_PE_PHB)
@@ -244,17 +249,17 @@ static void *__eeh_pe_get(void *data, void *flag)
* have non-zero PE address
*/
if (eeh_has_flag(EEH_VALID_PE_ZERO)) {
- if (edev->pe_config_addr == pe->addr)
+ if (tmp->pe_no == pe->addr)
return pe;
} else {
- if (edev->pe_config_addr &&
- (edev->pe_config_addr == pe->addr))
+ if (tmp->pe_no &&
+ (tmp->pe_no == pe->addr))
return pe;
}
/* Try BDF address */
- if (edev->config_addr &&
- (edev->config_addr == pe->config_addr))
+ if (tmp->config_addr &&
+ (tmp->config_addr == pe->config_addr))
return pe;
return NULL;
@@ -262,7 +267,9 @@ static void *__eeh_pe_get(void *data, void *flag)
/**
* eeh_pe_get - Search PE based on the given address
- * @edev: EEH device
+ * @phb: PCI controller
+ * @pe_no: PE number
+ * @config_addr: Config address
*
* Search the corresponding PE based on the specified address which
* is included in the eeh device. The function is used to check if
@@ -271,12 +278,14 @@ static void *__eeh_pe_get(void *data, void *flag)
* which is composed of PCI bus/device/function number, or unified
* PE address.
*/
-struct eeh_pe *eeh_pe_get(struct eeh_dev *edev)
+struct eeh_pe *eeh_pe_get(struct pci_controller *phb,
+ int pe_no, int config_addr)
{
- struct eeh_pe *root = eeh_phb_pe_get(edev->phb);
+ struct eeh_pe *root = eeh_phb_pe_get(phb);
+ struct eeh_pe_get_flag tmp = { pe_no, config_addr };
struct eeh_pe *pe;
- pe = eeh_pe_traverse(root, __eeh_pe_get, edev);
+ pe = eeh_pe_traverse(root, __eeh_pe_get, &tmp);
return pe;
}
@@ -330,11 +339,13 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
int eeh_add_to_parent_pe(struct eeh_dev *edev)
{
struct eeh_pe *pe, *parent;
+ struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+ int config_addr = (pdn->busno << 8) | (pdn->devfn);
/* Check if the PE number is valid */
if (!eeh_has_flag(EEH_VALID_PE_ZERO) && !edev->pe_config_addr) {
pr_err("%s: Invalid PE#0 for edev 0x%x on PHB#%x\n",
- __func__, edev->config_addr, edev->phb->global_number);
+ __func__, config_addr, pdn->phb->global_number);
return -EINVAL;
}
@@ -344,7 +355,7 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
* PE should be composed of PCI bus and its subordinate
* components.
*/
- pe = eeh_pe_get(edev);
+ pe = eeh_pe_get(pdn->phb, edev->pe_config_addr, config_addr);
if (pe && !(pe->type & EEH_PE_INVALID)) {
/* Mark the PE as type of PCI bus */
pe->type = EEH_PE_BUS;
@@ -353,11 +364,11 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
/* Put the edev to PE */
list_add_tail(&edev->list, &pe->edevs);
pr_debug("EEH: Add %04x:%02x:%02x.%01x to Bus PE#%x\n",
- edev->phb->global_number,
- edev->config_addr >> 8,
- PCI_SLOT(edev->config_addr & 0xFF),
- PCI_FUNC(edev->config_addr & 0xFF),
- pe->addr);
+ pdn->phb->global_number,
+ pdn->busno,
+ PCI_SLOT(pdn->devfn),
+ PCI_FUNC(pdn->devfn),
+ pe->addr);
return 0;
} else if (pe && (pe->type & EEH_PE_INVALID)) {
list_add_tail(&edev->list, &pe->edevs);
@@ -376,25 +387,25 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
pr_debug("EEH: Add %04x:%02x:%02x.%01x to Device "
"PE#%x, Parent PE#%x\n",
- edev->phb->global_number,
- edev->config_addr >> 8,
- PCI_SLOT(edev->config_addr & 0xFF),
- PCI_FUNC(edev->config_addr & 0xFF),
- pe->addr, pe->parent->addr);
+ pdn->phb->global_number,
+ pdn->busno,
+ PCI_SLOT(pdn->devfn),
+ PCI_FUNC(pdn->devfn),
+ pe->addr, pe->parent->addr);
return 0;
}
/* Create a new EEH PE */
if (edev->physfn)
- pe = eeh_pe_alloc(edev->phb, EEH_PE_VF);
+ pe = eeh_pe_alloc(pdn->phb, EEH_PE_VF);
else
- pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
+ pe = eeh_pe_alloc(pdn->phb, EEH_PE_DEVICE);
if (!pe) {
pr_err("%s: out of memory!\n", __func__);
return -ENOMEM;
}
pe->addr = edev->pe_config_addr;
- pe->config_addr = edev->config_addr;
+ pe->config_addr = config_addr;
/*
* Put the new EEH PE into hierarchy tree. If the parent
@@ -404,10 +415,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
*/
parent = eeh_pe_get_parent(edev);
if (!parent) {
- parent = eeh_phb_pe_get(edev->phb);
+ parent = eeh_phb_pe_get(pdn->phb);
if (!parent) {
pr_err("%s: No PHB PE is found (PHB Domain=%d)\n",
- __func__, edev->phb->global_number);
+ __func__, pdn->phb->global_number);
edev->pe = NULL;
kfree(pe);
return -EEXIST;
@@ -424,10 +435,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
edev->pe = pe;
pr_debug("EEH: Add %04x:%02x:%02x.%01x to "
"Device PE#%x, Parent PE#%x\n",
- edev->phb->global_number,
- edev->config_addr >> 8,
- PCI_SLOT(edev->config_addr & 0xFF),
- PCI_FUNC(edev->config_addr & 0xFF),
+ pdn->phb->global_number,
+ pdn->busno,
+ PCI_SLOT(pdn->devfn),
+ PCI_FUNC(pdn->devfn),
pe->addr, pe->parent->addr);
return 0;
@@ -446,13 +457,14 @@ int eeh_rmv_from_parent_pe(struct eeh_dev *edev)
{
struct eeh_pe *pe, *parent, *child;
int cnt;
+ struct pci_dn *pdn = eeh_dev_to_pdn(edev);
if (!edev->pe) {
pr_debug("%s: No PE found for device %04x:%02x:%02x.%01x\n",
- __func__, edev->phb->global_number,
- edev->config_addr >> 8,
- PCI_SLOT(edev->config_addr & 0xFF),
- PCI_FUNC(edev->config_addr & 0xFF));
+ __func__, pdn->phb->global_number,
+ pdn->busno,
+ PCI_SLOT(pdn->devfn),
+ PCI_FUNC(pdn->devfn));
return -EEXIST;
}
@@ -712,10 +724,10 @@ static void eeh_bridge_check_link(struct eeh_dev *edev)
return;
pr_debug("%s: Check PCIe link for %04x:%02x:%02x.%01x ...\n",
- __func__, edev->phb->global_number,
- edev->config_addr >> 8,
- PCI_SLOT(edev->config_addr & 0xFF),
- PCI_FUNC(edev->config_addr & 0xFF));
+ __func__, pdn->phb->global_number,
+ pdn->busno,
+ PCI_SLOT(pdn->devfn),
+ PCI_FUNC(pdn->devfn));
/* Check slot status */
cap = edev->pcie_cap;
diff --git a/arch/powerpc/kernel/eeh_sysfs.c b/arch/powerpc/kernel/eeh_sysfs.c
index 1ceecdda810b..797549289798 100644
--- a/arch/powerpc/kernel/eeh_sysfs.c
+++ b/arch/powerpc/kernel/eeh_sysfs.c
@@ -51,7 +51,6 @@ static ssize_t eeh_show_##_name(struct device *dev, \
static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL);
EEH_SHOW_ATTR(eeh_mode, mode, "0x%x");
-EEH_SHOW_ATTR(eeh_config_addr, config_addr, "0x%x");
EEH_SHOW_ATTR(eeh_pe_config_addr, pe_config_addr, "0x%x");
static ssize_t eeh_pe_state_show(struct device *dev,
@@ -103,7 +102,6 @@ void eeh_sysfs_add_device(struct pci_dev *pdev)
return;
rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode);
- rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr);
rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_state);
@@ -128,7 +126,6 @@ void eeh_sysfs_remove_device(struct pci_dev *pdev)
}
device_remove_file(&pdev->dev, &dev_attr_eeh_mode);
- device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr);
device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr);
device_remove_file(&pdev->dev, &dev_attr_eeh_pe_state);
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 8587059ad848..e780e1fbf6c2 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -43,6 +43,13 @@
#define LOAD_MSR_KERNEL(r, x) li r,(x)
#endif
+/*
+ * Align to 4k in order to ensure that all functions modyfing srr0/srr1
+ * fit into one page in order to not encounter a TLB miss between the
+ * modification of srr0/srr1 and the associated rfi.
+ */
+ .align 12
+
#ifdef CONFIG_BOOKE
.globl mcheck_transfer_to_handler
mcheck_transfer_to_handler:
@@ -586,6 +593,10 @@ ppc_swapcontext:
handle_page_fault:
stw r4,_DAR(r1)
addi r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_6xx
+ andis. r0,r5,DSISR_DABRMATCH@h
+ bne- handle_dabr_fault
+#endif
bl do_page_fault
cmpwi r3,0
beq+ ret_from_except
@@ -599,6 +610,17 @@ handle_page_fault:
bl bad_page_fault
b ret_from_except_full
+#ifdef CONFIG_6xx
+ /* We have a data breakpoint exception - handle it */
+handle_dabr_fault:
+ SAVE_NVGPRS(r1)
+ lwz r0,_TRAP(r1)
+ clrrwi r0,r0,1
+ stw r0,_TRAP(r1)
+ bl do_break
+ b ret_from_except_full
+#endif
+
/*
* This routine switches between two different tasks. The process
* state of one is saved on its kernel stack. Then the state
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index bfbad08a1207..4a0fd4f40245 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -57,7 +57,7 @@ system_call_common:
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */
- bne tabort_syscall
+ bne .Ltabort_syscall
END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
andi. r10,r12,MSR_PR
@@ -143,6 +143,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
mtmsrd r11,1
#endif /* CONFIG_PPC_BOOK3E */
+system_call: /* label this so stack traces look sane */
/* We do need to set SOFTE in the stack frame or the return
* from interrupt will be painful
*/
@@ -152,11 +153,11 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
CURRENT_THREAD_INFO(r11, r1)
ld r10,TI_FLAGS(r11)
andi. r11,r10,_TIF_SYSCALL_DOTRACE
- bne syscall_dotrace /* does not return */
+ bne .Lsyscall_dotrace /* does not return */
cmpldi 0,r0,NR_syscalls
- bge- syscall_enosys
+ bge- .Lsyscall_enosys
-system_call: /* label this so stack traces look sane */
+.Lsyscall:
/*
* Need to vector to 32 Bit or default sys_call_table here,
* based on caller's run-mode / personality.
@@ -185,8 +186,20 @@ system_call: /* label this so stack traces look sane */
#ifdef CONFIG_PPC_BOOK3S
/* No MSR:RI on BookE */
andi. r10,r8,MSR_RI
- beq- unrecov_restore
+ beq- .Lunrecov_restore
#endif
+
+/*
+ * This is a few instructions into the actual syscall exit path (which actually
+ * starts at .Lsyscall_exit) to cater to kprobe blacklisting and to reduce the
+ * number of visible symbols for profiling purposes.
+ *
+ * We can probe from system_call until this point as MSR_RI is set. But once it
+ * is cleared below, we won't be able to take a trap.
+ *
+ * This is blacklisted from kprobes further below with _ASM_NOKPROBE_SYMBOL().
+ */
+system_call_exit:
/*
* Disable interrupts so current_thread_info()->flags can't change,
* and so that we don't get interrupted after loading SRR0/1.
@@ -208,7 +221,7 @@ system_call: /* label this so stack traces look sane */
ld r9,TI_FLAGS(r12)
li r11,-MAX_ERRNO
andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
- bne- syscall_exit_work
+ bne- .Lsyscall_exit_work
andi. r0,r8,MSR_FP
beq 2f
@@ -232,7 +245,7 @@ system_call: /* label this so stack traces look sane */
3: cmpld r3,r11
ld r5,_CCR(r1)
- bge- syscall_error
+ bge- .Lsyscall_error
.Lsyscall_error_cont:
ld r7,_NIP(r1)
BEGIN_FTR_SECTION
@@ -258,14 +271,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
RFI
b . /* prevent speculative execution */
-syscall_error:
+.Lsyscall_error:
oris r5,r5,0x1000 /* Set SO bit in CR */
neg r3,r3
std r5,_CCR(r1)
b .Lsyscall_error_cont
-
+
/* Traced system call support */
-syscall_dotrace:
+.Lsyscall_dotrace:
bl save_nvgprs
addi r3,r1,STACK_FRAME_OVERHEAD
bl do_syscall_trace_enter
@@ -286,23 +299,23 @@ syscall_dotrace:
ld r7,GPR7(r1)
ld r8,GPR8(r1)
- /* Repopulate r9 and r10 for the system_call path */
+ /* Repopulate r9 and r10 for the syscall path */
addi r9,r1,STACK_FRAME_OVERHEAD
CURRENT_THREAD_INFO(r10, r1)
ld r10,TI_FLAGS(r10)
cmpldi r0,NR_syscalls
- blt+ system_call
+ blt+ .Lsyscall
/* Return code is already in r3 thanks to do_syscall_trace_enter() */
b .Lsyscall_exit
-syscall_enosys:
+.Lsyscall_enosys:
li r3,-ENOSYS
b .Lsyscall_exit
-syscall_exit_work:
+.Lsyscall_exit_work:
#ifdef CONFIG_PPC_BOOK3S
li r10,MSR_RI
mtmsrd r10,1 /* Restore RI */
@@ -362,7 +375,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
b ret_from_except
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-tabort_syscall:
+.Ltabort_syscall:
/* Firstly we need to enable TM in the kernel */
mfmsr r10
li r9, 1
@@ -388,6 +401,8 @@ tabort_syscall:
rfid
b . /* prevent speculative execution */
#endif
+_ASM_NOKPROBE_SYMBOL(system_call_common);
+_ASM_NOKPROBE_SYMBOL(system_call_exit);
/* Save non-volatile GPRs, if not already saved. */
_GLOBAL(save_nvgprs)
@@ -398,6 +413,7 @@ _GLOBAL(save_nvgprs)
clrrdi r0,r11,1
std r0,_TRAP(r1)
blr
+_ASM_NOKPROBE_SYMBOL(save_nvgprs);
/*
@@ -488,33 +504,30 @@ _GLOBAL(_switch)
std r23,_CCR(r1)
std r1,KSP(r3) /* Set old stack pointer */
-#ifdef CONFIG_SMP
- /* We need a sync somewhere here to make sure that if the
- * previous task gets rescheduled on another CPU, it sees all
- * stores it has performed on this one.
+ /*
+ * On SMP kernels, care must be taken because a task may be
+ * scheduled off CPUx and on to CPUy. Memory ordering must be
+ * considered.
+ *
+ * Cacheable stores on CPUx will be visible when the task is
+ * scheduled on CPUy by virtue of the core scheduler barriers
+ * (see "Notes on Program-Order guarantees on SMP systems." in
+ * kernel/sched/core.c).
+ *
+ * Uncacheable stores in the case of involuntary preemption must
+ * be taken care of. The smp_mb__before_spin_lock() in __schedule()
+ * is implemented as hwsync on powerpc, which orders MMIO too. So
+ * long as there is an hwsync in the context switch path, it will
+ * be executed on the source CPU after the task has performed
+ * all MMIO ops on that CPU, and on the destination CPU before the
+ * task performs any MMIO ops there.
*/
- sync
-#endif /* CONFIG_SMP */
/*
- * If we optimise away the clear of the reservation in system
- * calls because we know the CPU tracks the address of the
- * reservation, then we need to clear it here to cover the
- * case that the kernel context switch path has no larx
- * instructions.
+ * The kernel context switch path must contain a spin_lock,
+ * which contains larx/stcx, which will clear any reservation
+ * of the task being switched.
*/
-BEGIN_FTR_SECTION
- ldarx r6,0,r1
-END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS)
-
-BEGIN_FTR_SECTION
-/*
- * A cp_abort (copy paste abort) here ensures that when context switching, a
- * copy from one process can't leak into the paste of another.
- */
- PPC_CP_ABORT
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
-
#ifdef CONFIG_PPC_BOOK3S
/* Cancel all explict user streams as they will have no use after context
* switch and will stop the HW from creating streams itself
@@ -583,6 +596,14 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
top of the kernel stack. */
addi r7,r7,THREAD_SIZE-SWITCH_FRAME_SIZE
+ /*
+ * PMU interrupts in radix may come in here. They will use r1, not
+ * PACAKSAVE, so this stack switch will not cause a problem. They
+ * will store to the process stack, which may then be migrated to
+ * another CPU. However the rq lock release on this CPU paired with
+ * the rq lock acquire on the new CPU before the stack becomes
+ * active on the new CPU, will order those stores.
+ */
mr r1,r8 /* start using new stack pointer */
std r7,PACAKSAVE(r13)
@@ -763,11 +784,11 @@ restore:
ld r5,SOFTE(r1)
lbz r6,PACASOFTIRQEN(r13)
cmpwi cr0,r5,0
- beq restore_irq_off
+ beq .Lrestore_irq_off
/* We are enabling, were we already enabled ? Yes, just return */
cmpwi cr0,r6,1
- beq cr0,do_restore
+ beq cr0,.Ldo_restore
/*
* We are about to soft-enable interrupts (we are hard disabled
@@ -776,14 +797,14 @@ restore:
*/
lbz r0,PACAIRQHAPPENED(r13)
cmpwi cr0,r0,0
- bne- restore_check_irq_replay
+ bne- .Lrestore_check_irq_replay
/*
* Get here when nothing happened while soft-disabled, just
* soft-enable and move-on. We will hard-enable as a side
* effect of rfi
*/
-restore_no_replay:
+.Lrestore_no_replay:
TRACE_ENABLE_INTS
li r0,1
stb r0,PACASOFTIRQEN(r13);
@@ -791,7 +812,7 @@ restore_no_replay:
/*
* Final return path. BookE is handled in a different file
*/
-do_restore:
+.Ldo_restore:
#ifdef CONFIG_PPC_BOOK3E
b exception_return_book3e
#else
@@ -825,7 +846,7 @@ fast_exception_return:
REST_8GPRS(5, r1)
andi. r0,r3,MSR_RI
- beq- unrecov_restore
+ beq- .Lunrecov_restore
/* Load PPR from thread struct before we clear MSR:RI */
BEGIN_FTR_SECTION
@@ -883,7 +904,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
* make sure that in this case, we also clear PACA_IRQ_HARD_DIS
* or that bit can get out of sync and bad things will happen
*/
-restore_irq_off:
+.Lrestore_irq_off:
ld r3,_MSR(r1)
lbz r7,PACAIRQHAPPENED(r13)
andi. r0,r3,MSR_EE
@@ -893,13 +914,13 @@ restore_irq_off:
1: li r0,0
stb r0,PACASOFTIRQEN(r13);
TRACE_DISABLE_INTS
- b do_restore
+ b .Ldo_restore
/*
* Something did happen, check if a re-emit is needed
* (this also clears paca->irq_happened)
*/
-restore_check_irq_replay:
+.Lrestore_check_irq_replay:
/* XXX: We could implement a fast path here where we check
* for irq_happened being just 0x01, in which case we can
* clear it and return. That means that we would potentially
@@ -909,7 +930,7 @@ restore_check_irq_replay:
*/
bl __check_irq_replay
cmpwi cr0,r3,0
- beq restore_no_replay
+ beq .Lrestore_no_replay
/*
* We need to re-emit an interrupt. We do so by re-using our
@@ -945,23 +966,26 @@ restore_check_irq_replay:
#ifdef CONFIG_PPC_BOOK3E
cmpwi cr0,r3,0x280
#else
- BEGIN_FTR_SECTION
- cmpwi cr0,r3,0xe80
- FTR_SECTION_ELSE
- cmpwi cr0,r3,0xa00
- ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+ cmpwi cr0,r3,0xa00
#endif /* CONFIG_PPC_BOOK3E */
bne 1f
addi r3,r1,STACK_FRAME_OVERHEAD;
bl doorbell_exception
- b ret_from_except
#endif /* CONFIG_PPC_DOORBELL */
1: b ret_from_except /* What else to do here ? */
-unrecov_restore:
+.Lunrecov_restore:
addi r3,r1,STACK_FRAME_OVERHEAD
bl unrecoverable_exception
- b unrecov_restore
+ b .Lunrecov_restore
+
+_ASM_NOKPROBE_SYMBOL(ret_from_except);
+_ASM_NOKPROBE_SYMBOL(ret_from_except_lite);
+_ASM_NOKPROBE_SYMBOL(resume_kernel);
+_ASM_NOKPROBE_SYMBOL(fast_exc_return_irq);
+_ASM_NOKPROBE_SYMBOL(restore);
+_ASM_NOKPROBE_SYMBOL(fast_exception_return);
+
#ifdef CONFIG_PPC_RTAS
/*
@@ -1038,6 +1062,8 @@ _GLOBAL(enter_rtas)
rldicr r9,r9,MSR_SF_LG,(63-MSR_SF_LG)
ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE
andc r6,r0,r9
+
+__enter_rtas:
sync /* disable interrupts so SRR0/1 */
mtmsrd r0 /* don't get trashed */
@@ -1074,9 +1100,11 @@ rtas_return_loc:
mtspr SPRN_SRR1,r4
rfid
b . /* prevent speculative execution */
+_ASM_NOKPROBE_SYMBOL(__enter_rtas)
+_ASM_NOKPROBE_SYMBOL(rtas_return_loc)
.align 3
-1: .llong rtas_restore_regs
+1: .8byte rtas_restore_regs
rtas_restore_regs:
/* relocation is on at this point */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index b886795060fd..48da0f5d2f7f 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -99,7 +99,11 @@ EXC_VIRT_NONE(0x4000, 0x100)
#ifdef CONFIG_PPC_P7_NAP
/*
* If running native on arch 2.06 or later, check if we are waking up
- * from nap/sleep/winkle, and branch to idle handler.
+ * from nap/sleep/winkle, and branch to idle handler. This tests SRR1
+ * bits 46:47. A non-0 value indicates that we are coming from a power
+ * saving state. The idle wakeup handler initially runs in real mode,
+ * but we branch to the 0xc000... address so we can turn on relocation
+ * with mtmsr.
*/
#define IDLETEST(n) \
BEGIN_FTR_SECTION ; \
@@ -107,7 +111,7 @@ EXC_VIRT_NONE(0x4000, 0x100)
rlwinm. r10,r10,47-31,30,31 ; \
beq- 1f ; \
cmpwi cr3,r10,2 ; \
- BRANCH_TO_COMMON(r10, system_reset_idle_common) ; \
+ BRANCH_TO_C000(r10, system_reset_idle_common) ; \
1: \
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
#else
@@ -128,6 +132,7 @@ EXC_VIRT_NONE(0x4100, 0x100)
#ifdef CONFIG_PPC_P7_NAP
EXC_COMMON_BEGIN(system_reset_idle_common)
+ mfspr r12,SPRN_SRR1
b pnv_powersave_wakeup
#endif
@@ -507,46 +512,22 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80)
SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0(PACA_EXSLB)
EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
- std r3,PACA_EXSLB+EX_R3(r13)
+ mr r12,r3 /* save r3 */
mfspr r3,SPRN_DAR
- mfspr r12,SPRN_SRR1
+ mfspr r11,SPRN_SRR1
crset 4*cr6+eq
-#ifndef CONFIG_RELOCATABLE
- b slb_miss_realmode
-#else
- /*
- * We can't just use a direct branch to slb_miss_realmode
- * because the distance from here to there depends on where
- * the kernel ends up being put.
- */
- mfctr r11
- LOAD_HANDLER(r10, slb_miss_realmode)
- mtctr r10
- bctr
-#endif
+ BRANCH_TO_COMMON(r10, slb_miss_common)
EXC_REAL_END(data_access_slb, 0x380, 0x80)
EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80)
SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0(PACA_EXSLB)
EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380)
- std r3,PACA_EXSLB+EX_R3(r13)
+ mr r12,r3 /* save r3 */
mfspr r3,SPRN_DAR
- mfspr r12,SPRN_SRR1
+ mfspr r11,SPRN_SRR1
crset 4*cr6+eq
-#ifndef CONFIG_RELOCATABLE
- b slb_miss_realmode
-#else
- /*
- * We can't just use a direct branch to slb_miss_realmode
- * because the distance from here to there depends on where
- * the kernel ends up being put.
- */
- mfctr r11
- LOAD_HANDLER(r10, slb_miss_realmode)
- mtctr r10
- bctr
-#endif
+ BRANCH_TO_COMMON(r10, slb_miss_common)
EXC_VIRT_END(data_access_slb, 0x4380, 0x80)
TRAMP_KVM_SKIP(PACA_EXSLB, 0x380)
@@ -560,7 +541,7 @@ EXC_COMMON_BEGIN(instruction_access_common)
RECONCILE_IRQ_STATE(r10, r11)
ld r12,_MSR(r1)
ld r3,_NIP(r1)
- andis. r4,r12,0x5820
+ andis. r4,r12,DSISR_BAD_FAULT_64S@h
li r5,0x400
std r3,_DAR(r1)
std r4,_DSISR(r1)
@@ -575,88 +556,82 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80)
SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0(PACA_EXSLB)
EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
- std r3,PACA_EXSLB+EX_R3(r13)
+ mr r12,r3 /* save r3 */
mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */
- mfspr r12,SPRN_SRR1
+ mfspr r11,SPRN_SRR1
crclr 4*cr6+eq
-#ifndef CONFIG_RELOCATABLE
- b slb_miss_realmode
-#else
- mfctr r11
- LOAD_HANDLER(r10, slb_miss_realmode)
- mtctr r10
- bctr
-#endif
+ BRANCH_TO_COMMON(r10, slb_miss_common)
EXC_REAL_END(instruction_access_slb, 0x480, 0x80)
EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80)
SET_SCRATCH0(r13)
EXCEPTION_PROLOG_0(PACA_EXSLB)
EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480)
- std r3,PACA_EXSLB+EX_R3(r13)
+ mr r12,r3 /* save r3 */
mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */
- mfspr r12,SPRN_SRR1
+ mfspr r11,SPRN_SRR1
crclr 4*cr6+eq
-#ifndef CONFIG_RELOCATABLE
- b slb_miss_realmode
-#else
- mfctr r11
- LOAD_HANDLER(r10, slb_miss_realmode)
- mtctr r10
- bctr
-#endif
+ BRANCH_TO_COMMON(r10, slb_miss_common)
EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80)
TRAMP_KVM(PACA_EXSLB, 0x480)
-/* This handler is used by both 0x380 and 0x480 slb miss interrupts */
-EXC_COMMON_BEGIN(slb_miss_realmode)
+/*
+ * This handler is used by the 0x380 and 0x480 SLB miss interrupts, as well as
+ * the virtual mode 0x4380 and 0x4480 interrupts if AIL is enabled.
+ */
+EXC_COMMON_BEGIN(slb_miss_common)
/*
* r13 points to the PACA, r9 contains the saved CR,
- * r12 contain the saved SRR1, SRR0 is still ready for return
+ * r12 contains the saved r3,
+ * r11 contain the saved SRR1, SRR0 is still ready for return
* r3 has the faulting address
* r9 - r13 are saved in paca->exslb.
- * r3 is saved in paca->slb_r3
* cr6.eq is set for a D-SLB miss, clear for a I-SLB miss
* We assume we aren't going to take any exceptions during this
* procedure.
*/
mflr r10
-#ifdef CONFIG_RELOCATABLE
- mtctr r11
-#endif
-
stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */
std r10,PACA_EXSLB+EX_LR(r13) /* save LR */
- std r3,PACA_EXSLB+EX_DAR(r13)
+
+ /*
+ * Test MSR_RI before calling slb_allocate_realmode, because the
+ * MSR in r11 gets clobbered. However we still want to allocate
+ * SLB in case MSR_RI=0, to minimise the risk of getting stuck in
+ * recursive SLB faults. So use cr5 for this, which is preserved.
+ */
+ andi. r11,r11,MSR_RI /* check for unrecoverable exception */
+ cmpdi cr5,r11,MSR_RI
crset 4*cr0+eq
#ifdef CONFIG_PPC_STD_MMU_64
BEGIN_MMU_FTR_SECTION
- bl slb_allocate_realmode
+ bl slb_allocate
END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
#endif
ld r10,PACA_EXSLB+EX_LR(r13)
- ld r3,PACA_EXSLB+EX_R3(r13)
lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */
mtlr r10
- beq 8f /* if bad address, make full stack frame */
+ beq- 8f /* if bad address, make full stack frame */
- andi. r10,r12,MSR_RI /* check for unrecoverable exception */
- beq- 2f
+ bne- cr5,2f /* if unrecoverable exception, oops */
/* All done -- return from exception. */
.machine push
.machine "power4"
mtcrf 0x80,r9
+ mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */
mtcrf 0x02,r9 /* I/D indication is in cr6 */
mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */
.machine pop
+ RESTORE_CTR(r9, PACA_EXSLB)
RESTORE_PPR_PACA(PACA_EXSLB, r9)
+ mr r3,r12
ld r9,PACA_EXSLB+EX_R9(r13)
ld r10,PACA_EXSLB+EX_R10(r13)
ld r11,PACA_EXSLB+EX_R11(r13)
@@ -665,7 +640,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
rfid
b . /* prevent speculative execution */
-2: mfspr r11,SPRN_SRR0
+2: std r3,PACA_EXSLB+EX_DAR(r13)
+ mr r3,r12
+ mfspr r11,SPRN_SRR0
+ mfspr r12,SPRN_SRR1
LOAD_HANDLER(r10,unrecov_slb)
mtspr SPRN_SRR0,r10
ld r10,PACAKMSR(r13)
@@ -673,7 +651,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
rfid
b .
-8: mfspr r11,SPRN_SRR0
+8: std r3,PACA_EXSLB+EX_DAR(r13)
+ mr r3,r12
+ mfspr r11,SPRN_SRR0
+ mfspr r12,SPRN_SRR1
LOAD_HANDLER(r10,bad_addr_slb)
mtspr SPRN_SRR0,r10
ld r10,PACAKMSR(r13)
@@ -821,46 +802,81 @@ EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00)
TRAMP_KVM(PACA_EXGEN, 0xb00)
EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
+/*
+ * system call / hypercall (0xc00, 0x4c00)
+ *
+ * The system call exception is invoked with "sc 0" and does not alter HV bit.
+ * There is support for kernel code to invoke system calls but there are no
+ * in-tree users.
+ *
+ * The hypercall is invoked with "sc 1" and sets HV=1.
+ *
+ * In HPT, sc 1 always goes to 0xc00 real mode. In RADIX, sc 1 can go to
+ * 0x4c00 virtual mode.
+ *
+ * Call convention:
+ *
+ * syscall register convention is in Documentation/powerpc/syscall64-abi.txt
+ *
+ * For hypercalls, the register convention is as follows:
+ * r0 volatile
+ * r1-2 nonvolatile
+ * r3 volatile parameter and return value for status
+ * r4-r10 volatile input and output value
+ * r11 volatile hypercall number and output value
+ * r12 volatile input and output value
+ * r13-r31 nonvolatile
+ * LR nonvolatile
+ * CTR volatile
+ * XER volatile
+ * CR0-1 CR5-7 volatile
+ * CR2-4 nonvolatile
+ * Other registers nonvolatile
+ *
+ * The intersection of volatile registers that don't contain possible
+ * inputs is: cr0, xer, ctr. We may use these as scratch regs upon entry
+ * without saving, though xer is not a good idea to use, as hardware may
+ * interpret some bits so it may be costly to change them.
+ */
#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
- /*
- * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems
- * that support it) before changing to HMT_MEDIUM. That allows the KVM
- * code to save that value into the guest state (it is the guest's PPR
- * value). Otherwise just change to HMT_MEDIUM as userspace has
- * already saved the PPR.
- */
+ /*
+ * There is a little bit of juggling to get syscall and hcall
+ * working well. Save r13 in ctr to avoid using SPRG scratch
+ * register.
+ *
+ * Userspace syscalls have already saved the PPR, hcalls must save
+ * it before setting HMT_MEDIUM.
+ */
#define SYSCALL_KVMTEST \
- SET_SCRATCH0(r13); \
+ mtctr r13; \
GET_PACA(r13); \
- std r9,PACA_EXGEN+EX_R9(r13); \
- OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \
- HMT_MEDIUM; \
std r10,PACA_EXGEN+EX_R10(r13); \
- OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); \
- mfcr r9; \
- KVMTEST_PR(0xc00); \
- GET_SCRATCH0(r13)
+ KVMTEST_PR(0xc00); /* uses r10, branch to do_kvm_0xc00_system_call */ \
+ HMT_MEDIUM; \
+ mfctr r9;
#else
#define SYSCALL_KVMTEST \
- HMT_MEDIUM
+ HMT_MEDIUM; \
+ mr r9,r13; \
+ GET_PACA(r13);
#endif
#define LOAD_SYSCALL_HANDLER(reg) \
__LOAD_HANDLER(reg, system_call_common)
-/* Syscall routine is used twice, in reloc-off and reloc-on paths */
-#define SYSCALL_PSERIES_1 \
+#define SYSCALL_FASTENDIAN_TEST \
BEGIN_FTR_SECTION \
cmpdi r0,0x1ebe ; \
beq- 1f ; \
END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
- mr r9,r13 ; \
- GET_PACA(r13) ; \
- mfspr r11,SPRN_SRR0 ; \
-0:
-#define SYSCALL_PSERIES_2_RFID \
+/*
+ * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9,
+ * and HMT_MEDIUM.
+ */
+#define SYSCALL_REAL \
+ mfspr r11,SPRN_SRR0 ; \
mfspr r12,SPRN_SRR1 ; \
LOAD_SYSCALL_HANDLER(r10) ; \
mtspr SPRN_SRR0,r10 ; \
@@ -869,11 +885,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
rfid ; \
b . ; /* prevent speculative execution */
-#define SYSCALL_PSERIES_3 \
+#define SYSCALL_FASTENDIAN \
/* Fast LE/BE switch system call */ \
1: mfspr r12,SPRN_SRR1 ; \
xori r12,r12,MSR_LE ; \
mtspr SPRN_SRR1,r12 ; \
+ mr r13,r9 ; \
rfid ; /* return to userspace */ \
b . ; /* prevent speculative execution */
@@ -882,16 +899,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
* We can't branch directly so we do it via the CTR which
* is volatile across system calls.
*/
-#define SYSCALL_PSERIES_2_DIRECT \
- LOAD_SYSCALL_HANDLER(r12) ; \
- mtctr r12 ; \
+#define SYSCALL_VIRT \
+ LOAD_SYSCALL_HANDLER(r10) ; \
+ mtctr r10 ; \
+ mfspr r11,SPRN_SRR0 ; \
mfspr r12,SPRN_SRR1 ; \
li r10,MSR_RI ; \
mtmsrd r10,1 ; \
bctr ;
#else
/* We can branch directly */
-#define SYSCALL_PSERIES_2_DIRECT \
+#define SYSCALL_VIRT \
+ mfspr r11,SPRN_SRR0 ; \
mfspr r12,SPRN_SRR1 ; \
li r10,MSR_RI ; \
mtmsrd r10,1 ; /* Set RI (EE=0) */ \
@@ -899,20 +918,42 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \
#endif
EXC_REAL_BEGIN(system_call, 0xc00, 0x100)
- SYSCALL_KVMTEST
- SYSCALL_PSERIES_1
- SYSCALL_PSERIES_2_RFID
- SYSCALL_PSERIES_3
+ SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */
+ SYSCALL_FASTENDIAN_TEST
+ SYSCALL_REAL
+ SYSCALL_FASTENDIAN
EXC_REAL_END(system_call, 0xc00, 0x100)
EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100)
- SYSCALL_KVMTEST
- SYSCALL_PSERIES_1
- SYSCALL_PSERIES_2_DIRECT
- SYSCALL_PSERIES_3
+ SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */
+ SYSCALL_FASTENDIAN_TEST
+ SYSCALL_VIRT
+ SYSCALL_FASTENDIAN
EXC_VIRT_END(system_call, 0x4c00, 0x100)
-TRAMP_KVM(PACA_EXGEN, 0xc00)
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+ /*
+ * This is a hcall, so register convention is as above, with these
+ * differences:
+ * r13 = PACA
+ * ctr = orig r13
+ * orig r10 saved in PACA
+ */
+TRAMP_KVM_BEGIN(do_kvm_0xc00)
+ /*
+ * Save the PPR (on systems that support it) before changing to
+ * HMT_MEDIUM. That allows the KVM code to save that value into the
+ * guest state (it is the guest's PPR value).
+ */
+ OPT_GET_SPR(r10, SPRN_PPR, CPU_FTR_HAS_PPR)
+ HMT_MEDIUM
+ OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r10, CPU_FTR_HAS_PPR)
+ mfctr r10
+ SET_SCRATCH0(r10)
+ std r9,PACA_EXGEN+EX_R9(r13)
+ mfcr r9
+ KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
+#endif
EXC_REAL(single_step, 0xd00, 0x100)
@@ -1273,6 +1314,39 @@ EXC_REAL_NONE(0x1800, 0x100)
EXC_VIRT_NONE(0x5800, 0x100)
#endif
+#ifdef CONFIG_PPC_WATCHDOG
+
+#define MASKED_DEC_HANDLER_LABEL 3f
+
+#define MASKED_DEC_HANDLER(_H) \
+3: /* soft-nmi */ \
+ std r12,PACA_EXGEN+EX_R12(r13); \
+ GET_SCRATCH0(r10); \
+ std r10,PACA_EXGEN+EX_R13(r13); \
+ EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H)
+
+/*
+ * Branch to soft_nmi_interrupt using the emergency stack. The emergency
+ * stack is one that is usable by maskable interrupts so long as MSR_EE
+ * remains off. It is used for recovery when something has corrupted the
+ * normal kernel stack, for example. The "soft NMI" must not use the process
+ * stack because we want irq disabled sections to avoid touching the stack
+ * at all (other than PMU interrupts), so use the emergency stack for this,
+ * and run it entirely with interrupts hard disabled.
+ */
+EXC_COMMON_BEGIN(soft_nmi_common)
+ mr r10,r1
+ ld r1,PACAEMERGSP(r13)
+ subi r1,r1,INT_FRAME_SIZE
+ EXCEPTION_COMMON_NORET_STACK(PACA_EXGEN, 0x900,
+ system_reset, soft_nmi_interrupt,
+ ADD_NVGPRS;ADD_RECONCILE)
+ b ret_from_except
+
+#else /* CONFIG_PPC_WATCHDOG */
+#define MASKED_DEC_HANDLER_LABEL 2f /* normal return */
+#define MASKED_DEC_HANDLER(_H)
+#endif /* CONFIG_PPC_WATCHDOG */
/*
* An interrupt came in while soft-disabled. We set paca->irq_happened, then:
@@ -1295,22 +1369,20 @@ masked_##_H##interrupt: \
lis r10,0x7fff; \
ori r10,r10,0xffff; \
mtspr SPRN_DEC,r10; \
- b 2f; \
-1: cmpwi r10,PACA_IRQ_DBELL; \
- beq 2f; \
- cmpwi r10,PACA_IRQ_HMI; \
- beq 2f; \
+ b MASKED_DEC_HANDLER_LABEL; \
+1: andi. r10,r10,(PACA_IRQ_DBELL|PACA_IRQ_HMI); \
+ bne 2f; \
mfspr r10,SPRN_##_H##SRR1; \
- rldicl r10,r10,48,1; /* clear MSR_EE */ \
- rotldi r10,r10,16; \
+ xori r10,r10,MSR_EE; /* clear MSR_EE */ \
mtspr SPRN_##_H##SRR1,r10; \
2: mtcrf 0x80,r9; \
ld r9,PACA_EXGEN+EX_R9(r13); \
ld r10,PACA_EXGEN+EX_R10(r13); \
ld r11,PACA_EXGEN+EX_R11(r13); \
- GET_SCRATCH0(r13); \
+ /* returns to kernel where r13 must be set up, so don't restore it */ \
##_H##rfid; \
- b .
+ b .; \
+ MASKED_DEC_HANDLER(_H)
/*
* Real mode exceptions actually use this too, but alternate
@@ -1410,8 +1482,10 @@ USE_TEXT_SECTION()
*/
.balign IFETCH_ALIGN_BYTES
do_hash_page:
-#ifdef CONFIG_PPC_STD_MMU_64
- andis. r0,r4,0xa450 /* weird error? */
+ #ifdef CONFIG_PPC_STD_MMU_64
+ lis r0,DSISR_BAD_FAULT_64S@h
+ ori r0,r0,DSISR_BAD_FAULT_64S@l
+ and. r0,r4,r0 /* weird error? */
bne- handle_page_fault /* if not, try to insert a HPTE */
CURRENT_THREAD_INFO(r11, r1)
lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */
@@ -1553,6 +1627,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
1: addi r3,r1,STACK_FRAME_OVERHEAD
bl kernel_bad_stack
b 1b
+_ASM_NOKPROBE_SYMBOL(bad_stack);
+
+/*
+ * When doorbell is triggered from system reset wakeup, the message is
+ * not cleared, so it would fire again when EE is enabled.
+ *
+ * When coming from local_irq_enable, there may be the same problem if
+ * we were hard disabled.
+ *
+ * Execute msgclr to clear pending exceptions before handling it.
+ */
+h_doorbell_common_msgclr:
+ LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36))
+ PPC_MSGCLR(3)
+ b h_doorbell_common
+
+doorbell_super_common_msgclr:
+ LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36))
+ PPC_MSGCLRP(3)
+ b doorbell_super_common
/*
* Called from arch_local_irq_enable when an interrupt needs
@@ -1563,6 +1657,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
* Note: While MSR:EE is off, we need to make sure that _MSR
* in the generated frame has EE set to 1 or the exception
* handler will not properly re-enable them.
+ *
+ * Note that we don't specify LR as the NIP (return address) for
+ * the interrupt because that would unbalance the return branch
+ * predictor.
*/
_GLOBAL(__replay_interrupt)
/* We are going to jump to the exception common code which
@@ -1570,22 +1668,27 @@ _GLOBAL(__replay_interrupt)
* we don't give a damn about, so we don't bother storing them.
*/
mfmsr r12
- mflr r11
+ LOAD_REG_ADDR(r11, replay_interrupt_return)
mfcr r9
ori r12,r12,MSR_EE
cmpwi r3,0x900
beq decrementer_common
cmpwi r3,0x500
- beq hardware_interrupt_common
BEGIN_FTR_SECTION
- cmpwi r3,0xe80
- beq h_doorbell_common
- cmpwi r3,0xea0
beq h_virt_irq_common
+FTR_SECTION_ELSE
+ beq hardware_interrupt_common
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_300)
+BEGIN_FTR_SECTION
+ cmpwi r3,0xa00
+ beq h_doorbell_common_msgclr
cmpwi r3,0xe60
beq hmi_exception_common
FTR_SECTION_ELSE
cmpwi r3,0xa00
- beq doorbell_super_common
+ beq doorbell_super_common_msgclr
ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+replay_interrupt_return:
blr
+
+_ASM_NOKPROBE_SYMBOL(__replay_interrupt)
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 466569e26278..e1431800bfb9 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -113,11 +113,62 @@ int __init early_init_dt_scan_fw_dump(unsigned long node,
return 1;
}
+/*
+ * If fadump is registered, check if the memory provided
+ * falls within boot memory area.
+ */
+int is_fadump_boot_memory_area(u64 addr, ulong size)
+{
+ if (!fw_dump.dump_registered)
+ return 0;
+
+ return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size;
+}
+
+int should_fadump_crash(void)
+{
+ if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr)
+ return 0;
+ return 1;
+}
+
int is_fadump_active(void)
{
return fw_dump.dump_active;
}
+/*
+ * Returns 1, if there are no holes in boot memory area,
+ * 0 otherwise.
+ */
+static int is_boot_memory_area_contiguous(void)
+{
+ struct memblock_region *reg;
+ unsigned long tstart, tend;
+ unsigned long start_pfn = PHYS_PFN(RMA_START);
+ unsigned long end_pfn = PHYS_PFN(RMA_START + fw_dump.boot_memory_size);
+ unsigned int ret = 0;
+
+ for_each_memblock(memory, reg) {
+ tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
+ tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
+ if (tstart < tend) {
+ /* Memory hole from start_pfn to tstart */
+ if (tstart > start_pfn)
+ break;
+
+ if (tend == end_pfn) {
+ ret = 1;
+ break;
+ }
+
+ start_pfn = tend + 1;
+ }
+ }
+
+ return ret;
+}
+
/* Print firmware assisted dump configurations for debugging purpose. */
static void fadump_show_config(void)
{
@@ -212,20 +263,46 @@ static inline unsigned long fadump_calculate_reserve_size(void)
int ret;
unsigned long long base, size;
+ if (fw_dump.reserve_bootvar)
+ pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n");
+
/*
* Check if the size is specified through crashkernel= cmdline
- * option. If yes, then use that but ignore base as fadump
- * reserves memory at end of RAM.
+ * option. If yes, then use that but ignore base as fadump reserves
+ * memory at a predefined offset.
*/
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&size, &base);
if (ret == 0 && size > 0) {
+ unsigned long max_size;
+
+ if (fw_dump.reserve_bootvar)
+ pr_info("Using 'crashkernel=' parameter for memory reservation.\n");
+
fw_dump.reserve_bootvar = (unsigned long)size;
+
+ /*
+ * Adjust if the boot memory size specified is above
+ * the upper limit.
+ */
+ max_size = memblock_phys_mem_size() / MAX_BOOT_MEM_RATIO;
+ if (fw_dump.reserve_bootvar > max_size) {
+ fw_dump.reserve_bootvar = max_size;
+ pr_info("Adjusted boot memory size to %luMB\n",
+ (fw_dump.reserve_bootvar >> 20));
+ }
+
+ return fw_dump.reserve_bootvar;
+ } else if (fw_dump.reserve_bootvar) {
+ /*
+ * 'fadump_reserve_mem=' is being used to reserve memory
+ * for firmware-assisted dump.
+ */
return fw_dump.reserve_bootvar;
}
/* divide by 20 to get 5% of value */
- size = memblock_end_of_DRAM() / 20;
+ size = memblock_phys_mem_size() / 20;
/* round it down in multiples of 256 */
size = size & ~0x0FFFFFFFUL;
@@ -377,9 +454,22 @@ static int __init early_fadump_param(char *p)
}
early_param("fadump", early_fadump_param);
-static void register_fw_dump(struct fadump_mem_struct *fdm)
+/*
+ * Look for fadump_reserve_mem= cmdline option
+ * TODO: Remove references to 'fadump_reserve_mem=' parameter,
+ * the sooner 'crashkernel=' parameter is accustomed to.
+ */
+static int __init early_fadump_reserve_mem(char *p)
{
- int rc;
+ if (p)
+ fw_dump.reserve_bootvar = memparse(p, &p);
+ return 0;
+}
+early_param("fadump_reserve_mem", early_fadump_reserve_mem);
+
+static int register_fw_dump(struct fadump_mem_struct *fdm)
+{
+ int rc, err;
unsigned int wait_time;
pr_debug("Registering for firmware-assisted kernel dump...\n");
@@ -396,26 +486,38 @@ static void register_fw_dump(struct fadump_mem_struct *fdm)
} while (wait_time);
+ err = -EIO;
switch (rc) {
+ default:
+ pr_err("Failed to register. Unknown Error(%d).\n", rc);
+ break;
case -1:
printk(KERN_ERR "Failed to register firmware-assisted kernel"
" dump. Hardware Error(%d).\n", rc);
break;
case -3:
+ if (!is_boot_memory_area_contiguous())
+ pr_err("Can't have holes in boot memory area while "
+ "registering fadump\n");
+
printk(KERN_ERR "Failed to register firmware-assisted kernel"
" dump. Parameter Error(%d).\n", rc);
+ err = -EINVAL;
break;
case -9:
printk(KERN_ERR "firmware-assisted kernel dump is already "
" registered.");
fw_dump.dump_registered = 1;
+ err = -EEXIST;
break;
case 0:
printk(KERN_INFO "firmware-assisted kernel dump registration"
" is successful\n");
fw_dump.dump_registered = 1;
+ err = 0;
break;
}
+ return err;
}
void crash_fadump(struct pt_regs *regs, const char *str)
@@ -423,7 +525,7 @@ void crash_fadump(struct pt_regs *regs, const char *str)
struct fadump_crash_info_header *fdh = NULL;
int old_cpu, this_cpu;
- if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr)
+ if (!should_fadump_crash())
return;
/*
@@ -831,8 +933,19 @@ static void fadump_setup_crash_memory_ranges(void)
for_each_memblock(memory, reg) {
start = (unsigned long long)reg->base;
end = start + (unsigned long long)reg->size;
- if (start == RMA_START && end >= fw_dump.boot_memory_size)
- start = fw_dump.boot_memory_size;
+
+ /*
+ * skip the first memory chunk that is already added (RMA_START
+ * through boot_memory_size). This logic needs a relook if and
+ * when RMA_START changes to a non-zero value.
+ */
+ BUILD_BUG_ON(RMA_START != 0);
+ if (start < fw_dump.boot_memory_size) {
+ if (end > fw_dump.boot_memory_size)
+ start = fw_dump.boot_memory_size;
+ else
+ continue;
+ }
/* add this range excluding the reserved dump area. */
fadump_exclude_reserved_area(start, end);
@@ -893,8 +1006,7 @@ static int fadump_create_elfcore_headers(char *bufp)
phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note());
phdr->p_offset = phdr->p_paddr;
- phdr->p_memsz = vmcoreinfo_max_size;
- phdr->p_filesz = vmcoreinfo_max_size;
+ phdr->p_memsz = phdr->p_filesz = VMCOREINFO_NOTE_SIZE;
/* Increment number of program headers. */
(elf->e_phnum)++;
@@ -956,7 +1068,7 @@ static unsigned long init_fadump_header(unsigned long addr)
return addr;
}
-static void register_fadump(void)
+static int register_fadump(void)
{
unsigned long addr;
void *vaddr;
@@ -966,7 +1078,7 @@ static void register_fadump(void)
* assisted dump.
*/
if (!fw_dump.reserve_dump_area_size)
- return;
+ return -ENODEV;
fadump_setup_crash_memory_ranges();
@@ -979,7 +1091,7 @@ static void register_fadump(void)
fadump_create_elfcore_headers(vaddr);
/* register the future kernel dump with firmware. */
- register_fw_dump(&fdm);
+ return register_fw_dump(&fdm);
}
static int fadump_unregister_dump(struct fadump_mem_struct *fdm)
@@ -1046,28 +1158,71 @@ void fadump_cleanup(void)
}
}
+static void fadump_free_reserved_memory(unsigned long start_pfn,
+ unsigned long end_pfn)
+{
+ unsigned long pfn;
+ unsigned long time_limit = jiffies + HZ;
+
+ pr_info("freeing reserved memory (0x%llx - 0x%llx)\n",
+ PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ free_reserved_page(pfn_to_page(pfn));
+
+ if (time_after(jiffies, time_limit)) {
+ cond_resched();
+ time_limit = jiffies + HZ;
+ }
+ }
+}
+
+/*
+ * Skip memory holes and free memory that was actually reserved.
+ */
+static void fadump_release_reserved_area(unsigned long start, unsigned long end)
+{
+ struct memblock_region *reg;
+ unsigned long tstart, tend;
+ unsigned long start_pfn = PHYS_PFN(start);
+ unsigned long end_pfn = PHYS_PFN(end);
+
+ for_each_memblock(memory, reg) {
+ tstart = max(start_pfn, memblock_region_memory_base_pfn(reg));
+ tend = min(end_pfn, memblock_region_memory_end_pfn(reg));
+ if (tstart < tend) {
+ fadump_free_reserved_memory(tstart, tend);
+
+ if (tend == end_pfn)
+ break;
+
+ start_pfn = tend + 1;
+ }
+ }
+}
+
/*
* Release the memory that was reserved in early boot to preserve the memory
* contents. The released memory will be available for general use.
*/
static void fadump_release_memory(unsigned long begin, unsigned long end)
{
- unsigned long addr;
unsigned long ra_start, ra_end;
ra_start = fw_dump.reserve_dump_area_start;
ra_end = ra_start + fw_dump.reserve_dump_area_size;
- for (addr = begin; addr < end; addr += PAGE_SIZE) {
- /*
- * exclude the dump reserve area. Will reuse it for next
- * fadump registration.
- */
- if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start))
- continue;
-
- free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
- }
+ /*
+ * exclude the dump reserve area. Will reuse it for next
+ * fadump registration.
+ */
+ if (begin < ra_end && end > ra_start) {
+ if (begin < ra_start)
+ fadump_release_reserved_area(begin, ra_start);
+ if (end > ra_end)
+ fadump_release_reserved_area(ra_end, end);
+ } else
+ fadump_release_reserved_area(begin, end);
}
static void fadump_invalidate_release_mem(void)
@@ -1161,7 +1316,6 @@ static ssize_t fadump_register_store(struct kobject *kobj,
switch (buf[0]) {
case '0':
if (fw_dump.dump_registered == 0) {
- ret = -EINVAL;
goto unlock_out;
}
/* Un-register Firmware-assisted dump */
@@ -1169,11 +1323,11 @@ static ssize_t fadump_register_store(struct kobject *kobj,
break;
case '1':
if (fw_dump.dump_registered == 1) {
- ret = -EINVAL;
+ ret = -EEXIST;
goto unlock_out;
}
/* Register Firmware-assisted dump */
- register_fadump();
+ ret = register_fadump();
break;
default:
ret = -EINVAL;
@@ -1299,6 +1453,25 @@ static void fadump_init_files(void)
return;
}
+static int fadump_panic_event(struct notifier_block *this,
+ unsigned long event, void *ptr)
+{
+ /*
+ * If firmware-assisted dump has been registered then trigger
+ * firmware-assisted dump and let firmware handle everything
+ * else. If this returns, then fadump was not registered, so
+ * go through the rest of the panic path.
+ */
+ crash_fadump(NULL, ptr);
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block fadump_panic_block = {
+ .notifier_call = fadump_panic_event,
+ .priority = INT_MIN /* may not return; must be done last */
+};
+
/*
* Prepare for firmware-assisted dump.
*/
@@ -1331,6 +1504,9 @@ int __init setup_fadump(void)
init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start);
fadump_init_files();
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &fadump_panic_block);
+
return 1;
}
subsys_initcall(setup_fadump);
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index e22734278458..8c54166491e7 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -388,7 +388,7 @@ DataAccess:
EXCEPTION_PROLOG
mfspr r10,SPRN_DSISR
stw r10,_DSISR(r11)
- andis. r0,r10,0xa470 /* weird error? */
+ andis. r0,r10,DSISR_BAD_FAULT_32S@h
bne 1f /* if not, try to put a PTE */
mfspr r4,SPRN_DAR /* into the hash table */
rlwinm r3,r10,32-15,21,21 /* DSISR_STORE -> _PAGE_RW */
@@ -403,13 +403,13 @@ DataAccess:
DO_KVM 0x400
InstructionAccess:
EXCEPTION_PROLOG
- andis. r0,r9,0x4000 /* no pte found? */
+ andis. r0,r9,SRR1_ISI_NOPT@h /* no pte found? */
beq 1f /* if so, try to put a PTE */
li r3,0 /* into the hash table */
mr r4,r12 /* SRR0 is fault address */
bl hash_page
1: mr r4,r12
- mr r5,r9
+ andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */
EXC_XFER_LITE(0x400, handle_page_fault)
/* External interrupt */
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 0ddc602b33a4..ff8511d6d8ea 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -92,13 +92,13 @@ END_FTR_SECTION(0, 1)
.balign 8
.globl __secondary_hold_spinloop
__secondary_hold_spinloop:
- .llong 0x0
+ .8byte 0x0
/* Secondary processors write this value with their cpu # */
/* after they enter the spin loop immediately below. */
.globl __secondary_hold_acknowledge
__secondary_hold_acknowledge:
- .llong 0x0
+ .8byte 0x0
#ifdef CONFIG_RELOCATABLE
/* This flag is set to 1 by a loader if the kernel should run
@@ -650,7 +650,7 @@ __after_prom_start:
bctr
.balign 8
-p_end: .llong _end - copy_to_here
+p_end: .8byte _end - copy_to_here
4:
/*
@@ -892,7 +892,7 @@ _GLOBAL(relative_toc)
blr
.balign 8
-p_toc: .llong __toc_start + 0x8000 - 0b
+p_toc: .8byte __toc_start + 0x8000 - 0b
/*
* This is where the main kernel code starts.
diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S
index c032fe8c2d26..4fee00d414e8 100644
--- a/arch/powerpc/kernel/head_8xx.S
+++ b/arch/powerpc/kernel/head_8xx.S
@@ -50,18 +50,20 @@
mtspr spr, reg
#endif
-/* Macro to test if an address is a kernel address */
#if CONFIG_TASK_SIZE <= 0x80000000 && CONFIG_PAGE_OFFSET >= 0x80000000
-#define IS_KERNEL(tmp, addr) \
- andis. tmp, addr, 0x8000 /* Address >= 0x80000000 */
-#define BRANCH_UNLESS_KERNEL(label) beq label
-#else
-#define IS_KERNEL(tmp, addr) \
- rlwinm tmp, addr, 16, 16, 31; \
- cmpli cr0, tmp, PAGE_OFFSET >> 16
-#define BRANCH_UNLESS_KERNEL(label) blt label
+/* By simply checking Address >= 0x80000000, we know if its a kernel address */
+#define SIMPLE_KERNEL_ADDRESS 1
#endif
+/*
+ * We need an ITLB miss handler for kernel addresses if:
+ * - Either we have modules
+ * - Or we have not pinned the first 8M
+ */
+#if defined(CONFIG_MODULES) || !defined(CONFIG_PIN_TLB_TEXT) || \
+ defined(CONFIG_DEBUG_PAGEALLOC)
+#define ITLB_MISS_KERNEL 1
+#endif
/*
* Value for the bits that have fixed value in RPN entries.
@@ -123,7 +125,6 @@ turn_on_mmu:
lis r0,start_here@h
ori r0,r0,start_here@l
mtspr SPRN_SRR0,r0
- SYNC
rfi /* enables MMU */
/*
@@ -170,7 +171,7 @@ turn_on_mmu:
stw r1,0(r11); \
tovirt(r1,r11); /* set new kernel sp */ \
li r10,MSR_KERNEL & ~(MSR_IR|MSR_DR); /* can take exceptions */ \
- MTMSRD(r10); /* (except for mach check in rtas) */ \
+ mtmsr r10; \
stw r0,GPR0(r11); \
SAVE_4GPRS(3, r11); \
SAVE_2GPRS(7, r11)
@@ -300,7 +301,7 @@ SystemCall:
/* On the MPC8xx, this is a software emulation interrupt. It occurs
* for all unimplemented and illegal instructions.
*/
- EXCEPTION(0x1000, SoftEmu, SoftwareEmulation, EXC_XFER_STD)
+ EXCEPTION(0x1000, SoftEmu, program_check_exception, EXC_XFER_STD)
. = 0x1100
/*
@@ -325,7 +326,7 @@ SystemCall:
#endif
InstructionTLBMiss:
-#if defined(CONFIG_8xx_CPU6) || defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE)
+#if defined(CONFIG_8xx_CPU6) || defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
mtspr SPRN_SPRG_SCRATCH2, r3
#endif
EXCEPTION_PROLOG_0
@@ -343,15 +344,32 @@ InstructionTLBMiss:
INVALIDATE_ADJACENT_PAGES_CPU15(r11, r10)
/* Only modules will cause ITLB Misses as we always
* pin the first 8MB of kernel memory */
-#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE)
+#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
mfcr r3
#endif
-#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC)
- IS_KERNEL(r11, r10)
+#ifdef ITLB_MISS_KERNEL
+#if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT)
+ andis. r11, r10, 0x8000 /* Address >= 0x80000000 */
+#else
+ rlwinm r11, r10, 16, 0xfff8
+ cmpli cr0, r11, PAGE_OFFSET@h
+#ifndef CONFIG_PIN_TLB_TEXT
+ /* It is assumed that kernel code fits into the first 8M page */
+_ENTRY(ITLBMiss_cmp)
+ cmpli cr7, r11, (PAGE_OFFSET + 0x0800000)@h
+#endif
+#endif
#endif
mfspr r11, SPRN_M_TW /* Get level 1 table */
-#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC)
- BRANCH_UNLESS_KERNEL(3f)
+#ifdef ITLB_MISS_KERNEL
+#if defined(SIMPLE_KERNEL_ADDRESS) && defined(CONFIG_PIN_TLB_TEXT)
+ beq+ 3f
+#else
+ blt+ 3f
+#endif
+#ifndef CONFIG_PIN_TLB_TEXT
+ blt cr7, ITLBMissLinear
+#endif
lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha
3:
#endif
@@ -369,7 +387,7 @@ InstructionTLBMiss:
rlwimi r10, r11, 0, 0, 32 - PAGE_SHIFT - 1 /* Add level 2 base */
lwz r10, 0(r10) /* Get the pte */
4:
-#if defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE)
+#if defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
mtcr r3
#endif
/* Insert the APG into the TWC from the Linux PTE. */
@@ -400,7 +418,7 @@ InstructionTLBMiss:
MTSPR_CPU6(SPRN_MI_RPN, r10, r3) /* Update TLB entry */
/* Restore registers */
-#if defined(CONFIG_8xx_CPU6) || defined(CONFIG_MODULES) || defined (CONFIG_DEBUG_PAGEALLOC) || defined (CONFIG_HUGETLB_PAGE)
+#if defined(CONFIG_8xx_CPU6) || defined(ITLB_MISS_KERNEL) || defined(CONFIG_HUGETLB_PAGE)
mfspr r3, SPRN_SPRG_SCRATCH2
#endif
EXCEPTION_EPILOG_0
@@ -447,23 +465,23 @@ DataStoreTLBMiss:
* kernel page tables.
*/
mfspr r10, SPRN_MD_EPN
- rlwinm r10, r10, 16, 0xfff8
- cmpli cr0, r10, PAGE_OFFSET@h
+ rlwinm r11, r10, 16, 0xfff8
+ cmpli cr0, r11, PAGE_OFFSET@h
mfspr r11, SPRN_M_TW /* Get level 1 table */
blt+ 3f
+ rlwinm r11, r10, 16, 0xfff8
#ifndef CONFIG_PIN_TLB_IMMR
- cmpli cr0, r10, VIRT_IMMR_BASE@h
+ cmpli cr0, r11, VIRT_IMMR_BASE@h
#endif
_ENTRY(DTLBMiss_cmp)
- cmpli cr7, r10, (PAGE_OFFSET + 0x1800000)@h
- lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha
+ cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h
#ifndef CONFIG_PIN_TLB_IMMR
_ENTRY(DTLBMiss_jmp)
beq- DTLBMissIMMR
#endif
blt cr7, DTLBMissLinear
+ lis r11, (swapper_pg_dir-PAGE_OFFSET)@ha
3:
- mfspr r10, SPRN_MD_EPN
/* Insert level 1 index */
rlwimi r11, r10, 32 - ((PAGE_SHIFT - 2) << 1), (PAGE_SHIFT - 2) << 1, 29
@@ -569,8 +587,8 @@ _ENTRY(DTLBMiss_jmp)
InstructionTLBError:
EXCEPTION_PROLOG
mr r4,r12
- mr r5,r9
- andis. r10,r5,0x4000
+ andis. r5,r9,DSISR_SRR1_MATCH_32S@h /* Filter relevant SRR1 bits */
+ andis. r10,r9,SRR1_ISI_NOPT@h
beq+ 1f
tlbie r4
itlbie:
@@ -595,7 +613,7 @@ DARFixed:/* Return from dcbx instruction bug workaround */
mfspr r5,SPRN_DSISR
stw r5,_DSISR(r11)
mfspr r4,SPRN_DAR
- andis. r10,r5,0x4000
+ andis. r10,r5,DSISR_NOHPTE@h
beq+ 1f
tlbie r4
dtlbie:
@@ -684,7 +702,7 @@ DTLBMissLinear:
/* Set 8M byte page and mark it valid */
li r11, MD_PS8MEG | MD_SVALID
MTSPR_CPU6(SPRN_MD_TWC, r11, r3)
- rlwinm r10, r10, 16, 0x0f800000 /* 8xx supports max 256Mb RAM */
+ rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */
ori r10, r10, 0xf0 | MD_SPS16K | _PAGE_SHARED | _PAGE_DIRTY | \
_PAGE_PRESENT
MTSPR_CPU6(SPRN_MD_RPN, r10, r11) /* Update TLB entry */
@@ -695,6 +713,22 @@ DTLBMissLinear:
EXCEPTION_EPILOG_0
rfi
+#ifndef CONFIG_PIN_TLB_TEXT
+ITLBMissLinear:
+ mtcr r3
+ /* Set 8M byte page and mark it valid */
+ li r11, MI_PS8MEG | MI_SVALID | _PAGE_EXEC
+ MTSPR_CPU6(SPRN_MI_TWC, r11, r3)
+ rlwinm r10, r10, 0, 0x0f800000 /* 8xx supports max 256Mb RAM */
+ ori r10, r10, 0xf0 | MI_SPS16K | _PAGE_SHARED | _PAGE_DIRTY | \
+ _PAGE_PRESENT
+ MTSPR_CPU6(SPRN_MI_RPN, r10, r11) /* Update TLB entry */
+
+ mfspr r3, SPRN_SPRG_SCRATCH2
+ EXCEPTION_EPILOG_0
+ rfi
+#endif
+
/* This is the procedure to calculate the data EA for buggy dcbx,dcbi instructions
* by decoding the registers used by the dcbx instruction and adding them.
* DAR is set to the calculated address.
@@ -705,9 +739,10 @@ FixupDAR:/* Entry point for dcbx workaround. */
mtspr SPRN_SPRG_SCRATCH2, r10
/* fetch instruction from memory. */
mfspr r10, SPRN_SRR0
- IS_KERNEL(r11, r10)
+ rlwinm r11, r10, 16, 0xfff8
+ cmpli cr0, r11, PAGE_OFFSET@h
mfspr r11, SPRN_M_TW /* Get level 1 table */
- BRANCH_UNLESS_KERNEL(3f)
+ blt+ 3f
rlwinm r11, r10, 16, 0xfff8
_ENTRY(FixupDAR_cmp)
cmpli cr7, r11, (PAGE_OFFSET + 0x1800000)@h
@@ -915,10 +950,8 @@ start_here:
rfi
/* Load up the kernel context */
2:
- SYNC /* Force all PTE updates to finish */
tlbia /* Clear all TLB entries */
sync /* wait for tlbia/tlbie to finish */
- TLBSYNC /* ... on all CPUs */
/* set up the PTE pointers for the Abatron bdiGDB.
*/
@@ -955,15 +988,14 @@ initial_mmu:
mtspr SPRN_MD_CTR, r10 /* remove PINNED DTLB entries */
tlbia /* Invalidate all TLB entries */
-/* Always pin the first 8 MB ITLB to prevent ITLB
- misses while mucking around with SRR0/SRR1 in asm
-*/
+#ifdef CONFIG_PIN_TLB_TEXT
lis r8, MI_RSV4I@h
ori r8, r8, 0x1c00
mtspr SPRN_MI_CTR, r8 /* Set instruction MMU control */
+#endif
-#ifdef CONFIG_PIN_TLB
+#ifdef CONFIG_PIN_TLB_DATA
oris r10, r10, MD_RSV4I@h
mtspr SPRN_MD_CTR, r10 /* Set data TLB control */
#endif
@@ -989,6 +1021,7 @@ initial_mmu:
* internal registers (among other things).
*/
#ifdef CONFIG_PIN_TLB_IMMR
+ oris r10, r10, MD_RSV4I@h
ori r10, r10, 0x1c00
mtspr SPRN_MD_CTR, r10
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 4898d676dcae..1125c9be9e06 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -30,7 +30,9 @@
* Use unused space in the interrupt stack to save and restore
* registers for winkle support.
*/
+#define _MMCR0 GPR0
#define _SDR1 GPR3
+#define _PTCR GPR3
#define _RPR GPR4
#define _SPURR GPR5
#define _PURR GPR6
@@ -39,7 +41,7 @@
#define _AMOR GPR9
#define _WORT GPR10
#define _WORC GPR11
-#define _PTCR GPR12
+#define _LPCR GPR12
#define PSSCR_EC_ESL_MASK_SHIFTED (PSSCR_EC | PSSCR_ESL) >> 16
@@ -55,12 +57,14 @@ save_sprs_to_stack:
* here since any thread in the core might wake up first
*/
BEGIN_FTR_SECTION
- mfspr r3,SPRN_PTCR
- std r3,_PTCR(r1)
/*
* Note - SDR1 is dropped in Power ISA v3. Hence not restoring
* SDR1 here
*/
+ mfspr r3,SPRN_PTCR
+ std r3,_PTCR(r1)
+ mfspr r3,SPRN_LPCR
+ std r3,_LPCR(r1)
FTR_SECTION_ELSE
mfspr r3,SPRN_SDR1
std r3,_SDR1(r1)
@@ -81,7 +85,61 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
std r3,_WORT(r1)
mfspr r3,SPRN_WORC
std r3,_WORC(r1)
+/*
+ * On POWER9, there are idle states such as stop4, invoked via cpuidle,
+ * that lose hypervisor resources. In such cases, we need to save
+ * additional SPRs before entering those idle states so that they can
+ * be restored to their older values on wakeup from the idle state.
+ *
+ * On POWER8, the only such deep idle state is winkle which is used
+ * only in the context of CPU-Hotplug, where these additional SPRs are
+ * reinitiazed to a sane value. Hence there is no need to save/restore
+ * these SPRs.
+ */
+BEGIN_FTR_SECTION
+ blr
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
+
+power9_save_additional_sprs:
+ mfspr r3, SPRN_PID
+ mfspr r4, SPRN_LDBAR
+ std r3, STOP_PID(r13)
+ std r4, STOP_LDBAR(r13)
+
+ mfspr r3, SPRN_FSCR
+ mfspr r4, SPRN_HFSCR
+ std r3, STOP_FSCR(r13)
+ std r4, STOP_HFSCR(r13)
+
+ mfspr r3, SPRN_MMCRA
+ mfspr r4, SPRN_MMCR1
+ std r3, STOP_MMCRA(r13)
+ std r4, STOP_MMCR1(r13)
+ mfspr r3, SPRN_MMCR2
+ std r3, STOP_MMCR2(r13)
+ blr
+
+power9_restore_additional_sprs:
+ ld r3,_LPCR(r1)
+ ld r4, STOP_PID(r13)
+ mtspr SPRN_LPCR,r3
+ mtspr SPRN_PID, r4
+
+ ld r3, STOP_LDBAR(r13)
+ ld r4, STOP_FSCR(r13)
+ mtspr SPRN_LDBAR, r3
+ mtspr SPRN_FSCR, r4
+
+ ld r3, STOP_HFSCR(r13)
+ ld r4, STOP_MMCRA(r13)
+ mtspr SPRN_HFSCR, r3
+ mtspr SPRN_MMCRA, r4
+ /* We have already restored PACA_MMCR0 */
+ ld r3, STOP_MMCR1(r13)
+ ld r4, STOP_MMCR2(r13)
+ mtspr SPRN_MMCR1, r3
+ mtspr SPRN_MMCR2, r4
blr
/*
@@ -106,13 +164,9 @@ core_idle_lock_held:
/*
* Pass requested state in r3:
* r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8
- * - Requested STOP state in POWER9
+ * - Requested PSSCR value in POWER9
*
- * To check IRQ_HAPPENED in r4
- * 0 - don't check
- * 1 - check
- *
- * Address to 'rfid' to in r5
+ * Address of idle handler to branch to in realmode in r4
*/
pnv_powersave_common:
/* Use r3 to pass state nap/sleep/winkle */
@@ -122,37 +176,14 @@ pnv_powersave_common:
* need to save PC, some CR bits and the NV GPRs,
* but for now an interrupt frame will do.
*/
+ mtctr r4
+
mflr r0
std r0,16(r1)
stdu r1,-INT_FRAME_SIZE(r1)
std r0,_LINK(r1)
std r0,_NIP(r1)
- /* Hard disable interrupts */
- mfmsr r9
- rldicl r9,r9,48,1
- rotldi r9,r9,16
- mtmsrd r9,1 /* hard-disable interrupts */
-
- /* Check if something happened while soft-disabled */
- lbz r0,PACAIRQHAPPENED(r13)
- andi. r0,r0,~PACA_IRQ_HARD_DIS@l
- beq 1f
- cmpwi cr0,r4,0
- beq 1f
- addi r1,r1,INT_FRAME_SIZE
- ld r0,16(r1)
- li r3,0 /* Return 0 (no nap) */
- mtlr r0
- blr
-
-1: /* We mark irqs hard disabled as this is the state we'll
- * be in when returning and we need to tell arch_local_irq_restore()
- * about it
- */
- li r0,PACA_IRQ_HARD_DIS
- stb r0,PACAIRQHAPPENED(r13)
-
/* We haven't lost state ... yet */
li r0,0
stb r0,PACA_NAPSTATELOST(r13)
@@ -160,24 +191,42 @@ pnv_powersave_common:
/* Continue saving state */
SAVE_GPR(2, r1)
SAVE_NVGPRS(r1)
- mfcr r4
- std r4,_CCR(r1)
- std r9,_MSR(r1)
+ mfcr r5
+ std r5,_CCR(r1)
std r1,PACAR1(r13)
+BEGIN_FTR_SECTION
/*
+ * POWER9 does not require real mode to stop, and presently does not
+ * set hwthread_state for KVM (threads don't share MMU context), so
+ * we can remain in virtual mode for this.
+ */
+ bctr
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+ /*
+ * POWER8
* Go to real mode to do the nap, as required by the architecture.
* Also, we need to be in real mode before setting hwthread_state,
* because as soon as we do that, another thread can switch
* the MMU context to the guest.
*/
LOAD_REG_IMMEDIATE(r7, MSR_IDLE)
- li r6, MSR_RI
- andc r6, r9, r6
- mtmsrd r6, 1 /* clear RI before setting SRR0/1 */
- mtspr SPRN_SRR0, r5
- mtspr SPRN_SRR1, r7
- rfid
+ mtmsrd r7,0
+ bctr
+
+/*
+ * This is the sequence required to execute idle instructions, as
+ * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0.
+ */
+#define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST) \
+ /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \
+ std r0,0(r1); \
+ ptesync; \
+ ld r0,0(r1); \
+236: cmpd cr0,r0,r0; \
+ bne 236b; \
+ IDLE_INST;
+
.globl pnv_enter_arch207_idle_mode
pnv_enter_arch207_idle_mode:
@@ -270,24 +319,52 @@ enter_winkle:
/*
* r3 - PSSCR value corresponding to the requested stop state.
*/
-power_enter_stop:
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
- /* Tell KVM we're entering idle */
+power_enter_stop_kvm_rm:
+ /*
+ * This is currently unused because POWER9 KVM does not have to
+ * gather secondary threads into sibling mode, but the code is
+ * here in case that function is required.
+ *
+ * Tell KVM we're entering idle.
+ */
li r4,KVM_HWTHREAD_IN_IDLE
/* DO THIS IN REAL MODE! See comment above. */
stb r4,HSTATE_HWTHREAD_STATE(r13)
#endif
+power_enter_stop:
/*
* Check if we are executing the lite variant with ESL=EC=0
*/
andis. r4,r3,PSSCR_EC_ESL_MASK_SHIFTED
clrldi r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */
bne .Lhandle_esl_ec_set
- IDLE_STATE_ENTER_SEQ(PPC_STOP)
+ PPC_STOP
li r3,0 /* Since we didn't lose state, return 0 */
+
+ /*
+ * pnv_wakeup_noloss() expects r12 to contain the SRR1 value so
+ * it can determine if the wakeup reason is an HMI in
+ * CHECK_HMI_INTERRUPT.
+ *
+ * However, when we wakeup with ESL=0, SRR1 will not contain the wakeup
+ * reason, so there is no point setting r12 to SRR1.
+ *
+ * Further, we clear r12 here, so that we don't accidentally enter the
+ * HMI in pnv_wakeup_noloss() if the value of r12[42:45] == WAKE_HMI.
+ */
+ li r12, 0
b pnv_wakeup_noloss
.Lhandle_esl_ec_set:
+ /*
+ * POWER9 DD2 can incorrectly set PMAO when waking up after a
+ * state-loss idle. Saving and restoring MMCR0 over idle is a
+ * workaround.
+ */
+ mfspr r4,SPRN_MMCR0
+ std r4,_MMCR0(r1)
+
/*
* Check if the requested state is a deep idle state.
*/
@@ -295,7 +372,8 @@ power_enter_stop:
ld r4,ADDROFF(pnv_first_deep_stop_state)(r5)
cmpd r3,r4
bge .Lhandle_deep_stop
- IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP)
+ PPC_STOP /* Does not return (system reset interrupt) */
+
.Lhandle_deep_stop:
/*
* Entering deep idle state.
@@ -317,47 +395,25 @@ lwarx_loop_stop:
bl save_sprs_to_stack
- IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP)
+ PPC_STOP /* Does not return (system reset interrupt) */
-_GLOBAL(power7_idle)
+/*
+ * Entered with MSR[EE]=0 and no soft-masked interrupts pending.
+ * r3 contains desired idle state (PNV_THREAD_NAP/SLEEP/WINKLE).
+ */
+_GLOBAL(power7_idle_insn)
/* Now check if user or arch enabled NAP mode */
- LOAD_REG_ADDRBASE(r3,powersave_nap)
- lwz r4,ADDROFF(powersave_nap)(r3)
- cmpwi 0,r4,0
- beqlr
- li r3, 1
- /* fall through */
-
-_GLOBAL(power7_nap)
- mr r4,r3
- li r3,PNV_THREAD_NAP
- LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
+ LOAD_REG_ADDR(r4, pnv_enter_arch207_idle_mode)
b pnv_powersave_common
- /* No return */
-
-_GLOBAL(power7_sleep)
- li r3,PNV_THREAD_SLEEP
- li r4,1
- LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
- b pnv_powersave_common
- /* No return */
-
-_GLOBAL(power7_winkle)
- li r3,PNV_THREAD_WINKLE
- li r4,1
- LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode)
- b pnv_powersave_common
- /* No return */
#define CHECK_HMI_INTERRUPT \
- mfspr r0,SPRN_SRR1; \
BEGIN_FTR_SECTION_NESTED(66); \
- rlwinm r0,r0,45-31,0xf; /* extract wake reason field (P8) */ \
+ rlwinm r0,r12,45-31,0xf; /* extract wake reason field (P8) */ \
FTR_SECTION_ELSE_NESTED(66); \
- rlwinm r0,r0,45-31,0xe; /* P7 wake reason field is 3 bits */ \
+ rlwinm r0,r12,45-31,0xe; /* P7 wake reason field is 3 bits */ \
ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \
cmpwi r0,0xa; /* Hypervisor maintenance ? */ \
- bne 20f; \
+ bne+ 20f; \
/* Invoke opal call to handle hmi */ \
ld r2,PACATOC(r13); \
ld r1,PACAR1(r13); \
@@ -369,16 +425,13 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \
20: nop;
/*
- * r3 - The PSSCR value corresponding to the stop state.
- * r4 - The PSSCR mask corrresonding to the stop state.
+ * Entered with MSR[EE]=0 and no soft-masked interrupts pending.
+ * r3 contains desired PSSCR register value.
*/
_GLOBAL(power9_idle_stop)
- mfspr r5,SPRN_PSSCR
- andc r5,r5,r4
- or r3,r3,r5
+ std r3, PACA_REQ_PSSCR(r13)
mtspr SPRN_PSSCR,r3
- LOAD_REG_ADDR(r5,power_enter_stop)
- li r4,1
+ LOAD_REG_ADDR(r4,power_enter_stop)
b pnv_powersave_common
/* No return */
@@ -436,17 +489,29 @@ pnv_powersave_wakeup_mce:
/*
* Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake
- * reason into SRR1, which allows reuse of the system reset wakeup
+ * reason into r12, which allows reuse of the system reset wakeup
* code without being mistaken for another type of wakeup.
*/
- oris r3,r3,SRR1_WAKEMCE_RESVD@h
- mtspr SPRN_SRR1,r3
+ oris r12,r3,SRR1_WAKEMCE_RESVD@h
b pnv_powersave_wakeup
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+kvm_start_guest_check:
+ li r0,KVM_HWTHREAD_IN_KERNEL
+ stb r0,HSTATE_HWTHREAD_STATE(r13)
+ /* Order setting hwthread_state vs. testing hwthread_req */
+ sync
+ lbz r0,HSTATE_HWTHREAD_REQ(r13)
+ cmpwi r0,0
+ beqlr
+ b kvm_start_guest
+#endif
+
/*
* Called from reset vector for powersave wakeups.
* cr3 - set to gt if waking up with partial/complete hypervisor state loss
+ * r12 - SRR1
*/
.global pnv_powersave_wakeup
pnv_powersave_wakeup:
@@ -464,20 +529,15 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
li r0,PNV_THREAD_RUNNING
stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */
+ mr r3,r12
+
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
- li r0,KVM_HWTHREAD_IN_KERNEL
- stb r0,HSTATE_HWTHREAD_STATE(r13)
- /* Order setting hwthread_state vs. testing hwthread_req */
- sync
- lbz r0,HSTATE_HWTHREAD_REQ(r13)
- cmpwi r0,0
- beq 1f
- b kvm_start_guest
-1:
+BEGIN_FTR_SECTION
+ bl kvm_start_guest_check
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
#endif
/* Return SRR1 from power7_nap() */
- mfspr r3,SPRN_SRR1
blt cr3,pnv_wakeup_noloss
b pnv_wakeup_loss
@@ -489,18 +549,45 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
*/
pnv_restore_hyp_resource_arch300:
/*
+ * Workaround for POWER9, if we lost resources, the ERAT
+ * might have been mixed up and needs flushing. We also need
+ * to reload MMCR0 (see comment above). We also need to set
+ * then clear bit 60 in MMCRA to ensure the PMU starts running.
+ */
+ blt cr3,1f
+ PPC_INVALIDATE_ERAT
+ ld r1,PACAR1(r13)
+ mfspr r4,SPRN_MMCRA
+ ori r4,r4,(1 << (63-60))
+ mtspr SPRN_MMCRA,r4
+ xori r4,r4,(1 << (63-60))
+ mtspr SPRN_MMCRA,r4
+ ld r4,_MMCR0(r1)
+ mtspr SPRN_MMCR0,r4
+1:
+ /*
* POWER ISA 3. Use PSSCR to determine if we
* are waking up from deep idle state
*/
LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
ld r4,ADDROFF(pnv_first_deep_stop_state)(r5)
- mfspr r5,SPRN_PSSCR
+BEGIN_FTR_SECTION_NESTED(71)
+ /*
+ * Assume that we are waking up from the state
+ * same as the Requested Level (RL) in the PSSCR
+ * which are Bits 60-63
+ */
+ ld r5,PACA_REQ_PSSCR(r13)
+ rldicl r5,r5,0,60
+FTR_SECTION_ELSE_NESTED(71)
/*
* 0-3 bits correspond to Power-Saving Level Status
* which indicates the idle state we are waking up from
*/
+ mfspr r5, SPRN_PSSCR
rldicl r5,r5,4,60
+ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_POWER9_DD1, 71)
cmpd cr4,r5,r4
bge cr4,pnv_wakeup_tb_loss /* returns to caller */
@@ -567,9 +654,9 @@ pnv_wakeup_tb_loss:
* is required to return back to reset vector after hypervisor state
* restore is complete.
*/
+ mr r19,r12
mr r18,r4
mflr r17
- mfspr r16,SPRN_SRR1
BEGIN_FTR_SECTION
CHECK_HMI_INTERRUPT
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
@@ -731,13 +818,14 @@ timebase_resync:
* Use cr3 which indicates that we are waking up with atleast partial
* hypervisor state loss to determine if TIMEBASE RESYNC is needed.
*/
- ble cr3,clear_lock
+ ble cr3,.Ltb_resynced
/* Time base re-sync */
bl opal_resync_timebase;
/*
- * If waking up from sleep, per core state is not lost, skip to
- * clear_lock.
+ * If waking up from sleep (POWER8), per core state
+ * is not lost, skip to clear_lock.
*/
+.Ltb_resynced:
blt cr4,clear_lock
/*
@@ -812,9 +900,20 @@ no_segments:
mtctr r12
bctrl
+/*
+ * On POWER9, we can come here on wakeup from a cpuidle stop state.
+ * Hence restore the additional SPRs to the saved value.
+ *
+ * On POWER8, we come here only on winkle. Since winkle is used
+ * only in the case of CPU-Hotplug, we don't need to restore
+ * the additional SPRs.
+ */
+BEGIN_FTR_SECTION
+ bl power9_restore_additional_sprs
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
hypervisor_state_restored:
- mtspr SPRN_SRR1,r16
+ mr r12,r19
mtlr r17
blr /* return to pnv_powersave_wakeup */
@@ -827,6 +926,7 @@ fastsleep_workaround_at_exit:
/*
* R3 here contains the value that will be returned to the caller
* of power7_nap.
+ * R12 contains SRR1 for CHECK_HMI_INTERRUPT.
*/
.global pnv_wakeup_loss
pnv_wakeup_loss:
@@ -836,32 +936,33 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
REST_NVGPRS(r1)
REST_GPR(2, r1)
+ ld r4,PACAKMSR(r13)
+ ld r5,_LINK(r1)
ld r6,_CCR(r1)
- ld r4,_MSR(r1)
- ld r5,_NIP(r1)
addi r1,r1,INT_FRAME_SIZE
+ mtlr r5
mtcr r6
- mtspr SPRN_SRR1,r4
- mtspr SPRN_SRR0,r5
- rfid
+ mtmsrd r4
+ blr
/*
* R3 here contains the value that will be returned to the caller
* of power7_nap.
+ * R12 contains SRR1 for CHECK_HMI_INTERRUPT.
*/
pnv_wakeup_noloss:
lbz r0,PACA_NAPSTATELOST(r13)
cmpwi r0,0
bne pnv_wakeup_loss
+ ld r1,PACAR1(r13)
BEGIN_FTR_SECTION
CHECK_HMI_INTERRUPT
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
- ld r1,PACAR1(r13)
- ld r6,_CCR(r1)
- ld r4,_MSR(r1)
+ ld r4,PACAKMSR(r13)
ld r5,_NIP(r1)
+ ld r6,_CCR(r1)
addi r1,r1,INT_FRAME_SIZE
+ mtlr r5
mtcr r6
- mtspr SPRN_SRR1,r4
- mtspr SPRN_SRR0,r5
- rfid
+ mtmsrd r4
+ blr
diff --git a/arch/powerpc/kernel/io-workarounds.c b/arch/powerpc/kernel/io-workarounds.c
index a582e0d42525..aa9f1b8261db 100644
--- a/arch/powerpc/kernel/io-workarounds.c
+++ b/arch/powerpc/kernel/io-workarounds.c
@@ -19,6 +19,8 @@
#include <asm/pgtable.h>
#include <asm/ppc-pci.h>
#include <asm/io-workarounds.h>
+#include <asm/pte-walk.h>
+
#define IOWA_MAX_BUS 8
@@ -75,8 +77,7 @@ struct iowa_bus *iowa_mem_find_bus(const PCI_IO_ADDR addr)
* We won't find huge pages here (iomem). Also can't hit
* a page table free due to init_mm
*/
- ptep = __find_linux_pte_or_hugepte(init_mm.pgd, vaddr,
- NULL, &hugepage_shift);
+ ptep = find_init_mm_pte(vaddr, &hugepage_shift);
if (ptep == NULL)
paddr = 0;
else {
@@ -192,7 +193,7 @@ void iowa_register_bus(struct pci_controller *phb, struct ppc_pci_io *ops,
if (iowa_bus_count >= IOWA_MAX_BUS) {
pr_err("IOWA:Too many pci bridges, "
- "workarounds disabled for %s\n", np->full_name);
+ "workarounds disabled for %pOF\n", np);
return;
}
@@ -207,6 +208,6 @@ void iowa_register_bus(struct pci_controller *phb, struct ppc_pci_io *ops,
iowa_bus_count++;
- pr_debug("IOWA:[%d]Add bus, %s.\n", iowa_bus_count-1, np->full_name);
+ pr_debug("IOWA:[%d]Add bus, %pOF.\n", iowa_bus_count-1, np);
}
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index f2b724cd9e64..af7a20dc6e09 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -127,8 +127,7 @@ static ssize_t fail_iommu_store(struct device *dev,
return count;
}
-static DEVICE_ATTR(fail_iommu, S_IRUGO|S_IWUSR, fail_iommu_show,
- fail_iommu_store);
+static DEVICE_ATTR_RW(fail_iommu);
static int fail_iommu_bus_notify(struct notifier_block *nb,
unsigned long action, void *data)
@@ -190,7 +189,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
unsigned int pool_nr;
struct iommu_pool *pool;
- align_mask = 0xffffffffffffffffl >> (64 - align_order);
+ align_mask = (1ull << align_order) - 1;
/* This allocator was derived from x86_64's bit string search */
@@ -198,17 +197,17 @@ static unsigned long iommu_range_alloc(struct device *dev,
if (unlikely(npages == 0)) {
if (printk_ratelimit())
WARN_ON(1);
- return DMA_ERROR_CODE;
+ return IOMMU_MAPPING_ERROR;
}
if (should_fail_iommu(dev))
- return DMA_ERROR_CODE;
+ return IOMMU_MAPPING_ERROR;
/*
* We don't need to disable preemption here because any CPU can
* safely use any IOMMU pool.
*/
- pool_nr = __this_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1);
+ pool_nr = raw_cpu_read(iommu_pool_hash) & (tbl->nr_pools - 1);
if (largealloc)
pool = &(tbl->large_pool);
@@ -278,7 +277,7 @@ again:
} else {
/* Give up */
spin_unlock_irqrestore(&(pool->lock), flags);
- return DMA_ERROR_CODE;
+ return IOMMU_MAPPING_ERROR;
}
}
@@ -310,13 +309,13 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
unsigned long attrs)
{
unsigned long entry;
- dma_addr_t ret = DMA_ERROR_CODE;
+ dma_addr_t ret = IOMMU_MAPPING_ERROR;
int build_fail;
entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
- if (unlikely(entry == DMA_ERROR_CODE))
- return DMA_ERROR_CODE;
+ if (unlikely(entry == IOMMU_MAPPING_ERROR))
+ return IOMMU_MAPPING_ERROR;
entry += tbl->it_offset; /* Offset into real TCE table */
ret = entry << tbl->it_page_shift; /* Set the return dma address */
@@ -328,12 +327,12 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
/* tbl->it_ops->set() only returns non-zero for transient errors.
* Clean up the table bitmap in this case and return
- * DMA_ERROR_CODE. For all other errors the functionality is
+ * IOMMU_MAPPING_ERROR. For all other errors the functionality is
* not altered.
*/
if (unlikely(build_fail)) {
__iommu_free(tbl, ret, npages);
- return DMA_ERROR_CODE;
+ return IOMMU_MAPPING_ERROR;
}
/* Flush/invalidate TLB caches if necessary */
@@ -478,7 +477,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen);
/* Handle failure */
- if (unlikely(entry == DMA_ERROR_CODE)) {
+ if (unlikely(entry == IOMMU_MAPPING_ERROR)) {
if (!(attrs & DMA_ATTR_NO_WARN) &&
printk_ratelimit())
dev_info(dev, "iommu_alloc failed, tbl %p "
@@ -545,7 +544,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
*/
if (outcount < incount) {
outs = sg_next(outs);
- outs->dma_address = DMA_ERROR_CODE;
+ outs->dma_address = IOMMU_MAPPING_ERROR;
outs->dma_length = 0;
}
@@ -563,7 +562,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table *tbl,
npages = iommu_num_pages(s->dma_address, s->dma_length,
IOMMU_PAGE_SIZE(tbl));
__iommu_free(tbl, vaddr, npages);
- s->dma_address = DMA_ERROR_CODE;
+ s->dma_address = IOMMU_MAPPING_ERROR;
s->dma_length = 0;
}
if (s == outs)
@@ -777,7 +776,7 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
unsigned long mask, enum dma_data_direction direction,
unsigned long attrs)
{
- dma_addr_t dma_handle = DMA_ERROR_CODE;
+ dma_addr_t dma_handle = IOMMU_MAPPING_ERROR;
void *vaddr;
unsigned long uaddr;
unsigned int npages, align;
@@ -797,7 +796,7 @@ dma_addr_t iommu_map_page(struct device *dev, struct iommu_table *tbl,
dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction,
mask >> tbl->it_page_shift, align,
attrs);
- if (dma_handle == DMA_ERROR_CODE) {
+ if (dma_handle == IOMMU_MAPPING_ERROR) {
if (!(attrs & DMA_ATTR_NO_WARN) &&
printk_ratelimit()) {
dev_info(dev, "iommu_alloc failed, tbl %p "
@@ -869,7 +868,7 @@ void *iommu_alloc_coherent(struct device *dev, struct iommu_table *tbl,
io_order = get_iommu_order(size, tbl);
mapping = iommu_alloc(dev, tbl, ret, nio_pages, DMA_BIDIRECTIONAL,
mask >> tbl->it_page_shift, io_order, 0);
- if (mapping == DMA_ERROR_CODE) {
+ if (mapping == IOMMU_MAPPING_ERROR) {
free_pages((unsigned long)ret, order);
return NULL;
}
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 5c291df30fe3..4e65bf82f5e0 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -24,7 +24,7 @@
* mask register (of which only 16 are defined), hence the weird shifting
* and complement of the cached_irq_mask. I want to be able to stuff
* this right into the SIU SMASK register.
- * Many of the prep/chrp functions are conditional compiled on CONFIG_8xx
+ * Many of the prep/chrp functions are conditional compiled on CONFIG_PPC_8xx
* to reduce code space and undefined function references.
*/
@@ -143,8 +143,22 @@ notrace unsigned int __check_irq_replay(void)
*/
unsigned char happened = local_paca->irq_happened;
- /* Clear bit 0 which we wouldn't clear otherwise */
- local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+ if (happened & PACA_IRQ_HARD_DIS) {
+ /* Clear bit 0 which we wouldn't clear otherwise */
+ local_paca->irq_happened &= ~PACA_IRQ_HARD_DIS;
+
+ /*
+ * We may have missed a decrementer interrupt if hard disabled.
+ * Check the decrementer register in case we had a rollover
+ * while hard disabled.
+ */
+ if (!(happened & PACA_IRQ_DEC)) {
+ if (decrementer_check_overflow()) {
+ local_paca->irq_happened |= PACA_IRQ_DEC;
+ happened |= PACA_IRQ_DEC;
+ }
+ }
+ }
/*
* Force the delivery of pending soft-disabled interrupts on PS3.
@@ -160,41 +174,39 @@ notrace unsigned int __check_irq_replay(void)
* This is a higher priority interrupt than the others, so
* replay it first.
*/
- local_paca->irq_happened &= ~PACA_IRQ_HMI;
- if (happened & PACA_IRQ_HMI)
+ if (happened & PACA_IRQ_HMI) {
+ local_paca->irq_happened &= ~PACA_IRQ_HMI;
return 0xe60;
+ }
- /*
- * We may have missed a decrementer interrupt. We check the
- * decrementer itself rather than the paca irq_happened field
- * in case we also had a rollover while hard disabled
- */
- local_paca->irq_happened &= ~PACA_IRQ_DEC;
- if ((happened & PACA_IRQ_DEC) || decrementer_check_overflow())
+ if (happened & PACA_IRQ_DEC) {
+ local_paca->irq_happened &= ~PACA_IRQ_DEC;
return 0x900;
+ }
- /* Finally check if an external interrupt happened */
- local_paca->irq_happened &= ~PACA_IRQ_EE;
- if (happened & PACA_IRQ_EE)
+ if (happened & PACA_IRQ_EE) {
+ local_paca->irq_happened &= ~PACA_IRQ_EE;
return 0x500;
+ }
#ifdef CONFIG_PPC_BOOK3E
- /* Finally check if an EPR external interrupt happened
- * this bit is typically set if we need to handle another
- * "edge" interrupt from within the MPIC "EPR" handler
+ /*
+ * Check if an EPR external interrupt happened this bit is typically
+ * set if we need to handle another "edge" interrupt from within the
+ * MPIC "EPR" handler.
*/
- local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE;
- if (happened & PACA_IRQ_EE_EDGE)
+ if (happened & PACA_IRQ_EE_EDGE) {
+ local_paca->irq_happened &= ~PACA_IRQ_EE_EDGE;
return 0x500;
+ }
- local_paca->irq_happened &= ~PACA_IRQ_DBELL;
- if (happened & PACA_IRQ_DBELL)
+ if (happened & PACA_IRQ_DBELL) {
+ local_paca->irq_happened &= ~PACA_IRQ_DBELL;
return 0x280;
+ }
#else
- local_paca->irq_happened &= ~PACA_IRQ_DBELL;
if (happened & PACA_IRQ_DBELL) {
- if (cpu_has_feature(CPU_FTR_HVMODE))
- return 0xe80;
+ local_paca->irq_happened &= ~PACA_IRQ_DBELL;
return 0xa00;
}
#endif /* CONFIG_PPC_BOOK3E */
@@ -322,7 +334,8 @@ bool prep_irq_for_idle(void)
* First we need to hard disable to ensure no interrupt
* occurs before we effectively enter the low power state
*/
- hard_irq_disable();
+ __hard_irq_disable();
+ local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
/*
* If anything happened while we were soft-disabled,
@@ -347,6 +360,65 @@ bool prep_irq_for_idle(void)
return true;
}
+#ifdef CONFIG_PPC_BOOK3S
+/*
+ * This is for idle sequences that return with IRQs off, but the
+ * idle state itself wakes on interrupt. Tell the irq tracer that
+ * IRQs are enabled for the duration of idle so it does not get long
+ * off times. Must be paired with fini_irq_for_idle_irqsoff.
+ */
+bool prep_irq_for_idle_irqsoff(void)
+{
+ WARN_ON(!irqs_disabled());
+
+ /*
+ * First we need to hard disable to ensure no interrupt
+ * occurs before we effectively enter the low power state
+ */
+ __hard_irq_disable();
+ local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+
+ /*
+ * If anything happened while we were soft-disabled,
+ * we return now and do not enter the low power state.
+ */
+ if (lazy_irq_pending())
+ return false;
+
+ /* Tell lockdep we are about to re-enable */
+ trace_hardirqs_on();
+
+ return true;
+}
+
+/*
+ * Take the SRR1 wakeup reason, index into this table to find the
+ * appropriate irq_happened bit.
+ */
+static const u8 srr1_to_lazyirq[0x10] = {
+ 0, 0, 0,
+ PACA_IRQ_DBELL,
+ 0,
+ PACA_IRQ_DBELL,
+ PACA_IRQ_DEC,
+ 0,
+ PACA_IRQ_EE,
+ PACA_IRQ_EE,
+ PACA_IRQ_HMI,
+ 0, 0, 0, 0, 0 };
+
+void irq_set_pending_from_srr1(unsigned long srr1)
+{
+ unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18;
+
+ /*
+ * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0,
+ * so this can be called unconditionally with srr1 wake reason.
+ */
+ local_paca->irq_happened |= srr1_to_lazyirq[idx];
+}
+#endif /* CONFIG_PPC_BOOK3S */
+
/*
* Force a replay of the external interrupt handler on this CPU.
*/
@@ -410,6 +482,18 @@ int arch_show_interrupts(struct seq_file *p, int prec)
seq_printf(p, " Hypervisor Maintenance Interrupts\n");
}
+ seq_printf(p, "%*s: ", prec, "NMI");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", per_cpu(irq_stat, j).sreset_irqs);
+ seq_printf(p, " System Reset interrupts\n");
+
+#ifdef CONFIG_PPC_WATCHDOG
+ seq_printf(p, "%*s: ", prec, "WDG");
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", per_cpu(irq_stat, j).soft_nmi_irqs);
+ seq_printf(p, " Watchdog soft-NMI interrupts\n");
+#endif
+
#ifdef CONFIG_PPC_DOORBELL
if (cpu_has_feature(CPU_FTR_DBELL)) {
seq_printf(p, "%*s: ", prec, "DBL");
@@ -434,6 +518,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
sum += per_cpu(irq_stat, cpu).spurious_irqs;
sum += per_cpu(irq_stat, cpu).timer_irqs_others;
sum += per_cpu(irq_stat, cpu).hmi_exceptions;
+ sum += per_cpu(irq_stat, cpu).sreset_irqs;
+#ifdef CONFIG_PPC_WATCHDOG
+ sum += per_cpu(irq_stat, cpu).soft_nmi_irqs;
+#endif
#ifdef CONFIG_PPC_DOORBELL
sum += per_cpu(irq_stat, cpu).doorbell_irqs;
#endif
diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c
index bb6f8993412e..1df6c74aa731 100644
--- a/arch/powerpc/kernel/isa-bridge.c
+++ b/arch/powerpc/kernel/isa-bridge.c
@@ -164,7 +164,7 @@ void __init isa_bridge_find_early(struct pci_controller *hose)
/* Set the global ISA io base to indicate we have an ISA bridge */
isa_io_base = ISA_IO_BASE;
- pr_debug("ISA bridge (early) is %s\n", np->full_name);
+ pr_debug("ISA bridge (early) is %pOF\n", np);
}
/**
@@ -187,15 +187,15 @@ void __init isa_bridge_init_non_pci(struct device_node *np)
pna = of_n_addr_cells(np);
if (of_property_read_u32(np, "#address-cells", &na) ||
of_property_read_u32(np, "#size-cells", &ns)) {
- pr_warn("ISA: Non-PCI bridge %s is missing address format\n",
- np->full_name);
+ pr_warn("ISA: Non-PCI bridge %pOF is missing address format\n",
+ np);
return;
}
/* Check it's a supported address format */
if (na != 2 || ns != 1) {
- pr_warn("ISA: Non-PCI bridge %s has unsupported address format\n",
- np->full_name);
+ pr_warn("ISA: Non-PCI bridge %pOF has unsupported address format\n",
+ np);
return;
}
rs = na + ns + pna;
@@ -203,8 +203,8 @@ void __init isa_bridge_init_non_pci(struct device_node *np)
/* Grab the ranges property */
ranges = of_get_property(np, "ranges", &rlen);
if (ranges == NULL || rlen < rs) {
- pr_warn("ISA: Non-PCI bridge %s has absent or invalid ranges\n",
- np->full_name);
+ pr_warn("ISA: Non-PCI bridge %pOF has absent or invalid ranges\n",
+ np);
return;
}
@@ -220,8 +220,8 @@ void __init isa_bridge_init_non_pci(struct device_node *np)
/* Got something ? */
if (!size || !pbasep) {
- pr_warn("ISA: Non-PCI bridge %s has no usable IO range\n",
- np->full_name);
+ pr_warn("ISA: Non-PCI bridge %pOF has no usable IO range\n",
+ np);
return;
}
@@ -233,15 +233,15 @@ void __init isa_bridge_init_non_pci(struct device_node *np)
/* Map pbase */
pbase = of_translate_address(np, pbasep);
if (pbase == OF_BAD_ADDR) {
- pr_warn("ISA: Non-PCI bridge %s failed to translate IO base\n",
- np->full_name);
+ pr_warn("ISA: Non-PCI bridge %pOF failed to translate IO base\n",
+ np);
return;
}
/* We need page alignment */
if ((cbase & ~PAGE_MASK) || (pbase & ~PAGE_MASK)) {
- pr_warn("ISA: Non-PCI bridge %s has non aligned IO range\n",
- np->full_name);
+ pr_warn("ISA: Non-PCI bridge %pOF has non aligned IO range\n",
+ np);
return;
}
@@ -255,7 +255,7 @@ void __init isa_bridge_init_non_pci(struct device_node *np)
__ioremap_at(pbase, (void *)ISA_IO_BASE,
size, pgprot_val(pgprot_noncached(__pgprot(0))));
- pr_debug("ISA: Non-PCI bridge is %s\n", np->full_name);
+ pr_debug("ISA: Non-PCI bridge is %pOF\n", np);
}
/**
@@ -277,8 +277,8 @@ static void isa_bridge_find_late(struct pci_dev *pdev,
/* Set the global ISA io base to indicate we have an ISA bridge */
isa_io_base = ISA_IO_BASE;
- pr_debug("ISA bridge (late) is %s on %s\n",
- devnode->full_name, pci_name(pdev));
+ pr_debug("ISA bridge (late) is %pOF on %s\n",
+ devnode, pci_name(pdev));
}
/**
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index dbf098121ce6..35e240a0a408 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -67,9 +67,9 @@ static struct hard_trap_info
#endif
#else /* ! (defined(CONFIG_40x) || defined(CONFIG_BOOKE)) */
{ 0x0d00, 0x05 /* SIGTRAP */ }, /* single-step */
-#if defined(CONFIG_8xx)
+#if defined(CONFIG_PPC_8xx)
{ 0x1000, 0x04 /* SIGILL */ }, /* software emulation */
-#else /* ! CONFIG_8xx */
+#else /* ! CONFIG_PPC_8xx */
{ 0x0f00, 0x04 /* SIGILL */ }, /* performance monitor */
{ 0x0f20, 0x08 /* SIGFPE */ }, /* altivec unavailable */
{ 0x1300, 0x05 /* SIGTRAP */ }, /* instruction address break */
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index 01addfb0ed0a..367494dc67d9 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -164,17 +164,13 @@ NOKPROBE_SYMBOL(arch_prepare_kprobe);
void arch_arm_kprobe(struct kprobe *p)
{
- *p->addr = BREAKPOINT_INSTRUCTION;
- flush_icache_range((unsigned long) p->addr,
- (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+ patch_instruction(p->addr, BREAKPOINT_INSTRUCTION);
}
NOKPROBE_SYMBOL(arch_arm_kprobe);
void arch_disarm_kprobe(struct kprobe *p)
{
- *p->addr = p->opcode;
- flush_icache_range((unsigned long) p->addr,
- (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+ patch_instruction(p->addr, p->opcode);
}
NOKPROBE_SYMBOL(arch_disarm_kprobe);
@@ -221,7 +217,7 @@ static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs
kcb->kprobe_saved_msr = regs->msr;
}
-bool arch_function_offset_within_entry(unsigned long offset)
+bool arch_kprobe_on_func_entry(unsigned long offset)
{
#ifdef PPC64_ELF_ABI_v2
#ifdef CONFIG_KPROBES_ON_FTRACE
diff --git a/arch/powerpc/kernel/l2cr_6xx.S b/arch/powerpc/kernel/l2cr_6xx.S
index 97ec8557f974..6408f09dbbd9 100644
--- a/arch/powerpc/kernel/l2cr_6xx.S
+++ b/arch/powerpc/kernel/l2cr_6xx.S
@@ -181,7 +181,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPEC7450)
mtctr r4
li r4,0
1:
- lwzx r0,r0,r4
+ lwzx r0,0,r4
addi r4,r4,32 /* Go to start of next cache line */
bdnz 1b
isync
@@ -328,7 +328,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_L3CR)
mtctr r4
li r4,0
1:
- lwzx r0,r0,r4
+ lwzx r0,0,r4
dcbf 0,r4
addi r4,r4,32 /* Go to start of next cache line */
bdnz 1b
diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c
index 0694d20f85b6..5e5a64a8b4e4 100644
--- a/arch/powerpc/kernel/legacy_serial.c
+++ b/arch/powerpc/kernel/legacy_serial.c
@@ -147,8 +147,8 @@ static int __init add_legacy_port(struct device_node *np, int want_index,
legacy_serial_ports[index].serial_out = tsi_serial_out;
}
- printk(KERN_DEBUG "Found legacy serial port %d for %s\n",
- index, np->full_name);
+ printk(KERN_DEBUG "Found legacy serial port %d for %pOF\n",
+ index, np);
printk(KERN_DEBUG " %s=%llx, taddr=%llx, irq=%lx, clk=%d, speed=%d\n",
(iotype == UPIO_PORT) ? "port" : "mem",
(unsigned long long)base, (unsigned long long)taddr, irq,
@@ -207,7 +207,7 @@ static int __init add_legacy_isa_port(struct device_node *np,
int index = -1;
u64 taddr;
- DBG(" -> add_legacy_isa_port(%s)\n", np->full_name);
+ DBG(" -> add_legacy_isa_port(%pOF)\n", np);
/* Get the ISA port number */
reg = of_get_property(np, "reg", NULL);
@@ -256,7 +256,7 @@ static int __init add_legacy_pci_port(struct device_node *np,
unsigned int flags;
int iotype, index = -1, lindex = 0;
- DBG(" -> add_legacy_pci_port(%s)\n", np->full_name);
+ DBG(" -> add_legacy_pci_port(%pOF)\n", np);
/* We only support ports that have a clock frequency properly
* encoded in the device-tree (that is have an fcode). Anything
@@ -374,7 +374,7 @@ void __init find_legacy_serial_ports(void)
if (path != NULL) {
stdout = of_find_node_by_path(path);
if (stdout)
- DBG("stdout is %s\n", stdout->full_name);
+ DBG("stdout is %pOF\n", stdout);
} else {
DBG(" no linux,stdout-path !\n");
}
@@ -603,7 +603,7 @@ static int __init check_legacy_serial_console(void)
DBG(" can't find stdout package %s !\n", name);
return -ENODEV;
}
- DBG("stdout is %s\n", prom_stdout->full_name);
+ DBG("stdout is %pOF\n", prom_stdout);
name = of_get_property(prom_stdout, "name", NULL);
if (!name) {
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 5f9eada3519b..9b2ea7e71c06 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -22,11 +22,14 @@
#undef DEBUG
#define pr_fmt(fmt) "mce: " fmt
+#include <linux/hardirq.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/percpu.h>
#include <linux/export.h>
#include <linux/irq_work.h>
+
+#include <asm/machdep.h>
#include <asm/mce.h>
static DEFINE_PER_CPU(int, mce_nest_count);
@@ -268,6 +271,7 @@ void machine_check_print_event_info(struct machine_check_event *evt,
static const char *mc_ra_types[] = {
"Indeterminate",
"Instruction fetch (bad)",
+ "Instruction fetch (foreign)",
"Page table walk ifetch (bad)",
"Page table walk ifetch (foreign)",
"Load (bad)",
@@ -405,6 +409,7 @@ void machine_check_print_event_info(struct machine_check_event *evt,
break;
}
}
+EXPORT_SYMBOL_GPL(machine_check_print_event_info);
uint64_t get_mce_fault_addr(struct machine_check_event *evt)
{
@@ -444,3 +449,33 @@ uint64_t get_mce_fault_addr(struct machine_check_event *evt)
return 0;
}
EXPORT_SYMBOL(get_mce_fault_addr);
+
+/*
+ * This function is called in real mode. Strictly no printk's please.
+ *
+ * regs->nip and regs->msr contains srr0 and ssr1.
+ */
+long machine_check_early(struct pt_regs *regs)
+{
+ long handled = 0;
+
+ __this_cpu_inc(irq_stat.mce_exceptions);
+
+ if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
+ handled = cur_cpu_spec->machine_check_early(regs);
+ return handled;
+}
+
+long hmi_exception_realmode(struct pt_regs *regs)
+{
+ __this_cpu_inc(irq_stat.hmi_exceptions);
+
+ wait_for_subcore_guest_exit();
+
+ if (ppc_md.hmi_exception_early)
+ ppc_md.hmi_exception_early(regs);
+
+ wait_for_tb_resync();
+
+ return 0;
+}
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index f913139bb0c2..b76ca198e09c 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -53,6 +53,60 @@ static void flush_tlb_206(unsigned int num_sets, unsigned int action)
asm volatile("ptesync" : : : "memory");
}
+static void flush_tlb_300(unsigned int num_sets, unsigned int action)
+{
+ unsigned long rb;
+ unsigned int i;
+ unsigned int r;
+
+ switch (action) {
+ case TLB_INVAL_SCOPE_GLOBAL:
+ rb = TLBIEL_INVAL_SET;
+ break;
+ case TLB_INVAL_SCOPE_LPID:
+ rb = TLBIEL_INVAL_SET_LPID;
+ break;
+ default:
+ BUG();
+ break;
+ }
+
+ asm volatile("ptesync" : : : "memory");
+
+ if (early_radix_enabled())
+ r = 1;
+ else
+ r = 0;
+
+ /*
+ * First flush table/PWC caches with set 0, then flush the
+ * rest of the sets, partition scope. Radix must then do it
+ * all again with process scope. Hash just has to flush
+ * process table.
+ */
+ asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) : :
+ "r"(rb), "r"(0), "i"(2), "i"(0), "r"(r));
+ for (i = 1; i < num_sets; i++) {
+ unsigned long set = i * (1<<TLBIEL_INVAL_SET_SHIFT);
+
+ asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) : :
+ "r"(rb+set), "r"(0), "i"(2), "i"(0), "r"(r));
+ }
+
+ asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) : :
+ "r"(rb), "r"(0), "i"(2), "i"(1), "r"(r));
+ if (early_radix_enabled()) {
+ for (i = 1; i < num_sets; i++) {
+ unsigned long set = i * (1<<TLBIEL_INVAL_SET_SHIFT);
+
+ asm volatile(PPC_TLBIEL(%0, %1, %2, %3, %4) : :
+ "r"(rb+set), "r"(0), "i"(2), "i"(1), "r"(r));
+ }
+ }
+
+ asm volatile("ptesync" : : : "memory");
+}
+
/*
* Generic routines to flush TLB on POWER processors. These routines
* are used as flush_tlb hook in the cpu_spec.
@@ -79,7 +133,7 @@ void __flush_tlb_power9(unsigned int action)
else
num_sets = POWER9_TLB_SETS_HASH;
- flush_tlb_206(num_sets, action);
+ flush_tlb_300(num_sets, action);
}
@@ -236,6 +290,9 @@ static const struct mce_ierror_table mce_p9_ierror_table[] = {
{ 0x00000000081c0000, 0x0000000000180000, true,
MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000001c0000, true,
+ MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH_FOREIGN,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
{ 0x00000000081c0000, 0x0000000008000000, true,
MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S
index 84db14e435f5..3f7a9a2d2435 100644
--- a/arch/powerpc/kernel/misc_32.S
+++ b/arch/powerpc/kernel/misc_32.S
@@ -244,8 +244,7 @@ _GLOBAL(_nmask_and_or_msr)
*/
_GLOBAL(real_readb)
mfmsr r7
- ori r0,r7,MSR_DR
- xori r0,r0,MSR_DR
+ rlwinm r0,r7,0,~MSR_DR
sync
mtmsr r0
sync
@@ -262,8 +261,7 @@ _GLOBAL(real_readb)
*/
_GLOBAL(real_writeb)
mfmsr r7
- ori r0,r7,MSR_DR
- xori r0,r0,MSR_DR
+ rlwinm r0,r7,0,~MSR_DR
sync
mtmsr r0
sync
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index c119044cad0d..8ac0bd2bddb0 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -614,6 +614,18 @@ _GLOBAL(kexec_sequence)
li r0,0
std r0,16(r1)
+BEGIN_FTR_SECTION
+ /*
+ * This is the best time to turn AMR/IAMR off.
+ * key 0 is used in radix for supervisor<->user
+ * protection, but on hash key 0 is reserved
+ * ideally we want to enter with a clean state.
+ * NOTE, we rely on r0 being 0 from above.
+ */
+ mtspr SPRN_IAMR,r0
+ mtspr SPRN_AMOR,r0
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
/* save regs for local vars on new stack.
* yes, we won't go back, but ...
*/
diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c
index eae61b044e9e..496d6393bd41 100644
--- a/arch/powerpc/kernel/nvram_64.c
+++ b/arch/powerpc/kernel/nvram_64.c
@@ -792,21 +792,17 @@ static ssize_t dev_nvram_write(struct file *file, const char __user *buf,
count = min_t(size_t, count, size - *ppos);
count = min(count, PAGE_SIZE);
- ret = -ENOMEM;
- tmp = kmalloc(count, GFP_KERNEL);
- if (!tmp)
- goto out;
-
- ret = -EFAULT;
- if (copy_from_user(tmp, buf, count))
+ tmp = memdup_user(buf, count);
+ if (IS_ERR(tmp)) {
+ ret = PTR_ERR(tmp);
goto out;
+ }
ret = ppc_md.nvram_write(tmp, count, ppos);
-out:
kfree(tmp);
+out:
return ret;
-
}
static long dev_nvram_ioctl(struct file *file, unsigned int cmd,
diff --git a/arch/powerpc/kernel/of_platform.c b/arch/powerpc/kernel/of_platform.c
index 34aeac54f120..becaec990140 100644
--- a/arch/powerpc/kernel/of_platform.c
+++ b/arch/powerpc/kernel/of_platform.c
@@ -45,7 +45,7 @@ static int of_pci_phb_probe(struct platform_device *dev)
if (ppc_md.pci_setup_phb == NULL)
return -ENODEV;
- pr_info("Setting up PCI bus %s\n", dev->dev.of_node->full_name);
+ pr_info("Setting up PCI bus %pOF\n", dev->dev.of_node);
/* Alloc and setup PHB data structure */
phb = pcibios_alloc_controller(dev->dev.of_node);
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index ec60ed0d4aad..6f8273f5e988 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -158,12 +158,13 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op)
void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr)
{
/* addis r4,0,(insn)@h */
- *addr++ = PPC_INST_ADDIS | ___PPC_RT(4) |
- ((val >> 16) & 0xffff);
+ patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(4) |
+ ((val >> 16) & 0xffff));
+ addr++;
/* ori r4,r4,(insn)@l */
- *addr = PPC_INST_ORI | ___PPC_RA(4) | ___PPC_RS(4) |
- (val & 0xffff);
+ patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(4) |
+ ___PPC_RS(4) | (val & 0xffff));
}
/*
@@ -173,24 +174,28 @@ void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr)
void patch_imm64_load_insns(unsigned long val, kprobe_opcode_t *addr)
{
/* lis r3,(op)@highest */
- *addr++ = PPC_INST_ADDIS | ___PPC_RT(3) |
- ((val >> 48) & 0xffff);
+ patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(3) |
+ ((val >> 48) & 0xffff));
+ addr++;
/* ori r3,r3,(op)@higher */
- *addr++ = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) |
- ((val >> 32) & 0xffff);
+ patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) |
+ ___PPC_RS(3) | ((val >> 32) & 0xffff));
+ addr++;
/* rldicr r3,r3,32,31 */
- *addr++ = PPC_INST_RLDICR | ___PPC_RA(3) | ___PPC_RS(3) |
- __PPC_SH64(32) | __PPC_ME64(31);
+ patch_instruction(addr, PPC_INST_RLDICR | ___PPC_RA(3) |
+ ___PPC_RS(3) | __PPC_SH64(32) | __PPC_ME64(31));
+ addr++;
/* oris r3,r3,(op)@h */
- *addr++ = PPC_INST_ORIS | ___PPC_RA(3) | ___PPC_RS(3) |
- ((val >> 16) & 0xffff);
+ patch_instruction(addr, PPC_INST_ORIS | ___PPC_RA(3) |
+ ___PPC_RS(3) | ((val >> 16) & 0xffff));
+ addr++;
/* ori r3,r3,(op)@l */
- *addr = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) |
- (val & 0xffff);
+ patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) |
+ ___PPC_RS(3) | (val & 0xffff));
}
int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
@@ -198,7 +203,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
kprobe_opcode_t *buff, branch_op_callback, branch_emulate_step;
kprobe_opcode_t *op_callback_addr, *emulate_step_addr;
long b_offset;
- unsigned long nip;
+ unsigned long nip, size;
+ int rc, i;
kprobe_ppc_optinsn_slots.insn_size = MAX_OPTINSN_SIZE;
@@ -231,8 +237,14 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
goto error;
/* Setup template */
- memcpy(buff, optprobe_template_entry,
- TMPL_END_IDX * sizeof(kprobe_opcode_t));
+ /* We can optimize this via patch_instruction_window later */
+ size = (TMPL_END_IDX * sizeof(kprobe_opcode_t)) / sizeof(int);
+ pr_devel("Copying template to %p, size %lu\n", buff, size);
+ for (i = 0; i < size; i++) {
+ rc = patch_instruction(buff + i, *(optprobe_template_entry + i));
+ if (rc < 0)
+ goto error;
+ }
/*
* Fixup the template with instructions to:
@@ -261,8 +273,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
if (!branch_op_callback || !branch_emulate_step)
goto error;
- buff[TMPL_CALL_HDLR_IDX] = branch_op_callback;
- buff[TMPL_EMULATE_IDX] = branch_emulate_step;
+ patch_instruction(buff + TMPL_CALL_HDLR_IDX, branch_op_callback);
+ patch_instruction(buff + TMPL_EMULATE_IDX, branch_emulate_step);
/*
* 3. load instruction to be emulated into relevant register, and
@@ -272,8 +284,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
/*
* 4. branch back from trampoline
*/
- buff[TMPL_RET_IDX] = create_branch((unsigned int *)buff + TMPL_RET_IDX,
- (unsigned long)nip, 0);
+ patch_branch(buff + TMPL_RET_IDX, (unsigned long)nip, 0);
flush_icache_range((unsigned long)buff,
(unsigned long)(&buff[TMPL_END_IDX]));
diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S
index 4937bef7652f..52fc864cdec4 100644
--- a/arch/powerpc/kernel/optprobes_head.S
+++ b/arch/powerpc/kernel/optprobes_head.S
@@ -60,10 +60,6 @@ optprobe_template_entry:
std r5,_CCR(r1)
lbz r5,PACASOFTIRQEN(r13)
std r5,SOFTE(r1)
- mfdar r5
- std r5,_DAR(r1)
- mfdsisr r5
- std r5,_DSISR(r1)
/*
* We may get here from a module, so load the kernel TOC in r2.
@@ -122,10 +118,6 @@ optprobe_template_call_emulate:
mtxer r5
ld r5,_CCR(r1)
mtcr r5
- ld r5,_DAR(r1)
- mtdar r5
- ld r5,_DSISR(r1)
- mtdsisr r5
REST_GPR(0,r1)
REST_10GPRS(2,r1)
REST_10GPRS(12,r1)
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 8d63627e067f..70f073d6c3b2 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -99,18 +99,27 @@ static inline void free_lppacas(void) { }
* If you make the number of persistent SLB entries dynamic, please also
* update PR KVM to flush and restore them accordingly.
*/
-static struct slb_shadow *slb_shadow;
+static struct slb_shadow * __initdata slb_shadow;
static void __init allocate_slb_shadows(int nr_cpus, int limit)
{
int size = PAGE_ALIGN(sizeof(struct slb_shadow) * nr_cpus);
+
+ if (early_radix_enabled())
+ return;
+
slb_shadow = __va(memblock_alloc_base(size, PAGE_SIZE, limit));
memset(slb_shadow, 0, size);
}
static struct slb_shadow * __init init_slb_shadow(int cpu)
{
- struct slb_shadow *s = &slb_shadow[cpu];
+ struct slb_shadow *s;
+
+ if (early_radix_enabled())
+ return NULL;
+
+ s = &slb_shadow[cpu];
/*
* When we come through here to initialise boot_paca, the slb_shadow
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index 341a7469cab8..02831a396419 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -373,9 +373,8 @@ static int pci_read_irq_line(struct pci_dev *pci_dev)
if (virq)
irq_set_irq_type(virq, IRQ_TYPE_LEVEL_LOW);
} else {
- pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %s\n",
- oirq.args_count, oirq.args[0], oirq.args[1],
- of_node_full_name(oirq.np));
+ pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %pOF\n",
+ oirq.args_count, oirq.args[0], oirq.args[1], oirq.np);
virq = irq_create_of_mapping(&oirq);
}
@@ -741,8 +740,8 @@ void pci_process_bridge_OF_ranges(struct pci_controller *hose,
struct of_pci_range range;
struct of_pci_range_parser parser;
- printk(KERN_INFO "PCI host bridge %s %s ranges:\n",
- dev->full_name, primary ? "(primary)" : "");
+ printk(KERN_INFO "PCI host bridge %pOF %s ranges:\n",
+ dev, primary ? "(primary)" : "");
/* Check for ranges property */
if (of_pci_range_parser_init(&parser, dev))
@@ -1556,8 +1555,8 @@ static void pcibios_setup_phb_resources(struct pci_controller *hose,
if (!res->flags) {
pr_debug("PCI: I/O resource not set for host"
- " bridge %s (domain %d)\n",
- hose->dn->full_name, hose->global_number);
+ " bridge %pOF (domain %d)\n",
+ hose->dn, hose->global_number);
} else {
offset = pcibios_io_space_offset(hose);
@@ -1668,7 +1667,7 @@ void pcibios_scan_phb(struct pci_controller *hose)
struct device_node *node = hose->dn;
int mode;
- pr_debug("PCI: Scanning PHB %s\n", of_node_full_name(node));
+ pr_debug("PCI: Scanning PHB %pOF\n", node);
/* Get some IO space for the new PHB */
pcibios_setup_phb_io_space(hose);
diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c
index 41c86c6b6e4d..1d817f4d97d9 100644
--- a/arch/powerpc/kernel/pci_32.c
+++ b/arch/powerpc/kernel/pci_32.c
@@ -79,8 +79,8 @@ make_one_node_map(struct device_node* node, u8 pci_bus)
return;
bus_range = of_get_property(node, "bus-range", &len);
if (bus_range == NULL || len < 2 * sizeof(int)) {
- printk(KERN_WARNING "Can't get bus-range for %s, "
- "assuming it starts at 0\n", node->full_name);
+ printk(KERN_WARNING "Can't get bus-range for %pOF, "
+ "assuming it starts at 0\n", node);
pci_to_OF_bus_map[pci_bus] = 0;
} else
pci_to_OF_bus_map[pci_bus] = bus_range[0];
diff --git a/arch/powerpc/kernel/pci_64.c b/arch/powerpc/kernel/pci_64.c
index ed5e9ff61a68..932b9741aa8f 100644
--- a/arch/powerpc/kernel/pci_64.c
+++ b/arch/powerpc/kernel/pci_64.c
@@ -111,7 +111,7 @@ int pcibios_unmap_io_space(struct pci_bus *bus)
if (hose->io_base_alloc == NULL)
return 0;
- pr_debug("IO unmapping for PHB %s\n", hose->dn->full_name);
+ pr_debug("IO unmapping for PHB %pOF\n", hose->dn);
pr_debug(" alloc=0x%p\n", hose->io_base_alloc);
/* This is a PHB, we fully unmap the IO area */
@@ -151,7 +151,7 @@ static int pcibios_map_phb_io_space(struct pci_controller *hose)
hose->io_base_virt = (void __iomem *)(area->addr +
hose->io_base_phys - phys_page);
- pr_debug("IO mapping for PHB %s\n", hose->dn->full_name);
+ pr_debug("IO mapping for PHB %pOF\n", hose->dn);
pr_debug(" phys=0x%016llx, virt=0x%p (alloc=0x%p)\n",
hose->io_base_phys, hose->io_base_virt, hose->io_base_alloc);
pr_debug(" size=0x%016llx (alloc=0x%016lx)\n",
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index 592693437070..0e395afbf0f4 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -139,7 +139,6 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev)
#ifdef CONFIG_PCI_IOV
static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
- struct pci_dev *pdev,
int vf_index,
int busno, int devfn)
{
@@ -150,10 +149,8 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
return NULL;
pdn = kzalloc(sizeof(*pdn), GFP_KERNEL);
- if (!pdn) {
- dev_warn(&pdev->dev, "%s: Out of memory!\n", __func__);
+ if (!pdn)
return NULL;
- }
pdn->phb = parent->phb;
pdn->parent = parent;
@@ -167,13 +164,6 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
INIT_LIST_HEAD(&pdn->list);
list_add_tail(&pdn->list, &parent->child_list);
- /*
- * If we already have PCI device instance, lets
- * bind them.
- */
- if (pdev)
- pdev->dev.archdata.pci_data = pdn;
-
return pdn;
}
#endif
@@ -201,7 +191,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
struct eeh_dev *edev __maybe_unused;
- pdn = add_one_dev_pci_data(parent, NULL, i,
+ pdn = add_one_dev_pci_data(parent, i,
pci_iov_virtfn_bus(pdev, i),
pci_iov_virtfn_devfn(pdev, i));
if (!pdn) {
@@ -303,7 +293,6 @@ struct pci_dn *pci_add_device_node_info(struct pci_controller *hose,
if (pdn == NULL)
return NULL;
dn->data = pdn;
- pdn->node = dn;
pdn->phb = hose;
#ifdef CONFIG_PPC_POWERNV
pdn->pe_number = IODA_INVALID_PE;
@@ -352,6 +341,7 @@ EXPORT_SYMBOL_GPL(pci_add_device_node_info);
void pci_remove_device_node_info(struct device_node *dn)
{
struct pci_dn *pdn = dn ? PCI_DN(dn) : NULL;
+ struct device_node *parent;
#ifdef CONFIG_EEH
struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
@@ -364,8 +354,10 @@ void pci_remove_device_node_info(struct device_node *dn)
WARN_ON(!list_empty(&pdn->child_list));
list_del(&pdn->list);
- if (pdn->parent)
- of_node_put(pdn->parent->node);
+
+ parent = of_get_parent(dn);
+ if (parent)
+ of_node_put(parent);
dn->data = NULL;
kfree(pdn);
diff --git a/arch/powerpc/kernel/pci_of_scan.c b/arch/powerpc/kernel/pci_of_scan.c
index ea3d98115b88..0d790f8432d2 100644
--- a/arch/powerpc/kernel/pci_of_scan.c
+++ b/arch/powerpc/kernel/pci_of_scan.c
@@ -211,19 +211,19 @@ void of_scan_pci_bridge(struct pci_dev *dev)
unsigned int flags;
u64 size;
- pr_debug("of_scan_pci_bridge(%s)\n", node->full_name);
+ pr_debug("of_scan_pci_bridge(%pOF)\n", node);
/* parse bus-range property */
busrange = of_get_property(node, "bus-range", &len);
if (busrange == NULL || len != 8) {
- printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %s\n",
- node->full_name);
+ printk(KERN_DEBUG "Can't get bus-range for PCI-PCI bridge %pOF\n",
+ node);
return;
}
ranges = of_get_property(node, "ranges", &len);
if (ranges == NULL) {
- printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %s\n",
- node->full_name);
+ printk(KERN_DEBUG "Can't get ranges for PCI-PCI bridge %pOF\n",
+ node);
return;
}
@@ -233,8 +233,8 @@ void of_scan_pci_bridge(struct pci_dev *dev)
bus = pci_add_new_bus(dev->bus, dev,
of_read_number(busrange, 1));
if (!bus) {
- printk(KERN_ERR "Failed to create pci bus for %s\n",
- node->full_name);
+ printk(KERN_ERR "Failed to create pci bus for %pOF\n",
+ node);
return;
}
}
@@ -262,13 +262,13 @@ void of_scan_pci_bridge(struct pci_dev *dev)
res = bus->resource[0];
if (res->flags) {
printk(KERN_ERR "PCI: ignoring extra I/O range"
- " for bridge %s\n", node->full_name);
+ " for bridge %pOF\n", node);
continue;
}
} else {
if (i >= PCI_NUM_RESOURCES - PCI_BRIDGE_RESOURCES) {
printk(KERN_ERR "PCI: too many memory ranges"
- " for bridge %s\n", node->full_name);
+ " for bridge %pOF\n", node);
continue;
}
res = bus->resource[i];
@@ -307,7 +307,7 @@ static struct pci_dev *of_scan_pci_dev(struct pci_bus *bus,
struct eeh_dev *edev = pdn_to_eeh_dev(PCI_DN(dn));
#endif
- pr_debug(" * %s\n", dn->full_name);
+ pr_debug(" * %pOF\n", dn);
if (!of_device_is_available(dn))
return NULL;
@@ -350,8 +350,8 @@ static void __of_scan_bus(struct device_node *node, struct pci_bus *bus,
struct device_node *child;
struct pci_dev *dev;
- pr_debug("of_scan_bus(%s) bus no %d...\n",
- node->full_name, bus->number);
+ pr_debug("of_scan_bus(%pOF) bus no %d...\n",
+ node, bus->number);
/* Scan direct children */
for_each_child_of_node(node, child) {
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 2ad725ef4368..a0c74bbf3454 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -230,7 +230,8 @@ void enable_kernel_fp(void)
}
EXPORT_SYMBOL(enable_kernel_fp);
-static int restore_fp(struct task_struct *tsk) {
+static int restore_fp(struct task_struct *tsk)
+{
if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) {
load_fp_state(&current->thread.fp_state);
current->thread.load_fp++;
@@ -330,11 +331,19 @@ static inline int restore_altivec(struct task_struct *tsk) { return 0; }
#ifdef CONFIG_VSX
static void __giveup_vsx(struct task_struct *tsk)
{
- if (tsk->thread.regs->msr & MSR_FP)
+ unsigned long msr = tsk->thread.regs->msr;
+
+ /*
+ * We should never be ssetting MSR_VSX without also setting
+ * MSR_FP and MSR_VEC
+ */
+ WARN_ON((msr & MSR_VSX) && !((msr & MSR_FP) && (msr & MSR_VEC)));
+
+ /* __giveup_fpu will clear MSR_VSX */
+ if (msr & MSR_FP)
__giveup_fpu(tsk);
- if (tsk->thread.regs->msr & MSR_VEC)
+ if (msr & MSR_VEC)
__giveup_altivec(tsk);
- tsk->thread.regs->msr &= ~MSR_VSX;
}
static void giveup_vsx(struct task_struct *tsk)
@@ -346,14 +355,6 @@ static void giveup_vsx(struct task_struct *tsk)
msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
}
-static void save_vsx(struct task_struct *tsk)
-{
- if (tsk->thread.regs->msr & MSR_FP)
- save_fpu(tsk);
- if (tsk->thread.regs->msr & MSR_VEC)
- save_altivec(tsk);
-}
-
void enable_kernel_vsx(void)
{
unsigned long cpumsr;
@@ -362,7 +363,8 @@ void enable_kernel_vsx(void)
cpumsr = msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
- if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) {
+ if (current->thread.regs &&
+ (current->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP))) {
check_if_tm_restore_required(current);
/*
* If a thread has already been reclaimed then the
@@ -373,10 +375,6 @@ void enable_kernel_vsx(void)
*/
if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr))
return;
- if (current->thread.regs->msr & MSR_FP)
- __giveup_fpu(current);
- if (current->thread.regs->msr & MSR_VEC)
- __giveup_altivec(current);
__giveup_vsx(current);
}
}
@@ -386,7 +384,7 @@ void flush_vsx_to_thread(struct task_struct *tsk)
{
if (tsk->thread.regs) {
preempt_disable();
- if (tsk->thread.regs->msr & MSR_VSX) {
+ if (tsk->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP)) {
BUG_ON(tsk != current);
giveup_vsx(tsk);
}
@@ -406,7 +404,6 @@ static int restore_vsx(struct task_struct *tsk)
}
#else
static inline int restore_vsx(struct task_struct *tsk) { return 0; }
-static inline void save_vsx(struct task_struct *tsk) { }
#endif /* CONFIG_VSX */
#ifdef CONFIG_SPE
@@ -486,6 +483,8 @@ void giveup_all(struct task_struct *tsk)
msr_check_and_set(msr_all_available);
check_if_tm_restore_required(tsk);
+ WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC)));
+
#ifdef CONFIG_PPC_FPU
if (usermsr & MSR_FP)
__giveup_fpu(tsk);
@@ -494,10 +493,6 @@ void giveup_all(struct task_struct *tsk)
if (usermsr & MSR_VEC)
__giveup_altivec(tsk);
#endif
-#ifdef CONFIG_VSX
- if (usermsr & MSR_VSX)
- __giveup_vsx(tsk);
-#endif
#ifdef CONFIG_SPE
if (usermsr & MSR_SPE)
__giveup_spe(tsk);
@@ -552,19 +547,13 @@ void save_all(struct task_struct *tsk)
msr_check_and_set(msr_all_available);
- /*
- * Saving the way the register space is in hardware, save_vsx boils
- * down to a save_fpu() and save_altivec()
- */
- if (usermsr & MSR_VSX) {
- save_vsx(tsk);
- } else {
- if (usermsr & MSR_FP)
- save_fpu(tsk);
+ WARN_ON((usermsr & MSR_VSX) && !((usermsr & MSR_FP) && (usermsr & MSR_VEC)));
- if (usermsr & MSR_VEC)
- save_altivec(tsk);
- }
+ if (usermsr & MSR_FP)
+ save_fpu(tsk);
+
+ if (usermsr & MSR_VEC)
+ save_altivec(tsk);
if (usermsr & MSR_SPE)
__giveup_spe(tsk);
@@ -1133,6 +1122,11 @@ static inline void restore_sprs(struct thread_struct *old_thread,
#endif
}
+#ifdef CONFIG_PPC_BOOK3S_64
+#define CP_SIZE 128
+static const u8 dummy_copy_buffer[CP_SIZE] __attribute__((aligned(CP_SIZE)));
+#endif
+
struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *new)
{
@@ -1195,12 +1189,14 @@ struct task_struct *__switch_to(struct task_struct *prev,
__switch_to_tm(prev, new);
- /*
- * We can't take a PMU exception inside _switch() since there is a
- * window where the kernel stack SLB and the kernel stack are out
- * of sync. Hard disable here.
- */
- hard_irq_disable();
+ if (!radix_enabled()) {
+ /*
+ * We can't take a PMU exception inside _switch() since there
+ * is a window where the kernel stack SLB and the kernel stack
+ * are out of sync. Hard disable here.
+ */
+ hard_irq_disable();
+ }
/*
* Call restore_sprs() before calling _switch(). If we move it after
@@ -1220,8 +1216,28 @@ struct task_struct *__switch_to(struct task_struct *prev,
batch->active = 1;
}
- if (current_thread_info()->task->thread.regs)
+ if (current_thread_info()->task->thread.regs) {
restore_math(current_thread_info()->task->thread.regs);
+
+ /*
+ * The copy-paste buffer can only store into foreign real
+ * addresses, so unprivileged processes can not see the
+ * data or use it in any way unless they have foreign real
+ * mappings. We don't have a VAS driver that allocates those
+ * yet, so no cpabort is required.
+ */
+ if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+ /*
+ * DD1 allows paste into normal system memory, so we
+ * do an unpaired copy here to clear the buffer and
+ * prevent a covert channel being set up.
+ *
+ * cpabort is not used because it is quite expensive.
+ */
+ asm volatile(PPC_COPY(%0, %1)
+ : : "r"(dummy_copy_buffer), "r"(0));
+ }
+ }
#endif /* CONFIG_PPC_STD_MMU_64 */
return last;
@@ -1364,13 +1380,13 @@ void show_regs(struct pt_regs * regs)
show_regs_print_info(KERN_DEFAULT);
- printk("NIP: "REG" LR: "REG" CTR: "REG"\n",
+ printk("NIP: "REG" LR: "REG" CTR: "REG"\n",
regs->nip, regs->link, regs->ctr);
printk("REGS: %p TRAP: %04lx %s (%s)\n",
regs, regs->trap, print_tainted(), init_utsname()->release);
- printk("MSR: "REG" ", regs->msr);
+ printk("MSR: "REG" ", regs->msr);
print_msr_bits(regs->msr);
- printk(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer);
+ pr_cont(" CR: %08lx XER: %08lx\n", regs->ccr, regs->xer);
trap = TRAP(regs);
if ((regs->trap != 0xc00) && cpu_has_feature(CPU_FTR_CFAR))
pr_cont("CFAR: "REG" ", regs->orig_gpr3);
@@ -1963,11 +1979,25 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
void notrace __ppc64_runlatch_on(void)
{
struct thread_info *ti = current_thread_info();
- unsigned long ctrl;
- ctrl = mfspr(SPRN_CTRLF);
- ctrl |= CTRL_RUNLATCH;
- mtspr(SPRN_CTRLT, ctrl);
+ if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+ /*
+ * Least significant bit (RUN) is the only writable bit of
+ * the CTRL register, so we can avoid mfspr. 2.06 is not the
+ * earliest ISA where this is the case, but it's convenient.
+ */
+ mtspr(SPRN_CTRLT, CTRL_RUNLATCH);
+ } else {
+ unsigned long ctrl;
+
+ /*
+ * Some architectures (e.g., Cell) have writable fields other
+ * than RUN, so do the read-modify-write.
+ */
+ ctrl = mfspr(SPRN_CTRLF);
+ ctrl |= CTRL_RUNLATCH;
+ mtspr(SPRN_CTRLT, ctrl);
+ }
ti->local_flags |= _TLF_RUNLATCH;
}
@@ -1976,13 +2006,18 @@ void notrace __ppc64_runlatch_on(void)
void notrace __ppc64_runlatch_off(void)
{
struct thread_info *ti = current_thread_info();
- unsigned long ctrl;
ti->local_flags &= ~_TLF_RUNLATCH;
- ctrl = mfspr(SPRN_CTRLF);
- ctrl &= ~CTRL_RUNLATCH;
- mtspr(SPRN_CTRLT, ctrl);
+ if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+ mtspr(SPRN_CTRLT, 0);
+ } else {
+ unsigned long ctrl;
+
+ ctrl = mfspr(SPRN_CTRLF);
+ ctrl &= ~CTRL_RUNLATCH;
+ mtspr(SPRN_CTRLT, ctrl);
+ }
}
#endif /* CONFIG_PPC64 */
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index dd8a04f3053a..02190e90c7ae 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -15,6 +15,9 @@
#undef DEBUG_PROM
+/* we cannot use FORTIFY as it brings in new symbols */
+#define __NO_FORTIFY
+
#include <stdarg.h>
#include <linux/kernel.h>
#include <linux/string.h>
@@ -174,6 +177,7 @@ struct platform_support {
bool hash_mmu;
bool radix_mmu;
bool radix_gtse;
+ bool xive;
};
/* Platforms codes are now obsolete in the kernel. Now only used within this
@@ -1038,6 +1042,27 @@ static void __init prom_parse_mmu_model(u8 val,
}
}
+static void __init prom_parse_xive_model(u8 val,
+ struct platform_support *support)
+{
+ switch (val) {
+ case OV5_FEAT(OV5_XIVE_EITHER): /* Either Available */
+ prom_debug("XIVE - either mode supported\n");
+ support->xive = true;
+ break;
+ case OV5_FEAT(OV5_XIVE_EXPLOIT): /* Only Exploitation mode */
+ prom_debug("XIVE - exploitation mode supported\n");
+ support->xive = true;
+ break;
+ case OV5_FEAT(OV5_XIVE_LEGACY): /* Only Legacy mode */
+ prom_debug("XIVE - legacy mode supported\n");
+ break;
+ default:
+ prom_debug("Unknown xive support option: 0x%x\n", val);
+ break;
+ }
+}
+
static void __init prom_parse_platform_support(u8 index, u8 val,
struct platform_support *support)
{
@@ -1051,6 +1076,10 @@ static void __init prom_parse_platform_support(u8 index, u8 val,
support->radix_gtse = true;
}
break;
+ case OV5_INDX(OV5_XIVE_SUPPORT): /* Interrupt mode */
+ prom_parse_xive_model(val & OV5_FEAT(OV5_XIVE_SUPPORT),
+ support);
+ break;
}
}
@@ -1059,7 +1088,8 @@ static void __init prom_check_platform_support(void)
struct platform_support supported = {
.hash_mmu = false,
.radix_mmu = false,
- .radix_gtse = false
+ .radix_gtse = false,
+ .xive = false
};
int prop_len = prom_getproplen(prom.chosen,
"ibm,arch-vec-5-platform-support");
@@ -1092,6 +1122,11 @@ static void __init prom_check_platform_support(void)
/* We're probably on a legacy hypervisor */
prom_debug("Assuming legacy hash support\n");
}
+
+ if (supported.xive) {
+ prom_debug("Asking for XIVE\n");
+ ibm_architecture_vec.vec5.intarch = OV5_FEAT(OV5_XIVE_EXPLOIT);
+ }
}
static void __init prom_send_capabilities(void)
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 925a4ef90559..07cd22e35405 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -127,12 +127,19 @@ static void flush_tmregs_to_thread(struct task_struct *tsk)
* If task is not current, it will have been flushed already to
* it's thread_struct during __switch_to().
*
- * A reclaim flushes ALL the state.
+ * A reclaim flushes ALL the state or if not in TM save TM SPRs
+ * in the appropriate thread structures from live.
*/
- if (tsk == current && MSR_TM_SUSPENDED(mfmsr()))
- tm_reclaim_current(TM_CAUSE_SIGNAL);
+ if (tsk != current)
+ return;
+ if (MSR_TM_SUSPENDED(mfmsr())) {
+ tm_reclaim_current(TM_CAUSE_SIGNAL);
+ } else {
+ tm_enable();
+ tm_save_sprs(&(tsk->thread));
+ }
}
#else
static inline void flush_tmregs_to_thread(struct task_struct *tsk) { }
@@ -1587,11 +1594,8 @@ static int ppr_get(struct task_struct *target,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
- int ret;
-
- ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.ppr, 0, sizeof(u64));
- return ret;
+ return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ &target->thread.ppr, 0, sizeof(u64));
}
static int ppr_set(struct task_struct *target,
@@ -1599,11 +1603,8 @@ static int ppr_set(struct task_struct *target,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- int ret;
-
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.ppr, 0, sizeof(u64));
- return ret;
+ return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ &target->thread.ppr, 0, sizeof(u64));
}
static int dscr_get(struct task_struct *target,
@@ -1611,22 +1612,16 @@ static int dscr_get(struct task_struct *target,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
- int ret;
-
- ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.dscr, 0, sizeof(u64));
- return ret;
+ return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ &target->thread.dscr, 0, sizeof(u64));
}
static int dscr_set(struct task_struct *target,
const struct user_regset *regset,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- int ret;
-
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.dscr, 0, sizeof(u64));
- return ret;
+ return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ &target->thread.dscr, 0, sizeof(u64));
}
#endif
#ifdef CONFIG_PPC_BOOK3S_64
@@ -1635,22 +1630,16 @@ static int tar_get(struct task_struct *target,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
- int ret;
-
- ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.tar, 0, sizeof(u64));
- return ret;
+ return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+ &target->thread.tar, 0, sizeof(u64));
}
static int tar_set(struct task_struct *target,
const struct user_regset *regset,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- int ret;
-
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.tar, 0, sizeof(u64));
- return ret;
+ return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+ &target->thread.tar, 0, sizeof(u64));
}
static int ebb_active(struct task_struct *target,
diff --git a/arch/powerpc/kernel/reloc_64.S b/arch/powerpc/kernel/reloc_64.S
index d88736fbece6..e8cfc69f59ae 100644
--- a/arch/powerpc/kernel/reloc_64.S
+++ b/arch/powerpc/kernel/reloc_64.S
@@ -82,7 +82,7 @@ _GLOBAL(relocate)
6: blr
.balign 8
-p_dyn: .llong __dynamic_start - 0b
-p_rela: .llong __rela_dyn_start - 0b
-p_st: .llong _stext - 0b
+p_dyn: .8byte __dynamic_start - 0b
+p_rela: .8byte __rela_dyn_start - 0b
+p_st: .8byte _stext - 0b
diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c
index 73f1934582c2..c2b148b1634a 100644
--- a/arch/powerpc/kernel/rtas_pci.c
+++ b/arch/powerpc/kernel/rtas_pci.c
@@ -91,26 +91,14 @@ static int rtas_pci_read_config(struct pci_bus *bus,
unsigned int devfn,
int where, int size, u32 *val)
{
- struct device_node *busdn, *dn;
struct pci_dn *pdn;
- bool found = false;
int ret;
- /* Search only direct children of the bus */
*val = 0xFFFFFFFF;
- busdn = pci_bus_to_OF_node(bus);
- for (dn = busdn->child; dn; dn = dn->sibling) {
- pdn = PCI_DN(dn);
- if (pdn && pdn->devfn == devfn
- && of_device_is_available(dn)) {
- found = true;
- break;
- }
- }
- if (!found)
- return PCIBIOS_DEVICE_NOT_FOUND;
+ pdn = pci_get_pdn_by_devfn(bus, devfn);
+ /* Validity of pdn is checked in here */
ret = rtas_read_config(pdn, where, size, val);
if (*val == EEH_IO_ERROR_VALUE(size) &&
eeh_dev_check_failure(pdn_to_eeh_dev(pdn)))
@@ -153,24 +141,11 @@ static int rtas_pci_write_config(struct pci_bus *bus,
unsigned int devfn,
int where, int size, u32 val)
{
- struct device_node *busdn, *dn;
struct pci_dn *pdn;
- bool found = false;
-
- /* Search only direct children of the bus */
- busdn = pci_bus_to_OF_node(bus);
- for (dn = busdn->child; dn; dn = dn->sibling) {
- pdn = PCI_DN(dn);
- if (pdn && pdn->devfn == devfn
- && of_device_is_available(dn)) {
- found = true;
- break;
- }
- }
- if (!found)
- return PCIBIOS_DEVICE_NOT_FOUND;
+ pdn = pci_get_pdn_by_devfn(bus, devfn);
+ /* Validity of pdn is checked in here. */
return rtas_write_config(pdn, where, size, val);
}
diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c
index 3650732639ed..0f0b1b2f3b60 100644
--- a/arch/powerpc/kernel/rtasd.c
+++ b/arch/powerpc/kernel/rtasd.c
@@ -283,7 +283,7 @@ static void prrn_work_fn(struct work_struct *work)
* the RTAS event.
*/
pseries_devicetree_update(-prrn_update_scope);
- arch_update_cpu_topology();
+ numa_update_cpu_topology(false);
}
static DECLARE_WORK(prrn_work, prrn_work_fn);
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 857129acf960..7de73589d8e2 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -335,6 +335,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
maj = ((pvr >> 8) & 0xFF) - 1;
min = pvr & 0xFF;
break;
+ case 0x004e: /* POWER9 bits 12-15 give chip type */
+ maj = (pvr >> 8) & 0x0F;
+ min = pvr & 0xFF;
+ break;
default:
maj = (pvr >> 8) & 0xFF;
min = pvr & 0xFF;
@@ -477,7 +481,7 @@ void __init smp_setup_cpu_maps(void)
__be32 cpu_be;
int j, len;
- DBG(" * %s...\n", dn->full_name);
+ DBG(" * %pOF...\n", dn);
intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s",
&len);
@@ -700,30 +704,6 @@ int check_legacy_ioport(unsigned long base_port)
}
EXPORT_SYMBOL(check_legacy_ioport);
-static int ppc_panic_event(struct notifier_block *this,
- unsigned long event, void *ptr)
-{
- /*
- * If firmware-assisted dump has been registered then trigger
- * firmware-assisted dump and let firmware handle everything else.
- */
- crash_fadump(NULL, ptr);
- ppc_md.panic(ptr); /* May not return */
- return NOTIFY_DONE;
-}
-
-static struct notifier_block ppc_panic_block = {
- .notifier_call = ppc_panic_event,
- .priority = INT_MIN /* may not return; must be done last */
-};
-
-void __init setup_panic(void)
-{
- if (!ppc_md.panic)
- return;
- atomic_notifier_chain_register(&panic_notifier_list, &ppc_panic_block);
-}
-
#ifdef CONFIG_CHECK_CACHE_COHERENCY
/*
* For platforms that have configurable cache-coherency. This function
@@ -868,9 +848,6 @@ void __init setup_arch(char **cmdline_p)
/* Probe the machine type, establish ppc_md. */
probe_machine();
- /* Setup panic notifier if requested by the platform. */
- setup_panic();
-
/*
* Configure ppc_md.power_save (ppc32 only, 64-bit machines do
* it from their respective probe() function.
@@ -912,13 +889,6 @@ void __init setup_arch(char **cmdline_p)
/* Reserve large chunks of memory for use by CMA for KVM. */
kvm_cma_reserve();
- /*
- * Reserve any gigantic pages requested on the command line.
- * memblock needs to have been initialized by the time this is
- * called since this will reserve memory.
- */
- reserve_hugetlb_gpages();
-
klp_init_thread_info(&init_thread_info);
init_mm.start_code = (unsigned long)_stext;
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index 2f88f6cf1a42..51ebc01fff52 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -98,6 +98,9 @@ extern unsigned int memset_nocache_branch; /* Insn to be replaced by NOP */
notrace void __init machine_init(u64 dt_ptr)
{
+ unsigned int *addr = &memset_nocache_branch;
+ unsigned long insn;
+
/* Configure static keys first, now that we're relocated. */
setup_feature_keys();
@@ -105,7 +108,9 @@ notrace void __init machine_init(u64 dt_ptr)
udbg_early_init();
patch_instruction((unsigned int *)&memcpy, PPC_INST_NOP);
- patch_instruction(&memset_nocache_branch, PPC_INST_NOP);
+
+ insn = create_cond_branch(addr, branch_target(addr), 0x820000);
+ patch_instruction(addr, insn); /* replace b by bne cr0 */
/* Do some early initialization based on the flat device tree */
early_init_devtree(__va(dt_ptr));
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 4640f6d64f8b..b89c6aac48c9 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -564,6 +564,9 @@ static __init u64 safe_stack_limit(void)
/* Other BookE, we assume the first GB is bolted */
return 1ul << 30;
#else
+ if (early_radix_enabled())
+ return ULONG_MAX;
+
/* BookS, the first segment is bolted */
if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
return 1UL << SID_SHIFT_1T;
@@ -578,7 +581,8 @@ void __init irqstack_early_init(void)
/*
* Interrupt stacks must be in the first segment since we
- * cannot afford to take SLB misses on them.
+ * cannot afford to take SLB misses on them. They are not
+ * accessed in realmode.
*/
for_each_possible_cpu(i) {
softirq_ctx[i] = (struct thread_info *)
@@ -649,8 +653,9 @@ void __init emergency_stack_init(void)
* aligned.
*
* Since we use these as temporary stacks during secondary CPU
- * bringup, we need to get at them in real mode. This means they
- * must also be within the RMO region.
+ * bringup, machine check, system reset, and HMI, we need to get
+ * at them in real mode. This means they must also be within the RMO
+ * region.
*
* The IRQ stacks allocated elsewhere in this file are zeroed and
* initialized in kernel/irq.c. These are initialized here in order
@@ -752,21 +757,30 @@ struct ppc_pci_io ppc_pci_io;
EXPORT_SYMBOL(ppc_pci_io);
#endif
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
u64 hw_nmi_get_sample_period(int watchdog_thresh)
{
return ppc_proc_freq * watchdog_thresh;
}
+#endif
/*
- * The hardlockup detector breaks PMU event based branches and is likely
- * to get false positives in KVM guests, so disable it by default.
+ * The perf based hardlockup detector breaks PMU event based branches, so
+ * disable it by default. Book3S has a soft-nmi hardlockup detector based
+ * on the decrementer interrupt, so it does not suffer from this problem.
+ *
+ * It is likely to get false positives in VM guests, so disable it there
+ * by default too.
*/
static int __init disable_hardlockup_detector(void)
{
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
hardlockup_detector_disable();
+#else
+ if (firmware_has_feature(FW_FEATURE_LPAR))
+ hardlockup_detector_disable();
+#endif
return 0;
}
early_initcall(disable_hardlockup_detector);
-#endif
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index df2a41647d8e..e0a4c1f82e25 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -33,6 +33,7 @@
#include <linux/notifier.h>
#include <linux/topology.h>
#include <linux/profile.h>
+#include <linux/processor.h>
#include <asm/ptrace.h>
#include <linux/atomic.h>
@@ -74,9 +75,11 @@ static DEFINE_PER_CPU(int, cpu_state) = { 0 };
struct thread_info *secondary_ti;
DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map);
+DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map);
DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
+EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map);
EXPORT_PER_CPU_SYMBOL(cpu_core_map);
/* SMP operations for this machine */
@@ -97,7 +100,7 @@ int smp_generic_cpu_bootable(unsigned int nr)
/* Special case - we inhibit secondary thread startup
* during boot if the user requests it.
*/
- if (system_state == SYSTEM_BOOTING && cpu_has_feature(CPU_FTR_SMT)) {
+ if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) {
if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0)
return 0;
if (smt_enabled_at_boot
@@ -112,7 +115,8 @@ int smp_generic_cpu_bootable(unsigned int nr)
#ifdef CONFIG_PPC64
int smp_generic_kick_cpu(int nr)
{
- BUG_ON(nr < 0 || nr >= NR_CPUS);
+ if (nr < 0 || nr >= nr_cpu_ids)
+ return -EINVAL;
/*
* The processor is currently spinning, waiting for the
@@ -349,7 +353,7 @@ static void nmi_ipi_lock_start(unsigned long *flags)
hard_irq_disable();
while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) {
raw_local_irq_restore(*flags);
- cpu_relax();
+ spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0);
raw_local_irq_save(*flags);
hard_irq_disable();
}
@@ -358,7 +362,7 @@ static void nmi_ipi_lock_start(unsigned long *flags)
static void nmi_ipi_lock(void)
{
while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1)
- cpu_relax();
+ spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0);
}
static void nmi_ipi_unlock(void)
@@ -433,13 +437,31 @@ static void do_smp_send_nmi_ipi(int cpu)
}
}
+void smp_flush_nmi_ipi(u64 delay_us)
+{
+ unsigned long flags;
+
+ nmi_ipi_lock_start(&flags);
+ while (nmi_ipi_busy_count) {
+ nmi_ipi_unlock_end(&flags);
+ udelay(1);
+ if (delay_us) {
+ delay_us--;
+ if (!delay_us)
+ return;
+ }
+ nmi_ipi_lock_start(&flags);
+ }
+ nmi_ipi_unlock_end(&flags);
+}
+
/*
* - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS.
* - fn is the target callback function.
* - delay_us > 0 is the delay before giving up waiting for targets to
* enter the handler, == 0 specifies indefinite delay.
*/
-static int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
+int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
{
unsigned long flags;
int me = raw_smp_processor_id();
@@ -455,7 +477,7 @@ static int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
nmi_ipi_lock_start(&flags);
while (nmi_ipi_busy_count) {
nmi_ipi_unlock_end(&flags);
- cpu_relax();
+ spin_until_cond(nmi_ipi_busy_count == 0);
nmi_ipi_lock_start(&flags);
}
@@ -551,6 +573,26 @@ static void smp_store_cpu_info(int id)
#endif
}
+/*
+ * Relationships between CPUs are maintained in a set of per-cpu cpumasks so
+ * rather than just passing around the cpumask we pass around a function that
+ * returns the that cpumask for the given CPU.
+ */
+static void set_cpus_related(int i, int j, struct cpumask *(*get_cpumask)(int))
+{
+ cpumask_set_cpu(i, get_cpumask(j));
+ cpumask_set_cpu(j, get_cpumask(i));
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void set_cpus_unrelated(int i, int j,
+ struct cpumask *(*get_cpumask)(int))
+{
+ cpumask_clear_cpu(i, get_cpumask(j));
+ cpumask_clear_cpu(j, get_cpumask(i));
+}
+#endif
+
void __init smp_prepare_cpus(unsigned int max_cpus)
{
unsigned int cpu;
@@ -570,6 +612,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
for_each_possible_cpu(cpu) {
zalloc_cpumask_var_node(&per_cpu(cpu_sibling_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
+ zalloc_cpumask_var_node(&per_cpu(cpu_l2_cache_map, cpu),
+ GFP_KERNEL, cpu_to_node(cpu));
zalloc_cpumask_var_node(&per_cpu(cpu_core_map, cpu),
GFP_KERNEL, cpu_to_node(cpu));
/*
@@ -582,7 +626,9 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
}
}
+ /* Init the cpumasks so the boot CPU is related to itself */
cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid));
+ cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid));
cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid));
if (smp_ops && smp_ops->probe)
@@ -766,8 +812,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
smp_ops->give_timebase();
/* Wait until cpu puts itself in the online & active maps */
- while (!cpu_online(cpu))
- cpu_relax();
+ spin_until_cond(cpu_online(cpu));
return 0;
}
@@ -809,33 +854,6 @@ int cpu_first_thread_of_core(int core)
}
EXPORT_SYMBOL_GPL(cpu_first_thread_of_core);
-static void traverse_siblings_chip_id(int cpu, bool add, int chipid)
-{
- const struct cpumask *mask;
- struct device_node *np;
- int i, plen;
- const __be32 *prop;
-
- mask = add ? cpu_online_mask : cpu_present_mask;
- for_each_cpu(i, mask) {
- np = of_get_cpu_node(i, NULL);
- if (!np)
- continue;
- prop = of_get_property(np, "ibm,chip-id", &plen);
- if (prop && plen == sizeof(int) &&
- of_read_number(prop, 1) == chipid) {
- if (add) {
- cpumask_set_cpu(cpu, cpu_core_mask(i));
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- } else {
- cpumask_clear_cpu(cpu, cpu_core_mask(i));
- cpumask_clear_cpu(i, cpu_core_mask(cpu));
- }
- }
- of_node_put(np);
- }
-}
-
/* Must be called when no change can occur to cpu_present_mask,
* i.e. during cpu online or offline.
*/
@@ -858,52 +876,93 @@ static struct device_node *cpu_to_l2cache(int cpu)
return cache;
}
-static void traverse_core_siblings(int cpu, bool add)
+static bool update_mask_by_l2(int cpu, struct cpumask *(*mask_fn)(int))
{
struct device_node *l2_cache, *np;
- const struct cpumask *mask;
- int i, chip, plen;
- const __be32 *prop;
-
- /* First see if we have ibm,chip-id properties in cpu nodes */
- np = of_get_cpu_node(cpu, NULL);
- if (np) {
- chip = -1;
- prop = of_get_property(np, "ibm,chip-id", &plen);
- if (prop && plen == sizeof(int))
- chip = of_read_number(prop, 1);
- of_node_put(np);
- if (chip >= 0) {
- traverse_siblings_chip_id(cpu, add, chip);
- return;
- }
- }
+ int i;
l2_cache = cpu_to_l2cache(cpu);
- mask = add ? cpu_online_mask : cpu_present_mask;
- for_each_cpu(i, mask) {
+ if (!l2_cache)
+ return false;
+
+ for_each_cpu(i, cpu_online_mask) {
+ /*
+ * when updating the marks the current CPU has not been marked
+ * online, but we need to update the cache masks
+ */
np = cpu_to_l2cache(i);
if (!np)
continue;
- if (np == l2_cache) {
- if (add) {
- cpumask_set_cpu(cpu, cpu_core_mask(i));
- cpumask_set_cpu(i, cpu_core_mask(cpu));
- } else {
- cpumask_clear_cpu(cpu, cpu_core_mask(i));
- cpumask_clear_cpu(i, cpu_core_mask(cpu));
- }
- }
+
+ if (np == l2_cache)
+ set_cpus_related(cpu, i, mask_fn);
+
of_node_put(np);
}
of_node_put(l2_cache);
+
+ return true;
}
+#ifdef CONFIG_HOTPLUG_CPU
+static void remove_cpu_from_masks(int cpu)
+{
+ int i;
+
+ /* NB: cpu_core_mask is a superset of the others */
+ for_each_cpu(i, cpu_core_mask(cpu)) {
+ set_cpus_unrelated(cpu, i, cpu_core_mask);
+ set_cpus_unrelated(cpu, i, cpu_l2_cache_mask);
+ set_cpus_unrelated(cpu, i, cpu_sibling_mask);
+ }
+}
+#endif
+
+static void add_cpu_to_masks(int cpu)
+{
+ int first_thread = cpu_first_thread_sibling(cpu);
+ int chipid = cpu_to_chip_id(cpu);
+ int i;
+
+ /*
+ * This CPU will not be in the online mask yet so we need to manually
+ * add it to it's own thread sibling mask.
+ */
+ cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
+
+ for (i = first_thread; i < first_thread + threads_per_core; i++)
+ if (cpu_online(i))
+ set_cpus_related(i, cpu, cpu_sibling_mask);
+
+ /*
+ * Copy the thread sibling mask into the cache sibling mask
+ * and mark any CPUs that share an L2 with this CPU.
+ */
+ for_each_cpu(i, cpu_sibling_mask(cpu))
+ set_cpus_related(cpu, i, cpu_l2_cache_mask);
+ update_mask_by_l2(cpu, cpu_l2_cache_mask);
+
+ /*
+ * Copy the cache sibling mask into core sibling mask and mark
+ * any CPUs on the same chip as this CPU.
+ */
+ for_each_cpu(i, cpu_l2_cache_mask(cpu))
+ set_cpus_related(cpu, i, cpu_core_mask);
+
+ if (chipid == -1)
+ return;
+
+ for_each_cpu(i, cpu_online_mask)
+ if (cpu_to_chip_id(i) == chipid)
+ set_cpus_related(cpu, i, cpu_core_mask);
+}
+
+static bool shared_caches;
+
/* Activate a secondary processor. */
void start_secondary(void *unused)
{
unsigned int cpu = smp_processor_id();
- int i, base;
mmgrab(&init_mm);
current->active_mm = &init_mm;
@@ -926,22 +985,15 @@ void start_secondary(void *unused)
vdso_getcpu_init();
#endif
- /* Update sibling maps */
- base = cpu_first_thread_sibling(cpu);
- for (i = 0; i < threads_per_core; i++) {
- if (cpu_is_offline(base + i) && (cpu != base + i))
- continue;
- cpumask_set_cpu(cpu, cpu_sibling_mask(base + i));
- cpumask_set_cpu(base + i, cpu_sibling_mask(cpu));
+ /* Update topology CPU masks */
+ add_cpu_to_masks(cpu);
- /* cpu_core_map should be a superset of
- * cpu_sibling_map even if we don't have cache
- * information, so update the former here, too.
- */
- cpumask_set_cpu(cpu, cpu_core_mask(base + i));
- cpumask_set_cpu(base + i, cpu_core_mask(cpu));
- }
- traverse_core_siblings(cpu, true);
+ /*
+ * Check for any shared caches. Note that this must be done on a
+ * per-core basis because one core in the pair might be disabled.
+ */
+ if (!cpumask_equal(cpu_l2_cache_mask(cpu), cpu_sibling_mask(cpu)))
+ shared_caches = true;
set_numa_node(numa_cpu_lookup_table[cpu]);
set_numa_mem(local_memory_node(numa_cpu_lookup_table[cpu]));
@@ -984,35 +1036,65 @@ static struct sched_domain_topology_level powerpc_topology[] = {
{ NULL, },
};
-static __init long smp_setup_cpu_workfn(void *data __always_unused)
+/*
+ * P9 has a slightly odd architecture where pairs of cores share an L2 cache.
+ * This topology makes it *much* cheaper to migrate tasks between adjacent cores
+ * since the migrated task remains cache hot. We want to take advantage of this
+ * at the scheduler level so an extra topology level is required.
+ */
+static int powerpc_shared_cache_flags(void)
{
- smp_ops->setup_cpu(boot_cpuid);
- return 0;
+ return SD_SHARE_PKG_RESOURCES;
+}
+
+/*
+ * We can't just pass cpu_l2_cache_mask() directly because
+ * returns a non-const pointer and the compiler barfs on that.
+ */
+static const struct cpumask *shared_cache_mask(int cpu)
+{
+ return cpu_l2_cache_mask(cpu);
}
+static struct sched_domain_topology_level power9_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+ { cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+ { shared_cache_mask, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+ { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { NULL, },
+};
+
void __init smp_cpus_done(unsigned int max_cpus)
{
/*
- * We want the setup_cpu() here to be called on the boot CPU, but
- * init might run on any CPU, so make sure it's invoked on the boot
- * CPU.
+ * We are running pinned to the boot CPU, see rest_init().
*/
if (smp_ops && smp_ops->setup_cpu)
- work_on_cpu_safe(boot_cpuid, smp_setup_cpu_workfn, NULL);
+ smp_ops->setup_cpu(boot_cpuid);
if (smp_ops && smp_ops->bringup_done)
smp_ops->bringup_done();
dump_numa_cpu_topology();
- set_sched_topology(powerpc_topology);
+ /*
+ * If any CPU detects that it's sharing a cache with another CPU then
+ * use the deeper topology that is aware of this sharing.
+ */
+ if (shared_caches) {
+ pr_info("Using shared cache scheduler topology\n");
+ set_sched_topology(power9_topology);
+ } else {
+ pr_info("Using standard scheduler topology\n");
+ set_sched_topology(powerpc_topology);
+ }
}
#ifdef CONFIG_HOTPLUG_CPU
int __cpu_disable(void)
{
int cpu = smp_processor_id();
- int base, i;
int err;
if (!smp_ops->cpu_disable)
@@ -1023,14 +1105,7 @@ int __cpu_disable(void)
return err;
/* Update sibling maps */
- base = cpu_first_thread_sibling(cpu);
- for (i = 0; i < threads_per_core && base + i < nr_cpu_ids; i++) {
- cpumask_clear_cpu(cpu, cpu_sibling_mask(base + i));
- cpumask_clear_cpu(base + i, cpu_sibling_mask(cpu));
- cpumask_clear_cpu(cpu, cpu_core_mask(base + i));
- cpumask_clear_cpu(base + i, cpu_core_mask(cpu));
- }
- traverse_core_siblings(cpu, false);
+ remove_cpu_from_masks(cpu);
return 0;
}
diff --git a/arch/powerpc/kernel/swsusp_asm64.S b/arch/powerpc/kernel/swsusp_asm64.S
index 988f38dced0f..82d8aae81c6a 100644
--- a/arch/powerpc/kernel/swsusp_asm64.S
+++ b/arch/powerpc/kernel/swsusp_asm64.S
@@ -179,7 +179,7 @@ nothing_to_copy:
sld r3, r3, r0
li r0, 0
1:
- dcbf r0,r3
+ dcbf 0,r3
addi r3,r3,0x20
bdnz 1b
diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S
index 4d6b1d3a747f..7ccb7f81f8db 100644
--- a/arch/powerpc/kernel/systbl.S
+++ b/arch/powerpc/kernel/systbl.S
@@ -17,13 +17,13 @@
#include <asm/ppc_asm.h>
#ifdef CONFIG_PPC64
-#define SYSCALL(func) .llong DOTSYM(sys_##func),DOTSYM(sys_##func)
-#define COMPAT_SYS(func) .llong DOTSYM(sys_##func),DOTSYM(compat_sys_##func)
-#define PPC_SYS(func) .llong DOTSYM(ppc_##func),DOTSYM(ppc_##func)
-#define OLDSYS(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall)
-#define SYS32ONLY(func) .llong DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func)
-#define PPC64ONLY(func) .llong DOTSYM(ppc_##func),DOTSYM(sys_ni_syscall)
-#define SYSX(f, f3264, f32) .llong DOTSYM(f),DOTSYM(f3264)
+#define SYSCALL(func) .8byte DOTSYM(sys_##func),DOTSYM(sys_##func)
+#define COMPAT_SYS(func) .8byte DOTSYM(sys_##func),DOTSYM(compat_sys_##func)
+#define PPC_SYS(func) .8byte DOTSYM(ppc_##func),DOTSYM(ppc_##func)
+#define OLDSYS(func) .8byte DOTSYM(sys_ni_syscall),DOTSYM(sys_ni_syscall)
+#define SYS32ONLY(func) .8byte DOTSYM(sys_ni_syscall),DOTSYM(compat_sys_##func)
+#define PPC64ONLY(func) .8byte DOTSYM(ppc_##func),DOTSYM(sys_ni_syscall)
+#define SYSX(f, f3264, f32) .8byte DOTSYM(f),DOTSYM(f3264)
#else
#define SYSCALL(func) .long sys_##func
#define COMPAT_SYS(func) .long sys_##func
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 2b33cfaac7b8..fe6f3a285455 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -59,10 +59,10 @@
#include <linux/suspend.h>
#include <linux/rtc.h>
#include <linux/sched/cputime.h>
+#include <linux/processor.h>
#include <asm/trace.h>
#include <asm/io.h>
-#include <asm/processor.h>
#include <asm/nvram.h>
#include <asm/cache.h>
#include <asm/machdep.h>
@@ -442,6 +442,7 @@ void __delay(unsigned long loops)
unsigned long start;
int diff;
+ spin_begin();
if (__USE_RTC()) {
start = get_rtcl();
do {
@@ -449,13 +450,14 @@ void __delay(unsigned long loops)
diff = get_rtcl() - start;
if (diff < 0)
diff += 1000000000;
+ spin_cpu_relax();
} while (diff < loops);
} else {
start = get_tbl();
while (get_tbl() - start < loops)
- HMT_low();
- HMT_medium();
+ spin_cpu_relax();
}
+ spin_end();
}
EXPORT_SYMBOL(__delay);
@@ -675,7 +677,7 @@ EXPORT_SYMBOL_GPL(tb_to_ns);
* the high 64 bits of a * b, i.e. (a * b) >> 64, where a and b
* are 64-bit unsigned numbers.
*/
-unsigned long long sched_clock(void)
+notrace unsigned long long sched_clock(void)
{
if (__USE_RTC())
return get_rtc();
@@ -739,12 +741,20 @@ static int __init get_freq(char *name, int cells, unsigned long *val)
static void start_cpu_decrementer(void)
{
#if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
+ unsigned int tcr;
+
/* Clear any pending timer interrupts */
mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS);
- /* Enable decrementer interrupt */
- mtspr(SPRN_TCR, TCR_DIE);
-#endif /* defined(CONFIG_BOOKE) || defined(CONFIG_40x) */
+ tcr = mfspr(SPRN_TCR);
+ /*
+ * The watchdog may have already been enabled by u-boot. So leave
+ * TRC[WP] (Watchdog Period) alone.
+ */
+ tcr &= TCR_WP_MASK; /* Clear all bits except for TCR[WP] */
+ tcr |= TCR_DIE; /* Enable decrementer */
+ mtspr(SPRN_TCR, tcr);
+#endif
}
void __init generic_calibrate_decr(void)
@@ -823,38 +833,76 @@ void read_persistent_clock(struct timespec *ts)
}
/* clocksource code */
-static u64 rtc_read(struct clocksource *cs)
+static notrace u64 rtc_read(struct clocksource *cs)
{
return (u64)get_rtc();
}
-static u64 timebase_read(struct clocksource *cs)
+static notrace u64 timebase_read(struct clocksource *cs)
{
return (u64)get_tb();
}
-void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm,
- struct clocksource *clock, u32 mult, u64 cycle_last)
+
+void update_vsyscall(struct timekeeper *tk)
{
+ struct timespec xt;
+ struct clocksource *clock = tk->tkr_mono.clock;
+ u32 mult = tk->tkr_mono.mult;
+ u32 shift = tk->tkr_mono.shift;
+ u64 cycle_last = tk->tkr_mono.cycle_last;
u64 new_tb_to_xs, new_stamp_xsec;
- u32 frac_sec;
+ u64 frac_sec;
if (clock != &clocksource_timebase)
return;
+ xt.tv_sec = tk->xtime_sec;
+ xt.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
+
/* Make userspace gettimeofday spin until we're done. */
++vdso_data->tb_update_count;
smp_mb();
- /* 19342813113834067 ~= 2^(20+64) / 1e9 */
- new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift);
- new_stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC;
- do_div(new_stamp_xsec, 1000000000);
- new_stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC;
+ /*
+ * This computes ((2^20 / 1e9) * mult) >> shift as a
+ * 0.64 fixed-point fraction.
+ * The computation in the else clause below won't overflow
+ * (as long as the timebase frequency is >= 1.049 MHz)
+ * but loses precision because we lose the low bits of the constant
+ * in the shift. Note that 19342813113834067 ~= 2^(20+64) / 1e9.
+ * For a shift of 24 the error is about 0.5e-9, or about 0.5ns
+ * over a second. (Shift values are usually 22, 23 or 24.)
+ * For high frequency clocks such as the 512MHz timebase clock
+ * on POWER[6789], the mult value is small (e.g. 32768000)
+ * and so we can shift the constant by 16 initially
+ * (295147905179 ~= 2^(20+64-16) / 1e9) and then do the
+ * remaining shifts after the multiplication, which gives a
+ * more accurate result (e.g. with mult = 32768000, shift = 24,
+ * the error is only about 1.2e-12, or 0.7ns over 10 minutes).
+ */
+ if (mult <= 62500000 && clock->shift >= 16)
+ new_tb_to_xs = ((u64) mult * 295147905179ULL) >> (clock->shift - 16);
+ else
+ new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift);
+
+ /*
+ * Compute the fractional second in units of 2^-32 seconds.
+ * The fractional second is tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift
+ * in nanoseconds, so multiplying that by 2^32 / 1e9 gives
+ * it in units of 2^-32 seconds.
+ * We assume shift <= 32 because clocks_calc_mult_shift()
+ * generates shift values in the range 0 - 32.
+ */
+ frac_sec = tk->tkr_mono.xtime_nsec << (32 - shift);
+ do_div(frac_sec, NSEC_PER_SEC);
- BUG_ON(wall_time->tv_nsec >= NSEC_PER_SEC);
- /* this is tv_nsec / 1e9 as a 0.32 fraction */
- frac_sec = ((u64) wall_time->tv_nsec * 18446744073ULL) >> 32;
+ /*
+ * Work out new stamp_xsec value for any legacy users of systemcfg.
+ * stamp_xsec is in units of 2^-20 seconds.
+ */
+ new_stamp_xsec = frac_sec >> 12;
+ new_stamp_xsec += tk->xtime_sec * XSEC_PER_SEC;
/*
* tb_update_count is used to allow the userspace gettimeofday code
@@ -864,15 +912,13 @@ void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm,
* the two values of tb_update_count match and are even then the
* tb_to_xs and stamp_xsec values are consistent. If not, then it
* loops back and reads them again until this criteria is met.
- * We expect the caller to have done the first increment of
- * vdso_data->tb_update_count already.
*/
vdso_data->tb_orig_stamp = cycle_last;
vdso_data->stamp_xsec = new_stamp_xsec;
vdso_data->tb_to_xs = new_tb_to_xs;
- vdso_data->wtom_clock_sec = wtm->tv_sec;
- vdso_data->wtom_clock_nsec = wtm->tv_nsec;
- vdso_data->stamp_xtime = *wall_time;
+ vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec;
+ vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec;
+ vdso_data->stamp_xtime = xt;
vdso_data->stamp_sec_fraction = frac_sec;
smp_wmb();
++(vdso_data->tb_update_count);
diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S
index 3a2d04134da9..c4ba37822ba0 100644
--- a/arch/powerpc/kernel/tm.S
+++ b/arch/powerpc/kernel/tm.S
@@ -313,8 +313,8 @@ dont_backup_fp:
blr
- /* void tm_recheckpoint(struct thread_struct *thread,
- * unsigned long orig_msr)
+ /* void __tm_recheckpoint(struct thread_struct *thread,
+ * unsigned long orig_msr)
* - Restore the checkpointed register state saved by tm_reclaim
* when we switch_to a process.
*
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index d4e545d27ef9..ec74e203ee04 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -114,6 +114,28 @@ static void pmac_backlight_unblank(void)
static inline void pmac_backlight_unblank(void) { }
#endif
+/*
+ * If oops/die is expected to crash the machine, return true here.
+ *
+ * This should not be expected to be 100% accurate, there may be
+ * notifiers registered or other unexpected conditions that may bring
+ * down the kernel. Or if the current process in the kernel is holding
+ * locks or has other critical state, the kernel may become effectively
+ * unusable anyway.
+ */
+bool die_will_crash(void)
+{
+ if (should_fadump_crash())
+ return true;
+ if (kexec_should_crash(current))
+ return true;
+ if (in_interrupt() || panic_on_oops ||
+ !current->pid || is_global_init(current))
+ return true;
+
+ return false;
+}
+
static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
static int die_owner = -1;
static unsigned int die_nest_count;
@@ -162,21 +184,9 @@ static void oops_end(unsigned long flags, struct pt_regs *regs,
crash_fadump(regs, "die oops");
- /*
- * A system reset (0x100) is a request to dump, so we always send
- * it through the crashdump code.
- */
- if (kexec_should_crash(current) || (TRAP(regs) == 0x100)) {
+ if (kexec_should_crash(current))
crash_kexec(regs);
- /*
- * We aren't the primary crash CPU. We need to send it
- * to a holding pattern to avoid it ending up in the panic
- * code.
- */
- crash_kexec_secondary(regs);
- }
-
if (!signr)
return;
@@ -202,18 +212,25 @@ NOKPROBE_SYMBOL(oops_end);
static int __die(const char *str, struct pt_regs *regs, long err)
{
printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
-#ifdef CONFIG_PREEMPT
- printk("PREEMPT ");
-#endif
-#ifdef CONFIG_SMP
- printk("SMP NR_CPUS=%d ", NR_CPUS);
-#endif
+
+ if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN))
+ printk("LE ");
+ else
+ printk("BE ");
+
+ if (IS_ENABLED(CONFIG_PREEMPT))
+ pr_cont("PREEMPT ");
+
+ if (IS_ENABLED(CONFIG_SMP))
+ pr_cont("SMP NR_CPUS=%d ", NR_CPUS);
+
if (debug_pagealloc_enabled())
- printk("DEBUG_PAGEALLOC ");
-#ifdef CONFIG_NUMA
- printk("NUMA ");
-#endif
- printk("%s\n", ppc_md.name ? ppc_md.name : "");
+ pr_cont("DEBUG_PAGEALLOC ");
+
+ if (IS_ENABLED(CONFIG_NUMA))
+ pr_cont("NUMA ");
+
+ pr_cont("%s\n", ppc_md.name ? ppc_md.name : "");
if (notify_die(DIE_OOPS, str, regs, err, 255, SIGSEGV) == NOTIFY_STOP)
return 1;
@@ -237,6 +254,7 @@ void die(const char *str, struct pt_regs *regs, long err)
err = 0;
oops_end(flags, regs, err);
}
+NOKPROBE_SYMBOL(die);
void user_single_step_siginfo(struct task_struct *tsk,
struct pt_regs *regs, siginfo_t *info)
@@ -287,23 +305,52 @@ void system_reset_exception(struct pt_regs *regs)
if (!nested)
nmi_enter();
+ __this_cpu_inc(irq_stat.sreset_irqs);
+
/* See if any machine dependent calls */
if (ppc_md.system_reset_exception) {
if (ppc_md.system_reset_exception(regs))
goto out;
}
- die("System Reset", regs, SIGABRT);
+ if (debugger(regs))
+ goto out;
+
+ /*
+ * A system reset is a request to dump, so we always send
+ * it through the crashdump code (if fadump or kdump are
+ * registered).
+ */
+ crash_fadump(regs, "System Reset");
+
+ crash_kexec(regs);
+
+ /*
+ * We aren't the primary crash CPU. We need to send it
+ * to a holding pattern to avoid it ending up in the panic
+ * code.
+ */
+ crash_kexec_secondary(regs);
+
+ /*
+ * No debugger or crash dump registered, print logs then
+ * panic.
+ */
+ __die("System Reset", regs, SIGABRT);
+
+ mdelay(2*MSEC_PER_SEC); /* Wait a little while for others to print */
+ add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
+ nmi_panic(regs, "System Reset");
out:
#ifdef CONFIG_PPC_BOOK3S_64
BUG_ON(get_paca()->in_nmi == 0);
if (get_paca()->in_nmi > 1)
- panic("Unrecoverable nested System Reset");
+ nmi_panic(regs, "Unrecoverable nested System Reset");
#endif
/* Must die if the interrupt is not recoverable */
if (!(regs->msr & MSR_RI))
- panic("Unrecoverable System Reset");
+ nmi_panic(regs, "Unrecoverable System Reset");
if (!nested)
nmi_exit();
@@ -311,39 +358,6 @@ out:
/* What should we do here? We could issue a shutdown or hard reset. */
}
-#ifdef CONFIG_PPC64
-/*
- * This function is called in real mode. Strictly no printk's please.
- *
- * regs->nip and regs->msr contains srr0 and ssr1.
- */
-long machine_check_early(struct pt_regs *regs)
-{
- long handled = 0;
-
- __this_cpu_inc(irq_stat.mce_exceptions);
-
- if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
- handled = cur_cpu_spec->machine_check_early(regs);
- return handled;
-}
-
-long hmi_exception_realmode(struct pt_regs *regs)
-{
- __this_cpu_inc(irq_stat.hmi_exceptions);
-
- wait_for_subcore_guest_exit();
-
- if (ppc_md.hmi_exception_early)
- ppc_md.hmi_exception_early(regs);
-
- wait_for_tb_resync();
-
- return 0;
-}
-
-#endif
-
/*
* I/O accesses can cause machine checks on powermacs.
* Check if the NIP corresponds to the address of a sync
@@ -396,11 +410,6 @@ static inline int check_io_access(struct pt_regs *regs)
/* On 4xx, the reason for the machine check or program exception
is in the ESR. */
#define get_reason(regs) ((regs)->dsisr)
-#ifndef CONFIG_FSL_BOOKE
-#define get_mc_reason(regs) ((regs)->dsisr)
-#else
-#define get_mc_reason(regs) (mfspr(SPRN_MCSR))
-#endif
#define REASON_FP ESR_FP
#define REASON_ILLEGAL (ESR_PIL | ESR_PUO)
#define REASON_PRIVILEGED ESR_PPR
@@ -414,108 +423,17 @@ static inline int check_io_access(struct pt_regs *regs)
/* On non-4xx, the reason for the machine check or program
exception is in the MSR. */
#define get_reason(regs) ((regs)->msr)
-#define get_mc_reason(regs) ((regs)->msr)
-#define REASON_TM 0x200000
-#define REASON_FP 0x100000
-#define REASON_ILLEGAL 0x80000
-#define REASON_PRIVILEGED 0x40000
-#define REASON_TRAP 0x20000
+#define REASON_TM SRR1_PROGTM
+#define REASON_FP SRR1_PROGFPE
+#define REASON_ILLEGAL SRR1_PROGILL
+#define REASON_PRIVILEGED SRR1_PROGPRIV
+#define REASON_TRAP SRR1_PROGTRAP
#define single_stepping(regs) ((regs)->msr & MSR_SE)
#define clear_single_step(regs) ((regs)->msr &= ~MSR_SE)
#endif
-#if defined(CONFIG_4xx)
-int machine_check_4xx(struct pt_regs *regs)
-{
- unsigned long reason = get_mc_reason(regs);
-
- if (reason & ESR_IMCP) {
- printk("Instruction");
- mtspr(SPRN_ESR, reason & ~ESR_IMCP);
- } else
- printk("Data");
- printk(" machine check in kernel mode.\n");
-
- return 0;
-}
-
-int machine_check_440A(struct pt_regs *regs)
-{
- unsigned long reason = get_mc_reason(regs);
-
- printk("Machine check in kernel mode.\n");
- if (reason & ESR_IMCP){
- printk("Instruction Synchronous Machine Check exception\n");
- mtspr(SPRN_ESR, reason & ~ESR_IMCP);
- }
- else {
- u32 mcsr = mfspr(SPRN_MCSR);
- if (mcsr & MCSR_IB)
- printk("Instruction Read PLB Error\n");
- if (mcsr & MCSR_DRB)
- printk("Data Read PLB Error\n");
- if (mcsr & MCSR_DWB)
- printk("Data Write PLB Error\n");
- if (mcsr & MCSR_TLBP)
- printk("TLB Parity Error\n");
- if (mcsr & MCSR_ICP){
- flush_instruction_cache();
- printk("I-Cache Parity Error\n");
- }
- if (mcsr & MCSR_DCSP)
- printk("D-Cache Search Parity Error\n");
- if (mcsr & MCSR_DCFP)
- printk("D-Cache Flush Parity Error\n");
- if (mcsr & MCSR_IMPE)
- printk("Machine Check exception is imprecise\n");
-
- /* Clear MCSR */
- mtspr(SPRN_MCSR, mcsr);
- }
- return 0;
-}
-
-int machine_check_47x(struct pt_regs *regs)
-{
- unsigned long reason = get_mc_reason(regs);
- u32 mcsr;
-
- printk(KERN_ERR "Machine check in kernel mode.\n");
- if (reason & ESR_IMCP) {
- printk(KERN_ERR
- "Instruction Synchronous Machine Check exception\n");
- mtspr(SPRN_ESR, reason & ~ESR_IMCP);
- return 0;
- }
- mcsr = mfspr(SPRN_MCSR);
- if (mcsr & MCSR_IB)
- printk(KERN_ERR "Instruction Read PLB Error\n");
- if (mcsr & MCSR_DRB)
- printk(KERN_ERR "Data Read PLB Error\n");
- if (mcsr & MCSR_DWB)
- printk(KERN_ERR "Data Write PLB Error\n");
- if (mcsr & MCSR_TLBP)
- printk(KERN_ERR "TLB Parity Error\n");
- if (mcsr & MCSR_ICP) {
- flush_instruction_cache();
- printk(KERN_ERR "I-Cache Parity Error\n");
- }
- if (mcsr & MCSR_DCSP)
- printk(KERN_ERR "D-Cache Search Parity Error\n");
- if (mcsr & PPC47x_MCSR_GPR)
- printk(KERN_ERR "GPR Parity Error\n");
- if (mcsr & PPC47x_MCSR_FPR)
- printk(KERN_ERR "FPR Parity Error\n");
- if (mcsr & PPC47x_MCSR_IPR)
- printk(KERN_ERR "Machine Check exception is imprecise\n");
-
- /* Clear MCSR */
- mtspr(SPRN_MCSR, mcsr);
-
- return 0;
-}
-#elif defined(CONFIG_E500)
+#if defined(CONFIG_E500)
int machine_check_e500mc(struct pt_regs *regs)
{
unsigned long mcsr = mfspr(SPRN_MCSR);
@@ -617,7 +535,7 @@ silent_out:
int machine_check_e500(struct pt_regs *regs)
{
- unsigned long reason = get_mc_reason(regs);
+ unsigned long reason = mfspr(SPRN_MCSR);
if (reason & MCSR_BUS_RBERR) {
if (fsl_rio_mcheck_exception(regs))
@@ -664,7 +582,7 @@ int machine_check_generic(struct pt_regs *regs)
#elif defined(CONFIG_E200)
int machine_check_e200(struct pt_regs *regs)
{
- unsigned long reason = get_mc_reason(regs);
+ unsigned long reason = mfspr(SPRN_MCSR);
printk("Machine check in kernel mode.\n");
printk("Caused by (from MCSR=%lx): ", reason);
@@ -686,35 +604,10 @@ int machine_check_e200(struct pt_regs *regs)
return 0;
}
-#elif defined(CONFIG_PPC_8xx)
-int machine_check_8xx(struct pt_regs *regs)
-{
- unsigned long reason = get_mc_reason(regs);
-
- pr_err("Machine check in kernel mode.\n");
- pr_err("Caused by (from SRR1=%lx): ", reason);
- if (reason & 0x40000000)
- pr_err("Fetch error at address %lx\n", regs->nip);
- else
- pr_err("Data access error at address %lx\n", regs->dar);
-
-#ifdef CONFIG_PCI
- /* the qspan pci read routines can cause machine checks -- Cort
- *
- * yuck !!! that totally needs to go away ! There are better ways
- * to deal with that than having a wart in the mcheck handler.
- * -- BenH
- */
- bad_page_fault(regs, regs->dar, SIGBUS);
- return 1;
-#else
- return 0;
-#endif
-}
-#else
+#elif defined(CONFIG_PPC32)
int machine_check_generic(struct pt_regs *regs)
{
- unsigned long reason = get_mc_reason(regs);
+ unsigned long reason = regs->msr;
printk("Machine check in kernel mode.\n");
printk("Caused by (from SRR1=%lx): ", reason);
@@ -751,10 +644,14 @@ int machine_check_generic(struct pt_regs *regs)
void machine_check_exception(struct pt_regs *regs)
{
- enum ctx_state prev_state = exception_enter();
int recover = 0;
+ bool nested = in_nmi();
+ if (!nested)
+ nmi_enter();
- __this_cpu_inc(irq_stat.mce_exceptions);
+ /* 64s accounts the mce in machine_check_early when in HVMODE */
+ if (!IS_ENABLED(CONFIG_PPC_BOOK3S_64) || !cpu_has_feature(CPU_FTR_HVMODE))
+ __this_cpu_inc(irq_stat.mce_exceptions);
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
@@ -782,10 +679,11 @@ void machine_check_exception(struct pt_regs *regs)
/* Must die if the interrupt is not recoverable */
if (!(regs->msr & MSR_RI))
- panic("Unrecoverable Machine check");
+ nmi_panic(regs, "Unrecoverable Machine check");
bail:
- exception_exit(prev_state);
+ if (!nested)
+ nmi_exit();
}
void SMIException(struct pt_regs *regs)
@@ -1671,24 +1569,6 @@ void performance_monitor_exception(struct pt_regs *regs)
perf_irq(regs);
}
-#ifdef CONFIG_8xx
-void SoftwareEmulation(struct pt_regs *regs)
-{
- CHECK_FULL_REGS(regs);
-
- if (!user_mode(regs)) {
- debugger(regs);
- die("Kernel Mode Unimplemented Instruction or SW FPU Emulation",
- regs, SIGFPE);
- }
-
- if (!emulate_math(regs))
- return;
-
- _exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
-}
-#endif /* CONFIG_8xx */
-
#ifdef CONFIG_PPC_ADV_DEBUG_REGS
static void handle_debug(struct pt_regs *regs, unsigned long debug_status)
{
@@ -1968,6 +1848,7 @@ void unrecoverable_exception(struct pt_regs *regs)
regs->trap, regs->nip);
die("Unrecoverable exception", regs, SIGABRT);
}
+NOKPROBE_SYMBOL(unrecoverable_exception);
#if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x)
/*
@@ -1998,6 +1879,7 @@ void kernel_bad_stack(struct pt_regs *regs)
regs->gpr[1], regs->nip);
die("Bad kernel stack pointer", regs, SIGABRT);
}
+NOKPROBE_SYMBOL(kernel_bad_stack);
void __init trap_init(void)
{
diff --git a/arch/powerpc/kernel/uprobes.c b/arch/powerpc/kernel/uprobes.c
index 003b20964ea0..5d105b8eeece 100644
--- a/arch/powerpc/kernel/uprobes.c
+++ b/arch/powerpc/kernel/uprobes.c
@@ -205,3 +205,12 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs
return orig_ret_vaddr;
}
+
+bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+ struct pt_regs *regs)
+{
+ if (ctx == RP_CHECK_CHAIN_CALL)
+ return regs->gpr[1] <= ret->stack;
+ else
+ return regs->gpr[1] < ret->stack;
+}
diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S
index 6b2b69616e77..769c2624e0a6 100644
--- a/arch/powerpc/kernel/vdso32/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
@@ -232,15 +232,9 @@ __do_get_tspec:
lwz r6,(CFG_TB_ORIG_STAMP+4)(r9)
/* Get a stable TB value */
-#ifdef CONFIG_8xx
-2: mftbu r3
- mftbl r4
- mftbu r0
-#else
-2: mfspr r3, SPRN_TBRU
- mfspr r4, SPRN_TBRL
- mfspr r0, SPRN_TBRU
-#endif
+2: MFTBU(r3)
+ MFTBL(r4)
+ MFTBU(r0)
cmplw cr0,r3,r0
bne- 2b
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 2f793be3d2b1..882628fa6987 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -8,6 +8,12 @@
#include <asm/cache.h>
#include <asm/thread_info.h>
+#if defined(CONFIG_STRICT_KERNEL_RWX) && !defined(CONFIG_PPC32)
+#define STRICT_ALIGN_SIZE (1 << 24)
+#else
+#define STRICT_ALIGN_SIZE PAGE_SIZE
+#endif
+
ENTRY(_stext)
PHDRS {
@@ -58,7 +64,6 @@ SECTIONS
#ifdef CONFIG_PPC64
KEEP(*(.head.text.first_256B));
#ifdef CONFIG_PPC_BOOK3E
-# define END_FIXED 0x100
#else
KEEP(*(.head.text.real_vectors));
*(.head.text.real_trampolines);
@@ -66,12 +71,8 @@ SECTIONS
*(.head.text.virt_trampolines);
# if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
KEEP(*(.head.data.fwnmi_page));
-# define END_FIXED 0x8000
-# else
-# define END_FIXED 0x7000
# endif
#endif
- ASSERT((. == END_FIXED), "vmlinux.lds.S: fixed section overflow error");
#else /* !CONFIG_PPC64 */
HEAD_TEXT
#endif
@@ -79,23 +80,6 @@ SECTIONS
__head_end = .;
- /*
- * If the build dies here, it's likely code in head_64.S is referencing
- * labels it can't reach, and the linker inserting stubs without the
- * assembler's knowledge. To debug, remove the above assert and
- * rebuild. Look for branch stubs in the fixed section region.
- *
- * Linker stub generation could be allowed in "trampoline"
- * sections if absolutely necessary, but this would require
- * some rework of the fixed sections. Before resorting to this,
- * consider references that have sufficient addressing range,
- * (e.g., hand coded trampolines) so the linker does not have
- * to add stubs.
- *
- * Linker stubs at the top of the main text section are currently not
- * detected, and will result in a crash at boot due to offsets being
- * wrong.
- */
#ifdef CONFIG_PPC64
/*
* BLOCK(0) overrides the default output section alignment because
@@ -103,18 +87,31 @@ SECTIONS
* section placement to work.
*/
.text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) {
+#ifdef CONFIG_LD_HEAD_STUB_CATCH
+ *(.linker_stub_catch);
+ . = . ;
+#endif
+
#else
.text : AT(ADDR(.text) - LOAD_OFFSET) {
ALIGN_FUNCTION();
#endif
/* careful! __ftr_alt_* sections need to be close to .text */
- *(.text .fixup __ftr_alt_* .ref.text)
+ *(.text.hot .text .text.fixup .text.unlikely .fixup __ftr_alt_* .ref.text);
SCHED_TEXT
CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
+ /*
+ * -Os builds call FP save/restore functions. The powerpc64
+ * linker generates those on demand in the .sfpr section.
+ * .sfpr gets placed at the beginning of a group of input
+ * sections, which can break start-of-text offset if it is
+ * included with the main text sections, so put it by itself.
+ */
+ *(.sfpr);
MEM_KEEP(init.text)
MEM_KEEP(exit.text)
@@ -132,7 +129,7 @@ SECTIONS
PROVIDE32 (etext = .);
/* Read-only data */
- RODATA
+ RO_DATA(PAGE_SIZE)
EXCEPTION_TABLE(0)
@@ -149,7 +146,7 @@ SECTIONS
/*
* Init sections discarded at runtime
*/
- . = ALIGN(PAGE_SIZE);
+ . = ALIGN(STRICT_ALIGN_SIZE);
__init_begin = .;
INIT_TEXT_SECTION(PAGE_SIZE) :kernel
@@ -267,7 +264,9 @@ SECTIONS
.data : AT(ADDR(.data) - LOAD_OFFSET) {
DATA_DATA
*(.sdata)
+ *(.sdata2)
*(.got.plt) *(.got)
+ *(.plt)
}
#else
.data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -330,6 +329,16 @@ SECTIONS
_end = . ;
PROVIDE32 (end = .);
- /* Sections to be discarded. */
+ STABS_DEBUG
+
+ DWARF_DEBUG
+
DISCARDS
+ /DISCARD/ : {
+ *(*.EMB.apuinfo)
+ *(.glink .iplt .plt .rela* .comment)
+ *(.gnu.version*)
+ *(.gnu.attributes)
+ *(.eh_frame)
+ }
}
diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
new file mode 100644
index 000000000000..2f6eadd9408d
--- /dev/null
+++ b/arch/powerpc/kernel/watchdog.c
@@ -0,0 +1,412 @@
+/*
+ * Watchdog support on powerpc systems.
+ *
+ * Copyright 2017, IBM Corporation.
+ *
+ * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c
+ */
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/kdebug.h>
+#include <linux/sched/debug.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+
+#include <asm/paca.h>
+
+/*
+ * The watchdog has a simple timer that runs on each CPU, once per timer
+ * period. This is the heartbeat.
+ *
+ * Then there are checks to see if the heartbeat has not triggered on a CPU
+ * for the panic timeout period. Currently the watchdog only supports an
+ * SMP check, so the heartbeat only turns on when we have 2 or more CPUs.
+ *
+ * This is not an NMI watchdog, but Linux uses that name for a generic
+ * watchdog in some cases, so NMI gets used in some places.
+ */
+
+static cpumask_t wd_cpus_enabled __read_mostly;
+
+static u64 wd_panic_timeout_tb __read_mostly; /* timebase ticks until panic */
+static u64 wd_smp_panic_timeout_tb __read_mostly; /* panic other CPUs */
+
+static u64 wd_timer_period_ms __read_mostly; /* interval between heartbeat */
+
+static DEFINE_PER_CPU(struct timer_list, wd_timer);
+static DEFINE_PER_CPU(u64, wd_timer_tb);
+
+/*
+ * These are for the SMP checker. CPUs clear their pending bit in their
+ * heartbeat. If the bitmask becomes empty, the time is noted and the
+ * bitmask is refilled.
+ *
+ * All CPUs clear their bit in the pending mask every timer period.
+ * Once all have cleared, the time is noted and the bits are reset.
+ * If the time since all clear was greater than the panic timeout,
+ * we can panic with the list of stuck CPUs.
+ *
+ * This will work best with NMI IPIs for crash code so the stuck CPUs
+ * can be pulled out to get their backtraces.
+ */
+static unsigned long __wd_smp_lock;
+static cpumask_t wd_smp_cpus_pending;
+static cpumask_t wd_smp_cpus_stuck;
+static u64 wd_smp_last_reset_tb;
+
+static inline void wd_smp_lock(unsigned long *flags)
+{
+ /*
+ * Avoid locking layers if possible.
+ * This may be called from low level interrupt handlers at some
+ * point in future.
+ */
+ raw_local_irq_save(*flags);
+ hard_irq_disable(); /* Make it soft-NMI safe */
+ while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) {
+ raw_local_irq_restore(*flags);
+ spin_until_cond(!test_bit(0, &__wd_smp_lock));
+ raw_local_irq_save(*flags);
+ hard_irq_disable();
+ }
+}
+
+static inline void wd_smp_unlock(unsigned long *flags)
+{
+ clear_bit_unlock(0, &__wd_smp_lock);
+ raw_local_irq_restore(*flags);
+}
+
+static void wd_lockup_ipi(struct pt_regs *regs)
+{
+ pr_emerg("Watchdog CPU:%d Hard LOCKUP\n", raw_smp_processor_id());
+ print_modules();
+ print_irqtrace_events(current);
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+
+ if (hardlockup_panic)
+ nmi_panic(regs, "Hard LOCKUP");
+}
+
+static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
+{
+ cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
+ cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
+ if (cpumask_empty(&wd_smp_cpus_pending)) {
+ wd_smp_last_reset_tb = tb;
+ cpumask_andnot(&wd_smp_cpus_pending,
+ &wd_cpus_enabled,
+ &wd_smp_cpus_stuck);
+ }
+}
+static void set_cpu_stuck(int cpu, u64 tb)
+{
+ set_cpumask_stuck(cpumask_of(cpu), tb);
+}
+
+static void watchdog_smp_panic(int cpu, u64 tb)
+{
+ unsigned long flags;
+ int c;
+
+ wd_smp_lock(&flags);
+ /* Double check some things under lock */
+ if ((s64)(tb - wd_smp_last_reset_tb) < (s64)wd_smp_panic_timeout_tb)
+ goto out;
+ if (cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
+ goto out;
+ if (cpumask_weight(&wd_smp_cpus_pending) == 0)
+ goto out;
+
+ pr_emerg("Watchdog CPU:%d detected Hard LOCKUP other CPUS:%*pbl\n",
+ cpu, cpumask_pr_args(&wd_smp_cpus_pending));
+
+ /*
+ * Try to trigger the stuck CPUs.
+ */
+ for_each_cpu(c, &wd_smp_cpus_pending) {
+ if (c == cpu)
+ continue;
+ smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
+ }
+ smp_flush_nmi_ipi(1000000);
+
+ /* Take the stuck CPUs out of the watch group */
+ set_cpumask_stuck(&wd_smp_cpus_pending, tb);
+
+ wd_smp_unlock(&flags);
+
+ printk_safe_flush();
+ /*
+ * printk_safe_flush() seems to require another print
+ * before anything actually goes out to console.
+ */
+ if (sysctl_hardlockup_all_cpu_backtrace)
+ trigger_allbutself_cpu_backtrace();
+
+ if (hardlockup_panic)
+ nmi_panic(NULL, "Hard LOCKUP");
+
+ return;
+
+out:
+ wd_smp_unlock(&flags);
+}
+
+static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
+{
+ if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
+ if (unlikely(cpumask_test_cpu(cpu, &wd_smp_cpus_stuck))) {
+ unsigned long flags;
+
+ pr_emerg("Watchdog CPU:%d became unstuck\n", cpu);
+ wd_smp_lock(&flags);
+ cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
+ wd_smp_unlock(&flags);
+ }
+ return;
+ }
+ cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
+ if (cpumask_empty(&wd_smp_cpus_pending)) {
+ unsigned long flags;
+
+ wd_smp_lock(&flags);
+ if (cpumask_empty(&wd_smp_cpus_pending)) {
+ wd_smp_last_reset_tb = tb;
+ cpumask_andnot(&wd_smp_cpus_pending,
+ &wd_cpus_enabled,
+ &wd_smp_cpus_stuck);
+ }
+ wd_smp_unlock(&flags);
+ }
+}
+
+static void watchdog_timer_interrupt(int cpu)
+{
+ u64 tb = get_tb();
+
+ per_cpu(wd_timer_tb, cpu) = tb;
+
+ wd_smp_clear_cpu_pending(cpu, tb);
+
+ if ((s64)(tb - wd_smp_last_reset_tb) >= (s64)wd_smp_panic_timeout_tb)
+ watchdog_smp_panic(cpu, tb);
+}
+
+void soft_nmi_interrupt(struct pt_regs *regs)
+{
+ unsigned long flags;
+ int cpu = raw_smp_processor_id();
+ u64 tb;
+
+ if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
+ return;
+
+ nmi_enter();
+
+ __this_cpu_inc(irq_stat.soft_nmi_irqs);
+
+ tb = get_tb();
+ if (tb - per_cpu(wd_timer_tb, cpu) >= wd_panic_timeout_tb) {
+ per_cpu(wd_timer_tb, cpu) = tb;
+
+ wd_smp_lock(&flags);
+ if (cpumask_test_cpu(cpu, &wd_smp_cpus_stuck)) {
+ wd_smp_unlock(&flags);
+ goto out;
+ }
+ set_cpu_stuck(cpu, tb);
+
+ pr_emerg("Watchdog CPU:%d Hard LOCKUP\n", cpu);
+ print_modules();
+ print_irqtrace_events(current);
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+
+ wd_smp_unlock(&flags);
+
+ if (sysctl_hardlockup_all_cpu_backtrace)
+ trigger_allbutself_cpu_backtrace();
+
+ if (hardlockup_panic)
+ nmi_panic(regs, "Hard LOCKUP");
+ }
+ if (wd_panic_timeout_tb < 0x7fffffff)
+ mtspr(SPRN_DEC, wd_panic_timeout_tb);
+
+out:
+ nmi_exit();
+}
+
+static void wd_timer_reset(unsigned int cpu, struct timer_list *t)
+{
+ t->expires = jiffies + msecs_to_jiffies(wd_timer_period_ms);
+ if (wd_timer_period_ms > 1000)
+ t->expires = __round_jiffies_up(t->expires, cpu);
+ add_timer_on(t, cpu);
+}
+
+static void wd_timer_fn(unsigned long data)
+{
+ struct timer_list *t = this_cpu_ptr(&wd_timer);
+ int cpu = smp_processor_id();
+
+ watchdog_timer_interrupt(cpu);
+
+ wd_timer_reset(cpu, t);
+}
+
+void arch_touch_nmi_watchdog(void)
+{
+ unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
+ int cpu = smp_processor_id();
+
+ if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks)
+ watchdog_timer_interrupt(cpu);
+}
+EXPORT_SYMBOL(arch_touch_nmi_watchdog);
+
+static void start_watchdog_timer_on(unsigned int cpu)
+{
+ struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
+
+ per_cpu(wd_timer_tb, cpu) = get_tb();
+
+ setup_pinned_timer(t, wd_timer_fn, 0);
+ wd_timer_reset(cpu, t);
+}
+
+static void stop_watchdog_timer_on(unsigned int cpu)
+{
+ struct timer_list *t = per_cpu_ptr(&wd_timer, cpu);
+
+ del_timer_sync(t);
+}
+
+static int start_wd_on_cpu(unsigned int cpu)
+{
+ unsigned long flags;
+
+ if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
+ WARN_ON(1);
+ return 0;
+ }
+
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ return 0;
+
+ if (watchdog_suspended)
+ return 0;
+
+ if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
+ return 0;
+
+ wd_smp_lock(&flags);
+ cpumask_set_cpu(cpu, &wd_cpus_enabled);
+ if (cpumask_weight(&wd_cpus_enabled) == 1) {
+ cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
+ wd_smp_last_reset_tb = get_tb();
+ }
+ wd_smp_unlock(&flags);
+
+ start_watchdog_timer_on(cpu);
+
+ return 0;
+}
+
+static int stop_wd_on_cpu(unsigned int cpu)
+{
+ unsigned long flags;
+
+ if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
+ return 0; /* Can happen in CPU unplug case */
+
+ stop_watchdog_timer_on(cpu);
+
+ wd_smp_lock(&flags);
+ cpumask_clear_cpu(cpu, &wd_cpus_enabled);
+ wd_smp_unlock(&flags);
+
+ wd_smp_clear_cpu_pending(cpu, get_tb());
+
+ return 0;
+}
+
+static void watchdog_calc_timeouts(void)
+{
+ wd_panic_timeout_tb = watchdog_thresh * ppc_tb_freq;
+
+ /* Have the SMP detector trigger a bit later */
+ wd_smp_panic_timeout_tb = wd_panic_timeout_tb * 3 / 2;
+
+ /* 2/5 is the factor that the perf based detector uses */
+ wd_timer_period_ms = watchdog_thresh * 1000 * 2 / 5;
+}
+
+void watchdog_nmi_reconfigure(void)
+{
+ int cpu;
+
+ watchdog_calc_timeouts();
+
+ for_each_cpu(cpu, &wd_cpus_enabled)
+ stop_wd_on_cpu(cpu);
+
+ for_each_cpu_and(cpu, cpu_online_mask, &watchdog_cpumask)
+ start_wd_on_cpu(cpu);
+}
+
+/*
+ * This runs after lockup_detector_init() which sets up watchdog_cpumask.
+ */
+static int __init powerpc_watchdog_init(void)
+{
+ int err;
+
+ watchdog_calc_timeouts();
+
+ err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/watchdog:online",
+ start_wd_on_cpu, stop_wd_on_cpu);
+ if (err < 0)
+ pr_warn("Watchdog could not be initialized");
+
+ return 0;
+}
+arch_initcall(powerpc_watchdog_init);
+
+static void handle_backtrace_ipi(struct pt_regs *regs)
+{
+ nmi_cpu_backtrace(regs);
+}
+
+static void raise_backtrace_ipi(cpumask_t *mask)
+{
+ unsigned int cpu;
+
+ for_each_cpu(cpu, mask) {
+ if (cpu == smp_processor_id())
+ handle_backtrace_ipi(NULL);
+ else
+ smp_send_nmi_ipi(cpu, handle_backtrace_ipi, 1000000);
+ }
+}
+
+void arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self)
+{
+ nmi_trigger_cpumask_backtrace(mask, exclude_self, raise_backtrace_ipi);
+}