summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--Documentation/features/debug/kprobes-on-ftrace/arch-support.txt2
-rw-r--r--Documentation/powerpc/cxl.txt15
-rw-r--r--Documentation/powerpc/firmware-assisted-dump.txt34
-rw-r--r--MAINTAINERS3
-rw-r--r--arch/powerpc/Kconfig66
-rw-r--r--arch/powerpc/Makefile13
-rw-r--r--arch/powerpc/Makefile.postlink34
-rw-r--r--arch/powerpc/configs/powernv_defconfig6
-rw-r--r--arch/powerpc/configs/ppc64_defconfig6
-rw-r--r--arch/powerpc/configs/pseries_defconfig6
-rw-r--r--arch/powerpc/include/asm/asm-prototypes.h4
-rw-r--r--arch/powerpc/include/asm/bitops.h8
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash-4k.h2
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash-64k.h10
-rw-r--r--arch/powerpc/include/asm/book3s/64/hash.h16
-rw-r--r--arch/powerpc/include/asm/book3s/64/hugetlb.h2
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu-hash.h200
-rw-r--r--arch/powerpc/include/asm/book3s/64/mmu.h9
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h58
-rw-r--r--arch/powerpc/include/asm/book3s/64/radix.h8
-rw-r--r--arch/powerpc/include/asm/code-patching.h41
-rw-r--r--arch/powerpc/include/asm/cpuidle.h33
-rw-r--r--arch/powerpc/include/asm/cputable.h5
-rw-r--r--arch/powerpc/include/asm/dbell.h40
-rw-r--r--arch/powerpc/include/asm/debug.h2
-rw-r--r--arch/powerpc/include/asm/debugfs.h17
-rw-r--r--arch/powerpc/include/asm/exception-64s.h87
-rw-r--r--arch/powerpc/include/asm/feature-fixups.h3
-rw-r--r--arch/powerpc/include/asm/head-64.h1
-rw-r--r--arch/powerpc/include/asm/hvcall.h10
-rw-r--r--arch/powerpc/include/asm/io.h102
-rw-r--r--arch/powerpc/include/asm/iommu.h12
-rw-r--r--arch/powerpc/include/asm/kprobes.h63
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h2
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_asm.h2
-rw-r--r--arch/powerpc/include/asm/kvm_ppc.h10
-rw-r--r--arch/powerpc/include/asm/mce.h94
-rw-r--r--arch/powerpc/include/asm/mmu-book3e.h5
-rw-r--r--arch/powerpc/include/asm/mmu.h19
-rw-r--r--arch/powerpc/include/asm/mmu_context.h24
-rw-r--r--arch/powerpc/include/asm/nohash/64/pgtable.h5
-rw-r--r--arch/powerpc/include/asm/opal-api.h77
-rw-r--r--arch/powerpc/include/asm/opal.h41
-rw-r--r--arch/powerpc/include/asm/paca.h38
-rw-r--r--arch/powerpc/include/asm/page_64.h14
-rw-r--r--arch/powerpc/include/asm/perf_event_server.h3
-rw-r--r--arch/powerpc/include/asm/powernv.h22
-rw-r--r--arch/powerpc/include/asm/ppc-opcode.h2
-rw-r--r--arch/powerpc/include/asm/processor.h41
-rw-r--r--arch/powerpc/include/asm/reg.h4
-rw-r--r--arch/powerpc/include/asm/sections.h2
-rw-r--r--arch/powerpc/include/asm/smp.h21
-rw-r--r--arch/powerpc/include/asm/syscalls.h4
-rw-r--r--arch/powerpc/include/asm/thread_info.h10
-rw-r--r--arch/powerpc/include/asm/xics.h2
-rw-r--r--arch/powerpc/include/asm/xive-regs.h97
-rw-r--r--arch/powerpc/include/asm/xive.h163
-rw-r--r--arch/powerpc/include/asm/xmon.h2
-rw-r--r--arch/powerpc/include/uapi/asm/mman.h16
-rw-r--r--arch/powerpc/kernel/Makefile12
-rw-r--r--arch/powerpc/kernel/asm-offsets.c9
-rw-r--r--arch/powerpc/kernel/cpu_setup_power.S36
-rw-r--r--arch/powerpc/kernel/dbell.c58
-rw-r--r--arch/powerpc/kernel/eeh.c3
-rw-r--r--arch/powerpc/kernel/eeh_driver.c55
-rw-r--r--arch/powerpc/kernel/entry_32.S107
-rw-r--r--arch/powerpc/kernel/entry_64.S380
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S186
-rw-r--r--arch/powerpc/kernel/fadump.c36
-rw-r--r--arch/powerpc/kernel/head_32.S16
-rw-r--r--arch/powerpc/kernel/head_64.S3
-rw-r--r--arch/powerpc/kernel/idle_book3s.S285
-rw-r--r--arch/powerpc/kernel/iommu.c54
-rw-r--r--arch/powerpc/kernel/irq.c41
-rw-r--r--arch/powerpc/kernel/kprobes-ftrace.c104
-rw-r--r--arch/powerpc/kernel/kprobes.c214
-rw-r--r--arch/powerpc/kernel/mce.c18
-rw-r--r--arch/powerpc/kernel/mce_power.c780
-rw-r--r--arch/powerpc/kernel/optprobes.c6
-rw-r--r--arch/powerpc/kernel/paca.c21
-rw-r--r--arch/powerpc/kernel/prom.c1
-rw-r--r--arch/powerpc/kernel/prom_init.c2
-rw-r--r--arch/powerpc/kernel/setup-common.c11
-rw-r--r--arch/powerpc/kernel/setup_64.c9
-rw-r--r--arch/powerpc/kernel/smp.c299
-rw-r--r--arch/powerpc/kernel/stacktrace.c9
-rw-r--r--arch/powerpc/kernel/swsusp.c1
-rw-r--r--arch/powerpc/kernel/syscalls.c16
-rw-r--r--arch/powerpc/kernel/sysfs.c12
-rw-r--r--arch/powerpc/kernel/trace/Makefile29
-rw-r--r--arch/powerpc/kernel/trace/ftrace.c (renamed from arch/powerpc/kernel/ftrace.c)1
-rw-r--r--arch/powerpc/kernel/trace/ftrace_32.S118
-rw-r--r--arch/powerpc/kernel/trace/ftrace_64.S85
-rw-r--r--arch/powerpc/kernel/trace/ftrace_64_mprofile.S272
-rw-r--r--arch/powerpc/kernel/trace/ftrace_64_pg.S68
-rw-r--r--arch/powerpc/kernel/trace/trace_clock.c (renamed from arch/powerpc/kernel/trace_clock.c)0
-rw-r--r--arch/powerpc/kernel/traps.c27
-rw-r--r--arch/powerpc/kernel/vmlinux.lds.S2
-rw-r--r--arch/powerpc/kvm/book3s.c8
-rw-r--r--arch/powerpc/kvm/book3s_64_mmu_host.c10
-rw-r--r--arch/powerpc/kvm/book3s_hv.c18
-rw-r--r--arch/powerpc/kvm/book3s_hv_builtin.c29
-rw-r--r--arch/powerpc/kvm/book3s_hv_rm_xics.c5
-rw-r--r--arch/powerpc/kvm/book3s_xics.c5
-rw-r--r--arch/powerpc/lib/code-patching.c4
-rw-r--r--arch/powerpc/lib/sstep.c82
-rw-r--r--arch/powerpc/mm/dump_hashpagetable.c2
-rw-r--r--arch/powerpc/mm/dump_linuxpagetables.c106
-rw-r--r--arch/powerpc/mm/fault.c84
-rw-r--r--arch/powerpc/mm/hash_low_32.S2
-rw-r--r--arch/powerpc/mm/hash_utils_64.c26
-rw-r--r--arch/powerpc/mm/hugetlbpage-book3e.c7
-rw-r--r--arch/powerpc/mm/hugetlbpage-radix.c11
-rw-r--r--arch/powerpc/mm/hugetlbpage.c18
-rw-r--r--arch/powerpc/mm/init_64.c4
-rw-r--r--arch/powerpc/mm/mmap.c53
-rw-r--r--arch/powerpc/mm/mmu_context_book3s64.c116
-rw-r--r--arch/powerpc/mm/mmu_context_iommu.c43
-rw-r--r--arch/powerpc/mm/mmu_context_nohash.c5
-rw-r--r--arch/powerpc/mm/numa.c7
-rw-r--r--arch/powerpc/mm/slb.c4
-rw-r--r--arch/powerpc/mm/slb_low.S82
-rw-r--r--arch/powerpc/mm/slice.c258
-rw-r--r--arch/powerpc/mm/subpage-prot.c3
-rw-r--r--arch/powerpc/mm/tlb-radix.c93
-rw-r--r--arch/powerpc/mm/tlb_nohash.c2
-rw-r--r--arch/powerpc/perf/core-book3s.c8
-rw-r--r--arch/powerpc/perf/isa207-common.c82
-rw-r--r--arch/powerpc/perf/isa207-common.h26
-rw-r--r--arch/powerpc/perf/power8-events-list.h6
-rw-r--r--arch/powerpc/perf/power8-pmu.c4
-rw-r--r--arch/powerpc/perf/power9-pmu.c2
-rw-r--r--arch/powerpc/platforms/44x/sam440ep.c2
-rw-r--r--arch/powerpc/platforms/52xx/Kconfig1
-rw-r--r--arch/powerpc/platforms/85xx/smp.c12
-rw-r--r--arch/powerpc/platforms/86xx/mpc86xx_smp.c1
-rw-r--r--arch/powerpc/platforms/Kconfig.cputype14
-rw-r--r--arch/powerpc/platforms/cell/axon_msi.c2
-rw-r--r--arch/powerpc/platforms/cell/interrupt.c2
-rw-r--r--arch/powerpc/platforms/cell/pervasive.c11
-rw-r--r--arch/powerpc/platforms/chrp/smp.c1
-rw-r--r--arch/powerpc/platforms/pasemi/idle.c11
-rw-r--r--arch/powerpc/platforms/powermac/smp.c3
-rw-r--r--arch/powerpc/platforms/powernv/Kconfig3
-rw-r--r--arch/powerpc/platforms/powernv/eeh-powernv.c7
-rw-r--r--arch/powerpc/platforms/powernv/idle.c109
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c470
-rw-r--r--arch/powerpc/platforms/powernv/opal-lpc.c3
-rw-r--r--arch/powerpc/platforms/powernv/opal-sensor.c4
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S71
-rw-r--r--arch/powerpc/platforms/powernv/opal-xscom.c27
-rw-r--r--arch/powerpc/platforms/powernv/opal.c79
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c71
-rw-r--r--arch/powerpc/platforms/powernv/pci.c6
-rw-r--r--arch/powerpc/platforms/powernv/pci.h17
-rw-r--r--arch/powerpc/platforms/powernv/powernv.h2
-rw-r--r--arch/powerpc/platforms/powernv/rng.c2
-rw-r--r--arch/powerpc/platforms/powernv/setup.c19
-rw-r--r--arch/powerpc/platforms/powernv/smp.c107
-rw-r--r--arch/powerpc/platforms/ps3/smp.c4
-rw-r--r--arch/powerpc/platforms/pseries/Kconfig3
-rw-r--r--arch/powerpc/platforms/pseries/dlpar.c1
-rw-r--r--arch/powerpc/platforms/pseries/dtl.c3
-rw-r--r--arch/powerpc/platforms/pseries/hvCall_inst.c10
-rw-r--r--arch/powerpc/platforms/pseries/iommu.c43
-rw-r--r--arch/powerpc/platforms/pseries/lpar.c61
-rw-r--r--arch/powerpc/platforms/pseries/ras.c4
-rw-r--r--arch/powerpc/platforms/pseries/setup.c4
-rw-r--r--arch/powerpc/platforms/pseries/smp.c49
-rw-r--r--arch/powerpc/platforms/pseries/vio.c2
-rw-r--r--arch/powerpc/sysdev/Kconfig1
-rw-r--r--arch/powerpc/sysdev/Makefile1
-rw-r--r--arch/powerpc/sysdev/scom.c3
-rw-r--r--arch/powerpc/sysdev/xics/icp-hv.c2
-rw-r--r--arch/powerpc/sysdev/xics/icp-native.c20
-rw-r--r--arch/powerpc/sysdev/xics/icp-opal.c2
-rw-r--r--arch/powerpc/sysdev/xics/xics-common.c6
-rw-r--r--arch/powerpc/sysdev/xive/Kconfig11
-rw-r--r--arch/powerpc/sysdev/xive/Makefile4
-rw-r--r--arch/powerpc/sysdev/xive/common.c1302
-rw-r--r--arch/powerpc/sysdev/xive/native.c640
-rw-r--r--arch/powerpc/sysdev/xive/xive-internal.h62
-rwxr-xr-xarch/powerpc/tools/gcc-check-mprofile-kernel.sh (renamed from arch/powerpc/scripts/gcc-check-mprofile-kernel.sh)0
-rwxr-xr-xarch/powerpc/tools/relocs_check.sh (renamed from arch/powerpc/relocs_check.sh)0
-rw-r--r--arch/powerpc/xmon/xmon.c241
-rw-r--r--drivers/misc/cxl/api.c17
-rw-r--r--drivers/misc/cxl/context.c68
-rw-r--r--drivers/misc/cxl/cxl.h263
-rw-r--r--drivers/misc/cxl/debugfs.c41
-rw-r--r--drivers/misc/cxl/fault.c136
-rw-r--r--drivers/misc/cxl/file.c15
-rw-r--r--drivers/misc/cxl/guest.c10
-rw-r--r--drivers/misc/cxl/hcalls.c6
-rw-r--r--drivers/misc/cxl/irq.c53
-rw-r--r--drivers/misc/cxl/main.c12
-rw-r--r--drivers/misc/cxl/native.c339
-rw-r--r--drivers/misc/cxl/pci.c409
-rw-r--r--drivers/misc/cxl/trace.h43
-rw-r--r--drivers/of/base.c31
-rw-r--r--drivers/pcmcia/electra_cf.c4
-rw-r--r--drivers/vfio/vfio_iommu_spapr_tce.c2
-rw-r--r--include/linux/kprobes.h1
-rw-r--r--include/linux/of.h3
-rw-r--r--include/uapi/linux/perf_event.h16
-rw-r--r--kernel/kprobes.c32
-rw-r--r--tools/include/uapi/linux/perf_event.h16
-rw-r--r--tools/testing/selftests/powerpc/Makefile1
-rw-r--r--tools/testing/selftests/powerpc/cache_shape/.gitignore1
-rw-r--r--tools/testing/selftests/powerpc/cache_shape/Makefile10
-rw-r--r--tools/testing/selftests/powerpc/cache_shape/cache_shape.c125
-rw-r--r--tools/testing/selftests/powerpc/include/utils.h6
-rw-r--r--tools/testing/selftests/powerpc/utils.c53
212 files changed, 8698 insertions, 2790 deletions
diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
index f9133a921d5a..1e84be3c142e 100644
--- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
+++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt
@@ -26,7 +26,7 @@
| nios2: | TODO |
| openrisc: | TODO |
| parisc: | TODO |
- | powerpc: | TODO |
+ | powerpc: | ok |
| s390: | TODO |
| score: | TODO |
| sh: | TODO |
diff --git a/Documentation/powerpc/cxl.txt b/Documentation/powerpc/cxl.txt
index d5506ba0fef7..c5e8d5098ed3 100644
--- a/Documentation/powerpc/cxl.txt
+++ b/Documentation/powerpc/cxl.txt
@@ -21,7 +21,7 @@ Introduction
Hardware overview
=================
- POWER8 FPGA
+ POWER8/9 FPGA
+----------+ +---------+
| | | |
| CPU | | AFU |
@@ -34,7 +34,7 @@ Hardware overview
| | CAPP |<------>| |
+---+------+ PCIE +---------+
- The POWER8 chip has a Coherently Attached Processor Proxy (CAPP)
+ The POWER8/9 chip has a Coherently Attached Processor Proxy (CAPP)
unit which is part of the PCIe Host Bridge (PHB). This is managed
by Linux by calls into OPAL. Linux doesn't directly program the
CAPP.
@@ -59,6 +59,17 @@ Hardware overview
the fault. The context to which this fault is serviced is based on
who owns that acceleration function.
+ POWER8 <-----> PSL Version 8 is compliant to the CAIA Version 1.0.
+ POWER9 <-----> PSL Version 9 is compliant to the CAIA Version 2.0.
+ This PSL Version 9 provides new features such as:
+ * Interaction with the nest MMU on the P9 chip.
+ * Native DMA support.
+ * Supports sending ASB_Notify messages for host thread wakeup.
+ * Supports Atomic operations.
+ * ....
+
+ Cards with a PSL9 won't work on a POWER8 system and cards with a
+ PSL8 won't work on a POWER9 system.
AFU Modes
=========
diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt
index 3007bc98af28..19b1e3d09a19 100644
--- a/Documentation/powerpc/firmware-assisted-dump.txt
+++ b/Documentation/powerpc/firmware-assisted-dump.txt
@@ -105,21 +105,21 @@ memory is held.
If there is no waiting dump data, then only the memory required
to hold CPU state, HPTE region, boot memory dump and elfcore
-header, is reserved at the top of memory (see Fig. 1). This area
-is *not* released: this region will be kept permanently reserved,
-so that it can act as a receptacle for a copy of the boot memory
-content in addition to CPU state and HPTE region, in the case a
-crash does occur.
+header, is usually reserved at an offset greater than boot memory
+size (see Fig. 1). This area is *not* released: this region will
+be kept permanently reserved, so that it can act as a receptacle
+for a copy of the boot memory content in addition to CPU state
+and HPTE region, in the case a crash does occur.
o Memory Reservation during first kernel
- Low memory Top of memory
+ Low memory Top of memory
0 boot memory size |
- | | |<--Reserved dump area -->|
- V V | Permanent Reservation V
- +-----------+----------/ /----------+---+----+-----------+----+
- | | |CPU|HPTE| DUMP |ELF |
- +-----------+----------/ /----------+---+----+-----------+----+
+ | | |<--Reserved dump area -->| |
+ V V | Permanent Reservation | V
+ +-----------+----------/ /---+---+----+-----------+----+------+
+ | | |CPU|HPTE| DUMP |ELF | |
+ +-----------+----------/ /---+---+----+-----------+----+------+
| ^
| |
\ /
@@ -135,12 +135,12 @@ crash does occur.
0 boot memory size |
| |<------------- Reserved dump area ----------- -->|
V V V
- +-----------+----------/ /----------+---+----+-----------+----+
- | | |CPU|HPTE| DUMP |ELF |
- +-----------+----------/ /----------+---+----+-----------+----+
- | |
- V V
- Used by second /proc/vmcore
+ +-----------+----------/ /---+---+----+-----------+----+------+
+ | | |CPU|HPTE| DUMP |ELF | |
+ +-----------+----------/ /---+---+----+-----------+----+------+
+ | |
+ V V
+ Used by second /proc/vmcore
kernel to boot
Fig. 2
diff --git a/MAINTAINERS b/MAINTAINERS
index 5ee3125f8341..b1f9b45f061f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5310,6 +5310,7 @@ M: Scott Wood <oss@buserror.net>
L: linuxppc-dev@lists.ozlabs.org
L: linux-arm-kernel@lists.infradead.org
S: Maintained
+F: Documentation/devicetree/bindings/powerpc/fsl/
F: drivers/soc/fsl/
F: include/linux/fsl/
@@ -7568,7 +7569,7 @@ Q: http://patchwork.ozlabs.org/project/linuxppc-dev/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git
S: Supported
F: Documentation/ABI/stable/sysfs-firmware-opal-*
-F: Documentation/devicetree/bindings/powerpc/opal/
+F: Documentation/devicetree/bindings/powerpc/
F: Documentation/devicetree/bindings/rtc/rtc-opal.txt
F: Documentation/devicetree/bindings/i2c/i2c-opal.txt
F: Documentation/powerpc/
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 053382616533..f07f727cbfd2 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -22,6 +22,48 @@ config MMU
bool
default y
+config ARCH_MMAP_RND_BITS_MAX
+ # On Book3S 64, the default virtual address space for 64-bit processes
+ # is 2^47 (128TB). As a maximum, allow randomisation to consume up to
+ # 32T of address space (2^45), which should ensure a reasonable gap
+ # between bottom-up and top-down allocations for applications that
+ # consume "normal" amounts of address space. Book3S 64 only supports 64K
+ # and 4K page sizes.
+ default 29 if PPC_BOOK3S_64 && PPC_64K_PAGES # 29 = 45 (32T) - 16 (64K)
+ default 33 if PPC_BOOK3S_64 # 33 = 45 (32T) - 12 (4K)
+ #
+ # On all other 64-bit platforms (currently only Book3E), the virtual
+ # address space is 2^46 (64TB). Allow randomisation to consume up to 16T
+ # of address space (2^44). Only 4K page sizes are supported.
+ default 32 if 64BIT # 32 = 44 (16T) - 12 (4K)
+ #
+ # For 32-bit, use the compat values, as they're the same.
+ default ARCH_MMAP_RND_COMPAT_BITS_MAX
+
+config ARCH_MMAP_RND_BITS_MIN
+ # Allow randomisation to consume up to 1GB of address space (2^30).
+ default 14 if 64BIT && PPC_64K_PAGES # 14 = 30 (1GB) - 16 (64K)
+ default 18 if 64BIT # 18 = 30 (1GB) - 12 (4K)
+ #
+ # For 32-bit, use the compat values, as they're the same.
+ default ARCH_MMAP_RND_COMPAT_BITS_MIN
+
+config ARCH_MMAP_RND_COMPAT_BITS_MAX
+ # Total virtual address space for 32-bit processes is 2^31 (2GB).
+ # Allow randomisation to consume up to 512MB of address space (2^29).
+ default 11 if PPC_256K_PAGES # 11 = 29 (512MB) - 18 (256K)
+ default 13 if PPC_64K_PAGES # 13 = 29 (512MB) - 16 (64K)
+ default 15 if PPC_16K_PAGES # 15 = 29 (512MB) - 14 (16K)
+ default 17 # 17 = 29 (512MB) - 12 (4K)
+
+config ARCH_MMAP_RND_COMPAT_BITS_MIN
+ # Total virtual address space for 32-bit processes is 2^31 (2GB).
+ # Allow randomisation to consume up to 8MB of address space (2^23).
+ default 5 if PPC_256K_PAGES # 5 = 23 (8MB) - 18 (256K)
+ default 7 if PPC_64K_PAGES # 7 = 23 (8MB) - 16 (64K)
+ default 9 if PPC_16K_PAGES # 9 = 23 (8MB) - 14 (16K)
+ default 11 # 11 = 23 (8MB) - 12 (4K)
+
config HAVE_SETUP_PER_CPU_AREA
def_bool PPC64
@@ -38,6 +80,11 @@ config NR_IRQS
/proc/interrupts. If you configure your system to have too few,
drivers will fail to load or worse - handle with care.
+config NMI_IPI
+ bool
+ depends on SMP && (DEBUGGER || KEXEC_CORE)
+ default y
+
config STACKTRACE_SUPPORT
bool
default y
@@ -119,6 +166,8 @@ config PPC
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KGDB
+ select HAVE_ARCH_MMAP_RND_BITS
+ select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
select HAVE_ARCH_SECCOMP_FILTER
select HAVE_ARCH_TRACEHOOK
select HAVE_CBPF_JIT if !PPC64
@@ -141,6 +190,7 @@ config PPC
select HAVE_IRQ_EXIT_ON_IRQ_STACK
select HAVE_KERNEL_GZIP
select HAVE_KPROBES
+ select HAVE_KPROBES_ON_FTRACE
select HAVE_KRETPROBES
select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS
select HAVE_MEMBLOCK
@@ -489,7 +539,7 @@ config KEXEC_FILE
config RELOCATABLE
bool "Build a relocatable kernel"
- depends on (PPC64 && !COMPILE_TEST) || (FLATMEM && (44x || FSL_BOOKE))
+ depends on PPC64 || (FLATMEM && (44x || FSL_BOOKE))
select NONSTATIC_KERNEL
select MODULE_REL_CRCS if MODVERSIONS
help
@@ -523,7 +573,7 @@ config RELOCATABLE_TEST
config CRASH_DUMP
bool "Build a kdump crash kernel"
depends on PPC64 || 6xx || FSL_BOOKE || (44x && !SMP)
- select RELOCATABLE if (PPC64 && !COMPILE_TEST) || 44x || FSL_BOOKE
+ select RELOCATABLE if PPC64 || 44x || FSL_BOOKE
help
Build a kernel suitable for use as a kdump capture kernel.
The same kernel binary can be used as production kernel and dump
@@ -585,7 +635,7 @@ config ARCH_SPARSEMEM_ENABLE
config ARCH_SPARSEMEM_DEFAULT
def_bool y
- depends on (SMP && PPC_PSERIES) || PPC_PS3
+ depends on PPC_BOOK3S_64
config SYS_SUPPORTS_HUGETLBFS
bool
@@ -677,6 +727,16 @@ config PPC_256K_PAGES
endchoice
+config THREAD_SHIFT
+ int "Thread shift" if EXPERT
+ range 13 15
+ default "15" if PPC_256K_PAGES
+ default "14" if PPC64
+ default "13"
+ help
+ Used to define the stack size. The default is almost always what you
+ want. Only change this if you know what you are doing.
+
config FORCE_MAX_ZONEORDER
int "Maximum zone order"
range 8 9 if PPC64 && PPC_64K_PAGES
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 19b0d1a81959..3e0f0e1fadef 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -136,7 +136,7 @@ CFLAGS-$(CONFIG_GENERIC_CPU) += -mcpu=powerpc64
endif
ifdef CONFIG_MPROFILE_KERNEL
- ifeq ($(shell $(srctree)/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh $(CC) -I$(srctree)/include -D__KERNEL__),OK)
+ ifeq ($(shell $(srctree)/arch/powerpc/tools/gcc-check-mprofile-kernel.sh $(CC) -I$(srctree)/include -D__KERNEL__),OK)
CC_FLAGS_FTRACE := -pg -mprofile-kernel
KBUILD_CPPFLAGS += -DCC_USING_MPROFILE_KERNEL
else
@@ -274,17 +274,6 @@ PHONY += $(BOOT_TARGETS1) $(BOOT_TARGETS2)
boot := arch/$(ARCH)/boot
-ifeq ($(CONFIG_RELOCATABLE),y)
-quiet_cmd_relocs_check = CALL $<
- cmd_relocs_check = $(CONFIG_SHELL) $< "$(OBJDUMP)" "$(obj)/vmlinux"
-
-PHONY += relocs_check
-relocs_check: arch/powerpc/relocs_check.sh vmlinux
- $(call cmd,relocs_check)
-
-zImage: relocs_check
-endif
-
$(BOOT_TARGETS1): vmlinux
$(Q)$(MAKE) $(build)=$(boot) $(patsubst %,$(boot)/%,$@)
$(BOOT_TARGETS2): vmlinux
diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink
new file mode 100644
index 000000000000..3c22d64b2de9
--- /dev/null
+++ b/arch/powerpc/Makefile.postlink
@@ -0,0 +1,34 @@
+# ===========================================================================
+# Post-link powerpc pass
+# ===========================================================================
+#
+# 1. Check that vmlinux relocations look sane
+
+PHONY := __archpost
+__archpost:
+
+include include/config/auto.conf
+include scripts/Kbuild.include
+
+quiet_cmd_relocs_check = CHKREL $@
+ cmd_relocs_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/relocs_check.sh "$(OBJDUMP)" "$@"
+
+# `@true` prevents complaint when there is nothing to be done
+
+vmlinux: FORCE
+ @true
+ifdef CONFIG_RELOCATABLE
+ $(call if_changed,relocs_check)
+endif
+
+%.ko: FORCE
+ @true
+
+clean:
+ @true
+
+PHONY += FORCE clean
+
+FORCE:
+
+.PHONY: $(PHONY)
diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index ac8b8332ed82..0695ce047d56 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -33,7 +33,7 @@ CONFIG_BLK_DEV_INITRD=y
CONFIG_BPF_SYSCALL=y
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
@@ -261,7 +261,7 @@ CONFIG_NILFS2_FS=m
CONFIG_AUTOFS4_FS=m
CONFIG_FUSE_FS=m
CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=m
+CONFIG_ISO9660_FS=y
CONFIG_UDF_FS=m
CONFIG_MSDOS_FS=y
CONFIG_VFAT_FS=m
@@ -306,7 +306,7 @@ CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_CCM=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_CRC32C_VPMSUM=m
CONFIG_CRYPTO_MD5_PPC=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_SHA256=y
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index f2e03f032041..5175028c56ce 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -19,7 +19,7 @@ CONFIG_BLK_DEV_INITRD=y
CONFIG_BPF_SYSCALL=y
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
@@ -290,7 +290,7 @@ CONFIG_NILFS2_FS=m
CONFIG_AUTOFS4_FS=m
CONFIG_FUSE_FS=m
CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=m
+CONFIG_ISO9660_FS=y
CONFIG_UDF_FS=m
CONFIG_MSDOS_FS=y
CONFIG_VFAT_FS=m
@@ -339,7 +339,7 @@ CONFIG_PPC_EARLY_DEBUG=y
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_CRC32C_VPMSUM=m
CONFIG_CRYPTO_MD5_PPC=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_SHA256=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 4ff68b752618..1a61aa20dfba 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -34,7 +34,7 @@ CONFIG_BLK_DEV_INITRD=y
CONFIG_BPF_SYSCALL=y
# CONFIG_COMPAT_BRK is not set
CONFIG_PROFILING=y
-CONFIG_OPROFILE=y
+CONFIG_OPROFILE=m
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
CONFIG_MODULES=y
@@ -259,7 +259,7 @@ CONFIG_NILFS2_FS=m
CONFIG_AUTOFS4_FS=m
CONFIG_FUSE_FS=m
CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=m
+CONFIG_ISO9660_FS=y
CONFIG_UDF_FS=m
CONFIG_MSDOS_FS=y
CONFIG_VFAT_FS=m
@@ -303,7 +303,7 @@ CONFIG_XMON=y
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_HMAC=y
-CONFIG_CRYPT_CRC32C_VPMSUM=m
+CONFIG_CRYPTO_CRC32C_VPMSUM=m
CONFIG_CRYPTO_MD5_PPC=m
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_SHA256=y
diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
index f6c5264287e5..7330150bfe34 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -17,6 +17,8 @@
#include <asm/checksum.h>
#include <linux/uaccess.h>
#include <asm/epapr_hcalls.h>
+#include <asm/dcr.h>
+#include <asm/mmu_context.h>
#include <uapi/asm/ucontext.h>
@@ -120,6 +122,8 @@ extern s64 __ashrdi3(s64, int);
extern int __cmpdi2(s64, s64);
extern int __ucmpdi2(u64, u64);
+/* tracing */
void _mcount(void);
+unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip);
#endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index bc5fdfd22788..33a24fdd7958 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -55,6 +55,14 @@
#define PPC_BITEXTRACT(bits, ppc_bit, dst_bit) \
((((bits) >> PPC_BITLSHIFT(ppc_bit)) & 1) << (dst_bit))
+#define PPC_BITLSHIFT32(be) (32 - 1 - (be))
+#define PPC_BIT32(bit) (1UL << PPC_BITLSHIFT32(bit))
+#define PPC_BITMASK32(bs, be) ((PPC_BIT32(bs) - PPC_BIT32(be))|PPC_BIT32(bs))
+
+#define PPC_BITLSHIFT8(be) (8 - 1 - (be))
+#define PPC_BIT8(bit) (1UL << PPC_BITLSHIFT8(bit))
+#define PPC_BITMASK8(bs, be) ((PPC_BIT8(bs) - PPC_BIT8(be))|PPC_BIT8(bs))
+
#include <asm/barrier.h>
/* Macro for generating the ***_bits() functions */
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index 0c4e470571ca..b4b5e6b671ca 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -8,7 +8,7 @@
#define H_PTE_INDEX_SIZE 9
#define H_PMD_INDEX_SIZE 7
#define H_PUD_INDEX_SIZE 9
-#define H_PGD_INDEX_SIZE 9
+#define H_PGD_INDEX_SIZE 12
#ifndef __ASSEMBLY__
#define H_PTE_TABLE_SIZE (sizeof(pte_t) << H_PTE_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index f3dd21efa2ea..214219dff87c 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -4,10 +4,14 @@
#define H_PTE_INDEX_SIZE 8
#define H_PMD_INDEX_SIZE 5
#define H_PUD_INDEX_SIZE 5
-#define H_PGD_INDEX_SIZE 12
+#define H_PGD_INDEX_SIZE 15
-#define H_PAGE_COMBO 0x00001000 /* this is a combo 4k page */
-#define H_PAGE_4K_PFN 0x00002000 /* PFN is for a single 4k page */
+/*
+ * 64k aligned address free up few of the lower bits of RPN for us
+ * We steal that here. For more deatils look at pte_pfn/pfn_pte()
+ */
+#define H_PAGE_COMBO _RPAGE_RPN0 /* this is a combo 4k page */
+#define H_PAGE_4K_PFN _RPAGE_RPN1 /* PFN is for a single 4k page */
/*
* We need to differentiate between explicit huge page and THP huge
* page, since THP huge page also need to track real subpage details
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index f7b721bbf918..4e957b027fe0 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -6,19 +6,13 @@
* Common bits between 4K and 64K pages in a linux-style PTE.
* Additional bits may be defined in pgtable-hash64-*.h
*
- * Note: We only support user read/write permissions. Supervisor always
- * have full read/write to pages above PAGE_OFFSET (pages below that
- * always use the user access permissions).
- *
- * We could create separate kernel read-only if we used the 3 PP bits
- * combinations that newer processors provide but we currently don't.
*/
-#define H_PAGE_BUSY 0x00800 /* software: PTE & hash are busy */
#define H_PTE_NONE_MASK _PAGE_HPTEFLAGS
-#define H_PAGE_F_GIX_SHIFT 57
-#define H_PAGE_F_GIX (7ul << 57) /* HPTE index within HPTEG */
-#define H_PAGE_F_SECOND (1ul << 60) /* HPTE is in 2ndary HPTEG */
-#define H_PAGE_HASHPTE (1ul << 61) /* PTE has associated HPTE */
+#define H_PAGE_F_GIX_SHIFT 56
+#define H_PAGE_BUSY _RPAGE_RSV1 /* software: PTE & hash are busy */
+#define H_PAGE_F_SECOND _RPAGE_RSV2 /* HPTE is in 2ndary HPTEG */
+#define H_PAGE_F_GIX (_RPAGE_RSV3 | _RPAGE_RSV4 | _RPAGE_RPN44)
+#define H_PAGE_HASHPTE _RPAGE_RPN43 /* PTE has associated HPTE */
#ifdef CONFIG_PPC_64K_PAGES
#include <asm/book3s/64/hash-64k.h>
diff --git a/arch/powerpc/include/asm/book3s/64/hugetlb.h b/arch/powerpc/include/asm/book3s/64/hugetlb.h
index c62f14d0bec1..6666cd366596 100644
--- a/arch/powerpc/include/asm/book3s/64/hugetlb.h
+++ b/arch/powerpc/include/asm/book3s/64/hugetlb.h
@@ -46,7 +46,7 @@ static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
*/
VM_WARN_ON(page_shift == mmu_psize_defs[MMU_PAGE_1G].shift);
if (page_shift == mmu_psize_defs[MMU_PAGE_2M].shift)
- return __pte(pte_val(entry) | _PAGE_LARGE);
+ return __pte(pte_val(entry) | R_PAGE_LARGE);
else
return entry;
}
diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 52d8d1e4b772..6981a52b3887 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -39,6 +39,7 @@
/* Bits in the SLB VSID word */
#define SLB_VSID_SHIFT 12
+#define SLB_VSID_SHIFT_256M SLB_VSID_SHIFT
#define SLB_VSID_SHIFT_1T 24
#define SLB_VSID_SSIZE_SHIFT 62
#define SLB_VSID_B ASM_CONST(0xc000000000000000)
@@ -408,7 +409,7 @@ static inline unsigned long hpt_vpn(unsigned long ea,
static inline unsigned long hpt_hash(unsigned long vpn,
unsigned int shift, int ssize)
{
- int mask;
+ unsigned long mask;
unsigned long hash, vsid;
/* VPN_SHIFT can be atmost 12 */
@@ -491,13 +492,14 @@ extern void slb_set_size(u16 size);
* We first generate a 37-bit "proto-VSID". Proto-VSIDs are generated
* from mmu context id and effective segment id of the address.
*
- * For user processes max context id is limited to ((1ul << 19) - 5)
- * for kernel space, we use the top 4 context ids to map address as below
+ * For user processes max context id is limited to MAX_USER_CONTEXT.
+
+ * For kernel space, we use context ids 1-4 to map addresses as below:
* NOTE: each context only support 64TB now.
- * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ]
- * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ]
- * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ]
- * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ]
+ * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ]
+ * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ]
+ * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ]
+ * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ]
*
* The proto-VSIDs are then scrambled into real VSIDs with the
* multiplicative hash:
@@ -511,20 +513,28 @@ extern void slb_set_size(u16 size);
* robust scattering in the hash table (at least based on some initial
* results).
*
- * We also consider VSID 0 special. We use VSID 0 for slb entries mapping
- * bad address. This enables us to consolidate bad address handling in
- * hash_page.
+ * We use VSID 0 to indicate an invalid VSID. The means we can't use context id
+ * 0, because a context id of 0 and an EA of 0 gives a proto-VSID of 0, which
+ * will produce a VSID of 0.
*
* We also need to avoid the last segment of the last context, because that
* would give a protovsid of 0x1fffffffff. That will result in a VSID 0
- * because of the modulo operation in vsid scramble. But the vmemmap
- * (which is what uses region 0xf) will never be close to 64TB in size
- * (it's 56 bytes per page of system memory).
+ * because of the modulo operation in vsid scramble.
*/
+/*
+ * Max Va bits we support as of now is 68 bits. We want 19 bit
+ * context ID.
+ * Restrictions:
+ * GPU has restrictions of not able to access beyond 128TB
+ * (47 bit effective address). We also cannot do more than 20bit PID.
+ * For p4 and p5 which can only do 65 bit VA, we restrict our CONTEXT_BITS
+ * to 16 bits (ie, we can only have 2^16 pids at the same time).
+ */
+#define VA_BITS 68
#define CONTEXT_BITS 19
-#define ESID_BITS 18
-#define ESID_BITS_1T 6
+#define ESID_BITS (VA_BITS - (SID_SHIFT + CONTEXT_BITS))
+#define ESID_BITS_1T (VA_BITS - (SID_SHIFT_1T + CONTEXT_BITS))
#define ESID_BITS_MASK ((1 << ESID_BITS) - 1)
#define ESID_BITS_1T_MASK ((1 << ESID_BITS_1T) - 1)
@@ -532,63 +542,70 @@ extern void slb_set_size(u16 size);
/*
* 256MB segment
* The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments
- * available for user + kernel mapping. The top 4 contexts are used for
- * kernel mapping. Each segment contains 2^28 bytes. Each
- * context maps 2^46 bytes (64TB) so we can support 2^19-1 contexts
- * (19 == 37 + 28 - 46).
+ * available for user + kernel mapping. VSID 0 is reserved as invalid, contexts
+ * 1-4 are used for kernel mapping. Each segment contains 2^28 bytes. Each
+ * context maps 2^49 bytes (512TB).
+ *
+ * We also need to avoid the last segment of the last context, because that
+ * would give a protovsid of 0x1fffffffff. That will result in a VSID 0
+ * because of the modulo operation in vsid scramble.
+ */
+#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 2)
+#define MIN_USER_CONTEXT (5)
+
+/* Would be nice to use KERNEL_REGION_ID here */
+#define KERNEL_REGION_CONTEXT_OFFSET (0xc - 1)
+
+/*
+ * For platforms that support on 65bit VA we limit the context bits
*/
-#define MAX_USER_CONTEXT ((ASM_CONST(1) << CONTEXT_BITS) - 5)
+#define MAX_USER_CONTEXT_65BIT_VA ((ASM_CONST(1) << (65 - (SID_SHIFT + ESID_BITS))) - 2)
/*
* This should be computed such that protovosid * vsid_mulitplier
- * doesn't overflow 64 bits. It should also be co-prime to vsid_modulus
+ * doesn't overflow 64 bits. The vsid_mutliplier should also be
+ * co-prime to vsid_modulus. We also need to make sure that number
+ * of bits in multiplied result (dividend) is less than twice the number of
+ * protovsid bits for our modulus optmization to work.
+ *
+ * The below table shows the current values used.
+ * |-------+------------+----------------------+------------+-------------------|
+ * | | Prime Bits | proto VSID_BITS_65VA | Total Bits | 2* prot VSID_BITS |
+ * |-------+------------+----------------------+------------+-------------------|
+ * | 1T | 24 | 25 | 49 | 50 |
+ * |-------+------------+----------------------+------------+-------------------|
+ * | 256MB | 24 | 37 | 61 | 74 |
+ * |-------+------------+----------------------+------------+-------------------|
+ *
+ * |-------+------------+----------------------+------------+--------------------|
+ * | | Prime Bits | proto VSID_BITS_68VA | Total Bits | 2* proto VSID_BITS |
+ * |-------+------------+----------------------+------------+--------------------|
+ * | 1T | 24 | 28 | 52 | 56 |
+ * |-------+------------+----------------------+------------+--------------------|
+ * | 256MB | 24 | 40 | 64 | 80 |
+ * |-------+------------+----------------------+------------+--------------------|
+ *
*/
#define VSID_MULTIPLIER_256M ASM_CONST(12538073) /* 24-bit prime */
-#define VSID_BITS_256M (CONTEXT_BITS + ESID_BITS)
-#define VSID_MODULUS_256M ((1UL<<VSID_BITS_256M)-1)
+#define VSID_BITS_256M (VA_BITS - SID_SHIFT)
+#define VSID_BITS_65_256M (65 - SID_SHIFT)
+/*
+ * Modular multiplicative inverse of VSID_MULTIPLIER under modulo VSID_MODULUS
+ */
+#define VSID_MULINV_256M ASM_CONST(665548017062)
#define VSID_MULTIPLIER_1T ASM_CONST(12538073) /* 24-bit prime */
-#define VSID_BITS_1T (CONTEXT_BITS + ESID_BITS_1T)
-#define VSID_MODULUS_1T ((1UL<<VSID_BITS_1T)-1)
-
+#define VSID_BITS_1T (VA_BITS - SID_SHIFT_1T)
+#define VSID_BITS_65_1T (65 - SID_SHIFT_1T)
+#define VSID_MULINV_1T ASM_CONST(209034062)
+/* 1TB VSID reserved for VRMA */
+#define VRMA_VSID 0x1ffffffUL
#define USER_VSID_RANGE (1UL << (ESID_BITS + SID_SHIFT))
-/*
- * This macro generates asm code to compute the VSID scramble
- * function. Used in slb_allocate() and do_stab_bolted. The function
- * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS
- *
- * rt = register containing the proto-VSID and into which the
- * VSID will be stored
- * rx = scratch register (clobbered)
- *
- * - rt and rx must be different registers
- * - The answer will end up in the low VSID_BITS bits of rt. The higher
- * bits may contain other garbage, so you may need to mask the
- * result.
- */
-#define ASM_VSID_SCRAMBLE(rt, rx, size) \
- lis rx,VSID_MULTIPLIER_##size@h; \
- ori rx,rx,VSID_MULTIPLIER_##size@l; \
- mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \
- \
- srdi rx,rt,VSID_BITS_##size; \
- clrldi rt,rt,(64-VSID_BITS_##size); \
- add rt,rt,rx; /* add high and low bits */ \
- /* NOTE: explanation based on VSID_BITS_##size = 36 \
- * Now, r3 == VSID (mod 2^36-1), and lies between 0 and \
- * 2^36-1+2^28-1. That in particular means that if r3 >= \
- * 2^36-1, then r3+1 has the 2^36 bit set. So, if r3+1 has \
- * the bit clear, r3 already has the answer we want, if it \
- * doesn't, the answer is the low 36 bits of r3+1. So in all \
- * cases the answer is the low 36 bits of (r3 + ((r3+1) >> 36))*/\
- addi rx,rt,1; \
- srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \
- add rt,rt,rx
-
/* 4 bits per slice and we have one slice per 1TB */
-#define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41)
+#define SLICE_ARRAY_SIZE (H_PGTABLE_RANGE >> 41)
+#define TASK_SLICE_ARRAY_SZ(x) ((x)->context.addr_limit >> 41)
#ifndef __ASSEMBLY__
@@ -634,7 +651,7 @@ static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
#define vsid_scramble(protovsid, size) \
((((protovsid) * VSID_MULTIPLIER_##size) % VSID_MODULUS_##size))
-#else /* 1 */
+/* simplified form avoiding mod operation */
#define vsid_scramble(protovsid, size) \
({ \
unsigned long x; \
@@ -642,6 +659,21 @@ static inline void subpage_prot_init_new_context(struct mm_struct *mm) { }
x = (x >> VSID_BITS_##size) + (x & VSID_MODULUS_##size); \
(x + ((x+1) >> VSID_BITS_##size)) & VSID_MODULUS_##size; \
})
+
+#else /* 1 */
+static inline unsigned long vsid_scramble(unsigned long protovsid,
+ unsigned long vsid_multiplier, int vsid_bits)
+{
+ unsigned long vsid;
+ unsigned long vsid_modulus = ((1UL << vsid_bits) - 1);
+ /*
+ * We have same multipler for both 256 and 1T segements now
+ */
+ vsid = protovsid * vsid_multiplier;
+ vsid = (vsid >> vsid_bits) + (vsid & vsid_modulus);
+ return (vsid + ((vsid + 1) >> vsid_bits)) & vsid_modulus;
+}
+
#endif /* 1 */
/* Returns the segment size indicator for a user address */
@@ -656,36 +688,56 @@ static inline int user_segment_size(unsigned long addr)
static inline unsigned long get_vsid(unsigned long context, unsigned long ea,
int ssize)
{
+ unsigned long va_bits = VA_BITS;
+ unsigned long vsid_bits;
+ unsigned long protovsid;
+
/*
* Bad address. We return VSID 0 for that
*/
if ((ea & ~REGION_MASK) >= H_PGTABLE_RANGE)
return 0;
- if (ssize == MMU_SEGSIZE_256M)
- return vsid_scramble((context << ESID_BITS)
- | ((ea >> SID_SHIFT) & ESID_BITS_MASK), 256M);
- return vsid_scramble((context << ESID_BITS_1T)
- | ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK), 1T);
+ if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
+ va_bits = 65;
+
+ if (ssize == MMU_SEGSIZE_256M) {
+ vsid_bits = va_bits - SID_SHIFT;
+ protovsid = (context << ESID_BITS) |
+ ((ea >> SID_SHIFT) & ESID_BITS_MASK);
+ return vsid_scramble(protovsid, VSID_MULTIPLIER_256M, vsid_bits);
+ }
+ /* 1T segment */
+ vsid_bits = va_bits - SID_SHIFT_1T;
+ protovsid = (context << ESID_BITS_1T) |
+ ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK);
+ return vsid_scramble(protovsid, VSID_MULTIPLIER_1T, vsid_bits);
}
/*
* This is only valid for addresses >= PAGE_OFFSET
- *
- * For kernel space, we use the top 4 context ids to map address as below
- * 0x7fffc - [ 0xc000000000000000 - 0xc0003fffffffffff ]
- * 0x7fffd - [ 0xd000000000000000 - 0xd0003fffffffffff ]
- * 0x7fffe - [ 0xe000000000000000 - 0xe0003fffffffffff ]
- * 0x7ffff - [ 0xf000000000000000 - 0xf0003fffffffffff ]
*/
static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
{
unsigned long context;
+ if (!is_kernel_addr(ea))
+ return 0;
+
/*
- * kernel take the top 4 context from the available range
+ * For kernel space, we use context ids 1-4 to map the address space as
+ * below:
+ *
+ * 0x00001 - [ 0xc000000000000000 - 0xc0003fffffffffff ]
+ * 0x00002 - [ 0xd000000000000000 - 0xd0003fffffffffff ]
+ * 0x00003 - [ 0xe000000000000000 - 0xe0003fffffffffff ]
+ * 0x00004 - [ 0xf000000000000000 - 0xf0003fffffffffff ]
+ *
+ * So we can compute the context from the region (top nibble) by
+ * subtracting 11, or 0xc - 1.
*/
- context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1;
+ context = (ea >> 60) - KERNEL_REGION_CONTEXT_OFFSET;
+
return get_vsid(context, ea, ssize);
}
diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h
index 805d4105e9bb..77529a3e3811 100644
--- a/arch/powerpc/include/asm/book3s/64/mmu.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu.h
@@ -65,6 +65,8 @@ extern struct patb_entry *partition_tb;
* MAX_USER_CONTEXT * 16 bytes of space.
*/
#define PRTB_SIZE_SHIFT (CONTEXT_BITS + 4)
+#define PRTB_ENTRIES (1ul << CONTEXT_BITS)
+
/*
* Power9 currently only support 64K partition table size.
*/
@@ -73,13 +75,20 @@ extern struct patb_entry *partition_tb;
typedef unsigned long mm_context_id_t;
struct spinlock;
+/* Maximum possible number of NPUs in a system. */
+#define NV_MAX_NPUS 8
+
typedef struct {
mm_context_id_t id;
u16 user_psize; /* page size index */
+ /* NPU NMMU context */
+ struct npu_context *npu_context;
+
#ifdef CONFIG_PPC_MM_SLICES
u64 low_slices_psize; /* SLB page size encodings */
unsigned char high_slices_psize[SLICE_ARRAY_SIZE];
+ unsigned long addr_limit;
#else
u16 sllp; /* SLB page size encoding */
#endif
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8f4d41936e5a..85bc9875c3be 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -13,6 +13,7 @@
#define _PAGE_BIT_SWAP_TYPE 0
#define _PAGE_RO 0
+#define _PAGE_SHARED 0
#define _PAGE_EXEC 0x00001 /* execute permission */
#define _PAGE_WRITE 0x00002 /* write access allowed */
@@ -37,21 +38,47 @@
#define _RPAGE_RSV3 0x0400000000000000UL
#define _RPAGE_RSV4 0x0200000000000000UL
-#ifdef CONFIG_MEM_SOFT_DIRTY
-#define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */
-#else
-#define _PAGE_SOFT_DIRTY 0x00000
-#endif
-#define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */
+#define _PAGE_PTE 0x4000000000000000UL /* distinguishes PTEs from pointers */
+#define _PAGE_PRESENT 0x8000000000000000UL /* pte contains a translation */
/*
- * For P9 DD1 only, we need to track whether the pte's huge.
+ * Top and bottom bits of RPN which can be used by hash
+ * translation mode, because we expect them to be zero
+ * otherwise.
*/
-#define _PAGE_LARGE _RPAGE_RSV1
+#define _RPAGE_RPN0 0x01000
+#define _RPAGE_RPN1 0x02000
+#define _RPAGE_RPN44 0x0100000000000000UL
+#define _RPAGE_RPN43 0x0080000000000000UL
+#define _RPAGE_RPN42 0x0040000000000000UL
+#define _RPAGE_RPN41 0x0020000000000000UL
+
+/* Max physical address bit as per radix table */
+#define _RPAGE_PA_MAX 57
+/*
+ * Max physical address bit we will use for now.
+ *
+ * This is mostly a hardware limitation and for now Power9 has
+ * a 51 bit limit.
+ *
+ * This is different from the number of physical bit required to address
+ * the last byte of memory. That is defined by MAX_PHYSMEM_BITS.
+ * MAX_PHYSMEM_BITS is a linux limitation imposed by the maximum
+ * number of sections we can support (SECTIONS_SHIFT).
+ *
+ * This is different from Radix page table limitation above and
+ * should always be less than that. The limit is done such that
+ * we can overload the bits between _RPAGE_PA_MAX and _PAGE_PA_MAX
+ * for hash linux page table specific bits.
+ *
+ * In order to be compatible with future hardware generations we keep
+ * some offsets and limit this for now to 53
+ */
+#define _PAGE_PA_MAX 53
-#define _PAGE_PTE (1ul << 62) /* distinguishes PTEs from pointers */
-#define _PAGE_PRESENT (1ul << 63) /* pte contains a translation */
+#define _PAGE_SOFT_DIRTY _RPAGE_SW3 /* software: software dirty tracking */
+#define _PAGE_SPECIAL _RPAGE_SW2 /* software: special page */
/*
* Drivers request for cache inhibited pte mapping using _PAGE_NO_CACHE
* Instead of fixing all of them, add an alternate define which
@@ -59,10 +86,11 @@
*/
#define _PAGE_NO_CACHE _PAGE_TOLERANT
/*
- * We support 57 bit real address in pte. Clear everything above 57, and
- * every thing below PAGE_SHIFT;
+ * We support _RPAGE_PA_MAX bit real address in pte. On the linux side
+ * we are limited by _PAGE_PA_MAX. Clear everything above _PAGE_PA_MAX
+ * and every thing below PAGE_SHIFT;
*/
-#define PTE_RPN_MASK (((1UL << 57) - 1) & (PAGE_MASK))
+#define PTE_RPN_MASK (((1UL << _PAGE_PA_MAX) - 1) & (PAGE_MASK))
/*
* set of bits not changed in pmd_modify. Even though we have hash specific bits
* in here, on radix we expect them to be zero.
@@ -205,10 +233,6 @@ extern unsigned long __pte_frag_nr;
extern unsigned long __pte_frag_size_shift;
#define PTE_FRAG_SIZE_SHIFT __pte_frag_size_shift
#define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT)
-/*
- * Pgtable size used by swapper, init in asm code
- */
-#define MAX_PGD_TABLE_SIZE (sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE)
#define PTRS_PER_PTE (1 << PTE_INDEX_SIZE)
#define PTRS_PER_PMD (1 << PMD_INDEX_SIZE)
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 9e0bb7cd6e22..ac16d1943022 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -11,6 +11,12 @@
#include <asm/book3s/64/radix-4k.h>
#endif
+/*
+ * For P9 DD1 only, we need to track whether the pte's huge.
+ */
+#define R_PAGE_LARGE _RPAGE_RSV1
+
+
#ifndef __ASSEMBLY__
#include <asm/book3s/64/tlbflush-radix.h>
#include <asm/cpu_has_feature.h>
@@ -252,7 +258,7 @@ static inline int radix__pmd_trans_huge(pmd_t pmd)
static inline pmd_t radix__pmd_mkhuge(pmd_t pmd)
{
if (cpu_has_feature(CPU_FTR_POWER9_DD1))
- return __pmd(pmd_val(pmd) | _PAGE_PTE | _PAGE_LARGE);
+ return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE);
return __pmd(pmd_val(pmd) | _PAGE_PTE);
}
static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma,
diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h
index 8ab937771068..abef812de7f8 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -12,6 +12,8 @@
#include <asm/types.h>
#include <asm/ppc-opcode.h>
+#include <linux/string.h>
+#include <linux/kallsyms.h>
/* Flags for create_branch:
* "b" == create_branch(addr, target, 0);
@@ -99,6 +101,45 @@ static inline unsigned long ppc_global_function_entry(void *func)
#endif
}
+/*
+ * Wrapper around kallsyms_lookup() to return function entry address:
+ * - For ABIv1, we lookup the dot variant.
+ * - For ABIv2, we return the local entry point.
+ */
+static inline unsigned long ppc_kallsyms_lookup_name(const char *name)
+{
+ unsigned long addr;
+#ifdef PPC64_ELF_ABI_v1
+ /* check for dot variant */
+ char dot_name[1 + KSYM_NAME_LEN];
+ bool dot_appended = false;
+
+ if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN)
+ return 0;
+
+ if (name[0] != '.') {
+ dot_name[0] = '.';
+ dot_name[1] = '\0';
+ strlcat(dot_name, name, sizeof(dot_name));
+ dot_appended = true;
+ } else {
+ dot_name[0] = '\0';
+ strlcat(dot_name, name, sizeof(dot_name));
+ }
+ addr = kallsyms_lookup_name(dot_name);
+ if (!addr && dot_appended)
+ /* Let's try the original non-dot symbol lookup */
+ addr = kallsyms_lookup_name(name);
+#elif defined(PPC64_ELF_ABI_v2)
+ addr = kallsyms_lookup_name(name);
+ if (addr)
+ addr = ppc_function_entry((void *)addr);
+#else
+ addr = kallsyms_lookup_name(name);
+#endif
+ return addr;
+}
+
#ifdef CONFIG_PPC64
/*
* Some instruction encodings commonly used in dynamic ftracing
diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
index 155731557c9b..52586f9956bb 100644
--- a/arch/powerpc/include/asm/cpuidle.h
+++ b/arch/powerpc/include/asm/cpuidle.h
@@ -2,13 +2,39 @@
#define _ASM_POWERPC_CPUIDLE_H
#ifdef CONFIG_PPC_POWERNV
-/* Used in powernv idle state management */
+/* Thread state used in powernv idle state management */
#define PNV_THREAD_RUNNING 0
#define PNV_THREAD_NAP 1
#define PNV_THREAD_SLEEP 2
#define PNV_THREAD_WINKLE 3
-#define PNV_CORE_IDLE_LOCK_BIT 0x100
-#define PNV_CORE_IDLE_THREAD_BITS 0x0FF
+
+/*
+ * Core state used in powernv idle for POWER8.
+ *
+ * The lock bit synchronizes updates to the state, as well as parts of the
+ * sleep/wake code (see kernel/idle_book3s.S).
+ *
+ * Bottom 8 bits track the idle state of each thread. Bit is cleared before
+ * the thread executes an idle instruction (nap/sleep/winkle).
+ *
+ * Then there is winkle tracking. A core does not lose complete state
+ * until every thread is in winkle. So the winkle count field counts the
+ * number of threads in winkle (small window of false positives is okay
+ * around the sleep/wake, so long as there are no false negatives).
+ *
+ * When the winkle count reaches 8 (the COUNT_ALL_BIT becomes set), then
+ * the THREAD_WINKLE_BITS are set, which indicate which threads have not
+ * yet woken from the winkle state.
+ */
+#define PNV_CORE_IDLE_LOCK_BIT 0x10000000
+
+#define PNV_CORE_IDLE_WINKLE_COUNT 0x00010000
+#define PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT 0x00080000
+#define PNV_CORE_IDLE_WINKLE_COUNT_BITS 0x000F0000
+#define PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT 8
+#define PNV_CORE_IDLE_THREAD_WINKLE_BITS 0x0000FF00
+
+#define PNV_CORE_IDLE_THREAD_BITS 0x000000FF
/*
* ============================ NOTE =================================
@@ -46,6 +72,7 @@ extern u32 pnv_fastsleep_workaround_at_exit[];
extern u64 pnv_first_deep_stop_state;
+unsigned long pnv_cpu_offline(unsigned int cpu);
int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags);
static inline void report_invalid_psscr_val(u64 psscr_val, int err)
{
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index ab68d0ee7725..1f6847b107e4 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -471,10 +471,11 @@ enum {
CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
CPU_FTR_DSCR | CPU_FTR_SAO | \
CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
- CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
+ CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300)
-#define CPU_FTRS_POWER9_DD1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1)
+#define CPU_FTRS_POWER9_DD1 ((CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1) & \
+ (~CPU_FTR_SAO))
#define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h
index 378167377065..f70cbfe0ec04 100644
--- a/arch/powerpc/include/asm/dbell.h
+++ b/arch/powerpc/include/asm/dbell.h
@@ -35,33 +35,53 @@ enum ppc_dbell {
#ifdef CONFIG_PPC_BOOK3S
#define PPC_DBELL_MSGTYPE PPC_DBELL_SERVER
-#define SPRN_DOORBELL_CPUTAG SPRN_TIR
-#define PPC_DBELL_TAG_MASK 0x7f
static inline void _ppc_msgsnd(u32 msg)
{
- if (cpu_has_feature(CPU_FTR_HVMODE))
- __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
- else
- __asm__ __volatile__ (PPC_MSGSNDP(%0) : : "r" (msg));
+ __asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSND(%1), PPC_MSGSNDP(%1), %0)
+ : : "i" (CPU_FTR_HVMODE), "r" (msg));
+}
+
+/* sync before sending message */
+static inline void ppc_msgsnd_sync(void)
+{
+ __asm__ __volatile__ ("sync" : : : "memory");
+}
+
+/* sync after taking message interrupt */
+static inline void ppc_msgsync(void)
+{
+ /* sync is not required when taking messages from the same core */
+ __asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSYNC " ; lwsync", "", %0)
+ : : "i" (CPU_FTR_HVMODE|CPU_FTR_ARCH_300));
}
#else /* CONFIG_PPC_BOOK3S */
#define PPC_DBELL_MSGTYPE PPC_DBELL
-#define SPRN_DOORBELL_CPUTAG SPRN_PIR
-#define PPC_DBELL_TAG_MASK 0x3fff
static inline void _ppc_msgsnd(u32 msg)
{
__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
}
+/* sync before sending message */
+static inline void ppc_msgsnd_sync(void)
+{
+ __asm__ __volatile__ ("sync" : : : "memory");
+}
+
+/* sync after taking message interrupt */
+static inline void ppc_msgsync(void)
+{
+}
+
#endif /* CONFIG_PPC_BOOK3S */
-extern void doorbell_cause_ipi(int cpu, unsigned long data);
+extern void doorbell_global_ipi(int cpu);
+extern void doorbell_core_ipi(int cpu);
+extern int doorbell_try_core_ipi(int cpu);
extern void doorbell_exception(struct pt_regs *regs);
-extern void doorbell_setup_this_cpu(void);
static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag)
{
diff --git a/arch/powerpc/include/asm/debug.h b/arch/powerpc/include/asm/debug.h
index 86308f177f2d..5d5af3fddfd8 100644
--- a/arch/powerpc/include/asm/debug.h
+++ b/arch/powerpc/include/asm/debug.h
@@ -8,8 +8,6 @@
struct pt_regs;
-extern struct dentry *powerpc_debugfs_root;
-
#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE)
extern int (*__debugger)(struct pt_regs *regs);
diff --git a/arch/powerpc/include/asm/debugfs.h b/arch/powerpc/include/asm/debugfs.h
new file mode 100644
index 000000000000..4f3b39f3e3d2
--- /dev/null
+++ b/arch/powerpc/include/asm/debugfs.h
@@ -0,0 +1,17 @@
+#ifndef _ASM_POWERPC_DEBUGFS_H
+#define _ASM_POWERPC_DEBUGFS_H
+
+/*
+ * Copyright 2017, Michael Ellerman, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/debugfs.h>
+
+extern struct dentry *powerpc_debugfs_root;
+
+#endif /* _ASM_POWERPC_DEBUGFS_H */
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index ed3beadd2cc5..183d73b6ed99 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -167,17 +167,14 @@ BEGIN_FTR_SECTION_NESTED(943) \
std ra,offset(r13); \
END_FTR_SECTION_NESTED(ftr,ftr,943)
-#define EXCEPTION_PROLOG_0_PACA(area) \
+#define EXCEPTION_PROLOG_0(area) \
+ GET_PACA(r13); \
std r9,area+EX_R9(r13); /* save r9 */ \
OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \
HMT_MEDIUM; \
std r10,area+EX_R10(r13); /* save r10 - r12 */ \
OPT_GET_SPR(r10, SPRN_CFAR, CPU_FTR_CFAR)
-#define EXCEPTION_PROLOG_0(area) \
- GET_PACA(r13); \
- EXCEPTION_PROLOG_0_PACA(area)
-
#define __EXCEPTION_PROLOG_1(area, extra, vec) \
OPT_SAVE_REG_TO_PACA(area+EX_PPR, r9, CPU_FTR_HAS_PPR); \
OPT_SAVE_REG_TO_PACA(area+EX_CFAR, r10, CPU_FTR_CFAR); \
@@ -203,17 +200,26 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
#define EXCEPTION_PROLOG_PSERIES_1(label, h) \
__EXCEPTION_PROLOG_PSERIES_1(label, h)
+/* _NORI variant keeps MSR_RI clear */
+#define __EXCEPTION_PROLOG_PSERIES_1_NORI(label, h) \
+ ld r10,PACAKMSR(r13); /* get MSR value for kernel */ \
+ xori r10,r10,MSR_RI; /* Clear MSR_RI */ \
+ mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \
+ LOAD_HANDLER(r12,label) \
+ mtspr SPRN_##h##SRR0,r12; \
+ mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \
+ mtspr SPRN_##h##SRR1,r10; \
+ h##rfid; \
+ b . /* prevent speculative execution */
+
+#define EXCEPTION_PROLOG_PSERIES_1_NORI(label, h) \
+ __EXCEPTION_PROLOG_PSERIES_1_NORI(label, h)
+
#define EXCEPTION_PROLOG_PSERIES(area, label, h, extra, vec) \
EXCEPTION_PROLOG_0(area); \
EXCEPTION_PROLOG_1(area, extra, vec); \
EXCEPTION_PROLOG_PSERIES_1(label, h);
-/* Have the PACA in r13 already */
-#define EXCEPTION_PROLOG_PSERIES_PACA(area, label, h, extra, vec) \
- EXCEPTION_PROLOG_0_PACA(area); \
- EXCEPTION_PROLOG_1(area, extra, vec); \
- EXCEPTION_PROLOG_PSERIES_1(label, h);
-
#define __KVMTEST(h, n) \
lbz r10,HSTATE_IN_GUEST(r13); \
cmpwi r10,0; \
@@ -256,11 +262,6 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
ld r9,area+EX_R9(r13); \
bctr
-#define BRANCH_TO_KVM(reg, label) \
- __LOAD_FAR_HANDLER(reg, label); \
- mtctr reg; \
- bctr
-
#else
#define BRANCH_TO_COMMON(reg, label) \
b label
@@ -268,15 +269,18 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
#define BRANCH_LINK_TO_FAR(label) \
bl label
-#define BRANCH_TO_KVM(reg, label) \
- b label
-
#define __BRANCH_TO_KVM_EXIT(area, label) \
ld r9,area+EX_R9(r13); \
b label
#endif
+/* Do not enable RI */
+#define EXCEPTION_PROLOG_PSERIES_NORI(area, label, h, extra, vec) \
+ EXCEPTION_PROLOG_0(area); \
+ EXCEPTION_PROLOG_1(area, extra, vec); \
+ EXCEPTION_PROLOG_PSERIES_1_NORI(label, h);
+
#define __KVM_HANDLER(area, h, n) \
BEGIN_FTR_SECTION_NESTED(947) \
@@ -325,6 +329,15 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
#define NOTEST(n)
+#define EXCEPTION_PROLOG_COMMON_1() \
+ std r9,_CCR(r1); /* save CR in stackframe */ \
+ std r11,_NIP(r1); /* save SRR0 in stackframe */ \
+ std r12,_MSR(r1); /* save SRR1 in stackframe */ \
+ std r10,0(r1); /* make stack chain pointer */ \
+ std r0,GPR0(r1); /* save r0 in stackframe */ \
+ std r10,GPR1(r1); /* save r1 in stackframe */ \
+
+
/*
* The common exception prolog is used for all except a few exceptions
* such as a segment miss on a kernel address. We have to be prepared
@@ -349,12 +362,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
addi r3,r13,area; /* r3 -> where regs are saved*/ \
RESTORE_CTR(r1, area); \
b bad_stack; \
-3: std r9,_CCR(r1); /* save CR in stackframe */ \
- std r11,_NIP(r1); /* save SRR0 in stackframe */ \
- std r12,_MSR(r1); /* save SRR1 in stackframe */ \
- std r10,0(r1); /* make stack chain pointer */ \
- std r0,GPR0(r1); /* save r0 in stackframe */ \
- std r10,GPR1(r1); /* save r1 in stackframe */ \
+3: EXCEPTION_PROLOG_COMMON_1(); \
beq 4f; /* if from kernel mode */ \
ACCOUNT_CPU_USER_ENTRY(r13, r9, r10); \
SAVE_PPR(area, r9, r10); \
@@ -522,7 +530,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
#define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label) \
EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec); \
- EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV)
+ EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV)
/*
* Our exception common code can be passed various "additions"
@@ -547,26 +555,39 @@ BEGIN_FTR_SECTION \
beql ppc64_runlatch_on_trampoline; \
END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
-#define EXCEPTION_COMMON(trap, label, hdlr, ret, additions) \
- EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN); \
+#define EXCEPTION_COMMON(area, trap, label, hdlr, ret, additions) \
+ EXCEPTION_PROLOG_COMMON(trap, area); \
/* Volatile regs are potentially clobbered here */ \
additions; \
addi r3,r1,STACK_FRAME_OVERHEAD; \
bl hdlr; \
b ret
+/*
+ * Exception where stack is already set in r1, r1 is saved in r10, and it
+ * continues rather than returns.
+ */
+#define EXCEPTION_COMMON_NORET_STACK(area, trap, label, hdlr, additions) \
+ EXCEPTION_PROLOG_COMMON_1(); \
+ EXCEPTION_PROLOG_COMMON_2(area); \
+ EXCEPTION_PROLOG_COMMON_3(trap); \
+ /* Volatile regs are potentially clobbered here */ \
+ additions; \
+ addi r3,r1,STACK_FRAME_OVERHEAD; \
+ bl hdlr
+
#define STD_EXCEPTION_COMMON(trap, label, hdlr) \
- EXCEPTION_COMMON(trap, label, hdlr, ret_from_except, \
- ADD_NVGPRS;ADD_RECONCILE)
+ EXCEPTION_COMMON(PACA_EXGEN, trap, label, hdlr, \
+ ret_from_except, ADD_NVGPRS;ADD_RECONCILE)
/*
* Like STD_EXCEPTION_COMMON, but for exceptions that can occur
* in the idle task and therefore need the special idle handling
* (finish nap and runlatch)
*/
-#define STD_EXCEPTION_COMMON_ASYNC(trap, label, hdlr) \
- EXCEPTION_COMMON(trap, label, hdlr, ret_from_except_lite, \
- FINISH_NAP;ADD_RECONCILE;RUNLATCH_ON)
+#define STD_EXCEPTION_COMMON_ASYNC(trap, label, hdlr) \
+ EXCEPTION_COMMON(PACA_EXGEN, trap, label, hdlr, \
+ ret_from_except_lite, FINISH_NAP;ADD_RECONCILE;RUNLATCH_ON)
/*
* When the idle code in power4_idle puts the CPU into NAP mode,
diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
index ddf54f5bbdd1..2de2319b99e2 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -66,6 +66,9 @@ label##5: \
#define END_FTR_SECTION(msk, val) \
END_FTR_SECTION_NESTED(msk, val, 97)
+#define END_FTR_SECTION_NESTED_IFSET(msk, label) \
+ END_FTR_SECTION_NESTED((msk), (msk), label)
+
#define END_FTR_SECTION_IFSET(msk) END_FTR_SECTION((msk), (msk))
#define END_FTR_SECTION_IFCLR(msk) END_FTR_SECTION((msk), 0)
diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h
index 5067048daad4..86eb87382031 100644
--- a/arch/powerpc/include/asm/head-64.h
+++ b/arch/powerpc/include/asm/head-64.h
@@ -213,6 +213,7 @@ name:
USE_TEXT_SECTION(); \
.balign IFETCH_ALIGN_BYTES; \
.global name; \
+ _ASM_NOKPROBE_SYMBOL(name); \
DEFINE_FIXED_SYMBOL(name); \
name:
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 3cc12a86ef5d..d73755fafbb0 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -377,16 +377,6 @@ long plpar_hcall_raw(unsigned long opcode, unsigned long *retbuf, ...);
long plpar_hcall9(unsigned long opcode, unsigned long *retbuf, ...);
long plpar_hcall9_raw(unsigned long opcode, unsigned long *retbuf, ...);
-/* For hcall instrumentation. One structure per-hcall, per-CPU */
-struct hcall_stats {
- unsigned long num_calls; /* number of calls (on this CPU) */
- unsigned long tb_total; /* total wall time (mftb) of calls. */
- unsigned long purr_total; /* total cpu time (PURR) of calls. */
- unsigned long tb_start;
- unsigned long purr_start;
-};
-#define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1)
-
struct hvcall_mpp_data {
unsigned long entitled_mem;
unsigned long mapped_mem;
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 5ed292431b5b..422f99cf9924 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -25,8 +25,6 @@ extern struct pci_dev *isa_bridge_pcidev;
#endif
#include <linux/device.h>
-#include <linux/io.h>
-
#include <linux/compiler.h>
#include <asm/page.h>
#include <asm/byteorder.h>
@@ -192,24 +190,8 @@ DEF_MMIO_OUT_D(out_le32, 32, stw);
#endif /* __BIG_ENDIAN */
-/*
- * Cache inhibitied accessors for use in real mode, you don't want to use these
- * unless you know what you're doing.
- *
- * NB. These use the cpu byte ordering.
- */
-DEF_MMIO_OUT_X(out_rm8, 8, stbcix);
-DEF_MMIO_OUT_X(out_rm16, 16, sthcix);
-DEF_MMIO_OUT_X(out_rm32, 32, stwcix);
-DEF_MMIO_IN_X(in_rm8, 8, lbzcix);
-DEF_MMIO_IN_X(in_rm16, 16, lhzcix);
-DEF_MMIO_IN_X(in_rm32, 32, lwzcix);
-
#ifdef __powerpc64__
-DEF_MMIO_OUT_X(out_rm64, 64, stdcix);
-DEF_MMIO_IN_X(in_rm64, 64, ldcix);
-
#ifdef __BIG_ENDIAN__
DEF_MMIO_OUT_D(out_be64, 64, std);
DEF_MMIO_IN_D(in_be64, 64, ld);
@@ -242,35 +224,6 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val)
#endif
#endif /* __powerpc64__ */
-
-/*
- * Simple Cache inhibited accessors
- * Unlike the DEF_MMIO_* macros, these don't include any h/w memory
- * barriers, callers need to manage memory barriers on their own.
- * These can only be used in hypervisor real mode.
- */
-
-static inline u32 _lwzcix(unsigned long addr)
-{
- u32 ret;
-
- __asm__ __volatile__("lwzcix %0,0, %1"
- : "=r" (ret) : "r" (addr) : "memory");
- return ret;
-}
-
-static inline void _stbcix(u64 addr, u8 val)
-{
- __asm__ __volatile__("stbcix %0,0,%1"
- : : "r" (val), "r" (addr) : "memory");
-}
-
-static inline void _stwcix(u64 addr, u32 val)
-{
- __asm__ __volatile__("stwcix %0,0,%1"
- : : "r" (val), "r" (addr) : "memory");
-}
-
/*
* Low level IO stream instructions are defined out of line for now
*/
@@ -417,15 +370,64 @@ static inline void __raw_writeq(unsigned long v, volatile void __iomem *addr)
}
/*
- * Real mode version of the above. stdcix is only supposed to be used
- * in hypervisor real mode as per the architecture spec.
+ * Real mode versions of the above. Those instructions are only supposed
+ * to be used in hypervisor real mode as per the architecture spec.
*/
+static inline void __raw_rm_writeb(u8 val, volatile void __iomem *paddr)
+{
+ __asm__ __volatile__("stbcix %0,0,%1"
+ : : "r" (val), "r" (paddr) : "memory");
+}
+
+static inline void __raw_rm_writew(u16 val, volatile void __iomem *paddr)
+{
+ __asm__ __volatile__("sthcix %0,0,%1"
+ : : "r" (val), "r" (paddr) : "memory");
+}
+
+static inline void __raw_rm_writel(u32 val, volatile void __iomem *paddr)
+{
+ __asm__ __volatile__("stwcix %0,0,%1"
+ : : "r" (val), "r" (paddr) : "memory");
+}
+
static inline void __raw_rm_writeq(u64 val, volatile void __iomem *paddr)
{
__asm__ __volatile__("stdcix %0,0,%1"
: : "r" (val), "r" (paddr) : "memory");
}
+static inline u8 __raw_rm_readb(volatile void __iomem *paddr)
+{
+ u8 ret;
+ __asm__ __volatile__("lbzcix %0,0, %1"
+ : "=r" (ret) : "r" (paddr) : "memory");
+ return ret;
+}
+
+static inline u16 __raw_rm_readw(volatile void __iomem *paddr)
+{
+ u16 ret;
+ __asm__ __volatile__("lhzcix %0,0, %1"
+ : "=r" (ret) : "r" (paddr) : "memory");
+ return ret;
+}
+
+static inline u32 __raw_rm_readl(volatile void __iomem *paddr)
+{
+ u32 ret;
+ __asm__ __volatile__("lwzcix %0,0, %1"
+ : "=r" (ret) : "r" (paddr) : "memory");
+ return ret;
+}
+
+static inline u64 __raw_rm_readq(volatile void __iomem *paddr)
+{
+ u64 ret;
+ __asm__ __volatile__("ldcix %0,0, %1"
+ : "=r" (ret) : "r" (paddr) : "memory");
+ return ret;
+}
#endif /* __powerpc64__ */
/*
@@ -757,6 +759,8 @@ extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size,
extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size);
#define ioremap_nocache(addr, size) ioremap((addr), (size))
#define ioremap_uc(addr, size) ioremap((addr), (size))
+#define ioremap_cache(addr, size) \
+ ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL))
extern void iounmap(volatile void __iomem *addr);
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 2c1d50792944..d96142572e6d 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -64,6 +64,11 @@ struct iommu_table_ops {
long index,
unsigned long *hpa,
enum dma_data_direction *direction);
+ /* Real mode */
+ int (*exchange_rm)(struct iommu_table *tbl,
+ long index,
+ unsigned long *hpa,
+ enum dma_data_direction *direction);
#endif
void (*clear)(struct iommu_table *tbl,
long index, long npages);
@@ -114,6 +119,7 @@ struct iommu_table {
struct list_head it_group_list;/* List of iommu_table_group_link */
unsigned long *it_userspace; /* userspace view of the table */
struct iommu_table_ops *it_ops;
+ struct kref it_kref;
};
#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
@@ -146,8 +152,8 @@ static inline void *get_iommu_table_base(struct device *dev)
extern int dma_iommu_dma_supported(struct device *dev, u64 mask);
-/* Frees table for an individual device node */
-extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
+extern struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl);
+extern int iommu_tce_table_put(struct iommu_table *tbl);
/* Initializes an iommu_table based in values set in the passed-in
* structure
@@ -208,6 +214,8 @@ extern void iommu_del_device(struct device *dev);
extern int __init tce_iommu_bus_notifier_init(void);
extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
unsigned long *hpa, enum dma_data_direction *direction);
+extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
+ unsigned long *hpa, enum dma_data_direction *direction);
#else
static inline void iommu_register_group(struct iommu_table_group *table_group,
int pci_domain_number,
diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h
index 0503c98b2117..a83821f33ea3 100644
--- a/arch/powerpc/include/asm/kprobes.h
+++ b/arch/powerpc/include/asm/kprobes.h
@@ -61,59 +61,6 @@ extern kprobe_opcode_t optprobe_template_end[];
#define MAX_OPTINSN_SIZE (optprobe_template_end - optprobe_template_entry)
#define RELATIVEJUMP_SIZE sizeof(kprobe_opcode_t) /* 4 bytes */
-#ifdef PPC64_ELF_ABI_v2
-/* PPC64 ABIv2 needs local entry point */
-#define kprobe_lookup_name(name, addr) \
-{ \
- addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); \
- if (addr) \
- addr = (kprobe_opcode_t *)ppc_function_entry(addr); \
-}
-#elif defined(PPC64_ELF_ABI_v1)
-/*
- * 64bit powerpc ABIv1 uses function descriptors:
- * - Check for the dot variant of the symbol first.
- * - If that fails, try looking up the symbol provided.
- *
- * This ensures we always get to the actual symbol and not the descriptor.
- * Also handle <module:symbol> format.
- */
-#define kprobe_lookup_name(name, addr) \
-{ \
- char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN]; \
- const char *modsym; \
- bool dot_appended = false; \
- if ((modsym = strchr(name, ':')) != NULL) { \
- modsym++; \
- if (*modsym != '\0' && *modsym != '.') { \
- /* Convert to <module:.symbol> */ \
- strncpy(dot_name, name, modsym - name); \
- dot_name[modsym - name] = '.'; \
- dot_name[modsym - name + 1] = '\0'; \
- strncat(dot_name, modsym, \
- sizeof(dot_name) - (modsym - name) - 2);\
- dot_appended = true; \
- } else { \
- dot_name[0] = '\0'; \
- strncat(dot_name, name, sizeof(dot_name) - 1); \
- } \
- } else if (name[0] != '.') { \
- dot_name[0] = '.'; \
- dot_name[1] = '\0'; \
- strncat(dot_name, name, KSYM_NAME_LEN - 2); \
- dot_appended = true; \
- } else { \
- dot_name[0] = '\0'; \
- strncat(dot_name, name, KSYM_NAME_LEN - 1); \
- } \
- addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name); \
- if (!addr && dot_appended) { \
- /* Let's try the original non-dot symbol lookup */ \
- addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); \
- } \
-}
-#endif
-
#define flush_insn_slot(p) do { } while (0)
#define kretprobe_blacklist_size 0
@@ -156,6 +103,16 @@ extern int kprobe_exceptions_notify(struct notifier_block *self,
extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
extern int kprobe_handler(struct pt_regs *regs);
extern int kprobe_post_handler(struct pt_regs *regs);
+#ifdef CONFIG_KPROBES_ON_FTRACE
+extern int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb);
+#else
+static inline int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ return 0;
+}
+#endif
#else
static inline int kprobe_handler(struct pt_regs *regs) { return 0; }
static inline int kprobe_post_handler(struct pt_regs *regs) { return 0; }
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index d9b48f5bb606..d55c7f881ce7 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -49,8 +49,6 @@ static inline bool kvm_is_radix(struct kvm *kvm)
#define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */
#endif
-#define VRMA_VSID 0x1ffffffUL /* 1TB VSID reserved for VRMA */
-
/*
* We use a lock bit in HPTE dword 0 to synchronize updates and
* accesses to each HPTE, and another bit to indicate non-present
diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
index d318d432caa9..0593d9479f74 100644
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -110,7 +110,7 @@ struct kvmppc_host_state {
u8 ptid;
struct kvm_vcpu *kvm_vcpu;
struct kvmppc_vcore *kvm_vcore;
- unsigned long xics_phys;
+ void __iomem *xics_phys;
u32 saved_xirr;
u64 dabr;
u64 host_mmcr[7]; /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index dd11c4c8c56a..c3877992eff9 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -409,7 +409,7 @@ struct openpic;
extern void kvm_cma_reserve(void) __init;
static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
{
- paca[cpu].kvm_hstate.xics_phys = addr;
+ paca[cpu].kvm_hstate.xics_phys = (void __iomem *)addr;
}
static inline u32 kvmppc_get_xics_latch(void)
@@ -478,8 +478,6 @@ extern void kvmppc_free_host_rm_ops(void);
extern void kvmppc_free_pimap(struct kvm *kvm);
extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
-extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
-extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
@@ -507,12 +505,6 @@ static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
{ return 0; }
static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
-static inline int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu,
- unsigned long server)
- { return -EINVAL; }
-static inline int kvm_vm_ioctl_xics_irq(struct kvm *kvm,
- struct kvm_irq_level *args)
- { return -ENOTTY; }
static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
{ return 0; }
#endif
diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index ed62efe01e49..81eff8631434 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -24,97 +24,6 @@
#include <linux/bitops.h>
-/*
- * Machine Check bits on power7 and power8
- */
-#define P7_SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42)) /* P8 too */
-
-/* SRR1 bits for machine check (On Power7 and Power8) */
-#define P7_SRR1_MC_IFETCH(srr1) ((srr1) & PPC_BITMASK(43, 45)) /* P8 too */
-
-#define P7_SRR1_MC_IFETCH_UE (0x1 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_SLB_PARITY (0x2 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_SLB_MULTIHIT (0x3 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_SLB_BOTH (0x4 << PPC_BITLSHIFT(45))
-#define P7_SRR1_MC_IFETCH_TLB_MULTIHIT (0x5 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_UE_TLB_RELOAD (0x6 << PPC_BITLSHIFT(45)) /* P8 too */
-#define P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL (0x7 << PPC_BITLSHIFT(45))
-
-/* SRR1 bits for machine check (On Power8) */
-#define P8_SRR1_MC_IFETCH_ERAT_MULTIHIT (0x4 << PPC_BITLSHIFT(45))
-
-/* DSISR bits for machine check (On Power7 and Power8) */
-#define P7_DSISR_MC_UE (PPC_BIT(48)) /* P8 too */
-#define P7_DSISR_MC_UE_TABLEWALK (PPC_BIT(49)) /* P8 too */
-#define P7_DSISR_MC_ERAT_MULTIHIT (PPC_BIT(52)) /* P8 too */
-#define P7_DSISR_MC_TLB_MULTIHIT_MFTLB (PPC_BIT(53)) /* P8 too */
-#define P7_DSISR_MC_SLB_PARITY_MFSLB (PPC_BIT(55)) /* P8 too */
-#define P7_DSISR_MC_SLB_MULTIHIT (PPC_BIT(56)) /* P8 too */
-#define P7_DSISR_MC_SLB_MULTIHIT_PARITY (PPC_BIT(57)) /* P8 too */
-
-/*
- * DSISR bits for machine check (Power8) in addition to above.
- * Secondary DERAT Multihit
- */
-#define P8_DSISR_MC_ERAT_MULTIHIT_SEC (PPC_BIT(54))
-
-/* SLB error bits */
-#define P7_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_ERAT_MULTIHIT | \
- P7_DSISR_MC_SLB_PARITY_MFSLB | \
- P7_DSISR_MC_SLB_MULTIHIT | \
- P7_DSISR_MC_SLB_MULTIHIT_PARITY)
-
-#define P8_DSISR_MC_SLB_ERRORS (P7_DSISR_MC_SLB_ERRORS | \
- P8_DSISR_MC_ERAT_MULTIHIT_SEC)
-
-/*
- * Machine Check bits on power9
- */
-#define P9_SRR1_MC_LOADSTORE(srr1) (((srr1) >> PPC_BITLSHIFT(42)) & 1)
-
-#define P9_SRR1_MC_IFETCH(srr1) ( \
- PPC_BITEXTRACT(srr1, 45, 0) | \
- PPC_BITEXTRACT(srr1, 44, 1) | \
- PPC_BITEXTRACT(srr1, 43, 2) | \
- PPC_BITEXTRACT(srr1, 36, 3) )
-
-/* 0 is reserved */
-#define P9_SRR1_MC_IFETCH_UE 1
-#define P9_SRR1_MC_IFETCH_SLB_PARITY 2
-#define P9_SRR1_MC_IFETCH_SLB_MULTIHIT 3
-#define P9_SRR1_MC_IFETCH_ERAT_MULTIHIT 4
-#define P9_SRR1_MC_IFETCH_TLB_MULTIHIT 5
-#define P9_SRR1_MC_IFETCH_UE_TLB_RELOAD 6
-/* 7 is reserved */
-#define P9_SRR1_MC_IFETCH_LINK_TIMEOUT 8
-#define P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT 9
-/* 10 ? */
-#define P9_SRR1_MC_IFETCH_RA 11
-#define P9_SRR1_MC_IFETCH_RA_TABLEWALK 12
-#define P9_SRR1_MC_IFETCH_RA_ASYNC_STORE 13
-#define P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT 14
-#define P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN 15
-
-/* DSISR bits for machine check (On Power9) */
-#define P9_DSISR_MC_UE (PPC_BIT(48))
-#define P9_DSISR_MC_UE_TABLEWALK (PPC_BIT(49))
-#define P9_DSISR_MC_LINK_LOAD_TIMEOUT (PPC_BIT(50))
-#define P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT (PPC_BIT(51))
-#define P9_DSISR_MC_ERAT_MULTIHIT (PPC_BIT(52))
-#define P9_DSISR_MC_TLB_MULTIHIT_MFTLB (PPC_BIT(53))
-#define P9_DSISR_MC_USER_TLBIE (PPC_BIT(54))
-#define P9_DSISR_MC_SLB_PARITY_MFSLB (PPC_BIT(55))
-#define P9_DSISR_MC_SLB_MULTIHIT_MFSLB (PPC_BIT(56))
-#define P9_DSISR_MC_RA_LOAD (PPC_BIT(57))
-#define P9_DSISR_MC_RA_TABLEWALK (PPC_BIT(58))
-#define P9_DSISR_MC_RA_TABLEWALK_FOREIGN (PPC_BIT(59))
-#define P9_DSISR_MC_RA_FOREIGN (PPC_BIT(60))
-
-/* SLB error bits */
-#define P9_DSISR_MC_SLB_ERRORS (P9_DSISR_MC_ERAT_MULTIHIT | \
- P9_DSISR_MC_SLB_PARITY_MFSLB | \
- P9_DSISR_MC_SLB_MULTIHIT_MFSLB)
-
enum MCE_Version {
MCE_V1 = 1,
};
@@ -298,7 +207,8 @@ extern void save_mce_event(struct pt_regs *regs, long handled,
extern int get_mce_event(struct machine_check_event *mce, bool release);
extern void release_mce_event(void);
extern void machine_check_queue_event(void);
-extern void machine_check_print_event_info(struct machine_check_event *evt);
+extern void machine_check_print_event_info(struct machine_check_event *evt,
+ bool user_mode);
extern uint64_t get_mce_fault_addr(struct machine_check_event *evt);
#endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/include/asm/mmu-book3e.h b/arch/powerpc/include/asm/mmu-book3e.h
index b62a8d43a06c..7ca8d8e80ffa 100644
--- a/arch/powerpc/include/asm/mmu-book3e.h
+++ b/arch/powerpc/include/asm/mmu-book3e.h
@@ -229,11 +229,6 @@ typedef struct {
unsigned int id;
unsigned int active;
unsigned long vdso_base;
-#ifdef CONFIG_PPC_MM_SLICES
- u64 low_slices_psize; /* SLB page size encodings */
- u64 high_slices_psize; /* 4 bits per slice for now */
- u16 user_psize; /* page size index */
-#endif
#ifdef CONFIG_PPC_64K_PAGES
/* for 4K PTE fragment support */
void *pte_frag;
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 065e762fae85..78260409dc9c 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -29,6 +29,10 @@
*/
/*
+ * Support for 68 bit VA space. We added that from ISA 2.05
+ */
+#define MMU_FTR_68_BIT_VA ASM_CONST(0x00002000)
+/*
* Kernel read only support.
* We added the ppp value 0b110 in ISA 2.04.
*/
@@ -109,10 +113,10 @@
#define MMU_FTRS_POWER4 MMU_FTRS_DEFAULT_HPTE_ARCH_V2
#define MMU_FTRS_PPC970 MMU_FTRS_POWER4 | MMU_FTR_TLBIE_CROP_VA
#define MMU_FTRS_POWER5 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
-#define MMU_FTRS_POWER6 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
-#define MMU_FTRS_POWER7 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
-#define MMU_FTRS_POWER8 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
-#define MMU_FTRS_POWER9 MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_KERNEL_RO
+#define MMU_FTRS_POWER6 MMU_FTRS_POWER5 | MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA
+#define MMU_FTRS_POWER7 MMU_FTRS_POWER6
+#define MMU_FTRS_POWER8 MMU_FTRS_POWER6
+#define MMU_FTRS_POWER9 MMU_FTRS_POWER6
#define MMU_FTRS_CELL MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
MMU_FTR_CI_LARGE_PAGE
#define MMU_FTRS_PA6T MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
@@ -136,7 +140,7 @@ enum {
MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL |
MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE |
MMU_FTR_1T_SEGMENT | MMU_FTR_TLBIE_CROP_VA |
- MMU_FTR_KERNEL_RO |
+ MMU_FTR_KERNEL_RO | MMU_FTR_68_BIT_VA |
#ifdef CONFIG_PPC_RADIX_MMU
MMU_FTR_TYPE_RADIX |
#endif
@@ -290,7 +294,10 @@ static inline bool early_radix_enabled(void)
#define MMU_PAGE_16G 14
#define MMU_PAGE_64G 15
-/* N.B. we need to change the type of hpte_page_sizes if this gets to be > 16 */
+/*
+ * N.B. we need to change the type of hpte_page_sizes if this gets to be > 16
+ * Also we need to change he type of mm_context.low/high_slices_psize.
+ */
#define MMU_PAGE_COUNT 16
#ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
index ecf9885ab660..da7e9432fa8f 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -29,10 +29,14 @@ extern void mm_iommu_init(struct mm_struct *mm);
extern void mm_iommu_cleanup(struct mm_struct *mm);
extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
unsigned long ua, unsigned long size);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(
+ struct mm_struct *mm, unsigned long ua, unsigned long size);
extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
unsigned long ua, unsigned long entries);
extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
unsigned long ua, unsigned long *hpa);
+extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+ unsigned long ua, unsigned long *hpa);
extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
#endif
@@ -51,7 +55,8 @@ static inline void switch_mmu_context(struct mm_struct *prev,
return switch_slb(tsk, next);
}
-extern int __init_new_context(void);
+extern int hash__alloc_context_id(void);
+extern void hash__reserve_context_id(int id);
extern void __destroy_context(int context_id);
static inline void mmu_context_init(void) { }
#else
@@ -70,8 +75,9 @@ extern void drop_cop(unsigned long acop, struct mm_struct *mm);
* switch_mm is the entry point called from the architecture independent
* code in kernel/sched/core.c
*/
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
+static inline void switch_mm_irqs_off(struct mm_struct *prev,
+ struct mm_struct *next,
+ struct task_struct *tsk)
{
/* Mark this context has been used on the new CPU */
if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next)))
@@ -110,6 +116,18 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
switch_mmu_context(prev, next, tsk);
}
+static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ switch_mm_irqs_off(prev, next, tsk);
+ local_irq_restore(flags);
+}
+#define switch_mm_irqs_off switch_mm_irqs_off
+
+
#define deactivate_mm(tsk,mm) do { } while (0)
/*
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index c7f927e67d14..f0ff384d4ca5 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -88,11 +88,6 @@
#include <asm/nohash/pte-book3e.h>
#include <asm/pte-common.h>
-#ifdef CONFIG_PPC_MM_SLICES
-#define HAVE_ARCH_UNMAPPED_AREA
-#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
-#endif /* CONFIG_PPC_MM_SLICES */
-
#ifndef __ASSEMBLY__
/* pte_clear moved to later in this file */
diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h
index a0aa285869b5..cb3e6242a78c 100644
--- a/arch/powerpc/include/asm/opal-api.h
+++ b/arch/powerpc/include/asm/opal-api.h
@@ -40,6 +40,8 @@
#define OPAL_I2C_ARBT_LOST -22
#define OPAL_I2C_NACK_RCVD -23
#define OPAL_I2C_STOP_ERR -24
+#define OPAL_XIVE_PROVISIONING -31
+#define OPAL_XIVE_FREE_ACTIVE -32
/* API Tokens (in r0) */
#define OPAL_INVALID_CALL -1
@@ -168,7 +170,27 @@
#define OPAL_INT_SET_MFRR 125
#define OPAL_PCI_TCE_KILL 126
#define OPAL_NMMU_SET_PTCR 127
-#define OPAL_LAST 127
+#define OPAL_XIVE_RESET 128
+#define OPAL_XIVE_GET_IRQ_INFO 129
+#define OPAL_XIVE_GET_IRQ_CONFIG 130
+#define OPAL_XIVE_SET_IRQ_CONFIG 131
+#define OPAL_XIVE_GET_QUEUE_INFO 132
+#define OPAL_XIVE_SET_QUEUE_INFO 133
+#define OPAL_XIVE_DONATE_PAGE 134
+#define OPAL_XIVE_ALLOCATE_VP_BLOCK 135
+#define OPAL_XIVE_FREE_VP_BLOCK 136
+#define OPAL_XIVE_GET_VP_INFO 137
+#define OPAL_XIVE_SET_VP_INFO 138
+#define OPAL_XIVE_ALLOCATE_IRQ 139
+#define OPAL_XIVE_FREE_IRQ 140
+#define OPAL_XIVE_SYNC 141
+#define OPAL_XIVE_DUMP 142
+#define OPAL_XIVE_RESERVED3 143
+#define OPAL_XIVE_RESERVED4 144
+#define OPAL_NPU_INIT_CONTEXT 146
+#define OPAL_NPU_DESTROY_CONTEXT 147
+#define OPAL_NPU_MAP_LPAR 148
+#define OPAL_LAST 148
/* Device tree flags */
@@ -928,6 +950,59 @@ enum {
OPAL_PCI_TCE_KILL_ALL,
};
+/* The xive operation mode indicates the active "API" and
+ * corresponds to the "mode" parameter of the opal_xive_reset()
+ * call
+ */
+enum {
+ OPAL_XIVE_MODE_EMU = 0,
+ OPAL_XIVE_MODE_EXPL = 1,
+};
+
+/* Flags for OPAL_XIVE_GET_IRQ_INFO */
+enum {
+ OPAL_XIVE_IRQ_TRIGGER_PAGE = 0x00000001,
+ OPAL_XIVE_IRQ_STORE_EOI = 0x00000002,
+ OPAL_XIVE_IRQ_LSI = 0x00000004,
+ OPAL_XIVE_IRQ_SHIFT_BUG = 0x00000008,
+ OPAL_XIVE_IRQ_MASK_VIA_FW = 0x00000010,
+ OPAL_XIVE_IRQ_EOI_VIA_FW = 0x00000020,
+};
+
+/* Flags for OPAL_XIVE_GET/SET_QUEUE_INFO */
+enum {
+ OPAL_XIVE_EQ_ENABLED = 0x00000001,
+ OPAL_XIVE_EQ_ALWAYS_NOTIFY = 0x00000002,
+ OPAL_XIVE_EQ_ESCALATE = 0x00000004,
+};
+
+/* Flags for OPAL_XIVE_GET/SET_VP_INFO */
+enum {
+ OPAL_XIVE_VP_ENABLED = 0x00000001,
+};
+
+/* "Any chip" replacement for chip ID for allocation functions */
+enum {
+ OPAL_XIVE_ANY_CHIP = 0xffffffff,
+};
+
+/* Xive sync options */
+enum {
+ /* This bits are cumulative, arg is a girq */
+ XIVE_SYNC_EAS = 0x00000001, /* Sync irq source */
+ XIVE_SYNC_QUEUE = 0x00000002, /* Sync irq target */
+};
+
+/* Dump options */
+enum {
+ XIVE_DUMP_TM_HYP = 0,
+ XIVE_DUMP_TM_POOL = 1,
+ XIVE_DUMP_TM_OS = 2,
+ XIVE_DUMP_TM_USER = 3,
+ XIVE_DUMP_VP = 4,
+ XIVE_DUMP_EMU_STATE = 5,
+};
+
#endif /* __ASSEMBLY__ */
#endif /* __OPAL_API_H */
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 1ff03a6da76e..588fb1c23af9 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -29,6 +29,11 @@ extern struct device_node *opal_node;
/* API functions */
int64_t opal_invalid_call(void);
+int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid, uint64_t bdf);
+int64_t opal_npu_init_context(uint64_t phb_id, int pasid, uint64_t msr,
+ uint64_t bdf);
+int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid,
+ uint64_t lpcr);
int64_t opal_console_write(int64_t term_number, __be64 *length,
const uint8_t *buffer);
int64_t opal_console_read(int64_t term_number, __be64 *length,
@@ -226,6 +231,42 @@ int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type,
uint32_t pe_num, uint32_t tce_size,
uint64_t dma_addr, uint32_t npages);
int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr);
+int64_t opal_xive_reset(uint64_t version);
+int64_t opal_xive_get_irq_info(uint32_t girq,
+ __be64 *out_flags,
+ __be64 *out_eoi_page,
+ __be64 *out_trig_page,
+ __be32 *out_esb_shift,
+ __be32 *out_src_chip);
+int64_t opal_xive_get_irq_config(uint32_t girq, __be64 *out_vp,
+ uint8_t *out_prio, __be32 *out_lirq);
+int64_t opal_xive_set_irq_config(uint32_t girq, uint64_t vp, uint8_t prio,
+ uint32_t lirq);
+int64_t opal_xive_get_queue_info(uint64_t vp, uint32_t prio,
+ __be64 *out_qpage,
+ __be64 *out_qsize,
+ __be64 *out_qeoi_page,
+ __be32 *out_escalate_irq,
+ __be64 *out_qflags);
+int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio,
+ uint64_t qpage,
+ uint64_t qsize,
+ uint64_t qflags);
+int64_t opal_xive_donate_page(uint32_t chip_id, uint64_t addr);
+int64_t opal_xive_alloc_vp_block(uint32_t alloc_order);
+int64_t opal_xive_free_vp_block(uint64_t vp);
+int64_t opal_xive_get_vp_info(uint64_t vp,
+ __be64 *out_flags,
+ __be64 *out_cam_value,
+ __be64 *out_report_cl_pair,
+ __be32 *out_chip_id);
+int64_t opal_xive_set_vp_info(uint64_t vp,
+ uint64_t flags,
+ uint64_t report_cl_pair);
+int64_t opal_xive_allocate_irq(uint32_t chip_id);
+int64_t opal_xive_free_irq(uint32_t girq);
+int64_t opal_xive_sync(uint32_t type, uint32_t id);
+int64_t opal_xive_dump(uint32_t type, uint32_t id);
/* Internal functions */
extern int early_init_dt_scan_opal(unsigned long node, const char *uname,
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 708c3e592eeb..1c09f8fe2ee8 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,7 +99,6 @@ struct paca_struct {
*/
/* used for most interrupts/exceptions */
u64 exgen[13] __attribute__((aligned(0x80)));
- u64 exmc[13]; /* used for machine checks */
u64 exslb[13]; /* used for SLB/segment table misses
* on the linear mapping */
/* SLB related definitions */
@@ -139,6 +138,7 @@ struct paca_struct {
#ifdef CONFIG_PPC_MM_SLICES
u64 mm_ctx_low_slices_psize;
unsigned char mm_ctx_high_slices_psize[SLICE_ARRAY_SIZE];
+ unsigned long addr_limit;
#else
u16 mm_ctx_user_psize;
u16 mm_ctx_sllp;
@@ -172,17 +172,31 @@ struct paca_struct {
u8 thread_mask;
/* Mask to denote subcore sibling threads */
u8 subcore_sibling_mask;
+ /*
+ * Pointer to an array which contains pointer
+ * to the sibling threads' paca.
+ */
+ struct paca_struct **thread_sibling_pacas;
#endif
+#ifdef CONFIG_PPC_STD_MMU_64
+ /* Non-maskable exceptions that are not performance critical */
+ u64 exnmi[13]; /* used for system reset (nmi) */
+ u64 exmc[13]; /* used for machine checks */
+#endif
#ifdef CONFIG_PPC_BOOK3S_64
- /* Exclusive emergency stack pointer for machine check exception. */
+ /* Exclusive stacks for system reset and machine check exception. */
+ void *nmi_emergency_sp;
void *mc_emergency_sp;
+
+ u16 in_nmi; /* In nmi handler */
+
/*
* Flag to check whether we are in machine check early handler
* and already using emergency stack.
*/
u16 in_mce;
- u8 hmi_event_available; /* HMI event is available */
+ u8 hmi_event_available; /* HMI event is available */
#endif
/* Stuff for accurate time accounting */
@@ -206,23 +220,7 @@ struct paca_struct {
#endif
};
-#ifdef CONFIG_PPC_BOOK3S
-static inline void copy_mm_to_paca(mm_context_t *context)
-{
- get_paca()->mm_ctx_id = context->id;
-#ifdef CONFIG_PPC_MM_SLICES
- get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
- memcpy(&get_paca()->mm_ctx_high_slices_psize,
- &context->high_slices_psize, SLICE_ARRAY_SIZE);
-#else
- get_paca()->mm_ctx_user_psize = context->user_psize;
- get_paca()->mm_ctx_sllp = context->sllp;
-#endif
-}
-#else
-static inline void copy_mm_to_paca(mm_context_t *context){}
-#endif
-
+extern void copy_mm_to_paca(struct mm_struct *mm);
extern struct paca_struct *paca;
extern void initialise_paca(struct paca_struct *new_paca, int cpu);
extern void setup_paca(struct paca_struct *new_paca);
diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h
index 3e83d2a20b6f..c4d9654bd637 100644
--- a/arch/powerpc/include/asm/page_64.h
+++ b/arch/powerpc/include/asm/page_64.h
@@ -98,21 +98,7 @@ extern u64 ppc64_pft_size;
#define GET_LOW_SLICE_INDEX(addr) ((addr) >> SLICE_LOW_SHIFT)
#define GET_HIGH_SLICE_INDEX(addr) ((addr) >> SLICE_HIGH_SHIFT)
-/*
- * 1 bit per slice and we have one slice per 1TB
- * Right now we support only 64TB.
- * IF we change this we will have to change the type
- * of high_slices
- */
-#define SLICE_MASK_SIZE 8
-
#ifndef __ASSEMBLY__
-
-struct slice_mask {
- u16 low_slices;
- u64 high_slices;
-};
-
struct mm_struct;
extern unsigned long slice_get_unmapped_area(unsigned long addr,
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index ae0a23091a9b..723bf48e7494 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -38,6 +38,9 @@ struct power_pmu {
unsigned long *valp);
int (*get_alternatives)(u64 event_id, unsigned int flags,
u64 alt[]);
+ void (*get_mem_data_src)(union perf_mem_data_src *dsrc,
+ u32 flags, struct pt_regs *regs);
+ void (*get_mem_weight)(u64 *weight);
u64 (*bhrb_filter_map)(u64 branch_sample_type);
void (*config_bhrb)(u64 pmu_bhrb_filter);
void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]);
diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h
index 0e9c2402dd20..f62797702300 100644
--- a/arch/powerpc/include/asm/powernv.h
+++ b/arch/powerpc/include/asm/powernv.h
@@ -11,9 +11,31 @@
#define _ASM_POWERNV_H
#ifdef CONFIG_PPC_POWERNV
+#define NPU2_WRITE 1
extern void powernv_set_nmmu_ptcr(unsigned long ptcr);
+extern struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
+ unsigned long flags,
+ struct npu_context *(*cb)(struct npu_context *, void *),
+ void *priv);
+extern void pnv_npu2_destroy_context(struct npu_context *context,
+ struct pci_dev *gpdev);
+extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
+ unsigned long *flags, unsigned long *status,
+ int count);
#else
static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { }
+static inline struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
+ unsigned long flags,
+ struct npu_context *(*cb)(struct npu_context *, void *),
+ void *priv) { return ERR_PTR(-ENODEV); }
+static inline void pnv_npu2_destroy_context(struct npu_context *context,
+ struct pci_dev *gpdev) { }
+
+static inline int pnv_npu2_handle_fault(struct npu_context *context,
+ uintptr_t *ea, unsigned long *flags,
+ unsigned long *status, int count) {
+ return -ENODEV;
+}
#endif
#endif /* _ASM_POWERNV_H */
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index e7d6d86563ee..142d78d645f4 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -161,6 +161,7 @@
#define PPC_INST_MFTMR 0x7c0002dc
#define PPC_INST_MSGSND 0x7c00019c
#define PPC_INST_MSGCLR 0x7c0001dc
+#define PPC_INST_MSGSYNC 0x7c0006ec
#define PPC_INST_MSGSNDP 0x7c00011c
#define PPC_INST_MTTMR 0x7c0003dc
#define PPC_INST_NOP 0x60000000
@@ -345,6 +346,7 @@
___PPC_RB(b) | __PPC_EH(eh))
#define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \
___PPC_RB(b))
+#define PPC_MSGSYNC stringify_in_c(.long PPC_INST_MSGSYNC)
#define PPC_MSGCLR(b) stringify_in_c(.long PPC_INST_MSGCLR | \
___PPC_RB(b))
#define PPC_MSGSNDP(b) stringify_in_c(.long PPC_INST_MSGSNDP | \
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index e0fecbcea2a2..a4b1d8d6b793 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -102,11 +102,25 @@ void release_thread(struct task_struct *);
#endif
#ifdef CONFIG_PPC64
-/* 64-bit user address space is 46-bits (64TB user VM) */
-#define TASK_SIZE_USER64 (0x0000400000000000UL)
+/*
+ * 64-bit user address space can have multiple limits
+ * For now supported values are:
+ */
+#define TASK_SIZE_64TB (0x0000400000000000UL)
+#define TASK_SIZE_128TB (0x0000800000000000UL)
+#define TASK_SIZE_512TB (0x0002000000000000UL)
+
+#ifdef CONFIG_PPC_BOOK3S_64
+/*
+ * Max value currently used:
+ */
+#define TASK_SIZE_USER64 TASK_SIZE_512TB
+#else
+#define TASK_SIZE_USER64 TASK_SIZE_64TB
+#endif
-/*
- * 32-bit user address space is 4GB - 1 page
+/*
+ * 32-bit user address space is 4GB - 1 page
* (this 1 page is needed so referencing of 0xFFFFFFFF generates EFAULT
*/
#define TASK_SIZE_USER32 (0x0000000100000000UL - (1*PAGE_SIZE))
@@ -114,26 +128,37 @@ void release_thread(struct task_struct *);
#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \
TASK_SIZE_USER32 : TASK_SIZE_USER64)
#define TASK_SIZE TASK_SIZE_OF(current)
-
/* This decides where the kernel will search for a free chunk of vm
* space during mmap's.
*/
#define TASK_UNMAPPED_BASE_USER32 (PAGE_ALIGN(TASK_SIZE_USER32 / 4))
-#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_USER64 / 4))
+#define TASK_UNMAPPED_BASE_USER64 (PAGE_ALIGN(TASK_SIZE_128TB / 4))
#define TASK_UNMAPPED_BASE ((is_32bit_task()) ? \
TASK_UNMAPPED_BASE_USER32 : TASK_UNMAPPED_BASE_USER64 )
#endif
+/*
+ * Initial task size value for user applications. For book3s 64 we start
+ * with 128TB and conditionally enable upto 512TB
+ */
+#ifdef CONFIG_PPC_BOOK3S_64
+#define DEFAULT_MAP_WINDOW ((is_32bit_task()) ? \
+ TASK_SIZE_USER32 : TASK_SIZE_128TB)
+#else
+#define DEFAULT_MAP_WINDOW TASK_SIZE
+#endif
+
#ifdef __powerpc64__
-#define STACK_TOP_USER64 TASK_SIZE_USER64
+/* Limit stack to 128TB */
+#define STACK_TOP_USER64 TASK_SIZE_128TB
#define STACK_TOP_USER32 TASK_SIZE_USER32
#define STACK_TOP (is_32bit_task() ? \
STACK_TOP_USER32 : STACK_TOP_USER64)
-#define STACK_TOP_MAX STACK_TOP_USER64
+#define STACK_TOP_MAX TASK_SIZE_USER64
#else /* __powerpc64__ */
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index fc879fd6bdae..d4f653c9259a 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -310,6 +310,7 @@
#define SPRN_PMCR 0x374 /* Power Management Control Register */
/* HFSCR and FSCR bit numbers are the same */
+#define FSCR_SCV_LG 12 /* Enable System Call Vectored */
#define FSCR_MSGP_LG 10 /* Enable MSGP */
#define FSCR_TAR_LG 8 /* Enable Target Address Register */
#define FSCR_EBB_LG 7 /* Enable Event Based Branching */
@@ -320,6 +321,7 @@
#define FSCR_VECVSX_LG 1 /* Enable VMX/VSX */
#define FSCR_FP_LG 0 /* Enable Floating Point */
#define SPRN_FSCR 0x099 /* Facility Status & Control Register */
+#define FSCR_SCV __MASK(FSCR_SCV_LG)
#define FSCR_TAR __MASK(FSCR_TAR_LG)
#define FSCR_EBB __MASK(FSCR_EBB_LG)
#define FSCR_DSCR __MASK(FSCR_DSCR_LG)
@@ -365,6 +367,7 @@
#define LPCR_MER_SH 11
#define LPCR_GTSE ASM_CONST(0x0000000000000400) /* Guest Translation Shootdown Enable */
#define LPCR_TC ASM_CONST(0x0000000000000200) /* Translation control */
+#define LPCR_HEIC ASM_CONST(0x0000000000000010) /* Hypervisor External Interrupt Control */
#define LPCR_LPES 0x0000000c
#define LPCR_LPES0 ASM_CONST(0x0000000000000008) /* LPAR Env selector 0 */
#define LPCR_LPES1 ASM_CONST(0x0000000000000004) /* LPAR Env selector 1 */
@@ -656,6 +659,7 @@
#define SRR1_ISI_PROT 0x08000000 /* ISI: Other protection fault */
#define SRR1_WAKEMASK 0x00380000 /* reason for wakeup */
#define SRR1_WAKEMASK_P8 0x003c0000 /* reason for wakeup on POWER8 and 9 */
+#define SRR1_WAKEMCE_RESVD 0x003c0000 /* Unused/reserved value used by MCE wakeup to indicate cause to idle wakeup handler */
#define SRR1_WAKESYSERR 0x00300000 /* System error */
#define SRR1_WAKEEE 0x00200000 /* External interrupt */
#define SRR1_WAKEHVI 0x00240000 /* Hypervisor Virtualization Interrupt (P9) */
diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h
index 7dc006b58369..7902d6358854 100644
--- a/arch/powerpc/include/asm/sections.h
+++ b/arch/powerpc/include/asm/sections.h
@@ -6,6 +6,8 @@
#include <linux/uaccess.h>
#include <asm-generic/sections.h>
+extern char __head_end[];
+
#ifdef __powerpc64__
extern char __start_interrupts[];
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index 32db16d2e7ad..ebddb2111d87 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -40,10 +40,12 @@ extern int cpu_to_chip_id(int cpu);
struct smp_ops_t {
void (*message_pass)(int cpu, int msg);
#ifdef CONFIG_PPC_SMP_MUXED_IPI
- void (*cause_ipi)(int cpu, unsigned long data);
+ void (*cause_ipi)(int cpu);
#endif
+ int (*cause_nmi_ipi)(int cpu);
void (*probe)(void);
int (*kick_cpu)(int nr);
+ int (*prepare_cpu)(int nr);
void (*setup_cpu)(int nr);
void (*bringup_done)(void);
void (*take_timebase)(void);
@@ -61,7 +63,6 @@ extern void smp_generic_take_timebase(void);
DECLARE_PER_CPU(unsigned int, cpu_pvr);
#ifdef CONFIG_HOTPLUG_CPU
-extern void migrate_irqs(void);
int generic_cpu_disable(void);
void generic_cpu_die(unsigned int cpu);
void generic_set_cpu_dead(unsigned int cpu);
@@ -112,23 +113,31 @@ extern int cpu_to_core_id(int cpu);
*
* Make sure this matches openpic_request_IPIs in open_pic.c, or what shows up
* in /proc/interrupts will be wrong!!! --Troy */
-#define PPC_MSG_CALL_FUNCTION 0
-#define PPC_MSG_RESCHEDULE 1
+#define PPC_MSG_CALL_FUNCTION 0
+#define PPC_MSG_RESCHEDULE 1
#define PPC_MSG_TICK_BROADCAST 2
-#define PPC_MSG_DEBUGGER_BREAK 3
+#define PPC_MSG_NMI_IPI 3
/* This is only used by the powernv kernel */
#define PPC_MSG_RM_HOST_ACTION 4
+#define NMI_IPI_ALL_OTHERS -2
+
+#ifdef CONFIG_NMI_IPI
+extern int smp_handle_nmi_ipi(struct pt_regs *regs);
+#else
+static inline int smp_handle_nmi_ipi(struct pt_regs *regs) { return 0; }
+#endif
+
/* for irq controllers that have dedicated ipis per message (4) */
extern int smp_request_message_ipi(int virq, int message);
extern const char *smp_ipi_name[];
/* for irq controllers with only a single ipi */
-extern void smp_muxed_ipi_set_data(int cpu, unsigned long data);
extern void smp_muxed_ipi_message_pass(int cpu, int msg);
extern void smp_muxed_ipi_set_message(int cpu, int msg);
extern irqreturn_t smp_ipi_demux(void);
+extern irqreturn_t smp_ipi_demux_relaxed(void);
void smp_init_pSeries(void);
void smp_init_cell(void);
diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h
index 23be8f1e7e64..16fab6898240 100644
--- a/arch/powerpc/include/asm/syscalls.h
+++ b/arch/powerpc/include/asm/syscalls.h
@@ -8,10 +8,10 @@
struct rtas_args;
-asmlinkage unsigned long sys_mmap(unsigned long addr, size_t len,
+asmlinkage long sys_mmap(unsigned long addr, size_t len,
unsigned long prot, unsigned long flags,
unsigned long fd, off_t offset);
-asmlinkage unsigned long sys_mmap2(unsigned long addr, size_t len,
+asmlinkage long sys_mmap2(unsigned long addr, size_t len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long pgoff);
asmlinkage long ppc64_personality(unsigned long personality);
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 6fc6464f7421..a941cc6fc3e9 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -10,15 +10,7 @@
#ifdef __KERNEL__
-/* We have 8k stacks on ppc32 and 16k on ppc64 */
-
-#if defined(CONFIG_PPC64)
-#define THREAD_SHIFT 14
-#elif defined(CONFIG_PPC_256K_PAGES)
-#define THREAD_SHIFT 15
-#else
-#define THREAD_SHIFT 13
-#endif
+#define THREAD_SHIFT CONFIG_THREAD_SHIFT
#define THREAD_SIZE (1 << THREAD_SHIFT)
diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h
index e0b9e576905a..7ce2c3ac2964 100644
--- a/arch/powerpc/include/asm/xics.h
+++ b/arch/powerpc/include/asm/xics.h
@@ -57,7 +57,7 @@ struct icp_ops {
void (*teardown_cpu)(void);
void (*flush_ipi)(void);
#ifdef CONFIG_SMP
- void (*cause_ipi)(int cpu, unsigned long data);
+ void (*cause_ipi)(int cpu);
irq_handler_t ipi_action;
#endif
};
diff --git a/arch/powerpc/include/asm/xive-regs.h b/arch/powerpc/include/asm/xive-regs.h
new file mode 100644
index 000000000000..1d3f2be5ae39
--- /dev/null
+++ b/arch/powerpc/include/asm/xive-regs.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef _ASM_POWERPC_XIVE_REGS_H
+#define _ASM_POWERPC_XIVE_REGS_H
+
+/*
+ * Thread Management (aka "TM") registers
+ */
+
+/* TM register offsets */
+#define TM_QW0_USER 0x000 /* All rings */
+#define TM_QW1_OS 0x010 /* Ring 0..2 */
+#define TM_QW2_HV_POOL 0x020 /* Ring 0..1 */
+#define TM_QW3_HV_PHYS 0x030 /* Ring 0..1 */
+
+/* Byte offsets inside a QW QW0 QW1 QW2 QW3 */
+#define TM_NSR 0x0 /* + + - + */
+#define TM_CPPR 0x1 /* - + - + */
+#define TM_IPB 0x2 /* - + + + */
+#define TM_LSMFB 0x3 /* - + + + */
+#define TM_ACK_CNT 0x4 /* - + - - */
+#define TM_INC 0x5 /* - + - + */
+#define TM_AGE 0x6 /* - + - + */
+#define TM_PIPR 0x7 /* - + - + */
+
+#define TM_WORD0 0x0
+#define TM_WORD1 0x4
+
+/*
+ * QW word 2 contains the valid bit at the top and other fields
+ * depending on the QW.
+ */
+#define TM_WORD2 0x8
+#define TM_QW0W2_VU PPC_BIT32(0)
+#define TM_QW0W2_LOGIC_SERV PPC_BITMASK32(1,31) // XX 2,31 ?
+#define TM_QW1W2_VO PPC_BIT32(0)
+#define TM_QW1W2_OS_CAM PPC_BITMASK32(8,31)
+#define TM_QW2W2_VP PPC_BIT32(0)
+#define TM_QW2W2_POOL_CAM PPC_BITMASK32(8,31)
+#define TM_QW3W2_VT PPC_BIT32(0)
+#define TM_QW3W2_LP PPC_BIT32(6)
+#define TM_QW3W2_LE PPC_BIT32(7)
+#define TM_QW3W2_T PPC_BIT32(31)
+
+/*
+ * In addition to normal loads to "peek" and writes (only when invalid)
+ * using 4 and 8 bytes accesses, the above registers support these
+ * "special" byte operations:
+ *
+ * - Byte load from QW0[NSR] - User level NSR (EBB)
+ * - Byte store to QW0[NSR] - User level NSR (EBB)
+ * - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access
+ * - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0
+ * otherwise VT||0000000
+ * - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present)
+ *
+ * Then we have all these "special" CI ops at these offset that trigger
+ * all sorts of side effects:
+ */
+#define TM_SPC_ACK_EBB 0x800 /* Load8 ack EBB to reg*/
+#define TM_SPC_ACK_OS_REG 0x810 /* Load16 ack OS irq to reg */
+#define TM_SPC_PUSH_USR_CTX 0x808 /* Store32 Push/Validate user context */
+#define TM_SPC_PULL_USR_CTX 0x808 /* Load32 Pull/Invalidate user context */
+#define TM_SPC_SET_OS_PENDING 0x812 /* Store8 Set OS irq pending bit */
+#define TM_SPC_PULL_OS_CTX 0x818 /* Load32/Load64 Pull/Invalidate OS context to reg */
+#define TM_SPC_PULL_POOL_CTX 0x828 /* Load32/Load64 Pull/Invalidate Pool context to reg*/
+#define TM_SPC_ACK_HV_REG 0x830 /* Load16 ack HV irq to reg */
+#define TM_SPC_PULL_USR_CTX_OL 0xc08 /* Store8 Pull/Inval usr ctx to odd line */
+#define TM_SPC_ACK_OS_EL 0xc10 /* Store8 ack OS irq to even line */
+#define TM_SPC_ACK_HV_POOL_EL 0xc20 /* Store8 ack HV evt pool to even line */
+#define TM_SPC_ACK_HV_EL 0xc30 /* Store8 ack HV irq to even line */
+/* XXX more... */
+
+/* NSR fields for the various QW ack types */
+#define TM_QW0_NSR_EB PPC_BIT8(0)
+#define TM_QW1_NSR_EO PPC_BIT8(0)
+#define TM_QW3_NSR_HE PPC_BITMASK8(0,1)
+#define TM_QW3_NSR_HE_NONE 0
+#define TM_QW3_NSR_HE_POOL 1
+#define TM_QW3_NSR_HE_PHYS 2
+#define TM_QW3_NSR_HE_LSI 3
+#define TM_QW3_NSR_I PPC_BIT8(2)
+#define TM_QW3_NSR_GRP_LVL PPC_BIT8(3,7)
+
+/* Utilities to manipulate these (originaly from OPAL) */
+#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1)
+#define GETFIELD(m, v) (((v) & (m)) >> MASK_TO_LSH(m))
+#define SETFIELD(m, v, val) \
+ (((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m)))
+
+#endif /* _ASM_POWERPC_XIVE_REGS_H */
diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
new file mode 100644
index 000000000000..3cdbeaeac397
--- /dev/null
+++ b/arch/powerpc/include/asm/xive.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef _ASM_POWERPC_XIVE_H
+#define _ASM_POWERPC_XIVE_H
+
+#define XIVE_INVALID_VP 0xffffffff
+
+#ifdef CONFIG_PPC_XIVE
+
+/*
+ * Thread Interrupt Management Area (TIMA)
+ *
+ * This is a global MMIO region divided in 4 pages of varying access
+ * permissions, providing access to per-cpu interrupt management
+ * functions. It always identifies the CPU doing the access based
+ * on the PowerBus initiator ID, thus we always access via the
+ * same offset regardless of where the code is executing
+ */
+extern void __iomem *xive_tima;
+
+/*
+ * Offset in the TM area of our current execution level (provided by
+ * the backend)
+ */
+extern u32 xive_tima_offset;
+
+/*
+ * Per-irq data (irq_get_handler_data for normal IRQs), IPIs
+ * have it stored in the xive_cpu structure. We also cache
+ * for normal interrupts the current target CPU.
+ *
+ * This structure is setup by the backend for each interrupt.
+ */
+struct xive_irq_data {
+ u64 flags;
+ u64 eoi_page;
+ void __iomem *eoi_mmio;
+ u64 trig_page;
+ void __iomem *trig_mmio;
+ u32 esb_shift;
+ int src_chip;
+
+ /* Setup/used by frontend */
+ int target;
+ bool saved_p;
+};
+#define XIVE_IRQ_FLAG_STORE_EOI 0x01
+#define XIVE_IRQ_FLAG_LSI 0x02
+#define XIVE_IRQ_FLAG_SHIFT_BUG 0x04
+#define XIVE_IRQ_FLAG_MASK_FW 0x08
+#define XIVE_IRQ_FLAG_EOI_FW 0x10
+
+#define XIVE_INVALID_CHIP_ID -1
+
+/* A queue tracking structure in a CPU */
+struct xive_q {
+ __be32 *qpage;
+ u32 msk;
+ u32 idx;
+ u32 toggle;
+ u64 eoi_phys;
+ u32 esc_irq;
+ atomic_t count;
+ atomic_t pending_count;
+};
+
+/*
+ * "magic" Event State Buffer (ESB) MMIO offsets.
+ *
+ * Each interrupt source has a 2-bit state machine called ESB
+ * which can be controlled by MMIO. It's made of 2 bits, P and
+ * Q. P indicates that an interrupt is pending (has been sent
+ * to a queue and is waiting for an EOI). Q indicates that the
+ * interrupt has been triggered while pending.
+ *
+ * This acts as a coalescing mechanism in order to guarantee
+ * that a given interrupt only occurs at most once in a queue.
+ *
+ * When doing an EOI, the Q bit will indicate if the interrupt
+ * needs to be re-triggered.
+ *
+ * The following offsets into the ESB MMIO allow to read or
+ * manipulate the PQ bits. They must be used with an 8-bytes
+ * load instruction. They all return the previous state of the
+ * interrupt (atomically).
+ *
+ * Additionally, some ESB pages support doing an EOI via a
+ * store at 0 and some ESBs support doing a trigger via a
+ * separate trigger page.
+ */
+#define XIVE_ESB_GET 0x800
+#define XIVE_ESB_SET_PQ_00 0xc00
+#define XIVE_ESB_SET_PQ_01 0xd00
+#define XIVE_ESB_SET_PQ_10 0xe00
+#define XIVE_ESB_SET_PQ_11 0xf00
+#define XIVE_ESB_MASK XIVE_ESB_SET_PQ_01
+
+#define XIVE_ESB_VAL_P 0x2
+#define XIVE_ESB_VAL_Q 0x1
+
+/* Global enable flags for the XIVE support */
+extern bool __xive_enabled;
+
+static inline bool xive_enabled(void) { return __xive_enabled; }
+
+extern bool xive_native_init(void);
+extern void xive_smp_probe(void);
+extern int xive_smp_prepare_cpu(unsigned int cpu);
+extern void xive_smp_setup_cpu(void);
+extern void xive_smp_disable_cpu(void);
+extern void xive_kexec_teardown_cpu(int secondary);
+extern void xive_shutdown(void);
+extern void xive_flush_interrupt(void);
+
+/* xmon hook */
+extern void xmon_xive_do_dump(int cpu);
+
+/* APIs used by KVM */
+extern u32 xive_native_default_eq_shift(void);
+extern u32 xive_native_alloc_vp_block(u32 max_vcpus);
+extern void xive_native_free_vp_block(u32 vp_base);
+extern int xive_native_populate_irq_data(u32 hw_irq,
+ struct xive_irq_data *data);
+extern void xive_cleanup_irq_data(struct xive_irq_data *xd);
+extern u32 xive_native_alloc_irq(void);
+extern void xive_native_free_irq(u32 irq);
+extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
+
+extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
+ __be32 *qpage, u32 order, bool can_escalate);
+extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
+
+extern bool __xive_irq_trigger(struct xive_irq_data *xd);
+extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
+extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
+
+extern bool is_xive_irq(struct irq_chip *chip);
+
+#else
+
+static inline bool xive_enabled(void) { return false; }
+
+static inline bool xive_native_init(void) { return false; }
+static inline void xive_smp_probe(void) { }
+extern inline int xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; }
+static inline void xive_smp_setup_cpu(void) { }
+static inline void xive_smp_disable_cpu(void) { }
+static inline void xive_kexec_teardown_cpu(int secondary) { }
+static inline void xive_shutdown(void) { }
+static inline void xive_flush_interrupt(void) { }
+
+static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) { return XIVE_INVALID_VP; }
+static inline void xive_native_free_vp_block(u32 vp_base) { }
+
+#endif
+
+#endif /* _ASM_POWERPC_XIVE_H */
diff --git a/arch/powerpc/include/asm/xmon.h b/arch/powerpc/include/asm/xmon.h
index 5eb8e599e5cc..eb42a0c6e1d9 100644
--- a/arch/powerpc/include/asm/xmon.h
+++ b/arch/powerpc/include/asm/xmon.h
@@ -29,5 +29,7 @@ static inline void xmon_register_spus(struct list_head *list) { };
extern int cpus_are_in_xmon(void);
#endif
+extern void xmon_printf(const char *format, ...);
+
#endif /* __KERNEL __ */
#endif /* __ASM_POWERPC_XMON_H */
diff --git a/arch/powerpc/include/uapi/asm/mman.h b/arch/powerpc/include/uapi/asm/mman.h
index 03c06ba7464f..ab45cc2f3101 100644
--- a/arch/powerpc/include/uapi/asm/mman.h
+++ b/arch/powerpc/include/uapi/asm/mman.h
@@ -29,4 +29,20 @@
#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
+/*
+ * When MAP_HUGETLB is set, bits [26:31] of the flags argument to mmap(2),
+ * encode the log2 of the huge page size. A value of zero indicates that the
+ * default huge page size should be used. To use a non-default huge page size,
+ * one of these defines can be used, or the size can be encoded by hand. Note
+ * that on most systems only a subset, or possibly none, of these sizes will be
+ * available.
+ */
+#define MAP_HUGE_512KB (19 << MAP_HUGE_SHIFT) /* 512KB HugeTLB Page */
+#define MAP_HUGE_1MB (20 << MAP_HUGE_SHIFT) /* 1MB HugeTLB Page */
+#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) /* 2MB HugeTLB Page */
+#define MAP_HUGE_8MB (23 << MAP_HUGE_SHIFT) /* 8MB HugeTLB Page */
+#define MAP_HUGE_16MB (24 << MAP_HUGE_SHIFT) /* 16MB HugeTLB Page */
+#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) /* 1GB HugeTLB Page */
+#define MAP_HUGE_16GB (34 << MAP_HUGE_SHIFT) /* 16GB HugeTLB Page */
+
#endif /* _UAPI_ASM_POWERPC_MMAN_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 811f441a125f..b9db46ae545b 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -25,8 +25,6 @@ CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
-# do not trace tracer code
-CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
# timers used by tracing
CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
endif
@@ -97,6 +95,7 @@ obj-$(CONFIG_BOOTX_TEXT) += btext.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_OPTPROBES) += optprobes.o optprobes_head.o
+obj-$(CONFIG_KPROBES_ON_FTRACE) += kprobes-ftrace.o
obj-$(CONFIG_UPROBES) += uprobes.o
obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
@@ -118,10 +117,7 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o
obj-$(CONFIG_PPC_IO_WORKAROUNDS) += io-workarounds.o
-obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
-obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
-obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
-obj-$(CONFIG_TRACING) += trace_clock.o
+obj-y += trace/
ifneq ($(CONFIG_PPC_INDIRECT_PIO),y)
obj-y += iomap.o
@@ -142,14 +138,14 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvm_emul.o
# Disable GCOV & sanitizers in odd or sensitive code
GCOV_PROFILE_prom_init.o := n
UBSAN_SANITIZE_prom_init.o := n
-GCOV_PROFILE_ftrace.o := n
-UBSAN_SANITIZE_ftrace.o := n
GCOV_PROFILE_machine_kexec_64.o := n
UBSAN_SANITIZE_machine_kexec_64.o := n
GCOV_PROFILE_machine_kexec_32.o := n
UBSAN_SANITIZE_machine_kexec_32.o := n
GCOV_PROFILE_kprobes.o := n
UBSAN_SANITIZE_kprobes.o := n
+GCOV_PROFILE_kprobes-ftrace.o := n
+UBSAN_SANITIZE_kprobes-ftrace.o := n
UBSAN_SANITIZE_vdso.o := n
extra-$(CONFIG_PPC_FPU) += fpu.o
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 4367e7df51a1..439c257dec4a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -185,6 +185,7 @@ int main(void)
#ifdef CONFIG_PPC_MM_SLICES
OFFSET(PACALOWSLICESPSIZE, paca_struct, mm_ctx_low_slices_psize);
OFFSET(PACAHIGHSLICEPSIZE, paca_struct, mm_ctx_high_slices_psize);
+ DEFINE(PACA_ADDR_LIMIT, offsetof(struct paca_struct, addr_limit));
DEFINE(MMUPSIZEDEFSIZE, sizeof(struct mmu_psize_def));
#endif /* CONFIG_PPC_MM_SLICES */
#endif
@@ -219,6 +220,7 @@ int main(void)
OFFSET(PACA_EXGEN, paca_struct, exgen);
OFFSET(PACA_EXMC, paca_struct, exmc);
OFFSET(PACA_EXSLB, paca_struct, exslb);
+ OFFSET(PACA_EXNMI, paca_struct, exnmi);
OFFSET(PACALPPACAPTR, paca_struct, lppaca_ptr);
OFFSET(PACA_SLBSHADOWPTR, paca_struct, slb_shadow_ptr);
OFFSET(SLBSHADOW_STACKVSID, slb_shadow, save_area[SLB_NUM_BOLTED - 1].vsid);
@@ -232,7 +234,9 @@ int main(void)
OFFSET(PACAEMERGSP, paca_struct, emergency_sp);
#ifdef CONFIG_PPC_BOOK3S_64
OFFSET(PACAMCEMERGSP, paca_struct, mc_emergency_sp);
+ OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp);
OFFSET(PACA_IN_MCE, paca_struct, in_mce);
+ OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
#endif
OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);
@@ -399,8 +403,8 @@ int main(void)
DEFINE(BUG_ENTRY_SIZE, sizeof(struct bug_entry));
#endif
-#ifdef MAX_PGD_TABLE_SIZE
- DEFINE(PGD_TABLE_SIZE, MAX_PGD_TABLE_SIZE);
+#ifdef CONFIG_PPC_BOOK3S_64
+ DEFINE(PGD_TABLE_SIZE, (sizeof(pgd_t) << max(RADIX_PGD_INDEX_SIZE, H_PGD_INDEX_SIZE)));
#else
DEFINE(PGD_TABLE_SIZE, PGD_TABLE_SIZE);
#endif
@@ -727,6 +731,7 @@ int main(void)
OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state);
OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask);
OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask);
+ OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas);
#endif
DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 7fe8c79e6937..10cb2896b2ae 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -29,7 +29,8 @@ _GLOBAL(__setup_cpu_power7)
li r0,0
mtspr SPRN_LPID,r0
mfspr r3,SPRN_LPCR
- bl __init_LPCR
+ li r4,(LPCR_LPES1 >> LPCR_LPES_SH)
+ bl __init_LPCR_ISA206
bl __init_tlb_power7
mtlr r11
blr
@@ -42,7 +43,8 @@ _GLOBAL(__restore_cpu_power7)
li r0,0
mtspr SPRN_LPID,r0
mfspr r3,SPRN_LPCR
- bl __init_LPCR
+ li r4,(LPCR_LPES1 >> LPCR_LPES_SH)
+ bl __init_LPCR_ISA206
bl __init_tlb_power7
mtlr r11
blr
@@ -59,7 +61,8 @@ _GLOBAL(__setup_cpu_power8)
mtspr SPRN_LPID,r0
mfspr r3,SPRN_LPCR
ori r3, r3, LPCR_PECEDH
- bl __init_LPCR
+ li r4,0 /* LPES = 0 */
+ bl __init_LPCR_ISA206
bl __init_HFSCR
bl __init_tlb_power8
bl __init_PMU_HV
@@ -80,7 +83,8 @@ _GLOBAL(__restore_cpu_power8)
mtspr SPRN_LPID,r0
mfspr r3,SPRN_LPCR
ori r3, r3, LPCR_PECEDH
- bl __init_LPCR
+ li r4,0 /* LPES = 0 */
+ bl __init_LPCR_ISA206
bl __init_HFSCR
bl __init_tlb_power8
bl __init_PMU_HV
@@ -99,11 +103,12 @@ _GLOBAL(__setup_cpu_power9)
mtspr SPRN_PSSCR,r0
mtspr SPRN_LPID,r0
mfspr r3,SPRN_LPCR
- LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE)
+ LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC)
or r3, r3, r4
LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
andc r3, r3, r4
- bl __init_LPCR
+ li r4,0 /* LPES = 0 */
+ bl __init_LPCR_ISA300
bl __init_HFSCR
bl __init_tlb_power9
bl __init_PMU_HV
@@ -122,11 +127,12 @@ _GLOBAL(__restore_cpu_power9)
mtspr SPRN_PSSCR,r0
mtspr SPRN_LPID,r0
mfspr r3,SPRN_LPCR
- LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE)
+ LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC)
or r3, r3, r4
LOAD_REG_IMMEDIATE(r4, LPCR_UPRT | LPCR_HR)
andc r3, r3, r4
- bl __init_LPCR
+ li r4,0 /* LPES = 0 */
+ bl __init_LPCR_ISA300
bl __init_HFSCR
bl __init_tlb_power9
bl __init_PMU_HV
@@ -144,9 +150,9 @@ __init_hvmode_206:
std r5,CPU_SPEC_FEATURES(r4)
blr
-__init_LPCR:
+__init_LPCR_ISA206:
/* Setup a sane LPCR:
- * Called with initial LPCR in R3
+ * Called with initial LPCR in R3 and desired LPES 2-bit value in R4
*
* LPES = 0b01 (HSRR0/1 used for 0x500)
* PECE = 0b111
@@ -157,16 +163,18 @@ __init_LPCR:
*
* Other bits untouched for now
*/
- li r5,1
- rldimi r3,r5, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
+ li r5,0x10
+ rldimi r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
+
+ /* POWER9 has no VRMASD */
+__init_LPCR_ISA300:
+ rldimi r3,r4, LPCR_LPES_SH, 64-LPCR_LPES_SH-2
ori r3,r3,(LPCR_PECE0|LPCR_PECE1|LPCR_PECE2)
li r5,4
rldimi r3,r5, LPCR_DPFD_SH, 64-LPCR_DPFD_SH-3
clrrdi r3,r3,1 /* clear HDICE */
li r5,4
rldimi r3,r5, LPCR_VC_SH, 0
- li r5,0x10
- rldimi r3,r5, LPCR_VRMASD_SH, 64-LPCR_VRMASD_SH-5
mtspr SPRN_LPCR,r3
isync
blr
diff --git a/arch/powerpc/kernel/dbell.c b/arch/powerpc/kernel/dbell.c
index 2128f3a96c32..b6fe883b1016 100644
--- a/arch/powerpc/kernel/dbell.c
+++ b/arch/powerpc/kernel/dbell.c
@@ -20,18 +20,60 @@
#include <asm/kvm_ppc.h>
#ifdef CONFIG_SMP
-void doorbell_setup_this_cpu(void)
+
+/*
+ * Doorbells must only be used if CPU_FTR_DBELL is available.
+ * msgsnd is used in HV, and msgsndp is used in !HV.
+ *
+ * These should be used by platform code that is aware of restrictions.
+ * Other arch code should use ->cause_ipi.
+ *
+ * doorbell_global_ipi() sends a dbell to any target CPU.
+ * Must be used only by architectures that address msgsnd target
+ * by PIR/get_hard_smp_processor_id.
+ */
+void doorbell_global_ipi(int cpu)
{
- unsigned long tag = mfspr(SPRN_DOORBELL_CPUTAG) & PPC_DBELL_TAG_MASK;
+ u32 tag = get_hard_smp_processor_id(cpu);
- smp_muxed_ipi_set_data(smp_processor_id(), tag);
+ kvmppc_set_host_ipi(cpu, 1);
+ /* Order previous accesses vs. msgsnd, which is treated as a store */
+ ppc_msgsnd_sync();
+ ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
}
-void doorbell_cause_ipi(int cpu, unsigned long data)
+/*
+ * doorbell_core_ipi() sends a dbell to a target CPU in the same core.
+ * Must be used only by architectures that address msgsnd target
+ * by TIR/cpu_thread_in_core.
+ */
+void doorbell_core_ipi(int cpu)
{
+ u32 tag = cpu_thread_in_core(cpu);
+
+ kvmppc_set_host_ipi(cpu, 1);
/* Order previous accesses vs. msgsnd, which is treated as a store */
- mb();
- ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, data);
+ ppc_msgsnd_sync();
+ ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag);
+}
+
+/*
+ * Attempt to cause a core doorbell if destination is on the same core.
+ * Returns 1 on success, 0 on failure.
+ */
+int doorbell_try_core_ipi(int cpu)
+{
+ int this_cpu = get_cpu();
+ int ret = 0;
+
+ if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) {
+ doorbell_core_ipi(cpu);
+ ret = 1;
+ }
+
+ put_cpu();
+
+ return ret;
}
void doorbell_exception(struct pt_regs *regs)
@@ -40,12 +82,14 @@ void doorbell_exception(struct pt_regs *regs)
irq_enter();
+ ppc_msgsync();
+
may_hard_irq_enable();
kvmppc_set_host_ipi(smp_processor_id(), 0);
__this_cpu_inc(irq_stat.doorbell_irqs);
- smp_ipi_demux();
+ smp_ipi_demux_relaxed(); /* already performed the barrier */
irq_exit();
set_irq_regs(old_regs);
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9de7f79e702b..63992b2d8e15 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -22,7 +22,6 @@
*/
#include <linux/delay.h>
-#include <linux/debugfs.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/list.h>
@@ -37,7 +36,7 @@
#include <linux/of.h>
#include <linux/atomic.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
#include <asm/eeh.h>
#include <asm/eeh_event.h>
#include <asm/io.h>
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index b94887165a10..c405c79e50cd 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -724,7 +724,16 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
*/
#define MAX_WAIT_FOR_RECOVERY 300
-static void eeh_handle_normal_event(struct eeh_pe *pe)
+/**
+ * eeh_handle_normal_event - Handle EEH events on a specific PE
+ * @pe: EEH PE
+ *
+ * Attempts to recover the given PE. If recovery fails or the PE has failed
+ * too many times, remove the PE.
+ *
+ * Returns true if @pe should no longer be used, else false.
+ */
+static bool eeh_handle_normal_event(struct eeh_pe *pe)
{
struct pci_bus *frozen_bus;
struct eeh_dev *edev, *tmp;
@@ -736,13 +745,18 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
if (!frozen_bus) {
pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
__func__, pe->phb->global_number, pe->addr);
- return;
+ return false;
}
eeh_pe_update_time_stamp(pe);
pe->freeze_count++;
- if (pe->freeze_count > eeh_max_freezes)
- goto excess_failures;
+ if (pe->freeze_count > eeh_max_freezes) {
+ pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n"
+ "last hour and has been permanently disabled.\n",
+ pe->phb->global_number, pe->addr,
+ pe->freeze_count);
+ goto hard_fail;
+ }
pr_warn("EEH: This PCI device has failed %d times in the last hour\n",
pe->freeze_count);
@@ -870,27 +884,18 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
pr_info("EEH: Notify device driver to resume\n");
eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
- return;
+ return false;
-excess_failures:
+hard_fail:
/*
* About 90% of all real-life EEH failures in the field
* are due to poorly seated PCI cards. Only 10% or so are
* due to actual, failed cards.
*/
- pr_err("EEH: PHB#%x-PE#%x has failed %d times in the\n"
- "last hour and has been permanently disabled.\n"
- "Please try reseating or replacing it.\n",
- pe->phb->global_number, pe->addr,
- pe->freeze_count);
- goto perm_error;
-
-hard_fail:
pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
"Please try reseating or replacing it\n",
pe->phb->global_number, pe->addr);
-perm_error:
eeh_slot_error_detail(pe, EEH_LOG_PERM);
/* Notify all devices that they're about to go down. */
@@ -915,10 +920,21 @@ perm_error:
pci_lock_rescan_remove();
pci_hp_remove_devices(frozen_bus);
pci_unlock_rescan_remove();
+
+ /* The passed PE should no longer be used */
+ return true;
}
}
+ return false;
}
+/**
+ * eeh_handle_special_event - Handle EEH events without a specific failing PE
+ *
+ * Called when an EEH event is detected but can't be narrowed down to a
+ * specific PE. Iterates through possible failures and handles them as
+ * necessary.
+ */
static void eeh_handle_special_event(void)
{
struct eeh_pe *pe, *phb_pe;
@@ -982,7 +998,14 @@ static void eeh_handle_special_event(void)
*/
if (rc == EEH_NEXT_ERR_FROZEN_PE ||
rc == EEH_NEXT_ERR_FENCED_PHB) {
- eeh_handle_normal_event(pe);
+ /*
+ * eeh_handle_normal_event() can make the PE stale if it
+ * determines that the PE cannot possibly be recovered.
+ * Don't modify the PE state if that's the case.
+ */
+ if (eeh_handle_normal_event(pe))
+ continue;
+
eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
} else {
pci_lock_rescan_remove();
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index a38600949f3a..8587059ad848 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -31,7 +31,6 @@
#include <asm/ppc_asm.h>
#include <asm/asm-offsets.h>
#include <asm/unistd.h>
-#include <asm/ftrace.h>
#include <asm/ptrace.h>
#include <asm/export.h>
@@ -1315,109 +1314,3 @@ machine_check_in_rtas:
/* XXX load up BATs and panic */
#endif /* CONFIG_PPC_RTAS */
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-_GLOBAL(mcount)
-_GLOBAL(_mcount)
- /*
- * It is required that _mcount on PPC32 must preserve the
- * link register. But we have r0 to play with. We use r0
- * to push the return address back to the caller of mcount
- * into the ctr register, restore the link register and
- * then jump back using the ctr register.
- */
- mflr r0
- mtctr r0
- lwz r0, 4(r1)
- mtlr r0
- bctr
-
-_GLOBAL(ftrace_caller)
- MCOUNT_SAVE_FRAME
- /* r3 ends up with link register */
- subi r3, r3, MCOUNT_INSN_SIZE
-.globl ftrace_call
-ftrace_call:
- bl ftrace_stub
- nop
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
- b ftrace_graph_stub
-_GLOBAL(ftrace_graph_stub)
-#endif
- MCOUNT_RESTORE_FRAME
- /* old link register ends up in ctr reg */
- bctr
-#else
-_GLOBAL(mcount)
-_GLOBAL(_mcount)
-
- MCOUNT_SAVE_FRAME
-
- subi r3, r3, MCOUNT_INSN_SIZE
- LOAD_REG_ADDR(r5, ftrace_trace_function)
- lwz r5,0(r5)
-
- mtctr r5
- bctrl
- nop
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- b ftrace_graph_caller
-#endif
- MCOUNT_RESTORE_FRAME
- bctr
-#endif
-EXPORT_SYMBOL(_mcount)
-
-_GLOBAL(ftrace_stub)
- blr
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-_GLOBAL(ftrace_graph_caller)
- /* load r4 with local address */
- lwz r4, 44(r1)
- subi r4, r4, MCOUNT_INSN_SIZE
-
- /* Grab the LR out of the caller stack frame */
- lwz r3,52(r1)
-
- bl prepare_ftrace_return
- nop
-
- /*
- * prepare_ftrace_return gives us the address we divert to.
- * Change the LR in the callers stack frame to this.
- */
- stw r3,52(r1)
-
- MCOUNT_RESTORE_FRAME
- /* old link register ends up in ctr reg */
- bctr
-
-_GLOBAL(return_to_handler)
- /* need to save return values */
- stwu r1, -32(r1)
- stw r3, 20(r1)
- stw r4, 16(r1)
- stw r31, 12(r1)
- mr r31, r1
-
- bl ftrace_return_to_handler
- nop
-
- /* return value has real return address */
- mtlr r3
-
- lwz r3, 20(r1)
- lwz r4, 16(r1)
- lwz r31,12(r1)
- lwz r1, 0(r1)
-
- /* Jump back to real return address */
- blr
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-
-#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 767ef6d68c9e..bfbad08a1207 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -20,7 +20,6 @@
#include <linux/errno.h>
#include <linux/err.h>
-#include <linux/magic.h>
#include <asm/unistd.h>
#include <asm/processor.h>
#include <asm/page.h>
@@ -33,7 +32,6 @@
#include <asm/bug.h>
#include <asm/ptrace.h>
#include <asm/irqflags.h>
-#include <asm/ftrace.h>
#include <asm/hw_irq.h>
#include <asm/context_tracking.h>
#include <asm/tm.h>
@@ -1173,381 +1171,3 @@ _GLOBAL(enter_prom)
ld r0,16(r1)
mtlr r0
blr
-
-#ifdef CONFIG_FUNCTION_TRACER
-#ifdef CONFIG_DYNAMIC_FTRACE
-_GLOBAL(mcount)
-_GLOBAL(_mcount)
-EXPORT_SYMBOL(_mcount)
- mflr r12
- mtctr r12
- mtlr r0
- bctr
-
-#ifndef CC_USING_MPROFILE_KERNEL
-_GLOBAL_TOC(ftrace_caller)
- /* Taken from output of objdump from lib64/glibc */
- mflr r3
- ld r11, 0(r1)
- stdu r1, -112(r1)
- std r3, 128(r1)
- ld r4, 16(r11)
- subi r3, r3, MCOUNT_INSN_SIZE
-.globl ftrace_call
-ftrace_call:
- bl ftrace_stub
- nop
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-.globl ftrace_graph_call
-ftrace_graph_call:
- b ftrace_graph_stub
-_GLOBAL(ftrace_graph_stub)
-#endif
- ld r0, 128(r1)
- mtlr r0
- addi r1, r1, 112
-
-#else /* CC_USING_MPROFILE_KERNEL */
-/*
- *
- * ftrace_caller() is the function that replaces _mcount() when ftrace is
- * active.
- *
- * We arrive here after a function A calls function B, and we are the trace
- * function for B. When we enter r1 points to A's stack frame, B has not yet
- * had a chance to allocate one yet.
- *
- * Additionally r2 may point either to the TOC for A, or B, depending on
- * whether B did a TOC setup sequence before calling us.
- *
- * On entry the LR points back to the _mcount() call site, and r0 holds the
- * saved LR as it was on entry to B, ie. the original return address at the
- * call site in A.
- *
- * Our job is to save the register state into a struct pt_regs (on the stack)
- * and then arrange for the ftrace function to be called.
- */
-_GLOBAL(ftrace_caller)
- /* Save the original return address in A's stack frame */
- std r0,LRSAVE(r1)
-
- /* Create our stack frame + pt_regs */
- stdu r1,-SWITCH_FRAME_SIZE(r1)
-
- /* Save all gprs to pt_regs */
- SAVE_8GPRS(0,r1)
- SAVE_8GPRS(8,r1)
- SAVE_8GPRS(16,r1)
- SAVE_8GPRS(24,r1)
-
- /* Load special regs for save below */
- mfmsr r8
- mfctr r9
- mfxer r10
- mfcr r11
-
- /* Get the _mcount() call site out of LR */
- mflr r7
- /* Save it as pt_regs->nip & pt_regs->link */
- std r7, _NIP(r1)
- std r7, _LINK(r1)
-
- /* Save callee's TOC in the ABI compliant location */
- std r2, 24(r1)
- ld r2,PACATOC(r13) /* get kernel TOC in r2 */
-
- addis r3,r2,function_trace_op@toc@ha
- addi r3,r3,function_trace_op@toc@l
- ld r5,0(r3)
-
-#ifdef CONFIG_LIVEPATCH
- mr r14,r7 /* remember old NIP */
-#endif
- /* Calculate ip from nip-4 into r3 for call below */
- subi r3, r7, MCOUNT_INSN_SIZE
-
- /* Put the original return address in r4 as parent_ip */
- mr r4, r0
-
- /* Save special regs */
- std r8, _MSR(r1)
- std r9, _CTR(r1)
- std r10, _XER(r1)
- std r11, _CCR(r1)
-
- /* Load &pt_regs in r6 for call below */
- addi r6, r1 ,STACK_FRAME_OVERHEAD
-
- /* ftrace_call(r3, r4, r5, r6) */
-.globl ftrace_call
-ftrace_call:
- bl ftrace_stub
- nop
-
- /* Load ctr with the possibly modified NIP */
- ld r3, _NIP(r1)
- mtctr r3
-#ifdef CONFIG_LIVEPATCH
- cmpd r14,r3 /* has NIP been altered? */
-#endif
-
- /* Restore gprs */
- REST_8GPRS(0,r1)
- REST_8GPRS(8,r1)
- REST_8GPRS(16,r1)
- REST_8GPRS(24,r1)
-
- /* Restore callee's TOC */
- ld r2, 24(r1)
-
- /* Pop our stack frame */
- addi r1, r1, SWITCH_FRAME_SIZE
-
- /* Restore original LR for return to B */
- ld r0, LRSAVE(r1)
- mtlr r0
-
-#ifdef CONFIG_LIVEPATCH
- /* Based on the cmpd above, if the NIP was altered handle livepatch */
- bne- livepatch_handler
-#endif
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- stdu r1, -112(r1)
-.globl ftrace_graph_call
-ftrace_graph_call:
- b ftrace_graph_stub
-_GLOBAL(ftrace_graph_stub)
- addi r1, r1, 112
-#endif
-
- ld r0,LRSAVE(r1) /* restore callee's lr at _mcount site */
- mtlr r0
- bctr /* jump after _mcount site */
-#endif /* CC_USING_MPROFILE_KERNEL */
-
-_GLOBAL(ftrace_stub)
- blr
-
-#ifdef CONFIG_LIVEPATCH
- /*
- * This function runs in the mcount context, between two functions. As
- * such it can only clobber registers which are volatile and used in
- * function linkage.
- *
- * We get here when a function A, calls another function B, but B has
- * been live patched with a new function C.
- *
- * On entry:
- * - we have no stack frame and can not allocate one
- * - LR points back to the original caller (in A)
- * - CTR holds the new NIP in C
- * - r0 & r12 are free
- *
- * r0 can't be used as the base register for a DS-form load or store, so
- * we temporarily shuffle r1 (stack pointer) into r0 and then put it back.
- */
-livepatch_handler:
- CURRENT_THREAD_INFO(r12, r1)
-
- /* Save stack pointer into r0 */
- mr r0, r1
-
- /* Allocate 3 x 8 bytes */
- ld r1, TI_livepatch_sp(r12)
- addi r1, r1, 24
- std r1, TI_livepatch_sp(r12)
-
- /* Save toc & real LR on livepatch stack */
- std r2, -24(r1)
- mflr r12
- std r12, -16(r1)
-
- /* Store stack end marker */
- lis r12, STACK_END_MAGIC@h
- ori r12, r12, STACK_END_MAGIC@l
- std r12, -8(r1)
-
- /* Restore real stack pointer */
- mr r1, r0
-
- /* Put ctr in r12 for global entry and branch there */
- mfctr r12
- bctrl
-
- /*
- * Now we are returning from the patched function to the original
- * caller A. We are free to use r0 and r12, and we can use r2 until we
- * restore it.
- */
-
- CURRENT_THREAD_INFO(r12, r1)
-
- /* Save stack pointer into r0 */
- mr r0, r1
-
- ld r1, TI_livepatch_sp(r12)
-
- /* Check stack marker hasn't been trashed */
- lis r2, STACK_END_MAGIC@h
- ori r2, r2, STACK_END_MAGIC@l
- ld r12, -8(r1)
-1: tdne r12, r2
- EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0
-
- /* Restore LR & toc from livepatch stack */
- ld r12, -16(r1)
- mtlr r12
- ld r2, -24(r1)
-
- /* Pop livepatch stack frame */
- CURRENT_THREAD_INFO(r12, r0)
- subi r1, r1, 24
- std r1, TI_livepatch_sp(r12)
-
- /* Restore real stack pointer */
- mr r1, r0
-
- /* Return to original caller of live patched function */
- blr
-#endif
-
-
-#else
-_GLOBAL_TOC(_mcount)
-EXPORT_SYMBOL(_mcount)
- /* Taken from output of objdump from lib64/glibc */
- mflr r3
- ld r11, 0(r1)
- stdu r1, -112(r1)
- std r3, 128(r1)
- ld r4, 16(r11)
-
- subi r3, r3, MCOUNT_INSN_SIZE
- LOAD_REG_ADDR(r5,ftrace_trace_function)
- ld r5,0(r5)
- ld r5,0(r5)
- mtctr r5
- bctrl
- nop
-
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- b ftrace_graph_caller
-#endif
- ld r0, 128(r1)
- mtlr r0
- addi r1, r1, 112
-_GLOBAL(ftrace_stub)
- blr
-
-#endif /* CONFIG_DYNAMIC_FTRACE */
-
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-#ifndef CC_USING_MPROFILE_KERNEL
-_GLOBAL(ftrace_graph_caller)
- /* load r4 with local address */
- ld r4, 128(r1)
- subi r4, r4, MCOUNT_INSN_SIZE
-
- /* Grab the LR out of the caller stack frame */
- ld r11, 112(r1)
- ld r3, 16(r11)
-
- bl prepare_ftrace_return
- nop
-
- /*
- * prepare_ftrace_return gives us the address we divert to.
- * Change the LR in the callers stack frame to this.
- */
- ld r11, 112(r1)
- std r3, 16(r11)
-
- ld r0, 128(r1)
- mtlr r0
- addi r1, r1, 112
- blr
-
-#else /* CC_USING_MPROFILE_KERNEL */
-_GLOBAL(ftrace_graph_caller)
- /* with -mprofile-kernel, parameter regs are still alive at _mcount */
- std r10, 104(r1)
- std r9, 96(r1)
- std r8, 88(r1)
- std r7, 80(r1)
- std r6, 72(r1)
- std r5, 64(r1)
- std r4, 56(r1)
- std r3, 48(r1)
-
- /* Save callee's TOC in the ABI compliant location */
- std r2, 24(r1)
- ld r2, PACATOC(r13) /* get kernel TOC in r2 */
-
- mfctr r4 /* ftrace_caller has moved local addr here */
- std r4, 40(r1)
- mflr r3 /* ftrace_caller has restored LR from stack */
- subi r4, r4, MCOUNT_INSN_SIZE
-
- bl prepare_ftrace_return
- nop
-
- /*
- * prepare_ftrace_return gives us the address we divert to.
- * Change the LR to this.
- */
- mtlr r3
-
- ld r0, 40(r1)
- mtctr r0
- ld r10, 104(r1)
- ld r9, 96(r1)
- ld r8, 88(r1)
- ld r7, 80(r1)
- ld r6, 72(r1)
- ld r5, 64(r1)
- ld r4, 56(r1)
- ld r3, 48(r1)
-
- /* Restore callee's TOC */
- ld r2, 24(r1)
-
- addi r1, r1, 112
- mflr r0
- std r0, LRSAVE(r1)
- bctr
-#endif /* CC_USING_MPROFILE_KERNEL */
-
-_GLOBAL(return_to_handler)
- /* need to save return values */
- std r4, -32(r1)
- std r3, -24(r1)
- /* save TOC */
- std r2, -16(r1)
- std r31, -8(r1)
- mr r31, r1
- stdu r1, -112(r1)
-
- /*
- * We might be called from a module.
- * Switch to our TOC to run inside the core kernel.
- */
- ld r2, PACATOC(r13)
-
- bl ftrace_return_to_handler
- nop
-
- /* return value has real return address */
- mtlr r3
-
- ld r1, 0(r1)
- ld r4, -32(r1)
- ld r3, -24(r1)
- ld r2, -16(r1)
- ld r31, -8(r1)
-
- /* Jump back to real return address */
- blr
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6353019966e6..a9312b52fe6f 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -116,9 +116,11 @@ EXC_VIRT_NONE(0x4000, 0x100)
EXC_REAL_BEGIN(system_reset, 0x100, 0x100)
SET_SCRATCH0(r13)
- GET_PACA(r13)
- clrrdi r13,r13,1 /* Last bit of HSPRG0 is set if waking from winkle */
- EXCEPTION_PROLOG_PSERIES_PACA(PACA_EXGEN, system_reset_common, EXC_STD,
+ /*
+ * MSR_RI is not enabled, because PACA_EXNMI and nmi stack is
+ * being used, so a nested NMI exception would corrupt it.
+ */
+ EXCEPTION_PROLOG_PSERIES_NORI(PACA_EXNMI, system_reset_common, EXC_STD,
IDLETEST, 0x100)
EXC_REAL_END(system_reset, 0x100, 0x100)
@@ -126,34 +128,37 @@ EXC_VIRT_NONE(0x4100, 0x100)
#ifdef CONFIG_PPC_P7_NAP
EXC_COMMON_BEGIN(system_reset_idle_common)
-BEGIN_FTR_SECTION
- GET_PACA(r13) /* Restore HSPRG0 to get the winkle bit in r13 */
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
- bl pnv_restore_hyp_resource
+ b pnv_powersave_wakeup
+#endif
- li r0,PNV_THREAD_RUNNING
- stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */
+EXC_COMMON_BEGIN(system_reset_common)
+ /*
+ * Increment paca->in_nmi then enable MSR_RI. SLB or MCE will be able
+ * to recover, but nested NMI will notice in_nmi and not recover
+ * because of the use of the NMI stack. in_nmi reentrancy is tested in
+ * system_reset_exception.
+ */
+ lhz r10,PACA_IN_NMI(r13)
+ addi r10,r10,1
+ sth r10,PACA_IN_NMI(r13)
+ li r10,MSR_RI
+ mtmsrd r10,1
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
- li r0,KVM_HWTHREAD_IN_KERNEL
- stb r0,HSTATE_HWTHREAD_STATE(r13)
- /* Order setting hwthread_state vs. testing hwthread_req */
- sync
- lbz r0,HSTATE_HWTHREAD_REQ(r13)
- cmpwi r0,0
- beq 1f
- BRANCH_TO_KVM(r10, kvm_start_guest)
-1:
-#endif
+ mr r10,r1
+ ld r1,PACA_NMI_EMERG_SP(r13)
+ subi r1,r1,INT_FRAME_SIZE
+ EXCEPTION_COMMON_NORET_STACK(PACA_EXNMI, 0x100,
+ system_reset, system_reset_exception,
+ ADD_NVGPRS;ADD_RECONCILE)
- /* Return SRR1 from power7_nap() */
- mfspr r3,SPRN_SRR1
- blt cr3,2f
- b pnv_wakeup_loss
-2: b pnv_wakeup_noloss
-#endif
+ /*
+ * The stack is no longer in use, decrement in_nmi.
+ */
+ lhz r10,PACA_IN_NMI(r13)
+ subi r10,r10,1
+ sth r10,PACA_IN_NMI(r13)
-EXC_COMMON(system_reset_common, 0x100, system_reset_exception)
+ b ret_from_except
#ifdef CONFIG_PPC_PSERIES
/*
@@ -161,8 +166,9 @@ EXC_COMMON(system_reset_common, 0x100, system_reset_exception)
*/
TRAMP_REAL_BEGIN(system_reset_fwnmi)
SET_SCRATCH0(r13) /* save r13 */
- EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
- NOTEST, 0x100)
+ /* See comment at system_reset exception */
+ EXCEPTION_PROLOG_PSERIES_NORI(PACA_EXNMI, system_reset_common,
+ EXC_STD, NOTEST, 0x100)
#endif /* CONFIG_PPC_PSERIES */
@@ -172,14 +178,6 @@ EXC_REAL_BEGIN(machine_check, 0x200, 0x100)
* vector
*/
SET_SCRATCH0(r13) /* save r13 */
- /*
- * Running native on arch 2.06 or later, we may wakeup from winkle
- * inside machine check. If yes, then last bit of HSPRG0 would be set
- * to 1. Hence clear it unconditionally.
- */
- GET_PACA(r13)
- clrrdi r13,r13,1
- SET_PACA(r13)
EXCEPTION_PROLOG_0(PACA_EXMC)
BEGIN_FTR_SECTION
b machine_check_powernv_early
@@ -212,6 +210,12 @@ BEGIN_FTR_SECTION
* NOTE: We are here with MSR_ME=0 (off), which means we risk a
* checkstop if we get another machine check exception before we do
* rfid with MSR_ME=1.
+ *
+ * This interrupt can wake directly from idle. If that is the case,
+ * the machine check is handled then the idle wakeup code is called
+ * to restore state. In that case, the POWER9 DD1 idle PACA workaround
+ * is not applied in the early machine check code, which will cause
+ * bugs.
*/
mr r11,r1 /* Save r1 */
lhz r10,PACA_IN_MCE(r13)
@@ -268,20 +272,11 @@ machine_check_fwnmi:
machine_check_pSeries_0:
EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
/*
- * The following is essentially EXCEPTION_PROLOG_PSERIES_1 with the
- * difference that MSR_RI is not enabled, because PACA_EXMC is being
- * used, so nested machine check corrupts it. machine_check_common
- * enables MSR_RI.
+ * MSR_RI is not enabled, because PACA_EXMC is being used, so a
+ * nested machine check corrupts it. machine_check_common enables
+ * MSR_RI.
*/
- ld r10,PACAKMSR(r13)
- xori r10,r10,MSR_RI
- mfspr r11,SPRN_SRR0
- LOAD_HANDLER(r12, machine_check_common)
- mtspr SPRN_SRR0,r12
- mfspr r12,SPRN_SRR1
- mtspr SPRN_SRR1,r10
- rfid
- b . /* prevent speculative execution */
+ EXCEPTION_PROLOG_PSERIES_1_NORI(machine_check_common, EXC_STD)
TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
@@ -340,6 +335,37 @@ EXC_COMMON_BEGIN(machine_check_common)
/* restore original r1. */ \
ld r1,GPR1(r1)
+#ifdef CONFIG_PPC_P7_NAP
+/*
+ * This is an idle wakeup. Low level machine check has already been
+ * done. Queue the event then call the idle code to do the wake up.
+ */
+EXC_COMMON_BEGIN(machine_check_idle_common)
+ bl machine_check_queue_event
+
+ /*
+ * We have not used any non-volatile GPRs here, and as a rule
+ * most exception code including machine check does not.
+ * Therefore PACA_NAPSTATELOST does not need to be set. Idle
+ * wakeup will restore volatile registers.
+ *
+ * Load the original SRR1 into r3 for pnv_powersave_wakeup_mce.
+ *
+ * Then decrement MCE nesting after finishing with the stack.
+ */
+ ld r3,_MSR(r1)
+
+ lhz r11,PACA_IN_MCE(r13)
+ subi r11,r11,1
+ sth r11,PACA_IN_MCE(r13)
+
+ /* Turn off the RI bit because SRR1 is used by idle wakeup code. */
+ /* Recoverability could be improved by reducing the use of SRR1. */
+ li r11,0
+ mtmsrd r11,1
+
+ b pnv_powersave_wakeup_mce
+#endif
/*
* Handle machine check early in real mode. We come here with
* ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
@@ -352,6 +378,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
bl machine_check_early
std r3,RESULT(r1) /* Save result */
ld r12,_MSR(r1)
+
#ifdef CONFIG_PPC_P7_NAP
/*
* Check if thread was in power saving mode. We come here when any
@@ -362,48 +389,14 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
*
* Go back to nap/sleep/winkle mode again if (b) is true.
*/
- rlwinm. r11,r12,47-31,30,31 /* Was it in power saving mode? */
- beq 4f /* No, it wasn;t */
- /* Thread was in power saving mode. Go back to nap again. */
- cmpwi r11,2
- blt 3f
- /* Supervisor/Hypervisor state loss */
- li r0,1
- stb r0,PACA_NAPSTATELOST(r13)
-3: bl machine_check_queue_event
- MACHINE_CHECK_HANDLER_WINDUP
- GET_PACA(r13)
- ld r1,PACAR1(r13)
- /*
- * Check what idle state this CPU was in and go back to same mode
- * again.
- */
- lbz r3,PACA_THREAD_IDLE_STATE(r13)
- cmpwi r3,PNV_THREAD_NAP
- bgt 10f
- IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
- /* No return */
-10:
- cmpwi r3,PNV_THREAD_SLEEP
- bgt 2f
- IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
- /* No return */
-
-2:
- /*
- * Go back to winkle. Please note that this thread was woken up in
- * machine check from winkle and have not restored the per-subcore
- * state. Hence before going back to winkle, set last bit of HSPRG0
- * to 1. This will make sure that if this thread gets woken up
- * again at reset vector 0x100 then it will get chance to restore
- * the subcore state.
- */
- ori r13,r13,1
- SET_PACA(r13)
- IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
- /* No return */
+ BEGIN_FTR_SECTION
+ rlwinm. r11,r12,47-31,30,31
+ beq- 4f
+ BRANCH_TO_COMMON(r10, machine_check_idle_common)
4:
+ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
#endif
+
/*
* Check if we are coming from hypervisor userspace. If yes then we
* continue in host kernel in V mode to deliver the MC event.
@@ -968,17 +961,12 @@ EXC_VIRT_NONE(0x4e60, 0x20)
TRAMP_KVM_HV(PACA_EXGEN, 0xe60)
TRAMP_REAL_BEGIN(hmi_exception_early)
EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, 0xe60)
- mr r10,r1 /* Save r1 */
- ld r1,PACAEMERGSP(r13) /* Use emergency stack */
+ mr r10,r1 /* Save r1 */
+ ld r1,PACAEMERGSP(r13) /* Use emergency stack for realmode */
subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */
- std r9,_CCR(r1) /* save CR in stackframe */
mfspr r11,SPRN_HSRR0 /* Save HSRR0 */
- std r11,_NIP(r1) /* save HSRR0 in stackframe */
- mfspr r12,SPRN_HSRR1 /* Save SRR1 */
- std r12,_MSR(r1) /* save SRR1 in stackframe */
- std r10,0(r1) /* make stack chain pointer */
- std r0,GPR0(r1) /* save r0 in stackframe */
- std r10,GPR1(r1) /* save r1 in stackframe */
+ mfspr r12,SPRN_HSRR1 /* Save HSRR1 */
+ EXCEPTION_PROLOG_COMMON_1()
EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
EXCEPTION_PROLOG_COMMON_3(0xe60)
addi r3,r1,STACK_FRAME_OVERHEAD
diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
index 8ff0dd4e77a7..243dbef7e926 100644
--- a/arch/powerpc/kernel/fadump.c
+++ b/arch/powerpc/kernel/fadump.c
@@ -30,17 +30,16 @@
#include <linux/string.h>
#include <linux/memblock.h>
#include <linux/delay.h>
-#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/crash_dump.h>
#include <linux/kobject.h>
#include <linux/sysfs.h>
+#include <asm/debugfs.h>
#include <asm/page.h>
#include <asm/prom.h>
#include <asm/rtas.h>
#include <asm/fadump.h>
-#include <asm/debug.h>
#include <asm/setup.h>
static struct fw_dump fw_dump;
@@ -319,15 +318,34 @@ int __init fadump_reserve_mem(void)
pr_debug("fadumphdr_addr = %p\n",
(void *) fw_dump.fadumphdr_addr);
} else {
- /* Reserve the memory at the top of memory. */
size = get_fadump_area_size();
- base = memory_boundary - size;
- memblock_reserve(base, size);
- printk(KERN_INFO "Reserved %ldMB of memory at %ldMB "
- "for firmware-assisted dump\n",
- (unsigned long)(size >> 20),
- (unsigned long)(base >> 20));
+
+ /*
+ * Reserve memory at an offset closer to bottom of the RAM to
+ * minimize the impact of memory hot-remove operation. We can't
+ * use memblock_find_in_range() here since it doesn't allocate
+ * from bottom to top.
+ */
+ for (base = fw_dump.boot_memory_size;
+ base <= (memory_boundary - size);
+ base += size) {
+ if (memblock_is_region_memory(base, size) &&
+ !memblock_is_region_reserved(base, size))
+ break;
+ }
+ if ((base > (memory_boundary - size)) ||
+ memblock_reserve(base, size)) {
+ pr_err("Failed to reserve memory\n");
+ return 0;
+ }
+
+ pr_info("Reserved %ldMB of memory at %ldMB for firmware-"
+ "assisted dump (System RAM: %ldMB)\n",
+ (unsigned long)(size >> 20),
+ (unsigned long)(base >> 20),
+ (unsigned long)(memblock_phys_mem_size() >> 20));
}
+
fw_dump.reserve_dump_area_start = base;
fw_dump.reserve_dump_area_size = size;
return 1;
diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S
index 1607be7c0ef2..e22734278458 100644
--- a/arch/powerpc/kernel/head_32.S
+++ b/arch/powerpc/kernel/head_32.S
@@ -735,11 +735,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_NEED_DTLB_SW_LRU)
EXCEPTION(0x2c00, Trap_2c, unknown_exception, EXC_XFER_EE)
EXCEPTION(0x2d00, Trap_2d, unknown_exception, EXC_XFER_EE)
EXCEPTION(0x2e00, Trap_2e, unknown_exception, EXC_XFER_EE)
- EXCEPTION(0x2f00, MOLTrampoline, unknown_exception, EXC_XFER_EE_LITE)
-
- .globl mol_trampoline
- .set mol_trampoline, i0x2f00
- EXPORT_SYMBOL(mol_trampoline)
+ EXCEPTION(0x2f00, Trap_2f, unknown_exception, EXC_XFER_EE)
. = 0x3000
@@ -1278,16 +1274,6 @@ EXPORT_SYMBOL(empty_zero_page)
swapper_pg_dir:
.space PGD_TABLE_SIZE
- .globl intercept_table
-intercept_table:
- .long 0, 0, i0x200, i0x300, i0x400, 0, i0x600, i0x700
- .long i0x800, 0, 0, 0, 0, i0xd00, 0, 0
- .long 0, 0, 0, i0x1300, 0, 0, 0, 0
- .long 0, 0, 0, 0, 0, 0, 0, 0
- .long 0, 0, 0, 0, 0, 0, 0, 0
- .long 0, 0, 0, 0, 0, 0, 0, 0
-EXPORT_SYMBOL(intercept_table)
-
/* Room for two PTE pointers, usually the kernel and current user pointers
* to their respective root page table.
*/
diff --git a/arch/powerpc/kernel/head_64.S b/arch/powerpc/kernel/head_64.S
index 1dc5eae2ced3..0ddc602b33a4 100644
--- a/arch/powerpc/kernel/head_64.S
+++ b/arch/powerpc/kernel/head_64.S
@@ -949,7 +949,8 @@ start_here_multiplatform:
LOAD_REG_ADDR(r3,init_thread_union)
/* set up a stack pointer */
- addi r1,r3,THREAD_SIZE
+ LOAD_REG_IMMEDIATE(r1,THREAD_SIZE)
+ add r1,r3,r1
li r0,0
stdu r0,-STACK_FRAME_OVERHEAD(r1)
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 6fd08219248d..07d4e0ad60db 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -20,6 +20,7 @@
#include <asm/kvm_book3s_asm.h>
#include <asm/opal.h>
#include <asm/cpuidle.h>
+#include <asm/exception-64s.h>
#include <asm/book3s/64/mmu-hash.h>
#include <asm/mmu.h>
@@ -94,12 +95,12 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
core_idle_lock_held:
HMT_LOW
3: lwz r15,0(r14)
- andi. r15,r15,PNV_CORE_IDLE_LOCK_BIT
+ andis. r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
bne 3b
HMT_MEDIUM
lwarx r15,0,r14
- andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT
- bne core_idle_lock_held
+ andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+ bne- core_idle_lock_held
blr
/*
@@ -113,7 +114,7 @@ core_idle_lock_held:
*
* Address to 'rfid' to in r5
*/
-_GLOBAL(pnv_powersave_common)
+pnv_powersave_common:
/* Use r3 to pass state nap/sleep/winkle */
/* NAP is a state loss, we create a regs frame on the
* stack, fill it up with the state we care about and
@@ -188,8 +189,8 @@ pnv_enter_arch207_idle_mode:
/* The following store to HSTATE_HWTHREAD_STATE(r13) */
/* MUST occur in real mode, i.e. with the MMU off, */
/* and the MMU must stay off until we clear this flag */
- /* and test HSTATE_HWTHREAD_REQ(r13) in the system */
- /* reset interrupt vector in exceptions-64s.S. */
+ /* and test HSTATE_HWTHREAD_REQ(r13) in */
+ /* pnv_powersave_wakeup in this file. */
/* The reason is that another thread can switch the */
/* MMU to a guest context whenever this flag is set */
/* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on, */
@@ -209,15 +210,20 @@ pnv_enter_arch207_idle_mode:
/* Sleep or winkle */
lbz r7,PACA_THREAD_MASK(r13)
ld r14,PACA_CORE_IDLE_STATE_PTR(r13)
+ li r5,0
+ beq cr3,3f
+ lis r5,PNV_CORE_IDLE_WINKLE_COUNT@h
+3:
lwarx_loop1:
lwarx r15,0,r14
- andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT
- bnel core_idle_lock_held
+ andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+ bnel- core_idle_lock_held
+ add r15,r15,r5 /* Add if winkle */
andc r15,r15,r7 /* Clear thread bit */
- andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS
+ andi. r9,r15,PNV_CORE_IDLE_THREAD_BITS
/*
* If cr0 = 0, then current thread is the last thread of the core entering
@@ -240,7 +246,7 @@ common_enter: /* common code for all the threads entering sleep or winkle */
IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
fastsleep_workaround_at_entry:
- ori r15,r15,PNV_CORE_IDLE_LOCK_BIT
+ oris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
stwcx. r15,0,r14
bne- lwarx_loop1
isync
@@ -250,10 +256,10 @@ fastsleep_workaround_at_entry:
li r4,1
bl opal_config_cpu_idle_state
- /* Clear Lock bit */
- li r0,0
+ /* Unlock */
+ xoris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
lwsync
- stw r0,0(r14)
+ stw r15,0(r14)
b common_enter
enter_winkle:
@@ -301,8 +307,8 @@ power_enter_stop:
lwarx_loop_stop:
lwarx r15,0,r14
- andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT
- bnel core_idle_lock_held
+ andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+ bnel- core_idle_lock_held
andc r15,r15,r7 /* Clear thread bit */
stwcx. r15,0,r14
@@ -375,17 +381,113 @@ _GLOBAL(power9_idle_stop)
li r4,1
b pnv_powersave_common
/* No return */
+
/*
- * Called from reset vector. Check whether we have woken up with
- * hypervisor state loss. If yes, restore hypervisor state and return
- * back to reset vector.
+ * On waking up from stop 0,1,2 with ESL=1 on POWER9 DD1,
+ * HSPRG0 will be set to the HSPRG0 value of one of the
+ * threads in this core. Thus the value we have in r13
+ * may not be this thread's paca pointer.
+ *
+ * Fortunately, the TIR remains invariant. Since this thread's
+ * paca pointer is recorded in all its sibling's paca, we can
+ * correctly recover this thread's paca pointer if we
+ * know the index of this thread in the core.
+ *
+ * This index can be obtained from the TIR.
*
- * r13 - Contents of HSPRG0
+ * i.e, thread's position in the core = TIR.
+ * If this value is i, then this thread's paca is
+ * paca->thread_sibling_pacas[i].
+ */
+power9_dd1_recover_paca:
+ mfspr r4, SPRN_TIR
+ /*
+ * Since each entry in thread_sibling_pacas is 8 bytes
+ * we need to left-shift by 3 bits. Thus r4 = i * 8
+ */
+ sldi r4, r4, 3
+ /* Get &paca->thread_sibling_pacas[0] in r5 */
+ ld r5, PACA_SIBLING_PACA_PTRS(r13)
+ /* Load paca->thread_sibling_pacas[i] into r13 */
+ ldx r13, r4, r5
+ SET_PACA(r13)
+ /*
+ * Indicate that we have lost NVGPR state
+ * which needs to be restored from the stack.
+ */
+ li r3, 1
+ stb r0,PACA_NAPSTATELOST(r13)
+ blr
+
+/*
+ * Called from machine check handler for powersave wakeups.
+ * Low level machine check processing has already been done. Now just
+ * go through the wake up path to get everything in order.
+ *
+ * r3 - The original SRR1 value.
+ * Original SRR[01] have been clobbered.
+ * MSR_RI is clear.
+ */
+.global pnv_powersave_wakeup_mce
+pnv_powersave_wakeup_mce:
+ /* Set cr3 for pnv_powersave_wakeup */
+ rlwinm r11,r3,47-31,30,31
+ cmpwi cr3,r11,2
+
+ /*
+ * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake
+ * reason into SRR1, which allows reuse of the system reset wakeup
+ * code without being mistaken for another type of wakeup.
+ */
+ oris r3,r3,SRR1_WAKEMCE_RESVD@h
+ mtspr SPRN_SRR1,r3
+
+ b pnv_powersave_wakeup
+
+/*
+ * Called from reset vector for powersave wakeups.
* cr3 - set to gt if waking up with partial/complete hypervisor state loss
*/
-_GLOBAL(pnv_restore_hyp_resource)
+.global pnv_powersave_wakeup
+pnv_powersave_wakeup:
+ ld r2, PACATOC(r13)
+
BEGIN_FTR_SECTION
- ld r2,PACATOC(r13);
+BEGIN_FTR_SECTION_NESTED(70)
+ bl power9_dd1_recover_paca
+END_FTR_SECTION_NESTED_IFSET(CPU_FTR_POWER9_DD1, 70)
+ bl pnv_restore_hyp_resource_arch300
+FTR_SECTION_ELSE
+ bl pnv_restore_hyp_resource_arch207
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
+
+ li r0,PNV_THREAD_RUNNING
+ stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ li r0,KVM_HWTHREAD_IN_KERNEL
+ stb r0,HSTATE_HWTHREAD_STATE(r13)
+ /* Order setting hwthread_state vs. testing hwthread_req */
+ sync
+ lbz r0,HSTATE_HWTHREAD_REQ(r13)
+ cmpwi r0,0
+ beq 1f
+ b kvm_start_guest
+1:
+#endif
+
+ /* Return SRR1 from power7_nap() */
+ mfspr r3,SPRN_SRR1
+ blt cr3,pnv_wakeup_noloss
+ b pnv_wakeup_loss
+
+/*
+ * Check whether we have woken up with hypervisor state loss.
+ * If yes, restore hypervisor state and return back to link.
+ *
+ * cr3 - set to gt if waking up with partial/complete hypervisor state loss
+ */
+pnv_restore_hyp_resource_arch300:
/*
* POWER ISA 3. Use PSSCR to determine if we
* are waking up from deep idle state
@@ -400,31 +502,19 @@ BEGIN_FTR_SECTION
*/
rldicl r5,r5,4,60
cmpd cr4,r5,r4
- bge cr4,pnv_wakeup_tb_loss
- /*
- * Waking up without hypervisor state loss. Return to
- * reset vector
- */
- blr
+ bge cr4,pnv_wakeup_tb_loss /* returns to caller */
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+ blr /* Waking up without hypervisor state loss. */
+/* Same calling convention as arch300 */
+pnv_restore_hyp_resource_arch207:
/*
* POWER ISA 2.07 or less.
- * Check if last bit of HSPGR0 is set. This indicates whether we are
- * waking up from winkle.
+ * Check if we slept with sleep or winkle.
*/
- clrldi r5,r13,63
- clrrdi r13,r13,1
-
- /* Now that we are sure r13 is corrected, load TOC */
- ld r2,PACATOC(r13);
- cmpwi cr4,r5,1
- mtspr SPRN_HSPRG0,r13
-
- lbz r0,PACA_THREAD_IDLE_STATE(r13)
- cmpwi cr2,r0,PNV_THREAD_NAP
- bgt cr2,pnv_wakeup_tb_loss /* Either sleep or Winkle */
+ lbz r4,PACA_THREAD_IDLE_STATE(r13)
+ cmpwi cr2,r4,PNV_THREAD_NAP
+ bgt cr2,pnv_wakeup_tb_loss /* Either sleep or Winkle */
/*
* We fall through here if PACA_THREAD_IDLE_STATE shows we are waking
@@ -433,8 +523,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
*/
bgt cr3,.
- blr /* Return back to System Reset vector from where
- pnv_restore_hyp_resource was invoked */
+ blr /* Waking up without hypervisor state loss */
/*
* Called if waking up from idle state which can cause either partial or
@@ -444,9 +533,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
*
* r13 - PACA
* cr3 - gt if waking up with partial/complete hypervisor state loss
+ *
+ * If ISA300:
* cr4 - gt or eq if waking up from complete hypervisor state loss.
+ *
+ * If ISA207:
+ * r4 - PACA_THREAD_IDLE_STATE
*/
-_GLOBAL(pnv_wakeup_tb_loss)
+pnv_wakeup_tb_loss:
ld r1,PACAR1(r13)
/*
* Before entering any idle state, the NVGPRs are saved in the stack.
@@ -473,18 +567,19 @@ _GLOBAL(pnv_wakeup_tb_loss)
* is required to return back to reset vector after hypervisor state
* restore is complete.
*/
+ mr r18,r4
mflr r17
mfspr r16,SPRN_SRR1
BEGIN_FTR_SECTION
CHECK_HMI_INTERRUPT
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
- lbz r7,PACA_THREAD_MASK(r13)
ld r14,PACA_CORE_IDLE_STATE_PTR(r13)
-lwarx_loop2:
- lwarx r15,0,r14
- andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT
+ lbz r7,PACA_THREAD_MASK(r13)
+
/*
+ * Take the core lock to synchronize against other threads.
+ *
* Lock bit is set in one of the 2 cases-
* a. In the sleep/winkle enter path, the last thread is executing
* fastsleep workaround code.
@@ -492,23 +587,93 @@ lwarx_loop2:
* workaround undo code or resyncing timebase or restoring context
* In either case loop until the lock bit is cleared.
*/
- bnel core_idle_lock_held
+1:
+ lwarx r15,0,r14
+ andis. r9,r15,PNV_CORE_IDLE_LOCK_BIT@h
+ bnel- core_idle_lock_held
+ oris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
+ stwcx. r15,0,r14
+ bne- 1b
+ isync
- cmpwi cr2,r15,0
+ andi. r9,r15,PNV_CORE_IDLE_THREAD_BITS
+ cmpwi cr2,r9,0
/*
* At this stage
* cr2 - eq if first thread to wakeup in core
* cr3- gt if waking up with partial/complete hypervisor state loss
+ * ISA300:
* cr4 - gt or eq if waking up from complete hypervisor state loss.
*/
- ori r15,r15,PNV_CORE_IDLE_LOCK_BIT
- stwcx. r15,0,r14
- bne- lwarx_loop2
- isync
-
BEGIN_FTR_SECTION
+ /*
+ * Were we in winkle?
+ * If yes, check if all threads were in winkle, decrement our
+ * winkle count, set all thread winkle bits if all were in winkle.
+ * Check if our thread has a winkle bit set, and set cr4 accordingly
+ * (to match ISA300, above). Pseudo-code for core idle state
+ * transitions for ISA207 is as follows (everything happens atomically
+ * due to store conditional and/or lock bit):
+ *
+ * nap_idle() { }
+ * nap_wake() { }
+ *
+ * sleep_idle()
+ * {
+ * core_idle_state &= ~thread_in_core
+ * }
+ *
+ * sleep_wake()
+ * {
+ * bool first_in_core, first_in_subcore;
+ *
+ * first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
+ * first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
+ *
+ * core_idle_state |= thread_in_core;
+ * }
+ *
+ * winkle_idle()
+ * {
+ * core_idle_state &= ~thread_in_core;
+ * core_idle_state += 1 << WINKLE_COUNT_SHIFT;
+ * }
+ *
+ * winkle_wake()
+ * {
+ * bool first_in_core, first_in_subcore, winkle_state_lost;
+ *
+ * first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
+ * first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
+ *
+ * core_idle_state |= thread_in_core;
+ *
+ * if ((core_idle_state & WINKLE_MASK) == (8 << WINKLE_COUNT_SIHFT))
+ * core_idle_state |= THREAD_WINKLE_BITS;
+ * core_idle_state -= 1 << WINKLE_COUNT_SHIFT;
+ *
+ * winkle_state_lost = core_idle_state &
+ * (thread_in_core << WINKLE_THREAD_SHIFT);
+ * core_idle_state &= ~(thread_in_core << WINKLE_THREAD_SHIFT);
+ * }
+ *
+ */
+ cmpwi r18,PNV_THREAD_WINKLE
+ bne 2f
+ andis. r9,r15,PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT@h
+ subis r15,r15,PNV_CORE_IDLE_WINKLE_COUNT@h
+ beq 2f
+ ori r15,r15,PNV_CORE_IDLE_THREAD_WINKLE_BITS /* all were winkle */
+2:
+ /* Shift thread bit to winkle mask, then test if this thread is set,
+ * and remove it from the winkle bits */
+ slwi r8,r7,8
+ and r8,r8,r15
+ andc r15,r15,r8
+ cmpwi cr4,r8,1 /* cr4 will be gt if our bit is set, lt if not */
+
lbz r4,PACA_SUBCORE_SIBLING_MASK(r13)
and r4,r4,r15
cmpwi r4,0 /* Check if first in subcore */
@@ -593,7 +758,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
mtspr SPRN_WORC,r4
clear_lock:
- andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS
+ xoris r15,r15,PNV_CORE_IDLE_LOCK_BIT@h
lwsync
stw r15,0(r14)
@@ -651,8 +816,7 @@ hypervisor_state_restored:
mtspr SPRN_SRR1,r16
mtlr r17
- blr /* Return back to System Reset vector from where
- pnv_restore_hyp_resource was invoked */
+ blr /* return to pnv_powersave_wakeup */
fastsleep_workaround_at_exit:
li r3,1
@@ -664,7 +828,8 @@ fastsleep_workaround_at_exit:
* R3 here contains the value that will be returned to the caller
* of power7_nap.
*/
-_GLOBAL(pnv_wakeup_loss)
+.global pnv_wakeup_loss
+pnv_wakeup_loss:
ld r1,PACAR1(r13)
BEGIN_FTR_SECTION
CHECK_HMI_INTERRUPT
@@ -684,7 +849,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
* R3 here contains the value that will be returned to the caller
* of power7_nap.
*/
-_GLOBAL(pnv_wakeup_noloss)
+pnv_wakeup_noloss:
lbz r0,PACA_NAPSTATELOST(r13)
cmpwi r0,0
bne pnv_wakeup_loss
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 5f202a566ec5..5a3231fedf08 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -711,13 +711,16 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
return tbl;
}
-void iommu_free_table(struct iommu_table *tbl, const char *node_name)
+static void iommu_table_free(struct kref *kref)
{
unsigned long bitmap_sz;
unsigned int order;
+ struct iommu_table *tbl;
- if (!tbl)
- return;
+ tbl = container_of(kref, struct iommu_table, it_kref);
+
+ if (tbl->it_ops->free)
+ tbl->it_ops->free(tbl);
if (!tbl->it_map) {
kfree(tbl);
@@ -733,7 +736,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
/* verify that table contains no entries */
if (!bitmap_empty(tbl->it_map, tbl->it_size))
- pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name);
+ pr_warn("%s: Unexpected TCEs\n", __func__);
/* calculate bitmap size in bytes */
bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long);
@@ -746,6 +749,24 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
kfree(tbl);
}
+struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl)
+{
+ if (kref_get_unless_zero(&tbl->it_kref))
+ return tbl;
+
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_table_get);
+
+int iommu_tce_table_put(struct iommu_table *tbl)
+{
+ if (WARN_ON(!tbl))
+ return 0;
+
+ return kref_put(&tbl->it_kref, iommu_table_free);
+}
+EXPORT_SYMBOL_GPL(iommu_tce_table_put);
+
/* Creates TCEs for a user provided buffer. The user buffer must be
* contiguous real kernel storage (not vmalloc). The address passed here
* comprises a page address and offset into that page. The dma_addr_t
@@ -1004,6 +1025,31 @@ long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry,
}
EXPORT_SYMBOL_GPL(iommu_tce_xchg);
+#ifdef CONFIG_PPC_BOOK3S_64
+long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry,
+ unsigned long *hpa, enum dma_data_direction *direction)
+{
+ long ret;
+
+ ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+
+ if (!ret && ((*direction == DMA_FROM_DEVICE) ||
+ (*direction == DMA_BIDIRECTIONAL))) {
+ struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT);
+
+ if (likely(pg)) {
+ SetPageDirty(pg);
+ } else {
+ tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+ ret = -EFAULT;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm);
+#endif
+
int iommu_take_ownership(struct iommu_table *tbl)
{
unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index a018f5cae899..5c291df30fe3 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -65,7 +65,6 @@
#include <asm/machdep.h>
#include <asm/udbg.h>
#include <asm/smp.h>
-#include <asm/debug.h>
#include <asm/livepatch.h>
#include <asm/asm-prototypes.h>
@@ -442,46 +441,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
return sum;
}
-#ifdef CONFIG_HOTPLUG_CPU
-void migrate_irqs(void)
-{
- struct irq_desc *desc;
- unsigned int irq;
- static int warned;
- cpumask_var_t mask;
- const struct cpumask *map = cpu_online_mask;
-
- alloc_cpumask_var(&mask, GFP_KERNEL);
-
- for_each_irq_desc(irq, desc) {
- struct irq_data *data;
- struct irq_chip *chip;
-
- data = irq_desc_get_irq_data(desc);
- if (irqd_is_per_cpu(data))
- continue;
-
- chip = irq_data_get_irq_chip(data);
-
- cpumask_and(mask, irq_data_get_affinity_mask(data), map);
- if (cpumask_any(mask) >= nr_cpu_ids) {
- pr_warn("Breaking affinity for irq %i\n", irq);
- cpumask_copy(mask, map);
- }
- if (chip->irq_set_affinity)
- chip->irq_set_affinity(data, mask, true);
- else if (desc->action && !(warned++))
- pr_err("Cannot set affinity for irq %i\n", irq);
- }
-
- free_cpumask_var(mask);
-
- local_irq_enable();
- mdelay(1);
- local_irq_disable();
-}
-#endif
-
static inline void check_stack_overflow(void)
{
#ifdef CONFIG_DEBUG_STACKOVERFLOW
diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c
new file mode 100644
index 000000000000..6c089d9757c9
--- /dev/null
+++ b/arch/powerpc/kernel/kprobes-ftrace.c
@@ -0,0 +1,104 @@
+/*
+ * Dynamic Ftrace based Kprobes Optimization
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) Hitachi Ltd., 2012
+ * Copyright 2016 Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
+ * IBM Corporation
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/hardirq.h>
+#include <linux/preempt.h>
+#include <linux/ftrace.h>
+
+static nokprobe_inline
+int __skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb, unsigned long orig_nip)
+{
+ /*
+ * Emulate singlestep (and also recover regs->nip)
+ * as if there is a nop
+ */
+ regs->nip = (unsigned long)p->addr + MCOUNT_INSN_SIZE;
+ if (unlikely(p->post_handler)) {
+ kcb->kprobe_status = KPROBE_HIT_SSDONE;
+ p->post_handler(p, regs, 0);
+ }
+ __this_cpu_write(current_kprobe, NULL);
+ if (orig_nip)
+ regs->nip = orig_nip;
+ return 1;
+}
+
+int skip_singlestep(struct kprobe *p, struct pt_regs *regs,
+ struct kprobe_ctlblk *kcb)
+{
+ if (kprobe_ftrace(p))
+ return __skip_singlestep(p, regs, kcb, 0);
+ else
+ return 0;
+}
+NOKPROBE_SYMBOL(skip_singlestep);
+
+/* Ftrace callback handler for kprobes */
+void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip,
+ struct ftrace_ops *ops, struct pt_regs *regs)
+{
+ struct kprobe *p;
+ struct kprobe_ctlblk *kcb;
+ unsigned long flags;
+
+ /* Disable irq for emulating a breakpoint and avoiding preempt */
+ local_irq_save(flags);
+ hard_irq_disable();
+
+ p = get_kprobe((kprobe_opcode_t *)nip);
+ if (unlikely(!p) || kprobe_disabled(p))
+ goto end;
+
+ kcb = get_kprobe_ctlblk();
+ if (kprobe_running()) {
+ kprobes_inc_nmissed_count(p);
+ } else {
+ unsigned long orig_nip = regs->nip;
+
+ /*
+ * On powerpc, NIP is *before* this instruction for the
+ * pre handler
+ */
+ regs->nip -= MCOUNT_INSN_SIZE;
+
+ __this_cpu_write(current_kprobe, p);
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ if (!p->pre_handler || !p->pre_handler(p, regs))
+ __skip_singlestep(p, regs, kcb, orig_nip);
+ /*
+ * If pre_handler returns !0, it sets regs->nip and
+ * resets current kprobe.
+ */
+ }
+end:
+ local_irq_restore(flags);
+}
+NOKPROBE_SYMBOL(kprobe_ftrace_handler);
+
+int arch_prepare_kprobe_ftrace(struct kprobe *p)
+{
+ p->ainsn.insn = NULL;
+ p->ainsn.boostable = -1;
+ return 0;
+}
diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c
index fce05a38851c..160ae0fa7d0d 100644
--- a/arch/powerpc/kernel/kprobes.c
+++ b/arch/powerpc/kernel/kprobes.c
@@ -35,6 +35,7 @@
#include <asm/code-patching.h>
#include <asm/cacheflush.h>
#include <asm/sstep.h>
+#include <asm/sections.h>
#include <linux/uaccess.h>
DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
@@ -42,7 +43,86 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
struct kretprobe_blackpoint kretprobe_blacklist[] = {{NULL, NULL}};
-int __kprobes arch_prepare_kprobe(struct kprobe *p)
+bool arch_within_kprobe_blacklist(unsigned long addr)
+{
+ return (addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end) ||
+ (addr >= (unsigned long)_stext &&
+ addr < (unsigned long)__head_end);
+}
+
+kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset)
+{
+ kprobe_opcode_t *addr;
+
+#ifdef PPC64_ELF_ABI_v2
+ /* PPC64 ABIv2 needs local entry point */
+ addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+ if (addr && !offset) {
+#ifdef CONFIG_KPROBES_ON_FTRACE
+ unsigned long faddr;
+ /*
+ * Per livepatch.h, ftrace location is always within the first
+ * 16 bytes of a function on powerpc with -mprofile-kernel.
+ */
+ faddr = ftrace_location_range((unsigned long)addr,
+ (unsigned long)addr + 16);
+ if (faddr)
+ addr = (kprobe_opcode_t *)faddr;
+ else
+#endif
+ addr = (kprobe_opcode_t *)ppc_function_entry(addr);
+ }
+#elif defined(PPC64_ELF_ABI_v1)
+ /*
+ * 64bit powerpc ABIv1 uses function descriptors:
+ * - Check for the dot variant of the symbol first.
+ * - If that fails, try looking up the symbol provided.
+ *
+ * This ensures we always get to the actual symbol and not
+ * the descriptor.
+ *
+ * Also handle <module:symbol> format.
+ */
+ char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN];
+ const char *modsym;
+ bool dot_appended = false;
+ if ((modsym = strchr(name, ':')) != NULL) {
+ modsym++;
+ if (*modsym != '\0' && *modsym != '.') {
+ /* Convert to <module:.symbol> */
+ strncpy(dot_name, name, modsym - name);
+ dot_name[modsym - name] = '.';
+ dot_name[modsym - name + 1] = '\0';
+ strncat(dot_name, modsym,
+ sizeof(dot_name) - (modsym - name) - 2);
+ dot_appended = true;
+ } else {
+ dot_name[0] = '\0';
+ strncat(dot_name, name, sizeof(dot_name) - 1);
+ }
+ } else if (name[0] != '.') {
+ dot_name[0] = '.';
+ dot_name[1] = '\0';
+ strncat(dot_name, name, KSYM_NAME_LEN - 2);
+ dot_appended = true;
+ } else {
+ dot_name[0] = '\0';
+ strncat(dot_name, name, KSYM_NAME_LEN - 1);
+ }
+ addr = (kprobe_opcode_t *)kallsyms_lookup_name(dot_name);
+ if (!addr && dot_appended) {
+ /* Let's try the original non-dot symbol lookup */
+ addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+ }
+#else
+ addr = (kprobe_opcode_t *)kallsyms_lookup_name(name);
+#endif
+
+ return addr;
+}
+
+int arch_prepare_kprobe(struct kprobe *p)
{
int ret = 0;
kprobe_opcode_t insn = *p->addr;
@@ -74,30 +154,34 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p)
p->ainsn.boostable = 0;
return ret;
}
+NOKPROBE_SYMBOL(arch_prepare_kprobe);
-void __kprobes arch_arm_kprobe(struct kprobe *p)
+void arch_arm_kprobe(struct kprobe *p)
{
*p->addr = BREAKPOINT_INSTRUCTION;
flush_icache_range((unsigned long) p->addr,
(unsigned long) p->addr + sizeof(kprobe_opcode_t));
}
+NOKPROBE_SYMBOL(arch_arm_kprobe);
-void __kprobes arch_disarm_kprobe(struct kprobe *p)
+void arch_disarm_kprobe(struct kprobe *p)
{
*p->addr = p->opcode;
flush_icache_range((unsigned long) p->addr,
(unsigned long) p->addr + sizeof(kprobe_opcode_t));
}
+NOKPROBE_SYMBOL(arch_disarm_kprobe);
-void __kprobes arch_remove_kprobe(struct kprobe *p)
+void arch_remove_kprobe(struct kprobe *p)
{
if (p->ainsn.insn) {
free_insn_slot(p->ainsn.insn, 0);
p->ainsn.insn = NULL;
}
}
+NOKPROBE_SYMBOL(arch_remove_kprobe);
-static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
+static nokprobe_inline void prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
{
enable_single_step(regs);
@@ -110,37 +194,80 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
regs->nip = (unsigned long)p->ainsn.insn;
}
-static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void save_previous_kprobe(struct kprobe_ctlblk *kcb)
{
kcb->prev_kprobe.kp = kprobe_running();
kcb->prev_kprobe.status = kcb->kprobe_status;
kcb->prev_kprobe.saved_msr = kcb->kprobe_saved_msr;
}
-static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
+static nokprobe_inline void restore_previous_kprobe(struct kprobe_ctlblk *kcb)
{
__this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
kcb->kprobe_status = kcb->prev_kprobe.status;
kcb->kprobe_saved_msr = kcb->prev_kprobe.saved_msr;
}
-static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
+static nokprobe_inline void set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
{
__this_cpu_write(current_kprobe, p);
kcb->kprobe_saved_msr = regs->msr;
}
-void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
- struct pt_regs *regs)
+bool arch_function_offset_within_entry(unsigned long offset)
+{
+#ifdef PPC64_ELF_ABI_v2
+#ifdef CONFIG_KPROBES_ON_FTRACE
+ return offset <= 16;
+#else
+ return offset <= 8;
+#endif
+#else
+ return !offset;
+#endif
+}
+
+void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs)
{
ri->ret_addr = (kprobe_opcode_t *)regs->link;
/* Replace the return addr with trampoline addr */
regs->link = (unsigned long)kretprobe_trampoline;
}
+NOKPROBE_SYMBOL(arch_prepare_kretprobe);
-int __kprobes kprobe_handler(struct pt_regs *regs)
+int try_to_emulate(struct kprobe *p, struct pt_regs *regs)
+{
+ int ret;
+ unsigned int insn = *p->ainsn.insn;
+
+ /* regs->nip is also adjusted if emulate_step returns 1 */
+ ret = emulate_step(regs, insn);
+ if (ret > 0) {
+ /*
+ * Once this instruction has been boosted
+ * successfully, set the boostable flag
+ */
+ if (unlikely(p->ainsn.boostable == 0))
+ p->ainsn.boostable = 1;
+ } else if (ret < 0) {
+ /*
+ * We don't allow kprobes on mtmsr(d)/rfi(d), etc.
+ * So, we should never get here... but, its still
+ * good to catch them, just in case...
+ */
+ printk("Can't step on instruction %x\n", insn);
+ BUG();
+ } else if (ret == 0)
+ /* This instruction can't be boosted */
+ p->ainsn.boostable = -1;
+
+ return ret;
+}
+NOKPROBE_SYMBOL(try_to_emulate);
+
+int kprobe_handler(struct pt_regs *regs)
{
struct kprobe *p;
int ret = 0;
@@ -177,10 +304,17 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
*/
save_previous_kprobe(kcb);
set_current_kprobe(p, regs, kcb);
- kcb->kprobe_saved_msr = regs->msr;
kprobes_inc_nmissed_count(p);
prepare_singlestep(p, regs);
kcb->kprobe_status = KPROBE_REENTER;
+ if (p->ainsn.boostable >= 0) {
+ ret = try_to_emulate(p, regs);
+
+ if (ret > 0) {
+ restore_previous_kprobe(kcb);
+ return 1;
+ }
+ }
return 1;
} else {
if (*addr != BREAKPOINT_INSTRUCTION) {
@@ -197,7 +331,9 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
}
p = __this_cpu_read(current_kprobe);
if (p->break_handler && p->break_handler(p, regs)) {
- goto ss_probe;
+ if (!skip_singlestep(p, regs, kcb))
+ goto ss_probe;
+ ret = 1;
}
}
goto no_kprobe;
@@ -235,18 +371,9 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
ss_probe:
if (p->ainsn.boostable >= 0) {
- unsigned int insn = *p->ainsn.insn;
+ ret = try_to_emulate(p, regs);
- /* regs->nip is also adjusted if emulate_step returns 1 */
- ret = emulate_step(regs, insn);
if (ret > 0) {
- /*
- * Once this instruction has been boosted
- * successfully, set the boostable flag
- */
- if (unlikely(p->ainsn.boostable == 0))
- p->ainsn.boostable = 1;
-
if (p->post_handler)
p->post_handler(p, regs, 0);
@@ -254,17 +381,7 @@ ss_probe:
reset_current_kprobe();
preempt_enable_no_resched();
return 1;
- } else if (ret < 0) {
- /*
- * We don't allow kprobes on mtmsr(d)/rfi(d), etc.
- * So, we should never get here... but, its still
- * good to catch them, just in case...
- */
- printk("Can't step on instruction %x\n", insn);
- BUG();
- } else if (ret == 0)
- /* This instruction can't be boosted */
- p->ainsn.boostable = -1;
+ }
}
prepare_singlestep(p, regs);
kcb->kprobe_status = KPROBE_HIT_SS;
@@ -274,6 +391,7 @@ no_kprobe:
preempt_enable_no_resched();
return ret;
}
+NOKPROBE_SYMBOL(kprobe_handler);
/*
* Function return probe trampoline:
@@ -291,8 +409,7 @@ asm(".global kretprobe_trampoline\n"
/*
* Called when the probe at kretprobe trampoline is hit
*/
-static int __kprobes trampoline_probe_handler(struct kprobe *p,
- struct pt_regs *regs)
+static int trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
@@ -361,6 +478,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
*/
return 1;
}
+NOKPROBE_SYMBOL(trampoline_probe_handler);
/*
* Called after single-stepping. p->addr is the address of the
@@ -370,7 +488,7 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p,
* single-stepped a copy of the instruction. The address of this
* copy is p->ainsn.insn.
*/
-int __kprobes kprobe_post_handler(struct pt_regs *regs)
+int kprobe_post_handler(struct pt_regs *regs)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -410,8 +528,9 @@ out:
return 1;
}
+NOKPROBE_SYMBOL(kprobe_post_handler);
-int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
+int kprobe_fault_handler(struct pt_regs *regs, int trapnr)
{
struct kprobe *cur = kprobe_running();
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -474,13 +593,15 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr)
}
return 0;
}
+NOKPROBE_SYMBOL(kprobe_fault_handler);
unsigned long arch_deref_entry_point(void *entry)
{
return ppc_global_function_entry(entry);
}
+NOKPROBE_SYMBOL(arch_deref_entry_point);
-int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
+int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct jprobe *jp = container_of(p, struct jprobe, kp);
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -497,17 +618,20 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
return 1;
}
+NOKPROBE_SYMBOL(setjmp_pre_handler);
-void __used __kprobes jprobe_return(void)
+void __used jprobe_return(void)
{
asm volatile("trap" ::: "memory");
}
+NOKPROBE_SYMBOL(jprobe_return);
-static void __used __kprobes jprobe_return_end(void)
+static void __used jprobe_return_end(void)
{
-};
+}
+NOKPROBE_SYMBOL(jprobe_return_end);
-int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
+int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
@@ -520,6 +644,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
preempt_enable_no_resched();
return 1;
}
+NOKPROBE_SYMBOL(longjmp_break_handler);
static struct kprobe trampoline_p = {
.addr = (kprobe_opcode_t *) &kretprobe_trampoline,
@@ -531,10 +656,11 @@ int __init arch_init_kprobes(void)
return register_kprobe(&trampoline_p);
}
-int __kprobes arch_trampoline_kprobe(struct kprobe *p)
+int arch_trampoline_kprobe(struct kprobe *p)
{
if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline)
return 1;
return 0;
}
+NOKPROBE_SYMBOL(arch_trampoline_kprobe);
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index a1475e6aef3a..5f9eada3519b 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -221,6 +221,8 @@ static void machine_check_process_queued_event(struct irq_work *work)
{
int index;
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
/*
* For now just print it to console.
* TODO: log this error event to FSP or nvram.
@@ -228,12 +230,13 @@ static void machine_check_process_queued_event(struct irq_work *work)
while (__this_cpu_read(mce_queue_count) > 0) {
index = __this_cpu_read(mce_queue_count) - 1;
machine_check_print_event_info(
- this_cpu_ptr(&mce_event_queue[index]));
+ this_cpu_ptr(&mce_event_queue[index]), false);
__this_cpu_dec(mce_queue_count);
}
}
-void machine_check_print_event_info(struct machine_check_event *evt)
+void machine_check_print_event_info(struct machine_check_event *evt,
+ bool user_mode)
{
const char *level, *sevstr, *subtype;
static const char *mc_ue_types[] = {
@@ -310,7 +313,16 @@ void machine_check_print_event_info(struct machine_check_event *evt)
printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
evt->disposition == MCE_DISPOSITION_RECOVERED ?
- "Recovered" : "[Not recovered");
+ "Recovered" : "Not recovered");
+
+ if (user_mode) {
+ printk("%s NIP: [%016llx] PID: %d Comm: %s\n", level,
+ evt->srr0, current->pid, current->comm);
+ } else {
+ printk("%s NIP [%016llx]: %pS\n", level, evt->srr0,
+ (void *)evt->srr0);
+ }
+
printk("%s Initiator: %s\n", level,
evt->initiator == MCE_INITIATOR_CPU ? "CPU" : "Unknown");
switch (evt->error_type) {
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 763d6f58caa8..f913139bb0c2 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -72,10 +72,14 @@ void __flush_tlb_power8(unsigned int action)
void __flush_tlb_power9(unsigned int action)
{
+ unsigned int num_sets;
+
if (radix_enabled())
- flush_tlb_206(POWER9_TLB_SETS_RADIX, action);
+ num_sets = POWER9_TLB_SETS_RADIX;
+ else
+ num_sets = POWER9_TLB_SETS_HASH;
- flush_tlb_206(POWER9_TLB_SETS_HASH, action);
+ flush_tlb_206(num_sets, action);
}
@@ -147,159 +151,365 @@ static int mce_flush(int what)
return 0;
}
-static int mce_handle_flush_derrors(uint64_t dsisr, uint64_t slb, uint64_t tlb, uint64_t erat)
-{
- if ((dsisr & slb) && mce_flush(MCE_FLUSH_SLB))
- dsisr &= ~slb;
- if ((dsisr & erat) && mce_flush(MCE_FLUSH_ERAT))
- dsisr &= ~erat;
- if ((dsisr & tlb) && mce_flush(MCE_FLUSH_TLB))
- dsisr &= ~tlb;
- /* Any other errors we don't understand? */
- if (dsisr)
- return 0;
- return 1;
-}
-
-static long mce_handle_derror(uint64_t dsisr, uint64_t slb_error_bits)
+#define SRR1_MC_LOADSTORE(srr1) ((srr1) & PPC_BIT(42))
+
+struct mce_ierror_table {
+ unsigned long srr1_mask;
+ unsigned long srr1_value;
+ bool nip_valid; /* nip is a valid indicator of faulting address */
+ unsigned int error_type;
+ unsigned int error_subtype;
+ unsigned int initiator;
+ unsigned int severity;
+};
+
+static const struct mce_ierror_table mce_p7_ierror_table[] = {
+{ 0x00000000001c0000, 0x0000000000040000, true,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000080000, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x00000000000c0000, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000100000, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000140000, true,
+ MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x0000000000180000, true,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000001c0000, 0x00000000001c0000, true,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+static const struct mce_ierror_table mce_p8_ierror_table[] = {
+{ 0x00000000081c0000, 0x0000000000040000, true,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000080000, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000000c0000, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000100000, true,
+ MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000140000, true,
+ MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000180000, true,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000001c0000, true,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008000000, true,
+ MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008040000, true,
+ MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+static const struct mce_ierror_table mce_p9_ierror_table[] = {
+{ 0x00000000081c0000, 0x0000000000040000, true,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_IFETCH,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000080000, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000000c0000, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000100000, true,
+ MCE_ERROR_TYPE_ERAT,MCE_ERAT_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000140000, true,
+ MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000000180000, true,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008000000, true,
+ MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008040000, true,
+ MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x00000000080c0000, true,
+ MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008100000, true,
+ MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000000081c0000, 0x0000000008140000, false,
+ MCE_ERROR_TYPE_RA, MCE_RA_ERROR_STORE,
+ MCE_INITIATOR_CPU, MCE_SEV_FATAL, }, /* ASYNC is fatal */
+{ 0x00000000081c0000, 0x0000000008180000, false,
+ MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_STORE_TIMEOUT,
+ MCE_INITIATOR_CPU, MCE_SEV_FATAL, }, /* ASYNC is fatal */
+{ 0x00000000081c0000, 0x00000000081c0000, true,
+ MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0, 0, 0, 0, 0, 0 } };
+
+struct mce_derror_table {
+ unsigned long dsisr_value;
+ bool dar_valid; /* dar is a valid indicator of faulting address */
+ unsigned int error_type;
+ unsigned int error_subtype;
+ unsigned int initiator;
+ unsigned int severity;
+};
+
+static const struct mce_derror_table mce_p7_derror_table[] = {
+{ 0x00008000, false,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00004000, true,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000800, true,
+ MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000400, true,
+ MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000100, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000080, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000040, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_INDETERMINATE, /* BOTH */
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+static const struct mce_derror_table mce_p8_derror_table[] = {
+{ 0x00008000, false,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00004000, true,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00002000, true,
+ MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00001000, true,
+ MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000800, true,
+ MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000400, true,
+ MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000200, true,
+ MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT, /* SECONDARY ERAT */
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000100, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000080, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+static const struct mce_derror_table mce_p9_derror_table[] = {
+{ 0x00008000, false,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_LOAD_STORE,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00004000, true,
+ MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00002000, true,
+ MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_LOAD_TIMEOUT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00001000, true,
+ MCE_ERROR_TYPE_LINK, MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000800, true,
+ MCE_ERROR_TYPE_ERAT, MCE_ERAT_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000400, true,
+ MCE_ERROR_TYPE_TLB, MCE_TLB_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000200, false,
+ MCE_ERROR_TYPE_USER, MCE_USER_ERROR_TLBIE,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000100, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_PARITY,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000080, true,
+ MCE_ERROR_TYPE_SLB, MCE_SLB_ERROR_MULTIHIT,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000040, true,
+ MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000020, false,
+ MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000010, false,
+ MCE_ERROR_TYPE_RA, MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0x00000008, false,
+ MCE_ERROR_TYPE_RA, MCE_RA_ERROR_LOAD_STORE_FOREIGN,
+ MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, },
+{ 0, false, 0, 0, 0, 0 } };
+
+static int mce_handle_ierror(struct pt_regs *regs,
+ const struct mce_ierror_table table[],
+ struct mce_error_info *mce_err, uint64_t *addr)
{
- long handled = 1;
+ uint64_t srr1 = regs->msr;
+ int handled = 0;
+ int i;
+
+ *addr = 0;
+
+ for (i = 0; table[i].srr1_mask; i++) {
+ if ((srr1 & table[i].srr1_mask) != table[i].srr1_value)
+ continue;
+
+ /* attempt to correct the error */
+ switch (table[i].error_type) {
+ case MCE_ERROR_TYPE_SLB:
+ handled = mce_flush(MCE_FLUSH_SLB);
+ break;
+ case MCE_ERROR_TYPE_ERAT:
+ handled = mce_flush(MCE_FLUSH_ERAT);
+ break;
+ case MCE_ERROR_TYPE_TLB:
+ handled = mce_flush(MCE_FLUSH_TLB);
+ break;
+ }
- /*
- * flush and reload SLBs for SLB errors and flush TLBs for TLB errors.
- * reset the error bits whenever we handle them so that at the end
- * we can check whether we handled all of them or not.
- * */
-#ifdef CONFIG_PPC_STD_MMU_64
- if (dsisr & slb_error_bits) {
- flush_and_reload_slb();
- /* reset error bits */
- dsisr &= ~(slb_error_bits);
- }
- if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
- if (cur_cpu_spec && cur_cpu_spec->flush_tlb)
- cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL);
- /* reset error bits */
- dsisr &= ~P7_DSISR_MC_TLB_MULTIHIT_MFTLB;
+ /* now fill in mce_error_info */
+ mce_err->error_type = table[i].error_type;
+ switch (table[i].error_type) {
+ case MCE_ERROR_TYPE_UE:
+ mce_err->u.ue_error_type = table[i].error_subtype;
+ break;
+ case MCE_ERROR_TYPE_SLB:
+ mce_err->u.slb_error_type = table[i].error_subtype;
+ break;
+ case MCE_ERROR_TYPE_ERAT:
+ mce_err->u.erat_error_type = table[i].error_subtype;
+ break;
+ case MCE_ERROR_TYPE_TLB:
+ mce_err->u.tlb_error_type = table[i].error_subtype;
+ break;
+ case MCE_ERROR_TYPE_USER:
+ mce_err->u.user_error_type = table[i].error_subtype;
+ break;
+ case MCE_ERROR_TYPE_RA:
+ mce_err->u.ra_error_type = table[i].error_subtype;
+ break;
+ case MCE_ERROR_TYPE_LINK:
+ mce_err->u.link_error_type = table[i].error_subtype;
+ break;
+ }
+ mce_err->severity = table[i].severity;
+ mce_err->initiator = table[i].initiator;
+ if (table[i].nip_valid)
+ *addr = regs->nip;
+ return handled;
}
-#endif
- /* Any other errors we don't understand? */
- if (dsisr & 0xffffffffUL)
- handled = 0;
- return handled;
-}
+ mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
+ mce_err->severity = MCE_SEV_ERROR_SYNC;
+ mce_err->initiator = MCE_INITIATOR_CPU;
-static long mce_handle_derror_p7(uint64_t dsisr)
-{
- return mce_handle_derror(dsisr, P7_DSISR_MC_SLB_ERRORS);
+ return 0;
}
-static long mce_handle_common_ierror(uint64_t srr1)
+static int mce_handle_derror(struct pt_regs *regs,
+ const struct mce_derror_table table[],
+ struct mce_error_info *mce_err, uint64_t *addr)
{
- long handled = 0;
-
- switch (P7_SRR1_MC_IFETCH(srr1)) {
- case 0:
- break;
-#ifdef CONFIG_PPC_STD_MMU_64
- case P7_SRR1_MC_IFETCH_SLB_PARITY:
- case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
- /* flush and reload SLBs for SLB errors. */
- flush_and_reload_slb();
- handled = 1;
- break;
- case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
- if (cur_cpu_spec && cur_cpu_spec->flush_tlb) {
- cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL);
- handled = 1;
+ uint64_t dsisr = regs->dsisr;
+ int handled = 0;
+ int found = 0;
+ int i;
+
+ *addr = 0;
+
+ for (i = 0; table[i].dsisr_value; i++) {
+ if (!(dsisr & table[i].dsisr_value))
+ continue;
+
+ /* attempt to correct the error */
+ switch (table[i].error_type) {
+ case MCE_ERROR_TYPE_SLB:
+ if (mce_flush(MCE_FLUSH_SLB))
+ handled = 1;
+ break;
+ case MCE_ERROR_TYPE_ERAT:
+ if (mce_flush(MCE_FLUSH_ERAT))
+ handled = 1;
+ break;
+ case MCE_ERROR_TYPE_TLB:
+ if (mce_flush(MCE_FLUSH_TLB))
+ handled = 1;
+ break;
}
- break;
-#endif
- default:
- break;
- }
- return handled;
-}
-
-static long mce_handle_ierror_p7(uint64_t srr1)
-{
- long handled = 0;
-
- handled = mce_handle_common_ierror(srr1);
+ /*
+ * Attempt to handle multiple conditions, but only return
+ * one. Ensure uncorrectable errors are first in the table
+ * to match.
+ */
+ if (found)
+ continue;
+
+ /* now fill in mce_error_info */
+ mce_err->error_type = table[i].error_type;
+ switch (table[i].error_type) {
+ case MCE_ERROR_TYPE_UE:
+ mce_err->u.ue_error_type = table[i].error_subtype;
+ break;
+ case MCE_ERROR_TYPE_SLB:
+ mce_err->u.slb_error_type = table[i].error_subtype;
+ break;
+ case MCE_ERROR_TYPE_ERAT:
+ mce_err->u.erat_error_type = table[i].error_subtype;
+ break;
+ case MCE_ERROR_TYPE_TLB:
+ mce_err->u.tlb_error_type = table[i].error_subtype;
+ break;
+ case MCE_ERROR_TYPE_USER:
+ mce_err->u.user_error_type = table[i].error_subtype;
+ break;
+ case MCE_ERROR_TYPE_RA:
+ mce_err->u.ra_error_type = table[i].error_subtype;
+ break;
+ case MCE_ERROR_TYPE_LINK:
+ mce_err->u.link_error_type = table[i].error_subtype;
+ break;
+ }
+ mce_err->severity = table[i].severity;
+ mce_err->initiator = table[i].initiator;
+ if (table[i].dar_valid)
+ *addr = regs->dar;
-#ifdef CONFIG_PPC_STD_MMU_64
- if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
- flush_and_reload_slb();
- handled = 1;
+ found = 1;
}
-#endif
- return handled;
-}
-static void mce_get_common_ierror(struct mce_error_info *mce_err, uint64_t srr1)
-{
- switch (P7_SRR1_MC_IFETCH(srr1)) {
- case P7_SRR1_MC_IFETCH_SLB_PARITY:
- mce_err->error_type = MCE_ERROR_TYPE_SLB;
- mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
- break;
- case P7_SRR1_MC_IFETCH_SLB_MULTIHIT:
- mce_err->error_type = MCE_ERROR_TYPE_SLB;
- mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
- break;
- case P7_SRR1_MC_IFETCH_TLB_MULTIHIT:
- mce_err->error_type = MCE_ERROR_TYPE_TLB;
- mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
- break;
- case P7_SRR1_MC_IFETCH_UE:
- case P7_SRR1_MC_IFETCH_UE_IFU_INTERNAL:
- mce_err->error_type = MCE_ERROR_TYPE_UE;
- mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH;
- break;
- case P7_SRR1_MC_IFETCH_UE_TLB_RELOAD:
- mce_err->error_type = MCE_ERROR_TYPE_UE;
- mce_err->u.ue_error_type =
- MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
- break;
- }
-}
+ if (found)
+ return handled;
-static void mce_get_ierror_p7(struct mce_error_info *mce_err, uint64_t srr1)
-{
- mce_get_common_ierror(mce_err, srr1);
- if (P7_SRR1_MC_IFETCH(srr1) == P7_SRR1_MC_IFETCH_SLB_BOTH) {
- mce_err->error_type = MCE_ERROR_TYPE_SLB;
- mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
- }
-}
+ mce_err->error_type = MCE_ERROR_TYPE_UNKNOWN;
+ mce_err->severity = MCE_SEV_ERROR_SYNC;
+ mce_err->initiator = MCE_INITIATOR_CPU;
-static void mce_get_derror_p7(struct mce_error_info *mce_err, uint64_t dsisr)
-{
- if (dsisr & P7_DSISR_MC_UE) {
- mce_err->error_type = MCE_ERROR_TYPE_UE;
- mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
- } else if (dsisr & P7_DSISR_MC_UE_TABLEWALK) {
- mce_err->error_type = MCE_ERROR_TYPE_UE;
- mce_err->u.ue_error_type =
- MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
- } else if (dsisr & P7_DSISR_MC_ERAT_MULTIHIT) {
- mce_err->error_type = MCE_ERROR_TYPE_ERAT;
- mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
- } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT) {
- mce_err->error_type = MCE_ERROR_TYPE_SLB;
- mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
- } else if (dsisr & P7_DSISR_MC_SLB_PARITY_MFSLB) {
- mce_err->error_type = MCE_ERROR_TYPE_SLB;
- mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
- } else if (dsisr & P7_DSISR_MC_TLB_MULTIHIT_MFTLB) {
- mce_err->error_type = MCE_ERROR_TYPE_TLB;
- mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
- } else if (dsisr & P7_DSISR_MC_SLB_MULTIHIT_PARITY) {
- mce_err->error_type = MCE_ERROR_TYPE_SLB;
- mce_err->u.slb_error_type = MCE_SLB_ERROR_INDETERMINATE;
- }
+ return 0;
}
static long mce_handle_ue_error(struct pt_regs *regs)
@@ -320,292 +530,42 @@ static long mce_handle_ue_error(struct pt_regs *regs)
return handled;
}
-long __machine_check_early_realmode_p7(struct pt_regs *regs)
+static long mce_handle_error(struct pt_regs *regs,
+ const struct mce_derror_table dtable[],
+ const struct mce_ierror_table itable[])
{
- uint64_t srr1, nip, addr;
- long handled = 1;
- struct mce_error_info mce_error_info = { 0 };
-
- mce_error_info.severity = MCE_SEV_ERROR_SYNC;
- mce_error_info.initiator = MCE_INITIATOR_CPU;
-
- srr1 = regs->msr;
- nip = regs->nip;
+ struct mce_error_info mce_err = { 0 };
+ uint64_t addr;
+ uint64_t srr1 = regs->msr;
+ long handled;
- /*
- * Handle memory errors depending whether this was a load/store or
- * ifetch exception. Also, populate the mce error_type and
- * type-specific error_type from either SRR1 or DSISR, depending
- * whether this was a load/store or ifetch exception
- */
- if (P7_SRR1_MC_LOADSTORE(srr1)) {
- handled = mce_handle_derror_p7(regs->dsisr);
- mce_get_derror_p7(&mce_error_info, regs->dsisr);
- addr = regs->dar;
- } else {
- handled = mce_handle_ierror_p7(srr1);
- mce_get_ierror_p7(&mce_error_info, srr1);
- addr = regs->nip;
- }
+ if (SRR1_MC_LOADSTORE(srr1))
+ handled = mce_handle_derror(regs, dtable, &mce_err, &addr);
+ else
+ handled = mce_handle_ierror(regs, itable, &mce_err, &addr);
- /* Handle UE error. */
- if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
+ if (!handled && mce_err.error_type == MCE_ERROR_TYPE_UE)
handled = mce_handle_ue_error(regs);
- save_mce_event(regs, handled, &mce_error_info, nip, addr);
- return handled;
-}
-
-static void mce_get_ierror_p8(struct mce_error_info *mce_err, uint64_t srr1)
-{
- mce_get_common_ierror(mce_err, srr1);
- if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
- mce_err->error_type = MCE_ERROR_TYPE_ERAT;
- mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
- }
-}
-
-static void mce_get_derror_p8(struct mce_error_info *mce_err, uint64_t dsisr)
-{
- mce_get_derror_p7(mce_err, dsisr);
- if (dsisr & P8_DSISR_MC_ERAT_MULTIHIT_SEC) {
- mce_err->error_type = MCE_ERROR_TYPE_ERAT;
- mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
- }
-}
-
-static long mce_handle_ierror_p8(uint64_t srr1)
-{
- long handled = 0;
+ save_mce_event(regs, handled, &mce_err, regs->nip, addr);
- handled = mce_handle_common_ierror(srr1);
-
-#ifdef CONFIG_PPC_STD_MMU_64
- if (P7_SRR1_MC_IFETCH(srr1) == P8_SRR1_MC_IFETCH_ERAT_MULTIHIT) {
- flush_and_reload_slb();
- handled = 1;
- }
-#endif
return handled;
}
-static long mce_handle_derror_p8(uint64_t dsisr)
-{
- return mce_handle_derror(dsisr, P8_DSISR_MC_SLB_ERRORS);
-}
-
-long __machine_check_early_realmode_p8(struct pt_regs *regs)
-{
- uint64_t srr1, nip, addr;
- long handled = 1;
- struct mce_error_info mce_error_info = { 0 };
-
- mce_error_info.severity = MCE_SEV_ERROR_SYNC;
- mce_error_info.initiator = MCE_INITIATOR_CPU;
-
- srr1 = regs->msr;
- nip = regs->nip;
-
- if (P7_SRR1_MC_LOADSTORE(srr1)) {
- handled = mce_handle_derror_p8(regs->dsisr);
- mce_get_derror_p8(&mce_error_info, regs->dsisr);
- addr = regs->dar;
- } else {
- handled = mce_handle_ierror_p8(srr1);
- mce_get_ierror_p8(&mce_error_info, srr1);
- addr = regs->nip;
- }
-
- /* Handle UE error. */
- if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
- handled = mce_handle_ue_error(regs);
-
- save_mce_event(regs, handled, &mce_error_info, nip, addr);
- return handled;
-}
-
-static int mce_handle_derror_p9(struct pt_regs *regs)
-{
- uint64_t dsisr = regs->dsisr;
-
- return mce_handle_flush_derrors(dsisr,
- P9_DSISR_MC_SLB_PARITY_MFSLB |
- P9_DSISR_MC_SLB_MULTIHIT_MFSLB,
-
- P9_DSISR_MC_TLB_MULTIHIT_MFTLB,
-
- P9_DSISR_MC_ERAT_MULTIHIT);
-}
-
-static int mce_handle_ierror_p9(struct pt_regs *regs)
+long __machine_check_early_realmode_p7(struct pt_regs *regs)
{
- uint64_t srr1 = regs->msr;
+ /* P7 DD1 leaves top bits of DSISR undefined */
+ regs->dsisr &= 0x0000ffff;
- switch (P9_SRR1_MC_IFETCH(srr1)) {
- case P9_SRR1_MC_IFETCH_SLB_PARITY:
- case P9_SRR1_MC_IFETCH_SLB_MULTIHIT:
- return mce_flush(MCE_FLUSH_SLB);
- case P9_SRR1_MC_IFETCH_TLB_MULTIHIT:
- return mce_flush(MCE_FLUSH_TLB);
- case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT:
- return mce_flush(MCE_FLUSH_ERAT);
- default:
- return 0;
- }
+ return mce_handle_error(regs, mce_p7_derror_table, mce_p7_ierror_table);
}
-static void mce_get_derror_p9(struct pt_regs *regs,
- struct mce_error_info *mce_err, uint64_t *addr)
-{
- uint64_t dsisr = regs->dsisr;
-
- mce_err->severity = MCE_SEV_ERROR_SYNC;
- mce_err->initiator = MCE_INITIATOR_CPU;
-
- if (dsisr & P9_DSISR_MC_USER_TLBIE)
- *addr = regs->nip;
- else
- *addr = regs->dar;
-
- if (dsisr & P9_DSISR_MC_UE) {
- mce_err->error_type = MCE_ERROR_TYPE_UE;
- mce_err->u.ue_error_type = MCE_UE_ERROR_LOAD_STORE;
- } else if (dsisr & P9_DSISR_MC_UE_TABLEWALK) {
- mce_err->error_type = MCE_ERROR_TYPE_UE;
- mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
- } else if (dsisr & P9_DSISR_MC_LINK_LOAD_TIMEOUT) {
- mce_err->error_type = MCE_ERROR_TYPE_LINK;
- mce_err->u.link_error_type = MCE_LINK_ERROR_LOAD_TIMEOUT;
- } else if (dsisr & P9_DSISR_MC_LINK_TABLEWALK_TIMEOUT) {
- mce_err->error_type = MCE_ERROR_TYPE_LINK;
- mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_LOAD_STORE_TIMEOUT;
- } else if (dsisr & P9_DSISR_MC_ERAT_MULTIHIT) {
- mce_err->error_type = MCE_ERROR_TYPE_ERAT;
- mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
- } else if (dsisr & P9_DSISR_MC_TLB_MULTIHIT_MFTLB) {
- mce_err->error_type = MCE_ERROR_TYPE_TLB;
- mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
- } else if (dsisr & P9_DSISR_MC_USER_TLBIE) {
- mce_err->error_type = MCE_ERROR_TYPE_USER;
- mce_err->u.user_error_type = MCE_USER_ERROR_TLBIE;
- } else if (dsisr & P9_DSISR_MC_SLB_PARITY_MFSLB) {
- mce_err->error_type = MCE_ERROR_TYPE_SLB;
- mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
- } else if (dsisr & P9_DSISR_MC_SLB_MULTIHIT_MFSLB) {
- mce_err->error_type = MCE_ERROR_TYPE_SLB;
- mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
- } else if (dsisr & P9_DSISR_MC_RA_LOAD) {
- mce_err->error_type = MCE_ERROR_TYPE_RA;
- mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD;
- } else if (dsisr & P9_DSISR_MC_RA_TABLEWALK) {
- mce_err->error_type = MCE_ERROR_TYPE_RA;
- mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE;
- } else if (dsisr & P9_DSISR_MC_RA_TABLEWALK_FOREIGN) {
- mce_err->error_type = MCE_ERROR_TYPE_RA;
- mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN;
- } else if (dsisr & P9_DSISR_MC_RA_FOREIGN) {
- mce_err->error_type = MCE_ERROR_TYPE_RA;
- mce_err->u.ra_error_type = MCE_RA_ERROR_LOAD_STORE_FOREIGN;
- }
-}
-
-static void mce_get_ierror_p9(struct pt_regs *regs,
- struct mce_error_info *mce_err, uint64_t *addr)
+long __machine_check_early_realmode_p8(struct pt_regs *regs)
{
- uint64_t srr1 = regs->msr;
-
- switch (P9_SRR1_MC_IFETCH(srr1)) {
- case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE:
- case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT:
- mce_err->severity = MCE_SEV_FATAL;
- break;
- default:
- mce_err->severity = MCE_SEV_ERROR_SYNC;
- break;
- }
-
- mce_err->initiator = MCE_INITIATOR_CPU;
-
- *addr = regs->nip;
-
- switch (P9_SRR1_MC_IFETCH(srr1)) {
- case P9_SRR1_MC_IFETCH_UE:
- mce_err->error_type = MCE_ERROR_TYPE_UE;
- mce_err->u.ue_error_type = MCE_UE_ERROR_IFETCH;
- break;
- case P9_SRR1_MC_IFETCH_SLB_PARITY:
- mce_err->error_type = MCE_ERROR_TYPE_SLB;
- mce_err->u.slb_error_type = MCE_SLB_ERROR_PARITY;
- break;
- case P9_SRR1_MC_IFETCH_SLB_MULTIHIT:
- mce_err->error_type = MCE_ERROR_TYPE_SLB;
- mce_err->u.slb_error_type = MCE_SLB_ERROR_MULTIHIT;
- break;
- case P9_SRR1_MC_IFETCH_ERAT_MULTIHIT:
- mce_err->error_type = MCE_ERROR_TYPE_ERAT;
- mce_err->u.erat_error_type = MCE_ERAT_ERROR_MULTIHIT;
- break;
- case P9_SRR1_MC_IFETCH_TLB_MULTIHIT:
- mce_err->error_type = MCE_ERROR_TYPE_TLB;
- mce_err->u.tlb_error_type = MCE_TLB_ERROR_MULTIHIT;
- break;
- case P9_SRR1_MC_IFETCH_UE_TLB_RELOAD:
- mce_err->error_type = MCE_ERROR_TYPE_UE;
- mce_err->u.ue_error_type = MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH;
- break;
- case P9_SRR1_MC_IFETCH_LINK_TIMEOUT:
- mce_err->error_type = MCE_ERROR_TYPE_LINK;
- mce_err->u.link_error_type = MCE_LINK_ERROR_IFETCH_TIMEOUT;
- break;
- case P9_SRR1_MC_IFETCH_LINK_TABLEWALK_TIMEOUT:
- mce_err->error_type = MCE_ERROR_TYPE_LINK;
- mce_err->u.link_error_type = MCE_LINK_ERROR_PAGE_TABLE_WALK_IFETCH_TIMEOUT;
- break;
- case P9_SRR1_MC_IFETCH_RA:
- mce_err->error_type = MCE_ERROR_TYPE_RA;
- mce_err->u.ra_error_type = MCE_RA_ERROR_IFETCH;
- break;
- case P9_SRR1_MC_IFETCH_RA_TABLEWALK:
- mce_err->error_type = MCE_ERROR_TYPE_RA;
- mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH;
- break;
- case P9_SRR1_MC_IFETCH_RA_ASYNC_STORE:
- mce_err->error_type = MCE_ERROR_TYPE_RA;
- mce_err->u.ra_error_type = MCE_RA_ERROR_STORE;
- break;
- case P9_SRR1_MC_IFETCH_LINK_ASYNC_STORE_TIMEOUT:
- mce_err->error_type = MCE_ERROR_TYPE_LINK;
- mce_err->u.link_error_type = MCE_LINK_ERROR_STORE_TIMEOUT;
- break;
- case P9_SRR1_MC_IFETCH_RA_TABLEWALK_FOREIGN:
- mce_err->error_type = MCE_ERROR_TYPE_RA;
- mce_err->u.ra_error_type = MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN;
- break;
- default:
- break;
- }
+ return mce_handle_error(regs, mce_p8_derror_table, mce_p8_ierror_table);
}
long __machine_check_early_realmode_p9(struct pt_regs *regs)
{
- uint64_t nip, addr;
- long handled;
- struct mce_error_info mce_error_info = { 0 };
-
- nip = regs->nip;
-
- if (P9_SRR1_MC_LOADSTORE(regs->msr)) {
- handled = mce_handle_derror_p9(regs);
- mce_get_derror_p9(regs, &mce_error_info, &addr);
- } else {
- handled = mce_handle_ierror_p9(regs);
- mce_get_ierror_p9(regs, &mce_error_info, &addr);
- }
-
- /* Handle UE error. */
- if (mce_error_info.error_type == MCE_ERROR_TYPE_UE)
- handled = mce_handle_ue_error(regs);
-
- save_mce_event(regs, handled, &mce_error_info, nip, addr);
- return handled;
+ return mce_handle_error(regs, mce_p9_derror_table, mce_p9_ierror_table);
}
diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c
index 2282bf4e63cd..ec60ed0d4aad 100644
--- a/arch/powerpc/kernel/optprobes.c
+++ b/arch/powerpc/kernel/optprobes.c
@@ -243,10 +243,10 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
/*
* 2. branch to optimized_callback() and emulate_step()
*/
- kprobe_lookup_name("optimized_callback", op_callback_addr);
- kprobe_lookup_name("emulate_step", emulate_step_addr);
+ op_callback_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("optimized_callback");
+ emulate_step_addr = (kprobe_opcode_t *)ppc_kallsyms_lookup_name("emulate_step");
if (!op_callback_addr || !emulate_step_addr) {
- WARN(1, "kprobe_lookup_name() failed\n");
+ WARN(1, "Unable to lookup optimized_callback()/emulate_step()\n");
goto error;
}
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index dfc479df9634..8d63627e067f 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -245,3 +245,24 @@ void __init free_unused_pacas(void)
free_lppacas();
}
+
+void copy_mm_to_paca(struct mm_struct *mm)
+{
+#ifdef CONFIG_PPC_BOOK3S
+ mm_context_t *context = &mm->context;
+
+ get_paca()->mm_ctx_id = context->id;
+#ifdef CONFIG_PPC_MM_SLICES
+ VM_BUG_ON(!mm->context.addr_limit);
+ get_paca()->addr_limit = mm->context.addr_limit;
+ get_paca()->mm_ctx_low_slices_psize = context->low_slices_psize;
+ memcpy(&get_paca()->mm_ctx_high_slices_psize,
+ &context->high_slices_psize, TASK_SLICE_ARRAY_SZ(mm));
+#else /* CONFIG_PPC_MM_SLICES */
+ get_paca()->mm_ctx_user_psize = context->user_psize;
+ get_paca()->mm_ctx_sllp = context->sllp;
+#endif
+#else /* CONFIG_PPC_BOOK3S */
+ return;
+#endif
+}
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index f5d399e46193..d2f0afeae5a0 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -55,7 +55,6 @@
#include <asm/kexec.h>
#include <asm/opal.h>
#include <asm/fadump.h>
-#include <asm/debug.h>
#include <asm/epapr_hcalls.h>
#include <asm/firmware.h>
diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 1c1b44ec7642..dd8a04f3053a 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -815,7 +815,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = {
.virt_base = cpu_to_be32(0xffffffff),
.virt_size = cpu_to_be32(0xffffffff),
.load_base = cpu_to_be32(0xffffffff),
- .min_rma = cpu_to_be32(256), /* 256MB min RMA */
+ .min_rma = cpu_to_be32(512), /* 512MB min RMA */
.min_load = cpu_to_be32(0xffffffff), /* full client load */
.min_rma_percent = 0, /* min RMA percentage of total RAM */
.max_pft_size = 48, /* max log_2(hash table size) */
diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
index 4697da895133..5c10b5925ac2 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -31,11 +31,11 @@
#include <linux/unistd.h>
#include <linux/serial.h>
#include <linux/serial_8250.h>
-#include <linux/debugfs.h>
#include <linux/percpu.h>
#include <linux/memblock.h>
#include <linux/of_platform.h>
#include <linux/hugetlb.h>
+#include <asm/debugfs.h>
#include <asm/io.h>
#include <asm/paca.h>
#include <asm/prom.h>
@@ -920,6 +920,15 @@ void __init setup_arch(char **cmdline_p)
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = klimit;
+
+#ifdef CONFIG_PPC_MM_SLICES
+#ifdef CONFIG_PPC64
+ init_mm.context.addr_limit = TASK_SIZE_128TB;
+#else
+#error "context.addr_limit not initialized."
+#endif
+#endif
+
#ifdef CONFIG_PPC_64K_PAGES
init_mm.context.pte_frag = NULL;
#endif
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index f997154dfc41..0d4dcaeaafcb 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -230,8 +230,8 @@ static void cpu_ready_for_interrupts(void)
* If we are not in hypervisor mode the job is done once for
* the whole partition in configure_exceptions().
*/
- if (early_cpu_has_feature(CPU_FTR_HVMODE) &&
- early_cpu_has_feature(CPU_FTR_ARCH_207S)) {
+ if (cpu_has_feature(CPU_FTR_HVMODE) &&
+ cpu_has_feature(CPU_FTR_ARCH_207S)) {
unsigned long lpcr = mfspr(SPRN_LPCR);
mtspr(SPRN_LPCR, lpcr | LPCR_AIL_3);
}
@@ -637,6 +637,11 @@ void __init emergency_stack_init(void)
paca[i].emergency_sp = (void *)ti + THREAD_SIZE;
#ifdef CONFIG_PPC_BOOK3S_64
+ /* emergency stack for NMI exception handling. */
+ ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
+ klp_init_thread_info(ti);
+ paca[i].nmi_emergency_sp = (void *)ti + THREAD_SIZE;
+
/* emergency stack for machine check exception handling. */
ti = __va(memblock_alloc_base(THREAD_SIZE, THREAD_SIZE, limit));
klp_init_thread_info(ti);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index d68ed1f004a3..df2a41647d8e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -39,6 +39,7 @@
#include <asm/irq.h>
#include <asm/hw_irq.h>
#include <asm/kvm_ppc.h>
+#include <asm/dbell.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/prom.h>
@@ -86,8 +87,6 @@ volatile unsigned int cpu_callin_map[NR_CPUS];
int smt_enabled_at_boot = 1;
-static void (*crash_ipi_function_ptr)(struct pt_regs *) = NULL;
-
/*
* Returns 1 if the specified cpu should be brought up during boot.
* Used to inhibit booting threads if they've been disabled or
@@ -158,32 +157,33 @@ static irqreturn_t tick_broadcast_ipi_action(int irq, void *data)
return IRQ_HANDLED;
}
-static irqreturn_t debug_ipi_action(int irq, void *data)
+#ifdef CONFIG_NMI_IPI
+static irqreturn_t nmi_ipi_action(int irq, void *data)
{
- if (crash_ipi_function_ptr) {
- crash_ipi_function_ptr(get_irq_regs());
- return IRQ_HANDLED;
- }
-
-#ifdef CONFIG_DEBUGGER
- debugger_ipi(get_irq_regs());
-#endif /* CONFIG_DEBUGGER */
-
+ smp_handle_nmi_ipi(get_irq_regs());
return IRQ_HANDLED;
}
+#endif
static irq_handler_t smp_ipi_action[] = {
[PPC_MSG_CALL_FUNCTION] = call_function_action,
[PPC_MSG_RESCHEDULE] = reschedule_action,
[PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action,
- [PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
+#ifdef CONFIG_NMI_IPI
+ [PPC_MSG_NMI_IPI] = nmi_ipi_action,
+#endif
};
+/*
+ * The NMI IPI is a fallback and not truly non-maskable. It is simpler
+ * than going through the call function infrastructure, and strongly
+ * serialized, so it is more appropriate for debugging.
+ */
const char *smp_ipi_name[] = {
[PPC_MSG_CALL_FUNCTION] = "ipi call function",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
[PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast",
- [PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
+ [PPC_MSG_NMI_IPI] = "nmi ipi",
};
/* optional function to request ipi, for controllers with >= 4 ipis */
@@ -191,14 +191,13 @@ int smp_request_message_ipi(int virq, int msg)
{
int err;
- if (msg < 0 || msg > PPC_MSG_DEBUGGER_BREAK) {
+ if (msg < 0 || msg > PPC_MSG_NMI_IPI)
return -EINVAL;
- }
-#if !defined(CONFIG_DEBUGGER) && !defined(CONFIG_KEXEC_CORE)
- if (msg == PPC_MSG_DEBUGGER_BREAK) {
+#ifndef CONFIG_NMI_IPI
+ if (msg == PPC_MSG_NMI_IPI)
return 1;
- }
#endif
+
err = request_irq(virq, smp_ipi_action[msg],
IRQF_PERCPU | IRQF_NO_THREAD | IRQF_NO_SUSPEND,
smp_ipi_name[msg], NULL);
@@ -211,17 +210,9 @@ int smp_request_message_ipi(int virq, int msg)
#ifdef CONFIG_PPC_SMP_MUXED_IPI
struct cpu_messages {
long messages; /* current messages */
- unsigned long data; /* data for cause ipi */
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
-void smp_muxed_ipi_set_data(int cpu, unsigned long data)
-{
- struct cpu_messages *info = &per_cpu(ipi_message, cpu);
-
- info->data = data;
-}
-
void smp_muxed_ipi_set_message(int cpu, int msg)
{
struct cpu_messages *info = &per_cpu(ipi_message, cpu);
@@ -236,14 +227,13 @@ void smp_muxed_ipi_set_message(int cpu, int msg)
void smp_muxed_ipi_message_pass(int cpu, int msg)
{
- struct cpu_messages *info = &per_cpu(ipi_message, cpu);
-
smp_muxed_ipi_set_message(cpu, msg);
+
/*
* cause_ipi functions are required to include a full barrier
* before doing whatever causes the IPI.
*/
- smp_ops->cause_ipi(cpu, info->data);
+ smp_ops->cause_ipi(cpu);
}
#ifdef __BIG_ENDIAN__
@@ -254,11 +244,18 @@ void smp_muxed_ipi_message_pass(int cpu, int msg)
irqreturn_t smp_ipi_demux(void)
{
- struct cpu_messages *info = this_cpu_ptr(&ipi_message);
- unsigned long all;
-
mb(); /* order any irq clear */
+ return smp_ipi_demux_relaxed();
+}
+
+/* sync-free variant. Callers should ensure synchronization */
+irqreturn_t smp_ipi_demux_relaxed(void)
+{
+ struct cpu_messages *info;
+ unsigned long all;
+
+ info = this_cpu_ptr(&ipi_message);
do {
all = xchg(&info->messages, 0);
#if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
@@ -278,8 +275,10 @@ irqreturn_t smp_ipi_demux(void)
scheduler_ipi();
if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST))
tick_broadcast_ipi_handler();
- if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK))
- debug_ipi_action(0, NULL);
+#ifdef CONFIG_NMI_IPI
+ if (all & IPI_MESSAGE(PPC_MSG_NMI_IPI))
+ nmi_ipi_action(0, NULL);
+#endif
} while (info->messages);
return IRQ_HANDLED;
@@ -316,6 +315,187 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
}
+#ifdef CONFIG_NMI_IPI
+
+/*
+ * "NMI IPI" system.
+ *
+ * NMI IPIs may not be recoverable, so should not be used as ongoing part of
+ * a running system. They can be used for crash, debug, halt/reboot, etc.
+ *
+ * NMI IPIs are globally single threaded. No more than one in progress at
+ * any time.
+ *
+ * The IPI call waits with interrupts disabled until all targets enter the
+ * NMI handler, then the call returns.
+ *
+ * No new NMI can be initiated until targets exit the handler.
+ *
+ * The IPI call may time out without all targets entering the NMI handler.
+ * In that case, there is some logic to recover (and ignore subsequent
+ * NMI interrupts that may eventually be raised), but the platform interrupt
+ * handler may not be able to distinguish this from other exception causes,
+ * which may cause a crash.
+ */
+
+static atomic_t __nmi_ipi_lock = ATOMIC_INIT(0);
+static struct cpumask nmi_ipi_pending_mask;
+static int nmi_ipi_busy_count = 0;
+static void (*nmi_ipi_function)(struct pt_regs *) = NULL;
+
+static void nmi_ipi_lock_start(unsigned long *flags)
+{
+ raw_local_irq_save(*flags);
+ hard_irq_disable();
+ while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) {
+ raw_local_irq_restore(*flags);
+ cpu_relax();
+ raw_local_irq_save(*flags);
+ hard_irq_disable();
+ }
+}
+
+static void nmi_ipi_lock(void)
+{
+ while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1)
+ cpu_relax();
+}
+
+static void nmi_ipi_unlock(void)
+{
+ smp_mb();
+ WARN_ON(atomic_read(&__nmi_ipi_lock) != 1);
+ atomic_set(&__nmi_ipi_lock, 0);
+}
+
+static void nmi_ipi_unlock_end(unsigned long *flags)
+{
+ nmi_ipi_unlock();
+ raw_local_irq_restore(*flags);
+}
+
+/*
+ * Platform NMI handler calls this to ack
+ */
+int smp_handle_nmi_ipi(struct pt_regs *regs)
+{
+ void (*fn)(struct pt_regs *);
+ unsigned long flags;
+ int me = raw_smp_processor_id();
+ int ret = 0;
+
+ /*
+ * Unexpected NMIs are possible here because the interrupt may not
+ * be able to distinguish NMI IPIs from other types of NMIs, or
+ * because the caller may have timed out.
+ */
+ nmi_ipi_lock_start(&flags);
+ if (!nmi_ipi_busy_count)
+ goto out;
+ if (!cpumask_test_cpu(me, &nmi_ipi_pending_mask))
+ goto out;
+
+ fn = nmi_ipi_function;
+ if (!fn)
+ goto out;
+
+ cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
+ nmi_ipi_busy_count++;
+ nmi_ipi_unlock();
+
+ ret = 1;
+
+ fn(regs);
+
+ nmi_ipi_lock();
+ nmi_ipi_busy_count--;
+out:
+ nmi_ipi_unlock_end(&flags);
+
+ return ret;
+}
+
+static void do_smp_send_nmi_ipi(int cpu)
+{
+ if (smp_ops->cause_nmi_ipi && smp_ops->cause_nmi_ipi(cpu))
+ return;
+
+ if (cpu >= 0) {
+ do_message_pass(cpu, PPC_MSG_NMI_IPI);
+ } else {
+ int c;
+
+ for_each_online_cpu(c) {
+ if (c == raw_smp_processor_id())
+ continue;
+ do_message_pass(c, PPC_MSG_NMI_IPI);
+ }
+ }
+}
+
+/*
+ * - cpu is the target CPU (must not be this CPU), or NMI_IPI_ALL_OTHERS.
+ * - fn is the target callback function.
+ * - delay_us > 0 is the delay before giving up waiting for targets to
+ * enter the handler, == 0 specifies indefinite delay.
+ */
+static int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
+{
+ unsigned long flags;
+ int me = raw_smp_processor_id();
+ int ret = 1;
+
+ BUG_ON(cpu == me);
+ BUG_ON(cpu < 0 && cpu != NMI_IPI_ALL_OTHERS);
+
+ if (unlikely(!smp_ops))
+ return 0;
+
+ /* Take the nmi_ipi_busy count/lock with interrupts hard disabled */
+ nmi_ipi_lock_start(&flags);
+ while (nmi_ipi_busy_count) {
+ nmi_ipi_unlock_end(&flags);
+ cpu_relax();
+ nmi_ipi_lock_start(&flags);
+ }
+
+ nmi_ipi_function = fn;
+
+ if (cpu < 0) {
+ /* ALL_OTHERS */
+ cpumask_copy(&nmi_ipi_pending_mask, cpu_online_mask);
+ cpumask_clear_cpu(me, &nmi_ipi_pending_mask);
+ } else {
+ /* cpumask starts clear */
+ cpumask_set_cpu(cpu, &nmi_ipi_pending_mask);
+ }
+ nmi_ipi_busy_count++;
+ nmi_ipi_unlock();
+
+ do_smp_send_nmi_ipi(cpu);
+
+ while (!cpumask_empty(&nmi_ipi_pending_mask)) {
+ udelay(1);
+ if (delay_us) {
+ delay_us--;
+ if (!delay_us)
+ break;
+ }
+ }
+
+ nmi_ipi_lock();
+ if (!cpumask_empty(&nmi_ipi_pending_mask)) {
+ /* Could not gather all CPUs */
+ ret = 0;
+ cpumask_clear(&nmi_ipi_pending_mask);
+ }
+ nmi_ipi_busy_count--;
+ nmi_ipi_unlock_end(&flags);
+
+ return ret;
+}
+#endif /* CONFIG_NMI_IPI */
+
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
void tick_broadcast(const struct cpumask *mask)
{
@@ -326,29 +506,22 @@ void tick_broadcast(const struct cpumask *mask)
}
#endif
-#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC_CORE)
-void smp_send_debugger_break(void)
+#ifdef CONFIG_DEBUGGER
+void debugger_ipi_callback(struct pt_regs *regs)
{
- int cpu;
- int me = raw_smp_processor_id();
-
- if (unlikely(!smp_ops))
- return;
+ debugger_ipi(regs);
+}
- for_each_online_cpu(cpu)
- if (cpu != me)
- do_message_pass(cpu, PPC_MSG_DEBUGGER_BREAK);
+void smp_send_debugger_break(void)
+{
+ smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, debugger_ipi_callback, 1000000);
}
#endif
#ifdef CONFIG_KEXEC_CORE
void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
{
- crash_ipi_function_ptr = crash_ipi_callback;
- if (crash_ipi_callback) {
- mb();
- smp_send_debugger_break();
- }
+ smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback, 1000000);
}
#endif
@@ -439,7 +612,21 @@ int generic_cpu_disable(void)
#ifdef CONFIG_PPC64
vdso_data->processorCount--;
#endif
- migrate_irqs();
+ /* Update affinity of all IRQs previously aimed at this CPU */
+ irq_migrate_all_off_this_cpu();
+
+ /*
+ * Depending on the details of the interrupt controller, it's possible
+ * that one of the interrupts we just migrated away from this CPU is
+ * actually already pending on this CPU. If we leave it in that state
+ * the interrupt will never be EOI'ed, and will never fire again. So
+ * temporarily enable interrupts here, to allow any pending interrupt to
+ * be received (and EOI'ed), before we take this CPU offline.
+ */
+ local_irq_enable();
+ mdelay(1);
+ local_irq_disable();
+
return 0;
}
@@ -521,6 +708,16 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
cpu_idle_thread_init(cpu, tidle);
+ /*
+ * The platform might need to allocate resources prior to bringing
+ * up the CPU
+ */
+ if (smp_ops->prepare_cpu) {
+ rc = smp_ops->prepare_cpu(cpu);
+ if (rc)
+ return rc;
+ }
+
/* Make sure callin-map entry is 0 (can be leftover a CPU
* hotplug
*/
diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c
index 66711958493c..d534ed901538 100644
--- a/arch/powerpc/kernel/stacktrace.c
+++ b/arch/powerpc/kernel/stacktrace.c
@@ -59,7 +59,14 @@ EXPORT_SYMBOL_GPL(save_stack_trace);
void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
{
- save_context_stack(trace, tsk->thread.ksp, tsk, 0);
+ unsigned long sp;
+
+ if (tsk == current)
+ sp = current_stack_pointer();
+ else
+ sp = tsk->thread.ksp;
+
+ save_context_stack(trace, sp, tsk, 0);
}
EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
diff --git a/arch/powerpc/kernel/swsusp.c b/arch/powerpc/kernel/swsusp.c
index 6ae9bd5086a4..0050b2d2ff7a 100644
--- a/arch/powerpc/kernel/swsusp.c
+++ b/arch/powerpc/kernel/swsusp.c
@@ -10,6 +10,7 @@
*/
#include <linux/sched.h>
+#include <linux/suspend.h>
#include <asm/current.h>
#include <asm/mmu_context.h>
#include <asm/switch_to.h>
diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
index de04c9fbb5cd..a877bf8269fe 100644
--- a/arch/powerpc/kernel/syscalls.c
+++ b/arch/powerpc/kernel/syscalls.c
@@ -42,11 +42,11 @@
#include <asm/unistd.h>
#include <asm/asm-prototypes.h>
-static inline unsigned long do_mmap2(unsigned long addr, size_t len,
+static inline long do_mmap2(unsigned long addr, size_t len,
unsigned long prot, unsigned long flags,
unsigned long fd, unsigned long off, int shift)
{
- unsigned long ret = -EINVAL;
+ long ret = -EINVAL;
if (!arch_validate_prot(prot))
goto out;
@@ -62,16 +62,16 @@ out:
return ret;
}
-unsigned long sys_mmap2(unsigned long addr, size_t len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, unsigned long pgoff)
+SYSCALL_DEFINE6(mmap2, unsigned long, addr, size_t, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, unsigned long, pgoff)
{
return do_mmap2(addr, len, prot, flags, fd, pgoff, PAGE_SHIFT-12);
}
-unsigned long sys_mmap(unsigned long addr, size_t len,
- unsigned long prot, unsigned long flags,
- unsigned long fd, off_t offset)
+SYSCALL_DEFINE6(mmap, unsigned long, addr, size_t, len,
+ unsigned long, prot, unsigned long, flags,
+ unsigned long, fd, off_t, offset)
{
return do_mmap2(addr, len, prot, flags, fd, offset, PAGE_SHIFT);
}
diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c
index c1fb255a60d6..4437c70c7c2b 100644
--- a/arch/powerpc/kernel/sysfs.c
+++ b/arch/powerpc/kernel/sysfs.c
@@ -710,6 +710,10 @@ static int register_cpu_online(unsigned int cpu)
struct device_attribute *attrs, *pmc_attrs;
int i, nattrs;
+ /* For cpus present at boot a reference was already grabbed in register_cpu() */
+ if (!s->of_node)
+ s->of_node = of_get_cpu_node(cpu, NULL);
+
#ifdef CONFIG_PPC64
if (cpu_has_feature(CPU_FTR_SMT))
device_create_file(s, &dev_attr_smt_snooze_delay);
@@ -785,9 +789,9 @@ static int register_cpu_online(unsigned int cpu)
return 0;
}
+#ifdef CONFIG_HOTPLUG_CPU
static int unregister_cpu_online(unsigned int cpu)
{
-#ifdef CONFIG_HOTPLUG_CPU
struct cpu *c = &per_cpu(cpu_devices, cpu);
struct device *s = &c->dev;
struct device_attribute *attrs, *pmc_attrs;
@@ -864,9 +868,13 @@ static int unregister_cpu_online(unsigned int cpu)
}
#endif
cacheinfo_cpu_offline(cpu);
-#endif /* CONFIG_HOTPLUG_CPU */
+ of_node_put(s->of_node);
+ s->of_node = NULL;
return 0;
}
+#else /* !CONFIG_HOTPLUG_CPU */
+#define unregister_cpu_online NULL
+#endif
#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
ssize_t arch_cpu_probe(const char *buf, size_t count)
diff --git a/arch/powerpc/kernel/trace/Makefile b/arch/powerpc/kernel/trace/Makefile
new file mode 100644
index 000000000000..729dffc5f7bc
--- /dev/null
+++ b/arch/powerpc/kernel/trace/Makefile
@@ -0,0 +1,29 @@
+#
+# Makefile for the powerpc trace subsystem
+#
+
+subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
+
+ifdef CONFIG_FUNCTION_TRACER
+# do not trace tracer code
+CFLAGS_REMOVE_ftrace.o = -mno-sched-epilog $(CC_FLAGS_FTRACE)
+endif
+
+obj32-$(CONFIG_FUNCTION_TRACER) += ftrace_32.o
+obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64.o
+ifdef CONFIG_MPROFILE_KERNEL
+obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64_mprofile.o
+else
+obj64-$(CONFIG_FUNCTION_TRACER) += ftrace_64_pg.o
+endif
+obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
+obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
+obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
+obj-$(CONFIG_TRACING) += trace_clock.o
+
+obj-$(CONFIG_PPC64) += $(obj64-y)
+obj-$(CONFIG_PPC32) += $(obj32-y)
+
+# Disable GCOV & sanitizers in odd or sensitive code
+GCOV_PROFILE_ftrace.o := n
+UBSAN_SANITIZE_ftrace.o := n
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c
index 5c9f50c1aa99..32509de6ce4c 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/trace/ftrace.c
@@ -21,6 +21,7 @@
#include <linux/init.h>
#include <linux/list.h>
+#include <asm/asm-prototypes.h>
#include <asm/cacheflush.h>
#include <asm/code-patching.h>
#include <asm/ftrace.h>
diff --git a/arch/powerpc/kernel/trace/ftrace_32.S b/arch/powerpc/kernel/trace/ftrace_32.S
new file mode 100644
index 000000000000..afef2c076282
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_32.S
@@ -0,0 +1,118 @@
+/*
+ * Split from entry_32.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/reg.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/export.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+ /*
+ * It is required that _mcount on PPC32 must preserve the
+ * link register. But we have r0 to play with. We use r0
+ * to push the return address back to the caller of mcount
+ * into the ctr register, restore the link register and
+ * then jump back using the ctr register.
+ */
+ mflr r0
+ mtctr r0
+ lwz r0, 4(r1)
+ mtlr r0
+ bctr
+
+_GLOBAL(ftrace_caller)
+ MCOUNT_SAVE_FRAME
+ /* r3 ends up with link register */
+ subi r3, r3, MCOUNT_INSN_SIZE
+.globl ftrace_call
+ftrace_call:
+ bl ftrace_stub
+ nop
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ b ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+ MCOUNT_RESTORE_FRAME
+ /* old link register ends up in ctr reg */
+ bctr
+#else
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+
+ MCOUNT_SAVE_FRAME
+
+ subi r3, r3, MCOUNT_INSN_SIZE
+ LOAD_REG_ADDR(r5, ftrace_trace_function)
+ lwz r5,0(r5)
+
+ mtctr r5
+ bctrl
+ nop
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ b ftrace_graph_caller
+#endif
+ MCOUNT_RESTORE_FRAME
+ bctr
+#endif
+EXPORT_SYMBOL(_mcount)
+
+_GLOBAL(ftrace_stub)
+ blr
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(ftrace_graph_caller)
+ /* load r4 with local address */
+ lwz r4, 44(r1)
+ subi r4, r4, MCOUNT_INSN_SIZE
+
+ /* Grab the LR out of the caller stack frame */
+ lwz r3,52(r1)
+
+ bl prepare_ftrace_return
+ nop
+
+ /*
+ * prepare_ftrace_return gives us the address we divert to.
+ * Change the LR in the callers stack frame to this.
+ */
+ stw r3,52(r1)
+
+ MCOUNT_RESTORE_FRAME
+ /* old link register ends up in ctr reg */
+ bctr
+
+_GLOBAL(return_to_handler)
+ /* need to save return values */
+ stwu r1, -32(r1)
+ stw r3, 20(r1)
+ stw r4, 16(r1)
+ stw r31, 12(r1)
+ mr r31, r1
+
+ bl ftrace_return_to_handler
+ nop
+
+ /* return value has real return address */
+ mtlr r3
+
+ lwz r3, 20(r1)
+ lwz r4, 16(r1)
+ lwz r31,12(r1)
+ lwz r1, 0(r1)
+
+ /* Jump back to real return address */
+ blr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64.S b/arch/powerpc/kernel/trace/ftrace_64.S
new file mode 100644
index 000000000000..e5ccea19821e
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64.S
@@ -0,0 +1,85 @@
+/*
+ * Split from entry_64.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/ppc-opcode.h>
+#include <asm/export.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+_GLOBAL(mcount)
+_GLOBAL(_mcount)
+EXPORT_SYMBOL(_mcount)
+ mflr r12
+ mtctr r12
+ mtlr r0
+ bctr
+
+#else /* CONFIG_DYNAMIC_FTRACE */
+_GLOBAL_TOC(_mcount)
+EXPORT_SYMBOL(_mcount)
+ /* Taken from output of objdump from lib64/glibc */
+ mflr r3
+ ld r11, 0(r1)
+ stdu r1, -112(r1)
+ std r3, 128(r1)
+ ld r4, 16(r11)
+
+ subi r3, r3, MCOUNT_INSN_SIZE
+ LOAD_REG_ADDR(r5,ftrace_trace_function)
+ ld r5,0(r5)
+ ld r5,0(r5)
+ mtctr r5
+ bctrl
+ nop
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ b ftrace_graph_caller
+#endif
+ ld r0, 128(r1)
+ mtlr r0
+ addi r1, r1, 112
+_GLOBAL(ftrace_stub)
+ blr
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(return_to_handler)
+ /* need to save return values */
+ std r4, -32(r1)
+ std r3, -24(r1)
+ /* save TOC */
+ std r2, -16(r1)
+ std r31, -8(r1)
+ mr r31, r1
+ stdu r1, -112(r1)
+
+ /*
+ * We might be called from a module.
+ * Switch to our TOC to run inside the core kernel.
+ */
+ ld r2, PACATOC(r13)
+
+ bl ftrace_return_to_handler
+ nop
+
+ /* return value has real return address */
+ mtlr r3
+
+ ld r1, 0(r1)
+ ld r4, -32(r1)
+ ld r3, -24(r1)
+ ld r2, -16(r1)
+ ld r31, -8(r1)
+
+ /* Jump back to real return address */
+ blr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64_mprofile.S b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
new file mode 100644
index 000000000000..7c933a99f5d5
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64_mprofile.S
@@ -0,0 +1,272 @@
+/*
+ * Split from ftrace_64.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/ppc-opcode.h>
+#include <asm/export.h>
+#include <asm/thread_info.h>
+#include <asm/bug.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+/*
+ *
+ * ftrace_caller() is the function that replaces _mcount() when ftrace is
+ * active.
+ *
+ * We arrive here after a function A calls function B, and we are the trace
+ * function for B. When we enter r1 points to A's stack frame, B has not yet
+ * had a chance to allocate one yet.
+ *
+ * Additionally r2 may point either to the TOC for A, or B, depending on
+ * whether B did a TOC setup sequence before calling us.
+ *
+ * On entry the LR points back to the _mcount() call site, and r0 holds the
+ * saved LR as it was on entry to B, ie. the original return address at the
+ * call site in A.
+ *
+ * Our job is to save the register state into a struct pt_regs (on the stack)
+ * and then arrange for the ftrace function to be called.
+ */
+_GLOBAL(ftrace_caller)
+ /* Save the original return address in A's stack frame */
+ std r0,LRSAVE(r1)
+
+ /* Create our stack frame + pt_regs */
+ stdu r1,-SWITCH_FRAME_SIZE(r1)
+
+ /* Save all gprs to pt_regs */
+ SAVE_8GPRS(0,r1)
+ SAVE_8GPRS(8,r1)
+ SAVE_8GPRS(16,r1)
+ SAVE_8GPRS(24,r1)
+
+ /* Load special regs for save below */
+ mfmsr r8
+ mfctr r9
+ mfxer r10
+ mfcr r11
+
+ /* Get the _mcount() call site out of LR */
+ mflr r7
+ /* Save it as pt_regs->nip */
+ std r7, _NIP(r1)
+ /* Save the read LR in pt_regs->link */
+ std r0, _LINK(r1)
+
+ /* Save callee's TOC in the ABI compliant location */
+ std r2, 24(r1)
+ ld r2,PACATOC(r13) /* get kernel TOC in r2 */
+
+ addis r3,r2,function_trace_op@toc@ha
+ addi r3,r3,function_trace_op@toc@l
+ ld r5,0(r3)
+
+#ifdef CONFIG_LIVEPATCH
+ mr r14,r7 /* remember old NIP */
+#endif
+ /* Calculate ip from nip-4 into r3 for call below */
+ subi r3, r7, MCOUNT_INSN_SIZE
+
+ /* Put the original return address in r4 as parent_ip */
+ mr r4, r0
+
+ /* Save special regs */
+ std r8, _MSR(r1)
+ std r9, _CTR(r1)
+ std r10, _XER(r1)
+ std r11, _CCR(r1)
+
+ /* Load &pt_regs in r6 for call below */
+ addi r6, r1 ,STACK_FRAME_OVERHEAD
+
+ /* ftrace_call(r3, r4, r5, r6) */
+.globl ftrace_call
+ftrace_call:
+ bl ftrace_stub
+ nop
+
+ /* Load ctr with the possibly modified NIP */
+ ld r3, _NIP(r1)
+ mtctr r3
+#ifdef CONFIG_LIVEPATCH
+ cmpd r14,r3 /* has NIP been altered? */
+#endif
+
+ /* Restore gprs */
+ REST_8GPRS(0,r1)
+ REST_8GPRS(8,r1)
+ REST_8GPRS(16,r1)
+ REST_8GPRS(24,r1)
+
+ /* Restore possibly modified LR */
+ ld r0, _LINK(r1)
+ mtlr r0
+
+ /* Restore callee's TOC */
+ ld r2, 24(r1)
+
+ /* Pop our stack frame */
+ addi r1, r1, SWITCH_FRAME_SIZE
+
+#ifdef CONFIG_LIVEPATCH
+ /* Based on the cmpd above, if the NIP was altered handle livepatch */
+ bne- livepatch_handler
+#endif
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ b ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+
+ bctr /* jump after _mcount site */
+
+_GLOBAL(ftrace_stub)
+ blr
+
+#ifdef CONFIG_LIVEPATCH
+ /*
+ * This function runs in the mcount context, between two functions. As
+ * such it can only clobber registers which are volatile and used in
+ * function linkage.
+ *
+ * We get here when a function A, calls another function B, but B has
+ * been live patched with a new function C.
+ *
+ * On entry:
+ * - we have no stack frame and can not allocate one
+ * - LR points back to the original caller (in A)
+ * - CTR holds the new NIP in C
+ * - r0 & r12 are free
+ *
+ * r0 can't be used as the base register for a DS-form load or store, so
+ * we temporarily shuffle r1 (stack pointer) into r0 and then put it back.
+ */
+livepatch_handler:
+ CURRENT_THREAD_INFO(r12, r1)
+
+ /* Save stack pointer into r0 */
+ mr r0, r1
+
+ /* Allocate 3 x 8 bytes */
+ ld r1, TI_livepatch_sp(r12)
+ addi r1, r1, 24
+ std r1, TI_livepatch_sp(r12)
+
+ /* Save toc & real LR on livepatch stack */
+ std r2, -24(r1)
+ mflr r12
+ std r12, -16(r1)
+
+ /* Store stack end marker */
+ lis r12, STACK_END_MAGIC@h
+ ori r12, r12, STACK_END_MAGIC@l
+ std r12, -8(r1)
+
+ /* Restore real stack pointer */
+ mr r1, r0
+
+ /* Put ctr in r12 for global entry and branch there */
+ mfctr r12
+ bctrl
+
+ /*
+ * Now we are returning from the patched function to the original
+ * caller A. We are free to use r0 and r12, and we can use r2 until we
+ * restore it.
+ */
+
+ CURRENT_THREAD_INFO(r12, r1)
+
+ /* Save stack pointer into r0 */
+ mr r0, r1
+
+ ld r1, TI_livepatch_sp(r12)
+
+ /* Check stack marker hasn't been trashed */
+ lis r2, STACK_END_MAGIC@h
+ ori r2, r2, STACK_END_MAGIC@l
+ ld r12, -8(r1)
+1: tdne r12, r2
+ EMIT_BUG_ENTRY 1b, __FILE__, __LINE__ - 1, 0
+
+ /* Restore LR & toc from livepatch stack */
+ ld r12, -16(r1)
+ mtlr r12
+ ld r2, -24(r1)
+
+ /* Pop livepatch stack frame */
+ CURRENT_THREAD_INFO(r12, r0)
+ subi r1, r1, 24
+ std r1, TI_livepatch_sp(r12)
+
+ /* Restore real stack pointer */
+ mr r1, r0
+
+ /* Return to original caller of live patched function */
+ blr
+#endif /* CONFIG_LIVEPATCH */
+
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(ftrace_graph_caller)
+ stdu r1, -112(r1)
+ /* with -mprofile-kernel, parameter regs are still alive at _mcount */
+ std r10, 104(r1)
+ std r9, 96(r1)
+ std r8, 88(r1)
+ std r7, 80(r1)
+ std r6, 72(r1)
+ std r5, 64(r1)
+ std r4, 56(r1)
+ std r3, 48(r1)
+
+ /* Save callee's TOC in the ABI compliant location */
+ std r2, 24(r1)
+ ld r2, PACATOC(r13) /* get kernel TOC in r2 */
+
+ mfctr r4 /* ftrace_caller has moved local addr here */
+ std r4, 40(r1)
+ mflr r3 /* ftrace_caller has restored LR from stack */
+ subi r4, r4, MCOUNT_INSN_SIZE
+
+ bl prepare_ftrace_return
+ nop
+
+ /*
+ * prepare_ftrace_return gives us the address we divert to.
+ * Change the LR to this.
+ */
+ mtlr r3
+
+ ld r0, 40(r1)
+ mtctr r0
+ ld r10, 104(r1)
+ ld r9, 96(r1)
+ ld r8, 88(r1)
+ ld r7, 80(r1)
+ ld r6, 72(r1)
+ ld r5, 64(r1)
+ ld r4, 56(r1)
+ ld r3, 48(r1)
+
+ /* Restore callee's TOC */
+ ld r2, 24(r1)
+
+ addi r1, r1, 112
+ mflr r0
+ std r0, LRSAVE(r1)
+ bctr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.S b/arch/powerpc/kernel/trace/ftrace_64_pg.S
new file mode 100644
index 000000000000..f095358da96e
--- /dev/null
+++ b/arch/powerpc/kernel/trace/ftrace_64_pg.S
@@ -0,0 +1,68 @@
+/*
+ * Split from ftrace_64.S
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/magic.h>
+#include <asm/ppc_asm.h>
+#include <asm/asm-offsets.h>
+#include <asm/ftrace.h>
+#include <asm/ppc-opcode.h>
+#include <asm/export.h>
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+_GLOBAL_TOC(ftrace_caller)
+ /* Taken from output of objdump from lib64/glibc */
+ mflr r3
+ ld r11, 0(r1)
+ stdu r1, -112(r1)
+ std r3, 128(r1)
+ ld r4, 16(r11)
+ subi r3, r3, MCOUNT_INSN_SIZE
+.globl ftrace_call
+ftrace_call:
+ bl ftrace_stub
+ nop
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+.globl ftrace_graph_call
+ftrace_graph_call:
+ b ftrace_graph_stub
+_GLOBAL(ftrace_graph_stub)
+#endif
+ ld r0, 128(r1)
+ mtlr r0
+ addi r1, r1, 112
+
+_GLOBAL(ftrace_stub)
+ blr
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+_GLOBAL(ftrace_graph_caller)
+ /* load r4 with local address */
+ ld r4, 128(r1)
+ subi r4, r4, MCOUNT_INSN_SIZE
+
+ /* Grab the LR out of the caller stack frame */
+ ld r11, 112(r1)
+ ld r3, 16(r11)
+
+ bl prepare_ftrace_return
+ nop
+
+ /*
+ * prepare_ftrace_return gives us the address we divert to.
+ * Change the LR in the callers stack frame to this.
+ */
+ ld r11, 112(r1)
+ std r3, 16(r11)
+
+ ld r0, 128(r1)
+ mtlr r0
+ addi r1, r1, 112
+ blr
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/powerpc/kernel/trace_clock.c b/arch/powerpc/kernel/trace/trace_clock.c
index 49170690946d..49170690946d 100644
--- a/arch/powerpc/kernel/trace_clock.c
+++ b/arch/powerpc/kernel/trace/trace_clock.c
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index ff365f9de27a..d4e545d27ef9 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -35,13 +35,13 @@
#include <linux/backlight.h>
#include <linux/bug.h>
#include <linux/kdebug.h>
-#include <linux/debugfs.h>
#include <linux/ratelimit.h>
#include <linux/context_tracking.h>
#include <asm/emulated_ops.h>
#include <asm/pgtable.h>
#include <linux/uaccess.h>
+#include <asm/debugfs.h>
#include <asm/io.h>
#include <asm/machdep.h>
#include <asm/rtas.h>
@@ -279,18 +279,35 @@ void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr)
void system_reset_exception(struct pt_regs *regs)
{
+ /*
+ * Avoid crashes in case of nested NMI exceptions. Recoverability
+ * is determined by RI and in_nmi
+ */
+ bool nested = in_nmi();
+ if (!nested)
+ nmi_enter();
+
/* See if any machine dependent calls */
if (ppc_md.system_reset_exception) {
if (ppc_md.system_reset_exception(regs))
- return;
+ goto out;
}
die("System Reset", regs, SIGABRT);
+out:
+#ifdef CONFIG_PPC_BOOK3S_64
+ BUG_ON(get_paca()->in_nmi == 0);
+ if (get_paca()->in_nmi > 1)
+ panic("Unrecoverable nested System Reset");
+#endif
/* Must die if the interrupt is not recoverable */
if (!(regs->msr & MSR_RI))
panic("Unrecoverable System Reset");
+ if (!nested)
+ nmi_exit();
+
/* What should we do here? We could issue a shutdown or hard reset. */
}
@@ -306,8 +323,6 @@ long machine_check_early(struct pt_regs *regs)
__this_cpu_inc(irq_stat.mce_exceptions);
- add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
-
if (cur_cpu_spec && cur_cpu_spec->machine_check_early)
handled = cur_cpu_spec->machine_check_early(regs);
return handled;
@@ -741,6 +756,8 @@ void machine_check_exception(struct pt_regs *regs)
__this_cpu_inc(irq_stat.mce_exceptions);
+ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
+
/* See if any machine dependent calls. In theory, we would want
* to call the CPU first, and call the ppc_md. one if the CPU
* one returns a positive number. However there is existing code
@@ -1440,6 +1457,8 @@ void facility_unavailable_exception(struct pt_regs *regs)
[FSCR_TM_LG] = "TM",
[FSCR_EBB_LG] = "EBB",
[FSCR_TAR_LG] = "TAR",
+ [FSCR_MSGP_LG] = "MSGP",
+ [FSCR_SCV_LG] = "SCV",
};
char *facility = "unknown";
u64 value;
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 1c24c894c908..2f793be3d2b1 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -77,6 +77,8 @@ SECTIONS
#endif
} :kernel
+ __head_end = .;
+
/*
* If the build dies here, it's likely code in head_64.S is referencing
* labels it can't reach, and the linker inserting stubs without the
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index b6b5c185bd92..aedacefd961d 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -20,6 +20,10 @@
#include <linux/slab.h>
#include <linux/module.h>
#include <linux/miscdevice.h>
+#include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
#include <asm/reg.h>
#include <asm/cputable.h>
@@ -31,10 +35,6 @@
#include <asm/kvm_book3s.h>
#include <asm/mmu_context.h>
#include <asm/page.h>
-#include <linux/gfp.h>
-#include <linux/sched.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
#include "book3s.h"
#include "trace.h"
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index a587e8f4fd26..74b0153780e3 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -229,6 +229,7 @@ void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte)
static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
{
+ unsigned long vsid_bits = VSID_BITS_65_256M;
struct kvmppc_sid_map *map;
struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu);
u16 sid_map_mask;
@@ -257,7 +258,12 @@ static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid)
kvmppc_mmu_pte_flush(vcpu, 0, 0);
kvmppc_mmu_flush_segments(vcpu);
}
- map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, 256M);
+
+ if (mmu_has_feature(MMU_FTR_68_BIT_VA))
+ vsid_bits = VSID_BITS_256M;
+
+ map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++,
+ VSID_MULTIPLIER_256M, vsid_bits);
map->guest_vsid = gvsid;
map->valid = true;
@@ -390,7 +396,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
int err;
- err = __init_new_context();
+ err = hash__alloc_context_id();
if (err < 0)
return -1;
vcpu3s->context_id[0] = err;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1ec86d9e2a82..fadb75abfe37 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -35,6 +35,15 @@
#include <linux/srcu.h>
#include <linux/miscdevice.h>
#include <linux/debugfs.h>
+#include <linux/gfp.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
+#include <linux/module.h>
+#include <linux/compiler.h>
+#include <linux/of.h>
#include <asm/reg.h>
#include <asm/cputable.h>
@@ -58,15 +67,6 @@
#include <asm/mmu.h>
#include <asm/opal.h>
#include <asm/xics.h>
-#include <linux/gfp.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <linux/hugetlb.h>
-#include <linux/kvm_irqfd.h>
-#include <linux/irqbypass.h>
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/of.h>
#include "book3s.h"
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 4d6c64b3041c..a752e29977e0 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -23,6 +23,7 @@
#include <asm/kvm_book3s.h>
#include <asm/archrandom.h>
#include <asm/xics.h>
+#include <asm/xive.h>
#include <asm/dbell.h>
#include <asm/cputhreads.h>
#include <asm/io.h>
@@ -193,12 +194,6 @@ long kvmppc_h_random(struct kvm_vcpu *vcpu)
return H_HARDWARE;
}
-static inline void rm_writeb(unsigned long paddr, u8 val)
-{
- __asm__ __volatile__("stbcix %0,0,%1"
- : : "r" (val), "r" (paddr) : "memory");
-}
-
/*
* Send an interrupt or message to another CPU.
* The caller needs to include any barrier needed to order writes
@@ -206,7 +201,7 @@ static inline void rm_writeb(unsigned long paddr, u8 val)
*/
void kvmhv_rm_send_ipi(int cpu)
{
- unsigned long xics_phys;
+ void __iomem *xics_phys;
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
/* On POWER9 we can use msgsnd for any destination cpu. */
@@ -224,10 +219,14 @@ void kvmhv_rm_send_ipi(int cpu)
return;
}
+ /* We should never reach this */
+ if (WARN_ON_ONCE(xive_enabled()))
+ return;
+
/* Else poke the target with an IPI */
xics_phys = paca[cpu].kvm_hstate.xics_phys;
if (xics_phys)
- rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+ __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR);
else
opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
}
@@ -386,6 +385,9 @@ long kvmppc_read_intr(void)
long rc;
bool again;
+ if (xive_enabled())
+ return 1;
+
do {
again = false;
rc = kvmppc_read_one_intr(&again);
@@ -397,7 +399,7 @@ long kvmppc_read_intr(void)
static long kvmppc_read_one_intr(bool *again)
{
- unsigned long xics_phys;
+ void __iomem *xics_phys;
u32 h_xirr;
__be32 xirr;
u32 xisr;
@@ -415,7 +417,7 @@ static long kvmppc_read_one_intr(bool *again)
if (!xics_phys)
rc = opal_int_get_xirr(&xirr, false);
else
- xirr = _lwzcix(xics_phys + XICS_XIRR);
+ xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
if (rc < 0)
return 1;
@@ -445,8 +447,8 @@ static long kvmppc_read_one_intr(bool *again)
if (xisr == XICS_IPI) {
rc = 0;
if (xics_phys) {
- _stbcix(xics_phys + XICS_MFRR, 0xff);
- _stwcix(xics_phys + XICS_XIRR, xirr);
+ __raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
+ __raw_rm_writel(xirr, xics_phys + XICS_XIRR);
} else {
opal_int_set_mfrr(hard_smp_processor_id(), 0xff);
rc = opal_int_eoi(h_xirr);
@@ -471,7 +473,8 @@ static long kvmppc_read_one_intr(bool *again)
* we need to resend that IPI, bummer
*/
if (xics_phys)
- _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
+ __raw_rm_writeb(IPI_PRIORITY,
+ xics_phys + XICS_MFRR);
else
opal_int_set_mfrr(hard_smp_processor_id(),
IPI_PRIORITY);
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index e78542d99cd6..ffde4507ddfd 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -16,7 +16,6 @@
#include <asm/kvm_ppc.h>
#include <asm/hvcall.h>
#include <asm/xics.h>
-#include <asm/debug.h>
#include <asm/synch.h>
#include <asm/cputhreads.h>
#include <asm/pgtable.h>
@@ -766,7 +765,7 @@ unsigned long eoi_rc;
static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
{
- unsigned long xics_phys;
+ void __iomem *xics_phys;
int64_t rc;
rc = pnv_opal_pci_msi_eoi(c, hwirq);
@@ -779,7 +778,7 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
/* EOI it */
xics_phys = local_paca->kvm_hstate.xics_phys;
if (xics_phys) {
- _stwcix(xics_phys + XICS_XIRR, xirr);
+ __raw_rm_writel(xirr, xics_phys + XICS_XIRR);
} else {
rc = opal_int_eoi(be32_to_cpu(xirr));
*again = rc > 0;
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index e48803e2918d..459b72cb617a 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -19,10 +19,9 @@
#include <asm/kvm_ppc.h>
#include <asm/hvcall.h>
#include <asm/xics.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
#include <asm/time.h>
-#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include "book3s_xics.h"
@@ -1084,7 +1083,7 @@ static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
return xics->ics[icsid];
}
-int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
+static int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num)
{
struct kvmppc_icp *icp;
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index 0d3002b7e2b4..500b0f6a0b64 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -8,6 +8,7 @@
*/
#include <linux/kernel.h>
+#include <linux/kprobes.h>
#include <linux/vmalloc.h>
#include <linux/init.h>
#include <linux/mm.h>
@@ -59,7 +60,7 @@ bool is_offset_in_branch_range(long offset)
* Helper to check if a given instruction is a conditional branch
* Derived from the conditional checks in analyse_instr()
*/
-bool __kprobes is_conditional_branch(unsigned int instr)
+bool is_conditional_branch(unsigned int instr)
{
unsigned int opcode = instr >> 26;
@@ -75,6 +76,7 @@ bool __kprobes is_conditional_branch(unsigned int instr)
}
return false;
}
+NOKPROBE_SYMBOL(is_conditional_branch);
unsigned int create_branch(const unsigned int *addr,
unsigned long target, int flags)
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index 9c542ec70c5b..33117f8a0882 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -49,7 +49,8 @@ extern int do_stxvd2x(int rn, unsigned long ea);
/*
* Emulate the truncation of 64 bit values in 32-bit mode.
*/
-static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val)
+static nokprobe_inline unsigned long truncate_if_32bit(unsigned long msr,
+ unsigned long val)
{
#ifdef __powerpc64__
if ((msr & MSR_64BIT) == 0)
@@ -61,7 +62,7 @@ static unsigned long truncate_if_32bit(unsigned long msr, unsigned long val)
/*
* Determine whether a conditional branch instruction would branch.
*/
-static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs)
+static nokprobe_inline int branch_taken(unsigned int instr, struct pt_regs *regs)
{
unsigned int bo = (instr >> 21) & 0x1f;
unsigned int bi;
@@ -81,8 +82,7 @@ static int __kprobes branch_taken(unsigned int instr, struct pt_regs *regs)
return 1;
}
-
-static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb)
+static nokprobe_inline long address_ok(struct pt_regs *regs, unsigned long ea, int nb)
{
if (!user_mode(regs))
return 1;
@@ -92,7 +92,7 @@ static long __kprobes address_ok(struct pt_regs *regs, unsigned long ea, int nb)
/*
* Calculate effective address for a D-form instruction
*/
-static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs)
+static nokprobe_inline unsigned long dform_ea(unsigned int instr, struct pt_regs *regs)
{
int ra;
unsigned long ea;
@@ -109,7 +109,7 @@ static unsigned long __kprobes dform_ea(unsigned int instr, struct pt_regs *regs
/*
* Calculate effective address for a DS-form instruction
*/
-static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *regs)
+static nokprobe_inline unsigned long dsform_ea(unsigned int instr, struct pt_regs *regs)
{
int ra;
unsigned long ea;
@@ -126,8 +126,8 @@ static unsigned long __kprobes dsform_ea(unsigned int instr, struct pt_regs *reg
/*
* Calculate effective address for an X-form instruction
*/
-static unsigned long __kprobes xform_ea(unsigned int instr,
- struct pt_regs *regs)
+static nokprobe_inline unsigned long xform_ea(unsigned int instr,
+ struct pt_regs *regs)
{
int ra, rb;
unsigned long ea;
@@ -145,33 +145,33 @@ static unsigned long __kprobes xform_ea(unsigned int instr,
* Return the largest power of 2, not greater than sizeof(unsigned long),
* such that x is a multiple of it.
*/
-static inline unsigned long max_align(unsigned long x)
+static nokprobe_inline unsigned long max_align(unsigned long x)
{
x |= sizeof(unsigned long);
return x & -x; /* isolates rightmost bit */
}
-static inline unsigned long byterev_2(unsigned long x)
+static nokprobe_inline unsigned long byterev_2(unsigned long x)
{
return ((x >> 8) & 0xff) | ((x & 0xff) << 8);
}
-static inline unsigned long byterev_4(unsigned long x)
+static nokprobe_inline unsigned long byterev_4(unsigned long x)
{
return ((x >> 24) & 0xff) | ((x >> 8) & 0xff00) |
((x & 0xff00) << 8) | ((x & 0xff) << 24);
}
#ifdef __powerpc64__
-static inline unsigned long byterev_8(unsigned long x)
+static nokprobe_inline unsigned long byterev_8(unsigned long x)
{
return (byterev_4(x) << 32) | byterev_4(x >> 32);
}
#endif
-static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea,
- int nb)
+static nokprobe_inline int read_mem_aligned(unsigned long *dest,
+ unsigned long ea, int nb)
{
int err = 0;
unsigned long x = 0;
@@ -197,8 +197,8 @@ static int __kprobes read_mem_aligned(unsigned long *dest, unsigned long ea,
return err;
}
-static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea,
- int nb, struct pt_regs *regs)
+static nokprobe_inline int read_mem_unaligned(unsigned long *dest,
+ unsigned long ea, int nb, struct pt_regs *regs)
{
int err;
unsigned long x, b, c;
@@ -248,7 +248,7 @@ static int __kprobes read_mem_unaligned(unsigned long *dest, unsigned long ea,
* Read memory at address ea for nb bytes, return 0 for success
* or -EFAULT if an error occurred.
*/
-static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb,
+static int read_mem(unsigned long *dest, unsigned long ea, int nb,
struct pt_regs *regs)
{
if (!address_ok(regs, ea, nb))
@@ -257,9 +257,10 @@ static int __kprobes read_mem(unsigned long *dest, unsigned long ea, int nb,
return read_mem_aligned(dest, ea, nb);
return read_mem_unaligned(dest, ea, nb, regs);
}
+NOKPROBE_SYMBOL(read_mem);
-static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea,
- int nb)
+static nokprobe_inline int write_mem_aligned(unsigned long val,
+ unsigned long ea, int nb)
{
int err = 0;
@@ -282,8 +283,8 @@ static int __kprobes write_mem_aligned(unsigned long val, unsigned long ea,
return err;
}
-static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea,
- int nb, struct pt_regs *regs)
+static nokprobe_inline int write_mem_unaligned(unsigned long val,
+ unsigned long ea, int nb, struct pt_regs *regs)
{
int err;
unsigned long c;
@@ -325,7 +326,7 @@ static int __kprobes write_mem_unaligned(unsigned long val, unsigned long ea,
* Write memory at address ea for nb bytes, return 0 for success
* or -EFAULT if an error occurred.
*/
-static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb,
+static int write_mem(unsigned long val, unsigned long ea, int nb,
struct pt_regs *regs)
{
if (!address_ok(regs, ea, nb))
@@ -334,13 +335,14 @@ static int __kprobes write_mem(unsigned long val, unsigned long ea, int nb,
return write_mem_aligned(val, ea, nb);
return write_mem_unaligned(val, ea, nb, regs);
}
+NOKPROBE_SYMBOL(write_mem);
#ifdef CONFIG_PPC_FPU
/*
* Check the address and alignment, and call func to do the actual
* load or store.
*/
-static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long),
+static int do_fp_load(int rn, int (*func)(int, unsigned long),
unsigned long ea, int nb,
struct pt_regs *regs)
{
@@ -380,8 +382,9 @@ static int __kprobes do_fp_load(int rn, int (*func)(int, unsigned long),
return err;
return (*func)(rn, ptr);
}
+NOKPROBE_SYMBOL(do_fp_load);
-static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long),
+static int do_fp_store(int rn, int (*func)(int, unsigned long),
unsigned long ea, int nb,
struct pt_regs *regs)
{
@@ -425,11 +428,12 @@ static int __kprobes do_fp_store(int rn, int (*func)(int, unsigned long),
}
return err;
}
+NOKPROBE_SYMBOL(do_fp_store);
#endif
#ifdef CONFIG_ALTIVEC
/* For Altivec/VMX, no need to worry about alignment */
-static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long),
+static nokprobe_inline int do_vec_load(int rn, int (*func)(int, unsigned long),
unsigned long ea, struct pt_regs *regs)
{
if (!address_ok(regs, ea & ~0xfUL, 16))
@@ -437,7 +441,7 @@ static int __kprobes do_vec_load(int rn, int (*func)(int, unsigned long),
return (*func)(rn, ea);
}
-static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long),
+static nokprobe_inline int do_vec_store(int rn, int (*func)(int, unsigned long),
unsigned long ea, struct pt_regs *regs)
{
if (!address_ok(regs, ea & ~0xfUL, 16))
@@ -447,7 +451,7 @@ static int __kprobes do_vec_store(int rn, int (*func)(int, unsigned long),
#endif /* CONFIG_ALTIVEC */
#ifdef CONFIG_VSX
-static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long),
+static nokprobe_inline int do_vsx_load(int rn, int (*func)(int, unsigned long),
unsigned long ea, struct pt_regs *regs)
{
int err;
@@ -465,7 +469,7 @@ static int __kprobes do_vsx_load(int rn, int (*func)(int, unsigned long),
return err;
}
-static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long),
+static nokprobe_inline int do_vsx_store(int rn, int (*func)(int, unsigned long),
unsigned long ea, struct pt_regs *regs)
{
int err;
@@ -522,7 +526,7 @@ static int __kprobes do_vsx_store(int rn, int (*func)(int, unsigned long),
: "=r" (err) \
: "r" (addr), "i" (-EFAULT), "0" (err))
-static void __kprobes set_cr0(struct pt_regs *regs, int rd)
+static nokprobe_inline void set_cr0(struct pt_regs *regs, int rd)
{
long val = regs->gpr[rd];
@@ -539,7 +543,7 @@ static void __kprobes set_cr0(struct pt_regs *regs, int rd)
regs->ccr |= 0x20000000;
}
-static void __kprobes add_with_carry(struct pt_regs *regs, int rd,
+static nokprobe_inline void add_with_carry(struct pt_regs *regs, int rd,
unsigned long val1, unsigned long val2,
unsigned long carry_in)
{
@@ -560,7 +564,7 @@ static void __kprobes add_with_carry(struct pt_regs *regs, int rd,
regs->xer &= ~XER_CA;
}
-static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2,
+static nokprobe_inline void do_cmp_signed(struct pt_regs *regs, long v1, long v2,
int crfld)
{
unsigned int crval, shift;
@@ -576,7 +580,7 @@ static void __kprobes do_cmp_signed(struct pt_regs *regs, long v1, long v2,
regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift);
}
-static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1,
+static nokprobe_inline void do_cmp_unsigned(struct pt_regs *regs, unsigned long v1,
unsigned long v2, int crfld)
{
unsigned int crval, shift;
@@ -592,7 +596,7 @@ static void __kprobes do_cmp_unsigned(struct pt_regs *regs, unsigned long v1,
regs->ccr = (regs->ccr & ~(0xf << shift)) | (crval << shift);
}
-static int __kprobes trap_compare(long v1, long v2)
+static nokprobe_inline int trap_compare(long v1, long v2)
{
int ret = 0;
@@ -631,7 +635,7 @@ static int __kprobes trap_compare(long v1, long v2)
* Returns 1 if the instruction has been executed, or 0 if not.
* Sets *op to indicate what the instruction does.
*/
-int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs,
+int analyse_instr(struct instruction_op *op, struct pt_regs *regs,
unsigned int instr)
{
unsigned int opcode, ra, rb, rd, spr, u;
@@ -1692,6 +1696,7 @@ int __kprobes analyse_instr(struct instruction_op *op, struct pt_regs *regs,
#endif
}
EXPORT_SYMBOL_GPL(analyse_instr);
+NOKPROBE_SYMBOL(analyse_instr);
/*
* For PPC32 we always use stwu with r1 to change the stack pointer.
@@ -1701,7 +1706,7 @@ EXPORT_SYMBOL_GPL(analyse_instr);
* don't emulate the real store operation. We will do real store
* operation safely in exception return code by checking this flag.
*/
-static __kprobes int handle_stack_update(unsigned long ea, struct pt_regs *regs)
+static nokprobe_inline int handle_stack_update(unsigned long ea, struct pt_regs *regs)
{
#ifdef CONFIG_PPC32
/*
@@ -1721,7 +1726,7 @@ static __kprobes int handle_stack_update(unsigned long ea, struct pt_regs *regs)
return 0;
}
-static __kprobes void do_signext(unsigned long *valp, int size)
+static nokprobe_inline void do_signext(unsigned long *valp, int size)
{
switch (size) {
case 2:
@@ -1733,7 +1738,7 @@ static __kprobes void do_signext(unsigned long *valp, int size)
}
}
-static __kprobes void do_byterev(unsigned long *valp, int size)
+static nokprobe_inline void do_byterev(unsigned long *valp, int size)
{
switch (size) {
case 2:
@@ -1757,7 +1762,7 @@ static __kprobes void do_byterev(unsigned long *valp, int size)
* or -1 if the instruction is one that should not be stepped,
* such as an rfid, or a mtmsrd that would clear MSR_RI.
*/
-int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
+int emulate_step(struct pt_regs *regs, unsigned int instr)
{
struct instruction_op op;
int r, err, size;
@@ -1988,3 +1993,4 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr)
regs->nip = truncate_if_32bit(regs->msr, regs->nip + 4);
return 1;
}
+NOKPROBE_SYMBOL(emulate_step);
diff --git a/arch/powerpc/mm/dump_hashpagetable.c b/arch/powerpc/mm/dump_hashpagetable.c
index d979709a0239..c6b900f54c07 100644
--- a/arch/powerpc/mm/dump_hashpagetable.c
+++ b/arch/powerpc/mm/dump_hashpagetable.c
@@ -468,7 +468,7 @@ static void walk_linearmapping(struct pg_state *st)
unsigned long psize = 1 << mmu_psize_defs[mmu_linear_psize].shift;
for (addr = PAGE_OFFSET; addr < PAGE_OFFSET +
- memblock_phys_mem_size(); addr += psize)
+ memblock_end_of_DRAM(); addr += psize)
hpte_find(st, addr, mmu_linear_psize);
}
diff --git a/arch/powerpc/mm/dump_linuxpagetables.c b/arch/powerpc/mm/dump_linuxpagetables.c
index 49abaf4dc8e3..d659345a98d6 100644
--- a/arch/powerpc/mm/dump_linuxpagetables.c
+++ b/arch/powerpc/mm/dump_linuxpagetables.c
@@ -26,6 +26,10 @@
#include <asm/page.h>
#include <asm/pgalloc.h>
+#ifdef CONFIG_PPC32
+#define KERN_VIRT_START 0
+#endif
+
/*
* To visualise what is happening,
*
@@ -56,6 +60,8 @@ struct pg_state {
struct seq_file *seq;
const struct addr_marker *marker;
unsigned long start_address;
+ unsigned long start_pa;
+ unsigned long last_pa;
unsigned int level;
u64 current_flags;
};
@@ -69,6 +75,7 @@ static struct addr_marker address_markers[] = {
{ 0, "Start of kernel VM" },
{ 0, "vmalloc() Area" },
{ 0, "vmalloc() End" },
+#ifdef CONFIG_PPC64
{ 0, "isa I/O start" },
{ 0, "isa I/O end" },
{ 0, "phb I/O start" },
@@ -76,6 +83,20 @@ static struct addr_marker address_markers[] = {
{ 0, "I/O remap start" },
{ 0, "I/O remap end" },
{ 0, "vmemmap start" },
+#else
+ { 0, "Early I/O remap start" },
+ { 0, "Early I/O remap end" },
+#ifdef CONFIG_NOT_COHERENT_CACHE
+ { 0, "Consistent mem start" },
+ { 0, "Consistent mem end" },
+#endif
+#ifdef CONFIG_HIGHMEM
+ { 0, "Highmem PTEs start" },
+ { 0, "Highmem PTEs end" },
+#endif
+ { 0, "Fixmap start" },
+ { 0, "Fixmap end" },
+#endif
{ -1, NULL },
};
@@ -100,8 +121,13 @@ static const struct flag_info flag_array[] = {
.set = "user",
.clear = " ",
}, {
+#if _PAGE_RO == 0
.mask = _PAGE_RW,
.val = _PAGE_RW,
+#else
+ .mask = _PAGE_RO,
+ .val = 0,
+#endif
.set = "rw",
.clear = "ro",
}, {
@@ -154,11 +180,24 @@ static const struct flag_info flag_array[] = {
.clear = " ",
}, {
#endif
+#ifndef CONFIG_PPC_BOOK3S_64
.mask = _PAGE_NO_CACHE,
.val = _PAGE_NO_CACHE,
.set = "no cache",
.clear = " ",
}, {
+#else
+ .mask = _PAGE_NON_IDEMPOTENT,
+ .val = _PAGE_NON_IDEMPOTENT,
+ .set = "non-idempotent",
+ .clear = " ",
+ }, {
+ .mask = _PAGE_TOLERANT,
+ .val = _PAGE_TOLERANT,
+ .set = "tolerant",
+ .clear = " ",
+ }, {
+#endif
#ifdef CONFIG_PPC_BOOK3S_64
.mask = H_PAGE_BUSY,
.val = H_PAGE_BUSY,
@@ -188,6 +227,10 @@ static const struct flag_info flag_array[] = {
.mask = _PAGE_SPECIAL,
.val = _PAGE_SPECIAL,
.set = "special",
+ }, {
+ .mask = _PAGE_SHARED,
+ .val = _PAGE_SHARED,
+ .set = "shared",
}
};
@@ -252,7 +295,14 @@ static void dump_addr(struct pg_state *st, unsigned long addr)
const char *unit = units;
unsigned long delta;
- seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1);
+#ifdef CONFIG_PPC64
+ seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1);
+ seq_printf(st->seq, "0x%016lx ", st->start_pa);
+#else
+ seq_printf(st->seq, "0x%08lx-0x%08lx ", st->start_address, addr - 1);
+ seq_printf(st->seq, "0x%08lx ", st->start_pa);
+#endif
+
delta = (addr - st->start_address) >> 10;
/* Work out what appropriate unit to use */
while (!(delta & 1023) && unit[1]) {
@@ -267,11 +317,15 @@ static void note_page(struct pg_state *st, unsigned long addr,
unsigned int level, u64 val)
{
u64 flag = val & pg_level[level].mask;
+ u64 pa = val & PTE_RPN_MASK;
+
/* At first no level is set */
if (!st->level) {
st->level = level;
st->current_flags = flag;
st->start_address = addr;
+ st->start_pa = pa;
+ st->last_pa = pa;
seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
/*
* Dump the section of virtual memory when:
@@ -279,9 +333,11 @@ static void note_page(struct pg_state *st, unsigned long addr,
* - we change levels in the tree.
* - the address is in a different section of memory and is thus
* used for a different purpose, regardless of the flags.
+ * - the pa of this page is not adjacent to the last inspected page
*/
} else if (flag != st->current_flags || level != st->level ||
- addr >= st->marker[1].start_address) {
+ addr >= st->marker[1].start_address ||
+ pa != st->last_pa + PAGE_SIZE) {
/* Check the PTE flags */
if (st->current_flags) {
@@ -305,8 +361,12 @@ static void note_page(struct pg_state *st, unsigned long addr,
seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
}
st->start_address = addr;
+ st->start_pa = pa;
+ st->last_pa = pa;
st->current_flags = flag;
st->level = level;
+ } else {
+ st->last_pa = pa;
}
}
@@ -377,20 +437,38 @@ static void walk_pagetables(struct pg_state *st)
static void populate_markers(void)
{
- address_markers[0].start_address = PAGE_OFFSET;
- address_markers[1].start_address = VMALLOC_START;
- address_markers[2].start_address = VMALLOC_END;
- address_markers[3].start_address = ISA_IO_BASE;
- address_markers[4].start_address = ISA_IO_END;
- address_markers[5].start_address = PHB_IO_BASE;
- address_markers[6].start_address = PHB_IO_END;
- address_markers[7].start_address = IOREMAP_BASE;
- address_markers[8].start_address = IOREMAP_END;
+ int i = 0;
+
+ address_markers[i++].start_address = PAGE_OFFSET;
+ address_markers[i++].start_address = VMALLOC_START;
+ address_markers[i++].start_address = VMALLOC_END;
+#ifdef CONFIG_PPC64
+ address_markers[i++].start_address = ISA_IO_BASE;
+ address_markers[i++].start_address = ISA_IO_END;
+ address_markers[i++].start_address = PHB_IO_BASE;
+ address_markers[i++].start_address = PHB_IO_END;
+ address_markers[i++].start_address = IOREMAP_BASE;
+ address_markers[i++].start_address = IOREMAP_END;
#ifdef CONFIG_PPC_STD_MMU_64
- address_markers[9].start_address = H_VMEMMAP_BASE;
+ address_markers[i++].start_address = H_VMEMMAP_BASE;
#else
- address_markers[9].start_address = VMEMMAP_BASE;
+ address_markers[i++].start_address = VMEMMAP_BASE;
+#endif
+#else /* !CONFIG_PPC64 */
+ address_markers[i++].start_address = ioremap_bot;
+ address_markers[i++].start_address = IOREMAP_TOP;
+#ifdef CONFIG_NOT_COHERENT_CACHE
+ address_markers[i++].start_address = IOREMAP_TOP;
+ address_markers[i++].start_address = IOREMAP_TOP +
+ CONFIG_CONSISTENT_SIZE;
+#endif
+#ifdef CONFIG_HIGHMEM
+ address_markers[i++].start_address = PKMAP_BASE;
+ address_markers[i++].start_address = PKMAP_ADDR(LAST_PKMAP);
#endif
+ address_markers[i++].start_address = FIXADDR_START;
+ address_markers[i++].start_address = FIXADDR_TOP;
+#endif /* CONFIG_PPC64 */
}
static int ptdump_show(struct seq_file *m, void *v)
@@ -435,7 +513,7 @@ static int ptdump_init(void)
populate_markers();
build_pgtable_complete_mask();
- debugfs_file = debugfs_create_file("kernel_pagetables", 0400, NULL,
+ debugfs_file = debugfs_create_file("kernel_page_tables", 0400, NULL,
NULL, &ptdump_fops);
return debugfs_file ? 0 : -ENOMEM;
}
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 51def8a515be..3a7d580fdc59 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -120,8 +120,6 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
siginfo_t info;
unsigned int lsb = 0;
- up_read(&current->mm->mmap_sem);
-
if (!user_mode(regs))
return MM_FAULT_ERR(SIGBUS);
@@ -154,13 +152,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
* continue the pagefault.
*/
if (fatal_signal_pending(current)) {
- /*
- * If we have retry set, the mmap semaphore will have
- * alrady been released in __lock_page_or_retry(). Else
- * we release it now.
- */
- if (!(fault & VM_FAULT_RETRY))
- up_read(&current->mm->mmap_sem);
/* Coming from kernel, we need to deal with uaccess fixups */
if (user_mode(regs))
return MM_FAULT_RETURN;
@@ -173,8 +164,6 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
/* Out of memory */
if (fault & VM_FAULT_OOM) {
- up_read(&current->mm->mmap_sem);
-
/*
* We ran out of memory, or some other thing happened to us that
* made us unable to handle the page fault gracefully.
@@ -298,7 +287,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
* can result in fault, which will cause a deadlock when called with
* mmap_sem held
*/
- if (user_mode(regs))
+ if (!is_exec && user_mode(regs))
store_update_sp = store_updates_sp(regs);
if (user_mode(regs))
@@ -458,9 +447,30 @@ good_area:
* the fault.
*/
fault = handle_mm_fault(vma, address, flags);
+
+ /*
+ * Handle the retry right now, the mmap_sem has been released in that
+ * case.
+ */
+ if (unlikely(fault & VM_FAULT_RETRY)) {
+ /* We retry only once */
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ /*
+ * Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
+ * of starvation.
+ */
+ flags &= ~FAULT_FLAG_ALLOW_RETRY;
+ flags |= FAULT_FLAG_TRIED;
+ if (!fatal_signal_pending(current))
+ goto retry;
+ }
+ /* We will enter mm_fault_error() below */
+ } else
+ up_read(&current->mm->mmap_sem);
+
if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
if (fault & VM_FAULT_SIGSEGV)
- goto bad_area;
+ goto bad_area_nosemaphore;
rc = mm_fault_error(regs, address, fault);
if (rc >= MM_FAULT_RETURN)
goto bail;
@@ -469,41 +479,29 @@ good_area:
}
/*
- * Major/minor page fault accounting is only done on the
- * initial attempt. If we go through a retry, it is extremely
- * likely that the page will be found in page cache at that point.
+ * Major/minor page fault accounting.
*/
- if (flags & FAULT_FLAG_ALLOW_RETRY) {
- if (fault & VM_FAULT_MAJOR) {
- current->maj_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
- regs, address);
+ if (fault & VM_FAULT_MAJOR) {
+ current->maj_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
+ regs, address);
#ifdef CONFIG_PPC_SMLPAR
- if (firmware_has_feature(FW_FEATURE_CMO)) {
- u32 page_ins;
-
- preempt_disable();
- page_ins = be32_to_cpu(get_lppaca()->page_ins);
- page_ins += 1 << PAGE_FACTOR;
- get_lppaca()->page_ins = cpu_to_be32(page_ins);
- preempt_enable();
- }
-#endif /* CONFIG_PPC_SMLPAR */
- } else {
- current->min_flt++;
- perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
- regs, address);
- }
- if (fault & VM_FAULT_RETRY) {
- /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
- * of starvation. */
- flags &= ~FAULT_FLAG_ALLOW_RETRY;
- flags |= FAULT_FLAG_TRIED;
- goto retry;
+ if (firmware_has_feature(FW_FEATURE_CMO)) {
+ u32 page_ins;
+
+ preempt_disable();
+ page_ins = be32_to_cpu(get_lppaca()->page_ins);
+ page_ins += 1 << PAGE_FACTOR;
+ get_lppaca()->page_ins = cpu_to_be32(page_ins);
+ preempt_enable();
}
+#endif /* CONFIG_PPC_SMLPAR */
+ } else {
+ current->min_flt++;
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
+ regs, address);
}
- up_read(&mm->mmap_sem);
goto bail;
bad_area:
diff --git a/arch/powerpc/mm/hash_low_32.S b/arch/powerpc/mm/hash_low_32.S
index 09cc50c8dace..6f962e5cb5e1 100644
--- a/arch/powerpc/mm/hash_low_32.S
+++ b/arch/powerpc/mm/hash_low_32.S
@@ -31,10 +31,8 @@
#ifdef CONFIG_SMP
.section .bss
.align 2
- .globl mmu_hash_lock
mmu_hash_lock:
.space 4
-EXPORT_SYMBOL(mmu_hash_lock)
#endif /* CONFIG_SMP */
/*
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index c554768b1fa2..f2095ce9d4b0 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -35,9 +35,8 @@
#include <linux/memblock.h>
#include <linux/context_tracking.h>
#include <linux/libfdt.h>
-#include <linux/debugfs.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
#include <asm/processor.h>
#include <asm/pgtable.h>
#include <asm/mmu.h>
@@ -927,11 +926,6 @@ static void __init htab_initialize(void)
}
#endif /* CONFIG_DEBUG_PAGEALLOC */
- /* On U3 based machines, we need to reserve the DART area and
- * _NOT_ map it to avoid cache paradoxes as it's remapped non
- * cacheable later on
- */
-
/* create bolted the linear mapping in the hash table */
for_each_memblock(memory, reg) {
base = (unsigned long)__va(reg->base);
@@ -981,6 +975,19 @@ void __init hash__early_init_devtree(void)
void __init hash__early_init_mmu(void)
{
+ /*
+ * We have code in __hash_page_64K() and elsewhere, which assumes it can
+ * do the following:
+ * new_pte |= (slot << H_PAGE_F_GIX_SHIFT) & (H_PAGE_F_SECOND | H_PAGE_F_GIX);
+ *
+ * Where the slot number is between 0-15, and values of 8-15 indicate
+ * the secondary bucket. For that code to work H_PAGE_F_SECOND and
+ * H_PAGE_F_GIX must occupy four contiguous bits in the PTE, and
+ * H_PAGE_F_SECOND must be placed above H_PAGE_F_GIX. Assert that here
+ * with a BUILD_BUG_ON().
+ */
+ BUILD_BUG_ON(H_PAGE_F_SECOND != (1ul << (H_PAGE_F_GIX_SHIFT + 3)));
+
htab_init_page_sizes();
/*
@@ -1120,7 +1127,7 @@ void demote_segment_4k(struct mm_struct *mm, unsigned long addr)
copro_flush_all_slbs(mm);
if ((get_paca_psize(addr) != MMU_PAGE_4K) && (current->mm == mm)) {
- copy_mm_to_paca(&mm->context);
+ copy_mm_to_paca(mm);
slb_flush_and_rebolt();
}
}
@@ -1192,7 +1199,7 @@ static void check_paca_psize(unsigned long ea, struct mm_struct *mm,
{
if (user_region) {
if (psize != get_paca_psize(ea)) {
- copy_mm_to_paca(&mm->context);
+ copy_mm_to_paca(mm);
slb_flush_and_rebolt();
}
} else if (get_paca()->vmalloc_sllp !=
@@ -1855,5 +1862,4 @@ static int __init hash64_debugfs(void)
return 0;
}
machine_device_initcall(pseries, hash64_debugfs);
-
#endif /* CONFIG_DEBUG_FS */
diff --git a/arch/powerpc/mm/hugetlbpage-book3e.c b/arch/powerpc/mm/hugetlbpage-book3e.c
index 83a8be791e06..bfe4e8526b2d 100644
--- a/arch/powerpc/mm/hugetlbpage-book3e.c
+++ b/arch/powerpc/mm/hugetlbpage-book3e.c
@@ -148,16 +148,9 @@ void book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea,
mm = vma->vm_mm;
-#ifdef CONFIG_PPC_MM_SLICES
- psize = get_slice_psize(mm, ea);
- tsize = mmu_get_tsize(psize);
- shift = mmu_psize_defs[psize].shift;
-#else
psize = vma_mmu_pagesize(vma);
shift = __ilog2(psize);
tsize = shift - 10;
-#endif
-
/*
* We can't be interrupted while we're setting up the MAS
* regusters or after we've confirmed that no tlb exists.
diff --git a/arch/powerpc/mm/hugetlbpage-radix.c b/arch/powerpc/mm/hugetlbpage-radix.c
index 35254a678456..6575b9aabef4 100644
--- a/arch/powerpc/mm/hugetlbpage-radix.c
+++ b/arch/powerpc/mm/hugetlbpage-radix.c
@@ -50,9 +50,12 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
struct hstate *h = hstate_file(file);
struct vm_unmapped_area_info info;
+ if (unlikely(addr > mm->context.addr_limit && addr < TASK_SIZE))
+ mm->context.addr_limit = TASK_SIZE;
+
if (len & ~huge_page_mask(h))
return -EINVAL;
- if (len > TASK_SIZE)
+ if (len > mm->task_size)
return -ENOMEM;
if (flags & MAP_FIXED) {
@@ -64,7 +67,7 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (mm->task_size - len >= addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
@@ -78,5 +81,9 @@ radix__hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
info.high_limit = current->mm->mmap_base;
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
+
+ if (addr > DEFAULT_MAP_WINDOW)
+ info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
+
return vm_unmapped_area(&info);
}
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 8c3389cbcd12..a4f33de4008e 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -753,6 +753,24 @@ static int __init add_huge_page_size(unsigned long long size)
if ((mmu_psize = shift_to_mmu_psize(shift)) < 0)
return -EINVAL;
+#ifdef CONFIG_PPC_BOOK3S_64
+ /*
+ * We need to make sure that for different page sizes reported by
+ * firmware we only add hugetlb support for page sizes that can be
+ * supported by linux page table layout.
+ * For now we have
+ * Radix: 2M
+ * Hash: 16M and 16G
+ */
+ if (radix_enabled()) {
+ if (mmu_psize != MMU_PAGE_2M)
+ return -EINVAL;
+ } else {
+ if (mmu_psize != MMU_PAGE_16M && mmu_psize != MMU_PAGE_16G)
+ return -EINVAL;
+ }
+#endif
+
BUG_ON(mmu_psize_defs[mmu_psize].shift != shift);
/* Return if huge page size has already been setup */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index c22f207aa656..ec84b31c6c86 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -71,10 +71,6 @@
#if H_PGTABLE_RANGE > USER_VSID_RANGE
#warning Limited user VSID range means pagetable space is wasted
#endif
-
-#if (TASK_SIZE_USER64 < H_PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
-#warning TASK_SIZE is smaller than it needs to be.
-#endif
#endif /* CONFIG_PPC_STD_MMU_64 */
phys_addr_t memstart_addr = ~0;
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index a5d9ef59debe..9dbd2a733d6b 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -59,13 +59,14 @@ static inline int mmap_is_legacy(void)
unsigned long arch_mmap_rnd(void)
{
- unsigned long rnd;
+ unsigned long shift, rnd;
- /* 8MB for 32bit, 1GB for 64bit */
+ shift = mmap_rnd_bits;
+#ifdef CONFIG_COMPAT
if (is_32bit_task())
- rnd = get_random_long() % (1<<(23-PAGE_SHIFT));
- else
- rnd = get_random_long() % (1UL<<(30-PAGE_SHIFT));
+ shift = mmap_rnd_compat_bits;
+#endif
+ rnd = get_random_long() % (1ul << shift);
return rnd << PAGE_SHIFT;
}
@@ -79,7 +80,7 @@ static inline unsigned long mmap_base(unsigned long rnd)
else if (gap > MAX_GAP)
gap = MAX_GAP;
- return PAGE_ALIGN(TASK_SIZE - gap - rnd);
+ return PAGE_ALIGN(DEFAULT_MAP_WINDOW - gap - rnd);
}
#ifdef CONFIG_PPC_RADIX_MMU
@@ -97,7 +98,11 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
- if (len > TASK_SIZE - mmap_min_addr)
+ if (unlikely(addr > mm->context.addr_limit &&
+ mm->context.addr_limit != TASK_SIZE))
+ mm->context.addr_limit = TASK_SIZE;
+
+ if (len > mm->task_size - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -106,7 +111,7 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
@@ -114,8 +119,13 @@ radix__arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_base;
- info.high_limit = TASK_SIZE;
info.align_mask = 0;
+
+ if (unlikely(addr > DEFAULT_MAP_WINDOW))
+ info.high_limit = mm->context.addr_limit;
+ else
+ info.high_limit = DEFAULT_MAP_WINDOW;
+
return vm_unmapped_area(&info);
}
@@ -131,8 +141,12 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
unsigned long addr = addr0;
struct vm_unmapped_area_info info;
+ if (unlikely(addr > mm->context.addr_limit &&
+ mm->context.addr_limit != TASK_SIZE))
+ mm->context.addr_limit = TASK_SIZE;
+
/* requested length too big for entire address space */
- if (len > TASK_SIZE - mmap_min_addr)
+ if (len > mm->task_size - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -142,7 +156,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ if (mm->task_size - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
@@ -152,7 +166,14 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
info.high_limit = mm->mmap_base;
info.align_mask = 0;
+
+ if (addr > DEFAULT_MAP_WINDOW)
+ info.high_limit += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
+
addr = vm_unmapped_area(&info);
+ if (!(addr & ~PAGE_MASK))
+ return addr;
+ VM_BUG_ON(addr != -ENOMEM);
/*
* A failed mmap() very likely causes application failure,
@@ -160,15 +181,7 @@ radix__arch_get_unmapped_area_topdown(struct file *filp,
* can happen with large stack limits and large mmap()
* allocations.
*/
- if (addr & ~PAGE_MASK) {
- VM_BUG_ON(addr != -ENOMEM);
- info.flags = 0;
- info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = TASK_SIZE;
- addr = vm_unmapped_area(&info);
- }
-
- return addr;
+ return radix__arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
}
static void radix__arch_pick_mmap_layout(struct mm_struct *mm,
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c
index 73bf6e14c3aa..c6dca2ae78ef 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -30,17 +30,16 @@
static DEFINE_SPINLOCK(mmu_context_lock);
static DEFINE_IDA(mmu_context_ida);
-int __init_new_context(void)
+static int alloc_context_id(int min_id, int max_id)
{
- int index;
- int err;
+ int index, err;
again:
if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
return -ENOMEM;
spin_lock(&mmu_context_lock);
- err = ida_get_new_above(&mmu_context_ida, 1, &index);
+ err = ida_get_new_above(&mmu_context_ida, min_id, &index);
spin_unlock(&mmu_context_lock);
if (err == -EAGAIN)
@@ -48,7 +47,7 @@ again:
else if (err)
return err;
- if (index > MAX_USER_CONTEXT) {
+ if (index > max_id) {
spin_lock(&mmu_context_lock);
ida_remove(&mmu_context_ida, index);
spin_unlock(&mmu_context_lock);
@@ -57,48 +56,105 @@ again:
return index;
}
-EXPORT_SYMBOL_GPL(__init_new_context);
-static int radix__init_new_context(struct mm_struct *mm, int index)
+
+void hash__reserve_context_id(int id)
+{
+ int rc, result = 0;
+
+ do {
+ if (!ida_pre_get(&mmu_context_ida, GFP_KERNEL))
+ break;
+
+ spin_lock(&mmu_context_lock);
+ rc = ida_get_new_above(&mmu_context_ida, id, &result);
+ spin_unlock(&mmu_context_lock);
+ } while (rc == -EAGAIN);
+
+ WARN(result != id, "mmu: Failed to reserve context id %d (rc %d)\n", id, result);
+}
+
+int hash__alloc_context_id(void)
+{
+ unsigned long max;
+
+ if (mmu_has_feature(MMU_FTR_68_BIT_VA))
+ max = MAX_USER_CONTEXT;
+ else
+ max = MAX_USER_CONTEXT_65BIT_VA;
+
+ return alloc_context_id(MIN_USER_CONTEXT, max);
+}
+EXPORT_SYMBOL_GPL(hash__alloc_context_id);
+
+static int hash__init_new_context(struct mm_struct *mm)
+{
+ int index;
+
+ index = hash__alloc_context_id();
+ if (index < 0)
+ return index;
+
+ /*
+ * We do switch_slb() early in fork, even before we setup the
+ * mm->context.addr_limit. Default to max task size so that we copy the
+ * default values to paca which will help us to handle slb miss early.
+ */
+ mm->context.addr_limit = TASK_SIZE_128TB;
+
+ /*
+ * The old code would re-promote on fork, we don't do that when using
+ * slices as it could cause problem promoting slices that have been
+ * forced down to 4K.
+ *
+ * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
+ * explicitly against context.id == 0. This ensures that we properly
+ * initialize context slice details for newly allocated mm's (which will
+ * have id == 0) and don't alter context slice inherited via fork (which
+ * will have id != 0).
+ *
+ * We should not be calling init_new_context() on init_mm. Hence a
+ * check against 0 is OK.
+ */
+ if (mm->context.id == 0)
+ slice_set_user_psize(mm, mmu_virtual_psize);
+
+ subpage_prot_init_new_context(mm);
+
+ return index;
+}
+
+static int radix__init_new_context(struct mm_struct *mm)
{
unsigned long rts_field;
+ int index;
+
+ index = alloc_context_id(1, PRTB_ENTRIES - 1);
+ if (index < 0)
+ return index;
/*
* set the process table entry,
*/
rts_field = radix__get_tree_size();
process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE);
- return 0;
+
+ mm->context.npu_context = NULL;
+
+ return index;
}
int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
{
int index;
- index = __init_new_context();
+ if (radix_enabled())
+ index = radix__init_new_context(mm);
+ else
+ index = hash__init_new_context(mm);
+
if (index < 0)
return index;
- if (radix_enabled()) {
- radix__init_new_context(mm, index);
- } else {
-
- /* The old code would re-promote on fork, we don't do that
- * when using slices as it could cause problem promoting slices
- * that have been forced down to 4K
- *
- * For book3s we have MMU_NO_CONTEXT set to be ~0. Hence check
- * explicitly against context.id == 0. This ensures that we
- * properly initialize context slice details for newly allocated
- * mm's (which will have id == 0) and don't alter context slice
- * inherited via fork (which will have id != 0).
- *
- * We should not be calling init_new_context() on init_mm. Hence a
- * check against 0 is ok.
- */
- if (mm->context.id == 0)
- slice_set_user_psize(mm, mmu_virtual_psize);
- subpage_prot_init_new_context(mm);
- }
mm->context.id = index;
#ifdef CONFIG_PPC_ICSWX
mm->context.cop_lockp = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c
index 497130c5c742..e0a2d8e806ed 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -81,7 +81,7 @@ struct page *new_iommu_non_cma_page(struct page *page, unsigned long private,
gfp_t gfp_mask = GFP_USER;
struct page *new_page;
- if (PageHuge(page) || PageTransHuge(page) || PageCompound(page))
+ if (PageCompound(page))
return NULL;
if (PageHighMem(page))
@@ -100,7 +100,7 @@ static int mm_iommu_move_page_from_cma(struct page *page)
LIST_HEAD(cma_migrate_pages);
/* Ignore huge pages for now */
- if (PageHuge(page) || PageTransHuge(page) || PageCompound(page))
+ if (PageCompound(page))
return -EBUSY;
lru_add_drain();
@@ -314,6 +314,25 @@ struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
}
EXPORT_SYMBOL_GPL(mm_iommu_lookup);
+struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm,
+ unsigned long ua, unsigned long size)
+{
+ struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
+
+ list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list,
+ next) {
+ if ((mem->ua <= ua) &&
+ (ua + size <= mem->ua +
+ (mem->entries << PAGE_SHIFT))) {
+ ret = mem;
+ break;
+ }
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm);
+
struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
unsigned long ua, unsigned long entries)
{
@@ -345,6 +364,26 @@ long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
}
EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
+long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem,
+ unsigned long ua, unsigned long *hpa)
+{
+ const long entry = (ua - mem->ua) >> PAGE_SHIFT;
+ void *va = &mem->hpas[entry];
+ unsigned long *pa;
+
+ if (entry >= mem->entries)
+ return -EFAULT;
+
+ pa = (void *) vmalloc_to_phys(va);
+ if (!pa)
+ return -EFAULT;
+
+ *hpa = *pa | (ua & ~PAGE_MASK);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm);
+
long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
{
if (atomic64_inc_not_zero(&mem->mapped))
diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index c491f2c8f2b9..4554d6527682 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -333,11 +333,6 @@ int init_new_context(struct task_struct *t, struct mm_struct *mm)
mm->context.id = MMU_NO_CONTEXT;
mm->context.active = 0;
-
-#ifdef CONFIG_PPC_MM_SLICES
- slice_set_user_psize(mm, mmu_virtual_psize);
-#endif
-
return 0;
}
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 9befaee237d6..371792e4418f 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -875,13 +875,6 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
void *nd;
int tnid;
- if (spanned_pages)
- pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n",
- nid, start_pfn << PAGE_SHIFT,
- (end_pfn << PAGE_SHIFT) - 1);
- else
- pr_info("Initmem setup node %d\n", nid);
-
nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
nd = __va(nd_pa);
diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c
index 5e01b2ece1d0..654a0d7ba0e7 100644
--- a/arch/powerpc/mm/slb.c
+++ b/arch/powerpc/mm/slb.c
@@ -131,7 +131,7 @@ static void __slb_flush_and_rebolt(void)
"slbmte %2,%3\n"
"isync"
:: "r"(mk_vsid_data(VMALLOC_START, mmu_kernel_ssize, vflags)),
- "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, 1)),
+ "r"(mk_esid_data(VMALLOC_START, mmu_kernel_ssize, VMALLOC_INDEX)),
"r"(ksp_vsid_data),
"r"(ksp_esid_data)
: "memory");
@@ -229,7 +229,7 @@ void switch_slb(struct task_struct *tsk, struct mm_struct *mm)
asm volatile("slbie %0" : : "r" (slbie_data));
get_paca()->slb_cache_ptr = 0;
- copy_mm_to_paca(&mm->context);
+ copy_mm_to_paca(mm);
/*
* preload some userspace segments into the SLB.
diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S
index a85e06ea6c20..1519617aab36 100644
--- a/arch/powerpc/mm/slb_low.S
+++ b/arch/powerpc/mm/slb_low.S
@@ -23,6 +23,48 @@
#include <asm/pgtable.h>
#include <asm/firmware.h>
+/*
+ * This macro generates asm code to compute the VSID scramble
+ * function. Used in slb_allocate() and do_stab_bolted. The function
+ * computed is: (protovsid*VSID_MULTIPLIER) % VSID_MODULUS
+ *
+ * rt = register containing the proto-VSID and into which the
+ * VSID will be stored
+ * rx = scratch register (clobbered)
+ * rf = flags
+ *
+ * - rt and rx must be different registers
+ * - The answer will end up in the low VSID_BITS bits of rt. The higher
+ * bits may contain other garbage, so you may need to mask the
+ * result.
+ */
+#define ASM_VSID_SCRAMBLE(rt, rx, rf, size) \
+ lis rx,VSID_MULTIPLIER_##size@h; \
+ ori rx,rx,VSID_MULTIPLIER_##size@l; \
+ mulld rt,rt,rx; /* rt = rt * MULTIPLIER */ \
+/* \
+ * powermac get slb fault before feature fixup, so make 65 bit part \
+ * the default part of feature fixup \
+ */ \
+BEGIN_MMU_FTR_SECTION \
+ srdi rx,rt,VSID_BITS_65_##size; \
+ clrldi rt,rt,(64-VSID_BITS_65_##size); \
+ add rt,rt,rx; \
+ addi rx,rt,1; \
+ srdi rx,rx,VSID_BITS_65_##size; \
+ add rt,rt,rx; \
+ rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_65_##size)); \
+MMU_FTR_SECTION_ELSE \
+ srdi rx,rt,VSID_BITS_##size; \
+ clrldi rt,rt,(64-VSID_BITS_##size); \
+ add rt,rt,rx; /* add high and low bits */ \
+ addi rx,rt,1; \
+ srdi rx,rx,VSID_BITS_##size; /* extract 2^VSID_BITS bit */ \
+ add rt,rt,rx; \
+ rldimi rf,rt,SLB_VSID_SHIFT_##size,(64 - (SLB_VSID_SHIFT_##size + VSID_BITS_##size)); \
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA)
+
+
/* void slb_allocate_realmode(unsigned long ea);
*
* Create an SLB entry for the given EA (user or kernel).
@@ -45,13 +87,6 @@ _GLOBAL(slb_allocate_realmode)
/* r3 = address, r10 = esid, cr7 = <> PAGE_OFFSET */
blt cr7,0f /* user or kernel? */
- /* kernel address: proto-VSID = ESID */
- /* WARNING - MAGIC: we don't use the VSID 0xfffffffff, but
- * this code will generate the protoVSID 0xfffffffff for the
- * top segment. That's ok, the scramble below will translate
- * it to VSID 0, which is reserved as a bad VSID - one which
- * will never have any pages in it. */
-
/* Check if hitting the linear mapping or some other kernel space
*/
bne cr7,1f
@@ -63,12 +98,10 @@ _GLOBAL(slb_allocate_realmode)
slb_miss_kernel_load_linear:
li r11,0
/*
- * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+ * context = (ea >> 60) - (0xc - 1)
* r9 = region id.
*/
- addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
- addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
-
+ subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET
BEGIN_FTR_SECTION
b .Lslb_finish_load
@@ -77,9 +110,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
1:
#ifdef CONFIG_SPARSEMEM_VMEMMAP
- /* Check virtual memmap region. To be patches at kernel boot */
cmpldi cr0,r9,0xf
bne 1f
+/* Check virtual memmap region. To be patched at kernel boot */
.globl slb_miss_kernel_load_vmemmap
slb_miss_kernel_load_vmemmap:
li r11,0
@@ -102,11 +135,10 @@ slb_miss_kernel_load_io:
li r11,0
6:
/*
- * context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1
+ * context = (ea >> 60) - (0xc - 1)
* r9 = region id.
*/
- addis r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@ha
- addi r9,r9,(MAX_USER_CONTEXT - 0xc + 1)@l
+ subi r9,r9,KERNEL_REGION_CONTEXT_OFFSET
BEGIN_FTR_SECTION
b .Lslb_finish_load
@@ -117,7 +149,13 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_1T_SEGMENT)
* For userspace addresses, make sure this is region 0.
*/
cmpdi r9, 0
- bne 8f
+ bne- 8f
+ /*
+ * user space make sure we are within the allowed limit
+ */
+ ld r11,PACA_ADDR_LIMIT(r13)
+ cmpld r3,r11
+ bge- 8f
/* when using slices, we extract the psize off the slice bitmaps
* and then we need to get the sllp encoding off the mmu_psize_defs
@@ -189,13 +227,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
*/
.Lslb_finish_load:
rldimi r10,r9,ESID_BITS,0
- ASM_VSID_SCRAMBLE(r10,r9,256M)
- /*
- * bits above VSID_BITS_256M need to be ignored from r10
- * also combine VSID and flags
- */
- rldimi r11,r10,SLB_VSID_SHIFT,(64 - (SLB_VSID_SHIFT + VSID_BITS_256M))
-
+ ASM_VSID_SCRAMBLE(r10,r9,r11,256M)
/* r3 = EA, r11 = VSID data */
/*
* Find a slot, round robin. Previously we tried to find a
@@ -259,12 +291,12 @@ slb_compare_rr_to_size:
.Lslb_finish_load_1T:
srdi r10,r10,(SID_SHIFT_1T - SID_SHIFT) /* get 1T ESID */
rldimi r10,r9,ESID_BITS_1T,0
- ASM_VSID_SCRAMBLE(r10,r9,1T)
+ ASM_VSID_SCRAMBLE(r10,r9,r11,1T)
/*
* bits above VSID_BITS_1T need to be ignored from r10
* also combine VSID and flags
*/
- rldimi r11,r10,SLB_VSID_SHIFT_1T,(64 - (SLB_VSID_SHIFT_1T + VSID_BITS_1T))
+
li r10,MMU_SEGSIZE_1T
rldimi r11,r10,SLB_VSID_SSIZE_SHIFT,0 /* insert segment size */
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index 2b27458902ee..966b9fccfa66 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -36,38 +36,29 @@
#include <asm/copro.h>
#include <asm/hugetlb.h>
-/* some sanity checks */
-#if (H_PGTABLE_RANGE >> 43) > SLICE_MASK_SIZE
-#error H_PGTABLE_RANGE exceeds slice_mask high_slices size
-#endif
-
static DEFINE_SPINLOCK(slice_convert_lock);
-
+/*
+ * One bit per slice. We have lower slices which cover 256MB segments
+ * upto 4G range. That gets us 16 low slices. For the rest we track slices
+ * in 1TB size.
+ */
+struct slice_mask {
+ u64 low_slices;
+ DECLARE_BITMAP(high_slices, SLICE_NUM_HIGH);
+};
#ifdef DEBUG
int _slice_debug = 1;
static void slice_print_mask(const char *label, struct slice_mask mask)
{
- char *p, buf[16 + 3 + 64 + 1];
- int i;
-
if (!_slice_debug)
return;
- p = buf;
- for (i = 0; i < SLICE_NUM_LOW; i++)
- *(p++) = (mask.low_slices & (1 << i)) ? '1' : '0';
- *(p++) = ' ';
- *(p++) = '-';
- *(p++) = ' ';
- for (i = 0; i < SLICE_NUM_HIGH; i++)
- *(p++) = (mask.high_slices & (1ul << i)) ? '1' : '0';
- *(p++) = 0;
-
- printk(KERN_DEBUG "%s:%s\n", label, buf);
+ pr_devel("%s low_slice: %*pbl\n", label, (int)SLICE_NUM_LOW, &mask.low_slices);
+ pr_devel("%s high_slice: %*pbl\n", label, (int)SLICE_NUM_HIGH, mask.high_slices);
}
-#define slice_dbg(fmt...) do { if (_slice_debug) pr_debug(fmt); } while(0)
+#define slice_dbg(fmt...) do { if (_slice_debug) pr_devel(fmt); } while (0)
#else
@@ -76,25 +67,28 @@ static void slice_print_mask(const char *label, struct slice_mask mask) {}
#endif
-static struct slice_mask slice_range_to_mask(unsigned long start,
- unsigned long len)
+static void slice_range_to_mask(unsigned long start, unsigned long len,
+ struct slice_mask *ret)
{
unsigned long end = start + len - 1;
- struct slice_mask ret = { 0, 0 };
+
+ ret->low_slices = 0;
+ bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
if (start < SLICE_LOW_TOP) {
- unsigned long mend = min(end, SLICE_LOW_TOP);
- unsigned long mstart = min(start, SLICE_LOW_TOP);
+ unsigned long mend = min(end, (SLICE_LOW_TOP - 1));
- ret.low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
- - (1u << GET_LOW_SLICE_INDEX(mstart));
+ ret->low_slices = (1u << (GET_LOW_SLICE_INDEX(mend) + 1))
+ - (1u << GET_LOW_SLICE_INDEX(start));
}
- if ((start + len) > SLICE_LOW_TOP)
- ret.high_slices = (1ul << (GET_HIGH_SLICE_INDEX(end) + 1))
- - (1ul << GET_HIGH_SLICE_INDEX(start));
+ if ((start + len) > SLICE_LOW_TOP) {
+ unsigned long start_index = GET_HIGH_SLICE_INDEX(start);
+ unsigned long align_end = ALIGN(end, (1UL << SLICE_HIGH_SHIFT));
+ unsigned long count = GET_HIGH_SLICE_INDEX(align_end) - start_index;
- return ret;
+ bitmap_set(ret->high_slices, start_index, count);
+ }
}
static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
@@ -128,53 +122,60 @@ static int slice_high_has_vma(struct mm_struct *mm, unsigned long slice)
return !slice_area_is_free(mm, start, end - start);
}
-static struct slice_mask slice_mask_for_free(struct mm_struct *mm)
+static void slice_mask_for_free(struct mm_struct *mm, struct slice_mask *ret)
{
- struct slice_mask ret = { 0, 0 };
unsigned long i;
+ ret->low_slices = 0;
+ bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
+
for (i = 0; i < SLICE_NUM_LOW; i++)
if (!slice_low_has_vma(mm, i))
- ret.low_slices |= 1u << i;
+ ret->low_slices |= 1u << i;
if (mm->task_size <= SLICE_LOW_TOP)
- return ret;
+ return;
- for (i = 0; i < SLICE_NUM_HIGH; i++)
+ for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++)
if (!slice_high_has_vma(mm, i))
- ret.high_slices |= 1ul << i;
-
- return ret;
+ __set_bit(i, ret->high_slices);
}
-static struct slice_mask slice_mask_for_size(struct mm_struct *mm, int psize)
+static void slice_mask_for_size(struct mm_struct *mm, int psize, struct slice_mask *ret)
{
unsigned char *hpsizes;
int index, mask_index;
- struct slice_mask ret = { 0, 0 };
unsigned long i;
u64 lpsizes;
+ ret->low_slices = 0;
+ bitmap_zero(ret->high_slices, SLICE_NUM_HIGH);
+
lpsizes = mm->context.low_slices_psize;
for (i = 0; i < SLICE_NUM_LOW; i++)
if (((lpsizes >> (i * 4)) & 0xf) == psize)
- ret.low_slices |= 1u << i;
+ ret->low_slices |= 1u << i;
hpsizes = mm->context.high_slices_psize;
- for (i = 0; i < SLICE_NUM_HIGH; i++) {
+ for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) {
mask_index = i & 0x1;
index = i >> 1;
if (((hpsizes[index] >> (mask_index * 4)) & 0xf) == psize)
- ret.high_slices |= 1ul << i;
+ __set_bit(i, ret->high_slices);
}
-
- return ret;
}
-static int slice_check_fit(struct slice_mask mask, struct slice_mask available)
+static int slice_check_fit(struct mm_struct *mm,
+ struct slice_mask mask, struct slice_mask available)
{
+ DECLARE_BITMAP(result, SLICE_NUM_HIGH);
+ unsigned long slice_count = GET_HIGH_SLICE_INDEX(mm->context.addr_limit);
+
+ bitmap_and(result, mask.high_slices,
+ available.high_slices, slice_count);
+
return (mask.low_slices & available.low_slices) == mask.low_slices &&
- (mask.high_slices & available.high_slices) == mask.high_slices;
+ bitmap_equal(result, mask.high_slices, slice_count);
}
static void slice_flush_segments(void *parm)
@@ -185,7 +186,7 @@ static void slice_flush_segments(void *parm)
if (mm != current->active_mm)
return;
- copy_mm_to_paca(&current->active_mm->context);
+ copy_mm_to_paca(current->active_mm);
local_irq_save(flags);
slb_flush_and_rebolt();
@@ -218,18 +219,18 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
mm->context.low_slices_psize = lpsizes;
hpsizes = mm->context.high_slices_psize;
- for (i = 0; i < SLICE_NUM_HIGH; i++) {
+ for (i = 0; i < GET_HIGH_SLICE_INDEX(mm->context.addr_limit); i++) {
mask_index = i & 0x1;
index = i >> 1;
- if (mask.high_slices & (1ul << i))
+ if (test_bit(i, mask.high_slices))
hpsizes[index] = (hpsizes[index] &
~(0xf << (mask_index * 4))) |
(((unsigned long)psize) << (mask_index * 4));
}
slice_dbg(" lsps=%lx, hsps=%lx\n",
- mm->context.low_slices_psize,
- mm->context.high_slices_psize);
+ (unsigned long)mm->context.low_slices_psize,
+ (unsigned long)mm->context.high_slices_psize);
spin_unlock_irqrestore(&slice_convert_lock, flags);
@@ -257,14 +258,14 @@ static bool slice_scan_available(unsigned long addr,
slice = GET_HIGH_SLICE_INDEX(addr);
*boundary_addr = (slice + end) ?
((slice + end) << SLICE_HIGH_SHIFT) : SLICE_LOW_TOP;
- return !!(available.high_slices & (1ul << slice));
+ return !!test_bit(slice, available.high_slices);
}
}
static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
unsigned long len,
struct slice_mask available,
- int psize)
+ int psize, unsigned long high_limit)
{
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
unsigned long addr, found, next_end;
@@ -276,7 +277,10 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
info.align_offset = 0;
addr = TASK_UNMAPPED_BASE;
- while (addr < TASK_SIZE) {
+ /*
+ * Check till the allow max value for this mmap request
+ */
+ while (addr < high_limit) {
info.low_limit = addr;
if (!slice_scan_available(addr, available, 1, &addr))
continue;
@@ -288,8 +292,8 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
* Check if we need to reduce the range, or if we can
* extend it to cover the next available slice.
*/
- if (addr >= TASK_SIZE)
- addr = TASK_SIZE;
+ if (addr >= high_limit)
+ addr = high_limit;
else if (slice_scan_available(addr, available, 1, &next_end)) {
addr = next_end;
goto next_slice;
@@ -307,7 +311,7 @@ static unsigned long slice_find_area_bottomup(struct mm_struct *mm,
static unsigned long slice_find_area_topdown(struct mm_struct *mm,
unsigned long len,
struct slice_mask available,
- int psize)
+ int psize, unsigned long high_limit)
{
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
unsigned long addr, found, prev;
@@ -319,6 +323,15 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
info.align_offset = 0;
addr = mm->mmap_base;
+ /*
+ * If we are trying to allocate above DEFAULT_MAP_WINDOW
+ * Add the different to the mmap_base.
+ * Only for that request for which high_limit is above
+ * DEFAULT_MAP_WINDOW we should apply this.
+ */
+ if (high_limit > DEFAULT_MAP_WINDOW)
+ addr += mm->context.addr_limit - DEFAULT_MAP_WINDOW;
+
while (addr > PAGE_SIZE) {
info.high_limit = addr;
if (!slice_scan_available(addr - 1, available, 0, &addr))
@@ -350,29 +363,38 @@ static unsigned long slice_find_area_topdown(struct mm_struct *mm,
* can happen with large stack limits and large mmap()
* allocations.
*/
- return slice_find_area_bottomup(mm, len, available, psize);
+ return slice_find_area_bottomup(mm, len, available, psize, high_limit);
}
static unsigned long slice_find_area(struct mm_struct *mm, unsigned long len,
struct slice_mask mask, int psize,
- int topdown)
+ int topdown, unsigned long high_limit)
{
if (topdown)
- return slice_find_area_topdown(mm, len, mask, psize);
+ return slice_find_area_topdown(mm, len, mask, psize, high_limit);
else
- return slice_find_area_bottomup(mm, len, mask, psize);
+ return slice_find_area_bottomup(mm, len, mask, psize, high_limit);
}
-#define or_mask(dst, src) do { \
- (dst).low_slices |= (src).low_slices; \
- (dst).high_slices |= (src).high_slices; \
-} while (0)
+static inline void slice_or_mask(struct slice_mask *dst, struct slice_mask *src)
+{
+ DECLARE_BITMAP(result, SLICE_NUM_HIGH);
+
+ dst->low_slices |= src->low_slices;
+ bitmap_or(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
+ bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH);
+}
-#define andnot_mask(dst, src) do { \
- (dst).low_slices &= ~(src).low_slices; \
- (dst).high_slices &= ~(src).high_slices; \
-} while (0)
+static inline void slice_andnot_mask(struct slice_mask *dst, struct slice_mask *src)
+{
+ DECLARE_BITMAP(result, SLICE_NUM_HIGH);
+
+ dst->low_slices &= ~src->low_slices;
+
+ bitmap_andnot(result, dst->high_slices, src->high_slices, SLICE_NUM_HIGH);
+ bitmap_copy(dst->high_slices, result, SLICE_NUM_HIGH);
+}
#ifdef CONFIG_PPC_64K_PAGES
#define MMU_PAGE_BASE MMU_PAGE_64K
@@ -384,14 +406,43 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
unsigned long flags, unsigned int psize,
int topdown)
{
- struct slice_mask mask = {0, 0};
+ struct slice_mask mask;
struct slice_mask good_mask;
- struct slice_mask potential_mask = {0,0} /* silence stupid warning */;
- struct slice_mask compat_mask = {0, 0};
+ struct slice_mask potential_mask;
+ struct slice_mask compat_mask;
int fixed = (flags & MAP_FIXED);
int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT);
struct mm_struct *mm = current->mm;
unsigned long newaddr;
+ unsigned long high_limit;
+
+ /*
+ * Check if we need to expland slice area.
+ */
+ if (unlikely(addr > mm->context.addr_limit &&
+ mm->context.addr_limit != TASK_SIZE)) {
+ mm->context.addr_limit = TASK_SIZE;
+ on_each_cpu(slice_flush_segments, mm, 1);
+ }
+ /*
+ * This mmap request can allocate upt to 512TB
+ */
+ if (addr > DEFAULT_MAP_WINDOW)
+ high_limit = mm->context.addr_limit;
+ else
+ high_limit = DEFAULT_MAP_WINDOW;
+ /*
+ * init different masks
+ */
+ mask.low_slices = 0;
+ bitmap_zero(mask.high_slices, SLICE_NUM_HIGH);
+
+ /* silence stupid warning */;
+ potential_mask.low_slices = 0;
+ bitmap_zero(potential_mask.high_slices, SLICE_NUM_HIGH);
+
+ compat_mask.low_slices = 0;
+ bitmap_zero(compat_mask.high_slices, SLICE_NUM_HIGH);
/* Sanity checks */
BUG_ON(mm->task_size == 0);
@@ -423,7 +474,7 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
/* First make up a "good" mask of slices that have the right size
* already
*/
- good_mask = slice_mask_for_size(mm, psize);
+ slice_mask_for_size(mm, psize, &good_mask);
slice_print_mask(" good_mask", good_mask);
/*
@@ -448,22 +499,22 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
#ifdef CONFIG_PPC_64K_PAGES
/* If we support combo pages, we can allow 64k pages in 4k slices */
if (psize == MMU_PAGE_64K) {
- compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
+ slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask);
if (fixed)
- or_mask(good_mask, compat_mask);
+ slice_or_mask(&good_mask, &compat_mask);
}
#endif
/* First check hint if it's valid or if we have MAP_FIXED */
if (addr != 0 || fixed) {
/* Build a mask for the requested range */
- mask = slice_range_to_mask(addr, len);
+ slice_range_to_mask(addr, len, &mask);
slice_print_mask(" mask", mask);
/* Check if we fit in the good mask. If we do, we just return,
* nothing else to do
*/
- if (slice_check_fit(mask, good_mask)) {
+ if (slice_check_fit(mm, mask, good_mask)) {
slice_dbg(" fits good !\n");
return addr;
}
@@ -471,7 +522,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
/* Now let's see if we can find something in the existing
* slices for that size
*/
- newaddr = slice_find_area(mm, len, good_mask, psize, topdown);
+ newaddr = slice_find_area(mm, len, good_mask,
+ psize, topdown, high_limit);
if (newaddr != -ENOMEM) {
/* Found within the good mask, we don't have to setup,
* we thus return directly
@@ -484,11 +536,11 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
/* We don't fit in the good mask, check what other slices are
* empty and thus can be converted
*/
- potential_mask = slice_mask_for_free(mm);
- or_mask(potential_mask, good_mask);
+ slice_mask_for_free(mm, &potential_mask);
+ slice_or_mask(&potential_mask, &good_mask);
slice_print_mask(" potential", potential_mask);
- if ((addr != 0 || fixed) && slice_check_fit(mask, potential_mask)) {
+ if ((addr != 0 || fixed) && slice_check_fit(mm, mask, potential_mask)) {
slice_dbg(" fits potential !\n");
goto convert;
}
@@ -503,7 +555,8 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
* anywhere in the good area.
*/
if (addr) {
- addr = slice_find_area(mm, len, good_mask, psize, topdown);
+ addr = slice_find_area(mm, len, good_mask,
+ psize, topdown, high_limit);
if (addr != -ENOMEM) {
slice_dbg(" found area at 0x%lx\n", addr);
return addr;
@@ -513,28 +566,29 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len,
/* Now let's see if we can find something in the existing slices
* for that size plus free slices
*/
- addr = slice_find_area(mm, len, potential_mask, psize, topdown);
+ addr = slice_find_area(mm, len, potential_mask,
+ psize, topdown, high_limit);
#ifdef CONFIG_PPC_64K_PAGES
if (addr == -ENOMEM && psize == MMU_PAGE_64K) {
/* retry the search with 4k-page slices included */
- or_mask(potential_mask, compat_mask);
- addr = slice_find_area(mm, len, potential_mask, psize,
- topdown);
+ slice_or_mask(&potential_mask, &compat_mask);
+ addr = slice_find_area(mm, len, potential_mask,
+ psize, topdown, high_limit);
}
#endif
if (addr == -ENOMEM)
return -ENOMEM;
- mask = slice_range_to_mask(addr, len);
+ slice_range_to_mask(addr, len, &mask);
slice_dbg(" found potential area at 0x%lx\n", addr);
slice_print_mask(" mask", mask);
convert:
- andnot_mask(mask, good_mask);
- andnot_mask(mask, compat_mask);
- if (mask.low_slices || mask.high_slices) {
+ slice_andnot_mask(&mask, &good_mask);
+ slice_andnot_mask(&mask, &compat_mask);
+ if (mask.low_slices || !bitmap_empty(mask.high_slices, SLICE_NUM_HIGH)) {
slice_convert(mm, mask, psize);
if (psize > MMU_PAGE_BASE)
on_each_cpu(slice_flush_segments, mm, 1);
@@ -649,8 +703,8 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
slice_dbg(" lsps=%lx, hsps=%lx\n",
- mm->context.low_slices_psize,
- mm->context.high_slices_psize);
+ (unsigned long)mm->context.low_slices_psize,
+ (unsigned long)mm->context.high_slices_psize);
bail:
spin_unlock_irqrestore(&slice_convert_lock, flags);
@@ -659,9 +713,11 @@ void slice_set_user_psize(struct mm_struct *mm, unsigned int psize)
void slice_set_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long len, unsigned int psize)
{
- struct slice_mask mask = slice_range_to_mask(start, len);
+ struct slice_mask mask;
VM_BUG_ON(radix_enabled());
+
+ slice_range_to_mask(start, len, &mask);
slice_convert(mm, mask, psize);
}
@@ -694,14 +750,14 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
if (radix_enabled())
return 0;
- mask = slice_range_to_mask(addr, len);
- available = slice_mask_for_size(mm, psize);
+ slice_range_to_mask(addr, len, &mask);
+ slice_mask_for_size(mm, psize, &available);
#ifdef CONFIG_PPC_64K_PAGES
/* We need to account for 4k slices too */
if (psize == MMU_PAGE_64K) {
struct slice_mask compat_mask;
- compat_mask = slice_mask_for_size(mm, MMU_PAGE_4K);
- or_mask(available, compat_mask);
+ slice_mask_for_size(mm, MMU_PAGE_4K, &compat_mask);
+ slice_or_mask(&available, &compat_mask);
}
#endif
@@ -711,6 +767,6 @@ int is_hugepage_only_range(struct mm_struct *mm, unsigned long addr,
slice_print_mask(" mask", mask);
slice_print_mask(" available", available);
#endif
- return !slice_check_fit(mask, available);
+ return !slice_check_fit(mm, mask, available);
}
#endif
diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c
index 94210940112f..e94fbd4c8845 100644
--- a/arch/powerpc/mm/subpage-prot.c
+++ b/arch/powerpc/mm/subpage-prot.c
@@ -197,7 +197,8 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map)
/* Check parameters */
if ((addr & ~PAGE_MASK) || (len & ~PAGE_MASK) ||
- addr >= TASK_SIZE || len >= TASK_SIZE || addr + len > TASK_SIZE)
+ addr >= mm->task_size || len >= mm->task_size ||
+ addr + len > mm->task_size)
return -EINVAL;
if (is_hugepage_only_range(mm, addr, len))
diff --git a/arch/powerpc/mm/tlb-radix.c b/arch/powerpc/mm/tlb-radix.c
index 952713d6cf04..02e71402fdd3 100644
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -17,7 +17,6 @@
#include <asm/tlb.h>
#include <asm/tlbflush.h>
-static DEFINE_RAW_SPINLOCK(native_tlbie_lock);
#define RIC_FLUSH_TLB 0
#define RIC_FLUSH_PWC 1
@@ -34,10 +33,8 @@ static inline void __tlbiel_pid(unsigned long pid, int set,
prs = 1; /* process scoped */
r = 1; /* raidx format */
- asm volatile("ptesync": : :"memory");
asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
- asm volatile("ptesync": : :"memory");
}
/*
@@ -47,9 +44,33 @@ static inline void _tlbiel_pid(unsigned long pid, unsigned long ric)
{
int set;
- for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) {
+ asm volatile("ptesync": : :"memory");
+
+ /*
+ * Flush the first set of the TLB, and if we're doing a RIC_FLUSH_ALL,
+ * also flush the entire Page Walk Cache.
+ */
+ __tlbiel_pid(pid, 0, ric);
+
+ if (ric == RIC_FLUSH_ALL)
+ /* For the remaining sets, just flush the TLB */
+ ric = RIC_FLUSH_TLB;
+
+ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++)
__tlbiel_pid(pid, set, ric);
- }
+
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
+}
+
+static inline void tlbiel_pwc(unsigned long pid)
+{
+ asm volatile("ptesync": : :"memory");
+
+ /* For PWC flush, we don't look at set number */
+ __tlbiel_pid(pid, 0, RIC_FLUSH_PWC);
+
+ asm volatile("ptesync": : :"memory");
asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory");
}
@@ -129,12 +150,18 @@ void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
{
unsigned long pid;
struct mm_struct *mm = tlb->mm;
+ /*
+ * If we are doing a full mm flush, we will do a tlb flush
+ * with RIC_FLUSH_ALL later.
+ */
+ if (tlb->fullmm)
+ return;
preempt_disable();
pid = mm->context.id;
if (pid != MMU_NO_CONTEXT)
- _tlbiel_pid(pid, RIC_FLUSH_PWC);
+ tlbiel_pwc(pid);
preempt_enable();
}
@@ -175,15 +202,9 @@ void radix__flush_tlb_mm(struct mm_struct *mm)
if (unlikely(pid == MMU_NO_CONTEXT))
goto no_context;
- if (!mm_is_thread_local(mm)) {
- int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
- if (lock_tlbie)
- raw_spin_lock(&native_tlbie_lock);
+ if (!mm_is_thread_local(mm))
_tlbie_pid(pid, RIC_FLUSH_ALL);
- if (lock_tlbie)
- raw_spin_unlock(&native_tlbie_lock);
- } else
+ else
_tlbiel_pid(pid, RIC_FLUSH_ALL);
no_context:
preempt_enable();
@@ -195,22 +216,22 @@ void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr)
unsigned long pid;
struct mm_struct *mm = tlb->mm;
+ /*
+ * If we are doing a full mm flush, we will do a tlb flush
+ * with RIC_FLUSH_ALL later.
+ */
+ if (tlb->fullmm)
+ return;
preempt_disable();
pid = mm->context.id;
if (unlikely(pid == MMU_NO_CONTEXT))
goto no_context;
- if (!mm_is_thread_local(mm)) {
- int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
- if (lock_tlbie)
- raw_spin_lock(&native_tlbie_lock);
+ if (!mm_is_thread_local(mm))
_tlbie_pid(pid, RIC_FLUSH_PWC);
- if (lock_tlbie)
- raw_spin_unlock(&native_tlbie_lock);
- } else
- _tlbiel_pid(pid, RIC_FLUSH_PWC);
+ else
+ tlbiel_pwc(pid);
no_context:
preempt_enable();
}
@@ -226,15 +247,9 @@ void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr,
pid = mm ? mm->context.id : 0;
if (unlikely(pid == MMU_NO_CONTEXT))
goto bail;
- if (!mm_is_thread_local(mm)) {
- int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
- if (lock_tlbie)
- raw_spin_lock(&native_tlbie_lock);
+ if (!mm_is_thread_local(mm))
_tlbie_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
- if (lock_tlbie)
- raw_spin_unlock(&native_tlbie_lock);
- } else
+ else
_tlbiel_va(vmaddr, pid, ap, RIC_FLUSH_TLB);
bail:
preempt_enable();
@@ -255,13 +270,7 @@ EXPORT_SYMBOL(radix__flush_tlb_page);
void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
- int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
-
- if (lock_tlbie)
- raw_spin_lock(&native_tlbie_lock);
_tlbie_pid(0, RIC_FLUSH_ALL);
- if (lock_tlbie)
- raw_spin_unlock(&native_tlbie_lock);
}
EXPORT_SYMBOL(radix__flush_tlb_kernel_range);
@@ -323,7 +332,6 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
unsigned long addr;
int local = mm_is_thread_local(mm);
unsigned long ap = mmu_get_ap(psize);
- int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE);
unsigned long page_size = 1UL << mmu_psize_defs[psize].shift;
@@ -344,13 +352,8 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
if (local)
_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
- else {
- if (lock_tlbie)
- raw_spin_lock(&native_tlbie_lock);
+ else
_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
- if (lock_tlbie)
- raw_spin_unlock(&native_tlbie_lock);
- }
}
err_out:
preempt_enable();
@@ -437,7 +440,7 @@ void radix__flush_tlb_pte_p9_dd1(unsigned long old_pte, struct mm_struct *mm,
return;
}
- if (old_pte & _PAGE_LARGE)
+ if (old_pte & R_PAGE_LARGE)
radix__flush_tlb_page_psize(mm, address, MMU_PAGE_2M);
else
radix__flush_tlb_page_psize(mm, address, mmu_virtual_psize);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index ba28fcb98597..bfc4a0869609 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -770,7 +770,7 @@ void setup_initial_memory_limit(phys_addr_t first_memblock_base,
* avoid going over total available memory just in case...
*/
#ifdef CONFIG_PPC_FSL_BOOK3E
- if (mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
+ if (early_mmu_has_feature(MMU_FTR_TYPE_FSL_E)) {
unsigned long linear_sz;
unsigned int num_cams;
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 2ff13249f87a..6c2d4168daec 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2049,6 +2049,14 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
data.br_stack = &cpuhw->bhrb_stack;
}
+ if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC &&
+ ppmu->get_mem_data_src)
+ ppmu->get_mem_data_src(&data.data_src, ppmu->flags, regs);
+
+ if (event->attr.sample_type & PERF_SAMPLE_WEIGHT &&
+ ppmu->get_mem_weight)
+ ppmu->get_mem_weight(&data.weight);
+
if (perf_event_overflow(event, &data, regs))
power_pmu_stop(event, 0);
}
diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c
index cd951fd231c4..8125160be7bc 100644
--- a/arch/powerpc/perf/isa207-common.c
+++ b/arch/powerpc/perf/isa207-common.c
@@ -148,6 +148,88 @@ static bool is_thresh_cmp_valid(u64 event)
return true;
}
+static inline u64 isa207_find_source(u64 idx, u32 sub_idx)
+{
+ u64 ret = PERF_MEM_NA;
+
+ switch(idx) {
+ case 0:
+ /* Nothing to do */
+ break;
+ case 1:
+ ret = PH(LVL, L1);
+ break;
+ case 2:
+ ret = PH(LVL, L2);
+ break;
+ case 3:
+ ret = PH(LVL, L3);
+ break;
+ case 4:
+ if (sub_idx <= 1)
+ ret = PH(LVL, LOC_RAM);
+ else if (sub_idx > 1 && sub_idx <= 2)
+ ret = PH(LVL, REM_RAM1);
+ else
+ ret = PH(LVL, REM_RAM2);
+ ret |= P(SNOOP, HIT);
+ break;
+ case 5:
+ ret = PH(LVL, REM_CCE1);
+ if ((sub_idx == 0) || (sub_idx == 2) || (sub_idx == 4))
+ ret |= P(SNOOP, HIT);
+ else if ((sub_idx == 1) || (sub_idx == 3) || (sub_idx == 5))
+ ret |= P(SNOOP, HITM);
+ break;
+ case 6:
+ ret = PH(LVL, REM_CCE2);
+ if ((sub_idx == 0) || (sub_idx == 2))
+ ret |= P(SNOOP, HIT);
+ else if ((sub_idx == 1) || (sub_idx == 3))
+ ret |= P(SNOOP, HITM);
+ break;
+ case 7:
+ ret = PM(LVL, L1);
+ break;
+ }
+
+ return ret;
+}
+
+void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
+ struct pt_regs *regs)
+{
+ u64 idx;
+ u32 sub_idx;
+ u64 sier;
+ u64 val;
+
+ /* Skip if no SIER support */
+ if (!(flags & PPMU_HAS_SIER)) {
+ dsrc->val = 0;
+ return;
+ }
+
+ sier = mfspr(SPRN_SIER);
+ val = (sier & ISA207_SIER_TYPE_MASK) >> ISA207_SIER_TYPE_SHIFT;
+ if (val == 1 || val == 2) {
+ idx = (sier & ISA207_SIER_LDST_MASK) >> ISA207_SIER_LDST_SHIFT;
+ sub_idx = (sier & ISA207_SIER_DATA_SRC_MASK) >> ISA207_SIER_DATA_SRC_SHIFT;
+
+ dsrc->val = isa207_find_source(idx, sub_idx);
+ dsrc->val |= (val == 1) ? P(OP, LOAD) : P(OP, STORE);
+ }
+}
+
+void isa207_get_mem_weight(u64 *weight)
+{
+ u64 mmcra = mfspr(SPRN_MMCRA);
+ u64 exp = MMCRA_THR_CTR_EXP(mmcra);
+ u64 mantissa = MMCRA_THR_CTR_MANT(mmcra);
+
+ *weight = mantissa << (2 * exp);
+}
+
int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp)
{
unsigned int unit, pmc, cache, ebb;
diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h
index 899210f14ee4..8acbe6e802c7 100644
--- a/arch/powerpc/perf/isa207-common.h
+++ b/arch/powerpc/perf/isa207-common.h
@@ -248,6 +248,15 @@
#define MMCRA_SDAR_MODE_TLB (1ull << MMCRA_SDAR_MODE_SHIFT)
#define MMCRA_SDAR_MODE_NO_UPDATES ~(0x3ull << MMCRA_SDAR_MODE_SHIFT)
#define MMCRA_IFM_SHIFT 30
+#define MMCRA_THR_CTR_MANT_SHIFT 19
+#define MMCRA_THR_CTR_MANT_MASK 0x7Ful
+#define MMCRA_THR_CTR_MANT(v) (((v) >> MMCRA_THR_CTR_MANT_SHIFT) &\
+ MMCRA_THR_CTR_MANT_MASK)
+
+#define MMCRA_THR_CTR_EXP_SHIFT 27
+#define MMCRA_THR_CTR_EXP_MASK 0x7ul
+#define MMCRA_THR_CTR_EXP(v) (((v) >> MMCRA_THR_CTR_EXP_SHIFT) &\
+ MMCRA_THR_CTR_EXP_MASK)
/* MMCR1 Threshold Compare bit constant for power9 */
#define p9_MMCRA_THR_CMP_SHIFT 45
@@ -260,6 +269,19 @@
#define MAX_ALT 2
#define MAX_PMU_COUNTERS 6
+#define ISA207_SIER_TYPE_SHIFT 15
+#define ISA207_SIER_TYPE_MASK (0x7ull << ISA207_SIER_TYPE_SHIFT)
+
+#define ISA207_SIER_LDST_SHIFT 1
+#define ISA207_SIER_LDST_MASK (0x7ull << ISA207_SIER_LDST_SHIFT)
+
+#define ISA207_SIER_DATA_SRC_SHIFT 53
+#define ISA207_SIER_DATA_SRC_MASK (0x7ull << ISA207_SIER_DATA_SRC_SHIFT)
+
+#define P(a, b) PERF_MEM_S(a, b)
+#define PH(a, b) (P(LVL, HIT) | P(a, b))
+#define PM(a, b) (P(LVL, MISS) | P(a, b))
+
int isa207_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp);
int isa207_compute_mmcr(u64 event[], int n_ev,
unsigned int hwc[], unsigned long mmcr[],
@@ -267,6 +289,8 @@ int isa207_compute_mmcr(u64 event[], int n_ev,
void isa207_disable_pmc(unsigned int pmc, unsigned long mmcr[]);
int isa207_get_alternatives(u64 event, u64 alt[],
const unsigned int ev_alt[][MAX_ALT], int size);
-
+void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags,
+ struct pt_regs *regs);
+void isa207_get_mem_weight(u64 *weight);
#endif
diff --git a/arch/powerpc/perf/power8-events-list.h b/arch/powerpc/perf/power8-events-list.h
index 3a2e6e8ebb92..0f1d184627cc 100644
--- a/arch/powerpc/perf/power8-events-list.h
+++ b/arch/powerpc/perf/power8-events-list.h
@@ -89,3 +89,9 @@ EVENT(PM_MRK_FILT_MATCH, 0x2013c)
EVENT(PM_MRK_FILT_MATCH_ALT, 0x3012e)
/* Alternate event code for PM_LD_MISS_L1 */
EVENT(PM_LD_MISS_L1_ALT, 0x400f0)
+/*
+ * Memory Access Event -- mem_access
+ * Primary PMU event used here is PM_MRK_INST_CMPL, along with
+ * Random Load/Store Facility Sampling (RIS) in Random sampling mode (MMCRA[SM]).
+ */
+EVENT(MEM_ACCESS, 0x10401e0)
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index ce15b19a7962..5463516e369b 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -90,6 +90,7 @@ GENERIC_EVENT_ATTR(branch-instructions, PM_BRU_FIN);
GENERIC_EVENT_ATTR(branch-misses, PM_BR_MPRED_CMPL);
GENERIC_EVENT_ATTR(cache-references, PM_LD_REF_L1);
GENERIC_EVENT_ATTR(cache-misses, PM_LD_MISS_L1);
+GENERIC_EVENT_ATTR(mem_access, MEM_ACCESS);
CACHE_EVENT_ATTR(L1-dcache-load-misses, PM_LD_MISS_L1);
CACHE_EVENT_ATTR(L1-dcache-loads, PM_LD_REF_L1);
@@ -120,6 +121,7 @@ static struct attribute *power8_events_attr[] = {
GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL),
GENERIC_EVENT_PTR(PM_LD_REF_L1),
GENERIC_EVENT_PTR(PM_LD_MISS_L1),
+ GENERIC_EVENT_PTR(MEM_ACCESS),
CACHE_EVENT_PTR(PM_LD_MISS_L1),
CACHE_EVENT_PTR(PM_LD_REF_L1),
@@ -325,6 +327,8 @@ static struct power_pmu power8_pmu = {
.bhrb_filter_map = power8_bhrb_filter_map,
.get_constraint = isa207_get_constraint,
.get_alternatives = power8_get_alternatives,
+ .get_mem_data_src = isa207_get_mem_data_src,
+ .get_mem_weight = isa207_get_mem_weight,
.disable_pmc = isa207_disable_pmc,
.flags = PPMU_HAS_SIER | PPMU_ARCH_207S,
.n_generic = ARRAY_SIZE(power8_generic_events),
diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c
index 7f6582708e06..018f8e90ac35 100644
--- a/arch/powerpc/perf/power9-pmu.c
+++ b/arch/powerpc/perf/power9-pmu.c
@@ -427,6 +427,8 @@ static struct power_pmu power9_pmu = {
.bhrb_filter_map = power9_bhrb_filter_map,
.get_constraint = isa207_get_constraint,
.get_alternatives = power9_get_alternatives,
+ .get_mem_data_src = isa207_get_mem_data_src,
+ .get_mem_weight = isa207_get_mem_weight,
.disable_pmc = isa207_disable_pmc,
.flags = PPMU_HAS_SIER | PPMU_ARCH_207S,
.n_generic = ARRAY_SIZE(power9_generic_events),
diff --git a/arch/powerpc/platforms/44x/sam440ep.c b/arch/powerpc/platforms/44x/sam440ep.c
index 688ffeab0699..55fed5e4de14 100644
--- a/arch/powerpc/platforms/44x/sam440ep.c
+++ b/arch/powerpc/platforms/44x/sam440ep.c
@@ -70,7 +70,7 @@ static struct i2c_board_info sam440ep_rtc_info = {
.irq = -1,
};
-static int sam440ep_setup_rtc(void)
+static int __init sam440ep_setup_rtc(void)
{
return i2c_register_board_info(0, &sam440ep_rtc_info, 1);
}
diff --git a/arch/powerpc/platforms/52xx/Kconfig b/arch/powerpc/platforms/52xx/Kconfig
index b625a2c6f4f2..e4c745981912 100644
--- a/arch/powerpc/platforms/52xx/Kconfig
+++ b/arch/powerpc/platforms/52xx/Kconfig
@@ -33,7 +33,6 @@ config PPC_EFIKA
bool "bPlan Efika 5k2. MPC5200B based computer"
depends on PPC_MPC52xx
select PPC_RTAS
- select RTAS_PROC
select PPC_NATIVE
config PPC_LITE5200
diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c
index 078097a0b09d..f51fd35f4618 100644
--- a/arch/powerpc/platforms/85xx/smp.c
+++ b/arch/powerpc/platforms/85xx/smp.c
@@ -344,6 +344,7 @@ done:
}
struct smp_ops_t smp_85xx_ops = {
+ .cause_nmi_ipi = NULL,
.kick_cpu = smp_85xx_kick_cpu,
.cpu_bootable = smp_generic_cpu_bootable,
#ifdef CONFIG_HOTPLUG_CPU
@@ -461,16 +462,9 @@ static void mpc85xx_smp_machine_kexec(struct kimage *image)
}
#endif /* CONFIG_KEXEC_CORE */
-static void smp_85xx_basic_setup(int cpu_nr)
-{
- if (cpu_has_feature(CPU_FTR_DBELL))
- doorbell_setup_this_cpu();
-}
-
static void smp_85xx_setup_cpu(int cpu_nr)
{
mpic_setup_this_cpu();
- smp_85xx_basic_setup(cpu_nr);
}
void __init mpc85xx_smp_init(void)
@@ -484,7 +478,7 @@ void __init mpc85xx_smp_init(void)
smp_85xx_ops.setup_cpu = smp_85xx_setup_cpu;
smp_85xx_ops.message_pass = smp_mpic_message_pass;
} else
- smp_85xx_ops.setup_cpu = smp_85xx_basic_setup;
+ smp_85xx_ops.setup_cpu = NULL;
if (cpu_has_feature(CPU_FTR_DBELL)) {
/*
@@ -492,7 +486,7 @@ void __init mpc85xx_smp_init(void)
* smp_muxed_ipi_message_pass
*/
smp_85xx_ops.message_pass = NULL;
- smp_85xx_ops.cause_ipi = doorbell_cause_ipi;
+ smp_85xx_ops.cause_ipi = doorbell_global_ipi;
smp_85xx_ops.probe = NULL;
}
diff --git a/arch/powerpc/platforms/86xx/mpc86xx_smp.c b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
index af09baee22cb..020e84a47a32 100644
--- a/arch/powerpc/platforms/86xx/mpc86xx_smp.c
+++ b/arch/powerpc/platforms/86xx/mpc86xx_smp.c
@@ -105,6 +105,7 @@ smp_86xx_setup_cpu(int cpu_nr)
struct smp_ops_t smp_86xx_ops = {
+ .cause_nmi_ipi = NULL,
.message_pass = smp_mpic_message_pass,
.probe = smp_mpic_probe,
.kick_cpu = smp_86xx_kick_cpu,
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 99b0ae8acb78..684e886eaae4 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -279,7 +279,8 @@ config PPC_ICSWX
This option enables kernel support for the PowerPC Initiate
Coprocessor Store Word (icswx) coprocessor instruction on POWER7
- or newer processors.
+ and POWER8 processors. POWER9 uses new copy/paste instructions
+ to invoke the coprocessor.
This option is only useful if you have a processor that supports
the icswx coprocessor instruction. It does not have any effect
@@ -359,7 +360,7 @@ config PPC_BOOK3E_MMU
config PPC_MM_SLICES
bool
- default y if (!PPC_FSL_BOOK3E && PPC64 && HUGETLB_PAGE) || (PPC_STD_MMU_64 && PPC_64K_PAGES)
+ default y if PPC_STD_MMU_64
default n
config PPC_HAVE_PMU_SUPPORT
@@ -371,9 +372,16 @@ config PPC_PERF_CTRS
help
This enables the powerpc-specific perf_event back-end.
+config FORCE_SMP
+ # Allow platforms to force SMP=y by selecting this
+ bool
+ default n
+ select SMP
+
config SMP
depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE || PPC_47x
- bool "Symmetric multi-processing support"
+ select GENERIC_IRQ_MIGRATION
+ bool "Symmetric multi-processing support" if !FORCE_SMP
---help---
This enables support for systems with more than one CPU. If you have
a system with only one CPU, say N. If you have a system with more
diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
index 8b55c5f19d4c..8d3ae2cc52bf 100644
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -15,9 +15,9 @@
#include <linux/msi.h>
#include <linux/export.h>
#include <linux/of_platform.h>
-#include <linux/debugfs.h>
#include <linux/slab.h>
+#include <asm/debugfs.h>
#include <asm/dcr.h>
#include <asm/machdep.h>
#include <asm/prom.h>
diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c
index a6bbbaba14a3..871d38479a25 100644
--- a/arch/powerpc/platforms/cell/interrupt.c
+++ b/arch/powerpc/platforms/cell/interrupt.c
@@ -211,7 +211,7 @@ void iic_request_IPIs(void)
iic_request_ipi(PPC_MSG_CALL_FUNCTION);
iic_request_ipi(PPC_MSG_RESCHEDULE);
iic_request_ipi(PPC_MSG_TICK_BROADCAST);
- iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
+ iic_request_ipi(PPC_MSG_NMI_IPI);
}
#endif /* CONFIG_SMP */
diff --git a/arch/powerpc/platforms/cell/pervasive.c b/arch/powerpc/platforms/cell/pervasive.c
index e7d075077cb0..a88944db9fc3 100644
--- a/arch/powerpc/platforms/cell/pervasive.c
+++ b/arch/powerpc/platforms/cell/pervasive.c
@@ -88,11 +88,14 @@ static void cbe_power_save(void)
static int cbe_system_reset_exception(struct pt_regs *regs)
{
switch (regs->msr & SRR1_WAKEMASK) {
- case SRR1_WAKEEE:
- do_IRQ(regs);
- break;
case SRR1_WAKEDEC:
- timer_interrupt(regs);
+ set_dec(1);
+ case SRR1_WAKEEE:
+ /*
+ * Handle these when interrupts get re-enabled and we take
+ * them as regular exceptions. We are in an NMI context
+ * and can't handle these here.
+ */
break;
case SRR1_WAKEMT:
return cbe_sysreset_hack();
diff --git a/arch/powerpc/platforms/chrp/smp.c b/arch/powerpc/platforms/chrp/smp.c
index b6c9a0dcc924..14515040f7cd 100644
--- a/arch/powerpc/platforms/chrp/smp.c
+++ b/arch/powerpc/platforms/chrp/smp.c
@@ -44,6 +44,7 @@ static void smp_chrp_setup_cpu(int cpu_nr)
/* CHRP with openpic */
struct smp_ops_t chrp_smp_ops = {
+ .cause_nmi_ipi = NULL,
.message_pass = smp_mpic_message_pass,
.probe = smp_mpic_probe,
.kick_cpu = smp_chrp_kick_cpu,
diff --git a/arch/powerpc/platforms/pasemi/idle.c b/arch/powerpc/platforms/pasemi/idle.c
index 75b296bc51af..44e0d9226f0a 100644
--- a/arch/powerpc/platforms/pasemi/idle.c
+++ b/arch/powerpc/platforms/pasemi/idle.c
@@ -53,11 +53,14 @@ static int pasemi_system_reset_exception(struct pt_regs *regs)
regs->nip = regs->link;
switch (regs->msr & SRR1_WAKEMASK) {
- case SRR1_WAKEEE:
- do_IRQ(regs);
- break;
case SRR1_WAKEDEC:
- timer_interrupt(regs);
+ set_dec(1);
+ case SRR1_WAKEEE:
+ /*
+ * Handle these when interrupts get re-enabled and we take
+ * them as regular exceptions. We are in an NMI context
+ * and can't handle these here.
+ */
break;
default:
/* do system reset */
diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c
index 746ca7321b03..2cd99eb30762 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -172,7 +172,7 @@ static irqreturn_t psurge_ipi_intr(int irq, void *d)
return IRQ_HANDLED;
}
-static void smp_psurge_cause_ipi(int cpu, unsigned long data)
+static void smp_psurge_cause_ipi(int cpu)
{
psurge_set_ipi(cpu);
}
@@ -447,6 +447,7 @@ void __init smp_psurge_give_timebase(void)
struct smp_ops_t psurge_smp_ops = {
.message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
.cause_ipi = smp_psurge_cause_ipi,
+ .cause_nmi_ipi = NULL,
.probe = smp_psurge_probe,
.kick_cpu = smp_psurge_kick_cpu,
.setup_cpu = smp_psurge_setup_cpu,
diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
index 3a07e4dcf97c..6a6f4ef46b9e 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -4,6 +4,7 @@ config PPC_POWERNV
select PPC_NATIVE
select PPC_XICS
select PPC_ICP_NATIVE
+ select PPC_XIVE_NATIVE
select PPC_P7_NAP
select PCI
select PCI_MSI
@@ -19,6 +20,8 @@ config PPC_POWERNV
select CPU_FREQ_GOV_ONDEMAND
select CPU_FREQ_GOV_CONSERVATIVE
select PPC_DOORBELL
+ select MMU_NOTIFIER
+ select FORCE_SMP
default y
config OPAL_PRD
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 6fb5522acd70..d2f19821d71d 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -1102,6 +1102,13 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
return -EIO;
}
+ /*
+ * If dealing with the root bus (or the bus underneath the
+ * root port), we reset the bus underneath the root port.
+ *
+ * The cxl driver depends on this behaviour for bi-modal card
+ * switching.
+ */
if (pci_is_root_bus(bus) ||
pci_is_root_bus(bus->parent))
return pnv_eeh_root_reset(hose, option);
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 4ee837e6391a..445f30a2c5ef 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -53,19 +53,6 @@ static int pnv_save_sprs_for_deep_states(void)
uint64_t pir = get_hard_smp_processor_id(cpu);
uint64_t hsprg0_val = (uint64_t)&paca[cpu];
- if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
- /*
- * HSPRG0 is used to store the cpu's pointer to paca.
- * Hence last 3 bits are guaranteed to be 0. Program
- * slw to restore HSPRG0 with 63rd bit set, so that
- * when a thread wakes up at 0x100 we can use this bit
- * to distinguish between fastsleep and deep winkle.
- * This is not necessary with stop/psscr since PLS
- * field of psscr indicates which state we are waking
- * up from.
- */
- hsprg0_val |= 1;
- }
rc = opal_slw_set_reg(pir, SPRN_HSPRG0, hsprg0_val);
if (rc != 0)
return rc;
@@ -122,9 +109,12 @@ static void pnv_alloc_idle_core_states(void)
for (i = 0; i < nr_cores; i++) {
int first_cpu = i * threads_per_core;
int node = cpu_to_node(first_cpu);
+ size_t paca_ptr_array_size;
core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
*core_idle_state = PNV_CORE_IDLE_THREAD_BITS;
+ paca_ptr_array_size = (threads_per_core *
+ sizeof(struct paca_struct *));
for (j = 0; j < threads_per_core; j++) {
int cpu = first_cpu + j;
@@ -132,6 +122,11 @@ static void pnv_alloc_idle_core_states(void)
paca[cpu].core_idle_state_ptr = core_idle_state;
paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
paca[cpu].thread_mask = 1 << j;
+ if (!cpu_has_feature(CPU_FTR_POWER9_DD1))
+ continue;
+ paca[cpu].thread_sibling_pacas =
+ kmalloc_node(paca_ptr_array_size,
+ GFP_KERNEL, node);
}
}
@@ -147,7 +142,6 @@ u32 pnv_get_supported_cpuidle_states(void)
}
EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states);
-
static void pnv_fastsleep_workaround_apply(void *info)
{
@@ -241,8 +235,9 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
* The default stop state that will be used by ppc_md.power_save
* function on platforms that support stop instruction.
*/
-u64 pnv_default_stop_val;
-u64 pnv_default_stop_mask;
+static u64 pnv_default_stop_val;
+static u64 pnv_default_stop_mask;
+static bool default_stop_found;
/*
* Used for ppc_md.power_save which needs a function with no parameters
@@ -262,8 +257,42 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
* psscr value and mask of the deepest stop idle state.
* Used when a cpu is offlined.
*/
-u64 pnv_deepest_stop_psscr_val;
-u64 pnv_deepest_stop_psscr_mask;
+static u64 pnv_deepest_stop_psscr_val;
+static u64 pnv_deepest_stop_psscr_mask;
+static bool deepest_stop_found;
+
+/*
+ * pnv_cpu_offline: A function that puts the CPU into the deepest
+ * available platform idle state on a CPU-Offline.
+ */
+unsigned long pnv_cpu_offline(unsigned int cpu)
+{
+ unsigned long srr1;
+
+ u32 idle_states = pnv_get_supported_cpuidle_states();
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) {
+ srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val,
+ pnv_deepest_stop_psscr_mask);
+ } else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
+ srr1 = power7_winkle();
+ } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
+ (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
+ srr1 = power7_sleep();
+ } else if (idle_states & OPAL_PM_NAP_ENABLED) {
+ srr1 = power7_nap(1);
+ } else {
+ /* This is the fallback method. We emulate snooze */
+ while (!generic_check_cpu_restart(cpu)) {
+ HMT_low();
+ HMT_very_low();
+ }
+ srr1 = 0;
+ HMT_medium();
+ }
+
+ return srr1;
+}
/*
* Power ISA 3.0 idle initialization.
@@ -352,7 +381,6 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
u32 *residency_ns = NULL;
u64 max_residency_ns = 0;
int rc = 0, i;
- bool default_stop_found = false, deepest_stop_found = false;
psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL);
psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL);
@@ -432,21 +460,24 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
}
}
- if (!default_stop_found) {
- pnv_default_stop_val = PSSCR_HV_DEFAULT_VAL;
- pnv_default_stop_mask = PSSCR_HV_DEFAULT_MASK;
- pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n",
+ if (unlikely(!default_stop_found)) {
+ pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n");
+ } else {
+ ppc_md.power_save = power9_idle;
+ pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n",
pnv_default_stop_val, pnv_default_stop_mask);
}
- if (!deepest_stop_found) {
- pnv_deepest_stop_psscr_val = PSSCR_HV_DEFAULT_VAL;
- pnv_deepest_stop_psscr_mask = PSSCR_HV_DEFAULT_MASK;
- pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n",
+ if (unlikely(!deepest_stop_found)) {
+ pr_warn("cpuidle-powernv: No suitable stop state for CPU-Hotplug. Offlined CPUs will busy wait");
+ } else {
+ pr_info("cpuidle-powernv: Deepest stop: psscr = 0x%016llx,mask=0x%016llx\n",
pnv_deepest_stop_psscr_val,
pnv_deepest_stop_psscr_mask);
}
+ pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n",
+ pnv_first_deep_stop_state);
out:
kfree(psscr_val);
kfree(psscr_mask);
@@ -524,10 +555,30 @@ static int __init pnv_init_idle_states(void)
pnv_alloc_idle_core_states();
+ /*
+ * For each CPU, record its PACA address in each of it's
+ * sibling thread's PACA at the slot corresponding to this
+ * CPU's index in the core.
+ */
+ if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+ int cpu;
+
+ pr_info("powernv: idle: Saving PACA pointers of all CPUs in their thread sibling PACA\n");
+ for_each_possible_cpu(cpu) {
+ int base_cpu = cpu_first_thread_sibling(cpu);
+ int idx = cpu_thread_in_core(cpu);
+ int i;
+
+ for (i = 0; i < threads_per_core; i++) {
+ int j = base_cpu + i;
+
+ paca[j].thread_sibling_pacas[idx] = &paca[cpu];
+ }
+ }
+ }
+
if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED)
ppc_md.power_save = power7_idle;
- else if (supported_cpuidle_states & OPAL_PM_STOP_INST_FAST)
- ppc_md.power_save = power9_idle;
out:
return 0;
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index 1c383f38031d..067defeea691 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -9,11 +9,20 @@
* License as published by the Free Software Foundation.
*/
+#include <linux/slab.h>
+#include <linux/mmu_notifier.h>
+#include <linux/mmu_context.h>
+#include <linux/of.h>
#include <linux/export.h>
#include <linux/pci.h>
#include <linux/memblock.h>
#include <linux/iommu.h>
+#include <asm/tlb.h>
+#include <asm/powernv.h>
+#include <asm/reg.h>
+#include <asm/opal.h>
+#include <asm/io.h>
#include <asm/iommu.h>
#include <asm/pnv-pci.h>
#include <asm/msi_bitmap.h>
@@ -22,6 +31,8 @@
#include "powernv.h"
#include "pci.h"
+#define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
+
/*
* Other types of TCE cache invalidation are not functional in the
* hardware.
@@ -37,6 +48,12 @@ struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
struct device_node *dn;
struct pci_dev *gpdev;
+ if (WARN_ON(!npdev))
+ return NULL;
+
+ if (WARN_ON(!npdev->dev.of_node))
+ return NULL;
+
/* Get assoicated PCI device */
dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
if (!dn)
@@ -55,6 +72,12 @@ struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
struct device_node *dn;
struct pci_dev *npdev;
+ if (WARN_ON(!gpdev))
+ return NULL;
+
+ if (WARN_ON(!gpdev->dev.of_node))
+ return NULL;
+
/* Get assoicated PCI device */
dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
if (!dn)
@@ -180,7 +203,7 @@ long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
return rc;
}
- pnv_pci_phb3_tce_invalidate_entire(phb, false);
+ pnv_pci_ioda2_tce_invalidate_entire(phb, false);
/* Add the table to the list so its TCE cache will get invalidated */
pnv_pci_link_table_and_group(phb->hose->node, num,
@@ -204,7 +227,7 @@ long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
return rc;
}
- pnv_pci_phb3_tce_invalidate_entire(phb, false);
+ pnv_pci_ioda2_tce_invalidate_entire(phb, false);
pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
&npe->table_group);
@@ -270,7 +293,7 @@ static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
0 /* bypass base */, top);
if (rc == OPAL_SUCCESS)
- pnv_pci_phb3_tce_invalidate_entire(phb, false);
+ pnv_pci_ioda2_tce_invalidate_entire(phb, false);
return rc;
}
@@ -334,7 +357,7 @@ void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
return;
}
- pnv_pci_phb3_tce_invalidate_entire(npe->phb, false);
+ pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
}
struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
@@ -359,3 +382,442 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
return gpe;
}
+
+/* Maximum number of nvlinks per npu */
+#define NV_MAX_LINKS 6
+
+/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
+static int max_npu2_index;
+
+struct npu_context {
+ struct mm_struct *mm;
+ struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
+ struct mmu_notifier mn;
+ struct kref kref;
+
+ /* Callback to stop translation requests on a given GPU */
+ struct npu_context *(*release_cb)(struct npu_context *, void *);
+
+ /*
+ * Private pointer passed to the above callback for usage by
+ * device drivers.
+ */
+ void *priv;
+};
+
+/*
+ * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
+ * if none are available.
+ */
+static int get_mmio_atsd_reg(struct npu *npu)
+{
+ int i;
+
+ for (i = 0; i < npu->mmio_atsd_count; i++) {
+ if (!test_and_set_bit(i, &npu->mmio_atsd_usage))
+ return i;
+ }
+
+ return -ENOSPC;
+}
+
+static void put_mmio_atsd_reg(struct npu *npu, int reg)
+{
+ clear_bit(reg, &npu->mmio_atsd_usage);
+}
+
+/* MMIO ATSD register offsets */
+#define XTS_ATSD_AVA 1
+#define XTS_ATSD_STAT 2
+
+static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,
+ unsigned long va)
+{
+ int mmio_atsd_reg;
+
+ do {
+ mmio_atsd_reg = get_mmio_atsd_reg(npu);
+ cpu_relax();
+ } while (mmio_atsd_reg < 0);
+
+ __raw_writeq(cpu_to_be64(va),
+ npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA);
+ eieio();
+ __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]);
+
+ return mmio_atsd_reg;
+}
+
+static int mmio_invalidate_pid(struct npu *npu, unsigned long pid)
+{
+ unsigned long launch;
+
+ /* IS set to invalidate matching PID */
+ launch = PPC_BIT(12);
+
+ /* PRS set to process-scoped */
+ launch |= PPC_BIT(13);
+
+ /* AP */
+ launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+
+ /* PID */
+ launch |= pid << PPC_BITLSHIFT(38);
+
+ /* Invalidating the entire process doesn't use a va */
+ return mmio_launch_invalidate(npu, launch, 0);
+}
+
+static int mmio_invalidate_va(struct npu *npu, unsigned long va,
+ unsigned long pid)
+{
+ unsigned long launch;
+
+ /* IS set to invalidate target VA */
+ launch = 0;
+
+ /* PRS set to process scoped */
+ launch |= PPC_BIT(13);
+
+ /* AP */
+ launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
+
+ /* PID */
+ launch |= pid << PPC_BITLSHIFT(38);
+
+ return mmio_launch_invalidate(npu, launch, va);
+}
+
+#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
+
+/*
+ * Invalidate either a single address or an entire PID depending on
+ * the value of va.
+ */
+static void mmio_invalidate(struct npu_context *npu_context, int va,
+ unsigned long address)
+{
+ int i, j, reg;
+ struct npu *npu;
+ struct pnv_phb *nphb;
+ struct pci_dev *npdev;
+ struct {
+ struct npu *npu;
+ int reg;
+ } mmio_atsd_reg[NV_MAX_NPUS];
+ unsigned long pid = npu_context->mm->context.id;
+
+ /*
+ * Loop over all the NPUs this process is active on and launch
+ * an invalidate.
+ */
+ for (i = 0; i <= max_npu2_index; i++) {
+ mmio_atsd_reg[i].reg = -1;
+ for (j = 0; j < NV_MAX_LINKS; j++) {
+ npdev = npu_context->npdev[i][j];
+ if (!npdev)
+ continue;
+
+ nphb = pci_bus_to_host(npdev->bus)->private_data;
+ npu = &nphb->npu;
+ mmio_atsd_reg[i].npu = npu;
+
+ if (va)
+ mmio_atsd_reg[i].reg =
+ mmio_invalidate_va(npu, address, pid);
+ else
+ mmio_atsd_reg[i].reg =
+ mmio_invalidate_pid(npu, pid);
+
+ /*
+ * The NPU hardware forwards the shootdown to all GPUs
+ * so we only have to launch one shootdown per NPU.
+ */
+ break;
+ }
+ }
+
+ /*
+ * Unfortunately the nest mmu does not support flushing specific
+ * addresses so we have to flush the whole mm.
+ */
+ flush_tlb_mm(npu_context->mm);
+
+ /* Wait for all invalidations to complete */
+ for (i = 0; i <= max_npu2_index; i++) {
+ if (mmio_atsd_reg[i].reg < 0)
+ continue;
+
+ /* Wait for completion */
+ npu = mmio_atsd_reg[i].npu;
+ reg = mmio_atsd_reg[i].reg;
+ while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
+ cpu_relax();
+ put_mmio_atsd_reg(npu, reg);
+ }
+}
+
+static void pnv_npu2_mn_release(struct mmu_notifier *mn,
+ struct mm_struct *mm)
+{
+ struct npu_context *npu_context = mn_to_npu_context(mn);
+
+ /* Call into device driver to stop requests to the NMMU */
+ if (npu_context->release_cb)
+ npu_context->release_cb(npu_context, npu_context->priv);
+
+ /*
+ * There should be no more translation requests for this PID, but we
+ * need to ensure any entries for it are removed from the TLB.
+ */
+ mmio_invalidate(npu_context, 0, 0);
+}
+
+static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address,
+ pte_t pte)
+{
+ struct npu_context *npu_context = mn_to_npu_context(mn);
+
+ mmio_invalidate(npu_context, 1, address);
+}
+
+static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long address)
+{
+ struct npu_context *npu_context = mn_to_npu_context(mn);
+
+ mmio_invalidate(npu_context, 1, address);
+}
+
+static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct npu_context *npu_context = mn_to_npu_context(mn);
+ unsigned long address;
+
+ for (address = start; address <= end; address += PAGE_SIZE)
+ mmio_invalidate(npu_context, 1, address);
+}
+
+static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
+ .release = pnv_npu2_mn_release,
+ .change_pte = pnv_npu2_mn_change_pte,
+ .invalidate_page = pnv_npu2_mn_invalidate_page,
+ .invalidate_range = pnv_npu2_mn_invalidate_range,
+};
+
+/*
+ * Call into OPAL to setup the nmmu context for the current task in
+ * the NPU. This must be called to setup the context tables before the
+ * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
+ *
+ * A release callback should be registered to allow a device driver to
+ * be notified that it should not launch any new translation requests
+ * as the final TLB invalidate is about to occur.
+ *
+ * Returns an error if there no contexts are currently available or a
+ * npu_context which should be passed to pnv_npu2_handle_fault().
+ *
+ * mmap_sem must be held in write mode.
+ */
+struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
+ unsigned long flags,
+ struct npu_context *(*cb)(struct npu_context *, void *),
+ void *priv)
+{
+ int rc;
+ u32 nvlink_index;
+ struct device_node *nvlink_dn;
+ struct mm_struct *mm = current->mm;
+ struct pnv_phb *nphb;
+ struct npu *npu;
+ struct npu_context *npu_context;
+
+ /*
+ * At present we don't support GPUs connected to multiple NPUs and I'm
+ * not sure the hardware does either.
+ */
+ struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return ERR_PTR(-ENODEV);
+
+ if (!npdev)
+ /* No nvlink associated with this GPU device */
+ return ERR_PTR(-ENODEV);
+
+ if (!mm) {
+ /* kernel thread contexts are not supported */
+ return ERR_PTR(-EINVAL);
+ }
+
+ nphb = pci_bus_to_host(npdev->bus)->private_data;
+ npu = &nphb->npu;
+
+ /*
+ * Setup the NPU context table for a particular GPU. These need to be
+ * per-GPU as we need the tables to filter ATSDs when there are no
+ * active contexts on a particular GPU.
+ */
+ rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
+ PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+ if (rc < 0)
+ return ERR_PTR(-ENOSPC);
+
+ /*
+ * We store the npu pci device so we can more easily get at the
+ * associated npus.
+ */
+ npu_context = mm->context.npu_context;
+ if (!npu_context) {
+ npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
+ if (!npu_context)
+ return ERR_PTR(-ENOMEM);
+
+ mm->context.npu_context = npu_context;
+ npu_context->mm = mm;
+ npu_context->mn.ops = &nv_nmmu_notifier_ops;
+ __mmu_notifier_register(&npu_context->mn, mm);
+ kref_init(&npu_context->kref);
+ } else {
+ kref_get(&npu_context->kref);
+ }
+
+ npu_context->release_cb = cb;
+ npu_context->priv = priv;
+ nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
+ if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
+ &nvlink_index)))
+ return ERR_PTR(-ENODEV);
+ npu_context->npdev[npu->index][nvlink_index] = npdev;
+
+ return npu_context;
+}
+EXPORT_SYMBOL(pnv_npu2_init_context);
+
+static void pnv_npu2_release_context(struct kref *kref)
+{
+ struct npu_context *npu_context =
+ container_of(kref, struct npu_context, kref);
+
+ npu_context->mm->context.npu_context = NULL;
+ mmu_notifier_unregister(&npu_context->mn,
+ npu_context->mm);
+
+ kfree(npu_context);
+}
+
+void pnv_npu2_destroy_context(struct npu_context *npu_context,
+ struct pci_dev *gpdev)
+{
+ struct pnv_phb *nphb, *phb;
+ struct npu *npu;
+ struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
+ struct device_node *nvlink_dn;
+ u32 nvlink_index;
+
+ if (WARN_ON(!npdev))
+ return;
+
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return;
+
+ nphb = pci_bus_to_host(npdev->bus)->private_data;
+ npu = &nphb->npu;
+ phb = pci_bus_to_host(gpdev->bus)->private_data;
+ nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
+ if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
+ &nvlink_index)))
+ return;
+ npu_context->npdev[npu->index][nvlink_index] = NULL;
+ opal_npu_destroy_context(phb->opal_id, npu_context->mm->context.id,
+ PCI_DEVID(gpdev->bus->number, gpdev->devfn));
+ kref_put(&npu_context->kref, pnv_npu2_release_context);
+}
+EXPORT_SYMBOL(pnv_npu2_destroy_context);
+
+/*
+ * Assumes mmap_sem is held for the contexts associated mm.
+ */
+int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
+ unsigned long *flags, unsigned long *status, int count)
+{
+ u64 rc = 0, result = 0;
+ int i, is_write;
+ struct page *page[1];
+
+ /* mmap_sem should be held so the struct_mm must be present */
+ struct mm_struct *mm = context->mm;
+
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
+ return -ENODEV;
+
+ WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
+
+ for (i = 0; i < count; i++) {
+ is_write = flags[i] & NPU2_WRITE;
+ rc = get_user_pages_remote(NULL, mm, ea[i], 1,
+ is_write ? FOLL_WRITE : 0,
+ page, NULL, NULL);
+
+ /*
+ * To support virtualised environments we will have to do an
+ * access to the page to ensure it gets faulted into the
+ * hypervisor. For the moment virtualisation is not supported in
+ * other areas so leave the access out.
+ */
+ if (rc != 1) {
+ status[i] = rc;
+ result = -EFAULT;
+ continue;
+ }
+
+ status[i] = 0;
+ put_page(page[0]);
+ }
+
+ return result;
+}
+EXPORT_SYMBOL(pnv_npu2_handle_fault);
+
+int pnv_npu2_init(struct pnv_phb *phb)
+{
+ unsigned int i;
+ u64 mmio_atsd;
+ struct device_node *dn;
+ struct pci_dev *gpdev;
+ static int npu_index;
+ uint64_t rc = 0;
+
+ for_each_child_of_node(phb->hose->dn, dn) {
+ gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
+ if (gpdev) {
+ rc = opal_npu_map_lpar(phb->opal_id,
+ PCI_DEVID(gpdev->bus->number, gpdev->devfn),
+ 0, 0);
+ if (rc)
+ dev_err(&gpdev->dev,
+ "Error %lld mapping device to LPAR\n",
+ rc);
+ }
+ }
+
+ for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd",
+ i, &mmio_atsd); i++)
+ phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
+
+ pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i);
+ phb->npu.mmio_atsd_count = i;
+ phb->npu.mmio_atsd_usage = 0;
+ npu_index++;
+ if (WARN_ON(npu_index >= NV_MAX_NPUS))
+ return -ENOSPC;
+ max_npu2_index = npu_index;
+ phb->npu.index = npu_index;
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c
index a91d7876fae2..6c7ad1d8b32e 100644
--- a/arch/powerpc/platforms/powernv/opal-lpc.c
+++ b/arch/powerpc/platforms/powernv/opal-lpc.c
@@ -12,7 +12,6 @@
#include <linux/kernel.h>
#include <linux/of.h>
#include <linux/bug.h>
-#include <linux/debugfs.h>
#include <linux/io.h>
#include <linux/slab.h>
@@ -21,7 +20,7 @@
#include <asm/opal.h>
#include <asm/prom.h>
#include <linux/uaccess.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
#include <asm/isa-bridge.h>
static int opal_lpc_chip_id = -1;
diff --git a/arch/powerpc/platforms/powernv/opal-sensor.c b/arch/powerpc/platforms/powernv/opal-sensor.c
index 308efd170c27..aa267f120033 100644
--- a/arch/powerpc/platforms/powernv/opal-sensor.c
+++ b/arch/powerpc/platforms/powernv/opal-sensor.c
@@ -64,6 +64,10 @@ int opal_get_sensor_data(u32 sensor_hndl, u32 *sensor_data)
*sensor_data = be32_to_cpu(data);
break;
+ case OPAL_WRONG_STATE:
+ ret = -EIO;
+ break;
+
default:
ret = opal_error_code(ret);
break;
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index da8a0f7a035c..f620572f891f 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -50,21 +50,13 @@ END_FTR_SECTION(0, 1); \
#define OPAL_BRANCH(LABEL)
#endif
-/* TODO:
- *
- * - Trace irqs in/off (needs saving/restoring all args, argh...)
- * - Get r11 feed up by Dave so I can have better register usage
+/*
+ * DO_OPAL_CALL assumes:
+ * r0 = opal call token
+ * r12 = msr
+ * LR has been saved
*/
-
-#define OPAL_CALL(name, token) \
- _GLOBAL_TOC(name); \
- mfmsr r12; \
- mflr r0; \
- andi. r11,r12,MSR_IR|MSR_DR; \
- std r0,PPC_LR_STKOFF(r1); \
- li r0,token; \
- beq opal_real_call; \
- OPAL_BRANCH(opal_tracepoint_entry) \
+#define DO_OPAL_CALL() \
mfcr r11; \
stw r11,8(r1); \
li r11,0; \
@@ -83,6 +75,18 @@ END_FTR_SECTION(0, 1); \
mtspr SPRN_HSRR0,r12; \
hrfid
+#define OPAL_CALL(name, token) \
+ _GLOBAL_TOC(name); \
+ mfmsr r12; \
+ mflr r0; \
+ andi. r11,r12,MSR_IR|MSR_DR; \
+ std r0,PPC_LR_STKOFF(r1); \
+ li r0,token; \
+ beq opal_real_call; \
+ OPAL_BRANCH(opal_tracepoint_entry) \
+ DO_OPAL_CALL()
+
+
opal_return:
/*
* Fixup endian on OPAL return... we should be able to simplify
@@ -148,26 +152,13 @@ opal_tracepoint_entry:
ld r8,STK_REG(R29)(r1)
ld r9,STK_REG(R30)(r1)
ld r10,STK_REG(R31)(r1)
+
+ /* setup LR so we return via tracepoint_return */
LOAD_REG_ADDR(r11,opal_tracepoint_return)
- mfcr r12
std r11,16(r1)
- stw r12,8(r1)
- li r11,0
+
mfmsr r12
- ori r11,r11,MSR_EE
- std r12,PACASAVEDMSR(r13)
- andc r12,r12,r11
- mtmsrd r12,1
- LOAD_REG_ADDR(r11,opal_return)
- mtlr r11
- li r11,MSR_DR|MSR_IR|MSR_LE
- andc r12,r12,r11
- mtspr SPRN_HSRR1,r12
- LOAD_REG_ADDR(r11,opal)
- ld r12,8(r11)
- ld r2,0(r11)
- mtspr SPRN_HSRR0,r12
- hrfid
+ DO_OPAL_CALL()
opal_tracepoint_return:
std r3,STK_REG(R31)(r1)
@@ -301,3 +292,21 @@ OPAL_CALL(opal_int_eoi, OPAL_INT_EOI);
OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR);
OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL);
OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR);
+OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET);
+OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO);
+OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG);
+OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO);
+OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO);
+OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE);
+OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK);
+OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK);
+OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ);
+OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ);
+OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO);
+OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO);
+OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC);
+OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP);
+OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT);
+OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT);
+OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR);
diff --git a/arch/powerpc/platforms/powernv/opal-xscom.c b/arch/powerpc/platforms/powernv/opal-xscom.c
index d0ac535cf5d7..28651fb25417 100644
--- a/arch/powerpc/platforms/powernv/opal-xscom.c
+++ b/arch/powerpc/platforms/powernv/opal-xscom.c
@@ -73,25 +73,32 @@ static int opal_xscom_err_xlate(int64_t rc)
static u64 opal_scom_unmangle(u64 addr)
{
+ u64 tmp;
+
/*
- * XSCOM indirect addresses have the top bit set. Additionally
- * the rest of the top 3 nibbles is always 0.
+ * XSCOM addresses use the top nibble to set indirect mode and
+ * its form. Bits 4-11 are always 0.
*
* Because the debugfs interface uses signed offsets and shifts
* the address left by 3, we basically cannot use the top 4 bits
* of the 64-bit address, and thus cannot use the indirect bit.
*
- * To deal with that, we support the indirect bit being in bit
- * 4 (IBM notation) instead of bit 0 in this API, we do the
- * conversion here. To leave room for further xscom address
- * expansion, we only clear out the top byte
+ * To deal with that, we support the indirect bits being in
+ * bits 4-7 (IBM notation) instead of bit 0-3 in this API, we
+ * do the conversion here.
*
- * For in-kernel use, we also support the real indirect bit, so
- * we test for any of the top 5 bits
+ * For in-kernel use, we don't need to do this mangling. In
+ * kernel won't have bits 4-7 set.
*
+ * So:
+ * debugfs will always set 0-3 = 0 and clear 4-7
+ * kernel will always clear 0-3 = 0 and set 4-7
*/
- if (addr & (0x1full << 59))
- addr = (addr & ~(0xffull << 56)) | (1ull << 63);
+ tmp = addr;
+ tmp &= 0x0f00000000000000;
+ addr &= 0xf0ffffffffffffff;
+ addr |= tmp << 4;
+
return addr;
}
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index e0f856bfbfe8..7925a9d72cca 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -435,7 +435,7 @@ int opal_machine_check(struct pt_regs *regs)
evt.version);
return 0;
}
- machine_check_print_event_info(&evt);
+ machine_check_print_event_info(&evt, user_mode(regs));
if (opal_recover_mce(regs, &evt))
return 1;
@@ -595,6 +595,80 @@ static void opal_export_symmap(void)
pr_warn("Error %d creating OPAL symbols file\n", rc);
}
+static ssize_t export_attr_read(struct file *fp, struct kobject *kobj,
+ struct bin_attribute *bin_attr, char *buf,
+ loff_t off, size_t count)
+{
+ return memory_read_from_buffer(buf, count, &off, bin_attr->private,
+ bin_attr->size);
+}
+
+/*
+ * opal_export_attrs: creates a sysfs node for each property listed in
+ * the device-tree under /ibm,opal/firmware/exports/
+ * All new sysfs nodes are created under /opal/exports/.
+ * This allows for reserved memory regions (e.g. HDAT) to be read.
+ * The new sysfs nodes are only readable by root.
+ */
+static void opal_export_attrs(void)
+{
+ struct bin_attribute *attr;
+ struct device_node *np;
+ struct property *prop;
+ struct kobject *kobj;
+ u64 vals[2];
+ int rc;
+
+ np = of_find_node_by_path("/ibm,opal/firmware/exports");
+ if (!np)
+ return;
+
+ /* Create new 'exports' directory - /sys/firmware/opal/exports */
+ kobj = kobject_create_and_add("exports", opal_kobj);
+ if (!kobj) {
+ pr_warn("kobject_create_and_add() of exports failed\n");
+ return;
+ }
+
+ for_each_property_of_node(np, prop) {
+ if (!strcmp(prop->name, "name") || !strcmp(prop->name, "phandle"))
+ continue;
+
+ if (of_property_read_u64_array(np, prop->name, &vals[0], 2))
+ continue;
+
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+
+ if (attr == NULL) {
+ pr_warn("Failed kmalloc for bin_attribute!");
+ continue;
+ }
+
+ sysfs_bin_attr_init(attr);
+ attr->attr.name = kstrdup(prop->name, GFP_KERNEL);
+ attr->attr.mode = 0400;
+ attr->read = export_attr_read;
+ attr->private = __va(vals[0]);
+ attr->size = vals[1];
+
+ if (attr->attr.name == NULL) {
+ pr_warn("Failed kstrdup for bin_attribute attr.name");
+ kfree(attr);
+ continue;
+ }
+
+ rc = sysfs_create_bin_file(kobj, attr);
+ if (rc) {
+ pr_warn("Error %d creating OPAL sysfs exports/%s file\n",
+ rc, prop->name);
+ kfree(attr->attr.name);
+ kfree(attr);
+ }
+ }
+
+ of_node_put(np);
+}
+
static void __init opal_dump_region_init(void)
{
void *addr;
@@ -733,6 +807,9 @@ static int __init opal_init(void)
opal_msglog_sysfs_init();
}
+ /* Export all properties */
+ opal_export_attrs();
+
/* Initialize platform devices: IPMI backend, PRD & flash interface */
opal_pdev_init("ibm,opal-ipmi");
opal_pdev_init("ibm,opal-flash");
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index e36738291c32..6fdbd383f676 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -14,7 +14,6 @@
#include <linux/kernel.h>
#include <linux/pci.h>
#include <linux/crash_dump.h>
-#include <linux/debugfs.h>
#include <linux/delay.h>
#include <linux/string.h>
#include <linux/init.h>
@@ -38,7 +37,7 @@
#include <asm/iommu.h>
#include <asm/tce.h>
#include <asm/xics.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
#include <asm/firmware.h>
#include <asm/pnv-pci.h>
#include <asm/mmzone.h>
@@ -1262,6 +1261,8 @@ static void pnv_pci_ioda_setup_PEs(void)
/* PE#0 is needed for error reporting */
pnv_ioda_reserve_pe(phb, 0);
pnv_ioda_setup_npu_PEs(hose->bus);
+ if (phb->model == PNV_PHB_MODEL_NPU2)
+ pnv_npu2_init(phb);
}
}
}
@@ -1424,8 +1425,7 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
iommu_group_put(pe->table_group.group);
BUG_ON(pe->table_group.group);
}
- pnv_pci_ioda2_table_free_pages(tbl);
- iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
+ iommu_tce_table_put(tbl);
}
static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
@@ -1860,6 +1860,17 @@ static int pnv_ioda1_tce_xchg(struct iommu_table *tbl, long index,
return ret;
}
+
+static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index,
+ unsigned long *hpa, enum dma_data_direction *direction)
+{
+ long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+ if (!ret)
+ pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true);
+
+ return ret;
+}
#endif
static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index,
@@ -1874,6 +1885,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
.set = pnv_ioda1_tce_build,
#ifdef CONFIG_IOMMU_API
.exchange = pnv_ioda1_tce_xchg,
+ .exchange_rm = pnv_ioda1_tce_xchg_rm,
#endif
.clear = pnv_ioda1_tce_free,
.get = pnv_tce_get,
@@ -1883,7 +1895,7 @@ static struct iommu_table_ops pnv_ioda1_iommu_ops = {
#define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1)
#define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2)
-void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
{
__be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm);
const unsigned long val = PHB3_TCE_KILL_INVAL_ALL;
@@ -1948,7 +1960,7 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
{
struct iommu_table_group_link *tgl;
- list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) {
+ list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) {
struct pnv_ioda_pe *pe = container_of(tgl->table_group,
struct pnv_ioda_pe, table_group);
struct pnv_phb *phb = pe->phb;
@@ -1979,6 +1991,14 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl,
}
}
+void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm)
+{
+ if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3)
+ pnv_pci_phb3_tce_invalidate_entire(phb, rm);
+ else
+ opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0);
+}
+
static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index,
long npages, unsigned long uaddr,
enum dma_data_direction direction,
@@ -2004,6 +2024,17 @@ static int pnv_ioda2_tce_xchg(struct iommu_table *tbl, long index,
return ret;
}
+
+static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index,
+ unsigned long *hpa, enum dma_data_direction *direction)
+{
+ long ret = pnv_tce_xchg(tbl, index, hpa, direction);
+
+ if (!ret)
+ pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true);
+
+ return ret;
+}
#endif
static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
@@ -2017,13 +2048,13 @@ static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index,
static void pnv_ioda2_table_free(struct iommu_table *tbl)
{
pnv_pci_ioda2_table_free_pages(tbl);
- iommu_free_table(tbl, "pnv");
}
static struct iommu_table_ops pnv_ioda2_iommu_ops = {
.set = pnv_ioda2_tce_build,
#ifdef CONFIG_IOMMU_API
.exchange = pnv_ioda2_tce_xchg,
+ .exchange_rm = pnv_ioda2_tce_xchg_rm,
#endif
.clear = pnv_ioda2_tce_free,
.get = pnv_tce_get,
@@ -2128,6 +2159,9 @@ static void pnv_pci_ioda1_setup_dma_pe(struct pnv_phb *phb,
found:
tbl = pnv_pci_table_alloc(phb->hose->node);
+ if (WARN_ON(!tbl))
+ return;
+
iommu_register_group(&pe->table_group, phb->hose->global_number,
pe->pe_number);
pnv_pci_link_table_and_group(phb->hose->node, 0, tbl, &pe->table_group);
@@ -2203,7 +2237,7 @@ found:
__free_pages(tce_mem, get_order(tce32_segsz * segs));
if (tbl) {
pnv_pci_unlink_table_and_group(tbl, &pe->table_group);
- iommu_free_table(tbl, "pnv");
+ iommu_tce_table_put(tbl);
}
}
@@ -2293,16 +2327,16 @@ static long pnv_pci_ioda2_create_table(struct iommu_table_group *table_group,
if (!tbl)
return -ENOMEM;
+ tbl->it_ops = &pnv_ioda2_iommu_ops;
+
ret = pnv_pci_ioda2_table_alloc_pages(nid,
bus_offset, page_shift, window_size,
levels, tbl);
if (ret) {
- iommu_free_table(tbl, "pnv");
+ iommu_tce_table_put(tbl);
return ret;
}
- tbl->it_ops = &pnv_ioda2_iommu_ops;
-
*ptbl = tbl;
return 0;
@@ -2343,7 +2377,7 @@ static long pnv_pci_ioda2_setup_default_config(struct pnv_ioda_pe *pe)
if (rc) {
pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n",
rc);
- pnv_ioda2_table_free(tbl);
+ iommu_tce_table_put(tbl);
return rc;
}
@@ -2414,7 +2448,8 @@ static unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
tce_table_size /= direct_table_size;
tce_table_size <<= 3;
- tce_table_size = _ALIGN_UP(tce_table_size, direct_table_size);
+ tce_table_size = max_t(unsigned long,
+ tce_table_size, direct_table_size);
}
return bytes;
@@ -2431,7 +2466,7 @@ static void pnv_ioda2_take_ownership(struct iommu_table_group *table_group)
pnv_pci_ioda2_unset_window(&pe->table_group, 0);
if (pe->pbus)
pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
- pnv_ioda2_table_free(tbl);
+ iommu_tce_table_put(tbl);
}
static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group)
@@ -2735,9 +2770,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
if (rc)
return;
- if (pe->flags & PNV_IODA_PE_DEV)
- iommu_add_device(&pe->pdev->dev);
- else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
+ if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
}
@@ -3406,7 +3439,7 @@ static void pnv_pci_ioda1_release_pe_dma(struct pnv_ioda_pe *pe)
}
free_pages(tbl->it_base, get_order(tbl->it_size << 3));
- iommu_free_table(tbl, "pnv");
+ iommu_tce_table_put(tbl);
}
static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
@@ -3433,7 +3466,7 @@ static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe)
}
pnv_pci_ioda2_table_free_pages(tbl);
- iommu_free_table(tbl, "pnv");
+ iommu_tce_table_put(tbl);
}
static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe,
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index eb835e977e33..935ccb249a8a 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -758,7 +758,7 @@ void pnv_tce_free(struct iommu_table *tbl, long index, long npages)
unsigned long pnv_tce_get(struct iommu_table *tbl, long index)
{
- return *(pnv_tce(tbl, index - tbl->it_offset));
+ return be64_to_cpu(*(pnv_tce(tbl, index - tbl->it_offset)));
}
struct iommu_table *pnv_pci_table_alloc(int nid)
@@ -766,7 +766,11 @@ struct iommu_table *pnv_pci_table_alloc(int nid)
struct iommu_table *tbl;
tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid);
+ if (!tbl)
+ return NULL;
+
INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+ kref_init(&tbl->it_kref);
return tbl;
}
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index e1d3e5526b54..18c8a2fa03b8 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -7,6 +7,9 @@
struct pci_dn;
+/* Maximum possible number of ATSD MMIO registers per NPU */
+#define NV_NMMU_ATSD_REGS 8
+
enum pnv_phb_type {
PNV_PHB_IODA1 = 0,
PNV_PHB_IODA2 = 1,
@@ -174,6 +177,16 @@ struct pnv_phb {
struct OpalIoP7IOCErrorData hub_diag;
} diag;
+ /* Nvlink2 data */
+ struct npu {
+ int index;
+ __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS];
+ unsigned int mmio_atsd_count;
+
+ /* Bitmask for MMIO register usage */
+ unsigned long mmio_atsd_usage;
+ } npu;
+
#ifdef CONFIG_CXL_BASE
struct cxl_afu *cxl_afu;
#endif
@@ -229,14 +242,14 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
/* Nvlink functions */
extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
-extern void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
+extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
struct iommu_table *tbl);
extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num);
extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe);
extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe);
-
+extern int pnv_npu2_init(struct pnv_phb *phb);
/* cxl functions */
extern bool pnv_cxl_enable_device_hook(struct pci_dev *dev);
diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h
index 613052232475..6dbc0a1da1f6 100644
--- a/arch/powerpc/platforms/powernv/powernv.h
+++ b/arch/powerpc/platforms/powernv/powernv.h
@@ -18,8 +18,6 @@ static inline void pnv_pci_shutdown(void) { }
#endif
extern u32 pnv_get_supported_cpuidle_states(void);
-extern u64 pnv_deepest_stop_psscr_val;
-extern u64 pnv_deepest_stop_psscr_mask;
extern void pnv_lpc_init(void);
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
index 5dcbdea1afac..1a9d84371a4d 100644
--- a/arch/powerpc/platforms/powernv/rng.c
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -62,7 +62,7 @@ int powernv_get_random_real_mode(unsigned long *v)
rng = raw_cpu_read(powernv_rng);
- *v = rng_whiten(rng, in_rm64(rng->regs_real));
+ *v = rng_whiten(rng, __raw_rm_readq(rng->regs_real));
return 1;
}
diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index d50c7d99baaf..2dc7e5fb86c3 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -32,6 +32,7 @@
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/xics.h>
+#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/kexec.h>
#include <asm/smp.h>
@@ -76,7 +77,9 @@ static void __init pnv_init(void)
static void __init pnv_init_IRQ(void)
{
- xics_init();
+ /* Try using a XIVE if available, otherwise use a XICS */
+ if (!xive_native_init())
+ xics_init();
WARN_ON(!ppc_md.get_irq);
}
@@ -95,6 +98,10 @@ static void pnv_show_cpuinfo(struct seq_file *m)
else
seq_printf(m, "firmware\t: BML\n");
of_node_put(root);
+ if (radix_enabled())
+ seq_printf(m, "MMU\t\t: Radix\n");
+ else
+ seq_printf(m, "MMU\t\t: Hash\n");
}
static void pnv_prepare_going_down(void)
@@ -218,10 +225,12 @@ static void pnv_kexec_wait_secondaries_down(void)
static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
{
- xics_kexec_teardown_cpu(secondary);
+ if (xive_enabled())
+ xive_kexec_teardown_cpu(secondary);
+ else
+ xics_kexec_teardown_cpu(secondary);
/* On OPAL, we return all CPUs to firmware */
-
if (!firmware_has_feature(FW_FEATURE_OPAL))
return;
@@ -237,6 +246,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
/* Primary waits for the secondaries to have reached OPAL */
pnv_kexec_wait_secondaries_down();
+ /* Switch XIVE back to emulation mode */
+ if (xive_enabled())
+ xive_shutdown();
+
/*
* We might be running as little-endian - now that interrupts
* are disabled, reset the HILE bit to big-endian so we don't
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 8b67e1eefb5c..4aff754b6f2c 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -29,12 +29,14 @@
#include <asm/vdso_datapage.h>
#include <asm/cputhreads.h>
#include <asm/xics.h>
+#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/runlatch.h>
#include <asm/code-patching.h>
#include <asm/dbell.h>
#include <asm/kvm_ppc.h>
#include <asm/ppc-opcode.h>
+#include <asm/cpuidle.h>
#include "powernv.h"
@@ -47,13 +49,10 @@
static void pnv_smp_setup_cpu(int cpu)
{
- if (cpu != boot_cpuid)
+ if (xive_enabled())
+ xive_smp_setup_cpu();
+ else if (cpu != boot_cpuid)
xics_setup_cpu();
-
-#ifdef CONFIG_PPC_DOORBELL
- if (cpu_has_feature(CPU_FTR_DBELL))
- doorbell_setup_this_cpu();
-#endif
}
static int pnv_smp_kick_cpu(int nr)
@@ -132,7 +131,10 @@ static int pnv_smp_cpu_disable(void)
vdso_data->processorCount--;
if (cpu == boot_cpuid)
boot_cpuid = cpumask_any(cpu_online_mask);
- xics_migrate_irqs_away();
+ if (xive_enabled())
+ xive_smp_disable_cpu();
+ else
+ xics_migrate_irqs_away();
return 0;
}
@@ -140,7 +142,6 @@ static void pnv_smp_cpu_kill_self(void)
{
unsigned int cpu;
unsigned long srr1, wmask;
- u32 idle_states;
/* Standard hot unplug procedure */
local_irq_disable();
@@ -155,8 +156,6 @@ static void pnv_smp_cpu_kill_self(void)
if (cpu_has_feature(CPU_FTR_ARCH_207S))
wmask = SRR1_WAKEMASK_P8;
- idle_states = pnv_get_supported_cpuidle_states();
-
/* We don't want to take decrementer interrupts while we are offline,
* so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9)
* enabled as to let IPIs in.
@@ -184,19 +183,7 @@ static void pnv_smp_cpu_kill_self(void)
kvmppc_set_host_ipi(cpu, 0);
ppc64_runlatch_off();
-
- if (cpu_has_feature(CPU_FTR_ARCH_300)) {
- srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val,
- pnv_deepest_stop_psscr_mask);
- } else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
- srr1 = power7_winkle();
- } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
- (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
- srr1 = power7_sleep();
- } else {
- srr1 = power7_nap(1);
- }
-
+ srr1 = pnv_cpu_offline(cpu);
ppc64_runlatch_on();
/*
@@ -213,9 +200,12 @@ static void pnv_smp_cpu_kill_self(void)
if (((srr1 & wmask) == SRR1_WAKEEE) ||
((srr1 & wmask) == SRR1_WAKEHVI) ||
(local_paca->irq_happened & PACA_IRQ_EE)) {
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- icp_opal_flush_interrupt();
- else
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ if (xive_enabled())
+ xive_flush_interrupt();
+ else
+ icp_opal_flush_interrupt();
+ } else
icp_native_flush_interrupt();
} else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
@@ -252,10 +242,69 @@ static int pnv_cpu_bootable(unsigned int nr)
return smp_generic_cpu_bootable(nr);
}
+static int pnv_smp_prepare_cpu(int cpu)
+{
+ if (xive_enabled())
+ return xive_smp_prepare_cpu(cpu);
+ return 0;
+}
+
+/* Cause IPI as setup by the interrupt controller (xics or xive) */
+static void (*ic_cause_ipi)(int cpu);
+
+static void pnv_cause_ipi(int cpu)
+{
+ if (doorbell_try_core_ipi(cpu))
+ return;
+
+ ic_cause_ipi(cpu);
+}
+
+static void pnv_p9_dd1_cause_ipi(int cpu)
+{
+ int this_cpu = get_cpu();
+
+ /*
+ * POWER9 DD1 has a global addressed msgsnd, but for now we restrict
+ * IPIs to same core, because it requires additional synchronization
+ * for inter-core doorbells which we do not implement.
+ */
+ if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu)))
+ doorbell_global_ipi(cpu);
+ else
+ ic_cause_ipi(cpu);
+
+ put_cpu();
+}
+
+static void __init pnv_smp_probe(void)
+{
+ if (xive_enabled())
+ xive_smp_probe();
+ else
+ xics_smp_probe();
+
+ if (cpu_has_feature(CPU_FTR_DBELL)) {
+ ic_cause_ipi = smp_ops->cause_ipi;
+ WARN_ON(!ic_cause_ipi);
+
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+ smp_ops->cause_ipi = pnv_p9_dd1_cause_ipi;
+ else
+ smp_ops->cause_ipi = doorbell_global_ipi;
+ } else {
+ smp_ops->cause_ipi = pnv_cause_ipi;
+ }
+ }
+}
+
static struct smp_ops_t pnv_smp_ops = {
- .message_pass = smp_muxed_ipi_message_pass,
- .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */
- .probe = xics_smp_probe,
+ .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
+ .cause_ipi = NULL, /* Filled at runtime by pnv_smp_probe() */
+ .cause_nmi_ipi = NULL,
+ .probe = pnv_smp_probe,
+ .prepare_cpu = pnv_smp_prepare_cpu,
.kick_cpu = pnv_smp_kick_cpu,
.setup_cpu = pnv_smp_setup_cpu,
.cpu_bootable = pnv_cpu_bootable,
diff --git a/arch/powerpc/platforms/ps3/smp.c b/arch/powerpc/platforms/ps3/smp.c
index 60154d08debf..1d1ad5df106f 100644
--- a/arch/powerpc/platforms/ps3/smp.c
+++ b/arch/powerpc/platforms/ps3/smp.c
@@ -77,7 +77,7 @@ static void __init ps3_smp_probe(void)
BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION != 0);
BUILD_BUG_ON(PPC_MSG_RESCHEDULE != 1);
BUILD_BUG_ON(PPC_MSG_TICK_BROADCAST != 2);
- BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK != 3);
+ BUILD_BUG_ON(PPC_MSG_NMI_IPI != 3);
for (i = 0; i < MSG_COUNT; i++) {
result = ps3_event_receive_port_setup(cpu, &virqs[i]);
@@ -96,7 +96,7 @@ static void __init ps3_smp_probe(void)
ps3_register_ipi_irq(cpu, virqs[i]);
}
- ps3_register_ipi_debug_brk(cpu, virqs[PPC_MSG_DEBUGGER_BREAK]);
+ ps3_register_ipi_debug_brk(cpu, virqs[PPC_MSG_NMI_IPI]);
DBG(" <- %s:%d: (%d)\n", __func__, __LINE__, cpu);
}
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
index 30ec04f1c67c..913c54e23eea 100644
--- a/arch/powerpc/platforms/pseries/Kconfig
+++ b/arch/powerpc/platforms/pseries/Kconfig
@@ -17,9 +17,10 @@ config PPC_PSERIES
select PPC_UDBG_16550
select PPC_NATIVE
select PPC_DOORBELL
- select HOTPLUG_CPU if SMP
+ select HOTPLUG_CPU
select ARCH_RANDOM
select PPC_DOORBELL
+ select FORCE_SMP
default y
config PPC_SPLPAR
diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c
index 193e052fa0dd..bda18d8e1674 100644
--- a/arch/powerpc/platforms/pseries/dlpar.c
+++ b/arch/powerpc/platforms/pseries/dlpar.c
@@ -288,7 +288,6 @@ int dlpar_detach_node(struct device_node *dn)
if (rc)
return rc;
- of_node_put(dn); /* Must decrement the refcount */
return 0;
}
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
index 6b04e3f0f982..18014cdeb590 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -21,13 +21,12 @@
*/
#include <linux/slab.h>
-#include <linux/debugfs.h>
#include <linux/spinlock.h>
#include <asm/smp.h>
#include <linux/uaccess.h>
#include <asm/firmware.h>
#include <asm/lppaca.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
#include <asm/plpar_wrappers.h>
#include <asm/machdep.h>
diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c
index f02ec3ab428c..957ae347b0b3 100644
--- a/arch/powerpc/platforms/pseries/hvCall_inst.c
+++ b/arch/powerpc/platforms/pseries/hvCall_inst.c
@@ -29,6 +29,16 @@
#include <asm/trace.h>
#include <asm/machdep.h>
+/* For hcall instrumentation. One structure per-hcall, per-CPU */
+struct hcall_stats {
+ unsigned long num_calls; /* number of calls (on this CPU) */
+ unsigned long tb_total; /* total wall time (mftb) of calls. */
+ unsigned long purr_total; /* total cpu time (PURR) of calls. */
+ unsigned long tb_start;
+ unsigned long purr_start;
+};
+#define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1)
+
DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats);
/*
diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
index 4d757eaa46bf..8374adee27e3 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -74,6 +74,7 @@ static struct iommu_table_group *iommu_pseries_alloc_group(int node)
goto fail_exit;
INIT_LIST_HEAD_RCU(&tbl->it_group_list);
+ kref_init(&tbl->it_kref);
tgl->table_group = table_group;
list_add_rcu(&tgl->next, &tbl->it_group_list);
@@ -115,7 +116,7 @@ static void iommu_pseries_free_group(struct iommu_table_group *table_group,
BUG_ON(table_group->group);
}
#endif
- iommu_free_table(tbl, node_name);
+ iommu_tce_table_put(tbl);
kfree(table_group);
}
@@ -550,6 +551,7 @@ static void iommu_table_setparms(struct pci_controller *phb,
static void iommu_table_setparms_lpar(struct pci_controller *phb,
struct device_node *dn,
struct iommu_table *tbl,
+ struct iommu_table_group *table_group,
const __be32 *dma_window)
{
unsigned long offset, size;
@@ -563,6 +565,9 @@ static void iommu_table_setparms_lpar(struct pci_controller *phb,
tbl->it_type = TCE_PCI;
tbl->it_offset = offset >> tbl->it_page_shift;
tbl->it_size = size >> tbl->it_page_shift;
+
+ table_group->tce32_start = offset;
+ table_group->tce32_size = size;
}
struct iommu_table_ops iommu_table_pseries_ops = {
@@ -651,8 +656,38 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size);
}
+#ifdef CONFIG_IOMMU_API
+static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned
+ long *tce, enum dma_data_direction *direction)
+{
+ long rc;
+ unsigned long ioba = (unsigned long) index << tbl->it_page_shift;
+ unsigned long flags, oldtce = 0;
+ u64 proto_tce = iommu_direction_to_tce_perm(*direction);
+ unsigned long newtce = *tce | proto_tce;
+
+ spin_lock_irqsave(&tbl->large_pool.lock, flags);
+
+ rc = plpar_tce_get((u64)tbl->it_index, ioba, &oldtce);
+ if (!rc)
+ rc = plpar_tce_put((u64)tbl->it_index, ioba, newtce);
+
+ if (!rc) {
+ *direction = iommu_tce_direction(oldtce);
+ *tce = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+ }
+
+ spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
+
+ return rc;
+}
+#endif
+
struct iommu_table_ops iommu_table_lpar_multi_ops = {
.set = tce_buildmulti_pSeriesLP,
+#ifdef CONFIG_IOMMU_API
+ .exchange = tce_exchange_pseries,
+#endif
.clear = tce_freemulti_pSeriesLP,
.get = tce_get_pSeriesLP
};
@@ -689,7 +724,8 @@ static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus)
if (!ppci->table_group) {
ppci->table_group = iommu_pseries_alloc_group(ppci->phb->node);
tbl = ppci->table_group->tables[0];
- iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window);
+ iommu_table_setparms_lpar(ppci->phb, pdn, tbl,
+ ppci->table_group, dma_window);
tbl->it_ops = &iommu_table_lpar_multi_ops;
iommu_init_table(tbl, ppci->phb->node);
iommu_register_group(ppci->table_group,
@@ -1143,7 +1179,8 @@ static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev)
if (!pci->table_group) {
pci->table_group = iommu_pseries_alloc_group(pci->phb->node);
tbl = pci->table_group->tables[0];
- iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window);
+ iommu_table_setparms_lpar(pci->phb, pdn, tbl,
+ pci->table_group, dma_window);
tbl->it_ops = &iommu_table_lpar_multi_ops;
iommu_init_table(tbl, pci->phb->node);
iommu_register_group(pci->table_group,
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 8b1fe895daa3..6541d0b03e4c 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -958,3 +958,64 @@ int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data)
return rc;
}
+
+static unsigned long vsid_unscramble(unsigned long vsid, int ssize)
+{
+ unsigned long protovsid;
+ unsigned long va_bits = VA_BITS;
+ unsigned long modinv, vsid_modulus;
+ unsigned long max_mod_inv, tmp_modinv;
+
+ if (!mmu_has_feature(MMU_FTR_68_BIT_VA))
+ va_bits = 65;
+
+ if (ssize == MMU_SEGSIZE_256M) {
+ modinv = VSID_MULINV_256M;
+ vsid_modulus = ((1UL << (va_bits - SID_SHIFT)) - 1);
+ } else {
+ modinv = VSID_MULINV_1T;
+ vsid_modulus = ((1UL << (va_bits - SID_SHIFT_1T)) - 1);
+ }
+
+ /*
+ * vsid outside our range.
+ */
+ if (vsid >= vsid_modulus)
+ return 0;
+
+ /*
+ * If modinv is the modular multiplicate inverse of (x % vsid_modulus)
+ * and vsid = (protovsid * x) % vsid_modulus, then we say:
+ * protovsid = (vsid * modinv) % vsid_modulus
+ */
+
+ /* Check if (vsid * modinv) overflow (63 bits) */
+ max_mod_inv = 0x7fffffffffffffffull / vsid;
+ if (modinv < max_mod_inv)
+ return (vsid * modinv) % vsid_modulus;
+
+ tmp_modinv = modinv/max_mod_inv;
+ modinv %= max_mod_inv;
+
+ protovsid = (((vsid * max_mod_inv) % vsid_modulus) * tmp_modinv) % vsid_modulus;
+ protovsid = (protovsid + vsid * modinv) % vsid_modulus;
+
+ return protovsid;
+}
+
+static int __init reserve_vrma_context_id(void)
+{
+ unsigned long protovsid;
+
+ /*
+ * Reserve context ids which map to reserved virtual addresses. For now
+ * we only reserve the context id which maps to the VRMA VSID. We ignore
+ * the addresses in "ibm,adjunct-virtual-addresses" because we don't
+ * enable adjunct support via the "ibm,client-architecture-support"
+ * interface.
+ */
+ protovsid = vsid_unscramble(VRMA_VSID, MMU_SEGSIZE_1T);
+ hash__reserve_context_id(protovsid >> ESID_BITS_1T);
+ return 0;
+}
+machine_device_initcall(pseries, reserve_vrma_context_id);
diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c
index 904a677208d1..bb70b26334f0 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -386,6 +386,10 @@ int pSeries_system_reset_exception(struct pt_regs *regs)
}
fwnmi_release_errinfo();
}
+
+ if (smp_handle_nmi_ipi(regs))
+ return 1;
+
return 0; /* need to perform reset */
}
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index b4d362ed03a1..b5d86426e97b 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -87,6 +87,10 @@ static void pSeries_show_cpuinfo(struct seq_file *m)
model = of_get_property(root, "model", NULL);
seq_printf(m, "machine\t\t: CHRP %s\n", model);
of_node_put(root);
+ if (radix_enabled())
+ seq_printf(m, "MMU\t\t: Radix\n");
+ else
+ seq_printf(m, "MMU\t\t: Hash\n");
}
/* Initialize firmware assisted non-maskable interrupts if
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index f6f83aeccaaa..52ca6b311d44 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -55,11 +55,6 @@
*/
static cpumask_var_t of_spin_mask;
-/*
- * If we multiplex IPI mechanisms, store the appropriate XICS IPI mechanism here
- */
-static void (*xics_cause_ipi)(int cpu, unsigned long data);
-
/* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */
int smp_query_cpu_stopped(unsigned int pcpu)
{
@@ -143,8 +138,6 @@ static void smp_setup_cpu(int cpu)
{
if (cpu != boot_cpuid)
xics_setup_cpu();
- if (cpu_has_feature(CPU_FTR_DBELL))
- doorbell_setup_this_cpu();
if (firmware_has_feature(FW_FEATURE_SPLPAR))
vpa_init(cpu);
@@ -187,28 +180,50 @@ static int smp_pSeries_kick_cpu(int nr)
return 0;
}
-/* Only used on systems that support multiple IPI mechanisms */
-static void pSeries_cause_ipi_mux(int cpu, unsigned long data)
+static void smp_pseries_cause_ipi(int cpu)
{
- if (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id())))
- doorbell_cause_ipi(cpu, data);
- else
- xics_cause_ipi(cpu, data);
+ /* POWER9 should not use this handler */
+ if (doorbell_try_core_ipi(cpu))
+ return;
+
+ icp_ops->cause_ipi(cpu);
+}
+
+static int pseries_cause_nmi_ipi(int cpu)
+{
+ int hwcpu;
+
+ if (cpu == NMI_IPI_ALL_OTHERS) {
+ hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
+ } else {
+ if (cpu < 0) {
+ WARN_ONCE(true, "incorrect cpu parameter %d", cpu);
+ return 0;
+ }
+
+ hwcpu = get_hard_smp_processor_id(cpu);
+ }
+
+ if (plapr_signal_sys_reset(hwcpu) == H_SUCCESS)
+ return 1;
+
+ return 0;
}
static __init void pSeries_smp_probe(void)
{
xics_smp_probe();
- if (cpu_has_feature(CPU_FTR_DBELL)) {
- xics_cause_ipi = smp_ops->cause_ipi;
- smp_ops->cause_ipi = pSeries_cause_ipi_mux;
- }
+ if (cpu_has_feature(CPU_FTR_DBELL))
+ smp_ops->cause_ipi = smp_pseries_cause_ipi;
+ else
+ smp_ops->cause_ipi = icp_ops->cause_ipi;
}
static struct smp_ops_t pseries_smp_ops = {
.message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
.cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */
+ .cause_nmi_ipi = pseries_cause_nmi_ipi,
.probe = pSeries_smp_probe,
.kick_cpu = smp_pSeries_kick_cpu,
.setup_cpu = smp_setup_cpu,
diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c
index 720493932486..28b09fd797ec 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -1318,7 +1318,7 @@ static void vio_dev_release(struct device *dev)
struct iommu_table *tbl = get_iommu_table_base(dev);
if (tbl)
- iommu_free_table(tbl, of_node_full_name(dev->of_node));
+ iommu_tce_table_put(tbl);
of_node_put(dev->of_node);
kfree(to_vio_dev(dev));
}
diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig
index 52dc165c0efb..caf882e749dc 100644
--- a/arch/powerpc/sysdev/Kconfig
+++ b/arch/powerpc/sysdev/Kconfig
@@ -28,6 +28,7 @@ config PPC_MSI_BITMAP
default y if PPC_POWERNV
source "arch/powerpc/sysdev/xics/Kconfig"
+source "arch/powerpc/sysdev/xive/Kconfig"
config PPC_SCOM
bool
diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
index a254824719f1..c0ae11d4f62f 100644
--- a/arch/powerpc/sysdev/Makefile
+++ b/arch/powerpc/sysdev/Makefile
@@ -71,5 +71,6 @@ obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS) += udbg_memcons.o
subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
obj-$(CONFIG_PPC_XICS) += xics/
+obj-$(CONFIG_PPC_XIVE) += xive/
obj-$(CONFIG_GE_FPGA) += ge/
diff --git a/arch/powerpc/sysdev/scom.c b/arch/powerpc/sysdev/scom.c
index d0e9f178a324..76ea32c1b664 100644
--- a/arch/powerpc/sysdev/scom.c
+++ b/arch/powerpc/sysdev/scom.c
@@ -19,10 +19,9 @@
*/
#include <linux/kernel.h>
-#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/export.h>
-#include <asm/debug.h>
+#include <asm/debugfs.h>
#include <asm/prom.h>
#include <asm/scom.h>
#include <linux/uaccess.h>
diff --git a/arch/powerpc/sysdev/xics/icp-hv.c b/arch/powerpc/sysdev/xics/icp-hv.c
index e7fa26c4ff73..bbc839a98c41 100644
--- a/arch/powerpc/sysdev/xics/icp-hv.c
+++ b/arch/powerpc/sysdev/xics/icp-hv.c
@@ -138,7 +138,7 @@ static void icp_hv_set_cpu_priority(unsigned char cppr)
#ifdef CONFIG_SMP
-static void icp_hv_cause_ipi(int cpu, unsigned long data)
+static void icp_hv_cause_ipi(int cpu)
{
icp_hv_set_qirr(cpu, IPI_PRIORITY);
}
diff --git a/arch/powerpc/sysdev/xics/icp-native.c b/arch/powerpc/sysdev/xics/icp-native.c
index 8a6a043e239b..2bfb9968d562 100644
--- a/arch/powerpc/sysdev/xics/icp-native.c
+++ b/arch/powerpc/sysdev/xics/icp-native.c
@@ -143,19 +143,9 @@ static unsigned int icp_native_get_irq(void)
#ifdef CONFIG_SMP
-static void icp_native_cause_ipi(int cpu, unsigned long data)
+static void icp_native_cause_ipi(int cpu)
{
kvmppc_set_host_ipi(cpu, 1);
-#ifdef CONFIG_PPC_DOORBELL
- if (cpu_has_feature(CPU_FTR_DBELL)) {
- if (cpumask_test_cpu(cpu, cpu_sibling_mask(get_cpu()))) {
- doorbell_cause_ipi(cpu, data);
- put_cpu();
- return;
- }
- put_cpu();
- }
-#endif
icp_native_set_qirr(cpu, IPI_PRIORITY);
}
@@ -168,15 +158,15 @@ void icp_native_cause_ipi_rm(int cpu)
* Need the physical address of the XICS to be
* previously saved in kvm_hstate in the paca.
*/
- unsigned long xics_phys;
+ void __iomem *xics_phys;
/*
* Just like the cause_ipi functions, it is required to
- * include a full barrier (out8 includes a sync) before
- * causing the IPI.
+ * include a full barrier before causing the IPI.
*/
xics_phys = paca[cpu].kvm_hstate.xics_phys;
- out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY);
+ mb();
+ __raw_rm_writeb(IPI_PRIORITY, xics_phys + XICS_MFRR);
}
#endif
diff --git a/arch/powerpc/sysdev/xics/icp-opal.c b/arch/powerpc/sysdev/xics/icp-opal.c
index b53f80f0b4d8..c71d2ea42627 100644
--- a/arch/powerpc/sysdev/xics/icp-opal.c
+++ b/arch/powerpc/sysdev/xics/icp-opal.c
@@ -126,7 +126,7 @@ static void icp_opal_eoi(struct irq_data *d)
#ifdef CONFIG_SMP
-static void icp_opal_cause_ipi(int cpu, unsigned long data)
+static void icp_opal_cause_ipi(int cpu)
{
int hw_cpu = get_hard_smp_processor_id(cpu);
diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c
index 23efe4e42172..ffe138b8b9dc 100644
--- a/arch/powerpc/sysdev/xics/xics-common.c
+++ b/arch/powerpc/sysdev/xics/xics-common.c
@@ -143,11 +143,11 @@ static void xics_request_ipi(void)
void __init xics_smp_probe(void)
{
- /* Setup cause_ipi callback based on which ICP is used */
- smp_ops->cause_ipi = icp_ops->cause_ipi;
-
/* Register all the IPIs */
xics_request_ipi();
+
+ /* Setup cause_ipi callback based on which ICP is used */
+ smp_ops->cause_ipi = icp_ops->cause_ipi;
}
#endif /* CONFIG_SMP */
diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig
new file mode 100644
index 000000000000..12ccd7373d2f
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/Kconfig
@@ -0,0 +1,11 @@
+config PPC_XIVE
+ bool
+ default n
+ select PPC_SMP_MUXED_IPI
+ select HARDIRQS_SW_RESEND
+
+config PPC_XIVE_NATIVE
+ bool
+ default n
+ select PPC_XIVE
+ depends on PPC_POWERNV
diff --git a/arch/powerpc/sysdev/xive/Makefile b/arch/powerpc/sysdev/xive/Makefile
new file mode 100644
index 000000000000..3fab303fc169
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/Makefile
@@ -0,0 +1,4 @@
+subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
+
+obj-y += common.o
+obj-$(CONFIG_PPC_XIVE_NATIVE) += native.o
diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
new file mode 100644
index 000000000000..6a98efb14264
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/common.c
@@ -0,0 +1,1302 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "xive: " fmt
+
+#include <linux/types.h>
+#include <linux/threads.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/debugfs.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/msi.h>
+
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/machdep.h>
+#include <asm/irq.h>
+#include <asm/errno.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+#include <asm/xmon.h>
+
+#include "xive-internal.h"
+
+#undef DEBUG_FLUSH
+#undef DEBUG_ALL
+
+#ifdef DEBUG_ALL
+#define DBG_VERBOSE(fmt...) pr_devel(fmt)
+#else
+#define DBG_VERBOSE(fmt...) do { } while(0)
+#endif
+
+bool __xive_enabled;
+bool xive_cmdline_disabled;
+
+/* We use only one priority for now */
+static u8 xive_irq_priority;
+
+/* TIMA */
+void __iomem *xive_tima;
+u32 xive_tima_offset;
+
+/* Backend ops */
+static const struct xive_ops *xive_ops;
+
+/* Our global interrupt domain */
+static struct irq_domain *xive_irq_domain;
+
+#ifdef CONFIG_SMP
+/* The IPIs all use the same logical irq number */
+static u32 xive_ipi_irq;
+#endif
+
+/* Xive state for each CPU */
+static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu);
+
+/*
+ * A "disabled" interrupt should never fire, to catch problems
+ * we set its logical number to this
+ */
+#define XIVE_BAD_IRQ 0x7fffffff
+#define XIVE_MAX_IRQ (XIVE_BAD_IRQ - 1)
+
+/* An invalid CPU target */
+#define XIVE_INVALID_TARGET (-1)
+
+/*
+ * Read the next entry in a queue, return its content if it's valid
+ * or 0 if there is no new entry.
+ *
+ * The queue pointer is moved forward unless "just_peek" is set
+ */
+static u32 xive_read_eq(struct xive_q *q, bool just_peek)
+{
+ u32 cur;
+
+ if (!q->qpage)
+ return 0;
+ cur = be32_to_cpup(q->qpage + q->idx);
+
+ /* Check valid bit (31) vs current toggle polarity */
+ if ((cur >> 31) == q->toggle)
+ return 0;
+
+ /* If consuming from the queue ... */
+ if (!just_peek) {
+ /* Next entry */
+ q->idx = (q->idx + 1) & q->msk;
+
+ /* Wrap around: flip valid toggle */
+ if (q->idx == 0)
+ q->toggle ^= 1;
+ }
+ /* Mask out the valid bit (31) */
+ return cur & 0x7fffffff;
+}
+
+/*
+ * Scans all the queue that may have interrupts in them
+ * (based on "pending_prio") in priority order until an
+ * interrupt is found or all the queues are empty.
+ *
+ * Then updates the CPPR (Current Processor Priority
+ * Register) based on the most favored interrupt found
+ * (0xff if none) and return what was found (0 if none).
+ *
+ * If just_peek is set, return the most favored pending
+ * interrupt if any but don't update the queue pointers.
+ *
+ * Note: This function can operate generically on any number
+ * of queues (up to 8). The current implementation of the XIVE
+ * driver only uses a single queue however.
+ *
+ * Note2: This will also "flush" "the pending_count" of a queue
+ * into the "count" when that queue is observed to be empty.
+ * This is used to keep track of the amount of interrupts
+ * targetting a queue. When an interrupt is moved away from
+ * a queue, we only decrement that queue count once the queue
+ * has been observed empty to avoid races.
+ */
+static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
+{
+ u32 irq = 0;
+ u8 prio;
+
+ /* Find highest pending priority */
+ while (xc->pending_prio != 0) {
+ struct xive_q *q;
+
+ prio = ffs(xc->pending_prio) - 1;
+ DBG_VERBOSE("scan_irq: trying prio %d\n", prio);
+
+ /* Try to fetch */
+ irq = xive_read_eq(&xc->queue[prio], just_peek);
+
+ /* Found something ? That's it */
+ if (irq)
+ break;
+
+ /* Clear pending bits */
+ xc->pending_prio &= ~(1 << prio);
+
+ /*
+ * Check if the queue count needs adjusting due to
+ * interrupts being moved away. See description of
+ * xive_dec_target_count()
+ */
+ q = &xc->queue[prio];
+ if (atomic_read(&q->pending_count)) {
+ int p = atomic_xchg(&q->pending_count, 0);
+ if (p) {
+ WARN_ON(p > atomic_read(&q->count));
+ atomic_sub(p, &q->count);
+ }
+ }
+ }
+
+ /* If nothing was found, set CPPR to 0xff */
+ if (irq == 0)
+ prio = 0xff;
+
+ /* Update HW CPPR to match if necessary */
+ if (prio != xc->cppr) {
+ DBG_VERBOSE("scan_irq: adjusting CPPR to %d\n", prio);
+ xc->cppr = prio;
+ out_8(xive_tima + xive_tima_offset + TM_CPPR, prio);
+ }
+
+ return irq;
+}
+
+/*
+ * This is used to perform the magic loads from an ESB
+ * described in xive.h
+ */
+static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset)
+{
+ u64 val;
+
+ /* Handle HW errata */
+ if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
+ offset |= offset << 4;
+
+ val = in_be64(xd->eoi_mmio + offset);
+
+ return (u8)val;
+}
+
+#ifdef CONFIG_XMON
+static void xive_dump_eq(const char *name, struct xive_q *q)
+{
+ u32 i0, i1, idx;
+
+ if (!q->qpage)
+ return;
+ idx = q->idx;
+ i0 = be32_to_cpup(q->qpage + idx);
+ idx = (idx + 1) & q->msk;
+ i1 = be32_to_cpup(q->qpage + idx);
+ xmon_printf(" %s Q T=%d %08x %08x ...\n", name,
+ q->toggle, i0, i1);
+}
+
+void xmon_xive_do_dump(int cpu)
+{
+ struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
+
+ xmon_printf("XIVE state for CPU %d:\n", cpu);
+ xmon_printf(" pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr);
+ xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]);
+#ifdef CONFIG_SMP
+ {
+ u64 val = xive_poke_esb(&xc->ipi_data, XIVE_ESB_GET);
+ xmon_printf(" IPI state: %x:%c%c\n", xc->hw_ipi,
+ val & XIVE_ESB_VAL_P ? 'P' : 'p',
+ val & XIVE_ESB_VAL_P ? 'Q' : 'q');
+ }
+#endif
+}
+#endif /* CONFIG_XMON */
+
+static unsigned int xive_get_irq(void)
+{
+ struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+ u32 irq;
+
+ /*
+ * This can be called either as a result of a HW interrupt or
+ * as a "replay" because EOI decided there was still something
+ * in one of the queues.
+ *
+ * First we perform an ACK cycle in order to update our mask
+ * of pending priorities. This will also have the effect of
+ * updating the CPPR to the most favored pending interrupts.
+ *
+ * In the future, if we have a way to differenciate a first
+ * entry (on HW interrupt) from a replay triggered by EOI,
+ * we could skip this on replays unless we soft-mask tells us
+ * that a new HW interrupt occurred.
+ */
+ xive_ops->update_pending(xc);
+
+ DBG_VERBOSE("get_irq: pending=%02x\n", xc->pending_prio);
+
+ /* Scan our queue(s) for interrupts */
+ irq = xive_scan_interrupts(xc, false);
+
+ DBG_VERBOSE("get_irq: got irq 0x%x, new pending=0x%02x\n",
+ irq, xc->pending_prio);
+
+ /* Return pending interrupt if any */
+ if (irq == XIVE_BAD_IRQ)
+ return 0;
+ return irq;
+}
+
+/*
+ * After EOI'ing an interrupt, we need to re-check the queue
+ * to see if another interrupt is pending since multiple
+ * interrupts can coalesce into a single notification to the
+ * CPU.
+ *
+ * If we find that there is indeed more in there, we call
+ * force_external_irq_replay() to make Linux synthetize an
+ * external interrupt on the next call to local_irq_restore().
+ */
+static void xive_do_queue_eoi(struct xive_cpu *xc)
+{
+ if (xive_scan_interrupts(xc, true) != 0) {
+ DBG_VERBOSE("eoi: pending=0x%02x\n", xc->pending_prio);
+ force_external_irq_replay();
+ }
+}
+
+/*
+ * EOI an interrupt at the source. There are several methods
+ * to do this depending on the HW version and source type
+ */
+void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
+{
+ /* If the XIVE supports the new "store EOI facility, use it */
+ if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
+ out_be64(xd->eoi_mmio, 0);
+ else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
+ /*
+ * The FW told us to call it. This happens for some
+ * interrupt sources that need additional HW whacking
+ * beyond the ESB manipulation. For example LPC interrupts
+ * on P9 DD1.0 need a latch to be clared in the LPC bridge
+ * itself. The Firmware will take care of it.
+ */
+ if (WARN_ON_ONCE(!xive_ops->eoi))
+ return;
+ xive_ops->eoi(hw_irq);
+ } else {
+ u8 eoi_val;
+
+ /*
+ * Otherwise for EOI, we use the special MMIO that does
+ * a clear of both P and Q and returns the old Q,
+ * except for LSIs where we use the "EOI cycle" special
+ * load.
+ *
+ * This allows us to then do a re-trigger if Q was set
+ * rather than synthesizing an interrupt in software
+ *
+ * For LSIs, using the HW EOI cycle works around a problem
+ * on P9 DD1 PHBs where the other ESB accesses don't work
+ * properly.
+ */
+ if (xd->flags & XIVE_IRQ_FLAG_LSI)
+ in_be64(xd->eoi_mmio);
+ else {
+ eoi_val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
+ DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val);
+
+ /* Re-trigger if needed */
+ if ((eoi_val & XIVE_ESB_VAL_Q) && xd->trig_mmio)
+ out_be64(xd->trig_mmio, 0);
+ }
+ }
+}
+
+/* irq_chip eoi callback */
+static void xive_irq_eoi(struct irq_data *d)
+{
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+ struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+
+ DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n",
+ d->irq, irqd_to_hwirq(d), xc->pending_prio);
+
+ /* EOI the source if it hasn't been disabled */
+ if (!irqd_irq_disabled(d))
+ xive_do_source_eoi(irqd_to_hwirq(d), xd);
+
+ /*
+ * Clear saved_p to indicate that it's no longer occupying
+ * a queue slot on the target queue
+ */
+ xd->saved_p = false;
+
+ /* Check for more work in the queue */
+ xive_do_queue_eoi(xc);
+}
+
+/*
+ * Helper used to mask and unmask an interrupt source. This
+ * is only called for normal interrupts that do not require
+ * masking/unmasking via firmware.
+ */
+static void xive_do_source_set_mask(struct xive_irq_data *xd,
+ bool mask)
+{
+ u64 val;
+
+ /*
+ * If the interrupt had P set, it may be in a queue.
+ *
+ * We need to make sure we don't re-enable it until it
+ * has been fetched from that queue and EOId. We keep
+ * a copy of that P state and use it to restore the
+ * ESB accordingly on unmask.
+ */
+ if (mask) {
+ val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_01);
+ xd->saved_p = !!(val & XIVE_ESB_VAL_P);
+ } else if (xd->saved_p)
+ xive_poke_esb(xd, XIVE_ESB_SET_PQ_10);
+ else
+ xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
+}
+
+/*
+ * Try to chose "cpu" as a new interrupt target. Increments
+ * the queue accounting for that target if it's not already
+ * full.
+ */
+static bool xive_try_pick_target(int cpu)
+{
+ struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
+ struct xive_q *q = &xc->queue[xive_irq_priority];
+ int max;
+
+ /*
+ * Calculate max number of interrupts in that queue.
+ *
+ * We leave a gap of 1 just in case...
+ */
+ max = (q->msk + 1) - 1;
+ return !!atomic_add_unless(&q->count, 1, max);
+}
+
+/*
+ * Un-account an interrupt for a target CPU. We don't directly
+ * decrement q->count since the interrupt might still be present
+ * in the queue.
+ *
+ * Instead increment a separate counter "pending_count" which
+ * will be substracted from "count" later when that CPU observes
+ * the queue to be empty.
+ */
+static void xive_dec_target_count(int cpu)
+{
+ struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
+ struct xive_q *q = &xc->queue[xive_irq_priority];
+
+ if (unlikely(WARN_ON(cpu < 0 || !xc))) {
+ pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc);
+ return;
+ }
+
+ /*
+ * We increment the "pending count" which will be used
+ * to decrement the target queue count whenever it's next
+ * processed and found empty. This ensure that we don't
+ * decrement while we still have the interrupt there
+ * occupying a slot.
+ */
+ atomic_inc(&q->pending_count);
+}
+
+/* Find a tentative CPU target in a CPU mask */
+static int xive_find_target_in_mask(const struct cpumask *mask,
+ unsigned int fuzz)
+{
+ int cpu, first, num, i;
+
+ /* Pick up a starting point CPU in the mask based on fuzz */
+ num = cpumask_weight(mask);
+ first = fuzz % num;
+
+ /* Locate it */
+ cpu = cpumask_first(mask);
+ for (i = 0; i < first && cpu < nr_cpu_ids; i++)
+ cpu = cpumask_next(cpu, mask);
+
+ /* Sanity check */
+ if (WARN_ON(cpu >= nr_cpu_ids))
+ cpu = cpumask_first(cpu_online_mask);
+
+ /* Remember first one to handle wrap-around */
+ first = cpu;
+
+ /*
+ * Now go through the entire mask until we find a valid
+ * target.
+ */
+ for (;;) {
+ /*
+ * We re-check online as the fallback case passes us
+ * an untested affinity mask
+ */
+ if (cpu_online(cpu) && xive_try_pick_target(cpu))
+ return cpu;
+ cpu = cpumask_next(cpu, mask);
+ if (cpu == first)
+ break;
+ /* Wrap around */
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(mask);
+ }
+ return -1;
+}
+
+/*
+ * Pick a target CPU for an interrupt. This is done at
+ * startup or if the affinity is changed in a way that
+ * invalidates the current target.
+ */
+static int xive_pick_irq_target(struct irq_data *d,
+ const struct cpumask *affinity)
+{
+ static unsigned int fuzz;
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+ cpumask_var_t mask;
+ int cpu = -1;
+
+ /*
+ * If we have chip IDs, first we try to build a mask of
+ * CPUs matching the CPU and find a target in there
+ */
+ if (xd->src_chip != XIVE_INVALID_CHIP_ID &&
+ zalloc_cpumask_var(&mask, GFP_ATOMIC)) {
+ /* Build a mask of matching chip IDs */
+ for_each_cpu_and(cpu, affinity, cpu_online_mask) {
+ struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
+ if (xc->chip_id == xd->src_chip)
+ cpumask_set_cpu(cpu, mask);
+ }
+ /* Try to find a target */
+ if (cpumask_empty(mask))
+ cpu = -1;
+ else
+ cpu = xive_find_target_in_mask(mask, fuzz++);
+ free_cpumask_var(mask);
+ if (cpu >= 0)
+ return cpu;
+ fuzz--;
+ }
+
+ /* No chip IDs, fallback to using the affinity mask */
+ return xive_find_target_in_mask(affinity, fuzz++);
+}
+
+static unsigned int xive_irq_startup(struct irq_data *d)
+{
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+ unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+ int target, rc;
+
+ pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n",
+ d->irq, hw_irq, d);
+
+#ifdef CONFIG_PCI_MSI
+ /*
+ * The generic MSI code returns with the interrupt disabled on the
+ * card, using the MSI mask bits. Firmware doesn't appear to unmask
+ * at that level, so we do it here by hand.
+ */
+ if (irq_data_get_msi_desc(d))
+ pci_msi_unmask_irq(d);
+#endif
+
+ /* Pick a target */
+ target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d));
+ if (target == XIVE_INVALID_TARGET) {
+ /* Try again breaking affinity */
+ target = xive_pick_irq_target(d, cpu_online_mask);
+ if (target == XIVE_INVALID_TARGET)
+ return -ENXIO;
+ pr_warn("irq %d started with broken affinity\n", d->irq);
+ }
+
+ /* Sanity check */
+ if (WARN_ON(target == XIVE_INVALID_TARGET ||
+ target >= nr_cpu_ids))
+ target = smp_processor_id();
+
+ xd->target = target;
+
+ /*
+ * Configure the logical number to be the Linux IRQ number
+ * and set the target queue
+ */
+ rc = xive_ops->configure_irq(hw_irq,
+ get_hard_smp_processor_id(target),
+ xive_irq_priority, d->irq);
+ if (rc)
+ return rc;
+
+ /* Unmask the ESB */
+ xive_do_source_set_mask(xd, false);
+
+ return 0;
+}
+
+static void xive_irq_shutdown(struct irq_data *d)
+{
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+ unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+
+ pr_devel("xive_irq_shutdown: irq %d [0x%x] data @%p\n",
+ d->irq, hw_irq, d);
+
+ if (WARN_ON(xd->target == XIVE_INVALID_TARGET))
+ return;
+
+ /* Mask the interrupt at the source */
+ xive_do_source_set_mask(xd, true);
+
+ /*
+ * The above may have set saved_p. We clear it otherwise it
+ * will prevent re-enabling later on. It is ok to forget the
+ * fact that the interrupt might be in a queue because we are
+ * accounting that already in xive_dec_target_count() and will
+ * be re-routing it to a new queue with proper accounting when
+ * it's started up again
+ */
+ xd->saved_p = false;
+
+ /*
+ * Mask the interrupt in HW in the IVT/EAS and set the number
+ * to be the "bad" IRQ number
+ */
+ xive_ops->configure_irq(hw_irq,
+ get_hard_smp_processor_id(xd->target),
+ 0xff, XIVE_BAD_IRQ);
+
+ xive_dec_target_count(xd->target);
+ xd->target = XIVE_INVALID_TARGET;
+}
+
+static void xive_irq_unmask(struct irq_data *d)
+{
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+ pr_devel("xive_irq_unmask: irq %d data @%p\n", d->irq, xd);
+
+ /*
+ * This is a workaround for PCI LSI problems on P9, for
+ * these, we call FW to set the mask. The problems might
+ * be fixed by P9 DD2.0, if that is the case, firmware
+ * will no longer set that flag.
+ */
+ if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) {
+ unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+ xive_ops->configure_irq(hw_irq,
+ get_hard_smp_processor_id(xd->target),
+ xive_irq_priority, d->irq);
+ return;
+ }
+
+ xive_do_source_set_mask(xd, false);
+}
+
+static void xive_irq_mask(struct irq_data *d)
+{
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+ pr_devel("xive_irq_mask: irq %d data @%p\n", d->irq, xd);
+
+ /*
+ * This is a workaround for PCI LSI problems on P9, for
+ * these, we call OPAL to set the mask. The problems might
+ * be fixed by P9 DD2.0, if that is the case, firmware
+ * will no longer set that flag.
+ */
+ if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) {
+ unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+ xive_ops->configure_irq(hw_irq,
+ get_hard_smp_processor_id(xd->target),
+ 0xff, d->irq);
+ return;
+ }
+
+ xive_do_source_set_mask(xd, true);
+}
+
+static int xive_irq_set_affinity(struct irq_data *d,
+ const struct cpumask *cpumask,
+ bool force)
+{
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+ unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+ u32 target, old_target;
+ int rc = 0;
+
+ pr_devel("xive_irq_set_affinity: irq %d\n", d->irq);
+
+ /* Is this valid ? */
+ if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids)
+ return -EINVAL;
+
+ /*
+ * If existing target is already in the new mask, and is
+ * online then do nothing.
+ */
+ if (xd->target != XIVE_INVALID_TARGET &&
+ cpu_online(xd->target) &&
+ cpumask_test_cpu(xd->target, cpumask))
+ return IRQ_SET_MASK_OK;
+
+ /* Pick a new target */
+ target = xive_pick_irq_target(d, cpumask);
+
+ /* No target found */
+ if (target == XIVE_INVALID_TARGET)
+ return -ENXIO;
+
+ /* Sanity check */
+ if (WARN_ON(target >= nr_cpu_ids))
+ target = smp_processor_id();
+
+ old_target = xd->target;
+
+ rc = xive_ops->configure_irq(hw_irq,
+ get_hard_smp_processor_id(target),
+ xive_irq_priority, d->irq);
+ if (rc < 0) {
+ pr_err("Error %d reconfiguring irq %d\n", rc, d->irq);
+ return rc;
+ }
+
+ pr_devel(" target: 0x%x\n", target);
+ xd->target = target;
+
+ /* Give up previous target */
+ if (old_target != XIVE_INVALID_TARGET)
+ xive_dec_target_count(old_target);
+
+ return IRQ_SET_MASK_OK;
+}
+
+static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type)
+{
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+ /*
+ * We only support these. This has really no effect other than setting
+ * the corresponding descriptor bits mind you but those will in turn
+ * affect the resend function when re-enabling an edge interrupt.
+ *
+ * Set set the default to edge as explained in map().
+ */
+ if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE)
+ flow_type = IRQ_TYPE_EDGE_RISING;
+
+ if (flow_type != IRQ_TYPE_EDGE_RISING &&
+ flow_type != IRQ_TYPE_LEVEL_LOW)
+ return -EINVAL;
+
+ irqd_set_trigger_type(d, flow_type);
+
+ /*
+ * Double check it matches what the FW thinks
+ *
+ * NOTE: We don't know yet if the PAPR interface will provide
+ * the LSI vs MSI information apart from the device-tree so
+ * this check might have to move into an optional backend call
+ * that is specific to the native backend
+ */
+ if ((flow_type == IRQ_TYPE_LEVEL_LOW) !=
+ !!(xd->flags & XIVE_IRQ_FLAG_LSI)) {
+ pr_warn("Interrupt %d (HW 0x%x) type mismatch, Linux says %s, FW says %s\n",
+ d->irq, (u32)irqd_to_hwirq(d),
+ (flow_type == IRQ_TYPE_LEVEL_LOW) ? "Level" : "Edge",
+ (xd->flags & XIVE_IRQ_FLAG_LSI) ? "Level" : "Edge");
+ }
+
+ return IRQ_SET_MASK_OK_NOCOPY;
+}
+
+static int xive_irq_retrigger(struct irq_data *d)
+{
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+ /* This should be only for MSIs */
+ if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
+ return 0;
+
+ /*
+ * To perform a retrigger, we first set the PQ bits to
+ * 11, then perform an EOI.
+ */
+ xive_poke_esb(xd, XIVE_ESB_SET_PQ_11);
+
+ /*
+ * Note: We pass "0" to the hw_irq argument in order to
+ * avoid calling into the backend EOI code which we don't
+ * want to do in the case of a re-trigger. Backends typically
+ * only do EOI for LSIs anyway.
+ */
+ xive_do_source_eoi(0, xd);
+
+ return 1;
+}
+
+static struct irq_chip xive_irq_chip = {
+ .name = "XIVE-IRQ",
+ .irq_startup = xive_irq_startup,
+ .irq_shutdown = xive_irq_shutdown,
+ .irq_eoi = xive_irq_eoi,
+ .irq_mask = xive_irq_mask,
+ .irq_unmask = xive_irq_unmask,
+ .irq_set_affinity = xive_irq_set_affinity,
+ .irq_set_type = xive_irq_set_type,
+ .irq_retrigger = xive_irq_retrigger,
+};
+
+bool is_xive_irq(struct irq_chip *chip)
+{
+ return chip == &xive_irq_chip;
+}
+
+void xive_cleanup_irq_data(struct xive_irq_data *xd)
+{
+ if (xd->eoi_mmio) {
+ iounmap(xd->eoi_mmio);
+ if (xd->eoi_mmio == xd->trig_mmio)
+ xd->trig_mmio = NULL;
+ xd->eoi_mmio = NULL;
+ }
+ if (xd->trig_mmio) {
+ iounmap(xd->trig_mmio);
+ xd->trig_mmio = NULL;
+ }
+}
+
+static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
+{
+ struct xive_irq_data *xd;
+ int rc;
+
+ xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL);
+ if (!xd)
+ return -ENOMEM;
+ rc = xive_ops->populate_irq_data(hw, xd);
+ if (rc) {
+ kfree(xd);
+ return rc;
+ }
+ xd->target = XIVE_INVALID_TARGET;
+ irq_set_handler_data(virq, xd);
+
+ return 0;
+}
+
+static void xive_irq_free_data(unsigned int virq)
+{
+ struct xive_irq_data *xd = irq_get_handler_data(virq);
+
+ if (!xd)
+ return;
+ irq_set_handler_data(virq, NULL);
+ xive_cleanup_irq_data(xd);
+ kfree(xd);
+}
+
+#ifdef CONFIG_SMP
+
+static void xive_cause_ipi(int cpu)
+{
+ struct xive_cpu *xc;
+ struct xive_irq_data *xd;
+
+ xc = per_cpu(xive_cpu, cpu);
+
+ DBG_VERBOSE("IPI CPU %d -> %d (HW IRQ 0x%x)\n",
+ smp_processor_id(), cpu, xc->hw_ipi);
+
+ xd = &xc->ipi_data;
+ if (WARN_ON(!xd->trig_mmio))
+ return;
+ out_be64(xd->trig_mmio, 0);
+}
+
+static irqreturn_t xive_muxed_ipi_action(int irq, void *dev_id)
+{
+ return smp_ipi_demux();
+}
+
+static void xive_ipi_eoi(struct irq_data *d)
+{
+ struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+
+ /* Handle possible race with unplug and drop stale IPIs */
+ if (!xc)
+ return;
+ xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data);
+ xive_do_queue_eoi(xc);
+}
+
+static void xive_ipi_do_nothing(struct irq_data *d)
+{
+ /*
+ * Nothing to do, we never mask/unmask IPIs, but the callback
+ * has to exist for the struct irq_chip.
+ */
+}
+
+static struct irq_chip xive_ipi_chip = {
+ .name = "XIVE-IPI",
+ .irq_eoi = xive_ipi_eoi,
+ .irq_mask = xive_ipi_do_nothing,
+ .irq_unmask = xive_ipi_do_nothing,
+};
+
+static void __init xive_request_ipi(void)
+{
+ unsigned int virq;
+
+ /*
+ * Initialization failed, move on, we might manage to
+ * reach the point where we display our errors before
+ * the system falls appart
+ */
+ if (!xive_irq_domain)
+ return;
+
+ /* Initialize it */
+ virq = irq_create_mapping(xive_irq_domain, 0);
+ xive_ipi_irq = virq;
+
+ WARN_ON(request_irq(virq, xive_muxed_ipi_action,
+ IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
+}
+
+static int xive_setup_cpu_ipi(unsigned int cpu)
+{
+ struct xive_cpu *xc;
+ int rc;
+
+ pr_debug("Setting up IPI for CPU %d\n", cpu);
+
+ xc = per_cpu(xive_cpu, cpu);
+
+ /* Check if we are already setup */
+ if (xc->hw_ipi != 0)
+ return 0;
+
+ /* Grab an IPI from the backend, this will populate xc->hw_ipi */
+ if (xive_ops->get_ipi(cpu, xc))
+ return -EIO;
+
+ /*
+ * Populate the IRQ data in the xive_cpu structure and
+ * configure the HW / enable the IPIs.
+ */
+ rc = xive_ops->populate_irq_data(xc->hw_ipi, &xc->ipi_data);
+ if (rc) {
+ pr_err("Failed to populate IPI data on CPU %d\n", cpu);
+ return -EIO;
+ }
+ rc = xive_ops->configure_irq(xc->hw_ipi,
+ get_hard_smp_processor_id(cpu),
+ xive_irq_priority, xive_ipi_irq);
+ if (rc) {
+ pr_err("Failed to map IPI CPU %d\n", cpu);
+ return -EIO;
+ }
+ pr_devel("CPU %d HW IPI %x, virq %d, trig_mmio=%p\n", cpu,
+ xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio);
+
+ /* Unmask it */
+ xive_do_source_set_mask(&xc->ipi_data, false);
+
+ return 0;
+}
+
+static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc)
+{
+ /* Disable the IPI and free the IRQ data */
+
+ /* Already cleaned up ? */
+ if (xc->hw_ipi == 0)
+ return;
+
+ /* Mask the IPI */
+ xive_do_source_set_mask(&xc->ipi_data, true);
+
+ /*
+ * Note: We don't call xive_cleanup_irq_data() to free
+ * the mappings as this is called from an IPI on kexec
+ * which is not a safe environment to call iounmap()
+ */
+
+ /* Deconfigure/mask in the backend */
+ xive_ops->configure_irq(xc->hw_ipi, hard_smp_processor_id(),
+ 0xff, xive_ipi_irq);
+
+ /* Free the IPIs in the backend */
+ xive_ops->put_ipi(cpu, xc);
+}
+
+void __init xive_smp_probe(void)
+{
+ smp_ops->cause_ipi = xive_cause_ipi;
+
+ /* Register the IPI */
+ xive_request_ipi();
+
+ /* Allocate and setup IPI for the boot CPU */
+ xive_setup_cpu_ipi(smp_processor_id());
+}
+
+#endif /* CONFIG_SMP */
+
+static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq,
+ irq_hw_number_t hw)
+{
+ int rc;
+
+ /*
+ * Mark interrupts as edge sensitive by default so that resend
+ * actually works. Will fix that up below if needed.
+ */
+ irq_clear_status_flags(virq, IRQ_LEVEL);
+
+#ifdef CONFIG_SMP
+ /* IPIs are special and come up with HW number 0 */
+ if (hw == 0) {
+ /*
+ * IPIs are marked per-cpu. We use separate HW interrupts under
+ * the hood but associated with the same "linux" interrupt
+ */
+ irq_set_chip_and_handler(virq, &xive_ipi_chip,
+ handle_percpu_irq);
+ return 0;
+ }
+#endif
+
+ rc = xive_irq_alloc_data(virq, hw);
+ if (rc)
+ return rc;
+
+ irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq);
+
+ return 0;
+}
+
+static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq)
+{
+ struct irq_data *data = irq_get_irq_data(virq);
+ unsigned int hw_irq;
+
+ /* XXX Assign BAD number */
+ if (!data)
+ return;
+ hw_irq = (unsigned int)irqd_to_hwirq(data);
+ if (hw_irq)
+ xive_irq_free_data(virq);
+}
+
+static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct,
+ const u32 *intspec, unsigned int intsize,
+ irq_hw_number_t *out_hwirq, unsigned int *out_flags)
+
+{
+ *out_hwirq = intspec[0];
+
+ /*
+ * If intsize is at least 2, we look for the type in the second cell,
+ * we assume the LSB indicates a level interrupt.
+ */
+ if (intsize > 1) {
+ if (intspec[1] & 1)
+ *out_flags = IRQ_TYPE_LEVEL_LOW;
+ else
+ *out_flags = IRQ_TYPE_EDGE_RISING;
+ } else
+ *out_flags = IRQ_TYPE_LEVEL_LOW;
+
+ return 0;
+}
+
+static int xive_irq_domain_match(struct irq_domain *h, struct device_node *node,
+ enum irq_domain_bus_token bus_token)
+{
+ return xive_ops->match(node);
+}
+
+static const struct irq_domain_ops xive_irq_domain_ops = {
+ .match = xive_irq_domain_match,
+ .map = xive_irq_domain_map,
+ .unmap = xive_irq_domain_unmap,
+ .xlate = xive_irq_domain_xlate,
+};
+
+static void __init xive_init_host(void)
+{
+ xive_irq_domain = irq_domain_add_nomap(NULL, XIVE_MAX_IRQ,
+ &xive_irq_domain_ops, NULL);
+ if (WARN_ON(xive_irq_domain == NULL))
+ return;
+ irq_set_default_host(xive_irq_domain);
+}
+
+static void xive_cleanup_cpu_queues(unsigned int cpu, struct xive_cpu *xc)
+{
+ if (xc->queue[xive_irq_priority].qpage)
+ xive_ops->cleanup_queue(cpu, xc, xive_irq_priority);
+}
+
+static int xive_setup_cpu_queues(unsigned int cpu, struct xive_cpu *xc)
+{
+ int rc = 0;
+
+ /* We setup 1 queues for now with a 64k page */
+ if (!xc->queue[xive_irq_priority].qpage)
+ rc = xive_ops->setup_queue(cpu, xc, xive_irq_priority);
+
+ return rc;
+}
+
+static int xive_prepare_cpu(unsigned int cpu)
+{
+ struct xive_cpu *xc;
+
+ xc = per_cpu(xive_cpu, cpu);
+ if (!xc) {
+ struct device_node *np;
+
+ xc = kzalloc_node(sizeof(struct xive_cpu),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!xc)
+ return -ENOMEM;
+ np = of_get_cpu_node(cpu, NULL);
+ if (np)
+ xc->chip_id = of_get_ibm_chip_id(np);
+ of_node_put(np);
+
+ per_cpu(xive_cpu, cpu) = xc;
+ }
+
+ /* Setup EQs if not already */
+ return xive_setup_cpu_queues(cpu, xc);
+}
+
+static void xive_setup_cpu(void)
+{
+ struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+
+ /* Debug: Dump the TM state */
+ pr_devel("CPU %d [HW 0x%02x] VT=%02x\n",
+ smp_processor_id(), hard_smp_processor_id(),
+ in_8(xive_tima + xive_tima_offset + TM_WORD2));
+
+ /* The backend might have additional things to do */
+ if (xive_ops->setup_cpu)
+ xive_ops->setup_cpu(smp_processor_id(), xc);
+
+ /* Set CPPR to 0xff to enable flow of interrupts */
+ xc->cppr = 0xff;
+ out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff);
+}
+
+#ifdef CONFIG_SMP
+void xive_smp_setup_cpu(void)
+{
+ pr_devel("SMP setup CPU %d\n", smp_processor_id());
+
+ /* This will have already been done on the boot CPU */
+ if (smp_processor_id() != boot_cpuid)
+ xive_setup_cpu();
+
+}
+
+int xive_smp_prepare_cpu(unsigned int cpu)
+{
+ int rc;
+
+ /* Allocate per-CPU data and queues */
+ rc = xive_prepare_cpu(cpu);
+ if (rc)
+ return rc;
+
+ /* Allocate and setup IPI for the new CPU */
+ return xive_setup_cpu_ipi(cpu);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc)
+{
+ u32 irq;
+
+ /* We assume local irqs are disabled */
+ WARN_ON(!irqs_disabled());
+
+ /* Check what's already in the CPU queue */
+ while ((irq = xive_scan_interrupts(xc, false)) != 0) {
+ /*
+ * We need to re-route that interrupt to its new destination.
+ * First get and lock the descriptor
+ */
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_data *d = irq_desc_get_irq_data(desc);
+ struct xive_irq_data *xd;
+ unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+
+ /*
+ * Ignore anything that isn't a XIVE irq and ignore
+ * IPIs, so can just be dropped.
+ */
+ if (d->domain != xive_irq_domain || hw_irq == 0)
+ continue;
+
+ /*
+ * The IRQ should have already been re-routed, it's just a
+ * stale in the old queue, so re-trigger it in order to make
+ * it reach is new destination.
+ */
+#ifdef DEBUG_FLUSH
+ pr_info("CPU %d: Got irq %d while offline, re-sending...\n",
+ cpu, irq);
+#endif
+ raw_spin_lock(&desc->lock);
+ xd = irq_desc_get_handler_data(desc);
+
+ /*
+ * For LSIs, we EOI, this will cause a resend if it's
+ * still asserted. Otherwise do an MSI retrigger.
+ */
+ if (xd->flags & XIVE_IRQ_FLAG_LSI)
+ xive_do_source_eoi(irqd_to_hwirq(d), xd);
+ else
+ xive_irq_retrigger(d);
+
+ raw_spin_unlock(&desc->lock);
+ }
+}
+
+void xive_smp_disable_cpu(void)
+{
+ struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+ unsigned int cpu = smp_processor_id();
+
+ /* Migrate interrupts away from the CPU */
+ irq_migrate_all_off_this_cpu();
+
+ /* Set CPPR to 0 to disable flow of interrupts */
+ xc->cppr = 0;
+ out_8(xive_tima + xive_tima_offset + TM_CPPR, 0);
+
+ /* Flush everything still in the queue */
+ xive_flush_cpu_queue(cpu, xc);
+
+ /* Re-enable CPPR */
+ xc->cppr = 0xff;
+ out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff);
+}
+
+void xive_flush_interrupt(void)
+{
+ struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+ unsigned int cpu = smp_processor_id();
+
+ /* Called if an interrupt occurs while the CPU is hot unplugged */
+ xive_flush_cpu_queue(cpu, xc);
+}
+
+#endif /* CONFIG_HOTPLUG_CPU */
+
+#endif /* CONFIG_SMP */
+
+void xive_kexec_teardown_cpu(int secondary)
+{
+ struct xive_cpu *xc = __this_cpu_read(xive_cpu);
+ unsigned int cpu = smp_processor_id();
+
+ /* Set CPPR to 0 to disable flow of interrupts */
+ xc->cppr = 0;
+ out_8(xive_tima + xive_tima_offset + TM_CPPR, 0);
+
+ /* Backend cleanup if any */
+ if (xive_ops->teardown_cpu)
+ xive_ops->teardown_cpu(cpu, xc);
+
+#ifdef CONFIG_SMP
+ /* Get rid of IPI */
+ xive_cleanup_cpu_ipi(cpu, xc);
+#endif
+
+ /* Disable and free the queues */
+ xive_cleanup_cpu_queues(cpu, xc);
+}
+
+void xive_shutdown(void)
+{
+ xive_ops->shutdown();
+}
+
+bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset,
+ u8 max_prio)
+{
+ xive_tima = area;
+ xive_tima_offset = offset;
+ xive_ops = ops;
+ xive_irq_priority = max_prio;
+
+ ppc_md.get_irq = xive_get_irq;
+ __xive_enabled = true;
+
+ pr_devel("Initializing host..\n");
+ xive_init_host();
+
+ pr_devel("Initializing boot CPU..\n");
+
+ /* Allocate per-CPU data and queues */
+ xive_prepare_cpu(smp_processor_id());
+
+ /* Get ready for interrupts */
+ xive_setup_cpu();
+
+ pr_info("Interrupt handling intialized with %s backend\n",
+ xive_ops->name);
+ pr_info("Using priority %d for all interrupts\n", max_prio);
+
+ return true;
+}
+
+static int __init xive_off(char *arg)
+{
+ xive_cmdline_disabled = true;
+ return 0;
+}
+__setup("xive=off", xive_off);
diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
new file mode 100644
index 000000000000..1a726229a427
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/native.c
@@ -0,0 +1,640 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#define pr_fmt(fmt) "xive: " fmt
+
+#include <linux/types.h>
+#include <linux/irq.h>
+#include <linux/debugfs.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/of.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/cpumask.h>
+#include <linux/mm.h>
+
+#include <asm/prom.h>
+#include <asm/io.h>
+#include <asm/smp.h>
+#include <asm/irq.h>
+#include <asm/errno.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+#include <asm/opal.h>
+
+#include "xive-internal.h"
+
+
+static u32 xive_provision_size;
+static u32 *xive_provision_chips;
+static u32 xive_provision_chip_count;
+static u32 xive_queue_shift;
+static u32 xive_pool_vps = XIVE_INVALID_VP;
+static struct kmem_cache *xive_provision_cache;
+
+int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
+{
+ __be64 flags, eoi_page, trig_page;
+ __be32 esb_shift, src_chip;
+ u64 opal_flags;
+ s64 rc;
+
+ memset(data, 0, sizeof(*data));
+
+ rc = opal_xive_get_irq_info(hw_irq, &flags, &eoi_page, &trig_page,
+ &esb_shift, &src_chip);
+ if (rc) {
+ pr_err("opal_xive_get_irq_info(0x%x) returned %lld\n",
+ hw_irq, rc);
+ return -EINVAL;
+ }
+
+ opal_flags = be64_to_cpu(flags);
+ if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI)
+ data->flags |= XIVE_IRQ_FLAG_STORE_EOI;
+ if (opal_flags & OPAL_XIVE_IRQ_LSI)
+ data->flags |= XIVE_IRQ_FLAG_LSI;
+ if (opal_flags & OPAL_XIVE_IRQ_SHIFT_BUG)
+ data->flags |= XIVE_IRQ_FLAG_SHIFT_BUG;
+ if (opal_flags & OPAL_XIVE_IRQ_MASK_VIA_FW)
+ data->flags |= XIVE_IRQ_FLAG_MASK_FW;
+ if (opal_flags & OPAL_XIVE_IRQ_EOI_VIA_FW)
+ data->flags |= XIVE_IRQ_FLAG_EOI_FW;
+ data->eoi_page = be64_to_cpu(eoi_page);
+ data->trig_page = be64_to_cpu(trig_page);
+ data->esb_shift = be32_to_cpu(esb_shift);
+ data->src_chip = be32_to_cpu(src_chip);
+
+ data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift);
+ if (!data->eoi_mmio) {
+ pr_err("Failed to map EOI page for irq 0x%x\n", hw_irq);
+ return -ENOMEM;
+ }
+
+ if (!data->trig_page)
+ return 0;
+ if (data->trig_page == data->eoi_page) {
+ data->trig_mmio = data->eoi_mmio;
+ return 0;
+ }
+
+ data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift);
+ if (!data->trig_mmio) {
+ pr_err("Failed to map trigger page for irq 0x%x\n", hw_irq);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
+{
+ s64 rc;
+
+ for (;;) {
+ rc = opal_xive_set_irq_config(hw_irq, target, prio, sw_irq);
+ if (rc != OPAL_BUSY)
+ break;
+ msleep(1);
+ }
+ return rc == 0 ? 0 : -ENXIO;
+}
+
+/* This can be called multiple time to change a queue configuration */
+int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
+ __be32 *qpage, u32 order, bool can_escalate)
+{
+ s64 rc = 0;
+ __be64 qeoi_page_be;
+ __be32 esc_irq_be;
+ u64 flags, qpage_phys;
+
+ /* If there's an actual queue page, clean it */
+ if (order) {
+ if (WARN_ON(!qpage))
+ return -EINVAL;
+ qpage_phys = __pa(qpage);
+ } else
+ qpage_phys = 0;
+
+ /* Initialize the rest of the fields */
+ q->msk = order ? ((1u << (order - 2)) - 1) : 0;
+ q->idx = 0;
+ q->toggle = 0;
+
+ rc = opal_xive_get_queue_info(vp_id, prio, NULL, NULL,
+ &qeoi_page_be,
+ &esc_irq_be,
+ NULL);
+ if (rc) {
+ pr_err("Error %lld getting queue info prio %d\n", rc, prio);
+ rc = -EIO;
+ goto fail;
+ }
+ q->eoi_phys = be64_to_cpu(qeoi_page_be);
+
+ /* Default flags */
+ flags = OPAL_XIVE_EQ_ALWAYS_NOTIFY | OPAL_XIVE_EQ_ENABLED;
+
+ /* Escalation needed ? */
+ if (can_escalate) {
+ q->esc_irq = be32_to_cpu(esc_irq_be);
+ flags |= OPAL_XIVE_EQ_ESCALATE;
+ }
+
+ /* Configure and enable the queue in HW */
+ for (;;) {
+ rc = opal_xive_set_queue_info(vp_id, prio, qpage_phys, order, flags);
+ if (rc != OPAL_BUSY)
+ break;
+ msleep(1);
+ }
+ if (rc) {
+ pr_err("Error %lld setting queue for prio %d\n", rc, prio);
+ rc = -EIO;
+ } else {
+ /*
+ * KVM code requires all of the above to be visible before
+ * q->qpage is set due to how it manages IPI EOIs
+ */
+ wmb();
+ q->qpage = qpage;
+ }
+fail:
+ return rc;
+}
+
+static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
+{
+ s64 rc;
+
+ /* Disable the queue in HW */
+ for (;;) {
+ rc = opal_xive_set_queue_info(vp_id, prio, 0, 0, 0);
+ if (rc != OPAL_BUSY)
+ break;
+ msleep(1);
+ }
+ if (rc)
+ pr_err("Error %lld disabling queue for prio %d\n", rc, prio);
+}
+
+void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
+{
+ __xive_native_disable_queue(vp_id, q, prio);
+}
+
+static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
+{
+ struct xive_q *q = &xc->queue[prio];
+ unsigned int alloc_order;
+ struct page *pages;
+ __be32 *qpage;
+
+ alloc_order = (xive_queue_shift > PAGE_SHIFT) ?
+ (xive_queue_shift - PAGE_SHIFT) : 0;
+ pages = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, alloc_order);
+ if (!pages)
+ return -ENOMEM;
+ qpage = (__be32 *)page_address(pages);
+ memset(qpage, 0, 1 << xive_queue_shift);
+ return xive_native_configure_queue(get_hard_smp_processor_id(cpu),
+ q, prio, qpage, xive_queue_shift, false);
+}
+
+static void xive_native_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
+{
+ struct xive_q *q = &xc->queue[prio];
+ unsigned int alloc_order;
+
+ /*
+ * We use the variant with no iounmap as this is called on exec
+ * from an IPI and iounmap isn't safe
+ */
+ __xive_native_disable_queue(get_hard_smp_processor_id(cpu), q, prio);
+ alloc_order = (xive_queue_shift > PAGE_SHIFT) ?
+ (xive_queue_shift - PAGE_SHIFT) : 0;
+ free_pages((unsigned long)q->qpage, alloc_order);
+ q->qpage = NULL;
+}
+
+static bool xive_native_match(struct device_node *node)
+{
+ return of_device_is_compatible(node, "ibm,opal-xive-vc");
+}
+
+#ifdef CONFIG_SMP
+static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc)
+{
+ struct device_node *np;
+ unsigned int chip_id;
+ s64 irq;
+
+ /* Find the chip ID */
+ np = of_get_cpu_node(cpu, NULL);
+ if (np) {
+ if (of_property_read_u32(np, "ibm,chip-id", &chip_id) < 0)
+ chip_id = 0;
+ }
+
+ /* Allocate an IPI and populate info about it */
+ for (;;) {
+ irq = opal_xive_allocate_irq(chip_id);
+ if (irq == OPAL_BUSY) {
+ msleep(1);
+ continue;
+ }
+ if (irq < 0) {
+ pr_err("Failed to allocate IPI on CPU %d\n", cpu);
+ return -ENXIO;
+ }
+ xc->hw_ipi = irq;
+ break;
+ }
+ return 0;
+}
+
+u32 xive_native_alloc_irq(void)
+{
+ s64 rc;
+
+ for (;;) {
+ rc = opal_xive_allocate_irq(OPAL_XIVE_ANY_CHIP);
+ if (rc != OPAL_BUSY)
+ break;
+ msleep(1);
+ }
+ if (rc < 0)
+ return 0;
+ return rc;
+}
+
+void xive_native_free_irq(u32 irq)
+{
+ for (;;) {
+ s64 rc = opal_xive_free_irq(irq);
+ if (rc != OPAL_BUSY)
+ break;
+ msleep(1);
+ }
+}
+
+static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc)
+{
+ s64 rc;
+
+ /* Free the IPI */
+ if (!xc->hw_ipi)
+ return;
+ for (;;) {
+ rc = opal_xive_free_irq(xc->hw_ipi);
+ if (rc == OPAL_BUSY) {
+ msleep(1);
+ continue;
+ }
+ xc->hw_ipi = 0;
+ break;
+ }
+}
+#endif /* CONFIG_SMP */
+
+static void xive_native_shutdown(void)
+{
+ /* Switch the XIVE to emulation mode */
+ opal_xive_reset(OPAL_XIVE_MODE_EMU);
+}
+
+/*
+ * Perform an "ack" cycle on the current thread, thus
+ * grabbing the pending active priorities and updating
+ * the CPPR to the most favored one.
+ */
+static void xive_native_update_pending(struct xive_cpu *xc)
+{
+ u8 he, cppr;
+ u16 ack;
+
+ /* Perform the acknowledge hypervisor to register cycle */
+ ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_HV_REG));
+
+ /* Synchronize subsequent queue accesses */
+ mb();
+
+ /*
+ * Grab the CPPR and the "HE" field which indicates the source
+ * of the hypervisor interrupt (if any)
+ */
+ cppr = ack & 0xff;
+ he = GETFIELD(TM_QW3_NSR_HE, (ack >> 8));
+ switch(he) {
+ case TM_QW3_NSR_HE_NONE: /* Nothing to see here */
+ break;
+ case TM_QW3_NSR_HE_PHYS: /* Physical thread interrupt */
+ if (cppr == 0xff)
+ return;
+ /* Mark the priority pending */
+ xc->pending_prio |= 1 << cppr;
+
+ /*
+ * A new interrupt should never have a CPPR less favored
+ * than our current one.
+ */
+ if (cppr >= xc->cppr)
+ pr_err("CPU %d odd ack CPPR, got %d at %d\n",
+ smp_processor_id(), cppr, xc->cppr);
+
+ /* Update our idea of what the CPPR is */
+ xc->cppr = cppr;
+ break;
+ case TM_QW3_NSR_HE_POOL: /* HV Pool interrupt (unused) */
+ case TM_QW3_NSR_HE_LSI: /* Legacy FW LSI (unused) */
+ pr_err("CPU %d got unexpected interrupt type HE=%d\n",
+ smp_processor_id(), he);
+ return;
+ }
+}
+
+static void xive_native_eoi(u32 hw_irq)
+{
+ /*
+ * Not normally used except if specific interrupts need
+ * a workaround on EOI.
+ */
+ opal_int_eoi(hw_irq);
+}
+
+static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc)
+{
+ s64 rc;
+ u32 vp;
+ __be64 vp_cam_be;
+ u64 vp_cam;
+
+ if (xive_pool_vps == XIVE_INVALID_VP)
+ return;
+
+ /* Enable the pool VP */
+ vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
+ pr_debug("CPU %d setting up pool VP 0x%x\n", cpu, vp);
+ for (;;) {
+ rc = opal_xive_set_vp_info(vp, OPAL_XIVE_VP_ENABLED, 0);
+ if (rc != OPAL_BUSY)
+ break;
+ msleep(1);
+ }
+ if (rc) {
+ pr_err("Failed to enable pool VP on CPU %d\n", cpu);
+ return;
+ }
+
+ /* Grab it's CAM value */
+ rc = opal_xive_get_vp_info(vp, NULL, &vp_cam_be, NULL, NULL);
+ if (rc) {
+ pr_err("Failed to get pool VP info CPU %d\n", cpu);
+ return;
+ }
+ vp_cam = be64_to_cpu(vp_cam_be);
+
+ pr_debug("VP CAM = %llx\n", vp_cam);
+
+ /* Push it on the CPU (set LSMFB to 0xff to skip backlog scan) */
+ pr_debug("(Old HW value: %08x)\n",
+ in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2));
+ out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD0, 0xff);
+ out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2,
+ TM_QW2W2_VP | vp_cam);
+ pr_debug("(New HW value: %08x)\n",
+ in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2));
+}
+
+static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc)
+{
+ s64 rc;
+ u32 vp;
+
+ if (xive_pool_vps == XIVE_INVALID_VP)
+ return;
+
+ /* Pull the pool VP from the CPU */
+ in_be64(xive_tima + TM_SPC_PULL_POOL_CTX);
+
+ /* Disable it */
+ vp = xive_pool_vps + get_hard_smp_processor_id(cpu);
+ for (;;) {
+ rc = opal_xive_set_vp_info(vp, 0, 0);
+ if (rc != OPAL_BUSY)
+ break;
+ msleep(1);
+ }
+}
+
+static void xive_native_sync_source(u32 hw_irq)
+{
+ opal_xive_sync(XIVE_SYNC_EAS, hw_irq);
+}
+
+static const struct xive_ops xive_native_ops = {
+ .populate_irq_data = xive_native_populate_irq_data,
+ .configure_irq = xive_native_configure_irq,
+ .setup_queue = xive_native_setup_queue,
+ .cleanup_queue = xive_native_cleanup_queue,
+ .match = xive_native_match,
+ .shutdown = xive_native_shutdown,
+ .update_pending = xive_native_update_pending,
+ .eoi = xive_native_eoi,
+ .setup_cpu = xive_native_setup_cpu,
+ .teardown_cpu = xive_native_teardown_cpu,
+ .sync_source = xive_native_sync_source,
+#ifdef CONFIG_SMP
+ .get_ipi = xive_native_get_ipi,
+ .put_ipi = xive_native_put_ipi,
+#endif /* CONFIG_SMP */
+ .name = "native",
+};
+
+static bool xive_parse_provisioning(struct device_node *np)
+{
+ int rc;
+
+ if (of_property_read_u32(np, "ibm,xive-provision-page-size",
+ &xive_provision_size) < 0)
+ return true;
+ rc = of_property_count_elems_of_size(np, "ibm,xive-provision-chips", 4);
+ if (rc < 0) {
+ pr_err("Error %d getting provision chips array\n", rc);
+ return false;
+ }
+ xive_provision_chip_count = rc;
+ if (rc == 0)
+ return true;
+
+ xive_provision_chips = kzalloc(4 * xive_provision_chip_count,
+ GFP_KERNEL);
+ if (WARN_ON(!xive_provision_chips))
+ return false;
+
+ rc = of_property_read_u32_array(np, "ibm,xive-provision-chips",
+ xive_provision_chips,
+ xive_provision_chip_count);
+ if (rc < 0) {
+ pr_err("Error %d reading provision chips array\n", rc);
+ return false;
+ }
+
+ xive_provision_cache = kmem_cache_create("xive-provision",
+ xive_provision_size,
+ xive_provision_size,
+ 0, NULL);
+ if (!xive_provision_cache) {
+ pr_err("Failed to allocate provision cache\n");
+ return false;
+ }
+ return true;
+}
+
+u32 xive_native_default_eq_shift(void)
+{
+ return xive_queue_shift;
+}
+
+bool xive_native_init(void)
+{
+ struct device_node *np;
+ struct resource r;
+ void __iomem *tima;
+ struct property *prop;
+ u8 max_prio = 7;
+ const __be32 *p;
+ u32 val;
+ s64 rc;
+
+ if (xive_cmdline_disabled)
+ return false;
+
+ pr_devel("xive_native_init()\n");
+ np = of_find_compatible_node(NULL, NULL, "ibm,opal-xive-pe");
+ if (!np) {
+ pr_devel("not found !\n");
+ return false;
+ }
+ pr_devel("Found %s\n", np->full_name);
+
+ /* Resource 1 is HV window */
+ if (of_address_to_resource(np, 1, &r)) {
+ pr_err("Failed to get thread mgmnt area resource\n");
+ return false;
+ }
+ tima = ioremap(r.start, resource_size(&r));
+ if (!tima) {
+ pr_err("Failed to map thread mgmnt area\n");
+ return false;
+ }
+
+ /* Read number of priorities */
+ if (of_property_read_u32(np, "ibm,xive-#priorities", &val) == 0)
+ max_prio = val - 1;
+
+ /* Iterate the EQ sizes and pick one */
+ of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, p, val) {
+ xive_queue_shift = val;
+ if (val == PAGE_SHIFT)
+ break;
+ }
+
+ /* Grab size of provisioning pages */
+ xive_parse_provisioning(np);
+
+ /* Switch the XIVE to exploitation mode */
+ rc = opal_xive_reset(OPAL_XIVE_MODE_EXPL);
+ if (rc) {
+ pr_err("Switch to exploitation mode failed with error %lld\n", rc);
+ return false;
+ }
+
+ /* Initialize XIVE core with our backend */
+ if (!xive_core_init(&xive_native_ops, tima, TM_QW3_HV_PHYS,
+ max_prio)) {
+ opal_xive_reset(OPAL_XIVE_MODE_EMU);
+ return false;
+ }
+ pr_info("Using %dkB queues\n", 1 << (xive_queue_shift - 10));
+ return true;
+}
+
+static bool xive_native_provision_pages(void)
+{
+ u32 i;
+ void *p;
+
+ for (i = 0; i < xive_provision_chip_count; i++) {
+ u32 chip = xive_provision_chips[i];
+
+ /*
+ * XXX TODO: Try to make the allocation local to the node where
+ * the chip resides.
+ */
+ p = kmem_cache_alloc(xive_provision_cache, GFP_KERNEL);
+ if (!p) {
+ pr_err("Failed to allocate provisioning page\n");
+ return false;
+ }
+ opal_xive_donate_page(chip, __pa(p));
+ }
+ return true;
+}
+
+u32 xive_native_alloc_vp_block(u32 max_vcpus)
+{
+ s64 rc;
+ u32 order;
+
+ order = fls(max_vcpus) - 1;
+ if (max_vcpus > (1 << order))
+ order++;
+
+ pr_info("VP block alloc, for max VCPUs %d use order %d\n",
+ max_vcpus, order);
+
+ for (;;) {
+ rc = opal_xive_alloc_vp_block(order);
+ switch (rc) {
+ case OPAL_BUSY:
+ msleep(1);
+ break;
+ case OPAL_XIVE_PROVISIONING:
+ if (!xive_native_provision_pages())
+ return XIVE_INVALID_VP;
+ break;
+ default:
+ if (rc < 0) {
+ pr_err("OPAL failed to allocate VCPUs order %d, err %lld\n",
+ order, rc);
+ return XIVE_INVALID_VP;
+ }
+ return rc;
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(xive_native_alloc_vp_block);
+
+void xive_native_free_vp_block(u32 vp_base)
+{
+ s64 rc;
+
+ if (vp_base == XIVE_INVALID_VP)
+ return;
+
+ rc = opal_xive_free_vp_block(vp_base);
+ if (rc < 0)
+ pr_warn("OPAL error %lld freeing VP block\n", rc);
+}
+EXPORT_SYMBOL_GPL(xive_native_free_vp_block);
diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
new file mode 100644
index 000000000000..d07ef2d29caf
--- /dev/null
+++ b/arch/powerpc/sysdev/xive/xive-internal.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2016,2017 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef __XIVE_INTERNAL_H
+#define __XIVE_INTERNAL_H
+
+/* Each CPU carry one of these with various per-CPU state */
+struct xive_cpu {
+#ifdef CONFIG_SMP
+ /* HW irq number and data of IPI */
+ u32 hw_ipi;
+ struct xive_irq_data ipi_data;
+#endif /* CONFIG_SMP */
+
+ int chip_id;
+
+ /* Queue datas. Only one is populated */
+#define XIVE_MAX_QUEUES 8
+ struct xive_q queue[XIVE_MAX_QUEUES];
+
+ /*
+ * Pending mask. Each bit corresponds to a priority that
+ * potentially has pending interrupts.
+ */
+ u8 pending_prio;
+
+ /* Cache of HW CPPR */
+ u8 cppr;
+};
+
+/* Backend ops */
+struct xive_ops {
+ int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data);
+ int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
+ int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
+ void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio);
+ void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc);
+ void (*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc);
+ bool (*match)(struct device_node *np);
+ void (*shutdown)(void);
+
+ void (*update_pending)(struct xive_cpu *xc);
+ void (*eoi)(u32 hw_irq);
+ void (*sync_source)(u32 hw_irq);
+#ifdef CONFIG_SMP
+ int (*get_ipi)(unsigned int cpu, struct xive_cpu *xc);
+ void (*put_ipi)(unsigned int cpu, struct xive_cpu *xc);
+#endif
+ const char *name;
+};
+
+bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset,
+ u8 max_prio);
+
+extern bool xive_cmdline_disabled;
+
+#endif /* __XIVE_INTERNAL_H */
diff --git a/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
index c658d8cf760b..c658d8cf760b 100755
--- a/arch/powerpc/scripts/gcc-check-mprofile-kernel.sh
+++ b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh
diff --git a/arch/powerpc/relocs_check.sh b/arch/powerpc/tools/relocs_check.sh
index ec2d5c835170..ec2d5c835170 100755
--- a/arch/powerpc/relocs_check.sh
+++ b/arch/powerpc/tools/relocs_check.sh
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 16321ad9e70c..f11f65634aab 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -29,7 +29,9 @@
#include <linux/nmi.h>
#include <linux/ctype.h>
+#include <asm/debugfs.h>
#include <asm/ptrace.h>
+#include <asm/smp.h>
#include <asm/string.h>
#include <asm/prom.h>
#include <asm/machdep.h>
@@ -48,7 +50,7 @@
#include <asm/reg.h>
#include <asm/debug.h>
#include <asm/hw_breakpoint.h>
-
+#include <asm/xive.h>
#include <asm/opal.h>
#include <asm/firmware.h>
@@ -76,6 +78,7 @@ static int xmon_gate;
#endif /* CONFIG_SMP */
static unsigned long in_xmon __read_mostly = 0;
+static int xmon_on = IS_ENABLED(CONFIG_XMON_DEFAULT);
static unsigned long adrs;
static int size = 1;
@@ -184,8 +187,6 @@ static void dump_tlb_44x(void);
static void dump_tlb_book3e(void);
#endif
-static int xmon_no_auto_backtrace;
-
#ifdef CONFIG_PPC64
#define REG "%.16lx"
#else
@@ -232,7 +233,13 @@ Commands:\n\
"\
dr dump stream of raw bytes\n\
dt dump the tracing buffers (uses printk)\n\
- e print exception information\n\
+"
+#ifdef CONFIG_PPC_POWERNV
+" dx# dump xive on CPU #\n\
+ dxi# dump xive irq state #\n\
+ dxa dump xive on all CPUs\n"
+#endif
+" e print exception information\n\
f flush cache\n\
la lookup symbol+offset of specified address\n\
ls lookup address of specified symbol\n\
@@ -411,7 +418,22 @@ int cpus_are_in_xmon(void)
{
return !cpumask_empty(&cpus_in_xmon);
}
-#endif
+
+static bool wait_for_other_cpus(int ncpus)
+{
+ unsigned long timeout;
+
+ /* We wait for 2s, which is a metric "little while" */
+ for (timeout = 20000; timeout != 0; --timeout) {
+ if (cpumask_weight(&cpus_in_xmon) >= ncpus)
+ return true;
+ udelay(100);
+ barrier();
+ }
+
+ return false;
+}
+#endif /* CONFIG_SMP */
static inline int unrecoverable_excp(struct pt_regs *regs)
{
@@ -433,7 +455,6 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
#ifdef CONFIG_SMP
int cpu;
int secondary;
- unsigned long timeout;
#endif
local_irq_save(flags);
@@ -520,13 +541,17 @@ static int xmon_core(struct pt_regs *regs, int fromipi)
xmon_owner = cpu;
mb();
if (ncpus > 1) {
- smp_send_debugger_break();
- /* wait for other cpus to come in */
- for (timeout = 100000000; timeout != 0; --timeout) {
- if (cpumask_weight(&cpus_in_xmon) >= ncpus)
- break;
- barrier();
- }
+ /*
+ * A system reset (trap == 0x100) can be triggered on
+ * all CPUs, so when we come in via 0x100 try waiting
+ * for the other CPUs to come in before we send the
+ * debugger break (IPI). This is similar to
+ * crash_kexec_secondary().
+ */
+ if (TRAP(regs) != 0x100 || !wait_for_other_cpus(ncpus))
+ smp_send_debugger_break();
+
+ wait_for_other_cpus(ncpus);
}
remove_bpts();
disable_surveillance();
@@ -884,10 +909,7 @@ cmds(struct pt_regs *excp)
last_cmd = NULL;
xmon_regs = excp;
- if (!xmon_no_auto_backtrace) {
- xmon_no_auto_backtrace = 1;
- xmon_show_stack(excp->gpr[1], excp->link, excp->nip);
- }
+ xmon_show_stack(excp->gpr[1], excp->link, excp->nip);
for(;;) {
#ifdef CONFIG_SMP
@@ -1347,9 +1369,19 @@ const char *getvecname(unsigned long vec)
case 0x100: ret = "(System Reset)"; break;
case 0x200: ret = "(Machine Check)"; break;
case 0x300: ret = "(Data Access)"; break;
- case 0x380: ret = "(Data SLB Access)"; break;
+ case 0x380:
+ if (radix_enabled())
+ ret = "(Data Access Out of Range)";
+ else
+ ret = "(Data SLB Access)";
+ break;
case 0x400: ret = "(Instruction Access)"; break;
- case 0x480: ret = "(Instruction SLB Access)"; break;
+ case 0x480:
+ if (radix_enabled())
+ ret = "(Instruction Access Out of Range)";
+ else
+ ret = "(Instruction SLB Access)";
+ break;
case 0x500: ret = "(Hardware Interrupt)"; break;
case 0x600: ret = "(Alignment)"; break;
case 0x700: ret = "(Program Check)"; break;
@@ -2231,7 +2263,9 @@ static void dump_one_paca(int cpu)
DUMP(p, kernel_msr, "lx");
DUMP(p, emergency_sp, "p");
#ifdef CONFIG_PPC_BOOK3S_64
+ DUMP(p, nmi_emergency_sp, "p");
DUMP(p, mc_emergency_sp, "p");
+ DUMP(p, in_nmi, "x");
DUMP(p, in_mce, "x");
DUMP(p, hmi_event_available, "x");
#endif
@@ -2338,6 +2372,81 @@ static void dump_pacas(void)
}
#endif
+#ifdef CONFIG_PPC_POWERNV
+static void dump_one_xive(int cpu)
+{
+ unsigned int hwid = get_hard_smp_processor_id(cpu);
+
+ opal_xive_dump(XIVE_DUMP_TM_HYP, hwid);
+ opal_xive_dump(XIVE_DUMP_TM_POOL, hwid);
+ opal_xive_dump(XIVE_DUMP_TM_OS, hwid);
+ opal_xive_dump(XIVE_DUMP_TM_USER, hwid);
+ opal_xive_dump(XIVE_DUMP_VP, hwid);
+ opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid);
+
+ if (setjmp(bus_error_jmp) != 0) {
+ catch_memory_errors = 0;
+ printf("*** Error dumping xive on cpu %d\n", cpu);
+ return;
+ }
+
+ catch_memory_errors = 1;
+ sync();
+ xmon_xive_do_dump(cpu);
+ sync();
+ __delay(200);
+ catch_memory_errors = 0;
+}
+
+static void dump_all_xives(void)
+{
+ int cpu;
+
+ if (num_possible_cpus() == 0) {
+ printf("No possible cpus, use 'dx #' to dump individual cpus\n");
+ return;
+ }
+
+ for_each_possible_cpu(cpu)
+ dump_one_xive(cpu);
+}
+
+static void dump_one_xive_irq(u32 num)
+{
+ s64 rc;
+ __be64 vp;
+ u8 prio;
+ __be32 lirq;
+
+ rc = opal_xive_get_irq_config(num, &vp, &prio, &lirq);
+ xmon_printf("IRQ 0x%x config: vp=0x%llx prio=%d lirq=0x%x (rc=%lld)\n",
+ num, be64_to_cpu(vp), prio, be32_to_cpu(lirq), rc);
+}
+
+static void dump_xives(void)
+{
+ unsigned long num;
+ int c;
+
+ c = inchar();
+ if (c == 'a') {
+ dump_all_xives();
+ return;
+ } else if (c == 'i') {
+ if (scanhex(&num))
+ dump_one_xive_irq(num);
+ return;
+ }
+
+ termch = c; /* Put c back, it wasn't 'a' */
+
+ if (scanhex(&num))
+ dump_one_xive(num);
+ else
+ dump_one_xive(xmon_owner);
+}
+#endif /* CONFIG_PPC_POWERNV */
+
static void dump_by_size(unsigned long addr, long count, int size)
{
unsigned char temp[16];
@@ -2386,6 +2495,14 @@ dump(void)
return;
}
#endif
+#ifdef CONFIG_PPC_POWERNV
+ if (c == 'x') {
+ xmon_start_pagination();
+ dump_xives();
+ xmon_end_pagination();
+ return;
+ }
+#endif
if (c == '\n')
termch = c;
@@ -3070,23 +3187,28 @@ void dump_segments(void)
for (i = 0; i < mmu_slb_size; i++) {
asm volatile("slbmfee %0,%1" : "=r" (esid) : "r" (i));
asm volatile("slbmfev %0,%1" : "=r" (vsid) : "r" (i));
- if (esid || vsid) {
- printf("%02d %016lx %016lx", i, esid, vsid);
- if (esid & SLB_ESID_V) {
- llp = vsid & SLB_VSID_LLP;
- if (vsid & SLB_VSID_B_1T) {
- printf(" 1T ESID=%9lx VSID=%13lx LLP:%3lx \n",
- GET_ESID_1T(esid),
- (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T,
- llp);
- } else {
- printf(" 256M ESID=%9lx VSID=%13lx LLP:%3lx \n",
- GET_ESID(esid),
- (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT,
- llp);
- }
- } else
- printf("\n");
+
+ if (!esid && !vsid)
+ continue;
+
+ printf("%02d %016lx %016lx", i, esid, vsid);
+
+ if (!(esid & SLB_ESID_V)) {
+ printf("\n");
+ continue;
+ }
+
+ llp = vsid & SLB_VSID_LLP;
+ if (vsid & SLB_VSID_B_1T) {
+ printf(" 1T ESID=%9lx VSID=%13lx LLP:%3lx \n",
+ GET_ESID_1T(esid),
+ (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T,
+ llp);
+ } else {
+ printf(" 256M ESID=%9lx VSID=%13lx LLP:%3lx \n",
+ GET_ESID(esid),
+ (vsid & ~SLB_VSID_B) >> SLB_VSID_SHIFT,
+ llp);
}
}
}
@@ -3302,6 +3424,8 @@ static void sysrq_handle_xmon(int key)
/* ensure xmon is enabled */
xmon_init(1);
debugger(get_irq_regs());
+ if (!xmon_on)
+ xmon_init(0);
}
static struct sysrq_key_op sysrq_xmon_op = {
@@ -3315,10 +3439,37 @@ static int __init setup_xmon_sysrq(void)
register_sysrq_key('x', &sysrq_xmon_op);
return 0;
}
-__initcall(setup_xmon_sysrq);
+device_initcall(setup_xmon_sysrq);
#endif /* CONFIG_MAGIC_SYSRQ */
-static int __initdata xmon_early, xmon_off;
+#ifdef CONFIG_DEBUG_FS
+static int xmon_dbgfs_set(void *data, u64 val)
+{
+ xmon_on = !!val;
+ xmon_init(xmon_on);
+
+ return 0;
+}
+
+static int xmon_dbgfs_get(void *data, u64 *val)
+{
+ *val = xmon_on;
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(xmon_dbgfs_ops, xmon_dbgfs_get,
+ xmon_dbgfs_set, "%llu\n");
+
+static int __init setup_xmon_dbgfs(void)
+{
+ debugfs_create_file("xmon", 0600, powerpc_debugfs_root, NULL,
+ &xmon_dbgfs_ops);
+ return 0;
+}
+device_initcall(setup_xmon_dbgfs);
+#endif /* CONFIG_DEBUG_FS */
+
+static int xmon_early __initdata;
static int __init early_parse_xmon(char *p)
{
@@ -3326,12 +3477,12 @@ static int __init early_parse_xmon(char *p)
/* just "xmon" is equivalent to "xmon=early" */
xmon_init(1);
xmon_early = 1;
- } else if (strncmp(p, "on", 2) == 0)
+ xmon_on = 1;
+ } else if (strncmp(p, "on", 2) == 0) {
xmon_init(1);
- else if (strncmp(p, "off", 3) == 0)
- xmon_off = 1;
- else if (strncmp(p, "nobt", 4) == 0)
- xmon_no_auto_backtrace = 1;
+ xmon_on = 1;
+ } else if (strncmp(p, "off", 3) == 0)
+ xmon_on = 0;
else
return 1;
@@ -3341,10 +3492,8 @@ early_param("xmon", early_parse_xmon);
void __init xmon_setup(void)
{
-#ifdef CONFIG_XMON_DEFAULT
- if (!xmon_off)
+ if (xmon_on)
xmon_init(1);
-#endif
if (xmon_early)
debugger(NULL);
}
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index bcc030eacab7..1a138c83f877 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -14,6 +14,7 @@
#include <linux/msi.h>
#include <linux/module.h>
#include <linux/mount.h>
+#include <linux/sched/mm.h>
#include "cxl.h"
@@ -321,19 +322,29 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,
if (task) {
ctx->pid = get_task_pid(task, PIDTYPE_PID);
- ctx->glpid = get_task_pid(task->group_leader, PIDTYPE_PID);
kernel = false;
ctx->real_mode = false;
+
+ /* acquire a reference to the task's mm */
+ ctx->mm = get_task_mm(current);
+
+ /* ensure this mm_struct can't be freed */
+ cxl_context_mm_count_get(ctx);
+
+ /* decrement the use count */
+ if (ctx->mm)
+ mmput(ctx->mm);
}
cxl_ctx_get();
if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) {
- put_pid(ctx->glpid);
put_pid(ctx->pid);
- ctx->glpid = ctx->pid = NULL;
+ ctx->pid = NULL;
cxl_adapter_context_put(ctx->afu->adapter);
cxl_ctx_put();
+ if (task)
+ cxl_context_mm_count_put(ctx);
goto out;
}
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 062bf6ca2625..4472ce11f98d 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -17,6 +17,7 @@
#include <linux/debugfs.h>
#include <linux/slab.h>
#include <linux/idr.h>
+#include <linux/sched/mm.h>
#include <asm/cputable.h>
#include <asm/current.h>
#include <asm/copro.h>
@@ -38,23 +39,26 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master)
{
int i;
- spin_lock_init(&ctx->sste_lock);
ctx->afu = afu;
ctx->master = master;
- ctx->pid = ctx->glpid = NULL; /* Set in start work ioctl */
+ ctx->pid = NULL; /* Set in start work ioctl */
mutex_init(&ctx->mapping_lock);
ctx->mapping = NULL;
- /*
- * Allocate the segment table before we put it in the IDR so that we
- * can always access it when dereferenced from IDR. For the same
- * reason, the segment table is only destroyed after the context is
- * removed from the IDR. Access to this in the IOCTL is protected by
- * Linux filesytem symantics (can't IOCTL until open is complete).
- */
- i = cxl_alloc_sst(ctx);
- if (i)
- return i;
+ if (cxl_is_psl8(afu)) {
+ spin_lock_init(&ctx->sste_lock);
+
+ /*
+ * Allocate the segment table before we put it in the IDR so that we
+ * can always access it when dereferenced from IDR. For the same
+ * reason, the segment table is only destroyed after the context is
+ * removed from the IDR. Access to this in the IOCTL is protected by
+ * Linux filesytem symantics (can't IOCTL until open is complete).
+ */
+ i = cxl_alloc_sst(ctx);
+ if (i)
+ return i;
+ }
INIT_WORK(&ctx->fault_work, cxl_handle_fault);
@@ -184,13 +188,26 @@ int cxl_context_iomap(struct cxl_context *ctx, struct vm_area_struct *vma)
if (ctx->afu->current_mode == CXL_MODE_DEDICATED) {
if (start + len > ctx->afu->adapter->ps_size)
return -EINVAL;
+
+ if (cxl_is_psl9(ctx->afu)) {
+ /*
+ * Make sure there is a valid problem state
+ * area space for this AFU.
+ */
+ if (ctx->master && !ctx->afu->psa) {
+ pr_devel("AFU doesn't support mmio space\n");
+ return -EINVAL;
+ }
+
+ /* Can't mmap until the AFU is enabled */
+ if (!ctx->afu->enabled)
+ return -EBUSY;
+ }
} else {
if (start + len > ctx->psn_size)
return -EINVAL;
- }
- if (ctx->afu->current_mode != CXL_MODE_DEDICATED) {
- /* make sure there is a valid per process space for this AFU */
+ /* Make sure there is a valid per process space for this AFU */
if ((ctx->master && !ctx->afu->psa) || (!ctx->afu->pp_psa)) {
pr_devel("AFU doesn't support mmio space\n");
return -EINVAL;
@@ -242,12 +259,16 @@ int __detach_context(struct cxl_context *ctx)
/* release the reference to the group leader and mm handling pid */
put_pid(ctx->pid);
- put_pid(ctx->glpid);
cxl_ctx_put();
/* Decrease the attached context count on the adapter */
cxl_adapter_context_put(ctx->afu->adapter);
+
+ /* Decrease the mm count on the context */
+ cxl_context_mm_count_put(ctx);
+ ctx->mm = NULL;
+
return 0;
}
@@ -303,7 +324,8 @@ static void reclaim_ctx(struct rcu_head *rcu)
{
struct cxl_context *ctx = container_of(rcu, struct cxl_context, rcu);
- free_page((u64)ctx->sstp);
+ if (cxl_is_psl8(ctx->afu))
+ free_page((u64)ctx->sstp);
if (ctx->ff_page)
__free_page(ctx->ff_page);
ctx->sstp = NULL;
@@ -325,3 +347,15 @@ void cxl_context_free(struct cxl_context *ctx)
mutex_unlock(&ctx->afu->contexts_lock);
call_rcu(&ctx->rcu, reclaim_ctx);
}
+
+void cxl_context_mm_count_get(struct cxl_context *ctx)
+{
+ if (ctx->mm)
+ atomic_inc(&ctx->mm->mm_count);
+}
+
+void cxl_context_mm_count_put(struct cxl_context *ctx)
+{
+ if (ctx->mm)
+ mmdrop(ctx->mm);
+}
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index 79e60ec70bd3..c8568ea7c518 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -63,7 +63,7 @@ typedef struct {
/* Memory maps. Ref CXL Appendix A */
/* PSL Privilege 1 Memory Map */
-/* Configuration and Control area */
+/* Configuration and Control area - CAIA 1&2 */
static const cxl_p1_reg_t CXL_PSL_CtxTime = {0x0000};
static const cxl_p1_reg_t CXL_PSL_ErrIVTE = {0x0008};
static const cxl_p1_reg_t CXL_PSL_KEY1 = {0x0010};
@@ -73,7 +73,7 @@ static const cxl_p1_reg_t CXL_PSL_Control = {0x0020};
static const cxl_p1_reg_t CXL_PSL_DLCNTL = {0x0060};
static const cxl_p1_reg_t CXL_PSL_DLADDR = {0x0068};
-/* PSL Lookaside Buffer Management Area */
+/* PSL Lookaside Buffer Management Area - CAIA 1 */
static const cxl_p1_reg_t CXL_PSL_LBISEL = {0x0080};
static const cxl_p1_reg_t CXL_PSL_SLBIE = {0x0088};
static const cxl_p1_reg_t CXL_PSL_SLBIA = {0x0090};
@@ -82,7 +82,7 @@ static const cxl_p1_reg_t CXL_PSL_TLBIA = {0x00A8};
static const cxl_p1_reg_t CXL_PSL_AFUSEL = {0x00B0};
/* 0x00C0:7EFF Implementation dependent area */
-/* PSL registers */
+/* PSL registers - CAIA 1 */
static const cxl_p1_reg_t CXL_PSL_FIR1 = {0x0100};
static const cxl_p1_reg_t CXL_PSL_FIR2 = {0x0108};
static const cxl_p1_reg_t CXL_PSL_Timebase = {0x0110};
@@ -98,61 +98,83 @@ static const cxl_p1_reg_t CXL_XSL_Timebase = {0x0100};
static const cxl_p1_reg_t CXL_XSL_TB_CTLSTAT = {0x0108};
static const cxl_p1_reg_t CXL_XSL_FEC = {0x0158};
static const cxl_p1_reg_t CXL_XSL_DSNCTL = {0x0168};
+/* PSL registers - CAIA 2 */
+static const cxl_p1_reg_t CXL_PSL9_CONTROL = {0x0020};
+static const cxl_p1_reg_t CXL_XSL9_DSNCTL = {0x0168};
+static const cxl_p1_reg_t CXL_PSL9_FIR1 = {0x0300};
+static const cxl_p1_reg_t CXL_PSL9_FIR2 = {0x0308};
+static const cxl_p1_reg_t CXL_PSL9_Timebase = {0x0310};
+static const cxl_p1_reg_t CXL_PSL9_DEBUG = {0x0320};
+static const cxl_p1_reg_t CXL_PSL9_FIR_CNTL = {0x0348};
+static const cxl_p1_reg_t CXL_PSL9_DSNDCTL = {0x0350};
+static const cxl_p1_reg_t CXL_PSL9_TB_CTLSTAT = {0x0340};
+static const cxl_p1_reg_t CXL_PSL9_TRACECFG = {0x0368};
+static const cxl_p1_reg_t CXL_PSL9_APCDEDALLOC = {0x0378};
+static const cxl_p1_reg_t CXL_PSL9_APCDEDTYPE = {0x0380};
+static const cxl_p1_reg_t CXL_PSL9_TNR_ADDR = {0x0388};
+static const cxl_p1_reg_t CXL_PSL9_GP_CT = {0x0398};
+static const cxl_p1_reg_t CXL_XSL9_IERAT = {0x0588};
+static const cxl_p1_reg_t CXL_XSL9_ILPP = {0x0590};
+
/* 0x7F00:7FFF Reserved PCIe MSI-X Pending Bit Array area */
/* 0x8000:FFFF Reserved PCIe MSI-X Table Area */
/* PSL Slice Privilege 1 Memory Map */
-/* Configuration Area */
+/* Configuration Area - CAIA 1&2 */
static const cxl_p1n_reg_t CXL_PSL_SR_An = {0x00};
static const cxl_p1n_reg_t CXL_PSL_LPID_An = {0x08};
static const cxl_p1n_reg_t CXL_PSL_AMBAR_An = {0x10};
static const cxl_p1n_reg_t CXL_PSL_SPOffset_An = {0x18};
static const cxl_p1n_reg_t CXL_PSL_ID_An = {0x20};
static const cxl_p1n_reg_t CXL_PSL_SERR_An = {0x28};
-/* Memory Management and Lookaside Buffer Management */
+/* Memory Management and Lookaside Buffer Management - CAIA 1*/
static const cxl_p1n_reg_t CXL_PSL_SDR_An = {0x30};
+/* Memory Management and Lookaside Buffer Management - CAIA 1&2 */
static const cxl_p1n_reg_t CXL_PSL_AMOR_An = {0x38};
-/* Pointer Area */
+/* Pointer Area - CAIA 1&2 */
static const cxl_p1n_reg_t CXL_HAURP_An = {0x80};
static const cxl_p1n_reg_t CXL_PSL_SPAP_An = {0x88};
static const cxl_p1n_reg_t CXL_PSL_LLCMD_An = {0x90};
-/* Control Area */
+/* Control Area - CAIA 1&2 */
static const cxl_p1n_reg_t CXL_PSL_SCNTL_An = {0xA0};
static const cxl_p1n_reg_t CXL_PSL_CtxTime_An = {0xA8};
static const cxl_p1n_reg_t CXL_PSL_IVTE_Offset_An = {0xB0};
static const cxl_p1n_reg_t CXL_PSL_IVTE_Limit_An = {0xB8};
-/* 0xC0:FF Implementation Dependent Area */
+/* 0xC0:FF Implementation Dependent Area - CAIA 1&2 */
static const cxl_p1n_reg_t CXL_PSL_FIR_SLICE_An = {0xC0};
static const cxl_p1n_reg_t CXL_AFU_DEBUG_An = {0xC8};
+/* 0xC0:FF Implementation Dependent Area - CAIA 1 */
static const cxl_p1n_reg_t CXL_PSL_APCALLOC_A = {0xD0};
static const cxl_p1n_reg_t CXL_PSL_COALLOC_A = {0xD8};
static const cxl_p1n_reg_t CXL_PSL_RXCTL_A = {0xE0};
static const cxl_p1n_reg_t CXL_PSL_SLICE_TRACE = {0xE8};
/* PSL Slice Privilege 2 Memory Map */
-/* Configuration and Control Area */
+/* Configuration and Control Area - CAIA 1&2 */
static const cxl_p2n_reg_t CXL_PSL_PID_TID_An = {0x000};
static const cxl_p2n_reg_t CXL_CSRP_An = {0x008};
+/* Configuration and Control Area - CAIA 1 */
static const cxl_p2n_reg_t CXL_AURP0_An = {0x010};
static const cxl_p2n_reg_t CXL_AURP1_An = {0x018};
static const cxl_p2n_reg_t CXL_SSTP0_An = {0x020};
static const cxl_p2n_reg_t CXL_SSTP1_An = {0x028};
+/* Configuration and Control Area - CAIA 1 */
static const cxl_p2n_reg_t CXL_PSL_AMR_An = {0x030};
-/* Segment Lookaside Buffer Management */
+/* Segment Lookaside Buffer Management - CAIA 1 */
static const cxl_p2n_reg_t CXL_SLBIE_An = {0x040};
static const cxl_p2n_reg_t CXL_SLBIA_An = {0x048};
static const cxl_p2n_reg_t CXL_SLBI_Select_An = {0x050};
-/* Interrupt Registers */
+/* Interrupt Registers - CAIA 1&2 */
static const cxl_p2n_reg_t CXL_PSL_DSISR_An = {0x060};
static const cxl_p2n_reg_t CXL_PSL_DAR_An = {0x068};
static const cxl_p2n_reg_t CXL_PSL_DSR_An = {0x070};
static const cxl_p2n_reg_t CXL_PSL_TFC_An = {0x078};
static const cxl_p2n_reg_t CXL_PSL_PEHandle_An = {0x080};
static const cxl_p2n_reg_t CXL_PSL_ErrStat_An = {0x088};
-/* AFU Registers */
+/* AFU Registers - CAIA 1&2 */
static const cxl_p2n_reg_t CXL_AFU_Cntl_An = {0x090};
static const cxl_p2n_reg_t CXL_AFU_ERR_An = {0x098};
-/* Work Element Descriptor */
+/* Work Element Descriptor - CAIA 1&2 */
static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
/* 0x0C0:FFF Implementation Dependent Area */
@@ -179,6 +201,10 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
#define CXL_PSL_SR_An_SF MSR_SF /* 64bit */
#define CXL_PSL_SR_An_TA (1ull << (63-1)) /* Tags active, GA1: 0 */
#define CXL_PSL_SR_An_HV MSR_HV /* Hypervisor, GA1: 0 */
+#define CXL_PSL_SR_An_XLAT_hpt (0ull << (63-6))/* Hashed page table (HPT) mode */
+#define CXL_PSL_SR_An_XLAT_roh (2ull << (63-6))/* Radix on HPT mode */
+#define CXL_PSL_SR_An_XLAT_ror (3ull << (63-6))/* Radix on Radix mode */
+#define CXL_PSL_SR_An_BOT (1ull << (63-10)) /* Use the in-memory segment table */
#define CXL_PSL_SR_An_PR MSR_PR /* Problem state, GA1: 1 */
#define CXL_PSL_SR_An_ISL (1ull << (63-53)) /* Ignore Segment Large Page */
#define CXL_PSL_SR_An_TC (1ull << (63-54)) /* Page Table secondary hash */
@@ -202,6 +228,24 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
#define CXL_PSL_SERR_An_llcmdto (1ull << (63-6))
#define CXL_PSL_SERR_An_afupar (1ull << (63-7))
#define CXL_PSL_SERR_An_afudup (1ull << (63-8))
+#define CXL_PSL_SERR_An_IRQS ( \
+ CXL_PSL_SERR_An_afuto | CXL_PSL_SERR_An_afudis | CXL_PSL_SERR_An_afuov | \
+ CXL_PSL_SERR_An_badsrc | CXL_PSL_SERR_An_badctx | CXL_PSL_SERR_An_llcmdis | \
+ CXL_PSL_SERR_An_llcmdto | CXL_PSL_SERR_An_afupar | CXL_PSL_SERR_An_afudup)
+#define CXL_PSL_SERR_An_afuto_mask (1ull << (63-32))
+#define CXL_PSL_SERR_An_afudis_mask (1ull << (63-33))
+#define CXL_PSL_SERR_An_afuov_mask (1ull << (63-34))
+#define CXL_PSL_SERR_An_badsrc_mask (1ull << (63-35))
+#define CXL_PSL_SERR_An_badctx_mask (1ull << (63-36))
+#define CXL_PSL_SERR_An_llcmdis_mask (1ull << (63-37))
+#define CXL_PSL_SERR_An_llcmdto_mask (1ull << (63-38))
+#define CXL_PSL_SERR_An_afupar_mask (1ull << (63-39))
+#define CXL_PSL_SERR_An_afudup_mask (1ull << (63-40))
+#define CXL_PSL_SERR_An_IRQ_MASKS ( \
+ CXL_PSL_SERR_An_afuto_mask | CXL_PSL_SERR_An_afudis_mask | CXL_PSL_SERR_An_afuov_mask | \
+ CXL_PSL_SERR_An_badsrc_mask | CXL_PSL_SERR_An_badctx_mask | CXL_PSL_SERR_An_llcmdis_mask | \
+ CXL_PSL_SERR_An_llcmdto_mask | CXL_PSL_SERR_An_afupar_mask | CXL_PSL_SERR_An_afudup_mask)
+
#define CXL_PSL_SERR_An_AE (1ull << (63-30))
/****** CXL_PSL_SCNTL_An ****************************************************/
@@ -257,7 +301,7 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
#define CXL_SSTP1_An_STVA_L_MASK (~((1ull << (63-55))-1))
#define CXL_SSTP1_An_V (1ull << (63-63))
-/****** CXL_PSL_SLBIE_[An] **************************************************/
+/****** CXL_PSL_SLBIE_[An] - CAIA 1 **************************************************/
/* write: */
#define CXL_SLBIE_C PPC_BIT(36) /* Class */
#define CXL_SLBIE_SS PPC_BITMASK(37, 38) /* Segment Size */
@@ -267,10 +311,10 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
#define CXL_SLBIE_MAX PPC_BITMASK(24, 31)
#define CXL_SLBIE_PENDING PPC_BITMASK(56, 63)
-/****** Common to all CXL_TLBIA/SLBIA_[An] **********************************/
+/****** Common to all CXL_TLBIA/SLBIA_[An] - CAIA 1 **********************************/
#define CXL_TLB_SLB_P (1ull) /* Pending (read) */
-/****** Common to all CXL_TLB/SLB_IA/IE_[An] registers **********************/
+/****** Common to all CXL_TLB/SLB_IA/IE_[An] registers - CAIA 1 **********************/
#define CXL_TLB_SLB_IQ_ALL (0ull) /* Inv qualifier */
#define CXL_TLB_SLB_IQ_LPID (1ull) /* Inv qualifier */
#define CXL_TLB_SLB_IQ_LPIDPID (3ull) /* Inv qualifier */
@@ -278,7 +322,7 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
/****** CXL_PSL_AFUSEL ******************************************************/
#define CXL_PSL_AFUSEL_A (1ull << (63-55)) /* Adapter wide invalidates affect all AFUs */
-/****** CXL_PSL_DSISR_An ****************************************************/
+/****** CXL_PSL_DSISR_An - CAIA 1 ****************************************************/
#define CXL_PSL_DSISR_An_DS (1ull << (63-0)) /* Segment not found */
#define CXL_PSL_DSISR_An_DM (1ull << (63-1)) /* PTE not found (See also: M) or protection fault */
#define CXL_PSL_DSISR_An_ST (1ull << (63-2)) /* Segment Table PTE not found */
@@ -295,12 +339,39 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An = {0x0A0};
#define CXL_PSL_DSISR_An_S DSISR_ISSTORE /* Access was afu_wr or afu_zero */
#define CXL_PSL_DSISR_An_K DSISR_KEYFAULT /* Access not permitted by virtual page class key protection */
+/****** CXL_PSL_DSISR_An - CAIA 2 ****************************************************/
+#define CXL_PSL9_DSISR_An_TF (1ull << (63-3)) /* Translation fault */
+#define CXL_PSL9_DSISR_An_PE (1ull << (63-4)) /* PSL Error (implementation specific) */
+#define CXL_PSL9_DSISR_An_AE (1ull << (63-5)) /* AFU Error */
+#define CXL_PSL9_DSISR_An_OC (1ull << (63-6)) /* OS Context Warning */
+#define CXL_PSL9_DSISR_An_S (1ull << (63-38)) /* TF for a write operation */
+#define CXL_PSL9_DSISR_PENDING (CXL_PSL9_DSISR_An_TF | CXL_PSL9_DSISR_An_PE | CXL_PSL9_DSISR_An_AE | CXL_PSL9_DSISR_An_OC)
+/*
+ * NOTE: Bits 56:63 (Checkout Response Status) are valid when DSISR_An[TF] = 1
+ * Status (0:7) Encoding
+ */
+#define CXL_PSL9_DSISR_An_CO_MASK 0x00000000000000ffULL
+#define CXL_PSL9_DSISR_An_SF 0x0000000000000080ULL /* Segment Fault 0b10000000 */
+#define CXL_PSL9_DSISR_An_PF_SLR 0x0000000000000088ULL /* PTE not found (Single Level Radix) 0b10001000 */
+#define CXL_PSL9_DSISR_An_PF_RGC 0x000000000000008CULL /* PTE not found (Radix Guest (child)) 0b10001100 */
+#define CXL_PSL9_DSISR_An_PF_RGP 0x0000000000000090ULL /* PTE not found (Radix Guest (parent)) 0b10010000 */
+#define CXL_PSL9_DSISR_An_PF_HRH 0x0000000000000094ULL /* PTE not found (HPT/Radix Host) 0b10010100 */
+#define CXL_PSL9_DSISR_An_PF_STEG 0x000000000000009CULL /* PTE not found (STEG VA) 0b10011100 */
+
/****** CXL_PSL_TFC_An ******************************************************/
#define CXL_PSL_TFC_An_A (1ull << (63-28)) /* Acknowledge non-translation fault */
#define CXL_PSL_TFC_An_C (1ull << (63-29)) /* Continue (abort transaction) */
#define CXL_PSL_TFC_An_AE (1ull << (63-30)) /* Restart PSL with address error */
#define CXL_PSL_TFC_An_R (1ull << (63-31)) /* Restart PSL transaction */
+/****** CXL_XSL9_IERAT_ERAT - CAIA 2 **********************************/
+#define CXL_XSL9_IERAT_MLPID (1ull << (63-0)) /* Match LPID */
+#define CXL_XSL9_IERAT_MPID (1ull << (63-1)) /* Match PID */
+#define CXL_XSL9_IERAT_PRS (1ull << (63-4)) /* PRS bit for Radix invalidations */
+#define CXL_XSL9_IERAT_INVR (1ull << (63-3)) /* Invalidate Radix */
+#define CXL_XSL9_IERAT_IALL (1ull << (63-8)) /* Invalidate All */
+#define CXL_XSL9_IERAT_IINPROG (1ull << (63-63)) /* Invalidate in progress */
+
/* cxl_process_element->software_status */
#define CXL_PE_SOFTWARE_STATE_V (1ul << (31 - 0)) /* Valid */
#define CXL_PE_SOFTWARE_STATE_C (1ul << (31 - 29)) /* Complete */
@@ -482,8 +553,6 @@ struct cxl_context {
unsigned int sst_size, sst_lru;
wait_queue_head_t wq;
- /* pid of the group leader associated with the pid */
- struct pid *glpid;
/* use mm context associated with this pid for ds faults */
struct pid *pid;
spinlock_t lock; /* Protects pending_irq_mask, pending_fault and fault_addr */
@@ -551,15 +620,27 @@ struct cxl_context {
* CX4 only:
*/
struct list_head extra_irq_contexts;
+
+ struct mm_struct *mm;
};
+struct cxl_irq_info;
+
struct cxl_service_layer_ops {
int (*adapter_regs_init)(struct cxl *adapter, struct pci_dev *dev);
+ int (*invalidate_all)(struct cxl *adapter);
int (*afu_regs_init)(struct cxl_afu *afu);
+ int (*sanitise_afu_regs)(struct cxl_afu *afu);
int (*register_serr_irq)(struct cxl_afu *afu);
void (*release_serr_irq)(struct cxl_afu *afu);
- void (*debugfs_add_adapter_sl_regs)(struct cxl *adapter, struct dentry *dir);
- void (*debugfs_add_afu_sl_regs)(struct cxl_afu *afu, struct dentry *dir);
+ irqreturn_t (*handle_interrupt)(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info);
+ irqreturn_t (*fail_irq)(struct cxl_afu *afu, struct cxl_irq_info *irq_info);
+ int (*activate_dedicated_process)(struct cxl_afu *afu);
+ int (*attach_afu_directed)(struct cxl_context *ctx, u64 wed, u64 amr);
+ int (*attach_dedicated_process)(struct cxl_context *ctx, u64 wed, u64 amr);
+ void (*update_dedicated_ivtes)(struct cxl_context *ctx);
+ void (*debugfs_add_adapter_regs)(struct cxl *adapter, struct dentry *dir);
+ void (*debugfs_add_afu_regs)(struct cxl_afu *afu, struct dentry *dir);
void (*psl_irq_dump_registers)(struct cxl_context *ctx);
void (*err_irq_dump_registers)(struct cxl *adapter);
void (*debugfs_stop_trace)(struct cxl *adapter);
@@ -641,25 +722,38 @@ int cxl_pci_reset(struct cxl *adapter);
void cxl_pci_release_afu(struct device *dev);
ssize_t cxl_pci_read_adapter_vpd(struct cxl *adapter, void *buf, size_t len);
-/* common == phyp + powernv */
+/* common == phyp + powernv - CAIA 1&2 */
struct cxl_process_element_common {
__be32 tid;
__be32 pid;
__be64 csrp;
- __be64 aurp0;
- __be64 aurp1;
- __be64 sstp0;
- __be64 sstp1;
+ union {
+ struct {
+ __be64 aurp0;
+ __be64 aurp1;
+ __be64 sstp0;
+ __be64 sstp1;
+ } psl8; /* CAIA 1 */
+ struct {
+ u8 reserved2[8];
+ u8 reserved3[8];
+ u8 reserved4[8];
+ u8 reserved5[8];
+ } psl9; /* CAIA 2 */
+ } u;
__be64 amr;
- u8 reserved3[4];
+ u8 reserved6[4];
__be64 wed;
} __packed;
-/* just powernv */
+/* just powernv - CAIA 1&2 */
struct cxl_process_element {
__be64 sr;
__be64 SPOffset;
- __be64 sdr;
+ union {
+ __be64 sdr; /* CAIA 1 */
+ u8 reserved1[8]; /* CAIA 2 */
+ } u;
__be64 haurp;
__be32 ctxtime;
__be16 ivte_offsets[4];
@@ -739,6 +833,39 @@ static inline u64 cxl_p2n_read(struct cxl_afu *afu, cxl_p2n_reg_t reg)
return ~0ULL;
}
+static inline bool cxl_is_power8(void)
+{
+ if ((pvr_version_is(PVR_POWER8E)) ||
+ (pvr_version_is(PVR_POWER8NVL)) ||
+ (pvr_version_is(PVR_POWER8)))
+ return true;
+ return false;
+}
+
+static inline bool cxl_is_power9(void)
+{
+ /* intermediate solution */
+ if (!cxl_is_power8() &&
+ (cpu_has_feature(CPU_FTRS_POWER9) ||
+ cpu_has_feature(CPU_FTR_POWER9_DD1)))
+ return true;
+ return false;
+}
+
+static inline bool cxl_is_psl8(struct cxl_afu *afu)
+{
+ if (afu->adapter->caia_major == 1)
+ return true;
+ return false;
+}
+
+static inline bool cxl_is_psl9(struct cxl_afu *afu)
+{
+ if (afu->adapter->caia_major == 2)
+ return true;
+ return false;
+}
+
ssize_t cxl_pci_afu_read_err_buffer(struct cxl_afu *afu, char *buf,
loff_t off, size_t count);
@@ -765,7 +892,6 @@ int cxl_update_properties(struct device_node *dn, struct property *new_prop);
void cxl_remove_adapter_nr(struct cxl *adapter);
-int cxl_alloc_spa(struct cxl_afu *afu);
void cxl_release_spa(struct cxl_afu *afu);
dev_t cxl_get_dev(void);
@@ -803,6 +929,15 @@ int afu_register_irqs(struct cxl_context *ctx, u32 count);
void afu_release_irqs(struct cxl_context *ctx, void *cookie);
void afu_irq_name_free(struct cxl_context *ctx);
+int cxl_attach_afu_directed_psl9(struct cxl_context *ctx, u64 wed, u64 amr);
+int cxl_attach_afu_directed_psl8(struct cxl_context *ctx, u64 wed, u64 amr);
+int cxl_activate_dedicated_process_psl9(struct cxl_afu *afu);
+int cxl_activate_dedicated_process_psl8(struct cxl_afu *afu);
+int cxl_attach_dedicated_process_psl9(struct cxl_context *ctx, u64 wed, u64 amr);
+int cxl_attach_dedicated_process_psl8(struct cxl_context *ctx, u64 wed, u64 amr);
+void cxl_update_dedicated_ivtes_psl9(struct cxl_context *ctx);
+void cxl_update_dedicated_ivtes_psl8(struct cxl_context *ctx);
+
#ifdef CONFIG_DEBUG_FS
int cxl_debugfs_init(void);
@@ -811,10 +946,13 @@ int cxl_debugfs_adapter_add(struct cxl *adapter);
void cxl_debugfs_adapter_remove(struct cxl *adapter);
int cxl_debugfs_afu_add(struct cxl_afu *afu);
void cxl_debugfs_afu_remove(struct cxl_afu *afu);
-void cxl_stop_trace(struct cxl *cxl);
-void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir);
-void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, struct dentry *dir);
-void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir);
+void cxl_stop_trace_psl9(struct cxl *cxl);
+void cxl_stop_trace_psl8(struct cxl *cxl);
+void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir);
+void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir);
+void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir);
+void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir);
+void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir);
#else /* CONFIG_DEBUG_FS */
@@ -845,21 +983,34 @@ static inline void cxl_debugfs_afu_remove(struct cxl_afu *afu)
{
}
-static inline void cxl_stop_trace(struct cxl *cxl)
+static inline void cxl_stop_trace_psl9(struct cxl *cxl)
{
}
-static inline void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter,
+static inline void cxl_stop_trace_psl8(struct cxl *cxl)
+{
+}
+
+static inline void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter,
struct dentry *dir)
{
}
-static inline void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter,
+static inline void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter,
struct dentry *dir)
{
}
-static inline void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir)
+static inline void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter,
+ struct dentry *dir)
+{
+}
+
+static inline void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir)
+{
+}
+
+static inline void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir)
{
}
@@ -888,27 +1039,15 @@ int __detach_context(struct cxl_context *ctx);
/*
* This must match the layout of the H_COLLECT_CA_INT_INFO retbuf defined
* in PAPR.
- * A word about endianness: a pointer to this structure is passed when
- * calling the hcall. However, it is not a block of memory filled up by
- * the hypervisor. The return values are found in registers, and copied
- * one by one when returning from the hcall. See the end of the call to
- * plpar_hcall9() in hvCall.S
- * As a consequence:
- * - we don't need to do any endianness conversion
- * - the pid and tid are an exception. They are 32-bit values returned in
- * the same 64-bit register. So we do need to worry about byte ordering.
+ * Field pid_tid is now 'reserved' because it's no more used on bare-metal.
+ * On a guest environment, PSL_PID_An is located on the upper 32 bits and
+ * PSL_TID_An register in the lower 32 bits.
*/
struct cxl_irq_info {
u64 dsisr;
u64 dar;
u64 dsr;
-#ifndef CONFIG_CPU_LITTLE_ENDIAN
- u32 pid;
- u32 tid;
-#else
- u32 tid;
- u32 pid;
-#endif
+ u64 reserved;
u64 afu_err;
u64 errstat;
u64 proc_handle;
@@ -916,19 +1055,23 @@ struct cxl_irq_info {
};
void cxl_assign_psn_space(struct cxl_context *ctx);
-irqreturn_t cxl_irq(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info);
+int cxl_invalidate_all_psl9(struct cxl *adapter);
+int cxl_invalidate_all_psl8(struct cxl *adapter);
+irqreturn_t cxl_irq_psl9(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info);
+irqreturn_t cxl_irq_psl8(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info);
+irqreturn_t cxl_fail_irq_psl(struct cxl_afu *afu, struct cxl_irq_info *irq_info);
int cxl_register_one_irq(struct cxl *adapter, irq_handler_t handler,
void *cookie, irq_hw_number_t *dest_hwirq,
unsigned int *dest_virq, const char *name);
int cxl_check_error(struct cxl_afu *afu);
int cxl_afu_slbia(struct cxl_afu *afu);
-int cxl_tlb_slb_invalidate(struct cxl *adapter);
int cxl_data_cache_flush(struct cxl *adapter);
int cxl_afu_disable(struct cxl_afu *afu);
int cxl_psl_purge(struct cxl_afu *afu);
-void cxl_native_psl_irq_dump_regs(struct cxl_context *ctx);
+void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx);
+void cxl_native_irq_dump_regs_psl8(struct cxl_context *ctx);
void cxl_native_err_irq_dump_regs(struct cxl *adapter);
int cxl_pci_vphb_add(struct cxl_afu *afu);
void cxl_pci_vphb_remove(struct cxl_afu *afu);
@@ -1024,4 +1167,10 @@ int cxl_adapter_context_lock(struct cxl *adapter);
/* Unlock the contexts-lock if taken. Warn and force unlock otherwise */
void cxl_adapter_context_unlock(struct cxl *adapter);
+/* Increases the reference count to "struct mm_struct" */
+void cxl_context_mm_count_get(struct cxl_context *ctx);
+
+/* Decrements the reference count to "struct mm_struct" */
+void cxl_context_mm_count_put(struct cxl_context *ctx);
+
#endif
diff --git a/drivers/misc/cxl/debugfs.c b/drivers/misc/cxl/debugfs.c
index 9c06ac8fa5ac..eae9d749f967 100644
--- a/drivers/misc/cxl/debugfs.c
+++ b/drivers/misc/cxl/debugfs.c
@@ -15,7 +15,13 @@
static struct dentry *cxl_debugfs;
-void cxl_stop_trace(struct cxl *adapter)
+void cxl_stop_trace_psl9(struct cxl *adapter)
+{
+ /* Stop the trace */
+ cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x4480000000000000ULL);
+}
+
+void cxl_stop_trace_psl8(struct cxl *adapter)
{
int slice;
@@ -53,7 +59,15 @@ static struct dentry *debugfs_create_io_x64(const char *name, umode_t mode,
(void __force *)value, &fops_io_x64);
}
-void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir)
+void cxl_debugfs_add_adapter_regs_psl9(struct cxl *adapter, struct dentry *dir)
+{
+ debugfs_create_io_x64("fir1", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR1));
+ debugfs_create_io_x64("fir2", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR2));
+ debugfs_create_io_x64("fir_cntl", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_FIR_CNTL));
+ debugfs_create_io_x64("trace", S_IRUSR | S_IWUSR, dir, _cxl_p1_addr(adapter, CXL_PSL9_TRACECFG));
+}
+
+void cxl_debugfs_add_adapter_regs_psl8(struct cxl *adapter, struct dentry *dir)
{
debugfs_create_io_x64("fir1", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_FIR1));
debugfs_create_io_x64("fir2", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_FIR2));
@@ -61,7 +75,7 @@ void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir)
debugfs_create_io_x64("trace", S_IRUSR | S_IWUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_TRACE));
}
-void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, struct dentry *dir)
+void cxl_debugfs_add_adapter_regs_xsl(struct cxl *adapter, struct dentry *dir)
{
debugfs_create_io_x64("fec", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_XSL_FEC));
}
@@ -82,8 +96,8 @@ int cxl_debugfs_adapter_add(struct cxl *adapter)
debugfs_create_io_x64("err_ivte", S_IRUSR, dir, _cxl_p1_addr(adapter, CXL_PSL_ErrIVTE));
- if (adapter->native->sl_ops->debugfs_add_adapter_sl_regs)
- adapter->native->sl_ops->debugfs_add_adapter_sl_regs(adapter, dir);
+ if (adapter->native->sl_ops->debugfs_add_adapter_regs)
+ adapter->native->sl_ops->debugfs_add_adapter_regs(adapter, dir);
return 0;
}
@@ -92,8 +106,16 @@ void cxl_debugfs_adapter_remove(struct cxl *adapter)
debugfs_remove_recursive(adapter->debugfs);
}
-void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir)
+void cxl_debugfs_add_afu_regs_psl9(struct cxl_afu *afu, struct dentry *dir)
{
+ debugfs_create_io_x64("serr", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SERR_An));
+}
+
+void cxl_debugfs_add_afu_regs_psl8(struct cxl_afu *afu, struct dentry *dir)
+{
+ debugfs_create_io_x64("sstp0", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP0_An));
+ debugfs_create_io_x64("sstp1", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP1_An));
+
debugfs_create_io_x64("fir", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_FIR_SLICE_An));
debugfs_create_io_x64("serr", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SERR_An));
debugfs_create_io_x64("afu_debug", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_AFU_DEBUG_An));
@@ -117,12 +139,11 @@ int cxl_debugfs_afu_add(struct cxl_afu *afu)
debugfs_create_io_x64("sr", S_IRUSR, dir, _cxl_p1n_addr(afu, CXL_PSL_SR_An));
debugfs_create_io_x64("dsisr", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_DSISR_An));
debugfs_create_io_x64("dar", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_DAR_An));
- debugfs_create_io_x64("sstp0", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP0_An));
- debugfs_create_io_x64("sstp1", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_SSTP1_An));
+
debugfs_create_io_x64("err_status", S_IRUSR, dir, _cxl_p2n_addr(afu, CXL_PSL_ErrStat_An));
- if (afu->adapter->native->sl_ops->debugfs_add_afu_sl_regs)
- afu->adapter->native->sl_ops->debugfs_add_afu_sl_regs(afu, dir);
+ if (afu->adapter->native->sl_ops->debugfs_add_afu_regs)
+ afu->adapter->native->sl_ops->debugfs_add_afu_regs(afu, dir);
return 0;
}
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 2fa015c05561..5344448f514e 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -146,108 +146,67 @@ static void cxl_handle_page_fault(struct cxl_context *ctx,
return cxl_ack_ae(ctx);
}
- /*
- * update_mmu_cache() will not have loaded the hash since current->trap
- * is not a 0x400 or 0x300, so just call hash_page_mm() here.
- */
- access = _PAGE_PRESENT | _PAGE_READ;
- if (dsisr & CXL_PSL_DSISR_An_S)
- access |= _PAGE_WRITE;
-
- access |= _PAGE_PRIVILEGED;
- if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID))
- access &= ~_PAGE_PRIVILEGED;
-
- if (dsisr & DSISR_NOHPTE)
- inv_flags |= HPTE_NOHPTE_UPDATE;
-
- local_irq_save(flags);
- hash_page_mm(mm, dar, access, 0x300, inv_flags);
- local_irq_restore(flags);
-
+ if (!radix_enabled()) {
+ /*
+ * update_mmu_cache() will not have loaded the hash since current->trap
+ * is not a 0x400 or 0x300, so just call hash_page_mm() here.
+ */
+ access = _PAGE_PRESENT | _PAGE_READ;
+ if (dsisr & CXL_PSL_DSISR_An_S)
+ access |= _PAGE_WRITE;
+
+ access |= _PAGE_PRIVILEGED;
+ if ((!ctx->kernel) || (REGION_ID(dar) == USER_REGION_ID))
+ access &= ~_PAGE_PRIVILEGED;
+
+ if (dsisr & DSISR_NOHPTE)
+ inv_flags |= HPTE_NOHPTE_UPDATE;
+
+ local_irq_save(flags);
+ hash_page_mm(mm, dar, access, 0x300, inv_flags);
+ local_irq_restore(flags);
+ }
pr_devel("Page fault successfully handled for pe: %i!\n", ctx->pe);
cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_R, 0);
}
/*
- * Returns the mm_struct corresponding to the context ctx via ctx->pid
- * In case the task has exited we use the task group leader accessible
- * via ctx->glpid to find the next task in the thread group that has a
- * valid mm_struct associated with it. If a task with valid mm_struct
- * is found the ctx->pid is updated to use the task struct for subsequent
- * translations. In case no valid mm_struct is found in the task group to
- * service the fault a NULL is returned.
+ * Returns the mm_struct corresponding to the context ctx.
+ * mm_users == 0, the context may be in the process of being closed.
*/
static struct mm_struct *get_mem_context(struct cxl_context *ctx)
{
- struct task_struct *task = NULL;
- struct mm_struct *mm = NULL;
- struct pid *old_pid = ctx->pid;
-
- if (old_pid == NULL) {
- pr_warn("%s: Invalid context for pe=%d\n",
- __func__, ctx->pe);
+ if (ctx->mm == NULL)
return NULL;
- }
- task = get_pid_task(old_pid, PIDTYPE_PID);
+ if (!atomic_inc_not_zero(&ctx->mm->mm_users))
+ return NULL;
- /*
- * pid_alive may look racy but this saves us from costly
- * get_task_mm when the task is a zombie. In worst case
- * we may think a task is alive, which is about to die
- * but get_task_mm will return NULL.
- */
- if (task != NULL && pid_alive(task))
- mm = get_task_mm(task);
+ return ctx->mm;
+}
- /* release the task struct that was taken earlier */
- if (task)
- put_task_struct(task);
- else
- pr_devel("%s: Context owning pid=%i for pe=%i dead\n",
- __func__, pid_nr(old_pid), ctx->pe);
-
- /*
- * If we couldn't find the mm context then use the group
- * leader to iterate over the task group and find a task
- * that gives us mm_struct.
- */
- if (unlikely(mm == NULL && ctx->glpid != NULL)) {
-
- rcu_read_lock();
- task = pid_task(ctx->glpid, PIDTYPE_PID);
- if (task)
- do {
- mm = get_task_mm(task);
- if (mm) {
- ctx->pid = get_task_pid(task,
- PIDTYPE_PID);
- break;
- }
- task = next_thread(task);
- } while (task && !thread_group_leader(task));
- rcu_read_unlock();
-
- /* check if we switched pid */
- if (ctx->pid != old_pid) {
- if (mm)
- pr_devel("%s:pe=%i switch pid %i->%i\n",
- __func__, ctx->pe, pid_nr(old_pid),
- pid_nr(ctx->pid));
- else
- pr_devel("%s:Cannot find mm for pid=%i\n",
- __func__, pid_nr(old_pid));
-
- /* drop the reference to older pid */
- put_pid(old_pid);
- }
- }
+static bool cxl_is_segment_miss(struct cxl_context *ctx, u64 dsisr)
+{
+ if ((cxl_is_psl8(ctx->afu)) && (dsisr & CXL_PSL_DSISR_An_DS))
+ return true;
- return mm;
+ return false;
}
+static bool cxl_is_page_fault(struct cxl_context *ctx, u64 dsisr)
+{
+ if ((cxl_is_psl8(ctx->afu)) && (dsisr & CXL_PSL_DSISR_An_DM))
+ return true;
+
+ if ((cxl_is_psl9(ctx->afu)) &&
+ ((dsisr & CXL_PSL9_DSISR_An_CO_MASK) &
+ (CXL_PSL9_DSISR_An_PF_SLR | CXL_PSL9_DSISR_An_PF_RGC |
+ CXL_PSL9_DSISR_An_PF_RGP | CXL_PSL9_DSISR_An_PF_HRH |
+ CXL_PSL9_DSISR_An_PF_STEG)))
+ return true;
+ return false;
+}
void cxl_handle_fault(struct work_struct *fault_work)
{
@@ -282,7 +241,6 @@ void cxl_handle_fault(struct work_struct *fault_work)
if (!ctx->kernel) {
mm = get_mem_context(ctx);
- /* indicates all the thread in task group have exited */
if (mm == NULL) {
pr_devel("%s: unable to get mm for pe=%d pid=%i\n",
__func__, ctx->pe, pid_nr(ctx->pid));
@@ -294,9 +252,9 @@ void cxl_handle_fault(struct work_struct *fault_work)
}
}
- if (dsisr & CXL_PSL_DSISR_An_DS)
+ if (cxl_is_segment_miss(ctx, dsisr))
cxl_handle_segment_miss(ctx, mm, dar);
- else if (dsisr & CXL_PSL_DSISR_An_DM)
+ else if (cxl_is_page_fault(ctx, dsisr))
cxl_handle_page_fault(ctx, mm, dsisr, dar);
else
WARN(1, "cxl_handle_fault has nothing to handle\n");
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index e7139c76f961..17b433f1ce23 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -18,6 +18,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include <asm/cputable.h>
#include <asm/current.h>
#include <asm/copro.h>
@@ -216,8 +217,16 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
* process is still accessible.
*/
ctx->pid = get_task_pid(current, PIDTYPE_PID);
- ctx->glpid = get_task_pid(current->group_leader, PIDTYPE_PID);
+ /* acquire a reference to the task's mm */
+ ctx->mm = get_task_mm(current);
+
+ /* ensure this mm_struct can't be freed */
+ cxl_context_mm_count_get(ctx);
+
+ /* decrement the use count */
+ if (ctx->mm)
+ mmput(ctx->mm);
trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr);
@@ -225,9 +234,9 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
amr))) {
afu_release_irqs(ctx, ctx);
cxl_adapter_context_put(ctx->afu->adapter);
- put_pid(ctx->glpid);
put_pid(ctx->pid);
- ctx->glpid = ctx->pid = NULL;
+ ctx->pid = NULL;
+ cxl_context_mm_count_put(ctx);
goto out;
}
diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c
index e04bc4ddfd74..f58b4b6c79f2 100644
--- a/drivers/misc/cxl/guest.c
+++ b/drivers/misc/cxl/guest.c
@@ -169,7 +169,7 @@ static irqreturn_t guest_psl_irq(int irq, void *data)
return IRQ_HANDLED;
}
- rc = cxl_irq(irq, ctx, &irq_info);
+ rc = cxl_irq_psl8(irq, ctx, &irq_info);
return rc;
}
@@ -551,13 +551,13 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
elem->common.tid = cpu_to_be32(0); /* Unused */
elem->common.pid = cpu_to_be32(pid);
elem->common.csrp = cpu_to_be64(0); /* disable */
- elem->common.aurp0 = cpu_to_be64(0); /* disable */
- elem->common.aurp1 = cpu_to_be64(0); /* disable */
+ elem->common.u.psl8.aurp0 = cpu_to_be64(0); /* disable */
+ elem->common.u.psl8.aurp1 = cpu_to_be64(0); /* disable */
cxl_prefault(ctx, wed);
- elem->common.sstp0 = cpu_to_be64(ctx->sstp0);
- elem->common.sstp1 = cpu_to_be64(ctx->sstp1);
+ elem->common.u.psl8.sstp0 = cpu_to_be64(ctx->sstp0);
+ elem->common.u.psl8.sstp1 = cpu_to_be64(ctx->sstp1);
/*
* Ensure we have at least one interrupt allocated to take faults for
diff --git a/drivers/misc/cxl/hcalls.c b/drivers/misc/cxl/hcalls.c
index d6d11f4056d7..9b8bb0f80c3b 100644
--- a/drivers/misc/cxl/hcalls.c
+++ b/drivers/misc/cxl/hcalls.c
@@ -413,9 +413,9 @@ long cxl_h_collect_int_info(u64 unit_address, u64 process_token,
switch (rc) {
case H_SUCCESS: /* The interrupt info is returned in return registers. */
- pr_devel("dsisr:%#llx, dar:%#llx, dsr:%#llx, pid:%u, tid:%u, afu_err:%#llx, errstat:%#llx\n",
- info->dsisr, info->dar, info->dsr, info->pid,
- info->tid, info->afu_err, info->errstat);
+ pr_devel("dsisr:%#llx, dar:%#llx, dsr:%#llx, pid_tid:%#llx, afu_err:%#llx, errstat:%#llx\n",
+ info->dsisr, info->dar, info->dsr, info->reserved,
+ info->afu_err, info->errstat);
return 0;
case H_PARAMETER: /* An incorrect parameter was supplied. */
return -EINVAL;
diff --git a/drivers/misc/cxl/irq.c b/drivers/misc/cxl/irq.c
index 1a402bbed687..ce08a9f22308 100644
--- a/drivers/misc/cxl/irq.c
+++ b/drivers/misc/cxl/irq.c
@@ -34,7 +34,58 @@ static irqreturn_t schedule_cxl_fault(struct cxl_context *ctx, u64 dsisr, u64 da
return IRQ_HANDLED;
}
-irqreturn_t cxl_irq(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info)
+irqreturn_t cxl_irq_psl9(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info)
+{
+ u64 dsisr, dar;
+
+ dsisr = irq_info->dsisr;
+ dar = irq_info->dar;
+
+ trace_cxl_psl9_irq(ctx, irq, dsisr, dar);
+
+ pr_devel("CXL interrupt %i for afu pe: %i DSISR: %#llx DAR: %#llx\n", irq, ctx->pe, dsisr, dar);
+
+ if (dsisr & CXL_PSL9_DSISR_An_TF) {
+ pr_devel("CXL interrupt: Scheduling translation fault handling for later (pe: %i)\n", ctx->pe);
+ return schedule_cxl_fault(ctx, dsisr, dar);
+ }
+
+ if (dsisr & CXL_PSL9_DSISR_An_PE)
+ return cxl_ops->handle_psl_slice_error(ctx, dsisr,
+ irq_info->errstat);
+ if (dsisr & CXL_PSL9_DSISR_An_AE) {
+ pr_devel("CXL interrupt: AFU Error 0x%016llx\n", irq_info->afu_err);
+
+ if (ctx->pending_afu_err) {
+ /*
+ * This shouldn't happen - the PSL treats these errors
+ * as fatal and will have reset the AFU, so there's not
+ * much point buffering multiple AFU errors.
+ * OTOH if we DO ever see a storm of these come in it's
+ * probably best that we log them somewhere:
+ */
+ dev_err_ratelimited(&ctx->afu->dev, "CXL AFU Error undelivered to pe %i: 0x%016llx\n",
+ ctx->pe, irq_info->afu_err);
+ } else {
+ spin_lock(&ctx->lock);
+ ctx->afu_err = irq_info->afu_err;
+ ctx->pending_afu_err = 1;
+ spin_unlock(&ctx->lock);
+
+ wake_up_all(&ctx->wq);
+ }
+
+ cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_A, 0);
+ return IRQ_HANDLED;
+ }
+ if (dsisr & CXL_PSL9_DSISR_An_OC)
+ pr_devel("CXL interrupt: OS Context Warning\n");
+
+ WARN(1, "Unhandled CXL PSL IRQ\n");
+ return IRQ_HANDLED;
+}
+
+irqreturn_t cxl_irq_psl8(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info)
{
u64 dsisr, dar;
diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c
index b0b6ed31918e..1703655072b1 100644
--- a/drivers/misc/cxl/main.c
+++ b/drivers/misc/cxl/main.c
@@ -59,16 +59,10 @@ int cxl_afu_slbia(struct cxl_afu *afu)
static inline void _cxl_slbia(struct cxl_context *ctx, struct mm_struct *mm)
{
- struct task_struct *task;
unsigned long flags;
- if (!(task = get_pid_task(ctx->pid, PIDTYPE_PID))) {
- pr_devel("%s unable to get task %i\n",
- __func__, pid_nr(ctx->pid));
- return;
- }
- if (task->mm != mm)
- goto out_put;
+ if (ctx->mm != mm)
+ return;
pr_devel("%s matched mm - card: %i afu: %i pe: %i\n", __func__,
ctx->afu->adapter->adapter_num, ctx->afu->slice, ctx->pe);
@@ -79,8 +73,6 @@ static inline void _cxl_slbia(struct cxl_context *ctx, struct mm_struct *mm)
spin_unlock_irqrestore(&ctx->sste_lock, flags);
mb();
cxl_afu_slbia(ctx->afu);
-out_put:
- put_task_struct(task);
}
static inline void cxl_slbia_core(struct mm_struct *mm)
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index 7ae710585267..871a2f09c718 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -95,12 +95,23 @@ int cxl_afu_disable(struct cxl_afu *afu)
/* This will disable as well as reset */
static int native_afu_reset(struct cxl_afu *afu)
{
+ int rc;
+ u64 serr;
+
pr_devel("AFU reset request\n");
- return afu_control(afu, CXL_AFU_Cntl_An_RA, 0,
+ rc = afu_control(afu, CXL_AFU_Cntl_An_RA, 0,
CXL_AFU_Cntl_An_RS_Complete | CXL_AFU_Cntl_An_ES_Disabled,
CXL_AFU_Cntl_An_RS_MASK | CXL_AFU_Cntl_An_ES_MASK,
false);
+
+ /* Re-enable any masked interrupts */
+ serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
+ serr &= ~CXL_PSL_SERR_An_IRQ_MASKS;
+ cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
+
+
+ return rc;
}
static int native_afu_check_and_enable(struct cxl_afu *afu)
@@ -120,6 +131,7 @@ int cxl_psl_purge(struct cxl_afu *afu)
u64 AFU_Cntl = cxl_p2n_read(afu, CXL_AFU_Cntl_An);
u64 dsisr, dar;
u64 start, end;
+ u64 trans_fault = 0x0ULL;
unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT);
int rc = 0;
@@ -127,6 +139,11 @@ int cxl_psl_purge(struct cxl_afu *afu)
pr_devel("PSL purge request\n");
+ if (cxl_is_psl8(afu))
+ trans_fault = CXL_PSL_DSISR_TRANS;
+ if (cxl_is_psl9(afu))
+ trans_fault = CXL_PSL9_DSISR_An_TF;
+
if (!cxl_ops->link_ok(afu->adapter, afu)) {
dev_warn(&afu->dev, "PSL Purge called with link down, ignoring\n");
rc = -EIO;
@@ -155,13 +172,17 @@ int cxl_psl_purge(struct cxl_afu *afu)
}
dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
- pr_devel_ratelimited("PSL purging... PSL_CNTL: 0x%016llx PSL_DSISR: 0x%016llx\n", PSL_CNTL, dsisr);
- if (dsisr & CXL_PSL_DSISR_TRANS) {
+ pr_devel_ratelimited("PSL purging... PSL_CNTL: 0x%016llx PSL_DSISR: 0x%016llx\n",
+ PSL_CNTL, dsisr);
+
+ if (dsisr & trans_fault) {
dar = cxl_p2n_read(afu, CXL_PSL_DAR_An);
- dev_notice(&afu->dev, "PSL purge terminating pending translation, DSISR: 0x%016llx, DAR: 0x%016llx\n", dsisr, dar);
+ dev_notice(&afu->dev, "PSL purge terminating pending translation, DSISR: 0x%016llx, DAR: 0x%016llx\n",
+ dsisr, dar);
cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE);
} else if (dsisr) {
- dev_notice(&afu->dev, "PSL purge acknowledging pending non-translation fault, DSISR: 0x%016llx\n", dsisr);
+ dev_notice(&afu->dev, "PSL purge acknowledging pending non-translation fault, DSISR: 0x%016llx\n",
+ dsisr);
cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A);
} else {
cpu_relax();
@@ -196,7 +217,7 @@ static int spa_max_procs(int spa_size)
return ((spa_size / 8) - 96) / 17;
}
-int cxl_alloc_spa(struct cxl_afu *afu)
+static int cxl_alloc_spa(struct cxl_afu *afu, int mode)
{
unsigned spa_size;
@@ -209,7 +230,8 @@ int cxl_alloc_spa(struct cxl_afu *afu)
if (spa_size > 0x100000) {
dev_warn(&afu->dev, "num_of_processes too large for the SPA, limiting to %i (0x%x)\n",
afu->native->spa_max_procs, afu->native->spa_size);
- afu->num_procs = afu->native->spa_max_procs;
+ if (mode != CXL_MODE_DEDICATED)
+ afu->num_procs = afu->native->spa_max_procs;
break;
}
@@ -258,7 +280,37 @@ void cxl_release_spa(struct cxl_afu *afu)
}
}
-int cxl_tlb_slb_invalidate(struct cxl *adapter)
+/*
+ * Invalidation of all ERAT entries is no longer required by CAIA2. Use
+ * only for debug.
+ */
+int cxl_invalidate_all_psl9(struct cxl *adapter)
+{
+ unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT);
+ u64 ierat;
+
+ pr_devel("CXL adapter - invalidation of all ERAT entries\n");
+
+ /* Invalidates all ERAT entries for Radix or HPT */
+ ierat = CXL_XSL9_IERAT_IALL;
+ if (radix_enabled())
+ ierat |= CXL_XSL9_IERAT_INVR;
+ cxl_p1_write(adapter, CXL_XSL9_IERAT, ierat);
+
+ while (cxl_p1_read(adapter, CXL_XSL9_IERAT) & CXL_XSL9_IERAT_IINPROG) {
+ if (time_after_eq(jiffies, timeout)) {
+ dev_warn(&adapter->dev,
+ "WARNING: CXL adapter invalidation of all ERAT entries timed out!\n");
+ return -EBUSY;
+ }
+ if (!cxl_ops->link_ok(adapter, NULL))
+ return -EIO;
+ cpu_relax();
+ }
+ return 0;
+}
+
+int cxl_invalidate_all_psl8(struct cxl *adapter)
{
unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT);
@@ -466,7 +518,8 @@ static int remove_process_element(struct cxl_context *ctx)
if (!rc)
ctx->pe_inserted = false;
- slb_invalid(ctx);
+ if (cxl_is_power8())
+ slb_invalid(ctx);
pr_devel("%s Remove pe: %i finished\n", __func__, ctx->pe);
mutex_unlock(&ctx->afu->native->spa_mutex);
@@ -493,13 +546,14 @@ static int activate_afu_directed(struct cxl_afu *afu)
afu->num_procs = afu->max_procs_virtualised;
if (afu->native->spa == NULL) {
- if (cxl_alloc_spa(afu))
+ if (cxl_alloc_spa(afu, CXL_MODE_DIRECTED))
return -ENOMEM;
}
attach_spa(afu);
cxl_p1n_write(afu, CXL_PSL_SCNTL_An, CXL_PSL_SCNTL_An_PM_AFU);
- cxl_p1n_write(afu, CXL_PSL_AMOR_An, 0xFFFFFFFFFFFFFFFFULL);
+ if (cxl_is_power8())
+ cxl_p1n_write(afu, CXL_PSL_AMOR_An, 0xFFFFFFFFFFFFFFFFULL);
cxl_p1n_write(afu, CXL_PSL_ID_An, CXL_PSL_ID_An_F | CXL_PSL_ID_An_L);
afu->current_mode = CXL_MODE_DIRECTED;
@@ -542,10 +596,19 @@ static u64 calculate_sr(struct cxl_context *ctx)
sr |= (mfmsr() & MSR_SF) | CXL_PSL_SR_An_HV;
} else {
sr |= CXL_PSL_SR_An_PR | CXL_PSL_SR_An_R;
- sr &= ~(CXL_PSL_SR_An_HV);
+ if (radix_enabled())
+ sr |= CXL_PSL_SR_An_HV;
+ else
+ sr &= ~(CXL_PSL_SR_An_HV);
if (!test_tsk_thread_flag(current, TIF_32BIT))
sr |= CXL_PSL_SR_An_SF;
}
+ if (cxl_is_psl9(ctx->afu)) {
+ if (radix_enabled())
+ sr |= CXL_PSL_SR_An_XLAT_ror;
+ else
+ sr |= CXL_PSL_SR_An_XLAT_hpt;
+ }
return sr;
}
@@ -578,7 +641,71 @@ static void update_ivtes_directed(struct cxl_context *ctx)
WARN_ON(add_process_element(ctx));
}
-static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
+static int process_element_entry_psl9(struct cxl_context *ctx, u64 wed, u64 amr)
+{
+ u32 pid;
+
+ cxl_assign_psn_space(ctx);
+
+ ctx->elem->ctxtime = 0; /* disable */
+ ctx->elem->lpid = cpu_to_be32(mfspr(SPRN_LPID));
+ ctx->elem->haurp = 0; /* disable */
+
+ if (ctx->kernel)
+ pid = 0;
+ else {
+ if (ctx->mm == NULL) {
+ pr_devel("%s: unable to get mm for pe=%d pid=%i\n",
+ __func__, ctx->pe, pid_nr(ctx->pid));
+ return -EINVAL;
+ }
+ pid = ctx->mm->context.id;
+ }
+
+ ctx->elem->common.tid = 0;
+ ctx->elem->common.pid = cpu_to_be32(pid);
+
+ ctx->elem->sr = cpu_to_be64(calculate_sr(ctx));
+
+ ctx->elem->common.csrp = 0; /* disable */
+
+ cxl_prefault(ctx, wed);
+
+ /*
+ * Ensure we have the multiplexed PSL interrupt set up to take faults
+ * for kernel contexts that may not have allocated any AFU IRQs at all:
+ */
+ if (ctx->irqs.range[0] == 0) {
+ ctx->irqs.offset[0] = ctx->afu->native->psl_hwirq;
+ ctx->irqs.range[0] = 1;
+ }
+
+ ctx->elem->common.amr = cpu_to_be64(amr);
+ ctx->elem->common.wed = cpu_to_be64(wed);
+
+ return 0;
+}
+
+int cxl_attach_afu_directed_psl9(struct cxl_context *ctx, u64 wed, u64 amr)
+{
+ int result;
+
+ /* fill the process element entry */
+ result = process_element_entry_psl9(ctx, wed, amr);
+ if (result)
+ return result;
+
+ update_ivtes_directed(ctx);
+
+ /* first guy needs to enable */
+ result = cxl_ops->afu_check_and_enable(ctx->afu);
+ if (result)
+ return result;
+
+ return add_process_element(ctx);
+}
+
+int cxl_attach_afu_directed_psl8(struct cxl_context *ctx, u64 wed, u64 amr)
{
u32 pid;
int result;
@@ -588,7 +715,7 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
ctx->elem->ctxtime = 0; /* disable */
ctx->elem->lpid = cpu_to_be32(mfspr(SPRN_LPID));
ctx->elem->haurp = 0; /* disable */
- ctx->elem->sdr = cpu_to_be64(mfspr(SPRN_SDR1));
+ ctx->elem->u.sdr = cpu_to_be64(mfspr(SPRN_SDR1));
pid = current->pid;
if (ctx->kernel)
@@ -599,13 +726,13 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
ctx->elem->sr = cpu_to_be64(calculate_sr(ctx));
ctx->elem->common.csrp = 0; /* disable */
- ctx->elem->common.aurp0 = 0; /* disable */
- ctx->elem->common.aurp1 = 0; /* disable */
+ ctx->elem->common.u.psl8.aurp0 = 0; /* disable */
+ ctx->elem->common.u.psl8.aurp1 = 0; /* disable */
cxl_prefault(ctx, wed);
- ctx->elem->common.sstp0 = cpu_to_be64(ctx->sstp0);
- ctx->elem->common.sstp1 = cpu_to_be64(ctx->sstp1);
+ ctx->elem->common.u.psl8.sstp0 = cpu_to_be64(ctx->sstp0);
+ ctx->elem->common.u.psl8.sstp1 = cpu_to_be64(ctx->sstp1);
/*
* Ensure we have the multiplexed PSL interrupt set up to take faults
@@ -671,7 +798,33 @@ static int deactivate_afu_directed(struct cxl_afu *afu)
return 0;
}
-static int activate_dedicated_process(struct cxl_afu *afu)
+int cxl_activate_dedicated_process_psl9(struct cxl_afu *afu)
+{
+ dev_info(&afu->dev, "Activating dedicated process mode\n");
+
+ /*
+ * If XSL is set to dedicated mode (Set in PSL_SCNTL reg), the
+ * XSL and AFU are programmed to work with a single context.
+ * The context information should be configured in the SPA area
+ * index 0 (so PSL_SPAP must be configured before enabling the
+ * AFU).
+ */
+ afu->num_procs = 1;
+ if (afu->native->spa == NULL) {
+ if (cxl_alloc_spa(afu, CXL_MODE_DEDICATED))
+ return -ENOMEM;
+ }
+ attach_spa(afu);
+
+ cxl_p1n_write(afu, CXL_PSL_SCNTL_An, CXL_PSL_SCNTL_An_PM_Process);
+ cxl_p1n_write(afu, CXL_PSL_ID_An, CXL_PSL_ID_An_F | CXL_PSL_ID_An_L);
+
+ afu->current_mode = CXL_MODE_DEDICATED;
+
+ return cxl_chardev_d_afu_add(afu);
+}
+
+int cxl_activate_dedicated_process_psl8(struct cxl_afu *afu)
{
dev_info(&afu->dev, "Activating dedicated process mode\n");
@@ -694,7 +847,17 @@ static int activate_dedicated_process(struct cxl_afu *afu)
return cxl_chardev_d_afu_add(afu);
}
-static void update_ivtes_dedicated(struct cxl_context *ctx)
+void cxl_update_dedicated_ivtes_psl9(struct cxl_context *ctx)
+{
+ int r;
+
+ for (r = 0; r < CXL_IRQ_RANGES; r++) {
+ ctx->elem->ivte_offsets[r] = cpu_to_be16(ctx->irqs.offset[r]);
+ ctx->elem->ivte_ranges[r] = cpu_to_be16(ctx->irqs.range[r]);
+ }
+}
+
+void cxl_update_dedicated_ivtes_psl8(struct cxl_context *ctx)
{
struct cxl_afu *afu = ctx->afu;
@@ -710,7 +873,27 @@ static void update_ivtes_dedicated(struct cxl_context *ctx)
((u64)ctx->irqs.range[3] & 0xffff));
}
-static int attach_dedicated(struct cxl_context *ctx, u64 wed, u64 amr)
+int cxl_attach_dedicated_process_psl9(struct cxl_context *ctx, u64 wed, u64 amr)
+{
+ struct cxl_afu *afu = ctx->afu;
+ int result;
+
+ /* fill the process element entry */
+ result = process_element_entry_psl9(ctx, wed, amr);
+ if (result)
+ return result;
+
+ if (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes)
+ afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx);
+
+ result = cxl_ops->afu_reset(afu);
+ if (result)
+ return result;
+
+ return afu_enable(afu);
+}
+
+int cxl_attach_dedicated_process_psl8(struct cxl_context *ctx, u64 wed, u64 amr)
{
struct cxl_afu *afu = ctx->afu;
u64 pid;
@@ -728,7 +911,8 @@ static int attach_dedicated(struct cxl_context *ctx, u64 wed, u64 amr)
cxl_prefault(ctx, wed);
- update_ivtes_dedicated(ctx);
+ if (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes)
+ afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx);
cxl_p2n_write(afu, CXL_PSL_AMR_An, amr);
@@ -778,8 +962,9 @@ static int native_afu_activate_mode(struct cxl_afu *afu, int mode)
if (mode == CXL_MODE_DIRECTED)
return activate_afu_directed(afu);
- if (mode == CXL_MODE_DEDICATED)
- return activate_dedicated_process(afu);
+ if ((mode == CXL_MODE_DEDICATED) &&
+ (afu->adapter->native->sl_ops->activate_dedicated_process))
+ return afu->adapter->native->sl_ops->activate_dedicated_process(afu);
return -EINVAL;
}
@@ -793,11 +978,13 @@ static int native_attach_process(struct cxl_context *ctx, bool kernel,
}
ctx->kernel = kernel;
- if (ctx->afu->current_mode == CXL_MODE_DIRECTED)
- return attach_afu_directed(ctx, wed, amr);
+ if ((ctx->afu->current_mode == CXL_MODE_DIRECTED) &&
+ (ctx->afu->adapter->native->sl_ops->attach_afu_directed))
+ return ctx->afu->adapter->native->sl_ops->attach_afu_directed(ctx, wed, amr);
- if (ctx->afu->current_mode == CXL_MODE_DEDICATED)
- return attach_dedicated(ctx, wed, amr);
+ if ((ctx->afu->current_mode == CXL_MODE_DEDICATED) &&
+ (ctx->afu->adapter->native->sl_ops->attach_dedicated_process))
+ return ctx->afu->adapter->native->sl_ops->attach_dedicated_process(ctx, wed, amr);
return -EINVAL;
}
@@ -830,8 +1017,9 @@ static void native_update_ivtes(struct cxl_context *ctx)
{
if (ctx->afu->current_mode == CXL_MODE_DIRECTED)
return update_ivtes_directed(ctx);
- if (ctx->afu->current_mode == CXL_MODE_DEDICATED)
- return update_ivtes_dedicated(ctx);
+ if ((ctx->afu->current_mode == CXL_MODE_DEDICATED) &&
+ (ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes))
+ return ctx->afu->adapter->native->sl_ops->update_dedicated_ivtes(ctx);
WARN(1, "native_update_ivtes: Bad mode\n");
}
@@ -859,8 +1047,6 @@ static int native_detach_process(struct cxl_context *ctx)
static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info)
{
- u64 pidtid;
-
/* If the adapter has gone away, we can't get any meaningful
* information.
*/
@@ -869,10 +1055,8 @@ static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info)
info->dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
info->dar = cxl_p2n_read(afu, CXL_PSL_DAR_An);
- info->dsr = cxl_p2n_read(afu, CXL_PSL_DSR_An);
- pidtid = cxl_p2n_read(afu, CXL_PSL_PID_TID_An);
- info->pid = pidtid >> 32;
- info->tid = pidtid & 0xffffffff;
+ if (cxl_is_power8())
+ info->dsr = cxl_p2n_read(afu, CXL_PSL_DSR_An);
info->afu_err = cxl_p2n_read(afu, CXL_AFU_ERR_An);
info->errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An);
info->proc_handle = 0;
@@ -880,7 +1064,22 @@ static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info)
return 0;
}
-void cxl_native_psl_irq_dump_regs(struct cxl_context *ctx)
+void cxl_native_irq_dump_regs_psl9(struct cxl_context *ctx)
+{
+ u64 fir1, fir2, serr;
+
+ fir1 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR1);
+ fir2 = cxl_p1_read(ctx->afu->adapter, CXL_PSL9_FIR2);
+
+ dev_crit(&ctx->afu->dev, "PSL_FIR1: 0x%016llx\n", fir1);
+ dev_crit(&ctx->afu->dev, "PSL_FIR2: 0x%016llx\n", fir2);
+ if (ctx->afu->adapter->native->sl_ops->register_serr_irq) {
+ serr = cxl_p1n_read(ctx->afu, CXL_PSL_SERR_An);
+ cxl_afu_decode_psl_serr(ctx->afu, serr);
+ }
+}
+
+void cxl_native_irq_dump_regs_psl8(struct cxl_context *ctx)
{
u64 fir1, fir2, fir_slice, serr, afu_debug;
@@ -916,9 +1115,20 @@ static irqreturn_t native_handle_psl_slice_error(struct cxl_context *ctx,
return cxl_ops->ack_irq(ctx, 0, errstat);
}
-static irqreturn_t fail_psl_irq(struct cxl_afu *afu, struct cxl_irq_info *irq_info)
+static bool cxl_is_translation_fault(struct cxl_afu *afu, u64 dsisr)
+{
+ if ((cxl_is_psl8(afu)) && (dsisr & CXL_PSL_DSISR_TRANS))
+ return true;
+
+ if ((cxl_is_psl9(afu)) && (dsisr & CXL_PSL9_DSISR_An_TF))
+ return true;
+
+ return false;
+}
+
+irqreturn_t cxl_fail_irq_psl(struct cxl_afu *afu, struct cxl_irq_info *irq_info)
{
- if (irq_info->dsisr & CXL_PSL_DSISR_TRANS)
+ if (cxl_is_translation_fault(afu, irq_info->dsisr))
cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE);
else
cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A);
@@ -932,7 +1142,7 @@ static irqreturn_t native_irq_multiplexed(int irq, void *data)
struct cxl_context *ctx;
struct cxl_irq_info irq_info;
u64 phreg = cxl_p2n_read(afu, CXL_PSL_PEHandle_An);
- int ph, ret;
+ int ph, ret = IRQ_HANDLED, res;
/* check if eeh kicked in while the interrupt was in flight */
if (unlikely(phreg == ~0ULL)) {
@@ -943,15 +1153,18 @@ static irqreturn_t native_irq_multiplexed(int irq, void *data)
}
/* Mask the pe-handle from register value */
ph = phreg & 0xffff;
- if ((ret = native_get_irq_info(afu, &irq_info))) {
- WARN(1, "Unable to get CXL IRQ Info: %i\n", ret);
- return fail_psl_irq(afu, &irq_info);
+ if ((res = native_get_irq_info(afu, &irq_info))) {
+ WARN(1, "Unable to get CXL IRQ Info: %i\n", res);
+ if (afu->adapter->native->sl_ops->fail_irq)
+ return afu->adapter->native->sl_ops->fail_irq(afu, &irq_info);
+ return ret;
}
rcu_read_lock();
ctx = idr_find(&afu->contexts_idr, ph);
if (ctx) {
- ret = cxl_irq(irq, ctx, &irq_info);
+ if (afu->adapter->native->sl_ops->handle_interrupt)
+ ret = afu->adapter->native->sl_ops->handle_interrupt(irq, ctx, &irq_info);
rcu_read_unlock();
return ret;
}
@@ -961,7 +1174,9 @@ static irqreturn_t native_irq_multiplexed(int irq, void *data)
" %016llx\n(Possible AFU HW issue - was a term/remove acked"
" with outstanding transactions?)\n", ph, irq_info.dsisr,
irq_info.dar);
- return fail_psl_irq(afu, &irq_info);
+ if (afu->adapter->native->sl_ops->fail_irq)
+ ret = afu->adapter->native->sl_ops->fail_irq(afu, &irq_info);
+ return ret;
}
static void native_irq_wait(struct cxl_context *ctx)
@@ -979,7 +1194,11 @@ static void native_irq_wait(struct cxl_context *ctx)
if (ph != ctx->pe)
return;
dsisr = cxl_p2n_read(ctx->afu, CXL_PSL_DSISR_An);
- if ((dsisr & CXL_PSL_DSISR_PENDING) == 0)
+ if (cxl_is_psl8(ctx->afu) &&
+ ((dsisr & CXL_PSL_DSISR_PENDING) == 0))
+ return;
+ if (cxl_is_psl9(ctx->afu) &&
+ ((dsisr & CXL_PSL9_DSISR_PENDING) == 0))
return;
/*
* We are waiting for the workqueue to process our
@@ -996,25 +1215,33 @@ static void native_irq_wait(struct cxl_context *ctx)
static irqreturn_t native_slice_irq_err(int irq, void *data)
{
struct cxl_afu *afu = data;
- u64 fir_slice, errstat, serr, afu_debug, afu_error, dsisr;
+ u64 errstat, serr, afu_error, dsisr;
+ u64 fir_slice, afu_debug, irq_mask;
/*
* slice err interrupt is only used with full PSL (no XSL)
*/
serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
- fir_slice = cxl_p1n_read(afu, CXL_PSL_FIR_SLICE_An);
errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An);
- afu_debug = cxl_p1n_read(afu, CXL_AFU_DEBUG_An);
afu_error = cxl_p2n_read(afu, CXL_AFU_ERR_An);
dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
cxl_afu_decode_psl_serr(afu, serr);
- dev_crit(&afu->dev, "PSL_FIR_SLICE_An: 0x%016llx\n", fir_slice);
+
+ if (cxl_is_power8()) {
+ fir_slice = cxl_p1n_read(afu, CXL_PSL_FIR_SLICE_An);
+ afu_debug = cxl_p1n_read(afu, CXL_AFU_DEBUG_An);
+ dev_crit(&afu->dev, "PSL_FIR_SLICE_An: 0x%016llx\n", fir_slice);
+ dev_crit(&afu->dev, "CXL_PSL_AFU_DEBUG_An: 0x%016llx\n", afu_debug);
+ }
dev_crit(&afu->dev, "CXL_PSL_ErrStat_An: 0x%016llx\n", errstat);
- dev_crit(&afu->dev, "CXL_PSL_AFU_DEBUG_An: 0x%016llx\n", afu_debug);
dev_crit(&afu->dev, "AFU_ERR_An: 0x%.16llx\n", afu_error);
dev_crit(&afu->dev, "PSL_DSISR_An: 0x%.16llx\n", dsisr);
+ /* mask off the IRQ so it won't retrigger until the AFU is reset */
+ irq_mask = (serr & CXL_PSL_SERR_An_IRQS) >> 32;
+ serr |= irq_mask;
cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
+ dev_info(&afu->dev, "Further such interrupts will be masked until the AFU is reset\n");
return IRQ_HANDLED;
}
@@ -1103,7 +1330,15 @@ int cxl_native_register_serr_irq(struct cxl_afu *afu)
}
serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
- serr = (serr & 0x00ffffffffff0000ULL) | (afu->serr_hwirq & 0xffff);
+ if (cxl_is_power8())
+ serr = (serr & 0x00ffffffffff0000ULL) | (afu->serr_hwirq & 0xffff);
+ if (cxl_is_power9()) {
+ /*
+ * By default, all errors are masked. So don't set all masks.
+ * Slice errors will be transfered.
+ */
+ serr = (serr & ~0xff0000007fffffffULL) | (afu->serr_hwirq & 0xffff);
+ }
cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
return 0;
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index b27ea98b781f..6dc1ee5b92c9 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -60,7 +60,7 @@
#define CXL_VSEC_PROTOCOL_MASK 0xe0
#define CXL_VSEC_PROTOCOL_1024TB 0x80
#define CXL_VSEC_PROTOCOL_512TB 0x40
-#define CXL_VSEC_PROTOCOL_256TB 0x20 /* Power 8 uses this */
+#define CXL_VSEC_PROTOCOL_256TB 0x20 /* Power 8/9 uses this */
#define CXL_VSEC_PROTOCOL_ENABLE 0x01
#define CXL_READ_VSEC_PSL_REVISION(dev, vsec, dest) \
@@ -123,6 +123,8 @@ static const struct pci_device_id cxl_pci_tbl[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x044b), },
{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x04cf), },
{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0601), },
+ { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0623), },
+ { PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0628), },
{ PCI_DEVICE_CLASS(0x120000, ~0), },
{ }
@@ -324,38 +326,59 @@ static void dump_afu_descriptor(struct cxl_afu *afu)
#undef show_reg
}
-#define CAPP_UNIT0_ID 0xBA
-#define CAPP_UNIT1_ID 0XBE
+#define P8_CAPP_UNIT0_ID 0xBA
+#define P8_CAPP_UNIT1_ID 0XBE
+#define P9_CAPP_UNIT0_ID 0xC0
+#define P9_CAPP_UNIT1_ID 0xE0
-static u64 get_capp_unit_id(struct device_node *np)
+static int get_phb_index(struct device_node *np, u32 *phb_index)
{
- u32 phb_index;
+ if (of_property_read_u32(np, "ibm,phb-index", phb_index))
+ return -ENODEV;
+ return 0;
+}
+static u64 get_capp_unit_id(struct device_node *np, u32 phb_index)
+{
/*
- * For chips other than POWER8NVL, we only have CAPP 0,
- * irrespective of which PHB is used.
+ * POWER 8:
+ * - For chips other than POWER8NVL, we only have CAPP 0,
+ * irrespective of which PHB is used.
+ * - For POWER8NVL, assume CAPP 0 is attached to PHB0 and
+ * CAPP 1 is attached to PHB1.
*/
- if (!pvr_version_is(PVR_POWER8NVL))
- return CAPP_UNIT0_ID;
+ if (cxl_is_power8()) {
+ if (!pvr_version_is(PVR_POWER8NVL))
+ return P8_CAPP_UNIT0_ID;
+
+ if (phb_index == 0)
+ return P8_CAPP_UNIT0_ID;
+
+ if (phb_index == 1)
+ return P8_CAPP_UNIT1_ID;
+ }
/*
- * For POWER8NVL, assume CAPP 0 is attached to PHB0 and
- * CAPP 1 is attached to PHB1.
+ * POWER 9:
+ * PEC0 (PHB0). Capp ID = CAPP0 (0b1100_0000)
+ * PEC1 (PHB1 - PHB2). No capi mode
+ * PEC2 (PHB3 - PHB4 - PHB5): Capi mode on PHB3 only. Capp ID = CAPP1 (0b1110_0000)
*/
- if (of_property_read_u32(np, "ibm,phb-index", &phb_index))
- return 0;
+ if (cxl_is_power9()) {
+ if (phb_index == 0)
+ return P9_CAPP_UNIT0_ID;
- if (phb_index == 0)
- return CAPP_UNIT0_ID;
-
- if (phb_index == 1)
- return CAPP_UNIT1_ID;
+ if (phb_index == 3)
+ return P9_CAPP_UNIT1_ID;
+ }
return 0;
}
-static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, u64 *capp_unit_id)
+static int calc_capp_routing(struct pci_dev *dev, u64 *chipid,
+ u32 *phb_index, u64 *capp_unit_id)
{
+ int rc;
struct device_node *np;
const __be32 *prop;
@@ -366,8 +389,16 @@ static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, u64 *capp_unit_id
np = of_get_next_parent(np);
if (!np)
return -ENODEV;
+
*chipid = be32_to_cpup(prop);
- *capp_unit_id = get_capp_unit_id(np);
+
+ rc = get_phb_index(np, phb_index);
+ if (rc) {
+ pr_err("cxl: invalid phb index\n");
+ return rc;
+ }
+
+ *capp_unit_id = get_capp_unit_id(np, *phb_index);
of_node_put(np);
if (!*capp_unit_id) {
pr_err("cxl: invalid capp unit id\n");
@@ -377,14 +408,104 @@ static int calc_capp_routing(struct pci_dev *dev, u64 *chipid, u64 *capp_unit_id
return 0;
}
-static int init_implementation_adapter_psl_regs(struct cxl *adapter, struct pci_dev *dev)
+static int init_implementation_adapter_regs_psl9(struct cxl *adapter, struct pci_dev *dev)
+{
+ u64 xsl_dsnctl, psl_fircntl;
+ u64 chipid;
+ u32 phb_index;
+ u64 capp_unit_id;
+ int rc;
+
+ rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id);
+ if (rc)
+ return rc;
+
+ /*
+ * CAPI Identifier bits [0:7]
+ * bit 61:60 MSI bits --> 0
+ * bit 59 TVT selector --> 0
+ */
+
+ /*
+ * Tell XSL where to route data to.
+ * The field chipid should match the PHB CAPI_CMPM register
+ */
+ xsl_dsnctl = ((u64)0x2 << (63-7)); /* Bit 57 */
+ xsl_dsnctl |= (capp_unit_id << (63-15));
+
+ /* nMMU_ID Defaults to: b’000001001’*/
+ xsl_dsnctl |= ((u64)0x09 << (63-28));
+
+ if (cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+ /*
+ * Used to identify CAPI packets which should be sorted into
+ * the Non-Blocking queues by the PHB. This field should match
+ * the PHB PBL_NBW_CMPM register
+ * nbwind=0x03, bits [57:58], must include capi indicator.
+ * Not supported on P9 DD1.
+ */
+ xsl_dsnctl |= ((u64)0x03 << (63-47));
+
+ /*
+ * Upper 16b address bits of ASB_Notify messages sent to the
+ * system. Need to match the PHB’s ASN Compare/Mask Register.
+ * Not supported on P9 DD1.
+ */
+ xsl_dsnctl |= ((u64)0x04 << (63-55));
+ }
+
+ cxl_p1_write(adapter, CXL_XSL9_DSNCTL, xsl_dsnctl);
+
+ /* Set fir_cntl to recommended value for production env */
+ psl_fircntl = (0x2ULL << (63-3)); /* ce_report */
+ psl_fircntl |= (0x1ULL << (63-6)); /* FIR_report */
+ psl_fircntl |= 0x1ULL; /* ce_thresh */
+ cxl_p1_write(adapter, CXL_PSL9_FIR_CNTL, psl_fircntl);
+
+ /* vccredits=0x1 pcklat=0x4 */
+ cxl_p1_write(adapter, CXL_PSL9_DSNDCTL, 0x0000000000001810ULL);
+
+ /*
+ * For debugging with trace arrays.
+ * Configure RX trace 0 segmented mode.
+ * Configure CT trace 0 segmented mode.
+ * Configure LA0 trace 0 segmented mode.
+ * Configure LA1 trace 0 segmented mode.
+ */
+ cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000000ULL);
+ cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000003ULL);
+ cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000005ULL);
+ cxl_p1_write(adapter, CXL_PSL9_TRACECFG, 0x8040800080000006ULL);
+
+ /*
+ * A response to an ASB_Notify request is returned by the
+ * system as an MMIO write to the address defined in
+ * the PSL_TNR_ADDR register
+ */
+ /* PSL_TNR_ADDR */
+
+ /* NORST */
+ cxl_p1_write(adapter, CXL_PSL9_DEBUG, 0x8000000000000000ULL);
+
+ /* allocate the apc machines */
+ cxl_p1_write(adapter, CXL_PSL9_APCDEDTYPE, 0x40000003FFFF0000ULL);
+
+ /* Disable vc dd1 fix */
+ if ((cxl_is_power9() && cpu_has_feature(CPU_FTR_POWER9_DD1)))
+ cxl_p1_write(adapter, CXL_PSL9_GP_CT, 0x0400000000000001ULL);
+
+ return 0;
+}
+
+static int init_implementation_adapter_regs_psl8(struct cxl *adapter, struct pci_dev *dev)
{
u64 psl_dsnctl, psl_fircntl;
u64 chipid;
+ u32 phb_index;
u64 capp_unit_id;
int rc;
- rc = calc_capp_routing(dev, &chipid, &capp_unit_id);
+ rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id);
if (rc)
return rc;
@@ -409,14 +530,15 @@ static int init_implementation_adapter_psl_regs(struct cxl *adapter, struct pci_
return 0;
}
-static int init_implementation_adapter_xsl_regs(struct cxl *adapter, struct pci_dev *dev)
+static int init_implementation_adapter_regs_xsl(struct cxl *adapter, struct pci_dev *dev)
{
u64 xsl_dsnctl;
u64 chipid;
+ u32 phb_index;
u64 capp_unit_id;
int rc;
- rc = calc_capp_routing(dev, &chipid, &capp_unit_id);
+ rc = calc_capp_routing(dev, &chipid, &phb_index, &capp_unit_id);
if (rc)
return rc;
@@ -434,7 +556,13 @@ static int init_implementation_adapter_xsl_regs(struct cxl *adapter, struct pci_
/* For the PSL this is a multiple for 0 < n <= 7: */
#define PSL_2048_250MHZ_CYCLES 1
-static void write_timebase_ctrl_psl(struct cxl *adapter)
+static void write_timebase_ctrl_psl9(struct cxl *adapter)
+{
+ cxl_p1_write(adapter, CXL_PSL9_TB_CTLSTAT,
+ TBSYNC_CNT(2 * PSL_2048_250MHZ_CYCLES));
+}
+
+static void write_timebase_ctrl_psl8(struct cxl *adapter)
{
cxl_p1_write(adapter, CXL_PSL_TB_CTLSTAT,
TBSYNC_CNT(2 * PSL_2048_250MHZ_CYCLES));
@@ -455,7 +583,12 @@ static void write_timebase_ctrl_xsl(struct cxl *adapter)
TBSYNC_CNT(XSL_4000_CLOCKS));
}
-static u64 timebase_read_psl(struct cxl *adapter)
+static u64 timebase_read_psl9(struct cxl *adapter)
+{
+ return cxl_p1_read(adapter, CXL_PSL9_Timebase);
+}
+
+static u64 timebase_read_psl8(struct cxl *adapter)
{
return cxl_p1_read(adapter, CXL_PSL_Timebase);
}
@@ -513,7 +646,12 @@ static void cxl_setup_psl_timebase(struct cxl *adapter, struct pci_dev *dev)
return;
}
-static int init_implementation_afu_psl_regs(struct cxl_afu *afu)
+static int init_implementation_afu_regs_psl9(struct cxl_afu *afu)
+{
+ return 0;
+}
+
+static int init_implementation_afu_regs_psl8(struct cxl_afu *afu)
{
/* read/write masks for this slice */
cxl_p1n_write(afu, CXL_PSL_APCALLOC_A, 0xFFFFFFFEFEFEFEFEULL);
@@ -611,7 +749,7 @@ static int setup_cxl_bars(struct pci_dev *dev)
/*
* BAR 4/5 has a special meaning for CXL and must be programmed with a
* special value corresponding to the CXL protocol address range.
- * For POWER 8 that means bits 48:49 must be set to 10
+ * For POWER 8/9 that means bits 48:49 must be set to 10
*/
pci_write_config_dword(dev, PCI_BASE_ADDRESS_4, 0x00000000);
pci_write_config_dword(dev, PCI_BASE_ADDRESS_5, 0x00020000);
@@ -968,7 +1106,7 @@ static int cxl_afu_descriptor_looks_ok(struct cxl_afu *afu)
}
if (afu->pp_psa && (afu->pp_size < PAGE_SIZE))
- dev_warn(&afu->dev, "AFU uses < PAGE_SIZE per-process PSA!");
+ dev_warn(&afu->dev, "AFU uses pp_size(%#016llx) < PAGE_SIZE per-process PSA!\n", afu->pp_size);
for (i = 0; i < afu->crs_num; i++) {
rc = cxl_ops->afu_cr_read32(afu, i, 0, &val);
@@ -996,7 +1134,53 @@ static int cxl_afu_descriptor_looks_ok(struct cxl_afu *afu)
return 0;
}
-static int sanitise_afu_regs(struct cxl_afu *afu)
+static int sanitise_afu_regs_psl9(struct cxl_afu *afu)
+{
+ u64 reg;
+
+ /*
+ * Clear out any regs that contain either an IVTE or address or may be
+ * waiting on an acknowledgment to try to be a bit safer as we bring
+ * it online
+ */
+ reg = cxl_p2n_read(afu, CXL_AFU_Cntl_An);
+ if ((reg & CXL_AFU_Cntl_An_ES_MASK) != CXL_AFU_Cntl_An_ES_Disabled) {
+ dev_warn(&afu->dev, "WARNING: AFU was not disabled: %#016llx\n", reg);
+ if (cxl_ops->afu_reset(afu))
+ return -EIO;
+ if (cxl_afu_disable(afu))
+ return -EIO;
+ if (cxl_psl_purge(afu))
+ return -EIO;
+ }
+ cxl_p1n_write(afu, CXL_PSL_SPAP_An, 0x0000000000000000);
+ cxl_p1n_write(afu, CXL_PSL_AMBAR_An, 0x0000000000000000);
+ reg = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
+ if (reg) {
+ dev_warn(&afu->dev, "AFU had pending DSISR: %#016llx\n", reg);
+ if (reg & CXL_PSL9_DSISR_An_TF)
+ cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE);
+ else
+ cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A);
+ }
+ if (afu->adapter->native->sl_ops->register_serr_irq) {
+ reg = cxl_p1n_read(afu, CXL_PSL_SERR_An);
+ if (reg) {
+ if (reg & ~0x000000007fffffff)
+ dev_warn(&afu->dev, "AFU had pending SERR: %#016llx\n", reg);
+ cxl_p1n_write(afu, CXL_PSL_SERR_An, reg & ~0xffff);
+ }
+ }
+ reg = cxl_p2n_read(afu, CXL_PSL_ErrStat_An);
+ if (reg) {
+ dev_warn(&afu->dev, "AFU had pending error status: %#016llx\n", reg);
+ cxl_p2n_write(afu, CXL_PSL_ErrStat_An, reg);
+ }
+
+ return 0;
+}
+
+static int sanitise_afu_regs_psl8(struct cxl_afu *afu)
{
u64 reg;
@@ -1102,8 +1286,11 @@ static int pci_configure_afu(struct cxl_afu *afu, struct cxl *adapter, struct pc
if ((rc = pci_map_slice_regs(afu, adapter, dev)))
return rc;
- if ((rc = sanitise_afu_regs(afu)))
- goto err1;
+ if (adapter->native->sl_ops->sanitise_afu_regs) {
+ rc = adapter->native->sl_ops->sanitise_afu_regs(afu);
+ if (rc)
+ goto err1;
+ }
/* We need to reset the AFU before we can read the AFU descriptor */
if ((rc = cxl_ops->afu_reset(afu)))
@@ -1248,8 +1435,13 @@ int cxl_pci_reset(struct cxl *adapter)
dev_info(&dev->dev, "CXL reset\n");
- /* the adapter is about to be reset, so ignore errors */
- cxl_data_cache_flush(adapter);
+ /*
+ * The adapter is about to be reset, so ignore errors.
+ * Not supported on P9 DD1
+ */
+ if ((cxl_is_power8()) ||
+ ((cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1))))
+ cxl_data_cache_flush(adapter);
/* pcie_warm_reset requests a fundamental pci reset which includes a
* PERST assert/deassert. PERST triggers a loading of the image
@@ -1332,6 +1524,7 @@ static int cxl_read_vsec(struct cxl *adapter, struct pci_dev *dev)
CXL_READ_VSEC_IMAGE_STATE(dev, vsec, &image_state);
adapter->user_image_loaded = !!(image_state & CXL_VSEC_USER_IMAGE_LOADED);
adapter->perst_select_user = !!(image_state & CXL_VSEC_USER_IMAGE_LOADED);
+ adapter->perst_loads_image = !!(image_state & CXL_VSEC_PERST_LOADS_IMAGE);
CXL_READ_VSEC_NAFUS(dev, vsec, &adapter->slices);
CXL_READ_VSEC_AFU_DESC_OFF(dev, vsec, &afu_desc_off);
@@ -1378,6 +1571,17 @@ static void cxl_fixup_malformed_tlp(struct cxl *adapter, struct pci_dev *dev)
pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, data);
}
+static bool cxl_compatible_caia_version(struct cxl *adapter)
+{
+ if (cxl_is_power8() && (adapter->caia_major == 1))
+ return true;
+
+ if (cxl_is_power9() && (adapter->caia_major == 2))
+ return true;
+
+ return false;
+}
+
static int cxl_vsec_looks_ok(struct cxl *adapter, struct pci_dev *dev)
{
if (adapter->vsec_status & CXL_STATUS_SECOND_PORT)
@@ -1388,6 +1592,12 @@ static int cxl_vsec_looks_ok(struct cxl *adapter, struct pci_dev *dev)
return -EINVAL;
}
+ if (!cxl_compatible_caia_version(adapter)) {
+ dev_info(&dev->dev, "Ignoring card. PSL type is not supported (caia version: %d)\n",
+ adapter->caia_major);
+ return -ENODEV;
+ }
+
if (!adapter->slices) {
/* Once we support dynamic reprogramming we can use the card if
* it supports loadable AFUs */
@@ -1431,9 +1641,19 @@ static void cxl_release_adapter(struct device *dev)
static int sanitise_adapter_regs(struct cxl *adapter)
{
+ int rc = 0;
+
/* Clear PSL tberror bit by writing 1 to it */
cxl_p1_write(adapter, CXL_PSL_ErrIVTE, CXL_PSL_ErrIVTE_tberror);
- return cxl_tlb_slb_invalidate(adapter);
+
+ if (adapter->native->sl_ops->invalidate_all) {
+ /* do not invalidate ERAT entries when not reloading on PERST */
+ if (cxl_is_power9() && (adapter->perst_loads_image))
+ return 0;
+ rc = adapter->native->sl_ops->invalidate_all(adapter);
+ }
+
+ return rc;
}
/* This should contain *only* operations that can safely be done in
@@ -1496,8 +1716,6 @@ static int cxl_configure_adapter(struct cxl *adapter, struct pci_dev *dev)
if ((rc = cxl_native_register_psl_err_irq(adapter)))
goto err;
- /* Release the context lock as adapter is configured */
- cxl_adapter_context_unlock(adapter);
return 0;
err:
@@ -1516,25 +1734,65 @@ static void cxl_deconfigure_adapter(struct cxl *adapter)
pci_disable_device(pdev);
}
-static const struct cxl_service_layer_ops psl_ops = {
- .adapter_regs_init = init_implementation_adapter_psl_regs,
- .afu_regs_init = init_implementation_afu_psl_regs,
+static const struct cxl_service_layer_ops psl9_ops = {
+ .adapter_regs_init = init_implementation_adapter_regs_psl9,
+ .invalidate_all = cxl_invalidate_all_psl9,
+ .afu_regs_init = init_implementation_afu_regs_psl9,
+ .sanitise_afu_regs = sanitise_afu_regs_psl9,
.register_serr_irq = cxl_native_register_serr_irq,
.release_serr_irq = cxl_native_release_serr_irq,
- .debugfs_add_adapter_sl_regs = cxl_debugfs_add_adapter_psl_regs,
- .debugfs_add_afu_sl_regs = cxl_debugfs_add_afu_psl_regs,
- .psl_irq_dump_registers = cxl_native_psl_irq_dump_regs,
+ .handle_interrupt = cxl_irq_psl9,
+ .fail_irq = cxl_fail_irq_psl,
+ .activate_dedicated_process = cxl_activate_dedicated_process_psl9,
+ .attach_afu_directed = cxl_attach_afu_directed_psl9,
+ .attach_dedicated_process = cxl_attach_dedicated_process_psl9,
+ .update_dedicated_ivtes = cxl_update_dedicated_ivtes_psl9,
+ .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl9,
+ .debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl9,
+ .psl_irq_dump_registers = cxl_native_irq_dump_regs_psl9,
.err_irq_dump_registers = cxl_native_err_irq_dump_regs,
- .debugfs_stop_trace = cxl_stop_trace,
- .write_timebase_ctrl = write_timebase_ctrl_psl,
- .timebase_read = timebase_read_psl,
+ .debugfs_stop_trace = cxl_stop_trace_psl9,
+ .write_timebase_ctrl = write_timebase_ctrl_psl9,
+ .timebase_read = timebase_read_psl9,
+ .capi_mode = OPAL_PHB_CAPI_MODE_CAPI,
+ .needs_reset_before_disable = true,
+};
+
+static const struct cxl_service_layer_ops psl8_ops = {
+ .adapter_regs_init = init_implementation_adapter_regs_psl8,
+ .invalidate_all = cxl_invalidate_all_psl8,
+ .afu_regs_init = init_implementation_afu_regs_psl8,
+ .sanitise_afu_regs = sanitise_afu_regs_psl8,
+ .register_serr_irq = cxl_native_register_serr_irq,
+ .release_serr_irq = cxl_native_release_serr_irq,
+ .handle_interrupt = cxl_irq_psl8,
+ .fail_irq = cxl_fail_irq_psl,
+ .activate_dedicated_process = cxl_activate_dedicated_process_psl8,
+ .attach_afu_directed = cxl_attach_afu_directed_psl8,
+ .attach_dedicated_process = cxl_attach_dedicated_process_psl8,
+ .update_dedicated_ivtes = cxl_update_dedicated_ivtes_psl8,
+ .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_psl8,
+ .debugfs_add_afu_regs = cxl_debugfs_add_afu_regs_psl8,
+ .psl_irq_dump_registers = cxl_native_irq_dump_regs_psl8,
+ .err_irq_dump_registers = cxl_native_err_irq_dump_regs,
+ .debugfs_stop_trace = cxl_stop_trace_psl8,
+ .write_timebase_ctrl = write_timebase_ctrl_psl8,
+ .timebase_read = timebase_read_psl8,
.capi_mode = OPAL_PHB_CAPI_MODE_CAPI,
.needs_reset_before_disable = true,
};
static const struct cxl_service_layer_ops xsl_ops = {
- .adapter_regs_init = init_implementation_adapter_xsl_regs,
- .debugfs_add_adapter_sl_regs = cxl_debugfs_add_adapter_xsl_regs,
+ .adapter_regs_init = init_implementation_adapter_regs_xsl,
+ .invalidate_all = cxl_invalidate_all_psl8,
+ .sanitise_afu_regs = sanitise_afu_regs_psl8,
+ .handle_interrupt = cxl_irq_psl8,
+ .fail_irq = cxl_fail_irq_psl,
+ .activate_dedicated_process = cxl_activate_dedicated_process_psl8,
+ .attach_afu_directed = cxl_attach_afu_directed_psl8,
+ .attach_dedicated_process = cxl_attach_dedicated_process_psl8,
+ .update_dedicated_ivtes = cxl_update_dedicated_ivtes_psl8,
+ .debugfs_add_adapter_regs = cxl_debugfs_add_adapter_regs_xsl,
.write_timebase_ctrl = write_timebase_ctrl_xsl,
.timebase_read = timebase_read_xsl,
.capi_mode = OPAL_PHB_CAPI_MODE_DMA,
@@ -1548,8 +1806,13 @@ static void set_sl_ops(struct cxl *adapter, struct pci_dev *dev)
adapter->native->sl_ops = &xsl_ops;
adapter->min_pe = 1; /* Workaround for CX-4 hardware bug */
} else {
- dev_info(&dev->dev, "Device uses a PSL\n");
- adapter->native->sl_ops = &psl_ops;
+ if (cxl_is_power8()) {
+ dev_info(&dev->dev, "Device uses a PSL8\n");
+ adapter->native->sl_ops = &psl8_ops;
+ } else {
+ dev_info(&dev->dev, "Device uses a PSL9\n");
+ adapter->native->sl_ops = &psl9_ops;
+ }
}
}
@@ -1596,6 +1859,9 @@ static struct cxl *cxl_pci_init_adapter(struct pci_dev *dev)
if ((rc = cxl_sysfs_adapter_add(adapter)))
goto err_put1;
+ /* Release the context lock as adapter is configured */
+ cxl_adapter_context_unlock(adapter);
+
return adapter;
err_put1:
@@ -1619,8 +1885,13 @@ static void cxl_pci_remove_adapter(struct cxl *adapter)
cxl_sysfs_adapter_remove(adapter);
cxl_debugfs_adapter_remove(adapter);
- /* Flush adapter datacache as its about to be removed */
- cxl_data_cache_flush(adapter);
+ /*
+ * Flush adapter datacache as its about to be removed.
+ * Not supported on P9 DD1.
+ */
+ if ((cxl_is_power8()) ||
+ ((cxl_is_power9() && !cpu_has_feature(CPU_FTR_POWER9_DD1))))
+ cxl_data_cache_flush(adapter);
cxl_deconfigure_adapter(adapter);
@@ -1704,6 +1975,11 @@ static int cxl_probe(struct pci_dev *dev, const struct pci_device_id *id)
return -ENODEV;
}
+ if (cxl_is_power9() && !radix_enabled()) {
+ dev_info(&dev->dev, "Only Radix mode supported\n");
+ return -ENODEV;
+ }
+
if (cxl_verbose)
dump_cxl_config_space(dev);
@@ -1781,7 +2057,7 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
{
struct cxl *adapter = pci_get_drvdata(pdev);
struct cxl_afu *afu;
- pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET;
+ pci_ers_result_t result = PCI_ERS_RESULT_NEED_RESET, afu_result;
int i;
/* At this point, we could still have an interrupt pending.
@@ -1885,16 +2161,26 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
for (i = 0; i < adapter->slices; i++) {
afu = adapter->afu[i];
- result = cxl_vphb_error_detected(afu, state);
-
- /* Only continue if everyone agrees on NEED_RESET */
- if (result != PCI_ERS_RESULT_NEED_RESET)
- return result;
+ afu_result = cxl_vphb_error_detected(afu, state);
cxl_context_detach_all(afu);
cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
pci_deconfigure_afu(afu);
+
+ /* Disconnect trumps all, NONE trumps NEED_RESET */
+ if (afu_result == PCI_ERS_RESULT_DISCONNECT)
+ result = PCI_ERS_RESULT_DISCONNECT;
+ else if ((afu_result == PCI_ERS_RESULT_NONE) &&
+ (result == PCI_ERS_RESULT_NEED_RESET))
+ result = PCI_ERS_RESULT_NONE;
}
+
+ /* should take the context lock here */
+ if (cxl_adapter_context_lock(adapter) != 0)
+ dev_warn(&adapter->dev,
+ "Couldn't take context lock with %d active-contexts\n",
+ atomic_read(&adapter->contexts_num));
+
cxl_deconfigure_adapter(adapter);
return result;
@@ -1913,6 +2199,13 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
if (cxl_configure_adapter(adapter, pdev))
goto err;
+ /*
+ * Unlock context activation for the adapter. Ideally this should be
+ * done in cxl_pci_resume but cxlflash module tries to activate the
+ * master context as part of slot_reset callback.
+ */
+ cxl_adapter_context_unlock(adapter);
+
for (i = 0; i < adapter->slices; i++) {
afu = adapter->afu[i];
diff --git a/drivers/misc/cxl/trace.h b/drivers/misc/cxl/trace.h
index 751d6119683e..b8e300af0e55 100644
--- a/drivers/misc/cxl/trace.h
+++ b/drivers/misc/cxl/trace.h
@@ -17,6 +17,15 @@
#include "cxl.h"
+#define dsisr_psl9_flags(flags) \
+ __print_flags(flags, "|", \
+ { CXL_PSL9_DSISR_An_CO_MASK, "FR" }, \
+ { CXL_PSL9_DSISR_An_TF, "TF" }, \
+ { CXL_PSL9_DSISR_An_PE, "PE" }, \
+ { CXL_PSL9_DSISR_An_AE, "AE" }, \
+ { CXL_PSL9_DSISR_An_OC, "OC" }, \
+ { CXL_PSL9_DSISR_An_S, "S" })
+
#define DSISR_FLAGS \
{ CXL_PSL_DSISR_An_DS, "DS" }, \
{ CXL_PSL_DSISR_An_DM, "DM" }, \
@@ -154,6 +163,40 @@ TRACE_EVENT(cxl_afu_irq,
)
);
+TRACE_EVENT(cxl_psl9_irq,
+ TP_PROTO(struct cxl_context *ctx, int irq, u64 dsisr, u64 dar),
+
+ TP_ARGS(ctx, irq, dsisr, dar),
+
+ TP_STRUCT__entry(
+ __field(u8, card)
+ __field(u8, afu)
+ __field(u16, pe)
+ __field(int, irq)
+ __field(u64, dsisr)
+ __field(u64, dar)
+ ),
+
+ TP_fast_assign(
+ __entry->card = ctx->afu->adapter->adapter_num;
+ __entry->afu = ctx->afu->slice;
+ __entry->pe = ctx->pe;
+ __entry->irq = irq;
+ __entry->dsisr = dsisr;
+ __entry->dar = dar;
+ ),
+
+ TP_printk("afu%i.%i pe=%i irq=%i dsisr=0x%016llx dsisr=%s dar=0x%016llx",
+ __entry->card,
+ __entry->afu,
+ __entry->pe,
+ __entry->irq,
+ __entry->dsisr,
+ dsisr_psl9_flags(__entry->dsisr),
+ __entry->dar
+ )
+);
+
TRACE_EVENT(cxl_psl_irq,
TP_PROTO(struct cxl_context *ctx, int irq, u64 dsisr, u64 dar),
diff --git a/drivers/of/base.c b/drivers/of/base.c
index d7c4629a3a2d..0ea16bd3c8f1 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1213,6 +1213,37 @@ int of_property_read_u32_index(const struct device_node *np,
EXPORT_SYMBOL_GPL(of_property_read_u32_index);
/**
+ * of_property_read_u64_index - Find and read a u64 from a multi-value property.
+ *
+ * @np: device node from which the property value is to be read.
+ * @propname: name of the property to be searched.
+ * @index: index of the u64 in the list of values
+ * @out_value: pointer to return value, modified only if no error.
+ *
+ * Search for a property in a device node and read nth 64-bit value from
+ * it. Returns 0 on success, -EINVAL if the property does not exist,
+ * -ENODATA if property does not have a value, and -EOVERFLOW if the
+ * property data isn't large enough.
+ *
+ * The out_value is modified only if a valid u64 value can be decoded.
+ */
+int of_property_read_u64_index(const struct device_node *np,
+ const char *propname,
+ u32 index, u64 *out_value)
+{
+ const u64 *val = of_find_property_value_of_size(np, propname,
+ ((index + 1) * sizeof(*out_value)),
+ 0, NULL);
+
+ if (IS_ERR(val))
+ return PTR_ERR(val);
+
+ *out_value = be64_to_cpup(((__be64 *)val) + index);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(of_property_read_u64_index);
+
+/**
* of_property_read_variable_u8_array - Find and read an array of u8 from a
* property, with bounds on the minimum and maximum array size.
*
diff --git a/drivers/pcmcia/electra_cf.c b/drivers/pcmcia/electra_cf.c
index 4d7bc3f4124a..c6fe2a4a7a6a 100644
--- a/drivers/pcmcia/electra_cf.c
+++ b/drivers/pcmcia/electra_cf.c
@@ -207,7 +207,7 @@ static int electra_cf_probe(struct platform_device *ofdev)
return -ENOMEM;
setup_timer(&cf->timer, electra_cf_timer, (unsigned long)cf);
- cf->irq = NO_IRQ;
+ cf->irq = 0;
cf->ofdev = ofdev;
cf->mem_phys = mem.start;
@@ -313,7 +313,7 @@ fail3:
fail2:
release_mem_region(cf->mem_phys, cf->mem_size);
fail1:
- if (cf->irq != NO_IRQ)
+ if (cf->irq)
free_irq(cf->irq, cf);
if (cf->io_virt)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
index 78dca1aa6410..63112c36ab2d 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -685,7 +685,7 @@ static void tce_iommu_free_table(struct tce_container *container,
unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
tce_iommu_userspace_view_free(tbl, container->mm);
- tbl->it_ops->free(tbl);
+ iommu_tce_table_put(tbl);
decrement_locked_vm(container->mm, pages);
}
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 47e4da5b4fa2..30f90c1a0aaf 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -381,6 +381,7 @@ static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
return this_cpu_ptr(&kprobe_ctlblk);
}
+kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset);
int register_kprobe(struct kprobe *p);
void unregister_kprobe(struct kprobe *p);
int register_kprobes(struct kprobe **kps, int num);
diff --git a/include/linux/of.h b/include/linux/of.h
index e5d4225fda35..50fcdb54087f 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -294,6 +294,9 @@ extern int of_property_count_elems_of_size(const struct device_node *np,
extern int of_property_read_u32_index(const struct device_node *np,
const char *propname,
u32 index, u32 *out_value);
+extern int of_property_read_u64_index(const struct device_node *np,
+ const char *propname,
+ u32 index, u64 *out_value);
extern int of_property_read_variable_u8_array(const struct device_node *np,
const char *propname, u8 *out_values,
size_t sz_min, size_t sz_max);
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index d09a9cd021b1..b1c0b187acfe 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -922,6 +922,7 @@ enum perf_callchain_context {
#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */
#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
union perf_mem_data_src {
__u64 val;
struct {
@@ -933,6 +934,21 @@ union perf_mem_data_src {
mem_rsvd:31;
};
};
+#elif defined(__BIG_ENDIAN_BITFIELD)
+union perf_mem_data_src {
+ __u64 val;
+ struct {
+ __u64 mem_rsvd:31,
+ mem_dtlb:7, /* tlb access */
+ mem_lock:2, /* lock instr */
+ mem_snoop:5, /* snoop mode */
+ mem_lvl:14, /* memory hierarchy level */
+ mem_op:5; /* type of opcode */
+ };
+};
+#else
+#error "Unknown endianness"
+#endif
/* type of opcode (load/store/prefetch,code) */
#define PERF_MEM_OP_NA 0x01 /* not available */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d733479a10ee..7367e0ec6f81 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -58,15 +58,6 @@
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
-/*
- * Some oddball architectures like 64bit powerpc have function descriptors
- * so this must be overridable.
- */
-#ifndef kprobe_lookup_name
-#define kprobe_lookup_name(name, addr) \
- addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
-#endif
-
static int kprobes_initialized;
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
@@ -81,6 +72,12 @@ static struct {
raw_spinlock_t lock ____cacheline_aligned_in_smp;
} kretprobe_table_locks[KPROBE_TABLE_SIZE];
+kprobe_opcode_t * __weak kprobe_lookup_name(const char *name,
+ unsigned int __unused)
+{
+ return ((kprobe_opcode_t *)(kallsyms_lookup_name(name)));
+}
+
static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
{
return &(kretprobe_table_locks[hash].lock);
@@ -746,13 +743,20 @@ static void kill_optimized_kprobe(struct kprobe *p)
arch_remove_optimized_kprobe(op);
}
+static inline
+void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p)
+{
+ if (!kprobe_ftrace(p))
+ arch_prepare_optimized_kprobe(op, p);
+}
+
/* Try to prepare optimized instructions */
static void prepare_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
op = container_of(p, struct optimized_kprobe, kp);
- arch_prepare_optimized_kprobe(op, p);
+ __prepare_optimized_kprobe(op, p);
}
/* Allocate new optimized_kprobe and try to prepare optimized instructions */
@@ -766,7 +770,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
INIT_LIST_HEAD(&op->list);
op->kp.addr = p->addr;
- arch_prepare_optimized_kprobe(op, p);
+ __prepare_optimized_kprobe(op, p);
return &op->kp;
}
@@ -1398,7 +1402,7 @@ static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr,
goto invalid;
if (symbol_name) {
- kprobe_lookup_name(symbol_name, addr);
+ addr = kprobe_lookup_name(symbol_name, offset);
if (!addr)
return ERR_PTR(-ENOENT);
}
@@ -2218,8 +2222,8 @@ static int __init init_kprobes(void)
if (kretprobe_blacklist_size) {
/* lookup the function address from its name */
for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
- kprobe_lookup_name(kretprobe_blacklist[i].name,
- kretprobe_blacklist[i].addr);
+ kretprobe_blacklist[i].addr =
+ kprobe_lookup_name(kretprobe_blacklist[i].name, 0);
if (!kretprobe_blacklist[i].addr)
printk("kretprobe: lookup failed: %s\n",
kretprobe_blacklist[i].name);
diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h
index d09a9cd021b1..b1c0b187acfe 100644
--- a/tools/include/uapi/linux/perf_event.h
+++ b/tools/include/uapi/linux/perf_event.h
@@ -922,6 +922,7 @@ enum perf_callchain_context {
#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */
#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */
+#if defined(__LITTLE_ENDIAN_BITFIELD)
union perf_mem_data_src {
__u64 val;
struct {
@@ -933,6 +934,21 @@ union perf_mem_data_src {
mem_rsvd:31;
};
};
+#elif defined(__BIG_ENDIAN_BITFIELD)
+union perf_mem_data_src {
+ __u64 val;
+ struct {
+ __u64 mem_rsvd:31,
+ mem_dtlb:7, /* tlb access */
+ mem_lock:2, /* lock instr */
+ mem_snoop:5, /* snoop mode */
+ mem_lvl:14, /* memory hierarchy level */
+ mem_op:5; /* type of opcode */
+ };
+};
+#else
+#error "Unknown endianness"
+#endif
/* type of opcode (load/store/prefetch,code) */
#define PERF_MEM_OP_NA 0x01 /* not available */
diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
index 2132ff8eb4e7..03e1617367d3 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -14,6 +14,7 @@ export CFLAGS
SUB_DIRS = alignment \
benchmarks \
+ cache_shape \
copyloops \
context_switch \
dscr \
diff --git a/tools/testing/selftests/powerpc/cache_shape/.gitignore b/tools/testing/selftests/powerpc/cache_shape/.gitignore
new file mode 100644
index 000000000000..ec1848434be5
--- /dev/null
+++ b/tools/testing/selftests/powerpc/cache_shape/.gitignore
@@ -0,0 +1 @@
+cache_shape
diff --git a/tools/testing/selftests/powerpc/cache_shape/Makefile b/tools/testing/selftests/powerpc/cache_shape/Makefile
new file mode 100644
index 000000000000..b24485ab30e2
--- /dev/null
+++ b/tools/testing/selftests/powerpc/cache_shape/Makefile
@@ -0,0 +1,10 @@
+TEST_PROGS := cache_shape
+
+all: $(TEST_PROGS)
+
+$(TEST_PROGS): ../harness.c ../utils.c
+
+include ../../lib.mk
+
+clean:
+ rm -f $(TEST_PROGS) *.o
diff --git a/tools/testing/selftests/powerpc/cache_shape/cache_shape.c b/tools/testing/selftests/powerpc/cache_shape/cache_shape.c
new file mode 100644
index 000000000000..29ec07eba7f9
--- /dev/null
+++ b/tools/testing/selftests/powerpc/cache_shape/cache_shape.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright 2017, Michael Ellerman, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <elf.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <link.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "utils.h"
+
+#ifndef AT_L1I_CACHESIZE
+#define AT_L1I_CACHESIZE 40
+#define AT_L1I_CACHEGEOMETRY 41
+#define AT_L1D_CACHESIZE 42
+#define AT_L1D_CACHEGEOMETRY 43
+#define AT_L2_CACHESIZE 44
+#define AT_L2_CACHEGEOMETRY 45
+#define AT_L3_CACHESIZE 46
+#define AT_L3_CACHEGEOMETRY 47
+#endif
+
+static void print_size(const char *label, uint32_t val)
+{
+ printf("%s cache size: %#10x %10dB %10dK\n", label, val, val, val / 1024);
+}
+
+static void print_geo(const char *label, uint32_t val)
+{
+ uint16_t assoc;
+
+ printf("%s line size: %#10x ", label, val & 0xFFFF);
+
+ assoc = val >> 16;
+ if (assoc)
+ printf("%u-way", assoc);
+ else
+ printf("fully");
+
+ printf(" associative\n");
+}
+
+static int test_cache_shape()
+{
+ static char buffer[4096];
+ ElfW(auxv_t) *p;
+ int found;
+
+ FAIL_IF(read_auxv(buffer, sizeof(buffer)));
+
+ found = 0;
+
+ p = find_auxv_entry(AT_L1I_CACHESIZE, buffer);
+ if (p) {
+ found++;
+ print_size("L1I ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L1I_CACHEGEOMETRY, buffer);
+ if (p) {
+ found++;
+ print_geo("L1I ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L1D_CACHESIZE, buffer);
+ if (p) {
+ found++;
+ print_size("L1D ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L1D_CACHEGEOMETRY, buffer);
+ if (p) {
+ found++;
+ print_geo("L1D ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L2_CACHESIZE, buffer);
+ if (p) {
+ found++;
+ print_size("L2 ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L2_CACHEGEOMETRY, buffer);
+ if (p) {
+ found++;
+ print_geo("L2 ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L3_CACHESIZE, buffer);
+ if (p) {
+ found++;
+ print_size("L3 ", (uint32_t)p->a_un.a_val);
+ }
+
+ p = find_auxv_entry(AT_L3_CACHEGEOMETRY, buffer);
+ if (p) {
+ found++;
+ print_geo("L3 ", (uint32_t)p->a_un.a_val);
+ }
+
+ /* If we found none we're probably on a system where they don't exist */
+ SKIP_IF(found == 0);
+
+ /* But if we found any, we expect to find them all */
+ FAIL_IF(found != 8);
+
+ return 0;
+}
+
+int main(void)
+{
+ return test_harness(test_cache_shape, "cache_shape");
+}
diff --git a/tools/testing/selftests/powerpc/include/utils.h b/tools/testing/selftests/powerpc/include/utils.h
index 53405e8a52ab..735815b3ad7f 100644
--- a/tools/testing/selftests/powerpc/include/utils.h
+++ b/tools/testing/selftests/powerpc/include/utils.h
@@ -24,7 +24,11 @@ typedef uint8_t u8;
void test_harness_set_timeout(uint64_t time);
int test_harness(int (test_function)(void), char *name);
-extern void *get_auxv_entry(int type);
+
+int read_auxv(char *buf, ssize_t buf_size);
+void *find_auxv_entry(int type, char *auxv);
+void *get_auxv_entry(int type);
+
int pick_online_cpu(void);
static inline bool have_hwcap(unsigned long ftr)
diff --git a/tools/testing/selftests/powerpc/utils.c b/tools/testing/selftests/powerpc/utils.c
index dcf74184bfd0..d46916867a6f 100644
--- a/tools/testing/selftests/powerpc/utils.c
+++ b/tools/testing/selftests/powerpc/utils.c
@@ -19,45 +19,64 @@
static char auxv[4096];
-void *get_auxv_entry(int type)
+int read_auxv(char *buf, ssize_t buf_size)
{
- ElfW(auxv_t) *p;
- void *result;
ssize_t num;
- int fd;
+ int rc, fd;
fd = open("/proc/self/auxv", O_RDONLY);
if (fd == -1) {
perror("open");
- return NULL;
+ return -errno;
}
- result = NULL;
-
- num = read(fd, auxv, sizeof(auxv));
+ num = read(fd, buf, buf_size);
if (num < 0) {
perror("read");
+ rc = -EIO;
goto out;
}
- if (num > sizeof(auxv)) {
- printf("Overflowed auxv buffer\n");
+ if (num > buf_size) {
+ printf("overflowed auxv buffer\n");
+ rc = -EOVERFLOW;
goto out;
}
+ rc = 0;
+out:
+ close(fd);
+ return rc;
+}
+
+void *find_auxv_entry(int type, char *auxv)
+{
+ ElfW(auxv_t) *p;
+
p = (ElfW(auxv_t) *)auxv;
while (p->a_type != AT_NULL) {
- if (p->a_type == type) {
- result = (void *)p->a_un.a_val;
- break;
- }
+ if (p->a_type == type)
+ return p;
p++;
}
-out:
- close(fd);
- return result;
+
+ return NULL;
+}
+
+void *get_auxv_entry(int type)
+{
+ ElfW(auxv_t) *p;
+
+ if (read_auxv(auxv, sizeof(auxv)))
+ return NULL;
+
+ p = find_auxv_entry(type, auxv);
+ if (p)
+ return (void *)p->a_un.a_val;
+
+ return NULL;
}
int pick_online_cpu(void)